Final Project Knowledge Mining

# clear your memory
rm(list=ls())

Argentine Aprender Evaluation

Reading in the data

mydata <- read.delim("/Users/feder/Desktop/km_project/aprender_cordoba_dataset.txt")

Names of variables

names(mydata)
##   [1] "ID1"                            "cod_provincia"                 
##   [3] "sector"                         "ambito"                        
##   [5] "claveseccion"                   "idalumno"                      
##   [7] "ap01_01"                        "ap01_02"                       
##   [9] "ap02"                           "gender"                        
##  [11] "female"                         "ap03"                          
##  [13] "ap04"                           "ap05"                          
##  [15] "ap06"                           "ap07"                          
##  [17] "ap08_a"                         "ap08_b"                        
##  [19] "ap08_c"                         "ap08_d"                        
##  [21] "ap08_e"                         "ap08_f"                        
##  [23] "ap08_g"                         "ap08_h"                        
##  [25] "ap08_i"                         "ap09"                          
##  [27] "ap10"                           "ap11_01"                       
##  [29] "ap11_02"                        "ap11_03"                       
##  [31] "ap11_04"                        "ap11_05"                       
##  [33] "ap11_06"                        "ap11_07"                       
##  [35] "ap11_08"                        "ap12"                          
##  [37] "ap13"                           "ap14_01"                       
##  [39] "ap14_02"                        "ap15"                          
##  [41] "ap16"                           "ap17"                          
##  [43] "ap18_01"                        "ap18_02"                       
##  [45] "ap18_03"                        "ap19"                          
##  [47] "ap21"                           "ap22"                          
##  [49] "ap23_01"                        "ap23_02"                       
##  [51] "ap23_03"                        "ap23_04"                       
##  [53] "ap23_05"                        "ap23_06"                       
##  [55] "ap24"                           "ap25_01"                       
##  [57] "ap25_02"                        "ap25_03"                       
##  [59] "ap26"                           "ap27_a"                        
##  [61] "ap27_b"                         "ap27_c"                        
##  [63] "ap27_d"                         "ap27_e"                        
##  [65] "ap27_f"                         "ap28_01"                       
##  [67] "ap28_02"                        "ap28_03"                       
##  [69] "ap28_04"                        "ap29_01"                       
##  [71] "ap29_02"                        "ap29_03"                       
##  [73] "ap29_04"                        "ap29_05"                       
##  [75] "ap29_06"                        "ap29_07"                       
##  [77] "ap30"                           "ap31_01"                       
##  [79] "ap31_02"                        "ap31_03"                       
##  [81] "ap31_04"                        "ap31_05"                       
##  [83] "ap31_06"                        "ap32_a"                        
##  [85] "ap32_b"                         "ap32_c"                        
##  [87] "ap32_d"                         "ap32_e"                        
##  [89] "ap32_f"                         "ap33_01_a"                     
##  [91] "ap33_02_a"                      "ap33_03_a"                     
##  [93] "ap33_04_a"                      "ap33_01_b"                     
##  [95] "ap33_02_b"                      "ap33_03_b"                     
##  [97] "ap33_04_b"                      "ap33_01bis"                    
##  [99] "ap33_02bis"                     "ap33_03bis"                    
## [101] "ap33_04bis"                     "ap33_05bis"                    
## [103] "ap33_06bis"                     "ap33_07bis"                    
## [105] "ap33_08bis"                     "ap33_09bis"                    
## [107] "ap33_10bis"                     "ap33_11bis"                    
## [109] "ap34"                           "ap35_01"                       
## [111] "ap35_02"                        "ap35_03"                       
## [113] "ap35_04"                        "ap35_05"                       
## [115] "ap36_a"                         "ap36_b"                        
## [117] "ap36_c"                         "ap36_d"                        
## [119] "ap36_e"                         "ap36_f"                        
## [121] "ap37_a"                         "ap37_b"                        
## [123] "ap37_c"                         "ap37_d"                        
## [125] "ap37_e"                         "ap37_f"                        
## [127] "ap37_g"                         "ap37_h"                        
## [129] "ap37_i"                         "ap37_j"                        
## [131] "ap37_k"                         "ap37_l"                        
## [133] "ap37_m"                         "ap37_n"                        
## [135] "ap38_a"                         "ap38_b"                        
## [137] "ap38_c"                         "ap38_d"                        
## [139] "ap38_e"                         "ap38_f"                        
## [141] "ap38_g"                         "ap38_h"                        
## [143] "ap38_i"                         "ap38_j"                        
## [145] "ap38_k"                         "ap38_l"                        
## [147] "ap38_m"                         "ap38_n"                        
## [149] "ap39_01"                        "ap39_02"                       
## [151] "ap39_03"                        "ap39_04"                       
## [153] "ap40_01"                        "ap40_02"                       
## [155] "ap40_03"                        "ap40_04"                       
## [157] "ap40_05"                        "ap40_06"                       
## [159] "ap41_01"                        "ap41_02"                       
## [161] "ap41_03"                        "ap41_04"                       
## [163] "ap41_05"                        "ap41_06"                       
## [165] "ap41_07"                        "ap41_08"                       
## [167] "ap41_09"                        "ap41_10"                       
## [169] "ap41_11"                        "ap42_01"                       
## [171] "ap42_02"                        "ap42_03"                       
## [173] "ap42_04"                        "ap43_a"                        
## [175] "ap43_b"                         "ap43_c"                        
## [177] "ap43_d"                         "ap43_e"                        
## [179] "ap43_f"                         "ap43_g"                        
## [181] "ap43_h"                         "ap44_01"                       
## [183] "ap44_02"                        "ap44_03"                       
## [185] "ap44_04"                        "ap44_05"                       
## [187] "ap45"                           "ap46_01"                       
## [189] "ap46_02"                        "ap46_03"                       
## [191] "ap46_04"                        "ap46_05"                       
## [193] "ap47_01"                        "ap47_02"                       
## [195] "ap47_03"                        "ap47_04"                       
## [197] "ap47_05"                        "ap47_06"                       
## [199] "ap47_07"                        "ap47_08"                       
## [201] "ap47_09"                        "ap47_10"                       
## [203] "ap47_11"                        "ap47_12"                       
## [205] "ap48_a"                         "ap48_b"                        
## [207] "ap48_c"                         "ap48_d"                        
## [209] "ap48_e"                         "ap48_f"                        
## [211] "ap48_g"                         "ap48_h"                        
## [213] "ap48_i"                         "ap49_01"                       
## [215] "ap49_02"                        "ap50_01"                       
## [217] "ap50_02"                        "ap50_03"                       
## [219] "ap50_04"                        "ap50_05"                       
## [221] "ap51_01"                        "ap51_02"                       
## [223] "ap51_03"                        "ap51_04"                       
## [225] "ap51_05"                        "ap51_06"                       
## [227] "ap51_07"                        "ap51_08"                       
## [229] "ap51_09"                        "ap51_10"                       
## [231] "ponder"                         "lpondera"                      
## [233] "mpondera"                       "ldesemp"                       
## [235] "mdesemp"                        "TEL"                           
## [237] "TEM"                            "modelo"                        
## [239] "isocioa_puntaje"                "isocioa"                       
## [241] "isocioal_puntaje"               "isocioal"                      
## [243] "isocioam_puntaje"               "isocioam"                      
## [245] "repitencia_dicotomica"          "jardín"                        
## [247] "ap37_dicotomica"                "ap29_01.dico"                  
## [249] "ap29_02.dico"                   "ap29_03.dico"                  
## [251] "ap29_04.dico"                   "ap29_05.dico"                  
## [253] "ap29_06.dico"                   "ap29_07.dico"                  
## [255] "ap26_rec"                       "trabaja_fuera_hogar"           
## [257] "trabaja_fuera_hogar_remunerado" "migración"                     
## [259] "edadA_junio2019"                "sobreedad"                     
## [261] "infraestructura"                "iinfraestructura"              
## [263] "ap42_01rec"                     "ap42_02rec"                    
## [265] "ap42_03rec"                     "ap42_04rec"

Performance by Sector and Ambit

par(mfrow=c(2, 2))
boxplot(ldesemp~sector,data=mydata, main="Language Performance by Sector", sub="1= Public  2= Private",
        xlab="Sector", ylab="Language Performance", col="orange")

boxplot(ldesemp~ambito,data=mydata, main="Language Performance by Ambit", sub="1= Urban  2= Rural",
        xlab="Ambit", ylab="Language Performance", col="orange")

boxplot(mdesemp~sector,data=mydata, main="Math Performance by Sector", sub="1= Public  2= Private",
        xlab="Sector", ylab="Math Performance", col="lightblue")

boxplot(mdesemp~ambito,data=mydata, main="Math Performance by Ambit", sub="1= Urban  2= Rural",
        xlab="Ambit", ylab="Math Performance", col="lightblue")

Performance by Gender

par(mfrow=c(1, 2))

counts1 <- table(mydata$ldesemp, mydata$gender)
barplot(counts1, main="Language Perf. Level by Gender", sub="1= Male 2= Female 3= Other",
        xlab="Gender", col=c("red","orange","lightblue","forestgreen"))

counts2 <- table(mydata$mdesemp, mydata$gender)
barplot(counts2, main="Math Perf. Level by Gender", sub="1= Male 2= Female 3= Other",
        xlab="Gender", col=c("red","orange","lightblue","forestgreen"))

Performance by School Repetition and Students Who Work

par(mfrow=c(2, 2))
boxplot(ldesemp~repitencia_dicotomica,data=mydata, main="Language Performance by School Repetition", sub="1= Repeated  2= Non Repeated  3= No answer",
        xlab="School Repetition", ylab="Language Performance", col="pink")

boxplot(mdesemp~repitencia_dicotomica,data=mydata, main="Math Performance by School Repetition", sub="1= Repeated  2= Non Repeated  3= No answer",
        xlab="School Repetition", ylab="Math Performance", col="pink")

boxplot(ldesemp~trabaja_fuera_hogar,data=mydata, main="Language Performance by Students Who Work", sub="1= Yes  2= No  3= No answer",
        xlab="Students Who Work", ylab="Language Performance", col="darkkhaki")

boxplot(mdesemp~trabaja_fuera_hogar,data=mydata, main="Math Performance by Students Who Work", sub="1= Yes  2= No  3= No answer",
        xlab="Students Who Work", ylab="Math Performance", col="darkkhaki")

Performance by Socioeconomic level

par(mfrow=c(1, 2))

counts1 <- table(mydata$ldesemp, mydata$isocioa)
barplot(counts1, main="Language Perfor. by Socioeconomic", sub="-1=Non Answer, 1= Low 2= Medium 3= High",
        xlab="Socioeconomic Level", col=c("red","orange","lightblue","forestgreen"))

counts2 <- table(mydata$mdesemp, mydata$isocioa)
barplot(counts2, main="Math Perfor. by Socioeconomic", sub="-1=Non Answer, 1= Low 2= Medium 3= High",
        xlab="Socioeconomic Level", col=c("red","orange","lightblue","forestgreen"))

Percentages in tables

prop.table(counts1)
##    
##              -1           1           2           3
##   1 0.007986887 0.029563403 0.070868723 0.011592907
##   2 0.007510058 0.030546863 0.103352705 0.021248696
##   3 0.013857845 0.047921323 0.322455670 0.115601252
##   4 0.004112651 0.006854418 0.121531813 0.084994785
prop.table(counts2)
##    
##               -1            1            2            3
##   1 0.0149030851 0.0637039692 0.1952035361 0.0383776842
##   2 0.0087208434 0.0341068602 0.1878565242 0.0533106352
##   3 0.0081533913 0.0154406714 0.2175133650 0.1222112714
##   4 0.0009855748 0.0006271839 0.0184869934 0.0203984111

Correlations using subset of variables (until 15 variables)

#library(ggplot2)
#library(GGally)
#mydata[,c("ldesemp","mdesemp","gender","sector","ambito","isocioa")]

Testing regressions

fit1=lm(mdesemp~ female + factor(sector)+ factor(ambito) + factor(isocioa), data=mydata)
summary(fit1)
## 
## Call:
## lm(formula = mdesemp ~ female + factor(sector) + factor(ambito) + 
##     factor(isocioa), data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7191 -0.7191  0.1510  0.6035  2.6035 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.721431   0.026585  64.753  < 2e-16 ***
## female           -0.122334   0.009206 -13.288  < 2e-16 ***
## factor(sector)2   0.500041   0.009903  50.494  < 2e-16 ***
## factor(ambito)2  -0.095448   0.017572  -5.432 5.62e-08 ***
## factor(isocioa)1 -0.107166   0.029513  -3.631 0.000283 ***
## factor(isocioa)2  0.222974   0.026710   8.348  < 2e-16 ***
## factor(isocioa)3  0.497616   0.027978  17.786  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.829 on 33044 degrees of freedom
##   (1140 observations deleted due to missingness)
## Multiple R-squared:  0.1624, Adjusted R-squared:  0.1623 
## F-statistic:  1068 on 6 and 33044 DF,  p-value: < 2.2e-16
fit2=lm(ldesemp~ female + factor(sector)+ factor(ambito) + factor(isocioa),data=mydata)
summary(fit2)
## 
## Call:
## lm(formula = ldesemp ~ female + factor(sector) + factor(ambito) + 
##     factor(isocioa), data = mydata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.32520 -0.36500  0.03754  0.67480  1.96059 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.250895   0.027003  83.356  < 2e-16 ***
## female            0.114102   0.009393  12.147  < 2e-16 ***
## factor(sector)2   0.386280   0.010104  38.229  < 2e-16 ***
## factor(ambito)2  -0.141506   0.017909  -7.901 2.84e-15 ***
## factor(isocioa)1 -0.069978   0.029977  -2.334   0.0196 *  
## factor(isocioa)2  0.325282   0.027136  11.987  < 2e-16 ***
## factor(isocioa)3  0.573918   0.028438  20.182  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8462 on 33087 degrees of freedom
##   (1097 observations deleted due to missingness)
## Multiple R-squared:  0.1316, Adjusted R-squared:  0.1314 
## F-statistic: 835.6 on 6 and 33087 DF,  p-value: < 2.2e-16

GRADUATE Admissions Case

Clear my memory

rm(list=ls())

Reading in the data

mydata <- read.delim("/Users/feder/Desktop/km_project/admission_predict.txt")

Names of variables

names(mydata)
## [1] "Serial.No."        "GRE.Score"         "TOEFL.Score"      
## [4] "University.Rating" "SOP"               "LOR"              
## [7] "CGPA"              "Research"          "Chance.of.Admit"

Checking for linearity (for each predictor against the dep variable) with scatterplots

par(mfrow=c(3,3))

scatter.smooth(x=mydata$GRE.Score, y=mydata$Chance.of.Admit, main="Chance of Admission ~ GRES Scores")  
scatter.smooth(x=mydata$TOEFL.Score, y=mydata$Chance.of.Admit, main="Chance of Admission ~ TOEFL Scores") 

scatter.smooth(x=mydata$University.Rating, y=mydata$Chance.of.Admit, main="Chance of Admission ~ University Ranking") 
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1
scatter.smooth(x=mydata$SOP, y=mydata$Chance.of.Admit, main="Chance of Admission ~ Statement of Purpose Strength") 

scatter.smooth(x=mydata$LOR, y=mydata$Chance.of.Admit, main="Chance of Admission ~ Letter of Recommendation Strength") 

scatter.smooth(x=mydata$CGPA, y=mydata$Chance.of.Admit, main="Chance of Admission ~ Undergraduate GPA") 
scatter.smooth(x=mydata$Research, y=mydata$Chance.of.Admit, main="Chance of Admission ~ Research Experience") 
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 6.1612e-016
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at -0.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 6.1612e-016
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01

Checking for outliers with boxplots

par(mfrow=c(2, 3))  
boxplot(mydata$Chance.of.Admit, main="Chance of Admission", sub=paste("Outlier rows: ", boxplot.stats(mydata$Chance.of.Admit)$out))  
boxplot(mydata$GRE.Score, main="GRE Scores", sub=paste("Outlier rows: ", boxplot.stats(mydata$GRE.Score)$out))  
boxplot(mydata$TOEFL.Score, main="TOEFL Scores", sub=paste("Outlier rows: ", boxplot.stats(mydata$TOEFL.Score)$out))  
boxplot(mydata$University.Rating, main="University Ranking", sub=paste("Outlier rows: ", boxplot.stats(mydata$University.Rating)$out))  
boxplot(mydata$SOP, main="Statement of Purpose Strength", sub=paste("Outlier rows: ", boxplot.stats(mydata$SOP)$out))  
boxplot(mydata$LOR, main="Letter of Recommendation Strength", sub=paste("Outlier rows: ", boxplot.stats(mydata$LOR)$out))  

Checking for normality with density plots

library(e1071)
## Warning: package 'e1071' was built under R version 4.0.4
par(mfrow=c(3, 3))
plot(density(mydata$Chance.of.Admit), main="Chance of Admission", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$Chance.of.Admit), 2)))
polygon(density(mydata$Chance.of.Admit), col="lightblue")
plot(density(mydata$GRE.Score), main="GRE Scores", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$GRE.Score), 2)))
polygon(density(mydata$GRE.Score), col="lightblue")
plot(density(mydata$TOEFL.Score), main="TOEFL Scores", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$TOEFL.Score), 2)))
polygon(density(mydata$TOEFL.Score), col="lightblue")
plot(density(mydata$University.Rating), main="University Ranking", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$University.Rating), 2)))
polygon(density(mydata$University.Rating), col="lightblue")
plot(density(mydata$SOP), main="Statement of Purpose Strength", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$SOP), 2)))
polygon(density(mydata$SOP), col="lightblue")
plot(density(mydata$LOR), main="Letter of Recommendation Strength", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$LOR), 2)))
polygon(density(mydata$LOR), col="lightblue")
plot(density(mydata$Research), main="Research", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$Research), 2)))
polygon(density(mydata$Research), col="lightblue")

Checking correlations: scatterplot matrix

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.3
library(GGally)
## Warning: package 'GGally' was built under R version 4.0.4
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(mydata)

Regression analysis

fit1=lm(Chance.of.Admit ~ GRE.Score + TOEFL.Score + University.Rating + SOP + LOR + CGPA + Research, mydata)
summary(fit1) # University Ranking and SOP are statistically insignificant, f-statistic lower
## 
## Call:
## lm(formula = Chance.of.Admit ~ GRE.Score + TOEFL.Score + University.Rating + 
##     SOP + LOR + CGPA + Research, data = mydata)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.266657 -0.023327  0.009191  0.033714  0.156818 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.2757251  0.1042962 -12.232  < 2e-16 ***
## GRE.Score          0.0018585  0.0005023   3.700 0.000240 ***
## TOEFL.Score        0.0027780  0.0008724   3.184 0.001544 ** 
## University.Rating  0.0059414  0.0038019   1.563 0.118753    
## SOP                0.0015861  0.0045627   0.348 0.728263    
## LOR                0.0168587  0.0041379   4.074 5.38e-05 ***
## CGPA               0.1183851  0.0097051  12.198  < 2e-16 ***
## Research           0.0243075  0.0066057   3.680 0.000259 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05999 on 492 degrees of freedom
## Multiple R-squared:  0.8219, Adjusted R-squared:  0.8194 
## F-statistic: 324.4 on 7 and 492 DF,  p-value: < 2.2e-16
fit2=lm(Chance.of.Admit ~ GRE.Score + TOEFL.Score + LOR + CGPA + Research, mydata)
summary(fit2)
## 
## Call:
## lm(formula = Chance.of.Admit ~ GRE.Score + TOEFL.Score + LOR + 
##     CGPA + Research, data = mydata)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.265965 -0.023835  0.008003  0.035543  0.158379 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.3357018  0.0990753 -13.482  < 2e-16 ***
## GRE.Score    0.0018892  0.0005024   3.760 0.000190 ***
## TOEFL.Score  0.0030174  0.0008619   3.501 0.000506 ***
## LOR          0.0193203  0.0037939   5.092 5.04e-07 ***
## CGPA         0.1229798  0.0093018  13.221  < 2e-16 ***
## Research     0.0251649  0.0065988   3.814 0.000154 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06007 on 494 degrees of freedom
## Multiple R-squared:  0.8207, Adjusted R-squared:  0.8188 
## F-statistic: 452.1 on 5 and 494 DF,  p-value: < 2.2e-16

3D visualization

library(plotly)
## Warning: package 'plotly' was built under R version 4.0.3
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
plot <- plot_ly(mydata, 
                     x = ~CGPA, 
                     y = ~Chance.of.Admit, 
                     z = ~Research, 
                     type = "scatter3d", 
                     size = 0.02)
plot
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode