Final Project Knowledge Mining

# clear your memory
rm(list=ls())

Argentine Aprender Evaluation

Reading in the data

mydata <- read.delim("/Users/feder/Desktop/km_project/aprender_cordoba_dataset.txt")

Names of variables

names(mydata)

##   [1] "ID1"                            "cod_provincia"                 
##   [3] "sector"                         "ambito"                        
##   [5] "claveseccion"                   "idalumno"                      
##   [7] "ap01_01"                        "ap01_02"                       
##   [9] "ap02"                           "gender"                        
##  [11] "female"                         "ap03"                          
##  [13] "ap04"                           "ap05"                          
##  [15] "ap06"                           "ap07"                          
##  [17] "ap08_a"                         "ap08_b"                        
##  [19] "ap08_c"                         "ap08_d"                        
##  [21] "ap08_e"                         "ap08_f"                        
##  [23] "ap08_g"                         "ap08_h"                        
##  [25] "ap08_i"                         "ap09"                          
##  [27] "ap10"                           "ap11_01"                       
##  [29] "ap11_02"                        "ap11_03"                       
##  [31] "ap11_04"                        "ap11_05"                       
##  [33] "ap11_06"                        "ap11_07"                       
##  [35] "ap11_08"                        "ap12"                          
##  [37] "ap13"                           "ap14_01"                       
##  [39] "ap14_02"                        "ap15"                          
##  [41] "ap16"                           "ap17"                          
##  [43] "ap18_01"                        "ap18_02"                       
##  [45] "ap18_03"                        "ap19"                          
##  [47] "ap21"                           "ap22"                          
##  [49] "ap23_01"                        "ap23_02"                       
##  [51] "ap23_03"                        "ap23_04"                       
##  [53] "ap23_05"                        "ap23_06"                       
##  [55] "ap24"                           "ap25_01"                       
##  [57] "ap25_02"                        "ap25_03"                       
##  [59] "ap26"                           "ap27_a"                        
##  [61] "ap27_b"                         "ap27_c"                        
##  [63] "ap27_d"                         "ap27_e"                        
##  [65] "ap27_f"                         "ap28_01"                       
##  [67] "ap28_02"                        "ap28_03"                       
##  [69] "ap28_04"                        "ap29_01"                       
##  [71] "ap29_02"                        "ap29_03"                       
##  [73] "ap29_04"                        "ap29_05"                       
##  [75] "ap29_06"                        "ap29_07"                       
##  [77] "ap30"                           "ap31_01"                       
##  [79] "ap31_02"                        "ap31_03"                       
##  [81] "ap31_04"                        "ap31_05"                       
##  [83] "ap31_06"                        "ap32_a"                        
##  [85] "ap32_b"                         "ap32_c"                        
##  [87] "ap32_d"                         "ap32_e"                        
##  [89] "ap32_f"                         "ap33_01_a"                     
##  [91] "ap33_02_a"                      "ap33_03_a"                     
##  [93] "ap33_04_a"                      "ap33_01_b"                     
##  [95] "ap33_02_b"                      "ap33_03_b"                     
##  [97] "ap33_04_b"                      "ap33_01bis"                    
##  [99] "ap33_02bis"                     "ap33_03bis"                    
## [101] "ap33_04bis"                     "ap33_05bis"                    
## [103] "ap33_06bis"                     "ap33_07bis"                    
## [105] "ap33_08bis"                     "ap33_09bis"                    
## [107] "ap33_10bis"                     "ap33_11bis"                    
## [109] "ap34"                           "ap35_01"                       
## [111] "ap35_02"                        "ap35_03"                       
## [113] "ap35_04"                        "ap35_05"                       
## [115] "ap36_a"                         "ap36_b"                        
## [117] "ap36_c"                         "ap36_d"                        
## [119] "ap36_e"                         "ap36_f"                        
## [121] "ap37_a"                         "ap37_b"                        
## [123] "ap37_c"                         "ap37_d"                        
## [125] "ap37_e"                         "ap37_f"                        
## [127] "ap37_g"                         "ap37_h"                        
## [129] "ap37_i"                         "ap37_j"                        
## [131] "ap37_k"                         "ap37_l"                        
## [133] "ap37_m"                         "ap37_n"                        
## [135] "ap38_a"                         "ap38_b"                        
## [137] "ap38_c"                         "ap38_d"                        
## [139] "ap38_e"                         "ap38_f"                        
## [141] "ap38_g"                         "ap38_h"                        
## [143] "ap38_i"                         "ap38_j"                        
## [145] "ap38_k"                         "ap38_l"                        
## [147] "ap38_m"                         "ap38_n"                        
## [149] "ap39_01"                        "ap39_02"                       
## [151] "ap39_03"                        "ap39_04"                       
## [153] "ap40_01"                        "ap40_02"                       
## [155] "ap40_03"                        "ap40_04"                       
## [157] "ap40_05"                        "ap40_06"                       
## [159] "ap41_01"                        "ap41_02"                       
## [161] "ap41_03"                        "ap41_04"                       
## [163] "ap41_05"                        "ap41_06"                       
## [165] "ap41_07"                        "ap41_08"                       
## [167] "ap41_09"                        "ap41_10"                       
## [169] "ap41_11"                        "ap42_01"                       
## [171] "ap42_02"                        "ap42_03"                       
## [173] "ap42_04"                        "ap43_a"                        
## [175] "ap43_b"                         "ap43_c"                        
## [177] "ap43_d"                         "ap43_e"                        
## [179] "ap43_f"                         "ap43_g"                        
## [181] "ap43_h"                         "ap44_01"                       
## [183] "ap44_02"                        "ap44_03"                       
## [185] "ap44_04"                        "ap44_05"                       
## [187] "ap45"                           "ap46_01"                       
## [189] "ap46_02"                        "ap46_03"                       
## [191] "ap46_04"                        "ap46_05"                       
## [193] "ap47_01"                        "ap47_02"                       
## [195] "ap47_03"                        "ap47_04"                       
## [197] "ap47_05"                        "ap47_06"                       
## [199] "ap47_07"                        "ap47_08"                       
## [201] "ap47_09"                        "ap47_10"                       
## [203] "ap47_11"                        "ap47_12"                       
## [205] "ap48_a"                         "ap48_b"                        
## [207] "ap48_c"                         "ap48_d"                        
## [209] "ap48_e"                         "ap48_f"                        
## [211] "ap48_g"                         "ap48_h"                        
## [213] "ap48_i"                         "ap49_01"                       
## [215] "ap49_02"                        "ap50_01"                       
## [217] "ap50_02"                        "ap50_03"                       
## [219] "ap50_04"                        "ap50_05"                       
## [221] "ap51_01"                        "ap51_02"                       
## [223] "ap51_03"                        "ap51_04"                       
## [225] "ap51_05"                        "ap51_06"                       
## [227] "ap51_07"                        "ap51_08"                       
## [229] "ap51_09"                        "ap51_10"                       
## [231] "ponder"                         "lpondera"                      
## [233] "mpondera"                       "ldesemp"                       
## [235] "mdesemp"                        "TEL"                           
## [237] "TEM"                            "modelo"                        
## [239] "isocioa_puntaje"                "isocioa"                       
## [241] "isocioal_puntaje"               "isocioal"                      
## [243] "isocioam_puntaje"               "isocioam"                      
## [245] "repitencia_dicotomica"          "jardín"                        
## [247] "ap37_dicotomica"                "ap29_01.dico"                  
## [249] "ap29_02.dico"                   "ap29_03.dico"                  
## [251] "ap29_04.dico"                   "ap29_05.dico"                  
## [253] "ap29_06.dico"                   "ap29_07.dico"                  
## [255] "ap26_rec"                       "trabaja_fuera_hogar"           
## [257] "trabaja_fuera_hogar_remunerado" "migración"                     
## [259] "edadA_junio2019"                "sobreedad"                     
## [261] "infraestructura"                "iinfraestructura"              
## [263] "ap42_01rec"                     "ap42_02rec"                    
## [265] "ap42_03rec"                     "ap42_04rec"

Performance by Sector and Ambit

par(mfrow=c(2, 2))
boxplot(ldesemp~sector,data=mydata, main="Language Performance by Sector", sub="1= Public  2= Private",
        xlab="Sector", ylab="Language Performance", col="orange")

boxplot(ldesemp~ambito,data=mydata, main="Language Performance by Ambit", sub="1= Urban  2= Rural",
        xlab="Ambit", ylab="Language Performance", col="orange")

boxplot(mdesemp~sector,data=mydata, main="Math Performance by Sector", sub="1= Public  2= Private",
        xlab="Sector", ylab="Math Performance", col="lightblue")

boxplot(mdesemp~ambito,data=mydata, main="Math Performance by Ambit", sub="1= Urban  2= Rural",
        xlab="Ambit", ylab="Math Performance", col="lightblue")

Performance by Gender

par(mfrow=c(1, 2))

counts1 <- table(mydata$ldesemp, mydata$gender)
barplot(counts1, main="Language Perf. Level by Gender", sub="1= Male 2= Female 3= Other",
        xlab="Gender", col=c("red","orange","lightblue","forestgreen"))

counts2 <- table(mydata$mdesemp, mydata$gender)
barplot(counts2, main="Math Perf. Level by Gender", sub="1= Male 2= Female 3= Other",
        xlab="Gender", col=c("red","orange","lightblue","forestgreen"))

Performance by School Repetition and Students Who Work

par(mfrow=c(2, 2))
boxplot(ldesemp~repitencia_dicotomica,data=mydata, main="Language Performance by School Repetition", sub="1= Repeated  2= Non Repeated  3= No answer",
        xlab="School Repetition", ylab="Language Performance", col="pink")

boxplot(mdesemp~repitencia_dicotomica,data=mydata, main="Math Performance by School Repetition", sub="1= Repeated  2= Non Repeated  3= No answer",
        xlab="School Repetition", ylab="Math Performance", col="pink")

boxplot(ldesemp~trabaja_fuera_hogar,data=mydata, main="Language Performance by Students Who Work", sub="1= Yes  2= No  3= No answer",
        xlab="Students Who Work", ylab="Language Performance", col="darkkhaki")

boxplot(mdesemp~trabaja_fuera_hogar,data=mydata, main="Math Performance by Students Who Work", sub="1= Yes  2= No  3= No answer",
        xlab="Students Who Work", ylab="Math Performance", col="darkkhaki")

Performance by Socioeconomic level

par(mfrow=c(1, 2))

counts1 <- table(mydata$ldesemp, mydata$isocioa)
barplot(counts1, main="Language Perfor. by Socioeconomic", sub="-1=Non Answer, 1= Low 2= Medium 3= High",
        xlab="Socioeconomic Level", col=c("red","orange","lightblue","forestgreen"))

counts2 <- table(mydata$mdesemp, mydata$isocioa)
barplot(counts2, main="Math Perfor. by Socioeconomic", sub="-1=Non Answer, 1= Low 2= Medium 3= High",
        xlab="Socioeconomic Level", col=c("red","orange","lightblue","forestgreen"))

Percentages in tables

prop.table(counts1)

##    
##              -1           1           2           3
##   1 0.007986887 0.029563403 0.070868723 0.011592907
##   2 0.007510058 0.030546863 0.103352705 0.021248696
##   3 0.013857845 0.047921323 0.322455670 0.115601252
##   4 0.004112651 0.006854418 0.121531813 0.084994785

prop.table(counts2)

##    
##               -1            1            2            3
##   1 0.0149030851 0.0637039692 0.1952035361 0.0383776842
##   2 0.0087208434 0.0341068602 0.1878565242 0.0533106352
##   3 0.0081533913 0.0154406714 0.2175133650 0.1222112714
##   4 0.0009855748 0.0006271839 0.0184869934 0.0203984111

Correlations using subset of variables (until 15 variables)

#library(ggplot2)
#library(GGally)
#mydata[,c("ldesemp","mdesemp","gender","sector","ambito","isocioa")]

Testing regressions

fit1=lm(mdesemp~ female + factor(sector)+ factor(ambito) + factor(isocioa), data=mydata)
summary(fit1)

## 
## Call:
## lm(formula = mdesemp ~ female + factor(sector) + factor(ambito) + 
##     factor(isocioa), data = mydata)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.7191 -0.7191  0.1510  0.6035  2.6035 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       1.721431   0.026585  64.753  < 2e-16 ***
## female           -0.122334   0.009206 -13.288  < 2e-16 ***
## factor(sector)2   0.500041   0.009903  50.494  < 2e-16 ***
## factor(ambito)2  -0.095448   0.017572  -5.432 5.62e-08 ***
## factor(isocioa)1 -0.107166   0.029513  -3.631 0.000283 ***
## factor(isocioa)2  0.222974   0.026710   8.348  < 2e-16 ***
## factor(isocioa)3  0.497616   0.027978  17.786  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.829 on 33044 degrees of freedom
##   (1140 observations deleted due to missingness)
## Multiple R-squared:  0.1624, Adjusted R-squared:  0.1623 
## F-statistic:  1068 on 6 and 33044 DF,  p-value: < 2.2e-16

fit2=lm(ldesemp~ female + factor(sector)+ factor(ambito) + factor(isocioa),data=mydata)
summary(fit2)

## 
## Call:
## lm(formula = ldesemp ~ female + factor(sector) + factor(ambito) + 
##     factor(isocioa), data = mydata)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.32520 -0.36500  0.03754  0.67480  1.96059 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       2.250895   0.027003  83.356  < 2e-16 ***
## female            0.114102   0.009393  12.147  < 2e-16 ***
## factor(sector)2   0.386280   0.010104  38.229  < 2e-16 ***
## factor(ambito)2  -0.141506   0.017909  -7.901 2.84e-15 ***
## factor(isocioa)1 -0.069978   0.029977  -2.334   0.0196 *  
## factor(isocioa)2  0.325282   0.027136  11.987  < 2e-16 ***
## factor(isocioa)3  0.573918   0.028438  20.182  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8462 on 33087 degrees of freedom
##   (1097 observations deleted due to missingness)
## Multiple R-squared:  0.1316, Adjusted R-squared:  0.1314 
## F-statistic: 835.6 on 6 and 33087 DF,  p-value: < 2.2e-16

GRADUATE Admissions Case

Clear my memory

rm(list=ls())

Reading in the data

mydata <- read.delim("/Users/feder/Desktop/km_project/admission_predict.txt")

Names of variables

names(mydata)

## [1] "Serial.No."        "GRE.Score"         "TOEFL.Score"      
## [4] "University.Rating" "SOP"               "LOR"              
## [7] "CGPA"              "Research"          "Chance.of.Admit"

Checking for linearity (for each predictor against the dep variable) with scatterplots

par(mfrow=c(3,3))

scatter.smooth(x=mydata$GRE.Score, y=mydata$Chance.of.Admit, main="Chance of Admission ~ GRES Scores")  
scatter.smooth(x=mydata$TOEFL.Score, y=mydata$Chance.of.Admit, main="Chance of Admission ~ TOEFL Scores") 

scatter.smooth(x=mydata$University.Rating, y=mydata$Chance.of.Admit, main="Chance of Admission ~ University Ranking")

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at 3

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1

scatter.smooth(x=mydata$SOP, y=mydata$Chance.of.Admit, main="Chance of Admission ~ Statement of Purpose Strength") 

scatter.smooth(x=mydata$LOR, y=mydata$Chance.of.Admit, main="Chance of Admission ~ Letter of Recommendation Strength") 

scatter.smooth(x=mydata$CGPA, y=mydata$Chance.of.Admit, main="Chance of Admission ~ Undergraduate GPA") 
scatter.smooth(x=mydata$Research, y=mydata$Chance.of.Admit, main="Chance of Admission ~ Research Experience")

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at -0.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 6.1612e-016

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at -0.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at -0.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at -0.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## pseudoinverse used at -0.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## neighborhood radius 1.005

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## reciprocal condition number 6.1612e-016

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = FALSE, :
## There are other near singularities as well. 1.01

Checking for outliers with boxplots

par(mfrow=c(2, 3))  
boxplot(mydata$Chance.of.Admit, main="Chance of Admission", sub=paste("Outlier rows: ", boxplot.stats(mydata$Chance.of.Admit)$out))  
boxplot(mydata$GRE.Score, main="GRE Scores", sub=paste("Outlier rows: ", boxplot.stats(mydata$GRE.Score)$out))  
boxplot(mydata$TOEFL.Score, main="TOEFL Scores", sub=paste("Outlier rows: ", boxplot.stats(mydata$TOEFL.Score)$out))  
boxplot(mydata$University.Rating, main="University Ranking", sub=paste("Outlier rows: ", boxplot.stats(mydata$University.Rating)$out))  
boxplot(mydata$SOP, main="Statement of Purpose Strength", sub=paste("Outlier rows: ", boxplot.stats(mydata$SOP)$out))  
boxplot(mydata$LOR, main="Letter of Recommendation Strength", sub=paste("Outlier rows: ", boxplot.stats(mydata$LOR)$out))

Checking for normality with density plots

library(e1071)

## Warning: package 'e1071' was built under R version 4.0.4

par(mfrow=c(3, 3))
plot(density(mydata$Chance.of.Admit), main="Chance of Admission", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$Chance.of.Admit), 2)))
polygon(density(mydata$Chance.of.Admit), col="lightblue")
plot(density(mydata$GRE.Score), main="GRE Scores", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$GRE.Score), 2)))
polygon(density(mydata$GRE.Score), col="lightblue")
plot(density(mydata$TOEFL.Score), main="TOEFL Scores", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$TOEFL.Score), 2)))
polygon(density(mydata$TOEFL.Score), col="lightblue")
plot(density(mydata$University.Rating), main="University Ranking", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$University.Rating), 2)))
polygon(density(mydata$University.Rating), col="lightblue")
plot(density(mydata$SOP), main="Statement of Purpose Strength", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$SOP), 2)))
polygon(density(mydata$SOP), col="lightblue")
plot(density(mydata$LOR), main="Letter of Recommendation Strength", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$LOR), 2)))
polygon(density(mydata$LOR), col="lightblue")
plot(density(mydata$Research), main="Research", ylab="Frequency", sub=paste("Skewness:", round(e1071::skewness(mydata$Research), 2)))
polygon(density(mydata$Research), col="lightblue")

Checking correlations: scatterplot matrix

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 4.0.3

library(GGally)

## Warning: package 'GGally' was built under R version 4.0.4

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

ggpairs(mydata)

Regression analysis

fit1=lm(Chance.of.Admit ~ GRE.Score + TOEFL.Score + University.Rating + SOP + LOR + CGPA + Research, mydata)
summary(fit1) # University Ranking and SOP are statistically insignificant, f-statistic lower

## 
## Call:
## lm(formula = Chance.of.Admit ~ GRE.Score + TOEFL.Score + University.Rating + 
##     SOP + LOR + CGPA + Research, data = mydata)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.266657 -0.023327  0.009191  0.033714  0.156818 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       -1.2757251  0.1042962 -12.232  < 2e-16 ***
## GRE.Score          0.0018585  0.0005023   3.700 0.000240 ***
## TOEFL.Score        0.0027780  0.0008724   3.184 0.001544 ** 
## University.Rating  0.0059414  0.0038019   1.563 0.118753    
## SOP                0.0015861  0.0045627   0.348 0.728263    
## LOR                0.0168587  0.0041379   4.074 5.38e-05 ***
## CGPA               0.1183851  0.0097051  12.198  < 2e-16 ***
## Research           0.0243075  0.0066057   3.680 0.000259 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.05999 on 492 degrees of freedom
## Multiple R-squared:  0.8219, Adjusted R-squared:  0.8194 
## F-statistic: 324.4 on 7 and 492 DF,  p-value: < 2.2e-16

fit2=lm(Chance.of.Admit ~ GRE.Score + TOEFL.Score + LOR + CGPA + Research, mydata)
summary(fit2)

## 
## Call:
## lm(formula = Chance.of.Admit ~ GRE.Score + TOEFL.Score + LOR + 
##     CGPA + Research, data = mydata)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.265965 -0.023835  0.008003  0.035543  0.158379 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.3357018  0.0990753 -13.482  < 2e-16 ***
## GRE.Score    0.0018892  0.0005024   3.760 0.000190 ***
## TOEFL.Score  0.0030174  0.0008619   3.501 0.000506 ***
## LOR          0.0193203  0.0037939   5.092 5.04e-07 ***
## CGPA         0.1229798  0.0093018  13.221  < 2e-16 ***
## Research     0.0251649  0.0065988   3.814 0.000154 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06007 on 494 degrees of freedom
## Multiple R-squared:  0.8207, Adjusted R-squared:  0.8188 
## F-statistic: 452.1 on 5 and 494 DF,  p-value: < 2.2e-16

3D visualization

library(plotly)

## Warning: package 'plotly' was built under R version 4.0.3

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

plot <- plot_ly(mydata, 
                     x = ~CGPA, 
                     y = ~Chance.of.Admit, 
                     z = ~Research, 
                     type = "scatter3d", 
                     size = 0.02)
plot

## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plotly.com/r/reference/#scatter-mode

Progress Report Project

Federico Ferrero

4/14/2021

Final Project Knowledge Mining

Argentine Aprender Evaluation

Reading in the data

Names of variables

Performance by Sector and Ambit

Performance by Gender

Performance by School Repetition and Students Who Work

Performance by Socioeconomic level

Percentages in tables

Correlations using subset of variables (until 15 variables)

Testing regressions

GRADUATE Admissions Case

Clear my memory

Reading in the data

Names of variables

Checking for linearity (for each predictor against the dep variable) with scatterplots

Checking for outliers with boxplots

Checking for normality with density plots

Checking correlations: scatterplot matrix

Regression analysis

3D visualization