R Programming Lab (Logistic Regression)

Adapted from ISLR Chapter 4 Lab

Load ISLR library

require(ISLR)
## Loading required package: ISLR
## Warning: package 'ISLR' was built under R version 4.0.3

Check dataset Smarket

?Smarket
## starting httpd help server ... done
names(Smarket)
## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"
summary(Smarket)
##       Year           Lag1                Lag2                Lag3          
##  Min.   :2001   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500   1st Qu.:-0.640000  
##  Median :2003   Median : 0.039000   Median : 0.039000   Median : 0.038500  
##  Mean   :2003   Mean   : 0.003834   Mean   : 0.003919   Mean   : 0.001716  
##  3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :2005   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000  
##       Lag4                Lag5              Volume           Today          
##  Min.   :-4.922000   Min.   :-4.92200   Min.   :0.3561   Min.   :-4.922000  
##  1st Qu.:-0.640000   1st Qu.:-0.64000   1st Qu.:1.2574   1st Qu.:-0.639500  
##  Median : 0.038500   Median : 0.03850   Median :1.4229   Median : 0.038500  
##  Mean   : 0.001636   Mean   : 0.00561   Mean   :1.4783   Mean   : 0.003138  
##  3rd Qu.: 0.596750   3rd Qu.: 0.59700   3rd Qu.:1.6417   3rd Qu.: 0.596750  
##  Max.   : 5.733000   Max.   : 5.73300   Max.   :3.1525   Max.   : 5.733000  
##  Direction 
##  Down:602  
##  Up  :648  
##            
##            
##            
## 

Create a dataframe for data browsing

sm=Smarket

Plot the data with smaller dots

pairs(Smarket,col=Smarket$Direction,cex=.5)

Logistic regression

glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
            data=Smarket,family=binomial)
summary(glm.fit)
## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.446  -1.203   1.065   1.145   1.326  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3
glm.probs=predict(glm.fit,type="response") 
glm.probs[1:5]
##         1         2         3         4         5 
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
attach(Smarket)
table(glm.pred,Direction)
##         Direction
## glm.pred Down  Up
##     Down  145 141
##     Up    457 507
mean(glm.pred==Direction)
## [1] 0.5216

Make training and test set

train = Year<2005
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
            data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response") 
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
Direction.2005=Smarket$Direction[!train]
table(glm.pred,Direction.2005)
##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44
mean(glm.pred==Direction.2005)
## [1] 0.4801587

Fit smaller model

glm.fit=glm(Direction~Lag1+Lag2,
            data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response") 
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
table(glm.pred,Direction.2005)
##         Direction.2005
## glm.pred Down  Up
##     Down   35  35
##     Up     76 106
mean(glm.pred==Direction.2005)
## [1] 0.5595238

Check accuracy rate

106/(76+106)
## [1] 0.5824176

Logistic regerssion with TEDS2016

library(haven)
TEDS_2016 <-  read_stata("https://github.com/datageneration/home/blob/master/DataProgramming/data/TEDS_2016.dta?raw=true")
names(TEDS_2016)
##  [1] "District"        "Sex"             "Age"             "Edu"            
##  [5] "Arear"           "Career"          "Career8"         "Ethnic"         
##  [9] "Party"           "PartyID"         "Tondu"           "Tondu3"         
## [13] "nI2"             "votetsai"        "green"           "votetsai_nm"    
## [17] "votetsai_all"    "Independence"    "Unification"     "sq"             
## [21] "Taiwanese"       "edu"             "female"          "whitecollar"    
## [25] "lowincome"       "income"          "income_nm"       "age"            
## [29] "KMT"             "DPP"             "npp"             "noparty"        
## [33] "pfp"             "South"           "north"           "Minnan_father"  
## [37] "Mainland_father" "Econ_worse"      "Inequality"      "inequality5"    
## [41] "econworse5"      "Govt_for_public" "pubwelf5"        "Govt_dont_care" 
## [45] "highincome"      "votekmt"         "votekmt_nm"      "Blue"           
## [49] "Green"           "No_Party"        "voteblue"        "voteblue_nm"    
## [53] "votedpp_1"       "votekmt_1"

Logistic regression

teds.lm.fit=glm(votetsai~female,
            data=TEDS_2016,family=binomial)
summary(teds.lm.fit)
## 
## Call:
## glm(formula = votetsai ~ female, family = binomial, data = TEDS_2016)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4180  -1.3889   0.9546   0.9797   0.9797  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.54971    0.08245   6.667 2.61e-11 ***
## female      -0.06517    0.11644  -0.560    0.576    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1666.5  on 1260  degrees of freedom
## Residual deviance: 1666.2  on 1259  degrees of freedom
##   (429 observations deleted due to missingness)
## AIC: 1670.2
## 
## Number of Fisher Scoring iterations: 4
teds.lm.probs=predict(teds.lm.fit,type="response") 
teds.lm.probs[1:5]
##         2         3         6         7         8 
## 0.6188198 0.6340694 0.6188198 0.6340694 0.6188198

Female voters are not more likely to vote for President Tsai (estimate:- 0.06). It is not statistically significant.

Adding ID variables to improve the model

teds.lm.fit2=glm(votetsai~female+KMT+DPP+Age+edu+income,
                data=TEDS_2016,family=binomial)
summary(teds.lm.fit2)
## 
## Call:
## glm(formula = votetsai ~ female + KMT + DPP + Age + edu + income, 
##     family = binomial, data = TEDS_2016)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7416  -0.3658   0.2370   0.3098   2.5712  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.73673    0.50898   3.412 0.000644 ***
## female       0.04276    0.17769   0.241 0.809828    
## KMT         -3.14616    0.25036 -12.567  < 2e-16 ***
## DPP          2.90604    0.26860  10.819  < 2e-16 ***
## Age         -0.18582    0.08132  -2.285 0.022307 *  
## edu         -0.21355    0.08135  -2.625 0.008660 ** 
## income       0.01534    0.03447   0.445 0.656222    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1661.76  on 1256  degrees of freedom
## Residual deviance:  833.61  on 1250  degrees of freedom
##   (433 observations deleted due to missingness)
## AIC: 847.61
## 
## Number of Fisher Scoring iterations: 6

Female voters are more likely to vote for President Tsai (estimate: 0.04). It is still not statistically significant.

Adding ID variables to improve the model (more variables)

teds.lm.fit3=glm(votetsai~female+Independence+Econ_worse+
                   Govt_dont_care+Minnan_father+Mainland_father+Taiwanese,
                 data=TEDS_2016,family=binomial)
summary(teds.lm.fit3)
## 
## Call:
## glm(formula = votetsai ~ female + Independence + Econ_worse + 
##     Govt_dont_care + Minnan_father + Mainland_father + Taiwanese, 
##     family = binomial, data = TEDS_2016)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4548  -0.8116   0.3845   0.7301   2.2762  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.0608     0.2096  -5.062 4.16e-07 ***
## female           -0.2752     0.1423  -1.934   0.0531 .  
## Independence      1.4493     0.1881   7.705 1.31e-14 ***
## Econ_worse        0.6780     0.1424   4.762 1.92e-06 ***
## Govt_dont_care    0.2258     0.1418   1.593   0.1112    
## Minnan_father     0.1689     0.1815   0.930   0.3523    
## Mainland_father  -1.4518     0.2912  -4.985 6.19e-07 ***
## Taiwanese         1.5014     0.1470  10.213  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1666.5  on 1260  degrees of freedom
## Residual deviance: 1247.6  on 1253  degrees of freedom
##   (429 observations deleted due to missingness)
## AIC: 1263.6
## 
## Number of Fisher Scoring iterations: 5

Now it is statistically significant at 0.05 level of confidence.