R Programming Lab (Logistic Regression)

Adapted from ISLR Chapter 4 Lab

Load ISLR library

require(ISLR)

## Loading required package: ISLR

## Warning: package 'ISLR' was built under R version 4.0.3

Check dataset Smarket

?Smarket

## starting httpd help server ... done

names(Smarket)

## [1] "Year"      "Lag1"      "Lag2"      "Lag3"      "Lag4"      "Lag5"     
## [7] "Volume"    "Today"     "Direction"

summary(Smarket)

##       Year           Lag1                Lag2                Lag3          
##  Min.   :2001   Min.   :-4.922000   Min.   :-4.922000   Min.   :-4.922000  
##  1st Qu.:2002   1st Qu.:-0.639500   1st Qu.:-0.639500   1st Qu.:-0.640000  
##  Median :2003   Median : 0.039000   Median : 0.039000   Median : 0.038500  
##  Mean   :2003   Mean   : 0.003834   Mean   : 0.003919   Mean   : 0.001716  
##  3rd Qu.:2004   3rd Qu.: 0.596750   3rd Qu.: 0.596750   3rd Qu.: 0.596750  
##  Max.   :2005   Max.   : 5.733000   Max.   : 5.733000   Max.   : 5.733000  
##       Lag4                Lag5              Volume           Today          
##  Min.   :-4.922000   Min.   :-4.92200   Min.   :0.3561   Min.   :-4.922000  
##  1st Qu.:-0.640000   1st Qu.:-0.64000   1st Qu.:1.2574   1st Qu.:-0.639500  
##  Median : 0.038500   Median : 0.03850   Median :1.4229   Median : 0.038500  
##  Mean   : 0.001636   Mean   : 0.00561   Mean   :1.4783   Mean   : 0.003138  
##  3rd Qu.: 0.596750   3rd Qu.: 0.59700   3rd Qu.:1.6417   3rd Qu.: 0.596750  
##  Max.   : 5.733000   Max.   : 5.73300   Max.   :3.1525   Max.   : 5.733000  
##  Direction 
##  Down:602  
##  Up  :648  
##            
##            
##            
##

Create a dataframe for data browsing

sm=Smarket

Plot the data with smaller dots

pairs(Smarket,col=Smarket$Direction,cex=.5)

Logistic regression

glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
            data=Smarket,family=binomial)
summary(glm.fit)

## 
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 + 
##     Volume, family = binomial, data = Smarket)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.446  -1.203   1.065   1.145   1.326  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000   0.240736  -0.523    0.601
## Lag1        -0.073074   0.050167  -1.457    0.145
## Lag2        -0.042301   0.050086  -0.845    0.398
## Lag3         0.011085   0.049939   0.222    0.824
## Lag4         0.009359   0.049974   0.187    0.851
## Lag5         0.010313   0.049511   0.208    0.835
## Volume       0.135441   0.158360   0.855    0.392
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1731.2  on 1249  degrees of freedom
## Residual deviance: 1727.6  on 1243  degrees of freedom
## AIC: 1741.6
## 
## Number of Fisher Scoring iterations: 3

glm.probs=predict(glm.fit,type="response") 
glm.probs[1:5]

##         1         2         3         4         5 
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812

glm.pred=ifelse(glm.probs>0.5,"Up","Down")
attach(Smarket)
table(glm.pred,Direction)

##         Direction
## glm.pred Down  Up
##     Down  145 141
##     Up    457 507

mean(glm.pred==Direction)

## [1] 0.5216

Make training and test set

train = Year<2005
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
            data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response") 
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
Direction.2005=Smarket$Direction[!train]
table(glm.pred,Direction.2005)

##         Direction.2005
## glm.pred Down Up
##     Down   77 97
##     Up     34 44

mean(glm.pred==Direction.2005)

## [1] 0.4801587

Fit smaller model

glm.fit=glm(Direction~Lag1+Lag2,
            data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response") 
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
table(glm.pred,Direction.2005)

##         Direction.2005
## glm.pred Down  Up
##     Down   35  35
##     Up     76 106

mean(glm.pred==Direction.2005)

## [1] 0.5595238

Check accuracy rate

106/(76+106)

## [1] 0.5824176

Logistic regerssion with TEDS2016

library(haven)
TEDS_2016 <-  read_stata("https://github.com/datageneration/home/blob/master/DataProgramming/data/TEDS_2016.dta?raw=true")

names(TEDS_2016)

##  [1] "District"        "Sex"             "Age"             "Edu"            
##  [5] "Arear"           "Career"          "Career8"         "Ethnic"         
##  [9] "Party"           "PartyID"         "Tondu"           "Tondu3"         
## [13] "nI2"             "votetsai"        "green"           "votetsai_nm"    
## [17] "votetsai_all"    "Independence"    "Unification"     "sq"             
## [21] "Taiwanese"       "edu"             "female"          "whitecollar"    
## [25] "lowincome"       "income"          "income_nm"       "age"            
## [29] "KMT"             "DPP"             "npp"             "noparty"        
## [33] "pfp"             "South"           "north"           "Minnan_father"  
## [37] "Mainland_father" "Econ_worse"      "Inequality"      "inequality5"    
## [41] "econworse5"      "Govt_for_public" "pubwelf5"        "Govt_dont_care" 
## [45] "highincome"      "votekmt"         "votekmt_nm"      "Blue"           
## [49] "Green"           "No_Party"        "voteblue"        "voteblue_nm"    
## [53] "votedpp_1"       "votekmt_1"

Logistic regression

teds.lm.fit=glm(votetsai~female,
            data=TEDS_2016,family=binomial)
summary(teds.lm.fit)

## 
## Call:
## glm(formula = votetsai ~ female, family = binomial, data = TEDS_2016)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4180  -1.3889   0.9546   0.9797   0.9797  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  0.54971    0.08245   6.667 2.61e-11 ***
## female      -0.06517    0.11644  -0.560    0.576    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1666.5  on 1260  degrees of freedom
## Residual deviance: 1666.2  on 1259  degrees of freedom
##   (429 observations deleted due to missingness)
## AIC: 1670.2
## 
## Number of Fisher Scoring iterations: 4

teds.lm.probs=predict(teds.lm.fit,type="response") 
teds.lm.probs[1:5]

##         2         3         6         7         8 
## 0.6188198 0.6340694 0.6188198 0.6340694 0.6188198

Female voters are not more likely to vote for President Tsai (estimate:- 0.06). It is not statistically significant.

Adding ID variables to improve the model

teds.lm.fit2=glm(votetsai~female+KMT+DPP+Age+edu+income,
                data=TEDS_2016,family=binomial)
summary(teds.lm.fit2)

## 
## Call:
## glm(formula = votetsai ~ female + KMT + DPP + Age + edu + income, 
##     family = binomial, data = TEDS_2016)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7416  -0.3658   0.2370   0.3098   2.5712  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.73673    0.50898   3.412 0.000644 ***
## female       0.04276    0.17769   0.241 0.809828    
## KMT         -3.14616    0.25036 -12.567  < 2e-16 ***
## DPP          2.90604    0.26860  10.819  < 2e-16 ***
## Age         -0.18582    0.08132  -2.285 0.022307 *  
## edu         -0.21355    0.08135  -2.625 0.008660 ** 
## income       0.01534    0.03447   0.445 0.656222    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1661.76  on 1256  degrees of freedom
## Residual deviance:  833.61  on 1250  degrees of freedom
##   (433 observations deleted due to missingness)
## AIC: 847.61
## 
## Number of Fisher Scoring iterations: 6

Female voters are more likely to vote for President Tsai (estimate: 0.04). It is still not statistically significant.

Adding ID variables to improve the model (more variables)

teds.lm.fit3=glm(votetsai~female+Independence+Econ_worse+
                   Govt_dont_care+Minnan_father+Mainland_father+Taiwanese,
                 data=TEDS_2016,family=binomial)
summary(teds.lm.fit3)

## 
## Call:
## glm(formula = votetsai ~ female + Independence + Econ_worse + 
##     Govt_dont_care + Minnan_father + Mainland_father + Taiwanese, 
##     family = binomial, data = TEDS_2016)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.4548  -0.8116   0.3845   0.7301   2.2762  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -1.0608     0.2096  -5.062 4.16e-07 ***
## female           -0.2752     0.1423  -1.934   0.0531 .  
## Independence      1.4493     0.1881   7.705 1.31e-14 ***
## Econ_worse        0.6780     0.1424   4.762 1.92e-06 ***
## Govt_dont_care    0.2258     0.1418   1.593   0.1112    
## Minnan_father     0.1689     0.1815   0.930   0.3523    
## Mainland_father  -1.4518     0.2912  -4.985 6.19e-07 ***
## Taiwanese         1.5014     0.1470  10.213  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1666.5  on 1260  degrees of freedom
## Residual deviance: 1247.6  on 1253  degrees of freedom
##   (429 observations deleted due to missingness)
## AIC: 1263.6
## 
## Number of Fisher Scoring iterations: 5

Lab 06

Federico Ferrero

3/24/2021

R Programming Lab (Logistic Regression)

Adapted from ISLR Chapter 4 Lab

Load ISLR library

Check dataset Smarket

Create a dataframe for data browsing

Plot the data with smaller dots

Logistic regression

Make training and test set

Fit smaller model

Check accuracy rate

Logistic regerssion with TEDS2016

Logistic regression

Female voters are not more likely to vote for President Tsai (estimate:- 0.06). It is not statistically significant.

Adding ID variables to improve the model

Female voters are more likely to vote for President Tsai (estimate: 0.04). It is still not statistically significant.

Adding ID variables to improve the model (more variables)

Now it is statistically significant at 0.05 level of confidence.