R Programming Lab (Logistic Regression)
Adapted from ISLR Chapter 4 Lab
Load ISLR library
require(ISLR)
## Loading required package: ISLR
## Warning: package 'ISLR' was built under R version 4.0.3
Check dataset Smarket
?Smarket
## starting httpd help server ... done
names(Smarket)
## [1] "Year" "Lag1" "Lag2" "Lag3" "Lag4" "Lag5"
## [7] "Volume" "Today" "Direction"
summary(Smarket)
## Year Lag1 Lag2 Lag3
## Min. :2001 Min. :-4.922000 Min. :-4.922000 Min. :-4.922000
## 1st Qu.:2002 1st Qu.:-0.639500 1st Qu.:-0.639500 1st Qu.:-0.640000
## Median :2003 Median : 0.039000 Median : 0.039000 Median : 0.038500
## Mean :2003 Mean : 0.003834 Mean : 0.003919 Mean : 0.001716
## 3rd Qu.:2004 3rd Qu.: 0.596750 3rd Qu.: 0.596750 3rd Qu.: 0.596750
## Max. :2005 Max. : 5.733000 Max. : 5.733000 Max. : 5.733000
## Lag4 Lag5 Volume Today
## Min. :-4.922000 Min. :-4.92200 Min. :0.3561 Min. :-4.922000
## 1st Qu.:-0.640000 1st Qu.:-0.64000 1st Qu.:1.2574 1st Qu.:-0.639500
## Median : 0.038500 Median : 0.03850 Median :1.4229 Median : 0.038500
## Mean : 0.001636 Mean : 0.00561 Mean :1.4783 Mean : 0.003138
## 3rd Qu.: 0.596750 3rd Qu.: 0.59700 3rd Qu.:1.6417 3rd Qu.: 0.596750
## Max. : 5.733000 Max. : 5.73300 Max. :3.1525 Max. : 5.733000
## Direction
## Down:602
## Up :648
##
##
##
##
Create a dataframe for data browsing
sm=Smarket
Plot the data with smaller dots
pairs(Smarket,col=Smarket$Direction,cex=.5)
Logistic regression
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket,family=binomial)
summary(glm.fit)
##
## Call:
## glm(formula = Direction ~ Lag1 + Lag2 + Lag3 + Lag4 + Lag5 +
## Volume, family = binomial, data = Smarket)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.446 -1.203 1.065 1.145 1.326
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126000 0.240736 -0.523 0.601
## Lag1 -0.073074 0.050167 -1.457 0.145
## Lag2 -0.042301 0.050086 -0.845 0.398
## Lag3 0.011085 0.049939 0.222 0.824
## Lag4 0.009359 0.049974 0.187 0.851
## Lag5 0.010313 0.049511 0.208 0.835
## Volume 0.135441 0.158360 0.855 0.392
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1731.2 on 1249 degrees of freedom
## Residual deviance: 1727.6 on 1243 degrees of freedom
## AIC: 1741.6
##
## Number of Fisher Scoring iterations: 3
glm.probs=predict(glm.fit,type="response")
glm.probs[1:5]
## 1 2 3 4 5
## 0.5070841 0.4814679 0.4811388 0.5152224 0.5107812
glm.pred=ifelse(glm.probs>0.5,"Up","Down")
attach(Smarket)
table(glm.pred,Direction)
## Direction
## glm.pred Down Up
## Down 145 141
## Up 457 507
mean(glm.pred==Direction)
## [1] 0.5216
Make training and test set
train = Year<2005
glm.fit=glm(Direction~Lag1+Lag2+Lag3+Lag4+Lag5+Volume,
data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
Direction.2005=Smarket$Direction[!train]
table(glm.pred,Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 77 97
## Up 34 44
mean(glm.pred==Direction.2005)
## [1] 0.4801587
Fit smaller model
glm.fit=glm(Direction~Lag1+Lag2,
data=Smarket,family=binomial, subset=train)
glm.probs=predict(glm.fit,newdata=Smarket[!train,],type="response")
glm.pred=ifelse(glm.probs >0.5,"Up","Down")
table(glm.pred,Direction.2005)
## Direction.2005
## glm.pred Down Up
## Down 35 35
## Up 76 106
mean(glm.pred==Direction.2005)
## [1] 0.5595238
Check accuracy rate
106/(76+106)
## [1] 0.5824176
Logistic regression
teds.lm.fit=glm(votetsai~female,
data=TEDS_2016,family=binomial)
summary(teds.lm.fit)
##
## Call:
## glm(formula = votetsai ~ female, family = binomial, data = TEDS_2016)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4180 -1.3889 0.9546 0.9797 0.9797
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.54971 0.08245 6.667 2.61e-11 ***
## female -0.06517 0.11644 -0.560 0.576
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1666.5 on 1260 degrees of freedom
## Residual deviance: 1666.2 on 1259 degrees of freedom
## (429 observations deleted due to missingness)
## AIC: 1670.2
##
## Number of Fisher Scoring iterations: 4
teds.lm.probs=predict(teds.lm.fit,type="response")
teds.lm.probs[1:5]
## 2 3 6 7 8
## 0.6188198 0.6340694 0.6188198 0.6340694 0.6188198
Female voters are not more likely to vote for President Tsai (estimate:- 0.06). It is not statistically significant.
Adding ID variables to improve the model
teds.lm.fit2=glm(votetsai~female+KMT+DPP+Age+edu+income,
data=TEDS_2016,family=binomial)
summary(teds.lm.fit2)
##
## Call:
## glm(formula = votetsai ~ female + KMT + DPP + Age + edu + income,
## family = binomial, data = TEDS_2016)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7416 -0.3658 0.2370 0.3098 2.5712
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.73673 0.50898 3.412 0.000644 ***
## female 0.04276 0.17769 0.241 0.809828
## KMT -3.14616 0.25036 -12.567 < 2e-16 ***
## DPP 2.90604 0.26860 10.819 < 2e-16 ***
## Age -0.18582 0.08132 -2.285 0.022307 *
## edu -0.21355 0.08135 -2.625 0.008660 **
## income 0.01534 0.03447 0.445 0.656222
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1661.76 on 1256 degrees of freedom
## Residual deviance: 833.61 on 1250 degrees of freedom
## (433 observations deleted due to missingness)
## AIC: 847.61
##
## Number of Fisher Scoring iterations: 6
Female voters are more likely to vote for President Tsai (estimate: 0.04). It is still not statistically significant.
Adding ID variables to improve the model (more variables)
teds.lm.fit3=glm(votetsai~female+Independence+Econ_worse+
Govt_dont_care+Minnan_father+Mainland_father+Taiwanese,
data=TEDS_2016,family=binomial)
summary(teds.lm.fit3)
##
## Call:
## glm(formula = votetsai ~ female + Independence + Econ_worse +
## Govt_dont_care + Minnan_father + Mainland_father + Taiwanese,
## family = binomial, data = TEDS_2016)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.4548 -0.8116 0.3845 0.7301 2.2762
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.0608 0.2096 -5.062 4.16e-07 ***
## female -0.2752 0.1423 -1.934 0.0531 .
## Independence 1.4493 0.1881 7.705 1.31e-14 ***
## Econ_worse 0.6780 0.1424 4.762 1.92e-06 ***
## Govt_dont_care 0.2258 0.1418 1.593 0.1112
## Minnan_father 0.1689 0.1815 0.930 0.3523
## Mainland_father -1.4518 0.2912 -4.985 6.19e-07 ***
## Taiwanese 1.5014 0.1470 10.213 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1666.5 on 1260 degrees of freedom
## Residual deviance: 1247.6 on 1253 degrees of freedom
## (429 observations deleted due to missingness)
## AIC: 1263.6
##
## Number of Fisher Scoring iterations: 5
Now it is statistically significant at 0.05 level of confidence.