library("rpart")
## Warning: package 'rpart' was built under R version 4.0.5
library("rpart.plot")
## Warning: package 'rpart.plot' was built under R version 4.0.5
library("rattle")
## Warning: package 'rattle' was built under R version 4.0.5
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 4.0.5
## Loading required package: bitops
## Warning: package 'bitops' was built under R version 4.0.5
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
# AER Package (AER: Applied Econometrics with R)
library(AER)
## Warning: package 'AER' was built under R version 4.0.5
## Loading required package: car
## Warning: package 'car' was built under R version 4.0.3
## Loading required package: carData
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.0.5
## Loading required package: survival
# CreditCard dataset
# card: "Was the application for a credit card accepted?"
# reports: Number of major derogatory reports.
# age: Age in years plus twelfths of a year.
# income: Yearly income (in USD 10,000)
# owner: Does the individual own their home?
# months: Months living at current address.
# help("CreditCard") for dataset detail
# Reference: Greene, W.H. (2003). Econometric Analysis, 5th edition.
# Upper Saddle River, NJ: Prentice Hall.
# Link: http://pages.stern.nyu.edu/~wgreene/Text/tables/tablelist5.htm.
data(CreditCard)
bankcard <- subset(CreditCard, select = c(card, reports, age, income, owner, months))
bankcard$card <- ifelse(bankcard$card == "yes", 1, 0);
set.seed(1001)
newbankcard <- bankcard[sample(nrow(bankcard)),]
t_idx <- sample(seq_len(nrow(bankcard)), size = round(0.70 * nrow(bankcard)))
traindata <- newbankcard[t_idx,]
testdata <- newbankcard[ - t_idx,]
dtree_creditcard <- rpart::rpart(formula = card ~ ., data = traindata, method = "class", control = rpart.control(cp = 0.001)) # complexity parameter
rattle::fancyRpartPlot(dtree_creditcard, type = 1, main = "Decision tree", caption = "Credit card approval" )
resultdt <- predict(dtree_creditcard, newdata = testdata, type = "class")
cm_creditcarddt <- table(testdata$card, resultdt, dnn = c("Actual", "Predicted"))
cm_creditcarddt
## Predicted
## Actual 0 1
## 0 35 57
## 1 18 286
cm_creditcarddt[4] / sum(cm_creditcarddt[, 2])
## [1] 0.8338192
cm_creditcarddt[1] / sum(cm_creditcarddt[, 1])
## [1] 0.6603774
accuracydt <- sum(diag(cm_creditcarddt)) / sum(cm_creditcarddt)
accuracydt
## [1] 0.8106061
# install.packages("party")
library(party)
## Warning: package 'party' was built under R version 4.0.5
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.0.3
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 4.0.3
## Loading required package: stats4
##
## Attaching package: 'modeltools'
## The following object is masked from 'package:car':
##
## Predict
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.0.5
cit <- ctree(card ~ ., data = traindata)
plot(cit, main = "Conditional Inference Tree")
cm_creditcardcit = table(testdata$card, round(predict(cit, newdata = testdata)), dnn = c("Actual", "Predicted"))
cm_creditcardcit
## Predicted
## Actual 0 1
## 0 31 61
## 1 12 292
cm_creditcardcit[4] / sum(cm_creditcardcit[, 2])
## [1] 0.8271955
cm_creditcardcit[1] / sum(cm_creditcardcit[, 1])
## [1] 0.7209302
accuracycit <- sum(diag(cm_creditcardcit)) / sum(cm_creditcardcit)
accuracycit
## [1] 0.8156566
# install.packages("randomForest")
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
##
## importance
set.seed(1001)
rf_creditcard <- randomForest(card ~ ., data = traindata, importance = T, proximity = T, do.trace = 100)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## | Out-of-bag |
## Tree | MSE %Var(y) |
## 100 | 0.1254 72.82 |
## 200 | 0.1247 72.41 |
## 300 | 0.1244 72.25 |
## 400 | 0.1244 72.28 |
## 500 | 0.1239 71.97 |
plot(rf_creditcard)
round(importance(rf_creditcard), 3) # to three decimal place
## %IncMSE IncNodePurity
## reports 45.173 31.136
## age 7.795 10.158
## income 17.449 12.509
## owner 17.838 3.943
## months 7.410 10.297
resultrf <- predict(rf_creditcard, newdata = testdata)
resultrf_Approved <- ifelse(resultrf > 0.6, 1, 0)
cm_creditcardrf <- table(testdata$card, resultrf_Approved, dnn = c("Actual", "Predicted"))
cm_creditcardrf
## Predicted
## Actual 0 1
## 0 32 60
## 1 12 292
cm_creditcardrf[4] / sum(cm_creditcardrf[, 2])
## [1] 0.8295455
cm_creditcardrf[1] / sum(cm_creditcardrf[, 1])
## [1] 0.7272727
accuracyrf <- sum(diag(cm_creditcardrf)) / sum(cm_creditcardrf)
accuracyrf
## [1] 0.8181818