Gentle Machine Learning

Supervised Learning (Classification): Tree-based models

Decision Tree: rpart

Conditional Inference Tree: party

Random Forest: randomForest

Packages used: AER, rpart, rpart.plot, rattle, party, randomForest

library("rpart")
## Warning: package 'rpart' was built under R version 4.0.5
library("rpart.plot")
## Warning: package 'rpart.plot' was built under R version 4.0.5
library("rattle")
## Warning: package 'rattle' was built under R version 4.0.5
## Loading required package: tibble
## Warning: package 'tibble' was built under R version 4.0.5
## Loading required package: bitops
## Warning: package 'bitops' was built under R version 4.0.5
## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
# AER Package (AER: Applied Econometrics with R)
library(AER)
## Warning: package 'AER' was built under R version 4.0.5
## Loading required package: car
## Warning: package 'car' was built under R version 4.0.3
## Loading required package: carData
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Warning: package 'sandwich' was built under R version 4.0.5
## Loading required package: survival
# CreditCard dataset
# card:   "Was the application for a credit card accepted?"
# reports: Number of major derogatory reports.
# age:     Age in years plus twelfths of a year.
# income:  Yearly income (in USD 10,000)
# owner:   Does the individual own their home?
# months:  Months living at current address.
# help("CreditCard") for dataset detail
# Reference: Greene, W.H. (2003). Econometric Analysis, 5th edition. 
#            Upper Saddle River, NJ: Prentice Hall. 
# Link: http://pages.stern.nyu.edu/~wgreene/Text/tables/tablelist5.htm.

Load dataset

data(CreditCard)

Subset data including predictor variables

bankcard <- subset(CreditCard, select = c(card, reports, age, income, owner, months))

Recode card to 0, 1

bankcard$card <- ifelse(bankcard$card == "yes", 1, 0);
set.seed(1001)

Order data by row number

newbankcard <- bankcard[sample(nrow(bankcard)),]

Indexing for training data (selection of 70 % of data)

t_idx <- sample(seq_len(nrow(bankcard)), size = round(0.70 * nrow(bankcard)))

Build train and test data

traindata <- newbankcard[t_idx,]
testdata <- newbankcard[ - t_idx,]

Decision tree model

dtree_creditcard <- rpart::rpart(formula = card ~ ., data = traindata, method = "class", control = rpart.control(cp = 0.001)) # complexity parameter

Plot Decision tree

rattle::fancyRpartPlot(dtree_creditcard, type = 1, main = "Decision tree", caption = "Credit card approval" )

resultdt <- predict(dtree_creditcard, newdata = testdata, type = "class")

Confusion matrix

cm_creditcarddt <- table(testdata$card, resultdt, dnn = c("Actual", "Predicted"))
cm_creditcarddt
##       Predicted
## Actual   0   1
##      0  35  57
##      1  18 286

Predicted Approval rate

cm_creditcarddt[4] / sum(cm_creditcarddt[, 2])
## [1] 0.8338192

Predicted Denial rate

cm_creditcarddt[1] / sum(cm_creditcarddt[, 1])
## [1] 0.6603774

Accuracy

accuracydt <- sum(diag(cm_creditcarddt)) / sum(cm_creditcarddt)
accuracydt
## [1] 0.8106061
# install.packages("party") 
library(party)
## Warning: package 'party' was built under R version 4.0.5
## Loading required package: grid
## Loading required package: mvtnorm
## Warning: package 'mvtnorm' was built under R version 4.0.3
## Loading required package: modeltools
## Warning: package 'modeltools' was built under R version 4.0.3
## Loading required package: stats4
## 
## Attaching package: 'modeltools'
## The following object is masked from 'package:car':
## 
##     Predict
## Loading required package: strucchange
## Warning: package 'strucchange' was built under R version 4.0.5

Conditional Inference Tree

cit <- ctree(card ~ ., data = traindata)
plot(cit, main = "Conditional Inference Tree")

Confusion matrix

cm_creditcardcit = table(testdata$card, round(predict(cit, newdata = testdata)), dnn = c("Actual", "Predicted"))
cm_creditcardcit
##       Predicted
## Actual   0   1
##      0  31  61
##      1  12 292

Predicted Approval rate

cm_creditcardcit[4] / sum(cm_creditcardcit[, 2])
## [1] 0.8271955

Predicted Denial rate

cm_creditcardcit[1] / sum(cm_creditcardcit[, 1])
## [1] 0.7209302

Accuracy

accuracycit <- sum(diag(cm_creditcardcit)) / sum(cm_creditcardcit)
accuracycit
## [1] 0.8156566

Random Forest

# install.packages("randomForest")
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.0.5
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:rattle':
## 
##     importance
set.seed(1001)

randomForest model

rf_creditcard <- randomForest(card ~ ., data = traindata, importance = T, proximity = T, do.trace = 100)
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##  100 |   0.1254    72.82 |
##  200 |   0.1247    72.41 |
##  300 |   0.1244    72.25 |
##  400 |   0.1244    72.28 |
##  500 |   0.1239    71.97 |
plot(rf_creditcard)

round(importance(rf_creditcard), 3) # to three decimal place
##         %IncMSE IncNodePurity
## reports  45.173        31.136
## age       7.795        10.158
## income   17.449        12.509
## owner    17.838         3.943
## months    7.410        10.297
resultrf <- predict(rf_creditcard, newdata = testdata)
resultrf_Approved <- ifelse(resultrf > 0.6, 1, 0)

Confusion matrix

cm_creditcardrf <- table(testdata$card, resultrf_Approved, dnn = c("Actual", "Predicted"))
cm_creditcardrf
##       Predicted
## Actual   0   1
##      0  32  60
##      1  12 292

Predicted Approval rate

cm_creditcardrf[4] / sum(cm_creditcardrf[, 2])
## [1] 0.8295455

Predicted Denial rate

cm_creditcardrf[1] / sum(cm_creditcardrf[, 1])
## [1] 0.7272727

Accuracy

accuracyrf <- sum(diag(cm_creditcardrf)) / sum(cm_creditcardrf)
accuracyrf
## [1] 0.8181818