Gentle Machine Learning

Supervised Learning (Classification): Tree-based models

Decision Tree: rpart

Conditional Inference Tree: party

Random Forest: randomForest

Packages used: AER, rpart, rpart.plot, rattle, party, randomForest

library("rpart")

## Warning: package 'rpart' was built under R version 4.0.5

library("rpart.plot")

## Warning: package 'rpart.plot' was built under R version 4.0.5

library("rattle")

## Warning: package 'rattle' was built under R version 4.0.5

## Loading required package: tibble

## Warning: package 'tibble' was built under R version 4.0.5

## Loading required package: bitops

## Warning: package 'bitops' was built under R version 4.0.5

## Rattle: A free graphical interface for data science with R.
## Version 5.4.0 Copyright (c) 2006-2020 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

# AER Package (AER: Applied Econometrics with R)
library(AER)

## Warning: package 'AER' was built under R version 4.0.5

## Loading required package: car

## Warning: package 'car' was built under R version 4.0.3

## Loading required package: carData

## Loading required package: lmtest

## Warning: package 'lmtest' was built under R version 4.0.5

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## Loading required package: sandwich

## Warning: package 'sandwich' was built under R version 4.0.5

## Loading required package: survival

# CreditCard dataset
# card:   "Was the application for a credit card accepted?"
# reports: Number of major derogatory reports.
# age:     Age in years plus twelfths of a year.
# income:  Yearly income (in USD 10,000)
# owner:   Does the individual own their home?
# months:  Months living at current address.
# help("CreditCard") for dataset detail
# Reference: Greene, W.H. (2003). Econometric Analysis, 5th edition. 
#            Upper Saddle River, NJ: Prentice Hall. 
# Link: http://pages.stern.nyu.edu/~wgreene/Text/tables/tablelist5.htm.

Load dataset

data(CreditCard)

Subset data including predictor variables

bankcard <- subset(CreditCard, select = c(card, reports, age, income, owner, months))

Recode card to 0, 1

bankcard$card <- ifelse(bankcard$card == "yes", 1, 0);
set.seed(1001)

Order data by row number

newbankcard <- bankcard[sample(nrow(bankcard)),]

Indexing for training data (selection of 70 % of data)

t_idx <- sample(seq_len(nrow(bankcard)), size = round(0.70 * nrow(bankcard)))

Build train and test data

traindata <- newbankcard[t_idx,]
testdata <- newbankcard[ - t_idx,]

Decision tree model

dtree_creditcard <- rpart::rpart(formula = card ~ ., data = traindata, method = "class", control = rpart.control(cp = 0.001)) # complexity parameter

Plot Decision tree

rattle::fancyRpartPlot(dtree_creditcard, type = 1, main = "Decision tree", caption = "Credit card approval" )

resultdt <- predict(dtree_creditcard, newdata = testdata, type = "class")

Confusion matrix

cm_creditcarddt <- table(testdata$card, resultdt, dnn = c("Actual", "Predicted"))
cm_creditcarddt

##       Predicted
## Actual   0   1
##      0  35  57
##      1  18 286

Predicted Approval rate

cm_creditcarddt[4] / sum(cm_creditcarddt[, 2])

## [1] 0.8338192

Predicted Denial rate

cm_creditcarddt[1] / sum(cm_creditcarddt[, 1])

## [1] 0.6603774

Accuracy

accuracydt <- sum(diag(cm_creditcarddt)) / sum(cm_creditcarddt)
accuracydt

## [1] 0.8106061

# install.packages("party") 
library(party)

## Warning: package 'party' was built under R version 4.0.5

## Loading required package: grid

## Loading required package: mvtnorm

## Warning: package 'mvtnorm' was built under R version 4.0.3

## Loading required package: modeltools

## Warning: package 'modeltools' was built under R version 4.0.3

## Loading required package: stats4

## 
## Attaching package: 'modeltools'

## The following object is masked from 'package:car':
## 
##     Predict

## Loading required package: strucchange

## Warning: package 'strucchange' was built under R version 4.0.5

Conditional Inference Tree

cit <- ctree(card ~ ., data = traindata)
plot(cit, main = "Conditional Inference Tree")

Confusion matrix

cm_creditcardcit = table(testdata$card, round(predict(cit, newdata = testdata)), dnn = c("Actual", "Predicted"))
cm_creditcardcit

##       Predicted
## Actual   0   1
##      0  31  61
##      1  12 292

Predicted Approval rate

cm_creditcardcit[4] / sum(cm_creditcardcit[, 2])

## [1] 0.8271955

Predicted Denial rate

cm_creditcardcit[1] / sum(cm_creditcardcit[, 1])

## [1] 0.7209302

Accuracy

accuracycit <- sum(diag(cm_creditcardcit)) / sum(cm_creditcardcit)
accuracycit

## [1] 0.8156566

Random Forest

# install.packages("randomForest")
library(randomForest)

## Warning: package 'randomForest' was built under R version 4.0.5

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:rattle':
## 
##     importance

set.seed(1001)

randomForest model

rf_creditcard <- randomForest(card ~ ., data = traindata, importance = T, proximity = T, do.trace = 100)

## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?

##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##  100 |   0.1254    72.82 |
##  200 |   0.1247    72.41 |
##  300 |   0.1244    72.25 |
##  400 |   0.1244    72.28 |
##  500 |   0.1239    71.97 |

plot(rf_creditcard)

round(importance(rf_creditcard), 3) # to three decimal place

##         %IncMSE IncNodePurity
## reports  45.173        31.136
## age       7.795        10.158
## income   17.449        12.509
## owner    17.838         3.943
## months    7.410        10.297

resultrf <- predict(rf_creditcard, newdata = testdata)
resultrf_Approved <- ifelse(resultrf > 0.6, 1, 0)

Confusion matrix

cm_creditcardrf <- table(testdata$card, resultrf_Approved, dnn = c("Actual", "Predicted"))
cm_creditcardrf

##       Predicted
## Actual   0   1
##      0  32  60
##      1  12 292

Predicted Approval rate

cm_creditcardrf[4] / sum(cm_creditcardrf[, 2])

## [1] 0.8295455

Predicted Denial rate

cm_creditcardrf[1] / sum(cm_creditcardrf[, 1])

## [1] 0.7272727

Accuracy

accuracyrf <- sum(diag(cm_creditcardrf)) / sum(cm_creditcardrf)
accuracyrf

## [1] 0.8181818

Decision Trees

Federico Ferrero

4/29/2021

Gentle Machine Learning

Supervised Learning (Classification): Tree-based models

Decision Tree: rpart

Conditional Inference Tree: party

Random Forest: randomForest

Packages used: AER, rpart, rpart.plot, rattle, party, randomForest

Load dataset

Subset data including predictor variables

Recode card to 0, 1

Order data by row number

Indexing for training data (selection of 70 % of data)

Build train and test data

Decision tree model

Plot Decision tree

Confusion matrix

Predicted Approval rate

Predicted Denial rate

Accuracy

Conditional Inference Tree

Confusion matrix

Predicted Approval rate

Predicted Denial rate

Accuracy

Random Forest

randomForest model

Confusion matrix

Predicted Approval rate

Predicted Denial rate

Accuracy