## Decision Trees pt2
library(dplyr)
library(C50)
library(gmodels)
### Credit Data Frame
# Import data
library(readr)
credit <- read.csv("C:/Users/angel/Documents/RPractice/credit.csv", header=TRUE, stringsAsFactors=TRUE)
View(credit)
# Exploring the dataset
str(credit)
## 'data.frame': 1000 obs. of 17 variables:
## $ checking_balance : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
## $ months_loan_duration: int 6 48 12 42 24 36 24 36 12 30 ...
## $ credit_history : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
## $ purpose : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
## $ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ savings_balance : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
## $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
## $ percent_of_income : int 4 2 2 2 3 2 3 2 2 4 ...
## $ years_at_residence : int 4 2 3 4 4 4 4 2 4 2 ...
## $ age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ other_credit : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ housing : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
## $ existing_loans_count: int 2 1 1 1 2 1 1 1 1 2 ...
## $ job : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
## $ dependents : int 1 1 2 2 2 2 1 1 1 1 ...
## $ phone : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
## $ default : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...
credit <- credit[-1001,]
credit[155, 1] = as.factor("1 - 200 DM")
credit[155,1]
## [1] 1 - 200 DM
## Levels: < 0 DM > 200 DM 1 - 200 DM unknown
table(credit$checking_balance)
##
## < 0 DM > 200 DM 1 - 200 DM unknown
## 274 63 269 394
prop.table(table(credit$checking_balance))
##
## < 0 DM > 200 DM 1 - 200 DM unknown
## 0.274 0.063 0.269 0.394
str(credit$checking_balance)
## Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
table(credit$savings_balance)
##
## < 100 DM > 1000 DM 100 - 500 DM 500 - 1000 DM unknown
## 603 48 103 63 183
prop.table(table(credit$savings_balance))
##
## < 100 DM > 1000 DM 100 - 500 DM 500 - 1000 DM unknown
## 0.603 0.048 0.103 0.063 0.183
summary(credit$months_loan_duration)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.0 12.0 18.0 20.9 24.0 72.0
summary(credit$amount)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 250 1366 2320 3271 3972 18424
str(credit$default)
## Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...
credit$default <- factor(credit$default)
str(credit$default)
## Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...
table(credit$default)
##
## no yes
## 700 300
prop.table(table(credit$default))
##
## no yes
## 0.7 0.3
# Create a random training and test dataset
train_sample <- sample(1000, 900)
str(train_sample)
## int [1:900] 594 506 367 749 413 899 684 236 751 955 ...
credit_train <- credit[train_sample, ]
credit_test <- credit[-train_sample, ]
prop.table(table(credit_train$default))
##
## no yes
## 0.6933333 0.3066667
prop.table(table(credit_test$default))
##
## no yes
## 0.76 0.24
# Making prediction model
library(C50)
credit_model <- C5.0(credit_train[-17], credit_train$default)
credit_model
##
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
##
## Classification Tree
## Number of samples: 900
## Number of predictors: 16
##
## Tree size: 66
##
## Non-standard options: attempt to group attributes
summary(credit_model)
##
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Apr 26 15:35:03 2024
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 900 cases (17 attributes) from undefined.data
##
## Decision tree:
##
## checking_balance = unknown: no (358/42)
## checking_balance in {< 0 DM,> 200 DM,1 - 200 DM}:
## :...credit_history in {perfect,very good}:
## :...savings_balance in {> 1000 DM,500 - 1000 DM,unknown}:
## : :...dependents <= 1: no (9/1)
## : : dependents > 1: yes (4/1)
## : savings_balance in {< 100 DM,100 - 500 DM}:
## : :...credit_history = very good:
## : :...age <= 23: no (2)
## : : age > 23: yes (25/3)
## : credit_history = perfect:
## : :...housing in {other,rent}: yes (14)
## : housing = own:
## : :...age > 33: yes (4)
## : age <= 33:
## : :...age <= 29: yes (4/1)
## : age > 29: no (6)
## credit_history in {critical,good,poor}:
## :...months_loan_duration <= 11: no (88/16)
## months_loan_duration > 11:
## :...savings_balance in {> 1000 DM,unknown}: no (71/19)
## savings_balance = 500 - 1000 DM:
## :...percent_of_income <= 3: no (10/1)
## : percent_of_income > 3: yes (3)
## savings_balance = 100 - 500 DM:
## :...purpose in {business,car0,education,renovations}: no (9/1)
## : purpose = furniture/appliances: yes (17/4)
## : purpose = car:
## : :...age <= 25: yes (3)
## : age > 25: no (9/1)
## savings_balance = < 100 DM:
## :...months_loan_duration > 47: yes (19/2)
## months_loan_duration <= 47:
## :...checking_balance = > 200 DM:
## :...dependents <= 1: no (23/5)
## : dependents > 1: yes (2)
## checking_balance = < 0 DM:
## :...purpose = education: yes (8)
## : purpose in {business,car,car0,furniture/appliances,
## : : renovations}:
## : :...job = management: no (17/3)
## : job in {skilled,unemployed,unskilled}:
## : :...credit_history = poor: yes (7/1)
## : credit_history = critical:
## : :...housing = other: yes (1)
## : : housing = rent:
## : : :...amount <= 2122: no (3)
## : : : amount > 2122: yes (6)
## : : housing = own:
## : : :...months_loan_duration > 13: no (9/1)
## : : months_loan_duration <= 13:
## : : :...percent_of_income <= 3: yes (2)
## : : percent_of_income > 3: no (5/1)
## : credit_history = good:
## : :...purpose = renovations: yes (0)
## : purpose in {business,car0}: no (3)
## : purpose = car:
## : :...other_credit = store: yes (0)
## : : other_credit = bank: no (1)
## : : other_credit = none:
## : : :...percent_of_income <= 2: no (9/4)
## : : percent_of_income > 2: yes (13/1)
## : purpose = furniture/appliances:
## : :...phone = yes: yes (5)
## : phone = no:
## : :...years_at_residence <= 1: no (9/1)
## : years_at_residence > 1: [S1]
## checking_balance = 1 - 200 DM:
## :...amount > 8133: yes (8)
## amount <= 8133:
## :...amount > 5381: no (13)
## amount <= 5381:
## :...other_credit = store: yes (3)
## other_credit in {bank,none}:
## :...purpose in {car0,education}: no (2)
## purpose = renovations:
## :...years_at_residence <= 3: yes (2)
## : years_at_residence > 3: no (2)
## purpose = business:
## :...credit_history in {critical,
## : : poor}: no (2)
## : credit_history = good:
## : :...years_at_residence <= 1: no (2)
## : years_at_residence > 1: yes (2)
## purpose = car:
## :...other_credit = bank: yes (6)
## : other_credit = none:
## : :...dependents > 1: no (2)
## : dependents <= 1:
## : :...phone = no: yes (7/2)
## : phone = yes: no (4/1)
## purpose = furniture/appliances:
## :...years_at_residence > 3: no (12/1)
## years_at_residence <= 3:
## :...other_credit = bank: no (2)
## other_credit = none:
## :...age > 35: yes (4)
## age <= 35: [S2]
##
## SubTree [S1]
##
## employment_duration in {< 1 year,unemployed}: yes (4)
## employment_duration = > 7 years:
## :...years_at_residence <= 2: yes (2)
## : years_at_residence > 2: no (5/1)
## employment_duration = 4 - 7 years:
## :...percent_of_income <= 3: no (2)
## : percent_of_income > 3: yes (2)
## employment_duration = 1 - 4 years:
## :...housing in {other,rent}: yes (6)
## housing = own:
## :...age <= 26: yes (2)
## age > 26: no (10/1)
##
## SubTree [S2]
##
## credit_history = poor: yes (1)
## credit_history = critical:
## :...months_loan_duration <= 15: no (2)
## : months_loan_duration > 15: yes (4/1)
## credit_history = good:
## :...percent_of_income <= 1: yes (3/1)
## percent_of_income > 1: no (6)
##
##
## Evaluation on training data (900 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 64 117(13.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 607 17 (a): class no
## 100 176 (b): class yes
##
##
## Attribute usage:
##
## 100.00% checking_balance
## 60.22% credit_history
## 52.67% months_loan_duration
## 50.44% savings_balance
## 26.00% purpose
## 13.67% job
## 10.89% amount
## 10.11% other_credit
## 9.44% age
## 9.33% years_at_residence
## 8.00% housing
## 6.44% phone
## 6.11% percent_of_income
## 5.67% dependents
## 3.67% employment_duration
##
##
## Time: 0.0 secs
# Using the model to evaluate
credit_pred <- predict(credit_model, credit_test)
library(gmodels)
CrossTable(credit_test$default, credit_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | predicted default
## actual default | no | yes | Row Total |
## ---------------|-----------|-----------|-----------|
## no | 64 | 12 | 76 |
## | 0.640 | 0.120 | |
## ---------------|-----------|-----------|-----------|
## yes | 16 | 8 | 24 |
## | 0.160 | 0.080 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 80 | 20 | 100 |
## ---------------|-----------|-----------|-----------|
##
##
# Improving model performance
credit_boost10 <- C5.0(credit_train[-17], credit_train$default, trials = 10)
credit_boost10
##
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default, trials = 10)
##
## Classification Tree
## Number of samples: 900
## Number of predictors: 16
##
## Number of boosting iterations: 10
## Average tree size: 52.2
##
## Non-standard options: attempt to group attributes
# Predicting with boost
credit_boost10_pred <- predict(credit_boost10, credit_test)
CrossTable(credit_test$default, credit_boost10_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | predicted default
## actual default | no | yes | Row Total |
## ---------------|-----------|-----------|-----------|
## no | 63 | 13 | 76 |
## | 0.630 | 0.130 | |
## ---------------|-----------|-----------|-----------|
## yes | 12 | 12 | 24 |
## | 0.120 | 0.120 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 75 | 25 | 100 |
## ---------------|-----------|-----------|-----------|
##
##
# Cost Matrix
matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))
names(matrix_dimensions) <- c("predicted", "actual")
matrix_dimensions
## $predicted
## [1] "no" "yes"
##
## $actual
## [1] "no" "yes"
error_cost <- matrix(c(0, 1, 4, 0), nrow = 2, dimnames = matrix_dimensions)
error_cost
## actual
## predicted no yes
## no 0 4
## yes 1 0
credit_cost <- C5.0(credit_train[-17], credit_train$default, costs = error_cost)
credit_cost_pred <- predict(credit_cost, credit_test)
CrossTable(credit_test$default, credit_cost_pred,
prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
dnn = c('actual default', 'predicted default'))
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 100
##
##
## | predicted default
## actual default | no | yes | Row Total |
## ---------------|-----------|-----------|-----------|
## no | 29 | 47 | 76 |
## | 0.290 | 0.470 | |
## ---------------|-----------|-----------|-----------|
## yes | 5 | 19 | 24 |
## | 0.050 | 0.190 | |
## ---------------|-----------|-----------|-----------|
## Column Total | 34 | 66 | 100 |
## ---------------|-----------|-----------|-----------|
##
##