032624

## Decision Trees pt2

library(dplyr)
library(C50)
library(gmodels)
### Credit Data Frame
# Import data
library(readr)
credit <- read.csv("C:/Users/angel/Documents/RPractice/credit.csv", header=TRUE, stringsAsFactors=TRUE)
View(credit)

# Exploring the dataset
str(credit)

## 'data.frame':    1000 obs. of  17 variables:
##  $ checking_balance    : Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...
##  $ months_loan_duration: int  6 48 12 42 24 36 24 36 12 30 ...
##  $ credit_history      : Factor w/ 5 levels "critical","good",..: 1 2 1 2 4 2 2 2 2 1 ...
##  $ purpose             : Factor w/ 6 levels "business","car",..: 5 5 4 5 2 4 5 2 5 2 ...
##  $ amount              : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ savings_balance     : Factor w/ 5 levels "< 100 DM","> 1000 DM",..: 5 1 1 1 1 5 4 1 2 1 ...
##  $ employment_duration : Factor w/ 5 levels "< 1 year","> 7 years",..: 2 3 4 4 3 3 2 3 4 5 ...
##  $ percent_of_income   : int  4 2 2 2 3 2 3 2 2 4 ...
##  $ years_at_residence  : int  4 2 3 4 4 4 4 2 4 2 ...
##  $ age                 : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ other_credit        : Factor w/ 3 levels "bank","none",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ housing             : Factor w/ 3 levels "other","own",..: 2 2 2 1 1 1 2 3 2 2 ...
##  $ existing_loans_count: int  2 1 1 1 2 1 1 1 1 2 ...
##  $ job                 : Factor w/ 4 levels "management","skilled",..: 2 2 4 2 2 4 2 1 4 1 ...
##  $ dependents          : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ phone               : Factor w/ 2 levels "no","yes": 2 1 1 1 1 2 1 2 1 1 ...
##  $ default             : Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...

credit <- credit[-1001,]
credit[155, 1] = as.factor("1 - 200 DM")
credit[155,1]

## [1] 1 - 200 DM
## Levels: < 0 DM > 200 DM 1 - 200 DM unknown

table(credit$checking_balance)

## 
##     < 0 DM   > 200 DM 1 - 200 DM    unknown 
##        274         63        269        394

prop.table(table(credit$checking_balance))

## 
##     < 0 DM   > 200 DM 1 - 200 DM    unknown 
##      0.274      0.063      0.269      0.394

str(credit$checking_balance)

##  Factor w/ 4 levels "< 0 DM","> 200 DM",..: 1 3 4 1 1 4 4 3 4 3 ...

table(credit$savings_balance)

## 
##      < 100 DM     > 1000 DM  100 - 500 DM 500 - 1000 DM       unknown 
##           603            48           103            63           183

prop.table(table(credit$savings_balance))

## 
##      < 100 DM     > 1000 DM  100 - 500 DM 500 - 1000 DM       unknown 
##         0.603         0.048         0.103         0.063         0.183

summary(credit$months_loan_duration)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     4.0    12.0    18.0    20.9    24.0    72.0

summary(credit$amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

str(credit$default)

##  Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...

credit$default <- factor(credit$default)
str(credit$default)

##  Factor w/ 2 levels "no","yes": 1 2 1 1 2 1 1 1 1 2 ...

table(credit$default)

## 
##  no yes 
## 700 300

prop.table(table(credit$default))

## 
##  no yes 
## 0.7 0.3

# Create a random training and test dataset
train_sample <- sample(1000, 900)
str(train_sample)

##  int [1:900] 594 506 367 749 413 899 684 236 751 955 ...

credit_train <- credit[train_sample, ]
credit_test  <- credit[-train_sample, ]
prop.table(table(credit_train$default))

## 
##        no       yes 
## 0.6933333 0.3066667

prop.table(table(credit_test$default))

## 
##   no  yes 
## 0.76 0.24

# Making prediction model
library(C50)
credit_model <- C5.0(credit_train[-17], credit_train$default)
credit_model

## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 16 
## 
## Tree size: 66 
## 
## Non-standard options: attempt to group attributes

summary(credit_model)

## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Fri Apr 26 15:35:03 2024
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (17 attributes) from undefined.data
## 
## Decision tree:
## 
## checking_balance = unknown: no (358/42)
## checking_balance in {< 0 DM,> 200 DM,1 - 200 DM}:
## :...credit_history in {perfect,very good}:
##     :...savings_balance in {> 1000 DM,500 - 1000 DM,unknown}:
##     :   :...dependents <= 1: no (9/1)
##     :   :   dependents > 1: yes (4/1)
##     :   savings_balance in {< 100 DM,100 - 500 DM}:
##     :   :...credit_history = very good:
##     :       :...age <= 23: no (2)
##     :       :   age > 23: yes (25/3)
##     :       credit_history = perfect:
##     :       :...housing in {other,rent}: yes (14)
##     :           housing = own:
##     :           :...age > 33: yes (4)
##     :               age <= 33:
##     :               :...age <= 29: yes (4/1)
##     :                   age > 29: no (6)
##     credit_history in {critical,good,poor}:
##     :...months_loan_duration <= 11: no (88/16)
##         months_loan_duration > 11:
##         :...savings_balance in {> 1000 DM,unknown}: no (71/19)
##             savings_balance = 500 - 1000 DM:
##             :...percent_of_income <= 3: no (10/1)
##             :   percent_of_income > 3: yes (3)
##             savings_balance = 100 - 500 DM:
##             :...purpose in {business,car0,education,renovations}: no (9/1)
##             :   purpose = furniture/appliances: yes (17/4)
##             :   purpose = car:
##             :   :...age <= 25: yes (3)
##             :       age > 25: no (9/1)
##             savings_balance = < 100 DM:
##             :...months_loan_duration > 47: yes (19/2)
##                 months_loan_duration <= 47:
##                 :...checking_balance = > 200 DM:
##                     :...dependents <= 1: no (23/5)
##                     :   dependents > 1: yes (2)
##                     checking_balance = < 0 DM:
##                     :...purpose = education: yes (8)
##                     :   purpose in {business,car,car0,furniture/appliances,
##                     :   :           renovations}:
##                     :   :...job = management: no (17/3)
##                     :       job in {skilled,unemployed,unskilled}:
##                     :       :...credit_history = poor: yes (7/1)
##                     :           credit_history = critical:
##                     :           :...housing = other: yes (1)
##                     :           :   housing = rent:
##                     :           :   :...amount <= 2122: no (3)
##                     :           :   :   amount > 2122: yes (6)
##                     :           :   housing = own:
##                     :           :   :...months_loan_duration > 13: no (9/1)
##                     :           :       months_loan_duration <= 13:
##                     :           :       :...percent_of_income <= 3: yes (2)
##                     :           :           percent_of_income > 3: no (5/1)
##                     :           credit_history = good:
##                     :           :...purpose = renovations: yes (0)
##                     :               purpose in {business,car0}: no (3)
##                     :               purpose = car:
##                     :               :...other_credit = store: yes (0)
##                     :               :   other_credit = bank: no (1)
##                     :               :   other_credit = none:
##                     :               :   :...percent_of_income <= 2: no (9/4)
##                     :               :       percent_of_income > 2: yes (13/1)
##                     :               purpose = furniture/appliances:
##                     :               :...phone = yes: yes (5)
##                     :                   phone = no:
##                     :                   :...years_at_residence <= 1: no (9/1)
##                     :                       years_at_residence > 1: [S1]
##                     checking_balance = 1 - 200 DM:
##                     :...amount > 8133: yes (8)
##                         amount <= 8133:
##                         :...amount > 5381: no (13)
##                             amount <= 5381:
##                             :...other_credit = store: yes (3)
##                                 other_credit in {bank,none}:
##                                 :...purpose in {car0,education}: no (2)
##                                     purpose = renovations:
##                                     :...years_at_residence <= 3: yes (2)
##                                     :   years_at_residence > 3: no (2)
##                                     purpose = business:
##                                     :...credit_history in {critical,
##                                     :   :                  poor}: no (2)
##                                     :   credit_history = good:
##                                     :   :...years_at_residence <= 1: no (2)
##                                     :       years_at_residence > 1: yes (2)
##                                     purpose = car:
##                                     :...other_credit = bank: yes (6)
##                                     :   other_credit = none:
##                                     :   :...dependents > 1: no (2)
##                                     :       dependents <= 1:
##                                     :       :...phone = no: yes (7/2)
##                                     :           phone = yes: no (4/1)
##                                     purpose = furniture/appliances:
##                                     :...years_at_residence > 3: no (12/1)
##                                         years_at_residence <= 3:
##                                         :...other_credit = bank: no (2)
##                                             other_credit = none:
##                                             :...age > 35: yes (4)
##                                                 age <= 35: [S2]
## 
## SubTree [S1]
## 
## employment_duration in {< 1 year,unemployed}: yes (4)
## employment_duration = > 7 years:
## :...years_at_residence <= 2: yes (2)
## :   years_at_residence > 2: no (5/1)
## employment_duration = 4 - 7 years:
## :...percent_of_income <= 3: no (2)
## :   percent_of_income > 3: yes (2)
## employment_duration = 1 - 4 years:
## :...housing in {other,rent}: yes (6)
##     housing = own:
##     :...age <= 26: yes (2)
##         age > 26: no (10/1)
## 
## SubTree [S2]
## 
## credit_history = poor: yes (1)
## credit_history = critical:
## :...months_loan_duration <= 15: no (2)
## :   months_loan_duration > 15: yes (4/1)
## credit_history = good:
## :...percent_of_income <= 1: yes (3/1)
##     percent_of_income > 1: no (6)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      64  117(13.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     607    17    (a): class no
##     100   176    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% checking_balance
##   60.22% credit_history
##   52.67% months_loan_duration
##   50.44% savings_balance
##   26.00% purpose
##   13.67% job
##   10.89% amount
##   10.11% other_credit
##    9.44% age
##    9.33% years_at_residence
##    8.00% housing
##    6.44% phone
##    6.11% percent_of_income
##    5.67% dependents
##    3.67% employment_duration
## 
## 
## Time: 0.0 secs

# Using the model to evaluate
credit_pred <- predict(credit_model, credit_test)
library(gmodels)
CrossTable(credit_test$default, credit_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                | predicted default 
## actual default |        no |       yes | Row Total | 
## ---------------|-----------|-----------|-----------|
##             no |        64 |        12 |        76 | 
##                |     0.640 |     0.120 |           | 
## ---------------|-----------|-----------|-----------|
##            yes |        16 |         8 |        24 | 
##                |     0.160 |     0.080 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |        80 |        20 |       100 | 
## ---------------|-----------|-----------|-----------|
## 
##

# Improving model performance
credit_boost10 <- C5.0(credit_train[-17], credit_train$default, trials = 10)
credit_boost10

## 
## Call:
## C5.0.default(x = credit_train[-17], y = credit_train$default, trials = 10)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 16 
## 
## Number of boosting iterations: 10 
## Average tree size: 52.2 
## 
## Non-standard options: attempt to group attributes

# Predicting with boost
credit_boost10_pred <- predict(credit_boost10, credit_test)
CrossTable(credit_test$default, credit_boost10_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                | predicted default 
## actual default |        no |       yes | Row Total | 
## ---------------|-----------|-----------|-----------|
##             no |        63 |        13 |        76 | 
##                |     0.630 |     0.130 |           | 
## ---------------|-----------|-----------|-----------|
##            yes |        12 |        12 |        24 | 
##                |     0.120 |     0.120 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |        75 |        25 |       100 | 
## ---------------|-----------|-----------|-----------|
## 
##

# Cost Matrix
matrix_dimensions <- list(c("no", "yes"), c("no", "yes"))
names(matrix_dimensions) <- c("predicted", "actual")
matrix_dimensions

## $predicted
## [1] "no"  "yes"
## 
## $actual
## [1] "no"  "yes"

error_cost <- matrix(c(0, 1, 4, 0), nrow = 2, dimnames = matrix_dimensions)
error_cost

##          actual
## predicted no yes
##       no   0   4
##       yes  1   0

credit_cost <- C5.0(credit_train[-17], credit_train$default, costs = error_cost)
credit_cost_pred <- predict(credit_cost, credit_test)
CrossTable(credit_test$default, credit_cost_pred,
           prop.chisq = FALSE, prop.c = FALSE, prop.r = FALSE,
           dnn = c('actual default', 'predicted default'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                | predicted default 
## actual default |        no |       yes | Row Total | 
## ---------------|-----------|-----------|-----------|
##             no |        29 |        47 |        76 | 
##                |     0.290 |     0.470 |           | 
## ---------------|-----------|-----------|-----------|
##            yes |         5 |        19 |        24 | 
##                |     0.050 |     0.190 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |        34 |        66 |       100 | 
## ---------------|-----------|-----------|-----------|
## 
##

032624

Thu Vu

2024-03-26