## Fitting decision tree model
## German credit data
rm(list=ls())
library(tidyverse)
#credit_german_dat <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
credit_german_dat <- read.table("~/Desktop/CAU/AI4OPT/Data Engineering and Mining II/credit_data_German/german.data", sep=" ", header = TRUE)
dim(credit_german_dat)
## [1] 999 21
head(credit_german_dat)
## A11 X6 A34 A43 X1169 A65 A75 X4 A93 A101 X4.1 A121 X67 A143 A152 X2 A173 X1
## 1 A12 48 A32 A43 5951 A61 A73 2 A92 A101 2 A121 22 A143 A152 1 A173 1
## 2 A14 12 A34 A46 2096 A61 A74 2 A93 A101 3 A121 49 A143 A152 1 A172 2
## 3 A11 42 A32 A42 7882 A61 A74 2 A93 A103 4 A122 45 A143 A153 1 A173 2
## 4 A11 24 A33 A40 4870 A61 A73 3 A93 A101 4 A124 53 A143 A153 2 A173 2
## 5 A14 36 A32 A46 9055 A65 A73 2 A93 A101 4 A124 35 A143 A153 1 A172 2
## 6 A14 24 A32 A42 2835 A63 A75 3 A93 A101 4 A122 53 A143 A152 1 A173 1
## A192 A201 X1.1
## 1 A191 A201 2
## 2 A191 A201 1
## 3 A191 A201 1
## 4 A191 A201 2
## 5 A192 A201 1
## 6 A191 A201 1
names(credit_german_dat) <- c("status_chk_acct", "duration", "credit_history", "purpose",
"amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor",
"present_resid", "property", "age", "other_install", "housing", "n_credits",
"job", "n_people", "telephone", "foreign", "response")
View(credit_german_dat)
#glimpse(credit_german_dat)
credit_german_dat$response <- credit_german_dat$response - 1
# Convert selected columns to factors using mutate and across
myCate <- c("status_chk_acct", "credit_history", "purpose", "saving_acct", "present_emp",
"sex", "other_debtor", "property", "other_install", "housing",
"job", "telephone", "foreign", "response")
credit_german_dat <- credit_german_dat %>% mutate( across(myCate, factor) )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(myCate, factor)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
## # Was:
## data %>% select(myCate)
##
## # Now:
## data %>% select(all_of(myCate))
##
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
glimpse(credit_german_dat)
## Rows: 999
## Columns: 21
## $ status_chk_acct <fct> A12, A14, A11, A11, A14, A14, A12, A14, A12, A12, A11…
## $ duration <int> 48, 12, 42, 24, 36, 24, 36, 12, 30, 12, 48, 12, 24, 1…
## $ credit_history <fct> A32, A34, A32, A33, A32, A32, A32, A32, A34, A32, A32…
## $ purpose <fct> A43, A46, A42, A40, A46, A42, A41, A43, A40, A40, A49…
## $ amount <int> 5951, 2096, 7882, 4870, 9055, 2835, 6948, 3059, 5234,…
## $ saving_acct <fct> A61, A61, A61, A61, A65, A63, A61, A64, A61, A61, A61…
## $ present_emp <fct> A73, A74, A74, A73, A73, A75, A73, A74, A71, A72, A72…
## $ installment_rate <int> 2, 2, 2, 3, 2, 3, 2, 2, 4, 3, 3, 1, 4, 2, 4, 4, 2, 4,…
## $ sex <fct> A92, A93, A93, A93, A93, A93, A93, A91, A94, A92, A92…
## $ other_debtor <fct> A101, A101, A103, A101, A101, A101, A101, A101, A101,…
## $ present_resid <int> 2, 3, 4, 4, 4, 4, 2, 4, 2, 1, 4, 1, 4, 4, 2, 4, 3, 2,…
## $ property <fct> A121, A121, A122, A124, A124, A122, A123, A121, A123,…
## $ age <int> 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 24, 22, 60, 2…
## $ other_install <fct> A143, A143, A143, A143, A143, A143, A143, A143, A143,…
## $ housing <fct> A152, A152, A153, A153, A153, A152, A151, A152, A152,…
## $ n_credits <int> 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 3, 1,…
## $ job <fct> A173, A172, A173, A173, A172, A173, A174, A172, A174,…
## $ n_people <int> 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ telephone <fct> A191, A191, A191, A191, A192, A191, A192, A191, A191,…
## $ foreign <fct> A201, A201, A201, A201, A201, A201, A201, A201, A201,…
## $ response <fct> 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,…
summary(credit_german_dat)
## status_chk_acct duration credit_history purpose amount
## A11:273 Min. : 4.00 A30: 40 A43 :279 Min. : 250
## A12:269 1st Qu.:12.00 A31: 49 A40 :234 1st Qu.: 1368
## A13: 63 Median :18.00 A32:530 A42 :181 Median : 2320
## A14:394 Mean :20.92 A33: 88 A41 :103 Mean : 3273
## 3rd Qu.:24.00 A34:292 A49 : 97 3rd Qu.: 3972
## Max. :72.00 A46 : 50 Max. :18424
## (Other): 55
## saving_acct present_emp installment_rate sex other_debtor
## A61:603 A71: 62 Min. :1.000 A91: 50 A101:906
## A62:103 A72:172 1st Qu.:2.000 A92:310 A102: 41
## A63: 63 A73:339 Median :3.000 A93:547 A103: 52
## A64: 48 A74:174 Mean :2.972 A94: 92
## A65:182 A75:252 3rd Qu.:4.000
## Max. :4.000
##
## present_resid property age other_install housing
## Min. :1.000 A121:281 Min. :19.00 A141:139 A151:179
## 1st Qu.:2.000 A122:232 1st Qu.:27.00 A142: 47 A152:712
## Median :3.000 A123:332 Median :33.00 A143:813 A153:108
## Mean :2.844 A124:154 Mean :35.51
## 3rd Qu.:4.000 3rd Qu.:42.00
## Max. :4.000 Max. :75.00
##
## n_credits job n_people telephone foreign response
## Min. :1.000 A171: 22 Min. :1.000 A191:596 A201:962 0:699
## 1st Qu.:1.000 A172:200 1st Qu.:1.000 A192:403 A202: 37 1:300
## Median :1.000 A173:629 Median :1.000
## Mean :1.406 A174:148 Mean :1.155
## 3rd Qu.:2.000 3rd Qu.:1.000
## Max. :4.000 Max. :2.000
##
## Train/Test data
set.seed(105)
credit.random.df <- credit_german_dat[order(runif(1000)), ]
credit.train <- credit.random.df[1:800, ]
credit.test <- credit.random.df[801:1000, ]
prop.table(table(credit.train$response))
##
## 0 1
## 0.6975 0.3025
prop.table(table(credit.test$response))
##
## 0 1
## 0.7085427 0.2914573
## Modeling Decision Tree with
#install.packages("C50)
library(C50)
credit.fit <- C5.0( credit.train[, -21], credit.train$response)
summary(credit.fit)
##
## Call:
## C5.0.default(x = credit.train[, -21], y = credit.train$response)
##
##
## C5.0 [Release 2.07 GPL Edition] Tue Oct 21 16:32:02 2025
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 800 cases (21 attributes) from undefined.data
##
## Decision tree:
##
## status_chk_acct in {A13,A14}: 0 (363/52)
## status_chk_acct in {A11,A12}:
## :...credit_history in {A30,A31}:
## :...property = A122: 1 (9)
## : property = A123:
## : :...housing = A151: 1 (5/1)
## : : housing in {A152,A153}: 0 (16/6)
## : property = A124:
## : :...saving_acct in {A61,A62,A63,A64}: 1 (15/1)
## : : saving_acct = A65: 0 (2)
## : property = A121:
## : :...other_install = A141: 1 (1)
## : other_install = A142: 0 (2)
## : other_install = A143:
## : :...other_debtor in {A101,A102}: 1 (3)
## : other_debtor = A103: 0 (1)
## credit_history in {A32,A33,A34}:
## :...duration > 26:
## :...saving_acct in {A63,A64}: 1 (2)
## : saving_acct = A62:
## : :...credit_history = A32: 1 (8)
## : : credit_history in {A33,A34}: 0 (7/1)
## : saving_acct = A65:
## : :...status_chk_acct = A11: 1 (3)
## : : status_chk_acct = A12: 0 (9/1)
## : saving_acct = A61:
## : :...duration > 47: 1 (17/1)
## : duration <= 47:
## : :...age <= 34: 1 (28/6)
## : age > 34:
## : :...present_resid > 2: 0 (10)
## : present_resid <= 2:
## : :...sex in {A91,A92,A94}: 0 (4)
## : sex = A93:
## : :...other_debtor in {A101,A102}: 1 (5)
## : other_debtor = A103: 0 (1)
## duration <= 26:
## :...other_debtor = A103:
## :...housing = A151: 1 (2)
## : housing in {A152,A153}: 0 (24/1)
## other_debtor = A102:
## :...foreign = A202: 0 (2)
## : foreign = A201:
## : :...sex in {A92,A94}: 0 (2)
## : sex in {A91,A93}: 1 (8/2)
## other_debtor = A101:
## :...sex = A93: 0 (112/23)
## sex in {A91,A92,A94}:
## :...duration <= 7: 0 (13/1)
## duration > 7:
## :...credit_history = A34:
## :...other_install = A141: 1 (3/1)
## : other_install in {A142,A143}: 0 (19/4)
## credit_history = A33:
## :...n_credits <= 1: 1 (3)
## : n_credits > 1:
## : :...other_install = A142: 1 (1)
## : other_install in {A141,A143}: 0 (5)
## credit_history = A32:
## :...other_install = A142: 0 (2)
## other_install = A141:
## :...present_resid <= 1: 0 (2)
## : present_resid > 1: 1 (6)
## other_install = A143:
## :...saving_acct = A64: 0 (3)
## saving_acct = A62:
## :...property = A121: 0 (2)
## : property in {A122,A123,A124}: 1 (6)
## saving_acct = A63:
## :...purpose = A41: 1 (1)
## : purpose in {A40,A410,A42,A43,A44,A45,A46,
## : A48,A49}: 0 (4)
## saving_acct = A65:
## :...installment_rate <= 3: 0 (8/1)
## : installment_rate > 3: 1 (5)
## saving_acct = A61:
## :...purpose in {A41,A410,A44,A46,
## : A48}: 1 (8/2)
## purpose in {A45,A49}: 0 (3/1)
## purpose = A40:
## :...sex in {A91,A92}: 1 (12/1)
## : sex = A94: 0 (2)
## purpose = A42:
## :...telephone = A192: 1 (2)
## : telephone = A191:
## : :...amount <= 1393: 1 (6/1)
## : amount > 1393: 0 (8)
## purpose = A43:
## :...job = A171: 1 (0)
## job in {A172,A174}: 0 (2)
## job = A173:
## :...present_emp in {A71,A72,
## : A73}: 1 (10/2)
## present_emp in {A74,
## A75}: 0 (3)
##
##
## Evaluation on training data (800 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 53 109(13.6%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 540 18 (a): class 0
## 91 151 (b): class 1
##
##
## Attribute usage:
##
## 100.00% status_chk_acct
## 54.62% credit_history
## 47.88% duration
## 37.38% other_debtor
## 33.88% sex
## 24.50% saving_acct
## 16.25% other_install
## 7.75% property
## 7.62% purpose
## 6.00% age
## 5.88% housing
## 3.50% present_resid
## 2.00% telephone
## 1.88% job
## 1.75% amount
## 1.62% present_emp
## 1.62% installment_rate
## 1.50% foreign
## 1.12% n_credits
##
##
## Time: 0.0 secs
## Confusion matrix
fitted.class <- predict(credit.fit, credit.test, type="class")
head(fitted.class)
## [1] 1 0 1 0 1 0
## Levels: 0 1
(cm <- table(fitted.class, credit.test$response))
##
## fitted.class 0 1
## 0 122 30
## 1 19 28
("The error rate is: ")
## [1] "The error rate is: "
(100*(1-sum(diag(cm))/sum(cm)) )
## [1] 24.62312