## Fitting decision tree model

## German credit data

rm(list=ls())
library(tidyverse)
#credit_german_dat <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
credit_german_dat <- read.table("~/Desktop/CAU/AI4OPT/Data Engineering and Mining II/credit_data_German/german.data", sep=" ", header = TRUE)

dim(credit_german_dat)
## [1] 999  21
head(credit_german_dat)
##   A11 X6 A34 A43 X1169 A65 A75 X4 A93 A101 X4.1 A121 X67 A143 A152 X2 A173 X1
## 1 A12 48 A32 A43  5951 A61 A73  2 A92 A101    2 A121  22 A143 A152  1 A173  1
## 2 A14 12 A34 A46  2096 A61 A74  2 A93 A101    3 A121  49 A143 A152  1 A172  2
## 3 A11 42 A32 A42  7882 A61 A74  2 A93 A103    4 A122  45 A143 A153  1 A173  2
## 4 A11 24 A33 A40  4870 A61 A73  3 A93 A101    4 A124  53 A143 A153  2 A173  2
## 5 A14 36 A32 A46  9055 A65 A73  2 A93 A101    4 A124  35 A143 A153  1 A172  2
## 6 A14 24 A32 A42  2835 A63 A75  3 A93 A101    4 A122  53 A143 A152  1 A173  1
##   A192 A201 X1.1
## 1 A191 A201    2
## 2 A191 A201    1
## 3 A191 A201    1
## 4 A191 A201    2
## 5 A192 A201    1
## 6 A191 A201    1
names(credit_german_dat) <- c("status_chk_acct", "duration", "credit_history", "purpose", 
                             "amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor", 
                             "present_resid", "property", "age", "other_install", "housing", "n_credits", 
                             "job", "n_people", "telephone", "foreign", "response")
View(credit_german_dat)

#glimpse(credit_german_dat)
credit_german_dat$response <- credit_german_dat$response - 1

# Convert selected columns to factors using mutate and across
myCate <- c("status_chk_acct", "credit_history", "purpose", "saving_acct", "present_emp",
            "sex", "other_debtor", "property", "other_install", "housing",
            "job", "telephone", "foreign", "response")
credit_german_dat <- credit_german_dat %>% mutate( across(myCate, factor) )
## Warning: There was 1 warning in `mutate()`.
## ℹ In argument: `across(myCate, factor)`.
## Caused by warning:
## ! Using an external vector in selections was deprecated in tidyselect 1.1.0.
## ℹ Please use `all_of()` or `any_of()` instead.
##   # Was:
##   data %>% select(myCate)
## 
##   # Now:
##   data %>% select(all_of(myCate))
## 
## See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
glimpse(credit_german_dat)
## Rows: 999
## Columns: 21
## $ status_chk_acct  <fct> A12, A14, A11, A11, A14, A14, A12, A14, A12, A12, A11…
## $ duration         <int> 48, 12, 42, 24, 36, 24, 36, 12, 30, 12, 48, 12, 24, 1…
## $ credit_history   <fct> A32, A34, A32, A33, A32, A32, A32, A32, A34, A32, A32…
## $ purpose          <fct> A43, A46, A42, A40, A46, A42, A41, A43, A40, A40, A49…
## $ amount           <int> 5951, 2096, 7882, 4870, 9055, 2835, 6948, 3059, 5234,…
## $ saving_acct      <fct> A61, A61, A61, A61, A65, A63, A61, A64, A61, A61, A61…
## $ present_emp      <fct> A73, A74, A74, A73, A73, A75, A73, A74, A71, A72, A72…
## $ installment_rate <int> 2, 2, 2, 3, 2, 3, 2, 2, 4, 3, 3, 1, 4, 2, 4, 4, 2, 4,…
## $ sex              <fct> A92, A93, A93, A93, A93, A93, A93, A91, A94, A92, A92…
## $ other_debtor     <fct> A101, A101, A103, A101, A101, A101, A101, A101, A101,…
## $ present_resid    <int> 2, 3, 4, 4, 4, 4, 2, 4, 2, 1, 4, 1, 4, 4, 2, 4, 3, 2,…
## $ property         <fct> A121, A121, A122, A124, A124, A122, A123, A121, A123,…
## $ age              <int> 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 24, 22, 60, 2…
## $ other_install    <fct> A143, A143, A143, A143, A143, A143, A143, A143, A143,…
## $ housing          <fct> A152, A152, A153, A153, A153, A152, A151, A152, A152,…
## $ n_credits        <int> 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2, 3, 1,…
## $ job              <fct> A173, A172, A173, A173, A172, A173, A174, A172, A174,…
## $ n_people         <int> 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,…
## $ telephone        <fct> A191, A191, A191, A191, A192, A191, A192, A191, A191,…
## $ foreign          <fct> A201, A201, A201, A201, A201, A201, A201, A201, A201,…
## $ response         <fct> 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,…
summary(credit_german_dat)
##  status_chk_acct    duration     credit_history    purpose        amount     
##  A11:273         Min.   : 4.00   A30: 40        A43    :279   Min.   :  250  
##  A12:269         1st Qu.:12.00   A31: 49        A40    :234   1st Qu.: 1368  
##  A13: 63         Median :18.00   A32:530        A42    :181   Median : 2320  
##  A14:394         Mean   :20.92   A33: 88        A41    :103   Mean   : 3273  
##                  3rd Qu.:24.00   A34:292        A49    : 97   3rd Qu.: 3972  
##                  Max.   :72.00                  A46    : 50   Max.   :18424  
##                                                 (Other): 55                  
##  saving_acct present_emp installment_rate  sex      other_debtor
##  A61:603     A71: 62     Min.   :1.000    A91: 50   A101:906    
##  A62:103     A72:172     1st Qu.:2.000    A92:310   A102: 41    
##  A63: 63     A73:339     Median :3.000    A93:547   A103: 52    
##  A64: 48     A74:174     Mean   :2.972    A94: 92               
##  A65:182     A75:252     3rd Qu.:4.000                          
##                          Max.   :4.000                          
##                                                                 
##  present_resid   property        age        other_install housing   
##  Min.   :1.000   A121:281   Min.   :19.00   A141:139      A151:179  
##  1st Qu.:2.000   A122:232   1st Qu.:27.00   A142: 47      A152:712  
##  Median :3.000   A123:332   Median :33.00   A143:813      A153:108  
##  Mean   :2.844   A124:154   Mean   :35.51                           
##  3rd Qu.:4.000              3rd Qu.:42.00                           
##  Max.   :4.000              Max.   :75.00                           
##                                                                     
##    n_credits       job         n_people     telephone  foreign    response
##  Min.   :1.000   A171: 22   Min.   :1.000   A191:596   A201:962   0:699   
##  1st Qu.:1.000   A172:200   1st Qu.:1.000   A192:403   A202: 37   1:300   
##  Median :1.000   A173:629   Median :1.000                                 
##  Mean   :1.406   A174:148   Mean   :1.155                                 
##  3rd Qu.:2.000              3rd Qu.:1.000                                 
##  Max.   :4.000              Max.   :2.000                                 
## 
## Train/Test data
set.seed(105)
credit.random.df <- credit_german_dat[order(runif(1000)), ]

credit.train <- credit.random.df[1:800, ]
credit.test <- credit.random.df[801:1000, ]
prop.table(table(credit.train$response))
## 
##      0      1 
## 0.6975 0.3025
prop.table(table(credit.test$response))
## 
##         0         1 
## 0.7085427 0.2914573
## Modeling Decision Tree with 
#install.packages("C50)
library(C50)
credit.fit <- C5.0( credit.train[, -21], credit.train$response)
summary(credit.fit)
## 
## Call:
## C5.0.default(x = credit.train[, -21], y = credit.train$response)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Tue Oct 21 16:32:02 2025
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 800 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## status_chk_acct in {A13,A14}: 0 (363/52)
## status_chk_acct in {A11,A12}:
## :...credit_history in {A30,A31}:
##     :...property = A122: 1 (9)
##     :   property = A123:
##     :   :...housing = A151: 1 (5/1)
##     :   :   housing in {A152,A153}: 0 (16/6)
##     :   property = A124:
##     :   :...saving_acct in {A61,A62,A63,A64}: 1 (15/1)
##     :   :   saving_acct = A65: 0 (2)
##     :   property = A121:
##     :   :...other_install = A141: 1 (1)
##     :       other_install = A142: 0 (2)
##     :       other_install = A143:
##     :       :...other_debtor in {A101,A102}: 1 (3)
##     :           other_debtor = A103: 0 (1)
##     credit_history in {A32,A33,A34}:
##     :...duration > 26:
##         :...saving_acct in {A63,A64}: 1 (2)
##         :   saving_acct = A62:
##         :   :...credit_history = A32: 1 (8)
##         :   :   credit_history in {A33,A34}: 0 (7/1)
##         :   saving_acct = A65:
##         :   :...status_chk_acct = A11: 1 (3)
##         :   :   status_chk_acct = A12: 0 (9/1)
##         :   saving_acct = A61:
##         :   :...duration > 47: 1 (17/1)
##         :       duration <= 47:
##         :       :...age <= 34: 1 (28/6)
##         :           age > 34:
##         :           :...present_resid > 2: 0 (10)
##         :               present_resid <= 2:
##         :               :...sex in {A91,A92,A94}: 0 (4)
##         :                   sex = A93:
##         :                   :...other_debtor in {A101,A102}: 1 (5)
##         :                       other_debtor = A103: 0 (1)
##         duration <= 26:
##         :...other_debtor = A103:
##             :...housing = A151: 1 (2)
##             :   housing in {A152,A153}: 0 (24/1)
##             other_debtor = A102:
##             :...foreign = A202: 0 (2)
##             :   foreign = A201:
##             :   :...sex in {A92,A94}: 0 (2)
##             :       sex in {A91,A93}: 1 (8/2)
##             other_debtor = A101:
##             :...sex = A93: 0 (112/23)
##                 sex in {A91,A92,A94}:
##                 :...duration <= 7: 0 (13/1)
##                     duration > 7:
##                     :...credit_history = A34:
##                         :...other_install = A141: 1 (3/1)
##                         :   other_install in {A142,A143}: 0 (19/4)
##                         credit_history = A33:
##                         :...n_credits <= 1: 1 (3)
##                         :   n_credits > 1:
##                         :   :...other_install = A142: 1 (1)
##                         :       other_install in {A141,A143}: 0 (5)
##                         credit_history = A32:
##                         :...other_install = A142: 0 (2)
##                             other_install = A141:
##                             :...present_resid <= 1: 0 (2)
##                             :   present_resid > 1: 1 (6)
##                             other_install = A143:
##                             :...saving_acct = A64: 0 (3)
##                                 saving_acct = A62:
##                                 :...property = A121: 0 (2)
##                                 :   property in {A122,A123,A124}: 1 (6)
##                                 saving_acct = A63:
##                                 :...purpose = A41: 1 (1)
##                                 :   purpose in {A40,A410,A42,A43,A44,A45,A46,
##                                 :               A48,A49}: 0 (4)
##                                 saving_acct = A65:
##                                 :...installment_rate <= 3: 0 (8/1)
##                                 :   installment_rate > 3: 1 (5)
##                                 saving_acct = A61:
##                                 :...purpose in {A41,A410,A44,A46,
##                                     :           A48}: 1 (8/2)
##                                     purpose in {A45,A49}: 0 (3/1)
##                                     purpose = A40:
##                                     :...sex in {A91,A92}: 1 (12/1)
##                                     :   sex = A94: 0 (2)
##                                     purpose = A42:
##                                     :...telephone = A192: 1 (2)
##                                     :   telephone = A191:
##                                     :   :...amount <= 1393: 1 (6/1)
##                                     :       amount > 1393: 0 (8)
##                                     purpose = A43:
##                                     :...job = A171: 1 (0)
##                                         job in {A172,A174}: 0 (2)
##                                         job = A173:
##                                         :...present_emp in {A71,A72,
##                                             :               A73}: 1 (10/2)
##                                             present_emp in {A74,
##                                                             A75}: 0 (3)
## 
## 
## Evaluation on training data (800 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      53  109(13.6%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     540    18    (a): class 0
##      91   151    (b): class 1
## 
## 
##  Attribute usage:
## 
##  100.00% status_chk_acct
##   54.62% credit_history
##   47.88% duration
##   37.38% other_debtor
##   33.88% sex
##   24.50% saving_acct
##   16.25% other_install
##    7.75% property
##    7.62% purpose
##    6.00% age
##    5.88% housing
##    3.50% present_resid
##    2.00% telephone
##    1.88% job
##    1.75% amount
##    1.62% present_emp
##    1.62% installment_rate
##    1.50% foreign
##    1.12% n_credits
## 
## 
## Time: 0.0 secs
## Confusion matrix
fitted.class <- predict(credit.fit, credit.test, type="class")
head(fitted.class)
## [1] 1 0 1 0 1 0
## Levels: 0 1
(cm <- table(fitted.class, credit.test$response))
##             
## fitted.class   0   1
##            0 122  30
##            1  19  28
("The error rate is: ")
## [1] "The error rate is: "
(100*(1-sum(diag(cm))/sum(cm)) )    
## [1] 24.62312