header 1 = chunk

header 2

Cỡ chữ 10 cỡ chữ 1 chữ màu xanh chữ màu đỏ chữ màu vàng

install package

#install.packages("C50")

library("readr")
library("C50")
library("gmodels")

đọc file from web

credit <- read_csv("https://raw.githubusercontent.com/stedy/Machine-Learning-with-R-datasets/master/credit.csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   .default = col_character(),
##   months_loan_duration = col_double(),
##   amount = col_double(),
##   installment_rate = col_double(),
##   residence_history = col_double(),
##   age = col_double(),
##   existing_credits = col_double(),
##   default = col_double(),
##   dependents = col_double()
## )
## ℹ Use `spec()` for the full column specifications.

credit <- as.data.frame(credit)
credit$default <- factor(credit$default, levels = c(1,2), labels = c("no_apply","yes_apply"))

#str(credit)

Xem các phần tử trong một biến và loại (type) gì

table(credit$checking_balance)

## 
##     < 0 DM   > 200 DM 1 - 200 DM    unknown 
##        274         63        269        394

typeof(credit$checking_balance)

## [1] "character"

create a DF credit randomly —- runif() tạo ra một dãy số từ 0 -1, trong trường hợp này là 1000 số như vậy, rồi lệnh order() sếp lại SỐ THỨ TỰ từ nhỏ nhất đến lớn nhất, mục tiêu cuối cùng là sắp sếp lại DF credit một cách ngẫu nhiên
The runif(1000) command generates a list of 1,000 random numbers. We need exactly 1,000 random numbers because there are 1,000 records in the credit data frame. The order() function then returns a vector of numbers indicating the sorted position of the 1,000 random numbers. We then use these positions to select rows in the credit data frame and store in a new data frame named credit_rand.

set.seed(12345)
credit_rand <- credit[order(runif(1000)),]

Test – variable in Data frame change or not To confirm that we have the same data frame sorted differently, we’ll compare values on the amount feature across the two data frames. The following code shows the summary statistics

summary(credit$amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

summary(credit_rand$amount)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     250    1366    2320    3271    3972   18424

head(credit$amount)

## [1] 1169 5951 2096 7882 4870 9055

head(credit_rand$amount)

## [1] 1199 2576 1103 4020 1501 1568

create train and test set

credit_train <- credit_rand[1:900,]
credit_test <- credit_rand[901:1000,]

If all went well, we should have about 30 percent of defaulted loans in each of the datasets.

prop.table(table(credit_train$default))

## 
##  no_apply yes_apply 
## 0.7022222 0.2977778

prop.table(table(credit_test$default))

## 
##  no_apply yes_apply 
##      0.68      0.32

Using C5.0() to estimate

credit_mode <- C5.0(credit_train[,-17], credit_train$default)
credit_mode

## 
## Call:
## C5.0.default(x = credit_train[, -17], y = credit_train$default)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 20 
## 
## Tree size: 57 
## 
## Non-standard options: attempt to group attributes

summary(credit_mode)

## 
## Call:
## C5.0.default(x = credit_train[, -17], y = credit_train$default)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Thu Feb  4 03:35:18 2021
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 900 cases (21 attributes) from undefined.data
## 
## Decision tree:
## 
## checking_balance = unknown: no_apply (358/44)
## checking_balance in {< 0 DM,1 - 200 DM,> 200 DM}:
## :...foreign_worker = no:
##     :...installment_plan in {none,stores}: no_apply (17/1)
##     :   installment_plan = bank:
##     :   :...residence_history <= 3: yes_apply (2)
##     :       residence_history > 3: no_apply (2)
##     foreign_worker = yes:
##     :...credit_history in {fully repaid,
##         :                  fully repaid this bank}: yes_apply (61/20)
##         credit_history in {critical,repaid,delayed}:
##         :...months_loan_duration <= 11: no_apply (76/13)
##             months_loan_duration > 11:
##             :...savings_balance = > 1000 DM: no_apply (13)
##                 savings_balance in {< 100 DM,101 - 500 DM,501 - 1000 DM,
##                 :                   unknown}:
##                 :...checking_balance = > 200 DM:
##                     :...dependents > 1: yes_apply (3)
##                     :   dependents <= 1:
##                     :   :...credit_history in {repaid,
##                     :       :                  delayed}: no_apply (23/3)
##                     :       credit_history = critical:
##                     :       :...amount <= 2337: yes_apply (3)
##                     :           amount > 2337: no_apply (6)
##                     checking_balance = < 0 DM:
##                     :...other_debtors = guarantor:
##                     :   :...credit_history = critical: yes_apply (1)
##                     :   :   credit_history in {repaid,
##                     :   :                      delayed}: no_apply (11/1)
##                     :   other_debtors in {none,co-applicant}:
##                     :   :...job = mangement self-employed: no_apply (26/6)
##                     :       job in {unskilled resident,skilled employee,
##                     :       :       unemployed non-resident}: [S1]
##                     checking_balance = 1 - 200 DM:
##                     :...savings_balance = unknown: no_apply (34/6)
##                         savings_balance in {< 100 DM,101 - 500 DM,
##                         :                   501 - 1000 DM}:
##                         :...months_loan_duration > 45: yes_apply (11/1)
##                             months_loan_duration <= 45:
##                             :...installment_plan = stores:
##                                 :...age <= 35: yes_apply (4)
##                                 :   age > 35: no_apply (2)
##                                 installment_plan = bank:
##                                 :...residence_history <= 1: no_apply (3)
##                                 :   residence_history > 1:
##                                 :   :...existing_credits <= 1: yes_apply (5)
##                                 :       existing_credits > 1: [S2]
##                                 installment_plan = none: [S3]
## 
## SubTree [S1]
## 
## purpose in {radio/tv,others,repairs,domestic appliances,
## :           retraining}: yes_apply (33/10)
## purpose = education:
## :...savings_balance in {< 100 DM,101 - 500 DM,501 - 1000 DM}: yes_apply (6)
## :   savings_balance = unknown: no_apply (2)
## purpose = business:
## :...job in {unskilled resident,unemployed non-resident}: no_apply (3)
## :   job = skilled employee: yes_apply (3)
## purpose = car (new):
## :...savings_balance = 101 - 500 DM: no_apply (1)
## :   savings_balance in {501 - 1000 DM,unknown}: yes_apply (4)
## :   savings_balance = < 100 DM:
## :   :...personal_status in {single male,female,
## :       :                   divorced male}: yes_apply (29/6)
## :       personal_status = married male: no_apply (2)
## purpose = car (used):
## :...amount > 6229: yes_apply (5)
## :   amount <= 6229:
## :   :...job = unskilled resident: yes_apply (1)
## :       job in {skilled employee,unemployed non-resident}: no_apply (8/1)
## purpose = furniture:
## :...months_loan_duration > 27: yes_apply (9/1)
##     months_loan_duration <= 27:
##     :...employment_length in {> 7 yrs,4 - 7 yrs}: no_apply (7/1)
##         employment_length = unemployed: yes_apply (2)
##         employment_length = 0 - 1 yrs:
##         :...job = unskilled resident: yes_apply (1)
##         :   job in {skilled employee,unemployed non-resident}: no_apply (4)
##         employment_length = 1 - 4 yrs:
##         :...property in {building society savings,
##             :            unknown/none}: no_apply (5)
##             property in {other,real estate}:
##             :...residence_history <= 2: no_apply (4/1)
##                 residence_history > 2: yes_apply (5)
## 
## SubTree [S2]
## 
## installment_rate > 2: yes_apply (3)
## installment_rate <= 2:
## :...other_debtors in {none,guarantor}: no_apply (3)
##     other_debtors = co-applicant: yes_apply (1)
## 
## SubTree [S3]
## 
## other_debtors = guarantor: no_apply (7/1)
## other_debtors = co-applicant: yes_apply (3/1)
## other_debtors = none:
## :...employment_length = 4 - 7 yrs:
##     :...age <= 41: no_apply (16)
##     :   age > 41: yes_apply (3/1)
##     employment_length in {> 7 yrs,1 - 4 yrs,0 - 1 yrs,unemployed}:
##     :...amount > 7980: yes_apply (7)
##         amount <= 7980:
##         :...amount > 4746: no_apply (10)
##             amount <= 4746:
##             :...housing = for free: no_apply (2)
##                 housing = rent:
##                 :...credit_history = critical: no_apply (1)
##                 :   credit_history in {repaid,delayed}: yes_apply (10/2)
##                 housing = own:
##                 :...savings_balance = 101 - 500 DM: no_apply (6)
##                     savings_balance in {< 100 DM,501 - 1000 DM}:
##                     :...residence_history <= 1: no_apply (8/1)
##                         residence_history > 1:
##                         :...installment_rate <= 1: no_apply (2)
##                             installment_rate > 1: [S4]
## 
## SubTree [S4]
## 
## employment_length in {> 7 yrs,unemployed}: no_apply (13/6)
## employment_length in {1 - 4 yrs,0 - 1 yrs}: yes_apply (10)
## 
## 
## Evaluation on training data (900 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##      57  127(14.1%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     590    42    (a): class no_apply
##      85   183    (b): class yes_apply
## 
## 
##  Attribute usage:
## 
##  100.00% checking_balance
##   60.22% foreign_worker
##   57.89% credit_history
##   51.11% months_loan_duration
##   42.67% savings_balance
##   30.44% other_debtors
##   17.78% job
##   15.56% installment_plan
##   14.89% purpose
##   12.89% employment_length
##   10.22% amount
##    6.78% residence_history
##    5.78% housing
##    3.89% dependents
##    3.56% installment_rate
##    3.44% personal_status
##    2.78% age
##    1.56% property
##    1.33% existing_credits
## 
## 
## Time: 0.0 secs

predict is a generic function for predictions from the results of various model fitting functions. The function invokes particular methods which depend on the class of the first argument.

nôm na là predict(object, …) là một hàm dự đoán, object là phương thức (method) dự đoán, phần sau là test set, predict dự đoán phương thức tính toán so với test set.

credit_pred <- predict(credit_mode, credit_test)

This creates a vector of predicted class values, which we can compare to the actual class values using the CrossTable() function in the gmodels package. Setting the prop.c and prop.r parameters to FALSE removes the column and row percentages from the table. The remaining percentage (prop.t) indicates the proportion of records in the cell out of the total number of records.

CrossTable(credit_test$default, credit_pred, prop.r = T, prop.c = T, prop.chisq = F, prop.t = T, dnn = c("actual default","predicted default"))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                | predicted default 
## actual default |  no_apply | yes_apply | Row Total | 
## ---------------|-----------|-----------|-----------|
##       no_apply |        54 |        14 |        68 | 
##                |     0.794 |     0.206 |     0.680 | 
##                |     0.831 |     0.400 |           | 
##                |     0.540 |     0.140 |           | 
## ---------------|-----------|-----------|-----------|
##      yes_apply |        11 |        21 |        32 | 
##                |     0.344 |     0.656 |     0.320 | 
##                |     0.169 |     0.600 |           | 
##                |     0.110 |     0.210 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |        65 |        35 |       100 | 
##                |     0.650 |     0.350 |           | 
## ---------------|-----------|-----------|-----------|
## 
##

Cải thiện thuật toán - boost process

credit_boost10 <- C5.0(credit_train[,-17], credit_train$default, trials = 10)
credit_boost10

## 
## Call:
## C5.0.default(x = credit_train[, -17], y = credit_train$default, trials = 10)
## 
## Classification Tree
## Number of samples: 900 
## Number of predictors: 20 
## 
## Number of boosting iterations: 10 
## Average tree size: 47.3 
## 
## Non-standard options: attempt to group attributes

#summary(credit_boost10)

so sánh kết quả sau khi boost

credit_boost_pred10 <- predict(credit_boost10, credit_test)
CrossTable(credit_test$default, credit_boost_pred10, prop.chisq = FALSE, prop.c = T, prop.r = T, dnn = c('actual default', 'predicted default'))

## 
##  
##    Cell Contents
## |-------------------------|
## |                       N |
## |           N / Row Total |
## |           N / Col Total |
## |         N / Table Total |
## |-------------------------|
## 
##  
## Total Observations in Table:  100 
## 
##  
##                | predicted default 
## actual default |  no_apply | yes_apply | Row Total | 
## ---------------|-----------|-----------|-----------|
##       no_apply |        63 |         5 |        68 | 
##                |     0.926 |     0.074 |     0.680 | 
##                |     0.797 |     0.238 |           | 
##                |     0.630 |     0.050 |           | 
## ---------------|-----------|-----------|-----------|
##      yes_apply |        16 |        16 |        32 | 
##                |     0.500 |     0.500 |     0.320 | 
##                |     0.203 |     0.762 |           | 
##                |     0.160 |     0.160 |           | 
## ---------------|-----------|-----------|-----------|
##   Column Total |        79 |        21 |       100 | 
##                |     0.790 |     0.210 |           | 
## ---------------|-----------|-----------|-----------|
## 
##

typeof(credit_mode)

## [1] "list"

str(credit_mode)

## List of 16
##  $ names       : chr "| Generated using R version 4.0.3 (2020-10-10)\n| on Thu Feb 04 03:35:18 2021\noutcome.\n\noutcome: no\\_apply,"| __truncated__
##  $ cost        : chr ""
##  $ costMatrix  : NULL
##  $ caseWeights : logi FALSE
##  $ control     :List of 11
##   ..$ subset         : logi TRUE
##   ..$ bands          : num 0
##   ..$ winnow         : logi FALSE
##   ..$ noGlobalPruning: logi FALSE
##   ..$ CF             : num 0.25
##   ..$ minCases       : num 2
##   ..$ fuzzyThreshold : logi FALSE
##   ..$ sample         : num 0
##   ..$ earlyStopping  : logi TRUE
##   ..$ label          : chr "outcome"
##   ..$ seed           : int 999
##  $ trials      : Named num [1:2] 1 1
##   ..- attr(*, "names")= chr [1:2] "Requested" "Actual"
##  $ rbm         : logi FALSE
##  $ boostResults: NULL
##  $ size        : int 57
##  $ dims        : int [1:2] 900 20
##  $ call        : language C5.0.default(x = credit_train[, -17], y = credit_train$default)
##  $ levels      : chr [1:2] "no_apply" "yes_apply"
##  $ output      : chr "\nC5.0 [Release 2.07 GPL Edition]  \tThu Feb  4 03:35:18 2021\n-------------------------------\n\nClass specifi"| __truncated__
##  $ tree        : chr "id=\"See5/C5.0 2.07 GPL Edition 2021-02-04\"\nentries=\"1\"\ntype=\"3\" class=\"no_apply\" freq=\"632,268\" att"| __truncated__
##  $ predictors  : chr [1:20] "checking_balance" "months_loan_duration" "credit_history" "purpose" ...
##  $ rules       : chr ""
##  - attr(*, "class")= chr "C5.0"

Credit analysis built a tree with C5.0 method

Hoang An

2/3/2021

header 1 = chunk

header 2