Tree-Based Methods

Gerry Alfa Dito1

Rahma Anisa2

Package

Silahkan install jika belum ada

install.packages("tidyverse")
install.packages("mlr3verse")
install.packages("ranger")
install.packages("rpart.plot")
install.packages("RWeka")
install.packages("precrec")
remotes::install_github("mlr-org/mlr3extralearners")

Memanggil Package

library(tidyverse)
library(mlr3verse)
library(mlr3extralearners)

Import data ke R

data_credit <- read.csv("german_credit.csv",stringsAsFactors = TRUE)
head(data_credit)
##   checking_status duration                   credit_history             purpose
## 1            '<0'        6 'critical/other existing credit'            radio/tv
## 2      '0<=X<200'       48                  'existing paid'            radio/tv
## 3   'no checking'       12 'critical/other existing credit'           education
## 4            '<0'       42                  'existing paid' furniture/equipment
## 5            '<0'       24             'delayed previously'           'new car'
## 6   'no checking'       36                  'existing paid'           education
##   credit_amount     savings_status employment installment_commitment
## 1          1169 'no known savings'      '>=7'                      4
## 2          5951             '<100'   '1<=X<4'                      2
## 3          2096             '<100'   '4<=X<7'                      2
## 4          7882             '<100'   '4<=X<7'                      2
## 5          4870             '<100'   '1<=X<4'                      3
## 6          9055 'no known savings'   '1<=X<4'                      2
##        personal_status other_parties residence_since  property_magnitude age
## 1        'male single'          none               4       'real estate'  67
## 2 'female div/dep/mar'          none               2       'real estate'  22
## 3        'male single'          none               3       'real estate'  49
## 4        'male single'     guarantor               4    'life insurance'  45
## 5        'male single'          none               4 'no known property'  53
## 6        'male single'          none               4 'no known property'  35
##   other_payment_plans    housing existing_credits                  job
## 1                none        own                2              skilled
## 2                none        own                1              skilled
## 3                none        own                1 'unskilled resident'
## 4                none 'for free'                1              skilled
## 5                none 'for free'                2              skilled
## 6                none 'for free'                1 'unskilled resident'
##   num_dependents own_telephone foreign_worker class
## 1              1           yes            yes  good
## 2              1          none            yes   bad
## 3              2          none            yes  good
## 4              2          none            yes  good
## 5              2          none            yes   bad
## 6              2           yes            yes  good

mengubah tipe data integer ke numerik

data_credit <- data_credit %>% mutate_if(is.integer,as.numeric)
data_airfoil <- read.table("airfoil_self_noise.dat")
colnames(data_airfoil) <- c("Frequency","Angle_of_Attack","Chord_length","Free_stream_velocity","Suction_side","Scaled_sound")
head(data_airfoil)
##   Frequency Angle_of_Attack Chord_length Free_stream_velocity Suction_side
## 1       800               0       0.3048                 71.3   0.00266337
## 2      1000               0       0.3048                 71.3   0.00266337
## 3      1250               0       0.3048                 71.3   0.00266337
## 4      1600               0       0.3048                 71.3   0.00266337
## 5      2000               0       0.3048                 71.3   0.00266337
## 6      2500               0       0.3048                 71.3   0.00266337
##   Scaled_sound
## 1      126.201
## 2      125.201
## 3      125.951
## 4      127.591
## 5      127.461
## 6      125.571

Import data ke ekosistem mlr3

task_credit = TaskClassif$new(id="credit",backend = data_credit,
                                target = "class",positive ="bad")
task_airfoil = TaskRegr$new(id="airfoil",backend = data_airfoil,
                                target = "Scaled_sound")

Menentukan model yang digunakan

  1. Bagging
model_bagging_classif <- pipeline_bagging(lrn("classif.rpart", predict_type="prob"), iterations = 100,averager = po("classifavg"))
model_bagging_regr <- pipeline_bagging(lrn("regr.rpart"), iterations = 100,averager = po("regravg"))
  1. Random Forest
model_rf_classif <- lrn("classif.ranger", predict_type="prob",importance="impurity")
model_rf_regr <- lrn("regr.ranger",importance="impurity")
  1. Ada Boost
install_learners("classif.AdaBoostM1")
model_ada_classif <- lrn("classif.AdaBoostM1",predict_type="prob")
learner_credit <- list(model_bagging_classif,model_rf_classif,
                       model_ada_classif
                       )
learner_airfoil <- list(model_bagging_regr,model_rf_regr)

Menentukan cara pembagian data

resample_cv = rsmp("cv", folds = 10)

Melakukan interpretasi model (jika diperlukan)

model_rf_classif$train(task=task_credit)
model_rf_classif$model$variable.importance
##                    age        checking_status          credit_amount 
##              31.445540              38.991956              43.563684 
##         credit_history               duration             employment 
##              19.245259              32.637067              14.073071 
##       existing_credits         foreign_worker                housing 
##               5.831078               1.538162               8.256197 
## installment_commitment                    job         num_dependents 
##              11.431669               7.932523               3.797136 
##          other_parties    other_payment_plans          own_telephone 
##               5.986914               9.693468               4.747957 
##        personal_status     property_magnitude                purpose 
##               8.864974              10.998645              17.537670 
##        residence_since         savings_status 
##              10.075722              14.481703
model_rf_regr$train(task_airfoil)
model_rf_regr$model$variable.importance
##      Angle_of_Attack         Chord_length Free_stream_velocity 
##             7184.127             8450.686             2975.771 
##            Frequency         Suction_side 
##            29066.666            17613.132
importance_classif <- data.frame(Predictors = names(model_rf_classif$model$variable.importance),
                         impurity = model_rf_classif$model$variable.importance
                         )
rownames(importance_classif) <- NULL

importance_classif %>% arrange(desc(impurity))
##                Predictors  impurity
## 1           credit_amount 43.563684
## 2         checking_status 38.991956
## 3                duration 32.637067
## 4                     age 31.445540
## 5          credit_history 19.245259
## 6                 purpose 17.537670
## 7          savings_status 14.481703
## 8              employment 14.073071
## 9  installment_commitment 11.431669
## 10     property_magnitude 10.998645
## 11        residence_since 10.075722
## 12    other_payment_plans  9.693468
## 13        personal_status  8.864974
## 14                housing  8.256197
## 15                    job  7.932523
## 16          other_parties  5.986914
## 17       existing_credits  5.831078
## 18          own_telephone  4.747957
## 19         num_dependents  3.797136
## 20         foreign_worker  1.538162
importance_regr <- data.frame(Predictors = names(model_rf_regr$model$variable.importance),
                         variance = model_rf_regr$model$variable.importance
                         )
rownames(importance_regr) <- NULL

importance_regr %>% arrange(desc(variance))
##             Predictors  variance
## 1            Frequency 29066.666
## 2         Suction_side 17613.132
## 3         Chord_length  8450.686
## 4      Angle_of_Attack  7184.127
## 5 Free_stream_velocity  2975.771

Komparasi model

Komparasi model bisa dilakukan dengan menggunakan fungsi benchmark_design dan benchmark. Fungsi benchmark_design digunakan untuk memasukan informasi-inforamsi yang dibutuhkan untuk komparasi, seperti data yang digunakan (tasks), model yang ingin dikomparasi (learners) dan metode pembagian data yang digunakan (resamplings).

design_classif <- benchmark_grid(tasks = task_credit,
                         learners = learner_credit,
                         resamplings = resample_cv 
                         )

design_regr <- benchmark_grid(tasks = task_airfoil,
                         learners = learner_airfoil,
                         resamplings = resample_cv 
                         )

Kemudian fungsi benchmark digunakan untuk menjalankan/ running komparasi model berdasarkan desain yang sudah dirancang.

Karena terdapat 5 model dan masing-masing model menjalankan 10-folds cross-validation maka iterasi yang dilakukan ada sebanyak 50 kali.

Hasil Komparasi model

Hasil komparasi model dapat berupa nilai-nilai ukuran kebaikan model yang ditentukan oleh pengguna.

result_classif = bmr_classif$aggregate(list(msr("classif.acc"),msr("classif.auc"))
              )
result_classif
##    nr      resample_result task_id
## 1:  1 <ResampleResult[21]>  credit
## 2:  2 <ResampleResult[21]>  credit
## 3:  3 <ResampleResult[21]>  credit
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                            learner_id
## 1: subsample_1.subsample_2.subsample_3.subsample_4.subsample_5.subsample_6.subsample_7.subsample_8.subsample_9.subsample_10.subsample_11.subsample_12.subsample_13.subsample_14.subsample_15.subsample_16.subsample_17.subsample_18.subsample_19.subsample_20.subsample_21.subsample_22.subsample_23.subsample_24.subsample_25.subsample_26.subsample_27.subsample_28.subsample_29.subsample_30.subsample_31.subsample_32.subsample_33.subsample_34.subsample_35.subsample_36.subsample_37.subsample_38.subsample_39.subsample_40.subsample_41.subsample_42.subsample_43.subsample_44.subsample_45.subsample_46.subsample_47.subsample_48.subsample_49.subsample_50.subsample_51.subsample_52.subsample_53.subsample_54.subsample_55.subsample_56.subsample_57.subsample_58.subsample_59.subsample_60.subsample_61.subsample_62.subsample_63.subsample_64.subsample_65.subsample_66.subsample_67.subsample_68.subsample_69.subsample_70.subsample_71.subsample_72.subsample_73.subsample_74.subsample_75.subsample_76.subsample_77.subsample_78.subsample_79.subsample_80.subsample_81.subsample_82.subsample_83.subsample_84.subsample_85.subsample_86.subsample_87.subsample_88.subsample_89.subsample_90.subsample_91.subsample_92.subsample_93.subsample_94.subsample_95.subsample_96.subsample_97.subsample_98.subsample_99.subsample_100.classif.rpart_1.classif.rpart_2.classif.rpart_3.classif.rpart_4.classif.rpart_5.classif.rpart_6.classif.rpart_7.classif.rpart_8.classif.rpart_9.classif.rpart_10.classif.rpart_11.classif.rpart_12.classif.rpart_13.classif.rpart_14.classif.rpart_15.classif.rpart_16.classif.rpart_17.classif.rpart_18.classif.rpart_19.classif.rpart_20.classif.rpart_21.classif.rpart_22.classif.rpart_23.classif.rpart_24.classif.rpart_25.classif.rpart_26.classif.rpart_27.classif.rpart_28.classif.rpart_29.classif.rpart_30.classif.rpart_31.classif.rpart_32.classif.rpart_33.classif.rpart_34.classif.rpart_35.classif.rpart_36.classif.rpart_37.classif.rpart_38.classif.rpart_39.classif.rpart_40.classif.rpart_41.classif.rpart_42.classif.rpart_43.classif.rpart_44.classif.rpart_45.classif.rpart_46.classif.rpart_47.classif.rpart_48.classif.rpart_49.classif.rpart_50.classif.rpart_51.classif.rpart_52.classif.rpart_53.classif.rpart_54.classif.rpart_55.classif.rpart_56.classif.rpart_57.classif.rpart_58.classif.rpart_59.classif.rpart_60.classif.rpart_61.classif.rpart_62.classif.rpart_63.classif.rpart_64.classif.rpart_65.classif.rpart_66.classif.rpart_67.classif.rpart_68.classif.rpart_69.classif.rpart_70.classif.rpart_71.classif.rpart_72.classif.rpart_73.classif.rpart_74.classif.rpart_75.classif.rpart_76.classif.rpart_77.classif.rpart_78.classif.rpart_79.classif.rpart_80.classif.rpart_81.classif.rpart_82.classif.rpart_83.classif.rpart_84.classif.rpart_85.classif.rpart_86.classif.rpart_87.classif.rpart_88.classif.rpart_89.classif.rpart_90.classif.rpart_91.classif.rpart_92.classif.rpart_93.classif.rpart_94.classif.rpart_95.classif.rpart_96.classif.rpart_97.classif.rpart_98.classif.rpart_99.classif.rpart_100.classifavg
## 2:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     classif.ranger
## 3:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 classif.AdaBoostM1
##    resampling_id iters classif.acc classif.auc
## 1:            cv    10       0.757   0.7694201
## 2:            cv    10       0.757   0.7919533
## 3:            cv    10       0.706   0.7257064

Berdasarkan nilai akurasi model yang memiliki performa prediksi terbaik adalah model regresi logistik.

result_regr = bmr_regr$aggregate(list(msr("regr.mse"),msr("regr.srho"))
              )
result_regr
##    nr      resample_result task_id
## 1:  1 <ResampleResult[21]> airfoil
## 2:  2 <ResampleResult[21]> airfoil
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             learner_id
## 1: subsample_1.subsample_2.subsample_3.subsample_4.subsample_5.subsample_6.subsample_7.subsample_8.subsample_9.subsample_10.subsample_11.subsample_12.subsample_13.subsample_14.subsample_15.subsample_16.subsample_17.subsample_18.subsample_19.subsample_20.subsample_21.subsample_22.subsample_23.subsample_24.subsample_25.subsample_26.subsample_27.subsample_28.subsample_29.subsample_30.subsample_31.subsample_32.subsample_33.subsample_34.subsample_35.subsample_36.subsample_37.subsample_38.subsample_39.subsample_40.subsample_41.subsample_42.subsample_43.subsample_44.subsample_45.subsample_46.subsample_47.subsample_48.subsample_49.subsample_50.subsample_51.subsample_52.subsample_53.subsample_54.subsample_55.subsample_56.subsample_57.subsample_58.subsample_59.subsample_60.subsample_61.subsample_62.subsample_63.subsample_64.subsample_65.subsample_66.subsample_67.subsample_68.subsample_69.subsample_70.subsample_71.subsample_72.subsample_73.subsample_74.subsample_75.subsample_76.subsample_77.subsample_78.subsample_79.subsample_80.subsample_81.subsample_82.subsample_83.subsample_84.subsample_85.subsample_86.subsample_87.subsample_88.subsample_89.subsample_90.subsample_91.subsample_92.subsample_93.subsample_94.subsample_95.subsample_96.subsample_97.subsample_98.subsample_99.subsample_100.regr.rpart_1.regr.rpart_2.regr.rpart_3.regr.rpart_4.regr.rpart_5.regr.rpart_6.regr.rpart_7.regr.rpart_8.regr.rpart_9.regr.rpart_10.regr.rpart_11.regr.rpart_12.regr.rpart_13.regr.rpart_14.regr.rpart_15.regr.rpart_16.regr.rpart_17.regr.rpart_18.regr.rpart_19.regr.rpart_20.regr.rpart_21.regr.rpart_22.regr.rpart_23.regr.rpart_24.regr.rpart_25.regr.rpart_26.regr.rpart_27.regr.rpart_28.regr.rpart_29.regr.rpart_30.regr.rpart_31.regr.rpart_32.regr.rpart_33.regr.rpart_34.regr.rpart_35.regr.rpart_36.regr.rpart_37.regr.rpart_38.regr.rpart_39.regr.rpart_40.regr.rpart_41.regr.rpart_42.regr.rpart_43.regr.rpart_44.regr.rpart_45.regr.rpart_46.regr.rpart_47.regr.rpart_48.regr.rpart_49.regr.rpart_50.regr.rpart_51.regr.rpart_52.regr.rpart_53.regr.rpart_54.regr.rpart_55.regr.rpart_56.regr.rpart_57.regr.rpart_58.regr.rpart_59.regr.rpart_60.regr.rpart_61.regr.rpart_62.regr.rpart_63.regr.rpart_64.regr.rpart_65.regr.rpart_66.regr.rpart_67.regr.rpart_68.regr.rpart_69.regr.rpart_70.regr.rpart_71.regr.rpart_72.regr.rpart_73.regr.rpart_74.regr.rpart_75.regr.rpart_76.regr.rpart_77.regr.rpart_78.regr.rpart_79.regr.rpart_80.regr.rpart_81.regr.rpart_82.regr.rpart_83.regr.rpart_84.regr.rpart_85.regr.rpart_86.regr.rpart_87.regr.rpart_88.regr.rpart_89.regr.rpart_90.regr.rpart_91.regr.rpart_92.regr.rpart_93.regr.rpart_94.regr.rpart_95.regr.rpart_96.regr.rpart_97.regr.rpart_98.regr.rpart_99.regr.rpart_100.regravg
## 2:                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                         regr.ranger
##    resampling_id iters  regr.mse regr.srho
## 1:            cv    10 14.488646 0.8169416
## 2:            cv    10  4.802364 0.9478668