1 Decision Tree For Classification

1.1 Pre-Processing

Dalam pembelajaran klasifikasi, kita akan menggunakan data dari library palmerpenguins yang telah tersedia di R.

1.1.1 View Data

library(palmerpenguins)
head(penguins)
## # A tibble: 6 × 8
##   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
## 1 Adelie  Torgersen           39.1          18.7               181        3750
## 2 Adelie  Torgersen           39.5          17.4               186        3800
## 3 Adelie  Torgersen           40.3          18                 195        3250
## 4 Adelie  Torgersen           NA            NA                  NA          NA
## 5 Adelie  Torgersen           36.7          19.3               193        3450
## 6 Adelie  Torgersen           39.3          20.6               190        3650
## # ℹ 2 more variables: sex <fct>, year <int>

1.1.2 Data Type

str(penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
##  $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
##  $ year             : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...

1.1.3 Summary Data

summary(penguins)
##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :152   Biscoe   :168   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :124   1st Qu.:39.23   1st Qu.:15.60  
##  Gentoo   :124   Torgersen: 52   Median :44.45   Median :17.30  
##                                  Mean   :43.92   Mean   :17.15  
##                                  3rd Qu.:48.50   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##                                  NA's   :2       NA's   :2      
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172.0     Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190.0     1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197.0     Median :4050   NA's  : 11   Median :2008  
##  Mean   :200.9     Mean   :4202                Mean   :2008  
##  3rd Qu.:213.0     3rd Qu.:4750                3rd Qu.:2009  
##  Max.   :231.0     Max.   :6300                Max.   :2009  
##  NA's   :2         NA's   :2

1.1.4 Missing Value

length(which(is.na(penguins)))
## [1] 19

Data memiliki missing value, sehingga perlu dilakukan imputasi/pembersihan missing value tersebut.

pg<-na.omit(penguins)
pg
## # A tibble: 333 × 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           36.7          19.3               193        3450
##  5 Adelie  Torgersen           39.3          20.6               190        3650
##  6 Adelie  Torgersen           38.9          17.8               181        3625
##  7 Adelie  Torgersen           39.2          19.6               195        4675
##  8 Adelie  Torgersen           41.1          17.6               182        3200
##  9 Adelie  Torgersen           38.6          21.2               191        3800
## 10 Adelie  Torgersen           34.6          21.1               198        4400
## # ℹ 323 more rows
## # ℹ 2 more variables: sex <fct>, year <int>

1.1.5 Duplicated Data

pg[duplicated(pg), ]
## # A tibble: 0 × 8
## # ℹ 8 variables: species <fct>, island <fct>, bill_length_mm <dbl>,
## #   bill_depth_mm <dbl>, flipper_length_mm <int>, body_mass_g <int>, sex <fct>,
## #   year <int>

Dataset tidak memiliki data duplikat.

1.2 Exploratory Data

library(GGally)
ggpairs(pg)

1.3 Training & Testing

Membagi data menjadi data training & testing. Data training berfungsi untuk melatih algoritma model, sedangkan data pengujian untuk memvalidasi/mengevaluasi hasil pemodelan yang dibangun. Misalkan, digunakan proporsi training 70% dan testing 30%.

#drop variabel year
library(dplyr)
pg2 <- pg %>% select(-year)
pg2
## # A tibble: 333 × 7
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           36.7          19.3               193        3450
##  5 Adelie  Torgersen           39.3          20.6               190        3650
##  6 Adelie  Torgersen           38.9          17.8               181        3625
##  7 Adelie  Torgersen           39.2          19.6               195        4675
##  8 Adelie  Torgersen           41.1          17.6               182        3200
##  9 Adelie  Torgersen           38.6          21.2               191        3800
## 10 Adelie  Torgersen           34.6          21.1               198        4400
## # ℹ 323 more rows
## # ℹ 1 more variable: sex <fct>
library(caTools)
set.seed(9)
split <- sample.split(pg2$species, SplitRatio = 0.7)
train <- subset(pg2, split == TRUE)
test <- subset(pg2, split == FALSE)
dim(train)
## [1] 233   7
dim(test)
## [1] 100   7

1.4 Build Model Tree

1.4.1 Fit Model

library(rpart)
model1<-rpart(species~., data = train)

1.4.2 Result Y pred

model1$y
##   [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##  [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [223] 2 2 2 2 2 2 2 2 2 2 2

1.4.3 Tree Plot

library(rpart.plot)
rpart.plot(model1, cex = 0.8, box.palette = "auto")

1.5 Evaluating Model

1.5.1 For Train Data

library("caret")
pred_train<-predict(model1,train,type = "class")
confusionMatrix(pred_train,as.factor(train$species))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        98         3      0
##   Chinstrap      4        41      1
##   Gentoo         0         4     82
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9485          
##                  95% CI : (0.9118, 0.9731)
##     No Information Rate : 0.4378          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9193          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 0.9608           0.8542        0.9880
## Specificity                 0.9771           0.9730        0.9733
## Pos Pred Value              0.9703           0.8913        0.9535
## Neg Pred Value              0.9697           0.9626        0.9932
## Prevalence                  0.4378           0.2060        0.3562
## Detection Rate              0.4206           0.1760        0.3519
## Detection Prevalence        0.4335           0.1974        0.3691
## Balanced Accuracy           0.9689           0.9136        0.9806

Hasil model tree untuk data training sudah baik dalam mengklasifikasikan kelas penguin.

1.5.2 For Test Data

pred_test<-predict(model1,test,type = "class")
confusionMatrix(pred_test,as.factor(test$species))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        40         1      0
##   Chinstrap      2        18      0
##   Gentoo         2         1     36
## 
## Overall Statistics
##                                          
##                Accuracy : 0.94           
##                  95% CI : (0.874, 0.9777)
##     No Information Rate : 0.44           
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.9061         
##                                          
##  Mcnemar's Test P-Value : 0.343          
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 0.9091           0.9000        1.0000
## Specificity                 0.9821           0.9750        0.9531
## Pos Pred Value              0.9756           0.9000        0.9231
## Neg Pred Value              0.9322           0.9750        1.0000
## Prevalence                  0.4400           0.2000        0.3600
## Detection Rate              0.4000           0.1800        0.3600
## Detection Prevalence        0.4100           0.2000        0.3900
## Balanced Accuracy           0.9456           0.9375        0.9766

Hasil model tree untuk data testing sudah baik dalam mengklasifikasikan kelas penguin.

1.6 Importance Variable

library(vip)
model1$variable.importance
## flipper_length_mm    bill_length_mm     bill_depth_mm       body_mass_g 
##          88.68215          80.88095          65.62114          60.16413 
##            island 
##          50.70759
var_importance <- vip(model1, num_features = 5)
print(var_importance)

1.7 Control Parameters of Tree Model

Pemodelan Decision Tree menggunakan library(rpart) memiliki jenis parameter yang dapat dikontrol, beberapa diantaranya yaitu : 1.minsplit : jumlah minimum pengamatan yang harus ada dalam sebuah node agar pemisahan (split) dapat dicoba.

2.minbucket : jumlah minimum pengamatan dalam node leaf.

3.xval : jumlah cross-validation.

4.maxdepth : kedalaman pohon (jumlah keputusan dari atas ke bawah)

help("rpart.control")
model2<-rpart(species~.,data = train, control = 
                c(cp = 0.005,
                  xval = 5,
                  minsplit = 10,
                  minbucket = 5,
                  maxdepth = 3))
rpart.plot(model2)

confusionMatrix(predict(model2,train, type = "class"),as.factor(train$species))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        98         3      0
##   Chinstrap      4        45      2
##   Gentoo         0         0     81
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9614          
##                  95% CI : (0.9279, 0.9822)
##     No Information Rate : 0.4378          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9398          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 0.9608           0.9375        0.9759
## Specificity                 0.9771           0.9676        1.0000
## Pos Pred Value              0.9703           0.8824        1.0000
## Neg Pred Value              0.9697           0.9835        0.9868
## Prevalence                  0.4378           0.2060        0.3562
## Detection Rate              0.4206           0.1931        0.3476
## Detection Prevalence        0.4335           0.2189        0.3476
## Balanced Accuracy           0.9689           0.9525        0.9880
confusionMatrix(predict(model2,test, type = "class"),as.factor(test$species))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        40         1      0
##   Chinstrap      4        19      1
##   Gentoo         0         0     35
## 
## Overall Statistics
##                                          
##                Accuracy : 0.94           
##                  95% CI : (0.874, 0.9777)
##     No Information Rate : 0.44           
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.9071         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 0.9091           0.9500        0.9722
## Specificity                 0.9821           0.9375        1.0000
## Pos Pred Value              0.9756           0.7917        1.0000
## Neg Pred Value              0.9322           0.9868        0.9846
## Prevalence                  0.4400           0.2000        0.3600
## Detection Rate              0.4000           0.1900        0.3500
## Detection Prevalence        0.4100           0.2400        0.3500
## Balanced Accuracy           0.9456           0.9437        0.9861

2 Decision Tree for Regression

2.1 Pre-Processing

Dalam pembelajaran regresi, kita akan menggunakan data Boston dari library(MASS) yang telah tersedia di R.

2.1.1 View Data

library(MASS)
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

2.1.2 Data Type

str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

2.1.3 Summary Data

summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00

2.1.4 Missing Value

length(which(is.na(Boston)))
## [1] 0

Data tidak memiliki missing value.

2.1.5 Duplicated Data

pg[duplicated(pg), ]
## # A tibble: 0 × 8
## # ℹ 8 variables: species <fct>, island <fct>, bill_length_mm <dbl>,
## #   bill_depth_mm <dbl>, flipper_length_mm <int>, body_mass_g <int>, sex <fct>,
## #   year <int>

Dataset tidak memiliki data duplikat.

2.2 Exploratory Data

library(GGally)
ggpairs(Boston)

2.3 Training & Testing

Membagi data menjadi data training & testing. Misalkan, digunakan proporsi training 70% dan testing 30%.

#drop variabel
library(dplyr)
boston2 <- Boston[,c('crim', 'indus', 'nox', 'rm', 'ptratio', 'black','lstat','medv')]
head(boston2)
##      crim indus   nox    rm ptratio  black lstat medv
## 1 0.00632  2.31 0.538 6.575    15.3 396.90  4.98 24.0
## 2 0.02731  7.07 0.469 6.421    17.8 396.90  9.14 21.6
## 3 0.02729  7.07 0.469 7.185    17.8 392.83  4.03 34.7
## 4 0.03237  2.18 0.458 6.998    18.7 394.63  2.94 33.4
## 5 0.06905  2.18 0.458 7.147    18.7 396.90  5.33 36.2
## 6 0.02985  2.18 0.458 6.430    18.7 394.12  5.21 28.7
set.seed(9)
split_bs <- sample.split(boston2, SplitRatio = 0.7)
train_bs <- subset(boston2, split == TRUE)
test_bs <- subset(boston2, split == FALSE)
dim(train_bs)
## [1] 349   8
dim(test_bs)
## [1] 157   8

2.4 Build Model Tree

2.4.1 Fit Model

library(rpart)
tree_reg<-rpart(medv~., data = train_bs)

2.4.2 Tree Plot

rpart.plot(tree_reg, cex = 0.8, box.palette = "auto")

2.5 Evaluating Model

2.5.1 For Train Data

library(forecast)
pred_train_reg<-predict(tree_reg,train_bs)
accuracy(pred_train_reg,train_bs$medv)
##                     ME    RMSE      MAE       MPE     MAPE
## Test set -1.446082e-15 3.58444 2.633462 -3.312994 13.86545

2.5.2 For Test Data

pred_test_reg<-predict(tree_reg,test_bs)
accuracy(pred_test_reg,test_bs$medv)
##                 ME     RMSE      MAE      MPE     MAPE
## Test set 0.4088309 5.568173 3.432767 -2.45946 15.54483

Hasil model tree untuk data testing memiliki performa yang tidak jauh berbeda dengan model training.

2.6 Plot Y pred & actual

plot(test_bs$medv, pred_test_reg)

2.7 Importance Variable

library(vip)
tree_reg$variable.importance
##        rm     lstat   ptratio     indus       nox      crim     black 
## 15486.855 10971.740  5334.476  4644.154  3770.401  3748.424  1779.898
var_importance_reg <- vip(tree_reg, num_features = 7)
print(var_importance_reg)