1 Random Forest For Classification

1.1 Pre-Processing

Dalam pembelajaran klasifikasi, kita akan menggunakan data dari library palmerpenguins yang telah tersedia di R.

1.1.1 View Data

library(palmerpenguins)
head(penguins)
## # A tibble: 6 × 8
##   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
## 1 Adelie  Torgersen           39.1          18.7               181        3750
## 2 Adelie  Torgersen           39.5          17.4               186        3800
## 3 Adelie  Torgersen           40.3          18                 195        3250
## 4 Adelie  Torgersen           NA            NA                  NA          NA
## 5 Adelie  Torgersen           36.7          19.3               193        3450
## 6 Adelie  Torgersen           39.3          20.6               190        3650
## # ℹ 2 more variables: sex <fct>, year <int>

1.1.2 Data Type

str(penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
##  $ species          : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ island           : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
##  $ bill_length_mm   : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
##  $ bill_depth_mm    : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
##  $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
##  $ body_mass_g      : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
##  $ sex              : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
##  $ year             : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...

1.1.3 Summary Data

summary(penguins)
##       species          island    bill_length_mm  bill_depth_mm  
##  Adelie   :152   Biscoe   :168   Min.   :32.10   Min.   :13.10  
##  Chinstrap: 68   Dream    :124   1st Qu.:39.23   1st Qu.:15.60  
##  Gentoo   :124   Torgersen: 52   Median :44.45   Median :17.30  
##                                  Mean   :43.92   Mean   :17.15  
##                                  3rd Qu.:48.50   3rd Qu.:18.70  
##                                  Max.   :59.60   Max.   :21.50  
##                                  NA's   :2       NA's   :2      
##  flipper_length_mm  body_mass_g       sex           year     
##  Min.   :172.0     Min.   :2700   female:165   Min.   :2007  
##  1st Qu.:190.0     1st Qu.:3550   male  :168   1st Qu.:2007  
##  Median :197.0     Median :4050   NA's  : 11   Median :2008  
##  Mean   :200.9     Mean   :4202                Mean   :2008  
##  3rd Qu.:213.0     3rd Qu.:4750                3rd Qu.:2009  
##  Max.   :231.0     Max.   :6300                Max.   :2009  
##  NA's   :2         NA's   :2

1.1.4 Missing Value

length(which(is.na(penguins)))
## [1] 19

Data memiliki missing value, sehingga perlu dilakukan imputasi/pembersihan missing value tersebut.

pg<-na.omit(penguins)
pg
## # A tibble: 333 × 8
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           36.7          19.3               193        3450
##  5 Adelie  Torgersen           39.3          20.6               190        3650
##  6 Adelie  Torgersen           38.9          17.8               181        3625
##  7 Adelie  Torgersen           39.2          19.6               195        4675
##  8 Adelie  Torgersen           41.1          17.6               182        3200
##  9 Adelie  Torgersen           38.6          21.2               191        3800
## 10 Adelie  Torgersen           34.6          21.1               198        4400
## # ℹ 323 more rows
## # ℹ 2 more variables: sex <fct>, year <int>

1.1.5 Duplicated Data

pg[duplicated(pg), ]
## # A tibble: 0 × 8
## # ℹ 8 variables: species <fct>, island <fct>, bill_length_mm <dbl>,
## #   bill_depth_mm <dbl>, flipper_length_mm <int>, body_mass_g <int>, sex <fct>,
## #   year <int>

Dataset tidak memiliki data duplikat.

1.2 Exploratory Data

library(GGally)
ggpairs(pg)

1.3 Training & Testing

Membagi data menjadi data training & testing. Data training berfungsi untuk melatih algoritma model, sedangkan data pengujian untuk memvalidasi/mengevaluasi hasil pemodelan yang dibangun. Misalkan, digunakan proporsi training 70% dan testing 30%.

#drop variabel year
library(dplyr)
pg2 <- pg %>% select(-year)
pg2
## # A tibble: 333 × 7
##    species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
##    <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
##  1 Adelie  Torgersen           39.1          18.7               181        3750
##  2 Adelie  Torgersen           39.5          17.4               186        3800
##  3 Adelie  Torgersen           40.3          18                 195        3250
##  4 Adelie  Torgersen           36.7          19.3               193        3450
##  5 Adelie  Torgersen           39.3          20.6               190        3650
##  6 Adelie  Torgersen           38.9          17.8               181        3625
##  7 Adelie  Torgersen           39.2          19.6               195        4675
##  8 Adelie  Torgersen           41.1          17.6               182        3200
##  9 Adelie  Torgersen           38.6          21.2               191        3800
## 10 Adelie  Torgersen           34.6          21.1               198        4400
## # ℹ 323 more rows
## # ℹ 1 more variable: sex <fct>
library(caTools)
set.seed(9)
split <- sample.split(pg2$species, SplitRatio = 0.7)
train <- subset(pg2, split == TRUE)
test <- subset(pg2, split == FALSE)
dim(train)
## [1] 233   7
dim(test)
## [1] 100   7

1.4 Build Model Random Forest

1.4.1 Fit Model

library(randomForest)
train$species <-as.factor(train$species)
model_rf1<-randomForest(species~., data = train, ntree=50)

1.4.2 Result Y pred

model_rf1$y
##         1         2         3         4         5         6         7         8 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##         9        10        11        12        13        14        15        16 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        17        18        19        20        21        22        23        24 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        25        26        27        28        29        30        31        32 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        33        34        35        36        37        38        39        40 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        41        42        43        44        45        46        47        48 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        49        50        51        52        53        54        55        56 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        57        58        59        60        61        62        63        64 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        65        66        67        68        69        70        71        72 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        73        74        75        76        77        78        79        80 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        81        82        83        84        85        86        87        88 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        89        90        91        92        93        94        95        96 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie 
##        97        98        99       100       101       102       103       104 
##    Adelie    Adelie    Adelie    Adelie    Adelie    Adelie    Gentoo    Gentoo 
##       105       106       107       108       109       110       111       112 
##    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo 
##       113       114       115       116       117       118       119       120 
##    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo 
##       121       122       123       124       125       126       127       128 
##    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo 
##       129       130       131       132       133       134       135       136 
##    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo 
##       137       138       139       140       141       142       143       144 
##    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo 
##       145       146       147       148       149       150       151       152 
##    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo 
##       153       154       155       156       157       158       159       160 
##    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo 
##       161       162       163       164       165       166       167       168 
##    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo 
##       169       170       171       172       173       174       175       176 
##    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo 
##       177       178       179       180       181       182       183       184 
##    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo    Gentoo 
##       185       186       187       188       189       190       191       192 
##    Gentoo Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap 
##       193       194       195       196       197       198       199       200 
## Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap 
##       201       202       203       204       205       206       207       208 
## Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap 
##       209       210       211       212       213       214       215       216 
## Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap 
##       217       218       219       220       221       222       223       224 
## Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap 
##       225       226       227       228       229       230       231       232 
## Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap 
##       233 
## Chinstrap 
## Levels: Adelie Chinstrap Gentoo

1.4.3 Random Forest Plot

plot(model_rf1) 

Error yang dihasilkan model_rf1 akan menurun seiring dengan banyaknya pohon (ntree) yang digunakan dalam model Random Forest

1.5 Evaluating Model

1.5.1 For Train Data

library("caret")
pred_train<-predict(model_rf1,train,type = "class")
confusionMatrix(pred_train,as.factor(train$species))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie       102         0      0
##   Chinstrap      0        48      0
##   Gentoo         0         0     83
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9843, 1)
##     No Information Rate : 0.4378     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 1.0000            1.000        1.0000
## Specificity                 1.0000            1.000        1.0000
## Pos Pred Value              1.0000            1.000        1.0000
## Neg Pred Value              1.0000            1.000        1.0000
## Prevalence                  0.4378            0.206        0.3562
## Detection Rate              0.4378            0.206        0.3562
## Detection Prevalence        0.4378            0.206        0.3562
## Balanced Accuracy           1.0000            1.000        1.0000

Hasil model Random Forest untuk data training sudah sangat baik dalam mengklasifikasikan kelas penguin.

1.5.2 For Test Data

pred_test<-predict(model_rf1,test,type = "class")
confusionMatrix(pred_test,as.factor(test$species))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        42         0      0
##   Chinstrap      2        20      0
##   Gentoo         0         0     36
## 
## Overall Statistics
##                                           
##                Accuracy : 0.98            
##                  95% CI : (0.9296, 0.9976)
##     No Information Rate : 0.44            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9688          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 0.9545           1.0000          1.00
## Specificity                 1.0000           0.9750          1.00
## Pos Pred Value              1.0000           0.9091          1.00
## Neg Pred Value              0.9655           1.0000          1.00
## Prevalence                  0.4400           0.2000          0.36
## Detection Rate              0.4200           0.2000          0.36
## Detection Prevalence        0.4200           0.2200          0.36
## Balanced Accuracy           0.9773           0.9875          1.00

Hasil model Random Forest untuk data testing sudah sangat baik dalam mengklasifikasikan kelas penguin.

1.6 Importance Variable

library(vip)
model_rf1$importance
##                   MeanDecreaseGini
## island                  22.6678091
## bill_length_mm          46.3996292
## bill_depth_mm           21.7765583
## flipper_length_mm       36.8454398
## body_mass_g             19.4230329
## sex                      0.9530409
var_importance <- vip(model_rf1, num_features = 5)
print(var_importance)

1.7 Control Parameters & Tuning

Pemodelan Random Forest menggunakan fungsi randomForest() memiliki jenis parameter yang dapat dikontrol, beberapa diantaranya yaitu :

1.ntree : banyaknya pohon yang dibangun untuk pemodelan Random Forest.

2.mtry : banyaknya variabel yang digunakan untuk membangun sebuah pohon.

3.maxnodes : jumlah maksimum terminal node.

1.7.1 Tuning Parameter V1

set.seed(123)

tune_grid <- expand.grid(
  .mtry = c(2, 3,4,5, 6)  # mencoba beberapa nilai untuk mtry
)

train_control <- trainControl(
  method = "cv",  # cross-validation
  number = 5,     # 5-fold cv
  search = "grid"
)

tune1 <- train(species ~ .,data = train, ntree = 50, method="rf",metric="Accuracy",tuneGrid = tune_grid,trControl = train_control)

1.7.2 Result Tuning

tune1
## Random Forest 
## 
## 233 samples
##   6 predictor
##   3 classes: 'Adelie', 'Chinstrap', 'Gentoo' 
## 
## No pre-processing
## Resampling: Cross-Validated (5 fold) 
## Summary of sample sizes: 188, 187, 186, 186, 185 
## Resampling results across tuning parameters:
## 
##   mtry  Accuracy   Kappa    
##   2     0.9827857  0.9730228
##   3     0.9825080  0.9723954
##   4     0.9825080  0.9725165
##   5     0.9825080  0.9723954
##   6     0.9780635  0.9656858
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.

1.7.3 Evaluation for Model Tuning

model_tune1<-randomForest(species~., data = train, mtry=2, ntree=50)
#parameter mtry diperoleh dari hasil best accuracy pada tuning v1

1.7.3.1 For Train Data

pred_train_tune1<-predict(model_tune1,train,type = "class")
confusionMatrix(pred_train_tune1,as.factor(train$species))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie       102         0      0
##   Chinstrap      0        48      0
##   Gentoo         0         0     83
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9843, 1)
##     No Information Rate : 0.4378     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 1.0000            1.000        1.0000
## Specificity                 1.0000            1.000        1.0000
## Pos Pred Value              1.0000            1.000        1.0000
## Neg Pred Value              1.0000            1.000        1.0000
## Prevalence                  0.4378            0.206        0.3562
## Detection Rate              0.4378            0.206        0.3562
## Detection Prevalence        0.4378            0.206        0.3562
## Balanced Accuracy           1.0000            1.000        1.0000

1.7.3.2 For Test Data

pred_test_tune1<-predict(model_tune1,test,type = "class")
confusionMatrix(pred_test_tune1,as.factor(test$species))
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Adelie Chinstrap Gentoo
##   Adelie        43         0      0
##   Chinstrap      1        20      0
##   Gentoo         0         0     36
## 
## Overall Statistics
##                                           
##                Accuracy : 0.99            
##                  95% CI : (0.9455, 0.9997)
##     No Information Rate : 0.44            
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9844          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity                 0.9773           1.0000          1.00
## Specificity                 1.0000           0.9875          1.00
## Pos Pred Value              1.0000           0.9524          1.00
## Neg Pred Value              0.9825           1.0000          1.00
## Prevalence                  0.4400           0.2000          0.36
## Detection Rate              0.4300           0.2000          0.36
## Detection Prevalence        0.4300           0.2100          0.36
## Balanced Accuracy           0.9886           0.9938          1.00

1.7.4 Tuning Parameter V2

control <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")
tunegrid <- expand.grid(.mtry=c(sqrt(6)))
modellist <- list()

for (ntree in c(50, 300, 1000, 2500)) {
    set.seed(123)
    fit <- train(species~., data=train, method="rf", metric="Accuracy", tuneGrid=tunegrid, trControl=control, ntree=ntree)
    key <- toString(ntree)
    modellist[[key]] <- fit
}

results <- resamples(modellist)
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: 50, 300, 1000, 2500 
## Number of resamples: 30 
## 
## Accuracy 
##           Min. 1st Qu. Median      Mean 3rd Qu. Max. NA's
## 50   0.9565217 0.95875      1 0.9871329       1    1    0
## 300  0.9565217 0.95875      1 0.9871329       1    1    0
## 1000 0.9565217 0.95875      1 0.9871329       1    1    0
## 2500 0.9565217 0.95875      1 0.9871329       1    1    0
## 
## Kappa 
##           Min.   1st Qu. Median      Mean 3rd Qu. Max. NA's
## 50   0.9283489 0.9360561      1 0.9797078       1    1    0
## 300  0.9283489 0.9360561      1 0.9797078       1    1    0
## 1000 0.9283489 0.9360561      1 0.9797078       1    1    0
## 2500 0.9283489 0.9360561      1 0.9797078       1    1    0

Pada kasus ini, mengubah ntree tidak berpengaruh signifikan terhadap akurasi model RF

2 Random Forest for Regression

2.1 Pre-Processing

Dalam pembelajaran regresi, kita akan menggunakan data Boston dari library(MASS) yang telah tersedia di R.

2.1.1 View Data

library(MASS)
head(Boston)
##      crim zn indus chas   nox    rm  age    dis rad tax ptratio  black lstat
## 1 0.00632 18  2.31    0 0.538 6.575 65.2 4.0900   1 296    15.3 396.90  4.98
## 2 0.02731  0  7.07    0 0.469 6.421 78.9 4.9671   2 242    17.8 396.90  9.14
## 3 0.02729  0  7.07    0 0.469 7.185 61.1 4.9671   2 242    17.8 392.83  4.03
## 4 0.03237  0  2.18    0 0.458 6.998 45.8 6.0622   3 222    18.7 394.63  2.94
## 5 0.06905  0  2.18    0 0.458 7.147 54.2 6.0622   3 222    18.7 396.90  5.33
## 6 0.02985  0  2.18    0 0.458 6.430 58.7 6.0622   3 222    18.7 394.12  5.21
##   medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7

2.1.2 Data Type

str(Boston)
## 'data.frame':    506 obs. of  14 variables:
##  $ crim   : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn     : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus  : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ chas   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ nox    : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm     : num  6.58 6.42 7.18 7 7.15 ...
##  $ age    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis    : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad    : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ tax    : num  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio: num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ black  : num  397 397 393 395 397 ...
##  $ lstat  : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv   : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

2.1.3 Summary Data

summary(Boston)
##       crim                zn             indus            chas        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.00000  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.00000  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.00000  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.06917  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.00000  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :1.00000  
##       nox               rm             age              dis        
##  Min.   :0.3850   Min.   :3.561   Min.   :  2.90   Min.   : 1.130  
##  1st Qu.:0.4490   1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100  
##  Median :0.5380   Median :6.208   Median : 77.50   Median : 3.207  
##  Mean   :0.5547   Mean   :6.285   Mean   : 68.57   Mean   : 3.795  
##  3rd Qu.:0.6240   3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188  
##  Max.   :0.8710   Max.   :8.780   Max.   :100.00   Max.   :12.127  
##       rad              tax           ptratio          black       
##  Min.   : 1.000   Min.   :187.0   Min.   :12.60   Min.   :  0.32  
##  1st Qu.: 4.000   1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38  
##  Median : 5.000   Median :330.0   Median :19.05   Median :391.44  
##  Mean   : 9.549   Mean   :408.2   Mean   :18.46   Mean   :356.67  
##  3rd Qu.:24.000   3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23  
##  Max.   :24.000   Max.   :711.0   Max.   :22.00   Max.   :396.90  
##      lstat            medv      
##  Min.   : 1.73   Min.   : 5.00  
##  1st Qu.: 6.95   1st Qu.:17.02  
##  Median :11.36   Median :21.20  
##  Mean   :12.65   Mean   :22.53  
##  3rd Qu.:16.95   3rd Qu.:25.00  
##  Max.   :37.97   Max.   :50.00

2.1.4 Missing Value

length(which(is.na(Boston)))
## [1] 0

Data tidak memiliki missing value.

2.1.5 Duplicated Data

Boston[duplicated(Boston), ]
##  [1] crim    zn      indus   chas    nox     rm      age     dis     rad    
## [10] tax     ptratio black   lstat   medv   
## <0 rows> (or 0-length row.names)

Dataset tidak memiliki data duplikat.

2.2 Exploratory Data

library(GGally)
ggpairs(Boston)

2.3 Training & Testing

Membagi data menjadi data training & testing. Misalkan, digunakan proporsi training 70% dan testing 30%.

#drop variabel
library(dplyr)
boston2 <- Boston[,c('crim', 'indus', 'nox', 'rm', 'ptratio', 'black','lstat','medv')]
head(boston2)
##      crim indus   nox    rm ptratio  black lstat medv
## 1 0.00632  2.31 0.538 6.575    15.3 396.90  4.98 24.0
## 2 0.02731  7.07 0.469 6.421    17.8 396.90  9.14 21.6
## 3 0.02729  7.07 0.469 7.185    17.8 392.83  4.03 34.7
## 4 0.03237  2.18 0.458 6.998    18.7 394.63  2.94 33.4
## 5 0.06905  2.18 0.458 7.147    18.7 396.90  5.33 36.2
## 6 0.02985  2.18 0.458 6.430    18.7 394.12  5.21 28.7
set.seed(9)
split_bs <- sample.split(boston2, SplitRatio = 0.7)
train_bs <- subset(boston2, split == TRUE)
test_bs <- subset(boston2, split == FALSE)
dim(train_bs)
## [1] 349   8
dim(test_bs)
## [1] 157   8

2.4 Build Model Random Forest

2.4.1 Fit Model

library(rpart)
model_rf_reg<-randomForest(medv~., data = train_bs, ntree=50)

2.4.2 Tree Plot

plot(model_rf_reg)

Error yang dihasilkan model_rf_reg akan menurun seiring dengan banyaknya pohon (ntree) yang digunakan dalam model Random Forest

2.5 Evaluating Model

2.5.1 For Train Data

library(forecast)
pred_train_reg<-predict(model_rf_reg,train_bs)
accuracy(pred_train_reg,train_bs$medv)
##                  ME     RMSE      MAE       MPE     MAPE
## Test set 0.00729784 1.666886 1.115503 -1.806519 5.838764

2.5.2 For Test Data

pred_test_reg<-predict(model_rf_reg,test_bs)
accuracy(pred_test_reg,test_bs$medv)
##                 ME     RMSE      MAE       MPE     MAPE
## Test set 0.4411356 4.840045 2.939951 -2.182561 12.52691

Hasil model RF untuk data testing memiliki performa yang sedikit berbeda dengan model training.

2.6 Plot Y pred & actual

plot(test_bs$medv, pred_test_reg)

2.7 Importance Variable

library(vip)
model_rf_reg$importance
##         IncNodePurity
## crim        2207.9271
## indus       1908.8560
## nox         2096.1572
## rm          7902.6551
## ptratio     2566.4053
## black        764.8901
## lstat       7894.2563
var_importance_reg <- vip(model_rf_reg, num_features = 7)
print(var_importance_reg)