Dalam pembelajaran klasifikasi, kita akan menggunakan data dari library palmerpenguins yang telah tersedia di R.
library(palmerpenguins)
head(penguins)
## # A tibble: 6 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## # ℹ 2 more variables: sex <fct>, year <int>
str(penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
## $ species : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ island : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ bill_length_mm : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ bill_depth_mm : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
## $ sex : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
## $ year : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
summary(penguins)
## species island bill_length_mm bill_depth_mm
## Adelie :152 Biscoe :168 Min. :32.10 Min. :13.10
## Chinstrap: 68 Dream :124 1st Qu.:39.23 1st Qu.:15.60
## Gentoo :124 Torgersen: 52 Median :44.45 Median :17.30
## Mean :43.92 Mean :17.15
## 3rd Qu.:48.50 3rd Qu.:18.70
## Max. :59.60 Max. :21.50
## NA's :2 NA's :2
## flipper_length_mm body_mass_g sex year
## Min. :172.0 Min. :2700 female:165 Min. :2007
## 1st Qu.:190.0 1st Qu.:3550 male :168 1st Qu.:2007
## Median :197.0 Median :4050 NA's : 11 Median :2008
## Mean :200.9 Mean :4202 Mean :2008
## 3rd Qu.:213.0 3rd Qu.:4750 3rd Qu.:2009
## Max. :231.0 Max. :6300 Max. :2009
## NA's :2 NA's :2
length(which(is.na(penguins)))
## [1] 19
Data memiliki missing value, sehingga perlu dilakukan imputasi/pembersihan missing value tersebut.
pg<-na.omit(penguins)
pg
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## 7 Adelie Torgersen 39.2 19.6 195 4675
## 8 Adelie Torgersen 41.1 17.6 182 3200
## 9 Adelie Torgersen 38.6 21.2 191 3800
## 10 Adelie Torgersen 34.6 21.1 198 4400
## # ℹ 323 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
pg[duplicated(pg), ]
## # A tibble: 0 × 8
## # ℹ 8 variables: species <fct>, island <fct>, bill_length_mm <dbl>,
## # bill_depth_mm <dbl>, flipper_length_mm <int>, body_mass_g <int>, sex <fct>,
## # year <int>
Dataset tidak memiliki data duplikat.
library(GGally)
ggpairs(pg)
Membagi data menjadi data training & testing. Data training berfungsi untuk melatih algoritma model, sedangkan data pengujian untuk memvalidasi/mengevaluasi hasil pemodelan yang dibangun. Misalkan, digunakan proporsi training 70% dan testing 30%.
#drop variabel year
library(dplyr)
pg2 <- pg %>% select(-year)
pg2
## # A tibble: 333 × 7
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## 7 Adelie Torgersen 39.2 19.6 195 4675
## 8 Adelie Torgersen 41.1 17.6 182 3200
## 9 Adelie Torgersen 38.6 21.2 191 3800
## 10 Adelie Torgersen 34.6 21.1 198 4400
## # ℹ 323 more rows
## # ℹ 1 more variable: sex <fct>
library(caTools)
set.seed(9)
split <- sample.split(pg2$species, SplitRatio = 0.7)
train <- subset(pg2, split == TRUE)
test <- subset(pg2, split == FALSE)
dim(train)
## [1] 233 7
dim(test)
## [1] 100 7
library(randomForest)
train$species <-as.factor(train$species)
model_rf1<-randomForest(species~., data = train, ntree=50)
model_rf1$y
## 1 2 3 4 5 6 7 8
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 9 10 11 12 13 14 15 16
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 17 18 19 20 21 22 23 24
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 25 26 27 28 29 30 31 32
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 33 34 35 36 37 38 39 40
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 41 42 43 44 45 46 47 48
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 49 50 51 52 53 54 55 56
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 57 58 59 60 61 62 63 64
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 65 66 67 68 69 70 71 72
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 73 74 75 76 77 78 79 80
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 81 82 83 84 85 86 87 88
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 89 90 91 92 93 94 95 96
## Adelie Adelie Adelie Adelie Adelie Adelie Adelie Adelie
## 97 98 99 100 101 102 103 104
## Adelie Adelie Adelie Adelie Adelie Adelie Gentoo Gentoo
## 105 106 107 108 109 110 111 112
## Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
## 113 114 115 116 117 118 119 120
## Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
## 121 122 123 124 125 126 127 128
## Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
## 129 130 131 132 133 134 135 136
## Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
## 137 138 139 140 141 142 143 144
## Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
## 145 146 147 148 149 150 151 152
## Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
## 153 154 155 156 157 158 159 160
## Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
## 161 162 163 164 165 166 167 168
## Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
## 169 170 171 172 173 174 175 176
## Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
## 177 178 179 180 181 182 183 184
## Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo Gentoo
## 185 186 187 188 189 190 191 192
## Gentoo Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
## 193 194 195 196 197 198 199 200
## Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
## 201 202 203 204 205 206 207 208
## Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
## 209 210 211 212 213 214 215 216
## Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
## 217 218 219 220 221 222 223 224
## Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
## 225 226 227 228 229 230 231 232
## Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap Chinstrap
## 233
## Chinstrap
## Levels: Adelie Chinstrap Gentoo
plot(model_rf1)
Error yang dihasilkan model_rf1 akan menurun seiring dengan banyaknya
pohon (ntree) yang digunakan dalam model Random Forest
library("caret")
pred_train<-predict(model_rf1,train,type = "class")
confusionMatrix(pred_train,as.factor(train$species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 102 0 0
## Chinstrap 0 48 0
## Gentoo 0 0 83
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9843, 1)
## No Information Rate : 0.4378
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 1.0000 1.000 1.0000
## Specificity 1.0000 1.000 1.0000
## Pos Pred Value 1.0000 1.000 1.0000
## Neg Pred Value 1.0000 1.000 1.0000
## Prevalence 0.4378 0.206 0.3562
## Detection Rate 0.4378 0.206 0.3562
## Detection Prevalence 0.4378 0.206 0.3562
## Balanced Accuracy 1.0000 1.000 1.0000
Hasil model Random Forest untuk data training sudah sangat baik dalam mengklasifikasikan kelas penguin.
pred_test<-predict(model_rf1,test,type = "class")
confusionMatrix(pred_test,as.factor(test$species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 42 0 0
## Chinstrap 2 20 0
## Gentoo 0 0 36
##
## Overall Statistics
##
## Accuracy : 0.98
## 95% CI : (0.9296, 0.9976)
## No Information Rate : 0.44
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9688
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.9545 1.0000 1.00
## Specificity 1.0000 0.9750 1.00
## Pos Pred Value 1.0000 0.9091 1.00
## Neg Pred Value 0.9655 1.0000 1.00
## Prevalence 0.4400 0.2000 0.36
## Detection Rate 0.4200 0.2000 0.36
## Detection Prevalence 0.4200 0.2200 0.36
## Balanced Accuracy 0.9773 0.9875 1.00
Hasil model Random Forest untuk data testing sudah sangat baik dalam mengklasifikasikan kelas penguin.
library(vip)
model_rf1$importance
## MeanDecreaseGini
## island 22.6678091
## bill_length_mm 46.3996292
## bill_depth_mm 21.7765583
## flipper_length_mm 36.8454398
## body_mass_g 19.4230329
## sex 0.9530409
var_importance <- vip(model_rf1, num_features = 5)
print(var_importance)
Pemodelan Random Forest menggunakan fungsi randomForest() memiliki jenis parameter yang dapat dikontrol, beberapa diantaranya yaitu :
1.ntree : banyaknya pohon yang dibangun untuk pemodelan Random Forest.
2.mtry : banyaknya variabel yang digunakan untuk membangun sebuah pohon.
3.maxnodes : jumlah maksimum terminal node.
set.seed(123)
tune_grid <- expand.grid(
.mtry = c(2, 3,4,5, 6) # mencoba beberapa nilai untuk mtry
)
train_control <- trainControl(
method = "cv", # cross-validation
number = 5, # 5-fold cv
search = "grid"
)
tune1 <- train(species ~ .,data = train, ntree = 50, method="rf",metric="Accuracy",tuneGrid = tune_grid,trControl = train_control)
tune1
## Random Forest
##
## 233 samples
## 6 predictor
## 3 classes: 'Adelie', 'Chinstrap', 'Gentoo'
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 188, 187, 186, 186, 185
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.9827857 0.9730228
## 3 0.9825080 0.9723954
## 4 0.9825080 0.9725165
## 5 0.9825080 0.9723954
## 6 0.9780635 0.9656858
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
model_tune1<-randomForest(species~., data = train, mtry=2, ntree=50)
#parameter mtry diperoleh dari hasil best accuracy pada tuning v1
pred_train_tune1<-predict(model_tune1,train,type = "class")
confusionMatrix(pred_train_tune1,as.factor(train$species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 102 0 0
## Chinstrap 0 48 0
## Gentoo 0 0 83
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9843, 1)
## No Information Rate : 0.4378
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 1.0000 1.000 1.0000
## Specificity 1.0000 1.000 1.0000
## Pos Pred Value 1.0000 1.000 1.0000
## Neg Pred Value 1.0000 1.000 1.0000
## Prevalence 0.4378 0.206 0.3562
## Detection Rate 0.4378 0.206 0.3562
## Detection Prevalence 0.4378 0.206 0.3562
## Balanced Accuracy 1.0000 1.000 1.0000
pred_test_tune1<-predict(model_tune1,test,type = "class")
confusionMatrix(pred_test_tune1,as.factor(test$species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 43 0 0
## Chinstrap 1 20 0
## Gentoo 0 0 36
##
## Overall Statistics
##
## Accuracy : 0.99
## 95% CI : (0.9455, 0.9997)
## No Information Rate : 0.44
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9844
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.9773 1.0000 1.00
## Specificity 1.0000 0.9875 1.00
## Pos Pred Value 1.0000 0.9524 1.00
## Neg Pred Value 0.9825 1.0000 1.00
## Prevalence 0.4400 0.2000 0.36
## Detection Rate 0.4300 0.2000 0.36
## Detection Prevalence 0.4300 0.2100 0.36
## Balanced Accuracy 0.9886 0.9938 1.00
control <- trainControl(method="repeatedcv", number=10, repeats=3, search="grid")
tunegrid <- expand.grid(.mtry=c(sqrt(6)))
modellist <- list()
for (ntree in c(50, 300, 1000, 2500)) {
set.seed(123)
fit <- train(species~., data=train, method="rf", metric="Accuracy", tuneGrid=tunegrid, trControl=control, ntree=ntree)
key <- toString(ntree)
modellist[[key]] <- fit
}
results <- resamples(modellist)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: 50, 300, 1000, 2500
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 50 0.9565217 0.95875 1 0.9871329 1 1 0
## 300 0.9565217 0.95875 1 0.9871329 1 1 0
## 1000 0.9565217 0.95875 1 0.9871329 1 1 0
## 2500 0.9565217 0.95875 1 0.9871329 1 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 50 0.9283489 0.9360561 1 0.9797078 1 1 0
## 300 0.9283489 0.9360561 1 0.9797078 1 1 0
## 1000 0.9283489 0.9360561 1 0.9797078 1 1 0
## 2500 0.9283489 0.9360561 1 0.9797078 1 1 0
Pada kasus ini, mengubah ntree tidak berpengaruh signifikan terhadap akurasi model RF
Dalam pembelajaran regresi, kita akan menggunakan data Boston dari library(MASS) yang telah tersedia di R.
library(MASS)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
length(which(is.na(Boston)))
## [1] 0
Data tidak memiliki missing value.
Boston[duplicated(Boston), ]
## [1] crim zn indus chas nox rm age dis rad
## [10] tax ptratio black lstat medv
## <0 rows> (or 0-length row.names)
Dataset tidak memiliki data duplikat.
library(GGally)
ggpairs(Boston)
Membagi data menjadi data training & testing. Misalkan, digunakan proporsi training 70% dan testing 30%.
#drop variabel
library(dplyr)
boston2 <- Boston[,c('crim', 'indus', 'nox', 'rm', 'ptratio', 'black','lstat','medv')]
head(boston2)
## crim indus nox rm ptratio black lstat medv
## 1 0.00632 2.31 0.538 6.575 15.3 396.90 4.98 24.0
## 2 0.02731 7.07 0.469 6.421 17.8 396.90 9.14 21.6
## 3 0.02729 7.07 0.469 7.185 17.8 392.83 4.03 34.7
## 4 0.03237 2.18 0.458 6.998 18.7 394.63 2.94 33.4
## 5 0.06905 2.18 0.458 7.147 18.7 396.90 5.33 36.2
## 6 0.02985 2.18 0.458 6.430 18.7 394.12 5.21 28.7
set.seed(9)
split_bs <- sample.split(boston2, SplitRatio = 0.7)
train_bs <- subset(boston2, split == TRUE)
test_bs <- subset(boston2, split == FALSE)
dim(train_bs)
## [1] 349 8
dim(test_bs)
## [1] 157 8
library(rpart)
model_rf_reg<-randomForest(medv~., data = train_bs, ntree=50)
plot(model_rf_reg)
Error yang dihasilkan model_rf_reg akan menurun seiring dengan banyaknya
pohon (ntree) yang digunakan dalam model Random Forest
library(forecast)
pred_train_reg<-predict(model_rf_reg,train_bs)
accuracy(pred_train_reg,train_bs$medv)
## ME RMSE MAE MPE MAPE
## Test set 0.00729784 1.666886 1.115503 -1.806519 5.838764
pred_test_reg<-predict(model_rf_reg,test_bs)
accuracy(pred_test_reg,test_bs$medv)
## ME RMSE MAE MPE MAPE
## Test set 0.4411356 4.840045 2.939951 -2.182561 12.52691
Hasil model RF untuk data testing memiliki performa yang sedikit berbeda dengan model training.
plot(test_bs$medv, pred_test_reg)
library(vip)
model_rf_reg$importance
## IncNodePurity
## crim 2207.9271
## indus 1908.8560
## nox 2096.1572
## rm 7902.6551
## ptratio 2566.4053
## black 764.8901
## lstat 7894.2563
var_importance_reg <- vip(model_rf_reg, num_features = 7)
print(var_importance_reg)