Dalam pembelajaran klasifikasi, kita akan menggunakan data dari library palmerpenguins yang telah tersedia di R.
library(palmerpenguins)
head(penguins)
## # A tibble: 6 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen NA NA NA NA
## 5 Adelie Torgersen 36.7 19.3 193 3450
## 6 Adelie Torgersen 39.3 20.6 190 3650
## # ℹ 2 more variables: sex <fct>, year <int>
str(penguins)
## tibble [344 × 8] (S3: tbl_df/tbl/data.frame)
## $ species : Factor w/ 3 levels "Adelie","Chinstrap",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ island : Factor w/ 3 levels "Biscoe","Dream",..: 3 3 3 3 3 3 3 3 3 3 ...
## $ bill_length_mm : num [1:344] 39.1 39.5 40.3 NA 36.7 39.3 38.9 39.2 34.1 42 ...
## $ bill_depth_mm : num [1:344] 18.7 17.4 18 NA 19.3 20.6 17.8 19.6 18.1 20.2 ...
## $ flipper_length_mm: int [1:344] 181 186 195 NA 193 190 181 195 193 190 ...
## $ body_mass_g : int [1:344] 3750 3800 3250 NA 3450 3650 3625 4675 3475 4250 ...
## $ sex : Factor w/ 2 levels "female","male": 2 1 1 NA 1 2 1 2 NA NA ...
## $ year : int [1:344] 2007 2007 2007 2007 2007 2007 2007 2007 2007 2007 ...
summary(penguins)
## species island bill_length_mm bill_depth_mm
## Adelie :152 Biscoe :168 Min. :32.10 Min. :13.10
## Chinstrap: 68 Dream :124 1st Qu.:39.23 1st Qu.:15.60
## Gentoo :124 Torgersen: 52 Median :44.45 Median :17.30
## Mean :43.92 Mean :17.15
## 3rd Qu.:48.50 3rd Qu.:18.70
## Max. :59.60 Max. :21.50
## NA's :2 NA's :2
## flipper_length_mm body_mass_g sex year
## Min. :172.0 Min. :2700 female:165 Min. :2007
## 1st Qu.:190.0 1st Qu.:3550 male :168 1st Qu.:2007
## Median :197.0 Median :4050 NA's : 11 Median :2008
## Mean :200.9 Mean :4202 Mean :2008
## 3rd Qu.:213.0 3rd Qu.:4750 3rd Qu.:2009
## Max. :231.0 Max. :6300 Max. :2009
## NA's :2 NA's :2
length(which(is.na(penguins)))
## [1] 19
Data memiliki missing value, sehingga perlu dilakukan imputasi/pembersihan missing value tersebut.
pg<-na.omit(penguins)
pg
## # A tibble: 333 × 8
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## 7 Adelie Torgersen 39.2 19.6 195 4675
## 8 Adelie Torgersen 41.1 17.6 182 3200
## 9 Adelie Torgersen 38.6 21.2 191 3800
## 10 Adelie Torgersen 34.6 21.1 198 4400
## # ℹ 323 more rows
## # ℹ 2 more variables: sex <fct>, year <int>
pg[duplicated(pg), ]
## # A tibble: 0 × 8
## # ℹ 8 variables: species <fct>, island <fct>, bill_length_mm <dbl>,
## # bill_depth_mm <dbl>, flipper_length_mm <int>, body_mass_g <int>, sex <fct>,
## # year <int>
Dataset tidak memiliki data duplikat.
library(GGally)
ggpairs(pg)
Membagi data menjadi data training & testing. Data training berfungsi untuk melatih algoritma model, sedangkan data pengujian untuk memvalidasi/mengevaluasi hasil pemodelan yang dibangun. Misalkan, digunakan proporsi training 70% dan testing 30%.
#drop variabel year
library(dplyr)
pg2 <- pg %>% select(-year)
pg2
## # A tibble: 333 × 7
## species island bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
## <fct> <fct> <dbl> <dbl> <int> <int>
## 1 Adelie Torgersen 39.1 18.7 181 3750
## 2 Adelie Torgersen 39.5 17.4 186 3800
## 3 Adelie Torgersen 40.3 18 195 3250
## 4 Adelie Torgersen 36.7 19.3 193 3450
## 5 Adelie Torgersen 39.3 20.6 190 3650
## 6 Adelie Torgersen 38.9 17.8 181 3625
## 7 Adelie Torgersen 39.2 19.6 195 4675
## 8 Adelie Torgersen 41.1 17.6 182 3200
## 9 Adelie Torgersen 38.6 21.2 191 3800
## 10 Adelie Torgersen 34.6 21.1 198 4400
## # ℹ 323 more rows
## # ℹ 1 more variable: sex <fct>
library(caTools)
set.seed(9)
split <- sample.split(pg2$species, SplitRatio = 0.7)
train <- subset(pg2, split == TRUE)
test <- subset(pg2, split == FALSE)
dim(train)
## [1] 233 7
dim(test)
## [1] 100 7
library(rpart)
model1<-rpart(species~., data = train)
model1$y
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [38] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [75] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 3 3 3 3 3 3 3 3 3
## [112] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [149] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [186] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [223] 2 2 2 2 2 2 2 2 2 2 2
library(rpart.plot)
rpart.plot(model1, cex = 0.8, box.palette = "auto")
library("caret")
pred_train<-predict(model1,train,type = "class")
confusionMatrix(pred_train,as.factor(train$species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 98 3 0
## Chinstrap 4 41 1
## Gentoo 0 4 82
##
## Overall Statistics
##
## Accuracy : 0.9485
## 95% CI : (0.9118, 0.9731)
## No Information Rate : 0.4378
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9193
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.9608 0.8542 0.9880
## Specificity 0.9771 0.9730 0.9733
## Pos Pred Value 0.9703 0.8913 0.9535
## Neg Pred Value 0.9697 0.9626 0.9932
## Prevalence 0.4378 0.2060 0.3562
## Detection Rate 0.4206 0.1760 0.3519
## Detection Prevalence 0.4335 0.1974 0.3691
## Balanced Accuracy 0.9689 0.9136 0.9806
Hasil model tree untuk data training sudah baik dalam mengklasifikasikan kelas penguin.
pred_test<-predict(model1,test,type = "class")
confusionMatrix(pred_test,as.factor(test$species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 40 1 0
## Chinstrap 2 18 0
## Gentoo 2 1 36
##
## Overall Statistics
##
## Accuracy : 0.94
## 95% CI : (0.874, 0.9777)
## No Information Rate : 0.44
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.9061
##
## Mcnemar's Test P-Value : 0.343
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.9091 0.9000 1.0000
## Specificity 0.9821 0.9750 0.9531
## Pos Pred Value 0.9756 0.9000 0.9231
## Neg Pred Value 0.9322 0.9750 1.0000
## Prevalence 0.4400 0.2000 0.3600
## Detection Rate 0.4000 0.1800 0.3600
## Detection Prevalence 0.4100 0.2000 0.3900
## Balanced Accuracy 0.9456 0.9375 0.9766
Hasil model tree untuk data testing sudah baik dalam mengklasifikasikan kelas penguin.
library(vip)
model1$variable.importance
## flipper_length_mm bill_length_mm bill_depth_mm body_mass_g
## 88.68215 80.88095 65.62114 60.16413
## island
## 50.70759
var_importance <- vip(model1, num_features = 5)
print(var_importance)
Pemodelan Decision Tree menggunakan library(rpart) memiliki jenis parameter yang dapat dikontrol, beberapa diantaranya yaitu : 1.minsplit : jumlah minimum pengamatan yang harus ada dalam sebuah node agar pemisahan (split) dapat dicoba.
2.minbucket : jumlah minimum pengamatan dalam node leaf.
3.xval : jumlah cross-validation.
4.maxdepth : kedalaman pohon (jumlah keputusan dari atas ke bawah)
help("rpart.control")
model2<-rpart(species~.,data = train, control =
c(cp = 0.005,
xval = 5,
minsplit = 10,
minbucket = 5,
maxdepth = 3))
rpart.plot(model2)
confusionMatrix(predict(model2,train, type = "class"),as.factor(train$species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 98 3 0
## Chinstrap 4 45 2
## Gentoo 0 0 81
##
## Overall Statistics
##
## Accuracy : 0.9614
## 95% CI : (0.9279, 0.9822)
## No Information Rate : 0.4378
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9398
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.9608 0.9375 0.9759
## Specificity 0.9771 0.9676 1.0000
## Pos Pred Value 0.9703 0.8824 1.0000
## Neg Pred Value 0.9697 0.9835 0.9868
## Prevalence 0.4378 0.2060 0.3562
## Detection Rate 0.4206 0.1931 0.3476
## Detection Prevalence 0.4335 0.2189 0.3476
## Balanced Accuracy 0.9689 0.9525 0.9880
confusionMatrix(predict(model2,test, type = "class"),as.factor(test$species))
## Confusion Matrix and Statistics
##
## Reference
## Prediction Adelie Chinstrap Gentoo
## Adelie 40 1 0
## Chinstrap 4 19 1
## Gentoo 0 0 35
##
## Overall Statistics
##
## Accuracy : 0.94
## 95% CI : (0.874, 0.9777)
## No Information Rate : 0.44
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9071
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Adelie Class: Chinstrap Class: Gentoo
## Sensitivity 0.9091 0.9500 0.9722
## Specificity 0.9821 0.9375 1.0000
## Pos Pred Value 0.9756 0.7917 1.0000
## Neg Pred Value 0.9322 0.9868 0.9846
## Prevalence 0.4400 0.2000 0.3600
## Detection Rate 0.4000 0.1900 0.3500
## Detection Prevalence 0.4100 0.2400 0.3500
## Balanced Accuracy 0.9456 0.9437 0.9861
Dalam pembelajaran regresi, kita akan menggunakan data Boston dari library(MASS) yang telah tersedia di R.
library(MASS)
head(Boston)
## crim zn indus chas nox rm age dis rad tax ptratio black lstat
## 1 0.00632 18 2.31 0 0.538 6.575 65.2 4.0900 1 296 15.3 396.90 4.98
## 2 0.02731 0 7.07 0 0.469 6.421 78.9 4.9671 2 242 17.8 396.90 9.14
## 3 0.02729 0 7.07 0 0.469 7.185 61.1 4.9671 2 242 17.8 392.83 4.03
## 4 0.03237 0 2.18 0 0.458 6.998 45.8 6.0622 3 222 18.7 394.63 2.94
## 5 0.06905 0 2.18 0 0.458 7.147 54.2 6.0622 3 222 18.7 396.90 5.33
## 6 0.02985 0 2.18 0 0.458 6.430 58.7 6.0622 3 222 18.7 394.12 5.21
## medv
## 1 24.0
## 2 21.6
## 3 34.7
## 4 33.4
## 5 36.2
## 6 28.7
str(Boston)
## 'data.frame': 506 obs. of 14 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ chas : int 0 0 0 0 0 0 0 0 0 0 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ age : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ tax : num 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio: num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ black : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
summary(Boston)
## crim zn indus chas
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.00000
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.00000
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.00000
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.06917
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.00000
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :1.00000
## nox rm age dis
## Min. :0.3850 Min. :3.561 Min. : 2.90 Min. : 1.130
## 1st Qu.:0.4490 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100
## Median :0.5380 Median :6.208 Median : 77.50 Median : 3.207
## Mean :0.5547 Mean :6.285 Mean : 68.57 Mean : 3.795
## 3rd Qu.:0.6240 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188
## Max. :0.8710 Max. :8.780 Max. :100.00 Max. :12.127
## rad tax ptratio black
## Min. : 1.000 Min. :187.0 Min. :12.60 Min. : 0.32
## 1st Qu.: 4.000 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38
## Median : 5.000 Median :330.0 Median :19.05 Median :391.44
## Mean : 9.549 Mean :408.2 Mean :18.46 Mean :356.67
## 3rd Qu.:24.000 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23
## Max. :24.000 Max. :711.0 Max. :22.00 Max. :396.90
## lstat medv
## Min. : 1.73 Min. : 5.00
## 1st Qu.: 6.95 1st Qu.:17.02
## Median :11.36 Median :21.20
## Mean :12.65 Mean :22.53
## 3rd Qu.:16.95 3rd Qu.:25.00
## Max. :37.97 Max. :50.00
length(which(is.na(Boston)))
## [1] 0
Data tidak memiliki missing value.
pg[duplicated(pg), ]
## # A tibble: 0 × 8
## # ℹ 8 variables: species <fct>, island <fct>, bill_length_mm <dbl>,
## # bill_depth_mm <dbl>, flipper_length_mm <int>, body_mass_g <int>, sex <fct>,
## # year <int>
Dataset tidak memiliki data duplikat.
library(GGally)
ggpairs(Boston)
Membagi data menjadi data training & testing. Misalkan, digunakan proporsi training 70% dan testing 30%.
#drop variabel
library(dplyr)
boston2 <- Boston[,c('crim', 'indus', 'nox', 'rm', 'ptratio', 'black','lstat','medv')]
head(boston2)
## crim indus nox rm ptratio black lstat medv
## 1 0.00632 2.31 0.538 6.575 15.3 396.90 4.98 24.0
## 2 0.02731 7.07 0.469 6.421 17.8 396.90 9.14 21.6
## 3 0.02729 7.07 0.469 7.185 17.8 392.83 4.03 34.7
## 4 0.03237 2.18 0.458 6.998 18.7 394.63 2.94 33.4
## 5 0.06905 2.18 0.458 7.147 18.7 396.90 5.33 36.2
## 6 0.02985 2.18 0.458 6.430 18.7 394.12 5.21 28.7
set.seed(9)
split_bs <- sample.split(boston2, SplitRatio = 0.7)
train_bs <- subset(boston2, split == TRUE)
test_bs <- subset(boston2, split == FALSE)
dim(train_bs)
## [1] 349 8
dim(test_bs)
## [1] 157 8
library(rpart)
tree_reg<-rpart(medv~., data = train_bs)
rpart.plot(tree_reg, cex = 0.8, box.palette = "auto")
library(forecast)
pred_train_reg<-predict(tree_reg,train_bs)
accuracy(pred_train_reg,train_bs$medv)
## ME RMSE MAE MPE MAPE
## Test set -1.446082e-15 3.58444 2.633462 -3.312994 13.86545
pred_test_reg<-predict(tree_reg,test_bs)
accuracy(pred_test_reg,test_bs$medv)
## ME RMSE MAE MPE MAPE
## Test set 0.4088309 5.568173 3.432767 -2.45946 15.54483
Hasil model tree untuk data testing memiliki performa yang tidak jauh berbeda dengan model training.
plot(test_bs$medv, pred_test_reg)
library(vip)
tree_reg$variable.importance
## rm lstat ptratio indus nox crim black
## 15486.855 10971.740 5334.476 4644.154 3770.401 3748.424 1779.898
var_importance_reg <- vip(tree_reg, num_features = 7)
print(var_importance_reg)