121450022 - Sarah Natalia Geraldine

121450094 - Syifa Firnanda

121450105 - Raditia Riandi

121450112 - Christian Arvianus Nathanael Biran

121450157 - Salwa Naqwadisa Madinna

Import Library

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.0     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.1     ✔ tibble    3.1.8
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(mlr3verse)
## Loading required package: mlr3
library(mlr3tuning)
## Loading required package: paradox
library(paradox)
library(kknn)
library(ggpubr)
library(smotefamily)
library(skimr)
## 
## Attaching package: 'skimr'
## 
## The following object is masked from 'package:mlr3':
## 
##     partition
library(mlr3)
library(resample)
## 
## Attaching package: 'resample'
## 
## The following object is masked from 'package:mlr3':
## 
##     resample
library(mlr3learners)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:kknn':
## 
##     contr.dummy
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(lattice)
library(ggplot2)
library(dplyr)
library(rlist)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(gains)
library(leaps)
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(rpart)
library(rpart.plot)
library(ROSE)
## Loaded ROSE 0.0-4

Import Dataset

Mengimport dataset yang ingin digunakan.

library(readxl)
Raisin_Dataset <- read_csv("C:/Users/sarah/Downloads/Raisin_Dataset.csv")
## Rows: 900 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Class
## dbl (7): Area, MajorAxisLength, MinorAxisLength, Eccentricity, ConvexArea, E...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(Raisin_Dataset)
## Rows: 900
## Columns: 8
## $ Area            <dbl> 87524, 75166, 90856, 45928, 79408, 49242, 42492, 60952…
## $ MajorAxisLength <dbl> 442.2460, 406.6907, 442.2670, 286.5406, 352.1908, 318.…
## $ MinorAxisLength <dbl> 253.2912, 243.0324, 266.3283, 208.7600, 290.8275, 200.…
## $ Eccentricity    <dbl> 0.8197384, 0.8018052, 0.7983536, 0.6849892, 0.5640113,…
## $ ConvexArea      <dbl> 90546, 78789, 93717, 47336, 81463, 51368, 43904, 62329…
## $ Extent          <dbl> 0.7586506, 0.6841296, 0.6376128, 0.6995994, 0.7927719,…
## $ Perimeter       <dbl> 1184.040, 1121.786, 1208.575, 844.162, 1073.251, 881.8…
## $ Class           <chr> "Kecimen", "Kecimen", "Kecimen", "Kecimen", "Kecimen",…

Berdasarkan hasil di atas, diketahui bahwa dataset memiliki 900 baris dengan 8 kolom.

EDA

skim_without_charts(Raisin_Dataset)
Data summary
Name Raisin_Dataset
Number of rows 900
Number of columns 8
_______________________
Column type frequency:
character 1
numeric 7
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
Class 0 1 5 7 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Area 0 1 87804.13 39002.11 25387.00 59348.00 78902.00 105028.25 235047.00
MajorAxisLength 0 1 430.93 116.04 225.63 345.44 407.80 494.19 997.29
MinorAxisLength 0 1 254.49 49.99 143.71 219.11 247.85 279.89 492.28
Eccentricity 0 1 0.78 0.09 0.35 0.74 0.80 0.84 0.96
ConvexArea 0 1 91186.09 40769.29 26139.00 61513.25 81651.00 108375.75 278217.00
Extent 0 1 0.70 0.05 0.38 0.67 0.71 0.73 0.84
Perimeter 0 1 1165.91 273.76 619.07 966.41 1119.51 1308.39 2697.75

Berdasarkan hasil di atas, diketahui bahwa tidak terdapat missing value pada tiap kolom dataset dengan 1 kolom bertipe factor dan 7 kolom bertipe numeric.

Raisin_Dataset$Class <- as.factor(Raisin_Dataset$Class)
glimpse(Raisin_Dataset)
## Rows: 900
## Columns: 8
## $ Area            <dbl> 87524, 75166, 90856, 45928, 79408, 49242, 42492, 60952…
## $ MajorAxisLength <dbl> 442.2460, 406.6907, 442.2670, 286.5406, 352.1908, 318.…
## $ MinorAxisLength <dbl> 253.2912, 243.0324, 266.3283, 208.7600, 290.8275, 200.…
## $ Eccentricity    <dbl> 0.8197384, 0.8018052, 0.7983536, 0.6849892, 0.5640113,…
## $ ConvexArea      <dbl> 90546, 78789, 93717, 47336, 81463, 51368, 43904, 62329…
## $ Extent          <dbl> 0.7586506, 0.6841296, 0.6376128, 0.6995994, 0.7927719,…
## $ Perimeter       <dbl> 1184.040, 1121.786, 1208.575, 844.162, 1073.251, 881.8…
## $ Class           <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, …

Task

task_raisin <- TaskClassif$new(id = "Raisin",
                               backend = Raisin_Dataset,
                               target = "Class",
                               positive = "Kecimen")

Mendefinisikan objek task dengan backend untuk data yang dimodelkan menjadi peubah responnya berupa peubah numerik, target = “Class” dijadikan sebagai peubah respon.

Learner

# create
learner <- lrn("classif.kknn")
learner
## <LearnerClassifKKNN:classif.kknn>
## * Model: -
## * Parameters: k=7
## * Packages: mlr3, mlr3learners, kknn
## * Predict Types:  [response], prob
## * Feature Types: logical, integer, numeric, factor, ordered
## * Properties: multiclass, twoclass

Strategi Resampling

Query

as.data.table(mlr_resamplings)
##            key                         label        params iters
## 1:   bootstrap                     Bootstrap ratio,repeats    30
## 2:      custom                 Custom Splits                  NA
## 3:   custom_cv Custom Split Cross-Validation                  NA
## 4:          cv              Cross-Validation         folds    10
## 5:     holdout                       Holdout         ratio     1
## 6:    insample           Insample Resampling                   1
## 7:         loo                 Leave-One-Out                  NA
## 8: repeated_cv     Repeated Cross-Validation folds,repeats   100
## 9: subsampling                   Subsampling ratio,repeats    30

Holdout

Construction

# holdout resampling
set.seed(123)
resampling_holdout = rsmp("holdout")
print(resampling_holdout)
## <ResamplingHoldout>: Holdout
## * Iterations: 1
## * Instantiated: FALSE
## * Parameters: ratio=0.6667

Melakukan resampling dengan fungsi rsmp() menghasilkan iterasi 1 dengan parameters ratio sebesar 0.6667.

Instantiation

resample_holdout = mlr3::resample(task = task_raisin, learner = learner, resampling = resampling_holdout)
## INFO  [17:12:28.921] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 1/1)
resample_holdout$aggregate(msr("classif.acc"))
## classif.acc 
##   0.8333333

Instansiasi di atas digunakan untuk membandingkan uji train dengan uji test yang sama untuk mengevaluasi performa model.

Inspection

resample_holdoutdt = as.data.table(resample_holdout)
resample_holdoutdt
##                 task                  learner              resampling iteration
## 1: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingHoldout[20]>         1
##                 prediction
## 1: <PredictionClassif[20]>

Tabel di atas merupakan informasi dasar yang didapatkan menggunakan fungsi as.data.table untuk menyimpan prediksi data yang kita uji untuk mengkases informasi yang disimpan.

resample_holdoutdt$prediction
## [[1]]
## <PredictionClassif> for 300 observations:
##     row_ids   truth response
##           1 Kecimen    Besni
##           2 Kecimen  Kecimen
##           3 Kecimen  Kecimen
## ---                         
##         894   Besni    Besni
##         897   Besni  Kecimen
##         898   Besni    Besni
all.equal(resample_holdoutdt$prediction, resample_holdout$predictions())
## [1] TRUE

Pred

pred = resample_holdout$prediction()
pred
## <PredictionClassif> for 300 observations:
##     row_ids   truth response
##           1 Kecimen    Besni
##           2 Kecimen  Kecimen
##           3 Kecimen  Kecimen
## ---                         
##         894   Besni    Besni
##         897   Besni  Kecimen
##         898   Besni    Besni
pred$score(msr("classif.acc"))
## classif.acc 
##   0.8333333

ROC Analysis

Confusion Matrix-based Measures

splits = mlr3::partition(task_raisin, ratio = 0.75)
learner$train(task_raisin, splits$train)
pred = learner$predict(task_raisin, splits$test)
pred$confusion
##          truth
## response  Kecimen Besni
##   Kecimen      90    17
##   Besni        22    95
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
##          truth
## response  Kecimen Besni
##   Kecimen      90    17
##   Besni        22    95
## acc :  0.8259; ce  :  0.1741; dor :  22.8610; f1  :  0.8219 
## fdr :  0.1589; fnr :  0.1964; fomr:  0.1880; fpr :  0.1518 
## mcc :  0.6524; npv :  0.8120; ppv :  0.8411; tnr :  0.8482 
## tpr :  0.8036

Dari hasil analisis, diperoleh akurasi (acc) sebesar 0.8333, artinya sekitar 83.30% data terklasifikasi dengan benar. Nilai Positive Predictive Value (ppv) sebesar 0.7844, menunjukkan seberapa banyak hasil positif yang sebenarnya benar. Nilai Negative Predictive Value (npv) sebesar 0.8947, menunjukkan seberapa banyak hasil negatif yang sebenarnya benar.

Standarisasi dan menangani imbalance class

standardize <- po("scale")

smote <- po("smote",dup_size=1)

Standardize menerapkan proses standardisasi dengan fungsi scale(), yang akan mengubah variabel numerik pada dataset menjadi memiliki mean nol dan variansi satu. Proses standardisasi ini sering dilakukan sebelum melakukan analisis statistik atau machine learning untuk memperbaiki distribusi data.

Smote menerapkan teknik oversampling SMOTE (Synthetic Minority Over-sampling Technique) dengan dup_size=1. Teknik SMOTE digunakan untuk menangani masalah kelas yang tidak seimbang (imbalanced class) dengan membuat sampel sintetis dari kelas minoritas. dup_size adalah parameter yang menentukan seberapa banyak sampel sintetis akan dibuat. Dalam kode tersebut, dup_size=1 berarti akan membuat jumlah sampel sintetis yang sama dengan jumlah sampel awal dari kelas minoritas.

standardize$train(list(task_raisin))[[1]]$data() %>% glimpse
## Rows: 900
## Columns: 8
## $ Class           <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, …
## $ Area            <dbl> -0.007182375, -0.324037016, 0.078248898, -1.073688739,…
## $ ConvexArea      <dbl> -0.01570030, -0.30407912, 0.06207883, -1.07556668, -0.…
## $ Eccentricity    <dbl> 0.42290649, 0.22435165, 0.18613557, -1.06902826, -2.40…
## $ Extent          <dbl> 1.106127610, -0.287616870, -1.157606092, 0.001710523, …
## $ MajorAxisLength <dbl> 0.09752272, -0.20889592, 0.09770402, -1.24435939, -0.6…
## $ MinorAxisLength <dbl> -0.02394487, -0.22916480, 0.23685627, -0.91476486, 0.7…
## $ Perimeter       <dbl> 0.06623714, -0.16116284, 0.15585802, -1.17526141, -0.3…
smote$train(list(task_raisin))[[1]]$data() %>% count(Class)
##      Class   n
## 1: Kecimen 900
## 2:   Besni 450
kknn_pipeline <- GraphLearner$new(standardize %>>% smote %>>% learner)
kknn_pipeline
## <GraphLearner:scale.smote.classif.kknn>
## * Model: -
## * Parameters: scale.robust=FALSE, smote.dup_size=1, classif.kknn.k=7
## * Packages: mlr3, mlr3pipelines, smotefamily, mlr3learners, kknn
## * Predict Types:  [response], prob
## * Feature Types: logical, integer, numeric, character, factor, ordered,
##   POSIXct
## * Properties: featureless, hotstart_backward, hotstart_forward,
##   importance, loglik, missings, multiclass, oob_error,
##   selected_features, twoclass, weights

Interpretasi Model

kknn_pipeline$train(task = task_raisin)
summary(kknn_pipeline$model$classif.kknn$model)
##         Length Class      Mode
## formula 3      formula    call
## data    8      data.table list
## pv      1      -none-     list
## kknn    0      -none-     NULL
# Menghitung prediksi dari data test
pred <- kknn_pipeline$predict(task_raisin)

# Evaluate model
perf <- pred$score(msr("classif.acc"), task = task_raisin)
perf
## classif.acc 
##   0.9322222
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
##          truth
## response  Kecimen Besni
##   Kecimen     445    56
##   Besni         5   394
## acc :  0.9322; ce  :  0.0678; dor :  626.1786; f1  :  0.9359 
## fdr :  0.1118; fnr :  0.0111; fomr:  0.0125; fpr :  0.1244 
## mcc :  0.8701; npv :  0.9875; ppv :  0.8882; tnr :  0.8756 
## tpr :  0.9889

Nilai akurasi (acc) dari model yang diperoleh adalah sebesar 0.93, yang berarti model berhasil memprediksi dengan benar sebesar 93% dari total data yang ada. Hal ini menunjukkan bahwa model memiliki kinerja yang sangat baik dalam melakukan klasifikasi pada dataset yang digunakan.

Cross-Validation

Contruction

set.seed(321)
resampling_cv = rsmp("cv", folds = 10)
resampling_cv$instantiate(task_raisin)
task_raisin$col_roles$stratum = "Class"
print(resampling_cv)
## <ResamplingCV>: Cross-Validation
## * Iterations: 10
## * Instantiated: TRUE
## * Parameters: folds=10

Melakukan resampling dengan fungsi rsmp() menghasilkan iterasi 10 dengan parameters folds sebesar 10.

Instantiation

resample_cv = mlr3::resample(task_raisin, learner, resampling_cv)
## INFO  [17:12:33.382] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 1/10)
## INFO  [17:12:33.450] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 2/10)
## INFO  [17:12:33.494] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 3/10)
## INFO  [17:12:33.548] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 4/10)
## INFO  [17:12:34.117] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 5/10)
## INFO  [17:12:34.165] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 6/10)
## INFO  [17:12:34.212] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 7/10)
## INFO  [17:12:34.256] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 8/10)
## INFO  [17:12:34.306] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 9/10)
## INFO  [17:12:34.353] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 10/10)
resample_cv$aggregate(msr("classif.acc"))
## classif.acc 
##   0.8477778

Dari hasil analisis prediksi, diperoleh akurasi klasifikasi sebesar 0.8466667.

print(resample_cv)
## <ResampleResult> of 10 iterations
## * Task: Raisin
## * Learner: classif.kknn
## * Warnings: 0 in 0 iterations
## * Errors: 0 in 0 iterations
as.data.table(resample_cv)
##                  task                  learner         resampling iteration
##  1: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         1
##  2: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         2
##  3: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         3
##  4: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         4
##  5: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         5
##  6: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         6
##  7: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         7
##  8: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         8
##  9: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         9
## 10: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>        10
##                  prediction
##  1: <PredictionClassif[20]>
##  2: <PredictionClassif[20]>
##  3: <PredictionClassif[20]>
##  4: <PredictionClassif[20]>
##  5: <PredictionClassif[20]>
##  6: <PredictionClassif[20]>
##  7: <PredictionClassif[20]>
##  8: <PredictionClassif[20]>
##  9: <PredictionClassif[20]>
## 10: <PredictionClassif[20]>
acc = resample_cv$score(msr("classif.acc"))
acc[, .(iteration, classif.acc)]
##     iteration classif.acc
##  1:         1   0.8222222
##  2:         2   0.7888889
##  3:         3   0.8444444
##  4:         4   0.8444444
##  5:         5   0.8777778
##  6:         6   0.8333333
##  7:         7   0.9000000
##  8:         8   0.8222222
##  9:         9   0.8555556
## 10:        10   0.8888889
resample_cv$aggregate(msr("classif.acc"))
## classif.acc 
##   0.8477778

Instansiasi di atas digunakan untuk membandingkan uji train dengan uji test yang sama untuk mengevaluasi performa model.

Inspection

resample_cvdt = as.data.table(resample_cv)
resample_cvdt
##                  task                  learner         resampling iteration
##  1: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         1
##  2: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         2
##  3: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         3
##  4: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         4
##  5: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         5
##  6: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         6
##  7: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         7
##  8: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         8
##  9: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>         9
## 10: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]>        10
##                  prediction
##  1: <PredictionClassif[20]>
##  2: <PredictionClassif[20]>
##  3: <PredictionClassif[20]>
##  4: <PredictionClassif[20]>
##  5: <PredictionClassif[20]>
##  6: <PredictionClassif[20]>
##  7: <PredictionClassif[20]>
##  8: <PredictionClassif[20]>
##  9: <PredictionClassif[20]>
## 10: <PredictionClassif[20]>

Tabel di atas merupakan informasi dasar yang didapatkan menggunakan fungsi as.data.table untuk menyimpan prediksi data yang diuji untuk mengkases informasi yang disimpan.

resample_cvdt$prediction
## [[1]]
## <PredictionClassif> for 90 observations:
##     row_ids   truth response
##          15 Kecimen  Kecimen
##          36 Kecimen  Kecimen
##          42 Kecimen    Besni
## ---                         
##         862   Besni  Kecimen
##         882   Besni    Besni
##         891   Besni    Besni
## 
## [[2]]
## <PredictionClassif> for 90 observations:
##     row_ids   truth response
##           1 Kecimen    Besni
##          19 Kecimen  Kecimen
##          76 Kecimen  Kecimen
## ---                         
##         874   Besni    Besni
##         895   Besni    Besni
##         898   Besni    Besni
## 
## [[3]]
## <PredictionClassif> for 90 observations:
##     row_ids   truth response
##           2 Kecimen  Kecimen
##           3 Kecimen  Kecimen
##          29 Kecimen  Kecimen
## ---                         
##         885   Besni    Besni
##         886   Besni  Kecimen
##         900   Besni    Besni
## 
## [[4]]
## <PredictionClassif> for 90 observations:
##     row_ids   truth response
##           6 Kecimen  Kecimen
##          22 Kecimen  Kecimen
##          23 Kecimen  Kecimen
## ---                         
##         858   Besni    Besni
##         863   Besni    Besni
##         887   Besni    Besni
## 
## [[5]]
## <PredictionClassif> for 90 observations:
##     row_ids   truth response
##           8 Kecimen  Kecimen
##          17 Kecimen  Kecimen
##          27 Kecimen  Kecimen
## ---                         
##         876   Besni    Besni
##         897   Besni  Kecimen
##         899   Besni    Besni
## 
## [[6]]
## <PredictionClassif> for 90 observations:
##     row_ids   truth response
##           4 Kecimen  Kecimen
##          31 Kecimen    Besni
##          61 Kecimen  Kecimen
## ---                         
##         868   Besni  Kecimen
##         870   Besni    Besni
##         881   Besni    Besni
## 
## [[7]]
## <PredictionClassif> for 90 observations:
##     row_ids   truth response
##          10 Kecimen  Kecimen
##          13 Kecimen  Kecimen
##          43 Kecimen  Kecimen
## ---                         
##         890   Besni    Besni
##         893   Besni    Besni
##         896   Besni    Besni
## 
## [[8]]
## <PredictionClassif> for 90 observations:
##     row_ids   truth response
##           7 Kecimen  Kecimen
##           9 Kecimen  Kecimen
##          11 Kecimen    Besni
## ---                         
##         883   Besni    Besni
##         888   Besni    Besni
##         889   Besni  Kecimen
## 
## [[9]]
## <PredictionClassif> for 90 observations:
##     row_ids   truth response
##          14 Kecimen  Kecimen
##          18 Kecimen  Kecimen
##          34 Kecimen  Kecimen
## ---                         
##         825   Besni    Besni
##         830   Besni    Besni
##         894   Besni    Besni
## 
## [[10]]
## <PredictionClassif> for 90 observations:
##     row_ids   truth response
##           5 Kecimen  Kecimen
##          12 Kecimen  Kecimen
##          16 Kecimen  Kecimen
## ---                         
##         877   Besni    Besni
##         880   Besni    Besni
##         892   Besni    Besni
all.equal(resample_cvdt$prediction, resample_cv$predictions())
## [1] TRUE

pred

pred = resample_cv$prediction()
pred
## <PredictionClassif> for 900 observations:
##     row_ids   truth response
##          15 Kecimen  Kecimen
##          36 Kecimen  Kecimen
##          42 Kecimen    Besni
## ---                         
##         877   Besni    Besni
##         880   Besni    Besni
##         892   Besni    Besni
pred$score(msr("classif.acc"))
## classif.acc 
##   0.8477778

Dari hasil analisis prediksi, diperoleh akurasi klasifikasi sebesar 0.8477778.

prop.table(table(task_raisin$data(rows = resampling_cv$test_set(1), cols = "Class")))
## Class
##   Kecimen     Besni 
## 0.5111111 0.4888889

ROC Analysis

splits = mlr3::partition(task_raisin, ratio = 0.75)
learner$train(task_raisin, splits$train)
pred = learner$predict(task_raisin, splits$test)
pred$confusion
##          truth
## response  Kecimen Besni
##   Kecimen      95    15
##   Besni        17    97
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
##          truth
## response  Kecimen Besni
##   Kecimen      95    15
##   Besni        17    97
## acc :  0.8571; ce  :  0.1429; dor :  36.1373; f1  :  0.8559 
## fdr :  0.1364; fnr :  0.1518; fomr:  0.1491; fpr :  0.1339 
## mcc :  0.7144; npv :  0.8509; ppv :  0.8636; tnr :  0.8661 
## tpr :  0.8482

Dari hasil analisis, diperoleh akurasi (acc) sebesar 0.8571, artinya sekitar 85.71% data terklasifikasi dengan benar. Nilai Positive Predictive Value (ppv) sebesar 0.8636, menunjukkan seberapa banyak hasil positif yang sebenarnya benar. Nilai Negative Predictive Value (npv) sebesar 0.8509, menunjukkan seberapa banyak hasil negatif yang sebenarnya benar.

Standarisasi dan menangani imbalance class

standardize <- po("scale")

smote <- po("smote",dup_size=1)
standardize$train(list(task_raisin))[[1]]$data() %>% glimpse
## Rows: 900
## Columns: 8
## $ Class           <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, …
## $ Area            <dbl> -0.007182375, -0.324037016, 0.078248898, -1.073688739,…
## $ ConvexArea      <dbl> -0.01570030, -0.30407912, 0.06207883, -1.07556668, -0.…
## $ Eccentricity    <dbl> 0.42290649, 0.22435165, 0.18613557, -1.06902826, -2.40…
## $ Extent          <dbl> 1.106127610, -0.287616870, -1.157606092, 0.001710523, …
## $ MajorAxisLength <dbl> 0.09752272, -0.20889592, 0.09770402, -1.24435939, -0.6…
## $ MinorAxisLength <dbl> -0.02394487, -0.22916480, 0.23685627, -0.91476486, 0.7…
## $ Perimeter       <dbl> 0.06623714, -0.16116284, 0.15585802, -1.17526141, -0.3…
smote$train(list(task_raisin))[[1]]$data() %>% count(Class)
##      Class   n
## 1: Kecimen 900
## 2:   Besni 450
kknn_pipeline <- GraphLearner$new(standardize %>>% smote %>>% learner)
kknn_pipeline
## <GraphLearner:scale.smote.classif.kknn>
## * Model: -
## * Parameters: scale.robust=FALSE, smote.dup_size=1, classif.kknn.k=7
## * Packages: mlr3, mlr3pipelines, smotefamily, mlr3learners, kknn
## * Predict Types:  [response], prob
## * Feature Types: logical, integer, numeric, character, factor, ordered,
##   POSIXct
## * Properties: featureless, hotstart_backward, hotstart_forward,
##   importance, loglik, missings, multiclass, oob_error,
##   selected_features, twoclass, weights

Interpretasi

kknn_pipeline$train(task = task_raisin)
summary(kknn_pipeline$model$classif.kknn$model)
##         Length Class      Mode
## formula 3      formula    call
## data    8      data.table list
## pv      1      -none-     list
## kknn    0      -none-     NULL
# Menghitung prediksi dari data test

pred <- kknn_pipeline$predict(task_raisin)


# Evaluate model
perf <- pred$score(msr("classif.acc"), task = task_raisin)
perf
## classif.acc 
##        0.93
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
##          truth
## response  Kecimen Besni
##   Kecimen     447    60
##   Besni         3   390
## acc :  0.9300; ce  :  0.0700; dor :  968.5000; f1  :  0.9342 
## fdr :  0.1183; fnr :  0.0067; fomr:  0.0076; fpr :  0.1333 
## mcc :  0.8670; npv :  0.9924; ppv :  0.8817; tnr :  0.8667 
## tpr :  0.9933

Nilai akurasi (acc) dari model yang diperoleh adalah sebesar 0.9322, yang berarti model berhasil memprediksi dengan benar sebesar 93.22% dari total data yang ada. Hal ini menunjukkan bahwa model memiliki kinerja yang sangat baik dalam melakukan klasifikasi pada dataset yang digunakan.

Bootstrap

# Bootstrap resampling
set.seed(123)
resampling_bootstrap <- rsmp("bootstrap")

# Menjalankan resampling
rr <- mlr3::resample(task_raisin, learner, resampling = resampling_bootstrap)
## INFO  [17:12:37.678] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 1/30)
## INFO  [17:12:37.755] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 2/30)
## INFO  [17:12:37.814] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 3/30)
## INFO  [17:12:37.870] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 4/30)
## INFO  [17:12:37.926] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 5/30)
## INFO  [17:12:37.982] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 6/30)
## INFO  [17:12:38.041] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 7/30)
## INFO  [17:12:38.099] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 8/30)
## INFO  [17:12:38.169] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 9/30)
## INFO  [17:12:38.234] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 10/30)
## INFO  [17:12:38.301] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 11/30)
## INFO  [17:12:38.360] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 12/30)
## INFO  [17:12:38.421] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 13/30)
## INFO  [17:12:38.478] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 14/30)
## INFO  [17:12:38.535] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 15/30)
## INFO  [17:12:38.593] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 16/30)
## INFO  [17:12:38.648] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 17/30)
## INFO  [17:12:38.704] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 18/30)
## INFO  [17:12:38.762] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 19/30)
## INFO  [17:12:38.818] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 20/30)
## INFO  [17:12:38.879] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 21/30)
## INFO  [17:12:38.944] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 22/30)
## INFO  [17:12:39.009] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 23/30)
## INFO  [17:12:39.065] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 24/30)
## INFO  [17:12:39.121] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 25/30)
## INFO  [17:12:39.181] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 26/30)
## INFO  [17:12:39.234] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 27/30)
## INFO  [17:12:39.299] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 28/30)
## INFO  [17:12:39.355] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 29/30)
## INFO  [17:12:39.414] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 30/30)
# Melihat hasil evaluasi
rr$aggregate(msr("classif.acc"))
## classif.acc 
##   0.8321325

Berdasarkan hasil di atas, diketahui bahwa hasil dari evaluasi model memiliki akurasi sebesar 0.8235044.

pred = rr$prediction()
pred
## <PredictionClassif> for 9957 observations:
##     row_ids   truth response
##           1 Kecimen    Besni
##           2 Kecimen    Besni
##           3 Kecimen    Besni
## ---                         
##         889   Besni  Kecimen
##         892   Besni    Besni
##         896   Besni    Besni
pred$score(msr("classif.acc"))
## classif.acc 
##   0.8319775
splits = mlr3::partition(task_raisin, ratio = 0.75)
learner$train(task_raisin, splits$train)
pred = learner$predict(task_raisin, splits$test)
pred$confusion
##          truth
## response  Kecimen Besni
##   Kecimen      96    18
##   Besni        16    94
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
##          truth
## response  Kecimen Besni
##   Kecimen      96    18
##   Besni        16    94
## acc :  0.8482; ce  :  0.1518; dor :  31.3333; f1  :  0.8496 
## fdr :  0.1579; fnr :  0.1429; fomr:  0.1455; fpr :  0.1607 
## mcc :  0.6965; npv :  0.8545; ppv :  0.8421; tnr :  0.8393 
## tpr :  0.8571

Dari hasil analisis, diperoleh akurasi (acc) sebesar 0.8393, artinya sekitar 83.93% data terklasifikasi dengan benar. Nilai Positive Predictive Value (ppv) sebesar 0.8276, menunjukkan seberapa banyak hasil positif yang sebenarnya benar. Nilai Negative Predictive Value (npv) sebesar 0.8519, menunjukkan seberapa banyak hasil negatif yang sebenarnya benar.

standardize <- po("scale")

smote <- po("smote",dup_size=1)

standardize$train(list(task_raisin))[[1]]$data() %>% glimpse
## Rows: 900
## Columns: 8
## $ Class           <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, …
## $ Area            <dbl> -0.007182375, -0.324037016, 0.078248898, -1.073688739,…
## $ ConvexArea      <dbl> -0.01570030, -0.30407912, 0.06207883, -1.07556668, -0.…
## $ Eccentricity    <dbl> 0.42290649, 0.22435165, 0.18613557, -1.06902826, -2.40…
## $ Extent          <dbl> 1.106127610, -0.287616870, -1.157606092, 0.001710523, …
## $ MajorAxisLength <dbl> 0.09752272, -0.20889592, 0.09770402, -1.24435939, -0.6…
## $ MinorAxisLength <dbl> -0.02394487, -0.22916480, 0.23685627, -0.91476486, 0.7…
## $ Perimeter       <dbl> 0.06623714, -0.16116284, 0.15585802, -1.17526141, -0.3…
smote$train(list(task_raisin))[[1]]$data() %>% count(Class)
##      Class   n
## 1: Kecimen 900
## 2:   Besni 450
kknn_pipeline <- GraphLearner$new(standardize %>>% smote %>>% learner)
kknn_pipeline
## <GraphLearner:scale.smote.classif.kknn>
## * Model: -
## * Parameters: scale.robust=FALSE, smote.dup_size=1, classif.kknn.k=7
## * Packages: mlr3, mlr3pipelines, smotefamily, mlr3learners, kknn
## * Predict Types:  [response], prob
## * Feature Types: logical, integer, numeric, character, factor, ordered,
##   POSIXct
## * Properties: featureless, hotstart_backward, hotstart_forward,
##   importance, loglik, missings, multiclass, oob_error,
##   selected_features, twoclass, weights
kknn_pipeline$train(task = task_raisin)
summary(kknn_pipeline$model$classif.kknn$model)
##         Length Class      Mode
## formula 3      formula    call
## data    8      data.table list
## pv      1      -none-     list
## kknn    0      -none-     NULL
# Menghitung prediksi dari data test
pred <- kknn_pipeline$predict(task_raisin)

# Evaluate model
perf <- pred$score(msr("classif.acc"), task = task_raisin)
perf
## classif.acc 
##        0.93
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
##          truth
## response  Kecimen Besni
##   Kecimen     445    58
##   Besni         5   392
## acc :  0.9300; ce  :  0.0700; dor :  601.5172; f1  :  0.9339 
## fdr :  0.1153; fnr :  0.0111; fomr:  0.0126; fpr :  0.1289 
## mcc :  0.8660; npv :  0.9874; ppv :  0.8847; tnr :  0.8711 
## tpr :  0.9889

Nilai akurasi (acc) dari model yang diperoleh adalah sebesar 0.9333, yang berarti model berhasil memprediksi dengan benar sebesar 93.33% dari total data yang ada. Hal ini menunjukkan bahwa model memiliki kinerja yang sangat baik dalam melakukan klasifikasi pada dataset yang digunakan.