121450022 - Sarah Natalia Geraldine
121450094 - Syifa Firnanda
121450105 - Raditia Riandi
121450112 - Christian Arvianus Nathanael Biran
121450157 - Salwa Naqwadisa Madinna
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(mlr3verse)
## Loading required package: mlr3
library(mlr3tuning)
## Loading required package: paradox
library(paradox)
library(kknn)
library(ggpubr)
library(smotefamily)
library(skimr)
##
## Attaching package: 'skimr'
##
## The following object is masked from 'package:mlr3':
##
## partition
library(mlr3)
library(resample)
##
## Attaching package: 'resample'
##
## The following object is masked from 'package:mlr3':
##
## resample
library(mlr3learners)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:kknn':
##
## contr.dummy
##
## The following object is masked from 'package:purrr':
##
## lift
library(lattice)
library(ggplot2)
library(dplyr)
library(rlist)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(gains)
library(leaps)
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(rpart)
library(rpart.plot)
library(ROSE)
## Loaded ROSE 0.0-4
Mengimport dataset yang ingin digunakan.
library(readxl)
Raisin_Dataset <- read_csv("C:/Users/sarah/Downloads/Raisin_Dataset.csv")
## Rows: 900 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Class
## dbl (7): Area, MajorAxisLength, MinorAxisLength, Eccentricity, ConvexArea, E...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(Raisin_Dataset)
## Rows: 900
## Columns: 8
## $ Area <dbl> 87524, 75166, 90856, 45928, 79408, 49242, 42492, 60952…
## $ MajorAxisLength <dbl> 442.2460, 406.6907, 442.2670, 286.5406, 352.1908, 318.…
## $ MinorAxisLength <dbl> 253.2912, 243.0324, 266.3283, 208.7600, 290.8275, 200.…
## $ Eccentricity <dbl> 0.8197384, 0.8018052, 0.7983536, 0.6849892, 0.5640113,…
## $ ConvexArea <dbl> 90546, 78789, 93717, 47336, 81463, 51368, 43904, 62329…
## $ Extent <dbl> 0.7586506, 0.6841296, 0.6376128, 0.6995994, 0.7927719,…
## $ Perimeter <dbl> 1184.040, 1121.786, 1208.575, 844.162, 1073.251, 881.8…
## $ Class <chr> "Kecimen", "Kecimen", "Kecimen", "Kecimen", "Kecimen",…
Berdasarkan hasil di atas, diketahui bahwa dataset memiliki 900 baris dengan 8 kolom.
skim_without_charts(Raisin_Dataset)
| Name | Raisin_Dataset |
| Number of rows | 900 |
| Number of columns | 8 |
| _______________________ | |
| Column type frequency: | |
| character | 1 |
| numeric | 7 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| Class | 0 | 1 | 5 | 7 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Area | 0 | 1 | 87804.13 | 39002.11 | 25387.00 | 59348.00 | 78902.00 | 105028.25 | 235047.00 |
| MajorAxisLength | 0 | 1 | 430.93 | 116.04 | 225.63 | 345.44 | 407.80 | 494.19 | 997.29 |
| MinorAxisLength | 0 | 1 | 254.49 | 49.99 | 143.71 | 219.11 | 247.85 | 279.89 | 492.28 |
| Eccentricity | 0 | 1 | 0.78 | 0.09 | 0.35 | 0.74 | 0.80 | 0.84 | 0.96 |
| ConvexArea | 0 | 1 | 91186.09 | 40769.29 | 26139.00 | 61513.25 | 81651.00 | 108375.75 | 278217.00 |
| Extent | 0 | 1 | 0.70 | 0.05 | 0.38 | 0.67 | 0.71 | 0.73 | 0.84 |
| Perimeter | 0 | 1 | 1165.91 | 273.76 | 619.07 | 966.41 | 1119.51 | 1308.39 | 2697.75 |
Berdasarkan hasil di atas, diketahui bahwa tidak terdapat missing value pada tiap kolom dataset dengan 1 kolom bertipe factor dan 7 kolom bertipe numeric.
Raisin_Dataset$Class <- as.factor(Raisin_Dataset$Class)
glimpse(Raisin_Dataset)
## Rows: 900
## Columns: 8
## $ Area <dbl> 87524, 75166, 90856, 45928, 79408, 49242, 42492, 60952…
## $ MajorAxisLength <dbl> 442.2460, 406.6907, 442.2670, 286.5406, 352.1908, 318.…
## $ MinorAxisLength <dbl> 253.2912, 243.0324, 266.3283, 208.7600, 290.8275, 200.…
## $ Eccentricity <dbl> 0.8197384, 0.8018052, 0.7983536, 0.6849892, 0.5640113,…
## $ ConvexArea <dbl> 90546, 78789, 93717, 47336, 81463, 51368, 43904, 62329…
## $ Extent <dbl> 0.7586506, 0.6841296, 0.6376128, 0.6995994, 0.7927719,…
## $ Perimeter <dbl> 1184.040, 1121.786, 1208.575, 844.162, 1073.251, 881.8…
## $ Class <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, …
task_raisin <- TaskClassif$new(id = "Raisin",
backend = Raisin_Dataset,
target = "Class",
positive = "Kecimen")
Mendefinisikan objek task dengan backend untuk data yang dimodelkan menjadi peubah responnya berupa peubah numerik, target = “Class” dijadikan sebagai peubah respon.
# create
learner <- lrn("classif.kknn")
learner
## <LearnerClassifKKNN:classif.kknn>
## * Model: -
## * Parameters: k=7
## * Packages: mlr3, mlr3learners, kknn
## * Predict Types: [response], prob
## * Feature Types: logical, integer, numeric, factor, ordered
## * Properties: multiclass, twoclass
as.data.table(mlr_resamplings)
## key label params iters
## 1: bootstrap Bootstrap ratio,repeats 30
## 2: custom Custom Splits NA
## 3: custom_cv Custom Split Cross-Validation NA
## 4: cv Cross-Validation folds 10
## 5: holdout Holdout ratio 1
## 6: insample Insample Resampling 1
## 7: loo Leave-One-Out NA
## 8: repeated_cv Repeated Cross-Validation folds,repeats 100
## 9: subsampling Subsampling ratio,repeats 30
# holdout resampling
set.seed(123)
resampling_holdout = rsmp("holdout")
print(resampling_holdout)
## <ResamplingHoldout>: Holdout
## * Iterations: 1
## * Instantiated: FALSE
## * Parameters: ratio=0.6667
Melakukan resampling dengan fungsi rsmp() menghasilkan iterasi 1 dengan parameters ratio sebesar 0.6667.
resample_holdout = mlr3::resample(task = task_raisin, learner = learner, resampling = resampling_holdout)
## INFO [17:12:28.921] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 1/1)
resample_holdout$aggregate(msr("classif.acc"))
## classif.acc
## 0.8333333
Instansiasi di atas digunakan untuk membandingkan uji train dengan uji test yang sama untuk mengevaluasi performa model.
resample_holdoutdt = as.data.table(resample_holdout)
resample_holdoutdt
## task learner resampling iteration
## 1: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingHoldout[20]> 1
## prediction
## 1: <PredictionClassif[20]>
Tabel di atas merupakan informasi dasar yang didapatkan menggunakan fungsi as.data.table untuk menyimpan prediksi data yang kita uji untuk mengkases informasi yang disimpan.
resample_holdoutdt$prediction
## [[1]]
## <PredictionClassif> for 300 observations:
## row_ids truth response
## 1 Kecimen Besni
## 2 Kecimen Kecimen
## 3 Kecimen Kecimen
## ---
## 894 Besni Besni
## 897 Besni Kecimen
## 898 Besni Besni
all.equal(resample_holdoutdt$prediction, resample_holdout$predictions())
## [1] TRUE
pred = resample_holdout$prediction()
pred
## <PredictionClassif> for 300 observations:
## row_ids truth response
## 1 Kecimen Besni
## 2 Kecimen Kecimen
## 3 Kecimen Kecimen
## ---
## 894 Besni Besni
## 897 Besni Kecimen
## 898 Besni Besni
pred$score(msr("classif.acc"))
## classif.acc
## 0.8333333
splits = mlr3::partition(task_raisin, ratio = 0.75)
learner$train(task_raisin, splits$train)
pred = learner$predict(task_raisin, splits$test)
pred$confusion
## truth
## response Kecimen Besni
## Kecimen 90 17
## Besni 22 95
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
## truth
## response Kecimen Besni
## Kecimen 90 17
## Besni 22 95
## acc : 0.8259; ce : 0.1741; dor : 22.8610; f1 : 0.8219
## fdr : 0.1589; fnr : 0.1964; fomr: 0.1880; fpr : 0.1518
## mcc : 0.6524; npv : 0.8120; ppv : 0.8411; tnr : 0.8482
## tpr : 0.8036
Dari hasil analisis, diperoleh akurasi (acc) sebesar 0.8333, artinya sekitar 83.30% data terklasifikasi dengan benar. Nilai Positive Predictive Value (ppv) sebesar 0.7844, menunjukkan seberapa banyak hasil positif yang sebenarnya benar. Nilai Negative Predictive Value (npv) sebesar 0.8947, menunjukkan seberapa banyak hasil negatif yang sebenarnya benar.
standardize <- po("scale")
smote <- po("smote",dup_size=1)
Standardize menerapkan proses standardisasi dengan fungsi scale(), yang akan mengubah variabel numerik pada dataset menjadi memiliki mean nol dan variansi satu. Proses standardisasi ini sering dilakukan sebelum melakukan analisis statistik atau machine learning untuk memperbaiki distribusi data.
Smote menerapkan teknik oversampling SMOTE (Synthetic Minority Over-sampling Technique) dengan dup_size=1. Teknik SMOTE digunakan untuk menangani masalah kelas yang tidak seimbang (imbalanced class) dengan membuat sampel sintetis dari kelas minoritas. dup_size adalah parameter yang menentukan seberapa banyak sampel sintetis akan dibuat. Dalam kode tersebut, dup_size=1 berarti akan membuat jumlah sampel sintetis yang sama dengan jumlah sampel awal dari kelas minoritas.
standardize$train(list(task_raisin))[[1]]$data() %>% glimpse
## Rows: 900
## Columns: 8
## $ Class <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, …
## $ Area <dbl> -0.007182375, -0.324037016, 0.078248898, -1.073688739,…
## $ ConvexArea <dbl> -0.01570030, -0.30407912, 0.06207883, -1.07556668, -0.…
## $ Eccentricity <dbl> 0.42290649, 0.22435165, 0.18613557, -1.06902826, -2.40…
## $ Extent <dbl> 1.106127610, -0.287616870, -1.157606092, 0.001710523, …
## $ MajorAxisLength <dbl> 0.09752272, -0.20889592, 0.09770402, -1.24435939, -0.6…
## $ MinorAxisLength <dbl> -0.02394487, -0.22916480, 0.23685627, -0.91476486, 0.7…
## $ Perimeter <dbl> 0.06623714, -0.16116284, 0.15585802, -1.17526141, -0.3…
smote$train(list(task_raisin))[[1]]$data() %>% count(Class)
## Class n
## 1: Kecimen 900
## 2: Besni 450
kknn_pipeline <- GraphLearner$new(standardize %>>% smote %>>% learner)
kknn_pipeline
## <GraphLearner:scale.smote.classif.kknn>
## * Model: -
## * Parameters: scale.robust=FALSE, smote.dup_size=1, classif.kknn.k=7
## * Packages: mlr3, mlr3pipelines, smotefamily, mlr3learners, kknn
## * Predict Types: [response], prob
## * Feature Types: logical, integer, numeric, character, factor, ordered,
## POSIXct
## * Properties: featureless, hotstart_backward, hotstart_forward,
## importance, loglik, missings, multiclass, oob_error,
## selected_features, twoclass, weights
kknn_pipeline$train(task = task_raisin)
summary(kknn_pipeline$model$classif.kknn$model)
## Length Class Mode
## formula 3 formula call
## data 8 data.table list
## pv 1 -none- list
## kknn 0 -none- NULL
# Menghitung prediksi dari data test
pred <- kknn_pipeline$predict(task_raisin)
# Evaluate model
perf <- pred$score(msr("classif.acc"), task = task_raisin)
perf
## classif.acc
## 0.9322222
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
## truth
## response Kecimen Besni
## Kecimen 445 56
## Besni 5 394
## acc : 0.9322; ce : 0.0678; dor : 626.1786; f1 : 0.9359
## fdr : 0.1118; fnr : 0.0111; fomr: 0.0125; fpr : 0.1244
## mcc : 0.8701; npv : 0.9875; ppv : 0.8882; tnr : 0.8756
## tpr : 0.9889
Nilai akurasi (acc) dari model yang diperoleh adalah sebesar 0.93, yang berarti model berhasil memprediksi dengan benar sebesar 93% dari total data yang ada. Hal ini menunjukkan bahwa model memiliki kinerja yang sangat baik dalam melakukan klasifikasi pada dataset yang digunakan.
set.seed(321)
resampling_cv = rsmp("cv", folds = 10)
resampling_cv$instantiate(task_raisin)
task_raisin$col_roles$stratum = "Class"
print(resampling_cv)
## <ResamplingCV>: Cross-Validation
## * Iterations: 10
## * Instantiated: TRUE
## * Parameters: folds=10
Melakukan resampling dengan fungsi rsmp() menghasilkan iterasi 10 dengan parameters folds sebesar 10.
resample_cv = mlr3::resample(task_raisin, learner, resampling_cv)
## INFO [17:12:33.382] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 1/10)
## INFO [17:12:33.450] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 2/10)
## INFO [17:12:33.494] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 3/10)
## INFO [17:12:33.548] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 4/10)
## INFO [17:12:34.117] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 5/10)
## INFO [17:12:34.165] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 6/10)
## INFO [17:12:34.212] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 7/10)
## INFO [17:12:34.256] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 8/10)
## INFO [17:12:34.306] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 9/10)
## INFO [17:12:34.353] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 10/10)
resample_cv$aggregate(msr("classif.acc"))
## classif.acc
## 0.8477778
Dari hasil analisis prediksi, diperoleh akurasi klasifikasi sebesar 0.8466667.
print(resample_cv)
## <ResampleResult> of 10 iterations
## * Task: Raisin
## * Learner: classif.kknn
## * Warnings: 0 in 0 iterations
## * Errors: 0 in 0 iterations
as.data.table(resample_cv)
## task learner resampling iteration
## 1: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 1
## 2: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 2
## 3: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 3
## 4: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 4
## 5: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 5
## 6: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 6
## 7: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 7
## 8: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 8
## 9: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 9
## 10: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 10
## prediction
## 1: <PredictionClassif[20]>
## 2: <PredictionClassif[20]>
## 3: <PredictionClassif[20]>
## 4: <PredictionClassif[20]>
## 5: <PredictionClassif[20]>
## 6: <PredictionClassif[20]>
## 7: <PredictionClassif[20]>
## 8: <PredictionClassif[20]>
## 9: <PredictionClassif[20]>
## 10: <PredictionClassif[20]>
acc = resample_cv$score(msr("classif.acc"))
acc[, .(iteration, classif.acc)]
## iteration classif.acc
## 1: 1 0.8222222
## 2: 2 0.7888889
## 3: 3 0.8444444
## 4: 4 0.8444444
## 5: 5 0.8777778
## 6: 6 0.8333333
## 7: 7 0.9000000
## 8: 8 0.8222222
## 9: 9 0.8555556
## 10: 10 0.8888889
resample_cv$aggregate(msr("classif.acc"))
## classif.acc
## 0.8477778
Instansiasi di atas digunakan untuk membandingkan uji train dengan uji test yang sama untuk mengevaluasi performa model.
resample_cvdt = as.data.table(resample_cv)
resample_cvdt
## task learner resampling iteration
## 1: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 1
## 2: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 2
## 3: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 3
## 4: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 4
## 5: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 5
## 6: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 6
## 7: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 7
## 8: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 8
## 9: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 9
## 10: <TaskClassif[50]> <LearnerClassifKKNN[36]> <ResamplingCV[20]> 10
## prediction
## 1: <PredictionClassif[20]>
## 2: <PredictionClassif[20]>
## 3: <PredictionClassif[20]>
## 4: <PredictionClassif[20]>
## 5: <PredictionClassif[20]>
## 6: <PredictionClassif[20]>
## 7: <PredictionClassif[20]>
## 8: <PredictionClassif[20]>
## 9: <PredictionClassif[20]>
## 10: <PredictionClassif[20]>
Tabel di atas merupakan informasi dasar yang didapatkan menggunakan fungsi as.data.table untuk menyimpan prediksi data yang diuji untuk mengkases informasi yang disimpan.
resample_cvdt$prediction
## [[1]]
## <PredictionClassif> for 90 observations:
## row_ids truth response
## 15 Kecimen Kecimen
## 36 Kecimen Kecimen
## 42 Kecimen Besni
## ---
## 862 Besni Kecimen
## 882 Besni Besni
## 891 Besni Besni
##
## [[2]]
## <PredictionClassif> for 90 observations:
## row_ids truth response
## 1 Kecimen Besni
## 19 Kecimen Kecimen
## 76 Kecimen Kecimen
## ---
## 874 Besni Besni
## 895 Besni Besni
## 898 Besni Besni
##
## [[3]]
## <PredictionClassif> for 90 observations:
## row_ids truth response
## 2 Kecimen Kecimen
## 3 Kecimen Kecimen
## 29 Kecimen Kecimen
## ---
## 885 Besni Besni
## 886 Besni Kecimen
## 900 Besni Besni
##
## [[4]]
## <PredictionClassif> for 90 observations:
## row_ids truth response
## 6 Kecimen Kecimen
## 22 Kecimen Kecimen
## 23 Kecimen Kecimen
## ---
## 858 Besni Besni
## 863 Besni Besni
## 887 Besni Besni
##
## [[5]]
## <PredictionClassif> for 90 observations:
## row_ids truth response
## 8 Kecimen Kecimen
## 17 Kecimen Kecimen
## 27 Kecimen Kecimen
## ---
## 876 Besni Besni
## 897 Besni Kecimen
## 899 Besni Besni
##
## [[6]]
## <PredictionClassif> for 90 observations:
## row_ids truth response
## 4 Kecimen Kecimen
## 31 Kecimen Besni
## 61 Kecimen Kecimen
## ---
## 868 Besni Kecimen
## 870 Besni Besni
## 881 Besni Besni
##
## [[7]]
## <PredictionClassif> for 90 observations:
## row_ids truth response
## 10 Kecimen Kecimen
## 13 Kecimen Kecimen
## 43 Kecimen Kecimen
## ---
## 890 Besni Besni
## 893 Besni Besni
## 896 Besni Besni
##
## [[8]]
## <PredictionClassif> for 90 observations:
## row_ids truth response
## 7 Kecimen Kecimen
## 9 Kecimen Kecimen
## 11 Kecimen Besni
## ---
## 883 Besni Besni
## 888 Besni Besni
## 889 Besni Kecimen
##
## [[9]]
## <PredictionClassif> for 90 observations:
## row_ids truth response
## 14 Kecimen Kecimen
## 18 Kecimen Kecimen
## 34 Kecimen Kecimen
## ---
## 825 Besni Besni
## 830 Besni Besni
## 894 Besni Besni
##
## [[10]]
## <PredictionClassif> for 90 observations:
## row_ids truth response
## 5 Kecimen Kecimen
## 12 Kecimen Kecimen
## 16 Kecimen Kecimen
## ---
## 877 Besni Besni
## 880 Besni Besni
## 892 Besni Besni
all.equal(resample_cvdt$prediction, resample_cv$predictions())
## [1] TRUE
pred = resample_cv$prediction()
pred
## <PredictionClassif> for 900 observations:
## row_ids truth response
## 15 Kecimen Kecimen
## 36 Kecimen Kecimen
## 42 Kecimen Besni
## ---
## 877 Besni Besni
## 880 Besni Besni
## 892 Besni Besni
pred$score(msr("classif.acc"))
## classif.acc
## 0.8477778
Dari hasil analisis prediksi, diperoleh akurasi klasifikasi sebesar 0.8477778.
prop.table(table(task_raisin$data(rows = resampling_cv$test_set(1), cols = "Class")))
## Class
## Kecimen Besni
## 0.5111111 0.4888889
splits = mlr3::partition(task_raisin, ratio = 0.75)
learner$train(task_raisin, splits$train)
pred = learner$predict(task_raisin, splits$test)
pred$confusion
## truth
## response Kecimen Besni
## Kecimen 95 15
## Besni 17 97
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
## truth
## response Kecimen Besni
## Kecimen 95 15
## Besni 17 97
## acc : 0.8571; ce : 0.1429; dor : 36.1373; f1 : 0.8559
## fdr : 0.1364; fnr : 0.1518; fomr: 0.1491; fpr : 0.1339
## mcc : 0.7144; npv : 0.8509; ppv : 0.8636; tnr : 0.8661
## tpr : 0.8482
Dari hasil analisis, diperoleh akurasi (acc) sebesar 0.8571, artinya sekitar 85.71% data terklasifikasi dengan benar. Nilai Positive Predictive Value (ppv) sebesar 0.8636, menunjukkan seberapa banyak hasil positif yang sebenarnya benar. Nilai Negative Predictive Value (npv) sebesar 0.8509, menunjukkan seberapa banyak hasil negatif yang sebenarnya benar.
standardize <- po("scale")
smote <- po("smote",dup_size=1)
standardize$train(list(task_raisin))[[1]]$data() %>% glimpse
## Rows: 900
## Columns: 8
## $ Class <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, …
## $ Area <dbl> -0.007182375, -0.324037016, 0.078248898, -1.073688739,…
## $ ConvexArea <dbl> -0.01570030, -0.30407912, 0.06207883, -1.07556668, -0.…
## $ Eccentricity <dbl> 0.42290649, 0.22435165, 0.18613557, -1.06902826, -2.40…
## $ Extent <dbl> 1.106127610, -0.287616870, -1.157606092, 0.001710523, …
## $ MajorAxisLength <dbl> 0.09752272, -0.20889592, 0.09770402, -1.24435939, -0.6…
## $ MinorAxisLength <dbl> -0.02394487, -0.22916480, 0.23685627, -0.91476486, 0.7…
## $ Perimeter <dbl> 0.06623714, -0.16116284, 0.15585802, -1.17526141, -0.3…
smote$train(list(task_raisin))[[1]]$data() %>% count(Class)
## Class n
## 1: Kecimen 900
## 2: Besni 450
kknn_pipeline <- GraphLearner$new(standardize %>>% smote %>>% learner)
kknn_pipeline
## <GraphLearner:scale.smote.classif.kknn>
## * Model: -
## * Parameters: scale.robust=FALSE, smote.dup_size=1, classif.kknn.k=7
## * Packages: mlr3, mlr3pipelines, smotefamily, mlr3learners, kknn
## * Predict Types: [response], prob
## * Feature Types: logical, integer, numeric, character, factor, ordered,
## POSIXct
## * Properties: featureless, hotstart_backward, hotstart_forward,
## importance, loglik, missings, multiclass, oob_error,
## selected_features, twoclass, weights
kknn_pipeline$train(task = task_raisin)
summary(kknn_pipeline$model$classif.kknn$model)
## Length Class Mode
## formula 3 formula call
## data 8 data.table list
## pv 1 -none- list
## kknn 0 -none- NULL
# Menghitung prediksi dari data test
pred <- kknn_pipeline$predict(task_raisin)
# Evaluate model
perf <- pred$score(msr("classif.acc"), task = task_raisin)
perf
## classif.acc
## 0.93
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
## truth
## response Kecimen Besni
## Kecimen 447 60
## Besni 3 390
## acc : 0.9300; ce : 0.0700; dor : 968.5000; f1 : 0.9342
## fdr : 0.1183; fnr : 0.0067; fomr: 0.0076; fpr : 0.1333
## mcc : 0.8670; npv : 0.9924; ppv : 0.8817; tnr : 0.8667
## tpr : 0.9933
Nilai akurasi (acc) dari model yang diperoleh adalah sebesar 0.9322, yang berarti model berhasil memprediksi dengan benar sebesar 93.22% dari total data yang ada. Hal ini menunjukkan bahwa model memiliki kinerja yang sangat baik dalam melakukan klasifikasi pada dataset yang digunakan.
# Bootstrap resampling
set.seed(123)
resampling_bootstrap <- rsmp("bootstrap")
# Menjalankan resampling
rr <- mlr3::resample(task_raisin, learner, resampling = resampling_bootstrap)
## INFO [17:12:37.678] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 1/30)
## INFO [17:12:37.755] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 2/30)
## INFO [17:12:37.814] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 3/30)
## INFO [17:12:37.870] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 4/30)
## INFO [17:12:37.926] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 5/30)
## INFO [17:12:37.982] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 6/30)
## INFO [17:12:38.041] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 7/30)
## INFO [17:12:38.099] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 8/30)
## INFO [17:12:38.169] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 9/30)
## INFO [17:12:38.234] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 10/30)
## INFO [17:12:38.301] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 11/30)
## INFO [17:12:38.360] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 12/30)
## INFO [17:12:38.421] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 13/30)
## INFO [17:12:38.478] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 14/30)
## INFO [17:12:38.535] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 15/30)
## INFO [17:12:38.593] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 16/30)
## INFO [17:12:38.648] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 17/30)
## INFO [17:12:38.704] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 18/30)
## INFO [17:12:38.762] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 19/30)
## INFO [17:12:38.818] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 20/30)
## INFO [17:12:38.879] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 21/30)
## INFO [17:12:38.944] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 22/30)
## INFO [17:12:39.009] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 23/30)
## INFO [17:12:39.065] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 24/30)
## INFO [17:12:39.121] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 25/30)
## INFO [17:12:39.181] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 26/30)
## INFO [17:12:39.234] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 27/30)
## INFO [17:12:39.299] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 28/30)
## INFO [17:12:39.355] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 29/30)
## INFO [17:12:39.414] [mlr3] Applying learner 'classif.kknn' on task 'Raisin' (iter 30/30)
# Melihat hasil evaluasi
rr$aggregate(msr("classif.acc"))
## classif.acc
## 0.8321325
Berdasarkan hasil di atas, diketahui bahwa hasil dari evaluasi model memiliki akurasi sebesar 0.8235044.
pred = rr$prediction()
pred
## <PredictionClassif> for 9957 observations:
## row_ids truth response
## 1 Kecimen Besni
## 2 Kecimen Besni
## 3 Kecimen Besni
## ---
## 889 Besni Kecimen
## 892 Besni Besni
## 896 Besni Besni
pred$score(msr("classif.acc"))
## classif.acc
## 0.8319775
splits = mlr3::partition(task_raisin, ratio = 0.75)
learner$train(task_raisin, splits$train)
pred = learner$predict(task_raisin, splits$test)
pred$confusion
## truth
## response Kecimen Besni
## Kecimen 96 18
## Besni 16 94
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
## truth
## response Kecimen Besni
## Kecimen 96 18
## Besni 16 94
## acc : 0.8482; ce : 0.1518; dor : 31.3333; f1 : 0.8496
## fdr : 0.1579; fnr : 0.1429; fomr: 0.1455; fpr : 0.1607
## mcc : 0.6965; npv : 0.8545; ppv : 0.8421; tnr : 0.8393
## tpr : 0.8571
Dari hasil analisis, diperoleh akurasi (acc) sebesar 0.8393, artinya sekitar 83.93% data terklasifikasi dengan benar. Nilai Positive Predictive Value (ppv) sebesar 0.8276, menunjukkan seberapa banyak hasil positif yang sebenarnya benar. Nilai Negative Predictive Value (npv) sebesar 0.8519, menunjukkan seberapa banyak hasil negatif yang sebenarnya benar.
standardize <- po("scale")
smote <- po("smote",dup_size=1)
standardize$train(list(task_raisin))[[1]]$data() %>% glimpse
## Rows: 900
## Columns: 8
## $ Class <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, …
## $ Area <dbl> -0.007182375, -0.324037016, 0.078248898, -1.073688739,…
## $ ConvexArea <dbl> -0.01570030, -0.30407912, 0.06207883, -1.07556668, -0.…
## $ Eccentricity <dbl> 0.42290649, 0.22435165, 0.18613557, -1.06902826, -2.40…
## $ Extent <dbl> 1.106127610, -0.287616870, -1.157606092, 0.001710523, …
## $ MajorAxisLength <dbl> 0.09752272, -0.20889592, 0.09770402, -1.24435939, -0.6…
## $ MinorAxisLength <dbl> -0.02394487, -0.22916480, 0.23685627, -0.91476486, 0.7…
## $ Perimeter <dbl> 0.06623714, -0.16116284, 0.15585802, -1.17526141, -0.3…
smote$train(list(task_raisin))[[1]]$data() %>% count(Class)
## Class n
## 1: Kecimen 900
## 2: Besni 450
kknn_pipeline <- GraphLearner$new(standardize %>>% smote %>>% learner)
kknn_pipeline
## <GraphLearner:scale.smote.classif.kknn>
## * Model: -
## * Parameters: scale.robust=FALSE, smote.dup_size=1, classif.kknn.k=7
## * Packages: mlr3, mlr3pipelines, smotefamily, mlr3learners, kknn
## * Predict Types: [response], prob
## * Feature Types: logical, integer, numeric, character, factor, ordered,
## POSIXct
## * Properties: featureless, hotstart_backward, hotstart_forward,
## importance, loglik, missings, multiclass, oob_error,
## selected_features, twoclass, weights
kknn_pipeline$train(task = task_raisin)
summary(kknn_pipeline$model$classif.kknn$model)
## Length Class Mode
## formula 3 formula call
## data 8 data.table list
## pv 1 -none- list
## kknn 0 -none- NULL
# Menghitung prediksi dari data test
pred <- kknn_pipeline$predict(task_raisin)
# Evaluate model
perf <- pred$score(msr("classif.acc"), task = task_raisin)
perf
## classif.acc
## 0.93
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
## truth
## response Kecimen Besni
## Kecimen 445 58
## Besni 5 392
## acc : 0.9300; ce : 0.0700; dor : 601.5172; f1 : 0.9339
## fdr : 0.1153; fnr : 0.0111; fomr: 0.0126; fpr : 0.1289
## mcc : 0.8660; npv : 0.9874; ppv : 0.8847; tnr : 0.8711
## tpr : 0.9889
Nilai akurasi (acc) dari model yang diperoleh adalah sebesar 0.9333, yang berarti model berhasil memprediksi dengan benar sebesar 93.33% dari total data yang ada. Hal ini menunjukkan bahwa model memiliki kinerja yang sangat baik dalam melakukan klasifikasi pada dataset yang digunakan.