library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.2.2
## Warning: package 'ggplot2' was built under R version 4.2.2
## Warning: package 'tibble' was built under R version 4.2.2
## Warning: package 'tidyr' was built under R version 4.2.2
## Warning: package 'readr' was built under R version 4.2.2
## Warning: package 'purrr' was built under R version 4.2.2
## Warning: package 'dplyr' was built under R version 4.2.2
## Warning: package 'stringr' was built under R version 4.2.2
## Warning: package 'forcats' was built under R version 4.2.2
## Warning: package 'lubridate' was built under R version 4.2.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(mlr3verse)
## Warning: package 'mlr3verse' was built under R version 4.2.2
## Loading required package: mlr3
## Warning: package 'mlr3' was built under R version 4.2.2
library(mlr3tuning)
## Loading required package: paradox
## Warning: package 'paradox' was built under R version 4.2.2
library(smotefamily)
## Warning: package 'smotefamily' was built under R version 4.2.2
library(readr)
audit <- read.csv("C:/Users/zdann!!!/Downloads/audit_risk.csv",stringsAsFactors = TRUE)
head(audit)
## Sector_score LOCATION_ID PARA_A Score_A Risk_A PARA_B Score_B Risk_B TOTAL
## 1 3.89 23 4.18 0.6 2.508 2.50 0.2 0.500 6.68
## 2 3.89 6 0.00 0.2 0.000 4.83 0.2 0.966 4.83
## 3 3.89 6 0.51 0.2 0.102 0.23 0.2 0.046 0.74
## 4 3.89 6 0.00 0.2 0.000 10.80 0.6 6.480 10.80
## 5 3.89 6 0.00 0.2 0.000 0.08 0.2 0.016 0.08
## 6 3.89 6 0.00 0.2 0.000 0.83 0.2 0.166 0.83
## numbers Score_B.1 Risk_C Money_Value Score_MV Risk_D District_Loss PROB
## 1 5 0.2 1.0 3.38 0.2 0.676 2 0.2
## 2 5 0.2 1.0 0.94 0.2 0.188 2 0.2
## 3 5 0.2 1.0 0.00 0.2 0.000 2 0.2
## 4 6 0.6 3.6 11.75 0.6 7.050 2 0.2
## 5 5 0.2 1.0 0.00 0.2 0.000 2 0.2
## 6 5 0.2 1.0 2.95 0.2 0.590 2 0.2
## RiSk_E History Prob Risk_F Score Inherent_Risk CONTROL_RISK Detection_Risk
## 1 0.4 0 0.2 0 2.4 8.574 0.4 0.5
## 2 0.4 0 0.2 0 2.0 2.554 0.4 0.5
## 3 0.4 0 0.2 0 2.0 1.548 0.4 0.5
## 4 0.4 0 0.2 0 4.4 17.530 0.4 0.5
## 5 0.4 0 0.2 0 2.0 1.416 0.4 0.5
## 6 0.4 0 0.2 0 2.0 2.156 0.4 0.5
## Audit_Risk Risk
## 1 1.7148 1
## 2 0.5108 0
## 3 0.3096 0
## 4 3.5060 1
## 5 0.2832 0
## 6 0.4312 0
summary(audit)
## Sector_score LOCATION_ID PARA_A Score_A
## Min. : 1.85 8 : 76 Min. : 0.000 Min. :0.2000
## 1st Qu.: 2.37 19 : 68 1st Qu.: 0.210 1st Qu.:0.2000
## Median : 3.89 9 : 53 Median : 0.875 Median :0.2000
## Mean :20.18 16 : 52 Mean : 2.450 Mean :0.3513
## 3rd Qu.:55.57 12 : 47 3rd Qu.: 2.480 3rd Qu.:0.6000
## Max. :59.85 5 : 44 Max. :85.000 Max. :0.6000
## (Other):436
## Risk_A PARA_B Score_B Risk_B
## Min. : 0.000 Min. : 0.000 Min. :0.2000 Min. : 0.000
## 1st Qu.: 0.042 1st Qu.: 0.000 1st Qu.:0.2000 1st Qu.: 0.000
## Median : 0.175 Median : 0.405 Median :0.2000 Median : 0.081
## Mean : 1.351 Mean : 10.800 Mean :0.3131 Mean : 6.334
## 3rd Qu.: 1.488 3rd Qu.: 4.160 3rd Qu.:0.4000 3rd Qu.: 1.841
## Max. :51.000 Max. :1264.630 Max. :0.6000 Max. :758.778
##
## TOTAL numbers Score_B.1 Risk_C
## Min. : 0.0000 Min. :5.000 Min. :0.2000 Min. :1.000
## 1st Qu.: 0.5375 1st Qu.:5.000 1st Qu.:0.2000 1st Qu.:1.000
## Median : 1.3700 Median :5.000 Median :0.2000 Median :1.000
## Mean : 13.2185 Mean :5.068 Mean :0.2237 Mean :1.153
## 3rd Qu.: 7.7075 3rd Qu.:5.000 3rd Qu.:0.2000 3rd Qu.:1.000
## Max. :1268.9100 Max. :9.000 Max. :0.6000 Max. :5.400
##
## Money_Value Score_MV Risk_D District_Loss
## Min. : 0.000 Min. :0.200 Min. : 0.000 Min. :2.000
## 1st Qu.: 0.000 1st Qu.:0.200 1st Qu.: 0.000 1st Qu.:2.000
## Median : 0.090 Median :0.200 Median : 0.018 Median :2.000
## Mean : 14.138 Mean :0.291 Mean : 8.265 Mean :2.505
## 3rd Qu.: 5.595 3rd Qu.:0.400 3rd Qu.: 2.235 3rd Qu.:2.000
## Max. :935.030 Max. :0.600 Max. :561.018 Max. :6.000
## NA's :1
## PROB RiSk_E History Prob
## Min. :0.2000 Min. :0.4000 Min. :0.0000 Min. :0.2000
## 1st Qu.:0.2000 1st Qu.:0.4000 1st Qu.:0.0000 1st Qu.:0.2000
## Median :0.2000 Median :0.4000 Median :0.0000 Median :0.2000
## Mean :0.2062 Mean :0.5191 Mean :0.1044 Mean :0.2168
## 3rd Qu.:0.2000 3rd Qu.:0.4000 3rd Qu.:0.0000 3rd Qu.:0.2000
## Max. :0.6000 Max. :2.4000 Max. :9.0000 Max. :0.6000
##
## Risk_F Score Inherent_Risk CONTROL_RISK
## Min. :0.00000 Min. :2.000 Min. : 1.400 Min. :0.4000
## 1st Qu.:0.00000 1st Qu.:2.000 1st Qu.: 1.583 1st Qu.:0.4000
## Median :0.00000 Median :2.400 Median : 2.214 Median :0.4000
## Mean :0.05361 Mean :2.703 Mean : 17.681 Mean :0.5727
## 3rd Qu.:0.00000 3rd Qu.:3.250 3rd Qu.: 10.664 3rd Qu.:0.4000
## Max. :5.40000 Max. :5.200 Max. :801.262 Max. :5.8000
##
## Detection_Risk Audit_Risk Risk
## Min. :0.5 Min. : 0.2800 Min. :0.000
## 1st Qu.:0.5 1st Qu.: 0.3167 1st Qu.:0.000
## Median :0.5 Median : 0.5556 Median :0.000
## Mean :0.5 Mean : 7.1682 Mean :0.393
## 3rd Qu.:0.5 3rd Qu.: 3.2499 3rd Qu.:1.000
## Max. :0.5 Max. :961.5144 Max. :1.000
##
Untuk memulai, langkah awal adalah mengimpor dataset dan memverifikasi apakah ada variabel yang bersifat faktor. Setelah itu, ubah kolom Risk menjadi variabel faktor dan tampilkan datanya.
Langkah selanjutnya adalah menghapus kolom Money_Value dan LOCATION_ID yang merupakan variabel karakteristik pada data audit.
audit$Risk=as.factor(audit$Risk)
audit <- audit %>% mutate(across(where(is.integer), as.numeric))
audit <- audit %>% select(-Money_Value , -LOCATION_ID) %>%
na.omit()
glimpse(audit)
## Rows: 776
## Columns: 25
## $ Sector_score <dbl> 3.89, 3.89, 3.89, 3.89, 3.89, 3.89, 3.89, 3.89, 3.89, 3…
## $ PARA_A <dbl> 4.18, 0.00, 0.51, 0.00, 0.00, 0.00, 1.10, 8.50, 8.40, 3…
## $ Score_A <dbl> 0.6, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.6, 0.6, 0.6, 0.6, …
## $ Risk_A <dbl> 2.508, 0.000, 0.102, 0.000, 0.000, 0.000, 0.440, 5.100,…
## $ PARA_B <dbl> 2.50, 4.83, 0.23, 10.80, 0.08, 0.83, 7.41, 12.03, 11.05…
## $ Score_B <dbl> 0.2, 0.2, 0.2, 0.6, 0.2, 0.2, 0.4, 0.6, 0.6, 0.2, 0.6, …
## $ Risk_B <dbl> 0.500, 0.966, 0.046, 6.480, 0.016, 0.166, 2.964, 7.218,…
## $ TOTAL <dbl> 6.68, 4.83, 0.74, 10.80, 0.08, 0.83, 8.51, 20.53, 19.45…
## $ numbers <dbl> 5.0, 5.0, 5.0, 6.0, 5.0, 5.0, 5.0, 5.5, 5.5, 5.0, 5.0, …
## $ Score_B.1 <dbl> 0.2, 0.2, 0.2, 0.6, 0.2, 0.2, 0.2, 0.4, 0.4, 0.2, 0.2, …
## $ Risk_C <dbl> 1.0, 1.0, 1.0, 3.6, 1.0, 1.0, 1.0, 2.2, 2.2, 1.0, 1.0, …
## $ Score_MV <dbl> 0.2, 0.2, 0.2, 0.6, 0.2, 0.2, 0.6, 0.4, 0.4, 0.2, 0.2, …
## $ Risk_D <dbl> 0.6760, 0.1880, 0.0000, 7.0500, 0.0000, 0.5900, 26.9700…
## $ District_Loss <dbl> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2…
## $ PROB <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, …
## $ RiSk_E <dbl> 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, …
## $ History <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0…
## $ Prob <dbl> 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, …
## $ Risk_F <dbl> 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, …
## $ Score <dbl> 2.4, 2.0, 2.0, 4.4, 2.0, 2.0, 3.2, 4.2, 4.2, 2.4, 3.6, …
## $ Inherent_Risk <dbl> 8.5740, 2.5540, 1.5480, 17.5300, 1.4160, 2.1560, 31.774…
## $ CONTROL_RISK <dbl> 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, …
## $ Detection_Risk <dbl> 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, …
## $ Audit_Risk <dbl> 1.71480, 0.51080, 0.30960, 3.50600, 0.28320, 0.43120, 6…
## $ Risk <fct> 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0…
library(skimr)
## Warning: package 'skimr' was built under R version 4.2.2
##
## Attaching package: 'skimr'
## The following object is masked from 'package:mlr3':
##
## partition
skim_without_charts(audit)
| Name | audit |
| Number of rows | 776 |
| Number of columns | 25 |
| _______________________ | |
| Column type frequency: | |
| factor | 1 |
| numeric | 24 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Risk | 0 | 1 | FALSE | 2 | 0: 471, 1: 305 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| Sector_score | 0 | 1 | 20.18 | 24.32 | 1.85 | 2.37 | 3.89 | 55.57 | 59.85 |
| PARA_A | 0 | 1 | 2.45 | 5.68 | 0.00 | 0.21 | 0.88 | 2.48 | 85.00 |
| Score_A | 0 | 1 | 0.35 | 0.17 | 0.20 | 0.20 | 0.20 | 0.60 | 0.60 |
| Risk_A | 0 | 1 | 1.35 | 3.44 | 0.00 | 0.04 | 0.17 | 1.49 | 51.00 |
| PARA_B | 0 | 1 | 10.80 | 50.08 | 0.00 | 0.00 | 0.41 | 4.16 | 1264.63 |
| Score_B | 0 | 1 | 0.31 | 0.17 | 0.20 | 0.20 | 0.20 | 0.40 | 0.60 |
| Risk_B | 0 | 1 | 6.33 | 30.07 | 0.00 | 0.00 | 0.08 | 1.84 | 758.78 |
| TOTAL | 0 | 1 | 13.22 | 51.31 | 0.00 | 0.54 | 1.37 | 7.71 | 1268.91 |
| numbers | 0 | 1 | 5.07 | 0.26 | 5.00 | 5.00 | 5.00 | 5.00 | 9.00 |
| Score_B.1 | 0 | 1 | 0.22 | 0.08 | 0.20 | 0.20 | 0.20 | 0.20 | 0.60 |
| Risk_C | 0 | 1 | 1.15 | 0.54 | 1.00 | 1.00 | 1.00 | 1.00 | 5.40 |
| Score_MV | 0 | 1 | 0.29 | 0.16 | 0.20 | 0.20 | 0.20 | 0.40 | 0.60 |
| Risk_D | 0 | 1 | 8.27 | 39.97 | 0.00 | 0.00 | 0.02 | 2.24 | 561.02 |
| District_Loss | 0 | 1 | 2.51 | 1.23 | 2.00 | 2.00 | 2.00 | 2.00 | 6.00 |
| PROB | 0 | 1 | 0.21 | 0.04 | 0.20 | 0.20 | 0.20 | 0.20 | 0.60 |
| RiSk_E | 0 | 1 | 0.52 | 0.29 | 0.40 | 0.40 | 0.40 | 0.40 | 2.40 |
| History | 0 | 1 | 0.10 | 0.53 | 0.00 | 0.00 | 0.00 | 0.00 | 9.00 |
| Prob | 0 | 1 | 0.22 | 0.07 | 0.20 | 0.20 | 0.20 | 0.20 | 0.60 |
| Risk_F | 0 | 1 | 0.05 | 0.31 | 0.00 | 0.00 | 0.00 | 0.00 | 5.40 |
| Score | 0 | 1 | 2.70 | 0.86 | 2.00 | 2.00 | 2.40 | 3.25 | 5.20 |
| Inherent_Risk | 0 | 1 | 17.68 | 54.74 | 1.40 | 1.58 | 2.21 | 10.66 | 801.26 |
| CONTROL_RISK | 0 | 1 | 0.57 | 0.44 | 0.40 | 0.40 | 0.40 | 0.40 | 5.80 |
| Detection_Risk | 0 | 1 | 0.50 | 0.00 | 0.50 | 0.50 | 0.50 | 0.50 | 0.50 |
| Audit_Risk | 0 | 1 | 7.17 | 38.67 | 0.28 | 0.32 | 0.56 | 3.25 | 961.51 |
task_audit = TaskClassif$new(id="audit",
backend = audit,
target = "Risk",positive ="1")
cara manual untuk membagi data dalam objek Tugas ke dalam satu set pelatihan (untuk melatih model) dan satu set pengujian (untuk mengevaluasi performa generalisasi) disebut metode holdout. Sebagai pengenalan cepat untuk resampling dan benchmarking menggunakan paket mlr3, kami akan menunjukkan contoh singkat menggunakan fungsi praktis resample() dan benchmark().
Argumen utama dalam fungsi TaskClassif$new adalah id, yang merupakan nama tugas (dapat diisi dengan nama apa pun), backend merupakan data yang ingin dimodelkan, dengan catatan bahwa variabel respons harus berupa variabel numerik, dan target adalah nama kolom yang akan dijadikan sebagai variabel respons.
learner1 = lrn("classif.log_reg", predict_type = "prob")
learner1
## <LearnerClassifLogReg:classif.log_reg>
## * Model: -
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, character, factor, ordered
## * Properties: loglik, twoclass
Pada fungsi learner 1 kita akan mengklasifikasikan regresi linier pada data
learner2 = lrn("classif.lda", predict_type = "prob")
learner2
## <LearnerClassifLDA:classif.lda>
## * Model: -
## * Parameters: list()
## * Packages: mlr3, mlr3learners, MASS
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, factor, ordered
## * Properties: multiclass, twoclass, weights
Berdasarkan output diatas argumen-argumen yang dapat digunakan dalam classif.log_reg dan classif.lda ada pada kolom id. Selanjutnya, kolom class menunjukkan tipe data argumen. Kolom lower, upper dan levels adalah isi ataupun nilai dari argumen. Informasi tersebut dapat digunakan untuk melakukan tuning hyperparameter.
#memeriksa pengukuran dari klasifikasi ini dapat digunakan perintah berikut:
msr_tbl = as.data.table(mlr_measures)
msr_tbl[1:5, .(key, label, task_type)]
## key label task_type
## 1: aic Akaika Information Criterion <NA>
## 2: bic Bayesian Information Criterion <NA>
## 3: classif.acc Classification Accuracy classif
## 4: classif.auc Area Under the ROC Curve classif
## 5: classif.bacc Balanced Accuracy classif
msr_tbl[1:5, .(key, packages, predict_type, task_properties)]
## key packages predict_type task_properties
## 1: aic mlr3 response
## 2: bic mlr3 response
## 3: classif.acc mlr3,mlr3measures response
## 4: classif.auc mlr3,mlr3measures prob twoclass
## 5: classif.bacc mlr3,mlr3measures response
as.data.table(lrn("classif.log_reg")$param_set)
## id class lower upper levels nlevels is_bounded
## 1: dispersion ParamUty NA NA Inf FALSE
## 2: epsilon ParamDbl -Inf Inf Inf FALSE
## 3: etastart ParamUty NA NA Inf FALSE
## 4: maxit ParamDbl -Inf Inf Inf FALSE
## 5: model ParamLgl NA NA TRUE,FALSE 2 TRUE
## 6: mustart ParamUty NA NA Inf FALSE
## 7: offset ParamUty NA NA Inf FALSE
## 8: singular.ok ParamLgl NA NA TRUE,FALSE 2 TRUE
## 9: start ParamUty NA NA Inf FALSE
## 10: trace ParamLgl NA NA TRUE,FALSE 2 TRUE
## 11: x ParamLgl NA NA TRUE,FALSE 2 TRUE
## 12: y ParamLgl NA NA TRUE,FALSE 2 TRUE
## special_vals default storage_type tags
## 1: <list[0]> list predict
## 2: <list[0]> 1e-08 numeric train,control
## 3: <list[0]> <NoDefault[3]> list train
## 4: <list[0]> 25 numeric train,control
## 5: <list[0]> TRUE logical train
## 6: <list[0]> <NoDefault[3]> list train
## 7: <list[0]> <NoDefault[3]> list train
## 8: <list[0]> TRUE logical train
## 9: <list[0]> list train
## 10: <list[0]> FALSE logical train,control
## 11: <list[0]> FALSE logical train
## 12: <list[0]> TRUE logical train
as.data.table(lrn("classif.lda")$param_set)
## id class lower upper levels nlevels
## 1: dimen ParamUty NA NA Inf
## 2: method ParamFct NA NA moment,mle,mve,t 4
## 3: nu ParamInt -Inf Inf Inf
## 4: predict.method ParamFct NA NA plug-in,predictive,debiased 3
## 5: predict.prior ParamUty NA NA Inf
## 6: prior ParamUty NA NA Inf
## 7: tol ParamDbl -Inf Inf Inf
## is_bounded special_vals default storage_type tags
## 1: FALSE <list[0]> <NoDefault[3]> list predict
## 2: TRUE <list[0]> moment character train
## 3: FALSE <list[0]> <NoDefault[3]> integer train
## 4: TRUE <list[0]> plug-in character predict
## 5: FALSE <list[0]> <NoDefault[3]> list predict
## 6: FALSE <list[0]> <NoDefault[3]> list train
## 7: FALSE <list[0]> <NoDefault[3]> numeric train
Code di atas berfungsi untuk memeriksa pengukuran pada klasifikasi yangg terlah dilakukan oleh fungsi learner
resampling = rsmp("holdout")
rr = resample(task = task_audit, learner = learner1, resampling = resampling)
## INFO [23:24:30.666] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 1/1)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
rr$aggregate(msr("classif.acc"))
## classif.acc
## 0.965251
Kode di atas adalah contoh dari penggunaan holdout (specified using rsmp("holdout")) untuk LDA
lrns = c(learner1, lrn("classif.featureless"))
d = benchmark_grid(task = task_audit, learners = lrns, resampling = resampling)
bmr = benchmark(design = d)
## INFO [23:24:30.948] [mlr3] Running benchmark with 2 resampling iterations
## INFO [23:24:30.967] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 1/1)
## INFO [23:24:31.041] [mlr3] Applying learner 'classif.featureless' on task 'audit' (iter 1/1)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## INFO [23:24:31.064] [mlr3] Finished benchmark
The benchmark() function juga menggunakan resample() function yang digunakan untuk mengestimasi performa berdasarkan strategi resampling.
acc = bmr$aggregate(msr("classif.acc"))
acc[, .(task_id, learner_id, classif.acc)]
## task_id learner_id classif.acc
## 1: audit classif.log_reg 0.9150579
## 2: audit classif.featureless 0.6370656
as.data.table(mlr_resamplings)
## key label params iters
## 1: bootstrap Bootstrap ratio,repeats 30
## 2: custom Custom Splits NA
## 3: custom_cv Custom Split Cross-Validation NA
## 4: cv Cross-Validation folds 10
## 5: holdout Holdout ratio 1
## 6: insample Insample Resampling 1
## 7: loo Leave-One-Out NA
## 8: repeated_cv Repeated Cross-Validation folds,repeats 100
## 9: subsampling Subsampling ratio,repeats 30
resampling = rsmp("holdout")
print(resampling)
## <ResamplingHoldout>: Holdout
## * Iterations: 1
## * Instantiated: FALSE
## * Parameters: ratio=0.6667
Holdout metode akan menggunakan 2/3 data sebagai data training dan 1/3 sebagai data test. Itu dapat mengatur agar lebih spesifik lagi parameter rasui untuk holdout dengan meng-update rasio.
resampling = rsmp("holdout", ratio = 0.8)
resampling$param_set$values = list(ratio = 0.5)
resampling = rsmp("cv", folds = 10)
Pada kode di atas merupakan contoh dari membangun objek resampling yang digunakan untuk holdout dengan berbagai size split
resampling = rsmp("holdout", ratio = 0.8)
resampling$instantiate(task_audit)
train_ids = resampling$train_set(1)
test_ids = resampling$test_set(1)
str(train_ids)
## int [1:621] 1 2 3 4 6 7 8 9 11 12 ...
str(test_ids)
## int [1:155] 5 10 14 17 36 37 63 66 67 70 ...
Melakukan pemanggilan metode \$instantiate() dari objek Resampling yang dibangun sebelumnya pada Task untuk menghasilkan pemisahan uji train untuk task tertentu.
resampling = rsmp("cv", folds = 4)
rr = resample(task_audit, learner1, resampling)
## INFO [23:24:31.608] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 1/4)
## INFO [23:24:31.697] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 2/4)
## INFO [23:24:31.778] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 3/4)
## INFO [23:24:31.879] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 4/4)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
print(rr)
## <ResampleResult> of 4 iterations
## * Task: audit
## * Learner: classif.log_reg
## * Warnings: 0 in 0 iterations
## * Errors: 0 in 0 iterations
as.data.table(rr)
## task learner resampling iteration
## 1: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 1
## 2: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 2
## 3: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 3
## 4: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 4
## prediction
## 1: <PredictionClassif[20]>
## 2: <PredictionClassif[20]>
## 3: <PredictionClassif[20]>
## 4: <PredictionClassif[20]>
Dalam contoh kode di bawah ini, secara langsung itu menggunakan akurasi klasifikasi (classif.acc) sebagai ukuran kinerja dan meneruskan ke metode \$score() untuk mendapatkan perkiraan kinerja setiap iterasi resampling secara terpisah.
acc = rr$score(msr("classif.acc"))
acc[, .(iteration, classif.acc)]
## iteration classif.acc
## 1: 1 0.9948454
## 2: 2 0.9381443
## 3: 3 0.9690722
## 4: 4 0.9536082
rr$aggregate(msr("classif.acc"))
## classif.acc
## 0.9639175
rr$aggregate(msr("classif.acc", average = "micro"))
## classif.acc
## 0.9639175
rrdt = as.data.table(rr)
rrdt
## task learner resampling iteration
## 1: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 1
## 2: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 2
## 3: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 3
## 4: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 4
## prediction
## 1: <PredictionClassif[20]>
## 2: <PredictionClassif[20]>
## 3: <PredictionClassif[20]>
## 4: <PredictionClassif[20]>
rrdt$prediction
## [[1]]
## <PredictionClassif> for 194 observations:
## row_ids truth response prob.1 prob.0
## 5 0 0 2.220446e-16 1
## 6 0 0 2.220446e-16 1
## 10 0 0 2.220446e-16 1
## ---
## 770 0 0 2.220446e-16 1
## 775 0 0 2.220446e-16 1
## 776 0 0 2.220446e-16 1
##
## [[2]]
## <PredictionClassif> for 194 observations:
## row_ids truth response prob.1 prob.0
## 4 1 1 1.000000e+00 2.220446e-16
## 9 1 1 1.000000e+00 2.220446e-16
## 24 0 0 2.220446e-16 1.000000e+00
## ---
## 755 0 0 2.220446e-16 1.000000e+00
## 757 0 0 2.220446e-16 1.000000e+00
## 759 0 0 2.220446e-16 1.000000e+00
##
## [[3]]
## <PredictionClassif> for 194 observations:
## row_ids truth response prob.1 prob.0
## 1 1 1 9.999454e-01 5.464923e-05
## 7 1 1 1.000000e+00 2.220446e-16
## 16 1 1 1.000000e+00 2.220446e-16
## ---
## 769 0 0 2.220446e-16 1.000000e+00
## 771 0 0 2.220446e-16 1.000000e+00
## 773 0 0 2.220446e-16 1.000000e+00
##
## [[4]]
## <PredictionClassif> for 194 observations:
## row_ids truth response prob.1 prob.0
## 2 0 0 2.220446e-16 1.000000e+00
## 3 0 0 2.220446e-16 1.000000e+00
## 8 1 1 1.000000e+00 2.220446e-16
## ---
## 768 0 0 2.220446e-16 1.000000e+00
## 772 0 0 2.220446e-16 1.000000e+00
## 774 0 0 2.220446e-16 1.000000e+00
all.equal(rrdt$prediction, rr$predictions())
## [1] TRUE
pred = rr$prediction()
pred
## <PredictionClassif> for 776 observations:
## row_ids truth response prob.1 prob.0
## 5 0 0 2.220446e-16 1
## 6 0 0 2.220446e-16 1
## 10 0 0 2.220446e-16 1
## ---
## 768 0 0 2.220446e-16 1
## 772 0 0 2.220446e-16 1
## 774 0 0 2.220446e-16 1
pred$score(msr("classif.acc"))
## classif.acc
## 0.9639175
Peran kolom lain yang tersedia di mlr3 adalah “strata”, yang menerapkan pengambilan sampel bertingkat. Stratified sampling memastikan bahwa satu atau lebih fitur diskrit dalam set pelatihan dan pengujian memiliki distribusi yang sama seperti pada tugas awal yang berisi semua pengamatan. Ini sangat berguna ketika fitur diskrit sangat tidak seimbang dan juga ingin memastikan bahwa distribusi fitur tersebut sama pada di setiap iterasi resampling. Stratifikasi umumnya digunakan dalam tugas klasifikasi yang tidak seimbang di mana kelas fitur target tidak seimbang.
Pada contoh di bawah ini akan menerapkam stratum pada dataset audit - Sebelum dilakukan stratum:
prop.table(table(task_audit$data(cols = "Risk")))
## Risk
## 1 0
## 0.3930412 0.6069588
r = rsmp("cv", folds = 3)
r$instantiate(task_audit)
prop.table(table(task_audit$data(rows = r$test_set(1), cols = "Risk")))
## Risk
## 1 0
## 0.3899614 0.6100386
prop.table(table(task_audit$data(rows = r$test_set(2), cols = "Risk")))
## Risk
## 1 0
## 0.4092664 0.5907336
prop.table(table(task_audit$data(rows = r$test_set(3), cols = "Risk")))
## Risk
## 1 0
## 0.379845 0.620155
task_audit$col_roles$stratum = "Risk"
r = rsmp("cv", folds = 3)
r$instantiate(task_audit)
prop.table(table(task_audit$data(rows = r$test_set(1), cols = "Risk")))
## Risk
## 1 0
## 0.3938224 0.6061776
prop.table(table(task_audit$data(rows = r$test_set(2), cols = "Risk")))
## Risk
## 1 0
## 0.3938224 0.6061776
prop.table(table(task_audit$data(rows = r$test_set(3), cols = "Risk")))
## Risk
## 1 0
## 0.3914729 0.6085271
set.seed(1110)
audit_split = sort(sample(nrow(audit), nrow(audit)*0.8)) ## 80% of the dataset randomly selected
train<-audit[audit_split,]
test<-audit[-audit_split,]
audit_logit <- glm(Risk~. ,data = train,family = "binomial"(link="logit"))
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(audit_logit)
##
## Call:
## glm(formula = Risk ~ ., family = binomial(link = "logit"), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -8.49 0.00 0.00 0.00 8.49
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.112e+16 4.770e+08 23308916 <2e-16 ***
## Sector_score -1.271e+13 1.285e+05 -98941235 <2e-16 ***
## PARA_A 8.066e+13 2.241e+07 3599137 <2e-16 ***
## Score_A 1.972e+15 2.310e+07 85359709 <2e-16 ***
## Risk_A -4.627e+14 2.926e+07 -15817104 <2e-16 ***
## PARA_B 8.813e+14 1.912e+07 46089080 <2e-16 ***
## Score_B 4.766e+15 2.406e+07 198100013 <2e-16 ***
## Risk_B -1.986e+15 2.084e+07 -95297301 <2e-16 ***
## TOTAL -4.413e+14 1.796e+07 -24566506 <2e-16 ***
## numbers -3.080e+15 9.730e+07 -31658324 <2e-16 ***
## Score_B.1 -2.313e+16 7.068e+08 -32727009 <2e-16 ***
## Risk_C 3.088e+15 1.493e+08 20691876 <2e-16 ***
## Score_MV 7.727e+15 2.490e+07 310340513 <2e-16 ***
## Risk_D -1.258e+15 1.947e+07 -64597545 <2e-16 ***
## District_Loss 1.476e+14 1.029e+07 14347146 <2e-16 ***
## PROB -6.591e+15 1.470e+08 -44823237 <2e-16 ***
## RiSk_E 1.720e+15 5.202e+07 33067377 <2e-16 ***
## History -3.771e+15 5.977e+07 -63091848 <2e-16 ***
## Prob -1.654e+15 1.009e+08 -16393926 <2e-16 ***
## Risk_F 5.685e+15 9.612e+07 59137398 <2e-16 ***
## Score NA NA NA NA
## Inherent_Risk 1.258e+15 1.947e+07 64624335 <2e-16 ***
## CONTROL_RISK NA NA NA NA
## Detection_Risk NA NA NA NA
## Audit_Risk -9.652e+12 2.759e+05 -34990469 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 832.04 on 619 degrees of freedom
## Residual deviance: 865.05 on 598 degrees of freedom
## AIC: 909.05
##
## Number of Fisher Scoring iterations: 22
r = rsmp("cv", folds = 30)
rr = resample(task_audit, learner1, r)
## INFO [23:24:33.295] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 1/30)
## INFO [23:24:33.382] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 2/30)
## INFO [23:24:33.481] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 3/30)
## INFO [23:24:33.583] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 4/30)
## INFO [23:24:33.664] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 5/30)
## INFO [23:24:33.751] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 6/30)
## INFO [23:24:33.845] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 7/30)
## INFO [23:24:33.940] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 8/30)
## INFO [23:24:34.042] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 9/30)
## INFO [23:24:34.131] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 10/30)
## INFO [23:24:34.243] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 11/30)
## INFO [23:24:34.330] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 12/30)
## INFO [23:24:34.429] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 13/30)
## INFO [23:24:34.524] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 14/30)
## INFO [23:24:34.615] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 15/30)
## INFO [23:24:34.713] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 16/30)
## INFO [23:24:34.805] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 17/30)
## INFO [23:24:34.900] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 18/30)
## INFO [23:24:35.003] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 19/30)
## INFO [23:24:35.091] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 20/30)
## INFO [23:24:35.201] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 21/30)
## INFO [23:24:35.308] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 22/30)
## INFO [23:24:35.415] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 23/30)
## INFO [23:24:35.508] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 24/30)
## INFO [23:24:35.604] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 25/30)
## INFO [23:24:35.702] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 26/30)
## INFO [23:24:35.802] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 27/30)
## INFO [23:24:35.925] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 28/30)
## INFO [23:24:36.022] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 29/30)
## INFO [23:24:36.128] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 30/30)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
autoplot(rr, type = "histogram", bins =30)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
task = task_audit$select(c("Risk_A", "Audit_Risk"))
resampling = rsmp("cv", folds = 3)
object = resample(task, learner1, resampling, store_models = TRUE)
## INFO [23:24:37.620] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 1/3)
## INFO [23:24:37.692] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 2/3)
## INFO [23:24:37.758] [mlr3] Applying learner 'classif.log_reg' on task 'audit' (iter 3/3)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
autoplot(object, type = "prediction")
Pada Package mlr3measures memungkinkan untuk menghitung tambahan beberapa pengukuran yang berbasis confusion matrix yang pada umum menggunakan fungsi Confusion_matrix.
mlr3measures::confusion_matrix(truth = pred$truth,
response = pred$response, positive = task_audit$positive)
## truth
## response 1 0
## 1 283 6
## 0 22 465
## acc : 0.9639; ce : 0.0361; dor : 996.9318; f1 : 0.9529
## fdr : 0.0208; fnr : 0.0721; fomr: 0.0452; fpr : 0.0127
## mcc : 0.9245; npv : 0.9548; ppv : 0.9792; tnr : 0.9873
## tpr : 0.9279
pred$set_threshold(0.99)
mlr3measures::confusion_matrix(pred$truth, pred$response, task_audit$positive)
## truth
## response 1 0
## 1 283 6
## 0 22 465
## acc : 0.9639; ce : 0.0361; dor : 996.9318; f1 : 0.9529
## fdr : 0.0208; fnr : 0.0721; fomr: 0.0452; fpr : 0.0127
## mcc : 0.9245; npv : 0.9548; ppv : 0.9792; tnr : 0.9873
## tpr : 0.9279
pred$set_threshold(0.01)
mlr3measures::confusion_matrix(pred$truth, pred$response, task_audit$positive)
## truth
## response 1 0
## 1 283 6
## 0 22 465
## acc : 0.9639; ce : 0.0361; dor : 996.9318; f1 : 0.9529
## fdr : 0.0208; fnr : 0.0721; fomr: 0.0452; fpr : 0.0127
## mcc : 0.9245; npv : 0.9548; ppv : 0.9792; tnr : 0.9873
## tpr : 0.9279
thresholds = sort(pred$prob[,1])
rocvals = data.table::rbindlist(lapply(thresholds, function(t) {
pred$set_threshold(t)
data.frame(
threshold = t,
FPR = pred$score(msr("classif.fpr")),
TPR = pred$score(msr("classif.tpr"))
)
}))
head(rocvals)
## threshold FPR TPR
## 1: 2.220446e-16 0.5307856 0.9606557
## 2: 2.220446e-16 0.5562633 0.9475410
## 3: 2.220446e-16 0.4925690 0.9704918
## 4: 2.220446e-16 0.4946921 0.9672131
## 5: 2.220446e-16 0.5138004 0.9672131
## 6: 2.220446e-16 0.5180467 0.9639344
Ukuran kinerja yang dapat diturunkan dari kurva ROC yaitu the area under the curve (AUC). Semakin baik kinerjanya, semakin tinggi juga nilai AUC, sedangkan pengklasifikasi acak akan menghasilkan AUC sebesar 0,5. AUC dapat di definisikan sebagai probabilitas bahwa instance positif yang dipilih secara acak diberi peringkat lebih tinggi (dalam artian mendapat probabilitas prediksi yang lebih tinggi untuk menjadi kelas positif) oleh model klasifikasi daripada instance negatif yang dipilih secara acak.
standardize <- po("scale")
# Jika dup_size=1, jumlah amatan kelas minoritas
#bertambah sebanyak
#1*(jumlah amatan awal)+jumlah amatan awal
smote <- po("smote",dup_size=1)
standardize$train(list(task_audit))[[1]]$data() %>% glimpse
## Rows: 776
## Columns: 3
## $ Risk <fct> 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1,…
## $ Audit_Risk <dbl> -0.14103211, -0.17216938, -0.17737271, -0.09470897, -0.1780…
## $ Risk_A <dbl> 0.3362853, -0.3926899, -0.3630426, -0.3926899, -0.3926899, …
smote$train(list(task_audit))[[1]]$data() %>% count(Risk)
## Risk n
## 1: 1 610
## 2: 0 471
reglog <- GraphLearner$new(standardize %>>% smote %>>% lrn("classif.log_reg"))
reglog
## <GraphLearner:scale.smote.classif.log_reg>
## * Model: -
## * Parameters: scale.robust=FALSE, smote.dup_size=1
## * Packages: mlr3, mlr3pipelines, smotefamily, mlr3learners, stats
## * Predict Types: [response], prob
## * Feature Types: logical, integer, numeric, character, factor, ordered,
## POSIXct
## * Properties: featureless, hotstart_backward, hotstart_forward,
## importance, loglik, missings, multiclass, oob_error,
## selected_features, twoclass, weights
lda <- GraphLearner$new(standardize %>>%
smote %>>% lrn("classif.lda",method="moment"))
lda
## <GraphLearner:scale.smote.classif.lda>
## * Model: -
## * Parameters: scale.robust=FALSE, smote.dup_size=1,
## classif.lda.method=moment
## * Packages: mlr3, mlr3pipelines, smotefamily, mlr3learners, MASS
## * Predict Types: [response], prob
## * Feature Types: logical, integer, numeric, character, factor, ordered,
## POSIXct
## * Properties: featureless, hotstart_backward, hotstart_forward,
## importance, loglik, missings, multiclass, oob_error,
## selected_features, twoclass, weights
Tahap ini biasa nya dilakukan untuk melihat bagaimana pengaruh peubah-peubah prediktor terhadap respon menurut masing-masing model. Conoth nya dalam regresi logistik besarnya nilai koefisien,odds ratio dan p-value bisa menggambarkan bagaimana pengaruh peubah prediktor terhadap respon. ### Regresi Logistik
# train model dengan keseluruhan data
reglog$train(task = task_audit)
## Warning: glm.fit: algorithm did not converge
## This happened PipeOp classif.log_reg's $train()
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## This happened PipeOp classif.log_reg's $train()
summary(reglog$model$classif.log_reg$model)
##
## Call:
## stats::glm(formula = task$formula(), family = "binomial", data = data,
## model = FALSE)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.01083 0.00000 0.00000 0.00000 0.01134
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3508.80 17224.91 0.204 0.839
## Audit_Risk 22110.13 108314.56 0.204 0.838
## Risk_A -58.03 832.84 -0.070 0.944
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.4807e+03 on 1080 degrees of freedom
## Residual deviance: 3.0607e-04 on 1078 degrees of freedom
## AIC: 6.0003
##
## Number of Fisher Scoring iterations: 25
Selain fungsi summary, itu juga dapat menggunakan fungsi tidy dari package broom untuk menampilkan hal yang sama. Hanya saja fungsi tidy menampilkan nilai koefisien dan p-value dalam bentuk data.frame
broom::tidy(reglog$model$classif.log_reg$model)
## # A tibble: 3 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3509. 17225. 0.204 0.839
## 2 Audit_Risk 22110. 108315. 0.204 0.838
## 3 Risk_A -58.0 833. -0.0697 0.944
Kemudian bisa menambahkan odds ratio dengan menggunakan sintaks berikut:
broom::tidy(reglog$model$classif.log_reg$model) %>%
mutate(OddsRatio = exp(estimate))
## # A tibble: 3 × 6
## term estimate std.error statistic p.value OddsRatio
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) 3509. 17225. 0.204 0.839 Inf
## 2 Audit_Risk 22110. 108315. 0.204 0.838 Inf
## 3 Risk_A -58.0 833. -0.0697 0.944 6.30e-26
# menampilkan informasi tambahan tentang model
broom::glance(reglog$model$classif.log_reg$model)
## # A tibble: 1 × 8
## null.deviance df.null logLik AIC BIC deviance df.residual nobs
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 1481. 1080 -0.000153 6.00 21.0 0.000306 1078 1081
Hasil dari pengujian pada praktikum Statistika sains data modul resampling, dapat dinyatakan bahwa pengujian yang lebih baik dengan menggunakan data audit risk yaitu pengujian regresi logistik, hal itu terbukti karena keberhasilan dalam menjalankan program yang lebih besar daripada pada pengujian lda. Pada dataset Audit Risk terdapat beberapa missing value, yang kemudian dihapus.Autoplot yang telah dibuat dari data resampling yang menunjukkan bagaimana peforma peningkatan program tergambarkan pada barplot yang ada. Pembangian data yang telah dilakukan dengan 80% data ditraining dan sisanya menjadi data testing, lalu data dikelompokan dalam kelas-kelas dengan menggunakan LDA yang dilanjutkan dengan analisis regresi logistik, pada saat dilakukan summary yang terlihat informasi-informasi untuk membuat model regresi. ROC juga dilakukan untuk mengetahui hubungan antara value true-positive dan false-positive, terlihat bahwa True-positive-nya besar yang artinya thresholdnya bagus, yang ditunjukkan oleh luas AUC yang besar. Akurasi prediksi dari pemodelan yang dilakukan pada program adalah 96,77% error, atau 3,23% error dari hasil training pula didapatkan bahwa dapat memprediksi 97 bernilai true-positive, 3 dengan nilai false-positive, 2 bernilai false-negative dan 53 true-negative.