121450022 - Sarah Natalia Geraldine
121450094 - Syifa Firnanda
121450105 - Raditia Riandi
121450112 - Christian Arvianus Nathanael Biran
121450157 - Salwa Naqwadisa Madinna
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.1.8
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(mlr3verse)
## Loading required package: mlr3
library(mlr3tuning)
## Loading required package: paradox
library(paradox)
library(kknn)
library(ggpubr)
library(smotefamily)
library(readr)
Raisin_Dataset <- read_csv("C:/Users/sarah/Downloads/Raisin_Dataset.csv")
## Rows: 900 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Class
## dbl (7): Area, MajorAxisLength, MinorAxisLength, Eccentricity, ConvexArea, E...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Raisin_Dataset)
raisin <- read.csv("C:/Users/sarah/Downloads/Raisin_Dataset.csv", stringsAsFactors = TRUE)
raisin <- raisin %>% mutate(across(where(is.integer),as.numeric))
glimpse(raisin)
## Rows: 900
## Columns: 8
## $ Area <dbl> 87524, 75166, 90856, 45928, 79408, 49242, 42492, 60952…
## $ MajorAxisLength <dbl> 442.2460, 406.6907, 442.2670, 286.5406, 352.1908, 318.…
## $ MinorAxisLength <dbl> 253.2912, 243.0324, 266.3283, 208.7600, 290.8275, 200.…
## $ Eccentricity <dbl> 0.8197384, 0.8018052, 0.7983536, 0.6849892, 0.5640113,…
## $ ConvexArea <dbl> 90546, 78789, 93717, 47336, 81463, 51368, 43904, 62329…
## $ Extent <dbl> 0.7586506, 0.6841296, 0.6376128, 0.6995994, 0.7927719,…
## $ Perimeter <dbl> 1184.040, 1121.786, 1208.575, 844.162, 1073.251, 881.8…
## $ Class <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, …
task_raisin = TaskClassif$new(id="raisin", backend = raisin, target = "Class", positive = "Kecimen")
learner1 = lrn("classif.log_reg", predict_type = "prob")
learner1
## <LearnerClassifLogReg:classif.log_reg>
## * Model: -
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, character, factor, ordered
## * Properties: loglik, twoclass
learner2 = lrn("classif.lda", predict_type = "prob")
learner2
## <LearnerClassifLDA:classif.lda>
## * Model: -
## * Parameters: list()
## * Packages: mlr3, mlr3learners, MASS
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, factor, ordered
## * Properties: multiclass, twoclass, weights
msr_tbl = as.data.table(mlr_measures)
msr_tbl[1:5, .(key, label, task_type)]
## key label task_type
## 1: aic Akaika Information Criterion <NA>
## 2: bic Bayesian Information Criterion <NA>
## 3: classif.acc Classification Accuracy classif
## 4: classif.auc Area Under the ROC Curve classif
## 5: classif.bacc Balanced Accuracy classif
msr_tbl[1:5, .(key, packages, predict_type, task_properties)]
## key packages predict_type task_properties
## 1: aic mlr3 response
## 2: bic mlr3 response
## 3: classif.acc mlr3,mlr3measures response
## 4: classif.auc mlr3,mlr3measures prob twoclass
## 5: classif.bacc mlr3,mlr3measures response
as.data.table(lrn("classif.log_reg")$param_set)
## id class lower upper levels nlevels is_bounded
## 1: dispersion ParamUty NA NA Inf FALSE
## 2: epsilon ParamDbl -Inf Inf Inf FALSE
## 3: etastart ParamUty NA NA Inf FALSE
## 4: maxit ParamDbl -Inf Inf Inf FALSE
## 5: model ParamLgl NA NA TRUE,FALSE 2 TRUE
## 6: mustart ParamUty NA NA Inf FALSE
## 7: offset ParamUty NA NA Inf FALSE
## 8: singular.ok ParamLgl NA NA TRUE,FALSE 2 TRUE
## 9: start ParamUty NA NA Inf FALSE
## 10: trace ParamLgl NA NA TRUE,FALSE 2 TRUE
## 11: x ParamLgl NA NA TRUE,FALSE 2 TRUE
## 12: y ParamLgl NA NA TRUE,FALSE 2 TRUE
## special_vals default storage_type tags
## 1: <list[0]> list predict
## 2: <list[0]> 1e-08 numeric train,control
## 3: <list[0]> <NoDefault[3]> list train
## 4: <list[0]> 25 numeric train,control
## 5: <list[0]> TRUE logical train
## 6: <list[0]> <NoDefault[3]> list train
## 7: <list[0]> <NoDefault[3]> list train
## 8: <list[0]> TRUE logical train
## 9: <list[0]> list train
## 10: <list[0]> FALSE logical train,control
## 11: <list[0]> FALSE logical train
## 12: <list[0]> TRUE logical train
as.data.table(lrn("classif.lda")$param_set)
## id class lower upper levels nlevels
## 1: dimen ParamUty NA NA Inf
## 2: method ParamFct NA NA moment,mle,mve,t 4
## 3: nu ParamInt -Inf Inf Inf
## 4: predict.method ParamFct NA NA plug-in,predictive,debiased 3
## 5: predict.prior ParamUty NA NA Inf
## 6: prior ParamUty NA NA Inf
## 7: tol ParamDbl -Inf Inf Inf
## is_bounded special_vals default storage_type tags
## 1: FALSE <list[0]> <NoDefault[3]> list predict
## 2: TRUE <list[0]> moment character train
## 3: FALSE <list[0]> <NoDefault[3]> integer train
## 4: TRUE <list[0]> plug-in character predict
## 5: FALSE <list[0]> <NoDefault[3]> list predict
## 6: FALSE <list[0]> <NoDefault[3]> list train
## 7: FALSE <list[0]> <NoDefault[3]> numeric train
resampling = rsmp("holdout")
rr = resample(task = task_raisin, learner = learner1, resampling = resampling)
## INFO [11:38:38.006] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 1/1)
rr$aggregate(msr("classif.acc"))
## classif.acc
## 0.8766667
resampling = rsmp("holdout")
rr = resample(task = task_raisin, learner = learner2, resampling = resampling)
## INFO [11:38:38.240] [mlr3] Applying learner 'classif.lda' on task 'raisin' (iter 1/1)
rr$aggregate(msr("classif.acc"))
## classif.acc
## 0.8533333
lrns = c(learner1, lrn("classif.featureless"))
d = benchmark_grid(task = task_raisin, learners = lrns, resampling = resampling)
bmr = benchmark(design = d)
## INFO [11:38:38.441] [mlr3] Running benchmark with 2 resampling iterations
## INFO [11:38:38.450] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 1/1)
## INFO [11:38:38.487] [mlr3] Applying learner 'classif.featureless' on task 'raisin' (iter 1/1)
## INFO [11:38:38.517] [mlr3] Finished benchmark
acc = bmr$aggregate(msr("classif.acc"))
acc[, .(task_id, learner_id, classif.acc)]
## task_id learner_id classif.acc
## 1: raisin classif.log_reg 0.8733333
## 2: raisin classif.featureless 0.4666667
lrns = c(learner2, lrn("classif.featureless"))
d = benchmark_grid(task = task_raisin, learners = lrns, resampling = resampling)
bmr = benchmark(design = d)
## INFO [11:38:38.678] [mlr3] Running benchmark with 2 resampling iterations
## INFO [11:38:38.686] [mlr3] Applying learner 'classif.lda' on task 'raisin' (iter 1/1)
## INFO [11:38:38.725] [mlr3] Applying learner 'classif.featureless' on task 'raisin' (iter 1/1)
## INFO [11:38:38.748] [mlr3] Finished benchmark
acc = bmr$aggregate(msr("classif.acc"))
acc[, .(task_id, learner_id, classif.acc)]
## task_id learner_id classif.acc
## 1: raisin classif.lda 0.86
## 2: raisin classif.featureless 0.50
as.data.table(mlr_resamplings)
## key label params iters
## 1: bootstrap Bootstrap ratio,repeats 30
## 2: custom Custom Splits NA
## 3: custom_cv Custom Split Cross-Validation NA
## 4: cv Cross-Validation folds 10
## 5: holdout Holdout ratio 1
## 6: insample Insample Resampling 1
## 7: loo Leave-One-Out NA
## 8: repeated_cv Repeated Cross-Validation folds,repeats 100
## 9: subsampling Subsampling ratio,repeats 30
resampling = rsmp("holdout")
print(resampling)
## <ResamplingHoldout>: Holdout
## * Iterations: 1
## * Instantiated: FALSE
## * Parameters: ratio=0.6667
resampling = rsmp("holdout", ratio = 0.8)
resampling$param_set$values = list(ratio = 0.5)
resampling = rsmp("cv", folds = 10)
resampling = rsmp("holdout", ratio = 0.8)
resampling$instantiate(task_raisin)
train_ids = resampling$train_set(1)
test_ids = resampling$test_set(1)
str(train_ids)
## int [1:720] 1 2 4 5 6 7 8 9 10 11 ...
str(test_ids)
## int [1:180] 3 15 18 27 47 50 55 58 70 72 ...
resampling = rsmp("cv", folds = 4)
rr = resample(task_raisin, learner1, resampling)
## INFO [11:38:39.266] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 1/4)
## INFO [11:38:39.321] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 2/4)
## INFO [11:38:39.362] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 3/4)
## INFO [11:38:39.401] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 4/4)
print(rr)
## <ResampleResult> of 4 iterations
## * Task: raisin
## * Learner: classif.log_reg
## * Warnings: 0 in 0 iterations
## * Errors: 0 in 0 iterations
as.data.table(rr)
## task learner resampling iteration
## 1: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 1
## 2: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 2
## 3: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 3
## 4: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 4
## prediction
## 1: <PredictionClassif[20]>
## 2: <PredictionClassif[20]>
## 3: <PredictionClassif[20]>
## 4: <PredictionClassif[20]>
acc = rr$score(msr("classif.acc"))
acc[, .(iteration, classif.acc)]
## iteration classif.acc
## 1: 1 0.8711111
## 2: 2 0.8577778
## 3: 3 0.8488889
## 4: 4 0.8488889
rr$aggregate(msr("classif.acc"))
## classif.acc
## 0.8566667
rr$aggregate(msr("classif.acc", average = "micro"))
## classif.acc
## 0.8566667
rrdt = as.data.table(rr)
rrdt
## task learner resampling iteration
## 1: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 1
## 2: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 2
## 3: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 3
## 4: <TaskClassif[50]> <LearnerClassifLogReg[37]> <ResamplingCV[20]> 4
## prediction
## 1: <PredictionClassif[20]>
## 2: <PredictionClassif[20]>
## 3: <PredictionClassif[20]>
## 4: <PredictionClassif[20]>
rrdt$prediction
## [[1]]
## <PredictionClassif> for 225 observations:
## row_ids truth response prob.Kecimen prob.Besni
## 2 Kecimen Kecimen 0.5115881272 0.48841187
## 5 Kecimen Kecimen 0.8929509469 0.10704905
## 14 Kecimen Kecimen 0.9395626287 0.06043737
## ---
## 890 Besni Besni 0.3944390949 0.60556091
## 893 Besni Besni 0.0007619518 0.99923805
## 898 Besni Besni 0.1624790613 0.83752094
##
## [[2]]
## <PredictionClassif> for 225 observations:
## row_ids truth response prob.Kecimen prob.Besni
## 4 Kecimen Kecimen 0.9521226 0.04787743
## 7 Kecimen Kecimen 0.9310531 0.06894693
## 10 Kecimen Kecimen 0.9148641 0.08513588
## ---
## 886 Besni Kecimen 0.9050845 0.09491550
## 892 Besni Besni 0.3444417 0.65555830
## 897 Besni Besni 0.2400622 0.75993780
##
## [[3]]
## <PredictionClassif> for 225 observations:
## row_ids truth response prob.Kecimen prob.Besni
## 1 Kecimen Besni 2.431414e-01 0.7568586
## 3 Kecimen Besni 2.576259e-01 0.7423741
## 6 Kecimen Kecimen 9.360649e-01 0.0639351
## ---
## 881 Besni Besni 1.476791e-05 0.9999852
## 888 Besni Besni 4.635215e-02 0.9536479
## 899 Besni Besni 1.098976e-01 0.8901024
##
## [[4]]
## <PredictionClassif> for 225 observations:
## row_ids truth response prob.Kecimen prob.Besni
## 9 Kecimen Kecimen 0.94937741 0.05062259
## 12 Kecimen Kecimen 0.96442677 0.03557323
## 15 Kecimen Kecimen 0.59692034 0.40307966
## ---
## 895 Besni Besni 0.01987088 0.98012912
## 896 Besni Kecimen 0.52787520 0.47212480
## 900 Besni Besni 0.03173124 0.96826876
all.equal(rrdt$prediction, rr$predictions())
## [1] TRUE
pred = rr$prediction()
pred
## <PredictionClassif> for 900 observations:
## row_ids truth response prob.Kecimen prob.Besni
## 2 Kecimen Kecimen 0.51158813 0.48841187
## 5 Kecimen Kecimen 0.89295095 0.10704905
## 14 Kecimen Kecimen 0.93956263 0.06043737
## ---
## 895 Besni Besni 0.01987088 0.98012912
## 896 Besni Kecimen 0.52787520 0.47212480
## 900 Besni Besni 0.03173124 0.96826876
prop.table(table(task_raisin$data(cols = "Class")))
## Class
## Kecimen Besni
## 0.5 0.5
r = rsmp("cv", folds = 3)
r$instantiate(task_raisin)
prop.table(table(task_raisin$data(rows = r$test_set(1), cols = "Class")))
## Class
## Kecimen Besni
## 0.5466667 0.4533333
prop.table(table(task_raisin$data(rows = r$test_set(2), cols = "Class")))
## Class
## Kecimen Besni
## 0.47 0.53
prop.table(table(task_raisin$data(rows = r$test_set(3), cols = "Class")))
## Class
## Kecimen Besni
## 0.4833333 0.5166667
task_raisin$col_roles$stratum = "Class"
r = rsmp("cv", folds = 3)
r$instantiate(task_raisin)
prop.table(table(task_raisin$data(rows = r$test_set(1), cols = "Class")))
## Class
## Kecimen Besni
## 0.5 0.5
prop.table(table(task_raisin$data(rows = r$test_set(2), cols = "Class")))
## Class
## Kecimen Besni
## 0.5 0.5
prop.table(table(task_raisin$data(rows = r$test_set(3), cols = "Class")))
## Class
## Kecimen Besni
## 0.5 0.5
r = rsmp("cv", folds = 30)
rr = resample(task_raisin, learner1, r)
## INFO [11:38:40.630] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 1/30)
## INFO [11:38:40.675] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 2/30)
## INFO [11:38:40.734] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 3/30)
## INFO [11:38:40.780] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 4/30)
## INFO [11:38:40.827] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 5/30)
## INFO [11:38:40.872] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 6/30)
## INFO [11:38:40.919] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 7/30)
## INFO [11:38:40.972] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 8/30)
## INFO [11:38:41.029] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 9/30)
## INFO [11:38:41.084] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 10/30)
## INFO [11:38:41.134] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 11/30)
## INFO [11:38:41.190] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 12/30)
## INFO [11:38:41.238] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 13/30)
## INFO [11:38:41.279] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 14/30)
## INFO [11:38:41.321] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 15/30)
## INFO [11:38:41.359] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 16/30)
## INFO [11:38:41.409] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 17/30)
## INFO [11:38:41.459] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 18/30)
## INFO [11:38:41.499] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 19/30)
## INFO [11:38:41.537] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 20/30)
## INFO [11:38:41.576] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 21/30)
## INFO [11:38:41.627] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 22/30)
## INFO [11:38:41.664] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 23/30)
## INFO [11:38:41.704] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 24/30)
## INFO [11:38:41.744] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 25/30)
## INFO [11:38:41.795] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 26/30)
## INFO [11:38:41.831] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 27/30)
## INFO [11:38:41.871] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 28/30)
## INFO [11:38:41.911] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 29/30)
## INFO [11:38:41.952] [mlr3] Applying learner 'classif.log_reg' on task 'raisin' (iter 30/30)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
autoplot(rr, type = "histogram", bins =30)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
splits = partition(task_raisin, ratio = 0.8)
learner1$train(task_raisin, splits$train)
pred = learner1$predict(task_raisin, splits$test)
pred$confusion
## truth
## response Kecimen Besni
## Kecimen 85 19
## Besni 5 71
mlr3measures::confusion_matrix(truth = pred$truth, response = pred$response, positive = task_raisin$positive)
## truth
## response Kecimen Besni
## Kecimen 85 19
## Besni 5 71
## acc : 0.8667; ce : 0.1333; dor : 63.5263; f1 : 0.8763
## fdr : 0.1827; fnr : 0.0556; fomr: 0.0658; fpr : 0.2111
## mcc : 0.7424; npv : 0.9342; ppv : 0.8173; tnr : 0.7889
## tpr : 0.9444
pred$set_threshold(0.99)
mlr3measures::confusion_matrix(pred$truth, pred$response, task_raisin$positive)
## truth
## response Kecimen Besni
## Kecimen 1 0
## Besni 89 90
## acc : 0.5056; ce : 0.4944; dor : NaN; f1 : 0.0220
## fdr : 0.0000; fnr : 0.9889; fomr: 0.4972; fpr : 0.0000
## mcc : 0.0747; npv : 0.5028; ppv : 1.0000; tnr : 1.0000
## tpr : 0.0111
pred$set_threshold(0.01)
mlr3measures::confusion_matrix(pred$truth, pred$response, task_raisin$positive)
## truth
## response Kecimen Besni
## Kecimen 90 64
## Besni 0 26
## acc : 0.6444; ce : 0.3556; dor : NaN; f1 : 0.7377
## fdr : 0.4156; fnr : 0.0000; fomr: 0.0000; fpr : 0.7111
## mcc : 0.4109; npv : 1.0000; ppv : 0.5844; tnr : 0.2889
## tpr : 1.0000
thresholds = sort(pred$prob[,1])
rocvals = data.table::rbindlist(lapply(thresholds, function(t) {
pred$set_threshold(t)
data.frame(
threshold = t,
FPR = pred$score(msr("classif.fpr")),
TPR = pred$score(msr("classif.tpr"))
)
}))
head(rocvals)
## threshold FPR TPR
## 1: 5.417940e-06 1.0000000 1
## 2: 1.124654e-05 0.9888889 1
## 3: 1.241649e-05 0.9666667 1
## 4: 2.743087e-05 0.9666667 1
## 5: 9.917029e-05 0.9444444 1
## 6: 1.831313e-04 0.9333333 1
standardize <- po("scale")
smote <- po("smote",dup_size=1)
standardize$train(list(task_raisin))[[1]]$data() %>% glimpse
## Rows: 900
## Columns: 8
## $ Class <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, …
## $ Area <dbl> -0.007182375, -0.324037016, 0.078248898, -1.073688739,…
## $ ConvexArea <dbl> -0.01570030, -0.30407912, 0.06207883, -1.07556668, -0.…
## $ Eccentricity <dbl> 0.42290649, 0.22435165, 0.18613557, -1.06902826, -2.40…
## $ Extent <dbl> 1.106127610, -0.287616870, -1.157606092, 0.001710523, …
## $ MajorAxisLength <dbl> 0.09752272, -0.20889592, 0.09770402, -1.24435939, -0.6…
## $ MinorAxisLength <dbl> -0.02394487, -0.22916480, 0.23685627, -0.91476486, 0.7…
## $ Perimeter <dbl> 0.06623714, -0.16116284, 0.15585802, -1.17526141, -0.3…
smote$train(list(task_raisin))[[1]]$data() %>% count(Class)
## Class n
## 1: Kecimen 900
## 2: Besni 450
reglog <- GraphLearner$new(standardize %>>% smote %>>% lrn("classif.log_reg"))
reglog
## <GraphLearner:scale.smote.classif.log_reg>
## * Model: -
## * Parameters: scale.robust=FALSE, smote.dup_size=1
## * Packages: mlr3, mlr3pipelines, smotefamily, mlr3learners, stats
## * Predict Types: [response], prob
## * Feature Types: logical, integer, numeric, character, factor, ordered,
## POSIXct
## * Properties: featureless, hotstart_backward, hotstart_forward,
## importance, loglik, missings, multiclass, oob_error,
## selected_features, twoclass, weights
lda <- GraphLearner$new(standardize %>>%
smote %>>% lrn("classif.lda",method="moment"))
lda
## <GraphLearner:scale.smote.classif.lda>
## * Model: -
## * Parameters: scale.robust=FALSE, smote.dup_size=1,
## classif.lda.method=moment
## * Packages: mlr3, mlr3pipelines, smotefamily, mlr3learners, MASS
## * Predict Types: [response], prob
## * Feature Types: logical, integer, numeric, character, factor, ordered,
## POSIXct
## * Properties: featureless, hotstart_backward, hotstart_forward,
## importance, loglik, missings, multiclass, oob_error,
## selected_features, twoclass, weights
reglog$train(task = task_raisin)
summary(reglog$model$classif.log_reg$model)
##
## Call:
## stats::glm(formula = task$formula(), family = "binomial", data = data,
## model = FALSE)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1514 -0.1165 0.2326 0.4535 3.2682
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.04220 0.19156 -0.220 0.825655
## Area -24.68210 3.03633 -8.129 4.33e-16 ***
## ConvexArea 22.23466 2.52825 8.794 < 2e-16 ***
## Eccentricity 0.43018 0.40870 1.053 0.292545
## Extent -0.01558 0.12844 -0.121 0.903465
## MajorAxisLength 5.77355 1.66015 3.478 0.000506 ***
## MinorAxisLength 5.02710 1.22341 4.109 3.97e-05 ***
## Perimeter -11.47827 1.53736 -7.466 8.25e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1718.59 on 1349 degrees of freedom
## Residual deviance: 811.82 on 1342 degrees of freedom
## AIC: 827.82
##
## Number of Fisher Scoring iterations: 7
broom::tidy(reglog$model$classif.log_reg$model)
## # A tibble: 8 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.0422 0.192 -0.220 8.26e- 1
## 2 Area -24.7 3.04 -8.13 4.33e-16
## 3 ConvexArea 22.2 2.53 8.79 1.44e-18
## 4 Eccentricity 0.430 0.409 1.05 2.93e- 1
## 5 Extent -0.0156 0.128 -0.121 9.03e- 1
## 6 MajorAxisLength 5.77 1.66 3.48 5.06e- 4
## 7 MinorAxisLength 5.03 1.22 4.11 3.97e- 5
## 8 Perimeter -11.5 1.54 -7.47 8.25e-14
broom::tidy(reglog$model$classif.log_reg$model) %>%
mutate(OddsRatio = exp(estimate))
## # A tibble: 8 × 6
## term estimate std.error statistic p.value OddsRatio
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -0.0422 0.192 -0.220 8.26e- 1 9.59e- 1
## 2 Area -24.7 3.04 -8.13 4.33e-16 1.91e-11
## 3 ConvexArea 22.2 2.53 8.79 1.44e-18 4.53e+ 9
## 4 Eccentricity 0.430 0.409 1.05 2.93e- 1 1.54e+ 0
## 5 Extent -0.0156 0.128 -0.121 9.03e- 1 9.85e- 1
## 6 MajorAxisLength 5.77 1.66 3.48 5.06e- 4 3.22e+ 2
## 7 MinorAxisLength 5.03 1.22 4.11 3.97e- 5 1.52e+ 2
## 8 Perimeter -11.5 1.54 -7.47 8.25e-14 1.04e- 5
broom::glance(reglog$model$classif.log_reg$model)
## # A tibble: 1 × 8
## null.deviance df.null logLik AIC BIC deviance df.residual nobs
## <dbl> <int> <dbl> <dbl> <dbl> <dbl> <int> <int>
## 1 1719. 1349 -406. 828. 869. 812. 1342 1350
lda$train(task = task_raisin)
coef_lda <- coef(lda$model$classif.lda$model)
coef_lda
## LD1
## Area 8.46763887
## ConvexArea -8.99799137
## Eccentricity -0.02226158
## Extent -0.02842566
## MajorAxisLength -1.58923225
## MinorAxisLength -1.22661054
## Perimeter 4.67752093
predictedLD <- predict(lda$model$classif.lda$model,newdata = raisin)
plotLD <- data.frame(predictedLD$x,class=predictedLD$class)
glimpse(plotLD)
## Rows: 900
## Columns: 2
## $ LD1 <dbl> -69085.33, -68161.13, -69305.05, -33789.74, -56501.07, -41871.26…
## $ class <fct> Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, Kecimen, K…
plotLD %>% count(class)
## class n
## 1 Kecimen 900