library(readxl)
library(caTools)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlr3verse)
## Loading required package: mlr3
library(rpart.plot)
## Loading required package: rpart
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(precrec)
library(kknn)
library(ranger)
library(ggplot2)
library(rpart)
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:cowplot':
##
## get_legend
library(mlr3learners)
library(mlr3tuning)
## Loading required package: paradox
library(yardstick)
##
## Attaching package: 'yardstick'
##
## The following object is masked from 'package:readr':
##
## spec
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(rsample) # Initial Split
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(caret) # Confussion Matrix
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:kknn':
##
## contr.dummy
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest) # random forest
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:ranger':
##
## importance
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:randomForest':
##
## combine
##
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
library(knitr)
library(cowplot)
library(formattable)
##
## Attaching package: 'formattable'
##
## The following object is masked from 'package:plotly':
##
## style
library(iml)
library(rio)
##
## Attaching package: 'rio'
##
## The following object is masked from 'package:plotly':
##
## export
library(tidyverse)
library(ggridges)
library(ROSE)
## Loaded ROSE 0.0-4
library(rpart)
library(ROCit)
library(rpart.plot)
library(caret)
library(ipred)
library(xgboost)
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:plotly':
##
## slice
##
## The following object is masked from 'package:dplyr':
##
## slice
library(Matrix)
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(ISLR)
library(caret) # cross-validation
library(gridExtra) # combining graphs
library(gam) # generalized additive models
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loaded gam 1.22-5
library(tidyverse)
library(splines)
library(rsample)
library(ggplot2)
library(dplyr)
library(purrr)
library(splines)
library(dplyr)
library(knitr)
library(DT)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(readxl)
library(mlr3)
library(readxl)
library(precrec)
library(mlr3verse)
dataUTS <- read_excel("LEVEL RISIKO UTS.xlsx")
dataUTS
## # A tibble: 100 × 16
## Country X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AD 17.5 38675. 173. 0.68 1.22 1.79 -2.08 55 -26.5 2.86
## 2 AE 18.2 40105. 104. 1.77 0.870 2.66 -0.725 103. -13.6 353.
## 3 AE-AZ 18.7 76038. 31.0 2.63 1.49 1.85 -1.90 103. -56.2 200.
## 4 AE-RK NA 27883. 24.8 1.29 1.75 2.23 -1.14 103. 24.8 10.1
## 5 AM 14 4251. 89.6 1.44 0.256 4.75 2.33 167. 47.3 12.6
## 6 AO NA 2034. 57.1 22.4 3.34 -0.878 -5.20 34.8 15.4 62.5
## 7 AR 23.3 9203. 43.3 36.7 0.966 -0.237 -3.73 NA -5.01 375.
## 8 AT 18.6 53174. 159. 1.52 0.726 1.88 -0.300 116. 15.4 430.
## 9 AU 15.7 63972. 122. 1.65 1.48 2.45 0.0306 192. 58.0 1359.
## 10 AW 33.5 24643. 92.8 1.22 0.797 2.06 -4.72 80.5 28.1 2.38
## # ℹ 90 more rows
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # `Risk Level` <chr>
dataUTS$`Risk Level` <- as.factor(dataUTS$`Risk Level`)
dataUTS <- dataUTS %>% select(- Country)
dataUTS
## # A tibble: 100 × 15
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 17.5 38675. 173. 0.68 1.22 1.79 -2.08 55 -26.5 2.86e0 8 23.1
## 2 18.2 40105. 104. 1.77 0.870 2.66 -0.725 103. -13.6 3.53e2 8.15 24.9
## 3 18.7 76038. 31.0 2.63 1.49 1.85 -1.90 103. -56.2 2.00e2 8.15 20.4
## 4 NA 27883. 24.8 1.29 1.75 2.23 -1.14 103. 24.8 1.01e1 NA 21.7
## 5 14 4251. 89.6 1.44 0.256 4.75 2.33 167. 47.3 1.26e1 6.6 19.4
## 6 NA 2034. 57.1 22.4 3.34 -0.878 -5.20 34.8 15.4 6.25e1 10.3 31.1
## 7 23.3 9203. 43.3 36.7 0.966 -0.237 -3.73 NA -5.01 3.75e2 10.6 16.7
## 8 18.6 53174. 159. 1.52 0.726 1.88 -0.300 116. 15.4 4.30e2 2.02 24.8
## 9 15.7 63972. 122. 1.65 1.48 2.45 0.0306 192. 58.0 1.36e3 0.96 24.3
## 10 33.5 24643. 92.8 1.22 0.797 2.06 -4.72 80.5 28.1 2.38e0 5 21.1
## # ℹ 90 more rows
## # ℹ 3 more variables: X13 <dbl>, X14 <dbl>, `Risk Level` <fct>
summary(dataUTS)
## X1 X2 X3 X4
## Min. : 4.20 Min. : 434.5 Min. : 13.63 Min. :-0.151
## 1st Qu.:15.93 1st Qu.: 4265.9 1st Qu.: 42.96 1st Qu.: 0.869
## Median :18.58 Median : 11659.1 Median : 70.42 Median : 1.700
## Mean :18.97 Mean : 22641.6 Mean : 191.94 Mean : 3.263
## 3rd Qu.:21.80 3rd Qu.: 34815.2 3rd Qu.: 130.63 3rd Qu.: 3.939
## Max. :47.50 Max. :124340.4 Max. :6908.35 Max. :36.703
## NA's :12
## X5 X6 X7 X8
## Min. :-0.8862 Min. :-5.135 Min. :-9.84530 Min. : 34.82
## 1st Qu.: 0.4419 1st Qu.: 1.765 1st Qu.:-1.18720 1st Qu.: 76.95
## Median : 1.1402 Median : 2.984 Median : 0.07155 Median : 90.19
## Mean : 1.2019 Mean : 3.076 Mean : 0.10804 Mean : 99.94
## 3rd Qu.: 1.9502 3rd Qu.: 4.305 3rd Qu.: 1.94108 3rd Qu.:113.39
## Max. : 4.4021 Max. :10.076 Max. : 6.07120 Max. :359.14
## NA's :7
## X9 X10 X11 X12
## Min. :-1955.72 Min. : 1.171 Min. : 0.3357 Min. :12.67
## 1st Qu.: -14.11 1st Qu.: 32.813 1st Qu.: 1.9250 1st Qu.:20.79
## Median : 12.67 Median : 106.872 Median : 3.9000 Median :23.40
## Mean : -13.58 Mean : 582.318 Mean : 5.5346 Mean :24.96
## 3rd Qu.: 36.67 3rd Qu.: 366.370 3rd Qu.: 7.9500 3rd Qu.:28.38
## Max. : 456.49 Max. :14866.703 Max. :26.9780 Max. :46.83
## NA's :17
## X13 X14 Risk Level
## Min. :10.95 Min. : 0.120 high:54
## 1st Qu.:19.06 1st Qu.: 4.818 low :46
## Median :24.28 Median : 6.800
## Mean :24.48 Mean : 8.441
## 3rd Qu.:29.36 3rd Qu.:10.500
## Max. :55.09 Max. :24.650
## NA's :11
dataUTS$X1 = ifelse(is.na(dataUTS$X1),
ave(dataUTS$X1, FUN = function(x) median(dataUTS$X1, na.rm = TRUE)), dataUTS$X1)
dataUTS$X8 = ifelse(is.na(dataUTS$X8),
ave(dataUTS$X8, FUN = function(x) median(dataUTS$X8, na.rm = TRUE)), dataUTS$X8)
dataUTS$X11 = ifelse(is.na(dataUTS$X11),
ave(dataUTS$X11, FUN = function(x) median(dataUTS$X11, na.rm = TRUE)), dataUTS$X11)
dataUTS$X14 = ifelse(is.na(dataUTS$X14),
ave(dataUTS$X14, FUN = function(x) median(dataUTS$X14, na.rm = TRUE)), dataUTS$X14)
sum(is.na(dataUTS))
## [1] 0
misdata <- as.data.frame(table(dataUTS$`Risk Level`))
misdata
## Var1 Freq
## 1 high 54
## 2 low 46
misdata$persen <- misdata$Freq/sum(misdata$Freq)
ggplot(data = misdata, mapping = aes(x = Var1, y = Freq)) +
geom_col(aes(fill =Var1), alpha = 0.7) +
labs(title = "Investment Risk Level",
x = "Risk Level",
y = "Frekuensi") +
geom_text(aes(label = paste0(round(persen*100, 2), "%")), vjust = -0.25) +
theme(legend.position = "none")
library(cowplot)
library(ggplot2)
x1_y <- ggplot(dataUTS, aes(x = X1, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X1 vs Y")
x2_y <- ggplot(dataUTS, aes(x = X2, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X2 vs Y")
x3_y <- ggplot(dataUTS, aes(x = X3, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X3 vs Y")
x4_y <- ggplot(dataUTS, aes(x = X4, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X4 vs Y")
x5_y <- ggplot(dataUTS, aes(x = X5, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X5 vs Y")
x6_y <- ggplot(dataUTS, aes(x = X6, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X6 vs Y")
x7_y <- ggplot(dataUTS, aes(x = X7, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X7 vs Y")
x8_y <- ggplot(dataUTS, aes(x = X8, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X8 vs Y")
x9_y <- ggplot(dataUTS, aes(x = X9, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X9 vs Y")
x10_y <- ggplot(dataUTS, aes(x = X10, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X10 vs Y")
x11_y <- ggplot(dataUTS, aes(x = X11, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X11 vs Y")
x12_y <- ggplot(dataUTS, aes(x = X12, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X12 vs Y")
x13_y <- ggplot(dataUTS, aes(x = X13, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X13 vs Y")
x14_y <- ggplot(dataUTS, aes(x = X14, y = `Risk Level`)) + geom_boxplot() + theme_minimal() + labs(title = "X14 vs Y")
plot_grid(x1_y, x2_y, x3_y, x4_y, x5_y, x6_y, x7_y, x8_y, x9_y, x10_y, x11_y, x12_y, x13_y, x14_y)
library(mlr3)
# Ensure no spaces in the column names
colnames(dataUTS) <- gsub(" ", "_", colnames(dataUTS))
# Konversi Risk Level menjadi faktor
dataUTS$Risk_Level <- as.factor(dataUTS$Risk_Level)
# Cek struktur data untuk memastikan semua kolom memiliki tipe data yang benar
str(dataUTS)
## tibble [100 × 15] (S3: tbl_df/tbl/data.frame)
## $ X1 : num [1:100] 17.5 18.2 18.7 18.6 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 3.9 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 6.8 6.8 18.5 ...
## $ Risk_Level: Factor w/ 2 levels "high","low": 2 2 2 2 1 1 1 2 2 1 ...
# Define the classification task correctly
task_UTS <- TaskClassif$new(id = "investment_risk", backend = dataUTS, target = "Risk_Level", positive = "low")
task_UTS
## <TaskClassif:investment_risk> (100 x 15)
## * Target: Risk_Level
## * Properties: twoclass
## * Features (14):
## - dbl (14): X1, X10, X11, X12, X13, X14, X2, X3, X4, X5, X6, X7, X8,
## X9
set.seed(6)
resampleUTS = rsmp("holdout", ratio = 0.8)
resampleUTS$instantiate(task = task_UTS)
learner_logreg <- lrn("classif.log_reg", predict_type = "prob")
learner_logreg
## <LearnerClassifLogReg:classif.log_reg>: Logistic Regression
## * Model: -
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, character, factor, ordered
## * Properties: loglik, twoclass
learner_logreg$train(task = task_UTS)
summary(learner_logreg$model)
##
## Call:
## stats::glm(formula = task$formula(), family = "binomial", data = data,
## model = FALSE)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.637e+00 5.589e+00 -1.188 0.23499
## X1 1.618e-02 9.059e-02 0.179 0.85820
## X10 -5.464e-05 6.082e-04 -0.090 0.92841
## X11 -3.600e-01 2.340e-01 -1.538 0.12403
## X12 -1.430e-01 1.798e-01 -0.795 0.42657
## X13 1.799e-01 1.825e-01 0.986 0.32404
## X14 -1.537e-01 1.627e-01 -0.945 0.34487
## X2 2.404e-04 9.032e-05 2.662 0.00776 **
## X3 -6.858e-03 7.883e-03 -0.870 0.38433
## X4 1.019e-01 1.897e-01 0.537 0.59099
## X5 -2.090e-01 8.142e-01 -0.257 0.79744
## X6 3.831e-01 5.935e-01 0.645 0.51863
## X7 3.937e-01 4.852e-01 0.812 0.41706
## X8 3.633e-02 3.315e-02 1.096 0.27309
## X9 -1.578e-02 1.304e-02 -1.211 0.22599
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 137.989 on 99 degrees of freedom
## Residual deviance: 38.575 on 85 degrees of freedom
## AIC: 68.575
##
## Number of Fisher Scoring iterations: 8
train_test_reglog <- resample(task = task_UTS, learner = learner_logreg, resampling = resampleUTS, store_models = TRUE)
## INFO [23:39:31.117] [mlr3] Applying learner 'classif.log_reg' on task 'investment_risk' (iter 1/1)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
train_test_reglog
## <ResampleResult> with 1 resampling iterations
## task_id learner_id resampling_id iteration prediction_test
## investment_risk classif.log_reg holdout 1 <PredictionClassif>
## warnings errors
## 0 0
prediksi_test_reglog = as.data.table(train_test_reglog$prediction())
prediksi_test_reglog
## row_ids truth response prob.low prob.high
## <int> <fctr> <fctr> <num> <num>
## 1: 1 low low 1.000000e+00 2.220446e-16
## 2: 9 low low 1.000000e+00 2.220446e-16
## 3: 11 high low 1.000000e+00 2.220446e-16
## 4: 14 low low 1.000000e+00 2.220446e-16
## 5: 15 high low 1.000000e+00 2.220446e-16
## 6: 37 high high 2.220446e-16 1.000000e+00
## 7: 38 low low 1.000000e+00 2.220446e-16
## 8: 39 high high 2.220446e-16 1.000000e+00
## 9: 44 high high 2.220446e-16 1.000000e+00
## 10: 47 high high 2.220446e-16 1.000000e+00
## 11: 54 high high 2.220446e-16 1.000000e+00
## 12: 55 high high 2.220446e-16 1.000000e+00
## 13: 57 high low 1.000000e+00 2.220446e-16
## 14: 70 low low 1.000000e+00 2.220446e-16
## 15: 72 high high 2.220446e-16 1.000000e+00
## 16: 82 high high 2.220446e-16 1.000000e+00
## 17: 91 low low 1.000000e+00 2.220446e-16
## 18: 94 low low 1.000000e+00 2.220446e-16
## 19: 96 high high 2.220446e-16 1.000000e+00
## 20: 100 high high 2.220446e-16 1.000000e+00
## row_ids truth response prob.low prob.high
train_test_reglog$prediction()$confusion
## truth
## response low high
## low 7 3
## high 0 10
akurasi_reglog <- train_test_reglog$aggregate(list(msr("classif.acc"),msr("classif.specificity"),msr("classif.sensitivity")))
akurasi_reglog
## classif.acc classif.specificity classif.sensitivity
## 0.8500000 0.7692308 1.0000000
learner_knn <- lrn("classif.kknn", predict_type = "prob", k =10, kernel = "rectangular")
learner_knn
## <LearnerClassifKKNN:classif.kknn>: k-Nearest-Neighbor
## * Model: -
## * Parameters: k=10, kernel=rectangular
## * Packages: mlr3, mlr3learners, kknn
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, factor, ordered
## * Properties: multiclass, twoclass
train_test_knn = resample(task = task_UTS, learner = learner_knn, resampling = resampleUTS, store_models = TRUE)
## INFO [23:39:31.921] [mlr3] Applying learner 'classif.kknn' on task 'investment_risk' (iter 1/1)
prediksi_test_KNN = as.data.table(train_test_knn$prediction())
head(prediksi_test_KNN)
## row_ids truth response prob.low prob.high
## <int> <fctr> <fctr> <num> <num>
## 1: 1 low low 0.9 0.1
## 2: 9 low low 1.0 0.0
## 3: 11 high low 0.6 0.4
## 4: 14 low low 0.6 0.4
## 5: 15 high low 0.6 0.4
## 6: 37 high high 0.1 0.9
train_test_knn$prediction()$confusion
## truth
## response low high
## low 7 3
## high 0 10
akurasi_knn <- train_test_knn$aggregate(list(msr("classif.acc"),msr("classif.specificity"),msr("classif.sensitivity")))
akurasi_knn
## classif.acc classif.specificity classif.sensitivity
## 0.8500000 0.7692308 1.0000000
autoplot(train_test_knn, type = "roc")
learner_tree <- lrn("classif.rpart", cp = 0.001, predict_type="prob")
learner_tree
## <LearnerClassifRpart:classif.rpart>: Classification Tree
## * Model: -
## * Parameters: cp=0.001, xval=0
## * Packages: mlr3, rpart
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, factor, ordered
## * Properties: importance, missings, multiclass, selected_features,
## twoclass, weights
learner_tree$train(task = task_UTS)
rpart.plot(learner_tree$model, roundint = F,type = 5,tweak = 2)
## Training
set.seed(6)
train_test_tree = resample(task = task_UTS, learner = learner_tree, resampling = resampleUTS, store_models = TRUE)
## INFO [23:39:33.447] [mlr3] Applying learner 'classif.rpart' on task 'investment_risk' (iter 1/1)
prediksi_test_tree = as.data.table(train_test_tree$prediction())
prediksi_test_tree
## row_ids truth response prob.low prob.high
## <int> <fctr> <fctr> <num> <num>
## 1: 1 low low 0.93750000 0.0625000
## 2: 9 low low 0.93750000 0.0625000
## 3: 11 high low 0.85714286 0.1428571
## 4: 14 low low 0.85714286 0.1428571
## 5: 15 high low 0.93750000 0.0625000
## 6: 37 high high 0.07317073 0.9268293
## 7: 38 low low 0.93750000 0.0625000
## 8: 39 high high 0.07317073 0.9268293
## 9: 44 high high 0.07317073 0.9268293
## 10: 47 high high 0.07317073 0.9268293
## 11: 54 high high 0.07317073 0.9268293
## 12: 55 high high 0.07317073 0.9268293
## 13: 57 high low 0.93750000 0.0625000
## 14: 70 low high 0.07317073 0.9268293
## 15: 72 high high 0.07317073 0.9268293
## 16: 82 high high 0.07317073 0.9268293
## 17: 91 low high 0.07317073 0.9268293
## 18: 94 low low 0.93750000 0.0625000
## 19: 96 high high 0.07317073 0.9268293
## 20: 100 high high 0.07317073 0.9268293
## row_ids truth response prob.low prob.high
train_test_tree$prediction()$confusion
## truth
## response low high
## low 5 3
## high 2 10
akurasi_tree <- train_test_tree$aggregate(list(msr("classif.acc"),msr("classif.specificity"),msr("classif.sensitivity")))
akurasi_tree
## classif.acc classif.specificity classif.sensitivity
## 0.7500000 0.7692308 0.7142857
autoplot(train_test_tree, type = "roc")
## Bagging
learner_bagging <- lrn(id = "bagging clf", "classif.ranger", mtry=11, predict_type = "prob", importance = "impurity")
set.seed(6)
train_test_bagging = resample (task = task_UTS, learner = learner_bagging, resampling = resampleUTS, store_models = TRUE)
## INFO [23:39:34.646] [mlr3] Applying learner 'bagging clf' on task 'investment_risk' (iter 1/1)
train_test_bagging$prediction()$confusion
## truth
## response low high
## low 7 3
## high 0 10
akurasi_bagging <- train_test_bagging$aggregate(list(msr("classif.acc"),msr("classif.specificity"),msr("classif.sensitivity")))
akurasi_bagging
## classif.acc classif.specificity classif.sensitivity
## 0.8500000 0.7692308 1.0000000
autoplot(train_test_bagging, type = "roc")
## RANDOM FOREST
learner_rf <- lrn("classif.ranger", predict_type="prob",importance="impurity")
learner_rf
## <LearnerClassifRanger:classif.ranger>: Random Forest
## * Model: -
## * Parameters: importance=impurity, num.threads=1
## * Packages: mlr3, mlr3learners, ranger
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, character, factor, ordered
## * Properties: hotstart_backward, importance, multiclass, oob_error,
## twoclass, weights
learner_rf$train(task = task_UTS)
learner_rf$model$variable.importance
## X1 X10 X11 X12 X13 X14 X2
## 1.1117695 4.3875232 4.9037478 1.0882446 2.6482023 2.3155506 12.8956327
## X3 X4 X5 X6 X7 X8 X9
## 2.0689538 2.4489693 1.2519776 0.6500289 0.7841348 1.3322588 5.7699944
importance <- data.frame(Predictors = names(learner_rf$model$variable.importance),impurity = learner_rf$model$variable.importance)
rownames(importance) <- NULL
importance %>% arrange (desc(impurity))
## Predictors impurity
## 1 X2 12.8956327
## 2 X9 5.7699944
## 3 X11 4.9037478
## 4 X10 4.3875232
## 5 X13 2.6482023
## 6 X4 2.4489693
## 7 X14 2.3155506
## 8 X3 2.0689538
## 9 X8 1.3322588
## 10 X5 1.2519776
## 11 X1 1.1117695
## 12 X12 1.0882446
## 13 X7 0.7841348
## 14 X6 0.6500289
ggplot(importance,aes(x=impurity,y=reorder(Predictors,impurity))) +geom_col(fill = "steelblue") + geom_text(aes(label=round(impurity,2)),hjust=1.2)
### dari hasil out[put ditas terlihat bahwa variabel yang paling
berpengaruh adalah x2 dan x9 sedangkan yang pengaruhnya kecil diantara
variabel lainnya dalah x7 dan x6
set.seed(6)
train_test_rf = resample(task = task_UTS, learner = learner_rf, resampling = resampleUTS, store_models = TRUE)
## INFO [23:39:36.634] [mlr3] Applying learner 'classif.ranger' on task 'investment_risk' (iter 1/1)
prediksi_test_rf = as.data.table(train_test_rf$prediction())
head(prediksi_test_rf)
## row_ids truth response prob.low prob.high
## <int> <fctr> <fctr> <num> <num>
## 1: 1 low low 0.7658944 0.23410556
## 2: 9 low low 0.9248135 0.07518651
## 3: 11 high low 0.5309151 0.46908492
## 4: 14 low high 0.4581992 0.54180079
## 5: 15 high low 0.7773500 0.22265000
## 6: 37 high high 0.1895175 0.81048254
train_test_rf$prediction()$confusion
## truth
## response low high
## low 5 3
## high 2 10
akurasi_rf <- train_test_rf$aggregate(list(msr("classif.acc"),msr("classif.specificity"),msr("classif.sensitivity")))
akurasi_rf
## classif.acc classif.specificity classif.sensitivity
## 0.7500000 0.7692308 0.7142857
autoplot(train_test_rf, type = "roc")
## KOMPARASI MODEL
# Buat daftar akurasi
akurasi <- list(regresi.logistik = akurasi_reglog,knn = akurasi_knn,pohon.klasifikasi = akurasi_tree,bagging = akurasi_bagging,random.forest = akurasi_rf)
akurasi_bagging
## classif.acc classif.specificity classif.sensitivity
## 0.8500000 0.7692308 1.0000000
df_akurasi <- round(t(data.frame(akurasi)), 4)
df_akurasi <- as.data.frame(df_akurasi)
df_akurasi <- df_akurasi %>%
select(classif.acc) %>%
arrange(desc(classif.acc))
df_akurasi
## classif.acc
## regresi.logistik 0.85
## knn 0.85
## bagging 0.85
## pohon.klasifikasi 0.75
## random.forest 0.75
set.seed(6)
learner <- list(learner_logreg,learner_tree,learner_rf,learner_knn,learner_bagging)
learner
## [[1]]
## <LearnerClassifLogReg:classif.log_reg>: Logistic Regression
## * Model: glm
## * Parameters: list()
## * Packages: mlr3, mlr3learners, stats
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, character, factor, ordered
## * Properties: loglik, twoclass
##
## [[2]]
## <LearnerClassifRpart:classif.rpart>: Classification Tree
## * Model: rpart
## * Parameters: cp=0.001, xval=0
## * Packages: mlr3, rpart
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, factor, ordered
## * Properties: importance, missings, multiclass, selected_features,
## twoclass, weights
##
## [[3]]
## <LearnerClassifRanger:classif.ranger>: Random Forest
## * Model: ranger
## * Parameters: importance=impurity, num.threads=1
## * Packages: mlr3, mlr3learners, ranger
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, character, factor, ordered
## * Properties: hotstart_backward, importance, multiclass, oob_error,
## twoclass, weights
##
## [[4]]
## <LearnerClassifKKNN:classif.kknn>: k-Nearest-Neighbor
## * Model: -
## * Parameters: k=10, kernel=rectangular
## * Packages: mlr3, mlr3learners, kknn
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, factor, ordered
## * Properties: multiclass, twoclass
##
## [[5]]
## <LearnerClassifRanger:bagging clf>: Random Forest
## * Model: -
## * Parameters: importance=impurity, mtry=11, num.threads=1
## * Packages: mlr3, mlr3learners, ranger
## * Predict Types: response, [prob]
## * Feature Types: logical, integer, numeric, character, factor, ordered
## * Properties: hotstart_backward, importance, multiclass, oob_error,
## twoclass, weights
design <- benchmark_grid(tasks = task_UTS,learners = learner,resamplings = resampleUTS )
design
## task learner resampling
## <char> <char> <char>
## 1: investment_risk classif.log_reg holdout
## 2: investment_risk classif.rpart holdout
## 3: investment_risk classif.ranger holdout
## 4: investment_risk classif.kknn holdout
## 5: investment_risk bagging clf holdout
bmr = benchmark(design,store_models = TRUE)
## INFO [23:39:37.879] [mlr3] Running benchmark with 5 resampling iterations
## INFO [23:39:37.893] [mlr3] Applying learner 'classif.log_reg' on task 'investment_risk' (iter 1/1)
## INFO [23:39:37.965] [mlr3] Applying learner 'classif.rpart' on task 'investment_risk' (iter 1/1)
## INFO [23:39:37.993] [mlr3] Applying learner 'classif.ranger' on task 'investment_risk' (iter 1/1)
## INFO [23:39:38.082] [mlr3] Applying learner 'classif.kknn' on task 'investment_risk' (iter 1/1)
## INFO [23:39:38.129] [mlr3] Applying learner 'bagging clf' on task 'investment_risk' (iter 1/1)
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## INFO [23:39:38.238] [mlr3] Finished benchmark
bmr
## <BenchmarkResult> of 5 rows with 5 resampling runs
## nr task_id learner_id resampling_id iters warnings errors
## 1 investment_risk classif.log_reg holdout 1 0 0
## 2 investment_risk classif.rpart holdout 1 0 0
## 3 investment_risk classif.ranger holdout 1 0 0
## 4 investment_risk classif.kknn holdout 1 0 0
## 5 investment_risk bagging clf holdout 1 0 0
bmr$aggregate(list(msr("classif.auc"))) %>% arrange(desc(classif.auc))
## nr task_id learner_id resampling_id iters classif.auc
## <int> <char> <char> <char> <int> <num>
## 1: 3 investment_risk classif.ranger holdout 1 0.9010989
## 2: 4 investment_risk classif.kknn holdout 1 0.9010989
## 3: 1 investment_risk classif.log_reg holdout 1 0.8846154
## 4: 5 investment_risk bagging clf holdout 1 0.8791209
## 5: 2 investment_risk classif.rpart holdout 1 0.7527473
## Hidden columns: resample_result
autoplot(bmr, type = "roc")
### AUC (Area Under the Curve) adalah luas area di bawah kurva ROC,
dimana semakin besar luas daerah kurva AUC maka model yang dihasilkan
semakin baik.
dataUTS$Country <- read_excel("LEVEL RISIKO UTS.xlsx")$Country
colSums(is.na(dataUTS))
## X1 X2 X3 X4 X5 X6 X7
## 0 0 0 0 0 0 0
## X8 X9 X10 X11 X12 X13 X14
## 0 0 0 0 0 0 0
## Risk_Level Country
## 0 0
dataUTS$X1 = ifelse(is.na(dataUTS$X1),
ave(dataUTS$X1, FUN = function(x) median(dataUTS$X1, na.rm = TRUE)), dataUTS$X1)
dataUTS$X8 = ifelse(is.na(dataUTS$X8),
ave(dataUTS$X8, FUN = function(x) median(dataUTS$X8, na.rm = TRUE)), dataUTS$X8)
dataUTS$X11 = ifelse(is.na(dataUTS$X11),
ave(dataUTS$X11, FUN = function(x) median(dataUTS$X11, na.rm = TRUE)), dataUTS$X11)
dataUTS$X14 = ifelse(is.na(dataUTS$X14),
ave(dataUTS$X14, FUN = function(x) median(dataUTS$X14, na.rm = TRUE)), dataUTS$X14)
misdata <- as.data.frame(table(dataUTS$`Risk Level`))
## Warning: Unknown or uninitialised column: `Risk Level`.
misdata
## [1] Freq
## <0 rows> (or 0-length row.names)
sum(is.na(dataUTS))
## [1] 0
prediksi <- predict(learner_rf, newdata = dataUTS)
prediksi
## [1] low low low low high high high low low high high high low low low
## [16] high high high high low high low high low high low high high high high
## [31] low low low high high low high low high low low high low high high
## [46] high high low high low high low low high high low high high high low
## [61] high low low low high high high low low low high high high low low
## [76] high high low high high high high low low low high high low low high
## [91] low low high low high high low high low high
## Levels: low high