library(readxl)
library(caTools)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlr3verse)
## Loading required package: mlr3
library(rpart.plot)
## Loading required package: rpart
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(precrec)
library(kknn)
library(ranger)
library(ggplot2)
library(rpart)
library(ggpubr)
##
## Attaching package: 'ggpubr'
##
## The following object is masked from 'package:cowplot':
##
## get_legend
library(mlr3learners)
library(mlr3tuning)
## Loading required package: paradox
library(yardstick)
##
## Attaching package: 'yardstick'
##
## The following object is masked from 'package:readr':
##
## spec
library(data.table)
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(plotly)
##
## Attaching package: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
library(rsample) # Initial Split
library(partykit)
## Loading required package: grid
## Loading required package: libcoin
## Loading required package: mvtnorm
library(caret) # Confussion Matrix
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following objects are masked from 'package:yardstick':
##
## precision, recall, sensitivity, specificity
##
## The following object is masked from 'package:kknn':
##
## contr.dummy
##
## The following object is masked from 'package:purrr':
##
## lift
library(randomForest) # random forest
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:ranger':
##
## importance
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:randomForest':
##
## combine
##
## The following object is masked from 'package:dplyr':
##
## combine
library(grid)
library(knitr)
library(cowplot)
library(formattable)
##
## Attaching package: 'formattable'
##
## The following object is masked from 'package:plotly':
##
## style
library(iml)
library(rio)
##
## Attaching package: 'rio'
##
## The following object is masked from 'package:plotly':
##
## export
library(tidyverse)
library(ggridges)
library(ROSE)
## Loaded ROSE 0.0-4
library(rpart)
library(ROCit)
library(rpart.plot)
library(caret)
library(ipred)
library(xgboost)
##
## Attaching package: 'xgboost'
##
## The following object is masked from 'package:plotly':
##
## slice
##
## The following object is masked from 'package:dplyr':
##
## slice
library(Matrix)
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
library(magrittr)
##
## Attaching package: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
library(ISLR)
library(caret) # cross-validation
library(gridExtra) # combining graphs
library(gam) # generalized additive models
## Loading required package: splines
## Loading required package: foreach
##
## Attaching package: 'foreach'
##
## The following objects are masked from 'package:purrr':
##
## accumulate, when
##
## Loaded gam 1.22-5
library(tidyverse)
library(splines)
library(rsample)
library(ggplot2)
library(dplyr)
library(purrr)
library(splines)
library(dplyr)
library(knitr)
library(DT)
library(kableExtra)
##
## Attaching package: 'kableExtra'
##
## The following object is masked from 'package:dplyr':
##
## group_rows
library(readxl)
library(mlr3)
library(readxl)
library(precrec)
library(mlr3verse)
library(skimr)
##
## Attaching package: 'skimr'
##
## The following object is masked from 'package:mlr3':
##
## partition
library(imputeMissings)
##
## Attaching package: 'imputeMissings'
##
## The following object is masked from 'package:dplyr':
##
## compute
library(caret)
library(MLmetrics)
##
## Attaching package: 'MLmetrics'
##
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
##
## The following object is masked from 'package:base':
##
## Recall
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following object is masked from 'package:precrec':
##
## auc
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(visdat)
dataUTS <- read_excel("Level Risiko Investasi UTS.xlsx")
dataUTS
## # A tibble: 100 × 16
## Country X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AD 17.5 38675. 173. 0.68 1.22 1.79 -2.08 55 -26.5 2.86
## 2 AE 18.2 40105. 104. 1.77 0.870 2.66 -0.725 103. -13.6 353.
## 3 AE-AZ 18.7 76038. 31.0 2.63 1.49 1.85 -1.90 103. -56.2 200.
## 4 AE-RK NA 27883. 24.8 1.29 1.75 2.23 -1.14 103. 24.8 10.1
## 5 AM 14 4251. 89.6 1.44 0.256 4.75 2.33 167. 47.3 12.6
## 6 AO NA 2034. 57.1 22.4 3.34 -0.878 -5.20 34.8 15.4 62.5
## 7 AR 23.3 9203. 43.3 36.7 0.966 -0.237 -3.73 NA -5.01 375.
## 8 AT 18.6 53174. 159. 1.52 0.726 1.88 -0.300 116. 15.4 430.
## 9 AU 15.7 63972. 122. 1.65 1.48 2.45 0.0306 192. 58.0 1359.
## 10 AW 33.5 24643. 92.8 1.22 0.797 2.06 -4.72 80.5 28.1 2.38
## # ℹ 90 more rows
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # `Risk Level` <chr>
table(dataUTS$`Risk Level`)
##
## high low
## 54 46
prop.table(table(dataUTS$`Risk Level`))
##
## high low
## 0.54 0.46
risklev <- ggplot(dataUTS, aes(x="", y=`Risk Level`, fill=`Risk Level`))+ geom_bar(width = 1, stat = "identity")
pie <- risklev + coord_polar("y", start=0)
pie + scale_fill_brewer(palette="RED")+ theme_minimal()
## Warning: Unknown palette: "RED"
library(visdat)
vis_miss(dataUTS)
# Memanggil data untuk dimasukkan dalam variabel baru bernama data6
data6 <- dataUTS
data6
## # A tibble: 100 × 16
## Country X1 X2 X3 X4 X5 X6 X7 X8 X9 X10
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AD 17.5 38675. 173. 0.68 1.22 1.79 -2.08 55 -26.5 2.86
## 2 AE 18.2 40105. 104. 1.77 0.870 2.66 -0.725 103. -13.6 353.
## 3 AE-AZ 18.7 76038. 31.0 2.63 1.49 1.85 -1.90 103. -56.2 200.
## 4 AE-RK NA 27883. 24.8 1.29 1.75 2.23 -1.14 103. 24.8 10.1
## 5 AM 14 4251. 89.6 1.44 0.256 4.75 2.33 167. 47.3 12.6
## 6 AO NA 2034. 57.1 22.4 3.34 -0.878 -5.20 34.8 15.4 62.5
## 7 AR 23.3 9203. 43.3 36.7 0.966 -0.237 -3.73 NA -5.01 375.
## 8 AT 18.6 53174. 159. 1.52 0.726 1.88 -0.300 116. 15.4 430.
## 9 AU 15.7 63972. 122. 1.65 1.48 2.45 0.0306 192. 58.0 1359.
## 10 AW 33.5 24643. 92.8 1.22 0.797 2.06 -4.72 80.5 28.1 2.38
## # ℹ 90 more rows
## # ℹ 5 more variables: X11 <dbl>, X12 <dbl>, X13 <dbl>, X14 <dbl>,
## # `Risk Level` <chr>
# Mengganti nilai NA nilai median dari kolom (variabel) yang memuat nilai NA dari data6
data6 <- data6 %>% select(-Country)
data6$X1[is.na(data6$X1)] <- median(data6$X1, na.rm = TRUE)
data6$X8[is.na(data6$X8)] <- median(data6$X8, na.rm = TRUE)
data6$X11[is.na(data6$X11)] <- median(data6$X11, na.rm = TRUE)
data6$X14[is.na(data6$X14)] <- median(data6$X14, na.rm = TRUE)
dim(data6)
## [1] 100 15
data6 <- data.frame(data6)
summary(data6)
## X1 X2 X3 X4
## Min. : 4.20 Min. : 434.5 Min. : 13.63 Min. :-0.151
## 1st Qu.:16.10 1st Qu.: 4265.9 1st Qu.: 42.96 1st Qu.: 0.869
## Median :18.58 Median : 11659.1 Median : 70.42 Median : 1.700
## Mean :18.92 Mean : 22641.6 Mean : 191.94 Mean : 3.263
## 3rd Qu.:21.43 3rd Qu.: 34815.2 3rd Qu.: 130.63 3rd Qu.: 3.939
## Max. :47.50 Max. :124340.4 Max. :6908.35 Max. :36.703
## X5 X6 X7 X8
## Min. :-0.8862 Min. :-5.135 Min. :-9.84530 Min. : 34.82
## 1st Qu.: 0.4419 1st Qu.: 1.765 1st Qu.:-1.18720 1st Qu.: 77.78
## Median : 1.1402 Median : 2.984 Median : 0.07155 Median : 90.19
## Mean : 1.2019 Mean : 3.076 Mean : 0.10804 Mean : 99.26
## 3rd Qu.: 1.9502 3rd Qu.: 4.305 3rd Qu.: 1.94108 3rd Qu.:112.27
## Max. : 4.4021 Max. :10.076 Max. : 6.07120 Max. :359.14
## X9 X10 X11 X12
## Min. :-1955.72 Min. : 1.171 Min. : 0.3357 Min. :12.67
## 1st Qu.: -14.11 1st Qu.: 32.813 1st Qu.: 2.3969 1st Qu.:20.79
## Median : 12.67 Median : 106.872 Median : 3.9000 Median :23.40
## Mean : -13.58 Mean : 582.318 Mean : 5.2567 Mean :24.96
## 3rd Qu.: 36.67 3rd Qu.: 366.370 3rd Qu.: 6.4500 3rd Qu.:28.38
## Max. : 456.49 Max. :14866.703 Max. :26.9780 Max. :46.83
## X13 X14 Risk.Level
## Min. :10.95 Min. : 0.12 Length:100
## 1st Qu.:19.06 1st Qu.: 5.00 Class :character
## Median :24.28 Median : 6.80 Mode :character
## Mean :24.48 Mean : 8.26
## 3rd Qu.:29.36 3rd Qu.: 9.70
## Max. :55.09 Max. :24.65
class(data6$Risk.Level)
## [1] "character"
# Mengubah tipe data pada variabel respon Y menjadi factor
data6$Risk.Level <- as.factor(data6$Risk.Level)
set.seed(123)
index6 <- createDataPartition(data6$Risk.Level, p = 0.8,list = FALSE)
index6
## Resample1
## [1,] 2
## [2,] 3
## [3,] 4
## [4,] 7
## [5,] 8
## [6,] 9
## [7,] 10
## [8,] 11
## [9,] 13
## [10,] 14
## [11,] 15
## [12,] 16
## [13,] 17
## [14,] 18
## [15,] 19
## [16,] 20
## [17,] 21
## [18,] 23
## [19,] 24
## [20,] 25
## [21,] 26
## [22,] 27
## [23,] 28
## [24,] 29
## [25,] 31
## [26,] 33
## [27,] 34
## [28,] 35
## [29,] 36
## [30,] 37
## [31,] 38
## [32,] 39
## [33,] 40
## [34,] 41
## [35,] 43
## [36,] 44
## [37,] 45
## [38,] 46
## [39,] 47
## [40,] 48
## [41,] 49
## [42,] 52
## [43,] 53
## [44,] 54
## [45,] 55
## [46,] 56
## [47,] 57
## [48,] 59
## [49,] 60
## [50,] 61
## [51,] 63
## [52,] 64
## [53,] 65
## [54,] 67
## [55,] 69
## [56,] 70
## [57,] 72
## [58,] 74
## [59,] 76
## [60,] 77
## [61,] 78
## [62,] 79
## [63,] 80
## [64,] 81
## [65,] 82
## [66,] 83
## [67,] 84
## [68,] 85
## [69,] 86
## [70,] 87
## [71,] 88
## [72,] 89
## [73,] 90
## [74,] 91
## [75,] 92
## [76,] 95
## [77,] 96
## [78,] 97
## [79,] 98
## [80,] 99
## [81,] 100
# Membagi data menjadi data train dan data test
train.data6index <- data6[index6,]
dim(train.data6index)
## [1] 81 15
test.data6index <- data6[-index6,]
dim(test.data6index)
## [1] 19 15
glimpse(test.data6index)
## Rows: 19
## Columns: 15
## $ X1 <dbl> 17.5000, 14.0000, 18.5770, 4.2000, 19.3000, 16.0000, 18.580…
## $ X2 <dbl> 38674.616, 4251.398, 2033.900, 2323.559, 89770.852, 30630.2…
## $ X3 <dbl> 172.75400, 89.61882, 57.05566, 19.74352, 275.61690, 1026.49…
## $ X4 <dbl> 0.68000, 1.44000, 22.35646, 5.81200, 0.00116, -0.15102, 1.2…
## $ X5 <dbl> 1.2206, 0.2562, 3.3422, 1.0568, 0.8402, 0.1535, 0.4835, 2.7…
## $ X6 <dbl> 1.78560, 4.74800, -0.87800, 7.39000, 1.88522, 4.62534, 1.62…
## $ X7 <dbl> -2.0843, 2.3318, -5.2032, 6.0712, 0.1733, 2.8078, -0.1226, …
## $ X8 <dbl> 55.00000, 166.80851, 34.81845, 78.40700, 82.50000, 68.15254…
## $ X9 <dbl> -26.52000, 47.27262, 15.44938, 4.91586, -152.44500, 456.486…
## $ X10 <dbl> 2.857862, 12.645460, 62.485865, 347.147671, 749.017673, 23.…
## $ X11 <dbl> 8.0000, 6.6000, 10.3000, 7.7000, 0.7500, 3.9000, 3.9000, 11…
## $ X12 <dbl> 23.08410, 19.40300, 31.12380, 32.70006, 24.41888, 17.90342,…
## $ X13 <dbl> 26.94344, 15.11172, 20.57210, 32.19390, 32.83370, 14.38300,…
## $ X14 <dbl> 3.0000, 18.5000, 10.5000, 5.0000, 3.1728, 7.8000, 4.8177, 1…
## $ Risk.Level <fct> low, high, high, high, low, high, low, high, low, low, high…
table(data6$Risk.Level)
##
## high low
## 54 46
# Menggunakan fungsi rpart untk memodelkan dengan Classification Tree
Xtree6 <- rpart(Risk.Level~., data = train.data6index, method = "class",control = rpart.control(cp = 0.008,minsplit=4))
Xtree6
## n= 81
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 81 37 high (0.54320988 0.45679012)
## 2) X2< 10001.82 36 2 high (0.94444444 0.05555556)
## 4) X11>=1.75185 35 1 high (0.97142857 0.02857143) *
## 5) X11< 1.75185 1 0 low (0.00000000 1.00000000) *
## 3) X2>=10001.82 45 10 low (0.22222222 0.77777778)
## 6) X2< 24962.76 18 9 high (0.50000000 0.50000000)
## 12) X10< 250.828 12 3 high (0.75000000 0.25000000)
## 24) X2>=11621.72 10 1 high (0.90000000 0.10000000) *
## 25) X2< 11621.72 2 0 low (0.00000000 1.00000000) *
## 13) X10>=250.828 6 0 low (0.00000000 1.00000000) *
## 7) X2>=24962.76 27 1 low (0.03703704 0.96296296) *
# Visualisasi dari model
rpart.plot(Xtree6, extra=4,box.palette="RdBu", shadow.col="gray", nn=TRUE)
printcp(Xtree6)
##
## Classification tree:
## rpart(formula = Risk.Level ~ ., data = train.data6index, method = "class",
## control = rpart.control(cp = 0.008, minsplit = 4))
##
## Variables actually used in tree construction:
## [1] X10 X11 X2
##
## Root node error: 37/81 = 0.45679
##
## n= 81
##
## CP nsplit rel error xerror xstd
## 1 0.675676 0 1.000000 1.00000 0.121167
## 2 0.081081 1 0.324324 0.43243 0.096844
## 3 0.054054 3 0.162162 0.40541 0.094487
## 4 0.027027 4 0.108108 0.40541 0.094487
## 5 0.008000 5 0.081081 0.45946 0.099054
pred6 <- predict(Xtree6, test.data6index)
dim(pred6)
## [1] 19 2
head(pred6, n=5)
## high low
## 1 0.03703704 0.96296296
## 5 0.97142857 0.02857143
## 6 0.97142857 0.02857143
## 12 0.97142857 0.02857143
## 22 0.03703704 0.96296296
predRL6 <- ifelse(pred6[,1] > 0.5 , "high", "low")
predRL6
## 1 5 6 12 22 30 32 42 50 51 58
## "low" "high" "high" "high" "low" "low" "low" "high" "high" "high" "high"
## 62 66 68 71 73 75 93 94
## "low" "high" "high" "high" "high" "low" "high" "low"
confusionMatrix(as.factor(predRL6), as.factor(test.data6index$Risk.Level))
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low
## high 9 3
## low 1 6
##
## Accuracy : 0.7895
## 95% CI : (0.5443, 0.9395)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : 0.01722
##
## Kappa : 0.573
##
## Mcnemar's Test P-Value : 0.61708
##
## Sensitivity : 0.9000
## Specificity : 0.6667
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.8571
## Prevalence : 0.5263
## Detection Rate : 0.4737
## Detection Prevalence : 0.6316
## Balanced Accuracy : 0.7833
##
## 'Positive' Class : high
##