library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlr3verse)
## Loading required package: mlr3
library(mlr3learners)
library(rpart.plot)
## Loading required package: rpart
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
#memasukkan semua data,disini saya menggabungkan kedua data yaitu training dan testing jadi jumlahnya ada 117 data
investment_risk <- read.csv("Laporan_risikoinvestasi.csv",stringsAsFactors = TRUE)
glimpse(investment_risk)
## Rows: 117
## Columns: 16
## $ Country <fct> AD, AE, AE-AZ, AE-RK, AM, AO, AR, AT, AU, AW, AZ, BD, BE, B…
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, NA, 14.0000, NA, 23.2527, 18.574…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 2033.…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.0556…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36.7…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0.7…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0.2…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.729…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.81…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.449…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 62.…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, NA, 6.6000, 10.3000, 10.6000, 2.019…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.12380,…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.57210,…
## $ X14 <dbl> 3.0000, 2.4500, NA, NA, 18.5000, 10.5000, 11.0500, 6.0000, …
## $ Risk.Level <fct> low, low, low, low, high, high, high, low, low, high, high,…
#eksplorasi data
skimr::skim(investment_risk)
| Name | investment_risk |
| Number of rows | 117 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| factor | 2 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Country | 1 | 0.99 | FALSE | 115 | LS: 2, AD: 1, AE: 1, AE-: 1 |
| Risk.Level | 0 | 1.00 | FALSE | 2 | hig: 64, low: 53 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| X1 | 13 | 0.89 | 18.73 | 5.18 | 4.20 | 15.93 | 18.35 | 21.64 | 47.50 | ▁▇▃▁▁ |
| X2 | 0 | 1.00 | 22596.35 | 24598.74 | 434.46 | 4223.46 | 11363.61 | 34641.26 | 124340.38 | ▇▂▂▁▁ |
| X3 | 0 | 1.00 | 177.50 | 646.02 | 13.63 | 43.11 | 70.35 | 117.76 | 6908.35 | ▇▁▁▁▁ |
| X4 | 0 | 1.00 | 3.44 | 4.94 | -0.15 | 0.84 | 1.70 | 4.21 | 36.70 | ▇▁▁▁▁ |
| X5 | 0 | 1.00 | 1.15 | 1.05 | -0.89 | 0.38 | 1.05 | 1.80 | 4.40 | ▃▇▅▂▁ |
| X6 | 0 | 1.00 | 3.06 | 2.20 | -5.14 | 1.75 | 2.84 | 4.26 | 10.08 | ▁▂▇▃▁ |
| X7 | 0 | 1.00 | 0.22 | 2.50 | -9.85 | -1.11 | 0.29 | 1.92 | 6.07 | ▁▁▆▇▂ |
| X8 | 9 | 0.92 | 99.47 | 41.28 | 34.82 | 75.98 | 90.02 | 113.28 | 359.14 | ▇▅▁▁▁ |
| X9 | 0 | 1.00 | -14.34 | 202.88 | -1955.72 | -16.23 | 12.94 | 33.35 | 456.49 | ▁▁▁▂▇ |
| X10 | 0 | 1.00 | 710.34 | 2434.56 | 1.17 | 34.54 | 107.80 | 375.19 | 20935.00 | ▇▁▁▁▁ |
| X11 | 21 | 0.82 | 6.31 | 8.92 | 0.34 | 1.84 | 3.60 | 7.93 | 63.50 | ▇▁▁▁▁ |
| X12 | 0 | 1.00 | 24.52 | 6.52 | 12.67 | 20.16 | 23.08 | 27.97 | 46.83 | ▃▇▃▁▁ |
| X13 | 0 | 1.00 | 24.36 | 8.19 | 8.88 | 18.42 | 24.23 | 29.25 | 55.09 | ▅▇▆▁▁ |
| X14 | 12 | 0.90 | 8.52 | 5.68 | 0.12 | 4.82 | 7.00 | 10.30 | 33.70 | ▇▆▂▁▁ |
#dari output yang didapat ternyata terdapat 55 missing data yaitu
# 13 darivariabel X1
# 9 dari variabel X8
# 21 dari variabel X11
# 12 dari variabel X14
#menghilangkan variabel country
investment_risk2 <- investment_risk %>% select(-Country)
head(investment_risk2)
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## 1 17.5 38674.616 172.75400 0.68000 1.2206 1.78560 -2.0843 55.00000 -26.52000
## 2 18.2 40105.120 103.52280 1.76600 0.8698 2.65884 -0.7254 102.52738 -13.59890
## 3 18.7 76037.997 31.03626 2.63056 1.4893 1.85034 -1.9008 102.52738 -56.24160
## 4 NA 27882.829 24.78532 1.29416 1.7530 2.23192 -1.1355 102.52738 24.78532
## 5 14.0 4251.398 89.61882 1.44000 0.2562 4.74800 2.3318 166.80851 47.27262
## 6 NA 2033.900 57.05566 22.35646 3.3422 -0.87800 -5.2032 34.81845 15.44938
## X10 X11 X12 X13 X14 Risk.Level
## 1 2.857862 8.000 23.08410 26.94344 3.00 low
## 2 352.910575 8.155 24.85976 32.47740 2.45 low
## 3 199.928422 8.155 20.39940 31.03926 NA low
## 4 10.108892 NA 21.69104 17.30888 NA low
## 5 12.645460 6.600 19.40300 15.11172 18.50 high
## 6 62.485865 10.300 31.12380 20.57210 10.50 high
#gambaran boxplot dari 14 variabel
x1_y <- ggplot(data = investment_risk, mapping = aes(x = X1, y = Risk.Level)) +
geom_boxplot()
x2_y <- ggplot(data = investment_risk, mapping = aes(x = X2, y = Risk.Level)) +
geom_boxplot()
x3_y <- ggplot(data = investment_risk, mapping = aes(x = X3, y = Risk.Level)) +
geom_boxplot()
x4_y <- ggplot(data = investment_risk, mapping = aes(x = X4, y = Risk.Level)) +
geom_boxplot()
x5_y <- ggplot(data = investment_risk, mapping = aes(x = X5, y = Risk.Level)) +
geom_boxplot()
x6_y <- ggplot(data = investment_risk, mapping = aes(x = X6, y = Risk.Level)) +
geom_boxplot()
x7_y <- ggplot(data = investment_risk, mapping = aes(x = X7, y = Risk.Level)) +
geom_boxplot()
x8_y <- ggplot(data = investment_risk, mapping = aes(x = X8, y = Risk.Level)) +
geom_boxplot()
x9_y <- ggplot(data = investment_risk, mapping = aes(x = X9, y = Risk.Level)) +
geom_boxplot()
x10_y <- ggplot(data = investment_risk, mapping = aes(x = X10, y = Risk.Level)) +
geom_boxplot()
x11_y <- ggplot(data = investment_risk, mapping = aes(x = X11, y = Risk.Level)) +
geom_boxplot()
x12_y <- ggplot(data = investment_risk, mapping = aes(x = X12, y = Risk.Level)) +
geom_boxplot()
x13_y <- ggplot(data = investment_risk, mapping = aes(x = X13, y = Risk.Level)) +
geom_boxplot()
x14_y <- ggplot(data = investment_risk, mapping = aes(x = X14, y = Risk.Level)) +
geom_boxplot()
plot_grid(x1_y, x2_y, x3_y, x4_y, x5_y, x6_y, x7_y, x8_y, x9_y, x10_y, x11_y, x12_y, x13_y, x14_y)
## Warning: Removed 13 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 21 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
#dilakukan pendugaan dengan mengganti data hilang dengan nilai median pada variabel
investment_risk2$X1 = ifelse(is.na(investment_risk$X1),
ave(investment_risk$X1, FUN = function(x) median(investment_risk$X1, na.rm = TRUE)), investment_risk$X1)
investment_risk2$X8 = ifelse(is.na(investment_risk$X8),
ave(investment_risk$X8, FUN = function(x) median(investment_risk$X8, na.rm = TRUE)), investment_risk$X8)
investment_risk2$X11 = ifelse(is.na(investment_risk$X11),
ave(investment_risk$X11, FUN = function(x) median(investment_risk$X11, na.rm = TRUE)), investment_risk$X11)
investment_risk2$X14 = ifelse(is.na(investment_risk$X14),
ave(investment_risk$X14, FUN = function(x) median(investment_risk$X14, na.rm = TRUE)), investment_risk$X14)
# cek missing data
sum(is.na(investment_risk2))
## [1] 0
#menghitung jumlah low dan high yg terdapat pada data
freqrisk <- as.data.frame(table(investment_risk2$Risk.Level))
freqrisk
## Var1 Freq
## 1 high 64
## 2 low 53
#Visualisasi data dengan menggunakan bar chart
freqrisk$persen <- freqrisk$Freq/sum(freqrisk$Freq)
ggplot(data = freqrisk, mapping = aes(x = Var1, y = Freq)) +
geom_col(aes(fill =Var1), alpha = 0.7) +
labs(title = "Investment Risk Level",
x = "Risk Level",
y = "Frekuensi") +
geom_text(aes(label = paste0(round(persen*100, 2), "%")), vjust = -0.25) +
theme(legend.position = "none")
#Berdasarkan bar chart diatas terlihat bahwa 54.7% atau 64 negara berada pada level risiko investasi yang tinggi (high-risk investment) dan 45.3% atau 53 negara berada pada level risiko investasi yang rendah (low-risk investment).
#prepocessing
#import data ke mlr3
task_risk = TaskClassif$new(id="country",backend = investment_risk2,
target = "Risk.Level",positive ="low")
#spliting data, data dibagi menjadi dua bagian yaitu 85% sebagai data training dan 15% sebagai data testing
set.seed(234)
resample_holdout =rsmp("holdout", ratio = 0.85)
resample_holdout$instantiate(task=task_risk)
#model yang digunakan yaitu random forest
model_rf <- lrn("classif.ranger", predict_type="prob",importance="impurity")
#interpretasi model
model_rf$train(task=task_risk)
importance<- data.frame(Predictors = names(model_rf$model$variable.importance),
impurity = model_rf$model$variable.importance
)
rownames(importance) <- NULL
importance %>% arrange(desc(impurity))
## Predictors impurity
## 1 X2 14.5911504
## 2 X10 5.4538524
## 3 X11 4.9822144
## 4 X9 4.7303949
## 5 X4 4.4428357
## 6 X14 3.6850741
## 7 X13 3.0534512
## 8 X3 2.6734131
## 9 X12 1.5999564
## 10 X5 1.5112317
## 11 X1 1.4038990
## 12 X8 1.1361569
## 13 X7 1.0696745
## 14 X6 0.7992691
#membuat grafik batang (bar plot)
ggplot(importance,
aes(x=impurity,
y=reorder(Predictors,impurity))
) +
geom_col(fill = "steelblue")+
geom_text(aes(label=round(impurity,2)),hjust=1.2)
Variabel X2 yaitu GDP per capita (USD) merupakan variable yang paling
berpengaruh terhadap resiko investasi, sedangkan X6 merupakan variable
yang paling tidak berpengaruh terhadap resiko investasi dibandingkan
variabel lainnya.
#resampling data training dan testing
set.seed(234)
train_test_rf = resample(task = task_risk,
learner = model_rf,
resampling = resample_holdout,
store_models = TRUE
)
## INFO [13:09:51.822] [mlr3] Applying learner 'classif.ranger' on task 'country' (iter 1/1)
#mengonversi hasil prediksi dari proses resampling ke dalam bentuk tabel data agar lebih mudah untuk dianalisis
data_testing = as.data.table(train_test_rf$prediction())
data_testing
## row_ids truth response prob.low prob.high
## <int> <fctr> <fctr> <num> <num>
## 1: 9 low low 0.9237595 0.07624048
## 2: 13 low low 0.9398468 0.06015317
## 3: 17 high high 0.2983238 0.70167619
## 4: 22 low low 0.9560151 0.04398492
## 5: 27 high high 0.1721976 0.82780238
## 6: 37 high high 0.1749087 0.82509127
## 7: 38 low low 0.5970817 0.40291825
## 8: 41 low low 0.8542214 0.14577857
## 9: 44 high high 0.1435373 0.85646270
## 10: 50 low low 0.7980500 0.20195000
## 11: 53 low low 0.8967770 0.10322302
## 12: 62 low low 0.8965794 0.10342063
## 13: 67 high high 0.1755571 0.82444286
## 14: 81 high high 0.1002175 0.89978254
## 15: 86 high high 0.4002913 0.59970873
## 16: 95 high low 0.6796262 0.32037381
## 17: 98 high high 0.1121929 0.88780714
## 18: 112 high high 0.1033683 0.89663175
#evaluasi model menggunakan confusion matrix
train_test_rf$prediction()$confusion
## truth
## response low high
## low 8 1
## high 0 9
True Positif = 8, artinya ada 8 negara yang memiliki kategori investasi rendah dan diprediksi rendah. True Negative = 9, artinya ada 9 negara yang memiliki kategori investasi tinggi dan diprediksi tinggi. False Positive = 0 artinya tidak ada keslahan dalam memprediksi kategori investasi rendah yang seharunya tinggi False Negative = 1, artinya ada 1 nagara yang salah dalam prediksi yaitu kategori investasi tinggi padahal seharusnya rendah
akurasi_rf <- train_test_rf$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
akurasi_rf
## classif.acc classif.specificity classif.sensitivity
## 0.9444444 0.9000000 1.0000000
nilai akurasi yang didapat lumayan tinggi yaitu 94.44%
autoplot(train_test_rf$prediction(), type = "roc")
Kurva ROC menunjukkan trade-off antara True Positive Rate (TPR) dan
False Positive Rate (FPR) pada berbagai ambang batas klasifikasi.
Semakin besar luas area di bawah kurva ROC maka kemampuan prediksi yang
dihasilkan semakin baik.
#nilai AUC atau Area Under Curve didapat dengan menghitung luas area dibawah kurva AUC
train_test_rf$aggregate((msr("classif.auc")))
## classif.auc
## 0.9875
Model dengan nilai AUC 0.9875 menunjukkan performa yang sangat baik.
#hasil prediksi
risklevel_baru <- data.frame(
X1 = 20,
X2 = 9203,
X3 = 90,
X4 = 22,
X5 = 1,
X6 = 3,
X7 = 0,
X8 = 90,
X9 = 15,
X10 = 450,
X11 = 8,
X12 = 23,
X13 = 26,
X14 = 5)
prediksi <- predict(model_rf, newdata = risklevel_baru)
prediksi
## [1] high
## Levels: low high
Hasil prediksi menunjukan bahwa negara tersebut masuk dalam kelompok tingkat investasi risiko yang tinggi