library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlr3verse)
## Loading required package: mlr3
library(mlr3extralearners)
library(rpart.plot)
## Loading required package: rpart
library(cowplot)
##
## Attaching package: 'cowplot'
##
## The following object is masked from 'package:lubridate':
##
## stamp
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:dplyr':
##
## combine
##
## The following object is masked from 'package:ggplot2':
##
## margin
DATA Deskripsi Data Investment Risk adalah kemungkinan atau ketidakpastian kerugian daripada keuntungan yang diharapkan dari investasi karena jatuhnya harga wajar sekuritas seperti obligasi, saham, real estate, dan sebagainya Pada Analisisini digunakan data level risiko investasi dengan 14 peubah bebas yaitu : X1 : capital adequacy ratio (%) average from last 5 years X2 : GDP per capita (USD) X3 : Gross External Debt (% of GDP) average from last 5 years X4 : Growth of consumer price (%) average from last 5 years X5 : Growth of population (%) average from last 5 years X6 : Growth of Real GDP (%) average from last 5 years X7 : Growth of Real GDP per cap. (%) average from last 5 years X8 : Loan-deposit ratio (%) average from last 5 years X9 : Net External Debt (% of GDP) average from last 5 years X10 : Nominal GDP (USD bn) X11 : Non-performing loans (% of gross loans) average from last 5 years X12 : percentage of gross domestic investment to GDP (%) average from last 5 years X13 : Percentage of gross domestic saving to GDP (%) average from last 5 years X14 : Unemployment rate (% labour force) average from last 5 years
dengan perubah respon yaitu risk level terdiri atas dua kategori (high,low)
Input Data
investment_risk <- read.csv("risiko_investasi.csv",stringsAsFactors = TRUE)
glimpse(investment_risk)
## Rows: 117
## Columns: 16
## $ Country <fct> AD, AE, AE-AZ, AE-RK, AM, AO, AR, AT, AU, AW, AZ, BD, BE, B…
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, NA, 14.0000, NA, 23.2527, 18.574…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 2033.…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.0556…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36.7…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0.7…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0.2…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.729…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.81…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.449…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 62.…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, NA, 6.6000, 10.3000, 10.6000, 2.019…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.12380,…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.57210,…
## $ X14 <dbl> 3.0000, 2.4500, NA, NA, 18.5000, 10.5000, 11.0500, 6.0000, …
## $ Risk.Level <fct> low, low, low, low, high, high, high, low, low, high, high,…
Eksplorasi Data
skimr::skim(investment_risk)
| Name | investment_risk |
| Number of rows | 117 |
| Number of columns | 16 |
| _______________________ | |
| Column type frequency: | |
| factor | 2 |
| numeric | 14 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Country | 1 | 0.99 | FALSE | 115 | LS: 2, AD: 1, AE: 1, AE-: 1 |
| Risk.Level | 0 | 1.00 | FALSE | 2 | hig: 64, low: 53 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| X1 | 13 | 0.89 | 18.73 | 5.18 | 4.20 | 15.93 | 18.35 | 21.64 | 47.50 | ▁▇▃▁▁ |
| X2 | 0 | 1.00 | 22596.35 | 24598.74 | 434.46 | 4223.46 | 11363.61 | 34641.26 | 124340.38 | ▇▂▂▁▁ |
| X3 | 0 | 1.00 | 177.50 | 646.02 | 13.63 | 43.11 | 70.35 | 117.76 | 6908.35 | ▇▁▁▁▁ |
| X4 | 0 | 1.00 | 3.44 | 4.94 | -0.15 | 0.84 | 1.70 | 4.21 | 36.70 | ▇▁▁▁▁ |
| X5 | 0 | 1.00 | 1.15 | 1.05 | -0.89 | 0.38 | 1.05 | 1.80 | 4.40 | ▃▇▅▂▁ |
| X6 | 0 | 1.00 | 3.06 | 2.20 | -5.14 | 1.75 | 2.84 | 4.26 | 10.08 | ▁▂▇▃▁ |
| X7 | 0 | 1.00 | 0.22 | 2.50 | -9.85 | -1.11 | 0.29 | 1.92 | 6.07 | ▁▁▆▇▂ |
| X8 | 9 | 0.92 | 99.47 | 41.28 | 34.82 | 75.98 | 90.02 | 113.28 | 359.14 | ▇▅▁▁▁ |
| X9 | 0 | 1.00 | -14.34 | 202.88 | -1955.72 | -16.23 | 12.94 | 33.35 | 456.49 | ▁▁▁▂▇ |
| X10 | 0 | 1.00 | 710.34 | 2434.56 | 1.17 | 34.54 | 107.80 | 375.19 | 20935.00 | ▇▁▁▁▁ |
| X11 | 21 | 0.82 | 6.31 | 8.92 | 0.34 | 1.84 | 3.60 | 7.93 | 63.50 | ▇▁▁▁▁ |
| X12 | 0 | 1.00 | 24.52 | 6.52 | 12.67 | 20.16 | 23.08 | 27.97 | 46.83 | ▃▇▃▁▁ |
| X13 | 0 | 1.00 | 24.36 | 8.19 | 8.88 | 18.42 | 24.23 | 29.25 | 55.09 | ▅▇▆▁▁ |
| X14 | 12 | 0.90 | 8.52 | 5.68 | 0.12 | 4.82 | 7.00 | 10.30 | 33.70 | ▇▆▂▁▁ |
Dari Output diatas terdapat dengan total 55 missing data yang terbagi atas : - Variabel X1 = 13 - Variabel X8 = 9 - Variabel X11 = 21 - Variabel X14 = 12
Menghilangkan Var Country Variabel ini dihilangkan karena tidak digunakan dalam analisis
investment_risk2 <- investment_risk %>% select(-Country)
head(investment_risk2)
## X1 X2 X3 X4 X5 X6 X7 X8 X9
## 1 17.5 38674.616 172.75400 0.68000 1.2206 1.78560 -2.0843 55.00000 -26.52000
## 2 18.2 40105.120 103.52280 1.76600 0.8698 2.65884 -0.7254 102.52738 -13.59890
## 3 18.7 76037.997 31.03626 2.63056 1.4893 1.85034 -1.9008 102.52738 -56.24160
## 4 NA 27882.829 24.78532 1.29416 1.7530 2.23192 -1.1355 102.52738 24.78532
## 5 14.0 4251.398 89.61882 1.44000 0.2562 4.74800 2.3318 166.80851 47.27262
## 6 NA 2033.900 57.05566 22.35646 3.3422 -0.87800 -5.2032 34.81845 15.44938
## X10 X11 X12 X13 X14 Risk.Level
## 1 2.857862 8.000 23.08410 26.94344 3.00 low
## 2 352.910575 8.155 24.85976 32.47740 2.45 low
## 3 199.928422 8.155 20.39940 31.03926 NA low
## 4 10.108892 NA 21.69104 17.30888 NA low
## 5 12.645460 6.600 19.40300 15.11172 18.50 high
## 6 62.485865 10.300 31.12380 20.57210 10.50 high
Boxplot
x1_y <- ggplot(data = investment_risk, mapping = aes(x = X1, y = Risk.Level)) + geom_boxplot()
x2_y <- ggplot(data = investment_risk, mapping = aes(x = X2, y = Risk.Level)) +
geom_boxplot()
x3_y <- ggplot(data = investment_risk, mapping = aes(x = X3, y = Risk.Level)) +
geom_boxplot()
x4_y <- ggplot(data = investment_risk, mapping = aes(x = X4, y = Risk.Level)) +
geom_boxplot()
x5_y <- ggplot(data = investment_risk, mapping = aes(x = X5, y = Risk.Level)) +
geom_boxplot()
x6_y <- ggplot(data = investment_risk, mapping = aes(x = X6, y = Risk.Level)) +
geom_boxplot()
x7_y <- ggplot(data = investment_risk, mapping = aes(x = X7, y = Risk.Level)) +
geom_boxplot()
x8_y <- ggplot(data = investment_risk, mapping = aes(x = X8, y = Risk.Level)) +
geom_boxplot()
x9_y <- ggplot(data = investment_risk, mapping = aes(x = X9, y = Risk.Level)) +
geom_boxplot()
x10_y <- ggplot(data = investment_risk, mapping = aes(x = X10, y = Risk.Level)) +
geom_boxplot()
x11_y <- ggplot(data = investment_risk, mapping = aes(x = X11, y = Risk.Level)) +
geom_boxplot()
x12_y <- ggplot(data = investment_risk, mapping = aes(x = X12, y = Risk.Level)) +
geom_boxplot()
x13_y <- ggplot(data = investment_risk, mapping = aes(x = X13, y = Risk.Level)) +
geom_boxplot()
x14_y <- ggplot(data = investment_risk, mapping = aes(x = X14, y = Risk.Level)) +
geom_boxplot()
plot_grid(x1_y, x2_y, x3_y, x4_y)
## Warning: Removed 13 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
plot_grid(x5_y, x6_y, x7_y, x8_y, x9_y)
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
plot_grid(x10_y, x11_y, x12_y, x13_y, x14_y)
## Warning: Removed 21 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
MANAJEMEN DATA salah satu langkah dalam mengatasi missing data adalah melakukan pendugaan dengan mengganti data hilang tersebut dengan nilai median pada variabel tersebut
investment_risk2$X1 = ifelse(is.na(investment_risk$X1),
ave(investment_risk$X1, FUN = function(x) median(investment_risk$X1, na.rm = TRUE)), investment_risk$X1)
investment_risk2$X8 = ifelse(is.na(investment_risk$X8),
ave(investment_risk$X8, FUN = function(x) median(investment_risk$X8, na.rm = TRUE)), investment_risk$X8)
investment_risk2$X11 = ifelse(is.na(investment_risk$X11),
ave(investment_risk$X11, FUN = function(x) median(investment_risk$X11, na.rm = TRUE)), investment_risk$X11)
investment_risk2$X14 = ifelse(is.na(investment_risk$X14),
ave(investment_risk$X14, FUN = function(x) median(investment_risk$X14, na.rm = TRUE)), investment_risk$X14)
Cek Missing Data
sum(is.na(investment_risk2))
## [1] 0
freqrisk <- as.data.frame(table(investment_risk2$Risk.Level))
freqrisk
## Var1 Freq
## 1 high 64
## 2 low 53
VISUALISASI DATA Bar Chart
freqrisk$persen <- freqrisk$Freq/sum(freqrisk$Freq)
ggplot(data = freqrisk, mapping = aes(x = Var1, y = Freq)) +
geom_col(aes(fill =Var1), alpha = 0.7) +
labs(title = "Level Risiko Investasi",
x = "Level Risiko",
y = "Frekuensi") +
geom_text(aes(label = paste0(round(persen*100, 2), "%")), vjust = -0.25) +
theme(legend.position = "none")
Berdasarkan diagram batang tersebut dapat dilihat bahwa 54,7% atau 64
negara berada pada level risiko investasi tinggi (high-risk investment)
dan 45,3% negara berada pada level risiko investasi yang rendah
(low-risk investment). Artinya lebih banyak negara yang berada pada
level risiko investasi tinggi dibandingkan dengan negara yang berada
pada level risiko investasi rendah
RANDOM FOREST Prepocessing
#import data ke mlr3
task_risk = TaskClassif$new(id="country",backend = investment_risk2,
target = "Risk.Level",positive ="low")
Spliting Data data dibagi menjadi 2 bagian yaitu 80% sebagai data training dan 20% sebagai data latih
set.seed(234)
resample_holdout = rsmp("holdout", ratio = 0.8)
resample_holdout$instantiate(task=task_risk)
Model
model_rf <- lrn("classif.ranger", predict_type="prob",importance="impurity")
Interpretasi Model
model_rf$train(task=task_risk)
importance<- data.frame(Predictors = names(model_rf$model$variable.importance),
impurity = model_rf$model$variable.importance
)
rownames(importance) <- NULL
importance %>% arrange(desc(impurity))
## Predictors impurity
## 1 X2 13.411908
## 2 X10 5.302743
## 3 X11 5.131209
## 4 X9 4.788739
## 5 X4 4.180021
## 6 X14 3.677466
## 7 X13 3.334789
## 8 X3 2.925997
## 9 X12 1.679471
## 10 X1 1.618101
## 11 X5 1.562178
## 12 X8 1.178608
## 13 X7 1.075028
## 14 X6 1.043877
ggplot(importance,
aes(x=impurity,
y=reorder(Predictors,impurity))
) +
geom_col(fill = "pink")+
geom_text(aes(label = round(impurity,2)),hjust=1.2)
Variabel X2 yaitu GDP per capita (USD) merupakan variabel paling
berpengaruh terhadap risiko investasi sedangkan Variabel X6 yaitu Growth
of Real GDP (%) average from last 5 years merupakan variabel paling
sedikit memberikan pengaruh terdapat risiko investasi dibandingkan
variabel yang lain
Training
set.seed(234)
train_test_rf = resample(task = task_risk,
learner = model_rf,
resampling = resample_holdout,
store_models = TRUE)
## INFO [13:27:09.001] [mlr3] Applying learner 'classif.ranger' on task 'country' (iter 1/1)
Testing
data_testing = as.data.table(train_test_rf$prediction())
data_testing
## row_ids truth response prob.low prob.high
## <int> <fctr> <fctr> <num> <num>
## 1: 3 low low 0.78600794 0.21399206
## 2: 9 low low 0.89465476 0.10534524
## 3: 13 low low 0.92694921 0.07305079
## 4: 17 high high 0.30266190 0.69733810
## 5: 22 low low 0.94354365 0.05645635
## 6: 27 high high 0.12660635 0.87339365
## 7: 32 low low 0.94916032 0.05083968
## 8: 37 high high 0.16258810 0.83741190
## 9: 38 low low 0.52935873 0.47064127
## 10: 41 low low 0.83743413 0.16256587
## 11: 44 high high 0.15739841 0.84260159
## 12: 50 low low 0.80437381 0.19562619
## 13: 53 low low 0.87608333 0.12391667
## 14: 59 high high 0.05473730 0.94526270
## 15: 62 low low 0.86980714 0.13019286
## 16: 67 high high 0.18334762 0.81665238
## 17: 81 high high 0.09180794 0.90819206
## 18: 83 low low 0.91432302 0.08567698
## 19: 86 high high 0.38120873 0.61879127
## 20: 95 high low 0.59477460 0.40522540
## 21: 98 high high 0.10395556 0.89604444
## 22: 110 low low 0.87265952 0.12734048
## 23: 112 high high 0.10894921 0.89105079
## row_ids truth response prob.low prob.high
EVALUASI MODEL Confusion Matrix
train_test_rf$prediction()$confusion
## truth
## response low high
## low 12 1
## high 0 10
Dari conclusing matrix dapat disimpulkan bahwa : - True Positive = 12, artinya ada 12 negara yang memiliki kategori investasi rendah dan diprediksi rendah - True Negative = 10, artinya ada 10 negara yang memiliki kategori investasi tinggi dan diprediksi tinggi - False Positive = 0, artinya tidak ada kesalahan dalam memprediksi kategori investasi rendah yang seharusnya tinggi - False Negative = 1, artinya ada 1 negara yang salah dalam prediksi yaitu kategori investasi tinggi padahal seharusnya rendah
Akurasi
akurasi_rf <- train_test_rf$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity"
)))
akurasi_rf
## classif.acc classif.specificity classif.sensitivity
## 0.9565217 0.9090909 1.0000000
Dari output diatas dapat disimpulkan bahwa nilai akurasi yang tinggi sekitar 95.65%
ROC
autoplot(train_test_rf$prediction(), type = "roc")
Kurva ROC menunjukkan trade-off antara True Positive Rate (TPR) dan
False Positive Rate (FPR) pada berbagai ambang batas klasifikasi.
Semakin besar luas area di bawah kurva ROC maka kemampuan prediksi yang
dihasilkan semakin baik
AUC
train_test_rf$aggregate(msr("classif.auc"))
## classif.auc
## 0.9924242
Nilai AUC atau Area Under Curve didapat dengan menghitung luas area dibawah kurva. Model dengan nilai AUC 0.9924242 memiliki arti bahwa performa sangat baik
Prediksi Data Baru
#Misal akan dilakukan prediksi apakah suatu negara memiliki tingkat risiko investasi yang rendah / tinggi dan diketahui negara tersebut memiliki Var x sebagai berikut :
risklevel_baru <- data.frame(
X1 = 20,
X2 = 9203,
X3 = 90,
X4 = 22,
X5 = 1,
X6 = 3,
X7 = 0,
X8 = 90,
X9 = 15,
X10 = 450,
X11 = 8,
X12 = 23,
X13 = 26,
X14 = 5)
prediksi <- predict(model_rf, newdata = risklevel_baru)
prediksi
## [1] high
## Levels: low high
Hasil dari prediksi data tersebut menunjukkan bahwa negara tersebut masuk kedalam kategori level risiko investasi yang tinggi