library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(mlr3verse)
## Loading required package: mlr3
library(mlr3extralearners)
library(rpart.plot)
## Loading required package: rpart
library(cowplot)
## 
## Attaching package: 'cowplot'
## 
## The following object is masked from 'package:lubridate':
## 
##     stamp
library(randomForest)
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine
## 
## The following object is masked from 'package:ggplot2':
## 
##     margin

DATA Deskripsi Data Investment Risk adalah kemungkinan atau ketidakpastian kerugian daripada keuntungan yang diharapkan dari investasi karena jatuhnya harga wajar sekuritas seperti obligasi, saham, real estate, dan sebagainya Pada Analisisini digunakan data level risiko investasi dengan 14 peubah bebas yaitu : X1 : capital adequacy ratio (%) average from last 5 years X2 : GDP per capita (USD) X3 : Gross External Debt (% of GDP) average from last 5 years X4 : Growth of consumer price (%) average from last 5 years X5 : Growth of population (%) average from last 5 years X6 : Growth of Real GDP (%) average from last 5 years X7 : Growth of Real GDP per cap. (%) average from last 5 years X8 : Loan-deposit ratio (%) average from last 5 years X9 : Net External Debt (% of GDP) average from last 5 years X10 : Nominal GDP (USD bn) X11 : Non-performing loans (% of gross loans) average from last 5 years X12 : percentage of gross domestic investment to GDP (%) average from last 5 years X13 : Percentage of gross domestic saving to GDP (%) average from last 5 years X14 : Unemployment rate (% labour force) average from last 5 years

dengan perubah respon yaitu risk level terdiri atas dua kategori (high,low)

Input Data

investment_risk <- read.csv("risiko_investasi.csv",stringsAsFactors = TRUE)
glimpse(investment_risk)
## Rows: 117
## Columns: 16
## $ Country    <fct> AD, AE, AE-AZ, AE-RK, AM, AO, AR, AT, AU, AW, AZ, BD, BE, B…
## $ X1         <dbl> 17.5000, 18.2000, 18.7000, NA, 14.0000, NA, 23.2527, 18.574…
## $ X2         <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 2033.…
## $ X3         <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.0556…
## $ X4         <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36.7…
## $ X5         <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0.7…
## $ X6         <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0.2…
## $ X7         <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.729…
## $ X8         <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.81…
## $ X9         <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.449…
## $ X10        <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 62.…
## $ X11        <dbl> 8.0000, 8.1550, 8.1550, NA, 6.6000, 10.3000, 10.6000, 2.019…
## $ X12        <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.12380,…
## $ X13        <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.57210,…
## $ X14        <dbl> 3.0000, 2.4500, NA, NA, 18.5000, 10.5000, 11.0500, 6.0000, …
## $ Risk.Level <fct> low, low, low, low, high, high, high, low, low, high, high,…

Eksplorasi Data

skimr::skim(investment_risk)
Data summary
Name investment_risk
Number of rows 117
Number of columns 16
_______________________
Column type frequency:
factor 2
numeric 14
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Country 1 0.99 FALSE 115 LS: 2, AD: 1, AE: 1, AE-: 1
Risk.Level 0 1.00 FALSE 2 hig: 64, low: 53

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
X1 13 0.89 18.73 5.18 4.20 15.93 18.35 21.64 47.50 ▁▇▃▁▁
X2 0 1.00 22596.35 24598.74 434.46 4223.46 11363.61 34641.26 124340.38 ▇▂▂▁▁
X3 0 1.00 177.50 646.02 13.63 43.11 70.35 117.76 6908.35 ▇▁▁▁▁
X4 0 1.00 3.44 4.94 -0.15 0.84 1.70 4.21 36.70 ▇▁▁▁▁
X5 0 1.00 1.15 1.05 -0.89 0.38 1.05 1.80 4.40 ▃▇▅▂▁
X6 0 1.00 3.06 2.20 -5.14 1.75 2.84 4.26 10.08 ▁▂▇▃▁
X7 0 1.00 0.22 2.50 -9.85 -1.11 0.29 1.92 6.07 ▁▁▆▇▂
X8 9 0.92 99.47 41.28 34.82 75.98 90.02 113.28 359.14 ▇▅▁▁▁
X9 0 1.00 -14.34 202.88 -1955.72 -16.23 12.94 33.35 456.49 ▁▁▁▂▇
X10 0 1.00 710.34 2434.56 1.17 34.54 107.80 375.19 20935.00 ▇▁▁▁▁
X11 21 0.82 6.31 8.92 0.34 1.84 3.60 7.93 63.50 ▇▁▁▁▁
X12 0 1.00 24.52 6.52 12.67 20.16 23.08 27.97 46.83 ▃▇▃▁▁
X13 0 1.00 24.36 8.19 8.88 18.42 24.23 29.25 55.09 ▅▇▆▁▁
X14 12 0.90 8.52 5.68 0.12 4.82 7.00 10.30 33.70 ▇▆▂▁▁

Dari Output diatas terdapat dengan total 55 missing data yang terbagi atas : - Variabel X1 = 13 - Variabel X8 = 9 - Variabel X11 = 21 - Variabel X14 = 12

Menghilangkan Var Country Variabel ini dihilangkan karena tidak digunakan dalam analisis

investment_risk2 <- investment_risk %>% select(-Country)
head(investment_risk2)
##     X1        X2        X3       X4     X5       X6      X7        X8        X9
## 1 17.5 38674.616 172.75400  0.68000 1.2206  1.78560 -2.0843  55.00000 -26.52000
## 2 18.2 40105.120 103.52280  1.76600 0.8698  2.65884 -0.7254 102.52738 -13.59890
## 3 18.7 76037.997  31.03626  2.63056 1.4893  1.85034 -1.9008 102.52738 -56.24160
## 4   NA 27882.829  24.78532  1.29416 1.7530  2.23192 -1.1355 102.52738  24.78532
## 5 14.0  4251.398  89.61882  1.44000 0.2562  4.74800  2.3318 166.80851  47.27262
## 6   NA  2033.900  57.05566 22.35646 3.3422 -0.87800 -5.2032  34.81845  15.44938
##          X10    X11      X12      X13   X14 Risk.Level
## 1   2.857862  8.000 23.08410 26.94344  3.00        low
## 2 352.910575  8.155 24.85976 32.47740  2.45        low
## 3 199.928422  8.155 20.39940 31.03926    NA        low
## 4  10.108892     NA 21.69104 17.30888    NA        low
## 5  12.645460  6.600 19.40300 15.11172 18.50       high
## 6  62.485865 10.300 31.12380 20.57210 10.50       high

Boxplot

x1_y <- ggplot(data = investment_risk, mapping = aes(x = X1, y = Risk.Level)) + geom_boxplot()
x2_y <- ggplot(data = investment_risk, mapping = aes(x = X2, y = Risk.Level)) + 
  geom_boxplot()

x3_y <- ggplot(data = investment_risk, mapping = aes(x = X3, y = Risk.Level)) + 
  geom_boxplot()

x4_y <- ggplot(data = investment_risk, mapping = aes(x = X4, y = Risk.Level)) + 
  geom_boxplot()

x5_y <- ggplot(data = investment_risk, mapping = aes(x = X5, y = Risk.Level)) + 
  geom_boxplot()

x6_y <- ggplot(data = investment_risk, mapping = aes(x = X6, y = Risk.Level)) + 
  geom_boxplot()

x7_y <- ggplot(data = investment_risk, mapping = aes(x = X7, y = Risk.Level)) + 
  geom_boxplot()

x8_y <- ggplot(data = investment_risk, mapping = aes(x = X8, y = Risk.Level)) + 
  geom_boxplot()

x9_y <- ggplot(data = investment_risk, mapping = aes(x = X9, y = Risk.Level)) + 
  geom_boxplot()

x10_y <- ggplot(data = investment_risk, mapping = aes(x = X10, y = Risk.Level)) + 
  geom_boxplot()

x11_y <- ggplot(data = investment_risk, mapping = aes(x = X11, y = Risk.Level)) + 
  geom_boxplot()

x12_y <- ggplot(data = investment_risk, mapping = aes(x = X12, y = Risk.Level)) + 
  geom_boxplot()

x13_y <- ggplot(data = investment_risk, mapping = aes(x = X13, y = Risk.Level)) + 
  geom_boxplot()

x14_y <- ggplot(data = investment_risk, mapping = aes(x = X14, y = Risk.Level)) + 
  geom_boxplot()

plot_grid(x1_y, x2_y, x3_y, x4_y)
## Warning: Removed 13 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

plot_grid(x5_y, x6_y, x7_y, x8_y, x9_y)
## Warning: Removed 9 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

plot_grid(x10_y, x11_y, x12_y, x13_y, x14_y)
## Warning: Removed 21 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

MANAJEMEN DATA salah satu langkah dalam mengatasi missing data adalah melakukan pendugaan dengan mengganti data hilang tersebut dengan nilai median pada variabel tersebut

investment_risk2$X1 = ifelse(is.na(investment_risk$X1),
                            ave(investment_risk$X1, FUN = function(x) median(investment_risk$X1, na.rm = TRUE)), investment_risk$X1)

investment_risk2$X8 = ifelse(is.na(investment_risk$X8),
                            ave(investment_risk$X8, FUN = function(x) median(investment_risk$X8, na.rm = TRUE)), investment_risk$X8)


investment_risk2$X11 = ifelse(is.na(investment_risk$X11),
                            ave(investment_risk$X11, FUN = function(x) median(investment_risk$X11, na.rm = TRUE)), investment_risk$X11)

investment_risk2$X14 = ifelse(is.na(investment_risk$X14),
                            ave(investment_risk$X14, FUN = function(x) median(investment_risk$X14, na.rm = TRUE)), investment_risk$X14)

Cek Missing Data

sum(is.na(investment_risk2))
## [1] 0
freqrisk <- as.data.frame(table(investment_risk2$Risk.Level))
freqrisk
##   Var1 Freq
## 1 high   64
## 2  low   53

VISUALISASI DATA Bar Chart

freqrisk$persen <- freqrisk$Freq/sum(freqrisk$Freq)

ggplot(data = freqrisk, mapping = aes(x = Var1, y = Freq)) + 
  geom_col(aes(fill =Var1), alpha = 0.7) + 
  labs(title = "Level Risiko Investasi",
       x = "Level Risiko",
       y = "Frekuensi") + 
  geom_text(aes(label = paste0(round(persen*100, 2), "%")), vjust = -0.25) +
  theme(legend.position = "none")

Berdasarkan diagram batang tersebut dapat dilihat bahwa 54,7% atau 64 negara berada pada level risiko investasi tinggi (high-risk investment) dan 45,3% negara berada pada level risiko investasi yang rendah (low-risk investment). Artinya lebih banyak negara yang berada pada level risiko investasi tinggi dibandingkan dengan negara yang berada pada level risiko investasi rendah

RANDOM FOREST Prepocessing

#import data ke mlr3
task_risk = TaskClassif$new(id="country",backend = investment_risk2,
                              target = "Risk.Level",positive ="low")

Spliting Data data dibagi menjadi 2 bagian yaitu 80% sebagai data training dan 20% sebagai data latih

set.seed(234)
resample_holdout = rsmp("holdout", ratio = 0.8)
resample_holdout$instantiate(task=task_risk)

Model

model_rf <- lrn("classif.ranger", predict_type="prob",importance="impurity")

Interpretasi Model

model_rf$train(task=task_risk)

importance<- data.frame(Predictors = names(model_rf$model$variable.importance),
                        impurity = model_rf$model$variable.importance
                        )

rownames(importance) <- NULL

importance %>% arrange(desc(impurity))
##    Predictors  impurity
## 1          X2 13.411908
## 2         X10  5.302743
## 3         X11  5.131209
## 4          X9  4.788739
## 5          X4  4.180021
## 6         X14  3.677466
## 7         X13  3.334789
## 8          X3  2.925997
## 9         X12  1.679471
## 10         X1  1.618101
## 11         X5  1.562178
## 12         X8  1.178608
## 13         X7  1.075028
## 14         X6  1.043877
ggplot(importance,
       aes(x=impurity,
           y=reorder(Predictors,impurity))
       ) +
  geom_col(fill = "pink")+
  geom_text(aes(label = round(impurity,2)),hjust=1.2)

Variabel X2 yaitu GDP per capita (USD) merupakan variabel paling berpengaruh terhadap risiko investasi sedangkan Variabel X6 yaitu Growth of Real GDP (%) average from last 5 years merupakan variabel paling sedikit memberikan pengaruh terdapat risiko investasi dibandingkan variabel yang lain

Training

set.seed(234)
train_test_rf = resample(task = task_risk,
                         learner = model_rf,
                         resampling = resample_holdout,
                         store_models = TRUE)
## INFO  [13:27:09.001] [mlr3] Applying learner 'classif.ranger' on task 'country' (iter 1/1)

Testing

data_testing = as.data.table(train_test_rf$prediction())
data_testing
##     row_ids  truth response   prob.low  prob.high
##       <int> <fctr>   <fctr>      <num>      <num>
##  1:       3    low      low 0.78600794 0.21399206
##  2:       9    low      low 0.89465476 0.10534524
##  3:      13    low      low 0.92694921 0.07305079
##  4:      17   high     high 0.30266190 0.69733810
##  5:      22    low      low 0.94354365 0.05645635
##  6:      27   high     high 0.12660635 0.87339365
##  7:      32    low      low 0.94916032 0.05083968
##  8:      37   high     high 0.16258810 0.83741190
##  9:      38    low      low 0.52935873 0.47064127
## 10:      41    low      low 0.83743413 0.16256587
## 11:      44   high     high 0.15739841 0.84260159
## 12:      50    low      low 0.80437381 0.19562619
## 13:      53    low      low 0.87608333 0.12391667
## 14:      59   high     high 0.05473730 0.94526270
## 15:      62    low      low 0.86980714 0.13019286
## 16:      67   high     high 0.18334762 0.81665238
## 17:      81   high     high 0.09180794 0.90819206
## 18:      83    low      low 0.91432302 0.08567698
## 19:      86   high     high 0.38120873 0.61879127
## 20:      95   high      low 0.59477460 0.40522540
## 21:      98   high     high 0.10395556 0.89604444
## 22:     110    low      low 0.87265952 0.12734048
## 23:     112   high     high 0.10894921 0.89105079
##     row_ids  truth response   prob.low  prob.high

EVALUASI MODEL Confusion Matrix

train_test_rf$prediction()$confusion
##         truth
## response low high
##     low   12    1
##     high   0   10

Dari conclusing matrix dapat disimpulkan bahwa : - True Positive = 12, artinya ada 12 negara yang memiliki kategori investasi rendah dan diprediksi rendah - True Negative = 10, artinya ada 10 negara yang memiliki kategori investasi tinggi dan diprediksi tinggi - False Positive = 0, artinya tidak ada kesalahan dalam memprediksi kategori investasi rendah yang seharusnya tinggi - False Negative = 1, artinya ada 1 negara yang salah dalam prediksi yaitu kategori investasi tinggi padahal seharusnya rendah

Akurasi

akurasi_rf <- train_test_rf$aggregate(list(msr("classif.acc"),
                        msr("classif.specificity"),
                        msr("classif.sensitivity"
                            )))
akurasi_rf
##         classif.acc classif.specificity classif.sensitivity 
##           0.9565217           0.9090909           1.0000000

Dari output diatas dapat disimpulkan bahwa nilai akurasi yang tinggi sekitar 95.65%

ROC

autoplot(train_test_rf$prediction(), type = "roc")

Kurva ROC menunjukkan trade-off antara True Positive Rate (TPR) dan False Positive Rate (FPR) pada berbagai ambang batas klasifikasi. Semakin besar luas area di bawah kurva ROC maka kemampuan prediksi yang dihasilkan semakin baik

AUC

train_test_rf$aggregate(msr("classif.auc"))
## classif.auc 
##   0.9924242

Nilai AUC atau Area Under Curve didapat dengan menghitung luas area dibawah kurva. Model dengan nilai AUC 0.9924242 memiliki arti bahwa performa sangat baik

Prediksi Data Baru

#Misal akan dilakukan prediksi apakah suatu negara memiliki tingkat risiko investasi yang rendah / tinggi dan diketahui negara tersebut memiliki Var x sebagai berikut : 

risklevel_baru <- data.frame(
  X1 = 20,
  X2 = 9203,
  X3 = 90,
  X4 = 22,
  X5 = 1,
  X6 = 3,
  X7 = 0,
  X8 = 90,
  X9 = 15,
  X10 = 450,
  X11 = 8,
  X12 = 23, 
  X13 = 26,
  X14 = 5)
prediksi <- predict(model_rf, newdata = risklevel_baru)
prediksi
## [1] high
## Levels: low high

Hasil dari prediksi data tersebut menunjukkan bahwa negara tersebut masuk kedalam kategori level risiko investasi yang tinggi