Tugas 2 STA581

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.3     v dplyr   1.0.7
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   2.0.1     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(readxl)
library(mlr3)
library(mlr3verse)
library(mlr3learners)
library(ggpubr)

Deskripsi Data

Data tersebut berisikan 117 negara dengan 11 variabel berdasarkan tingkat resiko investasi berdasarkan negara di seluruh dunia. Mereka ingin memperoleh model yang dapat digunakan untuk memprediksi negara mana saja yang berpotensi memilikki tingkat resiko investasi tinggi ataupun rendah.

Read Data

data_invesment <- read_excel("D:/Tugas 2 STA 581.xlsx")

data_invesment$'Risk_Level' <- as.factor(data_invesment$'Risk_Level')

summary(data_invesment)

##        X1              X2                 X3                X4         
##  Min.   : 4.20   Min.   :   434.5   Min.   :  13.63   Min.   :-0.1510  
##  1st Qu.:15.93   1st Qu.:  4223.5   1st Qu.:  43.11   1st Qu.: 0.8435  
##  Median :18.35   Median : 11363.6   Median :  70.35   Median : 1.6986  
##  Mean   :18.73   Mean   : 22596.3   Mean   : 177.50   Mean   : 3.4418  
##  3rd Qu.:21.64   3rd Qu.: 34641.3   3rd Qu.: 117.76   3rd Qu.: 4.2064  
##  Max.   :47.50   Max.   :124340.4   Max.   :6908.35   Max.   :36.7035  
##  NA's   :13                                                            
##        X5                X6               X7                X8        
##  Min.   :-0.8862   Min.   :-5.135   Min.   :-9.8453   Min.   : 34.82  
##  1st Qu.: 0.3751   1st Qu.: 1.754   1st Qu.:-1.1137   1st Qu.: 75.98  
##  Median : 1.0511   Median : 2.844   Median : 0.2912   Median : 90.02  
##  Mean   : 1.1471   Mean   : 3.064   Mean   : 0.2206   Mean   : 99.47  
##  3rd Qu.: 1.8006   3rd Qu.: 4.258   3rd Qu.: 1.9242   3rd Qu.:113.28  
##  Max.   : 4.4021   Max.   :10.076   Max.   : 6.0712   Max.   :359.14  
##                                                       NA's   :9       
##        X9                X10                 X11               X12       
##  Min.   :-1955.72   Min.   :    1.171   Min.   : 0.3357   Min.   :12.67  
##  1st Qu.:  -16.23   1st Qu.:   34.539   1st Qu.: 1.8372   1st Qu.:20.16  
##  Median :   12.94   Median :  107.796   Median : 3.6018   Median :23.08  
##  Mean   :  -14.34   Mean   :  710.336   Mean   : 6.3096   Mean   :24.52  
##  3rd Qu.:   33.35   3rd Qu.:  375.191   3rd Qu.: 7.9250   3rd Qu.:27.97  
##  Max.   :  456.49   Max.   :20935.000   Max.   :63.5000   Max.   :46.83  
##                                         NA's   :21                       
##       X13              X14         Risk_Level
##  Min.   : 8.882   Min.   : 0.120   high:64   
##  1st Qu.:18.419   1st Qu.: 4.818   low :53   
##  Median :24.226   Median : 7.000             
##  Mean   :24.362   Mean   : 8.521             
##  3rd Qu.:29.249   3rd Qu.:10.300             
##  Max.   :55.089   Max.   :33.700             
##                   NA's   :12

Jika dilihat dari summary data terlihat bahwa respon Y tidak terlalu jauh berbeda perbandingan antara high dan low sekitar 54% dan 45%

A <- ggplot(data_invesment, aes(x=X1, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "Capital adequacy ratio (%) average from last 5 years",
       y = "Risk Level")

B <- ggplot(data_invesment, aes(x=X2, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "GDP per capita (USD)",
       y = "Risk Level")

C <- ggplot(data_invesment, aes(x=X3, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "Gross External Debt (% of GDP) average from last 5 years",
       y = "Risk Level")

D <- ggplot(data_invesment, aes(x=X4, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "growth of consumer price (%) average from last 5 years",
       y = "Risk Level")

E <- ggplot(data_invesment, aes(x=X5, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "growth of population (%) average from last 5 years",
       y = "Risk Level")

G <- ggplot(data_invesment, aes(x=X6, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "growth of Real GDP (%) average from last 5 years",
       y = "Risk Level")

H <- ggplot(data_invesment, aes(x=X7, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "growth of Real GDP per cap. (%) average from last 5 years",
       y = "Risk Level")

I <- ggplot(data_invesment, aes(x=X8, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "Loan-deposit ratio (%) average from last 5 years",
       y = "Risk Level")

J <- ggplot(data_invesment, aes(x=X9, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "Net External Debt (% of GDP) average from last 5 years",
       y = "Risk Level")

K <- ggplot(data_invesment, aes(x=X10, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "Nominal GDP (USD bn)",
       y = "Risk Level")

L <- ggplot(data_invesment, aes(x=X11, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "Non-performing loans (% of gross loans) average from last 5 years",
       y = "Risk Level")
M <- ggplot(data_invesment, aes(x=X12, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "percentage of gross domestic investment to GDP (%) average from last 5 years",
       y = "Risk Level")

N <- ggplot(data_invesment, aes(x=X13, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "percentage of gross domestic saving to GDP (%) average from last 5 years",
       y = "Risk Level")
O <- ggplot(data_invesment, aes(x=X14, y= 'Risk_Level')) +
  geom_boxplot()+
  labs(x = "unemployment rate (% labour force) average from last 5 years",
       y = "Risk Level")

ggarrange(A, B, C , D,
          ncol = 2, nrow = 2)

## Warning: Removed 13 rows containing non-finite values (stat_boxplot).

ggarrange(E, G, H , I,
          ncol = 2, nrow = 2)

## Warning: Removed 9 rows containing non-finite values (stat_boxplot).

ggarrange(J, K, L , M,
          ncol = 2, nrow = 2)

## Warning: Removed 21 rows containing non-finite values (stat_boxplot).

ggarrange(N, O,
          ncol = 1, nrow = 2)

## Warning: Removed 12 rows containing non-finite values (stat_boxplot).

Hasil plot memperlihatkan hubungan antara masing-masing variabel x dengan variabel y. Terlihat dari data bahwa seluruh variabel X tersebut mempunyai distribusi data yang cenderung kurang baik, seperti terdapat outlier.

Persiapan Data

Import data ke ekosistem mlr3

task_invesment = TaskClassif$new(id="invesment",
                           backend = data_invesment,
                           target = "Risk_Level",
                           positive ="low")

Impute missing variabel

task_invesment$missings()

## Risk_Level         X1        X10        X11        X12        X13        X14 
##          0         13          0         21          0          0         12 
##         X2         X3         X4         X5         X6         X7         X8 
##          0          0          0          0          0          0          9 
##         X9 
##          0

imp_missind = po("missind")
imp_num = po("imputehist", affect_columns = selector_type("numeric"))

task_ext = imp_missind$train(list(task_invesment))[[1]]
task_ext$data()

##      Risk_Level missing_X1 missing_X11 missing_X14 missing_X8
##   1:        low    present     present     present    present
##   2:        low    present     present     present    present
##   3:        low    present     present     missing    present
##   4:        low    missing     missing     missing    present
##   5:       high    present     present     present    present
##  ---                                                         
## 113:        low    present     present     present    missing
## 114:       high    present     missing     present    present
## 115:       high    present     present     present    missing
## 116:       high    present     present     present    present
## 117:       high    present     missing     present    present

task_ext = imp_num$train(list(task_invesment))[[1]]
task_ext$data()

##      Risk_Level          X10      X12      X13        X2        X3       X4
##   1:        low     2.857862 23.08410 26.94344 38674.616 172.75400  0.68000
##   2:        low   352.910575 24.85976 32.47740 40105.120 103.52280  1.76600
##   3:        low   199.928422 20.39940 31.03926 76037.997  31.03626  2.63056
##   4:        low    10.108892 21.69104 17.30888 27882.829  24.78532  1.29416
##   5:       high    12.645460 19.40300 15.11172  4251.398  89.61882  1.44000
##  ---                                                                       
## 113:        low 20935.000000 17.40650 17.20802 69324.734 104.17110  1.55316
## 114:       high    53.628838 16.44666 17.57796 15968.231  73.01010  8.00348
## 115:       high    57.707193 31.60162 29.24286  1872.670  30.04996 12.29840
## 116:       high   351.683014 23.54764 25.80812  3886.516  34.52492  2.79600
## 117:       high   302.141270 19.11740 16.25316  6404.672  50.79576  4.98278
##          X5      X6      X7        X9       X1      X11        X14        X8
##   1: 1.2206 1.78560 -2.0843 -26.52000 17.50000 8.000000  3.0000000  55.00000
##   2: 0.8698 2.65884 -0.7254 -13.59890 18.20000 8.155000  2.4500000 102.52738
##   3: 1.4893 1.85034 -1.9008 -56.24160 18.70000 8.155000  0.4729452 102.52738
##   4: 1.7530 2.23192 -1.1355  24.78532 45.76723 3.310062 19.0976532 102.52738
##   5: 0.2562 4.74800  2.3318  47.27262 14.00000 6.600000 18.5000000 166.80851
##  ---                                                                        
## 113: 0.6255 2.45554  0.4867  47.70210 16.30000 1.000000  5.6146000 127.91449
## 114: 0.3592 0.82090 -0.7169 -16.23150 17.04000 2.715484 10.3000000  49.05568
## 115: 1.6065 5.84000  3.0735 -45.16010 18.40000 2.100000  6.0000000  95.64314
## 116: 0.8506 6.94570  5.2762   7.39622 12.09770 1.690000  2.5000000  86.56201
## 117: 1.4772 0.78940 -2.3230  15.04496 16.60000 7.560733 33.7000000 107.24859

learner_rf <-  lrn("classif.ranger",predict_type="prob",importance="impurity")

Split Data

Split Data dapat dilakukan dengan beberapa cara. Cara yang dilakukan pada klasifikasi ini adalah holdout yang membagi data train dan data test pada rasio 0.8.

set.seed(913)
resample_invesment=rsmp("holdout", ratio = 0.8) 
resample_invesment$instantiate(task=task_ext)

Intepretasi Model

learner_rf$train(task=task_ext)

learner_rf$model$variable.importance

##        X10        X12        X13         X2         X3         X4         X5 
##  5.8103780  1.7526604  3.0027230 14.9571611  2.5088125  3.8951439  1.6391296 
##         X6         X7         X9         X1        X11        X14         X8 
##  1.0071923  1.2034162  4.3341262  1.2587682  4.8899909  3.6332732  0.9223065

Nilai variable importance untuk Random Forest tersebut adalah nilai Gini Impurity. Semakin besar nilainya maka akan semakin berpengaruh prediktor tersebut. Untuk memudahkan melihat variable importance tersebut, maka dapat dibuat plot

#dataframe
importance <- data.frame(Predictors = names(learner_rf$model$variable.importance),
                         impurity = learner_rf$model$variable.importance)
rownames(importance) <- NULL
importance <- importance %>% arrange(desc(impurity))

#grafik
ggplot(importance,
       aes(x=impurity,
           y=reorder(Predictors,impurity))) +
  geom_col(fill = "steelblue")+
  geom_text(aes(label=round(impurity,2)),hjust=1.2)

set.seed(913)
train_test_invesment_forest = resample(task = task_ext,
                               learner = learner_rf,
                               resampling = resample_invesment,
                               store_models = TRUE
                               )

## INFO  [15:56:14.832] [mlr3]  Applying learner 'classif.ranger' on task 'invesment' (iter 1/1)

Hasil Prediksi

prediksi_test = as.data.table(train_test_invesment_forest$prediction())
head(prediksi_test)

##    row_ids truth response   prob.low  prob.high
## 1:       4   low     high 0.35828968 0.64171032
## 2:       8   low      low 0.94289921 0.05710079
## 3:      11  high     high 0.33506111 0.66493889
## 4:      22   low      low 0.97195873 0.02804127
## 5:      23  high     high 0.09947698 0.90052302
## 6:      24   low     high 0.36632302 0.63367698

Confusion Matrix

train_test_invesment_forest$prediction()$confusion

##         truth
## response low high
##     low    9    1
##     high   3   10

accforest <- train_test_invesment_forest$aggregate(list(msr("classif.acc"),
                                   msr("classif.specificity"),
                                   msr("classif.sensitivity")
                                   ))
accforest

##         classif.acc classif.specificity classif.sensitivity 
##           0.8260870           0.9090909           0.7500000

Plot ROC

autoplot(train_test_invesment_forest$prediction(), type = "roc")