Tugas 2 STA581
library(tidyverse)## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.3 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(readxl)
library(mlr3)
library(mlr3verse)
library(mlr3learners)
library(ggpubr)Deskripsi Data
Data tersebut berisikan 117 negara dengan 11 variabel berdasarkan tingkat resiko investasi berdasarkan negara di seluruh dunia. Mereka ingin memperoleh model yang dapat digunakan untuk memprediksi negara mana saja yang berpotensi memilikki tingkat resiko investasi tinggi ataupun rendah.
Read Data
data_invesment <- read_excel("D:/Tugas 2 STA 581.xlsx")data_invesment$'Risk_Level' <- as.factor(data_invesment$'Risk_Level')summary(data_invesment)## X1 X2 X3 X4
## Min. : 4.20 Min. : 434.5 Min. : 13.63 Min. :-0.1510
## 1st Qu.:15.93 1st Qu.: 4223.5 1st Qu.: 43.11 1st Qu.: 0.8435
## Median :18.35 Median : 11363.6 Median : 70.35 Median : 1.6986
## Mean :18.73 Mean : 22596.3 Mean : 177.50 Mean : 3.4418
## 3rd Qu.:21.64 3rd Qu.: 34641.3 3rd Qu.: 117.76 3rd Qu.: 4.2064
## Max. :47.50 Max. :124340.4 Max. :6908.35 Max. :36.7035
## NA's :13
## X5 X6 X7 X8
## Min. :-0.8862 Min. :-5.135 Min. :-9.8453 Min. : 34.82
## 1st Qu.: 0.3751 1st Qu.: 1.754 1st Qu.:-1.1137 1st Qu.: 75.98
## Median : 1.0511 Median : 2.844 Median : 0.2912 Median : 90.02
## Mean : 1.1471 Mean : 3.064 Mean : 0.2206 Mean : 99.47
## 3rd Qu.: 1.8006 3rd Qu.: 4.258 3rd Qu.: 1.9242 3rd Qu.:113.28
## Max. : 4.4021 Max. :10.076 Max. : 6.0712 Max. :359.14
## NA's :9
## X9 X10 X11 X12
## Min. :-1955.72 Min. : 1.171 Min. : 0.3357 Min. :12.67
## 1st Qu.: -16.23 1st Qu.: 34.539 1st Qu.: 1.8372 1st Qu.:20.16
## Median : 12.94 Median : 107.796 Median : 3.6018 Median :23.08
## Mean : -14.34 Mean : 710.336 Mean : 6.3096 Mean :24.52
## 3rd Qu.: 33.35 3rd Qu.: 375.191 3rd Qu.: 7.9250 3rd Qu.:27.97
## Max. : 456.49 Max. :20935.000 Max. :63.5000 Max. :46.83
## NA's :21
## X13 X14 Risk_Level
## Min. : 8.882 Min. : 0.120 high:64
## 1st Qu.:18.419 1st Qu.: 4.818 low :53
## Median :24.226 Median : 7.000
## Mean :24.362 Mean : 8.521
## 3rd Qu.:29.249 3rd Qu.:10.300
## Max. :55.089 Max. :33.700
## NA's :12
Jika dilihat dari summary data terlihat bahwa respon Y tidak terlalu jauh berbeda perbandingan antara high dan low sekitar 54% dan 45%
A <- ggplot(data_invesment, aes(x=X1, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "Capital adequacy ratio (%) average from last 5 years",
y = "Risk Level")
B <- ggplot(data_invesment, aes(x=X2, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "GDP per capita (USD)",
y = "Risk Level")
C <- ggplot(data_invesment, aes(x=X3, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "Gross External Debt (% of GDP) average from last 5 years",
y = "Risk Level")
D <- ggplot(data_invesment, aes(x=X4, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "growth of consumer price (%) average from last 5 years",
y = "Risk Level")
E <- ggplot(data_invesment, aes(x=X5, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "growth of population (%) average from last 5 years",
y = "Risk Level")
G <- ggplot(data_invesment, aes(x=X6, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "growth of Real GDP (%) average from last 5 years",
y = "Risk Level")
H <- ggplot(data_invesment, aes(x=X7, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "growth of Real GDP per cap. (%) average from last 5 years",
y = "Risk Level")
I <- ggplot(data_invesment, aes(x=X8, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "Loan-deposit ratio (%) average from last 5 years",
y = "Risk Level")
J <- ggplot(data_invesment, aes(x=X9, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "Net External Debt (% of GDP) average from last 5 years",
y = "Risk Level")
K <- ggplot(data_invesment, aes(x=X10, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "Nominal GDP (USD bn)",
y = "Risk Level")
L <- ggplot(data_invesment, aes(x=X11, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "Non-performing loans (% of gross loans) average from last 5 years",
y = "Risk Level")
M <- ggplot(data_invesment, aes(x=X12, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "percentage of gross domestic investment to GDP (%) average from last 5 years",
y = "Risk Level")
N <- ggplot(data_invesment, aes(x=X13, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "percentage of gross domestic saving to GDP (%) average from last 5 years",
y = "Risk Level")
O <- ggplot(data_invesment, aes(x=X14, y= 'Risk_Level')) +
geom_boxplot()+
labs(x = "unemployment rate (% labour force) average from last 5 years",
y = "Risk Level")ggarrange(A, B, C , D,
ncol = 2, nrow = 2)## Warning: Removed 13 rows containing non-finite values (stat_boxplot).
ggarrange(E, G, H , I,
ncol = 2, nrow = 2)## Warning: Removed 9 rows containing non-finite values (stat_boxplot).
ggarrange(J, K, L , M,
ncol = 2, nrow = 2)## Warning: Removed 21 rows containing non-finite values (stat_boxplot).
ggarrange(N, O,
ncol = 1, nrow = 2)## Warning: Removed 12 rows containing non-finite values (stat_boxplot).
Hasil plot memperlihatkan hubungan antara masing-masing variabel x dengan variabel y. Terlihat dari data bahwa seluruh variabel X tersebut mempunyai distribusi data yang cenderung kurang baik, seperti terdapat outlier.
Persiapan Data
Import data ke ekosistem mlr3
task_invesment = TaskClassif$new(id="invesment",
backend = data_invesment,
target = "Risk_Level",
positive ="low")Impute missing variabel
task_invesment$missings()## Risk_Level X1 X10 X11 X12 X13 X14
## 0 13 0 21 0 0 12
## X2 X3 X4 X5 X6 X7 X8
## 0 0 0 0 0 0 9
## X9
## 0
imp_missind = po("missind")
imp_num = po("imputehist", affect_columns = selector_type("numeric"))task_ext = imp_missind$train(list(task_invesment))[[1]]
task_ext$data()## Risk_Level missing_X1 missing_X11 missing_X14 missing_X8
## 1: low present present present present
## 2: low present present present present
## 3: low present present missing present
## 4: low missing missing missing present
## 5: high present present present present
## ---
## 113: low present present present missing
## 114: high present missing present present
## 115: high present present present missing
## 116: high present present present present
## 117: high present missing present present
task_ext = imp_num$train(list(task_invesment))[[1]]
task_ext$data()## Risk_Level X10 X12 X13 X2 X3 X4
## 1: low 2.857862 23.08410 26.94344 38674.616 172.75400 0.68000
## 2: low 352.910575 24.85976 32.47740 40105.120 103.52280 1.76600
## 3: low 199.928422 20.39940 31.03926 76037.997 31.03626 2.63056
## 4: low 10.108892 21.69104 17.30888 27882.829 24.78532 1.29416
## 5: high 12.645460 19.40300 15.11172 4251.398 89.61882 1.44000
## ---
## 113: low 20935.000000 17.40650 17.20802 69324.734 104.17110 1.55316
## 114: high 53.628838 16.44666 17.57796 15968.231 73.01010 8.00348
## 115: high 57.707193 31.60162 29.24286 1872.670 30.04996 12.29840
## 116: high 351.683014 23.54764 25.80812 3886.516 34.52492 2.79600
## 117: high 302.141270 19.11740 16.25316 6404.672 50.79576 4.98278
## X5 X6 X7 X9 X1 X11 X14 X8
## 1: 1.2206 1.78560 -2.0843 -26.52000 17.50000 8.000000 3.0000000 55.00000
## 2: 0.8698 2.65884 -0.7254 -13.59890 18.20000 8.155000 2.4500000 102.52738
## 3: 1.4893 1.85034 -1.9008 -56.24160 18.70000 8.155000 0.4729452 102.52738
## 4: 1.7530 2.23192 -1.1355 24.78532 45.76723 3.310062 19.0976532 102.52738
## 5: 0.2562 4.74800 2.3318 47.27262 14.00000 6.600000 18.5000000 166.80851
## ---
## 113: 0.6255 2.45554 0.4867 47.70210 16.30000 1.000000 5.6146000 127.91449
## 114: 0.3592 0.82090 -0.7169 -16.23150 17.04000 2.715484 10.3000000 49.05568
## 115: 1.6065 5.84000 3.0735 -45.16010 18.40000 2.100000 6.0000000 95.64314
## 116: 0.8506 6.94570 5.2762 7.39622 12.09770 1.690000 2.5000000 86.56201
## 117: 1.4772 0.78940 -2.3230 15.04496 16.60000 7.560733 33.7000000 107.24859
learner_rf <- lrn("classif.ranger",predict_type="prob",importance="impurity")Split Data
Split Data dapat dilakukan dengan beberapa cara. Cara yang dilakukan pada klasifikasi ini adalah holdout yang membagi data train dan data test pada rasio 0.8.
set.seed(913)
resample_invesment=rsmp("holdout", ratio = 0.8)
resample_invesment$instantiate(task=task_ext)Intepretasi Model
learner_rf$train(task=task_ext)learner_rf$model$variable.importance## X10 X12 X13 X2 X3 X4 X5
## 5.8103780 1.7526604 3.0027230 14.9571611 2.5088125 3.8951439 1.6391296
## X6 X7 X9 X1 X11 X14 X8
## 1.0071923 1.2034162 4.3341262 1.2587682 4.8899909 3.6332732 0.9223065
Nilai variable importance untuk Random Forest tersebut adalah nilai Gini Impurity. Semakin besar nilainya maka akan semakin berpengaruh prediktor tersebut. Untuk memudahkan melihat variable importance tersebut, maka dapat dibuat plot
#dataframe
importance <- data.frame(Predictors = names(learner_rf$model$variable.importance),
impurity = learner_rf$model$variable.importance)
rownames(importance) <- NULL
importance <- importance %>% arrange(desc(impurity))
#grafik
ggplot(importance,
aes(x=impurity,
y=reorder(Predictors,impurity))) +
geom_col(fill = "steelblue")+
geom_text(aes(label=round(impurity,2)),hjust=1.2)set.seed(913)
train_test_invesment_forest = resample(task = task_ext,
learner = learner_rf,
resampling = resample_invesment,
store_models = TRUE
)## INFO [15:56:14.832] [mlr3] Applying learner 'classif.ranger' on task 'invesment' (iter 1/1)
Hasil Prediksi
prediksi_test = as.data.table(train_test_invesment_forest$prediction())
head(prediksi_test)## row_ids truth response prob.low prob.high
## 1: 4 low high 0.35828968 0.64171032
## 2: 8 low low 0.94289921 0.05710079
## 3: 11 high high 0.33506111 0.66493889
## 4: 22 low low 0.97195873 0.02804127
## 5: 23 high high 0.09947698 0.90052302
## 6: 24 low high 0.36632302 0.63367698
Confusion Matrix
train_test_invesment_forest$prediction()$confusion## truth
## response low high
## low 9 1
## high 3 10
accforest <- train_test_invesment_forest$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
accforest## classif.acc classif.specificity classif.sensitivity
## 0.8260870 0.9090909 0.7500000
Plot ROC
autoplot(train_test_invesment_forest$prediction(), type = "roc")