#Package yang perlu diinstall
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(ggplot2)
library(cowplot)
library(mlr3learners)
## Loading required package: mlr3
library(mlr3viz)
library(precrec)
library(dplyr)
##Input Data
investment_risk <- read_excel("D:/coolyeah/rstudio/investment_risk.xlsx")
write.csv(investment_risk, "D:/coolyeah/rstudio/investment_risk.csv")
##Eksplorasi Data
investment_risk <- read.csv("D:/coolyeah/rstudio/investment_risk.csv", stringsAsFactors = TRUE)
glimpse(investment_risk)
## Rows: 100
## Columns: 17
## $ X <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
## $ Country <fct> AD, AE, AE-AZ, AE-RK, AM, AO, AR, AT, AU, AW, AZ, BD, BE, B…
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, NA, 14.0000, NA, 23.2527, 18.574…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 2033.…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.0556…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36.7…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0.7…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0.2…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.729…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.81…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.449…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 62.…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, NA, 6.6000, 10.3000, 10.6000, 2.019…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.12380,…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.57210,…
## $ X14 <dbl> 3.0000, 2.4500, NA, NA, 18.5000, 10.5000, 11.0500, 6.0000, …
## $ Risk.Level <fct> low, low, low, low, high, high, high, low, low, high, high,…
skimr::skim(investment_risk)
| Name | investment_risk |
| Number of rows | 100 |
| Number of columns | 17 |
| _______________________ | |
| Column type frequency: | |
| factor | 2 |
| numeric | 15 |
| ________________________ | |
| Group variables | None |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| Country | 1 | 0.99 | FALSE | 98 | LS: 2, AD: 1, AE: 1, AE-: 1 |
| Risk.Level | 0 | 1.00 | FALSE | 2 | hig: 54, low: 46 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| X | 0 | 1.00 | 50.50 | 29.01 | 1.00 | 25.75 | 50.50 | 75.25 | 100.00 | ▇▇▇▇▇ |
| X1 | 12 | 0.88 | 18.97 | 5.42 | 4.20 | 15.93 | 18.58 | 21.80 | 47.50 | ▁▇▃▁▁ |
| X2 | 0 | 1.00 | 22641.57 | 24846.18 | 434.46 | 4265.94 | 11659.10 | 34815.21 | 124340.38 | ▇▂▂▁▁ |
| X3 | 0 | 1.00 | 191.93 | 697.33 | 13.63 | 42.96 | 70.42 | 130.63 | 6908.35 | ▇▁▁▁▁ |
| X4 | 0 | 1.00 | 3.26 | 4.86 | -0.15 | 0.87 | 1.70 | 3.94 | 36.70 | ▇▁▁▁▁ |
| X5 | 0 | 1.00 | 1.20 | 1.06 | -0.89 | 0.44 | 1.14 | 1.95 | 4.40 | ▃▇▅▃▁ |
| X6 | 0 | 1.00 | 3.08 | 2.26 | -5.14 | 1.76 | 2.98 | 4.30 | 10.08 | ▁▂▇▃▁ |
| X7 | 0 | 1.00 | 0.11 | 2.58 | -9.85 | -1.19 | 0.07 | 1.94 | 6.07 | ▁▁▆▇▂ |
| X8 | 7 | 0.93 | 99.94 | 42.61 | 34.82 | 76.95 | 90.19 | 113.39 | 359.14 | ▇▅▁▁▁ |
| X9 | 0 | 1.00 | -13.58 | 217.07 | -1955.72 | -14.11 | 12.67 | 36.67 | 456.49 | ▁▁▁▂▇ |
| X10 | 0 | 1.00 | 582.32 | 1659.10 | 1.17 | 32.81 | 106.87 | 366.37 | 14866.70 | ▇▁▁▁▁ |
| X11 | 17 | 0.83 | 5.53 | 5.14 | 0.34 | 1.93 | 3.90 | 7.95 | 26.98 | ▇▃▁▁▁ |
| X12 | 0 | 1.00 | 24.96 | 6.73 | 12.67 | 20.79 | 23.40 | 28.38 | 46.83 | ▃▇▃▁▁ |
| X13 | 0 | 1.00 | 24.48 | 8.07 | 10.95 | 19.06 | 24.28 | 29.36 | 55.09 | ▅▇▅▁▁ |
| X14 | 11 | 0.89 | 8.44 | 5.29 | 0.12 | 4.82 | 6.80 | 10.50 | 24.65 | ▆▇▂▂▁ |
Dari output diatas ternyata terdapat 55 missing data yang terdiri dari a. 12 dari X1 b. 7 dari X8 c. 17 dari X11 d. 11 dri X14
#Menghilangkan var country Variabel ini dihilangkan karena tidak akan digunakan dalam analisis
investment_risk2 <- investment_risk %>% select(-Country)
head(investment_risk2)
## X X1 X2 X3 X4 X5 X6 X7 X8
## 1 1 17.5 38674.616 172.75400 0.68000 1.2206 1.78560 -2.0843 55.00000
## 2 2 18.2 40105.120 103.52280 1.76600 0.8698 2.65884 -0.7254 102.52738
## 3 3 18.7 76037.997 31.03626 2.63056 1.4893 1.85034 -1.9008 102.52738
## 4 4 NA 27882.829 24.78532 1.29416 1.7530 2.23192 -1.1355 102.52738
## 5 5 14.0 4251.398 89.61882 1.44000 0.2562 4.74800 2.3318 166.80851
## 6 6 NA 2033.900 57.05566 22.35646 3.3422 -0.87800 -5.2032 34.81845
## X9 X10 X11 X12 X13 X14 Risk.Level
## 1 -26.52000 2.857862 8.000 23.08410 26.94344 3.00 low
## 2 -13.59890 352.910575 8.155 24.85976 32.47740 2.45 low
## 3 -56.24160 199.928422 8.155 20.39940 31.03926 NA low
## 4 24.78532 10.108892 NA 21.69104 17.30888 NA low
## 5 47.27262 12.645460 6.600 19.40300 15.11172 18.50 high
## 6 15.44938 62.485865 10.300 31.12380 20.57210 10.50 high
#Boxplot
x1_y <- ggplot(data = investment_risk, mapping = aes(x = X1, y = Risk.Level)) + geom_boxplot()
x2_y <- ggplot(data = investment_risk, mapping = aes(x = X2, y = Risk.Level)) + geom_boxplot()
x3_y <- ggplot(data = investment_risk, mapping = aes(x = X3, y = Risk.Level)) + geom_boxplot()
x4_y <- ggplot(data = investment_risk, mapping = aes(x = X4, y = Risk.Level)) + geom_boxplot()
x5_y <- ggplot(data = investment_risk, mapping = aes(x = X5, y = Risk.Level)) + geom_boxplot()
x6_y <- ggplot(data = investment_risk, mapping = aes(x = X6, y = Risk.Level)) + geom_boxplot()
x7_y <- ggplot(data = investment_risk, mapping = aes(x = X7, y = Risk.Level)) + geom_boxplot()
x8_y <- ggplot(data = investment_risk, mapping = aes(x = X8, y = Risk.Level)) + geom_boxplot()
x9_y <- ggplot(data = investment_risk, mapping = aes(x = X9, y = Risk.Level)) + geom_boxplot()
x10_y <- ggplot(data = investment_risk, mapping = aes(x = X10, y = Risk.Level)) + geom_boxplot()
x11_y <- ggplot(data = investment_risk, mapping = aes(x = X11, y = Risk.Level)) + geom_boxplot()
X12_y <- ggplot(data = investment_risk, mapping = aes(x = X12, y = Risk.Level)) + geom_boxplot()
x13_y <- ggplot(data = investment_risk, mapping = aes(x = X13, y = Risk.Level)) + geom_boxplot()
x14_y <- ggplot(data = investment_risk, mapping = aes(x = X14, y = Risk.Level)) + geom_boxplot()
plot_grid(x1_y, x2_y, x3_y, x4_y)
## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
plot_grid(x5_y, x6_y, x7_y, x8_y)
## Warning: Removed 7 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
plot_grid(x9_y, x10_y, x11_y, X12_y)
## Warning: Removed 17 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
plot_grid(x13_y, x14_y)
## Warning: Removed 11 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
##Manajemen Data Salah satu cara untuk mengatasi missing data dapat dilakukan pendugaan dengan mengganti data hilang tersebut dengan nilai median pada variabel tersebut.
investment_risk2$X1 = ifelse(is.na(investment_risk$X1),
ave(investment_risk$X1, FUN = function(x) median(investment_risk$X1, na.rm = TRUE)), investment_risk$X1)
investment_risk2$X8 = ifelse(is.na(investment_risk$X8),
ave(investment_risk$X8, FUN = function(x) median(investment_risk$X8, na.rm = TRUE)), investment_risk$X8)
investment_risk2$X11 = ifelse(is.na(investment_risk$X11),
ave(investment_risk$X11, FUN = function(x) median(investment_risk$X11, na.rm = TRUE)), investment_risk$X11)
investment_risk2$X14 = ifelse(is.na(investment_risk$X14),
ave(investment_risk$X14, FUN = function(x) median(investment_risk$X14, na.rm = TRUE)), investment_risk$X14)
sum(is.na(investment_risk2))
## [1] 0
freqrisk <- as.data.frame(table(investment_risk2$Risk.Level))
freqrisk
## Var1 Freq
## 1 high 54
## 2 low 46
##Visualisasi Data #Bar Chart
freqrisk$persen <- freqrisk$Freq/sum(freqrisk$Freq)
ggplot(data = freqrisk, mapping = aes(x = Var1, y = Freq)) +
geom_col(aes(fill = Var1), alpha = 0.7) +
labs(title = "Investment Risk Level",
x = "Risk Level",
y = "Frekuensi") +
geom_text(aes(label = paste0(round(persen*100, 2), "%")), vjust = -0.025) +
theme(legend.position = "none")
Berdasarkan bar chart diatas terlihat bahwa 54.7% atau 64 negara berada
pada level risiko investasi yang tinggi (high-risk investment) dan 45.3%
atau 53 negara berada pada level risiko investasi yang rendah (low-risk
investment). Artinya, negara yang memiliki level risiko investasi yang
tinggi sedikit lebih banyak dibandingkan negara yang memiliki level
risiko investasi yang rendah.
##Random Forest #Prepocessing
#import data ke mlr3
task_risk = TaskClassif$new(id="country",backend = investment_risk2, target = "Risk.Level",positive = "low")
#Spliting Data Data dibagi menjadi 2 bagian yaitu 80% sebagai data training dan 20% sebagai data latih
set.seed(234)
resample_holdout =rsmp("holdout", ratio = 0.8)
resample_holdout$instantiate(task=task_risk)
#Model
model_rf <- lrn("classif.ranger", predict_type="prob",importance="impurity")
#Interpretasi Model
model_rf$train(task = task_risk)
importance <- data.frame(Predictors = names(model_rf$model$variable.importance), impurity = model_rf$model$variable.importance)
rownames(importance) <- NULL
importance %>% arrange(desc(impurity))
## Predictors impurity
## 1 X2 11.8724377
## 2 X9 5.2421323
## 3 X10 4.7361754
## 4 X11 4.3777131
## 5 X4 2.5998377
## 6 X14 2.4071723
## 7 X13 2.3934757
## 8 X3 2.1159699
## 9 X8 1.5433392
## 10 X12 1.2681765
## 11 X5 1.2579312
## 12 X1 1.0787028
## 13 X 0.8135966
## 14 X7 0.8020200
## 15 X6 0.7734643
ggplot(importance,
aes(x=impurity,
y=reorder(Predictors,impurity))
) +
geom_col(fill = "steelblue")+
geom_text(aes(label = round(impurity,2)),hjust=1.2)
Variabel X2 yaitu GDP per capita (USD) merupakan variabel yang paling
berpengaruh terhadap risiko investasi, sedangkan X6 merupakan variable
yang paling tidak berpengaruh terhadap risiko investasi dibandingkan
variabel lainnya.
##Training
set.seed(234)
train_test_rf = resample(task = task_risk,
learner = model_rf,
resampling = resample_holdout,
store_models = TRUE
)
## INFO [02:58:08.544] [mlr3] Applying learner 'classif.ranger' on task 'country' (iter 1/1)
##Testing
data_testing = as.data.table(train_test_rf$prediction())
data_testing
## row_ids truth response prob.low prob.high
## <int> <fctr> <fctr> <num> <num>
## 1: 9 low low 0.91592778 0.08407222
## 2: 10 high low 0.57005952 0.42994048
## 3: 12 high high 0.29087778 0.70912222
## 4: 22 low low 0.92773810 0.07226190
## 5: 23 high high 0.08159841 0.91840159
## 6: 27 high high 0.29357857 0.70642143
## 7: 32 low low 0.96166905 0.03833095
## 8: 33 low low 0.92919762 0.07080238
## 9: 36 low low 0.79794841 0.20205159
## 10: 37 high high 0.18384365 0.81615635
## 11: 44 high high 0.19987540 0.80012460
## 12: 47 high high 0.25113651 0.74886349
## 13: 59 high high 0.07966429 0.92033571
## 14: 62 low low 0.88374444 0.11625556
## 15: 64 low high 0.48527619 0.51472381
## 16: 71 high high 0.12314524 0.87685476
## 17: 73 high high 0.07230238 0.92769762
## 18: 81 high high 0.15267698 0.84732302
## 19: 94 low low 0.73786667 0.26213333
## 20: 95 high low 0.63084444 0.36915556
## row_ids truth response prob.low prob.high
##Evaluasi Model #Confusion Matrix
train_test_rf$prediction()$confusion
## truth
## response low high
## low 7 2
## high 1 10
Dari confusion matrix diatas terlihat bahwa diperoleh: 1. True Positif = 7, artinya ada 7 negara yang memiliki kategori investasi rendah dan diprediksi rendah. 2. True Negative = 2, artinya ada 2 negara yang memiliki kategori investasi tinggi dan diprediksi tinggi. 3. False Positive = 1, artinya ada 1 negara yang salah dalam memprediksi kategori investasi rendah yang seharunya tinggi 4. False Negative = 10, artinya ada 10 nagara yang salah dalam prediksi yaitu kategori investasi tinggi padahal seharusnya rendah
#Akurasi
akurasi_rf <- train_test_rf$aggregate(list(msr("classif.acc"),
msr("classif.specificity"),
msr("classif.sensitivity")
))
akurasi_rf
## classif.acc classif.specificity classif.sensitivity
## 0.8500000 0.8333333 0.8750000
Terlihat dari output diatas, diperoleh nilai akurasi yang sedang, yaitu sekitar 85%
#ROC
autoplot(train_test_rf$prediction(), type = "roc")
Kurva ROC menunjukkan trade-off antara True Positive Rate (TPR) dan
False Positive Rate (FPR) pada berbagai ambang batas klasifikasi.
Semakin besar luas area di bawah kurva ROC maka kemampuan prediksi yang
dihasilkan semakin baik.
nilai AUC atau Area Under Curve didapat dengan menghitung luas area bawah kurva AUC sebagai berikut:
train_test_rf$aggregate((msr("classif.auc")))
## classif.auc
## 0.9791667
Model dengan nilai AUC 0.9791667 menunjukkan performa yang sangat baik.