library(caret)
library(readxl)
library(dplyr)
library(mice)
library(corrplot)
library(VIM)
library(ggplot2)
library(xgboost)
Terdapat dua data yaitu Training dan Testing, data Training akan digunakan untuk modeling menggunakan XGBoost dan data Testing akan digunakan untuk memprediksi Risk Level.
Data mempunyai 16 peubah:
dengan peubah respon yaitu Risk Level yang terdiri dari dua kategori (low, high).
Data pada sheet Training dimasukkan dalam train.data dan data pada sheet Testing dimasukkan pada test.data.
train.data <- read_excel("Level Risiko Investasi.xlsx", sheet = "Training")
test.data <- read_excel("Level Risiko Investasi.xlsx", sheet = "Testing")
glimpse(train.data)
## Rows: 100
## Columns: 16
## $ Country <chr> "AD", "AE", "AE-AZ", "AE-RK", "AM", "AO", "AR", "AT", "AU…
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, NA, 14.0000, NA, 23.2527, 18.5…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 203…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.05…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.7…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.4…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 6…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, NA, 6.6000, 10.3000, 10.6000, 2.0…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.1238…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.5721…
## $ X14 <dbl> 3.0000, 2.4500, NA, NA, 18.5000, 10.5000, 11.0500, 6.0000…
## $ `Risk Level` <chr> "low", "low", "low", "low", "high", "high", "high", "low"…
glimpse(test.data)
## Rows: 17
## Columns: 15
## $ Country <chr> "SE", "SG", "SI", "SK", "SM", "SV", "TH", "TN", "TR", "TW", "U…
## $ X1 <dbl> 23.2000, 16.8056, 18.2857, 19.6715, 11.9000, NA, 19.8000, 12.9…
## $ X2 <dbl> 60338.0204, 62432.9952, 28684.1682, 21042.7221, 49356.2618, 39…
## $ X3 <dbl> 175.42230, 409.69700, 103.06040, 102.73060, 60.15464, 65.55750…
## $ X4 <dbl> 1.62000, 0.10510, 0.84352, 1.17400, 0.89594, 0.39400, 0.34500,…
## $ X5 <dbl> 0.6755, 0.9068, 0.0746, 0.0734, 0.5865, 0.5042, 0.3153, 1.1173…
## $ X6 <dbl> 2.47168, 2.77600, 3.55290, 3.21976, 1.75420, 2.44734, 3.44058,…
## $ X7 <dbl> 0.3526, 0.2912, 1.9299, 1.2325, -1.1342, -0.1248, 1.2787, -1.5…
## $ X8 <dbl> 185.64097, 94.00211, 72.30708, 111.78982, 88.60514, 88.88685, …
## $ X9 <dbl> 64.14972, -200.98100, 16.23838, 33.35258, -145.43800, 27.33332…
## $ X10 <dbl> 537.609866, 339.988210, 52.761781, 102.567122, 1.490827, 24.63…
## $ X11 <dbl> 0.5000, 1.3095, 3.0176, 2.5300, 63.5000, 1.5706, 3.2000, 13.60…
## $ X12 <dbl> 25.11320, 26.76784, 19.90742, 22.83084, 17.79208, 16.78238, 23…
## $ X13 <dbl> 27.95256, 47.25374, 25.76882, 20.95780, 23.21144, 14.52982, 32…
## $ X14 <dbl> 8.6000, 3.0000, 5.0000, 7.0000, 7.3000, 9.0000, 2.0000, 17.000…
Pada hasil terlihat Country dan Risk Level merupakan tipe karakter (string), nantinya peubah Country akan dihapus dan Risk Level akan menjadi faktor.
str(train.data)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:100] "AD" "AE" "AE-AZ" "AE-RK" ...
## $ X1 : num [1:100] 17.5 18.2 18.7 NA 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 NA 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 NA NA 18.5 ...
## $ Risk Level: chr [1:100] "low" "low" "low" "low" ...
str(test.data)
## tibble [17 × 15] (S3: tbl_df/tbl/data.frame)
## $ Country: chr [1:17] "SE" "SG" "SI" "SK" ...
## $ X1 : num [1:17] 23.2 16.8 18.3 19.7 11.9 ...
## $ X2 : num [1:17] 60338 62433 28684 21043 49356 ...
## $ X3 : num [1:17] 175.4 409.7 103.1 102.7 60.2 ...
## $ X4 : num [1:17] 1.62 0.105 0.844 1.174 0.896 ...
## $ X5 : num [1:17] 0.6755 0.9068 0.0746 0.0734 0.5865 ...
## $ X6 : num [1:17] 2.47 2.78 3.55 3.22 1.75 ...
## $ X7 : num [1:17] 0.353 0.291 1.93 1.232 -1.134 ...
## $ X8 : num [1:17] 185.6 94 72.3 111.8 88.6 ...
## $ X9 : num [1:17] 64.1 -201 16.2 33.4 -145.4 ...
## $ X10 : num [1:17] 537.61 339.99 52.76 102.57 1.49 ...
## $ X11 : num [1:17] 0.5 1.31 3.02 2.53 63.5 ...
## $ X12 : num [1:17] 25.1 26.8 19.9 22.8 17.8 ...
## $ X13 : num [1:17] 28 47.3 25.8 21 23.2 ...
## $ X14 : num [1:17] 8.6 3 5 7 7.3 9 2 17 13.2 3.7 ...
Ringkasan dari train.data dan test.data, terlihat ada beberapa peubah yang memiliki missing value.
summary(train.data)
## Country X1 X2 X3
## Length:100 Min. : 4.20 Min. : 434.5 Min. : 13.63
## Class :character 1st Qu.:15.93 1st Qu.: 4265.9 1st Qu.: 42.96
## Mode :character Median :18.58 Median : 11659.1 Median : 70.42
## Mean :18.97 Mean : 22641.6 Mean : 191.94
## 3rd Qu.:21.80 3rd Qu.: 34815.2 3rd Qu.: 130.63
## Max. :47.50 Max. :124340.4 Max. :6908.35
## NA's :12
## X4 X5 X6 X7
## Min. :-0.151 Min. :-0.8862 Min. :-5.135 Min. :-9.84530
## 1st Qu.: 0.869 1st Qu.: 0.4419 1st Qu.: 1.765 1st Qu.:-1.18720
## Median : 1.700 Median : 1.1402 Median : 2.984 Median : 0.07155
## Mean : 3.263 Mean : 1.2019 Mean : 3.076 Mean : 0.10804
## 3rd Qu.: 3.939 3rd Qu.: 1.9502 3rd Qu.: 4.305 3rd Qu.: 1.94108
## Max. :36.703 Max. : 4.4021 Max. :10.076 Max. : 6.07120
##
## X8 X9 X10 X11
## Min. : 34.82 Min. :-1955.72 Min. : 1.171 Min. : 0.3357
## 1st Qu.: 76.95 1st Qu.: -14.11 1st Qu.: 32.813 1st Qu.: 1.9250
## Median : 90.19 Median : 12.67 Median : 106.872 Median : 3.9000
## Mean : 99.94 Mean : -13.58 Mean : 582.318 Mean : 5.5346
## 3rd Qu.:113.39 3rd Qu.: 36.67 3rd Qu.: 366.370 3rd Qu.: 7.9500
## Max. :359.14 Max. : 456.49 Max. :14866.703 Max. :26.9780
## NA's :7 NA's :17
## X12 X13 X14 Risk Level
## Min. :12.67 Min. :10.95 Min. : 0.120 Length:100
## 1st Qu.:20.79 1st Qu.:19.06 1st Qu.: 4.818 Class :character
## Median :23.40 Median :24.28 Median : 6.800 Mode :character
## Mean :24.96 Mean :24.48 Mean : 8.441
## 3rd Qu.:28.38 3rd Qu.:29.36 3rd Qu.:10.500
## Max. :46.83 Max. :55.09 Max. :24.650
## NA's :11
summary(test.data)
## Country X1 X2 X3
## Length:17 Min. :11.90 Min. : 786.9 Min. : 30.05
## Class :character 1st Qu.:15.76 1st Qu.: 3955.1 1st Qu.: 48.51
## Mode :character Median :17.52 Median : 8653.0 Median : 65.56
## Mean :17.42 Mean :22330.4 Mean : 92.59
## 3rd Qu.:19.70 3rd Qu.:31854.3 3rd Qu.:103.06
## Max. :23.20 Max. :69324.7 Max. :409.70
## NA's :1
## X4 X5 X6 X7
## Min. : 0.1051 Min. :-0.3906 Min. :0.340 Min. :-2.3230
## 1st Qu.: 0.8435 1st Qu.: 0.3153 1st Qu.:1.754 1st Qu.:-0.1248
## Median : 1.6200 Median : 0.6255 Median :2.539 Median : 0.4867
## Mean : 4.4949 Mean : 0.8249 Mean :2.994 Mean : 0.8826
## 3rd Qu.: 5.5560 3rd Qu.: 1.1173 3rd Qu.:3.553 3rd Qu.: 1.8906
## Max. :19.1730 Max. : 3.6551 Max. :6.946 Max. : 5.2762
##
## X8 X9 X10 X11
## Min. : 49.06 Min. :-200.98 Min. : 1.491 Min. : 0.500
## 1st Qu.: 72.28 1st Qu.: -42.56 1st Qu.: 52.762 1st Qu.: 1.571
## Median : 88.89 Median : 15.04 Median : 155.582 Median : 2.530
## Mean : 96.54 Mean : -18.76 Mean : 1463.386 Mean :11.258
## 3rd Qu.:109.52 3rd Qu.: 28.57 3rd Qu.: 501.644 3rd Qu.: 3.336
## Max. :185.64 Max. : 64.46 Max. :20935.000 Max. :63.500
## NA's :2 NA's :4
## X12 X13 X14
## Min. :16.45 Min. : 8.882 Min. : 2.000
## 1st Qu.:17.79 1st Qu.:17.208 1st Qu.: 4.675
## Median :22.03 Median :23.211 Median : 7.150
## Mean :21.91 Mean :23.693 Mean : 8.963
## 3rd Qu.:24.86 3rd Qu.:27.953 3rd Qu.: 9.700
## Max. :31.60 Max. :47.254 Max. :33.700
## NA's :1
Fungsi ini mengubah nama kolom agar dapat digunakan dalam R. Ini berguna jika nama kolom mengandung spasi, karakter khusus, atau dimulai dengan angka yang dapat membuat suatu variabel menjadi tak valid di R. make.names() akan mengganti karakter yang tidak valid dengan titik (.) contohnya “Risk Level” menjadi “Risk.Level”.
colnames(train.data) <- make.names(colnames(train.data))
Mengubah Risk.Level menjadi faktor dengan 2 = low dan 1 = high
train.data$Risk.Level <- as.factor(train.data$Risk.Level)
str(train.data)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:100] "AD" "AE" "AE-AZ" "AE-RK" ...
## $ X1 : num [1:100] 17.5 18.2 18.7 NA 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 NA 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 NA NA 18.5 ...
## $ Risk.Level: Factor w/ 2 levels "high","low": 2 2 2 2 1 1 1 2 2 1 ...
Menghapus Country dari train.data dan test.data karena tak terpakai pada analisis ini.
train.data <- train.data[, -which(names(train.data) == "Country")]
test.data <- test.data[, -which(names(test.data) == "Country")]
colSums(is.na(train.data))
## X1 X2 X3 X4 X5 X6 X7
## 12 0 0 0 0 0 0
## X8 X9 X10 X11 X12 X13 X14
## 7 0 0 17 0 0 11
## Risk.Level
## 0
colSums(is.na(test.data))
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
## 1 0 0 0 0 0 0 2 0 0 4 0 0 1
Terlihat missing values pada X1, X8, X11, dan X14 dan proporsi terbesar pada X11 dan yang terkecil pada X8.
Keterangan: Merah : Missing Values Biru : Dataset
aggr(train.data)
Menggunakan Multivariate Imputation by Chained Equations (MICE) untuk mengatasi atau mengisi missing values pada dataset. Menggunakan model stat20tik untuk memprediksi nilai yang hilang berdasarkan variabel lain dalam dataset, proses imputasi (mengganti data) dilakukan dalam beberapa langkah (atau iterasi). Pada kasus ini MICE akan menghasilkan 20 dataset yang berbeda dan imputasi yang digunakan adalam pmm (Predictive Mean Matching).
train.data_imp <- mice(train.data, m = 20, method = 'pmm', seed = 500)
##
## iter imp variable
## 1 1 X1 X8 X11 X14
## 1 2 X1 X8 X11 X14
## 1 3 X1 X8 X11 X14
## 1 4 X1 X8 X11 X14
## 1 5 X1 X8 X11 X14
## 1 6 X1 X8 X11 X14
## 1 7 X1 X8 X11 X14
## 1 8 X1 X8 X11 X14
## 1 9 X1 X8 X11 X14
## 1 10 X1 X8 X11 X14
## 1 11 X1 X8 X11 X14
## 1 12 X1 X8 X11 X14
## 1 13 X1 X8 X11 X14
## 1 14 X1 X8 X11 X14
## 1 15 X1 X8 X11 X14
## 1 16 X1 X8 X11 X14
## 1 17 X1 X8 X11 X14
## 1 18 X1 X8 X11 X14
## 1 19 X1 X8 X11 X14
## 1 20 X1 X8 X11 X14
## 2 1 X1 X8 X11 X14
## 2 2 X1 X8 X11 X14
## 2 3 X1 X8 X11 X14
## 2 4 X1 X8 X11 X14
## 2 5 X1 X8 X11 X14
## 2 6 X1 X8 X11 X14
## 2 7 X1 X8 X11 X14
## 2 8 X1 X8 X11 X14
## 2 9 X1 X8 X11 X14
## 2 10 X1 X8 X11 X14
## 2 11 X1 X8 X11 X14
## 2 12 X1 X8 X11 X14
## 2 13 X1 X8 X11 X14
## 2 14 X1 X8 X11 X14
## 2 15 X1 X8 X11 X14
## 2 16 X1 X8 X11 X14
## 2 17 X1 X8 X11 X14
## 2 18 X1 X8 X11 X14
## 2 19 X1 X8 X11 X14
## 2 20 X1 X8 X11 X14
## 3 1 X1 X8 X11 X14
## 3 2 X1 X8 X11 X14
## 3 3 X1 X8 X11 X14
## 3 4 X1 X8 X11 X14
## 3 5 X1 X8 X11 X14
## 3 6 X1 X8 X11 X14
## 3 7 X1 X8 X11 X14
## 3 8 X1 X8 X11 X14
## 3 9 X1 X8 X11 X14
## 3 10 X1 X8 X11 X14
## 3 11 X1 X8 X11 X14
## 3 12 X1 X8 X11 X14
## 3 13 X1 X8 X11 X14
## 3 14 X1 X8 X11 X14
## 3 15 X1 X8 X11 X14
## 3 16 X1 X8 X11 X14
## 3 17 X1 X8 X11 X14
## 3 18 X1 X8 X11 X14
## 3 19 X1 X8 X11 X14
## 3 20 X1 X8 X11 X14
## 4 1 X1 X8 X11 X14
## 4 2 X1 X8 X11 X14
## 4 3 X1 X8 X11 X14
## 4 4 X1 X8 X11 X14
## 4 5 X1 X8 X11 X14
## 4 6 X1 X8 X11 X14
## 4 7 X1 X8 X11 X14
## 4 8 X1 X8 X11 X14
## 4 9 X1 X8 X11 X14
## 4 10 X1 X8 X11 X14
## 4 11 X1 X8 X11 X14
## 4 12 X1 X8 X11 X14
## 4 13 X1 X8 X11 X14
## 4 14 X1 X8 X11 X14
## 4 15 X1 X8 X11 X14
## 4 16 X1 X8 X11 X14
## 4 17 X1 X8 X11 X14
## 4 18 X1 X8 X11 X14
## 4 19 X1 X8 X11 X14
## 4 20 X1 X8 X11 X14
## 5 1 X1 X8 X11 X14
## 5 2 X1 X8 X11 X14
## 5 3 X1 X8 X11 X14
## 5 4 X1 X8 X11 X14
## 5 5 X1 X8 X11 X14
## 5 6 X1 X8 X11 X14
## 5 7 X1 X8 X11 X14
## 5 8 X1 X8 X11 X14
## 5 9 X1 X8 X11 X14
## 5 10 X1 X8 X11 X14
## 5 11 X1 X8 X11 X14
## 5 12 X1 X8 X11 X14
## 5 13 X1 X8 X11 X14
## 5 14 X1 X8 X11 X14
## 5 15 X1 X8 X11 X14
## 5 16 X1 X8 X11 X14
## 5 17 X1 X8 X11 X14
## 5 18 X1 X8 X11 X14
## 5 19 X1 X8 X11 X14
## 5 20 X1 X8 X11 X14
Menggunakan fungsi complete() untuk mengambil dataset yang telah diimputasi dari train.data_imp yang dihasilkan oleh MICE, dataset yang diambil pada tahap ini adalah dataset ke 18.
train.data_comp <- complete(train.data_imp, action = 16)
Pada diagram ini sudah tidak ada missing values pada X1, X8, X11, dan X14.
aggr(train.data_comp)
Berisi perbandingan dataset sebelum dan sesudah dilakukan MICE.
Keterangan: Garis hitam : sebelum Merah : sesudah
plot(density(train.data$X1, na.rm = T), main = "Data X1 With NA")
lines(density(train.data_comp$X1, na.rm = T), col = "red", lty = 3)
plot(density(train.data$X8, na.rm = T), main = "Data X8 With NA")
lines(density(train.data_comp$X8, na.rm = T), col = "red", lty = 3)
plot(density(train.data$X11, na.rm = T), main = "Data X11 With NA")
lines(density(train.data_comp$X11, na.rm = T), col = "red", lty = 3)
plot(density(train.data$X14, na.rm = T), main = "Data X14 With NA")
lines(density(train.data_comp$X14, na.rm = T), col = "red", lty = 3)
Terlihat missing values pada X1, X8, X11, dan X14 dan proporsi terbesar pada X11 dan yang terkecil pada X1 dan X14.
Keterangan: Merah : Missing Values Biru : Dataset
aggr(test.data)
Menggunakan Multivariate Imputation by Chained Equations (MICE) untuk mengatasi atau mengisi missing values pada dataset. Menggunakan model stat20tik untuk memprediksi nilai yang hilang berdasarkan variabel lain dalam dataset, proses imputasi (mengganti data) dilakukan dalam beberapa langkah (atau iterasi). Pada kasus ini MICE akan menghasilkan 20 dataset yang berbeda dan imputasi yang digunakan adalam pmm (Predictive Mean Matching).
test.data_imp <- mice(test.data, m = 20, method = 'pmm', seed = 500)
##
## iter imp variable
## 1 1 X1 X8 X11 X14
## 1 2 X1 X8 X11 X14
## 1 3 X1 X8 X11 X14
## 1 4 X1 X8 X11 X14
## 1 5 X1 X8 X11 X14
## 1 6 X1 X8 X11 X14
## 1 7 X1 X8 X11 X14
## 1 8 X1 X8 X11 X14
## 1 9 X1 X8 X11 X14
## 1 10 X1 X8 X11 X14
## 1 11 X1 X8 X11 X14
## 1 12 X1 X8 X11 X14
## 1 13 X1 X8 X11 X14
## 1 14 X1 X8 X11 X14
## 1 15 X1 X8 X11 X14
## 1 16 X1 X8 X11 X14
## 1 17 X1 X8 X11 X14
## 1 18 X1 X8 X11 X14
## 1 19 X1 X8 X11 X14
## 1 20 X1 X8 X11 X14
## 2 1 X1 X8 X11 X14
## 2 2 X1 X8 X11 X14
## 2 3 X1 X8 X11 X14
## 2 4 X1 X8 X11 X14
## 2 5 X1 X8 X11 X14
## 2 6 X1 X8 X11 X14
## 2 7 X1 X8 X11 X14
## 2 8 X1 X8 X11 X14
## 2 9 X1 X8 X11 X14
## 2 10 X1 X8 X11 X14
## 2 11 X1 X8 X11 X14
## 2 12 X1 X8 X11 X14
## 2 13 X1 X8 X11 X14
## 2 14 X1 X8 X11 X14
## 2 15 X1 X8 X11 X14
## 2 16 X1 X8 X11 X14
## 2 17 X1 X8 X11 X14
## 2 18 X1 X8 X11 X14
## 2 19 X1 X8 X11 X14
## 2 20 X1 X8 X11 X14
## 3 1 X1 X8 X11 X14
## 3 2 X1 X8 X11 X14
## 3 3 X1 X8 X11 X14
## 3 4 X1 X8 X11 X14
## 3 5 X1 X8 X11 X14
## 3 6 X1 X8 X11 X14
## 3 7 X1 X8 X11 X14
## 3 8 X1 X8 X11 X14
## 3 9 X1 X8 X11 X14
## 3 10 X1 X8 X11 X14
## 3 11 X1 X8 X11 X14
## 3 12 X1 X8 X11 X14
## 3 13 X1 X8 X11 X14
## 3 14 X1 X8 X11 X14
## 3 15 X1 X8 X11 X14
## 3 16 X1 X8 X11 X14
## 3 17 X1 X8 X11 X14
## 3 18 X1 X8 X11 X14
## 3 19 X1 X8 X11 X14
## 3 20 X1 X8 X11 X14
## 4 1 X1 X8 X11 X14
## 4 2 X1 X8 X11 X14
## 4 3 X1 X8 X11 X14
## 4 4 X1 X8 X11 X14
## 4 5 X1 X8 X11 X14
## 4 6 X1 X8 X11 X14
## 4 7 X1 X8 X11 X14
## 4 8 X1 X8 X11 X14
## 4 9 X1 X8 X11 X14
## 4 10 X1 X8 X11 X14
## 4 11 X1 X8 X11 X14
## 4 12 X1 X8 X11 X14
## 4 13 X1 X8 X11 X14
## 4 14 X1 X8 X11 X14
## 4 15 X1 X8 X11 X14
## 4 16 X1 X8 X11 X14
## 4 17 X1 X8 X11 X14
## 4 18 X1 X8 X11 X14
## 4 19 X1 X8 X11 X14
## 4 20 X1 X8 X11 X14
## 5 1 X1 X8 X11 X14
## 5 2 X1 X8 X11 X14
## 5 3 X1 X8 X11 X14
## 5 4 X1 X8 X11 X14
## 5 5 X1 X8 X11 X14
## 5 6 X1 X8 X11 X14
## 5 7 X1 X8 X11 X14
## 5 8 X1 X8 X11 X14
## 5 9 X1 X8 X11 X14
## 5 10 X1 X8 X11 X14
## 5 11 X1 X8 X11 X14
## 5 12 X1 X8 X11 X14
## 5 13 X1 X8 X11 X14
## 5 14 X1 X8 X11 X14
## 5 15 X1 X8 X11 X14
## 5 16 X1 X8 X11 X14
## 5 17 X1 X8 X11 X14
## 5 18 X1 X8 X11 X14
## 5 19 X1 X8 X11 X14
## 5 20 X1 X8 X11 X14
## Warning: Number of logged events: 124
Menggunakan fungsi complete() untuk mengambil dataset yang telah diimputasi dari test.data_imp yang dihasilkan oleh MICE, dataset yang diambil pada tahap ini adalah dataset ke 18.
test.data_comp <- complete(test.data_imp, action = 15)
Pada diagram sudah tak ada missing values pada X1, X8, X11, dan X14.
aggr(test.data_comp)
Berisi perbandingan dataset sebelum dan sesudah dilakukan MICE.
Keterangan: Garis hitam : sebelum Merah : sesudah
plot(density(test.data$X1, na.rm = T), main = "Data X1 With NA")
lines(density(test.data_comp$X1, na.rm = T), col = "red", lty = 3)
plot(density(test.data$X8, na.rm = T), main = "Data X8 With NA")
lines(density(test.data_comp$X8, na.rm = T), col = "red", lty = 3)
plot(density(test.data$X11, na.rm = T), main = "Data X11 With NA")
lines(density(test.data_comp$X11, na.rm = T), col = "red", lty = 3)
plot(density(test.data$X14, na.rm = T), main = "Data X14 With NA")
lines(density(test.data_comp$X14, na.rm = T), col = "red", lty = 3)
# Data Tanpa Missing Values
glimpse(train.data_comp)
## Rows: 100
## Columns: 15
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, 16.2500, 14.0000, 21.6000, 23.25…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 2033.…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.0556…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36.7…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0.7…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0.2…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.729…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.81…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.449…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 62.…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, 8.1550, 6.6000, 10.3000, 10.6000, 2…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.12380,…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.57210,…
## $ X14 <dbl> 3.0000, 2.4500, 6.5000, 4.0000, 18.5000, 10.5000, 11.0500, …
## $ Risk.Level <fct> low, low, low, low, high, high, high, low, low, high, high,…
glimpse(test.data_comp)
## Rows: 17
## Columns: 14
## $ X1 <dbl> 23.2000, 16.8056, 18.2857, 19.6715, 11.9000, 14.1400, 19.8000, 12.…
## $ X2 <dbl> 60338.0204, 62432.9952, 28684.1682, 21042.7221, 49356.2618, 3989.1…
## $ X3 <dbl> 175.42230, 409.69700, 103.06040, 102.73060, 60.15464, 65.55750, 33…
## $ X4 <dbl> 1.62000, 0.10510, 0.84352, 1.17400, 0.89594, 0.39400, 0.34500, 5.5…
## $ X5 <dbl> 0.6755, 0.9068, 0.0746, 0.0734, 0.5865, 0.5042, 0.3153, 1.1173, 1.…
## $ X6 <dbl> 2.47168, 2.77600, 3.55290, 3.21976, 1.75420, 2.44734, 3.44058, 1.6…
## $ X7 <dbl> 0.3526, 0.2912, 1.9299, 1.2325, -1.1342, -0.1248, 1.2787, -1.5047,…
## $ X8 <dbl> 185.64097, 94.00211, 72.30708, 111.78982, 88.60514, 88.88685, 100.…
## $ X9 <dbl> 64.14972, -200.98100, 16.23838, 33.35258, -145.43800, 27.33332, -4…
## $ X10 <dbl> 537.609866, 339.988210, 52.761781, 102.567122, 1.490827, 24.638720…
## $ X11 <dbl> 0.5000, 1.3095, 3.0176, 2.5300, 63.5000, 1.5706, 3.2000, 13.6000, …
## $ X12 <dbl> 25.11320, 26.76784, 19.90742, 22.83084, 17.79208, 16.78238, 23.059…
## $ X13 <dbl> 27.95256, 47.25374, 25.76882, 20.95780, 23.21144, 14.52982, 32.479…
## $ X14 <dbl> 8.6000, 3.0000, 5.0000, 7.0000, 7.3000, 9.0000, 2.0000, 17.0000, 1…
# Menentukan metode robust scaling dengan median dan IQR
robust_scaler <- preProcess(train.data_comp, method = c("center", "scale"))
# Menerapkan scaling pada data
train_data_scaled <- predict(robust_scaler, newdata = train.data_comp)
train.data_comp <- train_data_scaled
# Melihat hasil data yang telah di-scale
glimpse(train.data_comp)
## Rows: 100
## Columns: 15
## $ X1 <dbl> -0.25637153, -0.12215162, -0.02628026, -0.49604994, -0.9274…
## $ X2 <dbl> 0.64529235, 0.70286675, 2.14908003, 0.21094844, -0.74016075…
## $ X3 <dbl> -0.02750584, -0.12678694, -0.23073637, -0.23970053, -0.1467…
## $ X4 <dbl> -0.53099855, -0.30772683, -0.12998115, -0.40473284, -0.3747…
## $ X5 <dbl> 0.017649939, -0.312586268, 0.270598825, 0.518840808, -0.890…
## $ X6 <dbl> -0.5700293, -0.1843743, -0.5414378, -0.3729179, 0.7382759, …
## $ X7 <dbl> -0.84946987, -0.32293520, -0.77836889, -0.48183715, 0.86164…
## $ X8 <dbl> -1.04978481, 0.08000816, 0.08000816, 0.08000816, 1.60806117…
## $ X9 <dbl> -0.0595941002, -0.0000680741, -0.1965181034, 0.1767636262, …
## $ X10 <dbl> -0.34926133, -0.13827205, -0.23047986, -0.34489087, -0.3433…
## $ X11 <dbl> 0.52909040, 0.56005959, 0.56005959, 0.56005959, 0.24936870,…
## $ X12 <dbl> -0.27916463, -0.01530186, -0.67811060, -0.48617311, -0.8261…
## $ X13 <dbl> 0.305780462, 0.991479821, 0.813283435, -0.888014148, -1.160…
## $ X14 <dbl> -1.03407141, -1.14038716, -0.35751666, -0.84077005, 1.96209…
## $ Risk.Level <fct> low, low, low, low, high, high, high, low, low, high, high,…
Bar chart untuk Risk.Level pada train.data_comp
risk_counts <- train.data_comp %>%
group_by(Risk.Level) %>%
summarize(count = n())
ggplot(risk_counts, aes(x = Risk.Level, y = count, fill = Risk.Level)) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), vjust = -0.5, size = 5) +
scale_fill_manual(values = c("low" = "skyblue", "high" = "red")) +
labs(title = "Frekuensi Risk Level", x = "Risk Level", y = "Frekuensi") +
theme_minimal()
set.seed(123) # Untuk reproduksibilitas
train_index <- createDataPartition(train.data_comp$Risk.Level, p = 0.8, list = FALSE)
train_data <- train.data_comp[train_index, ] # Data pelatihan
test_data <- train.data_comp[-train_index, ] # Data pengujian
glimpse(train_data)
## Rows: 81
## Columns: 15
## $ X1 <dbl> -0.12215162, -0.02628026, -0.49604994, 0.84666686, -0.05043…
## $ X2 <dbl> 0.70286675, 2.14908003, 0.21094844, -0.54085321, 1.22886785…
## $ X3 <dbl> -0.12678694, -0.23073637, -0.23970053, -0.21321340, -0.0466…
## $ X4 <dbl> -0.30772683, -0.12998115, -0.40473284, 6.87509728, -0.35758…
## $ X5 <dbl> -0.312586268, 0.270598825, 0.518840808, -0.222307898, -0.44…
## $ X6 <dbl> -0.18437432, -0.54143776, -0.37291795, -1.46319584, -0.5281…
## $ X7 <dbl> -0.32293520, -0.77836889, -0.48183715, -1.48701505, -0.1581…
## $ X8 <dbl> 0.08000816, 0.08000816, 0.08000816, -0.48918610, 0.41022590…
## $ X9 <dbl> -0.0000680741, -0.1965181034, 0.1767636262, 0.0394839714, 0…
## $ X10 <dbl> -0.13827205, -0.23047986, -0.34489087, -0.12484299, -0.0918…
## $ X11 <dbl> 0.56005959, 0.56005959, 0.56005959, 1.04857356, -0.66592067…
## $ X12 <dbl> -0.01530186, -0.67811060, -0.48617311, -1.22580796, -0.0267…
## $ X13 <dbl> 0.991479821, 0.813283435, -0.888014148, -1.320414283, 0.300…
## $ X14 <dbl> -1.14038716, -0.35751666, -0.84077005, 0.52200452, -0.45416…
## $ Risk.Level <fct> low, low, low, high, low, low, high, high, low, low, high, …
glimpse(test_data)
## Rows: 19
## Columns: 15
## $ X1 <dbl> -0.256371531, -0.927471075, 0.529773650, -2.806549800, 0.08…
## $ X2 <dbl> 0.6452923487, -0.7401607460, -0.8294097861, -0.8177517087, …
## $ X3 <dbl> -0.02750584, -0.14672596, -0.19342321, -0.24693074, 0.12000…
## $ X4 <dbl> -0.53099855, -0.37474946, 3.92548453, 0.52409399, -0.670561…
## $ X5 <dbl> 0.01764994, -0.89021722, 2.01488237, -0.13654815, -0.340451…
## $ X6 <dbl> -0.570029337, 0.738275866, -1.746373429, 1.905080601, -0.52…
## $ X7 <dbl> -0.84946987, 0.86164189, -2.05795393, 2.31055185, 0.0252852…
## $ X8 <dbl> -1.04978481, 1.60806117, -1.52952871, -0.49336738, -0.39607…
## $ X9 <dbl> -0.059594100, 0.280360039, 0.133754021, 0.085227302, -0.639…
## $ X10 <dbl> -0.3492613, -0.3433620, -0.3133214, -0.1417456, 0.1004758, …
## $ X11 <dbl> 0.52909040, 0.24936870, 0.98863320, 0.46915004, -0.91946841…
## $ X12 <dbl> -0.27916463, -0.82617547, 0.91553342, 1.14976539, -0.080816…
## $ X13 <dbl> 0.3057805, -1.1602588, -0.4836766, 0.9563520, 1.0356281, -1…
## $ X14 <dbl> -1.03407141, 1.96209963, 0.41568877, -0.64746870, -1.000668…
## $ Risk.Level <fct> low, high, high, high, low, high, low, high, low, low, high…
===========================================================================
# Mengonversi variabel target ke numerik
train_data$Risk.Level <- as.numeric(as.factor(train_data$Risk.Level)) - 1
test_data$Risk.Level <- as.numeric(as.factor(test_data$Risk.Level)) - 1
# Mempersiapkan matriks untuk XGBoost
train_matrix <- as.matrix(train_data[, -which(names(train_data) == "Risk.Level")])
train_label <- train_data$Risk.Level
test_matrix <- as.matrix(test_data[, -which(names(test_data) == "Risk.Level")])
test_label <- test_data$Risk.Level
# Mengonversi data menjadi DMatrix untuk XGBoost
dtrain <- xgb.DMatrix(data = train_matrix, label = train_label)
dtest <- xgb.DMatrix(data = test_matrix, label = test_label)
# Melatih model XGBoost
params <- list(
booster = "gbtree",
objective = "binary:logistic", # Untuk klasifikasi biner
eta = 0.1, # Learning rate
max_depth = 6,
gamma = 1,
subsample = 0.8,
colsample_bytree = 0.8,
eval_metric = "error"
)
set.seed(123)
xgb_model <- xgb.train(
params = params,
data = dtrain,
nrounds = 100, # Jumlah iterasi boosting
watchlist = list(train = dtrain, test = dtest),
early_stopping_rounds = 10, # Hentikan lebih awal jika performa tidak meningkat
verbose = 1
)
## [1] train-error:0.123457 test-error:0.210526
## Multiple eval metrics are present. Will use test_error for early stopping.
## Will train until test_error hasn't improved in 10 rounds.
##
## [2] train-error:0.049383 test-error:0.157895
## [3] train-error:0.037037 test-error:0.157895
## [4] train-error:0.049383 test-error:0.052632
## [5] train-error:0.049383 test-error:0.105263
## [6] train-error:0.037037 test-error:0.105263
## [7] train-error:0.024691 test-error:0.105263
## [8] train-error:0.024691 test-error:0.105263
## [9] train-error:0.024691 test-error:0.105263
## [10] train-error:0.012346 test-error:0.105263
## [11] train-error:0.012346 test-error:0.105263
## [12] train-error:0.024691 test-error:0.105263
## [13] train-error:0.012346 test-error:0.105263
## [14] train-error:0.012346 test-error:0.105263
## Stopping. Best iteration:
## [4] train-error:0.049383 test-error:0.052632
xg_pred <- predict(xgb_model, newdata = dtest)
#prediksi
preds_class <- ifelse(xg_pred > 0.5, 1, 0)
preds_class
## [1] 1 0 0 0 1 0 1 0 1 0 0 1 0 1 0 0 1 0 1
xg_pred <- predict(xgb_model, newdata = dtest)
# Menghitung akurasi
preds_class <- ifelse(xg_pred > 0.5, 1, 0)
xg_predictions <- data.frame(Actual = test_data$Risk.Level, Predicted = xg_pred)
print(xg_predictions)
## Actual Predicted
## 1 1 0.5008311
## 2 0 0.4034971
## 3 0 0.3588598
## 4 0 0.3588598
## 5 1 0.6443974
## 6 0 0.4806306
## 7 1 0.6443974
## 8 0 0.3588598
## 9 1 0.5397391
## 10 1 0.3883951
## 11 0 0.3588598
## 12 1 0.6443974
## 13 0 0.3588598
## 14 1 0.5835428
## 15 0 0.3588598
## 16 0 0.3940508
## 17 1 0.5659225
## 18 0 0.3588598
## 19 1 0.5557163
# Mengevaluasi model dengan matriks kebingungan
confusion_matrix <- confusionMatrix(factor(preds_class), factor(test_label))
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 10 1
## 1 0 8
##
## Accuracy : 0.9474
## 95% CI : (0.7397, 0.9987)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : 9.149e-05
##
## Kappa : 0.8939
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.8889
## Pos Pred Value : 0.9091
## Neg Pred Value : 1.0000
## Prevalence : 0.5263
## Detection Rate : 0.5263
## Detection Prevalence : 0.5789
## Balanced Accuracy : 0.9444
##
## 'Positive' Class : 0
##
##Visualisasi data
library(reshape2)
library(pheatmap)
# Membuat confusion matrix
cm <- as.matrix(confusion_matrix$table)
# Mengubah menjadi data frame untuk visualisasi
cm_df <- as.data.frame(cm)
colnames(cm_df) <- c("Prediction", "Actual", "Freq")
# Visualisasi dengan ggplot2
ggplot(data = cm_df, aes(x = Prediction, y = Actual)) +
geom_tile(aes(fill = Freq), color = "white") +
scale_fill_gradient(low = "skyblue", high = "red") +
geom_text(aes(label = Freq), vjust = 1) +
theme_minimal() +
labs(title = "Confusion Matrix", x = "Predicted", y = "Actual")
Pada confusion matrix ini, menunjukkan kinerja dari sebuah model, dimana: - Baris (sumbu x) mempresentasikan nilai aktual dari kelas. - Kolom merepresentasikan nilai prediksi dari model.
Pada matrikx tersebut, dapat kita lihat bahwa: - True Positives (TP) (Prediksi high, Aktual high) = 1 Model memprediksi high dengan benar, dan kondisi sebenarnya juga high.
False Negatives (FN) (Prediksi low, Aktual high) = 8 Model salah memprediksi sebagai low, padahal sebenarnya high.
False Positives (FP) (Prediksi high, Aktual low) = 0 Model tidak salah memprediksi high ketika sebenarnya low, jadi tidak ada kesalahan di sini.
True Negatives (TN) (Prediksi low, Aktual low) = 10 Model memprediksi low dengan benar, dan kondisi sebenarnya juga low.
Jadi, model ini cukup baik dalam memprediksi kondisi low (benar 10 kali), tetapi sangat buruk dalam memprediksi kondisi high (hanya benar 1 kali, dan salah 8 kali).