suppressMessages({
library(caret)
library(readxl)
library(dplyr)
library(mice)
library(e1071)
library(VIM)
library(ggplot2)
library(kableExtra)
})
Terdapat dua data yaitu Training dan Testing, data Training akan digunakan untuk modeling menggunakan SVM dan data Testing akan digunakan untuk memprediksi Risk Level.
Analisis bertujuan untuk :
Input data Data pada sheet Training dimasukan dalam Risiko_training dan data pada sheet Testing dimasukan pada Risiko_testing.
Risiko_training <- read_excel("Level Risiko Investasi.xlsx", sheet = "Training")
Risiko_testing <- read_excel("Level Risiko Investasi.xlsx", sheet = "Testing")
Risiko_testing.ori <- Risiko_testing
glimpse(Risiko_training)
## Rows: 100
## Columns: 16
## $ Country <chr> "AD", "AE", "AE-AZ", "AE-RK", "AM", "AO", "AR", "AT", "AU…
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, NA, 14.0000, NA, 23.2527, 18.5…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 203…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.05…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.7…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.4…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 6…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, NA, 6.6000, 10.3000, 10.6000, 2.0…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.1238…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.5721…
## $ X14 <dbl> 3.0000, 2.4500, NA, NA, 18.5000, 10.5000, 11.0500, 6.0000…
## $ `Risk Level` <chr> "low", "low", "low", "low", "high", "high", "high", "low"…
glimpse(Risiko_testing)
## Rows: 17
## Columns: 15
## $ Country <chr> "SE", "SG", "SI", "SK", "SM", "SV", "TH", "TN", "TR", "TW", "U…
## $ X1 <dbl> 23.2000, 16.8056, 18.2857, 19.6715, 11.9000, NA, 19.8000, 12.9…
## $ X2 <dbl> 60338.0204, 62432.9952, 28684.1682, 21042.7221, 49356.2618, 39…
## $ X3 <dbl> 175.42230, 409.69700, 103.06040, 102.73060, 60.15464, 65.55750…
## $ X4 <dbl> 1.62000, 0.10510, 0.84352, 1.17400, 0.89594, 0.39400, 0.34500,…
## $ X5 <dbl> 0.6755, 0.9068, 0.0746, 0.0734, 0.5865, 0.5042, 0.3153, 1.1173…
## $ X6 <dbl> 2.47168, 2.77600, 3.55290, 3.21976, 1.75420, 2.44734, 3.44058,…
## $ X7 <dbl> 0.3526, 0.2912, 1.9299, 1.2325, -1.1342, -0.1248, 1.2787, -1.5…
## $ X8 <dbl> 185.64097, 94.00211, 72.30708, 111.78982, 88.60514, 88.88685, …
## $ X9 <dbl> 64.14972, -200.98100, 16.23838, 33.35258, -145.43800, 27.33332…
## $ X10 <dbl> 537.609866, 339.988210, 52.761781, 102.567122, 1.490827, 24.63…
## $ X11 <dbl> 0.5000, 1.3095, 3.0176, 2.5300, 63.5000, 1.5706, 3.2000, 13.60…
## $ X12 <dbl> 25.11320, 26.76784, 19.90742, 22.83084, 17.79208, 16.78238, 23…
## $ X13 <dbl> 27.95256, 47.25374, 25.76882, 20.95780, 23.21144, 14.52982, 32…
## $ X14 <dbl> 8.6000, 3.0000, 5.0000, 7.0000, 7.3000, 9.0000, 2.0000, 17.000…
Melihat struktur data
Pada hasil terlihat Country dan Risk Level merupakan tipe karakter (string), peubah Country nantinya akan dihapus dan Risk Level akan menjadi faktor.
str(Risiko_training)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:100] "AD" "AE" "AE-AZ" "AE-RK" ...
## $ X1 : num [1:100] 17.5 18.2 18.7 NA 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 NA 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 NA NA 18.5 ...
## $ Risk Level: chr [1:100] "low" "low" "low" "low" ...
str(Risiko_testing)
## tibble [17 × 15] (S3: tbl_df/tbl/data.frame)
## $ Country: chr [1:17] "SE" "SG" "SI" "SK" ...
## $ X1 : num [1:17] 23.2 16.8 18.3 19.7 11.9 ...
## $ X2 : num [1:17] 60338 62433 28684 21043 49356 ...
## $ X3 : num [1:17] 175.4 409.7 103.1 102.7 60.2 ...
## $ X4 : num [1:17] 1.62 0.105 0.844 1.174 0.896 ...
## $ X5 : num [1:17] 0.6755 0.9068 0.0746 0.0734 0.5865 ...
## $ X6 : num [1:17] 2.47 2.78 3.55 3.22 1.75 ...
## $ X7 : num [1:17] 0.353 0.291 1.93 1.232 -1.134 ...
## $ X8 : num [1:17] 185.6 94 72.3 111.8 88.6 ...
## $ X9 : num [1:17] 64.1 -201 16.2 33.4 -145.4 ...
## $ X10 : num [1:17] 537.61 339.99 52.76 102.57 1.49 ...
## $ X11 : num [1:17] 0.5 1.31 3.02 2.53 63.5 ...
## $ X12 : num [1:17] 25.1 26.8 19.9 22.8 17.8 ...
## $ X13 : num [1:17] 28 47.3 25.8 21 23.2 ...
## $ X14 : num [1:17] 8.6 3 5 7 7.3 9 2 17 13.2 3.7 ...
Ringkasan dari Risiko_training dan Risiko_testing terlihat ada beberapa peubah yang memiliki missing value.
summary(Risiko_training)
## Country X1 X2 X3
## Length:100 Min. : 4.20 Min. : 434.5 Min. : 13.63
## Class :character 1st Qu.:15.93 1st Qu.: 4265.9 1st Qu.: 42.96
## Mode :character Median :18.58 Median : 11659.1 Median : 70.42
## Mean :18.97 Mean : 22641.6 Mean : 191.94
## 3rd Qu.:21.80 3rd Qu.: 34815.2 3rd Qu.: 130.63
## Max. :47.50 Max. :124340.4 Max. :6908.35
## NA's :12
## X4 X5 X6 X7
## Min. :-0.151 Min. :-0.8862 Min. :-5.135 Min. :-9.84530
## 1st Qu.: 0.869 1st Qu.: 0.4419 1st Qu.: 1.765 1st Qu.:-1.18720
## Median : 1.700 Median : 1.1402 Median : 2.984 Median : 0.07155
## Mean : 3.263 Mean : 1.2019 Mean : 3.076 Mean : 0.10804
## 3rd Qu.: 3.939 3rd Qu.: 1.9502 3rd Qu.: 4.305 3rd Qu.: 1.94108
## Max. :36.703 Max. : 4.4021 Max. :10.076 Max. : 6.07120
##
## X8 X9 X10 X11
## Min. : 34.82 Min. :-1955.72 Min. : 1.171 Min. : 0.3357
## 1st Qu.: 76.95 1st Qu.: -14.11 1st Qu.: 32.813 1st Qu.: 1.9250
## Median : 90.19 Median : 12.67 Median : 106.872 Median : 3.9000
## Mean : 99.94 Mean : -13.58 Mean : 582.318 Mean : 5.5346
## 3rd Qu.:113.39 3rd Qu.: 36.67 3rd Qu.: 366.370 3rd Qu.: 7.9500
## Max. :359.14 Max. : 456.49 Max. :14866.703 Max. :26.9780
## NA's :7 NA's :17
## X12 X13 X14 Risk Level
## Min. :12.67 Min. :10.95 Min. : 0.120 Length:100
## 1st Qu.:20.79 1st Qu.:19.06 1st Qu.: 4.818 Class :character
## Median :23.40 Median :24.28 Median : 6.800 Mode :character
## Mean :24.96 Mean :24.48 Mean : 8.441
## 3rd Qu.:28.38 3rd Qu.:29.36 3rd Qu.:10.500
## Max. :46.83 Max. :55.09 Max. :24.650
## NA's :11
summary(Risiko_testing)
## Country X1 X2 X3
## Length:17 Min. :11.90 Min. : 786.9 Min. : 30.05
## Class :character 1st Qu.:15.76 1st Qu.: 3955.1 1st Qu.: 48.51
## Mode :character Median :17.52 Median : 8653.0 Median : 65.56
## Mean :17.42 Mean :22330.4 Mean : 92.59
## 3rd Qu.:19.70 3rd Qu.:31854.3 3rd Qu.:103.06
## Max. :23.20 Max. :69324.7 Max. :409.70
## NA's :1
## X4 X5 X6 X7
## Min. : 0.1051 Min. :-0.3906 Min. :0.340 Min. :-2.3230
## 1st Qu.: 0.8435 1st Qu.: 0.3153 1st Qu.:1.754 1st Qu.:-0.1248
## Median : 1.6200 Median : 0.6255 Median :2.539 Median : 0.4867
## Mean : 4.4949 Mean : 0.8249 Mean :2.994 Mean : 0.8826
## 3rd Qu.: 5.5560 3rd Qu.: 1.1173 3rd Qu.:3.553 3rd Qu.: 1.8906
## Max. :19.1730 Max. : 3.6551 Max. :6.946 Max. : 5.2762
##
## X8 X9 X10 X11
## Min. : 49.06 Min. :-200.98 Min. : 1.491 Min. : 0.500
## 1st Qu.: 72.28 1st Qu.: -42.56 1st Qu.: 52.762 1st Qu.: 1.571
## Median : 88.89 Median : 15.04 Median : 155.582 Median : 2.530
## Mean : 96.54 Mean : -18.76 Mean : 1463.386 Mean :11.258
## 3rd Qu.:109.52 3rd Qu.: 28.57 3rd Qu.: 501.644 3rd Qu.: 3.336
## Max. :185.64 Max. : 64.46 Max. :20935.000 Max. :63.500
## NA's :2 NA's :4
## X12 X13 X14
## Min. :16.45 Min. : 8.882 Min. : 2.000
## 1st Qu.:17.79 1st Qu.:17.208 1st Qu.: 4.675
## Median :22.03 Median :23.211 Median : 7.150
## Mean :21.91 Mean :23.693 Mean : 8.963
## 3rd Qu.:24.86 3rd Qu.:27.953 3rd Qu.: 9.700
## Max. :31.60 Max. :47.254 Max. :33.700
## NA's :1
Fungsi ini mengubah nama kolom sehingga menjadi valid dalam R. Ini berguna jika nama kolom mengandung spasi, karakter khusus, atau dimulai dengan angka. make.names() akan mengganti karakter yang tidak valid dengan titik (.) contohnya “Risk Level” mejadi “Risk.Level”.
colnames(Risiko_training) <- make.names(colnames(Risiko_training))
Mengubah Risk.Level mejadi faktor dengan 0 = low dan 1 = high
Risiko_training$Risk.Level <- as.factor(Risiko_training$Risk.Level)
str(Risiko_training)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:100] "AD" "AE" "AE-AZ" "AE-RK" ...
## $ X1 : num [1:100] 17.5 18.2 18.7 NA 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 NA 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 NA NA 18.5 ...
## $ Risk.Level: Factor w/ 2 levels "high","low": 2 2 2 2 1 1 1 2 2 1 ...
Menghapus Country dari Risiko_training dan Risiko_testing karena tidak terpakai pada analisis.
Risiko_training <- Risiko_training[, -which(names(Risiko_training) == "Country")]
Risiko_testing <- Risiko_testing[, -which(names(Risiko_testing) == "Country")]
colSums(is.na(Risiko_training))
## X1 X2 X3 X4 X5 X6 X7
## 12 0 0 0 0 0 0
## X8 X9 X10 X11 X12 X13 X14
## 7 0 0 17 0 0 11
## Risk.Level
## 0
colSums(is.na(Risiko_testing))
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
## 1 0 0 0 0 0 0 2 0 0 4 0 0 1
Proporsi Missing Values Terlihat missing values pada X1, X8, X11, dan X14 dab proporsi terbesar pada X11 dan yang terkecil pada X8.
Keterangan:
Merah : Missing Values
Biru : Dataset
aggr(Risiko_training)
Menggunakan Multivariate Imputation by Chained Equations untuk mengatasi atau mengisi missing values. Menggunakan model statistik untuk memprediksi nilai yang hilang berdasarkan variabel lain dalam dataset, proses imputasi dilakukan dalam beberapa langkah (atau iterasi). Pada kasus ini MICE akan menghasilkan 25 dataset yang berbeda dan imputasi yang digunakan adalam pmm (Predictive Mean Matching).
Risiko_training_imp <- mice(Risiko_training, m = 30, method = 'pmm', seed = 500)
##
## iter imp variable
## 1 1 X1 X8 X11 X14
## 1 2 X1 X8 X11 X14
## 1 3 X1 X8 X11 X14
## 1 4 X1 X8 X11 X14
## 1 5 X1 X8 X11 X14
## 1 6 X1 X8 X11 X14
## 1 7 X1 X8 X11 X14
## 1 8 X1 X8 X11 X14
## 1 9 X1 X8 X11 X14
## 1 10 X1 X8 X11 X14
## 1 11 X1 X8 X11 X14
## 1 12 X1 X8 X11 X14
## 1 13 X1 X8 X11 X14
## 1 14 X1 X8 X11 X14
## 1 15 X1 X8 X11 X14
## 1 16 X1 X8 X11 X14
## 1 17 X1 X8 X11 X14
## 1 18 X1 X8 X11 X14
## 1 19 X1 X8 X11 X14
## 1 20 X1 X8 X11 X14
## 1 21 X1 X8 X11 X14
## 1 22 X1 X8 X11 X14
## 1 23 X1 X8 X11 X14
## 1 24 X1 X8 X11 X14
## 1 25 X1 X8 X11 X14
## 1 26 X1 X8 X11 X14
## 1 27 X1 X8 X11 X14
## 1 28 X1 X8 X11 X14
## 1 29 X1 X8 X11 X14
## 1 30 X1 X8 X11 X14
## 2 1 X1 X8 X11 X14
## 2 2 X1 X8 X11 X14
## 2 3 X1 X8 X11 X14
## 2 4 X1 X8 X11 X14
## 2 5 X1 X8 X11 X14
## 2 6 X1 X8 X11 X14
## 2 7 X1 X8 X11 X14
## 2 8 X1 X8 X11 X14
## 2 9 X1 X8 X11 X14
## 2 10 X1 X8 X11 X14
## 2 11 X1 X8 X11 X14
## 2 12 X1 X8 X11 X14
## 2 13 X1 X8 X11 X14
## 2 14 X1 X8 X11 X14
## 2 15 X1 X8 X11 X14
## 2 16 X1 X8 X11 X14
## 2 17 X1 X8 X11 X14
## 2 18 X1 X8 X11 X14
## 2 19 X1 X8 X11 X14
## 2 20 X1 X8 X11 X14
## 2 21 X1 X8 X11 X14
## 2 22 X1 X8 X11 X14
## 2 23 X1 X8 X11 X14
## 2 24 X1 X8 X11 X14
## 2 25 X1 X8 X11 X14
## 2 26 X1 X8 X11 X14
## 2 27 X1 X8 X11 X14
## 2 28 X1 X8 X11 X14
## 2 29 X1 X8 X11 X14
## 2 30 X1 X8 X11 X14
## 3 1 X1 X8 X11 X14
## 3 2 X1 X8 X11 X14
## 3 3 X1 X8 X11 X14
## 3 4 X1 X8 X11 X14
## 3 5 X1 X8 X11 X14
## 3 6 X1 X8 X11 X14
## 3 7 X1 X8 X11 X14
## 3 8 X1 X8 X11 X14
## 3 9 X1 X8 X11 X14
## 3 10 X1 X8 X11 X14
## 3 11 X1 X8 X11 X14
## 3 12 X1 X8 X11 X14
## 3 13 X1 X8 X11 X14
## 3 14 X1 X8 X11 X14
## 3 15 X1 X8 X11 X14
## 3 16 X1 X8 X11 X14
## 3 17 X1 X8 X11 X14
## 3 18 X1 X8 X11 X14
## 3 19 X1 X8 X11 X14
## 3 20 X1 X8 X11 X14
## 3 21 X1 X8 X11 X14
## 3 22 X1 X8 X11 X14
## 3 23 X1 X8 X11 X14
## 3 24 X1 X8 X11 X14
## 3 25 X1 X8 X11 X14
## 3 26 X1 X8 X11 X14
## 3 27 X1 X8 X11 X14
## 3 28 X1 X8 X11 X14
## 3 29 X1 X8 X11 X14
## 3 30 X1 X8 X11 X14
## 4 1 X1 X8 X11 X14
## 4 2 X1 X8 X11 X14
## 4 3 X1 X8 X11 X14
## 4 4 X1 X8 X11 X14
## 4 5 X1 X8 X11 X14
## 4 6 X1 X8 X11 X14
## 4 7 X1 X8 X11 X14
## 4 8 X1 X8 X11 X14
## 4 9 X1 X8 X11 X14
## 4 10 X1 X8 X11 X14
## 4 11 X1 X8 X11 X14
## 4 12 X1 X8 X11 X14
## 4 13 X1 X8 X11 X14
## 4 14 X1 X8 X11 X14
## 4 15 X1 X8 X11 X14
## 4 16 X1 X8 X11 X14
## 4 17 X1 X8 X11 X14
## 4 18 X1 X8 X11 X14
## 4 19 X1 X8 X11 X14
## 4 20 X1 X8 X11 X14
## 4 21 X1 X8 X11 X14
## 4 22 X1 X8 X11 X14
## 4 23 X1 X8 X11 X14
## 4 24 X1 X8 X11 X14
## 4 25 X1 X8 X11 X14
## 4 26 X1 X8 X11 X14
## 4 27 X1 X8 X11 X14
## 4 28 X1 X8 X11 X14
## 4 29 X1 X8 X11 X14
## 4 30 X1 X8 X11 X14
## 5 1 X1 X8 X11 X14
## 5 2 X1 X8 X11 X14
## 5 3 X1 X8 X11 X14
## 5 4 X1 X8 X11 X14
## 5 5 X1 X8 X11 X14
## 5 6 X1 X8 X11 X14
## 5 7 X1 X8 X11 X14
## 5 8 X1 X8 X11 X14
## 5 9 X1 X8 X11 X14
## 5 10 X1 X8 X11 X14
## 5 11 X1 X8 X11 X14
## 5 12 X1 X8 X11 X14
## 5 13 X1 X8 X11 X14
## 5 14 X1 X8 X11 X14
## 5 15 X1 X8 X11 X14
## 5 16 X1 X8 X11 X14
## 5 17 X1 X8 X11 X14
## 5 18 X1 X8 X11 X14
## 5 19 X1 X8 X11 X14
## 5 20 X1 X8 X11 X14
## 5 21 X1 X8 X11 X14
## 5 22 X1 X8 X11 X14
## 5 23 X1 X8 X11 X14
## 5 24 X1 X8 X11 X14
## 5 25 X1 X8 X11 X14
## 5 26 X1 X8 X11 X14
## 5 27 X1 X8 X11 X14
## 5 28 X1 X8 X11 X14
## 5 29 X1 X8 X11 X14
## 5 30 X1 X8 X11 X14
Menggunakan fungsi complete() untuk mengambil dataset yang telah diimputasi dari Risiko_training_imp yang dihasilkan oleh MICE, dataset yang diambil adalah dataset ke 16.
Risiko_training_comp <- complete(Risiko_training_imp, action = 26)
Dapat dilihat pada diagram sudah tidak ada missing values pada X1, X8, X11, dan X14.
aggr(Risiko_training_comp)
Berisi perbandingan dataset sebelum dan sesudah dilakukan MICE.
Keterangan:
Garis hitam : sebelum Hijau : sesudah
plot(density(Risiko_training$X1, na.rm = T), main = "Data X1 With NA")
lines(density(Risiko_training_comp$X1, na.rm = T), col = "green", lty = 3)
plot(density(Risiko_training$X8, na.rm = T), main = "Data X8 With NA")
lines(density(Risiko_training_comp$X8, na.rm = T), col = "green", lty = 3)
plot(density(Risiko_training$X11, na.rm = T), main = "Data X11 With NA")
lines(density(Risiko_training_comp$X11, na.rm = T), col = "green", lty = 3)
plot(density(Risiko_training$X14, na.rm = T), main = "Data X14 With NA")
lines(density(Risiko_training_comp$X14, na.rm = T), col = "green", lty = 3)
Proporsi Missing Value Terlihat missing values pada X1, X8, X11, dan X14 dab proporsi terbesar pada X11 dan yang terkecil pada X1 dan X14.
Keterangan:
Merah : Missing Values
Biru : Dataset
aggr(Risiko_testing)
Menggunakan Multivariate Imputation by Chained Equations untuk mengatasi atau mengisi missing values. Menggunakan model statistik untuk memprediksi nilai yang hilang berdasarkan variabel lain dalam dataset, proses imputasi dilakukan dalam beberapa langkah (atau iterasi). Pada kasus ini MICE akan menghasilkan 20 dataset yang berbeda dan imputasi yang digunakan adalam pmm (Predictive Mean Matching).
Risiko_testing_imp <- mice(Risiko_testing, m = 20, method = 'pmm', seed = 500)
##
## iter imp variable
## 1 1 X1 X8 X11 X14
## 1 2 X1 X8 X11 X14
## 1 3 X1 X8 X11 X14
## 1 4 X1 X8 X11 X14
## 1 5 X1 X8 X11 X14
## 1 6 X1 X8 X11 X14
## 1 7 X1 X8 X11 X14
## 1 8 X1 X8 X11 X14
## 1 9 X1 X8 X11 X14
## 1 10 X1 X8 X11 X14
## 1 11 X1 X8 X11 X14
## 1 12 X1 X8 X11 X14
## 1 13 X1 X8 X11 X14
## 1 14 X1 X8 X11 X14
## 1 15 X1 X8 X11 X14
## 1 16 X1 X8 X11 X14
## 1 17 X1 X8 X11 X14
## 1 18 X1 X8 X11 X14
## 1 19 X1 X8 X11 X14
## 1 20 X1 X8 X11 X14
## 2 1 X1 X8 X11 X14
## 2 2 X1 X8 X11 X14
## 2 3 X1 X8 X11 X14
## 2 4 X1 X8 X11 X14
## 2 5 X1 X8 X11 X14
## 2 6 X1 X8 X11 X14
## 2 7 X1 X8 X11 X14
## 2 8 X1 X8 X11 X14
## 2 9 X1 X8 X11 X14
## 2 10 X1 X8 X11 X14
## 2 11 X1 X8 X11 X14
## 2 12 X1 X8 X11 X14
## 2 13 X1 X8 X11 X14
## 2 14 X1 X8 X11 X14
## 2 15 X1 X8 X11 X14
## 2 16 X1 X8 X11 X14
## 2 17 X1 X8 X11 X14
## 2 18 X1 X8 X11 X14
## 2 19 X1 X8 X11 X14
## 2 20 X1 X8 X11 X14
## 3 1 X1 X8 X11 X14
## 3 2 X1 X8 X11 X14
## 3 3 X1 X8 X11 X14
## 3 4 X1 X8 X11 X14
## 3 5 X1 X8 X11 X14
## 3 6 X1 X8 X11 X14
## 3 7 X1 X8 X11 X14
## 3 8 X1 X8 X11 X14
## 3 9 X1 X8 X11 X14
## 3 10 X1 X8 X11 X14
## 3 11 X1 X8 X11 X14
## 3 12 X1 X8 X11 X14
## 3 13 X1 X8 X11 X14
## 3 14 X1 X8 X11 X14
## 3 15 X1 X8 X11 X14
## 3 16 X1 X8 X11 X14
## 3 17 X1 X8 X11 X14
## 3 18 X1 X8 X11 X14
## 3 19 X1 X8 X11 X14
## 3 20 X1 X8 X11 X14
## 4 1 X1 X8 X11 X14
## 4 2 X1 X8 X11 X14
## 4 3 X1 X8 X11 X14
## 4 4 X1 X8 X11 X14
## 4 5 X1 X8 X11 X14
## 4 6 X1 X8 X11 X14
## 4 7 X1 X8 X11 X14
## 4 8 X1 X8 X11 X14
## 4 9 X1 X8 X11 X14
## 4 10 X1 X8 X11 X14
## 4 11 X1 X8 X11 X14
## 4 12 X1 X8 X11 X14
## 4 13 X1 X8 X11 X14
## 4 14 X1 X8 X11 X14
## 4 15 X1 X8 X11 X14
## 4 16 X1 X8 X11 X14
## 4 17 X1 X8 X11 X14
## 4 18 X1 X8 X11 X14
## 4 19 X1 X8 X11 X14
## 4 20 X1 X8 X11 X14
## 5 1 X1 X8 X11 X14
## 5 2 X1 X8 X11 X14
## 5 3 X1 X8 X11 X14
## 5 4 X1 X8 X11 X14
## 5 5 X1 X8 X11 X14
## 5 6 X1 X8 X11 X14
## 5 7 X1 X8 X11 X14
## 5 8 X1 X8 X11 X14
## 5 9 X1 X8 X11 X14
## 5 10 X1 X8 X11 X14
## 5 11 X1 X8 X11 X14
## 5 12 X1 X8 X11 X14
## 5 13 X1 X8 X11 X14
## 5 14 X1 X8 X11 X14
## 5 15 X1 X8 X11 X14
## 5 16 X1 X8 X11 X14
## 5 17 X1 X8 X11 X14
## 5 18 X1 X8 X11 X14
## 5 19 X1 X8 X11 X14
## 5 20 X1 X8 X11 X14
## Warning: Number of logged events: 124
Menggunakan fungsi complete() untuk mengambil dataset yang telah diimputasi dari Risiko_testing_imp yang dihasilkan oleh MICE, dataset yang diambil adalah dataset ke 13.
Risiko_testing_comp <- complete(Risiko_testing_imp, action = 13)
Dapat dilihat pada diagram sudah tidak ada missing values pada X1, X8, X11, dan X14.
aggr(Risiko_testing_comp)
Berisi perbandingan dataset sebelum dan sesudah dilakukan MICE.
Keterangan:
Garis hitam : sebelum Hijau : sesudah
plot(density(Risiko_testing$X1, na.rm = T), main = "Data X1 With NA")
lines(density(Risiko_testing_comp$X1, na.rm = T), col = "green", lty = 3)
plot(density(Risiko_testing$X8, na.rm = T), main = "Data X8 With NA")
lines(density(Risiko_testing_comp$X8, na.rm = T), col = "green", lty = 3)
plot(density(Risiko_testing$X11, na.rm = T), main = "Data X11 With NA")
lines(density(Risiko_testing_comp$X11, na.rm = T), col = "green", lty = 3)
plot(density(Risiko_testing$X14, na.rm = T), main = "Data X11 With NA")
lines(density(Risiko_testing_comp$X14, na.rm = T), col = "green", lty = 3)
glimpse(Risiko_training_comp)
## Rows: 100
## Columns: 15
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, 19.1000, 14.0000, 14.2800, 23.25…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 2033.…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.0556…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36.7…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0.7…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0.2…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.729…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.81…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.449…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 62.…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, 0.9250, 6.6000, 10.3000, 10.6000, 2…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.12380,…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.57210,…
## $ X14 <dbl> 3.0000, 2.4500, 4.0000, 7.1000, 18.5000, 10.5000, 11.0500, …
## $ Risk.Level <fct> low, low, low, low, high, high, high, low, low, high, high,…
glimpse(Risiko_testing_comp)
## Rows: 17
## Columns: 14
## $ X1 <dbl> 23.2000, 16.8056, 18.2857, 19.6715, 11.9000, 16.3000, 19.8000, 12.…
## $ X2 <dbl> 60338.0204, 62432.9952, 28684.1682, 21042.7221, 49356.2618, 3989.1…
## $ X3 <dbl> 175.42230, 409.69700, 103.06040, 102.73060, 60.15464, 65.55750, 33…
## $ X4 <dbl> 1.62000, 0.10510, 0.84352, 1.17400, 0.89594, 0.39400, 0.34500, 5.5…
## $ X5 <dbl> 0.6755, 0.9068, 0.0746, 0.0734, 0.5865, 0.5042, 0.3153, 1.1173, 1.…
## $ X6 <dbl> 2.47168, 2.77600, 3.55290, 3.21976, 1.75420, 2.44734, 3.44058, 1.6…
## $ X7 <dbl> 0.3526, 0.2912, 1.9299, 1.2325, -1.1342, -0.1248, 1.2787, -1.5047,…
## $ X8 <dbl> 185.64097, 94.00211, 72.30708, 111.78982, 88.60514, 88.88685, 100.…
## $ X9 <dbl> 64.14972, -200.98100, 16.23838, 33.35258, -145.43800, 27.33332, -4…
## $ X10 <dbl> 537.609866, 339.988210, 52.761781, 102.567122, 1.490827, 24.638720…
## $ X11 <dbl> 0.5000, 1.3095, 3.0176, 2.5300, 63.5000, 1.5706, 3.2000, 13.6000, …
## $ X12 <dbl> 25.11320, 26.76784, 19.90742, 22.83084, 17.79208, 16.78238, 23.059…
## $ X13 <dbl> 27.95256, 47.25374, 25.76882, 20.95780, 23.21144, 14.52982, 32.479…
## $ X14 <dbl> 8.6000, 3.0000, 5.0000, 7.0000, 7.3000, 9.0000, 2.0000, 17.0000, 1…
Bar chart untuk Risk.Level pada Risiko_training_comp
risk_counts <- Risiko_training_comp %>%
group_by(Risk.Level) %>%
summarize(count = n())
ggplot(risk_counts, aes(x = as.factor(Risk.Level), y = count, fill = as.factor(Risk.Level))) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), vjust = -0.25, size = 4) +
scale_fill_manual(values = c("0" = "gray", "1" = "black")) +
labs(title = "Frekuensi Risk Level", x = "Risk Level", y = "Frekuensi") +
theme_minimal()
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
Data Risiko_data_comp dibagi menjadi dua bagian yaitu 80% sebagai data training dan 20% sebagai data testing.
set.seed(123)
train_index <- createDataPartition(Risiko_training_comp$Risk.Level, p = 0.8, list = FALSE)
Risiko_training <- Risiko_training_comp[train_index, ]
Risiko_testing <- Risiko_training_comp[-train_index, ]
Menggunakan model SVM dengan kernel linear. Melatih model SVM dengan kernel linear menggunakan data pelatihan Risiko_training. Peubah targetnya adalah Risk.Level, dan semua peubah lain digunakan sebagai prediktor.
svm_model <- svm(Risk.Level ~ ., data = Risiko_training, kernel = "linear")
#Prediksi (Testing) Melakukan prediksi menggunakan data uji Risiko_testing dan menyimpan hasil prediksinya dalam svm_pred
svm_pred <- predict(svm_model, newdata = Risiko_testing)
Menghitung akurasi dari prediksi model dengan membandingkan prediksi svm_pred dengan nilai pada data uji test_data$Risk.Level. Akurasi dihitung sebagai persentase dari prediksi yang benar.
akurasi_svm <- sum(svm_pred == Risiko_testing$Risk.Level) / nrow(Risiko_testing)
print(paste("SVM Akurasi Model:", round(akurasi_svm * 100, 2),"%"))
## [1] "SVM Akurasi Model: 94.74 %"
confusion_matrix <- confusionMatrix(as.factor(svm_pred), as.factor(Risiko_testing$Risk.Level))
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low
## high 10 1
## low 0 8
##
## Accuracy : 0.9474
## 95% CI : (0.7397, 0.9987)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : 9.149e-05
##
## Kappa : 0.8939
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.8889
## Pos Pred Value : 0.9091
## Neg Pred Value : 1.0000
## Prevalence : 0.5263
## Detection Rate : 0.5263
## Detection Prevalence : 0.5789
## Balanced Accuracy : 0.9444
##
## 'Positive' Class : high
##
# Boxplot untuk setiap variabel (misalnya X1 dan X8) terhadap Risk.Level
ggplot(Risiko_training_comp, aes(x = Risk.Level, y = X1, fill = Risk.Level)) +
geom_boxplot() +
labs(title = "Boxplot X1 terhadap Risk Level", x = "Risk Level", y = "X1") +
scale_fill_manual(values = c("0" = "gray", "1" = "black")) +
theme_minimal()
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
ggplot(Risiko_training_comp, aes(x = Risk.Level, y = X8, fill = Risk.Level)) +
geom_boxplot() +
labs(title = "Boxplot X8 terhadap Risk Level", x = "Risk Level", y = "X8") +
scale_fill_manual(values = c("0" = "gray", "1" = "black")) +
theme_minimal()
## Warning: No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
## No shared levels found between `names(values)` of the manual scale and the
## data's fill values.
10 prediksi benar untuk kelas high (model memprediksi high dan benar-benar high).
9 prediksi benar untuk kelas low (model memprediksi low dan benar-benar low).
Tidak ada prediksi yang salah, baik untuk kelas high maupun low, menunjukan model berkerja dengan baik.
confusion_data <- as.data.frame(confusion_matrix$table)
ggplot(confusion_data, aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = Freq), color = "yellow") +
scale_fill_gradient(low = "yellow", high = "blue") +
theme_minimal() +
labs(title = "Confusion Matrix", x = "Actual", y = "Predicted") +
geom_text(aes(label = Freq), vjust = 1)
predictions <- data.frame(Actual = Risiko_testing$Risk.Level, Predicted = svm_pred)
print(predictions)
## Actual Predicted
## 1 low low
## 5 high high
## 6 high high
## 12 high high
## 22 low low
## 30 high high
## 32 low low
## 42 high high
## 50 low low
## 51 low high
## 58 high high
## 62 low low
## 66 high high
## 68 low low
## 71 high high
## 73 high high
## 75 low low
## 93 high high
## 94 low low
Prediksi dilakukan menggunakan model yang sudah dilatih dan diterapkan untuk memprediksi Risk.Level pada data Risiko_testing_comp.
svm_pred.excel <- predict(svm_model, newdata = Risiko_testing_comp)
print(svm_pred.excel)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## low low low low high high low high high low high high low high high low
## 17
## high
## Levels: high low
Data yang dihasilkan dari prediksi disimpan dengan nama Prediksi.Risk.Level dan akan masuuk dalam data frame Risiko_testing_comp
Risiko_testing_comp$Prediksi.Risk.Level <- svm_pred.excel
Menghitung akurasi dari prediksi dengan membandingkan prediksi svm_pred dengan nilai aktual pada data uji Risiko_testing_comp$Prediksi.Risk.Level. Akurasi dihitung sebagai persentase dari prediksi yang benar.
accuracy_svm.excel <- sum(Risiko_testing_comp$Prediksi.Risk.Level == svm_pred.excel) / nrow(Risiko_testing_comp)
print(paste("SVM Akurasi untuk Data Excel:", round(accuracy_svm.excel * 100, 2),"%"))
## [1] "SVM Akurasi untuk Data Excel: 100 %"
Risiko_testing_comp <- Risiko_testing_comp %>%
mutate(Prediksi.Risk.Level = ifelse(
Prediksi.Risk.Level == "high",
cell_spec(Prediksi.Risk.Level, background = "red", bold = TRUE),
cell_spec(Prediksi.Risk.Level, background = "grren", bold = TRUE)
))
Risiko_testing_comp %>%
kbl(escape = FALSE) %>%
kable_material_dark() %>%
row_spec(0, bold = TRUE, color = "white") %>%
kable_styling(font_size = 9)
| X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 | X12 | X13 | X14 | Prediksi.Risk.Level |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 23.2000 | 60338.0204 | 175.42230 | 1.62000 | 0.6755 | 2.47168 | 0.3526 | 185.64097 | 64.14972 | 537.609866 | 0.5000 | 25.11320 | 27.95256 | 8.6000 | low |
| 16.8056 | 62432.9952 | 409.69700 | 0.10510 | 0.9068 | 2.77600 | 0.2912 | 94.00211 | -200.98100 | 339.988210 | 1.3095 | 26.76784 | 47.25374 | 3.0000 | low |
| 18.2857 | 28684.1682 | 103.06040 | 0.84352 | 0.0746 | 3.55290 | 1.9299 | 72.30708 | 16.23838 | 52.761781 | 3.0176 | 19.90742 | 25.76882 | 5.0000 | low |
| 19.6715 | 21042.7221 | 102.73060 | 1.17400 | 0.0734 | 3.21976 | 1.2325 | 111.78982 | 33.35258 | 102.567122 | 2.5300 | 22.83084 | 20.95780 | 7.0000 | low |
| 11.9000 | 49356.2618 | 60.15464 | 0.89594 | 0.5865 | 1.75420 | -1.1342 | 88.60514 | -145.43800 | 1.490827 | 63.5000 | 17.79208 | 23.21144 | 7.3000 | high |
| 16.3000 | 3989.1913 | 65.55750 | 0.39400 | 0.5042 | 2.44734 | -0.1248 | 88.88685 | 27.33332 | 24.638720 | 1.5706 | 16.78238 | 14.52982 | 9.0000 | high |
| 19.8000 | 7450.5523 | 33.22256 | 0.34500 | 0.3153 | 3.44058 | 1.2787 | 100.19298 | -42.56340 | 501.644054 | 3.2000 | 23.05990 | 32.47950 | 2.0000 | low |
| 12.9000 | 3616.8650 | 85.26668 | 5.55600 | 1.1173 | 1.60820 | -1.5047 | 134.47988 | 64.46288 | 39.218118 | 13.6000 | 18.80654 | 8.88180 | 17.0000 | high |
| 18.0000 | 8652.9973 | 51.65878 | 11.65444 | 1.4844 | 4.15702 | 1.8070 | 116.52826 | 28.56998 | 720.244499 | 13.6000 | 28.55834 | 26.32778 | 13.2000 | high |
| 14.1400 | 31854.2815 | 48.51016 | 0.72360 | 0.1015 | 2.53870 | 2.7686 | 71.08310 | -189.14000 | 668.122597 | 3.2000 | 22.02936 | 33.80118 | 3.7000 | low |
| 22.0000 | 3955.0704 | 103.90710 | 19.17300 | -0.3906 | 0.34000 | 1.8906 | 72.25639 | -5.46582 | 155.581868 | 49.0000 | 17.79388 | 16.04966 | 9.5000 | high |
| 21.5689 | 786.8776 | 42.26784 | 4.29470 | 3.6551 | 5.73874 | 0.4207 | 69.45587 | 21.88058 | 33.538172 | 3.3357 | 24.85996 | 19.47826 | 9.0000 | high |
| 16.3000 | 69324.7338 | 104.17110 | 1.55316 | 0.6255 | 2.45554 | 0.4867 | 116.52826 | 47.70210 | 20935.000000 | 1.0000 | 17.40650 | 17.20802 | 5.6146 | low |
| 17.0400 | 15968.2306 | 73.01010 | 8.00348 | 0.3592 | 0.82090 | -0.7169 | 49.05568 | -16.23150 | 53.628838 | 3.3357 | 16.44666 | 17.57796 | 10.3000 | high |
| 18.4000 | 1872.6699 | 30.04996 | 12.29840 | 1.6065 | 5.84000 | 3.0735 | 71.08310 | -45.16010 | 57.707193 | 2.1000 | 31.60162 | 29.24286 | 6.0000 | high |
| 12.0977 | 3886.5162 | 34.52492 | 2.79600 | 0.8506 | 6.94570 | 5.2762 | 86.56201 | 7.39622 | 351.683014 | 1.6900 | 23.54764 | 25.80812 | 2.5000 | low |
| 16.6000 | 6404.6725 | 50.79576 | 4.98278 | 1.4772 | 0.78940 | -2.3230 | 107.24859 | 15.04496 | 302.141270 | 3.2000 | 19.11740 | 16.25316 | 33.7000 | high |