library(readxl)
library(dplyr)
library(mice)
library(VIM)
library(ggplot2)
library(caret)
library(randomForest)
library(kableExtra)
Terdapat dua data yaitu Testing dan Training, data Training digunakan untuk modelling Random Forest dan data testing digunakan untuk memprediksi Risk Level
Data pada sheet training dimasukkan dalam data.training dan data pada sheet testing dimasukkan dalam data.testing
data.training <- read_excel("C:/Wahyu/Kuliah/Semester 3/Statistika Ekonomi dan Industri/UTS/Level Risiko Investasi.xlsx", "Training")
data.testing <- read_excel("C:/Wahyu/Kuliah/Semester 3/Statistika Ekonomi dan Industri/UTS/Level Risiko Investasi.xlsx", sheet = "Testing")
test.data.ori <- data.testing
glimpse(data.training)
## Rows: 100
## Columns: 16
## $ Country <chr> "AD", "AE", "AE-AZ", "AE-RK", "AM", "AO", "AR", "AT", "AU…
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, NA, 14.0000, NA, 23.2527, 18.5…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 203…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.05…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.7…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.4…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 6…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, NA, 6.6000, 10.3000, 10.6000, 2.0…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.1238…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.5721…
## $ X14 <dbl> 3.0000, 2.4500, NA, NA, 18.5000, 10.5000, 11.0500, 6.0000…
## $ `Risk Level` <chr> "low", "low", "low", "low", "high", "high", "high", "low"…
glimpse(data.testing)
## Rows: 17
## Columns: 15
## $ Country <chr> "SE", "SG", "SI", "SK", "SM", "SV", "TH", "TN", "TR", "TW", "U…
## $ X1 <dbl> 23.2000, 16.8056, 18.2857, 19.6715, 11.9000, NA, 19.8000, 12.9…
## $ X2 <dbl> 60338.0204, 62432.9952, 28684.1682, 21042.7221, 49356.2618, 39…
## $ X3 <dbl> 175.42230, 409.69700, 103.06040, 102.73060, 60.15464, 65.55750…
## $ X4 <dbl> 1.62000, 0.10510, 0.84352, 1.17400, 0.89594, 0.39400, 0.34500,…
## $ X5 <dbl> 0.6755, 0.9068, 0.0746, 0.0734, 0.5865, 0.5042, 0.3153, 1.1173…
## $ X6 <dbl> 2.47168, 2.77600, 3.55290, 3.21976, 1.75420, 2.44734, 3.44058,…
## $ X7 <dbl> 0.3526, 0.2912, 1.9299, 1.2325, -1.1342, -0.1248, 1.2787, -1.5…
## $ X8 <dbl> 185.64097, 94.00211, 72.30708, 111.78982, 88.60514, 88.88685, …
## $ X9 <dbl> 64.14972, -200.98100, 16.23838, 33.35258, -145.43800, 27.33332…
## $ X10 <dbl> 537.609866, 339.988210, 52.761781, 102.567122, 1.490827, 24.63…
## $ X11 <dbl> 0.5000, 1.3095, 3.0176, 2.5300, 63.5000, 1.5706, 3.2000, 13.60…
## $ X12 <dbl> 25.11320, 26.76784, 19.90742, 22.83084, 17.79208, 16.78238, 23…
## $ X13 <dbl> 27.95256, 47.25374, 25.76882, 20.95780, 23.21144, 14.52982, 32…
## $ X14 <dbl> 8.6000, 3.0000, 5.0000, 7.0000, 7.3000, 9.0000, 2.0000, 17.000…
Pada hasil terlihat Country dan Risk Level merupakan tipe karakter (string), peubah Country nantinya akan dihapus dan Risk Level akan menjadi faktor.
str(data.training)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:100] "AD" "AE" "AE-AZ" "AE-RK" ...
## $ X1 : num [1:100] 17.5 18.2 18.7 NA 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 NA 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 NA NA 18.5 ...
## $ Risk Level: chr [1:100] "low" "low" "low" "low" ...
str(data.testing)
## tibble [17 × 15] (S3: tbl_df/tbl/data.frame)
## $ Country: chr [1:17] "SE" "SG" "SI" "SK" ...
## $ X1 : num [1:17] 23.2 16.8 18.3 19.7 11.9 ...
## $ X2 : num [1:17] 60338 62433 28684 21043 49356 ...
## $ X3 : num [1:17] 175.4 409.7 103.1 102.7 60.2 ...
## $ X4 : num [1:17] 1.62 0.105 0.844 1.174 0.896 ...
## $ X5 : num [1:17] 0.6755 0.9068 0.0746 0.0734 0.5865 ...
## $ X6 : num [1:17] 2.47 2.78 3.55 3.22 1.75 ...
## $ X7 : num [1:17] 0.353 0.291 1.93 1.232 -1.134 ...
## $ X8 : num [1:17] 185.6 94 72.3 111.8 88.6 ...
## $ X9 : num [1:17] 64.1 -201 16.2 33.4 -145.4 ...
## $ X10 : num [1:17] 537.61 339.99 52.76 102.57 1.49 ...
## $ X11 : num [1:17] 0.5 1.31 3.02 2.53 63.5 ...
## $ X12 : num [1:17] 25.1 26.8 19.9 22.8 17.8 ...
## $ X13 : num [1:17] 28 47.3 25.8 21 23.2 ...
## $ X14 : num [1:17] 8.6 3 5 7 7.3 9 2 17 13.2 3.7 ...
Ringkasan dari data.training dan data.testing terlihat ada beberapa peubah yang memiliki missing value
summary(data.training)
## Country X1 X2 X3
## Length:100 Min. : 4.20 Min. : 434.5 Min. : 13.63
## Class :character 1st Qu.:15.93 1st Qu.: 4265.9 1st Qu.: 42.96
## Mode :character Median :18.58 Median : 11659.1 Median : 70.42
## Mean :18.97 Mean : 22641.6 Mean : 191.94
## 3rd Qu.:21.80 3rd Qu.: 34815.2 3rd Qu.: 130.63
## Max. :47.50 Max. :124340.4 Max. :6908.35
## NA's :12
## X4 X5 X6 X7
## Min. :-0.151 Min. :-0.8862 Min. :-5.135 Min. :-9.84530
## 1st Qu.: 0.869 1st Qu.: 0.4419 1st Qu.: 1.765 1st Qu.:-1.18720
## Median : 1.700 Median : 1.1402 Median : 2.984 Median : 0.07155
## Mean : 3.263 Mean : 1.2019 Mean : 3.076 Mean : 0.10804
## 3rd Qu.: 3.939 3rd Qu.: 1.9502 3rd Qu.: 4.305 3rd Qu.: 1.94108
## Max. :36.703 Max. : 4.4021 Max. :10.076 Max. : 6.07120
##
## X8 X9 X10 X11
## Min. : 34.82 Min. :-1955.72 Min. : 1.171 Min. : 0.3357
## 1st Qu.: 76.95 1st Qu.: -14.11 1st Qu.: 32.813 1st Qu.: 1.9250
## Median : 90.19 Median : 12.67 Median : 106.872 Median : 3.9000
## Mean : 99.94 Mean : -13.58 Mean : 582.318 Mean : 5.5346
## 3rd Qu.:113.39 3rd Qu.: 36.67 3rd Qu.: 366.370 3rd Qu.: 7.9500
## Max. :359.14 Max. : 456.49 Max. :14866.703 Max. :26.9780
## NA's :7 NA's :17
## X12 X13 X14 Risk Level
## Min. :12.67 Min. :10.95 Min. : 0.120 Length:100
## 1st Qu.:20.79 1st Qu.:19.06 1st Qu.: 4.818 Class :character
## Median :23.40 Median :24.28 Median : 6.800 Mode :character
## Mean :24.96 Mean :24.48 Mean : 8.441
## 3rd Qu.:28.38 3rd Qu.:29.36 3rd Qu.:10.500
## Max. :46.83 Max. :55.09 Max. :24.650
## NA's :11
summary(data.testing)
## Country X1 X2 X3
## Length:17 Min. :11.90 Min. : 786.9 Min. : 30.05
## Class :character 1st Qu.:15.76 1st Qu.: 3955.1 1st Qu.: 48.51
## Mode :character Median :17.52 Median : 8653.0 Median : 65.56
## Mean :17.42 Mean :22330.4 Mean : 92.59
## 3rd Qu.:19.70 3rd Qu.:31854.3 3rd Qu.:103.06
## Max. :23.20 Max. :69324.7 Max. :409.70
## NA's :1
## X4 X5 X6 X7
## Min. : 0.1051 Min. :-0.3906 Min. :0.340 Min. :-2.3230
## 1st Qu.: 0.8435 1st Qu.: 0.3153 1st Qu.:1.754 1st Qu.:-0.1248
## Median : 1.6200 Median : 0.6255 Median :2.539 Median : 0.4867
## Mean : 4.4949 Mean : 0.8249 Mean :2.994 Mean : 0.8826
## 3rd Qu.: 5.5560 3rd Qu.: 1.1173 3rd Qu.:3.553 3rd Qu.: 1.8906
## Max. :19.1730 Max. : 3.6551 Max. :6.946 Max. : 5.2762
##
## X8 X9 X10 X11
## Min. : 49.06 Min. :-200.98 Min. : 1.491 Min. : 0.500
## 1st Qu.: 72.28 1st Qu.: -42.56 1st Qu.: 52.762 1st Qu.: 1.571
## Median : 88.89 Median : 15.04 Median : 155.582 Median : 2.530
## Mean : 96.54 Mean : -18.76 Mean : 1463.386 Mean :11.258
## 3rd Qu.:109.52 3rd Qu.: 28.57 3rd Qu.: 501.644 3rd Qu.: 3.336
## Max. :185.64 Max. : 64.46 Max. :20935.000 Max. :63.500
## NA's :2 NA's :4
## X12 X13 X14
## Min. :16.45 Min. : 8.882 Min. : 2.000
## 1st Qu.:17.79 1st Qu.:17.208 1st Qu.: 4.675
## Median :22.03 Median :23.211 Median : 7.150
## Mean :21.91 Mean :23.693 Mean : 8.963
## 3rd Qu.:24.86 3rd Qu.:27.953 3rd Qu.: 9.700
## Max. :31.60 Max. :47.254 Max. :33.700
## NA's :1
Fungsi ini mengubah nama kolom sehingga menjadi valid dalam R. Ini berguna jika nama kolom mengandung spasi, karakter khusus, atau dimulai dengan angka. make.names() akan mengganti karakter yang tidak valid dengan titik (.) contohnya “Risk Level” mejadi “Risk.Level”.
colnames(data.training) <- make.names(colnames(data.training))
Mengubah variabel dalam kolom Risk Level menjadi faktor
data.training$Risk.Level <- as.factor(data.training$Risk.Level)
str(data.training)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:100] "AD" "AE" "AE-AZ" "AE-RK" ...
## $ X1 : num [1:100] 17.5 18.2 18.7 NA 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 NA 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 NA NA 18.5 ...
## $ Risk.Level: Factor w/ 2 levels "high","low": 2 2 2 2 1 1 1 2 2 1 ...
Menghapus Country dari data.training dan data.testing karena tidak terpakai pada analisis.
data.training <- data.training[, -which(names(data.training) == "Country")]
data.testing <- data.testing[, -which(names(data.testing) == "Country")]
colSums(is.na(data.training))
## X1 X2 X3 X4 X5 X6 X7
## 12 0 0 0 0 0 0
## X8 X9 X10 X11 X12 X13 X14
## 7 0 0 17 0 0 11
## Risk.Level
## 0
colSums(is.na(data.testing))
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
## 1 0 0 0 0 0 0 2 0 0 4 0 0 1
aggr(data.training)
# Mengisi Missing Value dengan Mice
data.training.clean <- mice(data.training, m = 20, method = 'pmm', seed = 500)
##
## iter imp variable
## 1 1 X1 X8 X11 X14
## 1 2 X1 X8 X11 X14
## 1 3 X1 X8 X11 X14
## 1 4 X1 X8 X11 X14
## 1 5 X1 X8 X11 X14
## 1 6 X1 X8 X11 X14
## 1 7 X1 X8 X11 X14
## 1 8 X1 X8 X11 X14
## 1 9 X1 X8 X11 X14
## 1 10 X1 X8 X11 X14
## 1 11 X1 X8 X11 X14
## 1 12 X1 X8 X11 X14
## 1 13 X1 X8 X11 X14
## 1 14 X1 X8 X11 X14
## 1 15 X1 X8 X11 X14
## 1 16 X1 X8 X11 X14
## 1 17 X1 X8 X11 X14
## 1 18 X1 X8 X11 X14
## 1 19 X1 X8 X11 X14
## 1 20 X1 X8 X11 X14
## 2 1 X1 X8 X11 X14
## 2 2 X1 X8 X11 X14
## 2 3 X1 X8 X11 X14
## 2 4 X1 X8 X11 X14
## 2 5 X1 X8 X11 X14
## 2 6 X1 X8 X11 X14
## 2 7 X1 X8 X11 X14
## 2 8 X1 X8 X11 X14
## 2 9 X1 X8 X11 X14
## 2 10 X1 X8 X11 X14
## 2 11 X1 X8 X11 X14
## 2 12 X1 X8 X11 X14
## 2 13 X1 X8 X11 X14
## 2 14 X1 X8 X11 X14
## 2 15 X1 X8 X11 X14
## 2 16 X1 X8 X11 X14
## 2 17 X1 X8 X11 X14
## 2 18 X1 X8 X11 X14
## 2 19 X1 X8 X11 X14
## 2 20 X1 X8 X11 X14
## 3 1 X1 X8 X11 X14
## 3 2 X1 X8 X11 X14
## 3 3 X1 X8 X11 X14
## 3 4 X1 X8 X11 X14
## 3 5 X1 X8 X11 X14
## 3 6 X1 X8 X11 X14
## 3 7 X1 X8 X11 X14
## 3 8 X1 X8 X11 X14
## 3 9 X1 X8 X11 X14
## 3 10 X1 X8 X11 X14
## 3 11 X1 X8 X11 X14
## 3 12 X1 X8 X11 X14
## 3 13 X1 X8 X11 X14
## 3 14 X1 X8 X11 X14
## 3 15 X1 X8 X11 X14
## 3 16 X1 X8 X11 X14
## 3 17 X1 X8 X11 X14
## 3 18 X1 X8 X11 X14
## 3 19 X1 X8 X11 X14
## 3 20 X1 X8 X11 X14
## 4 1 X1 X8 X11 X14
## 4 2 X1 X8 X11 X14
## 4 3 X1 X8 X11 X14
## 4 4 X1 X8 X11 X14
## 4 5 X1 X8 X11 X14
## 4 6 X1 X8 X11 X14
## 4 7 X1 X8 X11 X14
## 4 8 X1 X8 X11 X14
## 4 9 X1 X8 X11 X14
## 4 10 X1 X8 X11 X14
## 4 11 X1 X8 X11 X14
## 4 12 X1 X8 X11 X14
## 4 13 X1 X8 X11 X14
## 4 14 X1 X8 X11 X14
## 4 15 X1 X8 X11 X14
## 4 16 X1 X8 X11 X14
## 4 17 X1 X8 X11 X14
## 4 18 X1 X8 X11 X14
## 4 19 X1 X8 X11 X14
## 4 20 X1 X8 X11 X14
## 5 1 X1 X8 X11 X14
## 5 2 X1 X8 X11 X14
## 5 3 X1 X8 X11 X14
## 5 4 X1 X8 X11 X14
## 5 5 X1 X8 X11 X14
## 5 6 X1 X8 X11 X14
## 5 7 X1 X8 X11 X14
## 5 8 X1 X8 X11 X14
## 5 9 X1 X8 X11 X14
## 5 10 X1 X8 X11 X14
## 5 11 X1 X8 X11 X14
## 5 12 X1 X8 X11 X14
## 5 13 X1 X8 X11 X14
## 5 14 X1 X8 X11 X14
## 5 15 X1 X8 X11 X14
## 5 16 X1 X8 X11 X14
## 5 17 X1 X8 X11 X14
## 5 18 X1 X8 X11 X14
## 5 19 X1 X8 X11 X14
## 5 20 X1 X8 X11 X14
data.training.fix <- complete(data.training.clean, action = 18)
aggr(data.training.fix)
# Plot Density untuk Setiap Variabel yang memiliki nilai NA Ket : Hitam
: Sebelum Biru : Sesudah
plot(density(data.training$X1, na.rm = T), main = "Data X1 With NA")
lines(density(data.training.fix$X1, na.rm = T), col = "blue", lty = 3)
plot(density(data.training$X8, na.rm = T), main = "Data X8 With NA")
lines(density(data.training.fix$X8, na.rm = T), col = "blue", lty = 3)
plot(density(data.training$X11, na.rm = T), main = "Data X11 With NA")
lines(density(data.training.fix$X11, na.rm = T), col = "blue", lty = 3)
plot(density(data.training$X14, na.rm = T), main = "Data X14 With NA")
lines(density(data.training.fix$X14, na.rm = T), col = "blue", lty = 3)
aggr(data.testing)
data.testing.clean <- mice(data.testing, m = 20, method = 'pmm', seed = 500)
##
## iter imp variable
## 1 1 X1 X8 X11 X14
## 1 2 X1 X8 X11 X14
## 1 3 X1 X8 X11 X14
## 1 4 X1 X8 X11 X14
## 1 5 X1 X8 X11 X14
## 1 6 X1 X8 X11 X14
## 1 7 X1 X8 X11 X14
## 1 8 X1 X8 X11 X14
## 1 9 X1 X8 X11 X14
## 1 10 X1 X8 X11 X14
## 1 11 X1 X8 X11 X14
## 1 12 X1 X8 X11 X14
## 1 13 X1 X8 X11 X14
## 1 14 X1 X8 X11 X14
## 1 15 X1 X8 X11 X14
## 1 16 X1 X8 X11 X14
## 1 17 X1 X8 X11 X14
## 1 18 X1 X8 X11 X14
## 1 19 X1 X8 X11 X14
## 1 20 X1 X8 X11 X14
## 2 1 X1 X8 X11 X14
## 2 2 X1 X8 X11 X14
## 2 3 X1 X8 X11 X14
## 2 4 X1 X8 X11 X14
## 2 5 X1 X8 X11 X14
## 2 6 X1 X8 X11 X14
## 2 7 X1 X8 X11 X14
## 2 8 X1 X8 X11 X14
## 2 9 X1 X8 X11 X14
## 2 10 X1 X8 X11 X14
## 2 11 X1 X8 X11 X14
## 2 12 X1 X8 X11 X14
## 2 13 X1 X8 X11 X14
## 2 14 X1 X8 X11 X14
## 2 15 X1 X8 X11 X14
## 2 16 X1 X8 X11 X14
## 2 17 X1 X8 X11 X14
## 2 18 X1 X8 X11 X14
## 2 19 X1 X8 X11 X14
## 2 20 X1 X8 X11 X14
## 3 1 X1 X8 X11 X14
## 3 2 X1 X8 X11 X14
## 3 3 X1 X8 X11 X14
## 3 4 X1 X8 X11 X14
## 3 5 X1 X8 X11 X14
## 3 6 X1 X8 X11 X14
## 3 7 X1 X8 X11 X14
## 3 8 X1 X8 X11 X14
## 3 9 X1 X8 X11 X14
## 3 10 X1 X8 X11 X14
## 3 11 X1 X8 X11 X14
## 3 12 X1 X8 X11 X14
## 3 13 X1 X8 X11 X14
## 3 14 X1 X8 X11 X14
## 3 15 X1 X8 X11 X14
## 3 16 X1 X8 X11 X14
## 3 17 X1 X8 X11 X14
## 3 18 X1 X8 X11 X14
## 3 19 X1 X8 X11 X14
## 3 20 X1 X8 X11 X14
## 4 1 X1 X8 X11 X14
## 4 2 X1 X8 X11 X14
## 4 3 X1 X8 X11 X14
## 4 4 X1 X8 X11 X14
## 4 5 X1 X8 X11 X14
## 4 6 X1 X8 X11 X14
## 4 7 X1 X8 X11 X14
## 4 8 X1 X8 X11 X14
## 4 9 X1 X8 X11 X14
## 4 10 X1 X8 X11 X14
## 4 11 X1 X8 X11 X14
## 4 12 X1 X8 X11 X14
## 4 13 X1 X8 X11 X14
## 4 14 X1 X8 X11 X14
## 4 15 X1 X8 X11 X14
## 4 16 X1 X8 X11 X14
## 4 17 X1 X8 X11 X14
## 4 18 X1 X8 X11 X14
## 4 19 X1 X8 X11 X14
## 4 20 X1 X8 X11 X14
## 5 1 X1 X8 X11 X14
## 5 2 X1 X8 X11 X14
## 5 3 X1 X8 X11 X14
## 5 4 X1 X8 X11 X14
## 5 5 X1 X8 X11 X14
## 5 6 X1 X8 X11 X14
## 5 7 X1 X8 X11 X14
## 5 8 X1 X8 X11 X14
## 5 9 X1 X8 X11 X14
## 5 10 X1 X8 X11 X14
## 5 11 X1 X8 X11 X14
## 5 12 X1 X8 X11 X14
## 5 13 X1 X8 X11 X14
## 5 14 X1 X8 X11 X14
## 5 15 X1 X8 X11 X14
## 5 16 X1 X8 X11 X14
## 5 17 X1 X8 X11 X14
## 5 18 X1 X8 X11 X14
## 5 19 X1 X8 X11 X14
## 5 20 X1 X8 X11 X14
## Warning: Number of logged events: 124
data.testing.fix <- complete(data.testing.clean, action = 15)
aggr(data.testing.fix)
plot(density(data.testing$X1, na.rm = T), main = "Data X1 With NA")
lines(density(data.testing.fix$X1, na.rm = T), col = "blue", lty = 3)
plot(density(data.testing$X8, na.rm = T), main = "Data X8 With NA")
lines(density(data.testing.fix$X8, na.rm = T), col = "blue", lty = 3)
plot(density(data.testing$X11, na.rm = T), main = "Data X11 With NA")
lines(density(data.testing.fix$X11, na.rm = T), col = "blue", lty = 3)
plot(density(data.testing$X14, na.rm = T), main = "Data X14 With NA")
lines(density(data.testing.fix$X14, na.rm = T), col = "blue", lty = 3)
glimpse(data.training.fix)
## Rows: 100
## Columns: 15
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, 24.8200, 14.0000, 26.0000, 23.25…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 2033.…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.0556…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36.7…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0.7…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0.2…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.729…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.81…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.449…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 62.…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, 0.9250, 6.6000, 10.3000, 10.6000, 2…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.12380,…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.57210,…
## $ X14 <dbl> 3.0000, 2.4500, 4.9000, 15.0000, 18.5000, 10.5000, 11.0500,…
## $ Risk.Level <fct> low, low, low, low, high, high, high, low, low, high, high,…
glimpse(data.testing.fix)
## Rows: 17
## Columns: 14
## $ X1 <dbl> 23.2000, 16.8056, 18.2857, 19.6715, 11.9000, 14.1400, 19.8000, 12.…
## $ X2 <dbl> 60338.0204, 62432.9952, 28684.1682, 21042.7221, 49356.2618, 3989.1…
## $ X3 <dbl> 175.42230, 409.69700, 103.06040, 102.73060, 60.15464, 65.55750, 33…
## $ X4 <dbl> 1.62000, 0.10510, 0.84352, 1.17400, 0.89594, 0.39400, 0.34500, 5.5…
## $ X5 <dbl> 0.6755, 0.9068, 0.0746, 0.0734, 0.5865, 0.5042, 0.3153, 1.1173, 1.…
## $ X6 <dbl> 2.47168, 2.77600, 3.55290, 3.21976, 1.75420, 2.44734, 3.44058, 1.6…
## $ X7 <dbl> 0.3526, 0.2912, 1.9299, 1.2325, -1.1342, -0.1248, 1.2787, -1.5047,…
## $ X8 <dbl> 185.64097, 94.00211, 72.30708, 111.78982, 88.60514, 88.88685, 100.…
## $ X9 <dbl> 64.14972, -200.98100, 16.23838, 33.35258, -145.43800, 27.33332, -4…
## $ X10 <dbl> 537.609866, 339.988210, 52.761781, 102.567122, 1.490827, 24.638720…
## $ X11 <dbl> 0.5000, 1.3095, 3.0176, 2.5300, 63.5000, 1.5706, 3.2000, 13.6000, …
## $ X12 <dbl> 25.11320, 26.76784, 19.90742, 22.83084, 17.79208, 16.78238, 23.059…
## $ X13 <dbl> 27.95256, 47.25374, 25.76882, 20.95780, 23.21144, 14.52982, 32.479…
## $ X14 <dbl> 8.6000, 3.0000, 5.0000, 7.0000, 7.3000, 9.0000, 2.0000, 17.0000, 1…
Bar chart untuk Risk.Level Data.Training.Fix
risk_counts <- data.training.fix %>%
group_by(Risk.Level) %>%
summarize(count = n())
ggplot(risk_counts, aes(x = Risk.Level, y = count, fill = Risk.Level)) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), vjust = -0.5, size = 5) +
scale_fill_manual(values = c("low" = "pink", "high" = "blue")) +
labs(title = "Frekuensi Risk Level", x = "Risk Level", y = "Frekuensi") +
theme_minimal()
# Random Forest # Splitting Data Data
Data.Training.Fix dibagi menjadi dua bagian yaitu 80%
sebagai data training dan 20% sebagai data testing.
set.seed(123)
train_index <- createDataPartition(data.training.fix$Risk.Level, p = 0.8, list = FALSE)
train_data <- data.training.fix[train_index, ]
test_data <- data.training.fix[-train_index, ]
Mengubah komponen dalam kolom Ris Level menjadi Faktor
train_data$Risk.Level <- as.factor(train_data$Risk.Level)
test_data$Risk.Level <- as.factor(test_data$Risk.Level)
rf_model <- randomForest(Risk.Level ~ .,
data = train_data,
ntree = 500, # Jumlah pohon
mtry = sqrt(ncol(train_data)-1), # Jumlah variabel yang dicoba di setiap split
importance = TRUE)
print(rf_model)
##
## Call:
## randomForest(formula = Risk.Level ~ ., data = train_data, ntree = 500, mtry = sqrt(ncol(train_data) - 1), importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 14.81%
## Confusion matrix:
## high low class.error
## high 38 6 0.1363636
## low 6 31 0.1621622
predictions <- predict(rf_model, test_data)
conf_matrix <- confusionMatrix(predictions, test_data$Risk.Level)
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low
## high 9 1
## low 1 8
##
## Accuracy : 0.8947
## 95% CI : (0.6686, 0.987)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : 0.0007916
##
## Kappa : 0.7889
##
## Mcnemar's Test P-Value : 1.0000000
##
## Sensitivity : 0.9000
## Specificity : 0.8889
## Pos Pred Value : 0.9000
## Neg Pred Value : 0.8889
## Prevalence : 0.5263
## Detection Rate : 0.4737
## Detection Prevalence : 0.5263
## Balanced Accuracy : 0.8944
##
## 'Positive' Class : high
##
importance(rf_model)
## high low MeanDecreaseAccuracy MeanDecreaseGini
## X1 -3.1577044 1.4385362 -1.5236552 0.9187648
## X2 22.2755693 17.7478230 24.3660846 12.0284059
## X3 3.4007146 1.8698506 4.0324868 1.6870668
## X4 3.1062603 3.2238830 4.3072987 1.5915884
## X5 -4.0039111 2.0538208 -1.5472891 1.4400934
## X6 -0.8260667 0.3231983 -0.5488609 0.7106845
## X7 -0.7424006 0.0827424 -0.3085407 0.8719884
## X8 2.3351025 2.1549803 2.9467869 1.3249548
## X9 12.1582934 9.3246875 13.2428353 4.1890692
## X10 7.5510624 6.9625018 9.3146697 3.9599503
## X11 13.1863222 9.4022858 14.5033486 6.3278656
## X12 1.0667609 2.2552777 2.2849631 1.0231860
## X13 3.3301304 6.2315341 6.5339426 1.8624712
## X14 1.7710543 6.1022875 5.3193248 1.8381330
varImpPlot(rf_model)
# Akurasi
accuracy <- sum(predictions == test_data$Risk.Level) / length(predictions)
print(paste("Accuracy:", round(accuracy * 100, 2), "%"))
## [1] "Accuracy: 89.47 %"
print(data.frame(predictions))
## predictions
## 1 low
## 5 high
## 6 high
## 12 high
## 22 low
## 30 low
## 32 low
## 42 high
## 50 low
## 51 high
## 58 high
## 62 low
## 66 high
## 68 low
## 71 high
## 73 high
## 75 low
## 93 high
## 94 low
prediction.results <- predict(rf_model, data.testing.fix)
print(prediction.results)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## low low low low low high low high high low high high low high high low
## 17
## high
## Levels: high low
Data yang dihasilkan dari hasil prediksi ke dalam data frame Data.Testing.Fix
data.testing.fix$Prediksi.Risk.Level <- prediction.results
Menghitung Akurasi dengan hasil prediksi
accuracy_rs.excel <- sum(data.testing.fix$Prediksi.Risk.Level == prediction.results) / nrow(data.testing.fix)
print(paste("Random Forest Akurasi untuk Data Excel:", round(accuracy_rs.excel * 100, 2),"%"))
## [1] "Random Forest Akurasi untuk Data Excel: 100 %"
Menambahkan variabel Country ke Tabel
data.testing.fix <- cbind(Country = test.data.ori$Country, data.testing.fix)
data.testing.fix <- data.testing.fix %>%
mutate(Prediksi.Risk.Level = ifelse(
Prediksi.Risk.Level == "high",
cell_spec(Prediksi.Risk.Level, background = "red", bold = TRUE),
cell_spec(Prediksi.Risk.Level, background = "green", bold = TRUE)
))
data.testing.fix %>%
kbl(escape = FALSE) %>%
kable_material_dark() %>%
row_spec(0, bold = TRUE, color = "white") %>%
kable_styling(font_size = 9)
| Country | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 | X12 | X13 | X14 | Prediksi.Risk.Level |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SE | 23.2000 | 60338.0204 | 175.42230 | 1.62000 | 0.6755 | 2.47168 | 0.3526 | 185.64097 | 64.14972 | 537.609866 | 0.5000 | 25.11320 | 27.95256 | 8.6000 | low |
| SG | 16.8056 | 62432.9952 | 409.69700 | 0.10510 | 0.9068 | 2.77600 | 0.2912 | 94.00211 | -200.98100 | 339.988210 | 1.3095 | 26.76784 | 47.25374 | 3.0000 | low |
| SI | 18.2857 | 28684.1682 | 103.06040 | 0.84352 | 0.0746 | 3.55290 | 1.9299 | 72.30708 | 16.23838 | 52.761781 | 3.0176 | 19.90742 | 25.76882 | 5.0000 | low |
| SK | 19.6715 | 21042.7221 | 102.73060 | 1.17400 | 0.0734 | 3.21976 | 1.2325 | 111.78982 | 33.35258 | 102.567122 | 2.5300 | 22.83084 | 20.95780 | 7.0000 | low |
| SM | 11.9000 | 49356.2618 | 60.15464 | 0.89594 | 0.5865 | 1.75420 | -1.1342 | 88.60514 | -145.43800 | 1.490827 | 63.5000 | 17.79208 | 23.21144 | 7.3000 | low |
| SV | 14.1400 | 3989.1913 | 65.55750 | 0.39400 | 0.5042 | 2.44734 | -0.1248 | 88.88685 | 27.33332 | 24.638720 | 1.5706 | 16.78238 | 14.52982 | 9.0000 | high |
| TH | 19.8000 | 7450.5523 | 33.22256 | 0.34500 | 0.3153 | 3.44058 | 1.2787 | 100.19298 | -42.56340 | 501.644054 | 3.2000 | 23.05990 | 32.47950 | 2.0000 | low |
| TN | 12.9000 | 3616.8650 | 85.26668 | 5.55600 | 1.1173 | 1.60820 | -1.5047 | 134.47988 | 64.46288 | 39.218118 | 13.6000 | 18.80654 | 8.88180 | 17.0000 | high |
| TR | 18.0000 | 8652.9973 | 51.65878 | 11.65444 | 1.4844 | 4.15702 | 1.8070 | 116.52826 | 28.56998 | 720.244499 | 2.5300 | 28.55834 | 26.32778 | 13.2000 | high |
| TW | 14.1400 | 31854.2815 | 48.51016 | 0.72360 | 0.1015 | 2.53870 | 2.7686 | 71.08310 | -189.14000 | 668.122597 | 49.0000 | 22.02936 | 33.80118 | 3.7000 | low |
| UA | 22.0000 | 3955.0704 | 103.90710 | 19.17300 | -0.3906 | 0.34000 | 1.8906 | 72.25639 | -5.46582 | 155.581868 | 49.0000 | 17.79388 | 16.04966 | 9.5000 | high |
| UG | 21.5689 | 786.8776 | 42.26784 | 4.29470 | 3.6551 | 5.73874 | 0.4207 | 69.45587 | 21.88058 | 33.538172 | 3.3357 | 24.85996 | 19.47826 | 17.0000 | high |
| US | 16.3000 | 69324.7338 | 104.17110 | 1.55316 | 0.6255 | 2.45554 | 0.4867 | 134.47988 | 47.70210 | 20935.000000 | 1.0000 | 17.40650 | 17.20802 | 5.6146 | low |
| UY | 17.0400 | 15968.2306 | 73.01010 | 8.00348 | 0.3592 | 0.82090 | -0.7169 | 49.05568 | -16.23150 | 53.628838 | 2.5300 | 16.44666 | 17.57796 | 10.3000 | high |
| UZ | 18.4000 | 1872.6699 | 30.04996 | 12.29840 | 1.6065 | 5.84000 | 3.0735 | 72.25639 | -45.16010 | 57.707193 | 2.1000 | 31.60162 | 29.24286 | 6.0000 | high |
| VN | 12.0977 | 3886.5162 | 34.52492 | 2.79600 | 0.8506 | 6.94570 | 5.2762 | 86.56201 | 7.39622 | 351.683014 | 1.6900 | 23.54764 | 25.80812 | 2.5000 | low |
| ZA | 16.6000 | 6404.6725 | 50.79576 | 4.98278 | 1.4772 | 0.78940 | -2.3230 | 107.24859 | 15.04496 | 302.141270 | 3.3357 | 19.11740 | 16.25316 | 33.7000 | high |