library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(mice)
##
## Attaching package: 'mice'
## The following object is masked from 'package:stats':
##
## filter
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(e1071)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(ggplot2)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
Ada dua jenis data, yaitu Training dan Testing. Data Training akan digunakan untuk membangun model dengan SVM, sedangkan data Testing akan digunakan untuk memprediksi Tingkat Risiko
Data mempunyai 16 peubah:
Country
X1: capital adequacy ratio (%) average from last 5 years
X2: GDP per capita (USD)
X3: Gross External Debt (% of GDP) average from last 5 years
X4: growth of consumer price (%) average from last 5 years
X5: growth of population (%) average from last 5 years
X6: growth of Real GDP (%) average from last 5 years
X7: growth of Real GDP per cap. (%) average from last 5 years
X8: Loan-deposit ratio (%) average from last 5 years
X9: Net External Debt (% of GDP) average from last 5 years
X10: Nominal GDP (USD bn)
X11: Non-performing loans (% of gross loans) average from last 5 years
X12: percentage of gross domestic investment to GDP (%) average from last 5 years
X13: percentage of gross domestic saving to GDP (%) average from last 5 years
X14: unemployment rate (% labour force) average from last 5 years
Risk Level
dengan peubah respon yaitu Risk Level yang terdiri dari dua kategori (low, high).
Data pada sheet Training diimpor ke dalam variabel data.train, sedangkan data pada sheet Testing diimpor ke dalam variabel data.test
data.train <- read_excel("Level Risiko Investasi.xlsx", sheet = "Training")
data.test <- read_excel("Level Risiko Investasi.xlsx", sheet = "Testing")
test.data.ori <- data.test
glimpse(data.train)
## Rows: 100
## Columns: 16
## $ Country <chr> "AD", "AE", "AE-AZ", "AE-RK", "AM", "AO", "AR", "AT", "AU…
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, NA, 14.0000, NA, 23.2527, 18.5…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 203…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.05…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.7…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.4…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 6…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, NA, 6.6000, 10.3000, 10.6000, 2.0…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.1238…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.5721…
## $ X14 <dbl> 3.0000, 2.4500, NA, NA, 18.5000, 10.5000, 11.0500, 6.0000…
## $ `Risk Level` <chr> "low", "low", "low", "low", "high", "high", "high", "low"…
glimpse(data.test)
## Rows: 17
## Columns: 15
## $ Country <chr> "SE", "SG", "SI", "SK", "SM", "SV", "TH", "TN", "TR", "TW", "U…
## $ X1 <dbl> 23.2000, 16.8056, 18.2857, 19.6715, 11.9000, NA, 19.8000, 12.9…
## $ X2 <dbl> 60338.0204, 62432.9952, 28684.1682, 21042.7221, 49356.2618, 39…
## $ X3 <dbl> 175.42230, 409.69700, 103.06040, 102.73060, 60.15464, 65.55750…
## $ X4 <dbl> 1.62000, 0.10510, 0.84352, 1.17400, 0.89594, 0.39400, 0.34500,…
## $ X5 <dbl> 0.6755, 0.9068, 0.0746, 0.0734, 0.5865, 0.5042, 0.3153, 1.1173…
## $ X6 <dbl> 2.47168, 2.77600, 3.55290, 3.21976, 1.75420, 2.44734, 3.44058,…
## $ X7 <dbl> 0.3526, 0.2912, 1.9299, 1.2325, -1.1342, -0.1248, 1.2787, -1.5…
## $ X8 <dbl> 185.64097, 94.00211, 72.30708, 111.78982, 88.60514, 88.88685, …
## $ X9 <dbl> 64.14972, -200.98100, 16.23838, 33.35258, -145.43800, 27.33332…
## $ X10 <dbl> 537.609866, 339.988210, 52.761781, 102.567122, 1.490827, 24.63…
## $ X11 <dbl> 0.5000, 1.3095, 3.0176, 2.5300, 63.5000, 1.5706, 3.2000, 13.60…
## $ X12 <dbl> 25.11320, 26.76784, 19.90742, 22.83084, 17.79208, 16.78238, 23…
## $ X13 <dbl> 27.95256, 47.25374, 25.76882, 20.95780, 23.21144, 14.52982, 32…
## $ X14 <dbl> 8.6000, 3.0000, 5.0000, 7.0000, 7.3000, 9.0000, 2.0000, 17.000…
#Melihat Struktur Data
str(data.train)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:100] "AD" "AE" "AE-AZ" "AE-RK" ...
## $ X1 : num [1:100] 17.5 18.2 18.7 NA 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 NA 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 NA NA 18.5 ...
## $ Risk Level: chr [1:100] "low" "low" "low" "low" ...
str(data.test)
## tibble [17 × 15] (S3: tbl_df/tbl/data.frame)
## $ Country: chr [1:17] "SE" "SG" "SI" "SK" ...
## $ X1 : num [1:17] 23.2 16.8 18.3 19.7 11.9 ...
## $ X2 : num [1:17] 60338 62433 28684 21043 49356 ...
## $ X3 : num [1:17] 175.4 409.7 103.1 102.7 60.2 ...
## $ X4 : num [1:17] 1.62 0.105 0.844 1.174 0.896 ...
## $ X5 : num [1:17] 0.6755 0.9068 0.0746 0.0734 0.5865 ...
## $ X6 : num [1:17] 2.47 2.78 3.55 3.22 1.75 ...
## $ X7 : num [1:17] 0.353 0.291 1.93 1.232 -1.134 ...
## $ X8 : num [1:17] 185.6 94 72.3 111.8 88.6 ...
## $ X9 : num [1:17] 64.1 -201 16.2 33.4 -145.4 ...
## $ X10 : num [1:17] 537.61 339.99 52.76 102.57 1.49 ...
## $ X11 : num [1:17] 0.5 1.31 3.02 2.53 63.5 ...
## $ X12 : num [1:17] 25.1 26.8 19.9 22.8 17.8 ...
## $ X13 : num [1:17] 28 47.3 25.8 21 23.2 ...
## $ X14 : num [1:17] 8.6 3 5 7 7.3 9 2 17 13.2 3.7 ...
summary(data.train)
## Country X1 X2 X3
## Length:100 Min. : 4.20 Min. : 434.5 Min. : 13.63
## Class :character 1st Qu.:15.93 1st Qu.: 4265.9 1st Qu.: 42.96
## Mode :character Median :18.58 Median : 11659.1 Median : 70.42
## Mean :18.97 Mean : 22641.6 Mean : 191.94
## 3rd Qu.:21.80 3rd Qu.: 34815.2 3rd Qu.: 130.63
## Max. :47.50 Max. :124340.4 Max. :6908.35
## NA's :12
## X4 X5 X6 X7
## Min. :-0.151 Min. :-0.8862 Min. :-5.135 Min. :-9.84530
## 1st Qu.: 0.869 1st Qu.: 0.4419 1st Qu.: 1.765 1st Qu.:-1.18720
## Median : 1.700 Median : 1.1402 Median : 2.984 Median : 0.07155
## Mean : 3.263 Mean : 1.2019 Mean : 3.076 Mean : 0.10804
## 3rd Qu.: 3.939 3rd Qu.: 1.9502 3rd Qu.: 4.305 3rd Qu.: 1.94108
## Max. :36.703 Max. : 4.4021 Max. :10.076 Max. : 6.07120
##
## X8 X9 X10 X11
## Min. : 34.82 Min. :-1955.72 Min. : 1.171 Min. : 0.3357
## 1st Qu.: 76.95 1st Qu.: -14.11 1st Qu.: 32.813 1st Qu.: 1.9250
## Median : 90.19 Median : 12.67 Median : 106.872 Median : 3.9000
## Mean : 99.94 Mean : -13.58 Mean : 582.318 Mean : 5.5346
## 3rd Qu.:113.39 3rd Qu.: 36.67 3rd Qu.: 366.370 3rd Qu.: 7.9500
## Max. :359.14 Max. : 456.49 Max. :14866.703 Max. :26.9780
## NA's :7 NA's :17
## X12 X13 X14 Risk Level
## Min. :12.67 Min. :10.95 Min. : 0.120 Length:100
## 1st Qu.:20.79 1st Qu.:19.06 1st Qu.: 4.818 Class :character
## Median :23.40 Median :24.28 Median : 6.800 Mode :character
## Mean :24.96 Mean :24.48 Mean : 8.441
## 3rd Qu.:28.38 3rd Qu.:29.36 3rd Qu.:10.500
## Max. :46.83 Max. :55.09 Max. :24.650
## NA's :11
summary(data.test)
## Country X1 X2 X3
## Length:17 Min. :11.90 Min. : 786.9 Min. : 30.05
## Class :character 1st Qu.:15.76 1st Qu.: 3955.1 1st Qu.: 48.51
## Mode :character Median :17.52 Median : 8653.0 Median : 65.56
## Mean :17.42 Mean :22330.4 Mean : 92.59
## 3rd Qu.:19.70 3rd Qu.:31854.3 3rd Qu.:103.06
## Max. :23.20 Max. :69324.7 Max. :409.70
## NA's :1
## X4 X5 X6 X7
## Min. : 0.1051 Min. :-0.3906 Min. :0.340 Min. :-2.3230
## 1st Qu.: 0.8435 1st Qu.: 0.3153 1st Qu.:1.754 1st Qu.:-0.1248
## Median : 1.6200 Median : 0.6255 Median :2.539 Median : 0.4867
## Mean : 4.4949 Mean : 0.8249 Mean :2.994 Mean : 0.8826
## 3rd Qu.: 5.5560 3rd Qu.: 1.1173 3rd Qu.:3.553 3rd Qu.: 1.8906
## Max. :19.1730 Max. : 3.6551 Max. :6.946 Max. : 5.2762
##
## X8 X9 X10 X11
## Min. : 49.06 Min. :-200.98 Min. : 1.491 Min. : 0.500
## 1st Qu.: 72.28 1st Qu.: -42.56 1st Qu.: 52.762 1st Qu.: 1.571
## Median : 88.89 Median : 15.04 Median : 155.582 Median : 2.530
## Mean : 96.54 Mean : -18.76 Mean : 1463.386 Mean :11.258
## 3rd Qu.:109.52 3rd Qu.: 28.57 3rd Qu.: 501.644 3rd Qu.: 3.336
## Max. :185.64 Max. : 64.46 Max. :20935.000 Max. :63.500
## NA's :2 NA's :4
## X12 X13 X14
## Min. :16.45 Min. : 8.882 Min. : 2.000
## 1st Qu.:17.79 1st Qu.:17.208 1st Qu.: 4.675
## Median :22.03 Median :23.211 Median : 7.150
## Mean :21.91 Mean :23.693 Mean : 8.963
## 3rd Qu.:24.86 3rd Qu.:27.953 3rd Qu.: 9.700
## Max. :31.60 Max. :47.254 Max. :33.700
## NA's :1
Risk Level mejadi Risk.Level
colnames(data.train) <- make.names(colnames(data.train))
Ubah Risk.Level jadi faktor ket: 2 low dan 1 hig
data.train$Risk.Level <- as.factor(data.train$Risk.Level)
str(data.train)
## tibble [100 × 16] (S3: tbl_df/tbl/data.frame)
## $ Country : chr [1:100] "AD" "AE" "AE-AZ" "AE-RK" ...
## $ X1 : num [1:100] 17.5 18.2 18.7 NA 14 ...
## $ X2 : num [1:100] 38675 40105 76038 27883 4251 ...
## $ X3 : num [1:100] 172.8 103.5 31 24.8 89.6 ...
## $ X4 : num [1:100] 0.68 1.77 2.63 1.29 1.44 ...
## $ X5 : num [1:100] 1.221 0.87 1.489 1.753 0.256 ...
## $ X6 : num [1:100] 1.79 2.66 1.85 2.23 4.75 ...
## $ X7 : num [1:100] -2.084 -0.725 -1.901 -1.135 2.332 ...
## $ X8 : num [1:100] 55 103 103 103 167 ...
## $ X9 : num [1:100] -26.5 -13.6 -56.2 24.8 47.3 ...
## $ X10 : num [1:100] 2.86 352.91 199.93 10.11 12.65 ...
## $ X11 : num [1:100] 8 8.15 8.15 NA 6.6 ...
## $ X12 : num [1:100] 23.1 24.9 20.4 21.7 19.4 ...
## $ X13 : num [1:100] 26.9 32.5 31 17.3 15.1 ...
## $ X14 : num [1:100] 3 2.45 NA NA 18.5 ...
## $ Risk.Level: Factor w/ 2 levels "high","low": 2 2 2 2 1 1 1 2 2 1 ...
Country dihapus karena tidak terpakai dalam analisis
data.train <- data.train[, -which(names(data.train) == "Country")]
data.test <- data.test[, -which(names(data.test) == "Country")]
colSums(is.na(data.train))
## X1 X2 X3 X4 X5 X6 X7
## 12 0 0 0 0 0 0
## X8 X9 X10 X11 X12 X13 X14
## 7 0 0 17 0 0 11
## Risk.Level
## 0
colSums(is.na(data.test))
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14
## 1 0 0 0 0 0 0 2 0 0 4 0 0 1
#Proporsi missing values
aggr(data.train,
col = c("grey", "pink"),
numbers = TRUE,
sortVars = TRUE,
labels = names(data.train),
cex.axis = 0.7,
cex.numbers = 0.7)
##
## Variables sorted by number of missings:
## Variable Count
## X11 0.17
## X1 0.12
## X14 0.11
## X8 0.07
## X2 0.00
## X3 0.00
## X4 0.00
## X5 0.00
## X6 0.00
## X7 0.00
## X9 0.00
## X10 0.00
## X12 0.00
## X13 0.00
## Risk.Level 0.00
Dalam analisis ini MICE akan menghasilkan 18 dataset berbeda dengan metode pmm (Predictive Mean Matching).
data.train_imp <- mice(data.train, m = 18, method = 'pmm', seed = 288)
##
## iter imp variable
## 1 1 X1 X8 X11 X14
## 1 2 X1 X8 X11 X14
## 1 3 X1 X8 X11 X14
## 1 4 X1 X8 X11 X14
## 1 5 X1 X8 X11 X14
## 1 6 X1 X8 X11 X14
## 1 7 X1 X8 X11 X14
## 1 8 X1 X8 X11 X14
## 1 9 X1 X8 X11 X14
## 1 10 X1 X8 X11 X14
## 1 11 X1 X8 X11 X14
## 1 12 X1 X8 X11 X14
## 1 13 X1 X8 X11 X14
## 1 14 X1 X8 X11 X14
## 1 15 X1 X8 X11 X14
## 1 16 X1 X8 X11 X14
## 1 17 X1 X8 X11 X14
## 1 18 X1 X8 X11 X14
## 2 1 X1 X8 X11 X14
## 2 2 X1 X8 X11 X14
## 2 3 X1 X8 X11 X14
## 2 4 X1 X8 X11 X14
## 2 5 X1 X8 X11 X14
## 2 6 X1 X8 X11 X14
## 2 7 X1 X8 X11 X14
## 2 8 X1 X8 X11 X14
## 2 9 X1 X8 X11 X14
## 2 10 X1 X8 X11 X14
## 2 11 X1 X8 X11 X14
## 2 12 X1 X8 X11 X14
## 2 13 X1 X8 X11 X14
## 2 14 X1 X8 X11 X14
## 2 15 X1 X8 X11 X14
## 2 16 X1 X8 X11 X14
## 2 17 X1 X8 X11 X14
## 2 18 X1 X8 X11 X14
## 3 1 X1 X8 X11 X14
## 3 2 X1 X8 X11 X14
## 3 3 X1 X8 X11 X14
## 3 4 X1 X8 X11 X14
## 3 5 X1 X8 X11 X14
## 3 6 X1 X8 X11 X14
## 3 7 X1 X8 X11 X14
## 3 8 X1 X8 X11 X14
## 3 9 X1 X8 X11 X14
## 3 10 X1 X8 X11 X14
## 3 11 X1 X8 X11 X14
## 3 12 X1 X8 X11 X14
## 3 13 X1 X8 X11 X14
## 3 14 X1 X8 X11 X14
## 3 15 X1 X8 X11 X14
## 3 16 X1 X8 X11 X14
## 3 17 X1 X8 X11 X14
## 3 18 X1 X8 X11 X14
## 4 1 X1 X8 X11 X14
## 4 2 X1 X8 X11 X14
## 4 3 X1 X8 X11 X14
## 4 4 X1 X8 X11 X14
## 4 5 X1 X8 X11 X14
## 4 6 X1 X8 X11 X14
## 4 7 X1 X8 X11 X14
## 4 8 X1 X8 X11 X14
## 4 9 X1 X8 X11 X14
## 4 10 X1 X8 X11 X14
## 4 11 X1 X8 X11 X14
## 4 12 X1 X8 X11 X14
## 4 13 X1 X8 X11 X14
## 4 14 X1 X8 X11 X14
## 4 15 X1 X8 X11 X14
## 4 16 X1 X8 X11 X14
## 4 17 X1 X8 X11 X14
## 4 18 X1 X8 X11 X14
## 5 1 X1 X8 X11 X14
## 5 2 X1 X8 X11 X14
## 5 3 X1 X8 X11 X14
## 5 4 X1 X8 X11 X14
## 5 5 X1 X8 X11 X14
## 5 6 X1 X8 X11 X14
## 5 7 X1 X8 X11 X14
## 5 8 X1 X8 X11 X14
## 5 9 X1 X8 X11 X14
## 5 10 X1 X8 X11 X14
## 5 11 X1 X8 X11 X14
## 5 12 X1 X8 X11 X14
## 5 13 X1 X8 X11 X14
## 5 14 X1 X8 X11 X14
## 5 15 X1 X8 X11 X14
## 5 16 X1 X8 X11 X14
## 5 17 X1 X8 X11 X14
## 5 18 X1 X8 X11 X14
Ambil dataset yang telah diimputasi dari data.train_imp, yang dihasilkan oleh MICE, yaitu dataset ke-18
data.train_comp <- complete(data.train_imp, action = 18)
tidak ada missing values
aggr(data.train_comp,
col = c("grey", "pink"),
numbers = TRUE,
sortVars = TRUE,
labels = names(data.train_comp),
cex.axis = 0.7,
cex.numbers = 0.7)
##
## Variables sorted by number of missings:
## Variable Count
## X1 0
## X2 0
## X3 0
## X4 0
## X5 0
## X6 0
## X7 0
## X8 0
## X9 0
## X10 0
## X11 0
## X12 0
## X13 0
## X14 0
## Risk.Level 0
ket: garis hitam: sebelum dan garis pink: sesudah
plot(density(data.train$X1, na.rm = T), main = "Data X1 With NA")
lines(density(data.train_comp$X1, na.rm = T), col = "pink", lty = 3)
plot(density(data.train$X8, na.rm = T), main = "Data X8 With NA")
lines(density(data.train_comp$X8, na.rm = T), col = "pink", lty = 3)
plot(density(data.train$X11, na.rm = T), main = "Data X11 With NA")
lines(density(data.train_comp$X11, na.rm = T), col = "pink", lty = 3)
plot(density(data.train$X14, na.rm = T), main = "Data X14 With NA")
lines(density(data.train_comp$X14, na.rm = T), col = "pink", lty = 3)
# Missing value Data Testing missing values: x1, x8, x11, x14 terbesar:
x11 terkecil: x1 dan x14 ket: grey: dataset dan
pink:missing values
aggr(data.test,
col = c("grey", "pink"),
numbers = TRUE,
sortVars = TRUE,
labels = names(data.test),
cex.axis = 0.7,
cex.numbers = 0.7)
##
## Variables sorted by number of missings:
## Variable Count
## X11 0.23529412
## X8 0.11764706
## X1 0.05882353
## X14 0.05882353
## X2 0.00000000
## X3 0.00000000
## X4 0.00000000
## X5 0.00000000
## X6 0.00000000
## X7 0.00000000
## X9 0.00000000
## X10 0.00000000
## X12 0.00000000
## X13 0.00000000
Dalam analisis ini MICE akan menghasilkan 18 dataset berbeda dengan metode pmm (Predictive Mean Matching)
data.test_imp <- mice(data.test, m = 18, method = 'pmm', seed = 288)
##
## iter imp variable
## 1 1 X1 X8 X11 X14
## 1 2 X1 X8 X11 X14
## 1 3 X1 X8 X11 X14
## 1 4 X1 X8 X11 X14
## 1 5 X1 X8 X11 X14
## 1 6 X1 X8 X11 X14
## 1 7 X1 X8 X11 X14
## 1 8 X1 X8 X11 X14
## 1 9 X1 X8 X11 X14
## 1 10 X1 X8 X11 X14
## 1 11 X1 X8 X11 X14
## 1 12 X1 X8 X11 X14
## 1 13 X1 X8 X11 X14
## 1 14 X1 X8 X11 X14
## 1 15 X1 X8 X11 X14
## 1 16 X1 X8 X11 X14
## 1 17 X1 X8 X11 X14
## 1 18 X1 X8 X11 X14
## 2 1 X1 X8 X11 X14
## 2 2 X1 X8 X11 X14
## 2 3 X1 X8 X11 X14
## 2 4 X1 X8 X11 X14
## 2 5 X1 X8 X11 X14
## 2 6 X1 X8 X11 X14
## 2 7 X1 X8 X11 X14
## 2 8 X1 X8 X11 X14
## 2 9 X1 X8 X11 X14
## 2 10 X1 X8 X11 X14
## 2 11 X1 X8 X11 X14
## 2 12 X1 X8 X11 X14
## 2 13 X1 X8 X11 X14
## 2 14 X1 X8 X11 X14
## 2 15 X1 X8 X11 X14
## 2 16 X1 X8 X11 X14
## 2 17 X1 X8 X11 X14
## 2 18 X1 X8 X11 X14
## 3 1 X1 X8 X11 X14
## 3 2 X1 X8 X11 X14
## 3 3 X1 X8 X11 X14
## 3 4 X1 X8 X11 X14
## 3 5 X1 X8 X11 X14
## 3 6 X1 X8 X11 X14
## 3 7 X1 X8 X11 X14
## 3 8 X1 X8 X11 X14
## 3 9 X1 X8 X11 X14
## 3 10 X1 X8 X11 X14
## 3 11 X1 X8 X11 X14
## 3 12 X1 X8 X11 X14
## 3 13 X1 X8 X11 X14
## 3 14 X1 X8 X11 X14
## 3 15 X1 X8 X11 X14
## 3 16 X1 X8 X11 X14
## 3 17 X1 X8 X11 X14
## 3 18 X1 X8 X11 X14
## 4 1 X1 X8 X11 X14
## 4 2 X1 X8 X11 X14
## 4 3 X1 X8 X11 X14
## 4 4 X1 X8 X11 X14
## 4 5 X1 X8 X11 X14
## 4 6 X1 X8 X11 X14
## 4 7 X1 X8 X11 X14
## 4 8 X1 X8 X11 X14
## 4 9 X1 X8 X11 X14
## 4 10 X1 X8 X11 X14
## 4 11 X1 X8 X11 X14
## 4 12 X1 X8 X11 X14
## 4 13 X1 X8 X11 X14
## 4 14 X1 X8 X11 X14
## 4 15 X1 X8 X11 X14
## 4 16 X1 X8 X11 X14
## 4 17 X1 X8 X11 X14
## 4 18 X1 X8 X11 X14
## 5 1 X1 X8 X11 X14
## 5 2 X1 X8 X11 X14
## 5 3 X1 X8 X11 X14
## 5 4 X1 X8 X11 X14
## 5 5 X1 X8 X11 X14
## 5 6 X1 X8 X11 X14
## 5 7 X1 X8 X11 X14
## 5 8 X1 X8 X11 X14
## 5 9 X1 X8 X11 X14
## 5 10 X1 X8 X11 X14
## 5 11 X1 X8 X11 X14
## 5 12 X1 X8 X11 X14
## 5 13 X1 X8 X11 X14
## 5 14 X1 X8 X11 X14
## 5 15 X1 X8 X11 X14
## 5 16 X1 X8 X11 X14
## 5 17 X1 X8 X11 X14
## 5 18 X1 X8 X11 X14
## Warning: Number of logged events: 111
Ambil dataset yang telah diimputasi dari data.test_imp, yang dihasilkan oleh MICE, yaitu dataset ke-18
data.test_comp <- complete(data.test_imp, action = 18)
tidak ada missing values
aggr(data.test_comp,
col = c("grey", "pink"),
numbers = TRUE,
sortVars = TRUE,
labels = names(data.test_comp),
cex.axis = 0.7,
cex.numbers = 0.7)
##
## Variables sorted by number of missings:
## Variable Count
## X1 0
## X2 0
## X3 0
## X4 0
## X5 0
## X6 0
## X7 0
## X8 0
## X9 0
## X10 0
## X11 0
## X12 0
## X13 0
## X14 0
ket: garis hitam: sebelum dan garis pink: sesudah
plot(density(data.test$X1, na.rm = T), main = "Data X1 With NA")
lines(density(data.test_comp$X1, na.rm = T), col = "pink", lty = 3)
plot(density(data.test$X8, na.rm = T), main = "Data X8 With NA")
lines(density(data.test_comp$X1, na.rm = T), col = "pink", lty = 3)
plot(density(data.test$X11, na.rm = T), main = "Data X11 With NA")
lines(density(data.test_comp$X11, na.rm = T), col = "pink", lty = 3)
plot(density(data.test$X14, na.rm = T), main = "Data X14 With NA")
lines(density(data.test_comp$X14, na.rm = T), col = "pink", lty = 3)
#data bebas missing values
glimpse(data.train_comp)
## Rows: 100
## Columns: 15
## $ X1 <dbl> 17.5000, 18.2000, 18.7000, 16.2500, 14.0000, 14.0000, 23.25…
## $ X2 <dbl> 38674.616, 40105.120, 76037.997, 27882.829, 4251.398, 2033.…
## $ X3 <dbl> 172.75400, 103.52280, 31.03626, 24.78532, 89.61882, 57.0556…
## $ X4 <dbl> 0.68000, 1.76600, 2.63056, 1.29416, 1.44000, 22.35646, 36.7…
## $ X5 <dbl> 1.2206, 0.8698, 1.4893, 1.7530, 0.2562, 3.3422, 0.9657, 0.7…
## $ X6 <dbl> 1.78560, 2.65884, 1.85034, 2.23192, 4.74800, -0.87800, -0.2…
## $ X7 <dbl> -2.0843, -0.7254, -1.9008, -1.1355, 2.3318, -5.2032, -3.729…
## $ X8 <dbl> 55.00000, 102.52738, 102.52738, 102.52738, 166.80851, 34.81…
## $ X9 <dbl> -26.52000, -13.59890, -56.24160, 24.78532, 47.27262, 15.449…
## $ X10 <dbl> 2.857862, 352.910575, 199.928422, 10.108892, 12.645460, 62.…
## $ X11 <dbl> 8.0000, 8.1550, 8.1550, 0.3357, 6.6000, 10.3000, 10.6000, 2…
## $ X12 <dbl> 23.08410, 24.85976, 20.39940, 21.69104, 19.40300, 31.12380,…
## $ X13 <dbl> 26.94344, 32.47740, 31.03926, 17.30888, 15.11172, 20.57210,…
## $ X14 <dbl> 3.0000, 2.4500, 4.8177, 10.0000, 18.5000, 10.5000, 11.0500,…
## $ Risk.Level <fct> low, low, low, low, high, high, high, low, low, high, high,…
glimpse(data.test_comp)
## Rows: 17
## Columns: 14
## $ X1 <dbl> 23.2000, 16.8056, 18.2857, 19.6715, 11.9000, 14.1400, 19.8000, 12.…
## $ X2 <dbl> 60338.0204, 62432.9952, 28684.1682, 21042.7221, 49356.2618, 3989.1…
## $ X3 <dbl> 175.42230, 409.69700, 103.06040, 102.73060, 60.15464, 65.55750, 33…
## $ X4 <dbl> 1.62000, 0.10510, 0.84352, 1.17400, 0.89594, 0.39400, 0.34500, 5.5…
## $ X5 <dbl> 0.6755, 0.9068, 0.0746, 0.0734, 0.5865, 0.5042, 0.3153, 1.1173, 1.…
## $ X6 <dbl> 2.47168, 2.77600, 3.55290, 3.21976, 1.75420, 2.44734, 3.44058, 1.6…
## $ X7 <dbl> 0.3526, 0.2912, 1.9299, 1.2325, -1.1342, -0.1248, 1.2787, -1.5047,…
## $ X8 <dbl> 185.64097, 94.00211, 72.30708, 111.78982, 88.60514, 88.88685, 100.…
## $ X9 <dbl> 64.14972, -200.98100, 16.23838, 33.35258, -145.43800, 27.33332, -4…
## $ X10 <dbl> 537.609866, 339.988210, 52.761781, 102.567122, 1.490827, 24.638720…
## $ X11 <dbl> 0.5000, 1.3095, 3.0176, 2.5300, 63.5000, 1.5706, 3.2000, 13.6000, …
## $ X12 <dbl> 25.11320, 26.76784, 19.90742, 22.83084, 17.79208, 16.78238, 23.059…
## $ X13 <dbl> 27.95256, 47.25374, 25.76882, 20.95780, 23.21144, 14.52982, 32.479…
## $ X14 <dbl> 8.6000, 3.0000, 5.0000, 7.0000, 7.3000, 9.0000, 2.0000, 17.0000, 1…
#bar chart Risk.Level untuk data.train_com
risk_counts <- data.train_comp %>%
group_by(Risk.Level) %>%
summarize(count = n())
ggplot(risk_counts, aes(x = Risk.Level, y = count, fill = Risk.Level)) +
geom_bar(stat = "identity") +
geom_text(aes(label = count), vjust = -0.5, size = 5) +
scale_fill_manual(values = c("low" = "pink", "high" = "red")) +
labs(title = "Tingkat Risiko Investasi", x = "Risk Level", y = "Frekuensi") +
theme_minimal()
# SVM # Splitting Data Data data.train_comp dibagi
menjadi dua bagian yaitu 80% sebagai data training dan 20% sebagai data
testing
set.seed(123)
train_index <- createDataPartition(data.train_comp$Risk.Level, p = 0.8, list = FALSE)
data_train <- data.train_comp[train_index, ]
data_test <- data.train_comp[-train_index, ]
model SVM dengan kernel linear menggunakan data pelatihan data_train, dengan Risk.Level sebagai variabel target dan variabel lainnya sebagai prediktor
svm_model <- svm(Risk.Level ~ ., data = data_train, kernel = "linear")
lakukan prediksi memakai data data_test dan simpan hasilnya di svm_pred
svm_pred <- predict(svm_model, newdata = data_test)
hitung akurasi prediksi dengan membandingkan svm_pred dengan data_test$Risk.Level
akurasi_svm <- sum(svm_pred == data_test$Risk.Level) / nrow(data_test)
print(paste("SVM Akurasi Model:", round(akurasi_svm * 100, 2),"%"))
## [1] "SVM Akurasi Model: 94.74 %"
#Confusion Matrix
confusion_matrix <- confusionMatrix(as.factor(svm_pred), as.factor(data_test$Risk.Level))
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low
## high 10 1
## low 0 8
##
## Accuracy : 0.9474
## 95% CI : (0.7397, 0.9987)
## No Information Rate : 0.5263
## P-Value [Acc > NIR] : 9.149e-05
##
## Kappa : 0.8939
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 1.0000
## Specificity : 0.8889
## Pos Pred Value : 0.9091
## Neg Pred Value : 1.0000
## Prevalence : 0.5263
## Detection Rate : 0.5263
## Detection Prevalence : 0.5789
## Balanced Accuracy : 0.9444
##
## 'Positive' Class : high
##
10 prediksi benar untuk kelas high (model memprediksi high dan benar-benar high).
8 prediksi benar untuk kelas low (model memprediksi low dan benar-benar low).
confusion_data <- as.data.frame(confusion_matrix$table)
ggplot(confusion_data, aes(x = Reference, y = Prediction)) +
geom_tile(aes(fill = Freq), color = "purple") +
scale_fill_gradient(low = "purple", high = "yellow") +
theme_minimal() +
labs(title = "Confusion Matrix", x = "Actual", y = "Predicted") +
geom_text(aes(label = Freq), vjust = 1)
predictions <- data.frame(Actual = data_test$Risk.Level, Predicted = svm_pred)
print(predictions)
## Actual Predicted
## 1 low low
## 5 high high
## 6 high high
## 12 high high
## 22 low low
## 30 high high
## 32 low low
## 42 high high
## 50 low low
## 51 low high
## 58 high high
## 62 low low
## 66 high high
## 68 low low
## 71 high high
## 73 high high
## 75 low low
## 93 high high
## 94 low low
memprediksi Risk.Level pada data test.data_comp.
svm_pred.excel <- predict(svm_model, newdata = data.test_comp)
print(svm_pred.excel)
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## low low low low high high low high high low high high low low high low
## 17
## high
## Levels: high low
Data disimpan dengan nama Prediksi.Tingkat.Risiko dan masuk dalam data frame data.test_comp
data.test_comp$Prediksi.Tingkat.Risiko <- svm_pred.excel
accuracy_svm.excel <- sum(data.test_comp$Prediksi.Tingkat.Risiko == svm_pred.excel) / nrow(data.test_comp)
print(paste("SVM Akurasi untuk Data Excel:", round(accuracy_svm.excel * 100, 2),"%"))
## [1] "SVM Akurasi untuk Data Excel: 100 %"
data.test_comp <- cbind(Country = test.data.ori$Country, data.test_comp)
data.test_comp <- data.test_comp %>%
mutate(Prediksi.Tingkat.Risiko = ifelse(
Prediksi.Tingkat.Risiko == "high",
cell_spec(Prediksi.Tingkat.Risiko, background = "purple", bold = TRUE),
cell_spec(Prediksi.Tingkat.Risiko, background = "yellow", bold = TRUE)
))
data.test_comp %>%
kbl(escape = FALSE) %>%
kable_material_dark() %>%
row_spec(0, bold = TRUE, color = "pink") %>%
kable_styling(font_size = 9)
| Country | X1 | X2 | X3 | X4 | X5 | X6 | X7 | X8 | X9 | X10 | X11 | X12 | X13 | X14 | Prediksi.Tingkat.Risiko |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SE | 23.2000 | 60338.0204 | 175.42230 | 1.62000 | 0.6755 | 2.47168 | 0.3526 | 185.64097 | 64.14972 | 537.609866 | 0.5000 | 25.11320 | 27.95256 | 8.6000 | low |
| SG | 16.8056 | 62432.9952 | 409.69700 | 0.10510 | 0.9068 | 2.77600 | 0.2912 | 94.00211 | -200.98100 | 339.988210 | 1.3095 | 26.76784 | 47.25374 | 3.0000 | low |
| SI | 18.2857 | 28684.1682 | 103.06040 | 0.84352 | 0.0746 | 3.55290 | 1.9299 | 72.30708 | 16.23838 | 52.761781 | 3.0176 | 19.90742 | 25.76882 | 5.0000 | low |
| SK | 19.6715 | 21042.7221 | 102.73060 | 1.17400 | 0.0734 | 3.21976 | 1.2325 | 111.78982 | 33.35258 | 102.567122 | 2.5300 | 22.83084 | 20.95780 | 7.0000 | low |
| SM | 11.9000 | 49356.2618 | 60.15464 | 0.89594 | 0.5865 | 1.75420 | -1.1342 | 88.60514 | -145.43800 | 1.490827 | 63.5000 | 17.79208 | 23.21144 | 7.3000 | high |
| SV | 14.1400 | 3989.1913 | 65.55750 | 0.39400 | 0.5042 | 2.44734 | -0.1248 | 88.88685 | 27.33332 | 24.638720 | 1.5706 | 16.78238 | 14.52982 | 9.0000 | high |
| TH | 19.8000 | 7450.5523 | 33.22256 | 0.34500 | 0.3153 | 3.44058 | 1.2787 | 100.19298 | -42.56340 | 501.644054 | 3.2000 | 23.05990 | 32.47950 | 2.0000 | low |
| TN | 12.9000 | 3616.8650 | 85.26668 | 5.55600 | 1.1173 | 1.60820 | -1.5047 | 134.47988 | 64.46288 | 39.218118 | 13.6000 | 18.80654 | 8.88180 | 17.0000 | high |
| TR | 18.0000 | 8652.9973 | 51.65878 | 11.65444 | 1.4844 | 4.15702 | 1.8070 | 116.52826 | 28.56998 | 720.244499 | 3.0176 | 28.55834 | 26.32778 | 13.2000 | high |
| TW | 14.1400 | 31854.2815 | 48.51016 | 0.72360 | 0.1015 | 2.53870 | 2.7686 | 71.08310 | -189.14000 | 668.122597 | 3.2000 | 22.02936 | 33.80118 | 3.7000 | low |
| UA | 22.0000 | 3955.0704 | 103.90710 | 19.17300 | -0.3906 | 0.34000 | 1.8906 | 72.25639 | -5.46582 | 155.581868 | 49.0000 | 17.79388 | 16.04966 | 9.5000 | high |
| UG | 21.5689 | 786.8776 | 42.26784 | 4.29470 | 3.6551 | 5.73874 | 0.4207 | 69.45587 | 21.88058 | 33.538172 | 3.3357 | 24.85996 | 19.47826 | 17.0000 | high |
| US | 16.3000 | 69324.7338 | 104.17110 | 1.55316 | 0.6255 | 2.45554 | 0.4867 | 116.52826 | 47.70210 | 20935.000000 | 1.0000 | 17.40650 | 17.20802 | 5.6146 | low |
| UY | 17.0400 | 15968.2306 | 73.01010 | 8.00348 | 0.3592 | 0.82090 | -0.7169 | 49.05568 | -16.23150 | 53.628838 | 2.5300 | 16.44666 | 17.57796 | 10.3000 | low |
| UZ | 18.4000 | 1872.6699 | 30.04996 | 12.29840 | 1.6065 | 5.84000 | 3.0735 | 49.05568 | -45.16010 | 57.707193 | 2.1000 | 31.60162 | 29.24286 | 6.0000 | high |
| VN | 12.0977 | 3886.5162 | 34.52492 | 2.79600 | 0.8506 | 6.94570 | 5.2762 | 86.56201 | 7.39622 | 351.683014 | 1.6900 | 23.54764 | 25.80812 | 2.5000 | low |
| ZA | 16.6000 | 6404.6725 | 50.79576 | 4.98278 | 1.4772 | 0.78940 | -2.3230 | 107.24859 | 15.04496 | 302.141270 | 3.3357 | 19.11740 | 16.25316 | 33.7000 | high |