`
Dokumen ini melakukan telaah variabel,
statistik deskriptif, karakteristik
data, dan cek multikolinearitas (VIF) pada
dataset Life Expectancy Data Clean.xlsx. Target asli
Life expectancy akan dikonversi menjadi 3
kelas (Low / Medium / High) berdasarkan tertiles.
library(readxl)
library(dplyr)
library(ggplot2)
library(janitor)
library(psych)
library(naniar)
library(corrplot)
library(car) # vif
library(caret) # preprocessing
library(randomForest())
library(e1071)
# path ke file (sesuaikan jika perlu)
file_path <- "C:/Users/USER/Downloads/Life Expectancy Data Clean.xlsx"
df <- read_excel(file_path)
# bersihkan nama kolom agar mudah dipanggil
names(df) <- make_clean_names(names(df))
# preview
glimpse(df)
## Rows: 2,938
## Columns: 22
## $ country <chr> "Afghanistan", "Afghanistan", "Afghani…
## $ year <dbl> 2015, 2014, 2013, 2012, 2011, 2010, 20…
## $ status <chr> "Developing", "Developing", "Developin…
## $ life_expectancy <dbl> 65.0, 59.9, 59.9, 59.5, 59.2, 58.8, 58…
## $ adult_mortality <dbl> 263, 271, 268, 272, 275, 279, 281, 287…
## $ infant_deaths <dbl> 62, 64, 66, 69, 71, 74, 77, 80, 82, 84…
## $ alcohol <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.…
## $ percentage_expenditure <dbl> 71.279624, 73.523582, 73.219243, 78.18…
## $ hepatitis_b <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63, 64…
## $ measles <dbl> 1154, 492, 430, 2787, 3013, 1989, 2861…
## $ bmi <dbl> 19.1, 18.6, 18.1, 17.6, 17.2, 16.7, 16…
## $ under_five_deaths <dbl> 83, 86, 89, 93, 97, 102, 106, 110, 113…
## $ polio <dbl> 6, 58, 62, 67, 68, 66, 63, 64, 63, 58,…
## $ total_expenditure <dbl> 8.16, 8.18, 8.13, 8.52, 7.87, 9.20, 9.…
## $ diphtheria <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63, 58…
## $ hiv_aids <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1…
## $ gdp <dbl> 584.25921, 612.69651, 631.74498, 669.9…
## $ population <dbl> 33736494, 327582, 31731688, 3696958, 2…
## $ thinness_1_19_years <dbl> 17.2, 17.5, 17.7, 17.9, 18.2, 18.4, 18…
## $ thinness_5_9_years <dbl> 17.3, 17.5, 17.7, 18.0, 18.2, 18.4, 18…
## $ income_composition_of_resources <dbl> 0.479, 0.476, 0.470, 0.463, 0.454, 0.4…
## $ schooling <dbl> 10.1, 10.0, 9.9, 9.8, 9.5, 9.2, 8.9, 8…
head(df)
## # A tibble: 6 × 22
## country year status life_expectancy adult_mortality infant_deaths alcohol
## <chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Afghanistan 2015 Devel… 65 263 62 0.01
## 2 Afghanistan 2014 Devel… 59.9 271 64 0.01
## 3 Afghanistan 2013 Devel… 59.9 268 66 0.01
## 4 Afghanistan 2012 Devel… 59.5 272 69 0.01
## 5 Afghanistan 2011 Devel… 59.2 275 71 0.01
## 6 Afghanistan 2010 Devel… 58.8 279 74 0.01
## # ℹ 15 more variables: percentage_expenditure <dbl>, hepatitis_b <dbl>,
## # measles <dbl>, bmi <dbl>, under_five_deaths <dbl>, polio <dbl>,
## # total_expenditure <dbl>, diphtheria <dbl>, hiv_aids <dbl>, gdp <dbl>,
## # population <dbl>, thinness_1_19_years <dbl>, thinness_5_9_years <dbl>,
## # income_composition_of_resources <dbl>, schooling <dbl>
# Daftar variabel dan tipe
vars_tbl <- tibble(variable = names(df), type = sapply(df, function(x) class(x)[1]))
vars_tbl
## # A tibble: 22 × 2
## variable type
## <chr> <chr>
## 1 country character
## 2 year numeric
## 3 status character
## 4 life_expectancy numeric
## 5 adult_mortality numeric
## 6 infant_deaths numeric
## 7 alcohol numeric
## 8 percentage_expenditure numeric
## 9 hepatitis_b numeric
## 10 measles numeric
## # ℹ 12 more rows
Tambahkan catatan singkat untuk variabel kategorik (contoh:
status, country) dan numeric.
Keputusan: membagi life_expectancy menjadi 3 kelas
berdasarkan tertiles (quantile 33.33% & 66.67%).
# pastikan nama kolom target (cek)
# biasanya hasil make_clean_names -> life_expectancy
if(!"life_expectancy" %in% names(df)) stop("Kolom 'life_expectancy' tidak ditemukan. Cek nama kolom.")
# hitung tertiles (dengan na.rm = TRUE)
q <- quantile(df$life_expectancy, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE)
q
## 0% 33.33333% 66.66667% 100%
## 36.3 66.6 74.4 89.0
# buat faktor 3 kelas
df <- df %>%
mutate(life_cat = cut(life_expectancy,
breaks = q,
include.lowest = TRUE,
labels = c("Low","Medium","High")))
table(df$life_cat, useNA = "ifany")
##
## Low Medium High <NA>
## 984 978 966 10
num_vars <- df %>% select(where(is.numeric))
# ringkasan dasar
summary(num_vars)
## year life_expectancy adult_mortality infant_deaths
## Min. :2000 Min. :36.30 Min. : 1.0 Min. : 0.0
## 1st Qu.:2004 1st Qu.:63.10 1st Qu.: 74.0 1st Qu.: 0.0
## Median :2008 Median :72.10 Median :144.0 Median : 3.0
## Mean :2008 Mean :69.22 Mean :164.8 Mean : 30.3
## 3rd Qu.:2012 3rd Qu.:75.70 3rd Qu.:228.0 3rd Qu.: 22.0
## Max. :2015 Max. :89.00 Max. :723.0 Max. :1800.0
## NA's :10 NA's :10
## alcohol percentage_expenditure hepatitis_b measles
## Min. : 0.0100 Min. : 0.000 Min. : 1.00 Min. : 0.0
## 1st Qu.: 0.8775 1st Qu.: 4.685 1st Qu.:77.00 1st Qu.: 0.0
## Median : 3.7550 Median : 64.913 Median :92.00 Median : 17.0
## Mean : 4.6029 Mean : 738.251 Mean :80.94 Mean : 2419.6
## 3rd Qu.: 7.7025 3rd Qu.: 441.534 3rd Qu.:97.00 3rd Qu.: 360.2
## Max. :17.8700 Max. :19479.912 Max. :99.00 Max. :212183.0
## NA's :194 NA's :553
## bmi under_five_deaths polio total_expenditure
## Min. : 1.00 Min. : 0.00 Min. : 3.00 Min. : 0.370
## 1st Qu.:19.30 1st Qu.: 0.00 1st Qu.:78.00 1st Qu.: 4.260
## Median :43.50 Median : 4.00 Median :93.00 Median : 5.755
## Mean :38.32 Mean : 42.04 Mean :82.55 Mean : 5.938
## 3rd Qu.:56.20 3rd Qu.: 28.00 3rd Qu.:97.00 3rd Qu.: 7.492
## Max. :87.30 Max. :2500.00 Max. :99.00 Max. :17.600
## NA's :34 NA's :19 NA's :226
## diphtheria hiv_aids gdp population
## Min. : 2.00 Min. : 0.100 Min. : 1.68 Min. :3.400e+01
## 1st Qu.:78.00 1st Qu.: 0.100 1st Qu.: 463.94 1st Qu.:1.958e+05
## Median :93.00 Median : 0.100 Median : 1766.95 Median :1.387e+06
## Mean :82.32 Mean : 1.742 Mean : 7483.16 Mean :1.275e+07
## 3rd Qu.:97.00 3rd Qu.: 0.800 3rd Qu.: 5910.81 3rd Qu.:7.420e+06
## Max. :99.00 Max. :50.600 Max. :119172.74 Max. :1.294e+09
## NA's :19 NA's :448 NA's :652
## thinness_1_19_years thinness_5_9_years income_composition_of_resources
## Min. : 0.10 Min. : 0.10 Min. :0.0000
## 1st Qu.: 1.60 1st Qu.: 1.50 1st Qu.:0.4930
## Median : 3.30 Median : 3.30 Median :0.6770
## Mean : 4.84 Mean : 4.87 Mean :0.6276
## 3rd Qu.: 7.20 3rd Qu.: 7.20 3rd Qu.:0.7790
## Max. :27.70 Max. :28.60 Max. :0.9480
## NA's :34 NA's :34 NA's :167
## schooling
## Min. : 0.00
## 1st Qu.:10.10
## Median :12.30
## Mean :11.99
## 3rd Qu.:14.30
## Max. :20.70
## NA's :163
# deskripsi lebih lengkap
psych::describe(num_vars)
## vars n mean sd median
## year 1 2938 2007.52 4.61 2008.00
## life_expectancy 2 2928 69.22 9.52 72.10
## adult_mortality 3 2928 164.80 124.29 144.00
## infant_deaths 4 2938 30.30 117.93 3.00
## alcohol 5 2744 4.60 4.05 3.76
## percentage_expenditure 6 2938 738.25 1987.91 64.91
## hepatitis_b 7 2385 80.94 25.07 92.00
## measles 8 2938 2419.59 11467.27 17.00
## bmi 9 2904 38.32 20.04 43.50
## under_five_deaths 10 2938 42.04 160.45 4.00
## polio 11 2919 82.55 23.43 93.00
## total_expenditure 12 2712 5.94 2.50 5.76
## diphtheria 13 2919 82.32 23.72 93.00
## hiv_aids 14 2938 1.74 5.08 0.10
## gdp 15 2490 7483.16 14270.17 1766.95
## population 16 2286 12753375.12 61012096.51 1386542.00
## thinness_1_19_years 17 2904 4.84 4.42 3.30
## thinness_5_9_years 18 2904 4.87 4.51 3.30
## income_composition_of_resources 19 2771 0.63 0.21 0.68
## schooling 20 2775 11.99 3.36 12.30
## trimmed mad min max
## year 2007.52 5.93 2000.00 2.015000e+03
## life_expectancy 69.91 8.60 36.30 8.900000e+01
## adult_mortality 150.51 112.68 1.00 7.230000e+02
## infant_deaths 10.20 4.45 0.00 1.800000e+03
## alcohol 4.23 4.81 0.01 1.787000e+01
## percentage_expenditure 230.74 96.24 0.00 1.947991e+04
## hepatitis_b 86.89 8.90 1.00 9.900000e+01
## measles 286.08 25.20 0.00 2.121830e+05
## bmi 39.05 24.17 1.00 8.730000e+01
## under_five_deaths 14.15 5.93 0.00 2.500000e+03
## polio 88.05 8.90 3.00 9.900000e+01
## total_expenditure 5.85 2.36 0.37 1.760000e+01
## diphtheria 87.99 8.90 2.00 9.900000e+01
## hiv_aids 0.54 0.00 0.10 5.060000e+01
## gdp 3751.73 2360.98 1.68 1.191727e+05
## population 3953693.58 2012347.06 34.00 1.293859e+09
## thinness_1_19_years 4.14 3.41 0.10 2.770000e+01
## thinness_5_9_years 4.15 3.41 0.10 2.860000e+01
## income_composition_of_resources 0.65 0.19 0.00 9.500000e-01
## schooling 12.17 3.11 0.00 2.070000e+01
## range skew kurtosis se
## year 1.500000e+01 -0.01 -1.21 0.09
## life_expectancy 5.270000e+01 -0.64 -0.24 0.18
## adult_mortality 7.220000e+02 1.17 1.74 2.30
## infant_deaths 1.800000e+03 9.78 115.76 2.18
## alcohol 1.786000e+01 0.59 -0.81 0.08
## percentage_expenditure 1.947991e+04 4.65 26.51 36.68
## hepatitis_b 9.800000e+01 -1.93 2.76 0.51
## measles 2.121830e+05 9.43 114.58 211.56
## bmi 8.630000e+01 -0.22 -1.29 0.37
## under_five_deaths 2.500000e+03 9.49 109.49 2.96
## polio 9.600000e+01 -2.10 3.76 0.43
## total_expenditure 1.723000e+01 0.62 1.15 0.05
## diphtheria 9.700000e+01 -2.07 3.55 0.44
## hiv_aids 5.050000e+01 5.39 34.80 0.09
## gdp 1.191711e+05 3.20 12.29 285.98
## population 1.293859e+09 15.90 297.09 1276079.80
## thinness_1_19_years 2.760000e+01 1.71 3.96 0.08
## thinness_5_9_years 2.850000e+01 1.78 4.34 0.08
## income_composition_of_resources 9.500000e-01 -1.14 1.38 0.00
## schooling 2.070000e+01 -0.60 0.88 0.06
# proporsi missing per kolom
miss_pct <- sapply(df, function(x) mean(is.na(x))) * 100
miss_tbl <- tibble(variable = names(miss_pct), missing_pct = miss_pct) %>% arrange(desc(missing_pct))
miss_tbl
## # A tibble: 23 × 2
## variable missing_pct
## <chr> <dbl>
## 1 population 22.2
## 2 hepatitis_b 18.8
## 3 gdp 15.2
## 4 total_expenditure 7.69
## 5 alcohol 6.60
## 6 income_composition_of_resources 5.68
## 7 schooling 5.55
## 8 bmi 1.16
## 9 thinness_1_19_years 1.16
## 10 thinness_5_9_years 1.16
## # ℹ 13 more rows
# visualisasi
naniar::vis_miss(df)
# histogram life_expectancy
ggplot(df, aes(x = life_expectancy)) +
geom_histogram(bins = 30) +
geom_vline(xintercept = as.numeric(q[2:3]), linetype = "dashed") +
labs(title = "Distribusi Life Expectancy & batas tertiles", x = "Life expectancy", y = "Count")
# barplot kelas
ggplot(df, aes(x = life_cat)) + geom_bar() + labs(title = "Jumlah per kelas life_cat")
# pilih numerik saja, hilangkan kolom dengan variansi 0 atau id non-informative
num_for_corr <- num_vars %>% select(-year) # opsional: drop year jika tidak relevan
# hilangkan kolom yang semuanya NA
num_for_corr <- num_for_corr[, sapply(num_for_corr, function(x) sum(!is.na(x)) > 0)]
cors <- cor(num_for_corr, use = "pairwise.complete.obs")
round(cors, 2)[1:10, 1:10] # tampilkan sebagian jika banyak kolom
## life_expectancy adult_mortality infant_deaths alcohol
## life_expectancy 1.00 -0.70 -0.20 0.40
## adult_mortality -0.70 1.00 0.08 -0.20
## infant_deaths -0.20 0.08 1.00 -0.12
## alcohol 0.40 -0.20 -0.12 1.00
## percentage_expenditure 0.38 -0.24 -0.09 0.34
## hepatitis_b 0.26 -0.16 -0.22 0.09
## measles -0.16 0.03 0.50 -0.05
## bmi 0.57 -0.39 -0.23 0.33
## under_five_deaths -0.22 0.09 1.00 -0.11
## polio 0.47 -0.27 -0.17 0.22
## percentage_expenditure hepatitis_b measles bmi
## life_expectancy 0.38 0.26 -0.16 0.57
## adult_mortality -0.24 -0.16 0.03 -0.39
## infant_deaths -0.09 -0.22 0.50 -0.23
## alcohol 0.34 0.09 -0.05 0.33
## percentage_expenditure 1.00 0.02 -0.06 0.23
## hepatitis_b 0.02 1.00 -0.12 0.15
## measles -0.06 -0.12 1.00 -0.18
## bmi 0.23 0.15 -0.18 1.00
## under_five_deaths -0.09 -0.23 0.51 -0.24
## polio 0.15 0.49 -0.14 0.28
## under_five_deaths polio
## life_expectancy -0.22 0.47
## adult_mortality 0.09 -0.27
## infant_deaths 1.00 -0.17
## alcohol -0.11 0.22
## percentage_expenditure -0.09 0.15
## hepatitis_b -0.23 0.49
## measles 0.51 -0.14
## bmi -0.24 0.28
## under_five_deaths 1.00 -0.19
## polio -0.19 1.00
# heatmap korelasi
corrplot::corrplot(cors, method = "color", tl.cex = 0.7, number.cex = 0.6)
# pilih beberapa fitur kunci
features <- c("adult_mortality", "infant_deaths", "alcohol", "percentage_expenditure", "gdp", "schooling")
features <- features[features %in% names(df)]
library(tidyr)
long <- df %>% select(all_of(features)) %>% pivot_longer(everything(), names_to = "feature", values_to = "value")
ggplot(long, aes(x = feature, y = value)) + geom_boxplot() + coord_flip() + labs(title = "Boxplot beberapa fitur")
Untuk menghitung VIF, kita butuh data numerik lengkap — saya lakukan imputasi sederhana (median) sebelum fitting model linier sementara.
# siapkan data numerik untuk VIF (buang kolom target dan categorical)
num_for_vif <- df %>% select(where(is.numeric))
# hilangkan kolom dengan semua NA
num_for_vif <- num_for_vif[, sapply(num_for_vif, function(x) sum(!is.na(x)) > 0)]
# simple median impute dengan caret
pre <- preProcess(num_for_vif, method = c("medianImpute"))
num_imp <- predict(pre, num_for_vif)
# pastikan tidak ada NA lagi
sum(is.na(num_imp))
## [1] 0
# pasang model linier sementara: life_expectancy ~ semua numerik kecuali life_expectancy sendiri
predictors <- setdiff(names(num_imp), "life_expectancy")
formula_vif <- as.formula(paste("life_expectancy ~", paste(predictors, collapse = " + ")))
lm_vif <- lm(formula_vif, data = num_imp)
vif_vals <- car::vif(lm_vif)
# urutkan dan tampilkan
vif_tbl <- tibble(variable = names(vif_vals), VIF = as.numeric(vif_vals)) %>% arrange(desc(VIF))
vif_tbl
## # A tibble: 19 × 2
## variable VIF
## <chr> <dbl>
## 1 infant_deaths 177.
## 2 under_five_deaths 176.
## 3 thinness_5_9_years 8.87
## 4 thinness_1_19_years 8.78
## 5 gdp 6.01
## 6 percentage_expenditure 5.79
## 7 schooling 3.32
## 8 income_composition_of_resources 3.04
## 9 diphtheria 2.16
## 10 polio 1.94
## 11 adult_mortality 1.73
## 12 bmi 1.72
## 13 alcohol 1.65
## 14 population 1.49
## 15 hiv_aids 1.44
## 16 measles 1.38
## 17 hepatitis_b 1.31
## 18 total_expenditure 1.20
## 19 year 1.15
# rekomendasi: tandai > 10 (atau >5 sebagai hati-hati)
vif_tbl %>% mutate(flag = case_when(VIF > 10 ~ "High (>10)", VIF > 5 ~ "Moderate (>5)", TRUE ~ "OK"))
## # A tibble: 19 × 3
## variable VIF flag
## <chr> <dbl> <chr>
## 1 infant_deaths 177. High (>10)
## 2 under_five_deaths 176. High (>10)
## 3 thinness_5_9_years 8.87 Moderate (>5)
## 4 thinness_1_19_years 8.78 Moderate (>5)
## 5 gdp 6.01 Moderate (>5)
## 6 percentage_expenditure 5.79 Moderate (>5)
## 7 schooling 3.32 OK
## 8 income_composition_of_resources 3.04 OK
## 9 diphtheria 2.16 OK
## 10 polio 1.94 OK
## 11 adult_mortality 1.73 OK
## 12 bmi 1.72 OK
## 13 alcohol 1.65 OK
## 14 population 1.49 OK
## 15 hiv_aids 1.44 OK
## 16 measles 1.38 OK
## 17 hepatitis_b 1.31 OK
## 18 total_expenditure 1.20 OK
## 19 year 1.15 OK
Catatan: VIF dihitung dari model linier sementara. Jika banyak variabel kategorik atau non-linear relationship, interpretasi harus hati-hati.
#create target
if(!"lifeclass" %in% names(df)){
q <- quantile(df$life_expectancy, probs=c(0,1/3,2/3,1), na.rm=TRUE)
df$lifeclass <- cut(df$life_expectancy, breaks=q, include.lowest=TRUE,
labels=c("Low","Medium","High"))
} else {
df$lifeclass <- as.factor(df$lifeclass)
}
table(df$lifeclass)
##
## Low Medium High
## 984 978 966
df_model <- df %>% select(-any_of("country"))
df_model$lifeclass <- as.factor(df_model$lifeclass)
#Train test split
set.seed(123)
train_index <- createDataPartition(df_model$lifeclass, p=0.8, list=FALSE)
train <- df_model[train_index, ]
test <- df_model[-train_index, ]
get_mode <- function(x){ux <- unique(x[!is.na(x)]); ux[which.max(tabulate(match(x, ux)))]}
num_cols <- names(train)[sapply(train, is.numeric)]
cat_cols <- setdiff(names(train), num_cols)
pre_num <- preProcess(train[, num_cols], method=c("medianImpute"))
train_num_imp <- predict(pre_num, train[, num_cols])
test_num_imp <- predict(pre_num, test[, num_cols])
train_cat_imp <- train[, cat_cols, drop=FALSE]
test_cat_imp <- test[, cat_cols, drop=FALSE]
for(col in names(train_cat_imp)){
mode_val <- get_mode(train_cat_imp[[col]])
train_cat_imp[[col]][is.na(train_cat_imp[[col]])] <- mode_val
test_cat_imp[[col]][is.na(test_cat_imp[[col]])] <- mode_val
}
train_imp <- bind_cols(train_num_imp, train_cat_imp)
test_imp <- bind_cols(test_num_imp, test_cat_imp)
train_imp$lifeclass <- as.factor(train_imp$lifeclass)
test_imp$lifeclass <- as.factor(test_imp$lifeclass)
#10. random forest
predictors <- setdiff(names(train_imp), "lifeclass")
formula_rf <- as.formula(paste("lifeclass ~", paste(predictors, collapse=" + ")))
set.seed(123)
rf_model <- randomForest(formula_rf, data=train_imp,
ntree=500,
mtry=floor(sqrt(length(predictors))),
importance=TRUE)
rf_model
##
## Call:
## randomForest(formula = formula_rf, data = train_imp, ntree = 500, mtry = floor(sqrt(length(predictors))), importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 0%
## Confusion matrix:
## Low Medium High class.error
## Low 796 0 0 0
## Medium 0 783 0 0
## High 0 0 773 0
y_pred_rf <- predict(rf_model, test_imp)
cm_rf <- confusionMatrix(y_pred_rf, test_imp$lifeclass)
cm_rf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Medium High
## Low 198 0 0
## Medium 0 195 0
## High 0 0 193
##
## Overall Statistics
##
## Accuracy : 1
## 95% CI : (0.9937, 1)
## No Information Rate : 0.3379
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Low Class: Medium Class: High
## Sensitivity 1.0000 1.0000 1.0000
## Specificity 1.0000 1.0000 1.0000
## Pos Pred Value 1.0000 1.0000 1.0000
## Neg Pred Value 1.0000 1.0000 1.0000
## Prevalence 0.3379 0.3328 0.3294
## Detection Rate 0.3379 0.3328 0.3294
## Detection Prevalence 0.3379 0.3328 0.3294
## Balanced Accuracy 1.0000 1.0000 1.0000
#11. SVM
num_cols <- names(train_imp)[sapply(train_imp, is.numeric)]
pre_svm <- preProcess(train_imp[, num_cols], method=c("center","scale"))
train_svm <- train_imp
test_svm <- test_imp
train_svm[, num_cols] <- predict(pre_svm, train_imp[, num_cols])
test_svm[, num_cols] <- predict(pre_svm, test_imp[, num_cols])
svm_model <- svm(lifeclass ~ ., data=train_svm, kernel="radial")
y_pred_svm <- predict(svm_model, test_svm)
cm_svm <- confusionMatrix(y_pred_svm, test_svm$lifeclass)
cm_svm
## Confusion Matrix and Statistics
##
## Reference
## Prediction Low Medium High
## Low 198 2 0
## Medium 0 193 1
## High 0 0 192
##
## Overall Statistics
##
## Accuracy : 0.9949
## 95% CI : (0.9851, 0.9989)
## No Information Rate : 0.3379
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.9923
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Low Class: Medium Class: High
## Sensitivity 1.0000 0.9897 0.9948
## Specificity 0.9948 0.9974 1.0000
## Pos Pred Value 0.9900 0.9948 1.0000
## Neg Pred Value 1.0000 0.9949 0.9975
## Prevalence 0.3379 0.3328 0.3294
## Detection Rate 0.3379 0.3294 0.3276
## Detection Prevalence 0.3413 0.3311 0.3276
## Balanced Accuracy 0.9974 0.9936 0.9974
#12. Perbandingan Model
acc_rf <- cm_rf$overall["Accuracy"]
acc_svm <- cm_svm$overall["Accuracy"]
f1_rf <- mean(cm_rf$byClass[,"F1"], na.rm=TRUE)
f1_svm <- mean(cm_svm$byClass[,"F1"], na.rm=TRUE)
data.frame(
Model=c("Random Forest","SVM RBF"),
Accuracy=c(acc_rf, acc_svm),
F1_Macro=c(f1_rf, f1_svm)
)
## Model Accuracy F1_Macro
## 1 Random Forest 1.0000000 1.0000000
## 2 SVM RBF 0.9948805 0.9948885