`

Tujuan

Dokumen ini melakukan telaah variabel, statistik deskriptif, karakteristik data, dan cek multikolinearitas (VIF) pada dataset Life Expectancy Data Clean.xlsx. Target asli Life expectancy akan dikonversi menjadi 3 kelas (Low / Medium / High) berdasarkan tertiles.


1. Load library & data

library(readxl)
library(dplyr)
library(ggplot2)
library(janitor)
library(psych)
library(naniar)
library(corrplot)
library(car)    # vif
library(caret)  # preprocessing
library(randomForest())
library(e1071)

# path ke file (sesuaikan jika perlu)
file_path <- "C:/Users/USER/Downloads/Life Expectancy Data Clean.xlsx"

df <- read_excel(file_path)
# bersihkan nama kolom agar mudah dipanggil
names(df) <- make_clean_names(names(df))

# preview
glimpse(df)
## Rows: 2,938
## Columns: 22
## $ country                         <chr> "Afghanistan", "Afghanistan", "Afghani…
## $ year                            <dbl> 2015, 2014, 2013, 2012, 2011, 2010, 20…
## $ status                          <chr> "Developing", "Developing", "Developin…
## $ life_expectancy                 <dbl> 65.0, 59.9, 59.9, 59.5, 59.2, 58.8, 58…
## $ adult_mortality                 <dbl> 263, 271, 268, 272, 275, 279, 281, 287…
## $ infant_deaths                   <dbl> 62, 64, 66, 69, 71, 74, 77, 80, 82, 84…
## $ alcohol                         <dbl> 0.01, 0.01, 0.01, 0.01, 0.01, 0.01, 0.…
## $ percentage_expenditure          <dbl> 71.279624, 73.523582, 73.219243, 78.18…
## $ hepatitis_b                     <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63, 64…
## $ measles                         <dbl> 1154, 492, 430, 2787, 3013, 1989, 2861…
## $ bmi                             <dbl> 19.1, 18.6, 18.1, 17.6, 17.2, 16.7, 16…
## $ under_five_deaths               <dbl> 83, 86, 89, 93, 97, 102, 106, 110, 113…
## $ polio                           <dbl> 6, 58, 62, 67, 68, 66, 63, 64, 63, 58,…
## $ total_expenditure               <dbl> 8.16, 8.18, 8.13, 8.52, 7.87, 9.20, 9.…
## $ diphtheria                      <dbl> 65, 62, 64, 67, 68, 66, 63, 64, 63, 58…
## $ hiv_aids                        <dbl> 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1…
## $ gdp                             <dbl> 584.25921, 612.69651, 631.74498, 669.9…
## $ population                      <dbl> 33736494, 327582, 31731688, 3696958, 2…
## $ thinness_1_19_years             <dbl> 17.2, 17.5, 17.7, 17.9, 18.2, 18.4, 18…
## $ thinness_5_9_years              <dbl> 17.3, 17.5, 17.7, 18.0, 18.2, 18.4, 18…
## $ income_composition_of_resources <dbl> 0.479, 0.476, 0.470, 0.463, 0.454, 0.4…
## $ schooling                       <dbl> 10.1, 10.0, 9.9, 9.8, 9.5, 9.2, 8.9, 8…
head(df)
## # A tibble: 6 × 22
##   country      year status life_expectancy adult_mortality infant_deaths alcohol
##   <chr>       <dbl> <chr>            <dbl>           <dbl>         <dbl>   <dbl>
## 1 Afghanistan  2015 Devel…            65               263            62    0.01
## 2 Afghanistan  2014 Devel…            59.9             271            64    0.01
## 3 Afghanistan  2013 Devel…            59.9             268            66    0.01
## 4 Afghanistan  2012 Devel…            59.5             272            69    0.01
## 5 Afghanistan  2011 Devel…            59.2             275            71    0.01
## 6 Afghanistan  2010 Devel…            58.8             279            74    0.01
## # ℹ 15 more variables: percentage_expenditure <dbl>, hepatitis_b <dbl>,
## #   measles <dbl>, bmi <dbl>, under_five_deaths <dbl>, polio <dbl>,
## #   total_expenditure <dbl>, diphtheria <dbl>, hiv_aids <dbl>, gdp <dbl>,
## #   population <dbl>, thinness_1_19_years <dbl>, thinness_5_9_years <dbl>,
## #   income_composition_of_resources <dbl>, schooling <dbl>

2. Telaah Variabel (daftar variabel & tipe)

# Daftar variabel dan tipe
vars_tbl <- tibble(variable = names(df), type = sapply(df, function(x) class(x)[1]))
vars_tbl
## # A tibble: 22 × 2
##    variable               type     
##    <chr>                  <chr>    
##  1 country                character
##  2 year                   numeric  
##  3 status                 character
##  4 life_expectancy        numeric  
##  5 adult_mortality        numeric  
##  6 infant_deaths          numeric  
##  7 alcohol                numeric  
##  8 percentage_expenditure numeric  
##  9 hepatitis_b            numeric  
## 10 measles                numeric  
## # ℹ 12 more rows

Tambahkan catatan singkat untuk variabel kategorik (contoh: status, country) dan numeric.


3. Bersihkan dan persiapan target 3 kelas

Keputusan: membagi life_expectancy menjadi 3 kelas berdasarkan tertiles (quantile 33.33% & 66.67%).

# pastikan nama kolom target (cek)
# biasanya hasil make_clean_names -> life_expectancy
if(!"life_expectancy" %in% names(df)) stop("Kolom 'life_expectancy' tidak ditemukan. Cek nama kolom.")

# hitung tertiles (dengan na.rm = TRUE)
q <- quantile(df$life_expectancy, probs = c(0, 1/3, 2/3, 1), na.rm = TRUE)
q
##        0% 33.33333% 66.66667%      100% 
##      36.3      66.6      74.4      89.0
# buat faktor 3 kelas
df <- df %>%
  mutate(life_cat = cut(life_expectancy,
                        breaks = q,
                        include.lowest = TRUE,
                        labels = c("Low","Medium","High")))

table(df$life_cat, useNA = "ifany")
## 
##    Low Medium   High   <NA> 
##    984    978    966     10

4. Statistik Deskriptif & Missing Value

4.1 Statistik deskriptif numerik

num_vars <- df %>% select(where(is.numeric))
# ringkasan dasar
summary(num_vars)
##       year      life_expectancy adult_mortality infant_deaths   
##  Min.   :2000   Min.   :36.30   Min.   :  1.0   Min.   :   0.0  
##  1st Qu.:2004   1st Qu.:63.10   1st Qu.: 74.0   1st Qu.:   0.0  
##  Median :2008   Median :72.10   Median :144.0   Median :   3.0  
##  Mean   :2008   Mean   :69.22   Mean   :164.8   Mean   :  30.3  
##  3rd Qu.:2012   3rd Qu.:75.70   3rd Qu.:228.0   3rd Qu.:  22.0  
##  Max.   :2015   Max.   :89.00   Max.   :723.0   Max.   :1800.0  
##                 NA's   :10      NA's   :10                      
##     alcohol        percentage_expenditure  hepatitis_b       measles        
##  Min.   : 0.0100   Min.   :    0.000      Min.   : 1.00   Min.   :     0.0  
##  1st Qu.: 0.8775   1st Qu.:    4.685      1st Qu.:77.00   1st Qu.:     0.0  
##  Median : 3.7550   Median :   64.913      Median :92.00   Median :    17.0  
##  Mean   : 4.6029   Mean   :  738.251      Mean   :80.94   Mean   :  2419.6  
##  3rd Qu.: 7.7025   3rd Qu.:  441.534      3rd Qu.:97.00   3rd Qu.:   360.2  
##  Max.   :17.8700   Max.   :19479.912      Max.   :99.00   Max.   :212183.0  
##  NA's   :194                              NA's   :553                       
##       bmi        under_five_deaths     polio       total_expenditure
##  Min.   : 1.00   Min.   :   0.00   Min.   : 3.00   Min.   : 0.370   
##  1st Qu.:19.30   1st Qu.:   0.00   1st Qu.:78.00   1st Qu.: 4.260   
##  Median :43.50   Median :   4.00   Median :93.00   Median : 5.755   
##  Mean   :38.32   Mean   :  42.04   Mean   :82.55   Mean   : 5.938   
##  3rd Qu.:56.20   3rd Qu.:  28.00   3rd Qu.:97.00   3rd Qu.: 7.492   
##  Max.   :87.30   Max.   :2500.00   Max.   :99.00   Max.   :17.600   
##  NA's   :34                        NA's   :19      NA's   :226      
##    diphtheria       hiv_aids           gdp              population       
##  Min.   : 2.00   Min.   : 0.100   Min.   :     1.68   Min.   :3.400e+01  
##  1st Qu.:78.00   1st Qu.: 0.100   1st Qu.:   463.94   1st Qu.:1.958e+05  
##  Median :93.00   Median : 0.100   Median :  1766.95   Median :1.387e+06  
##  Mean   :82.32   Mean   : 1.742   Mean   :  7483.16   Mean   :1.275e+07  
##  3rd Qu.:97.00   3rd Qu.: 0.800   3rd Qu.:  5910.81   3rd Qu.:7.420e+06  
##  Max.   :99.00   Max.   :50.600   Max.   :119172.74   Max.   :1.294e+09  
##  NA's   :19                       NA's   :448         NA's   :652        
##  thinness_1_19_years thinness_5_9_years income_composition_of_resources
##  Min.   : 0.10       Min.   : 0.10      Min.   :0.0000                 
##  1st Qu.: 1.60       1st Qu.: 1.50      1st Qu.:0.4930                 
##  Median : 3.30       Median : 3.30      Median :0.6770                 
##  Mean   : 4.84       Mean   : 4.87      Mean   :0.6276                 
##  3rd Qu.: 7.20       3rd Qu.: 7.20      3rd Qu.:0.7790                 
##  Max.   :27.70       Max.   :28.60      Max.   :0.9480                 
##  NA's   :34          NA's   :34         NA's   :167                    
##    schooling    
##  Min.   : 0.00  
##  1st Qu.:10.10  
##  Median :12.30  
##  Mean   :11.99  
##  3rd Qu.:14.30  
##  Max.   :20.70  
##  NA's   :163
# deskripsi lebih lengkap
psych::describe(num_vars)
##                                 vars    n        mean          sd     median
## year                               1 2938     2007.52        4.61    2008.00
## life_expectancy                    2 2928       69.22        9.52      72.10
## adult_mortality                    3 2928      164.80      124.29     144.00
## infant_deaths                      4 2938       30.30      117.93       3.00
## alcohol                            5 2744        4.60        4.05       3.76
## percentage_expenditure             6 2938      738.25     1987.91      64.91
## hepatitis_b                        7 2385       80.94       25.07      92.00
## measles                            8 2938     2419.59    11467.27      17.00
## bmi                                9 2904       38.32       20.04      43.50
## under_five_deaths                 10 2938       42.04      160.45       4.00
## polio                             11 2919       82.55       23.43      93.00
## total_expenditure                 12 2712        5.94        2.50       5.76
## diphtheria                        13 2919       82.32       23.72      93.00
## hiv_aids                          14 2938        1.74        5.08       0.10
## gdp                               15 2490     7483.16    14270.17    1766.95
## population                        16 2286 12753375.12 61012096.51 1386542.00
## thinness_1_19_years               17 2904        4.84        4.42       3.30
## thinness_5_9_years                18 2904        4.87        4.51       3.30
## income_composition_of_resources   19 2771        0.63        0.21       0.68
## schooling                         20 2775       11.99        3.36      12.30
##                                    trimmed        mad     min          max
## year                               2007.52       5.93 2000.00 2.015000e+03
## life_expectancy                      69.91       8.60   36.30 8.900000e+01
## adult_mortality                     150.51     112.68    1.00 7.230000e+02
## infant_deaths                        10.20       4.45    0.00 1.800000e+03
## alcohol                               4.23       4.81    0.01 1.787000e+01
## percentage_expenditure              230.74      96.24    0.00 1.947991e+04
## hepatitis_b                          86.89       8.90    1.00 9.900000e+01
## measles                             286.08      25.20    0.00 2.121830e+05
## bmi                                  39.05      24.17    1.00 8.730000e+01
## under_five_deaths                    14.15       5.93    0.00 2.500000e+03
## polio                                88.05       8.90    3.00 9.900000e+01
## total_expenditure                     5.85       2.36    0.37 1.760000e+01
## diphtheria                           87.99       8.90    2.00 9.900000e+01
## hiv_aids                              0.54       0.00    0.10 5.060000e+01
## gdp                                3751.73    2360.98    1.68 1.191727e+05
## population                      3953693.58 2012347.06   34.00 1.293859e+09
## thinness_1_19_years                   4.14       3.41    0.10 2.770000e+01
## thinness_5_9_years                    4.15       3.41    0.10 2.860000e+01
## income_composition_of_resources       0.65       0.19    0.00 9.500000e-01
## schooling                            12.17       3.11    0.00 2.070000e+01
##                                        range  skew kurtosis         se
## year                            1.500000e+01 -0.01    -1.21       0.09
## life_expectancy                 5.270000e+01 -0.64    -0.24       0.18
## adult_mortality                 7.220000e+02  1.17     1.74       2.30
## infant_deaths                   1.800000e+03  9.78   115.76       2.18
## alcohol                         1.786000e+01  0.59    -0.81       0.08
## percentage_expenditure          1.947991e+04  4.65    26.51      36.68
## hepatitis_b                     9.800000e+01 -1.93     2.76       0.51
## measles                         2.121830e+05  9.43   114.58     211.56
## bmi                             8.630000e+01 -0.22    -1.29       0.37
## under_five_deaths               2.500000e+03  9.49   109.49       2.96
## polio                           9.600000e+01 -2.10     3.76       0.43
## total_expenditure               1.723000e+01  0.62     1.15       0.05
## diphtheria                      9.700000e+01 -2.07     3.55       0.44
## hiv_aids                        5.050000e+01  5.39    34.80       0.09
## gdp                             1.191711e+05  3.20    12.29     285.98
## population                      1.293859e+09 15.90   297.09 1276079.80
## thinness_1_19_years             2.760000e+01  1.71     3.96       0.08
## thinness_5_9_years              2.850000e+01  1.78     4.34       0.08
## income_composition_of_resources 9.500000e-01 -1.14     1.38       0.00
## schooling                       2.070000e+01 -0.60     0.88       0.06

4.2 Missing value

# proporsi missing per kolom
miss_pct <- sapply(df, function(x) mean(is.na(x))) * 100
miss_tbl <- tibble(variable = names(miss_pct), missing_pct = miss_pct) %>% arrange(desc(missing_pct))
miss_tbl
## # A tibble: 23 × 2
##    variable                        missing_pct
##    <chr>                                 <dbl>
##  1 population                            22.2 
##  2 hepatitis_b                           18.8 
##  3 gdp                                   15.2 
##  4 total_expenditure                      7.69
##  5 alcohol                                6.60
##  6 income_composition_of_resources        5.68
##  7 schooling                              5.55
##  8 bmi                                    1.16
##  9 thinness_1_19_years                    1.16
## 10 thinness_5_9_years                     1.16
## # ℹ 13 more rows
# visualisasi
naniar::vis_miss(df)


5. Karakteristik Data

5.1 Distribusi target & life_expectancy

# histogram life_expectancy
ggplot(df, aes(x = life_expectancy)) +
  geom_histogram(bins = 30) +
  geom_vline(xintercept = as.numeric(q[2:3]), linetype = "dashed") +
  labs(title = "Distribusi Life Expectancy & batas tertiles", x = "Life expectancy", y = "Count")

# barplot kelas
ggplot(df, aes(x = life_cat)) + geom_bar() + labs(title = "Jumlah per kelas life_cat")

5.2 Korelasi numerik

# pilih numerik saja, hilangkan kolom dengan variansi 0 atau id non-informative
num_for_corr <- num_vars %>% select(-year) # opsional: drop year jika tidak relevan

# hilangkan kolom yang semuanya NA
num_for_corr <- num_for_corr[, sapply(num_for_corr, function(x) sum(!is.na(x)) > 0)]

cors <- cor(num_for_corr, use = "pairwise.complete.obs")
round(cors, 2)[1:10, 1:10] # tampilkan sebagian jika banyak kolom
##                        life_expectancy adult_mortality infant_deaths alcohol
## life_expectancy                   1.00           -0.70         -0.20    0.40
## adult_mortality                  -0.70            1.00          0.08   -0.20
## infant_deaths                    -0.20            0.08          1.00   -0.12
## alcohol                           0.40           -0.20         -0.12    1.00
## percentage_expenditure            0.38           -0.24         -0.09    0.34
## hepatitis_b                       0.26           -0.16         -0.22    0.09
## measles                          -0.16            0.03          0.50   -0.05
## bmi                               0.57           -0.39         -0.23    0.33
## under_five_deaths                -0.22            0.09          1.00   -0.11
## polio                             0.47           -0.27         -0.17    0.22
##                        percentage_expenditure hepatitis_b measles   bmi
## life_expectancy                          0.38        0.26   -0.16  0.57
## adult_mortality                         -0.24       -0.16    0.03 -0.39
## infant_deaths                           -0.09       -0.22    0.50 -0.23
## alcohol                                  0.34        0.09   -0.05  0.33
## percentage_expenditure                   1.00        0.02   -0.06  0.23
## hepatitis_b                              0.02        1.00   -0.12  0.15
## measles                                 -0.06       -0.12    1.00 -0.18
## bmi                                      0.23        0.15   -0.18  1.00
## under_five_deaths                       -0.09       -0.23    0.51 -0.24
## polio                                    0.15        0.49   -0.14  0.28
##                        under_five_deaths polio
## life_expectancy                    -0.22  0.47
## adult_mortality                     0.09 -0.27
## infant_deaths                       1.00 -0.17
## alcohol                            -0.11  0.22
## percentage_expenditure             -0.09  0.15
## hepatitis_b                        -0.23  0.49
## measles                             0.51 -0.14
## bmi                                -0.24  0.28
## under_five_deaths                   1.00 -0.19
## polio                              -0.19  1.00
# heatmap korelasi
corrplot::corrplot(cors, method = "color", tl.cex = 0.7, number.cex = 0.6)

5.3 Outlier (boxplot contoh beberapa fitur penting)

# pilih beberapa fitur kunci
features <- c("adult_mortality", "infant_deaths", "alcohol", "percentage_expenditure", "gdp", "schooling")
features <- features[features %in% names(df)]

library(tidyr)
long <- df %>% select(all_of(features)) %>% pivot_longer(everything(), names_to = "feature", values_to = "value")

ggplot(long, aes(x = feature, y = value)) + geom_boxplot() + coord_flip() + labs(title = "Boxplot beberapa fitur")


6. Cek Multikolinearitas (VIF)

Untuk menghitung VIF, kita butuh data numerik lengkap — saya lakukan imputasi sederhana (median) sebelum fitting model linier sementara.

# siapkan data numerik untuk VIF (buang kolom target dan categorical)
num_for_vif <- df %>% select(where(is.numeric))
# hilangkan kolom dengan semua NA
num_for_vif <- num_for_vif[, sapply(num_for_vif, function(x) sum(!is.na(x)) > 0)]

# simple median impute dengan caret
pre <- preProcess(num_for_vif, method = c("medianImpute"))
num_imp <- predict(pre, num_for_vif)

# pastikan tidak ada NA lagi
sum(is.na(num_imp))
## [1] 0
# pasang model linier sementara: life_expectancy ~ semua numerik kecuali life_expectancy sendiri
predictors <- setdiff(names(num_imp), "life_expectancy")
formula_vif <- as.formula(paste("life_expectancy ~", paste(predictors, collapse = " + ")))

lm_vif <- lm(formula_vif, data = num_imp)
vif_vals <- car::vif(lm_vif)

# urutkan dan tampilkan
vif_tbl <- tibble(variable = names(vif_vals), VIF = as.numeric(vif_vals)) %>% arrange(desc(VIF))
vif_tbl
## # A tibble: 19 × 2
##    variable                           VIF
##    <chr>                            <dbl>
##  1 infant_deaths                   177.  
##  2 under_five_deaths               176.  
##  3 thinness_5_9_years                8.87
##  4 thinness_1_19_years               8.78
##  5 gdp                               6.01
##  6 percentage_expenditure            5.79
##  7 schooling                         3.32
##  8 income_composition_of_resources   3.04
##  9 diphtheria                        2.16
## 10 polio                             1.94
## 11 adult_mortality                   1.73
## 12 bmi                               1.72
## 13 alcohol                           1.65
## 14 population                        1.49
## 15 hiv_aids                          1.44
## 16 measles                           1.38
## 17 hepatitis_b                       1.31
## 18 total_expenditure                 1.20
## 19 year                              1.15
# rekomendasi: tandai > 10 (atau >5 sebagai hati-hati)
vif_tbl %>% mutate(flag = case_when(VIF > 10 ~ "High (>10)", VIF > 5 ~ "Moderate (>5)", TRUE ~ "OK"))
## # A tibble: 19 × 3
##    variable                           VIF flag         
##    <chr>                            <dbl> <chr>        
##  1 infant_deaths                   177.   High (>10)   
##  2 under_five_deaths               176.   High (>10)   
##  3 thinness_5_9_years                8.87 Moderate (>5)
##  4 thinness_1_19_years               8.78 Moderate (>5)
##  5 gdp                               6.01 Moderate (>5)
##  6 percentage_expenditure            5.79 Moderate (>5)
##  7 schooling                         3.32 OK           
##  8 income_composition_of_resources   3.04 OK           
##  9 diphtheria                        2.16 OK           
## 10 polio                             1.94 OK           
## 11 adult_mortality                   1.73 OK           
## 12 bmi                               1.72 OK           
## 13 alcohol                           1.65 OK           
## 14 population                        1.49 OK           
## 15 hiv_aids                          1.44 OK           
## 16 measles                           1.38 OK           
## 17 hepatitis_b                       1.31 OK           
## 18 total_expenditure                 1.20 OK           
## 19 year                              1.15 OK

Catatan: VIF dihitung dari model linier sementara. Jika banyak variabel kategorik atau non-linear relationship, interpretasi harus hati-hati.

7 Masuk model random forest

#create target

if(!"lifeclass" %in% names(df)){
q <- quantile(df$life_expectancy, probs=c(0,1/3,2/3,1), na.rm=TRUE)
df$lifeclass <- cut(df$life_expectancy, breaks=q, include.lowest=TRUE,
labels=c("Low","Medium","High"))
} else {
df$lifeclass <- as.factor(df$lifeclass)
}
table(df$lifeclass)
## 
##    Low Medium   High 
##    984    978    966

8 Build Modeling dataset

df_model <- df %>% select(-any_of("country"))
df_model$lifeclass <- as.factor(df_model$lifeclass)

#Train test split

set.seed(123)
train_index <- createDataPartition(df_model$lifeclass, p=0.8, list=FALSE)
train <- df_model[train_index, ]
test <- df_model[-train_index, ]
get_mode <- function(x){ux <- unique(x[!is.na(x)]); ux[which.max(tabulate(match(x, ux)))]}


num_cols <- names(train)[sapply(train, is.numeric)]
cat_cols <- setdiff(names(train), num_cols)


pre_num <- preProcess(train[, num_cols], method=c("medianImpute"))
train_num_imp <- predict(pre_num, train[, num_cols])
test_num_imp <- predict(pre_num, test[, num_cols])


train_cat_imp <- train[, cat_cols, drop=FALSE]
test_cat_imp <- test[, cat_cols, drop=FALSE]


for(col in names(train_cat_imp)){
mode_val <- get_mode(train_cat_imp[[col]])
train_cat_imp[[col]][is.na(train_cat_imp[[col]])] <- mode_val
test_cat_imp[[col]][is.na(test_cat_imp[[col]])] <- mode_val
}


train_imp <- bind_cols(train_num_imp, train_cat_imp)
test_imp <- bind_cols(test_num_imp, test_cat_imp)


train_imp$lifeclass <- as.factor(train_imp$lifeclass)
test_imp$lifeclass <- as.factor(test_imp$lifeclass)

#10. random forest

predictors <- setdiff(names(train_imp), "lifeclass")
formula_rf <- as.formula(paste("lifeclass ~", paste(predictors, collapse=" + ")))


set.seed(123)
rf_model <- randomForest(formula_rf, data=train_imp,
ntree=500,
mtry=floor(sqrt(length(predictors))),
importance=TRUE)
rf_model
## 
## Call:
##  randomForest(formula = formula_rf, data = train_imp, ntree = 500,      mtry = floor(sqrt(length(predictors))), importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 0%
## Confusion matrix:
##        Low Medium High class.error
## Low    796      0    0           0
## Medium   0    783    0           0
## High     0      0  773           0
y_pred_rf <- predict(rf_model, test_imp)
cm_rf <- confusionMatrix(y_pred_rf, test_imp$lifeclass)
cm_rf
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low Medium High
##     Low    198      0    0
##     Medium   0    195    0
##     High     0      0  193
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9937, 1)
##     No Information Rate : 0.3379     
##     P-Value [Acc > NIR] : < 2.2e-16  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: Low Class: Medium Class: High
## Sensitivity              1.0000        1.0000      1.0000
## Specificity              1.0000        1.0000      1.0000
## Pos Pred Value           1.0000        1.0000      1.0000
## Neg Pred Value           1.0000        1.0000      1.0000
## Prevalence               0.3379        0.3328      0.3294
## Detection Rate           0.3379        0.3328      0.3294
## Detection Prevalence     0.3379        0.3328      0.3294
## Balanced Accuracy        1.0000        1.0000      1.0000

#11. SVM

num_cols <- names(train_imp)[sapply(train_imp, is.numeric)]
pre_svm <- preProcess(train_imp[, num_cols], method=c("center","scale"))


train_svm <- train_imp
test_svm <- test_imp
train_svm[, num_cols] <- predict(pre_svm, train_imp[, num_cols])
test_svm[, num_cols] <- predict(pre_svm, test_imp[, num_cols])


svm_model <- svm(lifeclass ~ ., data=train_svm, kernel="radial")
y_pred_svm <- predict(svm_model, test_svm)
cm_svm <- confusionMatrix(y_pred_svm, test_svm$lifeclass)
cm_svm
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Low Medium High
##     Low    198      2    0
##     Medium   0    193    1
##     High     0      0  192
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9949          
##                  95% CI : (0.9851, 0.9989)
##     No Information Rate : 0.3379          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9923          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Low Class: Medium Class: High
## Sensitivity              1.0000        0.9897      0.9948
## Specificity              0.9948        0.9974      1.0000
## Pos Pred Value           0.9900        0.9948      1.0000
## Neg Pred Value           1.0000        0.9949      0.9975
## Prevalence               0.3379        0.3328      0.3294
## Detection Rate           0.3379        0.3294      0.3276
## Detection Prevalence     0.3413        0.3311      0.3276
## Balanced Accuracy        0.9974        0.9936      0.9974

#12. Perbandingan Model

acc_rf <- cm_rf$overall["Accuracy"]
acc_svm <- cm_svm$overall["Accuracy"]


f1_rf <- mean(cm_rf$byClass[,"F1"], na.rm=TRUE)
f1_svm <- mean(cm_svm$byClass[,"F1"], na.rm=TRUE)


data.frame(
Model=c("Random Forest","SVM RBF"),
Accuracy=c(acc_rf, acc_svm),
F1_Macro=c(f1_rf, f1_svm)
)
##           Model  Accuracy  F1_Macro
## 1 Random Forest 1.0000000 1.0000000
## 2       SVM RBF 0.9948805 0.9948885