Link RPubs: https://rpubs.com/TyoUnesa1/MultinomialLogisticRegressionTanpaPCA

1 Load Library dan Data

Dataset yang digunakan dalam proyek ini berasal dari Kaggle dengan judul “Denpasar Bali Historical Weather Data”, yang berisi data cuaca historis Kota Denpasar dari tahun 1990 hingga 2020. Namun, untuk kebutuhan proyek ini, fokus klasifikasi diarahkan pada empat kategori utama yang relevan dengan kondisi cuaca Denpasar tahun 2019, yaitu Clouds, Rain, Thunderstorm, dan Clear.

library(readr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
data_path <- "openweatherdata-denpasar-1990-2020.csv"
data <- read_csv(data_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 264924 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): dt_iso, city_name, weather_main, weather_description, weather_icon
## dbl (17): dt, timezone, lat, lon, temp, temp_min, temp_max, pressure, humidi...
## lgl (10): sea_level, grnd_level, rain_12h, rain_today, snow_1h, snow_3h, sno...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$dt_iso <- gsub(" UTC", "", data$dt_iso)
data$dt_iso <- as_datetime(data$dt_iso)
data_filtered <- data %>% filter(year(dt_iso) == 2019)

if (nrow(data_filtered) == 0) stop("Tidak ada data untuk tahun 2019!")

head(data_filtered)
## # A tibble: 6 × 32
##           dt dt_iso              timezone city_name   lat   lon  temp temp_min
##        <dbl> <dttm>                 <dbl> <chr>     <dbl> <dbl> <dbl>    <dbl>
## 1 1546300800 2019-01-01 00:00:00    28800 Denpasar  -8.65  115.  27.1     27  
## 2 1546304400 2019-01-01 01:00:00    28800 Denpasar  -8.65  115.  27.8     27.4
## 3 1546308000 2019-01-01 02:00:00    28800 Denpasar  -8.65  115.  29.5     29  
## 4 1546311600 2019-01-01 03:00:00    28800 Denpasar  -8.65  115.  30.1     30  
## 5 1546315200 2019-01-01 04:00:00    28800 Denpasar  -8.65  115.  30.2     30  
## 6 1546318800 2019-01-01 05:00:00    28800 Denpasar  -8.65  115.  30       30  
## # ℹ 24 more variables: temp_max <dbl>, pressure <dbl>, sea_level <lgl>,
## #   grnd_level <lgl>, humidity <dbl>, wind_speed <dbl>, wind_deg <dbl>,
## #   rain_1h <dbl>, rain_3h <dbl>, rain_6h <dbl>, rain_12h <lgl>,
## #   rain_24h <dbl>, rain_today <lgl>, snow_1h <lgl>, snow_3h <lgl>,
## #   snow_6h <lgl>, snow_12h <lgl>, snow_24h <lgl>, snow_today <lgl>,
## #   clouds_all <dbl>, weather_id <dbl>, weather_main <chr>,
## #   weather_description <chr>, weather_icon <chr>

2 EXPLORATORY DATA ANALYSIS (EDA)

cat("Jumlah baris:", nrow(data_filtered), "\n")
## Jumlah baris: 8822
cat("Jumlah kolom:", ncol(data_filtered), "\n")
## Jumlah kolom: 32
na_summary <- data.frame(Kolom = names(data_filtered),
                         Jumlah_NA = colSums(is.na(data_filtered))) %>%
  arrange(desc(Jumlah_NA))
na_summary
##                                   Kolom Jumlah_NA
## sea_level                     sea_level      8822
## grnd_level                   grnd_level      8822
## rain_12h                       rain_12h      8822
## rain_today                   rain_today      8822
## snow_1h                         snow_1h      8822
## snow_3h                         snow_3h      8822
## snow_6h                         snow_6h      8822
## snow_12h                       snow_12h      8822
## snow_24h                       snow_24h      8822
## snow_today                   snow_today      8822
## rain_1h                         rain_1h      8813
## rain_24h                       rain_24h      8543
## rain_6h                         rain_6h      8541
## rain_3h                         rain_3h      8275
## dt                                   dt         0
## dt_iso                           dt_iso         0
## timezone                       timezone         0
## city_name                     city_name         0
## lat                                 lat         0
## lon                                 lon         0
## temp                               temp         0
## temp_min                       temp_min         0
## temp_max                       temp_max         0
## pressure                       pressure         0
## humidity                       humidity         0
## wind_speed                   wind_speed         0
## wind_deg                       wind_deg         0
## clouds_all                   clouds_all         0
## weather_id                   weather_id         0
## weather_main               weather_main         0
## weather_description weather_description         0
## weather_icon               weather_icon         0
freq_weather <- table(data_filtered$weather_main, useNA = "ifany")
weather_summary <- as.data.frame(freq_weather)
colnames(weather_summary) <- c("Weather Main", "Jumlah")
weather_summary
##   Weather Main Jumlah
## 1        Clear     22
## 2       Clouds   8075
## 3         Haze      5
## 4         Mist      3
## 5         Rain    571
## 6        Smoke      3
## 7 Thunderstorm    143

3 Preprocessing

3.1 Filter Kolom Target

data_filtered <- data_filtered %>%
  select(temp, temp_min, temp_max, pressure, humidity,
         wind_speed, wind_deg, clouds_all, weather_main) %>%
  filter(weather_main %in% c("Clouds", "Rain", "Thunderstorm", "Clear"))

if (length(unique(data_filtered$weather_main)) < 2) {
  stop("Kolom weather_main hanya memiliki satu kategori setelah filter. Tidak bisa dilakukan split!")
}

data_filtered$weather_main <- as.factor(data_filtered$weather_main)
data_filtered$weather_main <- as.integer(data_filtered$weather_main) - 1

is.ordered(data_filtered$weather_main)
## [1] FALSE
str(data_filtered)
## tibble [8,811 × 9] (S3: tbl_df/tbl/data.frame)
##  $ temp        : num [1:8811] 27.1 27.8 29.5 30.1 30.1 ...
##  $ temp_min    : num [1:8811] 27 27.4 29 30 30 30 30 29 29 29 ...
##  $ temp_max    : num [1:8811] 27.4 28 30.4 30.4 30.4 30 30 30 29.4 29.4 ...
##  $ pressure    : num [1:8811] 1010 1010 1010 1010 1009 ...
##  $ humidity    : num [1:8811] 94 88 83 79 79 79 74 79 83 79 ...
##  $ wind_speed  : num [1:8811] 0.5 2.1 3.1 4.1 4.1 5.1 4.6 5.7 5.7 5.7 ...
##  $ wind_deg    : num [1:8811] 0 240 240 260 260 250 260 260 260 270 ...
##  $ clouds_all  : num [1:8811] 40 20 20 20 20 20 20 20 20 20 ...
##  $ weather_main: num [1:8811] 2 2 1 1 1 1 1 1 1 1 ...
unique(data_filtered$weather_main)
## [1] 2 1 0 3

3.2 Penanganan Missing Values

library(dplyr)

data_filtered <- data_filtered %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

cat("Ringkasan data setelah penanganan missing values:\n")
## Ringkasan data setelah penanganan missing values:
na_summary_after <- data.frame(
  Kolom = names(data_filtered),
  Jumlah_NA = colSums(is.na(data_filtered))
)
print(na_summary_after)
##                     Kolom Jumlah_NA
## temp                 temp         0
## temp_min         temp_min         0
## temp_max         temp_max         0
## pressure         pressure         0
## humidity         humidity         0
## wind_speed     wind_speed         0
## wind_deg         wind_deg         0
## clouds_all     clouds_all         0
## weather_main weather_main         0

3.3 Handling Missing Value Kolom Numerik

data_filtered <- data_filtered %>%
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))

# Cek ulang jumlah NA setelah imputasi
cat("Jumlah nilai NA setelah imputasi (dengan mean):\n")
## Jumlah nilai NA setelah imputasi (dengan mean):
print(colSums(is.na(data_filtered)))
##         temp     temp_min     temp_max     pressure     humidity   wind_speed 
##            0            0            0            0            0            0 
##     wind_deg   clouds_all weather_main 
##            0            0            0

3.4 Split Data

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
split_index <- createDataPartition(data_filtered$weather_main, p = 0.7, list = FALSE)
train_data <- data_filtered[split_index, ]
test_data <- data_filtered[-split_index, ]

3.5 Penyeimbangan Data

Dilihat berdasarkan distribusi pada kolom target, karena terdapat ketidakseimbangan data maka dilakukan penanganan imbalaced data. pengananan ini dilakukan pada data train.

if (!require(smotefamily)) install.packages("smotefamily")
## Loading required package: smotefamily
## Warning: package 'smotefamily' was built under R version 4.4.3
library(smotefamily)

set.seed(123)
n_total_train <- nrow(train_data)
n_target_final <- n_total_train * 2
n_classes <- length(levels(train_data$weather_main))
n_per_class <- floor(n_target_final / n_classes)

list_per_class <- split(train_data, train_data$weather_main)
balanced_list <- list()

for (cls in names(list_per_class)) {
  data_cls <- list_per_class[[cls]]
  n_cls <- nrow(data_cls)
  balanced_list[[cls]] <- if (n_cls > n_per_class) {
    slice_sample(data_cls, n = n_per_class)
  } else {
    slice_sample(data_cls, n = n_per_class, replace = TRUE)
  }
}
train_balanced <- bind_rows(balanced_list)

cat("Distribusi kelas setelah balancing di train_data:\n")
## Distribusi kelas setelah balancing di train_data:
table(train_balanced$weather_main)
## 
##    0    1    2    3 
## 3085 3085 3085 3085
cat("Total data train setelah balancing:", nrow(train_balanced), "\n")
## Total data train setelah balancing: 12340
cat("Distribusi kelas di test_data (tidak diubah):\n")
## Distribusi kelas di test_data (tidak diubah):
table(test_data$weather_main)
## 
##    0    1    2    3 
##    6 2422  171   42

4 Uji Asumsi (VIF dan Chi-square)

4.1 Uji VIF

library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
temp_vars <- dplyr::select(train_balanced, temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars, scale. = TRUE)
pc_df <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc_df) <- "pca_temp"

train_balanced_pca <- train_balanced %>%
  dplyr::select(-temp, -temp_min, -temp_max) %>%
  bind_cols(pc_df)

lm_model <- lm(as.numeric(weather_main) ~ ., data = train_balanced_pca)
vif_values <- vif(lm_model)
vif_values
##   pressure   humidity wind_speed   wind_deg clouds_all   pca_temp 
##   1.298020   2.626288   1.224317   1.170867   1.406714   2.272122

4.2 Uji Chi-Square

train_balanced_pca$weather_main <- as.factor(train_balanced_pca$weather_main)
num_vars <- names(train_balanced_pca)[sapply(train_balanced_pca, is.numeric)]
chi_sq_test <- function(varname, data, target) {
  binned_var <- cut(data[[varname]], breaks = 5, include.lowest = TRUE)
  contingency_table <- table(binned_var, data[[target]])
  if (all(dim(contingency_table) > 1)) {
    chisq.test(contingency_table)$p.value
  } else NA
}

chi_results <- sapply(num_vars, chi_sq_test, data = train_balanced_pca, target = "weather_main")
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
chi_df <- data.frame(Variable = names(chi_results), P_Value = chi_results)
chi_df
##              Variable       P_Value
## pressure     pressure 3.123660e-210
## humidity     humidity  0.000000e+00
## wind_speed wind_speed  0.000000e+00
## wind_deg     wind_deg  0.000000e+00
## clouds_all clouds_all  0.000000e+00
## pca_temp     pca_temp 1.125748e-121

5 Normalisasi Data

feature_cols <- c("temp", "temp_min", "temp_max", "pressure", "humidity", "wind_speed", "wind_deg", "clouds_all")
min_max_norm <- function(x) (x - min(x)) / (max(x) - min(x))

train_balanced_norm <- train_balanced
train_balanced_norm[feature_cols] <- lapply(train_balanced[feature_cols], min_max_norm)

test_data_norm <- test_data
for (col in feature_cols) {
  min_val <- min(train_balanced[[col]])
  max_val <- max(train_balanced[[col]])
  test_data_norm[[col]] <- (test_data[[col]] - min_val) / (max_val - min_val)
}

summary(train_balanced_norm)
##       temp           temp_min         temp_max         pressure     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3309   1st Qu.:0.3929   1st Qu.:0.3226   1st Qu.:0.3333  
##  Median :0.4214   Median :0.4857   Median :0.4113   Median :0.4167  
##  Mean   :0.4397   Mean   :0.4932   Mean   :0.4406   Mean   :0.4280  
##  3rd Qu.:0.5276   3rd Qu.:0.5714   3rd Qu.:0.5403   3rd Qu.:0.5833  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     humidity        wind_speed        wind_deg        clouds_all    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5417   1st Qu.:0.1253   1st Qu.:0.3056   1st Qu.:0.0800  
##  Median :0.6458   Median :0.2365   Median :0.3833   Median :0.4000  
##  Mean   :0.6529   Mean   :0.2523   Mean   :0.4816   Mean   :0.3501  
##  3rd Qu.:0.8750   3rd Qu.:0.3477   3rd Qu.:0.7500   3rd Qu.:0.6000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  weather_main
##  0:3085      
##  1:3085      
##  2:3085      
##  3:3085      
##              
## 
summary(test_data_norm)
##       temp             temp_min         temp_max           pressure     
##  Min.   :-0.07819   Min.   :0.0500   Min.   :-0.08065   Min.   :0.0000  
##  1st Qu.: 0.34156   1st Qu.:0.4214   1st Qu.: 0.33065   1st Qu.:0.3333  
##  Median : 0.44527   Median :0.5000   Median : 0.47581   Median :0.5000  
##  Mean   : 0.46144   Mean   :0.5133   Mean   : 0.46195   Mean   :0.4844  
##  3rd Qu.: 0.57778   3rd Qu.:0.6143   3rd Qu.: 0.56452   3rd Qu.:0.5833  
##  Max.   : 1.01070   Max.   :1.0000   Max.   : 1.00000   Max.   :1.0000  
##     humidity        wind_speed         wind_deg        clouds_all    
##  Min.   :0.0625   Min.   :0.00593   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5208   1st Qu.:0.12528   1st Qu.:0.2778   1st Qu.:0.2000  
##  Median :0.6458   Median :0.23647   Median :0.3333   Median :0.2000  
##  Mean   :0.6328   Mean   :0.26084   Mean   :0.4247   Mean   :0.3132  
##  3rd Qu.:0.7500   3rd Qu.:0.39214   3rd Qu.:0.6944   3rd Qu.:0.4000  
##  Max.   :1.0000   Max.   :0.96294   Max.   :1.0000   Max.   :1.0000  
##  weather_main
##  0:   6      
##  1:2422      
##  2: 171      
##  3:  42      
##              
## 

6 Visualisasi Awal

6.1 Visualiasi Proporsi Kategori Cuaca Sebelum Penangan Imbalanced

library(ggplot2)
library(dplyr)
train_weather_summary_1 <- train_data %>%
  group_by(weather_main) %>%
  summarise(Jumlah = n()) %>%
  arrange(desc(Jumlah))

train_weather_summary_1$Proporsi <- train_weather_summary_1$Jumlah / sum(train_weather_summary_1$Jumlah)

ggplot(train_weather_summary_1, aes(x = reorder(weather_main, -Jumlah), y = Jumlah, fill = weather_main)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  coord_flip() +
  labs(title = "Distribusi Kategori Cuaca pada Data Train (Sebelum Balanced)",
       x = "Kategori Cuaca", y = "Jumlah")

6.2 Visualiasi Proporsi Kategori Cuaca Setelah Penangan Imbalanced

library(ggplot2)
library(dplyr)
train_weather_summary <- train_balanced %>%
  group_by(weather_main) %>%
  summarise(Jumlah = n()) %>%
  arrange(desc(Jumlah))

train_weather_summary$Proporsi <- train_weather_summary$Jumlah / sum(train_weather_summary$Jumlah)

ggplot(train_weather_summary, aes(x = reorder(weather_main, -Jumlah), y = Jumlah, fill = weather_main)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  coord_flip() +
  labs(title = "Distribusi Kategori Cuaca pada Data Train (Balanced)",
       x = "Kategori Cuaca", y = "Jumlah")

6.3 Visualisasi Korelasi

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
correlation_matrix <- cor(train_balanced_norm[, feature_cols])
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black", 
         addCoef.col = "black", number.cex = 0.7, diag = FALSE)

7 PEMODELAN - Multinomial Logistic Regression Tanpa PCA

library(nnet)
model_mlr <- multinom(weather_main ~ ., data = train_balanced_norm, trace = FALSE)
pred_mlr <- predict(model_mlr, test_data_norm)
conf_matrix_mlr <- confusionMatrix(pred_mlr, test_data_norm$weather_main)

conf_matrix_mlr
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3
##          0    5    1    0    0
##          1    1 1885   33    8
##          2    0  253   84   11
##          3    0  283   54   23
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7562          
##                  95% CI : (0.7393, 0.7724)
##     No Information Rate : 0.9171          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2383          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity          0.833333   0.7783  0.49123 0.547619
## Specificity          0.999620   0.8082  0.89312 0.870335
## Pos Pred Value       0.833333   0.9782  0.24138 0.063889
## Neg Pred Value       0.999620   0.2479  0.96206 0.991670
## Prevalence           0.002272   0.9171  0.06475 0.015903
## Detection Rate       0.001893   0.7137  0.03181 0.008709
## Detection Prevalence 0.002272   0.7296  0.13177 0.136312
## Balanced Accuracy    0.916477   0.7933  0.69217 0.708977
cat(sprintf("Accuracy MLR: %.4f\n", conf_matrix_mlr$overall['Accuracy']))
## Accuracy MLR: 0.7562

8 Visualisasi Akhir

8.1 Confusion Matrix Heatmap

library(pheatmap)
## Warning: package 'pheatmap' was built under R version 4.4.3
conf_matrix_table <- conf_matrix_mlr$table
conf_matrix_prop <- prop.table(conf_matrix_table, margin = 1)  # per baris

pheatmap(conf_matrix_prop, 
         cluster_rows = FALSE, cluster_cols = FALSE,
         display_numbers = TRUE, number_format = "%.2f",
         main = "Confusion Matrix Heatmap (Normalized)",
         color = colorRampPalette(c("white", "blue"))(50))

8.2 Visualisasi Akurasi per Kelas

library(dplyr)
class_accuracy <- diag(conf_matrix_table) / rowSums(conf_matrix_table)
df_class_acc <- data.frame(Class = names(class_accuracy), Accuracy = class_accuracy)

library(ggplot2)
ggplot(df_class_acc, aes(x = reorder(Class, -Accuracy), y = Accuracy, fill = Accuracy)) +
  geom_col() +
  coord_flip() +
  scale_fill_gradient(low = "red", high = "green") +
  labs(title = "Akurasi per Kelas", x = "Kelas Cuaca", y = "Akurasi") +
  theme_minimal()

Grafik akurasi per kelas menunjukkan bahwa model memiliki performa sangat baik pada kelas cuaca 0 dan 1, dengan akurasi mendekati 100%. Namun, performanya menurun drastis pada kelas 2 (akurasi sekitar 25%) dan sangat buruk pada kelas 3 (kurang dari 10%). Hal ini mengindikasikan ketidakseimbangan data atau kurangnya kemampuan model dalam membedakan pola pada kelas-kelas tersebut.

8.3 Multidimensional Scaling (MDS) Plot

Visualisasi ini membantu melihat penyebaran data uji berdasarkan prediksi dan label sebenarnya.

dist_matrix <- dist(test_data_norm[, -which(names(test_data_norm) == "weather_main")])
mds <- cmdscale(dist_matrix, k = 2)

mds_df <- data.frame(Dim1 = mds[,1], Dim2 = mds[,2],
                     Actual = test_data_norm$weather_main,
                     Predicted = pred_mlr)

ggplot(mds_df, aes(x = Dim1, y = Dim2, color = Actual, shape = Predicted)) +
  geom_point(alpha = 0.6, size = 3) +
  labs(title = "MDS Plot - Actual vs Predicted",
       subtitle = "Bentuk = Prediksi, Warna = Label Asli") +
  theme_minimal()

Plot MDS tersebut menunjukkan distribusi hasil prediksi terhadap label sebenarnya, di mana bentuk menunjukkan prediksi dan warna menunjukkan label asli. Terlihat bahwa sebagian besar data diprediksi sebagai kelas 1 (segitiga), meskipun warna-warna lain khususnya biru (kelas 2) dan ungu (kelas 3) masih muncul secara signifikan. Hal ini mengindikasikan adanya kecenderungan model untuk memprediksi sebagian besar sampel sebagai kelas mayoritas (kelas 1), sehingga banyak kesalahan klasifikasi terjadi, terutama pada kelas minoritas. Visualisasi ini mendukung temuan sebelumnya bahwa model mengalami kesulitan dalam mengenali kelas dengan distribusi lebih kecil.

8.4 ROC Curve per Kelas (One-vs-Rest)

library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
classes <- levels(test_data_norm$weather_main)
pred_probs <- predict(model_mlr, test_data_norm, type = "probs")

par(mfrow = c(2, ceiling(length(classes)/2)))
for (cls in classes) {
  roc_obj <- roc(response = as.numeric(test_data_norm$weather_main == cls),
                 predictor = pred_probs[, cls])
  plot(roc_obj, main = paste("ROC Curve -", cls))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases

par(mfrow = c(1,1))

Gambar tersebut menunjukkan empat kurva ROC untuk model klasifikasi yang berbeda. Kurva ROC-0 menunjukkan performa sempurna, dengan sensitivitas dan spesifisitas maksimal. Sementara kurva ROC-1, ROC-2, dan ROC-3 menunjukkan performa yang sangat baik meskipun tidak sempurna, dengan kurva yang tetap jauh di atas garis diagonal acak. Secara keseluruhan, semua model memiliki kemampuan klasifikasi yang baik, dengan model pada ROC-0 sebagai yang paling optimal.

8.5 Top-3 Accuracy

Mengukur apakah label sebenarnya termasuk dalam 3 prediksi probabilitas tertinggi.

pred_probs <- predict(model_mlr, test_data_norm, type = "probs")

top_3_preds <- t(apply(pred_probs, 1, function(x) {
  names(sort(x, decreasing = TRUE))[1:3]
}))

actual_labels <- as.character(test_data_norm$weather_main)
correct_top3 <- mapply(function(true, preds) true %in% preds, actual_labels, split(top_3_preds, row(top_3_preds)))

top3_accuracy <- mean(correct_top3)
cat(sprintf("Top-3 Accuracy: %.4f\n", top3_accuracy))
## Top-3 Accuracy: 1.0000
top_n_hits <- apply(pred_probs, 1, function(prob_row) {
  sorted <- names(sort(prob_row, decreasing = TRUE))
  which(sorted == as.character(test_data_norm$weather_main[which.max(prob_row == prob_row)]))[1]
})

hit_table <- table(factor(pmin(top_n_hits, 3), levels = 1:3))
hit_df <- data.frame(Top_N = c("Top-1", "Top-2", "Top-3"), 
                     Count = as.numeric(hit_table),
                     Accuracy = as.numeric(hit_table) / nrow(test_data_norm))

ggplot(hit_df, aes(x = Top_N, y = Accuracy, fill = Top_N)) +
  geom_col() +
  geom_text(aes(label = sprintf("%.2f%%", Accuracy * 100)), vjust = -0.5) +
  labs(title = "Top-N Accuracy",
       y = "Accuracy", x = "Top-N") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set2")

8.6 K-Fold Cross-Validation (Manual, 5-Fold)

library(caret)

set.seed(123)
folds <- createFolds(train_balanced_norm$weather_main, k = 5)

accuracies <- c()

for (i in 1:5) {
  fold_train <- train_balanced_norm[-folds[[i]], ]
  fold_test <- train_balanced_norm[folds[[i]], ]
  
  model_fold <- multinom(weather_main ~ ., data = fold_train, trace = FALSE)
  pred_fold <- predict(model_fold, fold_test)
  
  acc <- mean(pred_fold == fold_test$weather_main)
  cat(sprintf("Fold %d Accuracy: %.4f\n", i, acc))
  accuracies <- c(accuracies, acc)
}
## Fold 1 Accuracy: 0.7071
## Fold 2 Accuracy: 0.6985
## Fold 3 Accuracy: 0.7071
## Fold 4 Accuracy: 0.6969
## Fold 5 Accuracy: 0.7095
cat(sprintf("Mean CV Accuracy (5-fold): %.4f\n", mean(accuracies)))
## Mean CV Accuracy (5-fold): 0.7038
cv_df <- data.frame(Fold = paste0("Fold-", 1:5), Accuracy = accuracies)

ggplot(cv_df, aes(x = Fold, y = Accuracy, fill = Fold)) +
  geom_col(width = 0.6) +
  geom_text(aes(label = sprintf("%.2f%%", Accuracy * 100)), vjust = -0.5) +
  ylim(0, 1) +
  labs(title = "Accuracy per Fold (5-Fold CV)",
       x = "Fold", y = "Accuracy") +
  theme_minimal() +
  scale_fill_brewer(palette = "Pastel1")

9 Refrensi:

[1] J. S. Sura, R. Panchal, and A. Lather, “Economic value-added (EVA) myths and realities: evidence from the Indian manufacturing sector,” IIM Ranchi journal of management studies, vol. 2, no. 1, 2023, doi: 10.1108/irjms-03-2022-0037.
[2] J. Cuneen and D. Tobar, “Chi-square Tests,” in Sport Industry Research and Analysis, 2021. doi: 10.4324/9781315212944-28.
[3] E. Novitasari and A. Sofro, “Analisis regresi multinomial untuk pemodelan faktor penyebab kekerasan dalam rumah tangga,” MATHunesa, vol. 11, no. 1, 2023. [Online]. Available: https://journal.unesa.ac.id/index.php/mathunesa