Link RPubs: https://rpubs.com/TyoUnesa1/MultinomialLogisticRegressionTanpaPCA
Dataset yang digunakan dalam proyek ini berasal dari Kaggle dengan judul “Denpasar Bali Historical Weather Data”, yang berisi data cuaca historis Kota Denpasar dari tahun 1990 hingga 2020. Namun, untuk kebutuhan proyek ini, fokus klasifikasi diarahkan pada empat kategori utama yang relevan dengan kondisi cuaca Denpasar tahun 2019, yaitu Clouds, Rain, Thunderstorm, dan Clear.
library(readr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data_path <- "openweatherdata-denpasar-1990-2020.csv"
data <- read_csv(data_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 264924 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): dt_iso, city_name, weather_main, weather_description, weather_icon
## dbl (17): dt, timezone, lat, lon, temp, temp_min, temp_max, pressure, humidi...
## lgl (10): sea_level, grnd_level, rain_12h, rain_today, snow_1h, snow_3h, sno...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$dt_iso <- gsub(" UTC", "", data$dt_iso)
data$dt_iso <- as_datetime(data$dt_iso)
data_filtered <- data %>% filter(year(dt_iso) == 2019)
if (nrow(data_filtered) == 0) stop("Tidak ada data untuk tahun 2019!")
head(data_filtered)
## # A tibble: 6 × 32
## dt dt_iso timezone city_name lat lon temp temp_min
## <dbl> <dttm> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1546300800 2019-01-01 00:00:00 28800 Denpasar -8.65 115. 27.1 27
## 2 1546304400 2019-01-01 01:00:00 28800 Denpasar -8.65 115. 27.8 27.4
## 3 1546308000 2019-01-01 02:00:00 28800 Denpasar -8.65 115. 29.5 29
## 4 1546311600 2019-01-01 03:00:00 28800 Denpasar -8.65 115. 30.1 30
## 5 1546315200 2019-01-01 04:00:00 28800 Denpasar -8.65 115. 30.2 30
## 6 1546318800 2019-01-01 05:00:00 28800 Denpasar -8.65 115. 30 30
## # ℹ 24 more variables: temp_max <dbl>, pressure <dbl>, sea_level <lgl>,
## # grnd_level <lgl>, humidity <dbl>, wind_speed <dbl>, wind_deg <dbl>,
## # rain_1h <dbl>, rain_3h <dbl>, rain_6h <dbl>, rain_12h <lgl>,
## # rain_24h <dbl>, rain_today <lgl>, snow_1h <lgl>, snow_3h <lgl>,
## # snow_6h <lgl>, snow_12h <lgl>, snow_24h <lgl>, snow_today <lgl>,
## # clouds_all <dbl>, weather_id <dbl>, weather_main <chr>,
## # weather_description <chr>, weather_icon <chr>
cat("Jumlah baris:", nrow(data_filtered), "\n")
## Jumlah baris: 8822
cat("Jumlah kolom:", ncol(data_filtered), "\n")
## Jumlah kolom: 32
na_summary <- data.frame(Kolom = names(data_filtered),
Jumlah_NA = colSums(is.na(data_filtered))) %>%
arrange(desc(Jumlah_NA))
na_summary
## Kolom Jumlah_NA
## sea_level sea_level 8822
## grnd_level grnd_level 8822
## rain_12h rain_12h 8822
## rain_today rain_today 8822
## snow_1h snow_1h 8822
## snow_3h snow_3h 8822
## snow_6h snow_6h 8822
## snow_12h snow_12h 8822
## snow_24h snow_24h 8822
## snow_today snow_today 8822
## rain_1h rain_1h 8813
## rain_24h rain_24h 8543
## rain_6h rain_6h 8541
## rain_3h rain_3h 8275
## dt dt 0
## dt_iso dt_iso 0
## timezone timezone 0
## city_name city_name 0
## lat lat 0
## lon lon 0
## temp temp 0
## temp_min temp_min 0
## temp_max temp_max 0
## pressure pressure 0
## humidity humidity 0
## wind_speed wind_speed 0
## wind_deg wind_deg 0
## clouds_all clouds_all 0
## weather_id weather_id 0
## weather_main weather_main 0
## weather_description weather_description 0
## weather_icon weather_icon 0
freq_weather <- table(data_filtered$weather_main, useNA = "ifany")
weather_summary <- as.data.frame(freq_weather)
colnames(weather_summary) <- c("Weather Main", "Jumlah")
weather_summary
## Weather Main Jumlah
## 1 Clear 22
## 2 Clouds 8075
## 3 Haze 5
## 4 Mist 3
## 5 Rain 571
## 6 Smoke 3
## 7 Thunderstorm 143
data_filtered <- data_filtered %>%
select(temp, temp_min, temp_max, pressure, humidity,
wind_speed, wind_deg, clouds_all, weather_main) %>%
filter(weather_main %in% c("Clouds", "Rain", "Thunderstorm", "Clear"))
if (length(unique(data_filtered$weather_main)) < 2) {
stop("Kolom weather_main hanya memiliki satu kategori setelah filter. Tidak bisa dilakukan split!")
}
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
data_filtered$weather_main <- as.integer(data_filtered$weather_main) - 1
is.ordered(data_filtered$weather_main)
## [1] FALSE
str(data_filtered)
## tibble [8,811 × 9] (S3: tbl_df/tbl/data.frame)
## $ temp : num [1:8811] 27.1 27.8 29.5 30.1 30.1 ...
## $ temp_min : num [1:8811] 27 27.4 29 30 30 30 30 29 29 29 ...
## $ temp_max : num [1:8811] 27.4 28 30.4 30.4 30.4 30 30 30 29.4 29.4 ...
## $ pressure : num [1:8811] 1010 1010 1010 1010 1009 ...
## $ humidity : num [1:8811] 94 88 83 79 79 79 74 79 83 79 ...
## $ wind_speed : num [1:8811] 0.5 2.1 3.1 4.1 4.1 5.1 4.6 5.7 5.7 5.7 ...
## $ wind_deg : num [1:8811] 0 240 240 260 260 250 260 260 260 270 ...
## $ clouds_all : num [1:8811] 40 20 20 20 20 20 20 20 20 20 ...
## $ weather_main: num [1:8811] 2 2 1 1 1 1 1 1 1 1 ...
unique(data_filtered$weather_main)
## [1] 2 1 0 3
library(dplyr)
data_filtered <- data_filtered %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
cat("Ringkasan data setelah penanganan missing values:\n")
## Ringkasan data setelah penanganan missing values:
na_summary_after <- data.frame(
Kolom = names(data_filtered),
Jumlah_NA = colSums(is.na(data_filtered))
)
print(na_summary_after)
## Kolom Jumlah_NA
## temp temp 0
## temp_min temp_min 0
## temp_max temp_max 0
## pressure pressure 0
## humidity humidity 0
## wind_speed wind_speed 0
## wind_deg wind_deg 0
## clouds_all clouds_all 0
## weather_main weather_main 0
data_filtered <- data_filtered %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
# Cek ulang jumlah NA setelah imputasi
cat("Jumlah nilai NA setelah imputasi (dengan mean):\n")
## Jumlah nilai NA setelah imputasi (dengan mean):
print(colSums(is.na(data_filtered)))
## temp temp_min temp_max pressure humidity wind_speed
## 0 0 0 0 0 0
## wind_deg clouds_all weather_main
## 0 0 0
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
split_index <- createDataPartition(data_filtered$weather_main, p = 0.7, list = FALSE)
train_data <- data_filtered[split_index, ]
test_data <- data_filtered[-split_index, ]
Dilihat berdasarkan distribusi pada kolom target, karena terdapat ketidakseimbangan data maka dilakukan penanganan imbalaced data. pengananan ini dilakukan pada data train.
if (!require(smotefamily)) install.packages("smotefamily")
## Loading required package: smotefamily
## Warning: package 'smotefamily' was built under R version 4.4.3
library(smotefamily)
set.seed(123)
n_total_train <- nrow(train_data)
n_target_final <- n_total_train * 2
n_classes <- length(levels(train_data$weather_main))
n_per_class <- floor(n_target_final / n_classes)
list_per_class <- split(train_data, train_data$weather_main)
balanced_list <- list()
for (cls in names(list_per_class)) {
data_cls <- list_per_class[[cls]]
n_cls <- nrow(data_cls)
balanced_list[[cls]] <- if (n_cls > n_per_class) {
slice_sample(data_cls, n = n_per_class)
} else {
slice_sample(data_cls, n = n_per_class, replace = TRUE)
}
}
train_balanced <- bind_rows(balanced_list)
cat("Distribusi kelas setelah balancing di train_data:\n")
## Distribusi kelas setelah balancing di train_data:
table(train_balanced$weather_main)
##
## 0 1 2 3
## 3085 3085 3085 3085
cat("Total data train setelah balancing:", nrow(train_balanced), "\n")
## Total data train setelah balancing: 12340
cat("Distribusi kelas di test_data (tidak diubah):\n")
## Distribusi kelas di test_data (tidak diubah):
table(test_data$weather_main)
##
## 0 1 2 3
## 6 2422 171 42
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
temp_vars <- dplyr::select(train_balanced, temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars, scale. = TRUE)
pc_df <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc_df) <- "pca_temp"
train_balanced_pca <- train_balanced %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc_df)
lm_model <- lm(as.numeric(weather_main) ~ ., data = train_balanced_pca)
vif_values <- vif(lm_model)
vif_values
## pressure humidity wind_speed wind_deg clouds_all pca_temp
## 1.298020 2.626288 1.224317 1.170867 1.406714 2.272122
train_balanced_pca$weather_main <- as.factor(train_balanced_pca$weather_main)
num_vars <- names(train_balanced_pca)[sapply(train_balanced_pca, is.numeric)]
chi_sq_test <- function(varname, data, target) {
binned_var <- cut(data[[varname]], breaks = 5, include.lowest = TRUE)
contingency_table <- table(binned_var, data[[target]])
if (all(dim(contingency_table) > 1)) {
chisq.test(contingency_table)$p.value
} else NA
}
chi_results <- sapply(num_vars, chi_sq_test, data = train_balanced_pca, target = "weather_main")
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
chi_df <- data.frame(Variable = names(chi_results), P_Value = chi_results)
chi_df
## Variable P_Value
## pressure pressure 3.123660e-210
## humidity humidity 0.000000e+00
## wind_speed wind_speed 0.000000e+00
## wind_deg wind_deg 0.000000e+00
## clouds_all clouds_all 0.000000e+00
## pca_temp pca_temp 1.125748e-121
feature_cols <- c("temp", "temp_min", "temp_max", "pressure", "humidity", "wind_speed", "wind_deg", "clouds_all")
min_max_norm <- function(x) (x - min(x)) / (max(x) - min(x))
train_balanced_norm <- train_balanced
train_balanced_norm[feature_cols] <- lapply(train_balanced[feature_cols], min_max_norm)
test_data_norm <- test_data
for (col in feature_cols) {
min_val <- min(train_balanced[[col]])
max_val <- max(train_balanced[[col]])
test_data_norm[[col]] <- (test_data[[col]] - min_val) / (max_val - min_val)
}
summary(train_balanced_norm)
## temp temp_min temp_max pressure
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3309 1st Qu.:0.3929 1st Qu.:0.3226 1st Qu.:0.3333
## Median :0.4214 Median :0.4857 Median :0.4113 Median :0.4167
## Mean :0.4397 Mean :0.4932 Mean :0.4406 Mean :0.4280
## 3rd Qu.:0.5276 3rd Qu.:0.5714 3rd Qu.:0.5403 3rd Qu.:0.5833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## humidity wind_speed wind_deg clouds_all
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5417 1st Qu.:0.1253 1st Qu.:0.3056 1st Qu.:0.0800
## Median :0.6458 Median :0.2365 Median :0.3833 Median :0.4000
## Mean :0.6529 Mean :0.2523 Mean :0.4816 Mean :0.3501
## 3rd Qu.:0.8750 3rd Qu.:0.3477 3rd Qu.:0.7500 3rd Qu.:0.6000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## weather_main
## 0:3085
## 1:3085
## 2:3085
## 3:3085
##
##
summary(test_data_norm)
## temp temp_min temp_max pressure
## Min. :-0.07819 Min. :0.0500 Min. :-0.08065 Min. :0.0000
## 1st Qu.: 0.34156 1st Qu.:0.4214 1st Qu.: 0.33065 1st Qu.:0.3333
## Median : 0.44527 Median :0.5000 Median : 0.47581 Median :0.5000
## Mean : 0.46144 Mean :0.5133 Mean : 0.46195 Mean :0.4844
## 3rd Qu.: 0.57778 3rd Qu.:0.6143 3rd Qu.: 0.56452 3rd Qu.:0.5833
## Max. : 1.01070 Max. :1.0000 Max. : 1.00000 Max. :1.0000
## humidity wind_speed wind_deg clouds_all
## Min. :0.0625 Min. :0.00593 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5208 1st Qu.:0.12528 1st Qu.:0.2778 1st Qu.:0.2000
## Median :0.6458 Median :0.23647 Median :0.3333 Median :0.2000
## Mean :0.6328 Mean :0.26084 Mean :0.4247 Mean :0.3132
## 3rd Qu.:0.7500 3rd Qu.:0.39214 3rd Qu.:0.6944 3rd Qu.:0.4000
## Max. :1.0000 Max. :0.96294 Max. :1.0000 Max. :1.0000
## weather_main
## 0: 6
## 1:2422
## 2: 171
## 3: 42
##
##
library(ggplot2)
library(dplyr)
train_weather_summary_1 <- train_data %>%
group_by(weather_main) %>%
summarise(Jumlah = n()) %>%
arrange(desc(Jumlah))
train_weather_summary_1$Proporsi <- train_weather_summary_1$Jumlah / sum(train_weather_summary_1$Jumlah)
ggplot(train_weather_summary_1, aes(x = reorder(weather_main, -Jumlah), y = Jumlah, fill = weather_main)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(title = "Distribusi Kategori Cuaca pada Data Train (Sebelum Balanced)",
x = "Kategori Cuaca", y = "Jumlah")
library(ggplot2)
library(dplyr)
train_weather_summary <- train_balanced %>%
group_by(weather_main) %>%
summarise(Jumlah = n()) %>%
arrange(desc(Jumlah))
train_weather_summary$Proporsi <- train_weather_summary$Jumlah / sum(train_weather_summary$Jumlah)
ggplot(train_weather_summary, aes(x = reorder(weather_main, -Jumlah), y = Jumlah, fill = weather_main)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(title = "Distribusi Kategori Cuaca pada Data Train (Balanced)",
x = "Kategori Cuaca", y = "Jumlah")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
correlation_matrix <- cor(train_balanced_norm[, feature_cols])
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black",
addCoef.col = "black", number.cex = 0.7, diag = FALSE)
library(nnet)
model_mlr <- multinom(weather_main ~ ., data = train_balanced_norm, trace = FALSE)
pred_mlr <- predict(model_mlr, test_data_norm)
conf_matrix_mlr <- confusionMatrix(pred_mlr, test_data_norm$weather_main)
conf_matrix_mlr
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3
## 0 5 1 0 0
## 1 1 1885 33 8
## 2 0 253 84 11
## 3 0 283 54 23
##
## Overall Statistics
##
## Accuracy : 0.7562
## 95% CI : (0.7393, 0.7724)
## No Information Rate : 0.9171
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2383
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity 0.833333 0.7783 0.49123 0.547619
## Specificity 0.999620 0.8082 0.89312 0.870335
## Pos Pred Value 0.833333 0.9782 0.24138 0.063889
## Neg Pred Value 0.999620 0.2479 0.96206 0.991670
## Prevalence 0.002272 0.9171 0.06475 0.015903
## Detection Rate 0.001893 0.7137 0.03181 0.008709
## Detection Prevalence 0.002272 0.7296 0.13177 0.136312
## Balanced Accuracy 0.916477 0.7933 0.69217 0.708977
cat(sprintf("Accuracy MLR: %.4f\n", conf_matrix_mlr$overall['Accuracy']))
## Accuracy MLR: 0.7562
library(pheatmap)
## Warning: package 'pheatmap' was built under R version 4.4.3
conf_matrix_table <- conf_matrix_mlr$table
conf_matrix_prop <- prop.table(conf_matrix_table, margin = 1) # per baris
pheatmap(conf_matrix_prop,
cluster_rows = FALSE, cluster_cols = FALSE,
display_numbers = TRUE, number_format = "%.2f",
main = "Confusion Matrix Heatmap (Normalized)",
color = colorRampPalette(c("white", "blue"))(50))
library(dplyr)
class_accuracy <- diag(conf_matrix_table) / rowSums(conf_matrix_table)
df_class_acc <- data.frame(Class = names(class_accuracy), Accuracy = class_accuracy)
library(ggplot2)
ggplot(df_class_acc, aes(x = reorder(Class, -Accuracy), y = Accuracy, fill = Accuracy)) +
geom_col() +
coord_flip() +
scale_fill_gradient(low = "red", high = "green") +
labs(title = "Akurasi per Kelas", x = "Kelas Cuaca", y = "Akurasi") +
theme_minimal()
Grafik akurasi per kelas menunjukkan bahwa model memiliki performa
sangat baik pada kelas cuaca 0 dan 1, dengan akurasi mendekati 100%.
Namun, performanya menurun drastis pada kelas 2 (akurasi sekitar 25%)
dan sangat buruk pada kelas 3 (kurang dari 10%). Hal ini mengindikasikan
ketidakseimbangan data atau kurangnya kemampuan model dalam membedakan
pola pada kelas-kelas tersebut.
Visualisasi ini membantu melihat penyebaran data uji berdasarkan prediksi dan label sebenarnya.
dist_matrix <- dist(test_data_norm[, -which(names(test_data_norm) == "weather_main")])
mds <- cmdscale(dist_matrix, k = 2)
mds_df <- data.frame(Dim1 = mds[,1], Dim2 = mds[,2],
Actual = test_data_norm$weather_main,
Predicted = pred_mlr)
ggplot(mds_df, aes(x = Dim1, y = Dim2, color = Actual, shape = Predicted)) +
geom_point(alpha = 0.6, size = 3) +
labs(title = "MDS Plot - Actual vs Predicted",
subtitle = "Bentuk = Prediksi, Warna = Label Asli") +
theme_minimal()
Plot MDS tersebut menunjukkan distribusi hasil prediksi terhadap label
sebenarnya, di mana bentuk menunjukkan prediksi dan warna menunjukkan
label asli. Terlihat bahwa sebagian besar data diprediksi sebagai kelas
1 (segitiga), meskipun warna-warna lain khususnya biru (kelas 2) dan
ungu (kelas 3) masih muncul secara signifikan. Hal ini mengindikasikan
adanya kecenderungan model untuk memprediksi sebagian besar sampel
sebagai kelas mayoritas (kelas 1), sehingga banyak kesalahan klasifikasi
terjadi, terutama pada kelas minoritas. Visualisasi ini mendukung temuan
sebelumnya bahwa model mengalami kesulitan dalam mengenali kelas dengan
distribusi lebih kecil.
library(pROC)
## Warning: package 'pROC' was built under R version 4.4.3
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
classes <- levels(test_data_norm$weather_main)
pred_probs <- predict(model_mlr, test_data_norm, type = "probs")
par(mfrow = c(2, ceiling(length(classes)/2)))
for (cls in classes) {
roc_obj <- roc(response = as.numeric(test_data_norm$weather_main == cls),
predictor = pred_probs[, cls])
plot(roc_obj, main = paste("ROC Curve -", cls))
}
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
par(mfrow = c(1,1))
Gambar tersebut menunjukkan empat kurva ROC untuk model klasifikasi yang berbeda. Kurva ROC-0 menunjukkan performa sempurna, dengan sensitivitas dan spesifisitas maksimal. Sementara kurva ROC-1, ROC-2, dan ROC-3 menunjukkan performa yang sangat baik meskipun tidak sempurna, dengan kurva yang tetap jauh di atas garis diagonal acak. Secara keseluruhan, semua model memiliki kemampuan klasifikasi yang baik, dengan model pada ROC-0 sebagai yang paling optimal.
Mengukur apakah label sebenarnya termasuk dalam 3 prediksi probabilitas tertinggi.
pred_probs <- predict(model_mlr, test_data_norm, type = "probs")
top_3_preds <- t(apply(pred_probs, 1, function(x) {
names(sort(x, decreasing = TRUE))[1:3]
}))
actual_labels <- as.character(test_data_norm$weather_main)
correct_top3 <- mapply(function(true, preds) true %in% preds, actual_labels, split(top_3_preds, row(top_3_preds)))
top3_accuracy <- mean(correct_top3)
cat(sprintf("Top-3 Accuracy: %.4f\n", top3_accuracy))
## Top-3 Accuracy: 1.0000
top_n_hits <- apply(pred_probs, 1, function(prob_row) {
sorted <- names(sort(prob_row, decreasing = TRUE))
which(sorted == as.character(test_data_norm$weather_main[which.max(prob_row == prob_row)]))[1]
})
hit_table <- table(factor(pmin(top_n_hits, 3), levels = 1:3))
hit_df <- data.frame(Top_N = c("Top-1", "Top-2", "Top-3"),
Count = as.numeric(hit_table),
Accuracy = as.numeric(hit_table) / nrow(test_data_norm))
ggplot(hit_df, aes(x = Top_N, y = Accuracy, fill = Top_N)) +
geom_col() +
geom_text(aes(label = sprintf("%.2f%%", Accuracy * 100)), vjust = -0.5) +
labs(title = "Top-N Accuracy",
y = "Accuracy", x = "Top-N") +
theme_minimal() +
scale_fill_brewer(palette = "Set2")
library(caret)
set.seed(123)
folds <- createFolds(train_balanced_norm$weather_main, k = 5)
accuracies <- c()
for (i in 1:5) {
fold_train <- train_balanced_norm[-folds[[i]], ]
fold_test <- train_balanced_norm[folds[[i]], ]
model_fold <- multinom(weather_main ~ ., data = fold_train, trace = FALSE)
pred_fold <- predict(model_fold, fold_test)
acc <- mean(pred_fold == fold_test$weather_main)
cat(sprintf("Fold %d Accuracy: %.4f\n", i, acc))
accuracies <- c(accuracies, acc)
}
## Fold 1 Accuracy: 0.7071
## Fold 2 Accuracy: 0.6985
## Fold 3 Accuracy: 0.7071
## Fold 4 Accuracy: 0.6969
## Fold 5 Accuracy: 0.7095
cat(sprintf("Mean CV Accuracy (5-fold): %.4f\n", mean(accuracies)))
## Mean CV Accuracy (5-fold): 0.7038
cv_df <- data.frame(Fold = paste0("Fold-", 1:5), Accuracy = accuracies)
ggplot(cv_df, aes(x = Fold, y = Accuracy, fill = Fold)) +
geom_col(width = 0.6) +
geom_text(aes(label = sprintf("%.2f%%", Accuracy * 100)), vjust = -0.5) +
ylim(0, 1) +
labs(title = "Accuracy per Fold (5-Fold CV)",
x = "Fold", y = "Accuracy") +
theme_minimal() +
scale_fill_brewer(palette = "Pastel1")
[1] J. S. Sura, R. Panchal, and A. Lather, “Economic value-added
(EVA) myths and realities: evidence from the Indian manufacturing
sector,” IIM Ranchi journal of management studies, vol. 2, no. 1, 2023,
doi: 10.1108/irjms-03-2022-0037.
[2] J. Cuneen and D. Tobar, “Chi-square Tests,” in Sport Industry
Research and Analysis, 2021. doi: 10.4324/9781315212944-28.
[3] E. Novitasari and A. Sofro, “Analisis regresi multinomial untuk
pemodelan faktor penyebab kekerasan dalam rumah tangga,”
MATHunesa, vol. 11, no. 1, 2023. [Online]. Available: https://journal.unesa.ac.id/index.php/mathunesa