Dataset: Weather Type Classification
Sumber: https://www.kaggle.com/datasets/nikhil7280/weather-type-classification
Dataset memiliki 13.200 observasi, 10 variabel prediktor (numerik & kategorik), dan variabel respon nominal 4 kelas: Cloudy, Rainy, Snowy, Sunny.
if (!file.exists("weather_classification_data.csv")) {
download.file(
"https://drive.google.com/uc?id=1o0KJzMdPyZsyLdCGV9pY7IanJpxW3Jgg",
"weather_classification_data.csv", mode = "wb"
)
}
df_raw <- read.csv("weather_classification_data.csv",
sep = ",", stringsAsFactors = TRUE)
names(df_raw) <- make.names(names(df_raw))
df_raw$Weather.Type <- as.factor(df_raw$Weather.Type)
cat("Dimensi:", dim(df_raw), "\n")## Dimensi: 13200 11
## Kelas : Cloudy Rainy Snowy Sunny
## Temperature Humidity Wind.Speed Precipitation.... Cloud.Cover
## 1 14 73 9.5 82 partly cloudy
## 2 39 96 8.5 71 partly cloudy
## 3 30 64 7.0 16 clear
## 4 38 83 1.5 82 clear
## 5 27 74 17.0 66 overcast
## 6 32 55 3.5 26 overcast
## Atmospheric.Pressure UV.Index Season Visibility..km. Location Weather.Type
## 1 1010.82 2 Winter 3.5 inland Rainy
## 2 1011.43 7 Spring 10.0 inland Cloudy
## 3 1018.72 5 Spring 5.5 mountain Sunny
## 4 1026.25 7 Spring 1.0 coastal Sunny
## 5 990.67 1 Winter 2.5 mountain Rainy
## 6 1010.03 2 Summer 5.0 inland Cloudy
weather_summary <- df_raw %>%
group_by(Weather.Type) %>%
summarise(Jumlah = n()) %>%
mutate(Persentase = paste0(round(Jumlah / sum(Jumlah) * 100, 2), "%")) %>%
arrange(desc(Jumlah))
kable(weather_summary, caption = "Distribusi Kelas pada Data Mentah")| Weather.Type | Jumlah | Persentase |
|---|---|---|
| Cloudy | 3300 | 25% |
| Rainy | 3300 | 25% |
| Snowy | 3300 | 25% |
| Sunny | 3300 | 25% |
## Temperature Humidity Wind.Speed Precipitation....
## Min. :-25.00 Min. : 20.00 Min. : 0.000 Min. : 0.00
## 1st Qu.: 4.00 1st Qu.: 57.00 1st Qu.: 5.000 1st Qu.: 19.00
## Median : 21.00 Median : 70.00 Median : 9.000 Median : 58.00
## Mean : 19.13 Mean : 68.71 Mean : 9.832 Mean : 53.64
## 3rd Qu.: 31.00 3rd Qu.: 84.00 3rd Qu.:13.500 3rd Qu.: 82.00
## Max. :109.00 Max. :109.00 Max. :48.500 Max. :109.00
## Cloud.Cover Atmospheric.Pressure UV.Index Season
## clear :2139 Min. : 800.1 Min. : 0.000 Autumn:2500
## cloudy : 411 1st Qu.: 994.8 1st Qu.: 1.000 Spring:2598
## overcast :6090 Median :1007.6 Median : 3.000 Summer:2492
## partly cloudy:4560 Mean :1005.8 Mean : 4.006 Winter:5610
## 3rd Qu.:1016.8 3rd Qu.: 7.000
## Max. :1199.2 Max. :14.000
## Visibility..km. Location Weather.Type
## Min. : 0.000 coastal :3571 Cloudy:3300
## 1st Qu.: 3.000 inland :4816 Rainy :3300
## Median : 5.000 mountain:4813 Snowy :3300
## Mean : 5.463 Sunny :3300
## 3rd Qu.: 7.500
## Max. :20.000
ggplot(df_raw, aes(x = Weather.Type, fill = Weather.Type)) +
geom_bar(show.legend = FALSE) +
geom_text(stat = "count",
aes(label = after_stat(count)),
vjust = -0.5, size = 4, fontface = "bold") +
scale_fill_brewer(palette = "RdYlGn") +
expand_limits(y = max(table(df_raw$Weather.Type)) * 1.1) +
labs(title = "Weather Type Raw Data Distribution",
subtitle = "Observing the frequency across weather types",
x = "Weather Type", y = "Frequency")Split dilakukan pada data mentah dengan stratifikasi pada variabel target agar distribusi kelas seimbang di kedua set. Seluruh preprocessing (deteksi outlier, dst.) dilakukan hanya pada data training agar tidak ada kebocoran informasi (data leakage) ke data test.
set.seed(42)
data_split <- initial_split(df_raw, prop = 0.8, strata = Weather.Type)
train_data <- training(data_split)
test_data <- testing(data_split)
cat("Train:", nrow(train_data), "| Test:", nrow(test_data), "\n")## Train: 10560 | Test: 2640
kable(as.data.frame(table(train_data$Weather.Type)),
col.names = c("Kelas", "Frekuensi"),
caption = "Distribusi Kelas — Train")| Kelas | Frekuensi |
|---|---|
| Cloudy | 2640 |
| Rainy | 2640 |
| Snowy | 2640 |
| Sunny | 2640 |
Rafael Hartono (24031554166)
num_vars <- c("Temperature", "Humidity", "Wind.Speed",
"Precipitation....", "Atmospheric.Pressure",
"UV.Index", "Visibility..km.")Outlier multivariat diidentifikasi menggunakan Jarak Mahalanobis. Observasi dengan nilai p < 0.001 pada distribusi Chi-Square (df = jumlah variabel numerik) dikategorikan sebagai outlier (Tabachnick & Fidell, 2022, hal. 65).
X_mat <- as.matrix(train_data[, num_vars])
mah_dist <- mahalanobis(X_mat, colMeans(X_mat), cov(X_mat))
p_mah <- pchisq(mah_dist, df = length(num_vars), lower.tail = FALSE)
outlier_flag <- p_mah < 0.001
cat("Nilai kritis Chi-Square (df=7, p=0.001):",
round(qchisq(0.999, df = length(num_vars)), 3), "\n")## Nilai kritis Chi-Square (df=7, p=0.001): 24.322
## Jumlah outlier: 585 ( 5.54 % )
train_clean <- train_data[!outlier_flag, ]
cat("Sisa data training:", nrow(train_clean), "observasi\n")## Sisa data training: 9975 observasi
kable(as.data.frame(table(train_clean$Weather.Type)),
col.names = c("Kelas", "Frekuensi"),
caption = "Distribusi Kelas setelah Cleaning")| Kelas | Frekuensi |
|---|---|
| Cloudy | 2519 |
| Rainy | 2477 |
| Snowy | 2487 |
| Sunny | 2492 |
chi_kritis <- qchisq(0.999, df = length(num_vars))
plot(mah_dist, pch = 20, cex = 0.4,
col = ifelse(outlier_flag, "#E15759", "#4E79A7"),
main = "Jarak Mahalanobis per Observasi (Train Data)",
xlab = "Indeks Observasi", ylab = "Jarak Mahalanobis")
abline(h = chi_kritis, col = "red", lty = 2, lwd = 1.5)
legend("topright",
legend = c("Normal", "Outlier", "Batas kritis"),
col = c("#4E79A7", "#E15759", "red"),
pch = c(20, 20, NA), lty = c(NA, NA, 2), cex = 0.8)Observasi yang berada di bawah garis batas kritis (titik biru) dinyatakan bukan outlier. Asumsi tidak ada outlier terpenuhi pada data training setelah cleaning.
ggplot(train_clean, aes(x = Weather.Type, fill = Weather.Type)) +
geom_bar(show.legend = FALSE) +
geom_text(stat = "count",
aes(label = after_stat(count)),
vjust = -0.5, size = 4, fontface = "bold") +
scale_fill_brewer(palette = "RdYlGn") +
expand_limits(y = max(table(train_clean$Weather.Type)) * 1.1) +
labs(title = "Weather Type Clean Data Distribution",
subtitle = "Distribusi kelas setelah penghapusan outlier",
x = "Weather Type", y = "Frequency")Uji Durbin-Watson tidak relevan untuk data ini karena data bersifat cross-sectional, bukan time-series. Setiap baris merepresentasikan satu pengamatan cuaca yang berbeda dan independen, tanpa pengukuran berulang pada subjek yang sama (Hosmer & Lemeshow, 2000). Tidak diperlukan uji statistik formal; independensi dijamin oleh desain pengumpulan data.
cat("Duplikasi baris:", sum(duplicated(train_clean)),
"-> Tidak ada duplikasi, asumsi independensi terpenuhi\n")## Duplikasi baris: 0 -> Tidak ada duplikasi, asumsi independensi terpenuhi
VIF > 10 mengindikasikan multikolinieritas serius (Gujarati, 2004).
predictors_train <- train_clean[, num_vars]
dummy_lm <- lm(rnorm(nrow(predictors_train)) ~ ., data = predictors_train)
vif_vals <- vif(dummy_lm)
vif_df <- data.frame(
Variabel = names(vif_vals),
VIF = round(vif_vals, 3),
Status = ifelse(vif_vals >= 10, "Bermasalah", "OK")
)
kable(vif_df, row.names = FALSE, caption = "Nilai VIF (Train Data)")| Variabel | VIF | Status |
|---|---|---|
| Temperature | 1.445 | OK |
| Humidity | 2.065 | OK |
| Wind.Speed | 1.331 | OK |
| Precipitation…. | 2.438 | OK |
| Atmospheric.Pressure | 1.297 | OK |
| UV.Index | 1.413 | OK |
| Visibility..km. | 1.759 | OK |
cat("VIF tertinggi:", round(max(vif_vals), 3), "->",
ifelse(max(vif_vals) < 10,
"Semua VIF < 10, tidak ada multikolinieritas -> Terpenuhi",
"Terdapat VIF >= 10 -> Tidak Terpenuhi"), "\n")## VIF tertinggi: 2.438 -> Semua VIF < 10, tidak ada multikolinieritas -> Terpenuhi
Uji Box-Tidwell menguji apakah hubungan antara log-odds dan prediktor
numerik bersifat linear. Interaksi x * log(x) yang
signifikan (p < 0.05) mengindikasikan pelanggaran linearitas (Box
& Tidwell, 1962).
train_bt <- train_clean
for (v in num_vars) {
train_bt[[paste0(v, "_li")]] <- train_bt[[v]] *
log(abs(train_bt[[v]]) + 0.001)
}
li_vars <- paste0(num_vars, "_li")
bt_formula <- as.formula(
paste("Weather.Type ~", paste(c(num_vars, li_vars), collapse = " + "))
)
bt_model <- multinom(bt_formula, data = train_bt, trace = FALSE)
z_bt <- summary(bt_model)$coefficients / summary(bt_model)$standard.errors
p_bt <- round((1 - pnorm(abs(z_bt), 0, 1)) * 2, 4)
p_bt_li <- p_bt[, li_vars, drop = FALSE]
kable(t(p_bt_li), caption = "P-value Uji Box-Tidwell (Interaksi log)")| Rainy | Snowy | Sunny | |
|---|---|---|---|
| Temperature_li | 0 | 0e+00 | 0 |
| Humidity_li | 0 | 0e+00 | 0 |
| Wind.Speed_li | 0 | 1e-04 | 0 |
| Precipitation…._li | 0 | 0e+00 | 0 |
| Atmospheric.Pressure_li | 0 | 0e+00 | 0 |
| UV.Index_li | 0 | 0e+00 | 0 |
| Visibility..km._li | 0 | 0e+00 | 0 |
sig_flag <- any(p_bt_li < 0.05)
cat("Kesimpulan:", ifelse(sig_flag,
"Ada interaksi signifikan (p < 0.05) -> linearitas logit tidak sepenuhnya terpenuhi.",
"Tidak ada interaksi signifikan -> Terpenuhi"), "\n")## Kesimpulan: Ada interaksi signifikan (p < 0.05) -> linearitas logit tidak sepenuhnya terpenuhi.
if (sig_flag) {
cat("\nCatatan: Pelanggaran linearitas logit dicatat sebagai limitasi model.\n")
cat("Beberapa variabel menunjukkan hubungan non-linear dengan log-odds.\n")
cat("Pemodelan tetap dilanjutkan; interpretasi koefisien dilakukan\n")
cat("dengan kehati-hatian ekstra pada variabel yang bermasalah.\n")
}##
## Catatan: Pelanggaran linearitas logit dicatat sebagai limitasi model.
## Beberapa variabel menunjukkan hubungan non-linear dengan log-odds.
## Pemodelan tetap dilanjutkan; interpretasi koefisien dilakukan
## dengan kehati-hatian ekstra pada variabel yang bermasalah.
rekap_rlm <- data.frame(
No = 1:4,
Asumsi = c("Tidak ada outlier multivariat",
"Independensi observasi",
"Tidak ada multikolinieritas (VIF < 10)",
"Linearitas logit (Box-Tidwell)"),
Metode = c("Jarak Mahalanobis (p < 0.001)",
"Desain cross-sectional + cek duplikasi",
"VIF via regresi auxiliary",
"Box-Tidwell interaction test"),
Status = c("Terpenuhi (setelah cleaning)",
"Terpenuhi",
ifelse(max(vif_vals) < 10, "Terpenuhi", "Tidak Terpenuhi"),
ifelse(sig_flag,
"Tidak Sepenuhnya Terpenuhi (dicatat sebagai limitasi)",
"Terpenuhi"))
)
kable(rekap_rlm, row.names = FALSE, caption = "Rekapitulasi Asumsi RLM")| No | Asumsi | Metode | Status |
|---|---|---|---|
| 1 | Tidak ada outlier multivariat | Jarak Mahalanobis (p < 0.001) | Terpenuhi (setelah cleaning) |
| 2 | Independensi observasi | Desain cross-sectional + cek duplikasi | Terpenuhi |
| 3 | Tidak ada multikolinieritas (VIF < 10) | VIF via regresi auxiliary | Terpenuhi |
| 4 | Linearitas logit (Box-Tidwell) | Box-Tidwell interaction test | Tidak Sepenuhnya Terpenuhi (dicatat sebagai limitasi) |
model_rlm <- multinom(
Weather.Type ~ Temperature + Humidity + Wind.Speed +
Atmospheric.Pressure + UV.Index + Visibility..km. +
Cloud.Cover + Season + Location,
data = train_clean,
trace = FALSE
)
summary(model_rlm)## Call:
## multinom(formula = Weather.Type ~ Temperature + Humidity + Wind.Speed +
## Atmospheric.Pressure + UV.Index + Visibility..km. + Cloud.Cover +
## Season + Location, data = train_clean, trace = FALSE)
##
## Coefficients:
## (Intercept) Temperature Humidity Wind.Speed Atmospheric.Pressure
## Rainy -5.4561624 0.003275057 0.02098346 0.08215216 -0.004557902
## Snowy 0.1208629 -0.200512163 0.01563426 0.04657070 -0.011887351
## Sunny 14.5915619 0.043972052 -0.05244030 -0.09292580 0.004081929
## UV.Index Visibility..km. Cloud.Covercloudy Cloud.Coverovercast
## Rainy -0.17472949 -0.67746712 14.61023 11.87146
## Snowy -0.07187904 -0.34529601 14.58326 11.37699
## Sunny 0.33957329 -0.09668672 -18.30584 -20.26902
## Cloud.Coverpartly cloudy SeasonSpring SeasonSummer SeasonWinter
## Rainy 10.95724 -0.01820207 0.06507842 -0.2995466
## Snowy 10.23109 0.02034332 0.20693941 2.7644988
## Sunny -17.96712 0.12877556 0.24714384 -0.1656402
## Locationinland Locationmountain
## Rainy -0.21210620 -0.17448471
## Snowy 1.37035673 1.46943012
## Sunny -0.08492159 -0.06323801
##
## Std. Errors:
## (Intercept) Temperature Humidity Wind.Speed Atmospheric.Pressure
## Rainy 0.001705872 0.003828446 0.003094422 0.007400909 0.0002888427
## Snowy 0.002276048 0.006820489 0.004688070 0.010602795 0.0004194694
## Sunny 0.003619300 0.004928641 0.003584936 0.012181091 0.0003456917
## UV.Index Visibility..km. Cloud.Covercloudy Cloud.Coverovercast
## Rainy 0.01557710 0.02162012 0.1353541 0.07892715
## Snowy 0.02287084 0.03001069 0.1261274 0.09321632
## Sunny 0.01718510 0.02212676 0.1663561 0.14496815
## Cloud.Coverpartly cloudy SeasonSpring SeasonSummer SeasonWinter
## Rainy 0.08022871 0.1152827 0.1164812 0.1132770
## Snowy 0.09657384 0.1867479 0.1856740 0.1364466
## Sunny 0.11602672 0.1621759 0.1622702 0.1657296
## Locationinland Locationmountain
## Rainy 0.09629132 0.09694065
## Snowy 0.07608328 0.07685020
## Sunny 0.14070450 0.13899724
##
## Residual Deviance: 7898.483
## AIC: 7988.483
z_rlm <- summary(model_rlm)$coefficients / summary(model_rlm)$standard.errors
p_rlm <- round((1 - pnorm(abs(z_rlm), 0, 1)) * 2, 4)
kable(t(p_rlm), caption = "P-value Uji Wald per Parameter")| Rainy | Snowy | Sunny | |
|---|---|---|---|
| (Intercept) | 0.0000 | 0.0000 | 0.0000 |
| Temperature | 0.3923 | 0.0000 | 0.0000 |
| Humidity | 0.0000 | 0.0009 | 0.0000 |
| Wind.Speed | 0.0000 | 0.0000 | 0.0000 |
| Atmospheric.Pressure | 0.0000 | 0.0000 | 0.0000 |
| UV.Index | 0.0000 | 0.0017 | 0.0000 |
| Visibility..km. | 0.0000 | 0.0000 | 0.0000 |
| Cloud.Covercloudy | 0.0000 | 0.0000 | 0.0000 |
| Cloud.Coverovercast | 0.0000 | 0.0000 | 0.0000 |
| Cloud.Coverpartly cloudy | 0.0000 | 0.0000 | 0.0000 |
| SeasonSpring | 0.8745 | 0.9133 | 0.4272 |
| SeasonSummer | 0.5764 | 0.2651 | 0.1277 |
| SeasonWinter | 0.0082 | 0.0000 | 0.3176 |
| Locationinland | 0.0276 | 0.0000 | 0.5461 |
| Locationmountain | 0.0719 | 0.0000 | 0.6491 |
| Rainy | Snowy | Sunny | |
|---|---|---|---|
| (Intercept) | 0.0043 | 1.1285 | 2172875.4428 |
| Temperature | 1.0033 | 0.8183 | 1.0450 |
| Humidity | 1.0212 | 1.0158 | 0.9489 |
| Wind.Speed | 1.0856 | 1.0477 | 0.9113 |
| Atmospheric.Pressure | 0.9955 | 0.9882 | 1.0041 |
| UV.Index | 0.8397 | 0.9306 | 1.4043 |
| Visibility..km. | 0.5079 | 0.7080 | 0.9078 |
| Cloud.Covercloudy | 2213811.2747 | 2154913.2718 | 0.0000 |
| Cloud.Coverovercast | 143122.4789 | 87289.7899 | 0.0000 |
| Cloud.Coverpartly cloudy | 57367.6500 | 27752.6368 | 0.0000 |
| SeasonSpring | 0.9820 | 1.0206 | 1.1374 |
| SeasonSummer | 1.0672 | 1.2299 | 1.2804 |
| SeasonWinter | 0.7412 | 15.8711 | 0.8474 |
| Locationinland | 0.8089 | 3.9368 | 0.9186 |
| Locationmountain | 0.8399 | 4.3468 | 0.9387 |
pred_rlm <- predict(model_rlm, newdata = test_data)
cm_rlm <- table(Aktual = test_data$Weather.Type, Prediksi = pred_rlm)
acc_rlm <- round(mean(pred_rlm == test_data$Weather.Type) * 100, 2)
kable(cm_rlm, caption = "Confusion Matrix — RLM (Test Data)")| Cloudy | Rainy | Snowy | Sunny | |
|---|---|---|---|---|
| Cloudy | 566 | 51 | 17 | 26 |
| Rainy | 55 | 573 | 21 | 11 |
| Snowy | 23 | 7 | 615 | 15 |
| Sunny | 64 | 11 | 18 | 567 |
## Akurasi RLM: 87.92 %
acc_kelas_rlm <- round(diag(cm_rlm) / rowSums(cm_rlm) * 100, 2)
kable(data.frame(Kelas = names(acc_kelas_rlm),
Akurasi = paste0(acc_kelas_rlm, "%")),
row.names = FALSE, caption = "Akurasi per Kelas — RLM")| Kelas | Akurasi |
|---|---|
| Cloudy | 85.76% |
| Rainy | 86.82% |
| Snowy | 93.18% |
| Sunny | 85.91% |
Frendy Zahril Ramadhani (24031554187)
Analisis Diskriminan bertujuan membangun fungsi diskriminan yang
dapat memisahkan objek ke dalam grupnya secara optimal (Sharma, 1996).
Asumsi yang diperlukan hanya dua: normalitas
multivariat per kelas dan homogenitas matriks kovarians (Johnson &
Wichern, 2007). Model LDA dibangun menggunakan train_clean
(data training bersih) dan dievaluasi menggunakan APER
(resubstitusi) serta akurasi pada test_data.
Normalitas multivariat diuji secara visual menggunakan Q-Q Plot Mahalanobis per kelas, didukung oleh uji Henze-Zirkler (HZ) yang robust untuk ukuran sampel besar (Henze & Zirkler, 1990). Nilai HZ mendekati 0 mengindikasikan normalitas terpenuhi. Penolakan pada n besar dapat ditoleransi karena LDA robust berkat Central Limit Theorem selama deviasi hanya terjadi di ekor distribusi.
classes <- levels(train_clean$Weather.Type)
par(mfrow = c(2, 2), mar = c(4, 4, 3, 1))
for (cl in classes) {
sub <- train_clean[train_clean$Weather.Type == cl, num_vars]
m_d <- mahalanobis(sub, colMeans(sub), cov(sub))
chi_q <- qchisq(ppoints(nrow(sub)), df = length(num_vars))
qqplot(chi_q, sort(m_d),
main = paste("Q-Q Mahalanobis —", cl),
xlab = "Kuantil Chi-Square",
ylab = "Jarak Mahalanobis",
pch = 20, cex = 0.4, col = "#4E79A7")
abline(0, 1, col = "red", lwd = 1.5)
}hz_test_manual <- function(data) {
n <- nrow(data)
p <- ncol(data)
data <- scale(data)
beta <- (1 / sqrt(2)) * ((2*p + 1) / 4)^(1/(p+4)) * (n^(1/(p+4)))
dij <- as.matrix(dist(data))^2
HZ <- (1/n^2) * sum(exp(-beta^2/2 * dij)) -
(2 * (1 + beta^2)^(-p/2) / n) *
sum(exp(-beta^2 / (2*(1+beta^2)) * rowSums(data^2))) +
(1 + 2*beta^2)^(-p/2)
return(round(HZ, 4))
}
cat("=== Henze-Zirkler Test (approximation) per Kelas ===\n")## === Henze-Zirkler Test (approximation) per Kelas ===
for (cl in classes) {
sub <- train_clean[train_clean$Weather.Type == cl, num_vars]
hz <- hz_test_manual(sub)
cat(sprintf(" %-10s : HZ = %.4f %s\n", cl, hz,
ifelse(hz < 0.5, "-> mendekati normal", "-> deviasi ada")))
}## Cloudy : HZ = 0.0134 -> mendekati normal
## Rainy : HZ = 0.0059 -> mendekati normal
## Snowy : HZ = 0.0110 -> mendekati normal
## Sunny : HZ = 0.0118 -> mendekati normal
##
## Deviasi di ekor Q-Q Plot bersifat ringan.
## Asumsi normalitas multivariat dianggap cukup terpenuhi.
LDA menggunakan S_pooled sehingga memerlukan matriks
kovarians yang homogen antar kelas. Uji Box’s M
mengevaluasi seluruh matriks kovarians p×p secara simultan (Johnson
& Wichern, 2007). Pada n besar, penolakan hampir selalu terjadi
karena sensitivitas tinggi; struktur korelasi antar kelas
divisualisasikan sebagai konfirmasi praktis.
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <dbl> <chr>
## 1 5718. 0 84 Box's M-test for Homogeneity of Covariance Matric…
if (boxm_result$p.value < 0.05) {
cat("\nBox's M signifikan (p < 0.05) -> matriks kovarians tidak identik.\n")
cat("Pada n =", nrow(train_clean),
"penolakan H0 sensitif terhadap deviasi kecil.\n")
cat("Visualisasi korelasi per kelas dilakukan untuk konfirmasi praktis.\n")
} else {
cat("\nBox's M tidak signifikan -> homogenitas kovarians terpenuhi.\n")
}##
## Box's M signifikan (p < 0.05) -> matriks kovarians tidak identik.
## Pada n = 9975 penolakan H0 sensitif terhadap deviasi kecil.
## Visualisasi korelasi per kelas dilakukan untuk konfirmasi praktis.
par(mfrow = c(1, 4), mar = c(1, 1, 3, 1))
colors_corr <- colorRampPalette(c("#2980b9", "white", "#922b21"))(200)
for (cl in classes) {
sub <- train_clean[train_clean$Weather.Type == cl, num_vars]
cor_mat <- cor(sub)
corrplot(cor_mat, method = "color", type = "lower",
tl.cex = 0.65, tl.col = "black",
col = colors_corr, addCoef.col = "black",
number.cex = 0.5, cl.pos = "n",
title = cl, mar = c(0, 0, 2, 0))
}rekap_da <- data.frame(
No = 1:2,
Asumsi = c("Normalitas multivariat per kelas",
"Homogenitas matriks kovarians (Box's M)"),
Metode = c("Q-Q Plot Mahalanobis + Henze-Zirkler",
"Box's M Test + Corrplot per kelas"),
Status = c("Terpenuhi (deviasi ringan di ekor, CLT berlaku)",
paste0("Formal ditolak (p < 0.05), namun pada n = ",
nrow(train_clean),
" sensitif terhadap deviasi kecil; ",
"struktur korelasi antar kelas relatif konsisten"))
)
kable(rekap_da, row.names = FALSE,
caption = "Rekapitulasi Asumsi Analisis Diskriminan")| No | Asumsi | Metode | Status |
|---|---|---|---|
| 1 | Normalitas multivariat per kelas | Q-Q Plot Mahalanobis + Henze-Zirkler | Terpenuhi (deviasi ringan di ekor, CLT berlaku) |
| 2 | Homogenitas matriks kovarians (Box’s M) | Box’s M Test + Corrplot per kelas | Formal ditolak (p < 0.05), namun pada n = 9975 sensitif terhadap deviasi kecil; struktur korelasi antar kelas relatif konsisten |
model_lda <- lda(
Weather.Type ~ Temperature + Humidity + Wind.Speed +
Atmospheric.Pressure + UV.Index + Visibility..km.,
data = train_clean
)kable(data.frame(Kelas = names(model_lda$prior),
Prior = round(model_lda$prior, 4)),
row.names = FALSE, caption = "Prior Probability per Kelas")| Kelas | Prior |
|---|---|
| Cloudy | 0.2525 |
| Rainy | 0.2483 |
| Snowy | 0.2493 |
| Sunny | 0.2498 |
| Temperature | Humidity | Wind.Speed | Atmospheric.Pressure | UV.Index | Visibility..km. | |
|---|---|---|---|---|---|---|
| Cloudy | 23.3501 | 67.6098 | 8.7114 | 1010.1655 | 3.3930 | 6.8809 |
| Rainy | 22.9847 | 79.4618 | 13.4635 | 1004.6526 | 2.4106 | 3.2606 |
| Snowy | -2.1994 | 79.8368 | 10.8440 | 991.1641 | 1.6015 | 3.2010 |
| Sunny | 32.6661 | 50.8391 | 5.9492 | 1019.6591 | 7.8062 | 7.3642 |
| LD1 | LD2 | LD3 | |
|---|---|---|---|
| Temperature | 0.0646 | -0.0677 | -0.0004 |
| Humidity | -0.0225 | -0.0101 | -0.0221 |
| Wind.Speed | -0.0331 | -0.0656 | -0.0091 |
| Atmospheric.Pressure | 0.0130 | -0.0062 | 0.0003 |
| UV.Index | 0.1325 | 0.1671 | 0.2147 |
| Visibility..km. | 0.1517 | 0.1489 | -0.4234 |
prop_trace <- model_lda$svd^2 / sum(model_lda$svd^2)
names(prop_trace) <- paste0("LD", seq_along(prop_trace))
kable(data.frame(LD = names(prop_trace),
Proportion = round(prop_trace, 4),
Cumulative = round(cumsum(prop_trace), 4)),
row.names = FALSE, caption = "Proportion of Trace")| LD | Proportion | Cumulative |
|---|---|---|
| LD1 | 0.8233 | 0.8233 |
| LD2 | 0.1241 | 0.9474 |
| LD3 | 0.0526 | 1.0000 |
prop_df <- data.frame(
LD = names(prop_trace),
Proportion = as.numeric(prop_trace),
Cumulative = cumsum(as.numeric(prop_trace))
)
ggplot(prop_df, aes(x = LD)) +
geom_bar(aes(y = Proportion), stat = "identity",
fill = "#264653", alpha = 0.85, width = 0.5) +
geom_line(aes(y = Cumulative, group = 1),
color = "#E76F51", linewidth = 1.1) +
geom_point(aes(y = Cumulative), color = "#E76F51", size = 3) +
scale_y_continuous(labels = percent_format()) +
labs(title = "Proportion of Trace per Fungsi Diskriminan",
subtitle = "Bar = variansi individual | Garis = kumulatif",
x = "Fungsi Diskriminan", y = "Proportion of Trace")lda_scores <- as.data.frame(predict(model_lda)$x)
lda_scores$Kelas <- train_clean$Weather.Type
ggplot(lda_scores, aes(x = LD1, y = LD2, color = Kelas, fill = Kelas)) +
geom_point(alpha = 0.2, size = 1) +
stat_ellipse(geom = "polygon", alpha = 0.08, level = 0.95) +
stat_ellipse(level = 0.95, linewidth = 0.9) +
scale_color_manual(values = c("#264653","#2A9D8F","#E9C46A","#E76F51")) +
scale_fill_manual(values = c("#264653","#2A9D8F","#E9C46A","#E76F51")) +
labs(title = "Discriminant Space: LD1 vs LD2",
subtitle = paste0("LD1 (", round(prop_trace[1]*100, 1),
"% of trace) | LD2 (",
round(prop_trace[2]*100, 1), "% of trace)"),
x = paste0("LD1 (", round(prop_trace[1]*100, 1), "% of trace)"),
y = paste0("LD2 (", round(prop_trace[2]*100, 1), "% of trace)"),
color = "Kelas", fill = "Kelas")APER adalah proporsi observasi yang salah diklasifikasikan pada data yang digunakan untuk membangun model (resubstitution). Ini adalah evaluasi standar dalam Analisis Diskriminan klasik (Johnson & Wichern, 2007).
pred_lda_train <- predict(model_lda)$class
cm_lda <- table(Predicted = pred_lda_train,
Actual = train_clean$Weather.Type)
n_total <- sum(cm_lda)
n_correct <- sum(diag(cm_lda))
n_misclassed <- n_total - n_correct
APER <- n_misclassed / n_total
cat("=== Confusion Matrix (Resubstitusi) ===\n")## === Confusion Matrix (Resubstitusi) ===
## Actual
## Predicted Cloudy Rainy Snowy Sunny
## Cloudy 2155 83 34 207
## Rainy 175 2217 9 100
## Snowy 25 104 2381 27
## Sunny 164 73 63 2158
##
## === APER Breakdown ===
## Total observasi : 9975
## Benar klasifikasi : 8911
## Salah klasifikasi : 1064
## APER : 0.1067
## APER (%) : 10.67 %
per_class <- data.frame(
Kelas = names(colSums(cm_lda)),
Total = as.integer(colSums(cm_lda)),
Correct = as.integer(diag(cm_lda)),
Misclassified = as.integer(colSums(cm_lda) - diag(cm_lda))
)
per_class$ClassAPER <- round(per_class$Misclassified / per_class$Total, 4)
kable(per_class, row.names = FALSE,
caption = "APER per Kelas")| Kelas | Total | Correct | Misclassified | ClassAPER |
|---|---|---|---|---|
| Cloudy | 2519 | 2155 | 364 | 0.1445 |
| Rainy | 2477 | 2217 | 260 | 0.1050 |
| Snowy | 2487 | 2381 | 106 | 0.0426 |
| Sunny | 2492 | 2158 | 334 | 0.1340 |
ggplot(per_class, aes(x = Kelas, y = ClassAPER, fill = Kelas)) +
geom_bar(stat = "identity", show.legend = FALSE, width = 0.6) +
geom_hline(yintercept = APER, linetype = "dashed",
color = "#2c3e50", linewidth = 0.9) +
scale_fill_manual(values = c("#264653","#2A9D8F","#E9C46A","#E76F51")) +
scale_y_continuous(labels = percent_format()) +
labs(title = "Per-Class Apparent Error Rate (APER)",
subtitle = paste0("Garis putus-putus = Overall APER ",
round(APER * 100, 2), "%"),
x = "Kelas Cuaca", y = "Misclassification Rate")pred_lda_test <- predict(model_lda, newdata = test_data)$class
cm_lda_test <- table(Aktual = test_data$Weather.Type,
Prediksi = pred_lda_test)
acc_lda_test <- round(mean(pred_lda_test == test_data$Weather.Type) * 100, 2)
kable(cm_lda_test, caption = "Confusion Matrix — LDA (Test Data)")| Cloudy | Rainy | Snowy | Sunny | |
|---|---|---|---|---|
| Cloudy | 536 | 46 | 18 | 60 |
| Rainy | 22 | 578 | 22 | 38 |
| Snowy | 17 | 1 | 620 | 22 |
| Sunny | 64 | 23 | 15 | 558 |
## Akurasi LDA (test data): 86.82 %
acc_kelas_lda <- round(diag(cm_lda_test) / rowSums(cm_lda_test) * 100, 2)
kable(data.frame(Kelas = names(acc_kelas_lda),
Akurasi = paste0(acc_kelas_lda, "%")),
row.names = FALSE, caption = "Akurasi per Kelas — LDA (Test Data)")| Kelas | Akurasi |
|---|---|
| Cloudy | 81.21% |
| Rainy | 87.58% |
| Snowy | 93.94% |
| Sunny | 84.55% |
final_df <- data.frame(
Aspek = c("Metode", "Asumsi Utama", "Evaluasi Utama",
"Akurasi Test Data", "Kelebihan", "Keterbatasan"),
RLM = c(
"Regresi Logistik Multinomial",
"Tidak ada outlier, independensi, tidak ada multikolinieritas, linearitas logit",
"Train/Test Split (80:20) — Akurasi pada test data",
paste0(acc_rlm, "%"),
"Tidak mensyaratkan normalitas; output probabilitas per kelas",
"Linearitas logit tidak sepenuhnya terpenuhi (dicatat sebagai limitasi)"
),
LDA = c(
"Linear Discriminant Analysis",
"Normalitas multivariat per kelas, homogenitas matriks kovarians",
paste0("APER (resubstitusi) = ", round(APER*100, 2), "%"),
paste0(acc_lda_test, "%"),
"Fungsi diskriminan eksplisit; proportion of trace informatif",
"Homogenitas kovarians formal ditolak (namun ditoleransi pada n besar)"
)
)
kable(final_df, row.names = FALSE,
col.names = c("Aspek", "RLM", "LDA"),
caption = "Perbandingan RLM vs LDA")| Aspek | RLM | LDA |
|---|---|---|
| Metode | Regresi Logistik Multinomial | Linear Discriminant Analysis |
| Asumsi Utama | Tidak ada outlier, independensi, tidak ada multikolinieritas, linearitas logit | Normalitas multivariat per kelas, homogenitas matriks kovarians |
| Evaluasi Utama | Train/Test Split (80:20) — Akurasi pada test data | APER (resubstitusi) = 10.67% |
| Akurasi Test Data | 87.92% | 86.82% |
| Kelebihan | Tidak mensyaratkan normalitas; output probabilitas per kelas | Fungsi diskriminan eksplisit; proportion of trace informatif |
| Keterbatasan | Linearitas logit tidak sepenuhnya terpenuhi (dicatat sebagai limitasi) | Homogenitas kovarians formal ditolak (namun ditoleransi pada n besar) |