LINK PUBLIKASI RPUBS : https://rpubs.com/RezaUnesaSada23/RegresiMultinomialWithPCA
Dataset yang digunakan dalam proyek ini berasal dari Kaggle dengan judul “Denpasar Bali Historical Weather Data”, yang berisi data cuaca historis Kota Denpasar dari tahun 1990 hingga 2020. Namun, untuk kebutuhan proyek ini, fokus klasifikasi diarahkan pada empat kategori utama yang relevan dengan kondisi cuaca Denpasar tahun 2019, yaitu Clouds, Rain, Thunderstorm, dan Clear.
library(readr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data_path <- "D:/TUGAS KAMPUS SEMESTER 4/ANALISIS MULTIVARIAT/PROYEK AKHIR ANMUL/openweatherdata-denpasar-1990-2020.csv"
data <- read_csv(data_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 264924 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): dt_iso, city_name, weather_main, weather_description, weather_icon
## dbl (17): dt, timezone, lat, lon, temp, temp_min, temp_max, pressure, humidi...
## lgl (10): sea_level, grnd_level, rain_12h, rain_today, snow_1h, snow_3h, sno...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$dt_iso <- gsub(" UTC", "", data$dt_iso)
data$dt_iso <- as_datetime(data$dt_iso)
data_filtered <- data %>% filter(year(dt_iso) == 2019)
if (nrow(data_filtered) == 0) stop("Tidak ada data untuk tahun 2019!")
head(data_filtered)
## # A tibble: 6 × 32
## dt dt_iso timezone city_name lat lon temp temp_min
## <dbl> <dttm> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1546300800 2019-01-01 00:00:00 28800 Denpasar -8.65 115. 27.1 27
## 2 1546304400 2019-01-01 01:00:00 28800 Denpasar -8.65 115. 27.8 27.4
## 3 1546308000 2019-01-01 02:00:00 28800 Denpasar -8.65 115. 29.5 29
## 4 1546311600 2019-01-01 03:00:00 28800 Denpasar -8.65 115. 30.1 30
## 5 1546315200 2019-01-01 04:00:00 28800 Denpasar -8.65 115. 30.2 30
## 6 1546318800 2019-01-01 05:00:00 28800 Denpasar -8.65 115. 30 30
## # ℹ 24 more variables: temp_max <dbl>, pressure <dbl>, sea_level <lgl>,
## # grnd_level <lgl>, humidity <dbl>, wind_speed <dbl>, wind_deg <dbl>,
## # rain_1h <dbl>, rain_3h <dbl>, rain_6h <dbl>, rain_12h <lgl>,
## # rain_24h <dbl>, rain_today <lgl>, snow_1h <lgl>, snow_3h <lgl>,
## # snow_6h <lgl>, snow_12h <lgl>, snow_24h <lgl>, snow_today <lgl>,
## # clouds_all <dbl>, weather_id <dbl>, weather_main <chr>,
## # weather_description <chr>, weather_icon <chr>
cat("Jumlah baris:", nrow(data_filtered), "\n")
## Jumlah baris: 8822
cat("Jumlah kolom:", ncol(data_filtered), "\n")
## Jumlah kolom: 32
na_summary <- data.frame(Kolom = names(data_filtered),
Jumlah_NA = colSums(is.na(data_filtered))) %>%
arrange(desc(Jumlah_NA))
na_summary
## Kolom Jumlah_NA
## sea_level sea_level 8822
## grnd_level grnd_level 8822
## rain_12h rain_12h 8822
## rain_today rain_today 8822
## snow_1h snow_1h 8822
## snow_3h snow_3h 8822
## snow_6h snow_6h 8822
## snow_12h snow_12h 8822
## snow_24h snow_24h 8822
## snow_today snow_today 8822
## rain_1h rain_1h 8813
## rain_24h rain_24h 8543
## rain_6h rain_6h 8541
## rain_3h rain_3h 8275
## dt dt 0
## dt_iso dt_iso 0
## timezone timezone 0
## city_name city_name 0
## lat lat 0
## lon lon 0
## temp temp 0
## temp_min temp_min 0
## temp_max temp_max 0
## pressure pressure 0
## humidity humidity 0
## wind_speed wind_speed 0
## wind_deg wind_deg 0
## clouds_all clouds_all 0
## weather_id weather_id 0
## weather_main weather_main 0
## weather_description weather_description 0
## weather_icon weather_icon 0
freq_weather <- table(data_filtered$weather_main, useNA = "ifany")
weather_summary <- as.data.frame(freq_weather)
colnames(weather_summary) <- c("Weather Main", "Jumlah")
weather_summary
## Weather Main Jumlah
## 1 Clear 22
## 2 Clouds 8075
## 3 Haze 5
## 4 Mist 3
## 5 Rain 571
## 6 Smoke 3
## 7 Thunderstorm 143
data_filtered <- data_filtered %>%
select(temp, temp_min, temp_max, pressure, humidity,
wind_speed, wind_deg, clouds_all, weather_main) %>%
filter(weather_main %in% c("Clouds", "Rain", "Thunderstorm", "Clear"))
if (length(unique(data_filtered$weather_main)) < 2) {
stop("Kolom weather_main hanya memiliki satu kategori setelah filter. Tidak bisa dilakukan split!")
}
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
data_filtered$weather_main <- as.integer(data_filtered$weather_main) - 1
is.ordered(data_filtered$weather_main)
## [1] FALSE
str(data_filtered)
## tibble [8,811 × 9] (S3: tbl_df/tbl/data.frame)
## $ temp : num [1:8811] 27.1 27.8 29.5 30.1 30.1 ...
## $ temp_min : num [1:8811] 27 27.4 29 30 30 30 30 29 29 29 ...
## $ temp_max : num [1:8811] 27.4 28 30.4 30.4 30.4 30 30 30 29.4 29.4 ...
## $ pressure : num [1:8811] 1010 1010 1010 1010 1009 ...
## $ humidity : num [1:8811] 94 88 83 79 79 79 74 79 83 79 ...
## $ wind_speed : num [1:8811] 0.5 2.1 3.1 4.1 4.1 5.1 4.6 5.7 5.7 5.7 ...
## $ wind_deg : num [1:8811] 0 240 240 260 260 250 260 260 260 270 ...
## $ clouds_all : num [1:8811] 40 20 20 20 20 20 20 20 20 20 ...
## $ weather_main: num [1:8811] 2 2 1 1 1 1 1 1 1 1 ...
unique(data_filtered$weather_main)
## [1] 2 1 0 3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
split_index <- createDataPartition(data_filtered$weather_main, p = 0.7, list = FALSE)
train_data <- data_filtered[split_index, ]
test_data <- data_filtered[-split_index, ]
cat("Jumlah data Train :", nrow(train_data), "\n")
## Jumlah data Train : 6170
cat("Jumlah data Test :", nrow(test_data), "\n")
## Jumlah data Test : 2641
bisa dilihat berdasarkan distrubusi pada kolom target, terdapat ketidakseimbangan data maka dilakukan penangan imbalaced data. penganan ini dilakukan pada data train.
if (!require(smotefamily)) install.packages("smotefamily")
## Loading required package: smotefamily
## Warning: package 'smotefamily' was built under R version 4.4.3
library(smotefamily)
set.seed(123)
n_total_train <- nrow(train_data)
n_target_final <- n_total_train * 2
n_classes <- length(levels(train_data$weather_main))
n_per_class <- floor(n_target_final / n_classes)
list_per_class <- split(train_data, train_data$weather_main)
balanced_list <- list()
for (cls in names(list_per_class)) {
data_cls <- list_per_class[[cls]]
n_cls <- nrow(data_cls)
balanced_list[[cls]] <- if (n_cls > n_per_class) {
slice_sample(data_cls, n = n_per_class)
} else {
slice_sample(data_cls, n = n_per_class, replace = TRUE)
}
}
train_balanced <- bind_rows(balanced_list)
cat("Distribusi kelas setelah balancing di train_data:\n")
## Distribusi kelas setelah balancing di train_data:
table(train_balanced$weather_main)
##
## 0 1 2 3
## 3085 3085 3085 3085
cat("Total data train setelah balancing:", nrow(train_balanced), "\n")
## Total data train setelah balancing: 12340
cat("Distribusi kelas di test_data (tidak diubah):\n")
## Distribusi kelas di test_data (tidak diubah):
table(test_data$weather_main)
##
## 0 1 2 3
## 6 2422 171 42
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
temp_vars <- dplyr::select(train_balanced, temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars, scale. = TRUE)
pc_df <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc_df) <- "pca_temp"
train_balanced_pca <- train_balanced %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc_df)
lm_model <- lm(as.numeric(weather_main) ~ ., data = train_balanced_pca)
vif_values <- vif(lm_model)
vif_values
## pressure humidity wind_speed wind_deg clouds_all pca_temp
## 1.260702 2.715875 1.195759 1.116160 1.341947 2.465386
train_balanced_pca$weather_main <- as.factor(train_balanced_pca$weather_main)
num_vars <- names(train_balanced_pca)[sapply(train_balanced_pca, is.numeric)]
chi_sq_test <- function(varname, data, target) {
binned_var <- cut(data[[varname]], breaks = 5, include.lowest = TRUE)
contingency_table <- table(binned_var, data[[target]])
if (all(dim(contingency_table) > 1)) {
chisq.test(contingency_table)$p.value
} else NA
}
chi_results <- sapply(num_vars, chi_sq_test, data = train_balanced_pca, target = "weather_main")
chi_df <- data.frame(Variable = names(chi_results), P_Value = chi_results)
chi_df
## Variable P_Value
## pressure pressure 4.916433e-308
## humidity humidity 0.000000e+00
## wind_speed wind_speed 4.055834e-221
## wind_deg wind_deg 0.000000e+00
## clouds_all clouds_all 0.000000e+00
## pca_temp pca_temp 6.137522e-146
feature_cols <- c("temp", "temp_min", "temp_max", "pressure", "humidity", "wind_speed", "wind_deg", "clouds_all")
min_max_norm <- function(x) (x - min(x)) / (max(x) - min(x))
train_balanced_norm <- train_balanced
train_balanced_norm[feature_cols] <- lapply(train_balanced[feature_cols], min_max_norm)
test_data_norm <- test_data
for (col in feature_cols) {
min_val <- min(train_balanced[[col]])
max_val <- max(train_balanced[[col]])
test_data_norm[[col]] <- (test_data[[col]] - min_val) / (max_val - min_val)
}
summary(train_balanced_norm)
## temp temp_min temp_max pressure
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3875 1st Qu.:0.3824 1st Qu.:0.3923 1st Qu.:0.2818
## Median :0.4760 Median :0.4792 Median :0.4923 Median :0.4545
## Mean :0.4919 Mean :0.4819 Mean :0.5089 Mean :0.4520
## 3rd Qu.:0.5683 3rd Qu.:0.5536 3rd Qu.:0.6154 3rd Qu.:0.6364
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## humidity wind_speed wind_deg clouds_all
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5333 1st Qu.:0.1201 1st Qu.:0.2778 1st Qu.:0.0700
## Median :0.6222 Median :0.2319 Median :0.3889 Median :0.4000
## Mean :0.6381 Mean :0.2411 Mean :0.4789 Mean :0.3484
## 3rd Qu.:0.7778 3rd Qu.:0.3438 3rd Qu.:0.7500 3rd Qu.:0.7500
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## weather_main
## 0:3085
## 1:3085
## 2:3085
## 3:3085
##
##
summary(test_data_norm)
## temp temp_min temp_max pressure
## Min. :0.0007628 Min. :-0.04167 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3836766 1st Qu.: 0.38244 1st Qu.:0.3846 1st Qu.:0.3636
## Median :0.4774981 Median : 0.47917 Median :0.5000 Median :0.5455
## Mean :0.4959084 Mean : 0.48856 Mean :0.5120 Mean :0.5390
## 3rd Qu.:0.6041190 3rd Qu.: 0.58333 3rd Qu.:0.6154 3rd Qu.:0.7273
## Max. :1.0099161 Max. : 1.00000 Max. :1.0308 Max. :1.0000
## humidity wind_speed wind_deg clouds_all
## Min. :-0.06667 Min. :-0.005966 Min. :0.0000 Min. :0.0000
## 1st Qu.: 0.51111 1st Qu.: 0.120060 1st Qu.:0.2778 1st Qu.:0.2000
## Median : 0.62222 Median : 0.231916 Median :0.3333 Median :0.2000
## Mean : 0.61262 Mean : 0.255270 Mean :0.4259 Mean :0.3099
## 3rd Qu.: 0.73333 3rd Qu.: 0.385533 3rd Qu.:0.6944 3rd Qu.:0.4000
## Max. : 1.00000 Max. : 0.925429 Max. :1.0000 Max. :1.0000
## weather_main
## 0: 6
## 1:2422
## 2: 171
## 3: 42
##
##
library(ggplot2)
weather_summary$proporsi <- weather_summary$Jumlah / sum(weather_summary$Jumlah)
ggplot(weather_summary, aes(x = reorder(`Weather Main`, -Jumlah), y = Jumlah, fill = `Weather Main`)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(title = "Distribusi Kategori Cuaca di Denpasar (2019)",
x = "Kategori Cuaca", y = "Jumlah")
library(ggplot2)
library(dplyr)
train_weather_summary <- train_balanced %>%
group_by(weather_main) %>%
summarise(Jumlah = n()) %>%
arrange(desc(Jumlah))
train_weather_summary$Proporsi <- train_weather_summary$Jumlah / sum(train_weather_summary$Jumlah)
ggplot(train_weather_summary, aes(x = reorder(weather_main, -Jumlah), y = Jumlah, fill = weather_main)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(title = "Distribusi Kategori Cuaca pada Data Train (Balanced)",
x = "Kategori Cuaca", y = "Jumlah")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
correlation_matrix <- cor(train_balanced_norm[, feature_cols])
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black",
addCoef.col = "black", number.cex = 0.7, diag = FALSE)
karena fitur temp, temp_min, & temp_max mempunyai korelasi yang sangat tinggi, maka saya lakukan PCA kepada ketiga fitur menjadi 1 fitur (pca_temp).
library(nnet)
temp_vars_train <- train_balanced_norm %>% dplyr::select(temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars_train, scale. = TRUE)
pc1_train <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc1_train) <- "pca_temp"
train_mlr <- train_balanced_norm %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc1_train)
model_mlr <- multinom(weather_main ~ ., data = train_mlr, trace = FALSE)
temp_vars_test <- test_data_norm %>% dplyr::select(temp, temp_min, temp_max)
pc1_test_mat <- predict(pca_temp, newdata = temp_vars_test)[, 1, drop = FALSE]
pc1_test_df <- as.data.frame(pc1_test_mat)
colnames(pc1_test_df) <- "pca_temp"
test_mlr <- test_data_norm %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc1_test_df)
test_mlr$weather_main <- as.factor(test_mlr$weather_main)
pred_mlr <- predict(model_mlr, test_mlr)
conf_matrix_mlr <- confusionMatrix(pred_mlr, test_mlr$weather_main)
conf_matrix_mlr
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3
## 0 4 0 0 0
## 1 2 1897 34 6
## 2 0 248 84 11
## 3 0 277 53 25
##
## Overall Statistics
##
## Accuracy : 0.7611
## 95% CI : (0.7443, 0.7772)
## No Information Rate : 0.9171
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2442
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity 0.666667 0.7832 0.49123 0.595238
## Specificity 1.000000 0.8082 0.89514 0.873028
## Pos Pred Value 1.000000 0.9783 0.24490 0.070423
## Neg Pred Value 0.999242 0.2521 0.96214 0.992563
## Prevalence 0.002272 0.9171 0.06475 0.015903
## Detection Rate 0.001515 0.7183 0.03181 0.009466
## Detection Prevalence 0.001515 0.7342 0.12988 0.134419
## Balanced Accuracy 0.833333 0.7957 0.69318 0.734133
class_accuracy <- diag(conf_matrix_mlr$table) / rowSums(conf_matrix_mlr$table)
class_accuracy_df <- data.frame(Kelas = names(class_accuracy),
Akurasi = round(class_accuracy, 3))
ggplot(class_accuracy_df, aes(x = reorder(Kelas, -Akurasi), y = Akurasi, fill = Kelas)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Akurasi), vjust = -0.3, size = 3.5) +
theme_minimal() +
labs(title = "Akurasi per Kelas Cuaca",
x = "Kategori Cuaca", y = "Akurasi") +
ylim(0, 1)
bisa dilihat, secara per kelas, performa model sangat bervariasi. Untuk
kelas 0, sensitivitas dan spesifisitas hampir sempurna (1.000 dan
0.9996), namun ini bisa disebabkan oleh prevalensi yang sangat rendah
(0.22%). Kelas 1 menunjukkan performa terbaik dengan sensitivitas 78.41%
dan nilai prediksi positif 97.94%. Sebaliknya, kelas 2 dan 3 memiliki
sensitivitas rendah serta nilai prediksi positif yang juga rendah,
menandakan bahwa model mengalami kesulitan membedakan kedua kelas
ini.
library(ggplot2)
library(caret)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
# Ubah confusion matrix menjadi data frame
cm_df <- as.data.frame(conf_matrix_mlr$table)
# Visualisasi heatmap
ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 4) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix Multinomial Logistic Regression",
x = "Actual (Reference)",
y = "Predicted") +
theme_minimal()