Dataset yang digunakan dalam proyek ini berasal dari Kaggle dengan judul “Denpasar Bali Historical Weather Data”, yang berisi data cuaca historis Kota Denpasar dari tahun 1990 hingga 2020. Namun, untuk kebutuhan proyek ini, fokus klasifikasi diarahkan pada empat kategori utama yang relevan dengan kondisi cuaca Denpasar tahun 2019, yaitu Clouds, Rain, Thunderstorm, dan Clear.
library(readr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data_path <- "D:/TUGAS KAMPUS SEMESTER 4/ANALISIS MULTIVARIAT/PROYEK AKHIR ANMUL/openweatherdata-denpasar-1990-2020.csv"
data <- read_csv(data_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 264924 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): dt_iso, city_name, weather_main, weather_description, weather_icon
## dbl (17): dt, timezone, lat, lon, temp, temp_min, temp_max, pressure, humidi...
## lgl (10): sea_level, grnd_level, rain_12h, rain_today, snow_1h, snow_3h, sno...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$dt_iso <- gsub(" UTC", "", data$dt_iso)
data$dt_iso <- as_datetime(data$dt_iso)
data_filtered <- data %>% filter(year(dt_iso) == 2019)
if (nrow(data_filtered) == 0) stop("Tidak ada data untuk tahun 2019!")
head(data_filtered)
## # A tibble: 6 × 32
## dt dt_iso timezone city_name lat lon temp temp_min
## <dbl> <dttm> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1546300800 2019-01-01 00:00:00 28800 Denpasar -8.65 115. 27.1 27
## 2 1546304400 2019-01-01 01:00:00 28800 Denpasar -8.65 115. 27.8 27.4
## 3 1546308000 2019-01-01 02:00:00 28800 Denpasar -8.65 115. 29.5 29
## 4 1546311600 2019-01-01 03:00:00 28800 Denpasar -8.65 115. 30.1 30
## 5 1546315200 2019-01-01 04:00:00 28800 Denpasar -8.65 115. 30.2 30
## 6 1546318800 2019-01-01 05:00:00 28800 Denpasar -8.65 115. 30 30
## # ℹ 24 more variables: temp_max <dbl>, pressure <dbl>, sea_level <lgl>,
## # grnd_level <lgl>, humidity <dbl>, wind_speed <dbl>, wind_deg <dbl>,
## # rain_1h <dbl>, rain_3h <dbl>, rain_6h <dbl>, rain_12h <lgl>,
## # rain_24h <dbl>, rain_today <lgl>, snow_1h <lgl>, snow_3h <lgl>,
## # snow_6h <lgl>, snow_12h <lgl>, snow_24h <lgl>, snow_today <lgl>,
## # clouds_all <dbl>, weather_id <dbl>, weather_main <chr>,
## # weather_description <chr>, weather_icon <chr>
cat("Jumlah baris:", nrow(data_filtered), "\n")
## Jumlah baris: 8822
cat("Jumlah kolom:", ncol(data_filtered), "\n")
## Jumlah kolom: 32
na_summary <- data.frame(Kolom = names(data_filtered),
Jumlah_NA = colSums(is.na(data_filtered))) %>%
arrange(desc(Jumlah_NA))
na_summary
## Kolom Jumlah_NA
## sea_level sea_level 8822
## grnd_level grnd_level 8822
## rain_12h rain_12h 8822
## rain_today rain_today 8822
## snow_1h snow_1h 8822
## snow_3h snow_3h 8822
## snow_6h snow_6h 8822
## snow_12h snow_12h 8822
## snow_24h snow_24h 8822
## snow_today snow_today 8822
## rain_1h rain_1h 8813
## rain_24h rain_24h 8543
## rain_6h rain_6h 8541
## rain_3h rain_3h 8275
## dt dt 0
## dt_iso dt_iso 0
## timezone timezone 0
## city_name city_name 0
## lat lat 0
## lon lon 0
## temp temp 0
## temp_min temp_min 0
## temp_max temp_max 0
## pressure pressure 0
## humidity humidity 0
## wind_speed wind_speed 0
## wind_deg wind_deg 0
## clouds_all clouds_all 0
## weather_id weather_id 0
## weather_main weather_main 0
## weather_description weather_description 0
## weather_icon weather_icon 0
freq_weather <- table(data_filtered$weather_main, useNA = "ifany")
weather_summary <- as.data.frame(freq_weather)
colnames(weather_summary) <- c("Weather Main", "Jumlah")
weather_summary
## Weather Main Jumlah
## 1 Clear 22
## 2 Clouds 8075
## 3 Haze 5
## 4 Mist 3
## 5 Rain 571
## 6 Smoke 3
## 7 Thunderstorm 143
data_filtered <- data_filtered %>%
select(temp, temp_min, temp_max, pressure, humidity,
wind_speed, wind_deg, clouds_all, weather_main) %>%
filter(weather_main %in% c("Clouds", "Rain", "Thunderstorm", "Clear"))
if (length(unique(data_filtered$weather_main)) < 2) {
stop("Kolom weather_main hanya memiliki satu kategori setelah filter. Tidak bisa dilakukan split!")
}
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
data_filtered$weather_main <- as.integer(data_filtered$weather_main) - 1
is.ordered(data_filtered$weather_main)
## [1] FALSE
str(data_filtered)
## tibble [8,811 × 9] (S3: tbl_df/tbl/data.frame)
## $ temp : num [1:8811] 27.1 27.8 29.5 30.1 30.1 ...
## $ temp_min : num [1:8811] 27 27.4 29 30 30 30 30 29 29 29 ...
## $ temp_max : num [1:8811] 27.4 28 30.4 30.4 30.4 30 30 30 29.4 29.4 ...
## $ pressure : num [1:8811] 1010 1010 1010 1010 1009 ...
## $ humidity : num [1:8811] 94 88 83 79 79 79 74 79 83 79 ...
## $ wind_speed : num [1:8811] 0.5 2.1 3.1 4.1 4.1 5.1 4.6 5.7 5.7 5.7 ...
## $ wind_deg : num [1:8811] 0 240 240 260 260 250 260 260 260 270 ...
## $ clouds_all : num [1:8811] 40 20 20 20 20 20 20 20 20 20 ...
## $ weather_main: num [1:8811] 2 2 1 1 1 1 1 1 1 1 ...
unique(data_filtered$weather_main)
## [1] 2 1 0 3
data_filtered <- data_filtered %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
cat("Jumlah nilai NA setelah imputasi (dengan mean):\n")
## Jumlah nilai NA setelah imputasi (dengan mean):
print(colSums(is.na(data_filtered)))
## temp temp_min temp_max pressure humidity wind_speed
## 0 0 0 0 0 0
## wind_deg clouds_all weather_main
## 0 0 0
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
split_index <- createDataPartition(data_filtered$weather_main, p = 0.7, list = FALSE)
train_data <- data_filtered[split_index, ]
test_data <- data_filtered[-split_index, ]
cat("Jumlah data Train :", nrow(train_data), "\n")
## Jumlah data Train : 6170
cat("Jumlah data Test :", nrow(test_data), "\n")
## Jumlah data Test : 2641
bisa dilihat berdasarkan distribusi pada kolom target, terdapat ketidakseimbangan data maka dilakukan penangan imbalaced data. penganan ini dilakukan pada data train.
if (!require(smotefamily)) install.packages("smotefamily")
## Loading required package: smotefamily
## Warning: package 'smotefamily' was built under R version 4.4.3
library(smotefamily)
set.seed(123)
n_total_train <- nrow(train_data)
n_target_final <- n_total_train * 2
n_classes <- length(levels(train_data$weather_main))
n_per_class <- floor(n_target_final / n_classes)
list_per_class <- split(train_data, train_data$weather_main)
balanced_list <- list()
for (cls in names(list_per_class)) {
data_cls <- list_per_class[[cls]]
n_cls <- nrow(data_cls)
balanced_list[[cls]] <- if (n_cls > n_per_class) {
slice_sample(data_cls, n = n_per_class)
} else {
slice_sample(data_cls, n = n_per_class, replace = TRUE)
}
}
train_balanced <- bind_rows(balanced_list)
cat("Distribusi kelas setelah balancing di train_data:\n")
## Distribusi kelas setelah balancing di train_data:
table(train_balanced$weather_main)
##
## 0 1 2 3
## 3085 3085 3085 3085
cat("Total data train setelah balancing:", nrow(train_balanced), "\n")
## Total data train setelah balancing: 12340
cat("Distribusi kelas di test_data (tidak diubah):\n")
## Distribusi kelas di test_data (tidak diubah):
table(test_data$weather_main)
##
## 0 1 2 3
## 6 2422 171 42
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
temp_vars <- dplyr::select(train_balanced, temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars, scale. = TRUE)
pc_df <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc_df) <- "pca_temp"
train_balanced_pca <- train_balanced %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc_df)
lm_model <- lm(as.numeric(weather_main) ~ ., data = train_balanced_pca)
vif_values <- vif(lm_model)
vif_values
## pressure humidity wind_speed wind_deg clouds_all pca_temp
## 1.319953 2.649630 1.193450 1.106895 1.411245 2.327425
train_balanced_pca$weather_main <- as.factor(train_balanced_pca$weather_main)
num_vars <- names(train_balanced_pca)[sapply(train_balanced_pca, is.numeric)]
chi_sq_test <- function(varname, data, target) {
binned_var <- cut(data[[varname]], breaks = 5, include.lowest = TRUE)
contingency_table <- table(binned_var, data[[target]])
if (all(dim(contingency_table) > 1)) {
chisq.test(contingency_table)$p.value
} else NA
}
chi_results <- sapply(num_vars, chi_sq_test, data = train_balanced_pca, target = "weather_main")
## Warning in chisq.test(contingency_table): Chi-squared approximation may be
## incorrect
chi_df <- data.frame(Variable = names(chi_results), P_Value = chi_results)
chi_df
## Variable P_Value
## pressure pressure 2.742739e-305
## humidity humidity 0.000000e+00
## wind_speed wind_speed 0.000000e+00
## wind_deg wind_deg 0.000000e+00
## clouds_all clouds_all 0.000000e+00
## pca_temp pca_temp 1.094242e-219
feature_cols <- c("temp", "temp_min", "temp_max", "pressure", "humidity", "wind_speed", "wind_deg", "clouds_all")
min_max_norm <- function(x) (x - min(x)) / (max(x) - min(x))
train_balanced_norm <- train_balanced
train_balanced_norm[feature_cols] <- lapply(train_balanced[feature_cols], min_max_norm)
test_data_norm <- test_data
for (col in feature_cols) {
min_val <- min(train_balanced[[col]])
max_val <- max(train_balanced[[col]])
test_data_norm[[col]] <- (test_data[[col]] - min_val) / (max_val - min_val)
}
summary(train_balanced_norm)
## temp temp_min temp_max pressure
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3494 1st Qu.:0.3750 1st Qu.:0.3226 1st Qu.:0.2727
## Median :0.4469 Median :0.4792 Median :0.4355 Median :0.4545
## Mean :0.4610 Mean :0.4820 Mean :0.4532 Mean :0.4463
## 3rd Qu.:0.5697 3rd Qu.:0.5536 3rd Qu.:0.5645 3rd Qu.:0.6364
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## humidity wind_speed wind_deg clouds_all
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5417 1st Qu.:0.1301 1st Qu.:0.3056 1st Qu.:0.1000
## Median :0.6458 Median :0.2841 Median :0.3889 Median :0.4000
## Mean :0.6566 Mean :0.2719 Mean :0.4819 Mean :0.3479
## 3rd Qu.:0.8750 3rd Qu.:0.3610 3rd Qu.:0.7500 3rd Qu.:0.4000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## weather_main
## 0:3085
## 1:3085
## 2:3085
## 3:3085
##
##
summary(test_data_norm)
## temp temp_min temp_max pressure
## Min. :-0.04834 Min. :-0.04167 Min. :-0.08065 Min. :0.0000
## 1st Qu.: 0.35578 1st Qu.: 0.39732 1st Qu.: 0.33065 1st Qu.:0.3636
## Median : 0.45325 Median : 0.47917 Median : 0.46774 Median :0.5455
## Mean : 0.47085 Mean : 0.49226 Mean : 0.46185 Mean :0.5318
## 3rd Qu.: 0.58162 3rd Qu.: 0.59077 3rd Qu.: 0.56452 3rd Qu.:0.6636
## Max. : 0.97464 Max. : 0.96280 Max. : 1.00000 Max. :1.0909
## humidity wind_speed wind_deg clouds_all
## Min. :0.0625 Min. :0.006159 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5417 1st Qu.:0.130100 1st Qu.:0.2778 1st Qu.:0.2000
## Median :0.6458 Median :0.284065 Median :0.3333 Median :0.2000
## Mean :0.6314 Mean :0.275867 Mean :0.4256 Mean :0.3152
## 3rd Qu.:0.7500 3rd Qu.:0.407236 3rd Qu.:0.6944 3rd Qu.:0.4000
## Max. :1.0000 Max. :1.038491 Max. :1.0000 Max. :1.0000
## weather_main
## 0: 6
## 1:2422
## 2: 171
## 3: 42
##
##
library(ggplot2)
library(dplyr)
train_weather_summary_1 <- train_data %>%
group_by(weather_main) %>%
summarise(Jumlah = n()) %>%
arrange(desc(Jumlah))
train_weather_summary_1$Proporsi <- train_weather_summary_1$Jumlah / sum(train_weather_summary_1$Jumlah)
ggplot(train_weather_summary_1, aes(x = reorder(weather_main, -Jumlah), y = Jumlah, fill = weather_main)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(title = "Distribusi Kategori Cuaca pada Data Train (Sebelum Balanced)",
x = "Kategori Cuaca", y = "Jumlah")
library(ggplot2)
library(dplyr)
train_weather_summary <- train_balanced %>%
group_by(weather_main) %>%
summarise(Jumlah = n()) %>%
arrange(desc(Jumlah))
train_weather_summary$Proporsi <- train_weather_summary$Jumlah / sum(train_weather_summary$Jumlah)
ggplot(train_weather_summary, aes(x = reorder(weather_main, -Jumlah), y = Jumlah, fill = weather_main)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(title = "Distribusi Kategori Cuaca pada Data Train (Balanced)",
x = "Kategori Cuaca", y = "Jumlah")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
correlation_matrix <- cor(train_balanced_norm[, feature_cols])
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black",
addCoef.col = "black", number.cex = 0.7, diag = FALSE)
karena fitur temp, temp_min, & temp_max mempunyai korelasi yang sangat tinggi, maka saya lakukan PCA kepada ketiga fitur menjadi 1 fitur (pca_temp).
library(nnet)
temp_vars_train <- train_balanced_norm %>% dplyr::select(temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars_train, scale. = TRUE)
pc1_train <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc1_train) <- "pca_temp"
train_mlr <- train_balanced_norm %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc1_train)
model_mlr <- multinom(weather_main ~ ., data = train_mlr, trace = FALSE)
temp_vars_test <- test_data_norm %>% dplyr::select(temp, temp_min, temp_max)
pc1_test_mat <- predict(pca_temp, newdata = temp_vars_test)[, 1, drop = FALSE]
pc1_test_df <- as.data.frame(pc1_test_mat)
colnames(pc1_test_df) <- "pca_temp"
test_mlr <- test_data_norm %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc1_test_df)
test_mlr$weather_main <- as.factor(test_mlr$weather_main)
pred_mlr <- predict(model_mlr, test_mlr)
conf_matrix_mlr <- confusionMatrix(pred_mlr, test_mlr$weather_main)
conf_matrix_mlr
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3
## 0 4 2 0 0
## 1 2 1873 34 9
## 2 0 257 82 14
## 3 0 290 55 19
##
## Overall Statistics
##
## Accuracy : 0.749
## 95% CI : (0.732, 0.7654)
## No Information Rate : 0.9171
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2231
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity 0.666667 0.7733 0.47953 0.452381
## Specificity 0.999241 0.7945 0.89028 0.867257
## Pos Pred Value 0.666667 0.9765 0.23229 0.052198
## Neg Pred Value 0.999241 0.2407 0.96110 0.989899
## Prevalence 0.002272 0.9171 0.06475 0.015903
## Detection Rate 0.001515 0.7092 0.03105 0.007194
## Detection Prevalence 0.002272 0.7262 0.13366 0.137827
## Balanced Accuracy 0.832954 0.7839 0.68491 0.659819
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
library(corrplot)
cor_matrix <- cor(train_mlr %>% dplyr::select(where(is.numeric)))
cor_df <- melt(cor_matrix)
ggplot(cor_df, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = "Korelasi") +
theme_minimal() +
labs(title = "Heatmap Korelasi Setelah PCA Suhu") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45, addCoef.col = "black",
title = "Korelasi Fitur Setelah PCA Suhu", mar=c(0,0,1,0))
class_accuracy <- diag(conf_matrix_mlr$table) / rowSums(conf_matrix_mlr$table)
class_accuracy_df <- data.frame(Kelas = names(class_accuracy),
Akurasi = round(class_accuracy, 3))
ggplot(class_accuracy_df, aes(x = reorder(Kelas, -Akurasi), y = Akurasi, fill = Kelas)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Akurasi), vjust = -0.3, size = 3.5) +
theme_minimal() +
labs(title = "Akurasi per Kelas Cuaca",
x = "Kategori Cuaca", y = "Akurasi") +
ylim(0, 1)
Grafik menunjukkan bahwa model memiliki akurasi tinggi pada kelas cuaca
1 (97,7%) dan 0 (85,7%), namun sangat rendah pada kelas 2 (21,4%) dan 3
(7,8%). Hal ini mengindikasikan bahwa model cenderung hanya mampu
mengenali cuaca yang umum atau dominan, sementara gagal membedakan kelas
cuaca yang lebih jarang.
ggplot(train_mlr, aes(x = weather_main, y = pca_temp, fill = weather_main)) +
geom_boxplot() +
labs(title = "Distribusi PCA Suhu berdasarkan Weather",
x = "Weather Main", y = "PCA Temp (Komponen 1)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
Boxplot di atas menunjukkan distribusi nilai PCA dari suhu (gabungan
temp, temp_min, dan temp_max) berdasarkan kategori cuaca utama
(weather_main). Terlihat bahwa nilai tengah (median) untuk masing-masing
kategori cuaca berada di sekitar nol, menandakan tidak ada perbedaan
ekstrem antar kelompok.
library(ggplot2)
library(caret)
library(reshape2)
cm_df <- as.data.frame(conf_matrix_mlr$table)
ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 4) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix Multinomial Logistic Regression",
x = "Actual (Reference)",
y = "Predicted") +
theme_minimal()
test_features_pca <- test_mlr %>% dplyr::select(-weather_main)
dist_matrix_pca <- dist(test_features_pca)
mds_coords_pca <- cmdscale(dist_matrix_pca, k = 2) %>% as.data.frame()
colnames(mds_coords_pca) <- c("Dim1", "Dim2")
mds_coords_pca$Predicted <- pred_mlr
mds_coords_pca$Actual <- test_mlr$weather_main
ggplot(mds_coords_pca, aes(x = Dim1, y = Dim2, color = Predicted, shape = Actual)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "MDS Plot - MLR dengan PCA pada Suhu",
x = "Dimensi 1", y = "Dimensi 2") +
theme_minimal()
Plot ini menunjukkan visualisasi data cuaca hasil reduksi dimensi
menggunakan MDS berdasarkan hasil PCA suhu dan prediksi model MLR.
erlihat bahwa sebagian besar data terkonsentrasi di tengah, terutama
label dengan kode 1 (warna hijau), yang menunjukkan bahwa model lebih
sering memprediksi kelas tersebut dibandingkan yang lain.