Dataset yang digunakan dalam proyek ini berasal dari Kaggle dengan judul “Denpasar Bali Historical Weather Data”, yang berisi data cuaca historis Kota Denpasar dari tahun 1990 hingga 2020. Namun, untuk kebutuhan proyek ini, fokus klasifikasi diarahkan pada empat kategori utama yang relevan dengan kondisi cuaca Denpasar tahun 2019, yaitu Clouds, Rain, Thunderstorm, dan Clear.
library(readr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data_path <- "D:/TUGAS KAMPUS SEMESTER 4/ANALISIS MULTIVARIAT/PROYEK AKHIR ANMUL/openweatherdata-denpasar-1990-2020.csv"
data <- read_csv(data_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 264924 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): dt_iso, city_name, weather_main, weather_description, weather_icon
## dbl (17): dt, timezone, lat, lon, temp, temp_min, temp_max, pressure, humidi...
## lgl (10): sea_level, grnd_level, rain_12h, rain_today, snow_1h, snow_3h, sno...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$dt_iso <- gsub(" UTC", "", data$dt_iso)
data$dt_iso <- as_datetime(data$dt_iso)
data_filtered <- data %>% filter(year(dt_iso) == 2019)
if (nrow(data_filtered) == 0) stop("Tidak ada data untuk tahun 2019!")
head(data_filtered)
## # A tibble: 6 × 32
## dt dt_iso timezone city_name lat lon temp temp_min
## <dbl> <dttm> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1546300800 2019-01-01 00:00:00 28800 Denpasar -8.65 115. 27.1 27
## 2 1546304400 2019-01-01 01:00:00 28800 Denpasar -8.65 115. 27.8 27.4
## 3 1546308000 2019-01-01 02:00:00 28800 Denpasar -8.65 115. 29.5 29
## 4 1546311600 2019-01-01 03:00:00 28800 Denpasar -8.65 115. 30.1 30
## 5 1546315200 2019-01-01 04:00:00 28800 Denpasar -8.65 115. 30.2 30
## 6 1546318800 2019-01-01 05:00:00 28800 Denpasar -8.65 115. 30 30
## # ℹ 24 more variables: temp_max <dbl>, pressure <dbl>, sea_level <lgl>,
## # grnd_level <lgl>, humidity <dbl>, wind_speed <dbl>, wind_deg <dbl>,
## # rain_1h <dbl>, rain_3h <dbl>, rain_6h <dbl>, rain_12h <lgl>,
## # rain_24h <dbl>, rain_today <lgl>, snow_1h <lgl>, snow_3h <lgl>,
## # snow_6h <lgl>, snow_12h <lgl>, snow_24h <lgl>, snow_today <lgl>,
## # clouds_all <dbl>, weather_id <dbl>, weather_main <chr>,
## # weather_description <chr>, weather_icon <chr>
cat("Jumlah baris:", nrow(data_filtered), "\n")
## Jumlah baris: 8822
cat("Jumlah kolom:", ncol(data_filtered), "\n")
## Jumlah kolom: 32
na_summary <- data.frame(Kolom = names(data_filtered),
Jumlah_NA = colSums(is.na(data_filtered))) %>%
arrange(desc(Jumlah_NA))
na_summary
## Kolom Jumlah_NA
## sea_level sea_level 8822
## grnd_level grnd_level 8822
## rain_12h rain_12h 8822
## rain_today rain_today 8822
## snow_1h snow_1h 8822
## snow_3h snow_3h 8822
## snow_6h snow_6h 8822
## snow_12h snow_12h 8822
## snow_24h snow_24h 8822
## snow_today snow_today 8822
## rain_1h rain_1h 8813
## rain_24h rain_24h 8543
## rain_6h rain_6h 8541
## rain_3h rain_3h 8275
## dt dt 0
## dt_iso dt_iso 0
## timezone timezone 0
## city_name city_name 0
## lat lat 0
## lon lon 0
## temp temp 0
## temp_min temp_min 0
## temp_max temp_max 0
## pressure pressure 0
## humidity humidity 0
## wind_speed wind_speed 0
## wind_deg wind_deg 0
## clouds_all clouds_all 0
## weather_id weather_id 0
## weather_main weather_main 0
## weather_description weather_description 0
## weather_icon weather_icon 0
freq_weather <- table(data_filtered$weather_main, useNA = "ifany")
weather_summary <- as.data.frame(freq_weather)
colnames(weather_summary) <- c("Weather Main", "Jumlah")
weather_summary
## Weather Main Jumlah
## 1 Clear 22
## 2 Clouds 8075
## 3 Haze 5
## 4 Mist 3
## 5 Rain 571
## 6 Smoke 3
## 7 Thunderstorm 143
data_filtered <- data_filtered %>%
select(temp, temp_min, temp_max, pressure, humidity,
wind_speed, wind_deg, clouds_all, weather_main) %>%
filter(weather_main %in% c("Clouds", "Rain", "Thunderstorm", "Clear"))
if (length(unique(data_filtered$weather_main)) < 2) {
stop("Kolom weather_main hanya memiliki satu kategori setelah filter. Tidak bisa dilakukan split!")
}
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
data_filtered$weather_main <- as.integer(data_filtered$weather_main) - 1
is.ordered(data_filtered$weather_main)
## [1] FALSE
str(data_filtered)
## tibble [8,811 × 9] (S3: tbl_df/tbl/data.frame)
## $ temp : num [1:8811] 27.1 27.8 29.5 30.1 30.1 ...
## $ temp_min : num [1:8811] 27 27.4 29 30 30 30 30 29 29 29 ...
## $ temp_max : num [1:8811] 27.4 28 30.4 30.4 30.4 30 30 30 29.4 29.4 ...
## $ pressure : num [1:8811] 1010 1010 1010 1010 1009 ...
## $ humidity : num [1:8811] 94 88 83 79 79 79 74 79 83 79 ...
## $ wind_speed : num [1:8811] 0.5 2.1 3.1 4.1 4.1 5.1 4.6 5.7 5.7 5.7 ...
## $ wind_deg : num [1:8811] 0 240 240 260 260 250 260 260 260 270 ...
## $ clouds_all : num [1:8811] 40 20 20 20 20 20 20 20 20 20 ...
## $ weather_main: num [1:8811] 2 2 1 1 1 1 1 1 1 1 ...
unique(data_filtered$weather_main)
## [1] 2 1 0 3
data_filtered <- data_filtered %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
cat("Jumlah nilai NA setelah imputasi (dengan mean):\n")
## Jumlah nilai NA setelah imputasi (dengan mean):
print(colSums(is.na(data_filtered)))
## temp temp_min temp_max pressure humidity wind_speed
## 0 0 0 0 0 0
## wind_deg clouds_all weather_main
## 0 0 0
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
split_index <- createDataPartition(data_filtered$weather_main, p = 0.7, list = FALSE)
train_data <- data_filtered[split_index, ]
test_data <- data_filtered[-split_index, ]
cat("Jumlah data Train :", nrow(train_data), "\n")
## Jumlah data Train : 6170
cat("Jumlah data Test :", nrow(test_data), "\n")
## Jumlah data Test : 2641
bisa dilihat berdasarkan distribusi pada kolom target, terdapat ketidakseimbangan data maka dilakukan penangan imbalaced data. penganan ini dilakukan pada data train.
if (!require(smotefamily)) install.packages("smotefamily")
## Loading required package: smotefamily
## Warning: package 'smotefamily' was built under R version 4.4.3
library(smotefamily)
set.seed(123)
n_total_train <- nrow(train_data)
n_target_final <- n_total_train * 2
n_classes <- length(levels(train_data$weather_main))
n_per_class <- floor(n_target_final / n_classes)
list_per_class <- split(train_data, train_data$weather_main)
balanced_list <- list()
for (cls in names(list_per_class)) {
data_cls <- list_per_class[[cls]]
n_cls <- nrow(data_cls)
balanced_list[[cls]] <- if (n_cls > n_per_class) {
slice_sample(data_cls, n = n_per_class)
} else {
slice_sample(data_cls, n = n_per_class, replace = TRUE)
}
}
train_balanced <- bind_rows(balanced_list)
cat("Distribusi kelas setelah balancing di train_data:\n")
## Distribusi kelas setelah balancing di train_data:
table(train_balanced$weather_main)
##
## 0 1 2 3
## 3085 3085 3085 3085
cat("Total data train setelah balancing:", nrow(train_balanced), "\n")
## Total data train setelah balancing: 12340
cat("Distribusi kelas di test_data (tidak diubah):\n")
## Distribusi kelas di test_data (tidak diubah):
table(test_data$weather_main)
##
## 0 1 2 3
## 6 2422 171 42
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
temp_vars <- dplyr::select(train_balanced, temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars, scale. = TRUE)
pc_df <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc_df) <- "pca_temp"
train_balanced_pca <- train_balanced %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc_df)
lm_model <- lm(as.numeric(weather_main) ~ ., data = train_balanced_pca)
vif_values <- vif(lm_model)
vif_values
## pressure humidity wind_speed wind_deg clouds_all pca_temp
## 1.289274 2.343378 1.192243 1.159043 1.304914 2.161305
train_balanced_pca$weather_main <- as.factor(train_balanced_pca$weather_main)
num_vars <- names(train_balanced_pca)[sapply(train_balanced_pca, is.numeric)]
chi_sq_test <- function(varname, data, target) {
binned_var <- cut(data[[varname]], breaks = 5, include.lowest = TRUE)
contingency_table <- table(binned_var, data[[target]])
if (all(dim(contingency_table) > 1)) {
chisq.test(contingency_table)$p.value
} else NA
}
chi_results <- sapply(num_vars, chi_sq_test, data = train_balanced_pca, target = "weather_main")
chi_df <- data.frame(Variable = names(chi_results), P_Value = chi_results)
chi_df
## Variable P_Value
## pressure pressure 4.619129e-217
## humidity humidity 0.000000e+00
## wind_speed wind_speed 1.227488e-244
## wind_deg wind_deg 0.000000e+00
## clouds_all clouds_all 0.000000e+00
## pca_temp pca_temp 4.898088e-155
feature_cols <- c("temp", "temp_min", "temp_max", "pressure", "humidity", "wind_speed", "wind_deg", "clouds_all")
min_max_norm <- function(x) (x - min(x)) / (max(x) - min(x))
train_balanced_norm <- train_balanced
train_balanced_norm[feature_cols] <- lapply(train_balanced[feature_cols], min_max_norm)
test_data_norm <- test_data
for (col in feature_cols) {
min_val <- min(train_balanced[[col]])
max_val <- max(train_balanced[[col]])
test_data_norm[[col]] <- (test_data[[col]] - min_val) / (max_val - min_val)
}
summary(train_balanced_norm)
## temp temp_min temp_max pressure
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3414 1st Qu.:0.3895 1st Qu.:0.3333 1st Qu.:0.2500
## Median :0.4338 Median :0.4900 Median :0.4333 Median :0.4167
## Mean :0.4528 Mean :0.4938 Mean :0.4635 Mean :0.4177
## 3rd Qu.:0.5479 3rd Qu.:0.5750 3rd Qu.:0.5833 3rd Qu.:0.5833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## humidity wind_speed wind_deg clouds_all
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5625 1st Qu.:0.1240 1st Qu.:0.2778 1st Qu.:0.1450
## Median :0.6458 Median :0.2403 Median :0.3889 Median :0.4000
## Mean :0.6741 Mean :0.2614 Mean :0.4789 Mean :0.3484
## 3rd Qu.:0.8750 3rd Qu.:0.3566 3rd Qu.:0.7500 3rd Qu.:0.7500
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## weather_main
## 0:3085
## 1:3085
## 2:3085
## 3:3085
##
##
summary(test_data_norm)
## temp temp_min temp_max pressure
## Min. :-0.07744 Min. :-0.04328 Min. :-0.08333 Min. :0.0000
## 1st Qu.: 0.34555 1st Qu.: 0.41267 1st Qu.: 0.33333 1st Qu.:0.3333
## Median : 0.45629 Median : 0.49768 Median : 0.49167 Median :0.5000
## Mean : 0.46968 Mean : 0.51270 Mean : 0.47693 Mean :0.4796
## 3rd Qu.: 0.58701 3rd Qu.: 0.62133 3rd Qu.: 0.58333 3rd Qu.:0.5833
## Max. : 0.99167 Max. : 0.97682 Max. : 1.03333 Max. :0.9167
## humidity wind_speed wind_deg clouds_all
## Min. :0.0625 Min. :-0.0007752 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.4792 1st Qu.: 0.1240310 1st Qu.:0.2778 1st Qu.:0.2000
## Median :0.6458 Median : 0.2403101 Median :0.3333 Median :0.2000
## Mean :0.6300 Mean : 0.2635606 Mean :0.4253 Mean :0.3108
## 3rd Qu.:0.7500 3rd Qu.: 0.3565891 3rd Qu.:0.6944 3rd Qu.:0.4000
## Max. :1.0000 Max. : 1.0387597 Max. :1.0000 Max. :1.0000
## weather_main
## 0: 6
## 1:2422
## 2: 171
## 3: 42
##
##
library(ggplot2)
library(dplyr)
train_weather_summary_1 <- train_data %>%
group_by(weather_main) %>%
summarise(Jumlah = n()) %>%
arrange(desc(Jumlah))
train_weather_summary_1$Proporsi <- train_weather_summary_1$Jumlah / sum(train_weather_summary_1$Jumlah)
ggplot(train_weather_summary_1, aes(x = reorder(weather_main, -Jumlah), y = Jumlah, fill = weather_main)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(title = "Distribusi Kategori Cuaca pada Data Train (Sebelum Balanced)",
x = "Kategori Cuaca", y = "Jumlah")
library(ggplot2)
library(dplyr)
train_weather_summary <- train_balanced %>%
group_by(weather_main) %>%
summarise(Jumlah = n()) %>%
arrange(desc(Jumlah))
train_weather_summary$Proporsi <- train_weather_summary$Jumlah / sum(train_weather_summary$Jumlah)
ggplot(train_weather_summary, aes(x = reorder(weather_main, -Jumlah), y = Jumlah, fill = weather_main)) +
geom_bar(stat = "identity") +
theme_minimal() +
coord_flip() +
labs(title = "Distribusi Kategori Cuaca pada Data Train (Balanced)",
x = "Kategori Cuaca", y = "Jumlah")
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
correlation_matrix <- cor(train_balanced_norm[, feature_cols])
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black",
addCoef.col = "black", number.cex = 0.7, diag = FALSE)
karena fitur temp, temp_min, & temp_max mempunyai korelasi yang sangat tinggi, maka saya lakukan PCA kepada ketiga fitur menjadi 1 fitur (pca_temp).
library(nnet)
temp_vars_train <- train_balanced_norm %>% dplyr::select(temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars_train, scale. = TRUE)
pc1_train <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc1_train) <- "pca_temp"
train_mlr <- train_balanced_norm %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc1_train)
model_mlr <- multinom(weather_main ~ ., data = train_mlr, trace = FALSE)
temp_vars_test <- test_data_norm %>% dplyr::select(temp, temp_min, temp_max)
pc1_test_mat <- predict(pca_temp, newdata = temp_vars_test)[, 1, drop = FALSE]
pc1_test_df <- as.data.frame(pc1_test_mat)
colnames(pc1_test_df) <- "pca_temp"
test_mlr <- test_data_norm %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc1_test_df)
test_mlr$weather_main <- as.factor(test_mlr$weather_main)
pred_mlr <- predict(model_mlr, test_mlr)
conf_matrix_mlr <- confusionMatrix(pred_mlr, test_mlr$weather_main)
conf_matrix_mlr
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3
## 0 6 0 1 0
## 1 0 1886 38 9
## 2 0 258 81 16
## 3 0 278 51 17
##
## Overall Statistics
##
## Accuracy : 0.7535
## 95% CI : (0.7366, 0.7698)
## No Information Rate : 0.9171
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2248
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity 1.000000 0.7787 0.47368 0.404762
## Specificity 0.999620 0.7854 0.88907 0.873413
## Pos Pred Value 0.857143 0.9757 0.22817 0.049133
## Neg Pred Value 1.000000 0.2429 0.96063 0.989107
## Prevalence 0.002272 0.9171 0.06475 0.015903
## Detection Rate 0.002272 0.7141 0.03067 0.006437
## Detection Prevalence 0.002651 0.7319 0.13442 0.131011
## Balanced Accuracy 0.999810 0.7820 0.68138 0.639087
library(ggplot2)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
library(corrplot)
cor_matrix <- cor(train_mlr %>% dplyr::select(where(is.numeric)))
cor_df <- melt(cor_matrix)
ggplot(cor_df, aes(x = Var1, y = Var2, fill = value)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name = "Korelasi") +
theme_minimal() +
labs(title = "Heatmap Korelasi Setelah PCA Suhu") +
theme(axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1))
corrplot(cor_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45, addCoef.col = "black",
title = "Korelasi Fitur Setelah PCA Suhu", mar=c(0,0,1,0))
class_accuracy <- diag(conf_matrix_mlr$table) / rowSums(conf_matrix_mlr$table)
class_accuracy_df <- data.frame(Kelas = names(class_accuracy),
Akurasi = round(class_accuracy, 3))
ggplot(class_accuracy_df, aes(x = reorder(Kelas, -Akurasi), y = Akurasi, fill = Kelas)) +
geom_bar(stat = "identity") +
geom_text(aes(label = Akurasi), vjust = -0.3, size = 3.5) +
theme_minimal() +
labs(title = "Akurasi per Kelas Cuaca",
x = "Kategori Cuaca", y = "Akurasi") +
ylim(0, 1)
Grafik menunjukkan bahwa model memiliki akurasi tinggi pada kelas cuaca
1 (97,7%) dan 0 (85,7%), namun sangat rendah pada kelas 2 (21,4%) dan 3
(7,8%). Hal ini mengindikasikan bahwa model cenderung hanya mampu
mengenali cuaca yang umum atau dominan, sementara gagal membedakan kelas
cuaca yang lebih jarang.
ggplot(train_mlr, aes(x = weather_main, y = pca_temp, fill = weather_main)) +
geom_boxplot() +
labs(title = "Distribusi PCA Suhu berdasarkan Weather",
x = "Weather Main", y = "PCA Temp (Komponen 1)") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "none")
Boxplot di atas menunjukkan distribusi nilai PCA dari suhu (gabungan
temp, temp_min, dan temp_max) berdasarkan kategori cuaca utama
(weather_main). Terlihat bahwa nilai tengah (median) untuk masing-masing
kategori cuaca berada di sekitar nol, menandakan tidak ada perbedaan
ekstrem antar kelompok.
library(ggplot2)
library(caret)
library(reshape2)
cm_df <- as.data.frame(conf_matrix_mlr$table)
ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 4) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix Multinomial Logistic Regression",
x = "Actual (Reference)",
y = "Predicted") +
theme_minimal()
test_features_pca <- test_mlr %>% dplyr::select(-weather_main)
dist_matrix_pca <- dist(test_features_pca)
mds_coords_pca <- cmdscale(dist_matrix_pca, k = 2) %>% as.data.frame()
colnames(mds_coords_pca) <- c("Dim1", "Dim2")
mds_coords_pca$Predicted <- pred_mlr
mds_coords_pca$Actual <- test_mlr$weather_main
ggplot(mds_coords_pca, aes(x = Dim1, y = Dim2, color = Predicted, shape = Actual)) +
geom_point(alpha = 0.7, size = 3) +
labs(title = "MDS Plot - MLR dengan PCA pada Suhu",
x = "Dimensi 1", y = "Dimensi 2") +
theme_minimal()
Plot ini menunjukkan visualisasi data cuaca hasil reduksi dimensi
menggunakan MDS berdasarkan hasil PCA suhu dan prediksi model MLR.
erlihat bahwa sebagian besar data terkonsentrasi di tengah, terutama
label dengan kode 1 (warna hijau), yang menunjukkan bahwa model lebih
sering memprediksi kelas tersebut dibandingkan yang lain.