Load Library dan
Data
library(readr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data_path <- "D:/TUGAS KAMPUS SEMESTER 4/ANALISIS MULTIVARIAT/PROYEK AKHIR ANMUL/openweatherdata-denpasar-1990-2020.csv"
data <- read_csv(data_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 264924 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): dt_iso, city_name, weather_main, weather_description, weather_icon
## dbl (17): dt, timezone, lat, lon, temp, temp_min, temp_max, pressure, humidi...
## lgl (10): sea_level, grnd_level, rain_12h, rain_today, snow_1h, snow_3h, sno...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$dt_iso <- gsub(" UTC", "", data$dt_iso)
data$dt_iso <- as_datetime(data$dt_iso)
data_filtered <- data %>% filter(year(dt_iso) == 2019)
if (nrow(data_filtered) == 0) stop("Tidak ada data untuk tahun 2019!")
head(data_filtered)
## # A tibble: 6 × 32
## dt dt_iso timezone city_name lat lon temp temp_min
## <dbl> <dttm> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1546300800 2019-01-01 00:00:00 28800 Denpasar -8.65 115. 27.1 27
## 2 1546304400 2019-01-01 01:00:00 28800 Denpasar -8.65 115. 27.8 27.4
## 3 1546308000 2019-01-01 02:00:00 28800 Denpasar -8.65 115. 29.5 29
## 4 1546311600 2019-01-01 03:00:00 28800 Denpasar -8.65 115. 30.1 30
## 5 1546315200 2019-01-01 04:00:00 28800 Denpasar -8.65 115. 30.2 30
## 6 1546318800 2019-01-01 05:00:00 28800 Denpasar -8.65 115. 30 30
## # ℹ 24 more variables: temp_max <dbl>, pressure <dbl>, sea_level <lgl>,
## # grnd_level <lgl>, humidity <dbl>, wind_speed <dbl>, wind_deg <dbl>,
## # rain_1h <dbl>, rain_3h <dbl>, rain_6h <dbl>, rain_12h <lgl>,
## # rain_24h <dbl>, rain_today <lgl>, snow_1h <lgl>, snow_3h <lgl>,
## # snow_6h <lgl>, snow_12h <lgl>, snow_24h <lgl>, snow_today <lgl>,
## # clouds_all <dbl>, weather_id <dbl>, weather_main <chr>,
## # weather_description <chr>, weather_icon <chr>
EXPLORATORY DATA
ANALYSIS
cat("Jumlah baris:", nrow(data_filtered), "\n")
## Jumlah baris: 8822
cat("Jumlah kolom:", ncol(data_filtered), "\n")
## Jumlah kolom: 32
na_summary <- data.frame(Kolom = names(data_filtered),
Jumlah_NA = colSums(is.na(data_filtered))) %>%
arrange(desc(Jumlah_NA))
na_summary
## Kolom Jumlah_NA
## sea_level sea_level 8822
## grnd_level grnd_level 8822
## rain_12h rain_12h 8822
## rain_today rain_today 8822
## snow_1h snow_1h 8822
## snow_3h snow_3h 8822
## snow_6h snow_6h 8822
## snow_12h snow_12h 8822
## snow_24h snow_24h 8822
## snow_today snow_today 8822
## rain_1h rain_1h 8813
## rain_24h rain_24h 8543
## rain_6h rain_6h 8541
## rain_3h rain_3h 8275
## dt dt 0
## dt_iso dt_iso 0
## timezone timezone 0
## city_name city_name 0
## lat lat 0
## lon lon 0
## temp temp 0
## temp_min temp_min 0
## temp_max temp_max 0
## pressure pressure 0
## humidity humidity 0
## wind_speed wind_speed 0
## wind_deg wind_deg 0
## clouds_all clouds_all 0
## weather_id weather_id 0
## weather_main weather_main 0
## weather_description weather_description 0
## weather_icon weather_icon 0
freq_weather <- table(data_filtered$weather_main, useNA = "ifany")
weather_summary <- as.data.frame(freq_weather)
colnames(weather_summary) <- c("Weather Main", "Jumlah")
weather_summary
## Weather Main Jumlah
## 1 Clear 22
## 2 Clouds 8075
## 3 Haze 5
## 4 Mist 3
## 5 Rain 571
## 6 Smoke 3
## 7 Thunderstorm 143
PREPROCESSING
data_filtered <- data_filtered %>%
select(temp, temp_min, temp_max, pressure, humidity,
wind_speed, wind_deg, clouds_all, weather_main) %>%
filter(weather_main %in% c("Clouds", "Rain", "Thunderstorm", "Clear"))
if (length(unique(data_filtered$weather_main)) < 2) {
stop("Kolom weather_main hanya memiliki satu kategori setelah filter. Tidak bisa dilakukan split!")
}
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
data_filtered$weather_main <- as.integer(data_filtered$weather_main) - 1
is.ordered(data_filtered$weather_main)
## [1] FALSE
str(data_filtered)
## tibble [8,811 × 9] (S3: tbl_df/tbl/data.frame)
## $ temp : num [1:8811] 27.1 27.8 29.5 30.1 30.1 ...
## $ temp_min : num [1:8811] 27 27.4 29 30 30 30 30 29 29 29 ...
## $ temp_max : num [1:8811] 27.4 28 30.4 30.4 30.4 30 30 30 29.4 29.4 ...
## $ pressure : num [1:8811] 1010 1010 1010 1010 1009 ...
## $ humidity : num [1:8811] 94 88 83 79 79 79 74 79 83 79 ...
## $ wind_speed : num [1:8811] 0.5 2.1 3.1 4.1 4.1 5.1 4.6 5.7 5.7 5.7 ...
## $ wind_deg : num [1:8811] 0 240 240 260 260 250 260 260 260 270 ...
## $ clouds_all : num [1:8811] 40 20 20 20 20 20 20 20 20 20 ...
## $ weather_main: num [1:8811] 2 2 1 1 1 1 1 1 1 1 ...
unique(data_filtered$weather_main)
## [1] 2 1 0 3
Split Data
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
split_index <- createDataPartition(data_filtered$weather_main, p = 0.7, list = FALSE)
train_data <- data_filtered[split_index, ]
test_data <- data_filtered[-split_index, ]
Penyeimbangan Data
if (!require(smotefamily)) install.packages("smotefamily")
## Loading required package: smotefamily
## Warning: package 'smotefamily' was built under R version 4.4.3
library(smotefamily)
set.seed(123)
n_total_train <- nrow(train_data)
n_target_final <- n_total_train * 2
n_classes <- length(levels(train_data$weather_main))
n_per_class <- floor(n_target_final / n_classes)
list_per_class <- split(train_data, train_data$weather_main)
balanced_list <- list()
for (cls in names(list_per_class)) {
data_cls <- list_per_class[[cls]]
n_cls <- nrow(data_cls)
balanced_list[[cls]] <- if (n_cls > n_per_class) {
slice_sample(data_cls, n = n_per_class)
} else {
slice_sample(data_cls, n = n_per_class, replace = TRUE)
}
}
train_balanced <- bind_rows(balanced_list)
cat("Distribusi kelas setelah balancing di train_data:\n")
## Distribusi kelas setelah balancing di train_data:
table(train_balanced$weather_main)
##
## 0 1 2 3
## 3085 3085 3085 3085
cat("Total data train setelah balancing:", nrow(train_balanced), "\n")
## Total data train setelah balancing: 12340
cat("Distribusi kelas di test_data (tidak diubah):\n")
## Distribusi kelas di test_data (tidak diubah):
table(test_data$weather_main)
##
## 0 1 2 3
## 6 2422 171 42
Uji Asumsi (VIF dan
Chi-square)
Uji VIF
library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
temp_vars <- dplyr::select(train_balanced, temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars, scale. = TRUE)
pc_df <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc_df) <- "pca_temp"
train_balanced_pca <- train_balanced %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc_df)
lm_model <- lm(as.numeric(weather_main) ~ ., data = train_balanced_pca)
vif_values <- vif(lm_model)
vif_values
## pressure humidity wind_speed wind_deg clouds_all pca_temp
## 1.373099 2.356137 1.193936 1.129373 1.289307 2.299876
Uji Chi-Square
train_balanced_pca$weather_main <- as.factor(train_balanced_pca$weather_main)
num_vars <- names(train_balanced_pca)[sapply(train_balanced_pca, is.numeric)]
chi_sq_test <- function(varname, data, target) {
binned_var <- cut(data[[varname]], breaks = 5, include.lowest = TRUE)
contingency_table <- table(binned_var, data[[target]])
if (all(dim(contingency_table) > 1)) {
chisq.test(contingency_table)$p.value
} else NA
}
chi_results <- sapply(num_vars, chi_sq_test, data = train_balanced_pca, target = "weather_main")
chi_df <- data.frame(Variable = names(chi_results), P_Value = chi_results)
chi_df
## Variable P_Value
## pressure pressure 1.513836e-210
## humidity humidity 0.000000e+00
## wind_speed wind_speed 4.301578e-318
## wind_deg wind_deg 0.000000e+00
## clouds_all clouds_all 0.000000e+00
## pca_temp pca_temp 3.883452e-151
Normalisasi Data
feature_cols <- c("temp", "temp_min", "temp_max", "pressure", "humidity", "wind_speed", "wind_deg", "clouds_all")
min_max_norm <- function(x) (x - min(x)) / (max(x) - min(x))
train_balanced_norm <- train_balanced
train_balanced_norm[feature_cols] <- lapply(train_balanced[feature_cols], min_max_norm)
test_data_norm <- test_data
for (col in feature_cols) {
min_val <- min(train_balanced[[col]])
max_val <- max(train_balanced[[col]])
test_data_norm[[col]] <- (test_data[[col]] - min_val) / (max_val - min_val)
}
summary(train_balanced_norm)
## temp temp_min temp_max pressure
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3384 1st Qu.:0.4167 1st Qu.:0.3333 1st Qu.:0.2583
## Median :0.4316 Median :0.5152 Median :0.4250 Median :0.4167
## Mean :0.4495 Mean :0.5222 Mean :0.4534 Mean :0.4236
## 3rd Qu.:0.5273 3rd Qu.:0.6061 3rd Qu.:0.5500 3rd Qu.:0.5833
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## humidity wind_speed wind_deg clouds_all
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5625 1st Qu.:0.1201 1st Qu.:0.2778 1st Qu.:0.1000
## Median :0.6875 Median :0.2319 Median :0.3833 Median :0.4000
## Mean :0.6750 Mean :0.2464 Mean :0.4780 Mean :0.3487
## 3rd Qu.:0.8750 3rd Qu.:0.3438 3rd Qu.:0.7500 3rd Qu.:0.6000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## weather_main
## 0:3085
## 1:3085
## 2:3085
## 3:3085
##
##
summary(test_data_norm)
## temp temp_min temp_max pressure
## Min. :-0.07809 Min. :0.04242 Min. :-0.08333 Min. :0.0000
## 1st Qu.: 0.34761 1st Qu.:0.43939 1st Qu.: 0.33333 1st Qu.:0.3333
## Median : 0.45676 Median :0.53030 Median : 0.48333 Median :0.5000
## Mean : 0.47575 Mean :0.54638 Mean : 0.47962 Mean :0.4833
## 3rd Qu.: 0.59782 3rd Qu.:0.66667 3rd Qu.: 0.58333 3rd Qu.:0.5833
## Max. : 1.03359 Max. :1.06061 Max. : 1.03333 Max. :0.9167
## humidity wind_speed wind_deg clouds_all
## Min. :0.0625 Min. :-0.005966 Min. :0.0000 Min. :0.0100
## 1st Qu.:0.4583 1st Qu.: 0.120060 1st Qu.:0.2778 1st Qu.:0.2000
## Median :0.6458 Median : 0.231916 Median :0.3333 Median :0.2000
## Mean :0.6260 Mean : 0.257245 Mean :0.4262 Mean :0.3119
## 3rd Qu.:0.7500 3rd Qu.: 0.388516 3rd Qu.:0.6944 3rd Qu.:0.4000
## Max. :1.0000 Max. : 0.888143 Max. :1.0000 Max. :1.0000
## weather_main
## 0: 6
## 1:2422
## 2: 171
## 3: 42
##
##
Visualisasi
Korelasi
library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
correlation_matrix <- cor(train_balanced_norm[, feature_cols])
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black",
addCoef.col = "black", number.cex = 0.7, diag = FALSE)

PEMODELAN - Multinomial
Logistic Regression + PCA
karena fitur temp,
temp_min, & temp_max mempunyai korelasi yang sangat tinggi, maka
saya lakukan PCA kepada ketiga fitur menjadi 1 fitur (pca_temp).
library(nnet)
temp_vars_train <- train_balanced_norm %>% dplyr::select(temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars_train, scale. = TRUE)
pc1_train <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc1_train) <- "pca_temp"
train_mlr <- train_balanced_norm %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc1_train)
model_mlr <- multinom(weather_main ~ ., data = train_mlr, trace = FALSE)
temp_vars_test <- test_data_norm %>% dplyr::select(temp, temp_min, temp_max)
pc1_test_mat <- predict(pca_temp, newdata = temp_vars_test)[, 1, drop = FALSE]
pc1_test_df <- as.data.frame(pc1_test_mat)
colnames(pc1_test_df) <- "pca_temp"
test_mlr <- test_data_norm %>%
dplyr::select(-temp, -temp_min, -temp_max) %>%
bind_cols(pc1_test_df)
test_mlr$weather_main <- as.factor(test_mlr$weather_main)
pred_mlr <- predict(model_mlr, test_mlr)
conf_matrix_mlr <- confusionMatrix(pred_mlr, test_mlr$weather_main)
conf_matrix_mlr
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3
## 0 6 8 0 0
## 1 0 1917 41 6
## 2 0 251 79 14
## 3 0 246 51 22
##
## Overall Statistics
##
## Accuracy : 0.7664
## 95% CI : (0.7498, 0.7824)
## No Information Rate : 0.9171
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.2406
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity 1.000000 0.7915 0.46199 0.52381
## Specificity 0.996964 0.7854 0.89271 0.88573
## Pos Pred Value 0.428571 0.9761 0.22965 0.06897
## Neg Pred Value 1.000000 0.2541 0.95995 0.99139
## Prevalence 0.002272 0.9171 0.06475 0.01590
## Detection Rate 0.002272 0.7259 0.02991 0.00833
## Detection Prevalence 0.005301 0.7437 0.13025 0.12079
## Balanced Accuracy 0.998482 0.7884 0.67735 0.70477