LINK PUBLIKASI RPUBS : https://rpubs.com/RezaUnesaSada23/RegresiMultinomialWithPCA

1 LOAD DATA

Dataset yang digunakan dalam proyek ini berasal dari Kaggle dengan judul “Denpasar Bali Historical Weather Data”, yang berisi data cuaca historis Kota Denpasar dari tahun 1990 hingga 2020. Namun, untuk kebutuhan proyek ini, fokus klasifikasi diarahkan pada empat kategori utama yang relevan dengan kondisi cuaca Denpasar tahun 2019, yaitu Clouds, Rain, Thunderstorm, dan Clear.

library(readr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
data_path <- "D:/TUGAS KAMPUS SEMESTER 4/ANALISIS MULTIVARIAT/PROYEK AKHIR ANMUL/openweatherdata-denpasar-1990-2020.csv"
data <- read_csv(data_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 264924 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): dt_iso, city_name, weather_main, weather_description, weather_icon
## dbl (17): dt, timezone, lat, lon, temp, temp_min, temp_max, pressure, humidi...
## lgl (10): sea_level, grnd_level, rain_12h, rain_today, snow_1h, snow_3h, sno...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$dt_iso <- gsub(" UTC", "", data$dt_iso)
data$dt_iso <- as_datetime(data$dt_iso)
data_filtered <- data %>% filter(year(dt_iso) == 2019)

if (nrow(data_filtered) == 0) stop("Tidak ada data untuk tahun 2019!")

head(data_filtered)
## # A tibble: 6 × 32
##           dt dt_iso              timezone city_name   lat   lon  temp temp_min
##        <dbl> <dttm>                 <dbl> <chr>     <dbl> <dbl> <dbl>    <dbl>
## 1 1546300800 2019-01-01 00:00:00    28800 Denpasar  -8.65  115.  27.1     27  
## 2 1546304400 2019-01-01 01:00:00    28800 Denpasar  -8.65  115.  27.8     27.4
## 3 1546308000 2019-01-01 02:00:00    28800 Denpasar  -8.65  115.  29.5     29  
## 4 1546311600 2019-01-01 03:00:00    28800 Denpasar  -8.65  115.  30.1     30  
## 5 1546315200 2019-01-01 04:00:00    28800 Denpasar  -8.65  115.  30.2     30  
## 6 1546318800 2019-01-01 05:00:00    28800 Denpasar  -8.65  115.  30       30  
## # ℹ 24 more variables: temp_max <dbl>, pressure <dbl>, sea_level <lgl>,
## #   grnd_level <lgl>, humidity <dbl>, wind_speed <dbl>, wind_deg <dbl>,
## #   rain_1h <dbl>, rain_3h <dbl>, rain_6h <dbl>, rain_12h <lgl>,
## #   rain_24h <dbl>, rain_today <lgl>, snow_1h <lgl>, snow_3h <lgl>,
## #   snow_6h <lgl>, snow_12h <lgl>, snow_24h <lgl>, snow_today <lgl>,
## #   clouds_all <dbl>, weather_id <dbl>, weather_main <chr>,
## #   weather_description <chr>, weather_icon <chr>

2 EXPLORATORY DATA ANALYSIS

cat("Jumlah baris:", nrow(data_filtered), "\n")
## Jumlah baris: 8822
cat("Jumlah kolom:", ncol(data_filtered), "\n")
## Jumlah kolom: 32
na_summary <- data.frame(Kolom = names(data_filtered),
                         Jumlah_NA = colSums(is.na(data_filtered))) %>%
  arrange(desc(Jumlah_NA))
na_summary
##                                   Kolom Jumlah_NA
## sea_level                     sea_level      8822
## grnd_level                   grnd_level      8822
## rain_12h                       rain_12h      8822
## rain_today                   rain_today      8822
## snow_1h                         snow_1h      8822
## snow_3h                         snow_3h      8822
## snow_6h                         snow_6h      8822
## snow_12h                       snow_12h      8822
## snow_24h                       snow_24h      8822
## snow_today                   snow_today      8822
## rain_1h                         rain_1h      8813
## rain_24h                       rain_24h      8543
## rain_6h                         rain_6h      8541
## rain_3h                         rain_3h      8275
## dt                                   dt         0
## dt_iso                           dt_iso         0
## timezone                       timezone         0
## city_name                     city_name         0
## lat                                 lat         0
## lon                                 lon         0
## temp                               temp         0
## temp_min                       temp_min         0
## temp_max                       temp_max         0
## pressure                       pressure         0
## humidity                       humidity         0
## wind_speed                   wind_speed         0
## wind_deg                       wind_deg         0
## clouds_all                   clouds_all         0
## weather_id                   weather_id         0
## weather_main               weather_main         0
## weather_description weather_description         0
## weather_icon               weather_icon         0
freq_weather <- table(data_filtered$weather_main, useNA = "ifany")
weather_summary <- as.data.frame(freq_weather)
colnames(weather_summary) <- c("Weather Main", "Jumlah")
weather_summary
##   Weather Main Jumlah
## 1        Clear     22
## 2       Clouds   8075
## 3         Haze      5
## 4         Mist      3
## 5         Rain    571
## 6        Smoke      3
## 7 Thunderstorm    143

3 PREPROCESSING

data_filtered <- data_filtered %>%
  select(temp, temp_min, temp_max, pressure, humidity,
         wind_speed, wind_deg, clouds_all, weather_main) %>%
  filter(weather_main %in% c("Clouds", "Rain", "Thunderstorm", "Clear"))

if (length(unique(data_filtered$weather_main)) < 2) {
  stop("Kolom weather_main hanya memiliki satu kategori setelah filter. Tidak bisa dilakukan split!")
}

data_filtered$weather_main <- as.factor(data_filtered$weather_main)
data_filtered$weather_main <- as.integer(data_filtered$weather_main) - 1

is.ordered(data_filtered$weather_main)
## [1] FALSE
str(data_filtered)
## tibble [8,811 × 9] (S3: tbl_df/tbl/data.frame)
##  $ temp        : num [1:8811] 27.1 27.8 29.5 30.1 30.1 ...
##  $ temp_min    : num [1:8811] 27 27.4 29 30 30 30 30 29 29 29 ...
##  $ temp_max    : num [1:8811] 27.4 28 30.4 30.4 30.4 30 30 30 29.4 29.4 ...
##  $ pressure    : num [1:8811] 1010 1010 1010 1010 1009 ...
##  $ humidity    : num [1:8811] 94 88 83 79 79 79 74 79 83 79 ...
##  $ wind_speed  : num [1:8811] 0.5 2.1 3.1 4.1 4.1 5.1 4.6 5.7 5.7 5.7 ...
##  $ wind_deg    : num [1:8811] 0 240 240 260 260 250 260 260 260 270 ...
##  $ clouds_all  : num [1:8811] 40 20 20 20 20 20 20 20 20 20 ...
##  $ weather_main: num [1:8811] 2 2 1 1 1 1 1 1 1 1 ...
unique(data_filtered$weather_main)
## [1] 2 1 0 3

3.1 Split Data

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
split_index <- createDataPartition(data_filtered$weather_main, p = 0.7, list = FALSE)
train_data <- data_filtered[split_index, ]
test_data <- data_filtered[-split_index, ]
cat("Jumlah data Train :", nrow(train_data), "\n")
## Jumlah data Train : 6170
cat("Jumlah data Test  :", nrow(test_data), "\n")
## Jumlah data Test  : 2641

3.2 Penyeimbangan Data

bisa dilihat berdasarkan distrubusi pada kolom target, terdapat ketidakseimbangan data maka dilakukan penangan imbalaced data. penganan ini dilakukan pada data train.

if (!require(smotefamily)) install.packages("smotefamily")
## Loading required package: smotefamily
## Warning: package 'smotefamily' was built under R version 4.4.3
library(smotefamily)

set.seed(123)
n_total_train <- nrow(train_data)
n_target_final <- n_total_train * 2
n_classes <- length(levels(train_data$weather_main))
n_per_class <- floor(n_target_final / n_classes)

list_per_class <- split(train_data, train_data$weather_main)
balanced_list <- list()

for (cls in names(list_per_class)) {
  data_cls <- list_per_class[[cls]]
  n_cls <- nrow(data_cls)
  balanced_list[[cls]] <- if (n_cls > n_per_class) {
    slice_sample(data_cls, n = n_per_class)
  } else {
    slice_sample(data_cls, n = n_per_class, replace = TRUE)
  }
}
train_balanced <- bind_rows(balanced_list)

cat("Distribusi kelas setelah balancing di train_data:\n")
## Distribusi kelas setelah balancing di train_data:
table(train_balanced$weather_main)
## 
##    0    1    2    3 
## 3085 3085 3085 3085
cat("Total data train setelah balancing:", nrow(train_balanced), "\n")
## Total data train setelah balancing: 12340
cat("Distribusi kelas di test_data (tidak diubah):\n")
## Distribusi kelas di test_data (tidak diubah):
table(test_data$weather_main)
## 
##    0    1    2    3 
##    6 2422  171   42

4 Uji Asumsi (VIF dan Chi-square)

4.1 Uji VIF

library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
temp_vars <- dplyr::select(train_balanced, temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars, scale. = TRUE)
pc_df <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc_df) <- "pca_temp"

train_balanced_pca <- train_balanced %>%
  dplyr::select(-temp, -temp_min, -temp_max) %>%
  bind_cols(pc_df)

lm_model <- lm(as.numeric(weather_main) ~ ., data = train_balanced_pca)
vif_values <- vif(lm_model)
vif_values
##   pressure   humidity wind_speed   wind_deg clouds_all   pca_temp 
##   1.260702   2.715875   1.195759   1.116160   1.341947   2.465386

4.2 Uji Chi-Square

train_balanced_pca$weather_main <- as.factor(train_balanced_pca$weather_main)
num_vars <- names(train_balanced_pca)[sapply(train_balanced_pca, is.numeric)]
chi_sq_test <- function(varname, data, target) {
  binned_var <- cut(data[[varname]], breaks = 5, include.lowest = TRUE)
  contingency_table <- table(binned_var, data[[target]])
  if (all(dim(contingency_table) > 1)) {
    chisq.test(contingency_table)$p.value
  } else NA
}

chi_results <- sapply(num_vars, chi_sq_test, data = train_balanced_pca, target = "weather_main")
chi_df <- data.frame(Variable = names(chi_results), P_Value = chi_results)
chi_df
##              Variable       P_Value
## pressure     pressure 4.916433e-308
## humidity     humidity  0.000000e+00
## wind_speed wind_speed 4.055834e-221
## wind_deg     wind_deg  0.000000e+00
## clouds_all clouds_all  0.000000e+00
## pca_temp     pca_temp 6.137522e-146

5 NORMALISASI

feature_cols <- c("temp", "temp_min", "temp_max", "pressure", "humidity", "wind_speed", "wind_deg", "clouds_all")
min_max_norm <- function(x) (x - min(x)) / (max(x) - min(x))

train_balanced_norm <- train_balanced
train_balanced_norm[feature_cols] <- lapply(train_balanced[feature_cols], min_max_norm)

test_data_norm <- test_data
for (col in feature_cols) {
  min_val <- min(train_balanced[[col]])
  max_val <- max(train_balanced[[col]])
  test_data_norm[[col]] <- (test_data[[col]] - min_val) / (max_val - min_val)
}

summary(train_balanced_norm)
##       temp           temp_min         temp_max         pressure     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3875   1st Qu.:0.3824   1st Qu.:0.3923   1st Qu.:0.2818  
##  Median :0.4760   Median :0.4792   Median :0.4923   Median :0.4545  
##  Mean   :0.4919   Mean   :0.4819   Mean   :0.5089   Mean   :0.4520  
##  3rd Qu.:0.5683   3rd Qu.:0.5536   3rd Qu.:0.6154   3rd Qu.:0.6364  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     humidity        wind_speed        wind_deg        clouds_all    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5333   1st Qu.:0.1201   1st Qu.:0.2778   1st Qu.:0.0700  
##  Median :0.6222   Median :0.2319   Median :0.3889   Median :0.4000  
##  Mean   :0.6381   Mean   :0.2411   Mean   :0.4789   Mean   :0.3484  
##  3rd Qu.:0.7778   3rd Qu.:0.3438   3rd Qu.:0.7500   3rd Qu.:0.7500  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  weather_main
##  0:3085      
##  1:3085      
##  2:3085      
##  3:3085      
##              
## 
summary(test_data_norm)
##       temp              temp_min           temp_max         pressure     
##  Min.   :0.0007628   Min.   :-0.04167   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3836766   1st Qu.: 0.38244   1st Qu.:0.3846   1st Qu.:0.3636  
##  Median :0.4774981   Median : 0.47917   Median :0.5000   Median :0.5455  
##  Mean   :0.4959084   Mean   : 0.48856   Mean   :0.5120   Mean   :0.5390  
##  3rd Qu.:0.6041190   3rd Qu.: 0.58333   3rd Qu.:0.6154   3rd Qu.:0.7273  
##  Max.   :1.0099161   Max.   : 1.00000   Max.   :1.0308   Max.   :1.0000  
##     humidity          wind_speed           wind_deg        clouds_all    
##  Min.   :-0.06667   Min.   :-0.005966   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.: 0.51111   1st Qu.: 0.120060   1st Qu.:0.2778   1st Qu.:0.2000  
##  Median : 0.62222   Median : 0.231916   Median :0.3333   Median :0.2000  
##  Mean   : 0.61262   Mean   : 0.255270   Mean   :0.4259   Mean   :0.3099  
##  3rd Qu.: 0.73333   3rd Qu.: 0.385533   3rd Qu.:0.6944   3rd Qu.:0.4000  
##  Max.   : 1.00000   Max.   : 0.925429   Max.   :1.0000   Max.   :1.0000  
##  weather_main
##  0:   6      
##  1:2422      
##  2: 171      
##  3:  42      
##              
## 

6 VISUALISASI AWAL

6.1 Visualiasi Proporsi Kategori Cuaca Sebelum Penangan Imbalanced

library(ggplot2)
weather_summary$proporsi <- weather_summary$Jumlah / sum(weather_summary$Jumlah)
ggplot(weather_summary, aes(x = reorder(`Weather Main`, -Jumlah), y = Jumlah, fill = `Weather Main`)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  coord_flip() +
  labs(title = "Distribusi Kategori Cuaca di Denpasar (2019)",
       x = "Kategori Cuaca", y = "Jumlah")

6.2 Visualiasi Proporsi Kategori Cuaca Setelah Penangan Imbalanced

library(ggplot2)
library(dplyr)
train_weather_summary <- train_balanced %>%
  group_by(weather_main) %>%
  summarise(Jumlah = n()) %>%
  arrange(desc(Jumlah))

train_weather_summary$Proporsi <- train_weather_summary$Jumlah / sum(train_weather_summary$Jumlah)

ggplot(train_weather_summary, aes(x = reorder(weather_main, -Jumlah), y = Jumlah, fill = weather_main)) +
  geom_bar(stat = "identity") +
  theme_minimal() +
  coord_flip() +
  labs(title = "Distribusi Kategori Cuaca pada Data Train (Balanced)",
       x = "Kategori Cuaca", y = "Jumlah")

6.3 Visualisasi Korelasi

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
correlation_matrix <- cor(train_balanced_norm[, feature_cols])
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black", 
         addCoef.col = "black", number.cex = 0.7, diag = FALSE)

7 PEMODELAN - Multinomial Logistic Regression + PCA

karena fitur temp, temp_min, & temp_max mempunyai korelasi yang sangat tinggi, maka saya lakukan PCA kepada ketiga fitur menjadi 1 fitur (pca_temp).

library(nnet)

temp_vars_train <- train_balanced_norm %>% dplyr::select(temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars_train, scale. = TRUE)
pc1_train <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc1_train) <- "pca_temp"

train_mlr <- train_balanced_norm %>%
  dplyr::select(-temp, -temp_min, -temp_max) %>%
  bind_cols(pc1_train)

model_mlr <- multinom(weather_main ~ ., data = train_mlr, trace = FALSE)

temp_vars_test <- test_data_norm %>% dplyr::select(temp, temp_min, temp_max)
pc1_test_mat <- predict(pca_temp, newdata = temp_vars_test)[, 1, drop = FALSE]
pc1_test_df <- as.data.frame(pc1_test_mat)
colnames(pc1_test_df) <- "pca_temp"

test_mlr <- test_data_norm %>%
  dplyr::select(-temp, -temp_min, -temp_max) %>%
  bind_cols(pc1_test_df)
test_mlr$weather_main <- as.factor(test_mlr$weather_main)

pred_mlr <- predict(model_mlr, test_mlr)
conf_matrix_mlr <- confusionMatrix(pred_mlr, test_mlr$weather_main)

conf_matrix_mlr
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3
##          0    4    0    0    0
##          1    2 1897   34    6
##          2    0  248   84   11
##          3    0  277   53   25
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7611          
##                  95% CI : (0.7443, 0.7772)
##     No Information Rate : 0.9171          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2442          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity          0.666667   0.7832  0.49123 0.595238
## Specificity          1.000000   0.8082  0.89514 0.873028
## Pos Pred Value       1.000000   0.9783  0.24490 0.070423
## Neg Pred Value       0.999242   0.2521  0.96214 0.992563
## Prevalence           0.002272   0.9171  0.06475 0.015903
## Detection Rate       0.001515   0.7183  0.03181 0.009466
## Detection Prevalence 0.001515   0.7342  0.12988 0.134419
## Balanced Accuracy    0.833333   0.7957  0.69318 0.734133

8 VISUALIASI AKHIR

8.1 Visualiasi Akurasi Perkelas

class_accuracy <- diag(conf_matrix_mlr$table) / rowSums(conf_matrix_mlr$table)
class_accuracy_df <- data.frame(Kelas = names(class_accuracy),
                                Akurasi = round(class_accuracy, 3))

ggplot(class_accuracy_df, aes(x = reorder(Kelas, -Akurasi), y = Akurasi, fill = Kelas)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = Akurasi), vjust = -0.3, size = 3.5) +
  theme_minimal() +
  labs(title = "Akurasi per Kelas Cuaca",
       x = "Kategori Cuaca", y = "Akurasi") +
  ylim(0, 1)

bisa dilihat, secara per kelas, performa model sangat bervariasi. Untuk kelas 0, sensitivitas dan spesifisitas hampir sempurna (1.000 dan 0.9996), namun ini bisa disebabkan oleh prevalensi yang sangat rendah (0.22%). Kelas 1 menunjukkan performa terbaik dengan sensitivitas 78.41% dan nilai prediksi positif 97.94%. Sebaliknya, kelas 2 dan 3 memiliki sensitivitas rendah serta nilai prediksi positif yang juga rendah, menandakan bahwa model mengalami kesulitan membedakan kedua kelas ini.

8.2 Confusion Matrix Heatmap

library(ggplot2)
library(caret)
library(reshape2)
## Warning: package 'reshape2' was built under R version 4.4.3
# Ubah confusion matrix menjadi data frame
cm_df <- as.data.frame(conf_matrix_mlr$table)

# Visualisasi heatmap
ggplot(cm_df, aes(x = Reference, y = Prediction, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 4) +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(title = "Confusion Matrix Multinomial Logistic Regression",
       x = "Actual (Reference)",
       y = "Predicted") +
  theme_minimal()

9 REFERENSI

  • [1] I. Qutab, K. I. Malik, and H. Arooj, “Sentiment classification using multinomial logistic regression on Roman Urdu text,” Int. J. Innov. Sci. Technol., vol. 4, no. 2, pp. 323–335, 2022, doi: 10.33411/IJIST/2022040204.
  • [2] N. Anusha, M. Sai Chaithanya, and G. Jithendranath Reddy, “Weather prediction using multi linear regression algorithm,” in IOP Conference Series: Materials Science and Engineering, vol. 590, 2019, Art. no. 012034, doi: 10.1088/1757-899X/590/1/012034.
  • [3] E. Novitasari and A. Sofro, “Analisis regresi multinomial untuk pemodelan faktor penyebab kekerasan dalam rumah tangga,” MATHunesa, vol. 11, no. 1, 2023. [Online]. Available: https://journal.unesa.ac.id/index.php/mathunesa