1 Load Library dan Data

library(readr)
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
data_path <- "D:/TUGAS KAMPUS SEMESTER 4/ANALISIS MULTIVARIAT/PROYEK AKHIR ANMUL/openweatherdata-denpasar-1990-2020.csv"
data <- read_csv(data_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 264924 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (5): dt_iso, city_name, weather_main, weather_description, weather_icon
## dbl (17): dt, timezone, lat, lon, temp, temp_min, temp_max, pressure, humidi...
## lgl (10): sea_level, grnd_level, rain_12h, rain_today, snow_1h, snow_3h, sno...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$dt_iso <- gsub(" UTC", "", data$dt_iso)
data$dt_iso <- as_datetime(data$dt_iso)
data_filtered <- data %>% filter(year(dt_iso) == 2019)

if (nrow(data_filtered) == 0) stop("Tidak ada data untuk tahun 2019!")

head(data_filtered)
## # A tibble: 6 × 32
##           dt dt_iso              timezone city_name   lat   lon  temp temp_min
##        <dbl> <dttm>                 <dbl> <chr>     <dbl> <dbl> <dbl>    <dbl>
## 1 1546300800 2019-01-01 00:00:00    28800 Denpasar  -8.65  115.  27.1     27  
## 2 1546304400 2019-01-01 01:00:00    28800 Denpasar  -8.65  115.  27.8     27.4
## 3 1546308000 2019-01-01 02:00:00    28800 Denpasar  -8.65  115.  29.5     29  
## 4 1546311600 2019-01-01 03:00:00    28800 Denpasar  -8.65  115.  30.1     30  
## 5 1546315200 2019-01-01 04:00:00    28800 Denpasar  -8.65  115.  30.2     30  
## 6 1546318800 2019-01-01 05:00:00    28800 Denpasar  -8.65  115.  30       30  
## # ℹ 24 more variables: temp_max <dbl>, pressure <dbl>, sea_level <lgl>,
## #   grnd_level <lgl>, humidity <dbl>, wind_speed <dbl>, wind_deg <dbl>,
## #   rain_1h <dbl>, rain_3h <dbl>, rain_6h <dbl>, rain_12h <lgl>,
## #   rain_24h <dbl>, rain_today <lgl>, snow_1h <lgl>, snow_3h <lgl>,
## #   snow_6h <lgl>, snow_12h <lgl>, snow_24h <lgl>, snow_today <lgl>,
## #   clouds_all <dbl>, weather_id <dbl>, weather_main <chr>,
## #   weather_description <chr>, weather_icon <chr>

2 EXPLORATORY DATA ANALYSIS

cat("Jumlah baris:", nrow(data_filtered), "\n")
## Jumlah baris: 8822
cat("Jumlah kolom:", ncol(data_filtered), "\n")
## Jumlah kolom: 32
na_summary <- data.frame(Kolom = names(data_filtered),
                         Jumlah_NA = colSums(is.na(data_filtered))) %>%
  arrange(desc(Jumlah_NA))
na_summary
##                                   Kolom Jumlah_NA
## sea_level                     sea_level      8822
## grnd_level                   grnd_level      8822
## rain_12h                       rain_12h      8822
## rain_today                   rain_today      8822
## snow_1h                         snow_1h      8822
## snow_3h                         snow_3h      8822
## snow_6h                         snow_6h      8822
## snow_12h                       snow_12h      8822
## snow_24h                       snow_24h      8822
## snow_today                   snow_today      8822
## rain_1h                         rain_1h      8813
## rain_24h                       rain_24h      8543
## rain_6h                         rain_6h      8541
## rain_3h                         rain_3h      8275
## dt                                   dt         0
## dt_iso                           dt_iso         0
## timezone                       timezone         0
## city_name                     city_name         0
## lat                                 lat         0
## lon                                 lon         0
## temp                               temp         0
## temp_min                       temp_min         0
## temp_max                       temp_max         0
## pressure                       pressure         0
## humidity                       humidity         0
## wind_speed                   wind_speed         0
## wind_deg                       wind_deg         0
## clouds_all                   clouds_all         0
## weather_id                   weather_id         0
## weather_main               weather_main         0
## weather_description weather_description         0
## weather_icon               weather_icon         0
freq_weather <- table(data_filtered$weather_main, useNA = "ifany")
weather_summary <- as.data.frame(freq_weather)
colnames(weather_summary) <- c("Weather Main", "Jumlah")
weather_summary
##   Weather Main Jumlah
## 1        Clear     22
## 2       Clouds   8075
## 3         Haze      5
## 4         Mist      3
## 5         Rain    571
## 6        Smoke      3
## 7 Thunderstorm    143

3 PREPROCESSING

data_filtered <- data_filtered %>%
  select(temp, temp_min, temp_max, pressure, humidity,
         wind_speed, wind_deg, clouds_all, weather_main) %>%
  filter(weather_main %in% c("Clouds", "Rain", "Thunderstorm", "Clear"))

if (length(unique(data_filtered$weather_main)) < 2) {
  stop("Kolom weather_main hanya memiliki satu kategori setelah filter. Tidak bisa dilakukan split!")
}

data_filtered$weather_main <- as.factor(data_filtered$weather_main)
data_filtered$weather_main <- as.integer(data_filtered$weather_main) - 1

is.ordered(data_filtered$weather_main)
## [1] FALSE
str(data_filtered)
## tibble [8,811 × 9] (S3: tbl_df/tbl/data.frame)
##  $ temp        : num [1:8811] 27.1 27.8 29.5 30.1 30.1 ...
##  $ temp_min    : num [1:8811] 27 27.4 29 30 30 30 30 29 29 29 ...
##  $ temp_max    : num [1:8811] 27.4 28 30.4 30.4 30.4 30 30 30 29.4 29.4 ...
##  $ pressure    : num [1:8811] 1010 1010 1010 1010 1009 ...
##  $ humidity    : num [1:8811] 94 88 83 79 79 79 74 79 83 79 ...
##  $ wind_speed  : num [1:8811] 0.5 2.1 3.1 4.1 4.1 5.1 4.6 5.7 5.7 5.7 ...
##  $ wind_deg    : num [1:8811] 0 240 240 260 260 250 260 260 260 270 ...
##  $ clouds_all  : num [1:8811] 40 20 20 20 20 20 20 20 20 20 ...
##  $ weather_main: num [1:8811] 2 2 1 1 1 1 1 1 1 1 ...
unique(data_filtered$weather_main)
## [1] 2 1 0 3

4 Split Data

library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
split_index <- createDataPartition(data_filtered$weather_main, p = 0.7, list = FALSE)
train_data <- data_filtered[split_index, ]
test_data <- data_filtered[-split_index, ]

5 Penyeimbangan Data

if (!require(smotefamily)) install.packages("smotefamily")
## Loading required package: smotefamily
## Warning: package 'smotefamily' was built under R version 4.4.3
library(smotefamily)

set.seed(123)
n_total_train <- nrow(train_data)
n_target_final <- n_total_train * 2
n_classes <- length(levels(train_data$weather_main))
n_per_class <- floor(n_target_final / n_classes)

list_per_class <- split(train_data, train_data$weather_main)
balanced_list <- list()

for (cls in names(list_per_class)) {
  data_cls <- list_per_class[[cls]]
  n_cls <- nrow(data_cls)
  balanced_list[[cls]] <- if (n_cls > n_per_class) {
    slice_sample(data_cls, n = n_per_class)
  } else {
    slice_sample(data_cls, n = n_per_class, replace = TRUE)
  }
}
train_balanced <- bind_rows(balanced_list)

cat("Distribusi kelas setelah balancing di train_data:\n")
## Distribusi kelas setelah balancing di train_data:
table(train_balanced$weather_main)
## 
##    0    1    2    3 
## 3085 3085 3085 3085
cat("Total data train setelah balancing:", nrow(train_balanced), "\n")
## Total data train setelah balancing: 12340
cat("Distribusi kelas di test_data (tidak diubah):\n")
## Distribusi kelas di test_data (tidak diubah):
table(test_data$weather_main)
## 
##    0    1    2    3 
##    6 2422  171   42

6 Uji Asumsi (VIF dan Chi-square)

6.1 Uji VIF

library(car)
## Warning: package 'car' was built under R version 4.4.3
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.4.3
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
temp_vars <- dplyr::select(train_balanced, temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars, scale. = TRUE)
pc_df <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc_df) <- "pca_temp"

train_balanced_pca <- train_balanced %>%
  dplyr::select(-temp, -temp_min, -temp_max) %>%
  bind_cols(pc_df)

lm_model <- lm(as.numeric(weather_main) ~ ., data = train_balanced_pca)
vif_values <- vif(lm_model)
vif_values
##   pressure   humidity wind_speed   wind_deg clouds_all   pca_temp 
##   1.373099   2.356137   1.193936   1.129373   1.289307   2.299876

6.2 Uji Chi-Square

train_balanced_pca$weather_main <- as.factor(train_balanced_pca$weather_main)
num_vars <- names(train_balanced_pca)[sapply(train_balanced_pca, is.numeric)]
chi_sq_test <- function(varname, data, target) {
  binned_var <- cut(data[[varname]], breaks = 5, include.lowest = TRUE)
  contingency_table <- table(binned_var, data[[target]])
  if (all(dim(contingency_table) > 1)) {
    chisq.test(contingency_table)$p.value
  } else NA
}

chi_results <- sapply(num_vars, chi_sq_test, data = train_balanced_pca, target = "weather_main")
chi_df <- data.frame(Variable = names(chi_results), P_Value = chi_results)
chi_df
##              Variable       P_Value
## pressure     pressure 1.513836e-210
## humidity     humidity  0.000000e+00
## wind_speed wind_speed 4.301578e-318
## wind_deg     wind_deg  0.000000e+00
## clouds_all clouds_all  0.000000e+00
## pca_temp     pca_temp 3.883452e-151

7 Normalisasi Data

feature_cols <- c("temp", "temp_min", "temp_max", "pressure", "humidity", "wind_speed", "wind_deg", "clouds_all")
min_max_norm <- function(x) (x - min(x)) / (max(x) - min(x))

train_balanced_norm <- train_balanced
train_balanced_norm[feature_cols] <- lapply(train_balanced[feature_cols], min_max_norm)

test_data_norm <- test_data
for (col in feature_cols) {
  min_val <- min(train_balanced[[col]])
  max_val <- max(train_balanced[[col]])
  test_data_norm[[col]] <- (test_data[[col]] - min_val) / (max_val - min_val)
}

summary(train_balanced_norm)
##       temp           temp_min         temp_max         pressure     
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.3384   1st Qu.:0.4167   1st Qu.:0.3333   1st Qu.:0.2583  
##  Median :0.4316   Median :0.5152   Median :0.4250   Median :0.4167  
##  Mean   :0.4495   Mean   :0.5222   Mean   :0.4534   Mean   :0.4236  
##  3rd Qu.:0.5273   3rd Qu.:0.6061   3rd Qu.:0.5500   3rd Qu.:0.5833  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##     humidity        wind_speed        wind_deg        clouds_all    
##  Min.   :0.0000   Min.   :0.0000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5625   1st Qu.:0.1201   1st Qu.:0.2778   1st Qu.:0.1000  
##  Median :0.6875   Median :0.2319   Median :0.3833   Median :0.4000  
##  Mean   :0.6750   Mean   :0.2464   Mean   :0.4780   Mean   :0.3487  
##  3rd Qu.:0.8750   3rd Qu.:0.3438   3rd Qu.:0.7500   3rd Qu.:0.6000  
##  Max.   :1.0000   Max.   :1.0000   Max.   :1.0000   Max.   :1.0000  
##  weather_main
##  0:3085      
##  1:3085      
##  2:3085      
##  3:3085      
##              
## 
summary(test_data_norm)
##       temp             temp_min          temp_max           pressure     
##  Min.   :-0.07809   Min.   :0.04242   Min.   :-0.08333   Min.   :0.0000  
##  1st Qu.: 0.34761   1st Qu.:0.43939   1st Qu.: 0.33333   1st Qu.:0.3333  
##  Median : 0.45676   Median :0.53030   Median : 0.48333   Median :0.5000  
##  Mean   : 0.47575   Mean   :0.54638   Mean   : 0.47962   Mean   :0.4833  
##  3rd Qu.: 0.59782   3rd Qu.:0.66667   3rd Qu.: 0.58333   3rd Qu.:0.5833  
##  Max.   : 1.03359   Max.   :1.06061   Max.   : 1.03333   Max.   :0.9167  
##     humidity        wind_speed           wind_deg        clouds_all    
##  Min.   :0.0625   Min.   :-0.005966   Min.   :0.0000   Min.   :0.0100  
##  1st Qu.:0.4583   1st Qu.: 0.120060   1st Qu.:0.2778   1st Qu.:0.2000  
##  Median :0.6458   Median : 0.231916   Median :0.3333   Median :0.2000  
##  Mean   :0.6260   Mean   : 0.257245   Mean   :0.4262   Mean   :0.3119  
##  3rd Qu.:0.7500   3rd Qu.: 0.388516   3rd Qu.:0.6944   3rd Qu.:0.4000  
##  Max.   :1.0000   Max.   : 0.888143   Max.   :1.0000   Max.   :1.0000  
##  weather_main
##  0:   6      
##  1:2422      
##  2: 171      
##  3:  42      
##              
## 

8 Visualisasi Korelasi

library(corrplot)
## Warning: package 'corrplot' was built under R version 4.4.3
## corrplot 0.95 loaded
correlation_matrix <- cor(train_balanced_norm[, feature_cols])
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black", 
         addCoef.col = "black", number.cex = 0.7, diag = FALSE)

9 PEMODELAN - Multinomial Logistic Regression + PCA

10 karena fitur temp, temp_min, & temp_max mempunyai korelasi yang sangat tinggi, maka saya lakukan PCA kepada ketiga fitur menjadi 1 fitur (pca_temp).

library(nnet)

temp_vars_train <- train_balanced_norm %>% dplyr::select(temp, temp_min, temp_max)
pca_temp <- prcomp(temp_vars_train, scale. = TRUE)
pc1_train <- as.data.frame(pca_temp$x[,1, drop=FALSE])
colnames(pc1_train) <- "pca_temp"

train_mlr <- train_balanced_norm %>%
  dplyr::select(-temp, -temp_min, -temp_max) %>%
  bind_cols(pc1_train)

model_mlr <- multinom(weather_main ~ ., data = train_mlr, trace = FALSE)

temp_vars_test <- test_data_norm %>% dplyr::select(temp, temp_min, temp_max)
pc1_test_mat <- predict(pca_temp, newdata = temp_vars_test)[, 1, drop = FALSE]
pc1_test_df <- as.data.frame(pc1_test_mat)
colnames(pc1_test_df) <- "pca_temp"

test_mlr <- test_data_norm %>%
  dplyr::select(-temp, -temp_min, -temp_max) %>%
  bind_cols(pc1_test_df)
test_mlr$weather_main <- as.factor(test_mlr$weather_main)


pred_mlr <- predict(model_mlr, test_mlr)
conf_matrix_mlr <- confusionMatrix(pred_mlr, test_mlr$weather_main)

conf_matrix_mlr
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3
##          0    6    8    0    0
##          1    0 1917   41    6
##          2    0  251   79   14
##          3    0  246   51   22
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7664          
##                  95% CI : (0.7498, 0.7824)
##     No Information Rate : 0.9171          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.2406          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity          1.000000   0.7915  0.46199  0.52381
## Specificity          0.996964   0.7854  0.89271  0.88573
## Pos Pred Value       0.428571   0.9761  0.22965  0.06897
## Neg Pred Value       1.000000   0.2541  0.95995  0.99139
## Prevalence           0.002272   0.9171  0.06475  0.01590
## Detection Rate       0.002272   0.7259  0.02991  0.00833
## Detection Prevalence 0.005301   0.7437  0.13025  0.12079
## Balanced Accuracy    0.998482   0.7884  0.67735  0.70477