Dosen Pengampu: Ike Fitriyaningsih, M.Si
Mata Kuliah: Analisis Multivariat
Kelas: Sains Data 2023 E
library(readr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
data_path <- "openweatherdata-denpasar-1990-2020.csv"
data <- read_csv(data_path)
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 264924 Columns: 32
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): dt_iso, city_name, weather_main, weather_description, weather_icon
## dbl (17): dt, timezone, lat, lon, temp, temp_min, temp_max, pressure, humidi...
## lgl (10): sea_level, grnd_level, rain_12h, rain_today, snow_1h, snow_3h, sno...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
data$dt_iso <- gsub(" UTC", "", data$dt_iso)
data$dt_iso <- as_datetime(data$dt_iso)
data_filtered <- data %>% filter(year(dt_iso) == 2019)
if (nrow(data_filtered) == 0) stop("Tidak ada data untuk tahun 2019!")
head(data_filtered)
## # A tibble: 6 × 32
## dt dt_iso timezone city_name lat lon temp temp_min
## <dbl> <dttm> <dbl> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1546300800 2019-01-01 00:00:00 28800 Denpasar -8.65 115. 27.1 27
## 2 1546304400 2019-01-01 01:00:00 28800 Denpasar -8.65 115. 27.8 27.4
## 3 1546308000 2019-01-01 02:00:00 28800 Denpasar -8.65 115. 29.5 29
## 4 1546311600 2019-01-01 03:00:00 28800 Denpasar -8.65 115. 30.1 30
## 5 1546315200 2019-01-01 04:00:00 28800 Denpasar -8.65 115. 30.2 30
## 6 1546318800 2019-01-01 05:00:00 28800 Denpasar -8.65 115. 30 30
## # ℹ 24 more variables: temp_max <dbl>, pressure <dbl>, sea_level <lgl>,
## # grnd_level <lgl>, humidity <dbl>, wind_speed <dbl>, wind_deg <dbl>,
## # rain_1h <dbl>, rain_3h <dbl>, rain_6h <dbl>, rain_12h <lgl>,
## # rain_24h <dbl>, rain_today <lgl>, snow_1h <lgl>, snow_3h <lgl>,
## # snow_6h <lgl>, snow_12h <lgl>, snow_24h <lgl>, snow_today <lgl>,
## # clouds_all <dbl>, weather_id <dbl>, weather_main <chr>,
## # weather_description <chr>, weather_icon <chr>
cat("Jumlah baris:", nrow(data_filtered), "\n")
## Jumlah baris: 8822
cat("Jumlah kolom:", ncol(data_filtered), "\n")
## Jumlah kolom: 32
na_summary <- data.frame(Kolom = names(data_filtered),
Jumlah_NA = colSums(is.na(data_filtered))) %>%
arrange(desc(Jumlah_NA))
na_summary
## Kolom Jumlah_NA
## sea_level sea_level 8822
## grnd_level grnd_level 8822
## rain_12h rain_12h 8822
## rain_today rain_today 8822
## snow_1h snow_1h 8822
## snow_3h snow_3h 8822
## snow_6h snow_6h 8822
## snow_12h snow_12h 8822
## snow_24h snow_24h 8822
## snow_today snow_today 8822
## rain_1h rain_1h 8813
## rain_24h rain_24h 8543
## rain_6h rain_6h 8541
## rain_3h rain_3h 8275
## dt dt 0
## dt_iso dt_iso 0
## timezone timezone 0
## city_name city_name 0
## lat lat 0
## lon lon 0
## temp temp 0
## temp_min temp_min 0
## temp_max temp_max 0
## pressure pressure 0
## humidity humidity 0
## wind_speed wind_speed 0
## wind_deg wind_deg 0
## clouds_all clouds_all 0
## weather_id weather_id 0
## weather_main weather_main 0
## weather_description weather_description 0
## weather_icon weather_icon 0
freq_weather <- table(data_filtered$weather_main, useNA = "ifany")
weather_summary <- as.data.frame(freq_weather)
colnames(weather_summary) <- c("Weather Main", "Jumlah")
weather_summary
## Weather Main Jumlah
## 1 Clear 22
## 2 Clouds 8075
## 3 Haze 5
## 4 Mist 3
## 5 Rain 571
## 6 Smoke 3
## 7 Thunderstorm 143
library(ggplot2)
ggplot(weather_summary, aes(x = `Weather Main`, y = Jumlah, fill = `Weather Main`)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Distribusi Frekuensi Kategori Cuaca (2019)",
x = "Jenis Cuaca",
y = "Jumlah Observasi") +
geom_text(aes(label = Jumlah), vjust = -0.5)
numeric_cols_to_plot <- c("temp", "temp_min", "temp_max", "pressure", "humidity", "wind_speed", "wind_deg", "clouds_all")
data_for_hist <- data_filtered %>% select(all_of(numeric_cols_to_plot))
library(ggplot2)
library(tidyr)
data_for_hist %>%
pivot_longer(cols = everything(), names_to = "variable", values_to = "value") %>%
ggplot(aes(x = value)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "steelblue", color = "black", alpha = 0.7) +
geom_density(color = "red") +
facet_wrap(~variable, scales = "free") +
theme_minimal() +
labs(title = "Distribusi Variabel Numerik",
x = "Nilai",
y = "Densitas")
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
data_filtered <- data_filtered %>%
select(temp, temp_min, temp_max, pressure, humidity,
wind_speed, wind_deg, clouds_all, weather_main) %>%
filter(weather_main %in% c("Clouds", "Rain", "Thunderstorm", "Clear"))
if (length(unique(data_filtered$weather_main)) < 2) {
stop("Kolom weather_main hanya memiliki satu kategori setelah filter. Tidak bisa dilakukan split!")
}
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
data_filtered$weather_main <- as.integer(data_filtered$weather_main) - 1
is.ordered(data_filtered$weather_main)
## [1] FALSE
str(data_filtered)
## tibble [8,811 × 9] (S3: tbl_df/tbl/data.frame)
## $ temp : num [1:8811] 27.1 27.8 29.5 30.1 30.1 ...
## $ temp_min : num [1:8811] 27 27.4 29 30 30 30 30 29 29 29 ...
## $ temp_max : num [1:8811] 27.4 28 30.4 30.4 30.4 30 30 30 29.4 29.4 ...
## $ pressure : num [1:8811] 1010 1010 1010 1010 1009 ...
## $ humidity : num [1:8811] 94 88 83 79 79 79 74 79 83 79 ...
## $ wind_speed : num [1:8811] 0.5 2.1 3.1 4.1 4.1 5.1 4.6 5.7 5.7 5.7 ...
## $ wind_deg : num [1:8811] 0 240 240 260 260 250 260 260 260 270 ...
## $ clouds_all : num [1:8811] 40 20 20 20 20 20 20 20 20 20 ...
## $ weather_main: num [1:8811] 2 2 1 1 1 1 1 1 1 1 ...
unique(data_filtered$weather_main)
## [1] 2 1 0 3
data_filtered <- data_filtered %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), mean(., na.rm = TRUE), .)))
cat("Jumlah nilai NA setelah imputasi (dengan mean):\n")
## Jumlah nilai NA setelah imputasi (dengan mean):
print(colSums(is.na(data_filtered)))
## temp temp_min temp_max pressure humidity wind_speed
## 0 0 0 0 0 0
## wind_deg clouds_all weather_main
## 0 0 0
library(caret)
## Loading required package: lattice
data_filtered$weather_main <- as.factor(data_filtered$weather_main)
split_index <- createDataPartition(data_filtered$weather_main, p = 0.7, list = FALSE)
train_data <- data_filtered[split_index, ]
test_data <- data_filtered[-split_index, ]
if (!require(smotefamily)) install.packages("smotefamily")
## Loading required package: smotefamily
library(smotefamily)
set.seed(123)
n_total_train <- nrow(train_data)
n_target_final <- n_total_train * 2
n_classes <- length(levels(train_data$weather_main))
n_per_class <- floor(n_target_final / n_classes)
list_per_class <- split(train_data, train_data$weather_main)
balanced_list <- list()
for (cls in names(list_per_class)) {
data_cls <- list_per_class[[cls]]
n_cls <- nrow(data_cls)
balanced_list[[cls]] <- if (n_cls > n_per_class) {
slice_sample(data_cls, n = n_per_class)
} else {
slice_sample(data_cls, n = n_per_class, replace = TRUE)
}
}
train_balanced <- bind_rows(balanced_list)
cat("Total data train setelah balancing:", nrow(train_balanced), "\n")
## Total data train setelah balancing: 12340
# 1. Data distribusi kelas SEBELUM balancing (dari train_data)
dist_train_asli <- as.data.frame(table(train_data$weather_main))
colnames(dist_train_asli) <- c("Kelas_Cuaca", "Jumlah")
dist_train_asli$Keterangan <- "Data Latih (Sebelum Balancing)"
# 2. Data distribusi kelas SETELAH balancing (dari train_balanced)
dist_train_seimbang <- as.data.frame(table(train_balanced$weather_main))
colnames(dist_train_seimbang) <- c("Kelas_Cuaca", "Jumlah")
dist_train_seimbang$Keterangan <- "Data Latih (Setelah Balancing)"
# 3. Data distribusi kelas pada data TEST (dari test_data)
dist_test <- as.data.frame(table(test_data$weather_main))
colnames(dist_test) <- c("Kelas_Cuaca", "Jumlah")
dist_test$Keterangan <- "Data Uji (Tidak Diubah)"
data_plot_distribusi <- rbind(dist_train_asli, dist_train_seimbang, dist_test)
ggplot(data_plot_distribusi, aes(x = Kelas_Cuaca, y = Jumlah, fill = Keterangan)) +
geom_bar(stat = "identity", position = position_dodge(width = 0.9)) +
geom_text(aes(label = Jumlah),
position = position_dodge(width = 0.9),
vjust = -0.3,
size = 3.5) +
theme_minimal(base_size = 12) +
labs(title = "Perbandingan Distribusi Kelas Cuaca",
x = "Jenis Cuaca",
y = "Jumlah Sampel",
fill = "Keterangan Data") +
theme(axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "top") +
scale_fill_brewer(palette = "Set2")
feature_cols <- c("temp", "temp_min", "temp_max", "pressure", "humidity", "wind_speed", "wind_deg", "clouds_all")
min_max_norm <- function(x) (x - min(x)) / (max(x) - min(x))
train_balanced_norm <- train_balanced
train_balanced_norm[feature_cols] <- lapply(train_balanced[feature_cols], min_max_norm)
test_data_norm <- test_data
for (col in feature_cols) {
min_val <- min(train_balanced[[col]])
max_val <- max(train_balanced[[col]])
test_data_norm[[col]] <- (test_data[[col]] - min_val) / (max_val - min_val)
}
summary(train_balanced_norm)
## temp temp_min temp_max pressure
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3761 1st Qu.:0.3676 1st Qu.:0.3731 1st Qu.:0.3636
## Median :0.4615 Median :0.4643 Median :0.4627 Median :0.4545
## Mean :0.4750 Mean :0.4702 Mean :0.4833 Mean :0.4705
## 3rd Qu.:0.5544 3rd Qu.:0.5536 3rd Qu.:0.5746 3rd Qu.:0.6364
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## humidity wind_speed wind_deg clouds_all
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5333 1st Qu.:0.1201 1st Qu.:0.2778 1st Qu.:0.0800
## Median :0.6222 Median :0.2319 Median :0.3889 Median :0.4000
## Mean :0.6425 Mean :0.2535 Mean :0.4832 Mean :0.3529
## 3rd Qu.:0.8667 3rd Qu.:0.3438 3rd Qu.:0.7500 3rd Qu.:0.7500
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## weather_main
## 0:3085
## 1:3085
## 2:3085
## 3:3085
##
##
summary(test_data_norm)
## temp temp_min temp_max pressure
## Min. :0.0007553 Min. :-0.04167 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3836858 1st Qu.: 0.39732 1st Qu.:0.3731 1st Qu.:0.3636
## Median :0.4773414 Median : 0.47917 Median :0.5000 Median :0.5455
## Mean :0.4938370 Mean : 0.49112 Mean :0.4995 Mean :0.5293
## 3rd Qu.:0.6027190 3rd Qu.: 0.59821 3rd Qu.:0.5970 3rd Qu.:0.6364
## Max. :0.9773414 Max. : 0.96280 Max. :0.9701 Max. :1.0909
## humidity wind_speed wind_deg clouds_all
## Min. :0.0000 Min. :0.0007457 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.4889 1st Qu.:0.1200597 1st Qu.:0.2778 1st Qu.:0.2000
## Median :0.6222 Median :0.2319165 Median :0.3333 Median :0.2000
## Mean :0.6030 Mean :0.2553001 Mean :0.4263 Mean :0.3102
## 3rd Qu.:0.7333 3rd Qu.:0.3437733 3rd Qu.:0.6944 3rd Qu.:0.4000
## Max. :1.0000 Max. :0.9254288 Max. :1.0000 Max. :1.0000
## weather_main
## 0: 6
## 1:2422
## 2: 171
## 3: 42
##
##
library(corrplot)
## corrplot 0.95 loaded
correlation_matrix <- cor(train_balanced_norm[, feature_cols])
corrplot(correlation_matrix, method = "color", type = "upper", tl.col = "black",
addCoef.col = "black", number.cex = 0.7, diag = FALSE)
library(biotools) # untuk Box's M test
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked _by_ '.GlobalEnv':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
## ---
## biotools version 4.3
library(car) # untuk VIF
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
library(nnet) # untuk multinomial logistic regression
# Daftar fitur numerik
feature_cols <- c("temp", "temp_min", "temp_max", "pressure", "humidity",
"wind_speed", "wind_deg", "clouds_all")
cat("== Uji Normalitas Tiap Kelas (Shapiro-Wilk) ==\n")
## == Uji Normalitas Tiap Kelas (Shapiro-Wilk) ==
# Uji normalitas univariat untuk tiap fitur per kelas
for (cls in levels(train_balanced$weather_main)) {
cat("Kelas:", cls, "\n")
subset_data <- train_balanced[train_balanced$weather_main == cls, feature_cols]
for (feature in feature_cols) {
shapiro_test <- shapiro.test(subset_data[[feature]])
cat(sprintf(" %s: W=%.4f, p=%.4f\n", feature, shapiro_test$statistic, shapiro_test$p.value))
}
cat("\n")
}
## Kelas: 0
## temp: W=0.9280, p=0.0000
## temp_min: W=0.8951, p=0.0000
## temp_max: W=0.9370, p=0.0000
## pressure: W=0.8809, p=0.0000
## humidity: W=0.8845, p=0.0000
## wind_speed: W=0.9531, p=0.0000
## wind_deg: W=0.6410, p=0.0000
## clouds_all: W=0.6887, p=0.0000
##
## Kelas: 1
## temp: W=0.9910, p=0.0000
## temp_min: W=0.9906, p=0.0000
## temp_max: W=0.9870, p=0.0000
## pressure: W=0.9784, p=0.0000
## humidity: W=0.9660, p=0.0000
## wind_speed: W=0.9696, p=0.0000
## wind_deg: W=0.8954, p=0.0000
## clouds_all: W=0.6177, p=0.0000
##
## Kelas: 2
## temp: W=0.9891, p=0.0000
## temp_min: W=0.9796, p=0.0000
## temp_max: W=0.9795, p=0.0000
## pressure: W=0.9684, p=0.0000
## humidity: W=0.9273, p=0.0000
## wind_speed: W=0.8844, p=0.0000
## wind_deg: W=0.8555, p=0.0000
## clouds_all: W=0.8263, p=0.0000
##
## Kelas: 3
## temp: W=0.9662, p=0.0000
## temp_min: W=0.9629, p=0.0000
## temp_max: W=0.9563, p=0.0000
## pressure: W=0.9717, p=0.0000
## humidity: W=0.9206, p=0.0000
## wind_speed: W=0.8877, p=0.0000
## wind_deg: W=0.8762, p=0.0000
## clouds_all: W=0.7882, p=0.0000
cat("== Uji Homogenitas Matriks Kovarian (Box's M) ==\n")
## == Uji Homogenitas Matriks Kovarian (Box's M) ==
boxM_result <- boxM(train_balanced[, feature_cols], train_balanced$weather_main)
boxM_result
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: train_balanced[, feature_cols]
## Chi-Sq (approx.) = 25173, df = 108, p-value < 2.2e-16
cat("== Uji Normalitas Multivariat ==\n")
## == Uji Normalitas Multivariat ==
library(MVN)
# Uji normalitas multivariat dengan metode Mardia
result <- mvn(data = train_balanced %>% dplyr::select(-weather_main), mvnTest = "mardia")
result$multivariateNormality
## Test Statistic p value Result
## 1 Mardia Skewness 43175.1969984824 0 NO
## 2 Mardia Kurtosis 79.7638388317432 0 NO
## 3 MVN <NA> <NA> NO
library(MASS)
model_lda <- lda(weather_main ~ ., data = train_balanced_norm)
pred_lda <- predict(model_lda, test_data_norm)$class
conf_matrix_lda <- confusionMatrix(pred_lda, test_data_norm$weather_main)
cat("=== Evaluasi LDA ===\n")
## === Evaluasi LDA ===
conf_matrix_lda
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3
## 0 5 300 1 0
## 1 1 1678 39 12
## 2 0 251 81 11
## 3 0 193 50 19
##
## Overall Statistics
##
## Accuracy : 0.6751
## 95% CI : (0.6569, 0.693)
## No Information Rate : 0.9171
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.1649
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3
## Sensitivity 0.833333 0.6928 0.47368 0.452381
## Specificity 0.885769 0.7626 0.89393 0.906503
## Pos Pred Value 0.016340 0.9699 0.23615 0.072519
## Neg Pred Value 0.999572 0.1833 0.96084 0.990332
## Prevalence 0.002272 0.9171 0.06475 0.015903
## Detection Rate 0.001893 0.6354 0.03067 0.007194
## Detection Prevalence 0.115865 0.6551 0.12988 0.099205
## Balanced Accuracy 0.859551 0.7277 0.68381 0.679442
accuracy_lda <- conf_matrix_lda$overall['Accuracy']
cat(sprintf("Accuracy LDA: %.4f\n", accuracy_lda))
## Accuracy LDA: 0.6751
library(ggplot2)
conf_table <- as.data.frame(conf_matrix_lda$table)
colnames(conf_table) <- c("Prediction", "Reference", "Freq")
ggplot(data = conf_table, aes(x = Prediction, y = Reference, fill = Freq)) +
geom_tile() +
geom_text(aes(label = Freq), vjust = 0.5, color = "white", size = 5) +
scale_fill_gradient(low = "lightblue", high = "darkblue") +
theme_minimal() +
labs(title = "Matriks Konfusi LDA",
x = "Kelas Prediksi",
y = "Kelas Aktual")
lda_predictions_test <- predict(model_lda, test_data_norm)
lda_plot_data_test <- data.frame(LD1 = lda_predictions_test$x[,1],
LD2 = if(ncol(lda_predictions_test$x) >= 2) lda_predictions_test$x[,2] else NA,
weather_main = test_data_norm$weather_main)
if (ncol(lda_predictions_test$x) >= 2) {
library(ggplot2)
ggplot(lda_plot_data_test, aes(x = LD1, y = LD2, color = weather_main)) +
geom_point(alpha = 0.7) +
stat_ellipse(aes(group = weather_main), type = "norm", level = 0.68, linetype = 2) +
theme_minimal() +
labs(title = "Proyeksi Data Test pada Fungsi Diskriminan Linier",
x = "LD1", y = "LD2", color = "Jenis Cuaca Aktual")
} else if (ncol(lda_predictions_test$x) == 1) {
library(ggplot2)
ggplot(lda_plot_data_test, aes(x = LD1, fill = weather_main)) +
geom_density(alpha = 0.7) +
theme_minimal() +
labs(title = "Densitas LD1 per Jenis Cuaca Aktual (Data Test)",
x = "LD1", fill = "Jenis Cuaca Aktual")
}
lda_coefficients <- as.data.frame(model_lda$scaling)
lda_coefficients$variable <- rownames(lda_coefficients)
library(ggplot2)
library(tidyr)
lda_coefficients_long <- lda_coefficients %>%
pivot_longer(cols = -variable, names_to = "LD", values_to = "Coefficient")
# Plot koefisien
ggplot(lda_coefficients_long, aes(x = reorder(variable, Coefficient), y = Coefficient, fill = LD)) +
geom_bar(stat = "identity", position = position_dodge()) +
coord_flip() +
facet_wrap(~LD) +
theme_minimal() +
labs(title = "Koefisien Fungsi Diskriminan Linier (LDA Loadings)",
x = "Variabel",
y = "Koefisien (Bobot)") +
scale_fill_brewer(palette = "Set1")
[1] M. S. Tjahaya, Raupong, and G. M. Tinungki, “Analisis diskriminan linear robust dengan metode Winsorized modified one-step M-estimator,” Estimasi, vol. 3, no. 1, pp. 1–13, 2022, doi: 10.20956/ejsa.vi.11302.
[2] M. S. Irwanto, F. A. Bachtiar, and N. Yudistira, “Klasifikasi aktivitas manusia menggunakan algoritme computed-input-weight extreme learning machine dengan reduksi dimensi principal component analysis,” J. Teknol. Inf. Ilmu Komput., vol. 9, no. 6, pp. 1195–1202, 2022, doi: 10.25126/jtiik.202295504.
[3] C. Herdian, A. Kamila, and I. G. A. M. Budidarma, “Studi kasus feature engineering untuk data teks: perbandingan label encoding dan one-hot encoding pada metode linear regresi,” Technologia: J. Ilmiah, vol. 15, no. 1, Art. no. e13457, 2024, doi: 10.31602/tji.v15i1.13457.
[4] P. J. Muhammad Ali, “Investigating the impact of min-max data normalization on the regression performance of k-nearest neighbor with different similarity measurements,” ARO Sci. J. Koya Univ., vol. 10, pp. 85–91, 2022, doi: 10.14500/aro.10955.