Input Data
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(naniar)
## Warning: package 'naniar' was built under R version 4.5.3
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(ggplot2)
library(rpart)
library(missForest)
## Warning: package 'missForest' was built under R version 4.5.3
##
## Attaching package: 'missForest'
## The following object is masked from 'package:VIM':
##
## nrmse
# Ambil Data
data_mentah = read_excel("C:\\Users\\MUTHI'AH IFFA\\Downloads\\deska_baru\\variabel lain yg bisa digunakan.xlsx")
colnames(data_mentah) = c("Tanggal", "Tn", "Tx", "Tavg", "RH", "RR", "ss", "ff")
str(data_mentah)
## tibble [12,053 Ć 8] (S3: tbl_df/tbl/data.frame)
## $ Tanggal: POSIXct[1:12053], format: "1990-01-01" "1990-01-02" ...
## $ Tn : num [1:12053] 19 19 20 19 19 20 19 19 20 20 ...
## $ Tx : num [1:12053] 28.4 27 26.8 28.4 28.6 22 27.8 28.2 28.2 24.8 ...
## $ Tavg : num [1:12053] 22.2 22.5 23.1 22.5 21.5 20.3 22.3 23.3 23.2 21.8 ...
## $ RH : num [1:12053] 86 83 85 85 86 94 84 72 79 87 ...
## $ RR : num [1:12053] 6 47 0 12 16.5 1 13 1.5 2 0 ...
## $ ss : num [1:12053] 4.1 0.4 6.2 4.3 NA 0.1 5.2 2.8 5.1 0.4 ...
## $ ff : num [1:12053] 0 0 1 2 2 1 1 2 2 2 ...
Eksplorasi Data
summary(data_mentah$RR)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.0 1.3 603.1 11.3 8888.0 571
summary(data_mentah$Tn)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.00 19.00 20.00 19.38 20.00 29.00 578
summary(data_mentah$Tx)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.80 28.10 29.00 29.01 30.00 36.00 326
summary(data_mentah$RH)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 42.00 73.00 79.00 77.99 84.00 96.00 330
summary(data_mentah$ss)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 3.300 5.100 4.924 6.800 10.000 492
# Cek Data Kosong
colSums(is.na(data_mentah))
## Tanggal Tn Tx Tavg RH RR ss ff
## 0 578 326 156 330 571 492 114
# Visualisasi Peta
vis_miss(data_mentah) +
ggtitle("Peta Missing Value")

# Visualisasi Kombinasi
aggr(data_mentah,
col = c('navyblue','red'),
numbers = TRUE,
sortVars = TRUE,
labels = names(data_mentah),
cex.axis = 0.7,
gap = 3,
ylab = c("Proporsi Missing Value", "Pola Kombinasi Missing Value"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)

##
## Variables sorted by number of missings:
## Variable Count
## Tn 0.047954866
## RR 0.047374098
## ss 0.040819713
## RH 0.027379076
## Tx 0.027047208
## Tavg 0.012942836
## ff 0.009458226
## Tanggal 0.000000000
Pre-Processing Data
Cleaning Data
# Ubah kode NA
data_bersih = data_mentah %>%
mutate(`RR` = ifelse(`RR` >= 8888, NA, `RR`))
Pengujian Data
Skenario (10% Data Kosong)
Data Baseline
# Data Utuh sebagai pembanding
data_lengkap = na.omit(data_bersih) # Hapus data kosong
# Skenario Pengujian (10% Data Hilang)
set.seed(123)
n_hilang_10 = round(0.10 * nrow(data_lengkap))
indeks_hilang_10 = sample(1:nrow(data_lengkap), n_hilang_10)
nilai_aktual_10 = data_lengkap$RR[indeks_hilang_10]
# Data Pengujian
data_pengujian_10 = data_lengkap
data_pengujian_10$RR[indeks_hilang_10] = NA
data_pengujian_model_10 = data_pengujian_10 %>% select(-Tanggal)
# Imputasi Baseline (Mean)
mean_RR_10 = mean(data_pengujian_model_10$RR, na.rm = TRUE)
data_baseline_10 = data_pengujian_model_10 %>%
mutate(RR = ifelse(is.na(RR), mean_RR_10, RR))
Eksekusi Model
# 1. Regresi
mod_reg_10 = lm(RR ~ ., data = data_baseline_10)
pred_reg_10 = predict(mod_reg_10, newdata = data_baseline_10)[indeks_hilang_10]
# 2. CART
mod_cart_10 = rpart(RR ~ ., data = data_baseline_10, method = "anova")
pred_cart_10 = predict(mod_cart_10, newdata = data_baseline_10)[indeks_hilang_10]
# 3. MissForest
set.seed(123)
res_mf_10 = missForest(as.data.frame(data_pengujian_model_10))
pred_mf_10 = res_mf_10$ximp$RR[indeks_hilang_10]
Evaluasi Model
library(Metrics)
tabel_10 = data.frame(
Metode = c("Regresi Linear", "CART", "MissForest"),
RMSE_10 = c(rmse(nilai_aktual_10, pred_reg_10),
rmse(nilai_aktual_10, pred_cart_10),
rmse(nilai_aktual_10, pred_mf_10))
)
print(tabel_10)
## Metode RMSE_10
## 1 Regresi Linear 11.15630
## 2 CART 11.02645
## 3 MissForest 10.88774
Skenario (20%)
Data Baseline
set.seed(123)
n_hilang_20 = round(0.20 * nrow(data_lengkap))
indeks_hilang_20 = sample(1:nrow(data_lengkap), n_hilang_20)
nilai_aktual_20 = data_lengkap$RR[indeks_hilang_20]
# Buat Data Pengujian 20%
data_pengujian_20 = data_lengkap
data_pengujian_20$RR[indeks_hilang_20] = NA
data_pengujian_model_20 = data_pengujian_20 %>% select(-Tanggal)
# Imputasi Baseline (Mean)
mean_RR_20 = mean(data_pengujian_model_20$RR, na.rm = TRUE)
data_baseline_20 = data_pengujian_model_20 %>%
mutate(RR = ifelse(is.na(RR), mean_RR_20, RR))
Eksekusi Model
# Regresi
mod_reg_20 = lm(RR ~ ., data = data_baseline_20)
pred_reg_20 = predict(mod_reg_20, newdata = data_baseline_20)[indeks_hilang_20]
# CART
mod_cart_20 = rpart(RR ~ ., data = data_baseline_20, method = "anova")
pred_cart_20 = predict(mod_cart_20, newdata = data_baseline_20)[indeks_hilang_20]
# MissForest
set.seed(123)
res_mf_20 = missForest(as.data.frame(data_pengujian_model_20))
pred_mf_20 = res_mf_20$ximp$RR[indeks_hilang_20]
Evaluasi Model
# Nilai RMSE
tabel_20 = data.frame(
Metode = c("Regresi Linear", "CART", "MissForest"),
RMSE_20 = c(rmse(nilai_aktual_20, pred_reg_20),
rmse(nilai_aktual_20, pred_cart_20),
rmse(nilai_aktual_20, pred_mf_20))
)
tabel_20
## Metode RMSE_20
## 1 Regresi Linear 11.28918
## 2 CART 11.25141
## 3 MissForest 11.17510
Skenario 30%
Bangun Baseline
set.seed(123)
n_hilang_30 = round(0.30 * nrow(data_lengkap))
indeks_hilang_30 = sample(1:nrow(data_lengkap), n_hilang_30)
nilai_aktual_30 = data_lengkap$RR[indeks_hilang_30]
# Buat Data Pengujian 30%
data_pengujian_30 = data_lengkap
data_pengujian_30$RR[indeks_hilang_30] = NA
data_pengujian_model_30 = data_pengujian_30 %>% select(-Tanggal)
# Imputasi Baseline (Mean)
mean_RR_30 = mean(data_pengujian_model_30$RR, na.rm = TRUE)
data_baseline_30 = data_pengujian_model_30 %>%
mutate(RR = ifelse(is.na(RR), mean_RR_30, RR))
Eksekusi Model
# Regresi
mod_reg_30 = lm(RR ~ ., data = data_baseline_30)
pred_reg_30 = predict(mod_reg_30, newdata = data_baseline_30)[indeks_hilang_30]
# CART
mod_cart_30 = rpart(RR ~ ., data = data_baseline_30, method = "anova")
pred_cart_30 = predict(mod_cart_30, newdata = data_baseline_30)[indeks_hilang_30]
# MissForest
set.seed(123)
res_mf_30 = missForest(as.data.frame(data_pengujian_model_30))
pred_mf_30 = res_mf_30$ximp$RR[indeks_hilang_30]
Evaluasi Model
# Evaluasi
tabel_30 = data.frame(
Metode = c("Regresi Linear", "CART", "MissForest"),
RMSE_30 = c(rmse(nilai_aktual_30, pred_reg_30),
rmse(nilai_aktual_30, pred_cart_30),
rmse(nilai_aktual_30, pred_mf_30))
)
tabel_30
## Metode RMSE_30
## 1 Regresi Linear 11.16582
## 2 CART 11.24811
## 3 MissForest 11.04306
Skenario 70%
Data Baseline
set.seed(123)
n_hilang_70 = round(0.70 * nrow(data_lengkap))
indeks_hilang_70 = sample(1:nrow(data_lengkap), n_hilang_70)
nilai_aktual_70 = data_lengkap$RR[indeks_hilang_70]
# Buat Data Pengujian 70%
data_pengujian_70 = data_lengkap
data_pengujian_70$RR[indeks_hilang_70] = NA
data_pengujian_model_70 = data_pengujian_70 %>% select(-Tanggal)
# Imputasi Baseline (Mean)
mean_RR_70 = mean(data_pengujian_model_70$RR, na.rm = TRUE)
data_baseline_70 = data_pengujian_model_70 %>%
mutate(RR = ifelse(is.na(RR), mean_RR_70, RR))
Eksekusi Model
# 1. Regresi
mod_reg_70 = lm(RR ~ ., data = data_baseline_70)
pred_reg_70 = predict(mod_reg_70, newdata = data_baseline_70)[indeks_hilang_70]
# 2. CART
mod_cart_70 = rpart(RR ~ ., data = data_baseline_70, method = "anova")
pred_cart_70 = predict(mod_cart_70, newdata = data_baseline_70)[indeks_hilang_70]
# 3. MissForest
set.seed(123)
res_mf_70 = missForest(as.data.frame(data_pengujian_model_70))
pred_mf_70 = res_mf_70$ximp$RR[indeks_hilang_70]
Evaluasi Model
tabel_70 = data.frame(
Metode = c("Regresi Linear", "CART", "MissForest"),
RMSE_70 = c(rmse(nilai_aktual_70, pred_reg_70),
rmse(nilai_aktual_70, pred_cart_70),
rmse(nilai_aktual_70, pred_mf_70))
)
tabel_70
## Metode RMSE_70
## 1 Regresi Linear 11.78416
## 2 CART 11.87934
## 3 MissForest 11.20408
Perbandingan Pengujian Data
# Rekap Hasil Evaluasi
rekap_final = tabel_10 %>%
left_join(tabel_20, by = "Metode") %>%
left_join(tabel_30, by = "Metode") %>%
left_join(tabel_70, by = "Metode")
# Visualisasi
library(ggplot2)
library(tidyr)
rekap_long = rekap_final %>%
pivot_longer(cols = starts_with("RMSE"),
names_to = "Skenario",
values_to = "RMSE")
ggplot(rekap_long, aes(x = Skenario, y = RMSE, group = Metode, color = Metode)) +
geom_line(size = 1.2) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "Perbandingan Kinerja Metode Imputasi",
subtitle = "Semakin rendah nilai RMSE, semakin akurat metode tersebut",
x = "Skenario Missing Value",
y = "Nilai RMSE") +
scale_color_manual(values = c("MissForest" = "firebrick",
"CART" = "orange",
"Regresi Linear" = "navyblue"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ā¹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

tabel_rmse = tabel_10 %>%
left_join(tabel_20, by = "Metode") %>%
left_join(tabel_30, by = "Metode") %>%
left_join(tabel_70, by = "Metode")
tabel_rmse
## Metode RMSE_10 RMSE_20 RMSE_30 RMSE_70
## 1 Regresi Linear 11.15630 11.28918 11.16582 11.78416
## 2 CART 11.02645 11.25141 11.24811 11.87934
## 3 MissForest 10.88774 11.17510 11.04306 11.20408
Imputasi Miss Forest
# Imputasi Data
data_siap = data_bersih %>%
select(-Tanggal) # Tanggal dikeluarkan dulu karena bukan angka
# 2. Proses Imputasi Total
set.seed(123)
data_imputed = missForest(as.data.frame(data_siap))
# 3. Masukkan kembali kolom Tanggal ke data yang sudah lengkap
data_final= data.frame(
Tanggal = data_bersih$Tanggal,
data_imputed$ximp
)
# Cek apakah NA sudah hilang semua
colSums(is.na(data_final))
## Tanggal Tn Tx Tavg RH RR ss ff
## 0 0 0 0 0 0 0 0
Korelasi
# Contoh untuk Curah Hujan (RR)
library(ggplot2)
ggplot() +
geom_density(data = data_bersih, aes(x = RR), color = "blue", size = 1, linetype = "dashed") +
geom_density(data = data_final, aes(x = RR), color = "red", alpha = 0.5) +
labs(title = "Perbandingan Distribusi RR: Asli (Biru Putus-putus) vs Imputasi (Merah)",
x = "Curah Hujan (mm)", y = "Density") +
theme_minimal()
## Warning: Removed 1342 rows containing non-finite outside the scale range
## (`stat_density()`).

# Density Plot untuk Suhu Minimum (Tn)
ggplot() +
geom_density(data = data_bersih, aes(x = Tn), color = "blue", size = 1, linetype = "dashed") +
geom_density(data = data_final, aes(x = Tn), color = "red", alpha = 0.5) +
labs(title = "Perbandingan Distribusi Suhu Minimum (Tn)",
subtitle = "Biru (Putus-putus): Data Asli | Merah: Hasil Imputasi",
x = "Suhu Minimum (°C)", y = "Density") +
theme_minimal()
## Warning: Removed 578 rows containing non-finite outside the scale range
## (`stat_density()`).

# Density Plot untuk Suhu Maksimum (Tx)
ggplot() +
geom_density(data = data_bersih, aes(x = Tx), color = "blue", size = 1, linetype = "dashed") +
geom_density(data = data_final, aes(x = Tx), color = "red", alpha = 0.5) +
labs(title = "Perbandingan Distribusi Suhu Maksimum (Tx)",
subtitle = "Biru (Putus-putus): Data Asli | Merah: Hasil Imputasi",
x = "Suhu Maksimum (°C)", y = "Density") +
theme_minimal()
## Warning: Removed 326 rows containing non-finite outside the scale range
## (`stat_density()`).

# Perbandingan Mean
summary_asli = colMeans(data_siap, na.rm = TRUE)
summary_imputasi = colMeans(data_final %>% select(-Tanggal))
tabel_perbandingan = data.frame(
Variabel = names(summary_asli),
Mean_Asli = round(summary_asli, 3),
Mean_Imputasi = round(summary_imputasi, 3),
Selisih = round(summary_asli - summary_imputasi, 4)
)
tabel_perbandingan
## Variabel Mean_Asli Mean_Imputasi Selisih
## Tn Tn 19.380 19.382 -0.0020
## Tx Tx 29.012 29.027 -0.0154
## Tavg Tavg 23.391 23.398 -0.0063
## RH RH 77.986 77.956 0.0306
## RR RR 6.738 6.565 0.1732
## ss ss 4.924 4.881 0.0428
## ff ff 1.736 1.736 0.0001
ggplot(data_final, aes(x = ss, y = Tx)) +
geom_point(alpha = 0.3, color = "darkgreen") +
geom_smooth(method = "lm", color = "red") +
labs(title = "Konsistensi Hubungan ss vs Tx",
x = "Penyinaran Matahari", y = "Suhu Maksimum") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# 1. Hitung Matriks Korelasi
# korelasi_asli: menggunakan data yang masih ada NA (pairwise)
# korelasi_hasil: menggunakan data yang sudah lengkap 100%
korelasi_asli = cor(data_siap, use = "pairwise.complete.obs")
korelasi_hasil = cor(data_final %>% select(-Tanggal))
# 2. Tampilkan Matriks Korelasi Hasil Imputasi (Dibulatkan 3 angka)
print("--- MATRIKS KORELASI DATA FINAL (SETELAH IMPUTASI) ---")
## [1] "--- MATRIKS KORELASI DATA FINAL (SETELAH IMPUTASI) ---"
round(korelasi_hasil, 3)
## Tn Tx Tavg RH RR ss ff
## Tn 1.000 -0.095 0.383 0.369 0.106 -0.322 0.058
## Tx -0.095 1.000 0.606 -0.607 -0.190 0.556 0.176
## Tavg 0.383 0.606 1.000 -0.391 -0.177 0.253 0.173
## RH 0.369 -0.607 -0.391 1.000 0.399 -0.576 -0.247
## RR 0.106 -0.190 -0.177 0.399 1.000 -0.236 -0.111
## ss -0.322 0.556 0.253 -0.576 -0.236 1.000 0.132
## ff 0.058 0.176 0.173 -0.247 -0.111 0.132 1.000
# 3. Hitung Selisih/Bias (Untuk melihat apakah ada perubahan pola yang ekstrem)
selisih_kor = korelasi_asli - korelasi_hasil
print("--- SELISIH KORELASI (ASLI - HASIL) ---")
## [1] "--- SELISIH KORELASI (ASLI - HASIL) ---"
round(selisih_kor, 4)
## Tn Tx Tavg RH RR ss ff
## Tn 0.0000 -0.0068 -0.0069 0.0008 0.0036 -0.0017 -0.0071
## Tx -0.0068 0.0000 -0.0031 0.0108 0.0042 -0.0135 0.0033
## Tavg -0.0069 -0.0031 0.0000 0.0069 0.0045 -0.0151 0.0007
## RH 0.0008 0.0108 0.0069 0.0000 -0.0098 0.0144 0.0078
## RR 0.0036 0.0042 0.0045 -0.0098 0.0000 0.0047 0.0034
## ss -0.0017 -0.0135 -0.0151 0.0144 0.0047 0.0000 -0.0012
## ff -0.0071 0.0033 0.0007 0.0078 0.0034 -0.0012 0.0000
# 4. Visualisasi Korelasi Berdampingan
library(corrplot)
## corrplot 0.95 loaded
par(mfrow = c(1, 2))
corrplot(korelasi_asli, method = "circle", type = "upper",
title = "Asli (Ada NA)", mar = c(0,0,2,0), tl.cex = 0.7)
corrplot(korelasi_hasil, method = "circle", type = "upper",
title = "Final (Lengkap)", mar = c(0,0,2,0), tl.cex = 0.7)
