Input Data
library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(naniar)
## Warning: package 'naniar' was built under R version 4.5.3
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(ggplot2)
library(rpart)
library(missForest)
## Warning: package 'missForest' was built under R version 4.5.3
##
## Attaching package: 'missForest'
## The following object is masked from 'package:VIM':
##
## nrmse
# Ambil Data
data_mentah = read_excel("C:\\Users\\MUTHI'AH IFFA\\Downloads\\deska_baru\\variabel lain yg bisa digunakan.xlsx")
colnames(data_mentah) = c("Tanggal", "Tn", "Tx", "Tavg", "RH", "RR", "ss", "ff")
str(data_mentah)
## tibble [12,053 Ć 8] (S3: tbl_df/tbl/data.frame)
## $ Tanggal: POSIXct[1:12053], format: "1990-01-01" "1990-01-02" ...
## $ Tn : num [1:12053] 19 19 20 19 19 20 19 19 20 20 ...
## $ Tx : num [1:12053] 28.4 27 26.8 28.4 28.6 22 27.8 28.2 28.2 24.8 ...
## $ Tavg : num [1:12053] 22.2 22.5 23.1 22.5 21.5 20.3 22.3 23.3 23.2 21.8 ...
## $ RH : num [1:12053] 86 83 85 85 86 94 84 72 79 87 ...
## $ RR : num [1:12053] 6 47 0 12 16.5 1 13 1.5 2 0 ...
## $ ss : num [1:12053] 4.1 0.4 6.2 4.3 NA 0.1 5.2 2.8 5.1 0.4 ...
## $ ff : num [1:12053] 0 0 1 2 2 1 1 2 2 2 ...
Eksplorasi Data
summary(data_mentah$RR)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0 0.0 1.3 603.1 11.3 8888.0 571
summary(data_mentah$Tn)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 13.00 19.00 20.00 19.38 20.00 29.00 578
summary(data_mentah$Tx)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 18.80 28.10 29.00 29.01 30.00 36.00 326
summary(data_mentah$RH)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 42.00 73.00 79.00 77.99 84.00 96.00 330
summary(data_mentah$ss)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 3.300 5.100 4.924 6.800 10.000 492
# Cek Data Kosong
colSums(is.na(data_mentah))
## Tanggal Tn Tx Tavg RH RR ss ff
## 0 578 326 156 330 571 492 114
# Visualisasi Peta
vis_miss(data_mentah) +
ggtitle("Peta Missing Value")

# Visualisasi Kombinasi
aggr(data_mentah,
col = c('navyblue','red'),
numbers = TRUE,
sortVars = TRUE,
labels = names(data_mentah),
cex.axis = 0.7,
gap = 3,
ylab = c("Proporsi Missing Value", "Pola Kombinasi Missing Value"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)

##
## Variables sorted by number of missings:
## Variable Count
## Tn 0.047954866
## RR 0.047374098
## ss 0.040819713
## RH 0.027379076
## Tx 0.027047208
## Tavg 0.012942836
## ff 0.009458226
## Tanggal 0.000000000
Pre-Processing Data
Cleaning Data
# Ubah kode NA
data_bersih = data_mentah %>%
mutate(`RR` = ifelse(`RR` >= 8888, NA, `RR`))
Pengujian Data
Skenario (10% Data Kosong)
Data Baseline
# Data Utuh sebagai pembanding
data_lengkap = na.omit(data_bersih) # Hapus data kosong
# Skenario Pengujian (10% Data Hilang)
set.seed(123)
n_hilang_10 = round(0.10 * nrow(data_lengkap))
indeks_hilang_10 = sample(1:nrow(data_lengkap), n_hilang_10)
nilai_aktual_10 = data_lengkap$RR[indeks_hilang_10]
# Data Pengujian
data_pengujian_10 = data_lengkap
data_pengujian_10$RR[indeks_hilang_10] = NA
data_pengujian_model_10 = data_pengujian_10 %>% select(-Tanggal)
# Imputasi Baseline (Mean)
mean_RR_10 = mean(data_pengujian_model_10$RR, na.rm = TRUE)
data_baseline_10 = data_pengujian_model_10 %>%
mutate(RR = ifelse(is.na(RR), mean_RR_10, RR))
Eksekusi Model
# 1. Regresi
mod_reg_10 = lm(RR ~ ., data = data_baseline_10)
pred_reg_10 = predict(mod_reg_10, newdata = data_baseline_10)[indeks_hilang_10]
# 2. CART
mod_cart_10 = rpart(RR ~ ., data = data_baseline_10, method = "anova")
pred_cart_10 = predict(mod_cart_10, newdata = data_baseline_10)[indeks_hilang_10]
# 3. MissForest
set.seed(123)
res_mf_10 = missForest(as.data.frame(data_pengujian_model_10))
pred_mf_10 = res_mf_10$ximp$RR[indeks_hilang_10]
Evaluasi Model
library(Metrics)
tabel_10 = data.frame(
Metode = c("Regresi Linear", "CART", "MissForest"),
RMSE_10 = c(rmse(nilai_aktual_10, pred_reg_10),
rmse(nilai_aktual_10, pred_cart_10),
rmse(nilai_aktual_10, pred_mf_10))
)
print(tabel_10)
## Metode RMSE_10
## 1 Regresi Linear 11.15630
## 2 CART 11.02645
## 3 MissForest 10.88774
Skenario (20%)
Data Baseline
set.seed(123)
n_hilang_20 = round(0.20 * nrow(data_lengkap))
indeks_hilang_20 = sample(1:nrow(data_lengkap), n_hilang_20)
nilai_aktual_20 = data_lengkap$RR[indeks_hilang_20]
# Buat Data Pengujian 20%
data_pengujian_20 = data_lengkap
data_pengujian_20$RR[indeks_hilang_20] = NA
data_pengujian_model_20 = data_pengujian_20 %>% select(-Tanggal)
# Imputasi Baseline (Mean)
mean_RR_20 = mean(data_pengujian_model_20$RR, na.rm = TRUE)
data_baseline_20 = data_pengujian_model_20 %>%
mutate(RR = ifelse(is.na(RR), mean_RR_20, RR))
Eksekusi Model
# Regresi
mod_reg_20 = lm(RR ~ ., data = data_baseline_20)
pred_reg_20 = predict(mod_reg_20, newdata = data_baseline_20)[indeks_hilang_20]
# CART
mod_cart_20 = rpart(RR ~ ., data = data_baseline_20, method = "anova")
pred_cart_20 = predict(mod_cart_20, newdata = data_baseline_20)[indeks_hilang_20]
# MissForest
set.seed(123)
res_mf_20 = missForest(as.data.frame(data_pengujian_model_20))
pred_mf_20 = res_mf_20$ximp$RR[indeks_hilang_20]
Evaluasi Model
# Nilai RMSE
tabel_20 = data.frame(
Metode = c("Regresi Linear", "CART", "MissForest"),
RMSE_20 = c(rmse(nilai_aktual_20, pred_reg_20),
rmse(nilai_aktual_20, pred_cart_20),
rmse(nilai_aktual_20, pred_mf_20))
)
tabel_20
## Metode RMSE_20
## 1 Regresi Linear 11.28918
## 2 CART 11.25141
## 3 MissForest 11.17510
Skenario 30%
Bangun Baseline
set.seed(123)
n_hilang_30 = round(0.30 * nrow(data_lengkap))
indeks_hilang_30 = sample(1:nrow(data_lengkap), n_hilang_30)
nilai_aktual_30 = data_lengkap$RR[indeks_hilang_30]
# Buat Data Pengujian 30%
data_pengujian_30 = data_lengkap
data_pengujian_30$RR[indeks_hilang_30] = NA
data_pengujian_model_30 = data_pengujian_30 %>% select(-Tanggal)
# Imputasi Baseline (Mean)
mean_RR_30 = mean(data_pengujian_model_30$RR, na.rm = TRUE)
data_baseline_30 = data_pengujian_model_30 %>%
mutate(RR = ifelse(is.na(RR), mean_RR_30, RR))
Eksekusi Model
# Regresi
mod_reg_30 = lm(RR ~ ., data = data_baseline_30)
pred_reg_30 = predict(mod_reg_30, newdata = data_baseline_30)[indeks_hilang_30]
# CART
mod_cart_30 = rpart(RR ~ ., data = data_baseline_30, method = "anova")
pred_cart_30 = predict(mod_cart_30, newdata = data_baseline_30)[indeks_hilang_30]
# MissForest
set.seed(123)
res_mf_30 = missForest(as.data.frame(data_pengujian_model_30))
pred_mf_30 = res_mf_30$ximp$RR[indeks_hilang_30]
Evaluasi Model
# Evaluasi
tabel_30 = data.frame(
Metode = c("Regresi Linear", "CART", "MissForest"),
RMSE_30 = c(rmse(nilai_aktual_30, pred_reg_30),
rmse(nilai_aktual_30, pred_cart_30),
rmse(nilai_aktual_30, pred_mf_30))
)
tabel_30
## Metode RMSE_30
## 1 Regresi Linear 11.16582
## 2 CART 11.24811
## 3 MissForest 11.04306
Skenario 70%
Data Baseline
set.seed(123)
n_hilang_70 = round(0.70 * nrow(data_lengkap))
indeks_hilang_70 = sample(1:nrow(data_lengkap), n_hilang_70)
nilai_aktual_70 = data_lengkap$RR[indeks_hilang_70]
# Buat Data Pengujian 70%
data_pengujian_70 = data_lengkap
data_pengujian_70$RR[indeks_hilang_70] = NA
data_pengujian_model_70 = data_pengujian_70 %>% select(-Tanggal)
# Imputasi Baseline (Mean)
mean_RR_70 = mean(data_pengujian_model_70$RR, na.rm = TRUE)
data_baseline_70 = data_pengujian_model_70 %>%
mutate(RR = ifelse(is.na(RR), mean_RR_70, RR))
Eksekusi Model
# 1. Regresi
mod_reg_70 = lm(RR ~ ., data = data_baseline_70)
pred_reg_70 = predict(mod_reg_70, newdata = data_baseline_70)[indeks_hilang_70]
# 2. CART
mod_cart_70 = rpart(RR ~ ., data = data_baseline_70, method = "anova")
pred_cart_70 = predict(mod_cart_70, newdata = data_baseline_70)[indeks_hilang_70]
# 3. MissForest
set.seed(123)
res_mf_70 = missForest(as.data.frame(data_pengujian_model_70))
pred_mf_70 = res_mf_70$ximp$RR[indeks_hilang_70]
Evaluasi Model
tabel_70 = data.frame(
Metode = c("Regresi Linear", "CART", "MissForest"),
RMSE_70 = c(rmse(nilai_aktual_70, pred_reg_70),
rmse(nilai_aktual_70, pred_cart_70),
rmse(nilai_aktual_70, pred_mf_70))
)
tabel_70
## Metode RMSE_70
## 1 Regresi Linear 11.78416
## 2 CART 11.87934
## 3 MissForest 11.20408
Perbandingan Pengujian Data
# Rekap Hasil Evaluasi
rekap_final = tabel_10 %>%
left_join(tabel_20, by = "Metode") %>%
left_join(tabel_30, by = "Metode") %>%
left_join(tabel_70, by = "Metode")
# Visualisasi
library(ggplot2)
library(tidyr)
rekap_long = rekap_final %>%
pivot_longer(cols = starts_with("RMSE"),
names_to = "Skenario",
values_to = "RMSE")
ggplot(rekap_long, aes(x = Skenario, y = RMSE, group = Metode, color = Metode)) +
geom_line(size = 1.2) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "Perbandingan Kinerja Metode Imputasi",
subtitle = "Semakin rendah nilai RMSE, semakin akurat metode tersebut",
x = "Skenario Missing Value",
y = "Nilai RMSE") +
scale_color_manual(values = c("MissForest" = "firebrick",
"CART" = "orange",
"Regresi Linear" = "navyblue"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ā¹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

tabel_rmse = tabel_10 %>%
left_join(tabel_20, by = "Metode") %>%
left_join(tabel_30, by = "Metode") %>%
left_join(tabel_70, by = "Metode")
tabel_rmse
## Metode RMSE_10 RMSE_20 RMSE_30 RMSE_70
## 1 Regresi Linear 11.15630 11.28918 11.16582 11.78416
## 2 CART 11.02645 11.25141 11.24811 11.87934
## 3 MissForest 10.88774 11.17510 11.04306 11.20408
Imputasi Miss Forest
# Imputasi Data
data_siap = data_bersih %>%
select(-Tanggal) # Tanggal dikeluarkan dulu karena bukan angka
# 2. Proses Imputasi Total
set.seed(123)
data_imputed = missForest(as.data.frame(data_siap))
# 3. Masukkan kembali kolom Tanggal ke data yang sudah lengkap
data_final= data.frame(
Tanggal = data_bersih$Tanggal,
data_imputed$ximp
)
# Cek apakah NA sudah hilang semua
colSums(is.na(data_final))
## Tanggal Tn Tx Tavg RH RR ss ff
## 0 0 0 0 0 0 0 0
Hasil
# Contoh untuk Curah Hujan (RR)
library(ggplot2)
ggplot() +
geom_density(data = data_bersih, aes(x = RR), color = "blue", size = 1, linetype = "dashed") +
geom_density(data = data_final, aes(x = RR), color = "red", alpha = 0.5) +
labs(title = "Perbandingan Distribusi RR: Asli (Biru Putus-putus) vs Imputasi (Merah)",
x = "Curah Hujan (mm)", y = "Density") +
theme_minimal()
## Warning: Removed 1342 rows containing non-finite outside the scale range
## (`stat_density()`).

# Density Plot untuk Suhu Minimum (Tn)
ggplot() +
geom_density(data = data_bersih, aes(x = Tn), color = "blue", size = 1, linetype = "dashed") +
geom_density(data = data_final, aes(x = Tn), color = "red", alpha = 0.5) +
labs(title = "Perbandingan Distribusi Suhu Minimum (Tn)",
subtitle = "Biru (Putus-putus): Data Asli | Merah: Hasil Imputasi",
x = "Suhu Minimum (°C)", y = "Density") +
theme_minimal()
## Warning: Removed 578 rows containing non-finite outside the scale range
## (`stat_density()`).

# Density Plot untuk Suhu Maksimum (Tx)
ggplot() +
geom_density(data = data_bersih, aes(x = Tx), color = "blue", size = 1, linetype = "dashed") +
geom_density(data = data_final, aes(x = Tx), color = "red", alpha = 0.5) +
labs(title = "Perbandingan Distribusi Suhu Maksimum (Tx)",
subtitle = "Biru (Putus-putus): Data Asli | Merah: Hasil Imputasi",
x = "Suhu Maksimum (°C)", y = "Density") +
theme_minimal()
## Warning: Removed 326 rows containing non-finite outside the scale range
## (`stat_density()`).

# Perbandingan Mean
summary_asli = colMeans(data_siap, na.rm = TRUE)
summary_imputasi = colMeans(data_final %>% select(-Tanggal))
tabel_perbandingan = data.frame(
Variabel = names(summary_asli),
Mean_Asli = round(summary_asli, 3),
Mean_Imputasi = round(summary_imputasi, 3),
Selisih = round(summary_asli - summary_imputasi, 4)
)
tabel_perbandingan
## Variabel Mean_Asli Mean_Imputasi Selisih
## Tn Tn 19.380 19.382 -0.0020
## Tx Tx 29.012 29.027 -0.0154
## Tavg Tavg 23.391 23.398 -0.0063
## RH RH 77.986 77.956 0.0306
## RR RR 6.738 6.565 0.1732
## ss ss 4.924 4.881 0.0428
## ff ff 1.736 1.736 0.0001
ggplot(data_final, aes(x = ss, y = Tx)) +
geom_point(alpha = 0.3, color = "darkgreen") +
geom_smooth(method = "lm", color = "red") +
labs(title = "Konsistensi Hubungan ss vs Tx",
x = "Penyinaran Matahari", y = "Suhu Maksimum") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# Korelasi Antar Variabel
library(corrplot)
## corrplot 0.95 loaded
korelasi_asli = cor(data_siap, use = "pairwise.complete.obs")
korelasi_hasil = cor(data_final %>% select(-Tanggal))
# Bagi layar jadi 2 menyamping
par(mfrow = c(1, 2))
# 1. Plot Data Asli
corrplot(korelasi_asli,
method = "circle",
type = "upper",
addCoef.col = "black", # Menambahkan angka berwarna hitam
number.cex = 0.6, # Mengatur ukuran huruf angka biar gak tabrakan
title = "Asli (Ada NA)",
mar = c(0,0,2,0),
tl.cex = 0.8) # Mengatur ukuran teks nama variabel (Tn, Tx, dll)
# 2. Plot Data Hasil Imputasi
corrplot(korelasi_hasil,
method = "circle",
type = "upper",
addCoef.col = "black", # Menambahkan angka berwarna hitam
number.cex = 0.6, # Mengatur ukuran huruf angka
title = "Final (Lengkap)",
mar = c(0,0,2,0),
tl.cex = 0.8)

# 1. Tentukan variabel yang jadi fokus utama Kak Deska
var_uji = c("Tn", "Tx", "RR")
tabel_distribusi = data.frame(
Variabel = var_uji,
# Hitung Korelasi
Korelasi_Kurva = sapply(var_uji, function(v) {
batas_bawah = min(data_final[[v]])
batas_atas = max(data_final[[v]])
kurva_asli = density(data_bersih[[v]], na.rm = TRUE, from = batas_bawah, to = batas_atas, n = 512)$y
kurva_imputasi = density(data_final[[v]], from = batas_bawah, to = batas_atas, n = 512)$y
return(round(cor(kurva_asli, kurva_imputasi), 5))
}),
# Hitung Jarak Distribusi dengan Uji Kolmogorov-Smirnov
D_Statistic_KS = sapply(var_uji, function(v) {
uji_ks = ks.test(data_bersih[[v]], data_final[[v]])
return(round(uji_ks$statistic, 5))
})
)
## Warning in ks.test.default(data_bersih[[v]], data_final[[v]]): p-value will be
## approximate in the presence of ties
## Warning in ks.test.default(data_bersih[[v]], data_final[[v]]): p-value will be
## approximate in the presence of ties
## Warning in ks.test.default(data_bersih[[v]], data_final[[v]]): p-value will be
## approximate in the presence of ties
print(tabel_distribusi)
## Variabel Korelasi_Kurva D_Statistic_KS
## Tn Tn 0.99961 0.01445
## Tx Tx 0.99954 0.00846
## RR RR 0.99674 0.04676