Input Data

library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(naniar)
## Warning: package 'naniar' was built under R version 4.5.3
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(ggplot2)
library(rpart)
library(missForest)
## Warning: package 'missForest' was built under R version 4.5.3
## 
## Attaching package: 'missForest'
## The following object is masked from 'package:VIM':
## 
##     nrmse
# Ambil Data 
data_mentah = read_excel("C:\\Users\\MUTHI'AH IFFA\\Downloads\\deska_baru\\variabel lain yg bisa digunakan.xlsx")
colnames(data_mentah) = c("Tanggal", "Tn", "Tx", "Tavg", "RH", "RR", "ss", "ff")
str(data_mentah)
## tibble [12,053 Ɨ 8] (S3: tbl_df/tbl/data.frame)
##  $ Tanggal: POSIXct[1:12053], format: "1990-01-01" "1990-01-02" ...
##  $ Tn     : num [1:12053] 19 19 20 19 19 20 19 19 20 20 ...
##  $ Tx     : num [1:12053] 28.4 27 26.8 28.4 28.6 22 27.8 28.2 28.2 24.8 ...
##  $ Tavg   : num [1:12053] 22.2 22.5 23.1 22.5 21.5 20.3 22.3 23.3 23.2 21.8 ...
##  $ RH     : num [1:12053] 86 83 85 85 86 94 84 72 79 87 ...
##  $ RR     : num [1:12053] 6 47 0 12 16.5 1 13 1.5 2 0 ...
##  $ ss     : num [1:12053] 4.1 0.4 6.2 4.3 NA 0.1 5.2 2.8 5.1 0.4 ...
##  $ ff     : num [1:12053] 0 0 1 2 2 1 1 2 2 2 ...

Eksplorasi Data

summary(data_mentah$RR)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##     0.0     0.0     1.3   603.1    11.3  8888.0     571
summary(data_mentah$Tn)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   13.00   19.00   20.00   19.38   20.00   29.00     578
summary(data_mentah$Tx)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   18.80   28.10   29.00   29.01   30.00   36.00     326
summary(data_mentah$RH)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   42.00   73.00   79.00   77.99   84.00   96.00     330
summary(data_mentah$ss)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.000   3.300   5.100   4.924   6.800  10.000     492
# Cek Data Kosong 
colSums(is.na(data_mentah))
## Tanggal      Tn      Tx    Tavg      RH      RR      ss      ff 
##       0     578     326     156     330     571     492     114
# Visualisasi Peta 
vis_miss(data_mentah) + 
  ggtitle("Peta Missing Value")

# Visualisasi Kombinasi 
aggr(data_mentah, 
     col = c('navyblue','red'), 
     numbers = TRUE, 
     sortVars = TRUE, 
     labels = names(data_mentah), 
     cex.axis = 0.7, 
     gap = 3, 
     ylab = c("Proporsi Missing Value", "Pola Kombinasi Missing Value"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)

## 
##  Variables sorted by number of missings: 
##  Variable       Count
##        Tn 0.047954866
##        RR 0.047374098
##        ss 0.040819713
##        RH 0.027379076
##        Tx 0.027047208
##      Tavg 0.012942836
##        ff 0.009458226
##   Tanggal 0.000000000

Pre-Processing Data

Cleaning Data

# Ubah kode NA 
data_bersih = data_mentah %>%
  mutate(`RR` = ifelse(`RR` >= 8888, NA, `RR`))

Pengujian Data

Skenario (10% Data Kosong)

Data Baseline

# Data Utuh sebagai pembanding 
data_lengkap = na.omit(data_bersih) # Hapus data kosong  

# Skenario Pengujian (10% Data Hilang)
set.seed(123)
n_hilang_10 = round(0.10 * nrow(data_lengkap))
indeks_hilang_10 = sample(1:nrow(data_lengkap), n_hilang_10)
nilai_aktual_10 = data_lengkap$RR[indeks_hilang_10]

# Data Pengujian 
data_pengujian_10 = data_lengkap
data_pengujian_10$RR[indeks_hilang_10] = NA
data_pengujian_model_10 = data_pengujian_10 %>% select(-Tanggal)

# Imputasi Baseline (Mean)
mean_RR_10 = mean(data_pengujian_model_10$RR, na.rm = TRUE)
data_baseline_10 = data_pengujian_model_10 %>%
  mutate(RR = ifelse(is.na(RR), mean_RR_10, RR))

Eksekusi Model

# 1. Regresi
mod_reg_10 = lm(RR ~ ., data = data_baseline_10)
pred_reg_10 = predict(mod_reg_10, newdata = data_baseline_10)[indeks_hilang_10]

# 2. CART
mod_cart_10 = rpart(RR ~ ., data = data_baseline_10, method = "anova")
pred_cart_10 = predict(mod_cart_10, newdata = data_baseline_10)[indeks_hilang_10]

# 3. MissForest
set.seed(123)
res_mf_10 = missForest(as.data.frame(data_pengujian_model_10))
pred_mf_10 = res_mf_10$ximp$RR[indeks_hilang_10]

Evaluasi Model

library(Metrics)
tabel_10 = data.frame(
  Metode = c("Regresi Linear", "CART", "MissForest"),
  RMSE_10 = c(rmse(nilai_aktual_10, pred_reg_10), 
              rmse(nilai_aktual_10, pred_cart_10), 
              rmse(nilai_aktual_10, pred_mf_10))
)
print(tabel_10)
##           Metode  RMSE_10
## 1 Regresi Linear 11.15630
## 2           CART 11.02645
## 3     MissForest 10.88774

Skenario (20%)

Data Baseline

set.seed(123)
n_hilang_20 = round(0.20 * nrow(data_lengkap))
indeks_hilang_20 = sample(1:nrow(data_lengkap), n_hilang_20)
nilai_aktual_20 = data_lengkap$RR[indeks_hilang_20]

# Buat Data Pengujian 20%
data_pengujian_20 = data_lengkap
data_pengujian_20$RR[indeks_hilang_20] = NA
data_pengujian_model_20 = data_pengujian_20 %>% select(-Tanggal)

# Imputasi Baseline (Mean)
mean_RR_20 = mean(data_pengujian_model_20$RR, na.rm = TRUE)
data_baseline_20 = data_pengujian_model_20 %>%
  mutate(RR = ifelse(is.na(RR), mean_RR_20, RR))

Eksekusi Model

# Regresi
mod_reg_20 = lm(RR ~ ., data = data_baseline_20)
pred_reg_20 = predict(mod_reg_20, newdata = data_baseline_20)[indeks_hilang_20]

# CART
mod_cart_20 = rpart(RR ~ ., data = data_baseline_20, method = "anova")
pred_cart_20 = predict(mod_cart_20, newdata = data_baseline_20)[indeks_hilang_20]

# MissForest
set.seed(123)
res_mf_20 = missForest(as.data.frame(data_pengujian_model_20))
pred_mf_20 = res_mf_20$ximp$RR[indeks_hilang_20]

Evaluasi Model

# Nilai RMSE
tabel_20 = data.frame(
  Metode = c("Regresi Linear", "CART", "MissForest"),
  RMSE_20 = c(rmse(nilai_aktual_20, pred_reg_20), 
              rmse(nilai_aktual_20, pred_cart_20), 
              rmse(nilai_aktual_20, pred_mf_20))
)
tabel_20
##           Metode  RMSE_20
## 1 Regresi Linear 11.28918
## 2           CART 11.25141
## 3     MissForest 11.17510

Skenario 30%

Bangun Baseline

set.seed(123)
n_hilang_30 = round(0.30 * nrow(data_lengkap))
indeks_hilang_30 = sample(1:nrow(data_lengkap), n_hilang_30)
nilai_aktual_30 = data_lengkap$RR[indeks_hilang_30]

# Buat Data Pengujian 30%
data_pengujian_30 = data_lengkap
data_pengujian_30$RR[indeks_hilang_30] = NA
data_pengujian_model_30 = data_pengujian_30 %>% select(-Tanggal)

# Imputasi Baseline (Mean)
mean_RR_30 = mean(data_pengujian_model_30$RR, na.rm = TRUE)
data_baseline_30 = data_pengujian_model_30 %>%
  mutate(RR = ifelse(is.na(RR), mean_RR_30, RR))

Eksekusi Model

# Regresi
mod_reg_30 = lm(RR ~ ., data = data_baseline_30)
pred_reg_30 = predict(mod_reg_30, newdata = data_baseline_30)[indeks_hilang_30]

# CART
mod_cart_30 = rpart(RR ~ ., data = data_baseline_30, method = "anova")
pred_cart_30 = predict(mod_cart_30, newdata = data_baseline_30)[indeks_hilang_30]

# MissForest
set.seed(123)
res_mf_30 = missForest(as.data.frame(data_pengujian_model_30))
pred_mf_30 = res_mf_30$ximp$RR[indeks_hilang_30]

Evaluasi Model

# Evaluasi 
tabel_30 = data.frame(
  Metode = c("Regresi Linear", "CART", "MissForest"),
  RMSE_30 = c(rmse(nilai_aktual_30, pred_reg_30), 
              rmse(nilai_aktual_30, pred_cart_30), 
              rmse(nilai_aktual_30, pred_mf_30))
)
tabel_30
##           Metode  RMSE_30
## 1 Regresi Linear 11.16582
## 2           CART 11.24811
## 3     MissForest 11.04306

Skenario 70%

Data Baseline

set.seed(123)
n_hilang_70 = round(0.70 * nrow(data_lengkap))
indeks_hilang_70 = sample(1:nrow(data_lengkap), n_hilang_70)
nilai_aktual_70 = data_lengkap$RR[indeks_hilang_70]

# Buat Data Pengujian 70%
data_pengujian_70 = data_lengkap
data_pengujian_70$RR[indeks_hilang_70] = NA
data_pengujian_model_70 = data_pengujian_70 %>% select(-Tanggal)

# Imputasi Baseline (Mean)
mean_RR_70 = mean(data_pengujian_model_70$RR, na.rm = TRUE)
data_baseline_70 = data_pengujian_model_70 %>%
  mutate(RR = ifelse(is.na(RR), mean_RR_70, RR))

Eksekusi Model

# 1. Regresi
mod_reg_70 = lm(RR ~ ., data = data_baseline_70)
pred_reg_70 = predict(mod_reg_70, newdata = data_baseline_70)[indeks_hilang_70]

# 2. CART
mod_cart_70 = rpart(RR ~ ., data = data_baseline_70, method = "anova")
pred_cart_70 = predict(mod_cart_70, newdata = data_baseline_70)[indeks_hilang_70]

# 3. MissForest
set.seed(123)
res_mf_70 = missForest(as.data.frame(data_pengujian_model_70))
pred_mf_70 = res_mf_70$ximp$RR[indeks_hilang_70]

Evaluasi Model

tabel_70 = data.frame(
  Metode = c("Regresi Linear", "CART", "MissForest"),
  RMSE_70 = c(rmse(nilai_aktual_70, pred_reg_70), 
              rmse(nilai_aktual_70, pred_cart_70), 
              rmse(nilai_aktual_70, pred_mf_70))
)
tabel_70
##           Metode  RMSE_70
## 1 Regresi Linear 11.78416
## 2           CART 11.87934
## 3     MissForest 11.20408

Perbandingan Pengujian Data

# Rekap Hasil Evaluasi 
rekap_final = tabel_10 %>%
  left_join(tabel_20, by = "Metode") %>%
  left_join(tabel_30, by = "Metode") %>%
  left_join(tabel_70, by = "Metode")

# Visualisasi
library(ggplot2)
library(tidyr)

rekap_long = rekap_final %>%
  pivot_longer(cols = starts_with("RMSE"), 
               names_to = "Skenario", 
               values_to = "RMSE")

ggplot(rekap_long, aes(x = Skenario, y = RMSE, group = Metode, color = Metode)) +
  geom_line(size = 1.2) +
  geom_point(size = 3) +
  theme_minimal() +
  labs(title = "Perbandingan Kinerja Metode Imputasi",
       subtitle = "Semakin rendah nilai RMSE, semakin akurat metode tersebut",
       x = "Skenario Missing Value",
       y = "Nilai RMSE") +
  scale_color_manual(values = c("MissForest" = "firebrick", 
                                "CART" = "orange", 
                                "Regresi Linear" = "navyblue"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

tabel_rmse = tabel_10 %>%
  left_join(tabel_20, by = "Metode") %>%
  left_join(tabel_30, by = "Metode") %>%
  left_join(tabel_70, by = "Metode")
tabel_rmse
##           Metode  RMSE_10  RMSE_20  RMSE_30  RMSE_70
## 1 Regresi Linear 11.15630 11.28918 11.16582 11.78416
## 2           CART 11.02645 11.25141 11.24811 11.87934
## 3     MissForest 10.88774 11.17510 11.04306 11.20408

Imputasi Miss Forest

# Imputasi Data 
data_siap = data_bersih %>%
  select(-Tanggal) # Tanggal dikeluarkan dulu karena bukan angka

# 2. Proses Imputasi Total
set.seed(123)
data_imputed = missForest(as.data.frame(data_siap))

# 3. Masukkan kembali kolom Tanggal ke data yang sudah lengkap
data_final= data.frame(
  Tanggal = data_bersih$Tanggal,
  data_imputed$ximp
)

# Cek apakah NA sudah hilang semua
colSums(is.na(data_final))
## Tanggal      Tn      Tx    Tavg      RH      RR      ss      ff 
##       0       0       0       0       0       0       0       0

Korelasi

# Contoh untuk Curah Hujan (RR)
library(ggplot2)

ggplot() +
  geom_density(data = data_bersih, aes(x = RR), color = "blue", size = 1, linetype = "dashed") +
  geom_density(data = data_final, aes(x = RR), color = "red", alpha = 0.5) +
  labs(title = "Perbandingan Distribusi RR: Asli (Biru Putus-putus) vs Imputasi (Merah)",
       x = "Curah Hujan (mm)", y = "Density") +
  theme_minimal()
## Warning: Removed 1342 rows containing non-finite outside the scale range
## (`stat_density()`).

# Density Plot untuk Suhu Minimum (Tn)
ggplot() +
  geom_density(data = data_bersih, aes(x = Tn), color = "blue", size = 1, linetype = "dashed") +
  geom_density(data = data_final, aes(x = Tn), color = "red", alpha = 0.5) +
  labs(title = "Perbandingan Distribusi Suhu Minimum (Tn)",
       subtitle = "Biru (Putus-putus): Data Asli | Merah: Hasil Imputasi",
       x = "Suhu Minimum (°C)", y = "Density") +
  theme_minimal()
## Warning: Removed 578 rows containing non-finite outside the scale range
## (`stat_density()`).

# Density Plot untuk Suhu Maksimum (Tx)
ggplot() +
  geom_density(data = data_bersih, aes(x = Tx), color = "blue", size = 1, linetype = "dashed") +
  geom_density(data = data_final, aes(x = Tx), color = "red", alpha = 0.5) +
  labs(title = "Perbandingan Distribusi Suhu Maksimum (Tx)",
       subtitle = "Biru (Putus-putus): Data Asli | Merah: Hasil Imputasi",
       x = "Suhu Maksimum (°C)", y = "Density") +
  theme_minimal()
## Warning: Removed 326 rows containing non-finite outside the scale range
## (`stat_density()`).

# Perbandingan Mean
summary_asli = colMeans(data_siap, na.rm = TRUE)
summary_imputasi = colMeans(data_final %>% select(-Tanggal))

tabel_perbandingan = data.frame(
  Variabel = names(summary_asli),
  Mean_Asli = round(summary_asli, 3),
  Mean_Imputasi = round(summary_imputasi, 3),
  Selisih = round(summary_asli - summary_imputasi, 4)
)

tabel_perbandingan
##      Variabel Mean_Asli Mean_Imputasi Selisih
## Tn         Tn    19.380        19.382 -0.0020
## Tx         Tx    29.012        29.027 -0.0154
## Tavg     Tavg    23.391        23.398 -0.0063
## RH         RH    77.986        77.956  0.0306
## RR         RR     6.738         6.565  0.1732
## ss         ss     4.924         4.881  0.0428
## ff         ff     1.736         1.736  0.0001
ggplot(data_final, aes(x = ss, y = Tx)) +
  geom_point(alpha = 0.3, color = "darkgreen") +
  geom_smooth(method = "lm", color = "red") +
  labs(title = "Konsistensi Hubungan ss vs Tx",
       x = "Penyinaran Matahari", y = "Suhu Maksimum") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

# 1. Hitung Matriks Korelasi
# korelasi_asli: menggunakan data yang masih ada NA (pairwise)
# korelasi_hasil: menggunakan data yang sudah lengkap 100%
korelasi_asli = cor(data_siap, use = "pairwise.complete.obs")
korelasi_hasil = cor(data_final %>% select(-Tanggal))

# 2. Tampilkan Matriks Korelasi Hasil Imputasi (Dibulatkan 3 angka)
print("--- MATRIKS KORELASI DATA FINAL (SETELAH IMPUTASI) ---")
## [1] "--- MATRIKS KORELASI DATA FINAL (SETELAH IMPUTASI) ---"
round(korelasi_hasil, 3)
##          Tn     Tx   Tavg     RH     RR     ss     ff
## Tn    1.000 -0.095  0.383  0.369  0.106 -0.322  0.058
## Tx   -0.095  1.000  0.606 -0.607 -0.190  0.556  0.176
## Tavg  0.383  0.606  1.000 -0.391 -0.177  0.253  0.173
## RH    0.369 -0.607 -0.391  1.000  0.399 -0.576 -0.247
## RR    0.106 -0.190 -0.177  0.399  1.000 -0.236 -0.111
## ss   -0.322  0.556  0.253 -0.576 -0.236  1.000  0.132
## ff    0.058  0.176  0.173 -0.247 -0.111  0.132  1.000
# 3. Hitung Selisih/Bias (Untuk melihat apakah ada perubahan pola yang ekstrem)
selisih_kor = korelasi_asli - korelasi_hasil
print("--- SELISIH KORELASI (ASLI - HASIL) ---")
## [1] "--- SELISIH KORELASI (ASLI - HASIL) ---"
round(selisih_kor, 4)
##           Tn      Tx    Tavg      RH      RR      ss      ff
## Tn    0.0000 -0.0068 -0.0069  0.0008  0.0036 -0.0017 -0.0071
## Tx   -0.0068  0.0000 -0.0031  0.0108  0.0042 -0.0135  0.0033
## Tavg -0.0069 -0.0031  0.0000  0.0069  0.0045 -0.0151  0.0007
## RH    0.0008  0.0108  0.0069  0.0000 -0.0098  0.0144  0.0078
## RR    0.0036  0.0042  0.0045 -0.0098  0.0000  0.0047  0.0034
## ss   -0.0017 -0.0135 -0.0151  0.0144  0.0047  0.0000 -0.0012
## ff   -0.0071  0.0033  0.0007  0.0078  0.0034 -0.0012  0.0000
# 4. Visualisasi Korelasi Berdampingan
library(corrplot)
## corrplot 0.95 loaded
par(mfrow = c(1, 2))

corrplot(korelasi_asli, method = "circle", type = "upper", 
         title = "Asli (Ada NA)", mar = c(0,0,2,0), tl.cex = 0.7)

corrplot(korelasi_hasil, method = "circle", type = "upper", 
         title = "Final (Lengkap)", mar = c(0,0,2,0), tl.cex = 0.7)