# Library Bawaan & Manipulasi Data
library(readxl)
library(readr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(stringr)
# Library Analisis Missing Value
library(naniar)
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Suggestions and bug-reports can be submitted at: https://github.com/statistikat/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
# Library Visualisasi & Tabel Interaktif
library(ggplot2)
library(ggcorrplot)
library(DT)
# Input Data
data_met= read_xlsx("D:\\deska_baru2\\Data Meteotologi.xlsx")
str(data_met)
## tibble [12,053 × 8] (S3: tbl_df/tbl/data.frame)
## $ Tanggal : POSIXct[1:12053], format: "1990-01-01" "1990-01-02" ...
## $ Tn (suhu minimum) : num [1:12053] 19 19 20 19 19 20 19 19 20 20 ...
## $ Tx ( suhu maksimum) : num [1:12053] 28.4 27 26.8 28.4 28.6 22 27.8 28.2 28.2 24.8 ...
## $ Tavg (rata rata suhu) : num [1:12053] 22.2 22.5 23.1 22.5 21.5 20.3 22.3 23.3 23.2 21.8 ...
## $ RH_avg(kelembaban) : num [1:12053] 86 83 85 85 86 94 84 72 79 87 ...
## $ RR (curah hujan ) : num [1:12053] 6 47 0 12 16.5 1 13 1.5 2 0 ...
## $ ss ( penyinaran) : num [1:12053] 4.1 0.4 6.2 4.3 NA 0.1 5.2 2.8 5.1 0.4 ...
## $ ff_avg (angin rata rata): num [1:12053] 0 0 1 2 2 1 1 2 2 2 ...
head(data_met)
## # A tibble: 6 × 8
## Tanggal `Tn (suhu minimum)` `Tx ( suhu maksimum)`
## <dttm> <dbl> <dbl>
## 1 1990-01-01 00:00:00 19 28.4
## 2 1990-01-02 00:00:00 19 27
## 3 1990-01-03 00:00:00 20 26.8
## 4 1990-01-04 00:00:00 19 28.4
## 5 1990-01-05 00:00:00 19 28.6
## 6 1990-01-06 00:00:00 20 22
## # ℹ 5 more variables: `Tavg (rata rata suhu)` <dbl>,
## # `RH_avg(kelembaban)` <dbl>, `RR (curah hujan )` <dbl>,
## # `ss ( penyinaran)` <dbl>, `ff_avg (angin rata rata)` <dbl>
# Ganti nama kolom
data_met = data_met %>%
rename_with(~ str_remove(., "\\s*\\(.*\\)"))
str(data_met)
## tibble [12,053 × 8] (S3: tbl_df/tbl/data.frame)
## $ Tanggal: POSIXct[1:12053], format: "1990-01-01" "1990-01-02" ...
## $ Tn : num [1:12053] 19 19 20 19 19 20 19 19 20 20 ...
## $ Tx : num [1:12053] 28.4 27 26.8 28.4 28.6 22 27.8 28.2 28.2 24.8 ...
## $ Tavg : num [1:12053] 22.2 22.5 23.1 22.5 21.5 20.3 22.3 23.3 23.2 21.8 ...
## $ RH_avg : num [1:12053] 86 83 85 85 86 94 84 72 79 87 ...
## $ RR : num [1:12053] 6 47 0 12 16.5 1 13 1.5 2 0 ...
## $ ss : num [1:12053] 4.1 0.4 6.2 4.3 NA 0.1 5.2 2.8 5.1 0.4 ...
## $ ff_avg : num [1:12053] 0 0 1 2 2 1 1 2 2 2 ...
# Mengubah kode 99.9 dalam data menjadi NA
data_met[data_met==8888] = NA
summary(data_met)
## Tanggal Tn Tx Tavg
## Min. :1990-01-01 Min. :13.00 Min. :18.80 Min. :18.40
## 1st Qu.:1998-04-02 1st Qu.:19.00 1st Qu.:28.10 1st Qu.:22.80
## Median :2006-07-02 Median :20.00 Median :29.00 Median :23.40
## Mean :2006-07-02 Mean :19.38 Mean :29.01 Mean :23.39
## 3rd Qu.:2014-10-01 3rd Qu.:20.00 3rd Qu.:30.00 3rd Qu.:24.00
## Max. :2022-12-31 Max. :29.00 Max. :36.00 Max. :28.90
## NAs :578 NAs :326 NAs :156
## RH_avg RR ss ff_avg
## Min. :42.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:73.00 1st Qu.: 0.000 1st Qu.: 3.300 1st Qu.: 1.000
## Median :79.00 Median : 0.800 Median : 5.100 Median : 2.000
## Mean :77.99 Mean : 6.738 Mean : 4.924 Mean : 1.736
## 3rd Qu.:84.00 3rd Qu.: 7.800 3rd Qu.: 6.800 3rd Qu.: 2.000
## Max. :96.00 Max. :160.000 Max. :10.000 Max. :15.000
## NAs :330 NAs :1342 NAs :492 NAs :114
# Missing Value
data_met = data_met %>%
mutate(tahun = year(Tanggal))
missing_tahunan = data_met %>%
group_by(tahun) %>%
summarise(
RR_na = sum(is.na(RR)),
Tn_na = sum(is.na(Tn)),
Tx_na = sum(is.na(Tx))
)
library(DT)
# Membuat tabel interaktif
datatable(missing_tahunan,
options = list(pageLength = 33),
caption = 'Tabel Missing Value Tahunan')
# Visualisasi
# A. Menggunakan naniar: Melihat peta sebaran data kosong
# Hitam = Data ada, Abu-abu = Data kosong (NA)
vis_miss(data_met) +
labs(title = "Peta Sebaran Missing Value Data Meteorologi")
# B. Menggunakan VIM: Melihat kombinasi variabel yang sering kosong bersamaan
aggr(data_met,
col = c('navyblue','red'),
numbers = TRUE,
sortVars = TRUE,
labels = names(data_met),
cex.axis = .7,
gap = 3,
ylab = c("Proporsi Missing Data", "Pola Kombinasi Kosong"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)
##
## Variables sorted by number of missings:
## Variable Count
## RR 0.111341575
## Tn 0.047954866
## ss 0.040819713
## RH_avg 0.027379076
## Tx 0.027047208
## Tavg 0.012942836
## ff_avg 0.009458226
## Tanggal 0.000000000
## tahun 0.000000000
# Korelasi
data_numeric = na.omit(data_met[, 2:8]) # Buang kolom Tanggal dan hapus NA sementara khusus untuk hitung korelasi
data_numeric = na.omit(data_met[, 2:8])
matriks_korelasi = cor(data_numeric)
# Visualisasi
ggcorrplot(matriks_korelasi,
method = "square",
type = "lower", # Hanya menampilkan separuh bawah matriks
lab = TRUE, # Menampilkan angka korelasi
colors = c("red", "white", "blue"),
title = "Matriks Korelasi Variabel Meteorologi Bandung")
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the ggcorrplot package.
## Please report the issue at <https://github.com/kassambara/ggcorrplot/issues>.
## This warning is displayed once per session.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
matriks_korelasi = cor(data_numeric)
# Histogram
data_long = data_met %>%
select(Tn, Tx, RR) %>%
pivot_longer(cols = everything(), names_to = "Variabel", values_to = "Nilai")
# Visualisasi
ggplot(data_long, aes(x = Variabel, y = Nilai, fill = Variabel)) +
geom_boxplot(alpha = 0.7, outlier.colour = "red", outlier.shape = 16) +
coord_flip() +
theme_minimal() +
labs(title = "Distribusi Suhu (Tn, Tx) & Curah Hujan (RR)",
x = "Variabel Iklim",
y = "Nilai Pengukuran")
## Warning: Removed 2246 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Line chart untuk Curah Hujan (RR)
ggplot(data_met, aes(x = Tanggal, y = RR)) +
geom_line(color = "dodgerblue", alpha = 0.8) +
theme_minimal() +
labs(title = "Tren Runtun Waktu Curah Hujan (RR) Kota Bandung 1990-2022",
x = "Tahun",
y = "Curah Hujan (mm)")
# Line chart untuk Suhu Rata-rata (Tavg)
ggplot(data_met, aes(x = Tanggal, y = Tavg)) +
geom_line(color = "firebrick", alpha = 0.8) +
theme_minimal() +
labs(title = "Tren Runtun Waktu Suhu Rata-rata (Tavg) Kota Bandung 1990-2022",
x = "Tahun",
y = "Suhu Rata-rata (°C)")
# Data Ground Truth
gt_met = na.omit(data_met[,2:9])
anyNA(gt_met) # Cek data sudah bersih NA
## [1] FALSE
Duplikasi data dan hilangkan 10% nilai dalam data
set.seed(123)
uji_1 = gt_met
for (col in names(uji_1)) {
n_hilang = round(0.1 * nrow(uji_1))
idx = sample(1:nrow(uji_1), n_hilang)
uji_1[idx, col] = NA
}
# Simpan nilai asli yang nanti akan kita tebak
mask_1 = is.na(uji_1)
nilai_asli_1 = gt_met[mask_1]
Duplikasi data dan hilangkan 40% nilai dalam data
set.seed(123)
uji_2 = gt_met
for (col in names(uji_2)) {
n_hilang = round(0.4 * nrow(uji_2))
idx = sample(1:nrow(uji_2), n_hilang)
uji_2[idx, col] = NA
}
# Simpan nilai asli yang nanti akan kita tebak
mask_2 = is.na(uji_2)
nilai_asli_2 = gt_met[mask_2]
Duplikasi data dan hilangkan 40% nilai dalam data
set.seed(123)
uji_3 = gt_met
for (col in names(uji_3)) {
n_hilang = round(0.4 * nrow(uji_3))
idx = sample(1:nrow(uji_3), n_hilang)
uji_3[idx, col] = NA
}
# Simpan nilai asli yang nanti akan kita tebak
mask_3 = is.na(uji_3)
nilai_asli_3 = gt_met[mask_3]
library(missForest)
##
## Attaching package: 'missForest'
## The following object is masked from 'package:VIM':
##
## nrmse
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ tibble 3.3.1
## ✔ purrr 1.2.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
MF_1 = missForest(as.matrix(uji_1))
MF_1 = as.data.frame(MF_1$ximp)
MF_2 = missForest(as.matrix(uji_2))
MF_2 = as.data.frame(MF_2$ximp)
MF_3 = missForest(as.matrix(uji_3))
MF_3 = as.data.frame(MF_3$ximp)
prediksi_mf_1 = MF_1[mask_1]
prediksi_mf_2 = MF_2[mask_2]
prediksi_mf_3 = MF_3[mask_3]
# RMSE
rmse = function(asli, prediksi) sqrt(mean((asli - prediksi)^2, na.rm = TRUE))
rmse.mf_1 = rmse(nilai_asli_1, prediksi_mf_1)
rmse.mf_2 = rmse(nilai_asli_2, prediksi_mf_2)
rmse.mf_3 = rmse(nilai_asli_3, prediksi_mf_3)
# standar deviasi
sd_asli_1 = sd(nilai_asli_1, na.rm = TRUE)
sd_asli_2 = sd(nilai_asli_2, na.rm = TRUE)
sd_asli_3 = sd(nilai_asli_3, na.rm = TRUE)
# Output
results = data.frame(
Skema = c("Rendah", "Sedang", "Tinggi"),
missForest = c(rmse.mf_1, rmse.mf_2, rmse.mf_3),
SD = c(sd_asli_1, sd_asli_2, sd_asli_3)
)
results
## Skema missForest SD
## 1 Rendah 4.878047 656.0550
## 2 Sedang 5.762946 655.8715
## 3 Tinggi 5.750961 655.8715
# Siapkan keranjang kosong
eval_met_nrmse = data.frame(
Skema = character(),
Variabel = character(),
NRMSE_SD = numeric(),
stringsAsFactors = FALSE
)
# Daftar skema (Stasiun)
skema_met = list(
"10%" = list(uji = uji_1, mf = MF_1),
"40%" = list(uji = uji_2, mf = MF_2),
"70%" = list(uji = uji_3, mf = MF_3)
)
# Looping perhitungan khusus NRMSE (Metode SD)
for (nama_skema in names(skema_met)) {
d_uji = skema_met[[nama_skema]]$uji
d_mf = skema_met[[nama_skema]]$mf
for (var in names(d_uji)) {
idx_na = which(is.na(d_uji[[var]]))
if(length(idx_na) > 0) {
aktual = gt_met[[var]][idx_na]
prediksi = d_mf[[var]][idx_na]
# 1. Hitung error absolut (RMSE)
rmse_val = rmse(aktual, prediksi)
# 2. Hitung variasi alami (SD Aktual)
sd_aktual = sd(aktual, na.rm = TRUE)
# 3. Hitung NRMSE (RMSE dibagi SD)
nrmse_sd_val = rmse_val / sd_aktual
eval_met_nrmse = rbind(eval_met_nrmse, data.frame(
Skema = nama_skema,
Variabel = var,
NRMSE_SD = round(nrmse_sd_val, 4) # Dibulatkan 4 desimal
))
}
}
}
tabel_nrmse_stasiun = eval_met_nrmse %>%
pivot_wider(names_from = Skema, values_from = NRMSE_SD, names_prefix = "NRMSE_Skema_")
# Tampilkan hasil
print(tabel_nrmse_stasiun)
## # A tibble: 8 × 4
## Variabel `NRMSE_Skema_10%` `NRMSE_Skema_40%` `NRMSE_Skema_70%`
## <chr> <dbl> <dbl> <dbl>
## 1 Tn 0.634 0.833 0.823
## 2 Tx 0.595 0.767 0.758
## 3 Tavg 0.637 0.792 0.786
## 4 RH_avg 0.553 0.712 0.706
## 5 RR 0.907 1.02 1.01
## 6 ss 0.724 0.822 0.819
## 7 ff_avg 0.854 0.965 0.961
## 8 tahun 0.782 0.927 0.929
# Dataframe
eval_met_r2 = data.frame(
Skema = character(),
Variabel = character(),
R_Squared = numeric(),
stringsAsFactors = FALSE
)
# Daftar skema (Stasiun)
skema_met = list(
"10%" = list(uji = uji_1, mf = MF_1),
"40%" = list(uji = uji_2, mf = MF_2),
"70%" = list(uji = uji_3, mf = MF_3)
)
# Looping perhitungan khusus R-Squared
for (nama_skema in names(skema_met)) {
d_uji = skema_met[[nama_skema]]$uji
d_mf = skema_met[[nama_skema]]$mf
for (var in names(d_uji)) {
idx_na = which(is.na(d_uji[[var]]))
if(length(idx_na) > 0) {
aktual = gt_met[[var]][idx_na]
prediksi = d_mf[[var]][idx_na]
eval_met_r2 = rbind(eval_met_r2, data.frame(
Skema = nama_skema,
Variabel = var,
R_Squared = round(cor(aktual, prediksi)^2, 4) # Dibulatkan 4 desimal
))
}
}
}
tabel_r2_stasiun = eval_met_r2 %>%
pivot_wider(names_from = Skema, values_from = R_Squared, names_prefix = "R2_Skema_")
# Tampilkan hasil
print(tabel_r2_stasiun)
## # A tibble: 8 × 4
## Variabel `R2_Skema_10%` `R2_Skema_40%` `R2_Skema_70%`
## <chr> <dbl> <dbl> <dbl>
## 1 Tn 0.600 0.354 0.363
## 2 Tx 0.651 0.433 0.442
## 3 Tavg 0.594 0.406 0.407
## 4 RH_avg 0.696 0.508 0.516
## 5 RR 0.186 0.0967 0.100
## 6 ss 0.477 0.351 0.354
## 7 ff_avg 0.271 0.119 0.124
## 8 tahun 0.389 0.188 0.181
# EVALUASI RMSE SPESIFIK PER VARIABEL
hitung_rmse_per_var = function(data_asli, data_uji, data_imputasi) {
rmse_list = list() # Buat list kosong untuk menyimpan hasil
# Looping untuk setiap nama kolom di dataset
for (kolom in names(data_asli)) {
# Cari posisi baris yang nilainya sengaja dihilangkan (NA) HANYA untuk kolom ini
mask_kolom = is.na(data_uji[[kolom]])
# Cek apakah ada data yang dihilangkan di kolom tersebut
if (sum(mask_kolom) > 0) {
# Ambil nilai asli dan tebakan MissForest khusus di posisi yang di-NA-kan
asli_kolom = data_asli[[kolom]][mask_kolom]
pred_kolom = data_imputasi[[kolom]][mask_kolom]
# Hitung RMSE khusus kolom ini
rmse_val = sqrt(mean((asli_kolom - pred_kolom)^2, na.rm = TRUE))
rmse_list[[kolom]] = rmse_val
} else {
# Jika tidak ada yang di-NA-kan, kembalikan NA
rmse_list[[kolom]] = NA
}
}
# Ubah list menjadi dataframe agar rapi
return(as.data.frame(rmse_list))
}
# EVALUASI RMSE PER VARIABEL (Meteorologi)
rmse_met_10 = hitung_rmse_per_var(gt_met, uji_1, MF_1)
rmse_met_40 = hitung_rmse_per_var(gt_met, uji_2, MF_2)
rmse_met_70 = hitung_rmse_per_var(gt_met, uji_3, MF_3)
# Gabungkan hasil
tabel_rmse_met = rbind(rmse_met_10, rmse_met_40, rmse_met_70)
# Tabel
tabel_rmse_met$Skema = c("Rendah (10%)", "Sedang (40%)", "Tinggi (70%)")
tabel_rmse_met = tabel_rmse_met %>% select(Skema, everything())
# Hasil
print(tabel_rmse_met)
## Skema Tn Tx Tavg RH_avg RR ss
## 1 Rendah (10%) 0.9790773 0.9213993 0.5457545 4.214455 10.69447 1.685544
## 2 Sedang (40%) 1.2712601 1.1398173 0.7302706 5.488930 12.31773 1.879277
## 3 Tinggi (70%) 1.2559587 1.1257265 0.7249954 5.436895 12.28511 1.872205
## ff_avg tahun
## 1 0.9633813 7.235672
## 2 1.0963608 8.698002
## 3 1.0917019 8.719935
# Data Meteorologi (variabel-variabel lain)
data_met = data_met %>%
select(-tahun)
str(data_met)
## tibble [12,053 × 8] (S3: tbl_df/tbl/data.frame)
## $ Tanggal: POSIXct[1:12053], format: "1990-01-01" "1990-01-02" ...
## $ Tn : num [1:12053] 19 19 20 19 19 20 19 19 20 20 ...
## $ Tx : num [1:12053] 28.4 27 26.8 28.4 28.6 22 27.8 28.2 28.2 24.8 ...
## $ Tavg : num [1:12053] 22.2 22.5 23.1 22.5 21.5 20.3 22.3 23.3 23.2 21.8 ...
## $ RH_avg : num [1:12053] 86 83 85 85 86 94 84 72 79 87 ...
## $ RR : num [1:12053] 6 47 0 12 16.5 1 13 1.5 2 0 ...
## $ ss : num [1:12053] 4.1 0.4 6.2 4.3 NA 0.1 5.2 2.8 5.1 0.4 ...
## $ ff_avg : num [1:12053] 0 0 1 2 2 1 1 2 2 2 ...
# Data Chirps (satelit)
data_chrp = read_xlsx("D:\\deska_baru2\\DATA CH Bandung CHRIPS.xlsx")
# Ubah nama kolom ke-2 agar lebih mudah
data_chrp = data_chrp %>%
rename(RR_chrp = 2)
str(data_chrp)
## tibble [12,053 × 2] (S3: tbl_df/tbl/data.frame)
## $ Tanggal: POSIXct[1:12053], format: "1990-01-01" "1990-01-02" ...
## $ RR_chrp: num [1:12053] 13.22 3.86 4.41 11.57 3.86 ...
head(data_chrp)
## # A tibble: 6 × 2
## Tanggal RR_chrp
## <dttm> <dbl>
## 1 1990-01-01 00:00:00 13.2
## 2 1990-01-02 00:00:00 3.86
## 3 1990-01-03 00:00:00 4.41
## 4 1990-01-04 00:00:00 11.6
## 5 1990-01-05 00:00:00 3.86
## 6 1990-01-06 00:00:00 9.17
# Gabungkan data Meteorologi dengan data Satelit
data_gab = data_met %>%
left_join(data_chrp, by = "Tanggal")
# Ubah kode 8888 jadi NA
data_gab[data_gab==8888] = NA
str(data_gab)
## tibble [12,053 × 9] (S3: tbl_df/tbl/data.frame)
## $ Tanggal: POSIXct[1:12053], format: "1990-01-01" "1990-01-02" ...
## $ Tn : num [1:12053] 19 19 20 19 19 20 19 19 20 20 ...
## $ Tx : num [1:12053] 28.4 27 26.8 28.4 28.6 22 27.8 28.2 28.2 24.8 ...
## $ Tavg : num [1:12053] 22.2 22.5 23.1 22.5 21.5 20.3 22.3 23.3 23.2 21.8 ...
## $ RH_avg : num [1:12053] 86 83 85 85 86 94 84 72 79 87 ...
## $ RR : num [1:12053] 6 47 0 12 16.5 1 13 1.5 2 0 ...
## $ ss : num [1:12053] 4.1 0.4 6.2 4.3 NA 0.1 5.2 2.8 5.1 0.4 ...
## $ ff_avg : num [1:12053] 0 0 1 2 2 1 1 2 2 2 ...
## $ RR_chrp: num [1:12053] 13.22 3.86 4.41 11.57 3.86 ...
ncol(data_gab)
## [1] 9
summary(data_gab)
## Tanggal Tn Tx Tavg
## Min. :1990-01-01 Min. :13.00 Min. :18.80 Min. :18.40
## 1st Qu.:1998-04-02 1st Qu.:19.00 1st Qu.:28.10 1st Qu.:22.80
## Median :2006-07-02 Median :20.00 Median :29.00 Median :23.40
## Mean :2006-07-02 Mean :19.38 Mean :29.01 Mean :23.39
## 3rd Qu.:2014-10-01 3rd Qu.:20.00 3rd Qu.:30.00 3rd Qu.:24.00
## Max. :2022-12-31 Max. :29.00 Max. :36.00 Max. :28.90
## NAs :578 NAs :326 NAs :156
## RH_avg RR ss ff_avg
## Min. :42.00 Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.:73.00 1st Qu.: 0.000 1st Qu.: 3.300 1st Qu.: 1.000
## Median :79.00 Median : 0.800 Median : 5.100 Median : 2.000
## Mean :77.99 Mean : 6.738 Mean : 4.924 Mean : 1.736
## 3rd Qu.:84.00 3rd Qu.: 7.800 3rd Qu.: 6.800 3rd Qu.: 2.000
## Max. :96.00 Max. :160.000 Max. :10.000 Max. :15.000
## NAs :330 NAs :1342 NAs :492 NAs :114
## RR_chrp
## Min. : 0.000
## 1st Qu.: 0.000
## Median : 3.079
## Mean : 6.063
## 3rd Qu.: 9.865
## Max. :90.572
##
# Missing Value
data_gab = data_gab %>%
mutate(tahun = year(Tanggal))
missing_tahunan2 = data_gab %>%
group_by(tahun) %>%
summarise(
RR_na = sum(is.na(RR)),
Tn_na = sum(is.na(Tn)),
Tx_na = sum(is.na(Tx))
)
library(DT)
# Membuat tabel interaktif
datatable(missing_tahunan2,
options = list(pageLength = 33),
caption = 'Tabel Missing Value Tahunan')
# Visualisasi
# A. Menggunakan naniar: Melihat peta sebaran data kosong
# Hitam = Data ada, Abu-abu = Data kosong (NA)
vis_miss(data_gab) +
labs(title = "Peta Sebaran Missing Value Data Meteorologi")
# B. Menggunakan VIM: Melihat kombinasi variabel yang sering kosong bersamaan
aggr(data_gab,
col = c('navyblue','red'),
numbers = TRUE,
sortVars = TRUE,
labels = names(data_gab),
cex.axis = .7,
gap = 3,
ylab = c("Proporsi Missing Data", "Pola Kombinasi Kosong"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)
##
## Variables sorted by number of missings:
## Variable Count
## RR 0.111341575
## Tn 0.047954866
## ss 0.040819713
## RH_avg 0.027379076
## Tx 0.027047208
## Tavg 0.012942836
## ff_avg 0.009458226
## Tanggal 0.000000000
## RR_chrp 0.000000000
## tahun 0.000000000
# Korelasi
data_numeric2 = na.omit(data_gab[, 2:8]) # Buang kolom Tanggal dan hapus NA sementara khusus untuk hitung korelasi
data_numeric2 = na.omit(data_gab[, 2:8])
matriks_korelasi = cor(data_numeric2)
# Visualisasi
ggcorrplot(matriks_korelasi,
method = "square",
type = "lower", # Hanya menampilkan separuh bawah matriks
lab = TRUE, # Menampilkan angka korelasi
colors = c("red", "white", "blue"),
title = "Matriks Korelasi Variabel Meteorologi Bandung")
matriks_korelasi = cor(data_numeric2)
# Histogram
data_long2 = data_gab %>%
select(Tn, Tx, RR, RR_chrp) %>% # <--- RR_chrp ditambahkan di sini
pivot_longer(cols = everything(), names_to = "Variabel", values_to = "Nilai")
# Visualisasi
ggplot(data_long2, aes(x = Variabel, y = Nilai, fill = Variabel)) +
geom_boxplot(alpha = 0.7, outlier.colour = "red", outlier.shape = 16) +
coord_flip() +
theme_minimal() +
labs(title = "Distribusi Suhu & Curah Hujan (Stasiun vs CHIRPS)",
x = "Variabel Iklim",
y = "Nilai Pengukuran")
## Warning: Removed 2246 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# Line chart Perbandingan Curah Hujan (RR vs RR_chrp)
ggplot(data_gab, aes(x = Tanggal)) +
geom_line(aes(y = RR, color = "Stasiun (RR)"), alpha = 0.8) +
geom_line(aes(y = RR_chrp, color = "Satelit (CHIRPS)"), alpha = 0.6) +
scale_color_manual(values = c("Stasiun (RR)" = "dodgerblue", "Satelit (CHIRPS)" = "firebrick")) +
theme_minimal() +
labs(title = "Perbandingan Tren Curah Hujan: Stasiun vs CHIRPS Satelit (1990-2022)",
x = "Tahun",
y = "Curah Hujan (mm)",
color = "Sumber Data") +
theme(legend.position = "bottom")
# Data Ground Truth
gt_gab = na.omit(data_gab[,2:9])
anyNA(gt_gab) # Cek data sudah bersih NA
## [1] FALSE
Duplikasi data dan hilangkan 10% nilai dalam data
set.seed(123)
uji1 = gt_gab
for (col in names(uji1)) {
n_hilang = round(0.1 * nrow(uji1))
idx = sample(1:nrow(uji1), n_hilang)
uji1[idx, col] = NA
}
# Simpan nilai asli 10%
mask1 = is.na(uji1)
nilai_asli1 = gt_gab[mask1]
Duplikasi data dan hilangkan 40% nilai dalam data
set.seed(123)
uji2 = gt_gab
for (col in names(uji2)) {
n_hilang = round(0.4 * nrow(uji2))
idx = sample(1:nrow(uji2), n_hilang)
uji2[idx, col] = NA
}
# Simpan nilai asli 40%
mask2 = is.na(uji2)
nilai_asli2 = gt_gab[mask2]
Duplikasi data dan hilangkan 70% nilai dalam data
set.seed(123)
uji3 = gt_gab
for (col in names(uji3)) {
n_hilang = round(0.7 * nrow(uji3))
idx = sample(1:nrow(uji3), n_hilang)
uji3[idx, col] = NA
}
# Simpan nilai asli 70%
mask3 = is.na(uji3)
nilai_asli3 = gt_gab[mask3]
MF1 = missForest(as.matrix(uji1))
MF1 = as.data.frame(MF1$ximp)
MF2 = missForest(as.matrix(uji2))
MF2 = as.data.frame(MF2$ximp)
MF3 = missForest(as.matrix(uji3))
MF3 = as.data.frame(MF3$ximp)
prediksi_mf1 = MF1[mask1]
prediksi_mf2 = MF2[mask2]
prediksi_mf3 = MF3[mask3]
# RMSE
rmse = function(asli, prediksi) sqrt(mean((asli - prediksi)^2, na.rm = TRUE))
rmse.mf1 = rmse(nilai_asli1, prediksi_mf1)
rmse.mf2 = rmse(nilai_asli2, prediksi_mf2)
rmse.mf3 = rmse(nilai_asli3, prediksi_mf3)
# standar deviasi
sd_asli1 = sd(nilai_asli1, na.rm = TRUE)
sd_asli2 = sd(nilai_asli2, na.rm = TRUE)
sd_asli3 = sd(nilai_asli3, na.rm = TRUE)
# Output
results = data.frame(
Skema = c("Rendah", "Sedang", "Tinggi"),
RMSE = c(rmse.mf1, rmse.mf2, rmse.mf3),
SD = c(sd_asli1, sd_asli2, sd_asli3)
)
results
## Skema RMSE SD
## 1 Rendah 5.167922 24.14472
## 2 Sedang 5.597681 24.15131
## 3 Tinggi 6.572972 24.17520
# Dataframe
eval_gab_nrmse = data.frame(
Skema = character(),
Variabel = character(),
NRMSE_SD = numeric(),
stringsAsFactors = FALSE
)
# Daftar skema (Gabungan)
skema_gab = list(
"10%" = list(uji = uji1, mf = MF1),
"40%" = list(uji = uji2, mf = MF2),
"70%" = list(uji = uji3, mf = MF3)
)
# Looping perhitungan khusus NRMSE (Metode SD)
for (nama_skema in names(skema_gab)) {
d_uji = skema_gab[[nama_skema]]$uji
d_mf = skema_gab[[nama_skema]]$mf
for (var in names(d_uji)) {
idx_na = which(is.na(d_uji[[var]]))
if(length(idx_na) > 0) {
aktual = gt_gab[[var]][idx_na]
prediksi = d_mf[[var]][idx_na]
# 1. Hitung error absolut (RMSE)
rmse_val = rmse(aktual, prediksi)
# 2. Hitung variasi alami (SD Aktual)
sd_aktual = sd(aktual, na.rm = TRUE)
# 3. Hitung NRMSE (RMSE dibagi SD)
nrmse_sd_val = rmse_val / sd_aktual
eval_gab_nrmse = rbind(eval_gab_nrmse, data.frame(
Skema = nama_skema,
Variabel = var,
NRMSE_SD = round(nrmse_sd_val, 4) # Dibulatkan 4 desimal
))
}
}
}
tabel_nrmse_gabungan = eval_gab_nrmse %>%
pivot_wider(names_from = Skema, values_from = NRMSE_SD, names_prefix = "NRMSE_Skema_")
# Tampilkan hasil
print(tabel_nrmse_gabungan)
## # A tibble: 8 × 4
## Variabel `NRMSE_Skema_10%` `NRMSE_Skema_40%` `NRMSE_Skema_70%`
## <chr> <dbl> <dbl> <dbl>
## 1 Tn 0.690 0.841 1.01
## 2 Tx 0.616 0.780 0.906
## 3 Tavg 0.654 0.798 0.944
## 4 RH_avg 0.570 0.698 0.863
## 5 RR 0.933 1.01 1.17
## 6 ss 0.729 0.826 0.976
## 7 ff_avg 0.972 1.04 1.15
## 8 RR_chrp 0.902 0.974 1.07
# Dataframe
eval_gab_r2 = data.frame(
Skema = character(),
Variabel = character(),
R_Squared = numeric(),
stringsAsFactors = FALSE
)
# Daftar skema (Gabungan)
skema_gab = list(
"10%" = list(uji = uji1, mf = MF1),
"40%" = list(uji = uji2, mf = MF2),
"70%" = list(uji = uji3, mf = MF3)
)
# Looping perhitungan khusus R-Squared
for (nama_skema in names(skema_gab)) {
d_uji = skema_gab[[nama_skema]]$uji
d_mf = skema_gab[[nama_skema]]$mf
for (var in names(d_uji)) {
idx_na = which(is.na(d_uji[[var]]))
if(length(idx_na) > 0) {
aktual = gt_gab[[var]][idx_na]
prediksi = d_mf[[var]][idx_na]
eval_gab_r2 = rbind(eval_gab_r2, data.frame(
Skema = nama_skema,
Variabel = var,
R_Squared = round(cor(aktual, prediksi)^2, 4) # Dibulatkan 4 desimal
))
}
}
}
tabel_r2_gabungan = eval_gab_r2 %>%
pivot_wider(names_from = Skema, values_from = R_Squared, names_prefix = "R2_Skema_")
# Tampilkan hasil
print(tabel_r2_gabungan)
## # A tibble: 8 × 4
## Variabel `R2_Skema_10%` `R2_Skema_40%` `R2_Skema_70%`
## <chr> <dbl> <dbl> <dbl>
## 1 Tn 0.525 0.335 0.152
## 2 Tx 0.623 0.414 0.233
## 3 Tavg 0.573 0.396 0.179
## 4 RH_avg 0.675 0.531 0.301
## 5 RR 0.150 0.102 0.0242
## 6 ss 0.469 0.351 0.176
## 7 ff_avg 0.066 0.0353 0.0072
## 8 RR_chrp 0.189 0.119 0.0535
# Data
rmse_var_10 = hitung_rmse_per_var(gt_gab, uji1, MF1)
rmse_var_40 = hitung_rmse_per_var(gt_gab, uji2, MF2)
rmse_var_70 = hitung_rmse_per_var(gt_gab, uji3, MF3)
# Gabungkan Hasil
tabel_rmse_variabel = rbind(rmse_var_10, rmse_var_40, rmse_var_70)
# Tabel
tabel_rmse_variabel$Skema = c("Rendah (10%)", "Sedang (40%)", "Tinggi (70%)")
tabel_rmse_variabel = tabel_rmse_variabel %>% select(Skema, everything())
# Hasil
print(tabel_rmse_variabel)
## Skema Tn Tx Tavg RH_avg RR ss ff_avg
## 1 Rendah (10%) 1.065085 0.9546227 0.5601458 4.349404 11.00737 1.698263 1.096608
## 2 Sedang (40%) 1.284404 1.1591392 0.7353767 5.381213 12.24037 1.887367 1.176456
## 3 Tinggi (70%) 1.506655 1.3282911 0.8715822 6.593176 14.57583 2.249996 1.296153
## RR_chrp
## 1 8.193519
## 2 7.962900
## 3 8.841401
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
# 1. Ekstraksi Metrik Data Meteorologi (Stasiun)
df_met = data.frame()
for (nama_skema in names(skema_met)) {
d_uji = skema_met[[nama_skema]]$uji
d_mf = skema_met[[nama_skema]]$mf
for (var in names(d_uji)) {
idx_na = which(is.na(d_uji[[var]]))
if(length(idx_na) > 0) {
aktual = gt_met[[var]][idx_na]
prediksi = d_mf[[var]][idx_na]
rmse_val = rmse(aktual, prediksi)
sd_val = sd(aktual, na.rm = TRUE)
nrmse_val = rmse_val / sd_val
r2_val = cor(aktual, prediksi)^2
df_met = rbind(df_met, data.frame(
Variabel = var,
Skema = nama_skema,
RMSE_Met = round(rmse_val, 4),
SD_Met = round(sd_val, 4),
NRMSE_Met = round(nrmse_val, 4),
R2_Met = round(r2_val, 4)
))
}
}
}
# 2. Ekstraksi Metrik Data Gabungan (Stasiun + CHIRPS)
df_gab = data.frame()
for (nama_skema in names(skema_gab)) {
d_uji = skema_gab[[nama_skema]]$uji
d_mf = skema_gab[[nama_skema]]$mf
for (var in names(d_uji)) {
idx_na = which(is.na(d_uji[[var]]))
if(length(idx_na) > 0) {
aktual = gt_gab[[var]][idx_na]
prediksi = d_mf[[var]][idx_na]
rmse_val = rmse(aktual, prediksi)
sd_val = sd(aktual, na.rm = TRUE)
nrmse_val = rmse_val / sd_val
r2_val = cor(aktual, prediksi)^2
df_gab = rbind(df_gab, data.frame(
Variabel = var,
Skema = nama_skema,
RMSE_Gab = round(rmse_val, 4),
SD_Gab = round(sd_val, 4),
NRMSE_Gab = round(nrmse_val, 4),
R2_Gab = round(r2_val, 4)
))
}
}
}
# 3. Gabungkan Menjadi Master Data dan Bersihkan
df_master = full_join(df_met, df_gab, by = c("Variabel", "Skema")) %>%
arrange(Variabel, Skema)
# Hapus variabel yang tidak perlu (tahun, Tanggal)
df_master_bersih = df_master %>%
filter(!Variabel %in% c("tahun", "Tanggal"))
# 4. Siapkan Tabel 1 (Error dan Variasi)
tabel_error_sd_bersih = df_master_bersih %>%
select(Variabel, Skema, RMSE_Met, RMSE_Gab, SD_Met, SD_Gab, NRMSE_Met, NRMSE_Gab)
# 5. Siapkan Tabel 2 (R-Squared)
tabel_rsquare_bersih = df_master_bersih %>%
select(Variabel, Skema, R2_Met, R2_Gab)
# 6. RENDER TABEL 1 YANG ESTETIK
tabel_error_sd_bersih %>%
kable(caption = "<b>Tabel 1: Perbandingan Metrik Error dan Variasi (Stasiun vs Gabungan)</b>",
align = "c", escape = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center") %>%
add_header_above(c(" " = 2, "RMSE" = 2, "Standar Deviasi" = 2, "NRMSE" = 2)) %>%
collapse_rows(columns = 1, valign = "middle")
| Variabel | Skema | RMSE_Met | RMSE_Gab | SD_Met | SD_Gab | NRMSE_Met | NRMSE_Gab |
|---|---|---|---|---|---|---|---|
| RH_avg | 10% | 4.2145 | 4.3494 | 7.6250 | 7.6250 | 0.5527 | 0.5704 |
| 40% | 5.4889 | 5.3812 | 7.7043 | 7.7043 | 0.7125 | 0.6985 | |
| 70% | 5.4369 | 6.5932 | 7.7043 | 7.6391 | 0.7057 | 0.8631 | |
| RR | 10% | 10.6945 | 11.0074 | 11.7941 | 11.7941 | 0.9068 | 0.9333 |
| 40% | 12.3177 | 12.2404 | 12.1162 | 12.1162 | 1.0166 | 1.0102 | |
| 70% | 12.2851 | 14.5758 | 12.1162 | 12.4749 | 1.0139 | 1.1684 | |
| RR_chrp | 10% | NA | 8.1935 | NA | 9.0882 | NA | 0.9016 |
| 40% | NA | 7.9629 | NA | 8.1724 | NA | 0.9744 | |
| 70% | NA | 8.8414 | NA | 8.2925 | NA | 1.0662 | |
| Tavg | 10% | 0.5458 | 0.5601 | 0.8566 | 0.8566 | 0.6371 | 0.6539 |
| 40% | 0.7303 | 0.7354 | 0.9220 | 0.9220 | 0.7920 | 0.7976 | |
| 70% | 0.7250 | 0.8716 | 0.9220 | 0.9236 | 0.7863 | 0.9437 | |
| Tn | 10% | 0.9791 | 1.0651 | 1.5445 | 1.5445 | 0.6339 | 0.6896 |
| 40% | 1.2713 | 1.2844 | 1.5266 | 1.5266 | 0.8327 | 0.8414 | |
| 70% | 1.2560 | 1.5067 | 1.5266 | 1.4905 | 0.8227 | 1.0108 | |
| Tx | 10% | 0.9214 | 0.9546 | 1.5484 | 1.5484 | 0.5951 | 0.6165 |
| 40% | 1.1398 | 1.1591 | 1.4853 | 1.4853 | 0.7674 | 0.7804 | |
| 70% | 1.1257 | 1.3283 | 1.4853 | 1.4658 | 0.7579 | 0.9062 | |
| ff_avg | 10% | 0.9634 | 1.0966 | 1.1282 | 1.1282 | 0.8539 | 0.9720 |
| 40% | 1.0964 | 1.1765 | 1.1357 | 1.1357 | 0.9654 | 1.0359 | |
| 70% | 1.0917 | 1.2962 | 1.1357 | 1.1260 | 0.9613 | 1.1511 | |
| ss | 10% | 1.6855 | 1.6983 | 2.3298 | 2.3298 | 0.7235 | 0.7289 |
| 40% | 1.8793 | 1.8874 | 2.2859 | 2.2859 | 0.8221 | 0.8257 | |
| 70% | 1.8722 | 2.2500 | 2.2859 | 2.3050 | 0.8190 | 0.9761 |
# 7. RENDER TABEL 2 YANG ESTETIK
tabel_rsquare_bersih %>%
kable(caption = "<b>Tabel 2: Perbandingan Kesesuaian Pola / R-Squared</b>",
align = "c", escape = FALSE) %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
full_width = FALSE,
position = "center") %>%
collapse_rows(columns = 1, valign = "middle")
| Variabel | Skema | R2_Met | R2_Gab |
|---|---|---|---|
| RH_avg | 10% | 0.6961 | 0.6749 |
| 40% | 0.5085 | 0.5312 | |
| 70% | 0.5156 | 0.3009 | |
| RR | 10% | 0.1863 | 0.1499 |
| 40% | 0.0967 | 0.1019 | |
| 70% | 0.1001 | 0.0242 | |
| RR_chrp | 10% | NA | 0.1892 |
| 40% | NA | 0.1187 | |
| 70% | NA | 0.0535 | |
| Tavg | 10% | 0.5939 | 0.5730 |
| 40% | 0.4060 | 0.3963 | |
| 70% | 0.4069 | 0.1793 | |
| Tn | 10% | 0.5996 | 0.5247 |
| 40% | 0.3537 | 0.3354 | |
| 70% | 0.3634 | 0.1517 | |
| Tx | 10% | 0.6509 | 0.6226 |
| 40% | 0.4334 | 0.4135 | |
| 70% | 0.4424 | 0.2327 | |
| ff_avg | 10% | 0.2706 | 0.0660 |
| 40% | 0.1186 | 0.0353 | |
| 70% | 0.1237 | 0.0072 | |
| ss | 10% | 0.4768 | 0.4686 |
| 40% | 0.3514 | 0.3512 | |
| 70% | 0.3536 | 0.1755 |
# Imputasi Meteorologi
#data1 = data_met %>%
#select(-Tanggal)
#MF.1 = missForest(as.matrix(data1))
#MF.1 = as.data.frame(MF.1$ximp)
# Imputasi Gabungan (Meteorologi + Chirps)
#data2 = data_gab %>%
#select(-Tanggal, -tahun)
#MF.2 = missForest(as.matrix(data2))
#MF.2 = as.data.frame(MF.2$ximp)
# Meteorologi
#final_met = data.frame(
#Tanggal = data_met$Tanggal,
#CH = MF.1$RR,
#Tn = MF.1$Tn,
#Tx = MF.1$Tx
#)
#head(final_met)
#anyNA(final_met)
# Gabungan (Meteorologi + Satelit)
#final_gab = data.frame(
#Tanggal = data_gab$Tanggal,
#CH = MF.2$RR,
#Tn = MF.2$Tn,
#Tx = MF.2$Tx
#)
#head(final_gab)
#anyNA(final_gab)
# Excel
#library(writexl)
#path_met = "D:\\deska_baru2\\Meteorologi.xlsx"
#path_gab = "D:\\deska_baru2\\Gabungan.xlsx"
# Simpan hasil imputasi stasiun saja ke folder Downloads
#write_xlsx(final_met, path_met)
# Simpan hasil imputasi gabungan (stasiun + satelit) ke folder Downloads
#write_xlsx(final_gab, path_gab)
# CSV
#library(readr)
#path_met_csv = "D:\\deska_baru2\\Meteorologi.csv"
#path_gab_csv = "D:\\deska_baru2\\Gabungan.csv"
# Meteorologi
#write_csv(final_met, path_met_csv)
# Gabungan
#write_csv(final_gab, path_gab_csv)