# Nama : Lailli Arifia Dewi
# NRP : 5003221085
# Kelas : Data Mining dan Visualisasi K
# Import Library
library(imputeTS)
## Warning: package 'imputeTS' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
# Read the CSV file
data <- read.csv("C:\\Users\\Laillli Arifia\\OneDrive\\Documents\\Kuliah\\Semester 7\\datmin\\missing_data.csv")
# Count the total number of missing values
sum(is.na(data))
## [1] 64
# Convert the data into a time series with a daily pattern
ts_data <- ts(data[,2], frequency = 144)
# Plot the distribution of missing values
ggplot_na_distribution(ts_data)

# ====== Imputation using various methods
# 1. Mean / Median / Mode
imp_mean <- na_mean(ts_data)
imp_median <- na_mean(ts_data, option = "median")
imp_mode <- na_mean(ts_data, option = "mode")
# 2. LOCF & NOCB
imp_locf <- na_locf(ts_data)
imp_nocb <- na_locf(ts_data, option = "nocb")
# 3. Interpolation
imp_linear <- na_interpolation(ts_data, option = "linear")
imp_spline <- na_interpolation(ts_data, option = "spline")
imp_stine <- na_interpolation(ts_data, option = "stine")
# 4. Moving Average
imp_ma_simple <- na_ma(ts_data, weighting = "simple")
imp_ma_linear <- na_ma(ts_data, weighting = "linear")
imp_ma_exp <- na_ma(ts_data, weighting = "exponential")
# 5. Random Imputation
imp_random <- na_random(ts_data)
# 6. Seasonal Decomposition
imp_seadec_interpolation <- na_seadec(ts_data, algorithm = "interpolation")
# Read the complete dataset
data_complete <- read.csv("C:\\Users\\Laillli Arifia\\OneDrive\\Documents\\Kuliah\\Semester 7\\datmin\\complete_data.csv")
# Convert to time series as well
ts_complete <- ts(data_complete[,2], frequency = 144)
# Identify the positions of NA in the original data
na_index <- which(is.na(ts_data))
# RMSE function
rmse <- function(actual, predicted) {
sqrt(mean((actual - predicted)^2, na.rm = TRUE))
}
# Calculate RMSE for each method only at the missing positions
rmse_results <- data.frame(
Method = c("Mean", "Median", "Mode",
"LOCF", "NOCB",
"Linear", "Spline", "Stine",
"MA_Simple", "MA_Linear", "MA_Exp",
"Random",
"SeaDec_Interp"),
RMSE = c(
rmse(ts_complete[na_index], imp_mean[na_index]),
rmse(ts_complete[na_index], imp_median[na_index]),
rmse(ts_complete[na_index], imp_mode[na_index]),
rmse(ts_complete[na_index], imp_locf[na_index]),
rmse(ts_complete[na_index], imp_nocb[na_index]),
rmse(ts_complete[na_index], imp_linear[na_index]),
rmse(ts_complete[na_index], imp_spline[na_index]),
rmse(ts_complete[na_index], imp_stine[na_index]),
rmse(ts_complete[na_index], imp_ma_simple[na_index]),
rmse(ts_complete[na_index], imp_ma_linear[na_index]),
rmse(ts_complete[na_index], imp_ma_exp[na_index]),
rmse(ts_complete[na_index], imp_random[na_index]),
rmse(ts_complete[na_index], imp_seadec_interpolation[na_index])
)
)
print(rmse_results)
## Method RMSE
## 1 Mean 52.111118
## 2 Median 51.646333
## 3 Mode 62.371117
## 4 LOCF 8.181534
## 5 NOCB 12.253826
## 6 Linear 8.832111
## 7 Spline 18.239169
## 8 Stine 8.882046
## 9 MA_Simple 8.230348
## 10 MA_Linear 7.859050
## 11 MA_Exp 7.687848
## 12 Random 81.527877
## 13 SeaDec_Interp 9.424880
# Create visualization
ggplot_na_imputations(ts_data, imp_mean, ts_complete)

ggplot_na_imputations(ts_data, imp_median, ts_complete)

ggplot_na_imputations(ts_data, imp_mode, ts_complete)

ggplot_na_imputations(ts_data, imp_locf, ts_complete)

ggplot_na_imputations(ts_data, imp_nocb, ts_complete)

ggplot_na_imputations(ts_data, imp_linear, ts_complete)

ggplot_na_imputations(ts_data, imp_spline, ts_complete)

ggplot_na_imputations(ts_data, imp_stine, ts_complete)

ggplot_na_imputations(ts_data, imp_ma_simple, ts_complete)

ggplot_na_imputations(ts_data, imp_ma_linear, ts_complete)

ggplot_na_imputations(ts_data, imp_ma_exp, ts_complete)

ggplot_na_imputations(ts_data, imp_random, ts_complete)

ggplot_na_imputations(ts_data, imp_seadec_interpolation, ts_complete)
