# Nama : Lailli Arifia Dewi
# NRP : 5003221085
# Kelas : Data Mining dan Visualisasi K

# Import Library
library(imputeTS)
## Warning: package 'imputeTS' was built under R version 4.3.3
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
# Read the CSV file
data <- read.csv("C:\\Users\\Laillli Arifia\\OneDrive\\Documents\\Kuliah\\Semester 7\\datmin\\missing_data.csv")

# Count the total number of missing values
sum(is.na(data))
## [1] 64
# Convert the data into a time series with a daily pattern
ts_data <- ts(data[,2], frequency = 144)

# Plot the distribution of missing values
ggplot_na_distribution(ts_data)

# ====== Imputation using various methods
# 1. Mean / Median / Mode
imp_mean   <- na_mean(ts_data)
imp_median <- na_mean(ts_data, option = "median")
imp_mode   <- na_mean(ts_data, option = "mode")

# 2. LOCF & NOCB
imp_locf <- na_locf(ts_data)
imp_nocb <- na_locf(ts_data, option = "nocb")

# 3. Interpolation
imp_linear <- na_interpolation(ts_data, option = "linear")
imp_spline <- na_interpolation(ts_data, option = "spline")
imp_stine  <- na_interpolation(ts_data, option = "stine")

# 4. Moving Average
imp_ma_simple <- na_ma(ts_data, weighting = "simple")
imp_ma_linear <- na_ma(ts_data, weighting = "linear")
imp_ma_exp <- na_ma(ts_data, weighting = "exponential")

# 5. Random Imputation
imp_random <- na_random(ts_data)

# 6. Seasonal Decomposition
imp_seadec_interpolation <- na_seadec(ts_data, algorithm = "interpolation")

# Read the complete dataset
data_complete <- read.csv("C:\\Users\\Laillli Arifia\\OneDrive\\Documents\\Kuliah\\Semester 7\\datmin\\complete_data.csv")

# Convert to time series as well
ts_complete <- ts(data_complete[,2], frequency = 144)

# Identify the positions of NA in the original data
na_index <- which(is.na(ts_data))

# RMSE function
rmse <- function(actual, predicted) {
  sqrt(mean((actual - predicted)^2, na.rm = TRUE))
}

# Calculate RMSE for each method only at the missing positions
rmse_results <- data.frame(
  Method = c("Mean", "Median", "Mode",
             "LOCF", "NOCB",
             "Linear", "Spline", "Stine",
             "MA_Simple", "MA_Linear", "MA_Exp",
             "Random",
             "SeaDec_Interp"),
  RMSE = c(
    rmse(ts_complete[na_index], imp_mean[na_index]),
    rmse(ts_complete[na_index], imp_median[na_index]),
    rmse(ts_complete[na_index], imp_mode[na_index]),
    rmse(ts_complete[na_index], imp_locf[na_index]),
    rmse(ts_complete[na_index], imp_nocb[na_index]),
    rmse(ts_complete[na_index], imp_linear[na_index]),
    rmse(ts_complete[na_index], imp_spline[na_index]),
    rmse(ts_complete[na_index], imp_stine[na_index]),
    rmse(ts_complete[na_index], imp_ma_simple[na_index]),
    rmse(ts_complete[na_index], imp_ma_linear[na_index]),
    rmse(ts_complete[na_index], imp_ma_exp[na_index]),
    rmse(ts_complete[na_index], imp_random[na_index]),
    rmse(ts_complete[na_index], imp_seadec_interpolation[na_index])
  )
)

print(rmse_results)
##           Method      RMSE
## 1           Mean 52.111118
## 2         Median 51.646333
## 3           Mode 62.371117
## 4           LOCF  8.181534
## 5           NOCB 12.253826
## 6         Linear  8.832111
## 7         Spline 18.239169
## 8          Stine  8.882046
## 9      MA_Simple  8.230348
## 10     MA_Linear  7.859050
## 11        MA_Exp  7.687848
## 12        Random 81.527877
## 13 SeaDec_Interp  9.424880
# Create visualization
ggplot_na_imputations(ts_data, imp_mean, ts_complete)

ggplot_na_imputations(ts_data, imp_median, ts_complete)

ggplot_na_imputations(ts_data, imp_mode, ts_complete)

ggplot_na_imputations(ts_data, imp_locf, ts_complete)

ggplot_na_imputations(ts_data, imp_nocb, ts_complete)

ggplot_na_imputations(ts_data, imp_linear, ts_complete)

ggplot_na_imputations(ts_data, imp_spline, ts_complete)

ggplot_na_imputations(ts_data, imp_stine, ts_complete)

ggplot_na_imputations(ts_data, imp_ma_simple, ts_complete)

ggplot_na_imputations(ts_data, imp_ma_linear, ts_complete)

ggplot_na_imputations(ts_data, imp_ma_exp, ts_complete)

ggplot_na_imputations(ts_data, imp_random, ts_complete)

ggplot_na_imputations(ts_data, imp_seadec_interpolation, ts_complete)