Import library

library(tidyverse)
library(gridExtra)
library(readxl)
library(lubridate)
library(hms)
library(forecast)
library(fastDummies)
library(imputeTS)
library(scales)
library(kableExtra)
library(modeltime)
library(Metrics)

Load dataset

df_raw <- read_excel("Data Awal dan Hasil Imputasi rev.xlsx", sheet = 2) %>% 
          mutate(Jam=as_hms(Jam), week = rep(1:51,each=336)[1:16992],
                 time=seq(ymd_hm("2018-1-1 0:00"), ymd_hm("2018-12-20 23:30"), by = 30*60),
                 month=month(time), wday=wday(time),day=day(time), 
                 hour=as_hms(time), yday=yday(time),
                 wtime=rep(seq(ymd_hm("2018-1-1 0:00"), 
                               ymd_hm("2018-1-7 23:30"), 
                               by = 30*60),51)[1:16992]) 
head(df_raw)
tail(df_raw)
dim(df_raw)
[1] 16992    23

Number of missing value for each dataset

Missing value
SUF1_CO 1930
SUF7_CO 628
SUF1_NO2 2426
SUF7_NO2 866
SUF1_O3 474
SUF7_O3 665
SUF1_PM10 7621
SUF7_PM10 2198
SUF1_SO2 1318

Percentage of missing value in SUF 1 dataset

SUF1_CO 11.4%
SUF1_NO2 14.3%
SUF1_O3 2.8%
SUF1_PM10 44.9%
SUF1_SO2 7.8%

Percentage of missing value in SUF 7 dataset

SUF7_CO 3.70%
SUF7_NO2 5.10%
SUF7_O3 3.91%
SUF7_PM10 12.94%
SUF7_SO2 9.58%
var_pollutant <- colnames(df_raw)[6:15]
var_pollutant
 [1] "SUF1_CO"   "SUF7_CO"   "SUF1_NO2"  "SUF7_NO2"  "SUF1_O3"   "SUF7_O3"   "SUF1_PM10" "SUF7_PM10" "SUF1_SO2"  "SUF7_SO2" 

Exploratory Analysis

Time Series Plot

p <- list()
for (i in 1:10){
  p[[i]] <- ggplot(df_raw, aes_(x=~time, y=as.name(var_pollutant[i]))) +
              geom_line() + xlab("") +
              scale_x_datetime(date_breaks = "2 month", date_labels = "%b")
}
grid.arrange(p[[1]], p[[3]], p[[5]],
             p[[7]], p[[9]], ncol=2, top="Time Series Plot on SUF 1")

grid.arrange(p[[2]], p[[4]], p[[6]],
             p[[8]], p[[10]], ncol=2, top="Time Series Plot on SUF 7")

Daily Time Series

p <- list()
for (i in 1:10){
  p[[i]] <- ggplot(df_raw, aes_(x=~hour, y=as.name(var_pollutant[i]), group=~yday)) +
              geom_line() + xlab("") + 
              scale_x_time(labels = label_time(format = '%H:%M'), 
                           breaks=hms(hours = seq(0, 24, 4)))
}
grid.arrange(p[[1]], p[[3]], p[[5]],
             p[[7]], p[[9]], ncol=2, top="Daily Time Series Plot on SUF 1")

grid.arrange(p[[2]], p[[4]], p[[6]],
             p[[8]], p[[10]], ncol=2, top="Daily Time Series Plot on SUF 7")

Weekly Time Series

p <- list()
for (i in 1:10){
  p[[i]] <- ggplot(df_raw, aes_(x=~wtime, y=as.name(var_pollutant[i]), group=~week)) +
              geom_line() + xlab("") + scale_x_datetime(date_breaks = "1 day", date_labels = "%a")
}
grid.arrange(p[[1]], p[[3]], p[[5]],
             p[[7]], p[[10]], ncol=2, top="Weekly Time Series Plot on SUF 1")

grid.arrange(p[[2]], p[[4]], p[[6]],
             p[[8]], p[[10]], ncol=2, top="Weekly Time Series Plot on SUF 7")

Imputation

df <- list()
df[["na"]] <- data.frame(is.na(df_raw))
df[["impute_1"]] <- df_raw %>% 
                    group_by(Hari, J) %>% 
                    mutate_if(is.double,~replace_na(.,median(., na.rm = TRUE))) %>% 
                    ungroup()
`mutate_if()` ignored the following grouping variables:
• Columns `Hari`, `J`
df[["impute_2"]] <- df_raw %>% 
                    group_by(Hari, J) %>% 
                    mutate_if(is.double,~replace_na(.,mean(., na.rm = TRUE))) %>% 
                    ungroup()
`mutate_if()` ignored the following grouping variables:
• Columns `Hari`, `J`
df[["impute_3"]] <- df_raw %>% 
                    group_by(Hari, J) %>% 
                    mutate_if(is.double,~na_interpolation(., option="linear")) %>% 
                    ungroup()
`mutate_if()` ignored the following grouping variables:
• Columns `Hari`, `J`
df[["impute_4"]] <- df_raw %>% 
                    group_by(Hari, J) %>% 
                    mutate_if(is.double,~na_ma(.)) %>% 
                    ungroup()
`mutate_if()` ignored the following grouping variables:
• Columns `Hari`, `J`

CO

p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF1_CO), 
             aes(x=time,y=SUF1_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF1_CO), 
             aes(x=time,y=SUF1_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF1_CO), 
             aes(x=time,y=SUF1_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF1_CO), 
             aes(x=time,y=SUF1_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)

p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF7_CO), 
             aes(x=time,y=SUF7_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF7_CO), 
             aes(x=time,y=SUF7_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF7_CO), 
             aes(x=time,y=SUF7_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF7_CO), 
             aes(x=time,y=SUF7_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)

NO2

p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF1_NO2), 
             aes(x=time,y=SUF1_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF1_NO2), 
             aes(x=time,y=SUF1_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF1_NO2), 
             aes(x=time,y=SUF1_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF1_NO2), 
             aes(x=time,y=SUF1_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)

p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF7_NO2), 
             aes(x=time,y=SUF7_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF7_NO2), 
             aes(x=time,y=SUF7_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF7_NO2), 
             aes(x=time,y=SUF7_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF7_NO2), 
             aes(x=time,y=SUF7_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)

O3

p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF1_O3), 
             aes(x=time,y=SUF1_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF1_O3), 
             aes(x=time,y=SUF1_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF1_O3), 
             aes(x=time,y=SUF1_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF1_O3), 
             aes(x=time,y=SUF1_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)

p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF7_O3), 
             aes(x=time,y=SUF7_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF7_O3), 
             aes(x=time,y=SUF7_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF7_O3), 
             aes(x=time,y=SUF7_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF7_O3), 
             aes(x=time,y=SUF7_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)

PM10

p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF1_PM10), 
             aes(x=time,y=SUF1_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF1_PM10), 
             aes(x=time,y=SUF1_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF1_PM10), 
             aes(x=time,y=SUF1_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF1_PM10), 
             aes(x=time,y=SUF1_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)

p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF7_PM10), 
             aes(x=time,y=SUF7_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF7_PM10), 
             aes(x=time,y=SUF7_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF7_PM10), 
             aes(x=time,y=SUF7_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF7_PM10), 
             aes(x=time,y=SUF7_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)

SO2

p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF1_SO2), 
             aes(x=time,y=SUF1_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF1_SO2), 
             aes(x=time,y=SUF1_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF1_SO2), 
             aes(x=time,y=SUF1_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF1_SO2), 
             aes(x=time,y=SUF1_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)

p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF7_SO2), 
             aes(x=time,y=SUF7_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF7_SO2), 
             aes(x=time,y=SUF7_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF7_SO2), 
             aes(x=time,y=SUF7_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF7_SO2), 
             aes(x=time,y=SUF7_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)

Modelling

Dummy variables

df[["dummy"]] <- dummy_cols(df_raw, select_columns = c("J","Hari")) %>% 
                 mutate(t=seq_along(time)) %>% 
                 select(starts_with("J_"), starts_with("Hari_"),t)
for (k in var_pollutant){
  name <- paste0(k,"_imp_1")
  df[[name]] <- bind_cols(df[["impute_1"]], df[["dummy"]]) %>% 
                select(time,starts_with(k), starts_with("J_"), starts_with("Hari_"),t)
  name <- paste0(k,"_imp_2")
  df[[name]] <- bind_cols(df[["impute_2"]], df[["dummy"]]) %>% 
                select(time,starts_with(k), starts_with("J_"), starts_with("Hari_"),t)
  name <- paste0(k,"_imp_3")
  df[[name]] <- bind_cols(df[["impute_3"]], df[["dummy"]]) %>% 
                select(time,starts_with(k), starts_with("J_"), starts_with("Hari_"),t)
  name <- paste0(k,"_imp_4")
  df[[name]] <- bind_cols(df[["impute_4"]], df[["dummy"]]) %>% 
                select(time,starts_with(k), starts_with("J_"), starts_with("Hari_"),t)
}
ind <- df_raw %>% make_ts_splits(.length_test = 48)
ind
split [16944|48|16992]

Model Fitting and Forecasting

applyForecast <- function(data){
  ind <- data %>% make_ts_splits(.length_test = 48)
  train <- data[ind$idx_train,]
  test  <- data[ind$idx_test,]
  yy <- colnames(data)[2]
  f <- paste0(yy,"~.-1-time-Hari_1")
  m <- lm(f, data = train)
  r.squared <- summary(m)$r.squared
  pred <- m %>% predict(test)
  RMSE <- rmse(test[yy] %>% pull(),pred)
  MAPE <- mape(test[yy] %>% pull(),pred)
  sMAPE <- smape(test[yy] %>% pull(),pred)
  p_train <- ggplot(train %>% mutate(predicted=m$fitted.values)) + 
             geom_line(aes_(x=~time,y=as.name(yy))) +  
             geom_line(aes(x=time,y=predicted), color="red")
  p_test  <- ggplot(test %>% mutate(predicted=pred)) + 
             geom_line(aes_(x=~time,y=as.name(yy))) +  
             geom_line(aes(x=time,y=predicted), color="red")
  results <- list(prediction=pred, 
                  r.squared=r.squared,
                  RMSE=RMSE,
                  MAPE=MAPE,
                  sMAPE=sMAPE,
                  plot_train=p_train,
                  plot_test=p_test)
}
df_pollutant <- names(df)[grepl("imp_",names(df))]
res <- list()
for (name in df_pollutant){res[[name]] <- applyForecast(df[[name]])}

Forecast Comparison

dd <- as_tibble(res[[1]][2:5])
for (name in df_pollutant[-1]){
  dd <- dd %>% add_row(as_tibble(res[[name]][2:5]))
}
results <- dd %>% mutate(model=df_pollutant)

CO

results[1:8,]

NO2

results[9:16,]

O3

results[17:24,]

PM10

results[25:32,]

SO2

results[33:40,]
---
title: "Time Series Imputation"
output:
  html_notebook: 
    toc: yes
    toc_float:
      toc_collapsed: yes
editor_options: 
  chunk_output_type: inline
---

## Import library
```{r echo=TRUE, message=FALSE, warning=FALSE}
library(tidyverse)
library(gridExtra)
library(readxl)
library(lubridate)
library(hms)
library(forecast)
library(fastDummies)
library(imputeTS)
library(scales)
library(kableExtra)
library(modeltime)
library(Metrics)
```

## Load dataset

```{r}
df_raw <- read_excel("Data Awal dan Hasil Imputasi rev.xlsx", sheet = 2) %>% 
          mutate(Jam=as_hms(Jam), week = rep(1:51,each=336)[1:16992],
                 time=seq(ymd_hm("2018-1-1 0:00"), ymd_hm("2018-12-20 23:30"), by = 30*60),
                 month=month(time), wday=wday(time),day=day(time), 
                 hour=as_hms(time), yday=yday(time),
                 wtime=rep(seq(ymd_hm("2018-1-1 0:00"), 
                               ymd_hm("2018-1-7 23:30"), 
                               by = 30*60),51)[1:16992]) 
head(df_raw)
```

```{r}
tail(df_raw)
```


```{r}
dim(df_raw)
```

Number of missing value for each dataset
```{r echo=FALSE}
colSums(is.na(df_raw))[6:14] %>%
  kbl(caption = "Missing value", col.names = "") %>%
   kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                 full_width = F, position = "left")
```


Percentage of missing value in SUF 1 dataset
```{r echo=FALSE}
label_percent() (colSums(is.na(df_raw))[c(6,8,10,12,14)]/dim(df_raw)[1]) %>%
  kbl(col.names = "") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                full_width = F, position = "left")
```

Percentage of missing value in SUF 7 dataset
```{r echo=FALSE}
label_percent() (colSums(is.na(df_raw))[c(7,9,11,13,15)]/dim(df_raw)[1]) %>%
  kbl(col.names = "") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), 
                full_width = F, position = "left")
```


```{r}
var_pollutant <- colnames(df_raw)[6:15]
var_pollutant
```

## Exploratory Analysis

### Time Series Plot
```{r}
p <- list()
for (i in 1:10){
  p[[i]] <- ggplot(df_raw, aes_(x=~time, y=as.name(var_pollutant[i]))) +
              geom_line() + xlab("") +
              scale_x_datetime(date_breaks = "2 month", date_labels = "%b")
}
```


```{r warning=FALSE}
grid.arrange(p[[1]], p[[3]], p[[5]],
             p[[7]], p[[9]], ncol=2, top="Time Series Plot on SUF 1")
```

```{r warning=FALSE}
grid.arrange(p[[2]], p[[4]], p[[6]],
             p[[8]], p[[10]], ncol=2, top="Time Series Plot on SUF 7")
```

### Daily Time Series
```{r}
p <- list()
for (i in 1:10){
  p[[i]] <- ggplot(df_raw, aes_(x=~hour, y=as.name(var_pollutant[i]), group=~yday)) +
              geom_line() + xlab("") + 
              scale_x_time(labels = label_time(format = '%H:%M'), 
                           breaks=hms(hours = seq(0, 24, 4)))
}
```


```{r warning=FALSE}
grid.arrange(p[[1]], p[[3]], p[[5]],
             p[[7]], p[[9]], ncol=2, top="Daily Time Series Plot on SUF 1")
```

```{r warning=FALSE}
grid.arrange(p[[2]], p[[4]], p[[6]],
             p[[8]], p[[10]], ncol=2, top="Daily Time Series Plot on SUF 7")
```


### Weekly Time Series
```{r}
p <- list()
for (i in 1:10){
  p[[i]] <- ggplot(df_raw, aes_(x=~wtime, y=as.name(var_pollutant[i]), group=~week)) +
              geom_line() + xlab("") + scale_x_datetime(date_breaks = "1 day", date_labels = "%a")
}
```


```{r warning=FALSE}
grid.arrange(p[[1]], p[[3]], p[[5]],
             p[[7]], p[[10]], ncol=2, top="Weekly Time Series Plot on SUF 1")
```



```{r warning=FALSE}
grid.arrange(p[[2]], p[[4]], p[[6]],
             p[[8]], p[[10]], ncol=2, top="Weekly Time Series Plot on SUF 7")
```


## Imputation


```{r}
df <- list()
df[["na"]] <- data.frame(is.na(df_raw))
df[["impute_1"]] <- df_raw %>% 
                    group_by(Hari, J) %>% 
                    mutate_if(is.double,~replace_na(.,median(., na.rm = TRUE))) %>% 
                    ungroup()
df[["impute_2"]] <- df_raw %>% 
                    group_by(Hari, J) %>% 
                    mutate_if(is.double,~replace_na(.,mean(., na.rm = TRUE))) %>% 
                    ungroup()
df[["impute_3"]] <- df_raw %>% 
                    group_by(Hari, J) %>% 
                    mutate_if(is.double,~na_interpolation(., option="linear")) %>% 
                    ungroup()
df[["impute_4"]] <- df_raw %>% 
                    group_by(Hari, J) %>% 
                    mutate_if(is.double,~na_ma(.)) %>% 
                    ungroup()
```

### CO

```{r, fig.width=5}
p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF1_CO), 
             aes(x=time,y=SUF1_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF1_CO), 
             aes(x=time,y=SUF1_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF1_CO), 
             aes(x=time,y=SUF1_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF1_CO), 
             aes(x=time,y=SUF1_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)
```


```{r, fig.width=5}
p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF7_CO), 
             aes(x=time,y=SUF7_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF7_CO), 
             aes(x=time,y=SUF7_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF7_CO), 
             aes(x=time,y=SUF7_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF7_CO), 
             aes(x=time,y=SUF7_CO, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)
```



### NO2

```{r, fig.width=5}
p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF1_NO2), 
             aes(x=time,y=SUF1_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF1_NO2), 
             aes(x=time,y=SUF1_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF1_NO2), 
             aes(x=time,y=SUF1_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF1_NO2), 
             aes(x=time,y=SUF1_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)
```


```{r, fig.width=5}
p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF7_NO2), 
             aes(x=time,y=SUF7_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF7_NO2), 
             aes(x=time,y=SUF7_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF7_NO2), 
             aes(x=time,y=SUF7_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF7_NO2), 
             aes(x=time,y=SUF7_NO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)
```

### O3

```{r, fig.width=5}
p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF1_O3), 
             aes(x=time,y=SUF1_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF1_O3), 
             aes(x=time,y=SUF1_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF1_O3), 
             aes(x=time,y=SUF1_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF1_O3), 
             aes(x=time,y=SUF1_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)
```


```{r, fig.width=5}
p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF7_O3), 
             aes(x=time,y=SUF7_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF7_O3), 
             aes(x=time,y=SUF7_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF7_O3), 
             aes(x=time,y=SUF7_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF7_O3), 
             aes(x=time,y=SUF7_O3, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)
```

### PM10
```{r, fig.width=5}
p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF1_PM10), 
             aes(x=time,y=SUF1_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF1_PM10), 
             aes(x=time,y=SUF1_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF1_PM10), 
             aes(x=time,y=SUF1_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF1_PM10), 
             aes(x=time,y=SUF1_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)
```


```{r, fig.width=5}
p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF7_PM10), 
             aes(x=time,y=SUF7_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF7_PM10), 
             aes(x=time,y=SUF7_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF7_PM10), 
             aes(x=time,y=SUF7_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF7_PM10), 
             aes(x=time,y=SUF7_PM10, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)
```

### SO2

```{r, fig.width=5}
p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF1_SO2), 
             aes(x=time,y=SUF1_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF1_SO2), 
             aes(x=time,y=SUF1_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF1_SO2), 
             aes(x=time,y=SUF1_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF1_SO2), 
             aes(x=time,y=SUF1_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)
```


```{r, fig.width=5}
p1 <- ggplot(data = df[["impute_1"]] %>% mutate(missing=df[["na"]]$SUF7_SO2), 
             aes(x=time,y=SUF7_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Median Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p2 <- ggplot(data = df[["impute_2"]] %>% mutate(missing=df[["na"]]$SUF7_SO2), 
             aes(x=time,y=SUF7_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Mean Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p3 <- ggplot(data = df[["impute_3"]] %>% mutate(missing=df[["na"]]$SUF7_SO2), 
             aes(x=time,y=SUF7_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Linear Interpolation Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
p4 <- ggplot(data = df[["impute_4"]] %>% mutate(missing=df[["na"]]$SUF7_SO2), 
             aes(x=time,y=SUF7_SO2, color=missing)) + 
      geom_line(aes(group=1)) + xlab("") + ggtitle("Moving Average Imputation") +
      scale_x_datetime(date_breaks = "1 month", date_labels = "%b")
grid.arrange(p1,p2,p3,p4,ncol=1)
```

## Modelling

### Dummy variables
```{r warning=FALSE}
df[["dummy"]] <- dummy_cols(df_raw, select_columns = c("J","Hari")) %>% 
                 mutate(t=seq_along(time)) %>% 
                 select(starts_with("J_"), starts_with("Hari_"),t)
```



```{r}
for (k in var_pollutant){
  name <- paste0(k,"_imp_1")
  df[[name]] <- bind_cols(df[["impute_1"]], df[["dummy"]]) %>% 
                select(time,starts_with(k), starts_with("J_"), starts_with("Hari_"),t)
  name <- paste0(k,"_imp_2")
  df[[name]] <- bind_cols(df[["impute_2"]], df[["dummy"]]) %>% 
                select(time,starts_with(k), starts_with("J_"), starts_with("Hari_"),t)
  name <- paste0(k,"_imp_3")
  df[[name]] <- bind_cols(df[["impute_3"]], df[["dummy"]]) %>% 
                select(time,starts_with(k), starts_with("J_"), starts_with("Hari_"),t)
  name <- paste0(k,"_imp_4")
  df[[name]] <- bind_cols(df[["impute_4"]], df[["dummy"]]) %>% 
                select(time,starts_with(k), starts_with("J_"), starts_with("Hari_"),t)
}
```


```{r}
ind <- df_raw %>% make_ts_splits(.length_test = 48)
ind
```


### Model Fitting and Forecasting
```{r}
applyForecast <- function(data){
  ind <- data %>% make_ts_splits(.length_test = 48)
  train <- data[ind$idx_train,]
  test  <- data[ind$idx_test,]
  yy <- colnames(data)[2]
  f <- paste0(yy,"~.-1-time-Hari_1")
  m <- lm(f, data = train)
  r.squared <- summary(m)$r.squared
  pred <- m %>% predict(test)
  RMSE <- rmse(test[yy] %>% pull(),pred)
  MAPE <- mape(test[yy] %>% pull(),pred)
  sMAPE <- smape(test[yy] %>% pull(),pred)
  p_train <- ggplot(train %>% mutate(predicted=m$fitted.values)) + 
             geom_line(aes_(x=~time,y=as.name(yy))) +  
             geom_line(aes(x=time,y=predicted), color="red")
  p_test  <- ggplot(test %>% mutate(predicted=pred)) + 
             geom_line(aes_(x=~time,y=as.name(yy))) +  
             geom_line(aes(x=time,y=predicted), color="red")
  results <- list(prediction=pred, 
                  r.squared=r.squared,
                  RMSE=RMSE,
                  MAPE=MAPE,
                  sMAPE=sMAPE,
                  plot_train=p_train,
                  plot_test=p_test)
}
```




```{r}
df_pollutant <- names(df)[grepl("imp_",names(df))]
res <- list()
for (name in df_pollutant){res[[name]] <- applyForecast(df[[name]])}
```


## Forecast Comparison
```{r}
dd <- as_tibble(res[[1]][2:5])
for (name in df_pollutant[-1]){
  dd <- dd %>% add_row(as_tibble(res[[name]][2:5]))
}
results <- dd %>% mutate(model=df_pollutant)
```


### CO
```{r}
results[1:8,]
```

### NO2
```{r}
results[9:16,]
```


### O3
```{r}
results[17:24,]
```


### PM10
```{r}
results[25:32,]
```


### SO2
```{r}
results[33:40,]
```











