Libraries

library(tidyr)
library(readr)
library(ggplot2)
library(knitr)
library(readxl)
library(xlsx)
library(openxlsx)
library(naniar)

Filling Missing Values in a Time Series. Example

Read data

Air_Quality <- data.frame(read_csv("../data/examples-data/Air_Quality.csv"))
kable(head(Air_Quality))
Date Ozone Solar Wind Temp
1976-05-01 41 190 7.4 67
1976-05-02 36 118 8.0 72
1976-05-03 12 149 12.6 74
1976-05-04 18 313 11.5 62
1976-05-05 NA NA 14.3 56
1976-05-06 28 NA 14.9 66

Missing Values

vis_miss(Air_Quality, sort_miss = TRUE) + 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Missing values in Heart Rate")

Different examples of how to fill missing data

1. Forwardfill - ffill : according to next observed value

df1 <- Air_Quality %>% fill(Ozone, .direction = 'up')
df1$Ozone_Is_imputed = is.na(Air_Quality$Ozone)

df1 <- data.frame(df1)
df1$Date = as.Date(df1$Date)
ggplot(df1, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
  geom_line(color = "black") + xlab("") +
  geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  labs(title  = "Forwardfill - Ffill", subtitle = "according to next observed value")

df2 <- Air_Quality %>% fill(Ozone, .direction = 'down')

2. Backfill - bfill : according to last observed value

df2 <- Air_Quality %>% fill(Ozone, .direction = 'down')
df2$Ozone_Is_imputed = is.na(Air_Quality$Ozone)

df2 <- data.frame(df2)
df2$Date = as.Date(df2$Date)
ggplot(df2, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
  geom_line(color = "black") + xlab("") +
  geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  labs(title  = "Backfill - Bfill", subtitle = "According to last observed value")

3. Interpolation

library(imputeTS) # for quadratic interpolation 

df3 <- Air_Quality
df4 <- Air_Quality
df5 <- Air_Quality

df3$Ozone <- na_interpolation(df3$Ozone, option = "spline") # Impute parabolic trajectory in negative direction to positive, it has large bias in some condition. Using a mathematical function, the method estimates values that minimize overall curvature, thus obtaining a smooth surface passing through the input points

df4$Ozone <- na_interpolation(df4$Ozone, option = "linear")#  Impute linearly or with equidistant values, draw linearity between 2 non missing value to fill gap.

df5$Ozone <- na_interpolation(df5$Ozone, option = "stine") # Returns the values of an interpolating function that runs through a set of points in the xy-plane according to the algorithm of Stineman (1980).
ggplot(df3, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
  geom_line(color = "black") + xlab("") +
  geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  labs(title  = "Interpolation", subtitle = "Spiline")

ggplot(df4, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
  geom_line(color = "black") + xlab("") +
  geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  labs(title  = "Interpolation", subtitle = "Linear")

ggplot(df5, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
  geom_line(color = "black") + xlab("") +
  geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  labs(title  = "Interpolation", subtitle = "Stine")

4. KNN Impute

library(caret)
library(RANN)
df6 = preProcess(Air_Quality, "knnImpute")
df6_pred = predict(df6, Air_Quality)
df6_pred$Ozone_Is_imputed = is.na(Air_Quality$Ozone)
df6_pred$Date = as.Date(df1$Date)
ggplot(df6_pred, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
  geom_line(color = "black") + xlab("") +
  geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
  theme(axis.text.x = element_text(angle = 60, hjust = 1)) + 
  labs(title  = "KNN Impute", subtitle = "Knn")