Libraries
library(tidyr)
library(readr)
library(ggplot2)
library(knitr)
library(readxl)
library(xlsx)
library(openxlsx)
library(naniar)
Read data
Air_Quality <- data.frame(read_csv("../data/examples-data/Air_Quality.csv"))
kable(head(Air_Quality))
Date | Ozone | Solar | Wind | Temp |
---|---|---|---|---|
1976-05-01 | 41 | 190 | 7.4 | 67 |
1976-05-02 | 36 | 118 | 8.0 | 72 |
1976-05-03 | 12 | 149 | 12.6 | 74 |
1976-05-04 | 18 | 313 | 11.5 | 62 |
1976-05-05 | NA | NA | 14.3 | 56 |
1976-05-06 | 28 | NA | 14.9 | 66 |
Missing Values
vis_miss(Air_Quality, sort_miss = TRUE) +
theme(axis.text.x = element_text(angle = 90)) +
labs(title = "Missing values in Heart Rate")
Different examples of how to fill missing data
df1 <- Air_Quality %>% fill(Ozone, .direction = 'up')
df1$Ozone_Is_imputed = is.na(Air_Quality$Ozone)
df1 <- data.frame(df1)
df1$Date = as.Date(df1$Date)
ggplot(df1, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
geom_line(color = "black") + xlab("") +
geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Forwardfill - Ffill", subtitle = "according to next observed value")
df2 <- Air_Quality %>% fill(Ozone, .direction = 'down')
df2 <- Air_Quality %>% fill(Ozone, .direction = 'down')
df2$Ozone_Is_imputed = is.na(Air_Quality$Ozone)
df2 <- data.frame(df2)
df2$Date = as.Date(df2$Date)
ggplot(df2, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
geom_line(color = "black") + xlab("") +
geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Backfill - Bfill", subtitle = "According to last observed value")
library(imputeTS) # for quadratic interpolation
df3 <- Air_Quality
df4 <- Air_Quality
df5 <- Air_Quality
df3$Ozone <- na_interpolation(df3$Ozone, option = "spline") # Impute parabolic trajectory in negative direction to positive, it has large bias in some condition. Using a mathematical function, the method estimates values that minimize overall curvature, thus obtaining a smooth surface passing through the input points
df4$Ozone <- na_interpolation(df4$Ozone, option = "linear")# Impute linearly or with equidistant values, draw linearity between 2 non missing value to fill gap.
df5$Ozone <- na_interpolation(df5$Ozone, option = "stine") # Returns the values of an interpolating function that runs through a set of points in the xy-plane according to the algorithm of Stineman (1980).
ggplot(df3, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
geom_line(color = "black") + xlab("") +
geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Interpolation", subtitle = "Spiline")
ggplot(df4, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
geom_line(color = "black") + xlab("") +
geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Interpolation", subtitle = "Linear")
ggplot(df5, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
geom_line(color = "black") + xlab("") +
geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "Interpolation", subtitle = "Stine")
library(caret)
library(RANN)
df6 = preProcess(Air_Quality, "knnImpute")
df6_pred = predict(df6, Air_Quality)
df6_pred$Ozone_Is_imputed = is.na(Air_Quality$Ozone)
df6_pred$Date = as.Date(df1$Date)
ggplot(df6_pred, aes(x = Date, y = Ozone, colour = Ozone_Is_imputed == TRUE)) +
geom_line(color = "black") + xlab("") +
geom_point(color = ifelse(df1$Ozone_Is_imputed == TRUE, '#69b3a2','black')) +
theme(axis.text.x = element_text(angle = 60, hjust = 1)) +
labs(title = "KNN Impute", subtitle = "Knn")