# Load libraries ####
library(readr)
library(openair)
library(ggplot2)
# Quiet down warnings
oldw <- getOption("warn")
options(warn = -1)
# Paths and constants ####
# ODIN data folder
data_folder <- "~/data/ODIN_data/Tasmania/"
# Load data ####
odin_data <- read_delim(paste0(data_folder,"colocation_post.txt"), 
                        "\t",
                        escape_double = FALSE,
                        col_types = cols(DateTime = col_datetime(format = "%Y-%m-%d %H:%M"), 
                                         OutRH = col_double(), PM10_EPA = col_double(), 
                                         PM25_EPA = col_double(), PM25_H1in = col_double(), 
                                         Rain = col_double(), RainRate = col_double(), 
                                         WD = col_double()), trim_ws = TRUE)
odin_data$date <- odin_data$DateTime
odin_data$DateTime <- NULL
# Move the data to 1 hour averages for analysis
odin_data.1hr <- timeAverage(odin_data,avg.time = '1 hour')
# Extract only EPA PM2.5 data
epa_data <- odin_data.1hr[,c(1:10)]
# Extract only ODIN data
odin_only_data <- odin_data.1hr[,c(1,(11:17))]
# A cross correlation analysis points to ODIN data being 1 hour behind of EPA data 
ccf(odin_data.1hr$PM25_EPA,odin_data.1hr$PM25_H1in,na.action = na.omit)

# Correct the ODIN data
odin_only_data$date <- odin_only_data$date + 3600
odin_data.1hr.fixed <- merge(epa_data,odin_only_data,by = 'date',all = TRUE)
# Check the cross correlation analysis
ccf(odin_data.1hr.fixed$PM25_EPA,odin_data.1hr.fixed$PM25_H1in,na.action = na.omit)

# Plot summaries
timePlot(odin_data.1hr.fixed,
         pollutant = names(odin_data.1hr.fixed)[10:17],
         avg.time = '1 hour',
         group = TRUE)

serials <- names(odin_data.1hr.fixed)[11:17]
correction.coeffs <- data.frame(serialn=names(odin_data.1hr.fixed)[11:17])
correction.coeffs$PM25_EPA.slope <- NA
correction.coeffs$PM25_EPA.intrcpt <- NA
correction.coeffs$PM25_EPA.rsqrd <- NA
nfiles <- length(serials)
# Compare ODIN data against EPA data
for (i in (1:nfiles)){
  c_idx <- which(correction.coeffs$serialn==serials[i])
  od_idx <- which(names(odin_data.1hr.fixed)==serials[i])

  # PM2.5
  pm <- 'PM25_EPA'
  # Correction factors
  summary(eval(parse(text=paste0("correction_fit <- lm(data=odin_data.1hr.fixed, ",pm," ~`",serials[i],"`)"))))
  sum_fit <- summary(correction_fit)
  slope <- sprintf("%0.2f",correction_fit$coefficients[2])
  intrcpt <- sprintf("%0.2f",correction_fit$coefficients[1])
  rsq <- sprintf("%0.2f",sum_fit$r.squared)
  eval(parse(text = paste0("correction.coeffs$",pm,".slope[c_idx] <- correction_fit$coefficients[2]")))
  eval(parse(text = paste0("correction.coeffs$",pm,".intrcpt[c_idx] <- correction_fit$coefficients[1]")))
  eval(parse(text = paste0("correction.coeffs$",pm,".rsqrd[c_idx] <- sum_fit$r.squared")))
  eval(parse(text=paste0("odin_data.1hr.fixed$",serials[i],".fitted <- predict(correction_fit,odin_data.1hr.fixed)")))
  
  timePlot(odin_data.1hr.fixed,
           pollutant = c(pm,serials[i],paste0(serials[i],".fitted")),
           group = TRUE,
           avg.time = '1 hour',
           main = paste0(serials[i],
                         ' (R2 = ',rsq,')\n',
                         slope,' * ODIN + ',
                         intrcpt))
}

knitr::kable(correction.coeffs[,c('serialn',
                                  'PM25_EPA.rsqrd',
                                  'PM25_EPA.slope',
                                  'PM25_EPA.intrcpt')], digits = 2)
serialn PM25_EPA.rsqrd PM25_EPA.slope PM25_EPA.intrcpt
PM25_H1in 0.87 0.65 -1.63
PM25_H1out 0.82 0.81 -0.61
PM25_H2out 0.85 0.56 -0.51
PM25_H3in 0.87 0.45 0.45
PM25_H3out 0.81 0.86 0.38
PM25_H4in 0.76 1.04 0.30
PM25_H4out 0.82 0.94 0.00
readr::write_csv(correction.coeffs[,c('serialn',
                                      'PM25_EPA.rsqrd',
                                      'PM25_EPA.slope',
                                      'PM25_EPA.intrcpt')],path = paste0(data_folder,'odin_corrections_pm2.5.csv'))
timePlot(odin_data.1hr.fixed,
         pollutant = names(odin_data.1hr.fixed)[c(18:24)],
         avg.time = '1 hour',
         group = TRUE)

timePlot(odin_data.1hr.fixed,
         pollutant = names(odin_data.1hr.fixed)[c(10,18:24)],
         avg.time = '1 hour',
         group = TRUE)

#re-enable warnings to continue the session
options(warn = oldw)