# Set the seed ####
set.seed(2001)
##### Load relevant packages #####
library(RPostgreSQL)
## Loading required package: DBI
library(reshape2)
library(ggplot2)
library(openair)
## Loading required package: maps
## 
##  # maps v3.1: updated 'world': all lakes moved to separate new #
##  # 'lakes' database. Type '?world' or 'news(package="maps")'.  #
##### Set the working directory DB ####
setwd("~/repositories/cona/DB")
##### Set the path where location info is #####
filepath <- '~/data/CONA/2016'
##### Read the credentials file (hidden file not on the GIT repository) ####
access <- read.delim("./.cona_login", stringsAsFactors = FALSE)
##### Open DATA connection to the DB ####
p <- dbDriver("PostgreSQL")
con<-dbConnect(p,
               user=access$user[1],
               password=access$pwd[1],
               host='penap-data.dyndns.org',
               dbname='cona',
               port=5432)
# Load only data from ODINs at ECan's site
siteid = 18 # ECan site
data <- dbGetQuery(con," SELECT d.recordtime at time zone 'NZST' as date,
                          d.value::numeric as pm25, i.serialn as instrument
                         FROM data.fixed_data as d, admin.sensor as s, admin.instrument as i
                         WHERE s.id = d.sensorid AND
                           s.instrumentid = i.id AND
                           i.name = 'ODIN-SD-3' AND
                           d.siteid = 18 AND
                           s.name = 'PM2.5';")
data$ODIN.PM2.5 <- data$pm25
data$pm25 <- NULL
wide_data <- dcast(data,date~instrument,value.var = 'ODIN.PM2.5',fun.aggregate = mean)

# Load ECan's data
ecan.data <- read.csv(paste0(filepath,'/RangioraWinter2016.csv'))
ecan.data$date <- as.POSIXct(ecan.data$Date,format = '%d/%m/%Y %H:%M', tz='Etc/GMT-13')
ecan.data$X <- NULL
names(ecan.data) <- c('Date','Time','ws','wd','wdsd','wssd','wmx','co','dT','T2m','T6m','PM10.FDMS','PM2.5.FDMS','PMcoarse','date')

# Select time frames
min_date <- max(min(data$date),min(ecan.data$date))
max_date <- min(max(data$date),max(ecan.data$date))

# Merge ECan data
alldata <- subset(merge(wide_data,ecan.data,by='date',all = TRUE),(date >= min_date & date <= max_date))
wide_data.1hr <- timeAverage(alldata,avg.time = '1 hour')
## Warning in checkPrep(mydata, vars, type = "default", remove.calm = FALSE, :
## Detected data with Daylight Saving Time, converting to UTC/GMT
timePlot(wide_data.1hr,pollutant = names(wide_data.1hr)[c(2:18,29)],group = TRUE, avg.time = '1 hour')

fit_coeffs <- data.frame(snumber=NA*(1:17),inter=NA,slope=NA)

# Linear fit for ODIN-109
summary(lm_109 <- lm(data = wide_data.1hr,PM2.5.FDMS ~ ODIN.109))
## 
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN.109, data = wide_data.1hr)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.948  -2.261  -0.309   1.699  48.375 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.5740     0.1368    11.5   <2e-16 ***
## ODIN.109      0.7630     0.0071   107.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.031 on 2699 degrees of freedom
##   (59 observations deleted due to missingness)
## Multiple R-squared:  0.8106, Adjusted R-squared:  0.8105 
## F-statistic: 1.155e+04 on 1 and 2699 DF,  p-value: < 2.2e-16
fit_coeffs$snumber[1]<-'ODIN-109'
intr109 <- coefficients(lm_109)[1]
slp109 <- coefficients(lm_109)[2]
fit_coeffs$inter[1] <- coefficients(lm_109)[1]
fit_coeffs$slope[1] <- coefficients(lm_109)[2]

# Linear fit for all ODINs
j=2
for (i in c('100',
            '101',
            '102',
            '103',
            '104',
            '105',
            '106',
            '107',
            '108',
            '110',
            '111',
            '112',
            '113',
            '114',
            '115',
            '117'))
{
  eval(parse(text=paste0('summary(lm_',i,' <- lm(data = wide_data.1hr,ODIN.109 ~ ODIN.',i,'))')))
  fit_coeffs$snumber[j]<-paste0('ODIN-',i)
  eval(parse(text=paste0('inter <- coefficients(lm_',i,')[1]')))
  eval(parse(text=paste0('slope <- coefficients(lm_',i,')[2]')))
  eval(parse(text=paste0('fit_coeffs$inter[j] <- intr109 + slp109*inter')))
  eval(parse(text=paste0('fit_coeffs$slope[j] <- slope * slp109')))
  j=j+1
}


# Linear fit of ODIN-109 against FDMS ####
data.fit109 <- as.data.frame(wide_data.1hr[,c('date','PM2.5.FDMS','ODIN.109')])
# Whole dataset
summary(lm_109.whole <- lm(data = timeAverage(data.fit109,avg.time = '1 hour'),PM2.5.FDMS ~ ODIN.109))
## 
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN.109, data = timeAverage(data.fit109, 
##     avg.time = "1 hour"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -40.948  -2.261  -0.309   1.699  48.375 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.5740     0.1368    11.5   <2e-16 ***
## ODIN.109      0.7630     0.0071   107.5   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.031 on 2699 degrees of freedom
##   (59 observations deleted due to missingness)
## Multiple R-squared:  0.8106, Adjusted R-squared:  0.8105 
## F-statistic: 1.155e+04 on 1 and 2699 DF,  p-value: < 2.2e-16
# First 1/4
summary(lm_109.first3 <- lm(data = timeAverage(data.fit109[1:920,],avg.time = '1 hour'),PM2.5.FDMS ~ ODIN.109))
## 
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN.109, data = timeAverage(data.fit109[1:920, 
##     ], avg.time = "1 hour"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -42.682  -3.335  -0.789   2.331  47.237 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   2.0198     0.3505   5.763 1.14e-08 ***
## ODIN.109      0.7753     0.0139  55.776  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 8.329 on 895 degrees of freedom
##   (23 observations deleted due to missingness)
## Multiple R-squared:  0.7766, Adjusted R-squared:  0.7763 
## F-statistic:  3111 on 1 and 895 DF,  p-value: < 2.2e-16
# Third 1/4
summary(lm_109.last3 <- lm(data = timeAverage(data.fit109[1840:2760,],avg.time = '1 hour'),PM2.5.FDMS ~ ODIN.109))
## 
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN.109, data = timeAverage(data.fit109[1840:2760, 
##     ], avg.time = "1 hour"))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -9.601 -1.577 -0.285  1.174 40.864 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.29861    0.13644   16.85   <2e-16 ***
## ODIN.109     0.49803    0.01855   26.85   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.143 on 907 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.4428, Adjusted R-squared:  0.4422 
## F-statistic: 720.8 on 1 and 907 DF,  p-value: < 2.2e-16
# Random 30%
summary(lm_109.rnd3 <- lm(data = timeAverage(data.fit109[sample(nrow(wide_data.1hr),920),],avg.time = '1 hour'),PM2.5.FDMS ~ ODIN.109))
## 
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN.109, data = timeAverage(data.fit109[sample(nrow(wide_data.1hr), 
##     920), ], avg.time = "1 hour"))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -41.934  -2.295  -0.352   1.616  47.889 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.47947    0.24356   6.074 1.84e-09 ***
## ODIN.109     0.77335    0.01289  60.010  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.185 on 899 degrees of freedom
##   (1859 observations deleted due to missingness)
## Multiple R-squared:  0.8002, Adjusted R-squared:    0.8 
## F-statistic:  3601 on 1 and 899 DF,  p-value: < 2.2e-16
# Calculate fit and confidence interval estimates according to the previous segmentation

ODIN.109.whole <- predict(lm_109.whole,newdata = data.fit109,interval = 'confidence')
ODIN.109.first3 <- predict(lm_109.first3,newdata = data.fit109,interval = 'confidence')
ODIN.109.last3 <- predict(lm_109.last3,newdata = data.fit109,interval = 'confidence')
ODIN.109.rnd3 <- predict(lm_109.rnd3,newdata = data.fit109,interval = 'confidence')

data.fit109$ODIN.109.whole <- ODIN.109.whole[,1]
data.fit109$ODIN.109.whole.lwr <- ODIN.109.whole[,2]
data.fit109$ODIN.109.whole.upr <- ODIN.109.whole[,3]
data.fit109$ODIN.109.first3 <- ODIN.109.first3[,1]
data.fit109$ODIN.109.first3.lwr <- ODIN.109.first3[,2]
data.fit109$ODIN.109.first3.upr <- ODIN.109.first3[,3]
data.fit109$ODIN.109.last3 <- ODIN.109.last3[,1]
data.fit109$ODIN.109.last3.lwr <- ODIN.109.last3[,2]
data.fit109$ODIN.109.last3.upr <- ODIN.109.last3[,3]
data.fit109$ODIN.109.rnd3 <- ODIN.109.rnd3[,1]
data.fit109$ODIN.109.rnd3.lwr <- ODIN.109.rnd3[,2]
data.fit109$ODIN.109.rnd3.upr <- ODIN.109.rnd3[,3]

ggplot(data.fit109,aes(x=date)) +
  geom_line(aes(y=ODIN.109.last3),colour='red')+
  geom_line(aes(y=PM2.5.FDMS),colour='black')+
  geom_line(aes(y=ODIN.109.last3.lwr),colour='red',linetype=3)+
  geom_line(aes(y=ODIN.109.last3.upr),colour='red',linetype=3)
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 5 rows containing missing values (geom_path).

## Warning: Removed 5 rows containing missing values (geom_path).

# Plot the corrections ####
# Hourly data
timePlot(data.fit109,pollutant = names(data.fit109)[2:7],group = TRUE,avg.time = '1 hour')

# Daily data
timePlot(data.fit109,pollutant = names(data.fit109)[2:7],group = TRUE,avg.time = '1 day')

timePlot(wide_data.1hr[sample(nrow(wide_data.1hr),920),],pollutant = c('PM2.5.FDMS','ODIN.109'),group = TRUE, avg.time = '1 day')

ggplot(data)+
  geom_line(aes(x=date,y=ODIN.PM2.5,colour = instrument))+
  ggtitle('ODIN colocation')+
  xlab('Local Date')+
  ylab(expression(paste('PM2.5 [',mu,'g/',m^3)))

# Update metadata table with calibration data ####
sensor_id <- dbGetQuery(con, "SELECT i.serialn, s.id
                              FROM admin.sensor s,
                                admin.instrument i
                              WHERE s.instrumentid = i.id AND
                                i.name='ODIN-SD-3' AND
                                s.name = 'PM2.5';")
for (i in (1:nrow(sensor_id))){
  iserial <- sensor_id$serialn[i]
  sid <- sensor_id$id[i]
  idx_coeffs <- which(fit_coeffs$snumber==iserial)
  if (length(idx_coeffs)==1) {
    st <- dbGetQuery(con,paste0("INSERT INTO metadata.odin_sd_calibration(sensorid, slope, intercept, valid_from)
VALUES (",sid,",", fit_coeffs$slope[idx_coeffs],",",fit_coeffs$inter[idx_coeffs],",'2015-01-01 00:00:00 NZST');"))
  }
}