# Set the seed ####
set.seed(2001)
##### Load relevant packages #####
library(RPostgreSQL)
## Loading required package: DBI
library(reshape2)
library(ggplot2)
library(openair)
## Loading required package: maps
##
## # maps v3.1: updated 'world': all lakes moved to separate new #
## # 'lakes' database. Type '?world' or 'news(package="maps")'. #
##### Set the working directory DB ####
setwd("~/repositories/cona/DB")
##### Set the path where location info is #####
filepath <- '~/data/CONA/2016'
##### Read the credentials file (hidden file not on the GIT repository) ####
access <- read.delim("./.cona_login", stringsAsFactors = FALSE)
##### Open DATA connection to the DB ####
p <- dbDriver("PostgreSQL")
con<-dbConnect(p,
user=access$user[1],
password=access$pwd[1],
host='penap-data.dyndns.org',
dbname='cona',
port=5432)
# Load only data from ODINs at ECan's site
siteid = 18 # ECan site
data <- dbGetQuery(con," SELECT d.recordtime at time zone 'NZST' as date,
d.value::numeric as pm25, i.serialn as instrument
FROM data.fixed_data as d, admin.sensor as s, admin.instrument as i
WHERE s.id = d.sensorid AND
s.instrumentid = i.id AND
i.name = 'ODIN-SD-3' AND
d.siteid = 18 AND
s.name = 'PM2.5';")
data$ODIN.PM2.5 <- data$pm25
data$pm25 <- NULL
wide_data <- dcast(data,date~instrument,value.var = 'ODIN.PM2.5',fun.aggregate = mean)
# Load ECan's data
ecan.data <- read.csv(paste0(filepath,'/RangioraWinter2016.csv'))
ecan.data$date <- as.POSIXct(ecan.data$Date,format = '%d/%m/%Y %H:%M', tz='Etc/GMT-13')
ecan.data$X <- NULL
names(ecan.data) <- c('Date','Time','ws','wd','wdsd','wssd','wmx','co','dT','T2m','T6m','PM10.FDMS','PM2.5.FDMS','PMcoarse','date')
# Select time frames
min_date <- max(min(data$date),min(ecan.data$date))
max_date <- min(max(data$date),max(ecan.data$date))
# Merge ECan data
alldata <- subset(merge(wide_data,ecan.data,by='date',all = TRUE),(date >= min_date & date <= max_date))
wide_data.1hr <- timeAverage(alldata,avg.time = '1 hour')
## Warning in checkPrep(mydata, vars, type = "default", remove.calm = FALSE, :
## Detected data with Daylight Saving Time, converting to UTC/GMT
timePlot(wide_data.1hr,pollutant = names(wide_data.1hr)[c(2:18,29)],group = TRUE, avg.time = '1 hour')

fit_coeffs <- data.frame(snumber=NA*(1:17),inter=NA,slope=NA)
# Linear fit for ODIN-109
summary(lm_109 <- lm(data = wide_data.1hr,PM2.5.FDMS ~ ODIN.109))
##
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN.109, data = wide_data.1hr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.948 -2.261 -0.309 1.699 48.375
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.5740 0.1368 11.5 <2e-16 ***
## ODIN.109 0.7630 0.0071 107.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.031 on 2699 degrees of freedom
## (59 observations deleted due to missingness)
## Multiple R-squared: 0.8106, Adjusted R-squared: 0.8105
## F-statistic: 1.155e+04 on 1 and 2699 DF, p-value: < 2.2e-16
fit_coeffs$snumber[1]<-'ODIN-109'
intr109 <- coefficients(lm_109)[1]
slp109 <- coefficients(lm_109)[2]
fit_coeffs$inter[1] <- coefficients(lm_109)[1]
fit_coeffs$slope[1] <- coefficients(lm_109)[2]
# Linear fit for all ODINs
j=2
for (i in c('100',
'101',
'102',
'103',
'104',
'105',
'106',
'107',
'108',
'110',
'111',
'112',
'113',
'114',
'115',
'117'))
{
eval(parse(text=paste0('summary(lm_',i,' <- lm(data = wide_data.1hr,ODIN.109 ~ ODIN.',i,'))')))
fit_coeffs$snumber[j]<-paste0('ODIN-',i)
eval(parse(text=paste0('inter <- coefficients(lm_',i,')[1]')))
eval(parse(text=paste0('slope <- coefficients(lm_',i,')[2]')))
eval(parse(text=paste0('fit_coeffs$inter[j] <- intr109 + slp109*inter')))
eval(parse(text=paste0('fit_coeffs$slope[j] <- slope * slp109')))
j=j+1
}
# Linear fit of ODIN-109 against FDMS ####
data.fit109 <- as.data.frame(wide_data.1hr[,c('date','PM2.5.FDMS','ODIN.109')])
# Whole dataset
summary(lm_109.whole <- lm(data = timeAverage(data.fit109,avg.time = '1 hour'),PM2.5.FDMS ~ ODIN.109))
##
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN.109, data = timeAverage(data.fit109,
## avg.time = "1 hour"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.948 -2.261 -0.309 1.699 48.375
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.5740 0.1368 11.5 <2e-16 ***
## ODIN.109 0.7630 0.0071 107.5 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.031 on 2699 degrees of freedom
## (59 observations deleted due to missingness)
## Multiple R-squared: 0.8106, Adjusted R-squared: 0.8105
## F-statistic: 1.155e+04 on 1 and 2699 DF, p-value: < 2.2e-16
# First 1/4
summary(lm_109.first3 <- lm(data = timeAverage(data.fit109[1:920,],avg.time = '1 hour'),PM2.5.FDMS ~ ODIN.109))
##
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN.109, data = timeAverage(data.fit109[1:920,
## ], avg.time = "1 hour"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -42.682 -3.335 -0.789 2.331 47.237
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.0198 0.3505 5.763 1.14e-08 ***
## ODIN.109 0.7753 0.0139 55.776 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.329 on 895 degrees of freedom
## (23 observations deleted due to missingness)
## Multiple R-squared: 0.7766, Adjusted R-squared: 0.7763
## F-statistic: 3111 on 1 and 895 DF, p-value: < 2.2e-16
# Third 1/4
summary(lm_109.last3 <- lm(data = timeAverage(data.fit109[1840:2760,],avg.time = '1 hour'),PM2.5.FDMS ~ ODIN.109))
##
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN.109, data = timeAverage(data.fit109[1840:2760,
## ], avg.time = "1 hour"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.601 -1.577 -0.285 1.174 40.864
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.29861 0.13644 16.85 <2e-16 ***
## ODIN.109 0.49803 0.01855 26.85 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.143 on 907 degrees of freedom
## (12 observations deleted due to missingness)
## Multiple R-squared: 0.4428, Adjusted R-squared: 0.4422
## F-statistic: 720.8 on 1 and 907 DF, p-value: < 2.2e-16
# Random 30%
summary(lm_109.rnd3 <- lm(data = timeAverage(data.fit109[sample(nrow(wide_data.1hr),920),],avg.time = '1 hour'),PM2.5.FDMS ~ ODIN.109))
##
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN.109, data = timeAverage(data.fit109[sample(nrow(wide_data.1hr),
## 920), ], avg.time = "1 hour"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -41.934 -2.295 -0.352 1.616 47.889
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.47947 0.24356 6.074 1.84e-09 ***
## ODIN.109 0.77335 0.01289 60.010 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.185 on 899 degrees of freedom
## (1859 observations deleted due to missingness)
## Multiple R-squared: 0.8002, Adjusted R-squared: 0.8
## F-statistic: 3601 on 1 and 899 DF, p-value: < 2.2e-16
# Calculate fit and confidence interval estimates according to the previous segmentation
ODIN.109.whole <- predict(lm_109.whole,newdata = data.fit109,interval = 'confidence')
ODIN.109.first3 <- predict(lm_109.first3,newdata = data.fit109,interval = 'confidence')
ODIN.109.last3 <- predict(lm_109.last3,newdata = data.fit109,interval = 'confidence')
ODIN.109.rnd3 <- predict(lm_109.rnd3,newdata = data.fit109,interval = 'confidence')
data.fit109$ODIN.109.whole <- ODIN.109.whole[,1]
data.fit109$ODIN.109.whole.lwr <- ODIN.109.whole[,2]
data.fit109$ODIN.109.whole.upr <- ODIN.109.whole[,3]
data.fit109$ODIN.109.first3 <- ODIN.109.first3[,1]
data.fit109$ODIN.109.first3.lwr <- ODIN.109.first3[,2]
data.fit109$ODIN.109.first3.upr <- ODIN.109.first3[,3]
data.fit109$ODIN.109.last3 <- ODIN.109.last3[,1]
data.fit109$ODIN.109.last3.lwr <- ODIN.109.last3[,2]
data.fit109$ODIN.109.last3.upr <- ODIN.109.last3[,3]
data.fit109$ODIN.109.rnd3 <- ODIN.109.rnd3[,1]
data.fit109$ODIN.109.rnd3.lwr <- ODIN.109.rnd3[,2]
data.fit109$ODIN.109.rnd3.upr <- ODIN.109.rnd3[,3]
ggplot(data.fit109,aes(x=date)) +
geom_line(aes(y=ODIN.109.last3),colour='red')+
geom_line(aes(y=PM2.5.FDMS),colour='black')+
geom_line(aes(y=ODIN.109.last3.lwr),colour='red',linetype=3)+
geom_line(aes(y=ODIN.109.last3.upr),colour='red',linetype=3)
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 5 rows containing missing values (geom_path).

# Plot the corrections ####
# Hourly data
timePlot(data.fit109,pollutant = names(data.fit109)[2:7],group = TRUE,avg.time = '1 hour')

# Daily data
timePlot(data.fit109,pollutant = names(data.fit109)[2:7],group = TRUE,avg.time = '1 day')

timePlot(wide_data.1hr[sample(nrow(wide_data.1hr),920),],pollutant = c('PM2.5.FDMS','ODIN.109'),group = TRUE, avg.time = '1 day')

ggplot(data)+
geom_line(aes(x=date,y=ODIN.PM2.5,colour = instrument))+
ggtitle('ODIN colocation')+
xlab('Local Date')+
ylab(expression(paste('PM2.5 [',mu,'g/',m^3)))

# Update metadata table with calibration data ####
sensor_id <- dbGetQuery(con, "SELECT i.serialn, s.id
FROM admin.sensor s,
admin.instrument i
WHERE s.instrumentid = i.id AND
i.name='ODIN-SD-3' AND
s.name = 'PM2.5';")
for (i in (1:nrow(sensor_id))){
iserial <- sensor_id$serialn[i]
sid <- sensor_id$id[i]
idx_coeffs <- which(fit_coeffs$snumber==iserial)
if (length(idx_coeffs)==1) {
st <- dbGetQuery(con,paste0("INSERT INTO metadata.odin_sd_calibration(sensorid, slope, intercept, valid_from)
VALUES (",sid,",", fit_coeffs$slope[idx_coeffs],",",fit_coeffs$inter[idx_coeffs],",'2015-01-01 00:00:00 NZST');"))
}
}