# Set the seed ####
set.seed(2001)
##### Load relevant packages #####
library(RPostgreSQL)
## Loading required package: DBI
library(reshape2)
library(ggplot2)
library(openair)
## Loading required package: maps
##
## # maps v3.1: updated 'world': all lakes moved to separate new #
## # 'lakes' database. Type '?world' or 'news(package="maps")'. #
##### Set the working directory DB ####
setwd("~/repositories/cona/DB")
##### Set the path where location info is #####
filepath <- '~/data/CONA/2016'
##### Read the credentials file (hidden file not on the GIT repository) ####
access <- read.delim("./.cona_login", stringsAsFactors = FALSE)
##### Open DATA connection to the DB ####
p <- dbDriver("PostgreSQL")
con<-dbConnect(p,
user=access$user[1],
password=access$pwd[1],
host='penap-data.dyndns.org',
dbname='cona',
port=5432)
# Load only data from ODINs at ECan's site
siteid = 18 # ECan site
data <- dbGetQuery(con," SELECT d.recordtime at time zone 'NZST' as date,
d.value::numeric as pm25, i.serialn as instrument
FROM data.fixed_data as d, admin.sensor as s, admin.instrument as i
WHERE s.id = d.sensorid AND
s.instrumentid = i.id AND
i.name = 'ODIN-SD-3' AND
d.siteid = 18 AND
s.name = 'PM10';")
data$ODIN.PM10 <- data$pm25
data$pm25 <- NULL
wide_data <- dcast(data,date~instrument,value.var = 'ODIN.PM10',fun.aggregate = mean)
# Load ECan's data
ecan.data <- read.csv(paste0(filepath,'/RangioraWinter2016.csv'))
ecan.data$date <- as.POSIXct(ecan.data$Date,format = '%d/%m/%Y %H:%M', tz='Etc/GMT-13')
ecan.data$X <- NULL
names(ecan.data) <- c('Date','Time','ws','wd','wdsd','wssd','wmx','co','dT','T2m','T6m','PM10.FDMS','PM2.5.FDMS','PMcoarse','date')
# Select time frames
min_date <- max(min(data$date),min(ecan.data$date))
max_date <- min(max(data$date),max(ecan.data$date))
# Merge ECan data
alldata <- subset(merge(wide_data,ecan.data,by='date',all = TRUE),(date >= min_date & date <= max_date))
wide_data.1hr <- timeAverage(alldata,avg.time = '1 hour')
## Warning in checkPrep(mydata, vars, type = "default", remove.calm = FALSE, :
## Detected data with Daylight Saving Time, converting to UTC/GMT
timePlot(wide_data.1hr,pollutant = names(wide_data.1hr)[c(2:18,29)],group = TRUE, avg.time = '1 hour')

fit_coeffs <- data.frame(snumber=NA*(1:17),inter=NA,slope=NA)
# Linear fit for ODIN-109
summary(lm_109 <- lm(data = wide_data.1hr,PM10.FDMS ~ ODIN.109))
##
## Call:
## lm(formula = PM10.FDMS ~ ODIN.109, data = wide_data.1hr)
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.920 -4.443 -0.979 3.615 58.345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.999168 0.229289 17.44 <2e-16 ***
## ODIN.109 0.533678 0.006211 85.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.487 on 2697 degrees of freedom
## (61 observations deleted due to missingness)
## Multiple R-squared: 0.7325, Adjusted R-squared: 0.7324
## F-statistic: 7384 on 1 and 2697 DF, p-value: < 2.2e-16
fit_coeffs$snumber[1]<-'ODIN-109'
intr109 <- coefficients(lm_109)[1]
slp109 <- coefficients(lm_109)[2]
fit_coeffs$inter[1] <- coefficients(lm_109)[1]
fit_coeffs$slope[1] <- coefficients(lm_109)[2]
# Linear fit for all ODINs
j=2
for (i in c('100',
'101',
'102',
'103',
'104',
'105',
'106',
'107',
'108',
'110',
'111',
'112',
'113',
'114',
'115',
'117'))
{
eval(parse(text=paste0('summary(lm_',i,' <- lm(data = wide_data.1hr,ODIN.109 ~ ODIN.',i,'))')))
fit_coeffs$snumber[j]<-paste0('ODIN-',i)
eval(parse(text=paste0('inter <- coefficients(lm_',i,')[1]')))
eval(parse(text=paste0('slope <- coefficients(lm_',i,')[2]')))
eval(parse(text=paste0('fit_coeffs$inter[j] <- intr109 + slp109*inter')))
eval(parse(text=paste0('fit_coeffs$slope[j] <- slope * slp109')))
j=j+1
}
# Linear fit of ODIN-109 against FDMS ####
data.fit109 <- as.data.frame(wide_data.1hr[,c('date','PM10.FDMS','ODIN.109')])
# Whole dataset
summary(lm_109.whole <- lm(data = timeAverage(data.fit109,avg.time = '1 hour'),PM10.FDMS ~ ODIN.109))
##
## Call:
## lm(formula = PM10.FDMS ~ ODIN.109, data = timeAverage(data.fit109,
## avg.time = "1 hour"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -51.920 -4.443 -0.979 3.615 58.345
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.999168 0.229289 17.44 <2e-16 ***
## ODIN.109 0.533678 0.006211 85.93 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.487 on 2697 degrees of freedom
## (61 observations deleted due to missingness)
## Multiple R-squared: 0.7325, Adjusted R-squared: 0.7324
## F-statistic: 7384 on 1 and 2697 DF, p-value: < 2.2e-16
# First 1/4
summary(lm_109.first3 <- lm(data = timeAverage(data.fit109[1:920,],avg.time = '1 hour'),PM10.FDMS ~ ODIN.109))
##
## Call:
## lm(formula = PM10.FDMS ~ ODIN.109, data = timeAverage(data.fit109[1:920,
## ], avg.time = "1 hour"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.145 -5.082 -1.351 3.793 57.905
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.63968 0.53094 8.739 <2e-16 ***
## ODIN.109 0.53138 0.01137 46.743 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.96 on 894 degrees of freedom
## (24 observations deleted due to missingness)
## Multiple R-squared: 0.7096, Adjusted R-squared: 0.7093
## F-statistic: 2185 on 1 and 894 DF, p-value: < 2.2e-16
# Third 1/4
summary(lm_109.last3 <- lm(data = timeAverage(data.fit109[1840:2760,],avg.time = '1 hour'),PM10.FDMS ~ ODIN.109))
##
## Call:
## lm(formula = PM10.FDMS ~ ODIN.109, data = timeAverage(data.fit109[1840:2760,
## ], avg.time = "1 hour"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -17.149 -3.815 -1.063 3.263 37.726
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.49937 0.39383 13.96 <2e-16 ***
## ODIN.109 0.40480 0.01962 20.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.816 on 905 degrees of freedom
## (14 observations deleted due to missingness)
## Multiple R-squared: 0.3198, Adjusted R-squared: 0.3191
## F-statistic: 425.6 on 1 and 905 DF, p-value: < 2.2e-16
# Random 30%
summary(lm_109.rnd3 <- lm(data = timeAverage(data.fit109[sample(nrow(wide_data.1hr),920),],avg.time = '1 hour'),PM10.FDMS ~ ODIN.109))
##
## Call:
## lm(formula = PM10.FDMS ~ ODIN.109, data = timeAverage(data.fit109[sample(nrow(wide_data.1hr),
## 920), ], avg.time = "1 hour"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.675 -4.305 -0.924 3.316 51.940
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.93323 0.39820 9.877 <2e-16 ***
## ODIN.109 0.53820 0.01097 49.057 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.454 on 897 degrees of freedom
## (1861 observations deleted due to missingness)
## Multiple R-squared: 0.7285, Adjusted R-squared: 0.7282
## F-statistic: 2407 on 1 and 897 DF, p-value: < 2.2e-16
# Calculate fit and confidence interval estimates according to the previous segmentation
ODIN.109.whole <- predict(lm_109.whole,newdata = data.fit109,interval = 'confidence')
ODIN.109.first3 <- predict(lm_109.first3,newdata = data.fit109,interval = 'confidence')
ODIN.109.last3 <- predict(lm_109.last3,newdata = data.fit109,interval = 'confidence')
ODIN.109.rnd3 <- predict(lm_109.rnd3,newdata = data.fit109,interval = 'confidence')
data.fit109$ODIN.109.whole <- ODIN.109.whole[,1]
data.fit109$ODIN.109.whole.lwr <- ODIN.109.whole[,2]
data.fit109$ODIN.109.whole.upr <- ODIN.109.whole[,3]
data.fit109$ODIN.109.first3 <- ODIN.109.first3[,1]
data.fit109$ODIN.109.first3.lwr <- ODIN.109.first3[,2]
data.fit109$ODIN.109.first3.upr <- ODIN.109.first3[,3]
data.fit109$ODIN.109.last3 <- ODIN.109.last3[,1]
data.fit109$ODIN.109.last3.lwr <- ODIN.109.last3[,2]
data.fit109$ODIN.109.last3.upr <- ODIN.109.last3[,3]
data.fit109$ODIN.109.rnd3 <- ODIN.109.rnd3[,1]
data.fit109$ODIN.109.rnd3.lwr <- ODIN.109.rnd3[,2]
data.fit109$ODIN.109.rnd3.upr <- ODIN.109.rnd3[,3]
ggplot(data.fit109,aes(x=date)) +
geom_line(aes(y=ODIN.109.last3),colour='red')+
geom_line(aes(y=PM10.FDMS),colour='black')+
geom_line(aes(y=ODIN.109.last3.lwr),colour='red',linetype=3)+
geom_line(aes(y=ODIN.109.last3.upr),colour='red',linetype=3)
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 5 rows containing missing values (geom_path).
## Warning: Removed 5 rows containing missing values (geom_path).

# Plot the corrections ####
# Hourly data
timePlot(data.fit109,pollutant = names(data.fit109)[2:7],group = TRUE,avg.time = '1 hour')

# Daily data
timePlot(data.fit109,pollutant = names(data.fit109)[2:7],group = TRUE,avg.time = '1 day')

timePlot(wide_data.1hr[sample(nrow(wide_data.1hr),920),],pollutant = c('PM10.FDMS','ODIN.109'),group = TRUE, avg.time = '1 day')

ggplot(data)+
geom_line(aes(x=date,y=ODIN.PM10,colour = instrument))+
ggtitle('ODIN colocation')+
xlab('Local Date')+
ylab(expression(paste('PM10 [',mu,'g/',m^3)))

# Update metadata table with calibration data ####
sensor_id <- dbGetQuery(con, "SELECT i.serialn, s.id
FROM admin.sensor s,
admin.instrument i
WHERE s.instrumentid = i.id AND
i.name='ODIN-SD-3' AND
s.name = 'PM10';")
for (i in (1:nrow(sensor_id))){
iserial <- sensor_id$serialn[i]
sid <- sensor_id$id[i]
idx_coeffs <- which(fit_coeffs$snumber==iserial)
if (length(idx_coeffs)==1) {
st <- dbGetQuery(con,paste0("INSERT INTO metadata.odin_sd_calibration(sensorid, slope, intercept, valid_from)
VALUES (",sid,",", fit_coeffs$slope[idx_coeffs],",",fit_coeffs$inter[idx_coeffs],",'2015-01-01 00:00:00 NZST');"))
}
}