Load data

library(RPostgreSQL)
## Loading required package: DBI
library(rpart)
library(reshape2)
# library(openair)
# library(lubridate)
library(zoo)
## Warning: package 'zoo' was built under R version 3.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)

# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\repo")

# Database login details (not in GitHub repo)
load("access.Rda")

# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
                 user=access$user,
                 password=access$pwd,
                 host='penap-data.dyndns.org',
                 dbname='cona',
                 port=5432)
# Load ECan data (stored locally)
ecan <- read.csv('RangioraWinter2016.csv',stringsAsFactors=FALSE)
names(ecan) <- c('date','time','wind.speed','wind.dir','wind.dir.std','wind.speed.std',
                 'wind.max','co','temp.ground','temp.2m','temp.6m','pm10',
                 'pm2.5','pm.course')
ecan$date <- as.POSIXct(ecan$date,format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
# ecan$date <- ecan$date - 60*60
ecan$time <- NULL # this is included in date
# There are some NA's in PM2.5. Set these to zero.
# ecan$pm2.5[which(is.na(ecan$pm2.5))] <- 0
head(ecan)
##                  date wind.speed wind.dir wind.dir.std wind.speed.std
## 1 2016-04-01 00:10:00       0.72    353.9         43.0           0.24
## 2 2016-04-01 00:20:00       0.71    319.6         19.7           0.27
## 3 2016-04-01 00:30:00       0.63    305.4          4.0           0.36
## 4 2016-04-01 00:40:00       1.39    302.9         11.5           0.33
## 5 2016-04-01 00:50:00       0.91    312.4         21.3           0.45
## 6 2016-04-01 01:00:00       1.12    311.6         15.0           0.42
##   wind.max   co temp.ground temp.2m temp.6m  pm10 pm2.5 pm.course
## 1     1.35 0.16       -0.01   11.12   11.13 12.54  4.61      7.93
## 2     1.35 0.15        0.03   11.08   11.06 13.05  5.92      7.13
## 3     1.55 0.14        0.06   11.05   11.00 13.91  6.98      6.92
## 4     2.25 0.10        0.08   11.04   10.95 14.46  6.18      8.28
## 5     2.25 0.10        0.10   10.99   10.90 15.17  5.59      9.58
## 6     2.25 0.09        0.09   10.99   10.90 15.56  6.12      9.45
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
                            i.serialn as serial, s.name as label,
                            d.value::numeric as val
                           FROM data.fixed_data as d,  
                            admin.sensor as s, admin.instrument as i
                           WHERE s.id = d.sensorid
                            AND s.instrumentid = i.id
                            AND i.name = 'ODIN-SD-3'
                            AND d.siteid = 18
                            AND (i.serialn = 'ODIN-109')
                            AND (d.recordtime BETWEEN '2016-09-25 01:50 NZST'
                             AND '2016-09-25 03:10 NZST')
                            AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
                             AND '2016-09-25 03:00 NZST')
                            AND (s.name = 'PM2.5'
                             OR s.name = 'PM10'
                             OR s.name = 'Temperature'
                             OR s.name = 'RH')
                           ORDER BY date;") 
odin.raw$date <- trunc(odin.raw$date,'min') # get rid of the seconds
odin.raw$date <- as.POSIXct(odin.raw$date) # the above converted it to POSIXlt

odin <- dcast(odin.raw, date~label, value.var='val', fun.aggregate=mean)
x <- odin$date
min <- x[2] - x[1]
y <- ecan[ecan$date >= x[1]-10*min & ecan$date <= x[length(x)]+10*min,'date']
x
##  [1] "2016-09-25 01:50:00 NZST" "2016-09-25 01:51:00 NZST"
##  [3] "2016-09-25 01:52:00 NZST" "2016-09-25 01:53:00 NZST"
##  [5] "2016-09-25 01:54:00 NZST" "2016-09-25 01:55:00 NZST"
##  [7] "2016-09-25 01:56:00 NZST" "2016-09-25 01:57:00 NZST"
##  [9] "2016-09-25 01:58:00 NZST" "2016-09-25 01:59:00 NZST"
## [11] "2016-09-25 03:00:00 NZDT" "2016-09-25 03:01:00 NZDT"
## [13] "2016-09-25 03:02:00 NZDT" "2016-09-25 03:03:00 NZDT"
## [15] "2016-09-25 03:04:00 NZDT" "2016-09-25 03:05:00 NZDT"
## [17] "2016-09-25 03:06:00 NZDT" "2016-09-25 03:07:00 NZDT"
## [19] "2016-09-25 03:08:00 NZDT" "2016-09-25 03:09:00 NZDT"
y
## [1] "2016-09-25 01:40:00 +12" "2016-09-25 01:50:00 +12"
## [3] "2016-09-25 02:00:00 +12" "2016-09-25 02:10:00 +12"
i <- x[11] # 3:00 NZDT
j <- y[3] # 2:00 GMT+12
i == j # TRUE
## [1] TRUE
# everything seems fine
# Load ODIN measurements from June 2017
siteid = 18 # ecan site
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
                            i.serialn as serial, s.name as label,
                            d.value::numeric as val
                           FROM data.fixed_data as d,  
                            admin.sensor as s, admin.instrument as i
                           WHERE s.id = d.sensorid
                            AND s.instrumentid = i.id
                            AND i.name = 'ODIN-SD-3'
                            AND d.siteid = 18
                            AND (i.serialn = 'ODIN-109')
                            AND (d.recordtime BETWEEN '2016-01-01 00:00 NZST'
                             AND '2017-01-01 00:00 NZST')
                            AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
                             AND '2016-09-25 03:00 NZST')
                            AND (s.name = 'PM2.5'
                             OR s.name = 'PM10'
                             OR s.name = 'Temperature'
                             OR s.name = 'RH')
                           ORDER BY date;") 

# Time periods including timestamps between 2016-09-25 02:01 and 02:59 (daylight savings)
# daylight savings should also have weirdness in 3 april (3am went to 2am)
# data starts in 
# will truncate timestamps at 'day'.

odin.raw$date <- trunc(odin.raw$date,'min') # get rid of the seconds
odin.raw$date <- as.POSIXct(odin.raw$date) # the above converted it to POSIXlt
# Create a data frame for each ODIN
serials <- unique(odin.raw$serial)
odins <- list()
for (i in 1:length(serials)) {
  this.odin <- dcast(odin.raw[odin.raw$serial==serials[i],], 
                     date~label, value.var='val', fun.aggregate=mean)
  # Make column names easier to type
  names(this.odin) <- tolower(names(this.odin))
  names(this.odin)[which(names(this.odin)=="temperature")] <- "temp"
  odins[[i]] <- this.odin
}
# change ODIN serials from ODIN-101 to odin.101
# the hyphens cause problems by being interpreted as minuses
tidy.serial <- function(serialn) paste0('odin.',substring(serialn,6,8))
names(odins) <- sapply(serials,FUN=tidy.serial)

odin <- odins[[1]]
# change the ODIN measurements from 1 min averages to 1 hour averages

# make sure that there is a odin entry for each minute
dates <- data.frame(date=seq(odin$date[1],odin$date[nrow(odin)],by='mins'))
odin <- merge(odin,dates,by="date",all=TRUE)
if (nrow(odin) != nrow(dates))
  print("Oh noes! Gaps in ODIN data!")
# if there are gaps merge "dates" with odin

odin.zoo <- zoo( odin[,2:ncol(odin)] ) # remove date because rollapply only works on numerics
odin.roll.apply <- rollapply(odin.zoo,width=60,by=1,FUN=mean,align="left",na.rm=TRUE)
# NB I don't know what the 'align' parameter does
# Throw out data for which there is not a complete hour window
odin.avgs <- odin[1:nrow(odin.roll.apply),]
odin.avgs[,2:ncol(odin)] <- odin.roll.apply
# NB if there are gaps in the data then this will be screwed up
odin.avgs$date <- odin.avgs$date + 60*60 # take timestamps at the end of the hour average following ECan convention

# split into months
odin <- odin.avgs
odin$datel <- as.POSIXlt(odin$date)
months <- unique(odin$datel$mon)

# Find best correlation between ECan and ODIN

offsets <- -60:60 # 0:120
blank <- rep(NA,length(offset))

for (month in months) {
  odin.month <- odin[odin$datel$mon==month,]
  corr.df <- data.frame(offset=offsets,temp=blank,pm2.5=blank,pm10=blank)
  for (i in 1:nrow(corr.df)) {
    offset <- corr.df[i,'offset']
    odin.i <- odin.month
    odin.i$date <- odin.i$date + offset*60
    names(odin.i) <- c('date',paste0(names(odin.i),'.odin.109')[2:ncol(odin)])
    data.i <- merge(odin.i,ecan,by='date',all=FALSE)
    corr.df[i,'temp'] <- cor(data.i$temp.2m,data.i$temp.odin.109,
                             method="pearson",use="complete.obs")
    corr.df[i,'pm2.5'] <- cor(data.i$pm2.5,data.i$pm2.5.odin.109,
                             method="pearson",use="complete.obs")
    corr.df[i,'pm10'] <- cor(data.i$pm10,data.i$pm10.odin.109,
                             method="pearson",use="complete.obs")
  }
  corr.melt <- melt(corr.df,id='offset')
  plt <- ggplot(corr.melt) + 
    geom_line(aes(x=offset,y=value,colour=variable)) +
    scale_colour_manual(values=c("red","green","blue")) + 
    ylab("Pearson Correlation") +
    xlab("ODIN-109 Clock Offset (m)") +
    ggtitle(paste("Correlation Between ECan and ODIN Measurements in month",month+1))
  print(plt)
}

Minimal replication of problem

# odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
#                             i.serialn as serial, s.name as label,
#                             d.value::numeric as val
#                            FROM data.fixed_data as d,  
#                             admin.sensor as s, admin.instrument as i
#                            WHERE s.id = d.sensorid
#                             AND s.instrumentid = i.id
#                             AND i.name = 'ODIN-SD-3'
#                             AND d.siteid = 18
#                             AND (i.serialn = 'ODIN-109')
#                             AND (d.recordtime BETWEEN '2016-09-24 00:00 NZST'
#                              AND '2017-09-26 00:00 NZST')
#                             AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
#                              AND '2016-09-25 03:00 NZST')
#                             AND (s.name = 'PM2.5'
#                              OR s.name = 'PM10'
#                              OR s.name = 'Temperature'
#                              OR s.name = 'RH')
#                            ORDER BY date;")