Load data

library(RPostgreSQL)
## Loading required package: DBI
library(rpart)
library(reshape2)
library(zoo)
## Warning: package 'zoo' was built under R version 3.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)
library(MASS)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.4.3
# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\code_final")

# Database login details (not in GitHub repo)
load("access.Rda")

# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
                 user=access$user,
                 password=access$pwd,
                 host='penap-data.dyndns.org',
                 dbname='cona',
                 port=5432)
# Load ECan data (stored locally)
ecan <- read.csv('RangioraWinter2016.csv',stringsAsFactors=FALSE)
names(ecan) <- c('date','time','wind.speed','wind.dir','wind.dir.std','wind.speed.std',
                 'wind.max','co','temp.ground','temp.2m','temp.6m','pm10',
                 'pm2.5','pm.course')
ecan$date <- as.POSIXct(ecan$date,format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
ecan$time <- NULL # this is included in date
# ECan site has siteid=18
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
                            i.serialn as serial, s.name as label,
                            d.value::numeric as val
                           FROM data.fixed_data as d,  
                            admin.sensor as s, admin.instrument as i
                           WHERE s.id = d.sensorid
                            AND s.instrumentid = i.id
                            AND i.name = 'ODIN-SD-3'
                            AND d.siteid = 18
                            AND (i.serialn = 'ODIN-109')
                            AND (d.recordtime BETWEEN '2016-07-11 00:00 NZST'
                              AND '2017-07-26 00:00 NZST')
                            AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
                             AND '2016-09-25 03:00 NZST')
                            AND (s.name = 'PM2.5'
                             OR s.name = 'Temperature'
                             OR s.name = 'RH')
                           ORDER BY date;") 

# 02:01 to 02:59 on 2016-09-25 is excluded because this is when the switch to NZST occurs.
# Without the exclusion clause the timestamps will truncate at 'day'.

# Truncate seconds from timestamps
odin.raw$date <- trunc(odin.raw$date,'min')
odin.raw$date <- as.POSIXct(odin.raw$date) # the above converted it to POSIXlt

# Fix daylight savings bug
# NB: the anomalously high error in late september remains even when the following code is removed
# In fact, the month MSE around the anomaly gets considerably worse.
ds.time <- as.POSIXct('09/25/2016 02:00',format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
odin.raw[odin.raw$date > ds.time, 'date'] <- 
  odin.raw[odin.raw$date > ds.time, 'date'] + 60*60

Preprocessing

odin <- dcast(odin.raw, date~label, value.var='val', fun.aggregate=mean)

# Make column names easier to type
names(odin) <- tolower(names(odin))
names(odin)[which(names(odin)=="temperature")] <- "temp"
data <- ecan

# Make sure that there is a odin entry for each minute
dates <- data.frame(date=seq(odin$date[1],odin$date[nrow(odin)],by='mins'))
odin <- merge(odin,dates,by="date",all=TRUE)

# Change the ODIN measurements from 1 min averages to 1 hour averages
odin.zoo <- zoo( odin[,2:ncol(odin)] )
odin.roll.apply <- rollapply(odin.zoo,width=60,by=1,FUN=mean,align="left",na.rm=TRUE)
odin.avgs <- odin[1:nrow(odin.roll.apply),]
odin.avgs[,2:ncol(odin)] <- odin.roll.apply

# Take timestamps at the end of the hour average following ECan convention
odin.avgs$date <- odin.avgs$date + 60*60 

# Purge NaNs
odin.avgs <- odin.avgs[!is.nan(odin.avgs$pm2.5),]
odin.avgs <- odin.avgs[!is.nan(odin.avgs$rh),]
odin.avgs <- odin.avgs[!is.nan(odin.avgs$temp),]
names(odin.avgs) <- c('date',paste0(names(odin.avgs)[2:ncol(odin.avgs)],'.odin.109'))
data <- merge(data,odin.avgs,by='date',all=FALSE)

Calibration duration

max.days <- 40

# Use 24 hour periods starting and ending at noon
date.lt <- as.POSIXlt(data$date)
noons <- data[which(date.lt$hour==12 & date.lt$min==0),'date']
start.date <- noons[1]
end.date <- noons[length(noons)]
day <- as.POSIXct('1/1/2') - as.POSIXct('1/1/1')
week <- day * 7
month <- week*4

mse <- data.frame(days=NA,mse=NA)
mse[1:max.days,'days'] <- 1:max.days

mse2 <- matrix(NA,ncol=31,nrow=max.days)
for (i in 1:max.days) {
  for (j in 0:30) {
    train.inds <- which(start.date + day*i + day*j < data$date &
                          data$date < start.date + day*i + day*j + week)
    test.inds <- which(start.date + day*i + day*j + week < data$date &
                          data$date < start.date + day*i + day*j + week + month)
    train.data <- data[train.inds,]
    test.data <- data[-train.inds]
    model <- rlm(pm2.5~pm2.5.odin.109+temp.odin.109+rh.odin.109,data=train.data)
    y.hat <- predict(model,test.data)
    mse2[i,j+1] <- mean((test.data$pm2.5-y.hat)^2,na.rm=TRUE)
  }
  
  mse[i,'mse'] <- mean(mse2[i,])
  
}

mse
##    days      mse
## 1     1 43.66873
## 2     2 43.97243
## 3     3 43.81383
## 4     4 43.70995
## 5     5 43.64379
## 6     6 43.72078
## 7     7 43.60855
## 8     8 43.29194
## 9     9 42.99449
## 10   10 42.76474
## 11   11 42.54987
## 12   12 42.17640
## 13   13 41.78863
## 14   14 41.25146
## 15   15 40.77021
## 16   16 40.38781
## 17   17 40.22111
## 18   18 40.21803
## 19   19 40.55752
## 20   20 40.30728
## 21   21 40.02517
## 22   22 39.53073
## 23   23 38.72995
## 24   24 38.23758
## 25   25 38.27011
## 26   26 38.27408
## 27   27 38.00581
## 28   28 37.32154
## 29   29 36.95887
## 30   30 36.44492
## 31   31 36.45919
## 32   32 36.12669
## 33   33 36.01598
## 34   34 36.51642
## 35   35 36.46868
## 36   36 36.50012
## 37   37 36.62679
## 38   38 36.90622
## 39   39 37.38854
## 40   40 38.55908
summary(lm(mse~days,data=mse))$coefficients
##               Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 44.4830900  0.2656526 167.4483 4.043082e-56
## days        -0.2250483  0.0112916 -19.9306 1.019182e-21
# Plot

ggplot(mse,aes(days,mse)) +
  geom_line() +
  ylab("Error Estimation") +
  xlab("Days of Training Data Used") +
  ggtitle("MSE of ODIN-109 Model with Varying-Sized Window of Training Data") +
  geom_smooth(method="lm")

Calibration Timing

# Calibrating using one week of data
mse.week <- data.frame(date=noons[1],mse=NA)
mse.month <- data.frame(date=noons[1],mse=NA)
no.days <- floor( (data$date[nrow(data)]-data$date[1]) )-30

for (i in 1:no.days) {
  # Week long window
  train.inds <- which(start.date + i*day < data$date & 
                        data$date < start.date + i*day + week)
  train.data <- data[train.inds,]
  test.data <- data[-train.inds,]
  model <- rlm(pm2.5~pm2.5.odin.109+temp.odin.109+rh.odin.109,data=train.data)
  y.hat <- predict(model,test.data)
  mse.week[i,'mse'] <- mean((test.data$pm2.5-y.hat)^2,na.rm=TRUE)
  mse.week[i,'date'] <- train.data[1,'date']
  
  # Month long window
  train.inds <- which(start.date + i*day < data$date & 
                        data$date < start.date + i*day + month)
  train.data <- data[train.inds,]
  test.data <- data[-train.inds,]
  model <- rlm(pm2.5~pm2.5.odin.109+temp.odin.109+rh.odin.109,data=train.data)
  y.hat <- predict(model,test.data)
  mse.month[i,'mse'] <- mean((test.data$pm2.5-y.hat)^2,na.rm=TRUE)
  mse.month[i,'date'] <- train.data[1,'date']
}

# Plot the month and week data together
mse.week[,'duration'] <- rep("Week",nrow(mse.week))
mse.month[,'duration'] <- rep("Month",nrow(mse.month))

ggplot(rbind(mse.week,mse.month)) +
  geom_line(aes(date,mse,colour=duration)) +
  xlab("Start Date of Training Data") +
  ylab("Error Estimation") +
  ggtitle("MSE of ODIN-109 Model with Sliding Window of Training Data") +
  scale_colour_manual("Training Duration",values=c("red","blue")) +
  theme(legend.position = c(0.2, 0.7))

# Look at linear models

# Convert date from seconds to days with 0 = initial date
mse.week$date2 <- (mse.week$date-mse.week$date[1])/24/60/60 
mse.month$date2 <- (mse.week$date-mse.month$date[1])/24/60/60 
summary(lm(mse~date2,data=mse.week))$coefficients
##               Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 33.3587099  5.0139688 6.653155 2.992623e-09
## date2        0.3486068  0.1043182 3.341765 1.255606e-03
summary(lm(mse~date2,data=mse.month))$coefficients
##               Estimate Std. Error  t value     Pr(>|t|)
## (Intercept) 23.8573833  1.6518993 14.44240 3.061938e-24
## date2        0.3519374  0.0343686 10.24008 2.508827e-16
# Start date is...
mse.week$date[1]
## [1] "2016-07-14 12:10:00 +12"
mse.month$date[1]
## [1] "2016-07-14 12:10:00 +12"