Load data

library(RPostgreSQL)
## Loading required package: DBI
library(rpart)
library(reshape2)
library(zoo)
## Warning: package 'zoo' was built under R version 3.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)
library(MASS)
library(rpart)
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.4.3
# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\code_final")

# Database login details (not in GitHub repo)
load("access.Rda")

# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
                 user=access$user,
                 password=access$pwd,
                 host='penap-data.dyndns.org',
                 dbname='cona',
                 port=5432)
# Load ECan data (stored locally)
ecan <- read.csv('RangioraWinter2016.csv',stringsAsFactors=FALSE)
names(ecan) <- c('date','time','wind.speed','wind.dir','wind.dir.std','wind.speed.std',
                 'wind.max','co','temp.ground','temp.2m','temp.6m','pm10',
                 'pm2.5','pm.course')
ecan$date <- as.POSIXct(ecan$date,format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
ecan$time <- NULL # this is included in date
# ECan site has siteid=18
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
                            i.serialn as serial, s.name as label,
                            d.value::numeric as val
                           FROM data.fixed_data as d,  
                            admin.sensor as s, admin.instrument as i
                           WHERE s.id = d.sensorid
                            AND s.instrumentid = i.id
                            AND i.name = 'ODIN-SD-3'
                            AND d.siteid = 18
                            AND (i.serialn = 'ODIN-109')
                            AND (d.recordtime BETWEEN '2016-01-01 00:00 NZST'
                              AND '2017-01-01 00:00 NZST')
                            AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
                             AND '2016-09-25 03:00 NZST')
                            AND (s.name = 'PM2.5'
                             OR s.name = 'Temperature'
                             OR s.name = 'RH')
                           ORDER BY date;") 

# 02:01 to 02:59 on 2016-09-25 is excluded because this is when the switch to NZST occurs.
# Without the exclusion clause the timestamps will truncate at 'day'.

# Truncate seconds from timestamps
odin.raw$date <- trunc(odin.raw$date,'min')
odin.raw$date <- as.POSIXct(odin.raw$date) # the above converted it to POSIXlt

# Fix daylight savings bug
ds.time <- as.POSIXct('09/25/2016 02:00',format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
odin.raw[odin.raw$date > ds.time, 'date'] <- 
  odin.raw[odin.raw$date > ds.time, 'date'] + 60*60

Preprocessing

odin <- dcast(odin.raw, date~label, value.var='val', fun.aggregate=mean)

# Make column names easier to type
names(odin) <- tolower(names(odin))
names(odin)[which(names(odin)=="temperature")] <- "temp"
data <- ecan

# Make sure that there is a odin entry for each minute
dates <- data.frame(date=seq(odin$date[1],odin$date[nrow(odin)],by='mins'))
odin <- merge(odin,dates,by="date",all=TRUE)

# Change the ODIN measurements from 1 min averages to 1 hour averages
odin.zoo <- zoo( odin[,2:ncol(odin)] )
odin.roll.apply <- rollapply(odin.zoo,width=60,by=1,FUN=mean,align="left",na.rm=TRUE)
odin.avgs <- odin[1:nrow(odin.roll.apply),]
odin.avgs[,2:ncol(odin)] <- odin.roll.apply

# Take timestamps at the end of the hour average following ECan convention
odin.avgs$date <- odin.avgs$date + 60*60 

# Purge NaNs
odin.avgs <- odin.avgs[!is.nan(odin.avgs$pm2.5), ]
odin.avgs <- odin.avgs[!is.nan(odin.avgs$rh), ]
odin.avgs <- odin.avgs[!is.nan(odin.avgs$temp), ]

names(odin.avgs) <- c('date', paste0(names(odin.avgs)[2:ncol(odin.avgs)], '.odin.109'))
data <- merge(data,odin.avgs, by='date', all=FALSE)

data.backup <- data

# Get rid of unneeded columns
desired.cols <- c('date', 'pm2.5', 'pm2.5.odin.109', 'rh.odin.109', 'temp.odin.109')
data <- data[, desired.cols]
# Get rid of NA's
for (col in desired.cols) {
  data <- data[!is.na(data[, col]), ]
}

Build Models

# Set start and end times for initial and final colocations
init.start.time <- as.POSIXct('07/11/2016 00:00',format = '%m/%d/%Y %H:%M',tz='Etc/GMT-12')
init.end.time <- as.POSIXct('07/26/2016 00:00',format = '%m/%d/%Y %H:%M',tz='Etc/GMT-12')
final.start.time <- as.POSIXct('10/03/2016 00:00',format = '%m/%d/%Y %H:%M',tz='Etc/GMT-12')
final.end.time <- as.POSIXct('10/19/2016 00:00',format = '%m/%d/%Y %H:%M',tz='Etc/GMT-12')

# Get initial and final datasets
data.init <- data[init.start.time < data$date & data$date < init.end.time, ]
data.final <- data[final.start.time < data$date & data$date < final.end.time, ]

# Check the lengths
nrow(data.init)
## [1] 1850
nrow(data.final)
## [1] 2272
# Generate formula
odin.vars <- paste0(c('pm2.5','temp','rh'),'.odin.109')
eval(parse(text=paste0('form <- pm2.5 ~ ', paste(odin.vars,collapse='+'))))

# Generate linear models
init.lm <- lm(form, data=data.init)
final.lm <- lm(form, data=data.final)
both.lm <- lm(form, data=rbind(data.init,data.final))

# Extract coefficients
init.coefs <- (summary(init.lm)$coefficients)[, "Estimate"]
final.coefs <- (summary(final.lm)$coefficients)[, "Estimate"]
both.coefs <- (summary(both.lm)$coefficients)[, "Estimate"]

Testing Modeling Techniques

# Models must have parameters (date, pm2.5.odin, temp.odin.109, rh.odin.109)

# Generate the initial model
init.model <- function(line) {
  answer <- init.coefs['(Intercept)']
  for (var in odin.vars) {
    answer <- answer + init.coefs[var] * line[var]
  }
  return(answer)
}

# Generate the final model
final.model <- function(line) {
  answer <- final.coefs['(Intercept)']
  for (var in odin.vars) {
    answer <- answer + final.coefs[var] * line[var]
  }
  return(answer)
}

# Generate a single model using both colocation periods
both.model <- function(line) {
  answer <- both.coefs['(Intercept)']
  for (var in odin.vars) {
    answer <- answer + both.coefs[var] * line[var]
  }
  return(answer)
}

# Create a time dependent weighted-average model
data.start <- data$date[1]
data.end <- data$date[nrow(data)]

weighted.model <- function(line) {
  init.y <- init.model(line)
  final.y <- final.model(line)
  # The proportion of the way through the duration of the experiment
  prop <- as.numeric(line$date - data.start) / as.numeric(data.end - data.start)
  init.y * (1-prop) + final.y * prop
}

# Create a time dependent parametric-interpolation model
init.mid.time  <- mean(c(init.start.time, init.end.time))
final.mid.time <- mean(c(final.start.time, final.end.time))

interpolate.coef <- function(line, var) {
  x.0 <- init.mid.time
  x.1 <- final.mid.time
  x <- line$date
  y.0 <- init.coefs[var]
  y.1 <- final.coefs[var]
  y.0 + as.numeric(x-x.0) * (y.1-y.0) / as.numeric(x.1 - x.0)
}

interpolation.model <- function(line) {
  answer <- interpolate.coef(line, '(Intercept)')
  for (var in odin.vars) {
    answer <- answer + interpolate.coef(line, var) * line[var]
  }
  return(answer)
}

# Function for testing models
get.MSE <- function(model) {
  SE.sum <- 0
#  SE.df <- data.frame(date=data$date, SE=NA)
  for (i in 1:nrow(data)) {
    line <- data[i, ]
    y.hat <- model(line)
    SE <- (line$pm2.5 - y.hat)^2
    SE.sum <- SE.sum + SE
#    SE.df[i, 'SE'] <- SE
  }
#  plt <- ggplot(SE.df) +
#    geom_point(aes(date, SE)) # +
    # ggtitle(names(models)[which(models==model)])
#  print(plt)
  return(SE.sum/nrow(data))
}

# Test models
models <- list(init.model, final.model, both.model, 
               weighted.model, interpolation.model)
names(models) <- c("Initial Model", "Final Model", "Both model", 
                   "Weighted Average Model", "Parameter Interpolation Model")
model.MSEs <- rep(NA, length(models))
names(model.MSEs) <- names(models)
for (i in 1:length(models)) {
  print(i)
  model.MSEs[i] <- get.MSE(models[[i]])
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
## [1] 5
model.MSEs
##                 Initial Model                   Final Model 
##                      40.87922                      30.60962 
##                    Both model        Weighted Average Model 
##                      37.04403                      31.22628 
## Parameter Interpolation Model 
##                      30.23946

The Binning Method

bin.breaks <- seq(-10,100,length.out=50)
pm.bin.init <- cut(data.init[, odin.vars[1]], breaks=bin.breaks)
pm.bin.final <- cut(data.final[, odin.vars[1]], breaks=bin.breaks)

# The number of occurances in each bin
pm.num.bin.init <- summary(pm.bin.init)
pm.num.bin.final <- summary(pm.bin.final)

# The density of pm concentratinos from the two samples (binned)
pm.dist.init <- summary(pm.bin.init)/sum(pm.num.bin.init)
pm.dist.final <- summary(pm.bin.final)/sum(pm.num.bin.final)

# Calculate weights
weights.final <- pm.dist.init[pm.bin.final]
weights.init <- pm.dist.final[pm.bin.init]

# Generate linear models
init.lm <- lm(form, data=data.init, weights=weights.init)
final.lm <- lm(form, data=data.final, weights=weights.final)
#both.lm <- lm(form, data=rbind(data.init,data.final), weights=rbind(weights.init,
#                                                                    weights.final))

# Extract coefficients
init.coefs <- (summary(init.lm)$coefficients)[, "Estimate"]
final.coefs <- (summary(final.lm)$coefficients)[, "Estimate"]

# Test models
models <- list(init.model, final.model, #both.model, 
               weighted.model, interpolation.model)
names(models) <- c("Initial Model", "Final Model", #"Both model", 
                   "Weighted Average Model", "Parameter Interpolation Model")
model.MSEs <- rep(NA, length(models))
names(model.MSEs) <- names(models)
for (i in 1:length(models)) {
  print(i)
  model.MSEs[i] <- get.MSE(models[[i]])
}
## [1] 1
## [1] 2
## [1] 3
## [1] 4
model.MSEs
##                 Initial Model                   Final Model 
##                      74.84050                      34.37344 
##        Weighted Average Model Parameter Interpolation Model 
##                      45.96280                      43.43127