Load data

library(RPostgreSQL)
## Loading required package: DBI
library(rpart)
library(reshape2)
library(zoo)
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)
library(MASS)

# Set working directory
setwd("\\\\files.auckland.ac.nz\\MyHome\\Documents")

# Database login details (not in GitHub repo)
load("access.Rda")

# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
                 user=access$user,
                 password=access$pwd,
                 host='penap-data.dyndns.org',
                 dbname='cona',
                 port=5432)
# Load ECan data (stored locally)
ecan <- read.csv('RangioraWinter2016.csv',stringsAsFactors=FALSE)
names(ecan) <- c('date','time','wind.speed','wind.dir','wind.dir.std','wind.speed.std',
                 'wind.max','co','temp.ground','temp.2m','temp.6m','pm10',
                 'pm2.5','pm.course')
ecan$date <- as.POSIXct(ecan$date,format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
ecan$time <- NULL # this is included in date
# ECan site has siteid=18
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
                            i.serialn as serial, s.name as label,
                            d.value::numeric as val
                           FROM data.fixed_data as d,  
                            admin.sensor as s, admin.instrument as i
                           WHERE s.id = d.sensorid
                            AND s.instrumentid = i.id
                            AND i.name = 'ODIN-SD-3'
                            AND d.siteid = 18
                            AND (i.serialn = 'ODIN-109')
                            AND (d.recordtime BETWEEN '2016-07-11 00:00 NZST'
                              AND '2017-07-26 00:00 NZST')
                            AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
                             AND '2016-09-25 03:00 NZST')
                            AND (s.name = 'PM2.5'
                             OR s.name = 'Temperature'
                             OR s.name = 'RH')
                           ORDER BY date;") 

# 02:01 to 02:59 on 2016-09-25 is excluded because this is when the switch to NZST occurs.
# Without the exclusion clause the timestamps will truncate at 'day'.

# Truncate seconds from timestamps
odin.raw$date <- trunc(odin.raw$date,'min')
odin.raw$date <- as.POSIXct(odin.raw$date) # the above converted it to POSIXlt

# Fix daylight savings bug
ds.time <- as.POSIXct('09/25/2016 02:00',format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
odin.raw[odin.raw$date > ds.time, 'date'] <- 
  odin.raw[odin.raw$date > ds.time, 'date'] + 60*60

Preprocessing

odin <- dcast(odin.raw, date~label, value.var='val', fun.aggregate=mean)

# Make column names easier to type
names(odin) <- tolower(names(odin))
names(odin)[which(names(odin)=="temperature")] <- "temp"
data <- ecan

# Make sure that there is a odin entry for each minute
dates <- data.frame(date=seq(odin$date[1],odin$date[nrow(odin)],by='mins'))
odin <- merge(odin,dates,by="date",all=TRUE)

# Change the ODIN measurements from 1 min averages to 1 hour averages
odin.zoo <- zoo( odin[,2:ncol(odin)] )
odin.roll.apply <- rollapply(odin.zoo,width=60,by=1,FUN=mean,align="left",na.rm=TRUE)
odin.avgs <- odin[1:nrow(odin.roll.apply),]
odin.avgs[,2:ncol(odin)] <- odin.roll.apply

# Take timestamps at the end of the hour average following ECan convention
odin.avgs$date <- odin.avgs$date + 60*60 

# Purge NaNs
odin.avgs <- odin.avgs[!is.nan(odin.avgs$pm2.5),]
odin.avgs <- odin.avgs[!is.nan(odin.avgs$rh),]
odin.avgs <- odin.avgs[!is.nan(odin.avgs$temp),]
names(odin.avgs) <- c('date',paste0(names(odin.avgs)[2:ncol(odin.avgs)],'.odin.109'))
data <- merge(data,odin.avgs,by='date',all=FALSE)

Daily Models with lm

# Use 24 hour periods starting and ending at noon
date.lt <- as.POSIXlt(data$date)
noons <- data[which(date.lt$hour==12 & date.lt$min==0),'date']

daily.model <- data.frame(day=noons[1])

for (i in 2:length(noons)) {
  
  # Build the daily model and record coefficients
  day.data <- data[data$date >= noons[i-1] & data$date <= noons[i],]
  day <- day.data$date[1]
  day.coefs <- summary(lm(pm2.5~pm2.5.odin.109+rh.odin.109+temp.odin.109,
                          data=day.data))$coefficients
  daily.model[i,'day'] <- day
  daily.model[i,'pm2.5'] <- day.coefs['pm2.5.odin.109',1]
  daily.model[i,'temp'] <- day.coefs['temp.odin.109',1]
  daily.model[i,'rh'] <- day.coefs['rh.odin.109',1]
  daily.model[i,'cnst'] <- day.coefs['(Intercept)',1]
  # 95% CI is two std's
  daily.model[i,'pm2.5.err'] <- day.coefs['pm2.5.odin.109',2] * 2
  daily.model[i,'temp.err'] <- day.coefs['temp.odin.109',2] * 2
  daily.model[i,'rh.err'] <- day.coefs['rh.odin.109',2] * 2
  daily.model[i,'cnst.err'] <- day.coefs['(Intercept)',2] * 2
  # Weight is inverse of error
  daily.model[i,'pm2.5.wt'] <- 1 / day.coefs['pm2.5.odin.109',2] 
  daily.model[i,'temp.wt'] <- 1 / day.coefs['temp.odin.109',2] 
  daily.model[i,'rh.wt'] <- 1 / day.coefs['rh.odin.109',2] 
  daily.model[i,'cnst.wt'] <- 1 / day.coefs['(Intercept)',2] 
  
  # Record some facts about the day's data
  daily.model[i,'mean.pm2.5'] <- mean(day.data$pm2.5)
  daily.model[i,'mean.temp'] <- mean(day.data$temp.2m)
  
}

# Plot linear models over time

y.lab <- c(pm2.5="expression(PM['2.5,ODIN']*' Coefficient')",
               rh="expression(RH['ODIN']*' Coefficient')",
               temp="expression(Temp['ODIN']*' Coefficient')",
               cnst="'Constant'")
for (var in c('pm2.5','rh','temp','cnst')) {
  var.std <- paste0(var,'.err')
  eval(parse(text=paste0("p <- ggplot(daily.model, aes(x=day,y=",var,",weight=",var,".wt)) +",
    "geom_errorbar(aes(ymin=",var,"-",var.std,", ymax=",var,"+",var.std,")) + ",
    "geom_point() + ",
    'scale_colour_manual(values=c("red","blue")) + ',
    'xlab("Date") +',
    'ylab(',y.lab[var],') +',
    'ggtitle("Model Parameters from Successive 24-Hour Periods of ODIN-109 Data") +',
    'geom_smooth(method="lm")'
    )))
  print(p)
}
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbar).
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbar).
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbar).
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbar).
## Warning: Removed 1 rows containing missing values (geom_point).

headers <- c(pm2.5="PM2.5 Coefficient Predictors",
             rh="Relative Humidity Coefficient Predictors",
             temp="Temperature Coefficient Predictors",
             cnst="Constant Term Predictors")
for (var in c('pm2.5','rh','temp','cnst')) {
  eval(parse(text=paste0("coefs <- summary(lm(",var,"~day + mean.pm2.5,",
                         "data=daily.model,weight=",var,".wt))$coefficients")))
  writeLines(paste0('\n',headers[var],'\n'))
  print(coefs)
}
## 
## PM2.5 Coefficient Predictors
## 
##                  Estimate   Std. Error   t value   Pr(>|t|)
## (Intercept)  5.629998e+01 2.334991e+01  2.411143 0.01788970
## day         -3.771471e-08 1.583608e-08 -2.381568 0.01929900
## mean.pm2.5   8.122348e-03 4.536997e-03  1.790248 0.07670371
## 
## Relative Humidity Coefficient Predictors
## 
##                  Estimate   Std. Error    t value  Pr(>|t|)
## (Intercept) -1.345895e+01 1.277219e+01 -1.0537699 0.2947479
## day          9.100073e-09 8.654579e-09  1.0514750 0.2957945
## mean.pm2.5  -7.197288e-04 4.552369e-03 -0.1580998 0.8747246
## 
## Temperature Coefficient Predictors
## 
##                  Estimate   Std. Error    t value  Pr(>|t|)
## (Intercept) -2.598470e+01 2.872136e+01 -0.9047166 0.3679786
## day          1.762172e-08 1.946128e-08  0.9054759 0.3675785
## mean.pm2.5  -1.260038e-02 8.751902e-03 -1.4397308 0.1533377
## 
## Constant Term Predictors
## 
##                  Estimate   Std. Error    t value  Pr(>|t|)
## (Intercept)  5.904002e+02 1.067199e+03  0.5532243 0.5814517
## day         -3.988450e-07 7.231623e-07 -0.5515291 0.5826082
## mean.pm2.5   2.764645e-01 3.628251e-01  0.7619773 0.4480226

Daily Models with rlm

# Use 24 hour periods starting and ending at noon
date.lt <- as.POSIXlt(data$date)
noons <- data[which(date.lt$hour==12 & date.lt$min==0),'date']

daily.model <- data.frame(day=noons[1])

for (i in 2:length(noons)) {
  
  # Build the daily model and record coefficients
  day.data <- data[data$date >= noons[i-1] & data$date <= noons[i],]
  day <- day.data$date[1]
  day.coefs <- summary(rlm(pm2.5~pm2.5.odin.109+rh.odin.109+temp.odin.109,
                          data=day.data))$coefficients
  daily.model[i,'day'] <- day
  daily.model[i,'pm2.5'] <- day.coefs['pm2.5.odin.109',1]
  daily.model[i,'temp'] <- day.coefs['temp.odin.109',1]
  daily.model[i,'rh'] <- day.coefs['rh.odin.109',1]
  daily.model[i,'cnst'] <- day.coefs['(Intercept)',1]
  # 95% CI is two std's
  daily.model[i,'pm2.5.err'] <- day.coefs['pm2.5.odin.109',2] * 2
  daily.model[i,'temp.err'] <- day.coefs['temp.odin.109',2] * 2
  daily.model[i,'rh.err'] <- day.coefs['rh.odin.109',2] * 2
  daily.model[i,'cnst.err'] <- day.coefs['(Intercept)',2] * 2
  # Weight is inverse of error
  daily.model[i,'pm2.5.wt'] <- 1 / day.coefs['pm2.5.odin.109',2] 
  daily.model[i,'temp.wt'] <- 1 / day.coefs['temp.odin.109',2] 
  daily.model[i,'rh.wt'] <- 1 / day.coefs['rh.odin.109',2] 
  daily.model[i,'cnst.wt'] <- 1 / day.coefs['(Intercept)',2] 
  
  # Record some facts about the day's data
  daily.model[i,'mean.pm2.5'] <- mean(day.data$pm2.5)
  daily.model[i,'mean.temp'] <- mean(day.data$temp.2m)
  
}

# Plot linear models over time

y.lab <- c(pm2.5="expression(PM['2.5,ODIN']*' Coefficient')",
               rh="expression(RH['ODIN']*' Coefficient')",
               temp="expression(Temp['ODIN']*' Coefficient')",
               cnst="'Constant'")
for (var in c('pm2.5','rh','temp','cnst')) {
  var.std <- paste0(var,'.err')
  eval(parse(text=paste0("p <- ggplot(daily.model, aes(x=day,y=",var,",weight=",var,".wt)) +",
    "geom_errorbar(aes(ymin=",var,"-",var.std,", ymax=",var,"+",var.std,")) + ",
    "geom_point() + ",
    'scale_colour_manual(values=c("red","blue")) + ',
    'xlab("Date") +',
    'ylab(',y.lab[var],') +',
    'ggtitle("Model Parameters from Successive 24-Hour Periods of ODIN-109 Data") +',
    'geom_smooth(method="lm")'
    )))
  print(p)
}
## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbar).
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbar).
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbar).
## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_smooth).
## Warning: Removed 1 rows containing missing values (geom_errorbar).
## Warning: Removed 1 rows containing missing values (geom_point).

headers <- c(pm2.5="PM2.5 Coefficient Predictors",
             rh="Relative Humidity Coefficient Predictors",
             temp="Temperature Coefficient Predictors",
             cnst="Constant Term Predictors")
for (var in c('pm2.5','rh','temp','cnst')) {
  eval(parse(text=paste0("coefs <- summary(lm(",var,"~day + mean.pm2.5,",
                         "data=daily.model,weight=",var,".wt))$coefficients")))
  writeLines(paste0('\n',headers[var],'\n'))
  print(coefs)
}
## 
## PM2.5 Coefficient Predictors
## 
##                  Estimate   Std. Error   t value   Pr(>|t|)
## (Intercept)  5.673976e+01 2.363353e+01  2.400816 0.01837102
## day         -3.800811e-08 1.602923e-08 -2.371176 0.01981715
## mean.pm2.5   8.607333e-03 4.488827e-03  1.917502 0.05827659
## 
## Relative Humidity Coefficient Predictors
## 
##                  Estimate   Std. Error     t value  Pr(>|t|)
## (Intercept) -1.111238e+01 1.235236e+01 -0.89961527 0.3706737
## day          7.508885e-09 8.370146e-09  0.89710326 0.3720054
## mean.pm2.5  -4.172472e-04 4.246769e-03 -0.09825051 0.9219471
## 
## Temperature Coefficient Predictors
## 
##                  Estimate   Std. Error    t value  Pr(>|t|)
## (Intercept) -2.265448e+01 2.751008e+01 -0.8234976 0.4123547
## day          1.535763e-08 1.864075e-08  0.8238739 0.4121420
## mean.pm2.5  -1.021413e-02 8.138588e-03 -1.2550252 0.2126483
## 
## Constant Term Predictors
## 
##                  Estimate   Std. Error    t value  Pr(>|t|)
## (Intercept)  4.471493e+02 1.037704e+03  0.4309026 0.6675471
## day         -3.016223e-07 7.031793e-07 -0.4289408 0.6689690
## mean.pm2.5   2.104689e-01 3.407850e-01  0.6176004 0.5383645