Load data

library(RPostgreSQL)
## Loading required package: DBI
library(rpart)
library(reshape2)
library(zoo)
## Warning: package 'zoo' was built under R version 3.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)
library(MASS)

# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\code_final")

# Database login details (not in GitHub repo)
load("access.Rda")

# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
                 user=access$user,
                 password=access$pwd,
                 host='penap-data.dyndns.org',
                 dbname='cona',
                 port=5432)
# Load ECan data (stored locally)
ecan <- read.csv('RangioraWinter2016.csv',stringsAsFactors=FALSE)
names(ecan) <- c('date','time','wind.speed','wind.dir','wind.dir.std','wind.speed.std',
                 'wind.max','co','temp.ground','temp.2m','temp.6m','pm10',
                 'pm2.5','pm.course')
ecan$date <- as.POSIXct(ecan$date,format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
ecan$time <- NULL # this is included in date
# ECan site has siteid=18
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
                            i.serialn as serial, s.name as label,
                            d.value::numeric as val
                           FROM data.fixed_data as d,  
                            admin.sensor as s, admin.instrument as i
                           WHERE s.id = d.sensorid
                            AND s.instrumentid = i.id
                            AND i.name = 'ODIN-SD-3'
                            AND d.siteid = 18
                            AND (i.serialn = 'ODIN-109')
                            AND (d.recordtime BETWEEN '2016-07-11 00:00 NZST'
                              AND '2017-07-26 00:00 NZST')
                            AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
                             AND '2016-09-25 03:00 NZST')
                            AND (s.name = 'PM2.5'
                             OR s.name = 'Temperature'
                             OR s.name = 'RH')
                           ORDER BY date;") 

# 02:01 to 02:59 on 2016-09-25 is excluded because this is when the switch to NZST occurs.
# Without the exclusion clause the timestamps will truncate at 'day'.

# Truncate seconds from timestamps
odin.raw$date <- trunc(odin.raw$date,'min')
odin.raw$date <- as.POSIXct(odin.raw$date) # the above converted it to POSIXlt

# Fix daylight savings bug
ds.time <- as.POSIXct('09/25/2016 02:00',format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
odin.raw[odin.raw$date > ds.time, 'date'] <- 
  odin.raw[odin.raw$date > ds.time, 'date'] + 60*60

Preprocessing

odin <- dcast(odin.raw, date~label, value.var='val', fun.aggregate=mean)

# Make column names easier to type
names(odin) <- tolower(names(odin))
names(odin)[which(names(odin)=="temperature")] <- "temp"
data <- ecan

# Make sure that there is a odin entry for each minute
dates <- data.frame(date=seq(odin$date[1],odin$date[nrow(odin)],by='mins'))
odin <- merge(odin,dates,by="date",all=TRUE)

# Change the ODIN measurements from 1 min averages to 1 hour averages
odin.zoo <- zoo( odin[,2:ncol(odin)] )
odin.roll.apply <- rollapply(odin.zoo,width=60,by=1,FUN=mean,align="left",na.rm=TRUE)
odin.avgs <- odin[1:nrow(odin.roll.apply),]
odin.avgs[,2:ncol(odin)] <- odin.roll.apply

# Take timestamps at the end of the hour average following ECan convention
odin.avgs$date <- odin.avgs$date + 60*60 

# Purge NaNs
odin.avgs <- odin.avgs[!is.nan(odin.avgs$pm2.5),]
odin.avgs <- odin.avgs[!is.nan(odin.avgs$rh),]
odin.avgs <- odin.avgs[!is.nan(odin.avgs$temp),]
names(odin.avgs) <- c('date',paste0(names(odin.avgs)[2:ncol(odin.avgs)],'.odin.109'))
data <- merge(data,odin.avgs,by='date',all=FALSE)

I’m just wondering whether it’s better to model ODIN as having additive or multiplicative noise

data.backup <- data
data <- data[!is.na(data$pm2.5),]
pm.odin <- data$pm2.5.odin.109
pm.ecan <- data$pm2.5
df <- data.frame(x=seq(from=min(pm.odin),to=max(pm.odin),length.out=100),y=NA,num=NA)
for (i in 2:length(df$x)) {
  inds <- which(pm.odin<df$x[i] & pm.odin>df$x[i-1])
  odin.i <- pm.odin[inds]
  ecan.i <- pm.ecan[inds]
  df[i,'y'] <- mean(abs(odin.i-ecan.i),na.rm=TRUE)
  df[i,'num'] <- length(odin.i)
}

ggplot(df,aes(x,y,weight=num)) +
  geom_line() +
  geom_smooth(method='lm') +
  ylab("MAE") +
  xlab(expression(PM[2.5]*"-ODIN-109")) +
  ggtitle(expression("MAE as a function of"~PM[2.5]*"-ODIN-109 (+ lm weighted by frequency)"))
## Warning: Removed 12 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_path).

ggplot(df,aes(x,y)) +
  geom_line() +
  geom_smooth(method='lm') +
  ylab("MAE") +
  xlab(expression(PM[2.5]*"-ODIN-109")) +
  ggtitle(expression("MAE as a function of"~PM[2.5]*"-ODIN-109"))
## Warning: Removed 12 rows containing non-finite values (stat_smooth).

## Warning: Removed 3 rows containing missing values (geom_path).

summary(lm(y~x,df,weights=num))
## 
## Call:
## lm(formula = y ~ x, data = df, weights = num)
## 
## Weighted Residuals:
##     Min      1Q  Median      3Q     Max 
## -24.041 -10.892  -3.143  10.274  25.531 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1.104477   0.139790   7.901 8.39e-12 ***
## x           0.242197   0.008715  27.790  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.76 on 86 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.8998, Adjusted R-squared:  0.8986 
## F-statistic: 772.3 on 1 and 86 DF,  p-value: < 2.2e-16
summary(lm(y~x,df))
## 
## Call:
## lm(formula = y ~ x, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -16.5285  -2.3320  -0.3586   1.0597  27.0335 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.83442    1.26665   3.027  0.00326 ** 
## x            0.16870    0.01888   8.935 6.67e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.849 on 86 degrees of freedom
##   (12 observations deleted due to missingness)
## Multiple R-squared:  0.4814, Adjusted R-squared:  0.4754 
## F-statistic: 79.84 on 1 and 86 DF,  p-value: 6.67e-14
summary(pm.odin)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.574   2.757   5.186   9.388  10.044 126.839

because the linear models have non-zero intercepts (significant at 95%), but also non-zero gradients (significant at 95%), we should probably think of the ODIN as having both additive and multiplicative noise. The multiplicative noise is something like 20%, and the additive noise is something like 2.

Visualize Data

ggplot(data) +
  geom_point(aes(temp.odin.109,pm2.5),alpha=0.5) +
  ylab(expression(PM[2.5]*"-ECan"~(mu*'g'~m^-3))) +
  xlab(expression("Temp-ODIN-109 ("*degree*"C)")) +
  ggtitle("Full ODIN-109 2016 Dataset")

ggplot(data) +
  geom_point(aes(rh.odin.109,pm2.5),alpha=0.5) +
  ylab(expression(PM[2.5]*"-ECan"~(mu*'g'~m^-3))) +
  xlab(expression("RH-ODIN-109 (%)")) +
  ggtitle("Full ODIN-109 2016 Dataset")

ggplot(data) +
  geom_point(aes(pm2.5.odin.109,pm2.5),alpha=0.5) +
  ylab(expression(PM[2.5]*"-ECan"~(mu*'g'~m^-3))) +
  xlab(expression(PM[2.5]*"-ODIN-109"~(mu*'g'~m^-3))) +
  ggtitle("Full ODIN-109 2016 Dataset")

# look at correlations
cor(data$pm2.5.odin.109,data$rh.odin.109,use="pairwise.complete.obs")
## [1] 0.2016953
cor(data$pm2.5.odin.109,data$temp.odin.109,use="pairwise.complete.obs")
## [1] -0.3994703
cor(data$temp.odin.109,data$rh.odin.109,use="pairwise.complete.obs")
## [1] -0.6141244