Load the data

We initially load and store the ODIN and ECan data in dataframes named odin and ecan, respectively.

library(RPostgreSQL)
library(rpart)
library(reshape2)
library(openair)
library(lubridate)

# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\repo")

# Database login details (not in GitHub repo)
load("access.Rda")

# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
                 user=access$user,
                 password=access$pwd,
                 host='penap-data.dyndns.org',
                 dbname='cona',
                 port=5432)

# Load PM10, PM2.5, and temperature data from odin 109 at ecan site in June 2017
siteid = 18 # ecan site
odin.fresh <- dbGetQuery(con," SELECT d.recordtime AT TIME ZONE 'NZT' AS date,
                         s.name as label, d.value::numeric as val
                         FROM data.fixed_data as d,  admin.sensor as s, admin.instrument as i
                         WHERE s.id = d.sensorid AND
                         s.instrumentid = i.id AND
                         i.name = 'ODIN-SD-3' AND
                         d.siteid = 18 AND
                         i.serialn = 'ODIN-109' AND
                         (d.recordtime BETWEEN '2017-06-01 00:00' AND
                         '2017-07-01 00:00') AND
                         (s.name = 'PM10' OR s.name = 'PM2.5' OR s.name = 'Temperature')
                         ORDER BY date;")
odin <- dcast(odin.fresh, date~label, value.var='val')
# make the names easier to type
odin$temp <- odin$Temperature
odin$pm10 <- odin$PM10
odin$pm2.5 <- odin$PM2.5
odin$Temperature <- NULL
odin$PM10 <- NULL
odin$PM2.5 <- NULL
odin$date <- trunc(odin$date,'min') # get rid of the seconds
odin$date <- as.POSIXct(odin$date) # the above converted it to POSIXlt

# Load ECan data (stored locally)
ecan <- read.csv('EcanJune2017.csv',stringsAsFactors=FALSE)
names(ecan) <- c('date','wind.max','wind.speed','wind.direction','co','pm10',
                 'pm2.5','pm.course','temp.2m','temp.6m','temp.ground')
ecan$date <- as.POSIXct(ecan$date,format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
# There are some NA's in PM2.5. Set these to zero.
ecan$pm2.5[which(is.na(ecan$pm2.5))] <- 0

Let’s have a look

odin[1:10,]
##                   date temp     pm10     pm2.5
## 1  2017-06-07 23:56:00  7.3 33.51744 14.477783
## 2  2017-06-07 23:57:00  7.2 31.51744 10.477783
## 3  2017-06-07 23:58:00  7.2 21.51744  2.477783
## 4  2017-06-07 23:59:00  7.1 25.51744  8.477783
## 5  2017-06-08 00:00:00  7.2 11.51744 10.477783
## 6  2017-06-08 00:01:00  7.1 15.51744 14.477783
## 7  2017-06-08 00:02:00  7.2 25.51744  6.477783
## 8  2017-06-08 00:03:00  7.2 37.51744  4.477783
## 9  2017-06-08 00:04:00  7.1 21.51744  2.477783
## 10 2017-06-08 00:05:00  7.2 35.51744 34.477783
ecan[1:10,]
##                   date wind.max wind.speed wind.direction co pm10 pm2.5
## 1  2017-06-01 00:10:00      1.6        0.8            286  0   17    17
## 2  2017-06-01 00:20:00      1.4        0.7            303  0   17    17
## 3  2017-06-01 00:30:00      1.1        0.7            298  0   17    17
## 4  2017-06-01 00:40:00      1.5        0.7             NA  0   17    17
## 5  2017-06-01 00:50:00      1.6        1.0            295  0   17    17
## 6  2017-06-01 01:00:00      1.3        0.8            284  0   17    17
## 7  2017-06-01 01:10:00      1.5        0.7            313  0   15    11
## 8  2017-06-01 01:20:00      2.0        1.2            304  0   15    11
## 9  2017-06-01 01:30:00      2.0        1.1            304  0   15    11
## 10 2017-06-01 01:40:00      1.3        0.5             NA  0   15    11
##    pm.course temp.2m temp.6m temp.ground
## 1          0       9       9           0
## 2          0       9       9           0
## 3          0       9       9           0
## 4          0       9       9           0
## 5          0       9       9           0
## 6          0       9       9           0
## 7          4       8       8           0
## 8          4       8       8           0
## 9          4       8       8           0
## 10         4       8       8           0

ODIN PM10 and PM2.5 Measurements

Let’s make sure that the PM10 and PM2.5 measurements are doing something sensible.

plot(odin$pm2.5,odin$pm10,xlab="PM2.5",ylab="PM10",main="ODIN PM Measurements")
lm.odin <- lm(odin$pm10~odin$pm2.5)
abline(lm.odin,col="red")

The relationship appears linear with some error term. When we plot the error term we get

error <- odin$pm10 - predict(lm.odin,odin)
hist(error,
     xlab="Difference", main="Error around Linear Model",
     breaks=100, xlim=c(-100,100))

The error term is not Gaussian as expected, but bimodal. I don’t know what to make of this.

Synchronizing ODIN and ECan Clocks

# Where do the times overlap
min_date <- max(min(odin$date),min(ecan$date)) - 60*60 # include an hour's buffer for alignment
max_date <- min(max(odin$date),max(ecan$date)) + 60*60 
# Note that these are in GMT+13, not NZST (which is in DST: GMT+12)
ecan.indices <- which(ecan$date > min_date & ecan$date < max_date)
odin.indices <- which(odin$date > min_date & odin$date < max_date)
ecan <- ecan[ecan.indices,]
odin <- odin[odin.indices,]

# Time alignment
align.ecan <- function(ecan.series,odin.series,var.name,corr.plot=FALSE) {
  lag_test=ccf(ecan.series,
               odin.series,
               na.action=na.pass,
               lag.max=200,
               plot=corr.plot,
               type='correlation',
               ylab='Correlation',
               main=paste(var.name,'correlation as function of clock lag'))
  offset <- lag_test$lag[which.max(lag_test$acf)]
  corr <- max(lag_test$acf)
  c(offset,corr)
}
# Where does starting the average window (for ODIN) get the best correlation?
ecan.alignments <- data.frame(offset=NA,corr=NA)
for (i in 1:9) {
  odin.avg <- timeAverage(odin[i:nrow(odin),], avg.time='10 min')
  ecan.alignments[i,] <- align.ecan(ecan$temp.2m,odin.avg$temp,'Temperature (2m)')
}
i <- which.max(ecan.alignments$corr)
odin.avg <- timeAverage(odin[i:nrow(odin),], avg.time='10 min')
offset <- ecan.alignments$offset[i]
ecan.aligned <- ecan[(offset+1):nrow(ecan),]

The two time series should now be aligned. To verify correctness, we will check that temperature, PM2.5 and PM10 are all well correlated.

# Verify correctness
align.ecan(ecan.aligned$temp.2m,odin.avg$temp,'Temperature (2m)',TRUE)

## [1] 0.0000000 0.9757069
# How far off are the two clocks from each other?
ecan.aligned$date[1]-odin.avg$date[1]
## Time difference of 4 mins
# Check that PM10 and PM2.5 also line up
align.ecan(ecan.aligned$pm10,odin.avg$pm10,'PM10',TRUE)

## [1] 1.000000 0.896719
align.ecan(ecan.aligned$pm2.5,odin.avg$pm2.5,'PM2.5',TRUE)

## [1] 2.0000000 0.9180543

These are all well correlated, so let’s go ahead and merge the data. Also plot it.

# Merge data
data <- ecan.aligned[1:nrow(odin.avg),] # use the ECan clock
data$pm10.odin <- odin.avg$pm10
data$pm2.5.odin <- odin.avg$pm2.5
data$temp.odin <- odin.avg$temp

# Plot the pm2.5 data
plot(data$pm2.5.odin,data$pm2.5,ylab="TEOM",xlab="ODIN",main="PM2.5 Measurements")
lm.pm2.5 <- lm(data$pm2.5~data$pm2.5.odin)
abline(lm.pm2.5,col="red")

The TEOM measurements seem to cluster around certain values (as if they’ve been discretized). To investigate further, let’s look at histograms of the data.

hist(data$pm2.5,xlab="PM2.5",main="TEOM",breaks=100)

# hist(data$pm2.5.odin,xlab="PM2.5",main="ODIN",breaks=100)
hist(predict(lm.pm2.5,data),xlab="PM2.5",main="ODIN (via linear callibration)",breaks=100)

The ODIN histogram is a very well behaved Poisson-like distribution. The TEOM histogram on the other hand looks very jagged (as would be expected from discretization).

Models for ODIN-109 vs ECan

Let’s now try using some models to predict the TEOM measurements from the ODIN measurements. The models we will use are: linear, regression tree, pruned regression tree, kernel regression (with various parameters). We will also use the following training sets (the test sets then being the rest of the data): the entire data set, the first third (chronologically), the second third, the third third, and some random segment of the data. An error term is calculated for each model.

# Trying out models
# Functions for predicting test data based on training data
lm.model <- function(train,test,form=y~x) {
  this.lm <- lm(form,train)
  predictions <- predict(this.lm, test)
}
# double.lm.model <- function(train,test,form=y~x) {
#   midway <- max(train.x)/2
#   train.lower.half <- which(train < midway)
#   test.lower.half <- which(test < midway)
#   
#   this.lm <- lm(form,train)
#   predictions <- predict(this.lm, test)
# }
tree.model <- function(train,test,form=y~x) {
  fit <- rpart(form, method="anova", data=train)
  predict(fit, test, "vector")
}
ptree.model <- function(train,test,form=y~x) {
  fit <- rpart(form, method="anova", data=train)
  pfit<- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"])
  predict(fit, test, "vector")
}
kernel.n.model <- function(n,train,test,form=y~x) {
  kern <- ksmooth(train$x,train$y,kernel='normal',x.points=seq(1,500,n))
  kern$y[ as.integer(test$x/n) ]
}
kernel.2.model <- function(train,test) kernel.n.model(2,train,test)
kernel.5.model <- function(train,test) kernel.n.model(5,train,test)
kernel.10.model <- function(train,test) kernel.n.model(10,train,test)
# Setting up the array which will store error terms
column.names <- c("Whole","1st Third","2nd Third","3rd Third","Random","Avg")
row.names <- c("linear","tree","ptree","kernel.2","kernel.5","kernel.10")
matrix.names <- c("error","na","error (ecan inputs)","error (odin inputs)")
dim.names <- list(row.names,column.names,matrix.names)
model.succ.dim <- c(length(column.names),length(row.names),length(matrix.names))
model.success <- array(0,dim=lapply(dim.names,length),
                       dimnames = dim.names)
len <- nrow(data)
third <- as.integer(len/3)
rand.inds <- sample(1,len, 2)

subsets <- list(1:len, 1:third, third:(2*third), 
                (2*third):len, rand.inds[1]:rand.inds[2])
models <- list(lm.model,tree.model,ptree.model,kernel.2.model,kernel.5.model,
               kernel.10.model)
# Model with onlin pm2.5.odin as input variable
for (s in 1:length(subsets)) {
  train.data <- data[subsets[[s]],]
  train <- data.frame(x=train.data$pm2.5.odin,y=train.data$pm2.5)
  if (s==1)
    test.data <- data
  else
    test.data <- data[-subsets[[s]],]
  test <- data.frame(x=test.data$pm2.5.odin,y=test.data$pm2.5)
  for (m in 1:length(models)) {
    predictions <- models[[m]](train,test)
    model.success[m,s,'error'] <- mean( (predictions-test$y)^2, na.rm=TRUE )
    model.success[m,s,'na'] <- length(which(is.na(predictions)))
  }
}
# Model with pm2.5.odin + a bunch of ecan measurments for input variables
for (s in 1:length(subsets)) {
  train.data <- data[subsets[[s]],]
  train <- data.frame(pm2.5=train.data$pm2.5.odin,temp=train.data$temp.2m,
                      wind.dir=train.data$wind.direction,
                      wind.speed=train.data$wind.speed,
                      co=train.data$co,date=train.data$date,
                      y=train.data$pm2.5)
  all.ecan.form <- y~pm2.5+temp+wind.dir+wind.speed+co+date
  if (s==1)
    test.data <- data
  else
    test.data <- data[-subsets[[s]],]
  test <- data.frame(pm2.5=test.data$pm2.5.odin,temp=test.data$temp.2m,
                                wind.dir=test.data$wind.direction,
                                wind.speed=test.data$wind.speed,
                                co=test.data$co,date=test.data$date,
                                y=test.data$pm2.5)
  for (m in 1:3) { # kernel regression algorithm is univariate
    predictions <- models[[m]](train,test,all.ecan.form)
    model.success[m,s,"error (ecan inputs)"] <- mean( (predictions-test$y)^2, na.rm=TRUE )
  }
}
# Model with pm2.5.odin + a bunch of other odin measurments for input variables
for (s in 1:length(subsets)) {
  train.data <- data[subsets[[s]],]
  train <- data.frame(pm2.5=train.data$pm2.5.odin,temp=train.data$temp.odin,
                      pm10=train.data$pm10.odin,
                      y=train.data$pm2.5)
  all.odin.form <- y~pm2.5+temp+pm10
  if (s==1)
    test.data <- data
  else
    test.data <- data[-subsets[[s]],]
  test <- data.frame(pm2.5=test.data$pm2.5.odin,temp=test.data$temp.odin,
                     pm10=test.data$pm10.odin,
                     y=test.data$pm2.5)
  for (m in 1:3) { # kernel regression algorithm is univariate
    predictions <- models[[m]](train,test,all.odin.form)
    model.success[m,s,"error (odin inputs)"] <- mean( (predictions-test$y)^2, na.rm=TRUE )
  }
}
# Calculate averages
for (i in 1:length(matrix.names)) {
  for (m in 1:length(models)) {
    model.success[m,length(subsets)+1,i] <- mean(
      model.success[m,1:(length(subset)-1),i]
    )
  }
}

model.success
## , , error
## 
##               Whole 1st Third 2nd Third 3rd Third    Random       Avg
## linear     141.3317  143.3928  102.2716  202.4948 779.40000  141.3317
## tree       119.5021  138.4277  107.1176  731.0127 779.40000  119.5021
## ptree      119.5021  138.4277  107.1176  731.0127 779.40000  119.5021
## kernel.2   196.0532  183.3788  195.9612  171.7990  38.69444  196.0532
## kernel.5  1159.4367  542.8625  779.9787 1524.9871 687.01869 1159.4367
## kernel.10 1286.2763 1396.4148  588.0427 1272.2710 489.99095 1286.2763
## 
## , , na
## 
##           Whole 1st Third 2nd Third 3rd Third Random Avg
## linear        0         0         0         0      0   0
## tree          0         0         0         0      0   0
## ptree         0         0         0         0      0   0
## kernel.2    266       216       178       344   1044 266
## kernel.5    218       201       184       230    689 218
## kernel.10   195       165       140       237    451 195
## 
## , , error (ecan inputs)
## 
##               Whole 1st Third 2nd Third 3rd Third Random       Avg
## linear     84.19085  156.5740  141.0009  171.2742  779.4  84.19085
## tree      119.50212  325.5384  223.0751 1199.5791  779.4 119.50212
## ptree     119.50212  325.5384  223.0751 1199.5791  779.4 119.50212
## kernel.2    0.00000    0.0000    0.0000    0.0000    0.0   0.00000
## kernel.5    0.00000    0.0000    0.0000    0.0000    0.0   0.00000
## kernel.10   0.00000    0.0000    0.0000    0.0000    0.0   0.00000
## 
## , , error (odin inputs)
## 
##              Whole 1st Third 2nd Third 3rd Third Random      Avg
## linear    130.3390  135.4021  137.1118  197.7943  779.4 130.3390
## tree      124.6756  146.2764  117.8063  968.4693  779.4 124.6756
## ptree     124.6756  146.2764  117.8063  968.4693  779.4 124.6756
## kernel.2    0.0000    0.0000    0.0000    0.0000    0.0   0.0000
## kernel.5    0.0000    0.0000    0.0000    0.0000    0.0   0.0000
## kernel.10   0.0000    0.0000    0.0000    0.0000    0.0   0.0000

Only first kernel regression model gives a decent error, but it also has a lot of “can’t tells” (seen in the “na” matrix), which may prevent it from being useful. The regression tree seems promising. It performs slighly better than the linear model. It doesn’t look like it matters whether the regression tree has the pruning step.

Algebraic models

Are there any simple algebraic models which improve correlation between ODIN and TEOM PM2.5 measurements?

# linear
cor(data$pm2.5.odin,data$pm2.5) # TODO: figure out where the NAs are coming from
## [1] 0.9024725
# For logs we need to get rid of zeros
data.non.zero <- data[which(data$pm2.5 > 0 & data$pm2.5.odin > 0),]
# polynomial
cor(log(data.non.zero$pm2.5),log(data.non.zero$pm2.5.odin))
## [1] 0.8913095
# log
cor(log(data.non.zero$pm2.5),data.non.zero$pm2.5.odin)
## [1] 0.7046857
# exp
cor(exp(data$pm2.5),data$pm2.5.odin)
## [1] 0.4060808

TODO