We initially load and store the ODIN and ECan data in dataframes named odin and ecan, respectively.
library(RPostgreSQL)
library(rpart)
library(reshape2)
library(openair)
library(lubridate)
# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\repo")
# Database login details (not in GitHub repo)
load("access.Rda")
# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
user=access$user,
password=access$pwd,
host='penap-data.dyndns.org',
dbname='cona',
port=5432)
# Load PM10, PM2.5, and temperature data from odin 109 at ecan site in June 2017
siteid = 18 # ecan site
odin.fresh <- dbGetQuery(con," SELECT d.recordtime AT TIME ZONE 'NZT' AS date,
s.name as label, d.value::numeric as val
FROM data.fixed_data as d, admin.sensor as s, admin.instrument as i
WHERE s.id = d.sensorid AND
s.instrumentid = i.id AND
i.name = 'ODIN-SD-3' AND
d.siteid = 18 AND
i.serialn = 'ODIN-109' AND
(d.recordtime BETWEEN '2017-06-01 00:00' AND
'2017-07-01 00:00') AND
(s.name = 'PM10' OR s.name = 'PM2.5' OR s.name = 'Temperature')
ORDER BY date;")
odin <- dcast(odin.fresh, date~label, value.var='val')
# make the names easier to type
odin$temp <- odin$Temperature
odin$pm10 <- odin$PM10
odin$pm2.5 <- odin$PM2.5
odin$Temperature <- NULL
odin$PM10 <- NULL
odin$PM2.5 <- NULL
odin$date <- trunc(odin$date,'min') # get rid of the seconds
odin$date <- as.POSIXct(odin$date) # the above converted it to POSIXlt
# Load ECan data (stored locally)
ecan <- read.csv('EcanJune2017.csv',stringsAsFactors=FALSE)
names(ecan) <- c('date','wind.max','wind.speed','wind.direction','co','pm10',
'pm2.5','pm.course','temp.2m','temp.6m','temp.ground')
ecan$date <- as.POSIXct(ecan$date,format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
# There are some NA's in PM2.5. Set these to zero.
ecan$pm2.5[which(is.na(ecan$pm2.5))] <- 0
Let’s have a look
odin[1:10,]
## date temp pm10 pm2.5
## 1 2017-06-07 23:56:00 7.3 33.51744 14.477783
## 2 2017-06-07 23:57:00 7.2 31.51744 10.477783
## 3 2017-06-07 23:58:00 7.2 21.51744 2.477783
## 4 2017-06-07 23:59:00 7.1 25.51744 8.477783
## 5 2017-06-08 00:00:00 7.2 11.51744 10.477783
## 6 2017-06-08 00:01:00 7.1 15.51744 14.477783
## 7 2017-06-08 00:02:00 7.2 25.51744 6.477783
## 8 2017-06-08 00:03:00 7.2 37.51744 4.477783
## 9 2017-06-08 00:04:00 7.1 21.51744 2.477783
## 10 2017-06-08 00:05:00 7.2 35.51744 34.477783
ecan[1:10,]
## date wind.max wind.speed wind.direction co pm10 pm2.5
## 1 2017-06-01 00:10:00 1.6 0.8 286 0 17 17
## 2 2017-06-01 00:20:00 1.4 0.7 303 0 17 17
## 3 2017-06-01 00:30:00 1.1 0.7 298 0 17 17
## 4 2017-06-01 00:40:00 1.5 0.7 NA 0 17 17
## 5 2017-06-01 00:50:00 1.6 1.0 295 0 17 17
## 6 2017-06-01 01:00:00 1.3 0.8 284 0 17 17
## 7 2017-06-01 01:10:00 1.5 0.7 313 0 15 11
## 8 2017-06-01 01:20:00 2.0 1.2 304 0 15 11
## 9 2017-06-01 01:30:00 2.0 1.1 304 0 15 11
## 10 2017-06-01 01:40:00 1.3 0.5 NA 0 15 11
## pm.course temp.2m temp.6m temp.ground
## 1 0 9 9 0
## 2 0 9 9 0
## 3 0 9 9 0
## 4 0 9 9 0
## 5 0 9 9 0
## 6 0 9 9 0
## 7 4 8 8 0
## 8 4 8 8 0
## 9 4 8 8 0
## 10 4 8 8 0
Let’s make sure that the PM10 and PM2.5 measurements are doing something sensible.
plot(odin$pm2.5,odin$pm10,xlab="PM2.5",ylab="PM10",main="ODIN PM Measurements")
lm.odin <- lm(odin$pm10~odin$pm2.5)
abline(lm.odin,col="red")
The relationship appears linear with some error term. When we plot the error term we get
error <- odin$pm10 - predict(lm.odin,odin)
hist(error,
xlab="Difference", main="Error around Linear Model",
breaks=100, xlim=c(-100,100))
The error term is not Gaussian as expected, but bimodal. I don’t know what to make of this.
# Where do the times overlap
min_date <- max(min(odin$date),min(ecan$date)) - 60*60 # include an hour's buffer for alignment
max_date <- min(max(odin$date),max(ecan$date)) + 60*60
# Note that these are in GMT+13, not NZST (which is in DST: GMT+12)
ecan.indices <- which(ecan$date > min_date & ecan$date < max_date)
odin.indices <- which(odin$date > min_date & odin$date < max_date)
ecan <- ecan[ecan.indices,]
odin <- odin[odin.indices,]
# Time alignment
align.ecan <- function(ecan.series,odin.series,var.name,corr.plot=FALSE) {
lag_test=ccf(ecan.series,
odin.series,
na.action=na.pass,
lag.max=200,
plot=corr.plot,
type='correlation',
ylab='Correlation',
main=paste(var.name,'correlation as function of clock lag'))
offset <- lag_test$lag[which.max(lag_test$acf)]
corr <- max(lag_test$acf)
c(offset,corr)
}
# Where does starting the average window (for ODIN) get the best correlation?
ecan.alignments <- data.frame(offset=NA,corr=NA)
for (i in 1:9) {
odin.avg <- timeAverage(odin[i:nrow(odin),], avg.time='10 min')
ecan.alignments[i,] <- align.ecan(ecan$temp.2m,odin.avg$temp,'Temperature (2m)')
}
i <- which.max(ecan.alignments$corr)
odin.avg <- timeAverage(odin[i:nrow(odin),], avg.time='10 min')
offset <- ecan.alignments$offset[i]
ecan.aligned <- ecan[(offset+1):nrow(ecan),]
The two time series should now be aligned. To verify correctness, we will check that temperature, PM2.5 and PM10 are all well correlated.
# Verify correctness
align.ecan(ecan.aligned$temp.2m,odin.avg$temp,'Temperature (2m)',TRUE)
## [1] 0.0000000 0.9757069
# How far off are the two clocks from each other?
ecan.aligned$date[1]-odin.avg$date[1]
## Time difference of 4 mins
# Check that PM10 and PM2.5 also line up
align.ecan(ecan.aligned$pm10,odin.avg$pm10,'PM10',TRUE)
## [1] 1.000000 0.896719
align.ecan(ecan.aligned$pm2.5,odin.avg$pm2.5,'PM2.5',TRUE)
## [1] 2.0000000 0.9180543
These are all well correlated, so let’s go ahead and merge the data. Also plot it.
# Merge data
data <- ecan.aligned[1:nrow(odin.avg),] # use the ECan clock
data$pm10.odin <- odin.avg$pm10
data$pm2.5.odin <- odin.avg$pm2.5
data$temp.odin <- odin.avg$temp
# Plot the pm2.5 data
plot(data$pm2.5.odin,data$pm2.5,ylab="TEOM",xlab="ODIN",main="PM2.5 Measurements")
lm.pm2.5 <- lm(data$pm2.5~data$pm2.5.odin)
abline(lm.pm2.5,col="red")
The TEOM measurements seem to cluster around certain values (as if they’ve been discretized). To investigate further, let’s look at histograms of the data.
hist(data$pm2.5,xlab="PM2.5",main="TEOM",breaks=100)
# hist(data$pm2.5.odin,xlab="PM2.5",main="ODIN",breaks=100)
hist(predict(lm.pm2.5,data),xlab="PM2.5",main="ODIN (via linear callibration)",breaks=100)
The ODIN histogram is a very well behaved Poisson-like distribution. The TEOM histogram on the other hand looks very jagged (as would be expected from discretization).
Let’s now try using some models to predict the TEOM measurements from the ODIN measurements. The models we will use are: linear, regression tree, pruned regression tree, kernel regression (with various parameters). We will also use the following training sets (the test sets then being the rest of the data): the entire data set, the first third (chronologically), the second third, the third third, and some random segment of the data. An error term is calculated for each model.
# Trying out models
# Functions for predicting test data based on training data
lm.model <- function(train,test,form=y~x) {
this.lm <- lm(form,train)
predictions <- predict(this.lm, test)
}
# double.lm.model <- function(train,test,form=y~x) {
# midway <- max(train.x)/2
# train.lower.half <- which(train < midway)
# test.lower.half <- which(test < midway)
#
# this.lm <- lm(form,train)
# predictions <- predict(this.lm, test)
# }
tree.model <- function(train,test,form=y~x) {
fit <- rpart(form, method="anova", data=train)
predict(fit, test, "vector")
}
ptree.model <- function(train,test,form=y~x) {
fit <- rpart(form, method="anova", data=train)
pfit<- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"])
predict(fit, test, "vector")
}
kernel.n.model <- function(n,train,test,form=y~x) {
kern <- ksmooth(train$x,train$y,kernel='normal',x.points=seq(1,500,n))
kern$y[ as.integer(test$x/n) ]
}
kernel.2.model <- function(train,test) kernel.n.model(2,train,test)
kernel.5.model <- function(train,test) kernel.n.model(5,train,test)
kernel.10.model <- function(train,test) kernel.n.model(10,train,test)
# Setting up the array which will store error terms
column.names <- c("Whole","1st Third","2nd Third","3rd Third","Random","Avg")
row.names <- c("linear","tree","ptree","kernel.2","kernel.5","kernel.10")
matrix.names <- c("error","na","error (ecan inputs)","error (odin inputs)")
dim.names <- list(row.names,column.names,matrix.names)
model.succ.dim <- c(length(column.names),length(row.names),length(matrix.names))
model.success <- array(0,dim=lapply(dim.names,length),
dimnames = dim.names)
len <- nrow(data)
third <- as.integer(len/3)
rand.inds <- sample(1,len, 2)
subsets <- list(1:len, 1:third, third:(2*third),
(2*third):len, rand.inds[1]:rand.inds[2])
models <- list(lm.model,tree.model,ptree.model,kernel.2.model,kernel.5.model,
kernel.10.model)
# Model with onlin pm2.5.odin as input variable
for (s in 1:length(subsets)) {
train.data <- data[subsets[[s]],]
train <- data.frame(x=train.data$pm2.5.odin,y=train.data$pm2.5)
if (s==1)
test.data <- data
else
test.data <- data[-subsets[[s]],]
test <- data.frame(x=test.data$pm2.5.odin,y=test.data$pm2.5)
for (m in 1:length(models)) {
predictions <- models[[m]](train,test)
model.success[m,s,'error'] <- mean( (predictions-test$y)^2, na.rm=TRUE )
model.success[m,s,'na'] <- length(which(is.na(predictions)))
}
}
# Model with pm2.5.odin + a bunch of ecan measurments for input variables
for (s in 1:length(subsets)) {
train.data <- data[subsets[[s]],]
train <- data.frame(pm2.5=train.data$pm2.5.odin,temp=train.data$temp.2m,
wind.dir=train.data$wind.direction,
wind.speed=train.data$wind.speed,
co=train.data$co,date=train.data$date,
y=train.data$pm2.5)
all.ecan.form <- y~pm2.5+temp+wind.dir+wind.speed+co+date
if (s==1)
test.data <- data
else
test.data <- data[-subsets[[s]],]
test <- data.frame(pm2.5=test.data$pm2.5.odin,temp=test.data$temp.2m,
wind.dir=test.data$wind.direction,
wind.speed=test.data$wind.speed,
co=test.data$co,date=test.data$date,
y=test.data$pm2.5)
for (m in 1:3) { # kernel regression algorithm is univariate
predictions <- models[[m]](train,test,all.ecan.form)
model.success[m,s,"error (ecan inputs)"] <- mean( (predictions-test$y)^2, na.rm=TRUE )
}
}
# Model with pm2.5.odin + a bunch of other odin measurments for input variables
for (s in 1:length(subsets)) {
train.data <- data[subsets[[s]],]
train <- data.frame(pm2.5=train.data$pm2.5.odin,temp=train.data$temp.odin,
pm10=train.data$pm10.odin,
y=train.data$pm2.5)
all.odin.form <- y~pm2.5+temp+pm10
if (s==1)
test.data <- data
else
test.data <- data[-subsets[[s]],]
test <- data.frame(pm2.5=test.data$pm2.5.odin,temp=test.data$temp.odin,
pm10=test.data$pm10.odin,
y=test.data$pm2.5)
for (m in 1:3) { # kernel regression algorithm is univariate
predictions <- models[[m]](train,test,all.odin.form)
model.success[m,s,"error (odin inputs)"] <- mean( (predictions-test$y)^2, na.rm=TRUE )
}
}
# Calculate averages
for (i in 1:length(matrix.names)) {
for (m in 1:length(models)) {
model.success[m,length(subsets)+1,i] <- mean(
model.success[m,1:(length(subset)-1),i]
)
}
}
model.success
## , , error
##
## Whole 1st Third 2nd Third 3rd Third Random Avg
## linear 141.3317 143.3928 102.2716 202.4948 779.40000 141.3317
## tree 119.5021 138.4277 107.1176 731.0127 779.40000 119.5021
## ptree 119.5021 138.4277 107.1176 731.0127 779.40000 119.5021
## kernel.2 196.0532 183.3788 195.9612 171.7990 38.69444 196.0532
## kernel.5 1159.4367 542.8625 779.9787 1524.9871 687.01869 1159.4367
## kernel.10 1286.2763 1396.4148 588.0427 1272.2710 489.99095 1286.2763
##
## , , na
##
## Whole 1st Third 2nd Third 3rd Third Random Avg
## linear 0 0 0 0 0 0
## tree 0 0 0 0 0 0
## ptree 0 0 0 0 0 0
## kernel.2 266 216 178 344 1044 266
## kernel.5 218 201 184 230 689 218
## kernel.10 195 165 140 237 451 195
##
## , , error (ecan inputs)
##
## Whole 1st Third 2nd Third 3rd Third Random Avg
## linear 84.19085 156.5740 141.0009 171.2742 779.4 84.19085
## tree 119.50212 325.5384 223.0751 1199.5791 779.4 119.50212
## ptree 119.50212 325.5384 223.0751 1199.5791 779.4 119.50212
## kernel.2 0.00000 0.0000 0.0000 0.0000 0.0 0.00000
## kernel.5 0.00000 0.0000 0.0000 0.0000 0.0 0.00000
## kernel.10 0.00000 0.0000 0.0000 0.0000 0.0 0.00000
##
## , , error (odin inputs)
##
## Whole 1st Third 2nd Third 3rd Third Random Avg
## linear 130.3390 135.4021 137.1118 197.7943 779.4 130.3390
## tree 124.6756 146.2764 117.8063 968.4693 779.4 124.6756
## ptree 124.6756 146.2764 117.8063 968.4693 779.4 124.6756
## kernel.2 0.0000 0.0000 0.0000 0.0000 0.0 0.0000
## kernel.5 0.0000 0.0000 0.0000 0.0000 0.0 0.0000
## kernel.10 0.0000 0.0000 0.0000 0.0000 0.0 0.0000
Only first kernel regression model gives a decent error, but it also has a lot of “can’t tells” (seen in the “na” matrix), which may prevent it from being useful. The regression tree seems promising. It performs slighly better than the linear model. It doesn’t look like it matters whether the regression tree has the pruning step.
Are there any simple algebraic models which improve correlation between ODIN and TEOM PM2.5 measurements?
# linear
cor(data$pm2.5.odin,data$pm2.5) # TODO: figure out where the NAs are coming from
## [1] 0.9024725
# For logs we need to get rid of zeros
data.non.zero <- data[which(data$pm2.5 > 0 & data$pm2.5.odin > 0),]
# polynomial
cor(log(data.non.zero$pm2.5),log(data.non.zero$pm2.5.odin))
## [1] 0.8913095
# log
cor(log(data.non.zero$pm2.5),data.non.zero$pm2.5.odin)
## [1] 0.7046857
# exp
cor(exp(data$pm2.5),data$pm2.5.odin)
## [1] 0.4060808