library(RPostgreSQL)
## Loading required package: DBI
library(rpart)
library(reshape2)
library(zoo)
## Warning: package 'zoo' was built under R version 3.4.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggplot2)
library(mblm)
library(MASS)
library(rpart)
# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\repo")
# Database login details (not in GitHub repo)
load("access.Rda")
# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
user=access$user,
password=access$pwd,
host='penap-data.dyndns.org',
dbname='cona',
port=5432)
# Load ECan data (stored locally)
ecan <- read.csv('RangioraWinter2016.csv',stringsAsFactors=FALSE)
names(ecan) <- c('date','time','wind.speed','wind.dir','wind.dir.std','wind.speed.std',
'wind.max','co','temp.ground','temp.2m','temp.6m','pm10',
'pm2.5','pm.course')
ecan$date <- as.POSIXct(ecan$date,format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
ecan$time <- NULL # this is included in date
# Load ODIN data
# Siteid = 18 is ECan site
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
i.serialn as serial, s.name as label,
d.value::numeric as val
FROM data.fixed_data as d,
admin.sensor as s, admin.instrument as i
WHERE s.id = d.sensorid
AND s.instrumentid = i.id
AND i.name = 'ODIN-SD-3'
AND d.siteid = 18
AND (i.serialn = 'ODIN-102'
OR i.serialn = 'ODIN-103'
OR i.serialn = 'ODIN-105'
OR i.serialn = 'ODIN-107'
OR i.serialn = 'ODIN-108'
OR i.serialn = 'ODIN-109'
OR i.serialn = 'ODIN-111'
OR i.serialn = 'ODIN-113'
OR i.serialn = 'ODIN-114'
OR i.serialn = 'ODIN-115')
AND (d.recordtime BETWEEN '2016-07-01 00:00 NZST'
AND '2016-08-01 00:00 NZST')
AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
AND '2016-09-25 03:00 NZST')
AND (s.name = 'PM2.5'
OR s.name = 'Temperature'
OR s.name = 'RH')
ORDER BY date;")
# Time periods including timestamps between 2016-09-25 02:01 and 02:59
# will truncate timestamps at 'day'.
# Get rid of the seconds
odin.raw$date <- trunc(odin.raw$date,'min')
# The above converted it to POSIXlt
odin.raw$date <- as.POSIXct(odin.raw$date)
# Fix daylight savings bug
ds.time <- as.POSIXct('09/25/2016 02:00',format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
odin.raw[odin.raw$date > ds.time, 'date'] <- odin.raw[odin.raw$date > ds.time, 'date'] + 60*60
serials <- unique(odin.raw$serial)
data <- ecan
for (i in 1:length(serials)) {
# Get data for one ODIN in wide format
odin <- dcast(odin.raw[odin.raw$serial==serials[i],],
date~label, value.var='val', fun.aggregate=mean)
# Make column names easier to type
names(odin) <- tolower(names(odin))
names(odin)[which(names(odin)=="temperature")] <- "temp"
# Change columns from "temp"" to "temp.odin.100", etc
serial <- substring(serials[i],6,8)
names(odin) <- paste0(names(odin),'.odin.',serial)
# The 'date.odin.100' back to just 'date'
date.odin.100 <- paste0('date.odin.',serial)
odin[,'date'] <- odin[,date.odin.100]
odin[,date.odin.100] <- NULL
# Make sure that there is a odin entry for each minute
dates <- data.frame(date=seq(odin$date[1],odin$date[nrow(odin)],by='mins'))
odin <- merge(odin,dates,by="date",all=TRUE)
# Change the ODIN measurements from 1 min averages to 1 hour averages
date.col <- which(names(odin)=='date')
odin.zoo <- zoo( odin[,-date.col] ) # remove date because rollapply only works on numerics
odin.zoo <- rollapply(odin.zoo,width=60,by=1,FUN=mean,align="left",na.rm=TRUE)
# NB I don't know what the 'align' parameter does
odin <- odin[1:nrow(odin.zoo),]
odin[,-date.col] <- odin.zoo
# Take timestamps at the end of the hour average following ECan convention
odin$date <- odin$date + 60*60
# Merge with the rest of the data
data <- merge(data,odin,by='date',all=FALSE)
}
# Split dataset into halves
pivot <- as.integer(nrow(data)/2)
data.1 <- data[1:pivot,]
data.2 <- data[(pivot+1):nrow(data),]
# How many data points?
nrow(data.1)
## [1] 773
nrow(data.2)
## [1] 774
# Perform Kolmogorov-Smirnov Test
ks.test(data.1$pm2.5,data.2$pm2.5)$p.value
## Warning in ks.test(data.1$pm2.5, data.2$pm2.5): p-value will be approximate
## in the presence of ties
## [1] 0
ks.test(data.1$temp.2m,data.2$temp.2m)$p.value
## Warning in ks.test(data.1$temp.2m, data.2$temp.2m): p-value will be
## approximate in the presence of ties
## [1] 0
ks.test(data.1$wind.speed,data.2$wind.speed)$p.value
## Warning in ks.test(data.1$wind.speed, data.2$wind.speed): p-value will be
## approximate in the presence of ties
## [1] 0
# Plot the result
for (var in c('pm2.5','wind.speed','temp.2m')) {
dens.1 <- density(data.1[,var],na.rm=TRUE)
df.1 <- data.frame(x=dens.1$x,y=dens.1$y)
df.1[,'data.set'] <- rep('data.1',nrow(df.1))
dens.2 <- density(data.2[,var],na.rm=TRUE)
df.2 <- data.frame(x=dens.2$x,y=dens.2$y)
df.2[,'data.set'] <- rep('data.2',nrow(df.2))
densities <- rbind(df.1,df.2)
plt <- ggplot(densities) +
geom_line(aes(x=x,y=y,colour=data.set)) +
scale_colour_manual(values=c("red","blue")) +
ggtitle(var)
print(plt)
}



ggplot(data.1,aes(pm2.5.odin.102,pm2.5)) +
geom_point(alpha=0.05)
## Warning: Removed 64 rows containing missing values (geom_point).

ggplot(data.1,aes(rh.odin.102,pm2.5)) +
geom_point(alpha=0.05)
## Warning: Removed 64 rows containing missing values (geom_point).

ggplot(data.1,aes(temp.odin.102,pm2.5)) +
geom_point(alpha=0.05)
## Warning: Removed 64 rows containing missing values (geom_point).

models <- list(NA,NA,NA,NA,NA)
names(models) <- c("linear","robust linear","quadratic","cubic","reg tree")
mse <- array(NA,dim=c(length(serials)+1,length(models),3),
dimnames=list(c(serials,'avg'),names(models),c('data.1','data.2','avg')))
for (j in 1:2) {
for (i in 1:length(serials)) {
# Does test data = data.1 or data.2?
test <- 3-j
# Linear formula
in.vars <- paste0(c('pm2.5','temp','rh'),'.odin.',substring(serials[i],6,8))
eval(parse(text=paste0("form <- pm2.5 ~ ",paste0(in.vars,collapse='+'))))
# Quadratic formula
quad.in.vars <- paste0("poly(",in.vars,",2)")
eval(parse(text=paste0("quad.form <- pm2.5 ~ ",paste0(quad.in.vars,collapse='+'))))
# Cubic formula
cub.in.vars <- paste0("poly(",in.vars,",3)")
eval(parse(text=paste0("cub.form <- pm2.5 ~ ",paste0(cub.in.vars,collapse='+'))))
# Test and training data
eval(parse(text=paste0("test.data <- data.",j)))
eval(parse(text=paste0("train.data <- data.",3-j)))
# Linear model
models[['linear']] <- lm(form,data=train.data)
y.hat <- predict(models[['linear']],test.data)
mse[i,'linear',j] <- mean((test.data$pm2.5-y.hat)^2,na.rm=TRUE)
# Robust linear model
models[['robust linear']] <- rlm(form,data=train.data)
y.hat <- predict(models[['robust linear']],test.data)
mse[i,'robust linear',j] <- mean((test.data$pm2.5-y.hat)^2,na.rm=TRUE)
# Quadratic model
models[['quadratic']] <- lm(quad.form,data=train.data)
y.hat <- predict(models[['quadratic']],test.data)
mse[i,'quadratic',j] <- mean((test.data$pm2.5-y.hat)^2,na.rm=TRUE)
# Cubic model
models[['cubic']] <- lm(cub.form,data=train.data)
y.hat <- predict(models[['cubic']],test.data)
mse[i,'cubic',j] <- mean((test.data$pm2.5-y.hat)^2,na.rm=TRUE)
# Regression Tree
models[['reg tree']] <- rpart(form,data=train.data,method='anova')
y.hat <- predict(models[['reg tree']],test.data,'vector')
mse[i,'reg tree',j] <- mean((test.data$pm2.5-y.hat)^2,na.rm=TRUE)
# Plotting models (finish if I have time)
# # Datapoints for plot lines
# pm2.5 <- data[,in.vars[1]]
# pm2.5.range <- min(pm2.5):max(pm2.5)
# plot.df <- data.frame(x=NA,y=NA,model=NA)
# for (k in 1:length(models)) {
# y <- predict(models[[k]],)
# x <- data.frame(x)
# plot.df <- rbind(plot.df,x)
# }
}
# Find average accross ODINs
for (i in 1:length(models)) {
mse['avg',i,j] <- mean(mse[,i,j],na.rm=TRUE)
}
}
# Fill in the averages matrix
for (i in 1:(length(serials)+1)) {
for (j in 1:length(models)) {
mse[i,j,3] <- mean(c(mse[i,j,1],mse[i,j,2]))
}
}
mse <- round(mse,2)
mse
## , , data.1
##
## linear robust linear quadratic cubic reg tree
## ODIN-102 57.86 61.03 59.15 53.24 105.99
## ODIN-103 41.63 42.34 42.04 43.06 85.71
## ODIN-115 39.06 41.57 38.27 36.03 92.11
## ODIN-105 106.05 116.22 107.19 112.18 124.62
## ODIN-107 64.27 64.76 61.81 65.19 80.05
## ODIN-108 55.33 56.89 61.78 66.21 99.70
## ODIN-114 65.36 57.55 53.33 87.94 71.09
## ODIN-111 58.14 54.23 61.96 66.77 76.67
## ODIN-113 84.52 84.66 81.40 84.05 92.54
## ODIN-109 53.01 50.89 51.41 46.36 78.98
## avg 62.52 63.02 61.83 66.10 90.75
##
## , , data.2
##
## linear robust linear quadratic cubic reg tree
## ODIN-102 46.41 39.15 53.79 66.56 90.42
## ODIN-103 37.56 34.27 38.70 42.80 79.47
## ODIN-115 37.56 33.79 40.23 45.54 64.64
## ODIN-105 94.40 68.84 156.71 157.42 134.71
## ODIN-107 84.32 72.17 93.22 94.67 108.53
## ODIN-108 50.66 47.87 60.07 70.13 96.43
## ODIN-114 78.05 76.86 73.09 63.95 92.10
## ODIN-111 71.26 58.23 84.01 93.30 95.79
## ODIN-113 91.63 78.19 98.23 86.95 123.64
## ODIN-109 48.40 41.79 45.48 41.62 70.18
## avg 64.03 55.11 74.35 76.29 95.59
##
## , , avg
##
## linear robust linear quadratic cubic reg tree
## ODIN-102 52.14 50.09 56.47 59.90 98.21
## ODIN-103 39.59 38.31 40.37 42.93 82.59
## ODIN-115 38.31 37.68 39.25 40.79 78.37
## ODIN-105 100.23 92.53 131.95 134.80 129.66
## ODIN-107 74.30 68.46 77.51 79.93 94.29
## ODIN-108 52.99 52.38 60.93 68.17 98.06
## ODIN-114 71.71 67.21 63.21 75.95 81.59
## ODIN-111 64.70 56.23 72.98 80.04 86.23
## ODIN-113 88.07 81.43 89.81 85.50 108.09
## ODIN-109 50.71 46.34 48.45 43.99 74.58
## avg 63.27 59.06 68.09 71.20 93.17
# ggplot(data.1,aes(pm2.5.odin.102,pm2.5)) +
# geom_point(alpha=0.05) +
# geom_smooth(method='lm',formula=y~x)
#
# ggplot(data.1,aes(pm2.5.odin.102,pm2.5)) +
# geom_point(alpha=0.05) +
# geom_smooth(method='rlm',formula=y~x)
#
# ggplot(data.1,aes(pm2.5.odin.102,pm2.5)) +
# geom_point(alpha=0.05) +
# geom_smooth(method='mblm',formula=y~x)