library(RPostgreSQL)
## Loading required package: DBI
library(rpart)
library(reshape2)
# library(openair)
# library(lubridate)
library(zoo)
## Warning: package 'zoo' was built under R version 3.4.3
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(ggplot2)
# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\repo")
# Database login details (not in GitHub repo)
load("access.Rda")
# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
user=access$user,
password=access$pwd,
host='penap-data.dyndns.org',
dbname='cona',
port=5432)
# Load ECan data (stored locally)
ecan <- read.csv('RangioraWinter2016.csv',stringsAsFactors=FALSE)
names(ecan) <- c('date','time','wind.speed','wind.dir','wind.dir.std','wind.speed.std',
'wind.max','co','temp.ground','temp.2m','temp.6m','pm10',
'pm2.5','pm.course')
ecan$date <- as.POSIXct(ecan$date,format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
# ecan$date <- ecan$date - 60*60
ecan$time <- NULL # this is included in date
# There are some NA's in PM2.5. Set these to zero.
# ecan$pm2.5[which(is.na(ecan$pm2.5))] <- 0
head(ecan)
## date wind.speed wind.dir wind.dir.std wind.speed.std
## 1 2016-04-01 00:10:00 0.72 353.9 43.0 0.24
## 2 2016-04-01 00:20:00 0.71 319.6 19.7 0.27
## 3 2016-04-01 00:30:00 0.63 305.4 4.0 0.36
## 4 2016-04-01 00:40:00 1.39 302.9 11.5 0.33
## 5 2016-04-01 00:50:00 0.91 312.4 21.3 0.45
## 6 2016-04-01 01:00:00 1.12 311.6 15.0 0.42
## wind.max co temp.ground temp.2m temp.6m pm10 pm2.5 pm.course
## 1 1.35 0.16 -0.01 11.12 11.13 12.54 4.61 7.93
## 2 1.35 0.15 0.03 11.08 11.06 13.05 5.92 7.13
## 3 1.55 0.14 0.06 11.05 11.00 13.91 6.98 6.92
## 4 2.25 0.10 0.08 11.04 10.95 14.46 6.18 8.28
## 5 2.25 0.10 0.10 10.99 10.90 15.17 5.59 9.58
## 6 2.25 0.09 0.09 10.99 10.90 15.56 6.12 9.45
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
i.serialn as serial, s.name as label,
d.value::numeric as val
FROM data.fixed_data as d,
admin.sensor as s, admin.instrument as i
WHERE s.id = d.sensorid
AND s.instrumentid = i.id
AND i.name = 'ODIN-SD-3'
AND d.siteid = 18
AND (i.serialn = 'ODIN-109')
AND (d.recordtime BETWEEN '2016-09-25 01:50 NZST'
AND '2016-09-25 03:10 NZST')
AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
AND '2016-09-25 03:00 NZST')
AND (s.name = 'PM2.5'
OR s.name = 'PM10'
OR s.name = 'Temperature'
OR s.name = 'RH')
ORDER BY date;")
odin.raw$date <- trunc(odin.raw$date,'min') # get rid of the seconds
odin.raw$date <- as.POSIXct(odin.raw$date) # the above converted it to POSIXlt
odin <- dcast(odin.raw, date~label, value.var='val', fun.aggregate=mean)
x <- odin$date
min <- x[2] - x[1]
y <- ecan[ecan$date >= x[1]-10*min & ecan$date <= x[length(x)]+10*min,'date']
x
## [1] "2016-09-25 01:50:00 NZST" "2016-09-25 01:51:00 NZST"
## [3] "2016-09-25 01:52:00 NZST" "2016-09-25 01:53:00 NZST"
## [5] "2016-09-25 01:54:00 NZST" "2016-09-25 01:55:00 NZST"
## [7] "2016-09-25 01:56:00 NZST" "2016-09-25 01:57:00 NZST"
## [9] "2016-09-25 01:58:00 NZST" "2016-09-25 01:59:00 NZST"
## [11] "2016-09-25 03:00:00 NZDT" "2016-09-25 03:01:00 NZDT"
## [13] "2016-09-25 03:02:00 NZDT" "2016-09-25 03:03:00 NZDT"
## [15] "2016-09-25 03:04:00 NZDT" "2016-09-25 03:05:00 NZDT"
## [17] "2016-09-25 03:06:00 NZDT" "2016-09-25 03:07:00 NZDT"
## [19] "2016-09-25 03:08:00 NZDT" "2016-09-25 03:09:00 NZDT"
y
## [1] "2016-09-25 01:40:00 +12" "2016-09-25 01:50:00 +12"
## [3] "2016-09-25 02:00:00 +12" "2016-09-25 02:10:00 +12"
i <- x[11] # 3:00 NZDT
j <- y[3] # 2:00 GMT+12
i == j # TRUE
## [1] TRUE
# everything seems fine
# Load ODIN measurements from June 2017
siteid = 18 # ecan site
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
i.serialn as serial, s.name as label,
d.value::numeric as val
FROM data.fixed_data as d,
admin.sensor as s, admin.instrument as i
WHERE s.id = d.sensorid
AND s.instrumentid = i.id
AND i.name = 'ODIN-SD-3'
AND d.siteid = 18
AND (i.serialn = 'ODIN-109')
AND (d.recordtime BETWEEN '2016-01-01 00:00 NZST'
AND '2017-01-01 00:00 NZST')
AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
AND '2016-09-25 03:00 NZST')
AND (s.name = 'PM2.5'
OR s.name = 'PM10'
OR s.name = 'Temperature'
OR s.name = 'RH')
ORDER BY date;")
# Time periods including timestamps between 2016-09-25 02:01 and 02:59 (daylight savings)
# daylight savings should also have weirdness in 3 april (3am went to 2am)
# data starts in
# will truncate timestamps at 'day'.
odin.raw$date <- trunc(odin.raw$date,'min') # get rid of the seconds
odin.raw$date <- as.POSIXct(odin.raw$date) # the above converted it to POSIXlt
# Create a data frame for each ODIN
serials <- unique(odin.raw$serial)
odins <- list()
for (i in 1:length(serials)) {
this.odin <- dcast(odin.raw[odin.raw$serial==serials[i],],
date~label, value.var='val', fun.aggregate=mean)
# Make column names easier to type
names(this.odin) <- tolower(names(this.odin))
names(this.odin)[which(names(this.odin)=="temperature")] <- "temp"
odins[[i]] <- this.odin
}
# change ODIN serials from ODIN-101 to odin.101
# the hyphens cause problems by being interpreted as minuses
tidy.serial <- function(serialn) paste0('odin.',substring(serialn,6,8))
names(odins) <- sapply(serials,FUN=tidy.serial)
odin <- odins[[1]]
# change the ODIN measurements from 1 min averages to 1 hour averages
# make sure that there is a odin entry for each minute
dates <- data.frame(date=seq(odin$date[1],odin$date[nrow(odin)],by='mins'))
odin <- merge(odin,dates,by="date",all=TRUE)
if (nrow(odin) != nrow(dates))
print("Oh noes! Gaps in ODIN data!")
# if there are gaps merge "dates" with odin
odin.zoo <- zoo( odin[,2:ncol(odin)] ) # remove date because rollapply only works on numerics
odin.roll.apply <- rollapply(odin.zoo,width=60,by=1,FUN=mean,align="left",na.rm=TRUE)
# NB I don't know what the 'align' parameter does
# Throw out data for which there is not a complete hour window
odin.avgs <- odin[1:nrow(odin.roll.apply),]
odin.avgs[,2:ncol(odin)] <- odin.roll.apply
# NB if there are gaps in the data then this will be screwed up
odin.avgs$date <- odin.avgs$date + 60*60 # take timestamps at the end of the hour average following ECan convention
# Find best correlation between ECan and ODIN
offsets <- -60:60
blank <- rep(NA,length(offset))
corr.df <- data.frame(offset=offsets,temp=blank,pm2.5=blank,pm10=blank)
for (i in 1:nrow(corr.df)) {
offset <- corr.df[i,'offset']
odin.i <- odin.avgs
odin.i$date <- odin.i$date + offset*60
names(odin.i) <- c('date',paste0(names(odin.i),'.odin.109')[2:ncol(odin)])
data.i <- merge(odin.i,ecan,by='date',all=FALSE)
corr.df[i,'temp'] <- cor(data.i$temp.2m,data.i$temp.odin.109,
method="pearson",use="complete.obs")
corr.df[i,'pm2.5'] <- cor(data.i$pm2.5,data.i$pm2.5.odin.109,
method="pearson",use="complete.obs")
corr.df[i,'pm10'] <- cor(data.i$pm10,data.i$pm10.odin.109,
method="pearson",use="complete.obs")
}
print(paste0("Best temperature correlation: ",
corr.df[which.max(corr.df[,'temp']),'offset']))
## [1] "Best temperature correlation: 8"
print(paste0("Best PM2.5 correlation: ",
corr.df[which.max(corr.df[,'pm2.5']),'offset']))
## [1] "Best PM2.5 correlation: 11"
print(paste0("Best PM10 correlation: ",
corr.df[which.max(corr.df[,'pm10']),'offset']))
## [1] "Best PM10 correlation: 10"
corr.melt <- melt(corr.df,id='offset')
ggplot(corr.melt) +
geom_line(aes(x=offset,y=value,colour=variable)) +
scale_colour_manual(values=c("red","green","blue")) +
ylab("Pearson Correlation") +
xlab("ODIN-109 Clock Offset (m)") +
ggtitle("Correlation Between ECan and ODIN Measurements")
# # NB there's a bunch of ECan data missing from 2016-07-14 09:10:00 to 16:40:00
best.offset <- corr.df[which.max(corr.df[,'temp']),'offset']
odin.bo <- odin.avgs # bo = best offset
odin.bo$date <- odin.bo$date + best.offset*60
names(odin.bo) <- c('date',paste0(names(odin.bo),'.odin.109')[2:ncol(odin)])
data <- merge(odin.bo,ecan,by='date',all=FALSE)
head(data) #
## date pm10.odin.109 pm2.5.odin.109 rh.odin.109
## 1 2016-07-12 16:00:00 11.06150 2.744034 41.99333
## 2 2016-07-12 16:10:00 10.54561 2.820338 42.13167
## 3 2016-07-12 16:20:00 10.90140 3.329028 42.33000
## 4 2016-07-12 16:30:00 11.02592 3.456201 42.48833
## 5 2016-07-12 16:40:00 10.93698 3.888588 42.56333
## 6 2016-07-12 16:50:00 12.30675 5.109446 42.70000
## temp.odin.109 wind.speed wind.dir wind.dir.std wind.speed.std wind.max
## 1 16.89500 0.95 75.4 22.9 0.49 2.35
## 2 16.70000 0.67 104.5 46.5 0.38 1.85
## 3 16.47667 1.22 78.3 22.5 0.44 2.35
## 4 16.25833 0.32 37.2 25.4 0.27 0.85
## 5 15.89167 0.56 33.6 1.9 0.22 0.95
## 6 15.30000 0.50 66.7 13.8 0.24 1.15
## co temp.ground temp.2m temp.6m pm10 pm2.5 pm.course
## 1 0.10 4.96 15.80 10.84 13.25 2.84 10.41
## 2 0.09 5.01 15.68 10.67 13.70 3.08 10.62
## 3 0.12 4.72 14.92 10.21 13.91 2.88 11.03
## 4 0.12 3.65 13.72 10.07 14.71 3.28 11.44
## 5 0.19 3.06 12.40 9.34 16.41 3.48 12.93
## 6 0.66 1.38 9.94 8.56 19.73 4.68 15.05
tail(data)
## date pm10.odin.109 pm2.5.odin.109 rh.odin.109
## 16505 2016-11-04 07:40:00 10.314354 2.693165 72.77667
## 16506 2016-11-04 07:50:00 9.780676 2.642296 71.85333
## 16507 2016-11-04 08:00:00 9.834044 2.184475 70.66500
## 16508 2016-11-04 08:10:00 9.638362 2.133606 69.45333
## 16509 2016-11-04 08:20:00 9.140263 1.930129 68.15000
## 16510 2016-11-04 08:30:00 8.944581 1.904695 66.47167
## temp.odin.109 wind.speed wind.dir wind.dir.std wind.speed.std
## 16505 11.09500 1.53 297.2 17.0 0.41
## 16506 11.54667 1.26 313.1 19.4 0.54
## 16507 12.07833 1.62 308.0 19.0 0.45
## 16508 12.67500 1.79 292.2 17.0 0.46
## 16509 13.43167 1.55 294.9 10.8 0.40
## 16510 14.27667 1.65 277.8 22.4 0.65
## wind.max co temp.ground temp.2m temp.6m pm10 pm2.5 pm.course
## 16505 2.75 NA 0.17 9.41 9.24 4.12 2.01 2.11
## 16506 3.05 NA 0.19 9.57 9.38 4.14 2.36 1.78
## 16507 2.95 NA 0.23 9.74 9.52 4.41 2.76 1.65
## 16508 3.15 NA 0.22 9.85 9.63 5.06 3.06 2.01
## 16509 2.65 NA 0.22 10.02 9.80 5.53 2.91 2.62
## 16510 3.15 NA 0.26 10.32 10.06 5.75 2.94 2.81
# add daily mean to plot
# use 24 hour periods starting and ending at noon
date.lt <- as.POSIXlt(data$date)
noons <- data[which(date.lt$hour==12 & date.lt$min==0),'date']
daily.model <- data.frame()
for (i in 2:length(noons)) {
day.data <- data[data$date >= noons[i-1] & data$date <= noons[i],]
day.pm2.5 <- day.data$pm2.5
lower.bound <- quantile(day.pm2.5,na.rm=TRUE)[2] # 25% quantile
upper.bound <- quantile(day.pm2.5,na.rm=TRUE)[4] # 75% quantile
day.data <- day.data[day.data$pm2.5<=upper.bound & day.data$pm2.5>=lower.bound,]
day <- day.data$date[1]
# day <- trunc(day.data$date[1],'day')
# day <- as.POSIXct(day)
day.coefs <- summary(lm(pm2.5~pm2.5.odin.109+rh.odin.109+temp.odin.109,
data=day.data))$coefficients
daily.model[i,'day'] <- day
daily.model[i,'mean'] <- mean(day.data$pm2.5)
daily.model[i,'max'] <- max(day.data$pm2.5)
# daily.model[i,c('mean','max')] <- c(mean(day.data$pm2.5), max(day.data$pm2.5))
daily.model[i,'pm2.5'] <- day.coefs['pm2.5.odin.109',1]
daily.model[i,'temp'] <- day.coefs['temp.odin.109',1]
daily.model[i,'rh'] <- day.coefs['rh.odin.109',1]
daily.model[i,'cnst'] <- day.coefs['(Intercept)',1]
daily.model[i,'pm2.5.std'] <- day.coefs['pm2.5.odin.109',2]
daily.model[i,'temp.std'] <- day.coefs['temp.odin.109',2]
daily.model[i,'rh.std'] <- day.coefs['rh.odin.109',2]
daily.model[i,'cnst.std'] <- day.coefs['(Intercept)',2]
}
# plot linear models over time
for (var in c('pm2.5','rh','temp','cnst')) {
var.std <- paste0(var,'.std')
eval(parse(text=paste("p <- ggplot(daily.model, aes(x=day,y=",var,")) +",
"geom_errorbar(aes(ymin=",var,"-",var.std,", ymax=",var,"+",var.std,")) + ",
"geom_point() + ",
'scale_colour_manual(values=c("red","blue")) + ',
'ylab("Coefficient") +',
'xlab("Day") +',
'ggtitle("Coefficients for',var,'over 24 Hour Periods")'
)))
print(p)
}
## Warning: Removed 9 rows containing missing values (geom_errorbar).
## Warning: Removed 9 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_errorbar).
## Warning: Removed 9 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_errorbar).
## Warning: Removed 9 rows containing missing values (geom_point).
## Warning: Removed 9 rows containing missing values (geom_errorbar).
## Warning: Removed 9 rows containing missing values (geom_point).
# plot mean and max
for (var in c('mean','max')) {
eval(parse(text=paste("p <- ggplot(daily.model, aes(x=day,y=",var,")) +",
"geom_point() + ",
'scale_colour_manual(values=c("red","blue")) + ',
'ylab("Coefficient") +',
'xlab("Day") +',
'ggtitle("PM2.5',var,'over 24 Hour Periods")'
)))
print(p)
}
## Warning: Removed 20 rows containing missing values (geom_point).
## Warning: Removed 20 rows containing missing values (geom_point).
# dly.mdl.cnt <- daily.model
# dly.mdl.cnt[,'pm2.5'] <- dly.mdl.cnt[,'pm2.5'] - mean(dly.mdl.cnt[,'pm2.5'])
# dly.mdl.cnt[,'rh'] <- dly.mdl.cnt[,'rh'] - mean(dly.mdl.cnt[,'rh'])
# dly.mdl.cnt[,'temp'] <- dly.mdl.cnt[,'temp'] - mean(dly.mdl.cnt[,'temp'])
#
# cor(daily.model$day,daily.model$pm2.5,method='pearson',use="complete.obs")
# cor(daily.model$mean,daily.model$pm2.5,method='pearson',use="complete.obs")
# cor(daily.model$max,daily.model$pm2.5,method='pearson',use="complete.obs")
# cor(daily.model$day,daily.model$mean,method='pearson',use="complete.obs")
# cor(daily.model$day,daily.model$max,method='pearson',use="complete.obs")
# build linear model of means
daily.model.model <- daily.model[,c('mean','pm2.5','day','max')]
daily.model.model[,c('mean','day','max')] <- scale(daily.model.model[,c('mean','day','max')])
summary(lm(pm2.5~day+mean,data=daily.model.model))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3927134 0.02935974 13.375916 3.339767e-23
## day -0.2342782 0.03189908 -7.344356 8.524852e-11
## mean 0.1353568 0.03167196 4.273711 4.725248e-05
summary(lm(pm2.5~day+max,data=daily.model.model))$coefficients
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.3912651 0.02801613 13.965708 2.350292e-24
## day -0.2036112 0.03202740 -6.357407 8.037320e-09
## max 0.1713337 0.03179936 5.387961 5.553032e-07
# normal distribution?
# maybe take exp to get from log normal to normal
# try exp and log see which looks normal
# normalize data
data.norm <- data
data.norm[,2:ncol(data)] <- scale(data[,2:ncol(data)])
# different combinations of input variables to try
# without co
form.odin <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109 + temp.odin.109
just.pm2.5.rh <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109
just.pm2.5.temp <- pm2.5 ~ pm2.5.odin.109 + temp.odin.109
form.ecan <- pm2.5 ~ pm2.5.odin.109 + wind.speed + wind.dir + temp.2m
form.all <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109 + temp.odin.109 +
wind.speed + wind.dir + temp.2m
# with co
form.ecan.co <- pm2.5 ~ pm2.5.odin.109 + wind.speed + wind.dir + co + temp.2m
form.all.co <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109 + temp.odin.109 +
wind.speed + wind.dir + co + temp.2m
# is co a better predictor than pm2.5.odin.109?
just.co <- pm2.5 ~ co
just.pm2.5 <- pm2.5 ~ pm2.5.odin.109
just.co.pm2.5 <- pm2.5 ~ co + pm2.5.odin.109
forms <- c(form.odin,just.pm2.5.rh,just.pm2.5.temp,form.ecan,form.all,
form.ecan.co,form.all.co,just.co,just.pm2.5,just.co.pm2.5)
names(forms) <- c("odin","just.pm2.5.rh","just.pm2.5.temp","ecan","all","ecan.co",
"all.co","just.co","just.pm2.5","just.co.pm2.5")
r.sqr <- data.frame(form=names(forms), r.sqr=rep(NA,length(forms)))
for (i in 1:length(forms)) {
form <- forms[[i]]
# linear
this.lm <- summary(lm(form,data=data.norm))
r.sqr[i,2] <- this.lm$r.squared
coefs <- this.lm$coefficients[,1]
coefs <- coefs[2:length(coefs)] # get rid of intercept
var.import <- abs(coefs) # could also do square
var.import.df <- data.frame(linear=var.import,tree=NA)
# regression tree
tree <- rpart(form,method="anova",data=data)$variable.importance
var.import.df[names(tree),'tree'] <- tree
# turn into percentages
sums <- colSums(var.import.df,na.rm=TRUE)
var.import.df[,1] <- round(var.import.df[,1] / sums[1] * 100, digits=1)
var.import.df[,2] <- round(var.import.df[,2] / sums[2] * 100, digits=1)
print(var.import.df)
}
## linear tree
## pm2.5.odin.109 69.0 93.5
## rh.odin.109 15.9 4.2
## temp.odin.109 15.1 2.4
## linear tree
## pm2.5.odin.109 90.7 94.6
## rh.odin.109 9.3 5.4
## linear tree
## pm2.5.odin.109 93.2 97.6
## temp.odin.109 6.8 2.4
## linear tree
## pm2.5.odin.109 87.7 96.6
## wind.speed 4.5 0.5
## wind.dir 2.0 0.7
## temp.2m 5.8 2.2
## linear tree
## pm2.5.odin.109 65.4 90.0
## rh.odin.109 15.2 4.0
## temp.odin.109 10.6 2.3
## wind.speed 2.3 0.8
## wind.dir 2.5 0.8
## temp.2m 4.0 2.2
## linear tree
## pm2.5.odin.109 64.7 77.9
## wind.speed 0.4 1.4
## wind.dir 2.0 0.8
## co 32.8 18.2
## temp.2m 0.0 1.8
## linear tree
## pm2.5.odin.109 52.4 74.2
## rh.odin.109 9.3 3.6
## temp.odin.109 11.0 1.6
## wind.speed 0.1 1.2
## wind.dir 2.2 0.8
## co 21.2 17.1
## temp.2m 3.8 1.6
## linear tree
## co 100 100
## linear tree
## pm2.5.odin.109 100 100
## linear tree
## co 33.7 18.9
## pm2.5.odin.109 66.3 81.1
r.sqr
## form r.sqr
## 1 odin 0.8592077
## 2 just.pm2.5.rh 0.8393297
## 3 just.pm2.5.temp 0.8342428
## 4 ecan 0.8363499
## 5 all 0.8611697
## 6 ecan.co 0.8781256
## 7 all.co 0.8865197
## 8 just.co 0.6947977
## 9 just.pm2.5 0.8308080
## 10 just.co.pm2.5 0.8777196
# Models
lm.model <- function(train,test,form=y~x) {
this.lm <- lm(form,train)
predictions <- predict(this.lm, test)
}
tree.model <- function(train,test,form=y~x) {
fit <- rpart(form, method="anova", data=train)
predict(fit, test, "vector")
}
ptree.model <- function(train,test,form=y~x) {
fit <- rpart(form, method="anova", data=train)
pfit<- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"])
predict(fit, test, "vector")
}
# kernel.n.model <- function(n,train,test,form=y~x) {
# kern <- ksmooth(train$x,train$y,kernel='normal',x.points=seq(1,500,n))
# kern$y[ as.integer(test$x/n) ]
# }
# kernel.2.model <- function(train,test) kernel.n.model(2,train,test)
# kernel.5.model <- function(train,test) kernel.n.model(5,train,test)
# kernel.10.model <- function(train,test) kernel.n.model(10,train,test)
models <- list(lm.model,tree.model,ptree.model)#,kernel.2.model,kernel.5.model,
# kernel.10.model)
names(models) <- c("lm.model","tree.model","ptree.model")#,"kernel.2.model",
# "kernel.5.model","kernel.10.model")
# Combinations of input variables (formulae)
form.pm2.5 <- pm2.5 ~ pm2.5.odin.109
form.odin <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109 + temp.odin.109
form.ecan <- pm2.5 ~ pm2.5.odin.109 + wind.speed + wind.dir + temp.2m + co
form.all <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109 + temp.odin.109 +
wind.speed + wind.dir + temp.2m + co
forms <- c(form.odin,form.ecan,form.all)
names(forms) <- c("odin","ecan","all")
# Divisions into training and testing (subsets)
len <- nrow(data)
third <- as.integer(len/3)
day <- 6*24 # data is in 10 minute intervals
week <- day*7
subsets <- list(1:len, 1:third, third:(2*third), 1:week,
(2*third):len,1)
names(subsets) <- c("Whole","1st Third","2nd Third","3rd Third","1st Week","Avg")
# Setting up the array which will store error terms
dim.names <- list(names(models),names(subsets),names(forms))
# dim.dim <- c(length(models),length(subsets),length(forms))
error.array <- array(0,dim=lapply(dim.names,length),
dimnames = dim.names)
na.array <- array(0,dim=lapply(dim.names,length),
dimnames = dim.names)
# Crunch los numeros
for (j in 1:(length(subsets)-1)) {
train.data <- data[subsets[[j]],]
if (j==1)
test.data <- data
else
test.data <- data[-subsets[[j]],]
for (i in 1:length(models)) {
for (k in 1:length(forms)) {
predictions <- models[[i]](train.data,test.data,form=forms[[k]])
error.array[i,j,k] <- mean( (predictions-test.data$pm2.5)^2, na.rm=TRUE )
na.array[i,j,k] <- length(which(is.na(predictions)))
}
}
}
error.array
## , , odin
##
## Whole 1st Third 2nd Third 3rd Third 1st Week Avg
## lm.model 28.22503 32.61005 33.50414 57.35282 83.94223 0
## tree.model 28.43287 32.23724 50.48160 72.01138 165.07290 0
## ptree.model 28.43287 32.23724 50.48160 72.01138 165.07290 0
##
## , , ecan
##
## Whole 1st Third 2nd Third 3rd Third 1st Week Avg
## lm.model 24.58595 19.02540 29.43263 62.17012 53.87741 0
## tree.model 31.02946 23.09258 43.51766 72.07228 134.93300 0
## ptree.model 31.02946 23.09258 43.51766 72.07228 134.93300 0
##
## , , all
##
## Whole 1st Third 2nd Third 3rd Third 1st Week Avg
## lm.model 22.89259 25.19581 27.80165 53.42867 50.69385 0
## tree.model 26.44629 27.59459 43.51766 72.07228 145.87999 0
## ptree.model 26.44629 27.59459 43.51766 72.07228 145.87999 0
na.array
## , , odin
##
## Whole 1st Third 2nd Third 3rd Third 1st Week Avg
## lm.model 174 132 108 174 108 0
## tree.model 0 0 0 0 0 0
## ptree.model 0 0 0 0 0 0
##
## , , ecan
##
## Whole 1st Third 2nd Third 3rd Third 1st Week Avg
## lm.model 437 330 303 385 241 0
## tree.model 0 0 0 0 0 0
## ptree.model 0 0 0 0 0 0
##
## , , all
##
## Whole 1st Third 2nd Third 3rd Third 1st Week Avg
## lm.model 437 330 303 385 241 0
## tree.model 0 0 0 0 0 0
## ptree.model 0 0 0 0 0 0
try taking out first and last quartiles to remove outliers, errors should get closer to “2nd third” values, as this is the most mediumish period
days <- 1:as.integer(nrow(data)/day)
lm.success <- data.frame(days=days,r.sqr=rep(NA,length(days)),err=rep(NA,length(days)))
for (d in days) {
d.lm <- lm(form.odin,data=data[1:d,])
}
ggplot(lm.success,aes(x=days,y=r.sqr))
# plot(data$wind.speed,data$pm2.5)
# plot(data$co,data$pm2.5)
# # look for gaps in data
# to.ranges <- function(l) {
# # convert "1 2 3 4" to "1 - 4" or something
# l2 <- c()
# for (i in 2:(length(l)-1)) {
# if (l[i]==l[i+1]) {
# if (l[i-1]!=0)
# }
# }
# }
# for (i in 2:ncol(data)) {
# print(names(data)[i])
# print(which(is.na(data[,i])))
# }