Load data

library(RPostgreSQL)
## Loading required package: DBI
library(rpart)
library(reshape2)
# library(openair)
# library(lubridate)
library(zoo)
## Warning: package 'zoo' was built under R version 3.4.3
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(ggplot2)

# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\repo")

# Database login details (not in GitHub repo)
load("access.Rda")

# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
                 user=access$user,
                 password=access$pwd,
                 host='penap-data.dyndns.org',
                 dbname='cona',
                 port=5432)
# Load ECan data (stored locally)
ecan <- read.csv('RangioraWinter2016.csv',stringsAsFactors=FALSE)
names(ecan) <- c('date','time','wind.speed','wind.dir','wind.dir.std','wind.speed.std',
                 'wind.max','co','temp.ground','temp.2m','temp.6m','pm10',
                 'pm2.5','pm.course')
ecan$date <- as.POSIXct(ecan$date,format = '%m/%d/%Y %H:%M', tz='Etc/GMT-12')
# ecan$date <- ecan$date - 60*60
ecan$time <- NULL # this is included in date
# There are some NA's in PM2.5. Set these to zero.
# ecan$pm2.5[which(is.na(ecan$pm2.5))] <- 0
head(ecan)
##                  date wind.speed wind.dir wind.dir.std wind.speed.std
## 1 2016-04-01 00:10:00       0.72    353.9         43.0           0.24
## 2 2016-04-01 00:20:00       0.71    319.6         19.7           0.27
## 3 2016-04-01 00:30:00       0.63    305.4          4.0           0.36
## 4 2016-04-01 00:40:00       1.39    302.9         11.5           0.33
## 5 2016-04-01 00:50:00       0.91    312.4         21.3           0.45
## 6 2016-04-01 01:00:00       1.12    311.6         15.0           0.42
##   wind.max   co temp.ground temp.2m temp.6m  pm10 pm2.5 pm.course
## 1     1.35 0.16       -0.01   11.12   11.13 12.54  4.61      7.93
## 2     1.35 0.15        0.03   11.08   11.06 13.05  5.92      7.13
## 3     1.55 0.14        0.06   11.05   11.00 13.91  6.98      6.92
## 4     2.25 0.10        0.08   11.04   10.95 14.46  6.18      8.28
## 5     2.25 0.10        0.10   10.99   10.90 15.17  5.59      9.58
## 6     2.25 0.09        0.09   10.99   10.90 15.56  6.12      9.45
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
                            i.serialn as serial, s.name as label,
                            d.value::numeric as val
                           FROM data.fixed_data as d,  
                            admin.sensor as s, admin.instrument as i
                           WHERE s.id = d.sensorid
                            AND s.instrumentid = i.id
                            AND i.name = 'ODIN-SD-3'
                            AND d.siteid = 18
                            AND (i.serialn = 'ODIN-109')
                            AND (d.recordtime BETWEEN '2016-09-25 01:50 NZST'
                             AND '2016-09-25 03:10 NZST')
                            AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
                             AND '2016-09-25 03:00 NZST')
                            AND (s.name = 'PM2.5'
                             OR s.name = 'PM10'
                             OR s.name = 'Temperature'
                             OR s.name = 'RH')
                           ORDER BY date;") 
odin.raw$date <- trunc(odin.raw$date,'min') # get rid of the seconds
odin.raw$date <- as.POSIXct(odin.raw$date) # the above converted it to POSIXlt

odin <- dcast(odin.raw, date~label, value.var='val', fun.aggregate=mean)
x <- odin$date
min <- x[2] - x[1]
y <- ecan[ecan$date >= x[1]-10*min & ecan$date <= x[length(x)]+10*min,'date']
x
##  [1] "2016-09-25 01:50:00 NZST" "2016-09-25 01:51:00 NZST"
##  [3] "2016-09-25 01:52:00 NZST" "2016-09-25 01:53:00 NZST"
##  [5] "2016-09-25 01:54:00 NZST" "2016-09-25 01:55:00 NZST"
##  [7] "2016-09-25 01:56:00 NZST" "2016-09-25 01:57:00 NZST"
##  [9] "2016-09-25 01:58:00 NZST" "2016-09-25 01:59:00 NZST"
## [11] "2016-09-25 03:00:00 NZDT" "2016-09-25 03:01:00 NZDT"
## [13] "2016-09-25 03:02:00 NZDT" "2016-09-25 03:03:00 NZDT"
## [15] "2016-09-25 03:04:00 NZDT" "2016-09-25 03:05:00 NZDT"
## [17] "2016-09-25 03:06:00 NZDT" "2016-09-25 03:07:00 NZDT"
## [19] "2016-09-25 03:08:00 NZDT" "2016-09-25 03:09:00 NZDT"
y
## [1] "2016-09-25 01:40:00 +12" "2016-09-25 01:50:00 +12"
## [3] "2016-09-25 02:00:00 +12" "2016-09-25 02:10:00 +12"
i <- x[11] # 3:00 NZDT
j <- y[3] # 2:00 GMT+12
i == j # TRUE
## [1] TRUE
# everything seems fine
# Load ODIN measurements from June 2017
siteid = 18 # ecan site
odin.raw <-dbGetQuery(con,"SELECT d.recordtime AT TIME ZONE 'NZST' AS date,
                            i.serialn as serial, s.name as label,
                            d.value::numeric as val
                           FROM data.fixed_data as d,  
                            admin.sensor as s, admin.instrument as i
                           WHERE s.id = d.sensorid
                            AND s.instrumentid = i.id
                            AND i.name = 'ODIN-SD-3'
                            AND d.siteid = 18
                            AND (i.serialn = 'ODIN-109')
                            AND (d.recordtime BETWEEN '2016-01-01 00:00 NZST'
                             AND '2017-01-01 00:00 NZST')
                            AND NOT (d.recordtime BETWEEN '2016-09-25 02:00 NZST'
                             AND '2016-09-25 03:00 NZST')
                            AND (s.name = 'PM2.5'
                             OR s.name = 'PM10'
                             OR s.name = 'Temperature'
                             OR s.name = 'RH')
                           ORDER BY date;") 

# Time periods including timestamps between 2016-09-25 02:01 and 02:59 (daylight savings)
# daylight savings should also have weirdness in 3 april (3am went to 2am)
# data starts in 
# will truncate timestamps at 'day'.

odin.raw$date <- trunc(odin.raw$date,'min') # get rid of the seconds
odin.raw$date <- as.POSIXct(odin.raw$date) # the above converted it to POSIXlt
# Create a data frame for each ODIN
serials <- unique(odin.raw$serial)
odins <- list()
for (i in 1:length(serials)) {
  this.odin <- dcast(odin.raw[odin.raw$serial==serials[i],], 
                     date~label, value.var='val', fun.aggregate=mean)
  # Make column names easier to type
  names(this.odin) <- tolower(names(this.odin))
  names(this.odin)[which(names(this.odin)=="temperature")] <- "temp"
  odins[[i]] <- this.odin
}
# change ODIN serials from ODIN-101 to odin.101
# the hyphens cause problems by being interpreted as minuses
tidy.serial <- function(serialn) paste0('odin.',substring(serialn,6,8))
names(odins) <- sapply(serials,FUN=tidy.serial)

odin <- odins[[1]]
# change the ODIN measurements from 1 min averages to 1 hour averages

# make sure that there is a odin entry for each minute
dates <- data.frame(date=seq(odin$date[1],odin$date[nrow(odin)],by='mins'))
odin <- merge(odin,dates,by="date",all=TRUE)
if (nrow(odin) != nrow(dates))
  print("Oh noes! Gaps in ODIN data!")
# if there are gaps merge "dates" with odin

odin.zoo <- zoo( odin[,2:ncol(odin)] ) # remove date because rollapply only works on numerics
odin.roll.apply <- rollapply(odin.zoo,width=60,by=1,FUN=mean,align="left",na.rm=TRUE)
# NB I don't know what the 'align' parameter does
# Throw out data for which there is not a complete hour window
odin.avgs <- odin[1:nrow(odin.roll.apply),]
odin.avgs[,2:ncol(odin)] <- odin.roll.apply
# NB if there are gaps in the data then this will be screwed up
odin.avgs$date <- odin.avgs$date + 60*60 # take timestamps at the end of the hour average following ECan convention

# Find best correlation between ECan and ODIN

offsets <- -60:60
blank <- rep(NA,length(offset))
corr.df <- data.frame(offset=offsets,temp=blank,pm2.5=blank,pm10=blank)
for (i in 1:nrow(corr.df)) {
  offset <- corr.df[i,'offset']
  odin.i <- odin.avgs
  odin.i$date <- odin.i$date + offset*60
  names(odin.i) <- c('date',paste0(names(odin.i),'.odin.109')[2:ncol(odin)])
  data.i <- merge(odin.i,ecan,by='date',all=FALSE)
  corr.df[i,'temp'] <- cor(data.i$temp.2m,data.i$temp.odin.109,
                           method="pearson",use="complete.obs")
  corr.df[i,'pm2.5'] <- cor(data.i$pm2.5,data.i$pm2.5.odin.109,
                           method="pearson",use="complete.obs")
  corr.df[i,'pm10'] <- cor(data.i$pm10,data.i$pm10.odin.109,
                           method="pearson",use="complete.obs")
}

print(paste0("Best temperature correlation: ",
             corr.df[which.max(corr.df[,'temp']),'offset']))
## [1] "Best temperature correlation: 8"
print(paste0("Best PM2.5 correlation: ",
             corr.df[which.max(corr.df[,'pm2.5']),'offset']))
## [1] "Best PM2.5 correlation: 11"
print(paste0("Best PM10 correlation: ",
             corr.df[which.max(corr.df[,'pm10']),'offset']))
## [1] "Best PM10 correlation: 10"
corr.melt <- melt(corr.df,id='offset')
ggplot(corr.melt) + 
  geom_line(aes(x=offset,y=value,colour=variable)) +
  scale_colour_manual(values=c("red","green","blue")) + 
  ylab("Pearson Correlation") +
  xlab("ODIN-109 Clock Offset (m)") +
  ggtitle("Correlation Between ECan and ODIN Measurements")

# # NB there's a bunch of ECan data missing from 2016-07-14 09:10:00 to 16:40:00
best.offset <- corr.df[which.max(corr.df[,'temp']),'offset'] 

odin.bo <- odin.avgs # bo = best offset
odin.bo$date <- odin.bo$date + best.offset*60
names(odin.bo) <- c('date',paste0(names(odin.bo),'.odin.109')[2:ncol(odin)])
data <- merge(odin.bo,ecan,by='date',all=FALSE)

head(data) # 
##                  date pm10.odin.109 pm2.5.odin.109 rh.odin.109
## 1 2016-07-12 16:00:00      11.06150       2.744034    41.99333
## 2 2016-07-12 16:10:00      10.54561       2.820338    42.13167
## 3 2016-07-12 16:20:00      10.90140       3.329028    42.33000
## 4 2016-07-12 16:30:00      11.02592       3.456201    42.48833
## 5 2016-07-12 16:40:00      10.93698       3.888588    42.56333
## 6 2016-07-12 16:50:00      12.30675       5.109446    42.70000
##   temp.odin.109 wind.speed wind.dir wind.dir.std wind.speed.std wind.max
## 1      16.89500       0.95     75.4         22.9           0.49     2.35
## 2      16.70000       0.67    104.5         46.5           0.38     1.85
## 3      16.47667       1.22     78.3         22.5           0.44     2.35
## 4      16.25833       0.32     37.2         25.4           0.27     0.85
## 5      15.89167       0.56     33.6          1.9           0.22     0.95
## 6      15.30000       0.50     66.7         13.8           0.24     1.15
##     co temp.ground temp.2m temp.6m  pm10 pm2.5 pm.course
## 1 0.10        4.96   15.80   10.84 13.25  2.84     10.41
## 2 0.09        5.01   15.68   10.67 13.70  3.08     10.62
## 3 0.12        4.72   14.92   10.21 13.91  2.88     11.03
## 4 0.12        3.65   13.72   10.07 14.71  3.28     11.44
## 5 0.19        3.06   12.40    9.34 16.41  3.48     12.93
## 6 0.66        1.38    9.94    8.56 19.73  4.68     15.05
tail(data)
##                      date pm10.odin.109 pm2.5.odin.109 rh.odin.109
## 16505 2016-11-04 07:40:00     10.314354       2.693165    72.77667
## 16506 2016-11-04 07:50:00      9.780676       2.642296    71.85333
## 16507 2016-11-04 08:00:00      9.834044       2.184475    70.66500
## 16508 2016-11-04 08:10:00      9.638362       2.133606    69.45333
## 16509 2016-11-04 08:20:00      9.140263       1.930129    68.15000
## 16510 2016-11-04 08:30:00      8.944581       1.904695    66.47167
##       temp.odin.109 wind.speed wind.dir wind.dir.std wind.speed.std
## 16505      11.09500       1.53    297.2         17.0           0.41
## 16506      11.54667       1.26    313.1         19.4           0.54
## 16507      12.07833       1.62    308.0         19.0           0.45
## 16508      12.67500       1.79    292.2         17.0           0.46
## 16509      13.43167       1.55    294.9         10.8           0.40
## 16510      14.27667       1.65    277.8         22.4           0.65
##       wind.max co temp.ground temp.2m temp.6m pm10 pm2.5 pm.course
## 16505     2.75 NA        0.17    9.41    9.24 4.12  2.01      2.11
## 16506     3.05 NA        0.19    9.57    9.38 4.14  2.36      1.78
## 16507     2.95 NA        0.23    9.74    9.52 4.41  2.76      1.65
## 16508     3.15 NA        0.22    9.85    9.63 5.06  3.06      2.01
## 16509     2.65 NA        0.22   10.02    9.80 5.53  2.91      2.62
## 16510     3.15 NA        0.26   10.32   10.06 5.75  2.94      2.81
# add daily mean to plot

Model Stability

# use 24 hour periods starting and ending at noon
date.lt <- as.POSIXlt(data$date)
noons <- data[which(date.lt$hour==12 & date.lt$min==0),'date']

daily.model <- data.frame()

for (i in 2:length(noons)) {
  day.data <- data[data$date >= noons[i-1] & data$date <= noons[i],]
  day.pm2.5 <- day.data$pm2.5
  lower.bound <- quantile(day.pm2.5,na.rm=TRUE)[2] # 25% quantile
  upper.bound <- quantile(day.pm2.5,na.rm=TRUE)[4] # 75% quantile
  day.data <- day.data[day.data$pm2.5<=upper.bound & day.data$pm2.5>=lower.bound,]
  day <- day.data$date[1]
  # day <- trunc(day.data$date[1],'day')
  # day <- as.POSIXct(day)
  day.coefs <- summary(lm(pm2.5~pm2.5.odin.109+rh.odin.109+temp.odin.109,
                          data=day.data))$coefficients
  daily.model[i,'day'] <- day
  daily.model[i,'mean'] <- mean(day.data$pm2.5)
  daily.model[i,'max'] <- max(day.data$pm2.5)
  # daily.model[i,c('mean','max')] <- c(mean(day.data$pm2.5), max(day.data$pm2.5))
  daily.model[i,'pm2.5'] <- day.coefs['pm2.5.odin.109',1]
  daily.model[i,'temp'] <- day.coefs['temp.odin.109',1]
  daily.model[i,'rh'] <- day.coefs['rh.odin.109',1]
  daily.model[i,'cnst'] <- day.coefs['(Intercept)',1]
  daily.model[i,'pm2.5.std'] <- day.coefs['pm2.5.odin.109',2]
  daily.model[i,'temp.std'] <- day.coefs['temp.odin.109',2]
  daily.model[i,'rh.std'] <- day.coefs['rh.odin.109',2]
  daily.model[i,'cnst.std'] <- day.coefs['(Intercept)',2]
}

# plot linear models over time
for (var in c('pm2.5','rh','temp','cnst')) {
  var.std <- paste0(var,'.std')
  eval(parse(text=paste("p <- ggplot(daily.model, aes(x=day,y=",var,")) +",
    "geom_errorbar(aes(ymin=",var,"-",var.std,", ymax=",var,"+",var.std,")) + ",
    "geom_point() + ",
    'scale_colour_manual(values=c("red","blue")) + ',
    'ylab("Coefficient") +',
    'xlab("Day") +',
    'ggtitle("Coefficients for',var,'over 24 Hour Periods")'
    )))
  print(p)
}
## Warning: Removed 9 rows containing missing values (geom_errorbar).
## Warning: Removed 9 rows containing missing values (geom_point).

## Warning: Removed 9 rows containing missing values (geom_errorbar).

## Warning: Removed 9 rows containing missing values (geom_point).

## Warning: Removed 9 rows containing missing values (geom_errorbar).

## Warning: Removed 9 rows containing missing values (geom_point).

## Warning: Removed 9 rows containing missing values (geom_errorbar).

## Warning: Removed 9 rows containing missing values (geom_point).

# plot mean and max
for (var in c('mean','max')) {
  eval(parse(text=paste("p <- ggplot(daily.model, aes(x=day,y=",var,")) +",
    "geom_point() + ",
    'scale_colour_manual(values=c("red","blue")) + ',
    'ylab("Coefficient") +',
    'xlab("Day") +',
    'ggtitle("PM2.5',var,'over 24 Hour Periods")'
    )))
  print(p)
}
## Warning: Removed 20 rows containing missing values (geom_point).

## Warning: Removed 20 rows containing missing values (geom_point).

# dly.mdl.cnt <- daily.model
# dly.mdl.cnt[,'pm2.5'] <- dly.mdl.cnt[,'pm2.5'] - mean(dly.mdl.cnt[,'pm2.5'])
# dly.mdl.cnt[,'rh'] <- dly.mdl.cnt[,'rh'] - mean(dly.mdl.cnt[,'rh'])
# dly.mdl.cnt[,'temp'] <- dly.mdl.cnt[,'temp'] - mean(dly.mdl.cnt[,'temp'])
# 
# cor(daily.model$day,daily.model$pm2.5,method='pearson',use="complete.obs")
# cor(daily.model$mean,daily.model$pm2.5,method='pearson',use="complete.obs")
# cor(daily.model$max,daily.model$pm2.5,method='pearson',use="complete.obs")
# cor(daily.model$day,daily.model$mean,method='pearson',use="complete.obs")
# cor(daily.model$day,daily.model$max,method='pearson',use="complete.obs")
# build linear model of means
daily.model.model <- daily.model[,c('mean','pm2.5','day','max')]
daily.model.model[,c('mean','day','max')] <- scale(daily.model.model[,c('mean','day','max')])

summary(lm(pm2.5~day+mean,data=daily.model.model))$coefficients
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)  0.3927134 0.02935974 13.375916 3.339767e-23
## day         -0.2342782 0.03189908 -7.344356 8.524852e-11
## mean         0.1353568 0.03167196  4.273711 4.725248e-05
summary(lm(pm2.5~day+max,data=daily.model.model))$coefficients
##               Estimate Std. Error   t value     Pr(>|t|)
## (Intercept)  0.3912651 0.02801613 13.965708 2.350292e-24
## day         -0.2036112 0.03202740 -6.357407 8.037320e-09
## max          0.1713337 0.03179936  5.387961 5.553032e-07

Variable Significance

# normal distribution?
# maybe take exp to get from log normal to normal 
# try exp and log see which looks normal

# normalize data
data.norm <- data
data.norm[,2:ncol(data)] <- scale(data[,2:ncol(data)])

# different combinations of input variables to try
# without co
form.odin <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109 + temp.odin.109
just.pm2.5.rh <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109
just.pm2.5.temp <- pm2.5 ~ pm2.5.odin.109 + temp.odin.109
form.ecan <- pm2.5 ~ pm2.5.odin.109 + wind.speed + wind.dir + temp.2m
form.all <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109 + temp.odin.109 + 
                            wind.speed + wind.dir + temp.2m
# with co
form.ecan.co <- pm2.5 ~ pm2.5.odin.109 + wind.speed + wind.dir + co + temp.2m
form.all.co <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109 + temp.odin.109 + 
                            wind.speed + wind.dir + co + temp.2m

# is co a better predictor than pm2.5.odin.109?
just.co <- pm2.5 ~ co
just.pm2.5 <- pm2.5 ~ pm2.5.odin.109
just.co.pm2.5 <- pm2.5 ~ co + pm2.5.odin.109

forms <- c(form.odin,just.pm2.5.rh,just.pm2.5.temp,form.ecan,form.all,
           form.ecan.co,form.all.co,just.co,just.pm2.5,just.co.pm2.5)
names(forms) <- c("odin","just.pm2.5.rh","just.pm2.5.temp","ecan","all","ecan.co",
                           "all.co","just.co","just.pm2.5","just.co.pm2.5")
r.sqr <- data.frame(form=names(forms), r.sqr=rep(NA,length(forms)))

for (i in 1:length(forms)) {
  form <- forms[[i]]
  # linear
  this.lm <- summary(lm(form,data=data.norm))
  r.sqr[i,2] <- this.lm$r.squared
  
  coefs <- this.lm$coefficients[,1]
  coefs <- coefs[2:length(coefs)] # get rid of intercept
  var.import <- abs(coefs) # could also do square
  var.import.df <- data.frame(linear=var.import,tree=NA)
  
  # regression tree
  tree <- rpart(form,method="anova",data=data)$variable.importance
  var.import.df[names(tree),'tree'] <- tree
  
  
  
  # turn into percentages
  sums <- colSums(var.import.df,na.rm=TRUE)
  var.import.df[,1] <- round(var.import.df[,1] / sums[1] * 100, digits=1)
  var.import.df[,2] <- round(var.import.df[,2] / sums[2] * 100, digits=1)
  
  print(var.import.df)
}
##                linear tree
## pm2.5.odin.109   69.0 93.5
## rh.odin.109      15.9  4.2
## temp.odin.109    15.1  2.4
##                linear tree
## pm2.5.odin.109   90.7 94.6
## rh.odin.109       9.3  5.4
##                linear tree
## pm2.5.odin.109   93.2 97.6
## temp.odin.109     6.8  2.4
##                linear tree
## pm2.5.odin.109   87.7 96.6
## wind.speed        4.5  0.5
## wind.dir          2.0  0.7
## temp.2m           5.8  2.2
##                linear tree
## pm2.5.odin.109   65.4 90.0
## rh.odin.109      15.2  4.0
## temp.odin.109    10.6  2.3
## wind.speed        2.3  0.8
## wind.dir          2.5  0.8
## temp.2m           4.0  2.2
##                linear tree
## pm2.5.odin.109   64.7 77.9
## wind.speed        0.4  1.4
## wind.dir          2.0  0.8
## co               32.8 18.2
## temp.2m           0.0  1.8
##                linear tree
## pm2.5.odin.109   52.4 74.2
## rh.odin.109       9.3  3.6
## temp.odin.109    11.0  1.6
## wind.speed        0.1  1.2
## wind.dir          2.2  0.8
## co               21.2 17.1
## temp.2m           3.8  1.6
##    linear tree
## co    100  100
##                linear tree
## pm2.5.odin.109    100  100
##                linear tree
## co               33.7 18.9
## pm2.5.odin.109   66.3 81.1
r.sqr
##               form     r.sqr
## 1             odin 0.8592077
## 2    just.pm2.5.rh 0.8393297
## 3  just.pm2.5.temp 0.8342428
## 4             ecan 0.8363499
## 5              all 0.8611697
## 6          ecan.co 0.8781256
## 7           all.co 0.8865197
## 8          just.co 0.6947977
## 9       just.pm2.5 0.8308080
## 10   just.co.pm2.5 0.8777196

Comparing Models

# Models

lm.model <- function(train,test,form=y~x) {
  this.lm <- lm(form,train)
  predictions <- predict(this.lm, test)
}
tree.model <- function(train,test,form=y~x) {
  fit <- rpart(form, method="anova", data=train)
  predict(fit, test, "vector")
}
ptree.model <- function(train,test,form=y~x) {
  fit <- rpart(form, method="anova", data=train)
  pfit<- prune(fit, cp=fit$cptable[which.min(fit$cptable[,"xerror"]),"CP"])
  predict(fit, test, "vector")
}
# kernel.n.model <- function(n,train,test,form=y~x) {
#   kern <- ksmooth(train$x,train$y,kernel='normal',x.points=seq(1,500,n))
#   kern$y[ as.integer(test$x/n) ]
# }
# kernel.2.model <- function(train,test) kernel.n.model(2,train,test)
# kernel.5.model <- function(train,test) kernel.n.model(5,train,test)
# kernel.10.model <- function(train,test) kernel.n.model(10,train,test)

models <- list(lm.model,tree.model,ptree.model)#,kernel.2.model,kernel.5.model,
               # kernel.10.model)
names(models) <- c("lm.model","tree.model","ptree.model")#,"kernel.2.model",
                   # "kernel.5.model","kernel.10.model")


# Combinations of input variables (formulae)

form.pm2.5 <- pm2.5 ~ pm2.5.odin.109
form.odin <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109 + temp.odin.109
form.ecan <- pm2.5 ~ pm2.5.odin.109 + wind.speed + wind.dir + temp.2m + co
form.all <- pm2.5 ~ pm2.5.odin.109 + rh.odin.109 + temp.odin.109 + 
                            wind.speed + wind.dir + temp.2m + co
forms <- c(form.odin,form.ecan,form.all)
names(forms) <- c("odin","ecan","all")


# Divisions into training and testing (subsets)

len <- nrow(data)
third <- as.integer(len/3)
day <- 6*24 # data is in 10 minute intervals
week <- day*7
subsets <- list(1:len, 1:third, third:(2*third), 1:week,
                (2*third):len,1)
names(subsets) <- c("Whole","1st Third","2nd Third","3rd Third","1st Week","Avg")


# Setting up the array which will store error terms

dim.names <- list(names(models),names(subsets),names(forms))
# dim.dim <- c(length(models),length(subsets),length(forms))
error.array <- array(0,dim=lapply(dim.names,length),
                       dimnames = dim.names)
na.array <- array(0,dim=lapply(dim.names,length),
                       dimnames = dim.names)


# Crunch los numeros

for (j in 1:(length(subsets)-1)) {
  train.data <- data[subsets[[j]],]
  if (j==1)
    test.data <- data
  else
    test.data <- data[-subsets[[j]],]
  for (i in 1:length(models)) {
    for (k in 1:length(forms)) {
      predictions <- models[[i]](train.data,test.data,form=forms[[k]])
      error.array[i,j,k] <- mean( (predictions-test.data$pm2.5)^2, na.rm=TRUE )
      na.array[i,j,k] <- length(which(is.na(predictions)))
    }
  }
}

error.array
## , , odin
## 
##                Whole 1st Third 2nd Third 3rd Third  1st Week Avg
## lm.model    28.22503  32.61005  33.50414  57.35282  83.94223   0
## tree.model  28.43287  32.23724  50.48160  72.01138 165.07290   0
## ptree.model 28.43287  32.23724  50.48160  72.01138 165.07290   0
## 
## , , ecan
## 
##                Whole 1st Third 2nd Third 3rd Third  1st Week Avg
## lm.model    24.58595  19.02540  29.43263  62.17012  53.87741   0
## tree.model  31.02946  23.09258  43.51766  72.07228 134.93300   0
## ptree.model 31.02946  23.09258  43.51766  72.07228 134.93300   0
## 
## , , all
## 
##                Whole 1st Third 2nd Third 3rd Third  1st Week Avg
## lm.model    22.89259  25.19581  27.80165  53.42867  50.69385   0
## tree.model  26.44629  27.59459  43.51766  72.07228 145.87999   0
## ptree.model 26.44629  27.59459  43.51766  72.07228 145.87999   0
na.array
## , , odin
## 
##             Whole 1st Third 2nd Third 3rd Third 1st Week Avg
## lm.model      174       132       108       174      108   0
## tree.model      0         0         0         0        0   0
## ptree.model     0         0         0         0        0   0
## 
## , , ecan
## 
##             Whole 1st Third 2nd Third 3rd Third 1st Week Avg
## lm.model      437       330       303       385      241   0
## tree.model      0         0         0         0        0   0
## ptree.model     0         0         0         0        0   0
## 
## , , all
## 
##             Whole 1st Third 2nd Third 3rd Third 1st Week Avg
## lm.model      437       330       303       385      241   0
## tree.model      0         0         0         0        0   0
## ptree.model     0         0         0         0        0   0

try taking out first and last quartiles to remove outliers, errors should get closer to “2nd third” values, as this is the most mediumish period

How long to build decent model?

days <- 1:as.integer(nrow(data)/day)
lm.success <- data.frame(days=days,r.sqr=rep(NA,length(days)),err=rep(NA,length(days)))
for (d in days) {
  d.lm <- lm(form.odin,data=data[1:d,])
  
}

ggplot(lm.success,aes(x=days,y=r.sqr))

Misc

# plot(data$wind.speed,data$pm2.5)
# plot(data$co,data$pm2.5)
# # look for gaps in data
# to.ranges <- function(l) {
#   # convert "1 2 3 4" to "1 - 4" or something
#   l2 <- c()
#   for (i in 2:(length(l)-1)) {
#     if (l[i]==l[i+1]) {
#       if (l[i-1]!=0)
#     }
#   }
# }
# for (i in 2:ncol(data)) {
#   print(names(data)[i])
#   print(which(is.na(data[,i])))
# }