Load up some tasty datas

library(RPostgreSQL)
## Loading required package: DBI
library(reshape2)
library(rpart)

# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\repo")
# Database login details (not in GitHub repo)
load("access.Rda")
# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
                 user=access$user,
                 password=access$pwd,
                 host='penap-data.dyndns.org',
                 dbname='cona',
                 port=5432)

# Load all data from ODIN-109
siteid = 18 # ecan site
odin.fresh <- dbGetQuery(con," SELECT d.recordtime AT TIME ZONE 'NZT' AS date,
                         s.name as label, d.value::numeric as val
                         FROM data.fixed_data as d,  admin.sensor as s, admin.instrument as i
                         WHERE s.id = d.sensorid AND
                         s.instrumentid = i.id AND
                         i.name = 'ODIN-SD-3' AND
                         d.siteid = 18 AND
                         i.serialn = 'ODIN-109' AND
                         (d.recordtime BETWEEN '2017-06-01 00:00' AND
                         '2017-07-01 00:00') AND
                         (s.name = 'PM1' OR
                          s.name = 'PM2.5' OR
                          s.name = 'PM10' OR
                          s.name = 'xPM1' OR
                          s.name = 'XPM2.5' OR
                          s.name = 'xPM10' OR
                          s.name = 'Data7' OR
                          s.name = 'Data8' OR
                          s.name = 'Data9' OR
                          s.name = 'Temperature' OR
                          s.name = 'RH')
                         ORDER BY date;")
odin <- dcast(odin.fresh, date~label, value.var='val')
# make the names easier to type
odin$temp <- odin$Temperature
odin$pm10 <- odin$PM10
odin$pm2.5 <- odin$PM2.5
odin$Temperature <- NULL
odin$PM10 <- NULL
odin$PM2.5 <- NULL
odin$date <- trunc(odin$date,'min') # get rid of the seconds
odin$date <- as.POSIXct(odin$date) # the above converted it to POSIXlt

odin[1:5,]
##                  date Data7 Data8 Data9 PM1   RH xPM1 xPM10 XPM2.5 temp
## 1 2017-06-07 23:56:00     0     0 28928  12 34.9   12    30     12  7.3
## 2 2017-06-07 23:57:00     0     0 28928   6 34.7    6    28      8  7.2
## 3 2017-06-07 23:58:00     0     0 28928   0 34.7    0    18      0  7.2
## 4 2017-06-07 23:59:00     0     0 28928   2 34.5    2    22      6  7.1
## 5 2017-06-08 00:00:00     0     0 28928   8 34.6    8     8      8  7.2
##       pm10     pm2.5
## 1 33.51744 14.477783
## 2 31.51744 10.477783
## 3 21.51744  2.477783
## 4 25.51744  8.477783
## 5 11.51744 10.477783

Let’s make sure that the PM10 and PM2.5 measurements are doing something sensible.

plot(odin$pm2.5,odin$pm10,xlab="PM2.5",ylab="PM10",main="ODIN PM Measurements")
lm.odin <- lm(odin$pm10~odin$pm2.5)
abline(lm.odin,col="red")

The relationship appears linear with some error term. When we plot the error term we get

error <- odin$pm10 - predict(lm.odin,odin)
hist(error,
     xlab="Difference", main="Error around Linear Model",
     breaks=100, xlim=c(-100,100))

The error term is not Gaussian as expected, but bimodal. I don’t know what to make of this.

Which measurements correlate?

odin.cors <- matrix(nrow=ncol(odin),ncol=ncol(odin),
                    dimnames=list(names(odin),names(odin)))
for (i in 2:ncol(odin)) {
  for (j in 2:ncol(odin)) {
    odin.cors[i,j] <- cor(odin[,i],odin[,j])
  }
}
odin.cors
##        date        Data7        Data8 Data9         PM1           RH
## date     NA           NA           NA    NA          NA           NA
## Data7    NA  1.000000000  0.966914335    NA  0.02247100  0.004870463
## Data8    NA  0.966914335  1.000000000    NA  0.02191596  0.004211632
## Data9    NA           NA           NA    NA          NA           NA
## PM1      NA  0.022471005  0.021915964    NA  1.00000000  0.168711936
## RH       NA  0.004870463  0.004211632    NA  0.16871194  1.000000000
## xPM1     NA  0.025344927  0.024439221    NA  0.98566391  0.201318630
## xPM10    NA  0.021432285  0.022081354    NA  0.92001874  0.206589932
## XPM2.5   NA  0.026255952  0.025614613    NA  0.97634315  0.204241268
## temp     NA -0.010756529 -0.011843102    NA -0.28974196 -0.749623479
## pm10     NA  0.019074158  0.019648274    NA  0.93485095  0.172546118
## pm2.5    NA  0.023427507  0.023048723    NA  0.99021890  0.171486657
##               xPM1       xPM10      XPM2.5        temp        pm10
## date            NA          NA          NA          NA          NA
## Data7   0.02534493  0.02143228  0.02625595 -0.01075653  0.01907416
## Data8   0.02443922  0.02208135  0.02561461 -0.01184310  0.01964827
## Data9           NA          NA          NA          NA          NA
## PM1     0.98566391  0.92001874  0.97634315 -0.28974196  0.93485095
## RH      0.20131863  0.20658993  0.20424127 -0.74962348  0.17254612
## xPM1    1.00000000  0.92201323  0.98912030 -0.33533346  0.91454426
## xPM10   0.92201323  1.00000000  0.93324938 -0.33936011  0.98594340
## XPM2.5  0.98912030  0.93324938  1.00000000 -0.33738703  0.92480567
## temp   -0.33533346 -0.33936011 -0.33738703  1.00000000 -0.29238207
## pm10    0.91454426  0.98594340  0.92480567 -0.29238207  1.00000000
## pm2.5   0.97492598  0.92918403  0.98505044 -0.29130772  0.94432139
##              pm2.5
## date            NA
## Data7   0.02342751
## Data8   0.02304872
## Data9           NA
## PM1     0.99021890
## RH      0.17148666
## xPM1    0.97492598
## xPM10   0.92918403
## XPM2.5  0.98505044
## temp   -0.29130772
## pm10    0.94432139
## pm2.5   1.00000000

PM10 is strongly correlated with all the other pm measurements. As these measurements are more of less measuring the same thing, we will exclude these from further investigation.

Are any other measurements bimodal?

test.vars <- c('Data7', 'Data8', 'Data9', 'RH', 'temp')
for (test.var in test.vars) {
  hist(odin[,test.var],
     xlab="Units", main=test.var,
     breaks=100)
}

Temp and RH are both bimodal.

What are Data7, Data8 and Data9 even doing?

for (data789 in c('Data7', 'Data8', 'Data9')) {
  print(data789)
  print(eval(parse(text=paste("summary(odin$",data789,")"))))
  print("has this many non-zero elements:")
  print(length(which(data789 != 0)))
}
## [1] "Data7"
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##    0.0000    0.0000    0.0000    0.5155    0.0000 2048.0000 
## [1] "has this many non-zero elements:"
## [1] 1
## [1] "Data8"
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##  0.00000  0.00000  0.00000  0.01508  0.00000 64.00000 
## [1] "has this many non-zero elements:"
## [1] 1
## [1] "Data9"
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   28928   28928   28928   28928   28928   28928 
## [1] "has this many non-zero elements:"
## [1] 1

Nothing useful by the looks of it.

So it looks like just RH and temp are useful.

Regression Tree

# First just with pm2.5
tree <- rpart(pm10~pm2.5,method="class",data=odin)
mean( (predict(tree,odin)-odin$pm10)^2 )
## [1] 7023.416
# Now with pm2.5 plus temp and RH
tree <- rpart(pm10~pm2.5+RH+temp,method="class",data=odin)
tree$variable.importance
##      pm2.5         RH       temp 
## 1452.16231  132.81630   68.85046
mean( (predict(tree,odin)-odin$pm10)^2 )
## [1] 7023.416

No improvement using RH and temp.

Linear Model

# First just with pm2.5
linear <- lm(pm10~pm2.5,method="class",data=odin)
mean( (predict(linear,odin)-odin$pm10)^2 )
## [1] 501.7384
# Now with pm2.5 plus temp and RH
linear <- lm(pm10~pm2.5+RH+temp,method="class",data=odin)
linear$coefficients
## (Intercept)       pm2.5          RH        temp 
## 13.86814466  1.52080779 -0.03721341 -0.22461145
mean( (predict(linear,odin)-odin$pm10)^2 )
## [1] 500.151

Again, virtually no improvement. RH and temp probably aren’t going into the pm10 measurement then.

Different Times of Day

Let’s look at how PM10 and PM2.5 are related at different times of day.

par(mfrow=c(1,2))
h.lms <- data.frame(const=NA,grad=NA)
odin$date.lt <- as.POSIXlt(odin$date)
for (h in seq(0,22,2)) {
  h.inds <- which(odin$date.lt$hour > h & odin$date.lt$hour < h+2)
  h.pm10 <- odin$pm10[h.inds]
  h.pm2.5 <- odin$pm2.5[h.inds]
  plot(h.pm2.5,h.pm10,xlab="PM2.5",ylab="PM10",main=paste("ODIN PM Measurements between",h,'and',h+2))
  h.lm <- lm(h.pm10~h.pm2.5)
  abline(h.lm,col="red")
  h.lms[h/2,] <- h.lm$coefficients
  error <- h.pm10 - predict(h.lm,data.frame(h.pm2.5=h.pm2.5))
  hist(error,
     xlab="Difference", main=paste("Error around Linear Model",h,'and',h+2),
     breaks=100, xlim=c(-100,100))
  
}

odin$date.lt <- NULL
h.lms
##        const     grad
## 1  13.981492 1.291617
## 2  12.966912 1.254484
## 3  10.210401 1.585852
## 4  11.903660 1.345083
## 5  13.796009 1.079531
## 6  12.875611 1.058815
## 7  11.739360 1.375669
## 8   7.959759 1.538148
## 9  13.545376 1.535757
## 10  9.173851 1.557684
## 11 13.723334 1.496565