library(RPostgreSQL)
## Loading required package: DBI
library(reshape2)
library(rpart)
# Set working directory
setwd("c:\\Users\\Hammo\\Documents\\smoke emissions project\\repo")
# Database login details (not in GitHub repo)
load("access.Rda")
# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
user=access$user,
password=access$pwd,
host='penap-data.dyndns.org',
dbname='cona',
port=5432)
# Load all data from ODIN-109
siteid = 18 # ecan site
odin.fresh <- dbGetQuery(con," SELECT d.recordtime AT TIME ZONE 'NZT' AS date,
s.name as label, d.value::numeric as val
FROM data.fixed_data as d, admin.sensor as s, admin.instrument as i
WHERE s.id = d.sensorid AND
s.instrumentid = i.id AND
i.name = 'ODIN-SD-3' AND
d.siteid = 18 AND
i.serialn = 'ODIN-109' AND
(d.recordtime BETWEEN '2017-06-01 00:00' AND
'2017-07-01 00:00') AND
(s.name = 'PM1' OR
s.name = 'PM2.5' OR
s.name = 'PM10' OR
s.name = 'xPM1' OR
s.name = 'XPM2.5' OR
s.name = 'xPM10' OR
s.name = 'Data7' OR
s.name = 'Data8' OR
s.name = 'Data9' OR
s.name = 'Temperature' OR
s.name = 'RH')
ORDER BY date;")
odin <- dcast(odin.fresh, date~label, value.var='val')
# make the names easier to type
odin$temp <- odin$Temperature
odin$pm10 <- odin$PM10
odin$pm2.5 <- odin$PM2.5
odin$Temperature <- NULL
odin$PM10 <- NULL
odin$PM2.5 <- NULL
odin$date <- trunc(odin$date,'min') # get rid of the seconds
odin$date <- as.POSIXct(odin$date) # the above converted it to POSIXlt
odin[1:5,]
## date Data7 Data8 Data9 PM1 RH xPM1 xPM10 XPM2.5 temp
## 1 2017-06-07 23:56:00 0 0 28928 12 34.9 12 30 12 7.3
## 2 2017-06-07 23:57:00 0 0 28928 6 34.7 6 28 8 7.2
## 3 2017-06-07 23:58:00 0 0 28928 0 34.7 0 18 0 7.2
## 4 2017-06-07 23:59:00 0 0 28928 2 34.5 2 22 6 7.1
## 5 2017-06-08 00:00:00 0 0 28928 8 34.6 8 8 8 7.2
## pm10 pm2.5
## 1 33.51744 14.477783
## 2 31.51744 10.477783
## 3 21.51744 2.477783
## 4 25.51744 8.477783
## 5 11.51744 10.477783
Let’s make sure that the PM10 and PM2.5 measurements are doing something sensible.
plot(odin$pm2.5,odin$pm10,xlab="PM2.5",ylab="PM10",main="ODIN PM Measurements")
lm.odin <- lm(odin$pm10~odin$pm2.5)
abline(lm.odin,col="red")
The relationship appears linear with some error term. When we plot the error term we get
error <- odin$pm10 - predict(lm.odin,odin)
hist(error,
xlab="Difference", main="Error around Linear Model",
breaks=100, xlim=c(-100,100))
The error term is not Gaussian as expected, but bimodal. I don’t know what to make of this.
odin.cors <- matrix(nrow=ncol(odin),ncol=ncol(odin),
dimnames=list(names(odin),names(odin)))
for (i in 2:ncol(odin)) {
for (j in 2:ncol(odin)) {
odin.cors[i,j] <- cor(odin[,i],odin[,j])
}
}
odin.cors
## date Data7 Data8 Data9 PM1 RH
## date NA NA NA NA NA NA
## Data7 NA 1.000000000 0.966914335 NA 0.02247100 0.004870463
## Data8 NA 0.966914335 1.000000000 NA 0.02191596 0.004211632
## Data9 NA NA NA NA NA NA
## PM1 NA 0.022471005 0.021915964 NA 1.00000000 0.168711936
## RH NA 0.004870463 0.004211632 NA 0.16871194 1.000000000
## xPM1 NA 0.025344927 0.024439221 NA 0.98566391 0.201318630
## xPM10 NA 0.021432285 0.022081354 NA 0.92001874 0.206589932
## XPM2.5 NA 0.026255952 0.025614613 NA 0.97634315 0.204241268
## temp NA -0.010756529 -0.011843102 NA -0.28974196 -0.749623479
## pm10 NA 0.019074158 0.019648274 NA 0.93485095 0.172546118
## pm2.5 NA 0.023427507 0.023048723 NA 0.99021890 0.171486657
## xPM1 xPM10 XPM2.5 temp pm10
## date NA NA NA NA NA
## Data7 0.02534493 0.02143228 0.02625595 -0.01075653 0.01907416
## Data8 0.02443922 0.02208135 0.02561461 -0.01184310 0.01964827
## Data9 NA NA NA NA NA
## PM1 0.98566391 0.92001874 0.97634315 -0.28974196 0.93485095
## RH 0.20131863 0.20658993 0.20424127 -0.74962348 0.17254612
## xPM1 1.00000000 0.92201323 0.98912030 -0.33533346 0.91454426
## xPM10 0.92201323 1.00000000 0.93324938 -0.33936011 0.98594340
## XPM2.5 0.98912030 0.93324938 1.00000000 -0.33738703 0.92480567
## temp -0.33533346 -0.33936011 -0.33738703 1.00000000 -0.29238207
## pm10 0.91454426 0.98594340 0.92480567 -0.29238207 1.00000000
## pm2.5 0.97492598 0.92918403 0.98505044 -0.29130772 0.94432139
## pm2.5
## date NA
## Data7 0.02342751
## Data8 0.02304872
## Data9 NA
## PM1 0.99021890
## RH 0.17148666
## xPM1 0.97492598
## xPM10 0.92918403
## XPM2.5 0.98505044
## temp -0.29130772
## pm10 0.94432139
## pm2.5 1.00000000
PM10 is strongly correlated with all the other pm measurements. As these measurements are more of less measuring the same thing, we will exclude these from further investigation.
test.vars <- c('Data7', 'Data8', 'Data9', 'RH', 'temp')
for (test.var in test.vars) {
hist(odin[,test.var],
xlab="Units", main=test.var,
breaks=100)
}
Temp and RH are both bimodal.
What are Data7, Data8 and Data9 even doing?
for (data789 in c('Data7', 'Data8', 'Data9')) {
print(data789)
print(eval(parse(text=paste("summary(odin$",data789,")"))))
print("has this many non-zero elements:")
print(length(which(data789 != 0)))
}
## [1] "Data7"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.5155 0.0000 2048.0000
## [1] "has this many non-zero elements:"
## [1] 1
## [1] "Data8"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00000 0.00000 0.00000 0.01508 0.00000 64.00000
## [1] "has this many non-zero elements:"
## [1] 1
## [1] "Data9"
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 28928 28928 28928 28928 28928 28928
## [1] "has this many non-zero elements:"
## [1] 1
Nothing useful by the looks of it.
So it looks like just RH and temp are useful.
# First just with pm2.5
tree <- rpart(pm10~pm2.5,method="class",data=odin)
mean( (predict(tree,odin)-odin$pm10)^2 )
## [1] 7023.416
# Now with pm2.5 plus temp and RH
tree <- rpart(pm10~pm2.5+RH+temp,method="class",data=odin)
tree$variable.importance
## pm2.5 RH temp
## 1452.16231 132.81630 68.85046
mean( (predict(tree,odin)-odin$pm10)^2 )
## [1] 7023.416
No improvement using RH and temp.
# First just with pm2.5
linear <- lm(pm10~pm2.5,method="class",data=odin)
mean( (predict(linear,odin)-odin$pm10)^2 )
## [1] 501.7384
# Now with pm2.5 plus temp and RH
linear <- lm(pm10~pm2.5+RH+temp,method="class",data=odin)
linear$coefficients
## (Intercept) pm2.5 RH temp
## 13.86814466 1.52080779 -0.03721341 -0.22461145
mean( (predict(linear,odin)-odin$pm10)^2 )
## [1] 500.151
Again, virtually no improvement. RH and temp probably aren’t going into the pm10 measurement then.
Let’s look at how PM10 and PM2.5 are related at different times of day.
par(mfrow=c(1,2))
h.lms <- data.frame(const=NA,grad=NA)
odin$date.lt <- as.POSIXlt(odin$date)
for (h in seq(0,22,2)) {
h.inds <- which(odin$date.lt$hour > h & odin$date.lt$hour < h+2)
h.pm10 <- odin$pm10[h.inds]
h.pm2.5 <- odin$pm2.5[h.inds]
plot(h.pm2.5,h.pm10,xlab="PM2.5",ylab="PM10",main=paste("ODIN PM Measurements between",h,'and',h+2))
h.lm <- lm(h.pm10~h.pm2.5)
abline(h.lm,col="red")
h.lms[h/2,] <- h.lm$coefficients
error <- h.pm10 - predict(h.lm,data.frame(h.pm2.5=h.pm2.5))
hist(error,
xlab="Difference", main=paste("Error around Linear Model",h,'and',h+2),
breaks=100, xlim=c(-100,100))
}
odin$date.lt <- NULL
h.lms
## const grad
## 1 13.981492 1.291617
## 2 12.966912 1.254484
## 3 10.210401 1.585852
## 4 11.903660 1.345083
## 5 13.796009 1.079531
## 6 12.875611 1.058815
## 7 11.739360 1.375669
## 8 7.959759 1.538148
## 9 13.545376 1.535757
## 10 9.173851 1.557684
## 11 13.723334 1.496565