library(RPostgreSQL)
## Loading required package: DBI
# Database login details (not in GitHub repo)
load("access.Rda")
# Connect to database
p <- dbDriver("PostgreSQL")
con <- dbConnect(p,
user=access$user,
password=access$pwd,
host='penap-data.dyndns.org',
dbname='cona',
port=5432)
locs <- dbGetQuery(con,"SELECT fs.id,
ST_X(ST_TRANSFORM(fs.geom::geometry,4326)) as lon,
ST_Y(ST_TRANSFORM(fs.geom::geometry,4326)) as lat
FROM admin.fixedsites as fs
ORDER BY fs.id;")
library(ggplot2)
load("with_ecan.RData")
ids <- c(102, 109, 108)
sites <- c(10, 18, 16)
data <- data.frame(date = with.ecan$date)
for (i in 1:3) {
id <- ids[i]
site <- sites[i]
train.data <- data.frame(x = with.ecan[, paste0('pm2.5.odin.', id, '.site.18')],
y = with.ecan[, 'pm2.5'])
linear.model <- lm(y ~ x, train.data, na.rm=TRUE)
deployed.data <- data.frame(date = with.ecan$date,
x = with.ecan[, paste0('pm2.5.odin.', id, '.site.', site)])
results <- predict(linear.model, deployed.data)
deployed.data[, paste0('pm2.5.odin.', id)] <- predict(linear.model, deployed.data)
deployed.data$x <- NULL
data <- merge(data, deployed.data, by='date')
}
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
# Remove rows with NAs
data <- na.omit(data)
nrow(data)
## [1] 93291
# Comparing 102, 109, 108
# At sites 10, 18, 16, resp.
lat <- c(172.5907, 172.5953, 172.6046) # in spatial order 10, 18, 16, resp.
long <- c(-43.31509, -43.30752, -43.29752)
dist <- function(x1, y1, x2, y2)
sqrt((x1-x2)^2 + (y1-y2)^2)
d1 <- dist(lat[1], long[1], lat[2], long[2])
d2 <- dist(lat[2], long[2], lat[3], long[3])
odin.x <- c(0, d1, d1+d2)
linear.interpolate <- function(x.1, y.1, x.2, y.2, x) {
y.1 + (x - x.1) * (y.2 - y.1) / (x.2 - x.1)
}
y.1 <- data[, paste0('pm2.5.odin.', ids[1])]
y.2 <- data[, paste0('pm2.5.odin.',ids[3])]
y <- data[, paste0('pm2.5.odin.', ids[2])]
y.hat <- linear.interpolate(odin.x[1], y.1, odin.x[3], y.2, odin.x[2])
# Absolute errors
sqr.err <- (y - y.hat)^2
mean(sqr.err)
## [1] 76.87962
# Relative errors
mean(sqr.err / y^2)
## [1] 1.177295
print(odin.x[2] / odin.x[3])
## [1] 0.3934429
divs <- 100
offset.df <- data.frame(percentage = NA, dist = NA, rel.err =- NA)
for (i in 1:divs) {
odin.x[2] <- odin.x[3] * i / divs
y.1 <- data[, paste0('pm2.5.odin.', ids[1])]
y.2 <- data[, paste0('pm2.5.odin.',ids[3])]
y <- data[, paste0('pm2.5.odin.', ids[2])]
y.hat <- linear.interpolate(odin.x[1], y.1, odin.x[3], y.2, odin.x[2])
# Absolute errors
sqr.err <- (y - y.hat)^2
mean(sqr.err)
# Relative errors
offset.df[i, 1:3] <- c(i / divs * 100, odin.x[2], mean(sqr.err / y^2))
}
ggplot(offset.df) +
geom_point(aes(percentage, rel.err))

offset.df$percentage[ which.min(offset.df$rel.err) ]
## [1] 79
trial.1 <- data.frame(ids = c(102, 109, 108),
sites = c(10, 18, 16))
trial.2 <- data.frame(ids = c(107, 115, 102),
sites = c(15, 2, 10))
trial.3 <- data.frame(ids = c(105, 113, 114),
sites = c(1, 9, 8))
trials <- list(trial.1, trial.2, trial.3)
for (trial in trials) {
for (i in 1:3) {
trial[i, 'lat'] <- locs[which(locs$id==trial[i, 'sites']), 'lat']
trial[i, 'lon'] <- locs[which(locs$id==trial[i, 'sites']), 'lon']
}
ids <- trial$ids
data <- data.frame(date = with.ecan$date)
for (i in 1:3) {
id <- trial$ids[i]
site <- trial$sites[i]
train.data <- data.frame(x = with.ecan[, paste0('pm2.5.odin.', id, '.site.18')],
y = with.ecan[, 'pm2.5'])
linear.model <- lm(y ~ x, train.data, na.rm=TRUE)
deployed.data <- data.frame(date = with.ecan$date,
x = with.ecan[, paste0('pm2.5.odin.', id, '.site.', site)])
results <- predict(linear.model, deployed.data)
deployed.data[, paste0('pm2.5.odin.', id)] <- predict(linear.model, deployed.data)
deployed.data$x <- NULL
data <- merge(data, deployed.data, by='date')
}
# Remove rows with NAs
data <- na.omit(data)
print(trial$ids)
print('nrow')
print(nrow(data))
d1 <- dist(trial[1, 'lat'], trial[1, 'lon'], trial[2, 'lat'], trial[2, 'lon'])
d2 <- dist(trial[2, 'lat'], trial[2, 'lon'], trial[3, 'lat'], trial[3, 'lon'])
odin.x <- c(0, d1, d1+d2)
y.1 <- data[, paste0('pm2.5.odin.', ids[1])]
y.2 <- data[, paste0('pm2.5.odin.',ids[3])]
y <- data[, paste0('pm2.5.odin.', ids[2])]
y.hat <- linear.interpolate(odin.x[1], y.1, odin.x[3], y.2, odin.x[2])
# Absolute errors
sqr.err <- (y - y.hat)^2
mean(sqr.err)
# Relative errors
print('mean abs rel err')
print(mean(abs((y-y.hat)/y)))
print('percentage length')
print(odin.x[2] / odin.x[3])
divs <- 100
offset.df <- data.frame(percentage = NA, dist = NA, rel.err = NA)
offset.df.2 <- data.frame(percentage = NA, rel.err = NA)
for (i in 1:divs) {
odin.x[2] <- odin.x[3] * i / divs
y.1 <- data[, paste0('pm2.5.odin.', ids[1])]
y.2 <- data[, paste0('pm2.5.odin.',ids[3])]
y <- data[, paste0('pm2.5.odin.', ids[2])]
y.hat <- linear.interpolate(odin.x[1], y.1, odin.x[3], y.2, odin.x[2])
# Absolute errors
abs.rel.err <- abs( (y - y.hat)/y )
# Relative errors
offset.df[i, 1:3] <- c(i / divs * 100, odin.x[2], mean(abs.rel.err))
to.append <- data.frame(rel.err = abs.rel.err)
to.append[, 'percentage'] <- rep(i/divs*100, nrow(to.append))
offset.df.2 <- rbind(offset.df.2, to.append)
}
plt <- ggplot(offset.df) +
geom_point(aes(percentage, rel.err)) +
ggtitle(paste(trial$ids[1], trial$ids[2], trial$ids[3]))
print(plt)
#
# plt <- ggplot(offset.df.2) +
# geom_boxplot(aes(percentage, rel.err)) +
# ggtitle(paste(trial$ids[1], trial$ids[2], trial$ids[3]))
#
# print(plt)
print('best length percentage')
print(offset.df$percentage[ which.min(offset.df$rel.err) ])
print('############')
# DIVIDE IT UP INTO THIRDS
third <- as.integer(nrow(data)/3)
for (l in 1:3) {
ddata <- data[((l-1)*third):(l*third), ]
offset.df <- data.frame(percentage = NA, dist = NA, rel.err = NA)
for (i in 1:divs) {
odin.x[2] <- odin.x[3] * i / divs
y.1 <- ddata[, paste0('pm2.5.odin.', ids[1])]
y.2 <- ddata[, paste0('pm2.5.odin.',ids[3])]
y <- ddata[, paste0('pm2.5.odin.', ids[2])]
y.hat <- linear.interpolate(odin.x[1], y.1, odin.x[3], y.2, odin.x[2])
# Absolute errors
abs.rel.err <- abs( (y - y.hat)/y )
# Relative errors
offset.df[i, 1:3] <- c(i / divs * 100, odin.x[2], mean(abs.rel.err))
}
plt <- ggplot(offset.df) +
geom_point(aes(percentage, rel.err)) +
ggtitle(paste(trial$ids[1], trial$ids[2], trial$ids[3], "third", l))
print(plt)
}
}
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
## [1] 102 109 108
## [1] "nrow"
## [1] 93291
## [1] "mean abs rel err"
## [1] 0.6212454
## [1] "percentage length"
## [1] 0.39391

## [1] "best length percentage"
## [1] 74
## [1] "############"


## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded

## [1] 107 115 102
## [1] "nrow"
## [1] 68971
## [1] "mean abs rel err"
## [1] 0.568692
## [1] "percentage length"
## [1] 0.3595254

## [1] "best length percentage"
## [1] 5
## [1] "############"


## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'na.rm' will be disregarded

## [1] 105 113 114
## [1] "nrow"
## [1] 82966
## [1] "mean abs rel err"
## [1] 0.7101958
## [1] "percentage length"
## [1] 0.5324599

## [1] "best length percentage"
## [1] 93
## [1] "############"


