# Load libraries
library(readr)
library(openair)
library(ggplot2)
library(parallel)
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators

The point of this is to explore the 2021 HauHau data from Cromwell schools

# Load data
data_cromwell <- read_csv("~/data/CONA/2021/HauHau/all Cromwell school HH data 2020.csv", 
                         col_types = cols(time = col_character()))
# Fix timestamp to use openair UTC is forced for simplicity and used only as label
data_cromwell$date_orig <- data_cromwell$date
data_cromwell$date <- as.POSIXct(paste(data_cromwell$date_orig,
                                       data_cromwell$time),
                                 format = "%d-%B-%y %H:%M",
                                 tz= "UTC")
names(data_cromwell)
##  [1] "TimeStamp"   "date"        "time"        "Temperature" "Humidity"   
##  [6] "CO2"         "PM2.5"       "PM10"        "pressure"    "school"     
## [11] "room"        "unit"        "date_orig"
unique(data_cromwell$unit)
##  [1] "HH400" "HH399" "HH238" "HH388" "HH389" "HH396" "HH387" "HH225" "HH394"
## [10] "HH392" "HH221" "HH393"
# Subset one device HH400
data_play <- selectByDate(subset(data_cromwell,unit=="HH400"),
                          start = "2020-7-12",
                          end = "2020-12-12")
data_play$datenum <- as.numeric(data_play$date)
#What does it look like
timePlot(data_play,pollutant = "CO2")

Let’s see what a log transformation does

data_play$logCO2 <- log(data_play$CO2)
data_play$logdate <- 1000000 * (log(as.numeric(data_play$date)) - log(as.numeric(data_play$date[1])))
ggplot(data_play, aes(x=logdate, y = logCO2)) + 
  geom_line(colour = "red")

Let’s try to find some exponential decays in the first couple of days of data

# Walk the timeseries with a 30min window and try to fit an exponential decay.
nobs <- length(data_play$date)
#setup parallel backend to use many processors
cores <- detectCores()
cl <- makeCluster(cores - 2) #not to overload your computer
registerDoParallel(cl)
r2_array <- foreach(i=1:(nobs-30),
                    .errorhandling = 'remove',
                    .combine = rbind) %dopar%
  {
    out_tmp <- data.frame(id = i)
    out_tmp$adj.r2 <- NA
    #for (i in (1:(nobs-30))){
    window_data <- data_play[(i:(i+30)),]
    # Fit linear model CO2 ~ date
    curr_model <- lm(data = window_data,CO2 ~ logdate)
    # exponential_coeffs[[i]] <- curr_model
    #r2_array[i] <- summary(curr_model)$adj.r.squared
    #curr_model
    out_tmp$adj.r2 <- summary(curr_model)$adj.r.squared
    out_tmp$slope <- curr_model$coefficients[2]
    out_tmp
  }
stopCluster(cl)

What does this look plotted?

ggplot(data = r2_array, aes(x = data_play$date[id])) +
  geom_point(aes(y = adj.r2))

OK, now focusing on the “high r2” … what do the coefficients look like

# Selecting only adj.r2 > 0.9
high_r <- subset(r2_array,adj.r2 > 0.90)
# Plot the coefficients
ggplot(data = high_r, aes(x = data_play$date[id])) +
  geom_point(aes(y = slope))

And the timeseries of dC/dt

data_play$dC.dt <- NA
npoints <- length(data_play$date)
lwindow <- 1

data_play$dC.dt[(lwindow+1):npoints] <- (data_play$CO2[(lwindow+1):npoints] -
                                           data_play$CO2[1:(npoints - (lwindow))]) / (data_play$datenum[(lwindow+1):npoints] - data_play$datenum[1:(npoints - (lwindow))])

timePlot(data_play,pollutant = "dC.dt")

Now let’s start playing with water!

# From RH to absolute humidity
coeffA <- 6.116441
coeffm <- 7.591386
coeffTn <- 240.7263

data_play$P.ws <- coeffA * 10^(coeffm*data_play$Temperature
                               /(data_play$Temperature - coeffTn))
data_play$P.w <- data_play$Humidity * data_play$P.ws /100
data_play$MRw <- 621.9907 * data_play$P.w / (data_play$pressure - data_play$P.w)
timePlot(data_play, pollutant = c("CO2", "MRw", "Temperature", "Humidity"), y.relation = "free")

Playing with the mass balance equation we can write it like this: \[ F(t)=\frac{\frac{dC}{dt}}{C_{out}-C(t)}-\frac{S(t)}{V(C_{out}-C(t))} \] So, let’s plot the right hand side of that equation

Volume <- 180 #Volume of the classroom [m3]
ppm2Kgm3 <- 0.0018
S_rate <- (0.005*3.6) * 1.98 * 30 #[Kg/hr]
min_CO2 <- min(data_play$CO2, na.rm = TRUE)-100
# CO2 generation in (l/s * to m3/hr) * density [Kg/m3] * Npeople
data_play$F_rate <- (data_play$dC.dt*ppm2Kgm3)/(min_CO2*ppm2Kgm3 - data_play$CO2*ppm2Kgm3) - S_rate/(Volume*(min_CO2*ppm2Kgm3 - data_play$CO2*ppm2Kgm3))
timePlot(data_play, pollutant = c("CO2", "F_rate", "dC.dt"), y.relation = "free")

Trying to turn this into an optimisation problem: \[ \frac{\frac{dC(t)}{dt}}{C_{out}-C(t)}-\frac{S^*(t)}{V(C_{out}-C(t))}-F^*(t)=0 \] Given that we know everything except \(S^*(t)\) and \(F^*(t)\) we can work with chunks of the data and solve for the source and exchange rate terms, as long as they are constant in that period.

Now the problem turns to identify periods where S and F are constant. S will be constant if the number of people in the room doesn’t change and their activity level is also constant. F will be constant if the ventilation condition of the room is not changed by opening doors or windows.

Another approach could be to select periods where \(\frac{dC}{dt}=0\) in which case ventilation and emission are linearly anti correlated.

Let’s explore this a little further selecting periods where \(\frac{dC}{dt}\) is very small and then the mass balance equation can be written as: \[ \frac{S^*}{F^*}=-V(C_{out}-C(t)) \] Plotting \(\frac{S^*}{F^*}\) results in:

data_play$S.F <- -1 * Volume * (min_CO2*ppm2Kgm3 - data_play$CO2*ppm2Kgm3)
dC_notzero <- abs(data_play$dC.dt) > 0.2
data_play$S.F[dC_notzero] <- NA
timePlot(data_play, pollutant = c("CO2", "F_rate", "dC.dt","S.F"), y.relation = "free")

And only looking at one day:

timePlot(selectByDate(data_play,
                      start = "2020-10-12",
                      end = "2020-10-12"),
         pollutant = c("CO2", "F_rate", "dC.dt","S.F"), y.relation = "free")