Exploring HauHau data

# Load libraries
library(curl)

## Using libcurl 7.76.1 with OpenSSL/1.1.1l-fips

library(RJSONIO)
library(readr)

## 
## Attaching package: 'readr'

## The following object is masked from 'package:curl':
## 
##     parse_date

library(openair)
library(ggplot2)
library(parallel)
library(doParallel)

## Loading required package: foreach

## Loading required package: iterators

library(pspline)

The point of this is to explore the 2021 HauHau Ventilation experiment

# Get list of devices from Google Drive
aa <- curl_fetch_memory("https://docs.google.com/spreadsheets/d/e/2PACX-1vSNExRg6Q44L6m0Zet55Bh8lFVWxmCLtMF_csLP-zr4hY0l-fLth9cqN8EasIwA8L5De_1S8kInFwuW/pub?output=csv")
hauhau_devices <- read.csv(textConnection(rawToChar(aa$content)))

names(hauhau_devices) <- c("name", "devID", "type", "location", "user", "guestflag")

devices <- hauhau_devices$devID
# Let's focus on one device
device <- subset(hauhau_devices, name == "HAUHAU-406")

# Fetch data
# HauHau API stuff

hhbase_url <- "http://stats.iaqsensors.com/hauhau/api/requestDeviceData?"
hhapiKey <- "0652896a-cf03-4a4f-8487-ae3ef8634bb8"

# ==================================
# Make sure that x_now is in LOCAL TIME (NZDT for the current experiment)
x_now <- Sys.time()
# The beginning of this summary ... 14 hours in the past
x_start <- x_now - 8 * 3600

#x_start <- as.POSIXct("2021-11-30 08:00:00")
#x_now <- as.POSIXct("2021-11-30 16:00:00")


# Turn it to the format that the API wants
startDate <- strftime(x_start, format = "%Y-%m-%d%%20%H:%M:%S")
endDate <- strftime(x_now, format = "%Y-%m-%d%%20%H:%M:%S")

hhbuilt_url <- paste0(hhbase_url,
                            "apiKey=",hhapiKey,"&",
                            "devID=",device$devID,"&",
                            "startDate=",startDate,"&",
                            "endDate=",endDate)

req1 <- curl_fetch_memory(hhbuilt_url)
reqOK <- req1$status_code==200
if (!reqOK){
    print("Request error")
}

#Check we've got some data

server_response <- fromJSON(rawToChar(req1$content))
data_available <- server_response$status==1

# There is data in sensorlog so parse it
jreq1 <- server_response$sensorlog
npoints <- length(jreq1)
dataHH.raw <- data.frame(id = (1:npoints))
dataHH.raw$date <- as.POSIXct(jreq1[[1]][[1]])
dataHH.raw$Temperature <- NA
dataHH.raw$Humidity <- NA
dataHH.raw$CO2 <- NA
dataHH.raw$PM2.5 <- NA
dataHH.raw$sensorName <- device$name

for (i in (1:npoints)){
  dataHH.raw$date[i] <- as.POSIXct(jreq1[[i]][[1]])
  dataHH.raw$Temperature[i] <- as.numeric(jreq1[[i]][[2]])
  dataHH.raw$Humidity[i] <- as.numeric(jreq1[[i]][[3]])
  dataHH.raw$CO2[i] <- as.numeric(jreq1[[i]][[4]])
  dataHH.raw$PM2.5[i] <- as.numeric(jreq1[[i]][[5]])
}
dataHH.raw$datenum <- as.numeric(dataHH.raw$date)

dataHH <- timeAverage(dataHH.raw, avg.time = '15 sec', fill = TRUE)

## Detected data with Daylight Saving Time.

dataHH$sensorName <- dataHH.raw$sensorName[1]

names(dataHH)

## [1] "date"        "id"          "Temperature" "Humidity"    "CO2"        
## [6] "PM2.5"       "datenum"     "sensorName"

#What does it look like
ggplot(dataHH, aes(x=date, y = CO2)) + 
  geom_line(colour = "red")

Let’s see what a log transformation does

dataHH$logCO2 <- log(dataHH$CO2)
ggplot(dataHH, aes(x=date, y = logCO2)) + 
  geom_line(colour = "red")

Let’s try to find some exponential decays in the first couple of days of data

# Walk the timeseries with a X minutes window and try to fit an exponential decay.
nobs <- length(dataHH$date)
obs_span <- 20 # Number of observations that make X  minutes
#setup parallel backend to use many processors
cores <- detectCores()
cl <- makeCluster(cores - 2) #not to overload your computer
registerDoParallel(cl)
r2_array <- foreach(i=1:(nobs-obs_span),
                    .errorhandling = 'remove',
                    .combine = rbind) %dopar%
  {
    out_tmp <- data.frame(id = i)
    out_tmp$adj.r2 <- NA
    #for (i in (1:(nobs-30))){
    window_data <- dataHH[(i:(i+obs_span)),]
    window_data$hours <- (window_data$datenum - min(window_data$datenum,na.rm = TRUE)) / 3600
    
    # Fit linear model logCO2 ~ hours
    curr_model <- lm(data = window_data,logCO2 ~ hours)
    
    out_tmp$adj.r2 <- summary(curr_model)$adj.r.squared
    out_tmp$exp_coeff <- curr_model$coefficients[2]
    out_tmp$intercept <- exp(curr_model$coefficients[1])
    out_tmp
  }
stopCluster(cl)

# Flag "high R2"
r2_array$highR2 <- r2_array$adj.r2
r2_array$highR2[r2_array$adj.r2<=0.85] <- NA

ggplot(data = r2_array, aes(x = dataHH$date[id])) +
  geom_point(aes(y = adj.r2), colour = 'black') +
  geom_point(aes(y=highR2), colour = 'red') +
  geom_line(aes(y = dataHH$CO2[id]/max(dataHH$CO2[id],na.rm = TRUE)), colour = 'blue') +
  ylim(0,1)

Let’s try to fit the full model \(ln(\frac{C_{ss}-C_{in}}{C_{ss}-C(t)})= Ft\)

With \(C_{ss}=C_{out} + \frac{S}{FV}\)

# Define functions
# Constants for the fitting
CO2ppm2Kgm3 <- 0.0018
min_CO2 <- min(dataHH$CO2)
Volume <- 225

# Mass balance model
f_model <- function(f.BG,f.S,f.ACH,f.Vol,f.C,f.t){
  # Parameters:
  # f.BG = CO2 background concentration [Kg/m3]
  # f.S = CO2 emission rate TOTAL [Kg/hr]
  # f.ACH = Air Changes per Hour [1/hr]
  # f.Vol = Classroom Volume [m3]
  # f.C = Initial CO2 concentration [Kg/m3]
  # f.t = Time since the start of the increase/decrease [hr]
  ndata <- length(f.t)
  start_time <- f.t[1]
  C_start <- f.C[1] - f.BG
  term1 <-(f.S/(f.ACH*f.Vol)) * (1 - exp(-1 * f.ACH*(f.t - start_time)))
  term2 <- (C_start - f.BG) * exp(-1 * f.ACH*(f.t - start_time))
  f.C <- f.BG + term1 + term2
  f.C
}

# Model error function
f_optim <- function(params,C,t){
  # params = c(CO2 Emission Rate, ACH)
  # C = Array of CO2 concentrations [Kg/m3]
  # t = Array of time [hr]
  
  # Note BACK differences so first datapoint is not used
  f_optim <- 0
  npoints <- length(C)
  model_full <- f_model(min(C,na.rm = TRUE),
                        params[1],
                        params[2],
                        Volume,
                        C[1],
                        t)
  sum((model_full - C[2:npoints])^2)
}

# Running the fitting over the whole timeseries
n_R <- length(r2_array$id)

cl <- makeCluster(cores - 2) #not to overload your computer
registerDoParallel(cl)
model_coeffs <- foreach(i=1:(n_R),
                    .errorhandling = 'remove',
                    .combine = rbind) %dopar%
  {
    
    out_tmp <- data.frame(id = r2_array$id[i])
    out_tmp$ACH <- NA
    out_tmp$ER <- NA
    out_tmp$error <- NA
    try({
      # Select this segment of data (30 minutes)
      c_data <- dataHH[(r2_array$id[i]:(r2_array$id[i]+obs_span)),]
      c_data$hours <- (c_data$datenum - min(c_data$datenum,na.rm = TRUE)) / 3600
      # Optimisation 
      fit_result <- optim(c(1e-5,1),
                          f_optim,
                          C=c_data$CO2*CO2ppm2Kgm3,
                          t=c_data$hours)
      out_tmp$ACH <- fit_result$par[2]
      out_tmp$ER <- fit_result$par[1]
      out_tmp$error <- fit_result$value
    }, silent = TRUE)
    out_tmp
  }
stopCluster(cl)

Now plotting

ggplot()+
  geom_point(aes(x = dataHH$date[model_coeffs$id],y = (model_coeffs$ER)))

ggplot()+
  geom_point(aes(x = dataHH$date[model_coeffs$id],y = model_coeffs$ACH/max(model_coeffs$ACH)), colour = "black") +
  geom_point(aes(x = dataHH$date[model_coeffs$id],y = r2_array$adj.r2/max(r2_array$adj.r2)), colour = "red") +
  geom_line(aes(x = dataHH$date,y = dataHH$CO2/max(dataHH$CO2)), colour = "blue")

ggplot()+
  geom_point(aes(x = dataHH$date[model_coeffs$id],y = (model_coeffs$ACH)))

Now let’s try “nls”

# The model function is the same "f_model"

# Running the fitting over the whole timeseries
n_R <- length(r2_array$id)

cl <- makeCluster(cores - 2) #not to overload your computer
registerDoParallel(cl)
model_coeffs <- foreach(i=1:(n_R),
                    .errorhandling = 'remove',
                    .combine = rbind) %dopar%
  {
    
    out_tmp <- data.frame(id = r2_array$id[i])
    out_tmp$ACH <- NA
    out_tmp$ER <- NA
    out_tmp$error <- NA
    try({
      # Select this segment of data (30 minutes)
      c_data <- dataHH[(r2_array$id[i]:(r2_array$id[i]+obs_span)),]
      c_data$hours <- (c_data$datenum - min(c_data$datenum,na.rm = TRUE)) / 3600
      c_data$CO2 <- c_data$CO2 * CO2ppm2Kgm3
      # Optimisation
      model.nls <- nls(data = c_data,
                       formula = (CO2-CO2[1])*CO2ppm2Kgm3 ~ f_model(min(CO2),
                                                                    EmRate,
                                                                    ACH,
                                                                    Volume,
                                                                    CO2[1],
                                                                    hours),
                       start = list(EmRate = 0,
                                    ACH = 0.1),
                       trace = TRUE,
                       algorithm = "port")
      summary_model <- summary(model.nls)
      out_tmp$ACH <- summary_model$coefficients[2,1]
      out_tmp$ER <- summary_model$coefficients[1,1]
      
      RSS.p <- sum(residuals(model.nls)^2)  # Residual sum of squares
      TSS <- sum((c_data$CO2 - mean(c_data$CO2))^2)  # Total sum of squares
      out_tmp$FitMeas <- 1 - RSS.p/TSS
    }, silent = false)
    out_tmp
  }
stopCluster(cl)

Now plotting

ggplot()+
  geom_point(aes(x = dataHH$date[model_coeffs$id],y = (model_coeffs$ER)))

ggplot()+
  geom_point(aes(x = dataHH$date[model_coeffs$id],y = (model_coeffs$ACH)))

ggplot()+
  geom_point(aes(x = dataHH$date[model_coeffs$id],y = (model_coeffs$FitMeas)))

ggplot(data = subset(model_coeffs, FitMeas>0.85))+
  geom_point(aes(x = dataHH$date[id],y = (ER)))

ggplot(data = subset(model_coeffs, FitMeas>0.85))+
  geom_point(aes(x = dataHH$date[id],y = (ACH)))

Exploring HauHau data

Gustavo Olivares