Calibration workflow for MiCS-4514 sensors given below1 See Manufacturers Datasheet for MiCS Series sgx-product. The calibration routine is aimed at software corrections for cheap metal oxide gas sensors, R code available on request from M. Emre Sener2 See Githubmes-github.
Implementation is done using R language, session information, comments and implementation notes given in margins.
---
title: "MiCS-4514 Calibration"
author: "M. Emre Sener"
output:
tufte::tufte_handout: default
tufte::tufte_html: default
---
version
## _
## platform x86_64-apple-darwin13.4.0
## arch x86_64
## os darwin13.4.0
## system x86_64, darwin13.4.0
## status
## major 3
## minor 3.2
## year 2016
## month 10
## day 31
## svn rev 71607
## language R
## version.string R version 3.3.2 (2016-10-31)
## nickname Sincere Pumpkin Patch
There are three goals of this vignette:
Following libraries are used in the workflow, all available in the CRAN repository.
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tidyr)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(anytime)
library(dyn)
library(neuralnet)
##
## Attaching package: 'neuralnet'
## The following object is masked from 'package:dplyr':
##
## compute
library(openair)
## Loading required package: maps
This section describes the raw data import and reshaping from the low cost sensor module.
Chemduino measurements for the entire measurement period are loaded manually in csv format, this can also be achieved using a suitable API.
chemduino<-read.csv("feeds8.csv",header=TRUE)
names(chemduino)<-c("Time","ID","C.Temp","C.Humidity","C.PM10","C.PM2.5","C.NO2","C.CO","C.Batt")
chemduino$Time<-as.POSIXct(strptime(as.character(chemduino$Time),format="%Y-%m-%d %H:%M:%S"))
The data set is split into contiguous sub-sets, split by periods of inactivity (splitting period given by cutoff parameter in seconds). Periods shorter than an optional threshold (in days) are discarded.
##Declare utility functions
return_windows<-function(x,cutoff=21600, threshold=0){
indices<-x$Time
l<-c(0)
for(i in 1:(length(indices)-1)){
init<-indices[[i]]
sec<-indices[[i+1]]
if(sec-cutoff>init){
l<-c(l,i)
}
}
l<-c(l,length(indices))
windowed.data<-list()
no.windows<-length(l)
if(no.windows>0){
for(j in 1:(no.windows-1)){
temp<-x[c((l[j]+1):(l[j+1])),]
windowed.data[[j]]<-temp
}
}
if(threshold){
indices<-lapply(windowed.data,function(x) {difftime( x[length(x[,1]),1],x[1,1],units="days")})>threshold
windowed.data<-windowed.data[indices]
}
windowed.data
}
return_resistance<-function(x,vin=5,raux=22000,scale_max=1023){
m<-((1/((x*vin/scale_max)/(5*raux)))-raux)
}
find_max_CCF<- function(a,b)
{
d <- ccf(a, b, plot = FALSE,na.action=na.approx.default,max=1000)
cor = d$acf[,,1]
lag = d$lag[,,1]
res = data.frame(cor,lag)
res_max = res[which.max(res$cor),]
return(res_max)
}
##Applied to dataset
chemduino<-return_windows(chemduino,21600,6)
summary(chemduino)
## Length Class Mode
## [1,] 9 data.frame list
## [2,] 9 data.frame list
Reference data is sourced from the London Air Quality Network, using openair3 R package. Reference location is chosen as LAQN London-Bloomsbury Site due to proximity (site code BL0). Wind speed and direction data are also collected to aid analysis. A summary plot of selected pollutants and Temperature is given below. Chemduino data labels are prepended with “C.”
The data is then coerced into “zoo” class and merged with the windowed time series from Chemduino. Imputation can be done using spline or linear approximations. Spline method is chosen to allow smooth transitions between data points with low resolution After the merge, the object is reverted back into a time series object.
library(openair)
ccl2<-importKCL(site="BL0",year=2016:2017,met=TRUE)
## NOTE - mass units are used
## ug/m3 for NOx, NO2, SO2, O3; mg/m3 for CO
## PM10_raw is raw data multiplied by 1.3
lastday<-as.character(as.Date(chemduino[[length(chemduino)]][length(chemduino[[length(chemduino)]][,1]),1]))
firstday<-as.character(as.Date(chemduino[[1]][1,1]))
ccl2<-selectByDate(ccl2,start=firstday,end=lastday)
cclZoo<-read.zoo(ccl2)
mergedData_<-data.frame()
for(i in chemduino){
lastday_<-as.character(as.Date(i[length(i[,1]),1]))
firstday_<-as.character(as.Date(i[1,1]))
temp_<-selectByDate(ccl2,start=firstday_,end=lastday_)
temp_$site<-NULL;temp_$code<-NULL;temp_$solar<-NULL;temp_$so2<-NULL;temp_$v10<-NULL;temp_$v2.5<-NULL;temp_$nv10<-NULL;
temp_$nv2.5<-NULL; temp_$nox<-NULL;
tempZoo<-read.zoo(temp_)
tempi<-read.zoo(i)
mergedData_<-rbind(mergedData_,(as.data.frame(window(na.spline(merge(tempi,tempZoo)),index(tempi)))))
}
date<-as.POSIXct(as.character(rownames(mergedData_)))
mergedData_<-cbind(date,mergedData_)
mergedData_<-mergedData_[,c("date","wd","ws","o3","C.CO","C.PM10","pm10","C.PM2.5","pm25","C.NO2","no2","C.Temp","C.Humidity")]
summaryPlot(mergedData_,print.datacap=FALSE,period="months",type="density")
## date1 date2 wd ws o3 C.CO
## "POSIXct" "POSIXt" "numeric" "numeric" "numeric" "numeric"
## C.PM10 pm10 C.PM2.5 pm25 C.NO2 no2
## "numeric" "numeric" "numeric" "numeric" "numeric" "numeric"
## C.Temp C.Humidity
## "numeric" "numeric"
LAQN Data Over Collection Period and Density Summary
There seems to be no correlation between NO2 figures from both data sets, calibration is required to remove the confounding effects of Temperature, Humidity and input voltage ripple.
#Import
calibcurve<-read.csv("calibcurves3.csv",sep=",")
##Differentially amplified against 2.5V virtual ground
calibcurve$micsc<-2500+calibcurve$mics
#To volts
calibcurve$micsc<-calibcurve$micsc/1000
#Resistance
calibcurve$micsr<-(1/(calibcurve$micsc/(5*22000)))-22000
#Get model
secdeg<-lm(formula = X ~ micsr, data = calibcurve[-(450:480), ])
summary(secdeg)
##
## Call:
## lm(formula = X ~ micsr, data = calibcurve[-(450:480), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.4074 -2.7900 0.6172 3.6777 11.9834
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5.697e+01 1.000e+00 -56.97 <2e-16 ***
## micsr 1.680e-03 1.890e-05 88.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.587 on 477 degrees of freedom
## Multiple R-squared: 0.9431, Adjusted R-squared: 0.943
## F-statistic: 7906 on 1 and 477 DF, p-value: < 2.2e-16
##Not shown: Transformation of raw differential ADC reads to voltages and to resistance values
long<-read.csv("highrange.csv")
long<-long[-(1:300),-c(3,4,5,6)]
names(long)<-c("noxraw","aerosense","noxmv","noxR","concppb")
long$concppb<-long$concppb*100
#Models against aerosense
summary(lm(formula = aerosense ~ noxR + I(noxR^2), data = long))
##
## Call:
## lm(formula = aerosense ~ noxR + I(noxR^2), data = long)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.18640 -0.06713 -0.00984 0.06444 0.23103
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.278e+00 2.010e-01 -21.285 < 2e-16 ***
## noxR 1.275e-04 1.035e-05 12.321 < 2e-16 ***
## I(noxR^2) -6.465e-10 1.308e-10 -4.942 9.61e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08482 on 729 degrees of freedom
## Multiple R-squared: 0.9525, Adjusted R-squared: 0.9524
## F-statistic: 7309 on 2 and 729 DF, p-value: < 2.2e-16
summary(lm(formula = aerosense ~ noxR, data = long))
##
## Call:
## lm(formula = aerosense ~ noxR, data = long)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.20095 -0.06512 -0.01405 0.06145 0.25085
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.291e+00 2.368e-02 -139.0 <2e-16 ***
## noxR 7.649e-05 6.432e-07 118.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.08617 on 730 degrees of freedom
## Multiple R-squared: 0.9509, Adjusted R-squared: 0.9508
## F-statistic: 1.414e+04 on 1 and 730 DF, p-value: < 2.2e-16
We know the voltage divider equation for 2 resistors in series with Raux connected to ground, (greatest range when R1nominal = Raux)
\[V_{out}=V_{in} * \frac{R_{aux}}{R_{1}+R_{aux}}\]
Calibrating Chemduino NO2 Data
library(ggplot2)
ggplot(data=long,aes(x=concppb,y=noxR,colour=2))+
geom_point()+
geom_smooth(method="lm",formula = y ~ I(x) + I((x)^2))+
labs(title="Resistance vs. Concentration at high concentrations")+
xlab("Concentration /10*ppb")+ ylab("Resistance / kOhm")+
theme(legend.position = "none")
summary(lm(formula = concppb ~ noxR + I(noxR^2), data = long))
##
## Call:
## lm(formula = concppb ~ noxR + I(noxR^2), data = long)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.5395 -1.8977 -0.4108 1.4757 8.4527
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.506e+02 6.947e+00 -21.68 <2e-16 ***
## noxR 7.149e-03 3.578e-04 19.98 <2e-16 ***
## I(noxR^2) -6.618e-08 4.523e-09 -14.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.932 on 729 degrees of freedom
## Multiple R-squared: 0.9158, Adjusted R-squared: 0.9156
## F-statistic: 3964 on 2 and 729 DF, p-value: < 2.2e-16
summary(lm(formula = concppb ~ noxR, data = long))
##
## Call:
## lm(formula = concppb ~ noxR, data = long)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.8937 -2.2405 -0.7954 1.4178 9.8314
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4.961e+01 9.160e-01 -54.16 <2e-16 ***
## noxR 1.922e-03 2.488e-05 77.27 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.333 on 730 degrees of freedom
## Multiple R-squared: 0.8911, Adjusted R-squared: 0.8909
## F-statistic: 5971 on 1 and 730 DF, p-value: < 2.2e-16
##Multiple models that return estimated values
predictNO2_1<-function(x){pr.l<--4.27 +(x*0.00012754) -((0.0000000006465494*(x^2)))}
predictNO2_2<-function(x){pr.l<--3.29 +(x*7.649E-9) -((0*(x^2)))}
predictNO2_3<-function(x){pr.l<---1.506E2 +(x*7.149E-3) -((6.618E-8*(x^2)))}
predictNO2_4<-function(x){pr.l<---4.961E1 +(x*1.922E-3) -((0*(x^2)))}
predictNO2_hand<-function(x){pr.l<---1.506E2 +(x*7.149E-3) -((6.618E-8*(x^2)))}
predictNO2_tri<-function(x){pr.l<---1.506E2 +(x*7.149E-3) -((8.618E-3*(x^2)))+((8.618E-4*(x^3)))}
To check the linearity and the zero point of the sensor, it is tested with a calibrated reference under a controlled synthetic atmosphere with varying NO2 concentrations. To check linearity and to obtain the form of the transfer function. As the chemoresistive element is in series with a constant (%1) 22k\(\Omega\) resistor (labeled aux), the output relation is given by the voltage divider equation. Linear and polynomial fits are extracted from the dataset.
The overall relationship between sensor resistance and NO2 concentration
Calibrating Chemduino NO2 Data
library(ggplot2)
ggplot(data=calibcurve[-(450:470),],aes(x=X,y=I(micsr/1000),colour=2))+
geom_point()+
geom_smooth(method="lm",formula = y ~ I(x) + I((x)^2))+
labs(title="Resistance vs. Concentration at low concentrations")+
xlab("Concentration /ppb")+ ylab("Resistance / kOhm")+
theme(legend.position = "none")
It is evident that the second degree term is statistically significant and that the correlation with aerosense is significantly higher due to similar response times. Time to reach 50% of stable value \(\approx\) 2.5 min/0.1ppm.
Due to the complexity of generating controlled environments of set temperatures and humidity, statistical methods on passively collected data are easier.
The largest chunk of data, spanning December 2016 and January 2017 will be split into training and testing sets. 2 appraches are possible based on slicing the timeseries in half or random sampling. In this case data will be randomly sampled. Despite predictive power, PM10 and PM2.5 are not included as inclusion in some models might cause a significant loss in generality.
library(neuralnet)
library(stats)
firstday_<-as.Date(chemduino[[2]][1,1])
mergedDatashort<-mergedData_[mergedData_$date>(firstday_+3) & mergedData_$date<(firstday_+21) ,]
#Extract relevant predictors
data<-mergedDatashort[,c(1,8,10,11,12,13)]
#Transfer into resistance
data$C.NO2<-predictNO2_3(return_resistance(data$C.NO2,5,22000,1023))
##Random sampling of 3/4 of data indices
index <- sample(1:nrow(data),round(0.75*nrow(data)))
#Split into 2 sets, only the non linear transformation from raw readings to resistance will be applied as the second degree term is too small in the ambient range (<0.5 ppm).
train <- data[index,]
test <- data[-index,]
#Linear model fit as benchmark,remove col. date to avoid confusing the algo.
lm.fit <- glm(data=train[,-1], no2 ~.)
summary(lm.fit)
##
## Call:
## glm(formula = no2 ~ ., data = train[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -61.332 -12.269 -0.039 11.024 189.346
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 165.845365 23.081970 7.185 7.67e-13 ***
## C.PM2.5 0.197506 0.005214 37.881 < 2e-16 ***
## C.NO2 -0.111858 0.087612 -1.277 0.202
## C.Temp 0.174051 0.213093 0.817 0.414
## C.Humidity -1.554007 0.034881 -44.552 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 395.1089)
##
## Null deviance: 3699874 on 5209 degrees of freedom
## Residual deviance: 2056542 on 5205 degrees of freedom
## AIC: 45944
##
## Number of Fisher Scoring iterations: 2
Calibrating Chemduino NO2 Data
#Predict the NO2 values based on the linear model on the test set.
pr.lm <- predict(lm.fit,test)
ggplot()+geom_smooth(aes(y=scale(test$C.NO2),x=test$date,colour="Linear Pred."),method="loess",span=0.05)+geom_smooth(aes(y=scale(test$no2),x=test$date,colour="Reference"),method="loess",span=0.05)+labs(title="Scaled Raw Data vs Reference(R=0.21)")+xlab("Concentration /ppb")+ ylab("Date")+ scale_colour_discrete(name = "")
Calibrating Chemduino NO2 Data
ggplot()+geom_smooth(aes(y=pr.lm,x=test$date,colour="Linear Pred."),method="loess",span=0.05)+geom_smooth(aes(y=test$no2,x=test$date,colour="Reference"),method="loess",span=0.05)+labs(title="Linearl Model vs Reference (R=0.65)")+xlab("Concentration /ppb")+ ylab("Date")+ scale_colour_discrete(name = "")
#
# #Finding max and min values for scaling, commented out due to computation time.
max_<- apply(data[,-1], 2, max)
min_<- apply(data[,-1], 2, min)
dates_<-data[,1]
# ##Scale the data to ease the life of the neural network
scaled <- as.data.frame(scale(data[,-1], center = min_, scale = max_ - min_))
train_ <- scaled[index,]
test_ <- scaled[-index,]
library(neuralnet)
library(NeuralNetTools)
#
# ##2 Hidden layers with 5 and 3 nodes, linear output as we don't require classification. Might be a good idea for binning purposes leter on.
#
# nn <- neuralnet(no2 ~ (C.PM2.5 + C.NO2 + C.Temp + C.Humidity), data=train_, hidden=c(5,3), linear.output=TRUE, threshold=0.05)
nn<-readRDS("nn3.rda")
pr.nn <- neuralnet::compute(nn,test_[-3])
plotnet(nn)
Neural Network Diagram and Evaluation
pr.nn_ <- pr.nn$net.result*(max(data$no2)-min(data$no2))+min(data$no2)
ggplot()+geom_smooth(aes(y=pr.lm,x=test$date,colour="Linear Pred."),size=0.5,method="loess",span=0.05)+geom_smooth(aes(y=test$no2,x=test$date,colour="Reference"),size=0.5,method="loess",span=0.05)+geom_smooth(aes(y=pr.nn_,x=test$date,colour="Neural Network"),size=0.5,method="loess",span=0.05)+labs(title="NN vs LM vs Reference (NN R=0.86)")+xlab("Concentration /ppb")+ ylab("Date")+ scale_colour_discrete(name = "")
Neural Network Diagram and Evaluation
test.r <- (test_$no2)*(max(data$no2)-min(data$no2))+min(data$no2)
MSE.nn <- sum((test.r - pr.nn_)^2)/nrow(test_)
An interesting metric for both the linear and neural network models are their performance with discontinuous data sets and whether there is any lag between predictor and response due to the distance.
Binned Performance and CCF
ccf(as.vector(pr.nn_),as.vector(test$no2),lag.max=150)
Binned Performance and CCF
##Bootstrap results go here!
bins<-data.frame(test$no2, bin=cut(test$no2, c(1,20,40,60,80,100,120,140), include.lowest=TRUE))
bins2<-data.frame(as.vector(pr.nn_),bin=cut(as.vector(pr.nn_), c(1,20,40,60,80,100,120,140), include.lowest=TRUE))
mat<-bins[,2]
mat<-cbind(bins2[,2],mat)
df<-as.data.frame(mat)
ggplot(data=df)+stat_density_2d(aes(x=V1,y=mat,fill=..level..),geom="polygon")+labs(title="Coincidence of binned measurements in 20 ppb Bins")+xlab("Bin Reference")+ylab("NN Reference")
## Warning: Removed 9 rows containing non-finite values (stat_density2d).
plot(lm.fit)
Diagnostic Plots
Diagnostic Plots
Diagnostic Plots
Diagnostic Plots
##Bootstrap results go here!