###############################################################################
# Analysis script for the manuscript "The Outdoor Dust Information Node (ODIN)
# - Development and performance assessment of a low cost ambient dust sensor" 
# submitted to Atmospheric Measurement Techniques. June 2015
###############################################################################
## Preamble ####
# Libraries
require(ggplot2)
## Loading required package: ggplot2
require(openair)
## Loading required package: openair
## Loading required package: lazyeval
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## Loading required package: maps
require(reshape2)
## Loading required package: reshape2
require(gridExtra)
## Loading required package: gridExtra
## Loading required package: grid
# Seed
set.seed(1)
###############################################################################

## Baseline ####
raw_dust_data_baseline <- read.table("http://files.figshare.com/2099394/sharp_baseline_test.tsv",
                            header = T,
                            sep = "",
                            quote = "\"'",
                            dec = ".",
                            row.names=NULL)
# Scale Dust measurements to mV (digital scale is 0 - 1024 translates to 0 - 5000 mV)
for (i in (3:10)){
  raw_dust_data_baseline[,i]<-raw_dust_data_baseline[,i]*5000/1024
}
# force GMT as the time zone to avoid openair issues with daylight saving switches
# The actual time zone is 'NZST'
raw_dust_data_baseline$date=as.POSIXct(paste(raw_dust_data_baseline$Date,raw_dust_data_baseline$Time),tz='GMT')
raw_dust_data_baseline$Time<-NULL
raw_dust_data_baseline$Date<-NULL

# Averages for 10 minutes
dust_data_baseline<-timeAverage(mydata = raw_dust_data_baseline,avg.time = '10 min',statistic = 'mean')
dust_summary_baseline <- lapply( dust_data_baseline[,(2:9)] , function(x) rbind( mean = mean(x),
                                                               sd = sd(x) ,
                                                               median = median(x),
                                                               minimum = min(x),
                                                               maximum = max(x),
                                                               s.size = length(x)))
dust_summary_baseline <- data.frame( dust_summary_baseline )
dust_summary_baseline
##               Dust1       Dust2       Dust3       Dust4       Dust5
## mean      0.2429041   0.9865411  27.9243672   0.9608271   8.6837857
## sd        0.1525655   0.3008985   0.9117687   0.2750282   0.3684347
## median    0.2077793   0.9350066  27.9463098   0.9350066   8.7267287
## minimum   0.0000000   0.2077793  25.0374003   0.2077793   7.2722739
## maximum   0.7272274   1.7661237  30.4396609   1.9739029   9.6617354
## s.size  678.0000000 678.0000000 678.0000000 678.0000000 678.0000000
##               Dust6       Dust7       Dust8
## mean      4.4550723  10.9696956  10.6632976
## sd        0.3009904   0.3966578   0.4504248
## median    4.4672540  10.9084109  10.7006316
## minimum   3.5603841   9.7656250   9.3500665
## maximum   5.5061503  12.0511968  12.1053060
## s.size  678.0000000 678.0000000 678.0000000
###############################################################################

## Temperature dependence ####
raw_dust_data_temperature <- read.table("http://files.figshare.com/2099396/sharp_baseline_temperature_Nov2014.tsv",
                            header = T,
                            sep = "",
                            quote = "\"'",
                            dec = ".",
                            row.names=NULL)
# Baseline estimates from previous tests
baseline<-as.numeric(dust_summary_baseline['mean',])
# Scale Dust measurements to mV (digital scale is 0 - 1024 translates to 0 - 5000 mV)
# Substract previous baseline data from the signal.
for (i in (3:10)){
  raw_dust_data_temperature[,i]<-raw_dust_data_temperature[,i]*5000/1024-baseline[i-2]
}
# force GMT as the time zone to avoid openair issues with daylight saving switches
# The actual time zone is 'NZST'
raw_dust_data_temperature$date=as.POSIXct(paste(raw_dust_data_temperature$Date,raw_dust_data_temperature$Time),tz='GMT')
raw_dust_data_temperature$Time<-NULL
raw_dust_data_temperature$Date<-NULL

# Averages for 10 minutes
dust_data_temperature<-timeAverage(mydata = raw_dust_data_temperature,avg.time = '10 min',statistic = 'mean')
long_scatter_temperature=reshape(dust_data_temperature[,c(2,3,4,5,6,7,8,9,10,11)],direction = 'long',varying = names(dust_data_temperature[2:9]),v.name='Dust',timevar = 'Sensor')
long_scatter_temperature$Sensor=as.character(long_scatter_temperature$Sensor)
png("./sensor_temperature_scatter.png",height = 620,width = 1280)
long_scatter_temperature_plot<-ggplot(long_scatter_temperature, aes(x=Temperature, y=Dust, color=Sensor)) +
  geom_point(shape=19,alpha=1/3,size = 10)+
  theme_bw()+
  theme(text = element_text(size = 30))+
  xlab('Temperature (C)')+
  ylab('Sensor response (mV)')
long_scatter_temperature_plot
## Warning in loop_apply(n, do.ply): Removed 744 rows containing missing
## values (geom_point).
dev.off()
## png 
##   2
long_scatter_temperature_plot
## Warning in loop_apply(n, do.ply): Removed 744 rows containing missing
## values (geom_point).

summary(Temp.lm<-lm(data=long_scatter_temperature,Dust~Temperature))
## 
## Call:
## lm(formula = Dust ~ Temperature, data = long_scatter_temperature)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4725 -2.9810 -0.4161  2.5320  9.4704 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.25219    0.18366   17.71   <2e-16 ***
## Temperature  0.27018    0.01145   23.61   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.462 on 2742 degrees of freedom
##   (744 observations deleted due to missingness)
## Multiple R-squared:  0.1689, Adjusted R-squared:  0.1686 
## F-statistic: 557.3 on 1 and 2742 DF,  p-value: < 2.2e-16
###############################################################################

## ODIN's performance evaluation ####
odin_01 <- read.table("http://files.figshare.com/2099373/odin01_24July_14August_2014.tsv",
                      header=TRUE,
                      quote="")
ecan_data<-read.table("http://files.figshare.com/2099539/StAlbans24July_14August_2014.tsv",
                      header = TRUE,
                      sep="\t",
                      quote = "\"'",
                      dec=".")
ecan_data$date<-as.POSIXct(ecan_data$date,format = "%m/%d/%Y %H:%M:%S")
odin_01$date=as.POSIXct(paste(odin_01$Date,odin_01$Time),tz='GMT')
odin_01$Time<-NULL
odin_01$Date<-NULL
odin_01$Battery<-5*odin_01$Battery/1024
# Merging the data
all_merged_FULL<-merge(odin_01,ecan_data,by = 'date',all = TRUE)
all_merged.10min<-timeAverage(all_merged_FULL,avg.time = '10 min')
# Time sync
# Check the time difference, correct the data and re-merge.
lag_test=ccf(all_merged.10min$Temperature,
             all_merged.10min$Temperature.1m,
             na.action=na.pass,
             lag.max=100,
             type='correlation',
             ylab='Correlation',
             main='Temperature correlation as function of clock lag')

odin01_lag=lag_test$lag[which.max(lag_test$acf)]
odin_01$date=odin_01$date-odin01_lag*10*60
odin_01$ODIN = odin_01$Dust
odin_01$Dust<-NULL
all_merged_FULL<-merge(odin_01,ecan_data,by = 'date',all = TRUE)
all_merged.10min<-timeAverage(all_merged_FULL,avg.time = '10 min')
# Full dataset 1 hour  
all_merged.1hr<-timeAverage(all_merged.10min,avg.time='1 hour')
all_merged.1hr$PMcoarse<-all_merged.1hr$PM10.FDMS - all_merged.1hr$PM2.5.FDMS

# Remove drift from ODIN raw data
# Estimate the baseline from a simple linear regression
all_merged.1hr$ODIN_drift<-predict(lm(all_merged.1hr$ODIN~seq(all_merged.1hr$ODIN)),newdata = all_merged.1hr)

all_merged.1hr$ODIN_drift[1] # Baseline at the beginning
## [1] 403.3056
all_merged.1hr$ODIN_drift[length(all_merged.1hr$ODIN_drift)] # Baseline at the end 
## [1] 287.552
# Remove the baseline drift from the raw ODIN data
all_merged.1hr$ODIN_raw <- all_merged.1hr$ODIN
all_merged.1hr$ODIN_detrend<-all_merged.1hr$ODIN_raw - all_merged.1hr$ODIN_drift
# Temperature correction
summary(odin_T<-lm(data=all_merged.1hr,ODIN_detrend~Temperature,subset = Temperature>10))
## 
## Call:
## lm(formula = ODIN_detrend ~ Temperature, data = all_merged.1hr, 
##     subset = Temperature > 10)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -62.750  -7.120  -0.781   9.946  28.070 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -49.2798     3.8994  -12.64   <2e-16 ***
## Temperature   4.3821     0.2553   17.16   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13.49 on 189 degrees of freedom
##   (22 observations deleted due to missingness)
## Multiple R-squared:  0.6092, Adjusted R-squared:  0.6071 
## F-statistic: 294.6 on 1 and 189 DF,  p-value: < 2.2e-16
format(coef(odin_T),digits = 2)
## (Intercept) Temperature 
##     "-49.3"     "  4.4"
format((confint(odin_T)[,2]-confint(odin_T)[,1])/2,digits = 1)
## (Intercept) Temperature 
##       "7.7"       "0.5"
png('./odin_T_scatter.png',width = 750,height = 750)
odin_T_scatter<-qplot(Temperature, ODIN_detrend,data = all_merged.1hr)+
  #geom_point(colour = 'blue')
  geom_abline(intercept = coef(odin_T)[1],slope = coef(odin_T)[2])+
  ylab(bquote(ODIN[detrended]~'(mV)'))+
  xlab('Temperature (C)')+
  theme_bw()+
  theme(text=element_text(size=30))
odin_T_scatter
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_point).
dev.off()
## png 
##   2
odin_T_scatter
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_point).

all_merged.1hr$ODIN <- all_merged.1hr$ODIN_detrend - predict(odin_T,newdata = all_merged.1hr)

# Dust performance using ECan data for calibration
# Calibration expression:
#  $ODIN_{calibrated}=A*ODIN_{raw}+B$

# Using only the first 30% of the data to fit the models
data_selection.1hr <- rbinom(length(all_merged.1hr$date),1,0.3)
data_selection.1hr <- (seq(length(all_merged.1hr$date))<(length(all_merged.1hr$date)/3))
# PM$_{2.5}$ fdms
summary(odin1.lm.1hr.pm2.5.fdms<-
          lm(data=all_merged.1hr,PM2.5.FDMS~ODIN,
             subset = data_selection.1hr == 1))
## 
## Call:
## lm(formula = PM2.5.FDMS ~ ODIN, data = all_merged.1hr, subset = data_selection.1hr == 
##     1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -34.692  -6.487  -0.585   4.417  41.715 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 18.73469    0.89114   21.02   <2e-16 ***
## ODIN         0.63465    0.02067   30.70   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 10.95 on 158 degrees of freedom
##   (15 observations deleted due to missingness)
## Multiple R-squared:  0.8564, Adjusted R-squared:  0.8555 
## F-statistic: 942.6 on 1 and 158 DF,  p-value: < 2.2e-16
format(coef(odin1.lm.1hr.pm2.5.fdms),digits = 1)
## (Intercept)        ODIN 
##      "18.7"      " 0.6"
format((confint(odin1.lm.1hr.pm2.5.fdms)[,2]-confint(odin1.lm.1hr.pm2.5.fdms)[,1])/2,digits = 1)
## (Intercept)        ODIN 
##      "1.76"      "0.04"
### Calculate the corrected data
all_merged.1hr$ODIN.2.5f<-predict(odin1.lm.1hr.pm2.5.fdms,
                                  newdata = all_merged.1hr)
sqrt( mean( (all_merged.1hr$ODIN.2.5f-all_merged.1hr$PM2.5.FDMS)^2 , na.rm = TRUE ) ) #RMSE for the whole dataset
## [1] 15.41
cor(all_merged.1hr$ODIN.2.5f,all_merged.1hr$PM2.5.FDMS,use = "pairwise")^2 # R^2 for the whole dataset
## [1] 0.7242653
# 24 hour dataset
merged.24hr <- timeAverage(all_merged.1hr,avg.time = '24 hour',statistic = 'mean')

png('./raw_odin_fdms.png',width = 1500,height = 1500)
tseries_pm2.5<-ggplot(data = all_merged.1hr, aes(x=date,y=PM2.5.FDMS))+
  geom_line(colour = 'red')+
  ylab(bquote(PM[2.5]~'('*mu~'g'~m^-3~')'))+
  theme_bw()+
  theme(text=element_text(size=30))
tseries_ODIN<-ggplot(data = all_merged.1hr, aes(x=date,y=ODIN_raw))+
  geom_line(colour = 'black')+
  theme_bw()+
  ylab('ODIN(mV)')+
  theme(text=element_text(size=30))
grid.arrange(tseries_pm2.5,tseries_ODIN,ncol = 1)
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_path).
dev.off()
## png 
##   2
grid.arrange(tseries_pm2.5,tseries_ODIN,ncol = 1)
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_path).

png('./corrected_odin_fdms_PM2.5.png',width = 2400,height = 1200)
tseries_pm2.5_1hr<-ggplot(data = all_merged.1hr, aes(x=date))+
  geom_line(aes(y=PM2.5.FDMS,colour = '0'))+
  geom_line(aes(y=ODIN.2.5f,colour = '1'))+
  ylab(bquote(PM[2.5]~'('*mu~'g'~m^-3~')'))+
  theme_bw()+
  ggtitle('a)')+
  scale_colour_manual(values = c('red','black'),name='',labels = c(expression(PM[2.5]),'ODIN'))+
  theme(text=element_text(size=30),legend.position=c(0.5,0.9))
tseries_pm2.5_1dy<-ggplot(data = merged.24hr, aes(x=date))+
  geom_line(aes(y=PM2.5.FDMS,colour = '0'))+
  geom_line(aes(y=ODIN.2.5f,colour = '1'))+
  ylab(bquote(PM[2.5]~'('*mu~'g'~m^-3~')'))+
  theme_bw()+
  ggtitle('b)')+
  scale_colour_manual(values = c('red','black'),name='',labels = c(expression(PM[2.5]),'ODIN'))+
  theme(text=element_text(size=30),legend.position=c(0.5,0.9))
grid.arrange(tseries_pm2.5_1hr,tseries_pm2.5_1dy,ncol = 1)
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_path).
dev.off()
## png 
##   2
grid.arrange(tseries_pm2.5_1hr,tseries_pm2.5_1dy,ncol = 1)
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_path).

png('./corrected_odin_fdms_PM.png',width = 1500,height = 1500)
grid.arrange(tseries_pm2.5_1hr,tseries_pm2.5_1dy,ncol=2)
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_path).
dev.off()
## png 
##   2
grid.arrange(tseries_pm2.5_1hr,tseries_pm2.5_1dy,ncol=2)
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_path).

png('./corrected_odin_scatter_hr.png',width = 1500,height = 1500)
scatter_fitted_PM2.5_hour<-ggplot(data = all_merged.1hr,aes(x=PM2.5.FDMS,y = ODIN.2.5f))+
  geom_point(shape = 1)+
  geom_abline(intercept = 0, slope = 1)+
  theme_bw()+
  ggtitle('c)')+
  theme(text=element_text(size=30))+
  ylab(bquote(ODIN~estimate~'('*mu~'g'~m^-3~')'))+
  xlab(bquote(PM[2.5]~'('*mu~'g'~m^-3~')'))
scatter_fitted_PM2.5_hour
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_point).
dev.off()
## png 
##   2
scatter_fitted_PM2.5_hour
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_point).

png('./corrected_odin_scatter_dy.png',width = 1500,height = 1500)
scatter_fitted_PM2.5_day<-ggplot(data = merged.24hr,aes(x=PM2.5.FDMS,y = ODIN.2.5f))+
  geom_point(shape = 1)+
  geom_abline(intercept = 0, slope = 1)+
  theme_bw()+
  ggtitle('d)')+
  theme(text=element_text(size=30))+
  ylab(bquote(ODIN~estimate~'('*mu~'g'~m^-3~')'))+
  xlab(bquote(PM[2.5]~'('*mu~'g'~m^-3~')'))
scatter_fitted_PM2.5_day
dev.off()
## png 
##   2
scatter_fitted_PM2.5_day

png('./tseries_and_scatter.png',width = 1500, height = 1500)
grid.arrange(tseries_pm2.5_1hr,tseries_pm2.5_1dy,scatter_fitted_PM2.5_hour,scatter_fitted_PM2.5_day,ncol=2)
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_path).
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_point).
dev.off()
## png 
##   2
grid.arrange(tseries_pm2.5_1hr,tseries_pm2.5_1dy,scatter_fitted_PM2.5_hour,scatter_fitted_PM2.5_day,ncol=2)
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_path).
## Warning in loop_apply(n, do.ply): Removed 22 rows containing missing values
## (geom_point).

# System information
sessionInfo()
## R version 3.2.0 (2015-04-16)
## Platform: x86_64-redhat-linux-gnu (64-bit)
## Running under: Fedora 21 (Twenty One)
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] grid      stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] gridExtra_0.9.1 reshape2_1.4.1  openair_1.5     maps_2.3-9     
## [5] dplyr_0.4.1     lazyeval_0.1.10 ggplot2_1.0.1  
## 
## loaded via a namespace (and not attached):
##  [1] Rcpp_0.11.6         mapdata_2.2-3       RColorBrewer_1.1-2 
##  [4] plyr_1.8.2          tools_3.2.0         digest_0.6.8       
##  [7] evaluate_0.7        gtable_0.1.2        nlme_3.1-120       
## [10] lattice_0.20-31     mgcv_1.8-6          png_0.1-7          
## [13] Matrix_1.2-0        DBI_0.3.1           mapproj_1.2-2      
## [16] yaml_2.1.13         parallel_3.2.0      hexbin_1.27.0      
## [19] proto_0.3-10        stringr_1.0.0       knitr_1.10.5       
## [22] cluster_2.0.1       RgoogleMaps_1.2.0.7 rmarkdown_0.6.1    
## [25] RJSONIO_1.3-0       latticeExtra_0.6-26 magrittr_1.5       
## [28] scales_0.2.4        htmltools_0.2.6     MASS_7.3-40        
## [31] assertthat_0.1      colorspace_1.2-6    labeling_0.3       
## [34] stringi_0.4-1       munsell_0.4.2