Vanderbilt / CU Boulder Microbial Sensor Analysis V2

This R Publication is intended to provide updates on current and ongoing data analysis to the researches involved in the deployment of CU Boulder’s PHBV sensors (gen2) by Vanderbilt researchers. Using this approach provides transparency regarding the data analysis methods used, and improves communication between researchers.

Four loggers were provided with 3 sensors per logger (parallelized PHBV on long stake PCB). Loggers were installed at four sites, referred to here as grass_burn, grass_no_burn, juniper_burn and juniper_no_burn. Note that the logger at the juniper_burn sites intially had an unseated SD card, so data does not start from that logger until later in the experiment.

Data Analysis, using the R programming language:

# Load Libraries ----------------------------------------------------------
library(tidyverse); library(dplyr); library(lubridate);  library(scales)
library(ggplot2); library(cowplot); library(RColorBrewer); library(ggpubr); 
library(DescTools); library(stringr); library(zoo)
library(changepoint)
library(reshape)

Import and process data from parallelized sensor loggers (Gen1)

# Import and Process Text Files -------------------------------------------
setwd(dirname(rstudioapi::getActiveDocumentContext()$path))

dev1 <- read.table('grass site 1 burn september 2025.txt',sep=',',header=TRUE)
names(dev1) <- c('device','time','temp','ch1','ch2','ch3','ch4','ch5','ch6')
dev1$site <- 'grass_burn'

dev3 <- read.table('grass site 3 control no burn september 2025.txt',sep=',',header=TRUE)
names(dev3) <- c('device','time','temp','ch1','ch2','ch3','ch4','ch5','ch6')
dev3$site <- 'grass_no_burn'

dev4 <- read.table('juniper site 4 control no burn september 2025.txt',sep=',',header=TRUE)
names(dev4) <- c('device','time','temp','ch1','ch2','ch3','ch4','ch5','ch6')
dev4$site <- 'juniper_no_burn'

#Adding Device 2, which had a bad SD card installation initially
dev2 <- read.table('juniper site 2 burn september 2025.txt', sep=',',header=TRUE)
names(dev2) <- c('device','time','temp','ch1','ch2','ch3','ch4','ch5','ch6')
dev2$site <- 'juniper_burn'

####Combine the data####
wideData <- rbind(dev1,dev3,dev4,dev2)
rm(dev3,dev4)

#Format dates
wideData$time <- mdy_hms(wideData$time)

#Because the wakeup schedule is set by the timer, not the RTC, we can interpolate
#timestamps, where row = time order:
#Interpolate corrupted timestamps (NA after parsing) within each device
wideData <- wideData %>%
  group_by(device) %>%
  mutate(time = as.POSIXct(na.approx(as.numeric(time), na.rm = FALSE),
                            origin = "1970-01-01", tz = "UTC")) %>%
  ungroup() %>%
  as.data.frame()

#Remove early dates to only include good sensor port data
wideData <- subset(wideData, time > as.POSIXct("2024-06-12 15:00:00"))
#Filter out pre-SD datapoint from Device 2
preData <- subset(wideData, device=="Device 2")
preData <- subset(preData, time < "2024-10-01")
wideData <- setdiff(wideData,preData)

####Convert to long format####
allData <- melt(wideData,id=c('site','device','time'))

#Remove logger temperature data
tempData <- subset(allData, variable == 'temp')
allData <- setdiff(allData,tempData)

####Clean up the data a bit####
names(allData) <- c('site','sensor','time','variable','reading')
allData$sensor <- paste(allData$sensor,allData$variable)
allData <- select(allData, -'variable')

####Normalize the data by channel####
#Get an averaged initial resistance
#r_0 will be the mean resistance value between these dateTimes:
rStart <- as.POSIXct("2024-06-13 00:00:00")

r_0 <- allData %>%
  group_by(sensor) %>%
  arrange(time) %>%
  filter(time < pmax(min(time), rStart) + hours(12)) %>%
  dplyr::summarise(r0 = mean(reading))

allData <- merge(allData, r_0, by="sensor")

allData$r_norm <- allData$reading / allData$r0

####Include only channels that were hooked up to sensors####
allData <- subset(allData, sensor %in% 
                    c('Device 1 ch1', 'Device 1 ch2', 'Device 1 ch3',
                      'Device 2 ch1', 'Device 2 ch2', 'Device 2 ch3',
                      'Device 3 ch1', 'Device 3 ch2', 'Device 3 ch3',
                      'Device 4 ch1', 'Device 4 ch2', 'Device 4 ch3'))

Plot the raw (normalized) data

####Plot raw sensor data in a facet grid####
rawplot <- ggplot(allData) + theme_bw() +
  geom_point(aes(x=time,y=r_norm,col=site),shape=21,size=1.5) +
  xlab("") + ylab(expression("Normalized Resistance (R/R"[o]*")")) +
  scale_fill_brewer(palette = "Set1") + scale_color_brewer(palette = "Set1")

rawplot + facet_wrap(~sensor) + ylim(0.5,1.5) + ggtitle("Raw Data (set y axis)")

rawplot + facet_wrap(~sensor, scales='free_y') + ggtitle("Raw Data (free y axis)")

Simplify data by removing unused sensor channels and failed sensor

Based on the above plot, it looks like some channels were unused in this experiment. We’ll use brute force to only retain sensors that were on active channels.

Additionally, it looks like there’s a sensor sudden failure August 9th for Device 4, channel 2. We’ll remove any data past the failure for that sensor channel.

####Remove very high normalized resistances####
#This removes the data from Device 4 ch2 due to a failed sensor,
#and the data from Device 4 ch 3 later due to a failed sensor.
allData <- subset(allData, r_norm <= 50)

#Replot
ggplot(allData) + theme_bw() +
  geom_point(aes(x=time,y=r_norm,col=site),shape=21,size=1.5) +
  facet_wrap(~sensor, scale='free_y') + ylim(0.5,3) +
  xlab("") + ylab(expression("Normalized Resistance (R/R"[o]*")")) +
  scale_fill_brewer(palette = "Set1") + scale_color_brewer(palette = "Set1")

#Define data types
allData$time <- as.POSIXct(allData$time)
allData$site <- as.factor(allData$site); allData$sensor <- as.factor(allData$sensor)

Look at overlaid sensor signals

Based on raw sensor signals, do we see significant differences in sensor response as a function of the field in which the sensor is deployed?

ggplot(allData) + theme_bw() +
  geom_point(aes(x=time,y=r_norm,fill=site),shape=21,col="black") +
  scale_color_brewer(palette = "Set1") + scale_fill_brewer(palette = "Set1") +
  xlab("") + ylab(expression("Normalized Resistance (R/R"[o]*")"))

We often see larger sensor signal variability in healthy soil. Here, we see high variability between sensors in the juniper site, medium variability in the grass_burn site, and low variability in the grass_no_burn site. We also see generally higher normalized resistance in the juniper site, but across all of the sites, we see a negative slope dominating the response. We most often see this negative slope when the biodegradable polymer is not being colonized by bacteria. We see some interesting moments of events where the sensor sleet as a whole responds. We should examine what might be responsible for these impacts - for example, large rainfall events that impacted all three sites.

Look at averaged sensor signals

We often average sensor signal by treatment, examining these data by looking at both the mean and standard deviation of the sensor signal within each treatment (or field site).

####Let's average sensor signals by site to look for differences####
aveData <- allData %>%
  mutate(hourTime = cut(time, breaks='2 hours')) %>% 
  group_by(site,hourTime) %>%
  dplyr::summarise(stdev = sd(r_norm), r_norm = mean(r_norm, na.rm=T))

aveData$hourTime <- as.POSIXct(aveData$hourTime)
aveData$site <- as.factor(aveData$site)

sitePlot <- ggplot(aveData) + theme_bw() +
  geom_point(aes(x=hourTime,y=r_norm,fill=site),shape=21,col="black") +
  scale_color_brewer(palette = "Set1") + scale_fill_brewer(palette = "Set1") +
  xlab("") + ylab(expression("Normalized Resistance (R/R"[o]*")"))

devPlot <- sitePlot + 
  geom_ribbon(aes(x=hourTime,ymax=r_norm+stdev,ymin=r_norm-stdev,
                                      col=site,fill=site), alpha=0.3)

show(sitePlot); show(devPlot)

### Examine sensor response by soil type and treatment

#Embarrassingly brute force factor assignment
aveData <- data.frame(aveData)
aveData$soil <- "NA"; aveData$treatment <- "NA"
aveData[aveData$site=="grass_burn",]$soil <- "grass"
aveData[aveData$site=="grass_no_burn",]$soil <- "grass"
aveData[aveData$site=="grass_burn",]$treatment <- "burn"
aveData[aveData$site=="grass_no_burn",]$treatment <- "no_burn"
aveData[aveData$site=="juniper_burn",]$soil <- "juniper"
aveData[aveData$site=="juniper_no_burn",]$soil <- "juniper"
aveData[aveData$site=="juniper_burn",]$treatment <- "burn"
aveData[aveData$site=="juniper_no_burn",]$treatment <- "no_burn"

devPlot <- ggplot(aveData) + theme_bw() +
  geom_point(aes(x=hourTime,y=r_norm,fill=site),shape=21,col="black") +
  geom_ribbon(aes(x=hourTime,ymax=r_norm+stdev,ymin=r_norm-stdev,
                                      col=site,fill=site), alpha=0.3) +
  scale_color_brewer(palette = "Set1") + scale_fill_brewer(palette = "Set1") +
  xlab("") + ylab(expression("Normalized Resistance (R/R"[o]*")"))
  

devPlot + facet_wrap(~site)

devPlot + facet_wrap(~soil, scales="free_y")

These initital results seem to suggest higher sensor response variability in the burn sites than in the non-burn sites, along with a much more clear sensor response in the juniper plots than in the grass plots.