Calidad de aire

This exercise uses data from two Purple Air monitors and the SENAMHI ATE (CRS) station. The data spans August 27, 2023, to January 31, 2024. For the analysis, we will select data downloaded with the ALT cf=3 setting, as Purple Air recommends this as the best option.

pm25_alt_ATE1: Hourly average of the two ATE10 sensors downloaded from Purpleair Data Download.
pm25_alt_a_ATE1: ATE10 monitor sensor A
pm25_alt_b_ATE1: ATE10 monitor sensor B
pm25_alt__ATE9: Hourly average of the two ATE9 sensors downloaded from Purpleair Data Download.
pm25_alt_a_ATE9: ATE9 monitor sensor A
pm25_alt_b_ATE9: ATE9 monitor sensor B
ATE_SENAMHI_CRS: Hourly average of SENAMHI monitor in Ate district (CRS monitor).
Fecha: date and time

library(readxl)
data <- read_excel("ATE-Purpleair.xlsx")

## New names:
## • `pm25_atm_b_ATE9` -> `pm25_atm_b_ATE9...11`
## • `pm25_atm_b_ATE9` -> `pm25_atm_b_ATE9...22`

View(data)

Missing data

The data from the monitors is not complete. About 8% of our monitors’ data and 12% of SENAMHIs data are missing. The database shows that the missing data coincides with the data from SENAMHI, so it may be a power supply problem.

porcentaje_datos_faltantes <- colSums((is.na(data[, c("pm25_alt__ATE9", "pm25_alt_ATE1", "ATE_SENAMHI_CRS")])) / nrow(data) * 100)
print(porcentaje_datos_faltantes)

##  pm25_alt__ATE9   pm25_alt_ATE1 ATE_SENAMHI_CRS 
##        8.280678        8.413701       12.371134

Pre-analysis of sensors

Each Purple Air has two sensors, and according to the literature, for the purple air monitors to be considered in the analysis, the sensor readings had to coincide in at least 30% of their measurements with a threshold precision of ≤ 0.130. This is done with the formula: [abs (A-B)/(A +B)] Link: https://www.sciencedirect.com/science/article/abs/pii/S135223102100251X

Preliminary results indicate 100% agreement of the Purple Air monitor sensors ATE10 with the validation criteria set with a mean of 0.03 (SD = 0.02) and for the ATE9 a mean of 0.06 (SD = 0.03).

#ATE10 
data$precision_ATE1 <- abs(data$"pm25_alt_a_ATE1" - data$"pm25_alt_b_ATE1") / (data$"pm25_alt_a_ATE1" + data$"pm25_alt_b_ATE1")
num_precision_mayor_013_ATE1 <- sum(data$precision_ATE1 > 0.130, na.rm = TRUE)
print(num_precision_mayor_013_ATE1)

## [1] 0

#ATE10 - mean 
media_precision_ATE1 <- mean(data$precision_ATE1, na.rm = TRUE)
desviacion_std_precision_ATE1 <- sd(data$precision_ATE1, na.rm = TRUE)
print(media_precision_ATE1)

## [1] 0.02550067

print(desviacion_std_precision_ATE1)

## [1] 0.01622399

#ATE9 
data$precision_ATE9 <- abs(data$"pm25_alt_a_ATE9" - data$"pm25_alt_b_ATE9") / (data$"pm25_alt_a_ATE9" + data$"pm25_alt_b_ATE9")
num_precision_mayor_013_ATE9 <- sum(data$precision_ATE9 > 0.130, na.rm = TRUE)
print(num_precision_mayor_013_ATE9)

## [1] 0

#ATE9 - mean 
media_precision_ATE9 <- mean(data$precision_ATE9, na.rm = TRUE)
desviacion_std_precision_ATE9 <- sd(data$precision_ATE9, na.rm = TRUE)
print(media_precision_ATE9)

## [1] 0.05693449

print(desviacion_std_precision_ATE9)

## [1] 0.02661307

Descriptive analysis

The graphs were plotted over time and averages were taken, showing differences and that the senamhi values are higher than the purple air values.

mean_pm25_alt_ATE9 <- mean(data$pm25_alt__ATE9, na.rm = TRUE)
mean_pm25_alt_ATE1 <- mean(data$pm25_alt_ATE1, na.rm = TRUE)
mean_ATE_SENAMHI_CRS <- mean(data$ATE_SENAMHI_CRS, na.rm = TRUE)

print (mean_pm25_alt_ATE9)

## [1] 29.46885

print (mean_pm25_alt_ATE1)

## [1] 30.42251

print(mean_ATE_SENAMHI_CRS)

## [1] 42.41088

# Mean, max, count: 

average_pm25_alt_ATE1 <- mean(data$pm25_alt_ATE1, na.rm = TRUE)
max_pm25_alt_ATE1 <- max(data$pm25_alt_ATE1, na.rm = TRUE)
count_pm25_alt_ATE1 <- sum(!is.na(data$pm25_alt_ATE1))

average_pm25_alt__ATE9 <- mean(data$pm25_alt__ATE9, na.rm = TRUE)
max_pm25_alt__ATE9 <- max(data$pm25_alt__ATE9, na.rm = TRUE)
count_pm25_alt__ATE9 <- sum(!is.na(data$pm25_alt__ATE9))

average_ATE_SENAMHI_CRS <- mean(data$ATE_SENAMHI_CRS, na.rm = TRUE)
max_ATE_SENAMHI_CRS <- max(data$ATE_SENAMHI_CRS, na.rm = TRUE)
count_ATE_SENAMHI_CRS <- sum(!is.na(data$ATE_SENAMHI_CRS))

statistics_table <- data.frame(
  Variable = c("pm25_alt_ATE1", "pm25_alt__ATE9", "ATE_SENAMHI_CRS"),
  Average = c(average_pm25_alt_ATE1, average_pm25_alt__ATE9, average_ATE_SENAMHI_CRS),
  Maximum = c(max_pm25_alt_ATE1, max_pm25_alt__ATE9, max_ATE_SENAMHI_CRS),
  Count = c(count_pm25_alt_ATE1, count_pm25_alt__ATE9, count_ATE_SENAMHI_CRS)
)

print(statistics_table)

##          Variable  Average Maximum Count
## 1   pm25_alt_ATE1 30.42251   204.0  2754
## 2  pm25_alt__ATE9 29.46885   199.8  2758
## 3 ATE_SENAMHI_CRS 42.41088   184.1  2635

# ATE9 vs Senamhi 
library(ggplot2)
# Crear un gráfico de puntos (scatter plot) de ATE_SENAMHI_CRS a lo largo del tiempo
ggplot(data, aes(x = Fecha, y = ATE_SENAMHI_CRS)) +
  geom_point(aes(color = "ATE_SENAMHI_CRS"), size = 0.2) +
  geom_line(aes(color = "ATE_SENAMHI_CRS"), size = 0.1) +
  geom_point(aes(y = pm25_alt__ATE9, color = "pm25_alt__ATE9"), size = 0.2) +
  geom_line(aes(y = pm25_alt__ATE9, color = "pm25_alt__ATE9"), size =0.1) +
  ggtitle("Time Series of PM2.5 Measurements") +
  xlab("Date") +
  ylab("Concentration") +
  labs(color = "Legend") +
  scale_color_manual(values = c("ATE_SENAMHI_CRS" = "red", "pm25_alt__ATE9" = "blue")) +
  theme_minimal()

## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.

## Warning: Removed 372 rows containing missing values (`geom_point()`).

## Warning: Removed 249 rows containing missing values (`geom_point()`).

# ATE10 vs Senamhi 
ggplot(data, aes(x = Fecha, y = ATE_SENAMHI_CRS)) +
  geom_point(aes(color = "ATE_SENAMHI_CRS"), size = 0.2) +
  geom_line(aes(color = "ATE_SENAMHI_CRS"), linewidth = 0.1) +
  geom_point(aes(y = pm25_alt_ATE1, color = "pm25_alt_ATE1"), size = 0.2) +
  geom_line(aes(y = pm25_alt_ATE1, color = "pm25_alt_ATE1"), linewidth = 0.1) +
  ggtitle("Time Series of PM2.5 Measurements") +
  xlab("Date") +
  ylab("Concentration") +
  labs(color = "Legend") +
  scale_color_manual(values = c("ATE_SENAMHI_CRS" = "red", "pm25_alt_ATE1" = "green")) +
  theme_minimal()

## Warning: Removed 372 rows containing missing values (`geom_point()`).

## Warning: Removed 253 rows containing missing values (`geom_point()`).

Calibration factors

“The two data sets (the low-cost sensors versus the reference monitor’s data) were plotted against each other in a scatter plot. The slope and offset of the linear least squares fit line of the data was then calculated, and these parameters were used to calculate the new gain and offset calibration factors” Link: https://www.mdpi.com/1660-4601/19/3/1647

library(ggplot2)
ggplot(data, aes(x = pm25_alt__ATE9, y = pm25_alt_ATE1)) +
  geom_point(color = "blue") +
  ggtitle("Scatter Plot of PM2.5 Measurements Purple Air monitors ATE9 vs Senamhi") +
  xlab("pm2.5_alt_a_ATE9") +
  ylab("pm2.5_alt_a_ATE10")

## Warning: Removed 253 rows containing missing values (`geom_point()`).

# Realizar regresión lineal
model1 <- lm(pm25_alt_ATE1 ~ pm25_alt__ATE9, data = data)

# Resumen del modelo para revisión general
summary_model1 <- summary(model1)

# Extraer la pendiente directamente desde los coeficientes del modelo
slope <- coef(model1)["pm2.5_alt_a_ATE9"]

# R^2 desde el resumen del modelo
r_squared <- summary_model1$r.squared

# Error estándar de la estimación desde el resumen del modelo
std_error <- summary_model1$sigma

# Imprimir los resultados
print(paste("Slope:", slope))

## [1] "Slope: NA"

print(paste("R-squared:", r_squared))

## [1] "R-squared: 0.997349887091694"

print(paste("Standard Error of the Estimate:", std_error))

## [1] "Standard Error of the Estimate: 0.839438270485047"

Scatter Plot of PM2.5 ATE9 Measurements Purple Air monitors vs Senamhi

These results show that there is a weak positive linear correlation between pm25_alt__ATE9 and ATE_SENAMHI_CRS, with a slope of 0.313 and an R^2 of 0.141. This indicates that for each unit increase in pm25_alt__ATE9, ATE_SENAMHI_CRS increases by an average of 0.313 units, but only 14.1% of the variability in ATE_SENAMHI_CRS is explained by pm25_alt__ATE9.

ggplot(data, aes(x = ATE_SENAMHI_CRS, y = pm25_alt__ATE9)) +
  geom_point(color = "blue") +
  ggtitle("Scatter Plot of PM2.5 Measurements Purple Air monitors ATE9 vs Senamhi") +
  xlab("ATE_SENAMHI_(CRS)") +
  ylab("pm2.5_alt_a_ATE9")

## Warning: Removed 390 rows containing missing values (`geom_point()`).

# Realizar regresión lineal
model2 <- lm(pm25_alt__ATE9 ~ ATE_SENAMHI_CRS, data = data)

# Resumen del modelo para revisión general
summary_model2 <- summary(model2)

# Extraer la pendiente directamente desde los coeficientes del modelo
slope <- coef(model2)["ATE_SENAMHI_CRS"]

# R^2 desde el resumen del modelo
r_squared <- summary_model2$r.squared

# Error estándar de la estimación desde el resumen del modelo
std_error <- summary_model2$sigma

# Imprimir los resultados
print(paste("Slope:", slope))

## [1] "Slope: 0.313338159025196"

print(paste("R-squared:", r_squared))

## [1] "R-squared: 0.14122455697681"

print(paste("Standard Error of the Estimate:", std_error))

## [1] "Standard Error of the Estimate: 14.6566211470367"

Scatter Plot of PM2.5 ATE10 Measurements Purple Air monitors vs Senamhi

These results show that there is a weak positive linear correlation between pm25_alt_ATE10 and ATE_SENAMHI_CRS, with a slope of 0.322 and an R^2 of 0.141. This indicates that for each unit increase in pm25_alt_ATE10, ATE_SENAMHI_CRS increases by an average of 0.322 units, but only 14.1% of the variability in ATE_SENAMHI_CRS is explained by pm25_alt__ATE10.

ggplot(data, aes(x = ATE_SENAMHI_CRS, y = pm25_alt_ATE1)) +
  geom_point(color = "blue") +
  ggtitle("Scatter Plot of PM2.5 Measurements Purple Air monitors ATE10 vs Senamhi") +
  xlab("ATE_SENAMHI_(CRS)") +
  ylab("pm2.5_alt_a_ATE10")

## Warning: Removed 393 rows containing missing values (`geom_point()`).

# Realizar regresión lineal
model3 <- lm(pm25_alt_ATE1 ~ ATE_SENAMHI_CRS, data = data)

# Resumen del modelo para revisión general
summary_model3 <- summary(model3)

# Extraer la pendiente directamente desde los coeficientes del modelo
slope <- coef(model3)["ATE_SENAMHI_CRS"]

# R^2 desde el resumen del modelo
r_squared <- summary_model3$r.squared

# Error estándar de la estimación desde el resumen del modelo
std_error <- summary_model3$sigma

# Imprimir los resultados
print(paste("Slope:", slope))

## [1] "Slope: 0.322879813377353"

print(paste("R-squared:", r_squared))

## [1] "R-squared: 0.141446121754965"

print(paste("Standard Error of the Estimate:", std_error))

## [1] "Standard Error of the Estimate: 15.0940750851515"

Class III Acceptance Criteria methodology

link: https://www.4cleanair.org/wp-content/uploads/Documents/Purple_Air_Calibration_3_5_18.pdf

# New variable 

data$average_pm25 <- rowMeans(data[, c("pm25_alt_ATE1", "pm25_alt__ATE9")], na.rm = TRUE)
average_of_average_pm25 <- mean(data$average_pm25, na.rm = TRUE)
max_average_pm25 <- max(data$average_pm25, na.rm = TRUE)
count_average_pm25 <- sum(!is.na(data$average_pm25))

statistics_table2 <- data.frame(
  Variable = c("average_pm25", "ATE_SENAMHI_CRS"),
  Average = c(average_of_average_pm25, average_ATE_SENAMHI_CRS), 
  Maximum = c(max_average_pm25, max_ATE_SENAMHI_CRS),
  Count = c(count_average_pm25, count_ATE_SENAMHI_CRS)
)

print(statistics_table2)

##          Variable  Average Maximum Count
## 1    average_pm25 29.93544   201.9  2758
## 2 ATE_SENAMHI_CRS 42.41088   184.1  2635

Plot 24 hr avg continuous vs FRM and add the regression line.The FRM data in this case is SENAMHI data

# Variable with 24 avg daily_average_pm25 PURPLEAIR 

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Primero, extraemos la parte de la fecha de la columna datetime
data$date <- as.Date(data$Fecha)

data_daily <- data %>%
  group_by(date) %>%
  summarise(
    count_hours = n(),
    average_pm25 = mean(average_pm25, na.rm = TRUE),
    avg_ATE_SENAMHI_CRS = mean(ATE_SENAMHI_CRS, na.rm = TRUE)
  ) %>%
  filter(count_hours > 14)


# Primero, extraemos la parte de la fecha de la columna datetime
data$date <- as.Date(data$Fecha)

# Luego, agrupamos por esta nueva columna de fecha, calculamos el promedio de PM2.5,
# y filtramos solo aquellos días que tienen más de 14 horas de datos
data_daily_average <- data %>%
  group_by(date) %>%
  summarise(
    hours_recorded = n(),
    daily_average_pm25 = mean(average_pm25, na.rm = TRUE)
  ) %>%
  filter(hours_recorded > 14) %>%
  select(-hours_recorded) # Opcionalmente, quitar la columna hours_recorded si no la necesitas

# Visualizar el resultado
print(data_daily_average)

## # A tibble: 125 × 2
##    date       daily_average_pm25
##    <date>                  <dbl>
##  1 2023-09-28               34.1
##  2 2023-09-29               28.6
##  3 2023-09-30               36.2
##  4 2023-10-01               36.0
##  5 2023-10-02               38.1
##  6 2023-10-03               37.5
##  7 2023-10-04               52.2
##  8 2023-10-05               38.1
##  9 2023-10-06               30.9
## 10 2023-10-07               37.4
## # … with 115 more rows

# Variable with 24 avg SENAMHI 

data_daily_average_SENAMHI <- data %>%
  group_by(date) %>%
  summarise(
    hours_recorded = n(),
    daily_average_SENAMHI_CRS = mean(ATE_SENAMHI_CRS, na.rm = TRUE)
  ) %>%
  filter(hours_recorded > 14) %>%
  select(-hours_recorded)
print(data_daily_average_SENAMHI)

## # A tibble: 125 × 2
##    date       daily_average_SENAMHI_CRS
##    <date>                         <dbl>
##  1 2023-09-28                      47.7
##  2 2023-09-29                      30.7
##  3 2023-09-30                      39.0
##  4 2023-10-01                      44.7
##  5 2023-10-02                      43.2
##  6 2023-10-03                      47.7
##  7 2023-10-04                      63.9
##  8 2023-10-05                      45.0
##  9 2023-10-06                      43.2
## 10 2023-10-07                      46.1
## # … with 115 more rows

# Missing data 19% 

# De 156 días, se tiene 125 
total_dias <- 156
dias_con_datos <- 125
dias_sin_datos <- total_dias - dias_con_datos
porcentaje_perdida <- (dias_sin_datos / total_dias) * 100
print(porcentaje_perdida)

## [1] 19.87179

Scatter plot

ggplot(data_daily, aes(x = average_pm25, y = avg_ATE_SENAMHI_CRS)) +
  geom_point(color = "blue") +
  ggtitle("Scatter Plot of PM2.5 Measurements Purple Air monitors ATE9 vs Senamhi") +
  xlab("pm2.5 avg ") +
  ylab("ATE_SENAMHI_(CRS)")

## Warning: Removed 8 rows containing missing values (`geom_point()`).