This exercise uses data from two Purple Air monitors and the SENAMHI ATE (CRS) station. The data spans August 27, 2023, to January 31, 2024. For the analysis, we will select data downloaded with the ALT cf=3 setting, as Purple Air recommends this as the best option.
library(readxl)
data <- read_excel("ATE-Purpleair.xlsx")
## New names:
## • `pm25_atm_b_ATE9` -> `pm25_atm_b_ATE9...11`
## • `pm25_atm_b_ATE9` -> `pm25_atm_b_ATE9...22`
View(data)
The data from the monitors is not complete. About 8% of our monitors’ data and 12% of SENAMHIs data are missing. The database shows that the missing data coincides with the data from SENAMHI, so it may be a power supply problem.
porcentaje_datos_faltantes <- colSums((is.na(data[, c("pm25_alt__ATE9", "pm25_alt_ATE1", "ATE_SENAMHI_CRS")])) / nrow(data) * 100)
print(porcentaje_datos_faltantes)
## pm25_alt__ATE9 pm25_alt_ATE1 ATE_SENAMHI_CRS
## 8.280678 8.413701 12.371134
Each Purple Air has two sensors, and according to the literature, for the purple air monitors to be considered in the analysis, the sensor readings had to coincide in at least 30% of their measurements with a threshold precision of ≤ 0.130. This is done with the formula: [abs (A-B)/(A +B)] Link: https://www.sciencedirect.com/science/article/abs/pii/S135223102100251X
Preliminary results indicate 100% agreement of the Purple Air monitor sensors ATE10 with the validation criteria set with a mean of 0.03 (SD = 0.02) and for the ATE9 a mean of 0.06 (SD = 0.03).
#ATE10
data$precision_ATE1 <- abs(data$"pm25_alt_a_ATE1" - data$"pm25_alt_b_ATE1") / (data$"pm25_alt_a_ATE1" + data$"pm25_alt_b_ATE1")
num_precision_mayor_013_ATE1 <- sum(data$precision_ATE1 > 0.130, na.rm = TRUE)
print(num_precision_mayor_013_ATE1)
## [1] 0
#ATE10 - mean
media_precision_ATE1 <- mean(data$precision_ATE1, na.rm = TRUE)
desviacion_std_precision_ATE1 <- sd(data$precision_ATE1, na.rm = TRUE)
print(media_precision_ATE1)
## [1] 0.02550067
print(desviacion_std_precision_ATE1)
## [1] 0.01622399
#ATE9
data$precision_ATE9 <- abs(data$"pm25_alt_a_ATE9" - data$"pm25_alt_b_ATE9") / (data$"pm25_alt_a_ATE9" + data$"pm25_alt_b_ATE9")
num_precision_mayor_013_ATE9 <- sum(data$precision_ATE9 > 0.130, na.rm = TRUE)
print(num_precision_mayor_013_ATE9)
## [1] 0
#ATE9 - mean
media_precision_ATE9 <- mean(data$precision_ATE9, na.rm = TRUE)
desviacion_std_precision_ATE9 <- sd(data$precision_ATE9, na.rm = TRUE)
print(media_precision_ATE9)
## [1] 0.05693449
print(desviacion_std_precision_ATE9)
## [1] 0.02661307
The graphs were plotted over time and averages were taken, showing differences and that the senamhi values are higher than the purple air values.
mean_pm25_alt_ATE9 <- mean(data$pm25_alt__ATE9, na.rm = TRUE)
mean_pm25_alt_ATE1 <- mean(data$pm25_alt_ATE1, na.rm = TRUE)
mean_ATE_SENAMHI_CRS <- mean(data$ATE_SENAMHI_CRS, na.rm = TRUE)
print (mean_pm25_alt_ATE9)
## [1] 29.46885
print (mean_pm25_alt_ATE1)
## [1] 30.42251
print(mean_ATE_SENAMHI_CRS)
## [1] 42.41088
# Mean, max, count:
average_pm25_alt_ATE1 <- mean(data$pm25_alt_ATE1, na.rm = TRUE)
max_pm25_alt_ATE1 <- max(data$pm25_alt_ATE1, na.rm = TRUE)
count_pm25_alt_ATE1 <- sum(!is.na(data$pm25_alt_ATE1))
average_pm25_alt__ATE9 <- mean(data$pm25_alt__ATE9, na.rm = TRUE)
max_pm25_alt__ATE9 <- max(data$pm25_alt__ATE9, na.rm = TRUE)
count_pm25_alt__ATE9 <- sum(!is.na(data$pm25_alt__ATE9))
average_ATE_SENAMHI_CRS <- mean(data$ATE_SENAMHI_CRS, na.rm = TRUE)
max_ATE_SENAMHI_CRS <- max(data$ATE_SENAMHI_CRS, na.rm = TRUE)
count_ATE_SENAMHI_CRS <- sum(!is.na(data$ATE_SENAMHI_CRS))
statistics_table <- data.frame(
Variable = c("pm25_alt_ATE1", "pm25_alt__ATE9", "ATE_SENAMHI_CRS"),
Average = c(average_pm25_alt_ATE1, average_pm25_alt__ATE9, average_ATE_SENAMHI_CRS),
Maximum = c(max_pm25_alt_ATE1, max_pm25_alt__ATE9, max_ATE_SENAMHI_CRS),
Count = c(count_pm25_alt_ATE1, count_pm25_alt__ATE9, count_ATE_SENAMHI_CRS)
)
print(statistics_table)
## Variable Average Maximum Count
## 1 pm25_alt_ATE1 30.42251 204.0 2754
## 2 pm25_alt__ATE9 29.46885 199.8 2758
## 3 ATE_SENAMHI_CRS 42.41088 184.1 2635
# ATE9 vs Senamhi
library(ggplot2)
# Crear un gráfico de puntos (scatter plot) de ATE_SENAMHI_CRS a lo largo del tiempo
ggplot(data, aes(x = Fecha, y = ATE_SENAMHI_CRS)) +
geom_point(aes(color = "ATE_SENAMHI_CRS"), size = 0.2) +
geom_line(aes(color = "ATE_SENAMHI_CRS"), size = 0.1) +
geom_point(aes(y = pm25_alt__ATE9, color = "pm25_alt__ATE9"), size = 0.2) +
geom_line(aes(y = pm25_alt__ATE9, color = "pm25_alt__ATE9"), size =0.1) +
ggtitle("Time Series of PM2.5 Measurements") +
xlab("Date") +
ylab("Concentration") +
labs(color = "Legend") +
scale_color_manual(values = c("ATE_SENAMHI_CRS" = "red", "pm25_alt__ATE9" = "blue")) +
theme_minimal()
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## Warning: Removed 372 rows containing missing values (`geom_point()`).
## Warning: Removed 249 rows containing missing values (`geom_point()`).
# ATE10 vs Senamhi
ggplot(data, aes(x = Fecha, y = ATE_SENAMHI_CRS)) +
geom_point(aes(color = "ATE_SENAMHI_CRS"), size = 0.2) +
geom_line(aes(color = "ATE_SENAMHI_CRS"), linewidth = 0.1) +
geom_point(aes(y = pm25_alt_ATE1, color = "pm25_alt_ATE1"), size = 0.2) +
geom_line(aes(y = pm25_alt_ATE1, color = "pm25_alt_ATE1"), linewidth = 0.1) +
ggtitle("Time Series of PM2.5 Measurements") +
xlab("Date") +
ylab("Concentration") +
labs(color = "Legend") +
scale_color_manual(values = c("ATE_SENAMHI_CRS" = "red", "pm25_alt_ATE1" = "green")) +
theme_minimal()
## Warning: Removed 372 rows containing missing values (`geom_point()`).
## Warning: Removed 253 rows containing missing values (`geom_point()`).
“The two data sets (the low-cost sensors versus the reference monitor’s data) were plotted against each other in a scatter plot. The slope and offset of the linear least squares fit line of the data was then calculated, and these parameters were used to calculate the new gain and offset calibration factors” Link: https://www.mdpi.com/1660-4601/19/3/1647
library(ggplot2)
ggplot(data, aes(x = pm25_alt__ATE9, y = pm25_alt_ATE1)) +
geom_point(color = "blue") +
ggtitle("Scatter Plot of PM2.5 Measurements Purple Air monitors ATE9 vs Senamhi") +
xlab("pm2.5_alt_a_ATE9") +
ylab("pm2.5_alt_a_ATE10")
## Warning: Removed 253 rows containing missing values (`geom_point()`).
# Realizar regresión lineal
model1 <- lm(pm25_alt_ATE1 ~ pm25_alt__ATE9, data = data)
# Resumen del modelo para revisión general
summary_model1 <- summary(model1)
# Extraer la pendiente directamente desde los coeficientes del modelo
slope <- coef(model1)["pm2.5_alt_a_ATE9"]
# R^2 desde el resumen del modelo
r_squared <- summary_model1$r.squared
# Error estándar de la estimación desde el resumen del modelo
std_error <- summary_model1$sigma
# Imprimir los resultados
print(paste("Slope:", slope))
## [1] "Slope: NA"
print(paste("R-squared:", r_squared))
## [1] "R-squared: 0.997349887091694"
print(paste("Standard Error of the Estimate:", std_error))
## [1] "Standard Error of the Estimate: 0.839438270485047"
These results show that there is a weak positive linear correlation between pm25_alt__ATE9 and ATE_SENAMHI_CRS, with a slope of 0.313 and an R^2 of 0.141. This indicates that for each unit increase in pm25_alt__ATE9, ATE_SENAMHI_CRS increases by an average of 0.313 units, but only 14.1% of the variability in ATE_SENAMHI_CRS is explained by pm25_alt__ATE9.
ggplot(data, aes(x = ATE_SENAMHI_CRS, y = pm25_alt__ATE9)) +
geom_point(color = "blue") +
ggtitle("Scatter Plot of PM2.5 Measurements Purple Air monitors ATE9 vs Senamhi") +
xlab("ATE_SENAMHI_(CRS)") +
ylab("pm2.5_alt_a_ATE9")
## Warning: Removed 390 rows containing missing values (`geom_point()`).
# Realizar regresión lineal
model2 <- lm(pm25_alt__ATE9 ~ ATE_SENAMHI_CRS, data = data)
# Resumen del modelo para revisión general
summary_model2 <- summary(model2)
# Extraer la pendiente directamente desde los coeficientes del modelo
slope <- coef(model2)["ATE_SENAMHI_CRS"]
# R^2 desde el resumen del modelo
r_squared <- summary_model2$r.squared
# Error estándar de la estimación desde el resumen del modelo
std_error <- summary_model2$sigma
# Imprimir los resultados
print(paste("Slope:", slope))
## [1] "Slope: 0.313338159025196"
print(paste("R-squared:", r_squared))
## [1] "R-squared: 0.14122455697681"
print(paste("Standard Error of the Estimate:", std_error))
## [1] "Standard Error of the Estimate: 14.6566211470367"
These results show that there is a weak positive linear correlation between pm25_alt_ATE10 and ATE_SENAMHI_CRS, with a slope of 0.322 and an R^2 of 0.141. This indicates that for each unit increase in pm25_alt_ATE10, ATE_SENAMHI_CRS increases by an average of 0.322 units, but only 14.1% of the variability in ATE_SENAMHI_CRS is explained by pm25_alt__ATE10.
ggplot(data, aes(x = ATE_SENAMHI_CRS, y = pm25_alt_ATE1)) +
geom_point(color = "blue") +
ggtitle("Scatter Plot of PM2.5 Measurements Purple Air monitors ATE10 vs Senamhi") +
xlab("ATE_SENAMHI_(CRS)") +
ylab("pm2.5_alt_a_ATE10")
## Warning: Removed 393 rows containing missing values (`geom_point()`).
# Realizar regresión lineal
model3 <- lm(pm25_alt_ATE1 ~ ATE_SENAMHI_CRS, data = data)
# Resumen del modelo para revisión general
summary_model3 <- summary(model3)
# Extraer la pendiente directamente desde los coeficientes del modelo
slope <- coef(model3)["ATE_SENAMHI_CRS"]
# R^2 desde el resumen del modelo
r_squared <- summary_model3$r.squared
# Error estándar de la estimación desde el resumen del modelo
std_error <- summary_model3$sigma
# Imprimir los resultados
print(paste("Slope:", slope))
## [1] "Slope: 0.322879813377353"
print(paste("R-squared:", r_squared))
## [1] "R-squared: 0.141446121754965"
print(paste("Standard Error of the Estimate:", std_error))
## [1] "Standard Error of the Estimate: 15.0940750851515"
link: https://www.4cleanair.org/wp-content/uploads/Documents/Purple_Air_Calibration_3_5_18.pdf
# New variable
data$average_pm25 <- rowMeans(data[, c("pm25_alt_ATE1", "pm25_alt__ATE9")], na.rm = TRUE)
average_of_average_pm25 <- mean(data$average_pm25, na.rm = TRUE)
max_average_pm25 <- max(data$average_pm25, na.rm = TRUE)
count_average_pm25 <- sum(!is.na(data$average_pm25))
statistics_table2 <- data.frame(
Variable = c("average_pm25", "ATE_SENAMHI_CRS"),
Average = c(average_of_average_pm25, average_ATE_SENAMHI_CRS),
Maximum = c(max_average_pm25, max_ATE_SENAMHI_CRS),
Count = c(count_average_pm25, count_ATE_SENAMHI_CRS)
)
print(statistics_table2)
## Variable Average Maximum Count
## 1 average_pm25 29.93544 201.9 2758
## 2 ATE_SENAMHI_CRS 42.41088 184.1 2635
# Variable with 24 avg daily_average_pm25 PURPLEAIR
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Primero, extraemos la parte de la fecha de la columna datetime
data$date <- as.Date(data$Fecha)
data_daily <- data %>%
group_by(date) %>%
summarise(
count_hours = n(),
average_pm25 = mean(average_pm25, na.rm = TRUE),
avg_ATE_SENAMHI_CRS = mean(ATE_SENAMHI_CRS, na.rm = TRUE)
) %>%
filter(count_hours > 14)
# Primero, extraemos la parte de la fecha de la columna datetime
data$date <- as.Date(data$Fecha)
# Luego, agrupamos por esta nueva columna de fecha, calculamos el promedio de PM2.5,
# y filtramos solo aquellos días que tienen más de 14 horas de datos
data_daily_average <- data %>%
group_by(date) %>%
summarise(
hours_recorded = n(),
daily_average_pm25 = mean(average_pm25, na.rm = TRUE)
) %>%
filter(hours_recorded > 14) %>%
select(-hours_recorded) # Opcionalmente, quitar la columna hours_recorded si no la necesitas
# Visualizar el resultado
print(data_daily_average)
## # A tibble: 125 × 2
## date daily_average_pm25
## <date> <dbl>
## 1 2023-09-28 34.1
## 2 2023-09-29 28.6
## 3 2023-09-30 36.2
## 4 2023-10-01 36.0
## 5 2023-10-02 38.1
## 6 2023-10-03 37.5
## 7 2023-10-04 52.2
## 8 2023-10-05 38.1
## 9 2023-10-06 30.9
## 10 2023-10-07 37.4
## # … with 115 more rows
# Variable with 24 avg SENAMHI
data_daily_average_SENAMHI <- data %>%
group_by(date) %>%
summarise(
hours_recorded = n(),
daily_average_SENAMHI_CRS = mean(ATE_SENAMHI_CRS, na.rm = TRUE)
) %>%
filter(hours_recorded > 14) %>%
select(-hours_recorded)
print(data_daily_average_SENAMHI)
## # A tibble: 125 × 2
## date daily_average_SENAMHI_CRS
## <date> <dbl>
## 1 2023-09-28 47.7
## 2 2023-09-29 30.7
## 3 2023-09-30 39.0
## 4 2023-10-01 44.7
## 5 2023-10-02 43.2
## 6 2023-10-03 47.7
## 7 2023-10-04 63.9
## 8 2023-10-05 45.0
## 9 2023-10-06 43.2
## 10 2023-10-07 46.1
## # … with 115 more rows
# Missing data 19%
# De 156 días, se tiene 125
total_dias <- 156
dias_con_datos <- 125
dias_sin_datos <- total_dias - dias_con_datos
porcentaje_perdida <- (dias_sin_datos / total_dias) * 100
print(porcentaje_perdida)
## [1] 19.87179
Scatter plot
ggplot(data_daily, aes(x = average_pm25, y = avg_ATE_SENAMHI_CRS)) +
geom_point(color = "blue") +
ggtitle("Scatter Plot of PM2.5 Measurements Purple Air monitors ATE9 vs Senamhi") +
xlab("pm2.5 avg ") +
ylab("ATE_SENAMHI_(CRS)")
## Warning: Removed 8 rows containing missing values (`geom_point()`).