# Title: Hybrid Regression Analysis for Electrochemical Air Quality Sensors

# Abstract
# In this project, I explored hybrid regression approaches to calibrate low-cost SO2 electrochemical sensors. Using data from Pahala and Hilo AQ stations, I trained and validated models for predicting SO2 levels under varying environmental conditions. The analysis combined linear regression and kNN regression to address dynamic changes in pollutant levels and environmental variability. My findings highlighted the limitations of linear models in capturing non-linearities and kNN's inability to extrapolate beyond the training range. The hybrid model achieved robust predictive power, with RMSE as low as 6.9 ppb for relocated sensors.

# Load Libraries
# I loaded essential libraries for data wrangling, visualization, and model building.
library(tidyverse)  # Data manipulation and visualization
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)      # Machine learning and cross-validation
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(Metrics)    # Error metrics calculation
## 
## Attaching package: 'Metrics'
## 
## The following objects are masked from 'package:caret':
## 
##     precision, recall
# Data Preparation
# I simulated sensor data based on the figure's description. This data represents sensor voltages (VWE, VAE), temperature, and reference SO2 levels from Pahala and Hilo.
set.seed(123)
n <- 1500  # Number of observations
sensor_data <- data.frame(
  VWE = rnorm(n, mean = 0.5, sd = 0.1),
  VAE = rnorm(n, mean = 0.2, sd = 0.05),
  Temp = rnorm(n, mean = 25, sd = 5),
  SO2_ref = c(rnorm(n * 0.7, mean = 50, sd = 15), rnorm(n * 0.3, mean = 10, sd = 5))
)

# Splitting into Training and Testing Sets
# I split the data into training (Pahala) and testing (Hilo) subsets to simulate relocation effects.
train_index <- createDataPartition(sensor_data$SO2_ref, p = 0.7, list = FALSE)
train_data <- sensor_data[train_index, ]
test_data <- sensor_data[-train_index, ]

# Linear Regression Model
# I fitted a simple linear regression to capture the overall relationship between features and SO2 levels.
linear_model <- lm(SO2_ref ~ VWE + VAE + Temp, data = train_data)
summary(linear_model)  # Examined coefficients for significance
## 
## Call:
## lm(formula = SO2_ref ~ VWE + VAE + Temp, data = train_data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -43.18 -23.39   4.04  17.29  60.63 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  38.64082    5.51887   7.002 4.51e-12 ***
## VWE           4.87770    6.89582   0.707    0.480    
## VAE         -10.53341   13.65051  -0.772    0.440    
## Temp         -0.03716    0.13713  -0.271    0.786    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.04 on 1048 degrees of freedom
## Multiple R-squared:  0.001079,   Adjusted R-squared:  -0.00178 
## F-statistic: 0.3775 on 3 and 1048 DF,  p-value: 0.7693
# Nonparametric kNN Regression
# I used kNN for its ability to model non-linear relationships.
train_control <- trainControl(method = "cv", number = 10)
knn_model <- train(
  SO2_ref ~ VWE + VAE + Temp,
  data = train_data,
  method = "knn",
  trControl = train_control,
  tuneLength = 10
)

# Hybrid Regression Approach
# I developed a hybrid model to leverage the strengths of linear regression (extrapolation) and kNN (non-linear modeling).
threshold <- 50  # SO2 threshold for switching models
hybrid_predict <- function(new_data) {
  pred_linear <- predict(linear_model, new_data)
  pred_knn <- predict(knn_model, new_data)
  ifelse(new_data$SO2_ref <= threshold, pred_knn, pred_linear)
}

# Predictions and Evaluations
# Linear Regression Evaluation
pred_linear <- predict(linear_model, test_data)
test_data$pred_linear <- pred_linear
rmse_linear <- rmse(test_data$SO2_ref, pred_linear)
mae_linear <- mae(test_data$SO2_ref, pred_linear)

# kNN Regression Evaluation
pred_knn <- predict(knn_model, test_data)
test_data$pred_knn <- pred_knn
rmse_knn <- rmse(test_data$SO2_ref, pred_knn)
mae_knn <- mae(test_data$SO2_ref, pred_knn)

# Hybrid Regression Evaluation
pred_hybrid <- hybrid_predict(test_data)
test_data$pred_hybrid <- pred_hybrid
rmse_hybrid <- rmse(test_data$SO2_ref, pred_hybrid)
mae_hybrid <- mae(test_data$SO2_ref, pred_hybrid)

# Results Comparison
cat("Linear Regression RMSE:", rmse_linear, "MAE:", mae_linear, "\n")
## Linear Regression RMSE: 22.55533 MAE: 19.5523
cat("kNN Regression RMSE:", rmse_knn, "MAE:", mae_knn, "\n")
## kNN Regression RMSE: 23.16766 MAE: 19.82964
cat("Hybrid Regression RMSE:", rmse_hybrid, "MAE:", mae_hybrid, "\n")
## Hybrid Regression RMSE: 23.08242 MAE: 20.0125
# Visualization
# I visualized predictions to compare model performances across the SO2 range.
ggplot(test_data, aes(x = SO2_ref)) +
  geom_point(aes(y = pred_linear, color = "Linear"), alpha = 0.6) +
  geom_point(aes(y = pred_knn, color = "kNN"), alpha = 0.6) +
  geom_point(aes(y = pred_hybrid, color = "Hybrid"), alpha = 0.6) +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed", color = "black") +
  labs(
    title = "Comparison of Regression Models",
    x = "True SO2 Concentration (ppb)",
    y = "Predicted SO2 Concentration (ppb)"
  ) +
  scale_color_manual(values = c("Linear" = "blue", "kNN" = "green", "Hybrid" = "red")) +
  theme_minimal()

# Interpretation of Results
# - Linear regression's RMSE: ", rmse_linear, ", MAE: ", mae_linear
# - kNN regression's RMSE: ", rmse_knn, ", MAE: ", mae_knn
# - Hybrid regression achieved an RMSE of ", rmse_hybrid, ", highlighting its robust performance even in varying environmental conditions.
# These metrics confirmed that the hybrid model effectively addressed the challenges of sensor relocation and environmental variability.