Code
# Loading required packages
library(tidyverse)
library(nortest)
library(ggplot2)
library(Hmisc)
library(GGally)
library(erikmisc)
library(lmtest)
library(car)
library(MASS)
library(knitr)
library(kableExtra)This study analyzes ozone concentrations in Bernalillo, New Mexico, to identify their meteorological and seasonal drivers through multiple regression analysis. The final model, comprising average temperature, average wind speed, precipitation, and snow depth, explained approximately 55% of ozone concentration variation with all predictors being statistically significant (p-values < 0.01). Model diagnostics indicate satisfaction of regression assumptions, providing a reliable framework for predicting ozone concentration levels using weather and seasonal conditions.
Ground-level ozone is a significant air pollutant that poses health risks and environmental concerns. Understanding the meteorological factors that influence ozone concentrations is crucial for air quality forecasting and public health planning. This analysis explores the relationships between ozone levels and various weather parameters in Bernalillo, New Mexico, using multiple regression techniques.
The study follows a systematic approach:
All statistical tests were performed using R at α = 0.05 significance level.
# Loading required packages
library(tidyverse)
library(nortest)
library(ggplot2)
library(Hmisc)
library(GGally)
library(erikmisc)
library(lmtest)
library(car)
library(MASS)
library(knitr)
library(kableExtra)# Loading the data
ozone_data <- read.csv("C:/Users/ekubu/Desktop/Take Home Qual/unm_exam_202501_stat_qual-takehome_dat2.csv")
head(ozone_data)# Custom theme for plots
MyTheme4 <- theme(
axis.title.x = element_text(size = 14, face = "bold"),
axis.title.y = element_text(size = 14, face = "bold"),
axis.text.x = element_text(size = 12, face = "bold"),
axis.text.y = element_text(size = 12, face = "bold"),
legend.text = element_text(size = 12),
plot.title = element_text(size = 16, face = "bold", hjust = 0.5)
)The dataset spans 1,096 daily observations from July 1, 2021, to June 30, 2024. After removing missing ozone measurements, 1,076 complete cases remained for analysis.
# Data cleaning - removing DATE column and missing values
ozone_final <- ozone_data[,-1] %>%
filter(!is.na(X2ZJ.Bernalillo))
str(ozone_final)'data.frame': 1076 obs. of 9 variables:
$ AWND : num 4.92 10.29 7.38 6.26 10.51 ...
$ PRCP : num 0 0.17 0.28 0 0.12 0.02 0 0 0.06 0 ...
$ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
$ SNWD : num 0 0 0 0 0 0 0 0 0 0 ...
$ TAVG : int 71 78 74 76 80 76 75 80 85 80 ...
$ TMAX : int 87 88 90 92 92 91 89 96 100 101 ...
$ TMIN : int 64 66 65 67 68 65 66 70 73 68 ...
$ X2ZJ.Bernalillo: num 0.056 0.052 0.055 0.062 0.06 0.056 0.057 0.059 0.059 0.055 ...
$ DaysJuly1 : int 0 1 2 3 4 5 6 7 8 9 ...
# Summary statistics
summary(ozone_final) %>%
kable(caption = "Summary Statistics of Variables") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| AWND | PRCP | SNOW | SNWD | TAVG | TMAX | TMIN | X2ZJ.Bernalillo | DaysJuly1 | |
|---|---|---|---|---|---|---|---|---|---|
| Min. : 2.010 | Min. :0.00000 | Min. :0.00000 | Min. :0.00000 | Min. :18.00 | Min. : 24.0 | Min. : 9.00 | Min. :0.01500 | Min. : 0.00 | |
| 1st Qu.: 5.370 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:0.00000 | 1st Qu.:43.00 | 1st Qu.: 55.0 | 1st Qu.:32.00 | 1st Qu.:0.03900 | 1st Qu.: 45.00 | |
| Median : 7.380 | Median :0.00000 | Median :0.00000 | Median :0.00000 | Median :60.00 | Median : 73.0 | Median :46.00 | Median :0.04700 | Median : 91.00 | |
| Mean : 8.179 | Mean :0.02021 | Mean :0.01626 | Mean :0.01004 | Mean :58.86 | Mean : 71.5 | Mean :46.59 | Mean :0.04644 | Mean : 91.14 | |
| 3rd Qu.:10.070 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:0.00000 | 3rd Qu.:74.00 | 3rd Qu.: 87.0 | 3rd Qu.:62.00 | 3rd Qu.:0.05300 | 3rd Qu.:138.00 | |
| Max. :34.000 | Max. :1.48000 | Max. :2.70000 | Max. :2.00000 | Max. :90.00 | Max. :104.0 | Max. :79.00 | Max. :0.07500 | Max. :183.00 |
# Distribution plots
par(mfrow = c(3, 3))
hist(ozone_data$X2ZJ.Bernalillo, main = "Ozone Concentration", xlab = "Ozone (ppm)")
hist(ozone_data$TAVG, main = "Average Temperature", xlab = "Temperature (°F)")
hist(ozone_data$TMAX, main = "Maximum Temperature", xlab = "Temperature (°F)")
hist(ozone_data$TMIN, main = "Minimum Temperature", xlab = "Temperature (°F)")
hist(ozone_data$PRCP, main = "Precipitation", xlab = "Precipitation (inches)")
hist(sqrt(ozone_data$AWND), main = "Sqrt(Wind Speed)", xlab = "Sqrt(Wind Speed)")
hist(ozone_data$DaysJuly1, main = "Days Since July 1", xlab = "Days")
hist(ozone_data$SNOW, main = "Snowfall", xlab = "Snow (inches)")
hist(ozone_data$SNWD, main = "Snow Depth", xlab = "Snow Depth (inches)")# Boxplot of ozone concentration
boxplot(ozone_data$X2ZJ.Bernalillo,
main = "Distribution of Ozone Concentrations",
ylab = "Ozone Concentration (ppm)")# Correlation analysis
correlation_matrix <- cor(ozone_final)
correlation_matrix %>%
kable(digits = 3, caption = "Correlation Matrix") %>%
kable_styling(bootstrap_options = c("striped", "hover"),
font_size = 10)| AWND | PRCP | SNOW | SNWD | TAVG | TMAX | TMIN | X2ZJ.Bernalillo | DaysJuly1 | |
|---|---|---|---|---|---|---|---|---|---|
| AWND | 1.000 | 0.034 | 0.041 | 0.006 | 0.151 | 0.056 | 0.146 | 0.188 | -0.194 |
| PRCP | 0.034 | 1.000 | 0.146 | 0.027 | 0.056 | 0.007 | 0.091 | -0.046 | -0.094 |
| SNOW | 0.041 | 0.146 | 1.000 | 0.411 | -0.136 | -0.159 | -0.126 | -0.072 | 0.084 |
| SNWD | 0.006 | 0.027 | 0.411 | 1.000 | -0.129 | -0.131 | -0.114 | -0.032 | 0.064 |
| TAVG | 0.151 | 0.056 | -0.136 | -0.129 | 1.000 | 0.974 | 0.978 | 0.725 | -0.896 |
| TMAX | 0.056 | 0.007 | -0.159 | -0.131 | 0.974 | 1.000 | 0.936 | 0.733 | -0.888 |
| TMIN | 0.146 | 0.091 | -0.126 | -0.114 | 0.978 | 0.936 | 1.000 | 0.691 | -0.879 |
| X2ZJ.Bernalillo | 0.188 | -0.046 | -0.072 | -0.032 | 0.725 | 0.733 | 0.691 | 1.000 | -0.799 |
| DaysJuly1 | -0.194 | -0.094 | 0.084 | 0.064 | -0.896 | -0.888 | -0.879 | -0.799 | 1.000 |
# Scatter plot matrix
pairs(ozone_final, main = "Scatter Plot Matrix of All Variables")# Enhanced scatter plot matrix
ggscatmat(ozone_final) +
ggtitle("Enhanced Scatter Plot Matrix with Correlations")# Initial model with all relevant predictors
initial_model <- lm(X2ZJ.Bernalillo ~ TAVG + SNWD + SNOW + DaysJuly1 + log(AWND) + PRCP,
data = ozone_final)
summary(initial_model)
Call:
lm(formula = X2ZJ.Bernalillo ~ TAVG + SNWD + SNOW + DaysJuly1 +
log(AWND) + PRCP, data = ozone_final)
Residuals:
Min 1Q Median 3Q Max
-0.0256253 -0.0034765 0.0006427 0.0036454 0.0207118
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.491e-02 2.276e-03 24.122 < 2e-16 ***
TAVG 2.621e-05 2.413e-05 1.086 0.277637
SNWD 1.753e-03 1.520e-03 1.153 0.249092
SNOW 2.135e-04 1.197e-03 0.178 0.858482
DaysJuly1 -1.396e-04 7.623e-06 -18.307 < 2e-16 ***
log(AWND) 1.481e-03 4.059e-04 3.648 0.000277 ***
PRCP -1.359e-02 2.000e-03 -6.795 1.79e-11 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.005776 on 1069 degrees of freedom
Multiple R-squared: 0.6585, Adjusted R-squared: 0.6566
F-statistic: 343.5 on 6 and 1069 DF, p-value: < 2.2e-16
# Check for multicollinearity
vif(initial_model) %>%
kable(col.names = "VIF", caption = "Variance Inflation Factors - Initial Model") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| VIF | |
|---|---|
| TAVG | 5.244968 |
| SNWD | 1.222768 |
| SNOW | 1.247951 |
| DaysJuly1 | 5.302450 |
| log(AWND) | 1.093199 |
| PRCP | 1.037968 |
Step 1: Addressing Multicollinearity
# Removing DaysJuly1 due to high VIF
m1 <- lm(X2ZJ.Bernalillo ~ TAVG + log(AWND) + PRCP + SNOW + SNWD, data = ozone_final)
summary(m1)
Call:
lm(formula = X2ZJ.Bernalillo ~ TAVG + log(AWND) + PRCP + SNOW +
SNWD, data = ozone_final)
Residuals:
Min 1Q Median 3Q Max
-0.0275052 -0.0040831 0.0001674 0.0046044 0.0205752
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.676e-02 1.049e-03 15.976 < 2e-16 ***
TAVG 4.190e-04 1.264e-05 33.144 < 2e-16 ***
log(AWND) 2.592e-03 4.597e-04 5.638 2.21e-08 ***
PRCP -1.042e-02 2.283e-03 -4.567 5.53e-06 ***
SNOW 6.487e-04 1.371e-03 0.473 0.6361
SNWD 4.298e-03 1.734e-03 2.479 0.0133 *
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.006617 on 1070 degrees of freedom
Multiple R-squared: 0.5514, Adjusted R-squared: 0.5493
F-statistic: 263.1 on 5 and 1070 DF, p-value: < 2.2e-16
vif(m1) %>%
kable(col.names = "VIF", caption = "VIF After Removing DaysJuly1") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| VIF | |
|---|---|
| TAVG | 1.097294 |
| log(AWND) | 1.068755 |
| PRCP | 1.030203 |
| SNOW | 1.247458 |
| SNWD | 1.212540 |
Step 2: Variable Selection
# Removing non-significant SNOW variable
m2 <- lm(X2ZJ.Bernalillo ~ TAVG + log(AWND) + PRCP + SNWD, data = ozone_final)
summary(m2)
Call:
lm(formula = X2ZJ.Bernalillo ~ TAVG + log(AWND) + PRCP + SNWD,
data = ozone_final)
Residuals:
Min 1Q Median 3Q Max
-0.0275384 -0.0040972 0.0001767 0.0046088 0.0205880
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.678e-02 1.048e-03 16.010 < 2e-16 ***
TAVG 4.184e-04 1.256e-05 33.317 < 2e-16 ***
log(AWND) 2.604e-03 4.588e-04 5.677 1.76e-08 ***
PRCP -1.026e-02 2.255e-03 -4.549 5.99e-06 ***
SNWD 4.625e-03 1.590e-03 2.908 0.00371 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.006614 on 1071 degrees of freedom
Multiple R-squared: 0.5513, Adjusted R-squared: 0.5497
F-statistic: 329 on 4 and 1071 DF, p-value: < 2.2e-16
vif(m2) %>%
kable(col.names = "VIF", caption = "Final Model VIF Values") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| VIF | |
|---|---|
| TAVG | 1.083163 |
| log(AWND) | 1.065084 |
| PRCP | 1.006215 |
| SNWD | 1.020375 |
# Comprehensive model diagnostics
e_plot_lm_diagnostics(m2)Non-constant Variance Score Test
Variance formula: ~ fitted.values
Chisquare = 4.866419, Df = 1, p = 0.027384
# Residuals vs Fitted plot
par(mfrow = c(1, 2))
plot(fitted(m2), resid(m2),
xlab = "Fitted Values", ylab = "Residuals",
main = "Residuals vs Fitted Values",
pch = 16, col = "black", cex = 0.7)
abline(h = 0, col = "red", lty = 2)
# Q-Q plot for normality assessment
qqPlot(m2, xlab = "Theoretical Quantiles", ylab = "Studentized Residuals",
main = "Q-Q Plot of Residuals", col = "grey9", col.lines = "grey")[1] 522 876
# Statistical tests for assumptions
bp_test <- bptest(m2) # Breusch-Pagan test for homoscedasticity
sw_test <- shapiro.test(resid(m2)) # Shapiro-Wilk test for normality
# Create results table
test_results <- data.frame(
Test = c("Breusch-Pagan (Homoscedasticity)", "Shapiro-Wilk (Normality)"),
Statistic = c(bp_test$statistic, sw_test$statistic),
P_Value = c(bp_test$p.value, sw_test$p.value),
Conclusion = c(
ifelse(bp_test$p.value > 0.05, "Homoscedastic", "Heteroscedastic"),
ifelse(sw_test$p.value > 0.05, "Normal", "Non-normal")
)
)
test_results %>%
kable(digits = 4, caption = "Diagnostic Test Results") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Test | Statistic | P_Value | Conclusion | |
|---|---|---|---|---|
| BP | Breusch-Pagan (Homoscedasticity) | 8.5346 | 0.0738 | Homoscedastic |
| W | Shapiro-Wilk (Normality) | 0.9939 | 0.0002 | Non-normal |
# Cook's distance analysis
cooks_d <- cooks.distance(m2)
plot(cooks_d, type = "h",
ylim = c(0, max(cooks_d)+0.02),
xlab = "Observation Number", ylab = "Cook's Distance",
main = "Cook's Distance Plot",
col = "black", lwd = 0.5)
abline(h = 4/length(cooks_d), col = "red", lty = 2)
# Add observation numbers for high Cook's distance points
high_cooks <- which(cooks_d > 4/length(cooks_d))
text(x = high_cooks, y = cooks_d[high_cooks],
labels = high_cooks, pos = 3, cex = 0.7, col = "red")# Bonferroni outlier test
outlier_test <- outlierTest(m2)
print(outlier_test) rstudent unadjusted p-value Bonferroni p
876 -4.215487 2.7028e-05 0.029082
The final model equation:
Ŷ = 0.01678 + 0.000418(TAVG) + 0.00264(log(AWND)) - 0.01026(PRCP) + 0.004625(SNWD)
# Create formatted results table
model_summary <- summary(m2)
coef_table <- data.frame(
Predictor = rownames(model_summary$coefficients),
Estimate = model_summary$coefficients[,1],
Std_Error = model_summary$coefficients[,2],
t_value = model_summary$coefficients[,3],
p_value = model_summary$coefficients[,4],
VIF = c(NA, vif(m2))
)
coef_table %>%
kable(digits = 6, caption = "Final Model Summary Statistics") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Predictor | Estimate | Std_Error | t_value | p_value | VIF | |
|---|---|---|---|---|---|---|
| (Intercept) | (Intercept) | 0.016775 | 0.001048 | 16.010268 | 0.000000 | NA |
| TAVG | TAVG | 0.000418 | 0.000013 | 33.317083 | 0.000000 | 1.083163 |
| log(AWND) | log(AWND) | 0.002604 | 0.000459 | 5.677069 | 0.000000 | 1.065084 |
| PRCP | PRCP | -0.010260 | 0.002255 | -4.549417 | 0.000006 | 1.006215 |
| SNWD | SNWD | 0.004625 | 0.001590 | 2.908397 | 0.003708 | 1.020375 |
Model Performance: - Adjusted R² = 0.5497 (explaining ~55% of ozone variation) - F-statistic = 329.02 on 4 and 1071 DF - p-value < 0.0001 (highly significant overall model)
# Testing model with outliers removed
ozone_removed <- ozone_final[-c(1075, 876),]
m3 <- lm(X2ZJ.Bernalillo ~ TAVG + log(AWND) + PRCP + SNWD, data = ozone_removed)
summary(m3)
Call:
lm(formula = X2ZJ.Bernalillo ~ TAVG + log(AWND) + PRCP + SNWD,
data = ozone_removed)
Residuals:
Min 1Q Median 3Q Max
-0.022901 -0.004151 0.000161 0.004545 0.020629
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.664e-02 1.040e-03 16.008 < 2e-16 ***
TAVG 4.155e-04 1.246e-05 33.341 < 2e-16 ***
log(AWND) 2.786e-03 4.565e-04 6.102 1.46e-09 ***
PRCP -1.253e-02 2.574e-03 -4.869 1.29e-06 ***
SNWD 4.601e-03 1.577e-03 2.919 0.00359 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.006556 on 1069 degrees of freedom
Multiple R-squared: 0.5559, Adjusted R-squared: 0.5542
F-statistic: 334.5 on 4 and 1069 DF, p-value: < 2.2e-16
# Alternative model for comparison
m5 <- lm(X2ZJ.Bernalillo ~ AWND + PRCP + DaysJuly1, data = ozone_final)
summary(m5)
Call:
lm(formula = X2ZJ.Bernalillo ~ AWND + PRCP + DaysJuly1, data = ozone_final)
Residuals:
Min 1Q Median 3Q Max
-0.0255816 -0.0034466 0.0005715 0.0037472 0.0205911
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5.954e-02 5.571e-04 106.878 < 2e-16 ***
AWND 8.954e-05 4.505e-05 1.988 0.0471 *
PRCP -1.342e-02 1.981e-03 -6.776 2.04e-11 ***
DaysJuly1 -1.488e-04 3.403e-06 -43.744 < 2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 0.005801 on 1072 degrees of freedom
Multiple R-squared: 0.6546, Adjusted R-squared: 0.6537
F-statistic: 677.3 on 3 and 1072 DF, p-value: < 2.2e-16
# Model comparison table
model_comp <- data.frame(
Model = c("Final Model (m2)", "Without Outliers (m3)", "Alternative (m5)"),
Adj_R_Squared = c(summary(m2)$adj.r.squared,
summary(m3)$adj.r.squared,
summary(m5)$adj.r.squared),
AIC = c(AIC(m2), AIC(m3), AIC(m5)),
Variables = c("TAVG, log(AWND), PRCP, SNWD",
"TAVG, log(AWND), PRCP, SNWD",
"AWND, PRCP, DaysJuly1")
)
model_comp %>%
kable(digits = 4, caption = "Model Comparison") %>%
kable_styling(bootstrap_options = c("striped", "hover"))| Model | Adj_R_Squared | AIC | Variables |
|---|---|---|---|
| Final Model (m2) | 0.5497 | -7739.307 | TAVG, log(AWND), PRCP, SNWD |
| Without Outliers (m3) | 0.5542 | -7743.806 | TAVG, log(AWND), PRCP, SNWD |
| Alternative (m5) | 0.6537 | -8022.839 | AWND, PRCP, DaysJuly1 |
This analysis successfully identified four key meteorological drivers of ozone concentrations in Bernalillo, New Mexico:
This analysis demonstrates rigorous application of multiple regression techniques to environmental data, providing insights into the meteorological factors driving ozone concentrations in Bernalillo, New Mexico.