Global Air Quality Analysis & Prediction (1999–2025)

WQD7004 – Programming for Data Science · Group Project

Author: Group 4
University of Malaya · Faculty of Computer Science
Date: 2025-11-21

1 1. Introduction

This project analyses monthly PM2.5 and NO₂ data for 20 major global cities (1999–2025).
Objectives:

Clean and prepare dataset (tidy form).
Exploratory Data Analysis (EDA): temporal, spatial, and relationship analysis.
Build predictive models: regression (PM2.5) and classification (data_quality + alternative binary tests).
Provide model comparison, feature importance, limitations & future work.

2 2. Setup & Required Packages

3 3. Load & Inspect Data

df <- read.csv("air_quality_global.csv", stringsAsFactors = FALSE)
cat("Rows:", nrow(df), " Columns:", ncol(df), "\n")

## Rows: 6480  Columns: 11

glimpse(df)

## Rows: 6,480
## Columns: 11
## $ city               <chr> "New York", "New York", "New York", "New York", "Ne…
## $ country            <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "U…
## $ latitude           <dbl> 40.7128, 40.7128, 40.7128, 40.7128, 40.7128, 40.712…
## $ longitude          <dbl> -74.006, -74.006, -74.006, -74.006, -74.006, -74.00…
## $ year               <int> 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 199…
## $ month              <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, …
## $ pm25_ugm3          <dbl> 18.11, 27.79, 12.05, 35.25, 38.39, 14.89, 19.66, 10…
## $ no2_ugm3           <dbl> 35.98, 17.71, 40.99, 17.18, 25.07, 28.95, 27.85, 26…
## $ data_quality       <chr> "Moderate", "Good", "Moderate", "Poor", "Good", "Go…
## $ measurement_method <chr> "Reference/Equivalent Method", "Reference/Equivalen…
## $ data_source        <chr> "EPA_AQS", "EPA_AQS", "EPA_AQS", "EPA_AQS", "EPA_AQ…

skim(df)

Data summary
Name	df
Number of rows	6480
Number of columns	11
_______________________
Column type frequency:
character	5
numeric	6
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
city	1	5	12	20
country	1	2	7	10
data_quality	1	4	8	3
measurement_method	1	24	27	2
data_source	1	7	12	2

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
latitude	1	31.54	16.60	-23.55	29.24	33.75	40.14	52.52	▁▁▂▇▇
longitude	1	-35.88	81.51	-121.89	-98.65	-74.59	5.89	139.65	▇▁▃▂▂
year	1	2012.00	7.79	1999.00	2005.00	2012.00	2019.00	2025.00	▇▇▇▇▇
month	1	6.50	3.45	1.00	3.75	6.50	9.25	12.00	▇▅▅▅▇
pm25_ugm3	1	40.97	36.30	5.10	19.34	29.23	46.08	274.18	▇▁▁▁▁
no2_ugm3	1	39.62	16.71	10.25	27.08	36.84	48.92	110.27	▆▇▃▁▁

4 4. Data Cleaning & Feature Engineering

clean_df <- df %>%
  mutate(
    date = lubridate::make_date(year, month, 1),
    city = factor(city),
    country = factor(country),
    measurement_method = factor(measurement_method),
    data_source = factor(data_source),
    season = case_when(
      month %in% c(12,1,2) ~ "Winter",
      month %in% c(3,4,5) ~ "Spring",
      month %in% c(6,7,8) ~ "Summer",
      TRUE ~ "Autumn"
    ) %>% factor(levels = c("Winter","Spring","Summer","Autumn")),
    data_quality = factor(data_quality, levels = c("Good","Moderate","Poor"), ordered = TRUE)
  ) %>%
  group_by(city, month) %>%
  mutate(
    pm25_ugm3 = ifelse(is.na(pm25_ugm3), mean(pm25_ugm3, na.rm=TRUE), pm25_ugm3),
    no2_ugm3 = ifelse(is.na(no2_ugm3), mean(no2_ugm3, na.rm=TRUE), no2_ugm3)
  ) %>%
  ungroup()

# Quick checks
sum(is.na(clean_df$pm25_ugm3)); sum(is.na(clean_df$no2_ugm3))

## [1] 0

## [1] 0

table(clean_df$data_quality, useNA="ifany")

## 
##     Good Moderate     Poor 
##     4917     1169      394

5 5. Exploratory Data Analysis (EDA)

5.1 5.1 Global & Yearly Trends

clean_df %>%
  group_by(date) %>%
  summarize(pm25_mean = mean(pm25_ugm3, na.rm=TRUE)) %>%
  ggplot(aes(date, pm25_mean)) +
  geom_line(size=1.1, color="#1A73E8") +
  theme_minimal() +
  labs(title="Global Monthly Mean PM2.5 (1999–2025)", x="Year", y="Mean PM2.5 (µg/m³)")

5.2 5.2 City-level Distribution

ggplot(clean_df, aes(reorder(city, pm25_ugm3, FUN=median), pm25_ugm3, fill=city)) +
  geom_boxplot(outlier.size=0.8, alpha=0.8) +
  coord_flip() +
  theme_minimal() +
  theme(legend.position="none") +
  labs(title="PM2.5 Distribution by City (median order)", x="", y="PM2.5 (µg/m³)")

5.3 5.3 PM2.5 vs NO₂ Relationship

ggplot(clean_df, aes(no2_ugm3, pm25_ugm3)) +
  geom_point(alpha=0.35) +
  geom_smooth(method="lm", se=FALSE, color="black") +
  theme_minimal() +
  labs(title="PM2.5 vs NO₂ (monthly)", x="NO₂ (µg/m³)", y="PM2.5 (µg/m³)")

5.4 5.4 Seasonality

ggplot(clean_df, aes(season, pm25_ugm3, fill=season)) +
  geom_boxplot() +
  theme_minimal() +
  theme(legend.position="none") +
  labs(title="Seasonal Variation of PM2.5", x="Season", y="PM2.5 (µg/m³)")

5.5 5.5 Correlation Matrix

cor_mat <- clean_df %>% select(pm25_ugm3, no2_ugm3, latitude, longitude) %>% cor(use="pairwise.complete.obs")
corrplot(cor_mat, method="color", addCoef.col="black", tl.col="black")

5.6 5.6 Spatial (Map) — City mean PM2.5 on world map

city_mean <- clean_df %>%
  group_by(city, country, latitude, longitude) %>%
  summarize(pm25_avg = mean(pm25_ugm3, na.rm=TRUE)) %>%
  ungroup()

world_map <- maps::map("world", plot=FALSE, fill=TRUE)
ggplot() +
  geom_polygon(data = map_df <- map_data("world"), aes(x=long, y=lat, group=group), fill="#f0f0f0", color="#d9d9d9") +
  geom_point(data=city_mean, aes(x=longitude, y=latitude, color=pm25_avg, size=pm25_avg), alpha=0.9) +
  scale_color_viridis_c(option="plasma", name="PM2.5 avg") +
  scale_size(range=c(2,8), guide=FALSE) +
  coord_quickmap() +
  theme_minimal() +
  labs(title="City Average PM2.5 (1999–2025)", x="", y="")

6 6. Modeling — Train/Test Split

clean_df <- clean_df %>% filter(!is.na(data_quality))
set.seed(123)
train_idx <- createDataPartition(clean_df$data_quality, p=0.8, list=FALSE)
train <- clean_df[train_idx, ] %>% droplevels()
test  <- clean_df[-train_idx, ] %>% droplevels()
cat("Train / Test sizes:", nrow(train), "/", nrow(test), "\n")

## Train / Test sizes: 5186 / 1294

table(train$data_quality); table(test$data_quality)

## 
##     Good Moderate     Poor 
##     3934      936      316

## 
##     Good Moderate     Poor 
##      983      233       78

7 6.1 Regression: Linear Model (baseline)

lm_fit <- lm(pm25_ugm3 ~ no2_ugm3 + latitude + season + city, data=train)
summary(lm_fit)

## 
## Call:
## lm(formula = pm25_ugm3 ~ no2_ugm3 + latitude + season + city, 
##     data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -86.960  -9.206   0.406   9.023 129.196 
## 
## Coefficients: (1 not defined because of singularities)
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      -329.27469   15.84574 -20.780  < 2e-16 ***
## no2_ugm3            0.10015    0.02287   4.378 1.22e-05 ***
## latitude            9.96572    0.42539  23.427  < 2e-16 ***
## seasonSpring       10.55571    0.79967  13.200  < 2e-16 ***
## seasonSummer       -0.24278    0.81926  -0.296    0.767    
## seasonAutumn      -11.63281    0.76453 -15.216  < 2e-16 ***
## cityBerlin       -172.25148    6.52073 -26.416  < 2e-16 ***
## cityChicago       -66.51658    2.36116 -28.171  < 2e-16 ***
## cityDallas         23.52521    2.53712   9.272  < 2e-16 ***
## cityDelhi         170.36328    4.27334  39.866  < 2e-16 ***
## cityHouston        57.23679    3.67380  15.580  < 2e-16 ***
## cityLagos         326.65356   13.36556  24.440  < 2e-16 ***
## cityLondon       -158.95266    6.03690 -26.330  < 2e-16 ***
## cityLos Angeles    17.18419    2.14176   8.023 1.26e-15 ***
## cityMexico City   183.26172    8.01249  22.872  < 2e-16 ***
## cityMumbai        232.99353    8.12917  28.661  < 2e-16 ***
## cityNew York      -57.43163    2.05359 -27.966  < 2e-16 ***
## cityParis        -133.87052    4.97687 -26.899  < 2e-16 ***
## cityPhiladelphia  -48.81827    1.83019 -26.674  < 2e-16 ***
## cityPhoenix        14.39198    2.30133   6.254 4.33e-10 ***
## citySan Antonio    57.98366    3.77160  15.374  < 2e-16 ***
## citySan Diego      20.43272    2.53275   8.067 8.86e-16 ***
## citySan Jose      -22.50785    1.50922 -14.914  < 2e-16 ***
## citySão Paulo     600.18093   26.14867  22.953  < 2e-16 ***
## cityTokyo                NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19.47 on 5162 degrees of freedom
## Multiple R-squared:  0.7091, Adjusted R-squared:  0.7078 
## F-statistic:   547 on 23 and 5162 DF,  p-value: < 2.2e-16

7.0.1 Regression evaluation (test set)

pred_lm <- predict(lm_fit, test)
lm_rmse <- RMSE(pred_lm, test$pm25_ugm3)
lm_mae  <- MAE(pred_lm, test$pm25_ugm3)
lm_r2   <- R2(pred_lm, test$pm25_ugm3)
data.frame(Model="Linear Regression", RMSE=lm_rmse, MAE=lm_mae, R2=lm_r2)

##               Model     RMSE      MAE        R2
## 1 Linear Regression 20.30573 13.66486 0.7062653

8 6.2 Regression: Random Forest (improved)

set.seed(123)
rf_reg <- randomForest(pm25_ugm3 ~ no2_ugm3 + latitude + season + city, data=train, ntree=300, importance=TRUE)
print(rf_reg)

## 
## Call:
##  randomForest(formula = pm25_ugm3 ~ no2_ugm3 + latitude + season +      city, data = train, ntree = 300, importance = TRUE) 
##                Type of random forest: regression
##                      Number of trees: 300
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 364.5039
##                     % Var explained: 71.88

varImpReg <- importance(rf_reg)
varImpPlot(rf_reg, main="Random Forest: Regression Variable Importance")

8.0.1 RF regression evaluation

pred_rf <- predict(rf_reg, test)
rf_rmse <- RMSE(pred_rf, test$pm25_ugm3)
rf_mae  <- MAE(pred_rf, test$pm25_ugm3)
rf_r2   <- R2(pred_rf, test$pm25_ugm3)
data.frame(Model=c("Linear Regression","Random Forest Regression"),
           RMSE=c(lm_rmse, rf_rmse),
           MAE =c(lm_mae, rf_mae),
           R2  =c(lm_r2, rf_r2))

##                      Model     RMSE      MAE        R2
## 1        Linear Regression 20.30573 13.66486 0.7062653
## 2 Random Forest Regression 20.02125 13.24618 0.7182331

9 6.3 Classification: Decision Tree (data_quality)

train_cl <- train %>% select(data_quality, pm25_ugm3, no2_ugm3, latitude, season, city) %>% drop_na() %>% droplevels()
test_cl  <- test  %>% select(data_quality, pm25_ugm3, no2_ugm3, latitude, season, city) %>% drop_na() %>% droplevels()

tree_fit <- rpart(data_quality ~ pm25_ugm3 + no2_ugm3 + latitude + season + city, data=train_cl, method="class")
rpart.plot(tree_fit, main="Decision Tree: Predicting data_quality", box.palette="Greens", nn=TRUE, shadow.col="gray")

9.0.1 Tree evaluation

pred_tree <- predict(tree_fit, test_cl, type="class")
conf_tree <- confusionMatrix(pred_tree, test_cl$data_quality)
conf_tree

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Good Moderate Poor
##   Good      983      233   78
##   Moderate    0        0    0
##   Poor        0        0    0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7597          
##                  95% CI : (0.7354, 0.7827)
##     No Information Rate : 0.7597          
##     P-Value [Acc > NIR] : 0.5152          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: Good Class: Moderate Class: Poor
## Sensitivity               1.0000          0.0000     0.00000
## Specificity               0.0000          1.0000     1.00000
## Pos Pred Value            0.7597             NaN         NaN
## Neg Pred Value               NaN          0.8199     0.93972
## Prevalence                0.7597          0.1801     0.06028
## Detection Rate            0.7597          0.0000     0.00000
## Detection Prevalence      1.0000          0.0000     0.00000
## Balanced Accuracy         0.5000          0.5000     0.50000

10 6.4 Classification: Random Forest Classifier (improved)

set.seed(123)
rf_clf <- randomForest(data_quality ~ pm25_ugm3 + no2_ugm3 + latitude + season + city, data=train_cl, ntree=300, importance=TRUE)
print(rf_clf)

## 
## Call:
##  randomForest(formula = data_quality ~ pm25_ugm3 + no2_ugm3 +      latitude + season + city, data = train_cl, ntree = 300, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 300
## No. of variables tried at each split: 2
## 
##         OOB estimate of  error rate: 25.18%
## Confusion matrix:
##          Good Moderate Poor class.error
## Good     3876       51    7  0.01474326
## Moderate  930        4    2  0.99572650
## Poor      311        5    0  1.00000000

# Variable importance for classifier
varImpPlot(rf_clf, main="Random Forest: Classification Variable Importance")

10.0.1 RF classification evaluation

pred_rf_cl <- predict(rf_clf, test_cl)
conf_rf_cl <- confusionMatrix(pred_rf_cl, test_cl$data_quality)
conf_rf_cl

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction Good Moderate Poor
##   Good      972      229   75
##   Moderate   10        4    3
##   Poor        1        0    0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7543          
##                  95% CI : (0.7298, 0.7775)
##     No Information Rate : 0.7597          
##     P-Value [Acc > NIR] : 0.6887          
##                                           
##                   Kappa : 0.011           
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: Good Class: Moderate Class: Poor
## Sensitivity              0.98881        0.017167   0.0000000
## Specificity              0.02251        0.987747   0.9991776
## Pos Pred Value           0.76176        0.235294   0.0000000
## Neg Pred Value           0.38889        0.820673   0.9396752
## Prevalence               0.75966        0.180062   0.0602782
## Detection Rate           0.75116        0.003091   0.0000000
## Detection Prevalence     0.98609        0.013138   0.0007728
## Balanced Accuracy        0.50566        0.502457   0.4995888

11 7. Model Comparison Summary

reg_table <- data.frame(
  Model=c("Linear Regression","Random Forest (reg)"),
  RMSE=c(lm_rmse, rf_rmse),
  MAE=c(lm_mae, rf_mae),
  R2=c(lm_r2, rf_r2)
)

reg_table %>% knitr::kable(digits=3, caption="Regression model comparison (test set)")

Regression model comparison (test set)
Model	RMSE	MAE	R2
Linear Regression	20.306	13.665	0.706
Random Forest (reg)	20.021	13.246	0.718

class_summary <- tibble(
  Model = c("Decision Tree","Random Forest (clf)"),
  Accuracy = c(conf_tree$overall["Accuracy"], conf_rf_cl$overall["Accuracy"])
)
class_summary %>% knitr::kable(digits=3, caption="Classification model comparison (test set)")

Classification model comparison (test set)
Model	Accuracy
Decision Tree	0.760
Random Forest (clf)	0.754

12 8. Discussion (Interpretation)

PM2.5 & NO₂: EDA and correlation show a clear positive relationship, suggesting common emission sources (traffic, combustion).
Spatial patterns: City-level averages show large heterogeneity — some cities consistently show higher PM2.5.
Regression: Random Forest outperforms linear regression—indicates non-linear effects and interactions (city, season).
Classification: RF classifier yields better accuracy than single decision tree; tree offers interpretable rules (e.g., PM2.5 thresholds) while RF offers robustness.
Feature importance: PM2.5 (for classification) and NO₂ (for regression) rank high in importance — consistent with domain knowledge.

13 9. Limitations & Future Work

Limitations

Data are monthly averages, masking daily peaks and episodic events.
Meteorological variables (wind, temperature, humidity) are not included — they strongly affect pollutant dispersion.
Measurement and data source heterogeneity (EPA vs WHO methods) may introduce bias.
Some cities may have shorter monitoring histories (uneven time-series length).

Future Work

Incorporate meteorological and emissions inventory data (ERA5, local emissions).
Perform time-series forecasting with LSTM or Prophet for short-term predictions.
Use spatial interpolation (kriging) or satellite AOD to increase spatial coverage.
Conduct causal analysis (instrumental variables) to separate meteorology vs emissions effects.

14 10. Conclusion

This project provides a complete pipeline—from data cleaning, EDA, modeling (regression & classification), to interpretation. The Random Forest models demonstrate improved predictive performance, and the analyses highlight both spatial and temporal patterns of PM2.5 and NO₂ across major cities.

15 Appendix: Code & Session Info

sessionInfo()

## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
## 
## Matrix products: default
## BLAS:   /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib 
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib;  LAPACK version 3.12.1
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## time zone: Asia/Shanghai
## tzcode source: internal
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] scales_1.4.0         mapdata_2.3.1        maps_3.4.3          
##  [4] randomForest_4.7-1.2 rpart.plot_3.1.3     rpart_4.1.24        
##  [7] caret_7.0-1          lattice_0.22-7       corrplot_0.95       
## [10] skimr_2.2.1          lubridate_1.9.4      forcats_1.0.1       
## [13] stringr_1.5.2        dplyr_1.1.4          purrr_1.2.0         
## [16] readr_2.1.6          tidyr_1.3.1          tibble_3.3.0        
## [19] ggplot2_4.0.1        tidyverse_2.0.0     
## 
## loaded via a namespace (and not attached):
##  [1] tidyselect_1.2.1     viridisLite_0.4.2    timeDate_4051.111   
##  [4] farver_2.1.2         S7_0.2.1             fastmap_1.2.0       
##  [7] pROC_1.19.0.1        digest_0.6.37        timechange_0.3.0    
## [10] lifecycle_1.0.4      survival_3.8-3       magrittr_2.0.4      
## [13] compiler_4.5.1       rlang_1.1.6          sass_0.4.10         
## [16] tools_4.5.1          yaml_2.3.10          data.table_1.17.8   
## [19] knitr_1.50           labeling_0.4.3       plyr_1.8.9          
## [22] repr_1.1.7           RColorBrewer_1.1-3   withr_3.0.2         
## [25] nnet_7.3-20          grid_4.5.1           stats4_4.5.1        
## [28] e1071_1.7-16         future_1.68.0        globals_0.18.0      
## [31] iterators_1.0.14     MASS_7.3-65          cli_3.6.5           
## [34] rmarkdown_2.30       generics_0.1.4       rstudioapi_0.17.1   
## [37] future.apply_1.20.0  reshape2_1.4.5       tzdb_0.5.0          
## [40] proxy_0.4-27         cachem_1.1.0         splines_4.5.1       
## [43] parallel_4.5.1       base64enc_0.1-3      vctrs_0.6.5         
## [46] hardhat_1.4.2        Matrix_1.7-3         jsonlite_2.0.0      
## [49] hms_1.1.4            listenv_0.10.0       foreach_1.5.2       
## [52] gower_1.0.2          jquerylib_0.1.4      recipes_1.3.1       
## [55] glue_1.8.0           parallelly_1.45.1    codetools_0.2-20    
## [58] stringi_1.8.7        gtable_0.3.6         pillar_1.11.1       
## [61] htmltools_0.5.8.1    ipred_0.9-15         lava_1.8.2          
## [64] R6_2.6.1             evaluate_1.0.5       bslib_0.9.0         
## [67] class_7.3-23         Rcpp_1.1.0           nlme_3.1-168        
## [70] prodlim_2025.04.28   mgcv_1.9-3           xfun_0.53           
## [73] pkgconfig_2.0.3      ModelMetrics_1.2.2.2