WQD7004 – Programming for Data Science · Group Project
This project analyses monthly PM2.5 and NO₂ data for 20 major global
cities (1999–2025).
Objectives:
df <- read.csv("air_quality_global.csv", stringsAsFactors = FALSE)
cat("Rows:", nrow(df), " Columns:", ncol(df), "\n")
## Rows: 6480 Columns: 11
glimpse(df)
## Rows: 6,480
## Columns: 11
## $ city <chr> "New York", "New York", "New York", "New York", "Ne…
## $ country <chr> "USA", "USA", "USA", "USA", "USA", "USA", "USA", "U…
## $ latitude <dbl> 40.7128, 40.7128, 40.7128, 40.7128, 40.7128, 40.712…
## $ longitude <dbl> -74.006, -74.006, -74.006, -74.006, -74.006, -74.00…
## $ year <int> 1999, 1999, 1999, 1999, 1999, 1999, 1999, 1999, 199…
## $ month <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 1, 2, 3, 4, …
## $ pm25_ugm3 <dbl> 18.11, 27.79, 12.05, 35.25, 38.39, 14.89, 19.66, 10…
## $ no2_ugm3 <dbl> 35.98, 17.71, 40.99, 17.18, 25.07, 28.95, 27.85, 26…
## $ data_quality <chr> "Moderate", "Good", "Moderate", "Poor", "Good", "Go…
## $ measurement_method <chr> "Reference/Equivalent Method", "Reference/Equivalen…
## $ data_source <chr> "EPA_AQS", "EPA_AQS", "EPA_AQS", "EPA_AQS", "EPA_AQ…
skim(df)
| Name | df |
| Number of rows | 6480 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| city | 0 | 1 | 5 | 12 | 0 | 20 | 0 |
| country | 0 | 1 | 2 | 7 | 0 | 10 | 0 |
| data_quality | 0 | 1 | 4 | 8 | 0 | 3 | 0 |
| measurement_method | 0 | 1 | 24 | 27 | 0 | 2 | 0 |
| data_source | 0 | 1 | 7 | 12 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| latitude | 0 | 1 | 31.54 | 16.60 | -23.55 | 29.24 | 33.75 | 40.14 | 52.52 | ▁▁▂▇▇ |
| longitude | 0 | 1 | -35.88 | 81.51 | -121.89 | -98.65 | -74.59 | 5.89 | 139.65 | ▇▁▃▂▂ |
| year | 0 | 1 | 2012.00 | 7.79 | 1999.00 | 2005.00 | 2012.00 | 2019.00 | 2025.00 | ▇▇▇▇▇ |
| month | 0 | 1 | 6.50 | 3.45 | 1.00 | 3.75 | 6.50 | 9.25 | 12.00 | ▇▅▅▅▇ |
| pm25_ugm3 | 0 | 1 | 40.97 | 36.30 | 5.10 | 19.34 | 29.23 | 46.08 | 274.18 | ▇▁▁▁▁ |
| no2_ugm3 | 0 | 1 | 39.62 | 16.71 | 10.25 | 27.08 | 36.84 | 48.92 | 110.27 | ▆▇▃▁▁ |
clean_df <- df %>%
mutate(
date = lubridate::make_date(year, month, 1),
city = factor(city),
country = factor(country),
measurement_method = factor(measurement_method),
data_source = factor(data_source),
season = case_when(
month %in% c(12,1,2) ~ "Winter",
month %in% c(3,4,5) ~ "Spring",
month %in% c(6,7,8) ~ "Summer",
TRUE ~ "Autumn"
) %>% factor(levels = c("Winter","Spring","Summer","Autumn")),
data_quality = factor(data_quality, levels = c("Good","Moderate","Poor"), ordered = TRUE)
) %>%
group_by(city, month) %>%
mutate(
pm25_ugm3 = ifelse(is.na(pm25_ugm3), mean(pm25_ugm3, na.rm=TRUE), pm25_ugm3),
no2_ugm3 = ifelse(is.na(no2_ugm3), mean(no2_ugm3, na.rm=TRUE), no2_ugm3)
) %>%
ungroup()
# Quick checks
sum(is.na(clean_df$pm25_ugm3)); sum(is.na(clean_df$no2_ugm3))
## [1] 0
## [1] 0
table(clean_df$data_quality, useNA="ifany")
##
## Good Moderate Poor
## 4917 1169 394
clean_df %>%
group_by(date) %>%
summarize(pm25_mean = mean(pm25_ugm3, na.rm=TRUE)) %>%
ggplot(aes(date, pm25_mean)) +
geom_line(size=1.1, color="#1A73E8") +
theme_minimal() +
labs(title="Global Monthly Mean PM2.5 (1999–2025)", x="Year", y="Mean PM2.5 (µg/m³)")
ggplot(clean_df, aes(reorder(city, pm25_ugm3, FUN=median), pm25_ugm3, fill=city)) +
geom_boxplot(outlier.size=0.8, alpha=0.8) +
coord_flip() +
theme_minimal() +
theme(legend.position="none") +
labs(title="PM2.5 Distribution by City (median order)", x="", y="PM2.5 (µg/m³)")
ggplot(clean_df, aes(no2_ugm3, pm25_ugm3)) +
geom_point(alpha=0.35) +
geom_smooth(method="lm", se=FALSE, color="black") +
theme_minimal() +
labs(title="PM2.5 vs NO₂ (monthly)", x="NO₂ (µg/m³)", y="PM2.5 (µg/m³)")
ggplot(clean_df, aes(season, pm25_ugm3, fill=season)) +
geom_boxplot() +
theme_minimal() +
theme(legend.position="none") +
labs(title="Seasonal Variation of PM2.5", x="Season", y="PM2.5 (µg/m³)")
cor_mat <- clean_df %>% select(pm25_ugm3, no2_ugm3, latitude, longitude) %>% cor(use="pairwise.complete.obs")
corrplot(cor_mat, method="color", addCoef.col="black", tl.col="black")
city_mean <- clean_df %>%
group_by(city, country, latitude, longitude) %>%
summarize(pm25_avg = mean(pm25_ugm3, na.rm=TRUE)) %>%
ungroup()
world_map <- maps::map("world", plot=FALSE, fill=TRUE)
ggplot() +
geom_polygon(data = map_df <- map_data("world"), aes(x=long, y=lat, group=group), fill="#f0f0f0", color="#d9d9d9") +
geom_point(data=city_mean, aes(x=longitude, y=latitude, color=pm25_avg, size=pm25_avg), alpha=0.9) +
scale_color_viridis_c(option="plasma", name="PM2.5 avg") +
scale_size(range=c(2,8), guide=FALSE) +
coord_quickmap() +
theme_minimal() +
labs(title="City Average PM2.5 (1999–2025)", x="", y="")
clean_df <- clean_df %>% filter(!is.na(data_quality))
set.seed(123)
train_idx <- createDataPartition(clean_df$data_quality, p=0.8, list=FALSE)
train <- clean_df[train_idx, ] %>% droplevels()
test <- clean_df[-train_idx, ] %>% droplevels()
cat("Train / Test sizes:", nrow(train), "/", nrow(test), "\n")
## Train / Test sizes: 5186 / 1294
table(train$data_quality); table(test$data_quality)
##
## Good Moderate Poor
## 3934 936 316
##
## Good Moderate Poor
## 983 233 78
lm_fit <- lm(pm25_ugm3 ~ no2_ugm3 + latitude + season + city, data=train)
summary(lm_fit)
##
## Call:
## lm(formula = pm25_ugm3 ~ no2_ugm3 + latitude + season + city,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86.960 -9.206 0.406 9.023 129.196
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -329.27469 15.84574 -20.780 < 2e-16 ***
## no2_ugm3 0.10015 0.02287 4.378 1.22e-05 ***
## latitude 9.96572 0.42539 23.427 < 2e-16 ***
## seasonSpring 10.55571 0.79967 13.200 < 2e-16 ***
## seasonSummer -0.24278 0.81926 -0.296 0.767
## seasonAutumn -11.63281 0.76453 -15.216 < 2e-16 ***
## cityBerlin -172.25148 6.52073 -26.416 < 2e-16 ***
## cityChicago -66.51658 2.36116 -28.171 < 2e-16 ***
## cityDallas 23.52521 2.53712 9.272 < 2e-16 ***
## cityDelhi 170.36328 4.27334 39.866 < 2e-16 ***
## cityHouston 57.23679 3.67380 15.580 < 2e-16 ***
## cityLagos 326.65356 13.36556 24.440 < 2e-16 ***
## cityLondon -158.95266 6.03690 -26.330 < 2e-16 ***
## cityLos Angeles 17.18419 2.14176 8.023 1.26e-15 ***
## cityMexico City 183.26172 8.01249 22.872 < 2e-16 ***
## cityMumbai 232.99353 8.12917 28.661 < 2e-16 ***
## cityNew York -57.43163 2.05359 -27.966 < 2e-16 ***
## cityParis -133.87052 4.97687 -26.899 < 2e-16 ***
## cityPhiladelphia -48.81827 1.83019 -26.674 < 2e-16 ***
## cityPhoenix 14.39198 2.30133 6.254 4.33e-10 ***
## citySan Antonio 57.98366 3.77160 15.374 < 2e-16 ***
## citySan Diego 20.43272 2.53275 8.067 8.86e-16 ***
## citySan Jose -22.50785 1.50922 -14.914 < 2e-16 ***
## citySão Paulo 600.18093 26.14867 22.953 < 2e-16 ***
## cityTokyo NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.47 on 5162 degrees of freedom
## Multiple R-squared: 0.7091, Adjusted R-squared: 0.7078
## F-statistic: 547 on 23 and 5162 DF, p-value: < 2.2e-16
pred_lm <- predict(lm_fit, test)
lm_rmse <- RMSE(pred_lm, test$pm25_ugm3)
lm_mae <- MAE(pred_lm, test$pm25_ugm3)
lm_r2 <- R2(pred_lm, test$pm25_ugm3)
data.frame(Model="Linear Regression", RMSE=lm_rmse, MAE=lm_mae, R2=lm_r2)
## Model RMSE MAE R2
## 1 Linear Regression 20.30573 13.66486 0.7062653
set.seed(123)
rf_reg <- randomForest(pm25_ugm3 ~ no2_ugm3 + latitude + season + city, data=train, ntree=300, importance=TRUE)
print(rf_reg)
##
## Call:
## randomForest(formula = pm25_ugm3 ~ no2_ugm3 + latitude + season + city, data = train, ntree = 300, importance = TRUE)
## Type of random forest: regression
## Number of trees: 300
## No. of variables tried at each split: 1
##
## Mean of squared residuals: 364.5039
## % Var explained: 71.88
varImpReg <- importance(rf_reg)
varImpPlot(rf_reg, main="Random Forest: Regression Variable Importance")
pred_rf <- predict(rf_reg, test)
rf_rmse <- RMSE(pred_rf, test$pm25_ugm3)
rf_mae <- MAE(pred_rf, test$pm25_ugm3)
rf_r2 <- R2(pred_rf, test$pm25_ugm3)
data.frame(Model=c("Linear Regression","Random Forest Regression"),
RMSE=c(lm_rmse, rf_rmse),
MAE =c(lm_mae, rf_mae),
R2 =c(lm_r2, rf_r2))
## Model RMSE MAE R2
## 1 Linear Regression 20.30573 13.66486 0.7062653
## 2 Random Forest Regression 20.02125 13.24618 0.7182331
train_cl <- train %>% select(data_quality, pm25_ugm3, no2_ugm3, latitude, season, city) %>% drop_na() %>% droplevels()
test_cl <- test %>% select(data_quality, pm25_ugm3, no2_ugm3, latitude, season, city) %>% drop_na() %>% droplevels()
tree_fit <- rpart(data_quality ~ pm25_ugm3 + no2_ugm3 + latitude + season + city, data=train_cl, method="class")
rpart.plot(tree_fit, main="Decision Tree: Predicting data_quality", box.palette="Greens", nn=TRUE, shadow.col="gray")
pred_tree <- predict(tree_fit, test_cl, type="class")
conf_tree <- confusionMatrix(pred_tree, test_cl$data_quality)
conf_tree
## Confusion Matrix and Statistics
##
## Reference
## Prediction Good Moderate Poor
## Good 983 233 78
## Moderate 0 0 0
## Poor 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7597
## 95% CI : (0.7354, 0.7827)
## No Information Rate : 0.7597
## P-Value [Acc > NIR] : 0.5152
##
## Kappa : 0
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Good Class: Moderate Class: Poor
## Sensitivity 1.0000 0.0000 0.00000
## Specificity 0.0000 1.0000 1.00000
## Pos Pred Value 0.7597 NaN NaN
## Neg Pred Value NaN 0.8199 0.93972
## Prevalence 0.7597 0.1801 0.06028
## Detection Rate 0.7597 0.0000 0.00000
## Detection Prevalence 1.0000 0.0000 0.00000
## Balanced Accuracy 0.5000 0.5000 0.50000
set.seed(123)
rf_clf <- randomForest(data_quality ~ pm25_ugm3 + no2_ugm3 + latitude + season + city, data=train_cl, ntree=300, importance=TRUE)
print(rf_clf)
##
## Call:
## randomForest(formula = data_quality ~ pm25_ugm3 + no2_ugm3 + latitude + season + city, data = train_cl, ntree = 300, importance = TRUE)
## Type of random forest: classification
## Number of trees: 300
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 25.18%
## Confusion matrix:
## Good Moderate Poor class.error
## Good 3876 51 7 0.01474326
## Moderate 930 4 2 0.99572650
## Poor 311 5 0 1.00000000
# Variable importance for classifier
varImpPlot(rf_clf, main="Random Forest: Classification Variable Importance")
pred_rf_cl <- predict(rf_clf, test_cl)
conf_rf_cl <- confusionMatrix(pred_rf_cl, test_cl$data_quality)
conf_rf_cl
## Confusion Matrix and Statistics
##
## Reference
## Prediction Good Moderate Poor
## Good 972 229 75
## Moderate 10 4 3
## Poor 1 0 0
##
## Overall Statistics
##
## Accuracy : 0.7543
## 95% CI : (0.7298, 0.7775)
## No Information Rate : 0.7597
## P-Value [Acc > NIR] : 0.6887
##
## Kappa : 0.011
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: Good Class: Moderate Class: Poor
## Sensitivity 0.98881 0.017167 0.0000000
## Specificity 0.02251 0.987747 0.9991776
## Pos Pred Value 0.76176 0.235294 0.0000000
## Neg Pred Value 0.38889 0.820673 0.9396752
## Prevalence 0.75966 0.180062 0.0602782
## Detection Rate 0.75116 0.003091 0.0000000
## Detection Prevalence 0.98609 0.013138 0.0007728
## Balanced Accuracy 0.50566 0.502457 0.4995888
reg_table <- data.frame(
Model=c("Linear Regression","Random Forest (reg)"),
RMSE=c(lm_rmse, rf_rmse),
MAE=c(lm_mae, rf_mae),
R2=c(lm_r2, rf_r2)
)
reg_table %>% knitr::kable(digits=3, caption="Regression model comparison (test set)")
| Model | RMSE | MAE | R2 |
|---|---|---|---|
| Linear Regression | 20.306 | 13.665 | 0.706 |
| Random Forest (reg) | 20.021 | 13.246 | 0.718 |
class_summary <- tibble(
Model = c("Decision Tree","Random Forest (clf)"),
Accuracy = c(conf_tree$overall["Accuracy"], conf_rf_cl$overall["Accuracy"])
)
class_summary %>% knitr::kable(digits=3, caption="Classification model comparison (test set)")
| Model | Accuracy |
|---|---|
| Decision Tree | 0.760 |
| Random Forest (clf) | 0.754 |
Limitations
Future Work
This project provides a complete pipeline—from data cleaning, EDA, modeling (regression & classification), to interpretation. The Random Forest models demonstrate improved predictive performance, and the analyses highlight both spatial and temporal patterns of PM2.5 and NO₂ across major cities.
sessionInfo()
## R version 4.5.1 (2025-06-13)
## Platform: aarch64-apple-darwin20
## Running under: macOS Sequoia 15.6.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/4.5-arm64/Resources/lib/libRlapack.dylib; LAPACK version 3.12.1
##
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
##
## time zone: Asia/Shanghai
## tzcode source: internal
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] scales_1.4.0 mapdata_2.3.1 maps_3.4.3
## [4] randomForest_4.7-1.2 rpart.plot_3.1.3 rpart_4.1.24
## [7] caret_7.0-1 lattice_0.22-7 corrplot_0.95
## [10] skimr_2.2.1 lubridate_1.9.4 forcats_1.0.1
## [13] stringr_1.5.2 dplyr_1.1.4 purrr_1.2.0
## [16] readr_2.1.6 tidyr_1.3.1 tibble_3.3.0
## [19] ggplot2_4.0.1 tidyverse_2.0.0
##
## loaded via a namespace (and not attached):
## [1] tidyselect_1.2.1 viridisLite_0.4.2 timeDate_4051.111
## [4] farver_2.1.2 S7_0.2.1 fastmap_1.2.0
## [7] pROC_1.19.0.1 digest_0.6.37 timechange_0.3.0
## [10] lifecycle_1.0.4 survival_3.8-3 magrittr_2.0.4
## [13] compiler_4.5.1 rlang_1.1.6 sass_0.4.10
## [16] tools_4.5.1 yaml_2.3.10 data.table_1.17.8
## [19] knitr_1.50 labeling_0.4.3 plyr_1.8.9
## [22] repr_1.1.7 RColorBrewer_1.1-3 withr_3.0.2
## [25] nnet_7.3-20 grid_4.5.1 stats4_4.5.1
## [28] e1071_1.7-16 future_1.68.0 globals_0.18.0
## [31] iterators_1.0.14 MASS_7.3-65 cli_3.6.5
## [34] rmarkdown_2.30 generics_0.1.4 rstudioapi_0.17.1
## [37] future.apply_1.20.0 reshape2_1.4.5 tzdb_0.5.0
## [40] proxy_0.4-27 cachem_1.1.0 splines_4.5.1
## [43] parallel_4.5.1 base64enc_0.1-3 vctrs_0.6.5
## [46] hardhat_1.4.2 Matrix_1.7-3 jsonlite_2.0.0
## [49] hms_1.1.4 listenv_0.10.0 foreach_1.5.2
## [52] gower_1.0.2 jquerylib_0.1.4 recipes_1.3.1
## [55] glue_1.8.0 parallelly_1.45.1 codetools_0.2-20
## [58] stringi_1.8.7 gtable_0.3.6 pillar_1.11.1
## [61] htmltools_0.5.8.1 ipred_0.9-15 lava_1.8.2
## [64] R6_2.6.1 evaluate_1.0.5 bslib_0.9.0
## [67] class_7.3-23 Rcpp_1.1.0 nlme_3.1-168
## [70] prodlim_2025.04.28 mgcv_1.9-3 xfun_0.53
## [73] pkgconfig_2.0.3 ModelMetrics_1.2.2.2