Global Air Quality Analysis & Prediction (1999–2025)
WQD7004 – Programming for Data Science
Group Project Report
This project analyzes monthly PM2.5 and NO₂ levels across
20 major global cities (1999–2025) using EPA & WHO datasets.
options(repos = c(CRAN = "https://cloud.r-project.org"))
required <- c(
"tidyverse","lubridate","skimr","corrplot",
"caret","rpart","rpart.plot","randomForest"
)
to_install <- required[!(required %in% installed.packages()[,"Package"])]
if(length(to_install) > 0){
install.packages(to_install, dependencies = TRUE)
}
lapply(required, library, character.only = TRUE)
## [[1]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[2]]
## [1] "lubridate" "forcats" "stringr" "dplyr" "purrr" "readr"
## [7] "tidyr" "tibble" "ggplot2" "tidyverse" "stats" "graphics"
## [13] "grDevices" "utils" "datasets" "methods" "base"
##
## [[3]]
## [1] "skimr" "lubridate" "forcats" "stringr" "dplyr" "purrr"
## [7] "readr" "tidyr" "tibble" "ggplot2" "tidyverse" "stats"
## [13] "graphics" "grDevices" "utils" "datasets" "methods" "base"
##
## [[4]]
## [1] "corrplot" "skimr" "lubridate" "forcats" "stringr" "dplyr"
## [7] "purrr" "readr" "tidyr" "tibble" "ggplot2" "tidyverse"
## [13] "stats" "graphics" "grDevices" "utils" "datasets" "methods"
## [19] "base"
##
## [[5]]
## [1] "caret" "lattice" "corrplot" "skimr" "lubridate" "forcats"
## [7] "stringr" "dplyr" "purrr" "readr" "tidyr" "tibble"
## [13] "ggplot2" "tidyverse" "stats" "graphics" "grDevices" "utils"
## [19] "datasets" "methods" "base"
##
## [[6]]
## [1] "rpart" "caret" "lattice" "corrplot" "skimr" "lubridate"
## [7] "forcats" "stringr" "dplyr" "purrr" "readr" "tidyr"
## [13] "tibble" "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [19] "utils" "datasets" "methods" "base"
##
## [[7]]
## [1] "rpart.plot" "rpart" "caret" "lattice" "corrplot"
## [6] "skimr" "lubridate" "forcats" "stringr" "dplyr"
## [11] "purrr" "readr" "tidyr" "tibble" "ggplot2"
## [16] "tidyverse" "stats" "graphics" "grDevices" "utils"
## [21] "datasets" "methods" "base"
##
## [[8]]
## [1] "randomForest" "rpart.plot" "rpart" "caret" "lattice"
## [6] "corrplot" "skimr" "lubridate" "forcats" "stringr"
## [11] "dplyr" "purrr" "readr" "tidyr" "tibble"
## [16] "ggplot2" "tidyverse" "stats" "graphics" "grDevices"
## [21] "utils" "datasets" "methods" "base"
df <- read.csv("air_quality_global.csv")
cat("Rows:", nrow(df), "\nColumns:", ncol(df), "\n")
## Rows: 6480
## Columns: 11
head(df)
## city country latitude longitude year month pm25_ugm3 no2_ugm3
## 1 New York USA 40.7128 -74.006 1999 1 18.11 35.98
## 2 New York USA 40.7128 -74.006 1999 2 27.79 17.71
## 3 New York USA 40.7128 -74.006 1999 3 12.05 40.99
## 4 New York USA 40.7128 -74.006 1999 4 35.25 17.18
## 5 New York USA 40.7128 -74.006 1999 5 38.39 25.07
## 6 New York USA 40.7128 -74.006 1999 6 14.89 28.95
## data_quality measurement_method data_source
## 1 Moderate Reference/Equivalent Method EPA_AQS
## 2 Good Reference/Equivalent Method EPA_AQS
## 3 Moderate Reference/Equivalent Method EPA_AQS
## 4 Poor Reference/Equivalent Method EPA_AQS
## 5 Good Reference/Equivalent Method EPA_AQS
## 6 Good Reference/Equivalent Method EPA_AQS
str(df)
## 'data.frame': 6480 obs. of 11 variables:
## $ city : chr "New York" "New York" "New York" "New York" ...
## $ country : chr "USA" "USA" "USA" "USA" ...
## $ latitude : num 40.7 40.7 40.7 40.7 40.7 ...
## $ longitude : num -74 -74 -74 -74 -74 ...
## $ year : int 1999 1999 1999 1999 1999 1999 1999 1999 1999 1999 ...
## $ month : int 1 2 3 4 5 6 7 8 9 10 ...
## $ pm25_ugm3 : num 18.1 27.8 12.1 35.2 38.4 ...
## $ no2_ugm3 : num 36 17.7 41 17.2 25.1 ...
## $ data_quality : chr "Moderate" "Good" "Moderate" "Poor" ...
## $ measurement_method: chr "Reference/Equivalent Method" "Reference/Equivalent Method" "Reference/Equivalent Method" "Reference/Equivalent Method" ...
## $ data_source : chr "EPA_AQS" "EPA_AQS" "EPA_AQS" "EPA_AQS" ...
skim(df)
| Name | df |
| Number of rows | 6480 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| city | 0 | 1 | 5 | 12 | 0 | 20 | 0 |
| country | 0 | 1 | 2 | 7 | 0 | 10 | 0 |
| data_quality | 0 | 1 | 4 | 8 | 0 | 3 | 0 |
| measurement_method | 0 | 1 | 24 | 27 | 0 | 2 | 0 |
| data_source | 0 | 1 | 7 | 12 | 0 | 2 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| latitude | 0 | 1 | 31.54 | 16.60 | -23.55 | 29.24 | 33.75 | 40.14 | 52.52 | ▁▁▂▇▇ |
| longitude | 0 | 1 | -35.88 | 81.51 | -121.89 | -98.65 | -74.59 | 5.89 | 139.65 | ▇▁▃▂▂ |
| year | 0 | 1 | 2012.00 | 7.79 | 1999.00 | 2005.00 | 2012.00 | 2019.00 | 2025.00 | ▇▇▇▇▇ |
| month | 0 | 1 | 6.50 | 3.45 | 1.00 | 3.75 | 6.50 | 9.25 | 12.00 | ▇▅▅▅▇ |
| pm25_ugm3 | 0 | 1 | 40.97 | 36.30 | 5.10 | 19.34 | 29.23 | 46.08 | 274.18 | ▇▁▁▁▁ |
| no2_ugm3 | 0 | 1 | 39.62 | 16.71 | 10.25 | 27.08 | 36.84 | 48.92 | 110.27 | ▆▇▃▁▁ |
clean_df <- df %>%
mutate(
date = make_date(year, month, 1),
city = factor(city),
country = factor(country),
measurement_method = factor(measurement_method),
data_source = factor(data_source),
season = case_when(
month %in% c(12,1,2) ~ "Winter",
month %in% c(3,4,5) ~ "Spring",
month %in% c(6,7,8) ~ "Summer",
TRUE ~ "Autumn"
) |> factor(),
data_quality = factor(data_quality,
levels = c("Good","Moderate","Poor"),
ordered = TRUE)
) %>%
group_by(city, month) %>%
mutate(
pm25_ugm3 = ifelse(is.na(pm25_ugm3),
mean(pm25_ugm3, na.rm = TRUE),
pm25_ugm3),
no2_ugm3 = ifelse(is.na(no2_ugm3),
mean(no2_ugm3, na.rm = TRUE),
no2_ugm3)
) %>%
ungroup()
skim(clean_df)
| Name | clean_df |
| Number of rows | 6480 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| Date | 1 |
| factor | 6 |
| numeric | 6 |
| ________________________ | |
| Group variables | None |
Variable type: Date
| skim_variable | n_missing | complete_rate | min | max | median | n_unique |
|---|---|---|---|---|---|---|
| date | 0 | 1 | 1999-01-01 | 2025-12-01 | 2012-06-16 | 324 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| city | 0 | 1 | FALSE | 20 | Bei: 324, Ber: 324, Chi: 324, Dal: 324 |
| country | 0 | 1 | FALSE | 10 | USA: 3240, Ind: 648, Bra: 324, Chi: 324 |
| data_quality | 0 | 1 | TRUE | 3 | Goo: 4917, Mod: 1169, Poo: 394 |
| measurement_method | 0 | 1 | FALSE | 2 | Fed: 5040, Ref: 1440 |
| data_source | 0 | 1 | FALSE | 2 | EPA: 3240, WHO: 3240 |
| season | 0 | 1 | FALSE | 4 | Aut: 1620, Spr: 1620, Sum: 1620, Win: 1620 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| latitude | 0 | 1 | 31.54 | 16.60 | -23.55 | 29.24 | 33.75 | 40.14 | 52.52 | ▁▁▂▇▇ |
| longitude | 0 | 1 | -35.88 | 81.51 | -121.89 | -98.65 | -74.59 | 5.89 | 139.65 | ▇▁▃▂▂ |
| year | 0 | 1 | 2012.00 | 7.79 | 1999.00 | 2005.00 | 2012.00 | 2019.00 | 2025.00 | ▇▇▇▇▇ |
| month | 0 | 1 | 6.50 | 3.45 | 1.00 | 3.75 | 6.50 | 9.25 | 12.00 | ▇▅▅▅▇ |
| pm25_ugm3 | 0 | 1 | 40.97 | 36.30 | 5.10 | 19.34 | 29.23 | 46.08 | 274.18 | ▇▁▁▁▁ |
| no2_ugm3 | 0 | 1 | 39.62 | 16.71 | 10.25 | 27.08 | 36.84 | 48.92 | 110.27 | ▆▇▃▁▁ |
clean_df %>%
group_by(date) %>%
summarize(pm25 = mean(pm25_ugm3)) %>%
ggplot(aes(date, pm25)) +
geom_line(size=1.1, color="#1A73E8") +
theme_minimal() +
labs(title="Global PM2.5 Trend (1999–2025)",
y="Mean PM2.5 (µg/m³)",
x="Year")
ggplot(clean_df, aes(city, pm25_ugm3, fill=city)) +
geom_boxplot(alpha=.7) +
theme_minimal() +
theme(legend.position = "none") +
labs(title="PM2.5 Distribution Across Cities")
ggplot(clean_df, aes(no2_ugm3, pm25_ugm3, color=city)) +
geom_point(alpha=.45) +
geom_smooth(method="lm", se=FALSE, color="black") +
theme_minimal() +
labs(title="PM2.5 vs NO₂ Levels")
cor_matrix <- clean_df %>%
select(pm25_ugm3, no2_ugm3, latitude, longitude) %>%
cor()
corrplot(cor_matrix, method="color", addCoef.col="black")
set.seed(123)
clean_df <- clean_df %>% filter(!is.na(data_quality))
train_idx <- createDataPartition(clean_df$data_quality, p=0.8, list=FALSE)
train <- clean_df[train_idx, ] %>% droplevels()
test <- clean_df[-train_idx, ] %>% droplevels()
model_reg <- lm(pm25_ugm3 ~ no2_ugm3 + latitude + season + city,
data=train)
summary(model_reg)
##
## Call:
## lm(formula = pm25_ugm3 ~ no2_ugm3 + latitude + season + city,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -86.960 -9.206 0.406 9.023 129.196
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -340.90751 15.83840 -21.524 < 2e-16 ***
## no2_ugm3 0.10015 0.02287 4.378 1.22e-05 ***
## latitude 9.96572 0.42539 23.427 < 2e-16 ***
## seasonSpring 22.18852 0.78926 28.113 < 2e-16 ***
## seasonSummer 11.39003 0.80466 14.155 < 2e-16 ***
## seasonWinter 11.63281 0.76453 15.216 < 2e-16 ***
## cityBerlin -172.25148 6.52073 -26.416 < 2e-16 ***
## cityChicago -66.51658 2.36116 -28.171 < 2e-16 ***
## cityDallas 23.52521 2.53712 9.272 < 2e-16 ***
## cityDelhi 170.36328 4.27334 39.866 < 2e-16 ***
## cityHouston 57.23679 3.67380 15.580 < 2e-16 ***
## cityLagos 326.65356 13.36556 24.440 < 2e-16 ***
## cityLondon -158.95266 6.03690 -26.330 < 2e-16 ***
## cityLos Angeles 17.18419 2.14176 8.023 1.26e-15 ***
## cityMexico City 183.26172 8.01249 22.872 < 2e-16 ***
## cityMumbai 232.99353 8.12917 28.661 < 2e-16 ***
## cityNew York -57.43163 2.05359 -27.966 < 2e-16 ***
## cityParis -133.87052 4.97687 -26.899 < 2e-16 ***
## cityPhiladelphia -48.81827 1.83019 -26.674 < 2e-16 ***
## cityPhoenix 14.39198 2.30133 6.254 4.33e-10 ***
## citySan Antonio 57.98366 3.77160 15.374 < 2e-16 ***
## citySan Diego 20.43272 2.53275 8.067 8.86e-16 ***
## citySan Jose -22.50785 1.50922 -14.914 < 2e-16 ***
## citySão Paulo 600.18093 26.14867 22.953 < 2e-16 ***
## cityTokyo NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19.47 on 5162 degrees of freedom
## Multiple R-squared: 0.7091, Adjusted R-squared: 0.7078
## F-statistic: 547 on 23 and 5162 DF, p-value: < 2.2e-16
pred_reg <- predict(model_reg, test)
cat("RMSE:", RMSE(pred_reg, test$pm25_ugm3), "\n")
## RMSE: 20.30573
cat("MAE :", MAE(pred_reg, test$pm25_ugm3), "\n")
## MAE : 13.66486
cat("R² :", R2(pred_reg, test$pm25_ugm3), "\n")
## R² : 0.7062653
train_mod <- train %>%
select(data_quality, pm25_ugm3, no2_ugm3, latitude, season, city) %>%
drop_na() %>% droplevels()
test_mod <- test %>%
select(data_quality, pm25_ugm3, no2_ugm3, latitude, season, city) %>%
drop_na() %>% droplevels()
model_tree <- rpart(
data_quality ~ pm25_ugm3 + no2_ugm3 + latitude + season + city,
data=train_mod,
method="class"
)
rpart.plot(
model_tree,
main="Decision Tree for Air Quality",
box.palette="Greens",
nn=TRUE,
shadow.col="gray"
)
pred_class <- predict(model_tree, test_mod, type="class")
conf <- confusionMatrix(pred_class, test_mod$data_quality)
conf
## Confusion Matrix and Statistics
##
## Reference
## Prediction Good Moderate Poor
## Good 983 233 78
## Moderate 0 0 0
## Poor 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7597
## 95% CI : (0.7354, 0.7827)
## No Information Rate : 0.7597
## P-Value [Acc > NIR] : 0.5152
##
## Kappa : 0
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Good Class: Moderate Class: Poor
## Sensitivity 1.0000 0.0000 0.00000
## Specificity 0.0000 1.0000 1.00000
## Pos Pred Value 0.7597 NaN NaN
## Neg Pred Value NaN 0.8199 0.93972
## Prevalence 0.7597 0.1801 0.06028
## Detection Rate 0.7597 0.0000 0.00000
## Detection Prevalence 1.0000 0.0000 0.00000
## Balanced Accuracy 0.5000 0.5000 0.50000