Introduction

This project, “Air Quality Trend Analysis and Pollution Level Prediction” analyzes air quality data across Indian cities (2015–2020). It involves data merging, cleaning, and visualization, followed by correlation, regression, ANOVA, clustering (K-Means), classification (KNN, Logistic Regression), and association rule mining (Apriori) to study pollution trends and predict air quality levels.

Load Required Libraries

options(warn = -1)
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(caret)

## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

library(class)
library(arules)

## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## 
## Attaching package: 'arules'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(arulesViz)
library(factoextra)

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(dplyr)
library(lubridate)
library(corrplot)

## corrplot 0.95 loaded

#Load All Datasets

city_day <- read.csv("C:/Users/shant/Downloads/Air Quality Index - Datasets/city_day.csv")
city_hour <- read.csv("C:/Users/shant/Downloads/Air Quality Index - Datasets/city_hour.csv")
station_day <- read.csv("C:/Users/shant/Downloads/Air Quality Index - Datasets/station_day.csv")
station_hour <- read.csv("C:/Users/shant/Downloads/Air Quality Index - Datasets/station_hour.csv")
stations <- read.csv("C:/Users/shant/Downloads/Air Quality Index - Datasets/stations.csv")

Inspect Datasets

cat("City-Day:\n"); str(city_day)

## City-Day:

## 'data.frame':    29531 obs. of  16 variables:
##  $ City      : chr  "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
##  $ Date      : chr  "2015-01-01" "2015-01-02" "2015-01-03" "2015-01-04" ...
##  $ PM2.5     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PM10      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NO        : num  0.92 0.97 17.4 1.7 22.1 ...
##  $ NO2       : num  18.2 15.7 19.3 18.5 21.4 ...
##  $ NOx       : num  17.1 16.5 29.7 18 37.8 ...
##  $ NH3       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CO        : num  0.92 0.97 17.4 1.7 22.1 ...
##  $ SO2       : num  27.6 24.6 29.1 18.6 39.3 ...
##  $ O3        : num  133.4 34.1 30.7 36.1 39.3 ...
##  $ Benzene   : num  0 3.68 6.8 4.43 7.01 5.42 0 0 0 0 ...
##  $ Toluene   : num  0.02 5.5 16.4 10.14 18.89 ...
##  $ Xylene    : num  0 3.77 2.25 1 2.78 1.93 0 0 0 0 ...
##  $ AQI       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AQI_Bucket: chr  "" "" "" "" ...

cat("\nCity-Hour:\n"); str(city_hour)

## 
## City-Hour:

## 'data.frame':    707875 obs. of  16 variables:
##  $ City      : chr  "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
##  $ Datetime  : chr  "2015-01-01 01:00:00" "2015-01-01 02:00:00" "2015-01-01 03:00:00" "2015-01-01 04:00:00" ...
##  $ PM2.5     : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ PM10      : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ NO        : num  1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
##  $ NO2       : num  40 27.8 19.3 16.4 14.9 ...
##  $ NOx       : num  36.37 19.73 11.08 9.2 7.85 ...
##  $ NH3       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ CO        : num  1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
##  $ SO2       : num  122.1 85.9 52.8 39.5 32.6 ...
##  $ O3        : num  NA NA NA 154 NA ...
##  $ Benzene   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Toluene   : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ Xylene    : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ AQI       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AQI_Bucket: chr  "" "" "" "" ...

cat("\nStation-Day:\n"); str(station_day)

## 
## Station-Day:

## 'data.frame':    108035 obs. of  16 variables:
##  $ StationId : chr  "AP001" "AP001" "AP001" "AP001" ...
##  $ Date      : chr  "2017-11-24" "2017-11-25" "2017-11-26" "2017-11-27" ...
##  $ PM2.5     : num  71.4 81.4 78.3 88.8 64.2 ...
##  $ PM10      : num  116 124 129 135 104 ...
##  $ NO        : num  1.75 1.44 1.26 6.6 2.56 5.23 4.69 4.58 7.71 0.97 ...
##  $ NO2       : num  20.6 20.5 26 30.9 28.1 ...
##  $ NOx       : num  12.4 12.1 14.8 21.8 17 ...
##  $ NH3       : num  12.2 10.7 10.3 12.9 11.4 ...
##  $ CO        : num  0.1 0.12 0.14 0.11 0.09 0.16 0.12 0.1 0.1 0.15 ...
##  $ SO2       : num  10.8 15.2 27 33.6 19 ...
##  $ O3        : num  109 127 117 112 138 ...
##  $ Benzene   : num  0.17 0.2 0.22 0.29 0.17 0.21 0.16 0.17 0.25 0.23 ...
##  $ Toluene   : num  5.92 6.5 7.95 7.63 5.02 4.71 3.52 2.85 2.79 3.82 ...
##  $ Xylene    : num  0.1 0.06 0.08 0.12 0.07 0.08 0.06 0.04 0.07 0.04 ...
##  $ AQI       : num  NA 184 197 198 188 173 165 191 191 227 ...
##  $ AQI_Bucket: chr  "" "Moderate" "Moderate" "Moderate" ...

cat("\nStation-Hour:\n"); str(station_hour)

## 
## Station-Hour:

## 'data.frame':    2589083 obs. of  16 variables:
##  $ StationId : chr  "AP001" "AP001" "AP001" "AP001" ...
##  $ Datetime  : chr  "2017-11-24 17:00:00" "2017-11-24 18:00:00" "2017-11-24 19:00:00" "2017-11-24 20:00:00" ...
##  $ PM2.5     : num  60.5 65.5 80 81.5 75.2 ...
##  $ PM10      : num  98 111 132 133 116 ...
##  $ NO        : num  2.35 2.7 2.1 1.95 1.43 0.7 1.05 1.25 0.3 0.8 ...
##  $ NO2       : num  30.8 24.2 25.2 16.2 17.5 ...
##  $ NOx       : num  18.2 15.1 15.2 10.2 10.4 ...
##  $ NH3       : num  8.5 9.77 12.02 11.58 12.03 ...
##  $ CO        : num  0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.3 0.1 ...
##  $ SO2       : num  11.85 13.17 12.08 10.47 9.12 ...
##  $ O3        : num  126 117 99 112 106 ...
##  $ Benzene   : num  0.1 0.1 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.23 ...
##  $ Toluene   : num  6.1 6.25 5.98 6.72 5.75 5.02 5.6 5.55 6.6 6.77 ...
##  $ Xylene    : num  0.1 0.15 0.18 0.1 0.08 0 0.1 0.05 0 0.1 ...
##  $ AQI       : num  NA NA NA NA NA NA NA NA NA NA ...
##  $ AQI_Bucket: chr  "" "" "" "" ...

cat("\nStations:\n"); str(stations)

## 
## Stations:

## 'data.frame':    230 obs. of  5 variables:
##  $ StationId  : chr  "AP001" "AP002" "AP003" "AP004" ...
##  $ StationName: chr  "Secretariat, Amaravati - APPCB" "Anand Kala Kshetram, Rajamahendravaram - APPCB" "Tirumala, Tirupati - APPCB" "PWD Grounds, Vijayawada - APPCB" ...
##  $ City       : chr  "Amaravati" "Rajamahendravaram" "Tirupati" "Vijayawada" ...
##  $ State      : chr  "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" ...
##  $ Status     : chr  "Active" "" "" "" ...

Merge and Clean Datasets

# Convert column names to lowercase for consistency
names(city_day)    <- tolower(names(city_day))
names(city_hour)   <- tolower(names(city_hour))
names(station_day) <- tolower(names(station_day))
names(station_hour)<- tolower(names(station_hour))
names(stations)    <- tolower(names(stations))

# Merge station and city-level data
merged_data <- station_day %>%
  left_join(stations, by = "stationid") %>% 
  drop_na()
  
cat("\nMerged dataset dimensions:\n")

## 
## Merged dataset dimensions:

dim(merged_data)

## [1] 10314    20

glimpse(merged_data)

## Rows: 10,314
## Columns: 20
## $ stationid   <chr> "AP001", "AP001", "AP001", "AP001", "AP001", "AP001", "AP0…
## $ date        <chr> "2017-11-25", "2017-11-26", "2017-11-27", "2017-11-28", "2…
## $ pm2.5       <dbl> 81.40, 78.32, 88.76, 64.18, 72.47, 69.80, 73.96, 89.90, 87…
## $ pm10        <dbl> 124.50, 129.06, 135.32, 104.09, 114.84, 114.86, 113.56, 14…
## $ no          <dbl> 1.44, 1.26, 6.60, 2.56, 5.23, 4.69, 4.58, 7.71, 0.97, 4.02…
## $ no2         <dbl> 20.50, 26.00, 30.85, 28.07, 23.20, 20.17, 19.29, 26.19, 21…
## $ nox         <dbl> 12.08, 14.85, 21.77, 17.01, 16.59, 14.54, 13.97, 19.87, 12…
## $ nh3         <dbl> 10.72, 10.28, 12.91, 11.42, 12.25, 10.95, 10.95, 13.12, 14…
## $ co          <dbl> 0.12, 0.14, 0.11, 0.09, 0.16, 0.12, 0.10, 0.10, 0.15, 0.18…
## $ so2         <dbl> 15.24, 26.96, 33.59, 19.00, 10.55, 14.07, 13.90, 19.37, 11…
## $ o3          <dbl> 127.09, 117.44, 111.81, 138.18, 109.74, 118.09, 123.80, 12…
## $ benzene     <dbl> 0.20, 0.22, 0.29, 0.17, 0.21, 0.16, 0.17, 0.25, 0.23, 0.31…
## $ toluene     <dbl> 6.50, 7.95, 7.63, 5.02, 4.71, 3.52, 2.85, 2.79, 3.82, 3.53…
## $ xylene      <dbl> 0.06, 0.08, 0.12, 0.07, 0.08, 0.06, 0.04, 0.07, 0.04, 0.09…
## $ aqi         <dbl> 184, 197, 198, 188, 173, 165, 191, 191, 227, 168, 198, 201…
## $ aqi_bucket  <chr> "Moderate", "Moderate", "Moderate", "Moderate", "Moderate"…
## $ stationname <chr> "Secretariat, Amaravati - APPCB", "Secretariat, Amaravati …
## $ city        <chr> "Amaravati", "Amaravati", "Amaravati", "Amaravati", "Amara…
## $ state       <chr> "Andhra Pradesh", "Andhra Pradesh", "Andhra Pradesh", "And…
## $ status      <chr> "Active", "Active", "Active", "Active", "Active", "Active"…

# Select Useful Columns
df <- merged_data %>%
  select(stationid, city, state, 
         pm2.5, pm10, no2, so2, co, o3, date)

Descriptive Statistics

str(df)

## 'data.frame':    10314 obs. of  10 variables:
##  $ stationid: chr  "AP001" "AP001" "AP001" "AP001" ...
##  $ city     : chr  "Amaravati" "Amaravati" "Amaravati" "Amaravati" ...
##  $ state    : chr  "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" ...
##  $ pm2.5    : num  81.4 78.3 88.8 64.2 72.5 ...
##  $ pm10     : num  124 129 135 104 115 ...
##  $ no2      : num  20.5 26 30.9 28.1 23.2 ...
##  $ so2      : num  15.2 27 33.6 19 10.6 ...
##  $ co       : num  0.12 0.14 0.11 0.09 0.16 0.12 0.1 0.1 0.15 0.18 ...
##  $ o3       : num  127 117 112 138 110 ...
##  $ date     : chr  "2017-11-25" "2017-11-26" "2017-11-27" "2017-11-28" ...

summary(df)

##   stationid             city              state               pm2.5       
##  Length:10314       Length:10314       Length:10314       Min.   :  1.09  
##  Class :character   Class :character   Class :character   1st Qu.: 25.74  
##  Mode  :character   Mode  :character   Mode  :character   Median : 43.40  
##                                                           Mean   : 52.48  
##                                                           3rd Qu.: 66.21  
##                                                           Max.   :734.56  
##       pm10             no2              so2               co        
##  Min.   :  5.77   Min.   :  0.01   Min.   : 0.100   Min.   :0.0000  
##  1st Qu.: 62.69   1st Qu.: 15.22   1st Qu.: 4.370   1st Qu.:0.4200  
##  Median : 98.81   Median : 28.57   Median : 7.860   Median :0.6300  
##  Mean   :108.49   Mean   : 33.19   Mean   : 9.907   Mean   :0.6992  
##  3rd Qu.:137.84   3rd Qu.: 46.17   3rd Qu.:12.810   3rd Qu.:0.9000  
##  Max.   :830.10   Max.   :254.78   Max.   :67.260   Max.   :4.7400  
##        o3             date          
##  Min.   :  0.03   Length:10314      
##  1st Qu.: 18.38   Class :character  
##  Median : 28.24   Mode  :character  
##  Mean   : 32.22                     
##  3rd Qu.: 41.41                     
##  Max.   :162.33

cat("Rows:", nrow(df), " Columns:", ncol(df), "\n")

## Rows: 10314  Columns: 10

# Mean, Median, Mode, SD for PM2.5
mean_pm25 <- mean(df$pm2.5, na.rm = TRUE)
median_pm25 <- median(df$pm2.5, na.rm = TRUE)
sd_pm25 <- sd(df$pm2.5, na.rm = TRUE)
mode_pm25 <- names(sort(table(df$pm2.5), decreasing = TRUE))[1]

cat("Mean PM2.5:", mean_pm25, "\n")

## Mean PM2.5: 52.4821

cat("Median PM2.5:", median_pm25, "\n")

## Median PM2.5: 43.4

cat("Mode PM2.5:", mode_pm25, "\n")

## Mode PM2.5: 34

cat("SD PM2.5:", sd_pm25, "\n")

## SD PM2.5: 43.10114

# Handle Missing Data
colSums(is.na(df))

## stationid      city     state     pm2.5      pm10       no2       so2        co 
##         0         0         0         0         0         0         0         0 
##        o3      date 
##         0         0

df[is.na(df)] <- 0

# Min–Max Normalization for numeric pollutants
normalize <- function(x){ (x - min(x)) / (max(x) - min(x)) }
df <- df %>%
  mutate(across(c(pm2.5, pm10, no2, so2, co, o3), normalize))
head(df)

##   stationid      city          state      pm2.5      pm10        no2       so2
## 1     AP001 Amaravati Andhra Pradesh 0.10949323 0.1440321 0.08042548 0.2254318
## 2     AP001 Amaravati Andhra Pradesh 0.10529401 0.1495639 0.10201358 0.3999404
## 3     AP001 Amaravati Andhra Pradesh 0.11952772 0.1571579 0.12105036 0.4986599
## 4     AP001 Amaravati Andhra Pradesh 0.08601579 0.1192726 0.11013856 0.2814175
## 5     AP001 Amaravati Andhra Pradesh 0.09731823 0.1323135 0.09102328 0.1555986
## 6     AP001 Amaravati Andhra Pradesh 0.09367800 0.1323378 0.07913020 0.2080107
##           co        o3       date
## 1 0.02531646 0.7828712 2017-11-25
## 2 0.02953586 0.7234134 2017-11-26
## 3 0.02320675 0.6887246 2017-11-27
## 4 0.01898734 0.8512015 2017-11-28
## 5 0.03375527 0.6759704 2017-11-29
## 6 0.02531646 0.7274184 2017-11-30

Correlation & Visualization

# Correlation between pollutants
corr_matrix <- cor(df[, c("pm2.5","pm10","no2","so2","co","o3")], use="complete.obs")
print(corr_matrix)

##            pm2.5       pm10        no2       so2          co          o3
## pm2.5 1.00000000 0.88947188 0.55518254 0.1763268 0.608159593 0.046705607
## pm10  0.88947188 1.00000000 0.56060830 0.2098844 0.577965605 0.043487262
## no2   0.55518254 0.56060830 1.00000000 0.1468857 0.482896416 0.006454650
## so2   0.17632683 0.20988439 0.14688573 1.0000000 0.138754697 0.147865878
## co    0.60815959 0.57796560 0.48289642 0.1387547 1.000000000 0.001114814
## o3    0.04670561 0.04348726 0.00645465 0.1478659 0.001114814 1.000000000

corrplot(corr_matrix, method = "number")

Top 10 polluted cities

top10 <- df %>%
  group_by(city) %>%
  summarise(mean_pm25 = mean(pm2.5, na.rm = TRUE)) %>%
  arrange(desc(mean_pm25)) %>%
  head(10)
print(top10)

## # A tibble: 9 × 2
##   city          mean_pm25
##   <chr>             <dbl>
## 1 Delhi            0.143 
## 2 Patna            0.0878
## 3 Amritsar         0.0754
## 4 Gurugram         0.0726
## 5 Kolkata          0.0717
## 6 Visakhapatnam    0.0631
## 7 Hyderabad        0.0561
## 8 Amaravati        0.0554
## 9 Chandigarh       0.0524

Bar chart(Top-10 polluted cities)

ggplot(top10, aes(x = reorder(city, -mean_pm25), y = mean_pm25, fill = city)) +
  geom_bar(stat = "identity") +
  labs(title = "Top 10 Polluted Cities (Mean PM2.5)", x = "City", y = "Mean PM2.5") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Boxplot by city

boxplot(pm2.5 ~ city, data = df, main = "PM2.5 by City", las = 2)

PM2.5 trend over time

ggplot(df %>% filter(city == unique(city)[1]),
       aes(x = as.Date(date), y = pm2.5)) +
  geom_line(color = "blue") + ggtitle("PM2.5 Trend Over Time")

Scatter Plot PM2.5 vs PM10

ggplot(df, aes(x = pm10, y = pm2.5)) +
  geom_point(alpha = 0.3, color = "darkgreen") +
  ggtitle("PM10 vs PM2.5")

Statistical Analysis

# Linear Regression
lm_model <- lm(pm2.5 ~ pm10, data = df)
summary(lm_model)

## 
## Call:
## lm(formula = pm2.5 ~ pm10, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.50744 -0.01100  0.00056  0.01016  0.42074 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0090306  0.0004797  -18.83   <2e-16 ***
## pm10         0.6347307  0.0032114  197.65   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02686 on 10312 degrees of freedom
## Multiple R-squared:  0.7912, Adjusted R-squared:  0.7911 
## F-statistic: 3.907e+04 on 1 and 10312 DF,  p-value: < 2.2e-16

plot(df$pm10, df$pm2.5, col="blue", main="PM10 vs PM2.5")
abline(lm_model, col="red")

# Q12: Multiple Regression
multi_model <- lm(pm2.5 ~ pm10 + no2 + so2 + co + o3, data = df)
summary(multi_model)

## 
## Call:
## lm(formula = pm2.5 ~ pm10 + no2 + so2 + co + o3, data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.44707 -0.01148  0.00043  0.01048  0.43551 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.0168412  0.0006897 -24.417  < 2e-16 ***
## pm10         0.5620024  0.0041613 135.055  < 2e-16 ***
## no2          0.0349970  0.0035188   9.946  < 2e-16 ***
## so2         -0.0084331  0.0022302  -3.781 0.000157 ***
## co           0.0825269  0.0034825  23.698  < 2e-16 ***
## o3           0.0069227  0.0020965   3.302 0.000963 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.02584 on 10308 degrees of freedom
## Multiple R-squared:  0.8067, Adjusted R-squared:  0.8066 
## F-statistic:  8601 on 5 and 10308 DF,  p-value: < 2.2e-16

ANOVA

anova_model <- aov(pm2.5 ~ city, data = df)
summary(anova_model)

##                Df Sum Sq Mean Sq F value Pr(>F)    
## city            8  7.416  0.9270   338.8 <2e-16 ***
## Residuals   10305 28.196  0.0027                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Polynomial Regression

poly_model <- lm(pm2.5 ~ poly(pm10, 2), data = df)
summary(poly_model)

## 
## Call:
## lm(formula = pm2.5 ~ poly(pm10, 2), data = df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.58490 -0.01034  0.00014  0.00940  0.42295 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    0.0700671  0.0002629  266.51   <2e-16 ***
## poly(pm10, 2)1 5.3080029  0.0267003  198.80   <2e-16 ***
## poly(pm10, 2)2 0.2940440  0.0267003   11.01   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0267 on 10311 degrees of freedom
## Multiple R-squared:  0.7936, Adjusted R-squared:  0.7935 
## F-statistic: 1.982e+04 on 2 and 10311 DF,  p-value: < 2.2e-16

Logistic Regression

# Logistic Regression (High vs Low Pollution)
df$level_binary <- ifelse(df$pm2.5 > 0.07007, 1, 0)
model_vars <- c("level_binary", "pm10", "no2", "so2", "co", "o3")
df_clean <- df[complete.cases(df[, model_vars]), ]
cat("Class Distribution in Clean Data (1=High, 0=Low):\n")

## Class Distribution in Clean Data (1=High, 0=Low):

print(table(df_clean$level_binary))

## 
##    0    1 
## 6212 4102

log_model <- glm(level_binary ~ pm10 + no2 + so2 + co + o3, 
                 data = df_clean,         # Use the clean, filtered dataset
                 family = binomial(link = "logit"))
summary(log_model)

## 
## Call:
## glm(formula = level_binary ~ pm10 + no2 + so2 + co + o3, family = binomial(link = "logit"), 
##     data = df_clean)
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -8.2672     0.1750 -47.239  < 2e-16 ***
## pm10         46.5406     1.0953  42.493  < 2e-16 ***
## no2           1.9154     0.4534   4.225 2.39e-05 ***
## so2           0.2851     0.2635   1.082    0.279    
## co            7.0440     0.4827  14.593  < 2e-16 ***
## o3            3.0329     0.2533  11.972  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 13863.5  on 10313  degrees of freedom
## Residual deviance:  6101.5  on 10308  degrees of freedom
## AIC: 6113.5
## 
## Number of Fisher Scoring iterations: 7

K-Means Clustering

num_df <- na.omit(df[, c("pm2.5","pm10","no2","so2","co","o3")])
km <- kmeans(num_df, centers=3)
table(km$cluster)

## 
##    1    2    3 
## 2227 6006 2081

plot(num_df$pm2.5, num_df$pm10, col = km$cluster,
     main = "K-Means Clustering (PM2.5 vs PM10)",
     xlab = "PM2.5", ylab = "PM10")

KNN Classification

# KNN Classification (High vs Low)
df$level_binary <- as.factor(df$level_binary)
index <- createDataPartition(df$level_binary, p = 0.7, list = FALSE)
train <- df[index, ]
test  <- df[-index, ]
train_x <- train[, c("pm2.5","pm10","no2","so2","co","o3")]
test_x  <- test[, c("pm2.5","pm10","no2","so2","co","o3")]
train_y <- train$level_binary
test_y  <- test$level_binary

pred <- knn(train_x, test_x, train_y, k = 3)
table(Predicted=pred, Actual=test_y)

##          Actual
## Predicted    0    1
##         0 1746  100
##         1  117 1130

Seasonal Analysis

pollutants <- c("pm2.5","pm10","no2","so2","co","o3")
df$month <- as.numeric(format(as.Date(df$date), "%m"))
df$season <- ifelse(df$month %in% c(12,1,2), "Winter",
                      ifelse(df$month %in% c(3,4,5), "Summer",
                      ifelse(df$month %in% c(6,7,9), "Monsoon", "PostMonsoon")))
df %>%
  group_by(season) %>%
  summarise(across(all_of(pollutants), mean, na.rm = TRUE))

## # A tibble: 4 × 7
##   season       pm2.5   pm10    no2   so2    co    o3
##   <chr>        <dbl>  <dbl>  <dbl> <dbl> <dbl> <dbl>
## 1 Monsoon     0.0348 0.0730 0.0894 0.121 0.101 0.158
## 2 PostMonsoon 0.0792 0.130  0.133  0.155 0.157 0.186
## 3 Summer      0.0525 0.113  0.110  0.141 0.130 0.218
## 4 Winter      0.104  0.168  0.176  0.162 0.190 0.217

Association Rule Mining(Apriori)

# Use categorical columns for rules
# Convert continuous PM2.5 values into categorical levels
df$PollutionLevel <- cut(df$pm2.5,
                         breaks = c(-Inf, 0.03, 0.06, Inf),
                         labels = c("Low", "Moderate", "High"))

rules_data <- df[, c("city","state","PollutionLevel")]
trans <- as(rules_data, "transactions")
rules <- apriori(trans, parameter=list(supp=0.05, conf=0.6))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.6    0.1    1 none FALSE            TRUE       5    0.05      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 515 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[20 item(s), 10314 transaction(s)] done [0.00s].
## sorting and recoding items ... [14 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [24 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

inspect(head(rules))

##     lhs                    rhs                    support    confidence
## [1] {state=Punjab}      => {city=Amritsar}        0.06166376 1         
## [2] {city=Amritsar}     => {state=Punjab}         0.06166376 1         
## [3] {city=Amaravati}    => {state=Andhra Pradesh} 0.06263331 1         
## [4] {city=Kolkata}      => {state=West Bengal}    0.10480900 1         
## [5] {state=West Bengal} => {city=Kolkata}         0.10480900 1         
## [6] {state=Delhi}       => {city=Delhi}           0.10917200 1         
##     coverage   lift      count
## [1] 0.06166376 16.216981  636 
## [2] 0.06166376 16.216981  636 
## [3] 0.06263331  5.804164  646 
## [4] 0.10480900  9.541166 1081 
## [5] 0.10480900  9.541166 1081 
## [6] 0.10917200  9.159858 1126

plot(rules, method="graph")

AIR QUALITY TREND ANALYSIS AND POLLUTION LEVEL PREDICTION

Student 1: Shantanu Chhetri

Student 2: Saloni Panda

2025-11-03