This project, “Air Quality Trend Analysis and Pollution Level Prediction” analyzes air quality data across Indian cities (2015–2020). It involves data merging, cleaning, and visualization, followed by correlation, regression, ANOVA, clustering (K-Means), classification (KNN, Logistic Regression), and association rule mining (Apriori) to study pollution trends and predict air quality levels.
options(warn = -1)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(class)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
##
## Attaching package: 'arules'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(dplyr)
library(lubridate)
library(corrplot)
## corrplot 0.95 loaded
#Load All Datasets
city_day <- read.csv("C:/Users/shant/Downloads/Air Quality Index - Datasets/city_day.csv")
city_hour <- read.csv("C:/Users/shant/Downloads/Air Quality Index - Datasets/city_hour.csv")
station_day <- read.csv("C:/Users/shant/Downloads/Air Quality Index - Datasets/station_day.csv")
station_hour <- read.csv("C:/Users/shant/Downloads/Air Quality Index - Datasets/station_hour.csv")
stations <- read.csv("C:/Users/shant/Downloads/Air Quality Index - Datasets/stations.csv")
cat("City-Day:\n"); str(city_day)
## City-Day:
## 'data.frame': 29531 obs. of 16 variables:
## $ City : chr "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
## $ Date : chr "2015-01-01" "2015-01-02" "2015-01-03" "2015-01-04" ...
## $ PM2.5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ PM10 : num NA NA NA NA NA NA NA NA NA NA ...
## $ NO : num 0.92 0.97 17.4 1.7 22.1 ...
## $ NO2 : num 18.2 15.7 19.3 18.5 21.4 ...
## $ NOx : num 17.1 16.5 29.7 18 37.8 ...
## $ NH3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CO : num 0.92 0.97 17.4 1.7 22.1 ...
## $ SO2 : num 27.6 24.6 29.1 18.6 39.3 ...
## $ O3 : num 133.4 34.1 30.7 36.1 39.3 ...
## $ Benzene : num 0 3.68 6.8 4.43 7.01 5.42 0 0 0 0 ...
## $ Toluene : num 0.02 5.5 16.4 10.14 18.89 ...
## $ Xylene : num 0 3.77 2.25 1 2.78 1.93 0 0 0 0 ...
## $ AQI : num NA NA NA NA NA NA NA NA NA NA ...
## $ AQI_Bucket: chr "" "" "" "" ...
cat("\nCity-Hour:\n"); str(city_hour)
##
## City-Hour:
## 'data.frame': 707875 obs. of 16 variables:
## $ City : chr "Ahmedabad" "Ahmedabad" "Ahmedabad" "Ahmedabad" ...
## $ Datetime : chr "2015-01-01 01:00:00" "2015-01-01 02:00:00" "2015-01-01 03:00:00" "2015-01-01 04:00:00" ...
## $ PM2.5 : num NA NA NA NA NA NA NA NA NA NA ...
## $ PM10 : num NA NA NA NA NA NA NA NA NA NA ...
## $ NO : num 1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
## $ NO2 : num 40 27.8 19.3 16.4 14.9 ...
## $ NOx : num 36.37 19.73 11.08 9.2 7.85 ...
## $ NH3 : num NA NA NA NA NA NA NA NA NA NA ...
## $ CO : num 1 0.02 0.08 0.3 0.12 0.33 0.45 1.03 1.47 2.05 ...
## $ SO2 : num 122.1 85.9 52.8 39.5 32.6 ...
## $ O3 : num NA NA NA 154 NA ...
## $ Benzene : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Toluene : num 0 0 0 0 0 0 0 0 0 0 ...
## $ Xylene : num 0 0 0 0 0 0 0 0 0 0 ...
## $ AQI : num NA NA NA NA NA NA NA NA NA NA ...
## $ AQI_Bucket: chr "" "" "" "" ...
cat("\nStation-Day:\n"); str(station_day)
##
## Station-Day:
## 'data.frame': 108035 obs. of 16 variables:
## $ StationId : chr "AP001" "AP001" "AP001" "AP001" ...
## $ Date : chr "2017-11-24" "2017-11-25" "2017-11-26" "2017-11-27" ...
## $ PM2.5 : num 71.4 81.4 78.3 88.8 64.2 ...
## $ PM10 : num 116 124 129 135 104 ...
## $ NO : num 1.75 1.44 1.26 6.6 2.56 5.23 4.69 4.58 7.71 0.97 ...
## $ NO2 : num 20.6 20.5 26 30.9 28.1 ...
## $ NOx : num 12.4 12.1 14.8 21.8 17 ...
## $ NH3 : num 12.2 10.7 10.3 12.9 11.4 ...
## $ CO : num 0.1 0.12 0.14 0.11 0.09 0.16 0.12 0.1 0.1 0.15 ...
## $ SO2 : num 10.8 15.2 27 33.6 19 ...
## $ O3 : num 109 127 117 112 138 ...
## $ Benzene : num 0.17 0.2 0.22 0.29 0.17 0.21 0.16 0.17 0.25 0.23 ...
## $ Toluene : num 5.92 6.5 7.95 7.63 5.02 4.71 3.52 2.85 2.79 3.82 ...
## $ Xylene : num 0.1 0.06 0.08 0.12 0.07 0.08 0.06 0.04 0.07 0.04 ...
## $ AQI : num NA 184 197 198 188 173 165 191 191 227 ...
## $ AQI_Bucket: chr "" "Moderate" "Moderate" "Moderate" ...
cat("\nStation-Hour:\n"); str(station_hour)
##
## Station-Hour:
## 'data.frame': 2589083 obs. of 16 variables:
## $ StationId : chr "AP001" "AP001" "AP001" "AP001" ...
## $ Datetime : chr "2017-11-24 17:00:00" "2017-11-24 18:00:00" "2017-11-24 19:00:00" "2017-11-24 20:00:00" ...
## $ PM2.5 : num 60.5 65.5 80 81.5 75.2 ...
## $ PM10 : num 98 111 132 133 116 ...
## $ NO : num 2.35 2.7 2.1 1.95 1.43 0.7 1.05 1.25 0.3 0.8 ...
## $ NO2 : num 30.8 24.2 25.2 16.2 17.5 ...
## $ NOx : num 18.2 15.1 15.2 10.2 10.4 ...
## $ NH3 : num 8.5 9.77 12.02 11.58 12.03 ...
## $ CO : num 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.1 0.3 0.1 ...
## $ SO2 : num 11.85 13.17 12.08 10.47 9.12 ...
## $ O3 : num 126 117 99 112 106 ...
## $ Benzene : num 0.1 0.1 0.2 0.2 0.2 0.2 0.2 0.2 0.2 0.23 ...
## $ Toluene : num 6.1 6.25 5.98 6.72 5.75 5.02 5.6 5.55 6.6 6.77 ...
## $ Xylene : num 0.1 0.15 0.18 0.1 0.08 0 0.1 0.05 0 0.1 ...
## $ AQI : num NA NA NA NA NA NA NA NA NA NA ...
## $ AQI_Bucket: chr "" "" "" "" ...
cat("\nStations:\n"); str(stations)
##
## Stations:
## 'data.frame': 230 obs. of 5 variables:
## $ StationId : chr "AP001" "AP002" "AP003" "AP004" ...
## $ StationName: chr "Secretariat, Amaravati - APPCB" "Anand Kala Kshetram, Rajamahendravaram - APPCB" "Tirumala, Tirupati - APPCB" "PWD Grounds, Vijayawada - APPCB" ...
## $ City : chr "Amaravati" "Rajamahendravaram" "Tirupati" "Vijayawada" ...
## $ State : chr "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" ...
## $ Status : chr "Active" "" "" "" ...
# Convert column names to lowercase for consistency
names(city_day) <- tolower(names(city_day))
names(city_hour) <- tolower(names(city_hour))
names(station_day) <- tolower(names(station_day))
names(station_hour)<- tolower(names(station_hour))
names(stations) <- tolower(names(stations))
# Merge station and city-level data
merged_data <- station_day %>%
left_join(stations, by = "stationid") %>%
drop_na()
cat("\nMerged dataset dimensions:\n")
##
## Merged dataset dimensions:
dim(merged_data)
## [1] 10314 20
glimpse(merged_data)
## Rows: 10,314
## Columns: 20
## $ stationid <chr> "AP001", "AP001", "AP001", "AP001", "AP001", "AP001", "AP0…
## $ date <chr> "2017-11-25", "2017-11-26", "2017-11-27", "2017-11-28", "2…
## $ pm2.5 <dbl> 81.40, 78.32, 88.76, 64.18, 72.47, 69.80, 73.96, 89.90, 87…
## $ pm10 <dbl> 124.50, 129.06, 135.32, 104.09, 114.84, 114.86, 113.56, 14…
## $ no <dbl> 1.44, 1.26, 6.60, 2.56, 5.23, 4.69, 4.58, 7.71, 0.97, 4.02…
## $ no2 <dbl> 20.50, 26.00, 30.85, 28.07, 23.20, 20.17, 19.29, 26.19, 21…
## $ nox <dbl> 12.08, 14.85, 21.77, 17.01, 16.59, 14.54, 13.97, 19.87, 12…
## $ nh3 <dbl> 10.72, 10.28, 12.91, 11.42, 12.25, 10.95, 10.95, 13.12, 14…
## $ co <dbl> 0.12, 0.14, 0.11, 0.09, 0.16, 0.12, 0.10, 0.10, 0.15, 0.18…
## $ so2 <dbl> 15.24, 26.96, 33.59, 19.00, 10.55, 14.07, 13.90, 19.37, 11…
## $ o3 <dbl> 127.09, 117.44, 111.81, 138.18, 109.74, 118.09, 123.80, 12…
## $ benzene <dbl> 0.20, 0.22, 0.29, 0.17, 0.21, 0.16, 0.17, 0.25, 0.23, 0.31…
## $ toluene <dbl> 6.50, 7.95, 7.63, 5.02, 4.71, 3.52, 2.85, 2.79, 3.82, 3.53…
## $ xylene <dbl> 0.06, 0.08, 0.12, 0.07, 0.08, 0.06, 0.04, 0.07, 0.04, 0.09…
## $ aqi <dbl> 184, 197, 198, 188, 173, 165, 191, 191, 227, 168, 198, 201…
## $ aqi_bucket <chr> "Moderate", "Moderate", "Moderate", "Moderate", "Moderate"…
## $ stationname <chr> "Secretariat, Amaravati - APPCB", "Secretariat, Amaravati …
## $ city <chr> "Amaravati", "Amaravati", "Amaravati", "Amaravati", "Amara…
## $ state <chr> "Andhra Pradesh", "Andhra Pradesh", "Andhra Pradesh", "And…
## $ status <chr> "Active", "Active", "Active", "Active", "Active", "Active"…
# Select Useful Columns
df <- merged_data %>%
select(stationid, city, state,
pm2.5, pm10, no2, so2, co, o3, date)
str(df)
## 'data.frame': 10314 obs. of 10 variables:
## $ stationid: chr "AP001" "AP001" "AP001" "AP001" ...
## $ city : chr "Amaravati" "Amaravati" "Amaravati" "Amaravati" ...
## $ state : chr "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" "Andhra Pradesh" ...
## $ pm2.5 : num 81.4 78.3 88.8 64.2 72.5 ...
## $ pm10 : num 124 129 135 104 115 ...
## $ no2 : num 20.5 26 30.9 28.1 23.2 ...
## $ so2 : num 15.2 27 33.6 19 10.6 ...
## $ co : num 0.12 0.14 0.11 0.09 0.16 0.12 0.1 0.1 0.15 0.18 ...
## $ o3 : num 127 117 112 138 110 ...
## $ date : chr "2017-11-25" "2017-11-26" "2017-11-27" "2017-11-28" ...
summary(df)
## stationid city state pm2.5
## Length:10314 Length:10314 Length:10314 Min. : 1.09
## Class :character Class :character Class :character 1st Qu.: 25.74
## Mode :character Mode :character Mode :character Median : 43.40
## Mean : 52.48
## 3rd Qu.: 66.21
## Max. :734.56
## pm10 no2 so2 co
## Min. : 5.77 Min. : 0.01 Min. : 0.100 Min. :0.0000
## 1st Qu.: 62.69 1st Qu.: 15.22 1st Qu.: 4.370 1st Qu.:0.4200
## Median : 98.81 Median : 28.57 Median : 7.860 Median :0.6300
## Mean :108.49 Mean : 33.19 Mean : 9.907 Mean :0.6992
## 3rd Qu.:137.84 3rd Qu.: 46.17 3rd Qu.:12.810 3rd Qu.:0.9000
## Max. :830.10 Max. :254.78 Max. :67.260 Max. :4.7400
## o3 date
## Min. : 0.03 Length:10314
## 1st Qu.: 18.38 Class :character
## Median : 28.24 Mode :character
## Mean : 32.22
## 3rd Qu.: 41.41
## Max. :162.33
cat("Rows:", nrow(df), " Columns:", ncol(df), "\n")
## Rows: 10314 Columns: 10
# Mean, Median, Mode, SD for PM2.5
mean_pm25 <- mean(df$pm2.5, na.rm = TRUE)
median_pm25 <- median(df$pm2.5, na.rm = TRUE)
sd_pm25 <- sd(df$pm2.5, na.rm = TRUE)
mode_pm25 <- names(sort(table(df$pm2.5), decreasing = TRUE))[1]
cat("Mean PM2.5:", mean_pm25, "\n")
## Mean PM2.5: 52.4821
cat("Median PM2.5:", median_pm25, "\n")
## Median PM2.5: 43.4
cat("Mode PM2.5:", mode_pm25, "\n")
## Mode PM2.5: 34
cat("SD PM2.5:", sd_pm25, "\n")
## SD PM2.5: 43.10114
# Handle Missing Data
colSums(is.na(df))
## stationid city state pm2.5 pm10 no2 so2 co
## 0 0 0 0 0 0 0 0
## o3 date
## 0 0
df[is.na(df)] <- 0
# Min–Max Normalization for numeric pollutants
normalize <- function(x){ (x - min(x)) / (max(x) - min(x)) }
df <- df %>%
mutate(across(c(pm2.5, pm10, no2, so2, co, o3), normalize))
head(df)
## stationid city state pm2.5 pm10 no2 so2
## 1 AP001 Amaravati Andhra Pradesh 0.10949323 0.1440321 0.08042548 0.2254318
## 2 AP001 Amaravati Andhra Pradesh 0.10529401 0.1495639 0.10201358 0.3999404
## 3 AP001 Amaravati Andhra Pradesh 0.11952772 0.1571579 0.12105036 0.4986599
## 4 AP001 Amaravati Andhra Pradesh 0.08601579 0.1192726 0.11013856 0.2814175
## 5 AP001 Amaravati Andhra Pradesh 0.09731823 0.1323135 0.09102328 0.1555986
## 6 AP001 Amaravati Andhra Pradesh 0.09367800 0.1323378 0.07913020 0.2080107
## co o3 date
## 1 0.02531646 0.7828712 2017-11-25
## 2 0.02953586 0.7234134 2017-11-26
## 3 0.02320675 0.6887246 2017-11-27
## 4 0.01898734 0.8512015 2017-11-28
## 5 0.03375527 0.6759704 2017-11-29
## 6 0.02531646 0.7274184 2017-11-30
# Correlation between pollutants
corr_matrix <- cor(df[, c("pm2.5","pm10","no2","so2","co","o3")], use="complete.obs")
print(corr_matrix)
## pm2.5 pm10 no2 so2 co o3
## pm2.5 1.00000000 0.88947188 0.55518254 0.1763268 0.608159593 0.046705607
## pm10 0.88947188 1.00000000 0.56060830 0.2098844 0.577965605 0.043487262
## no2 0.55518254 0.56060830 1.00000000 0.1468857 0.482896416 0.006454650
## so2 0.17632683 0.20988439 0.14688573 1.0000000 0.138754697 0.147865878
## co 0.60815959 0.57796560 0.48289642 0.1387547 1.000000000 0.001114814
## o3 0.04670561 0.04348726 0.00645465 0.1478659 0.001114814 1.000000000
corrplot(corr_matrix, method = "number")
top10 <- df %>%
group_by(city) %>%
summarise(mean_pm25 = mean(pm2.5, na.rm = TRUE)) %>%
arrange(desc(mean_pm25)) %>%
head(10)
print(top10)
## # A tibble: 9 × 2
## city mean_pm25
## <chr> <dbl>
## 1 Delhi 0.143
## 2 Patna 0.0878
## 3 Amritsar 0.0754
## 4 Gurugram 0.0726
## 5 Kolkata 0.0717
## 6 Visakhapatnam 0.0631
## 7 Hyderabad 0.0561
## 8 Amaravati 0.0554
## 9 Chandigarh 0.0524
ggplot(top10, aes(x = reorder(city, -mean_pm25), y = mean_pm25, fill = city)) +
geom_bar(stat = "identity") +
labs(title = "Top 10 Polluted Cities (Mean PM2.5)", x = "City", y = "Mean PM2.5") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
boxplot(pm2.5 ~ city, data = df, main = "PM2.5 by City", las = 2)
ggplot(df %>% filter(city == unique(city)[1]),
aes(x = as.Date(date), y = pm2.5)) +
geom_line(color = "blue") + ggtitle("PM2.5 Trend Over Time")
ggplot(df, aes(x = pm10, y = pm2.5)) +
geom_point(alpha = 0.3, color = "darkgreen") +
ggtitle("PM10 vs PM2.5")
# Linear Regression
lm_model <- lm(pm2.5 ~ pm10, data = df)
summary(lm_model)
##
## Call:
## lm(formula = pm2.5 ~ pm10, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.50744 -0.01100 0.00056 0.01016 0.42074
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0090306 0.0004797 -18.83 <2e-16 ***
## pm10 0.6347307 0.0032114 197.65 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02686 on 10312 degrees of freedom
## Multiple R-squared: 0.7912, Adjusted R-squared: 0.7911
## F-statistic: 3.907e+04 on 1 and 10312 DF, p-value: < 2.2e-16
plot(df$pm10, df$pm2.5, col="blue", main="PM10 vs PM2.5")
abline(lm_model, col="red")
# Q12: Multiple Regression
multi_model <- lm(pm2.5 ~ pm10 + no2 + so2 + co + o3, data = df)
summary(multi_model)
##
## Call:
## lm(formula = pm2.5 ~ pm10 + no2 + so2 + co + o3, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.44707 -0.01148 0.00043 0.01048 0.43551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.0168412 0.0006897 -24.417 < 2e-16 ***
## pm10 0.5620024 0.0041613 135.055 < 2e-16 ***
## no2 0.0349970 0.0035188 9.946 < 2e-16 ***
## so2 -0.0084331 0.0022302 -3.781 0.000157 ***
## co 0.0825269 0.0034825 23.698 < 2e-16 ***
## o3 0.0069227 0.0020965 3.302 0.000963 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.02584 on 10308 degrees of freedom
## Multiple R-squared: 0.8067, Adjusted R-squared: 0.8066
## F-statistic: 8601 on 5 and 10308 DF, p-value: < 2.2e-16
anova_model <- aov(pm2.5 ~ city, data = df)
summary(anova_model)
## Df Sum Sq Mean Sq F value Pr(>F)
## city 8 7.416 0.9270 338.8 <2e-16 ***
## Residuals 10305 28.196 0.0027
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
poly_model <- lm(pm2.5 ~ poly(pm10, 2), data = df)
summary(poly_model)
##
## Call:
## lm(formula = pm2.5 ~ poly(pm10, 2), data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58490 -0.01034 0.00014 0.00940 0.42295
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0700671 0.0002629 266.51 <2e-16 ***
## poly(pm10, 2)1 5.3080029 0.0267003 198.80 <2e-16 ***
## poly(pm10, 2)2 0.2940440 0.0267003 11.01 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0267 on 10311 degrees of freedom
## Multiple R-squared: 0.7936, Adjusted R-squared: 0.7935
## F-statistic: 1.982e+04 on 2 and 10311 DF, p-value: < 2.2e-16
# Logistic Regression (High vs Low Pollution)
df$level_binary <- ifelse(df$pm2.5 > 0.07007, 1, 0)
model_vars <- c("level_binary", "pm10", "no2", "so2", "co", "o3")
df_clean <- df[complete.cases(df[, model_vars]), ]
cat("Class Distribution in Clean Data (1=High, 0=Low):\n")
## Class Distribution in Clean Data (1=High, 0=Low):
print(table(df_clean$level_binary))
##
## 0 1
## 6212 4102
log_model <- glm(level_binary ~ pm10 + no2 + so2 + co + o3,
data = df_clean, # Use the clean, filtered dataset
family = binomial(link = "logit"))
summary(log_model)
##
## Call:
## glm(formula = level_binary ~ pm10 + no2 + so2 + co + o3, family = binomial(link = "logit"),
## data = df_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.2672 0.1750 -47.239 < 2e-16 ***
## pm10 46.5406 1.0953 42.493 < 2e-16 ***
## no2 1.9154 0.4534 4.225 2.39e-05 ***
## so2 0.2851 0.2635 1.082 0.279
## co 7.0440 0.4827 14.593 < 2e-16 ***
## o3 3.0329 0.2533 11.972 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13863.5 on 10313 degrees of freedom
## Residual deviance: 6101.5 on 10308 degrees of freedom
## AIC: 6113.5
##
## Number of Fisher Scoring iterations: 7
num_df <- na.omit(df[, c("pm2.5","pm10","no2","so2","co","o3")])
km <- kmeans(num_df, centers=3)
table(km$cluster)
##
## 1 2 3
## 2227 6006 2081
plot(num_df$pm2.5, num_df$pm10, col = km$cluster,
main = "K-Means Clustering (PM2.5 vs PM10)",
xlab = "PM2.5", ylab = "PM10")
# KNN Classification (High vs Low)
df$level_binary <- as.factor(df$level_binary)
index <- createDataPartition(df$level_binary, p = 0.7, list = FALSE)
train <- df[index, ]
test <- df[-index, ]
train_x <- train[, c("pm2.5","pm10","no2","so2","co","o3")]
test_x <- test[, c("pm2.5","pm10","no2","so2","co","o3")]
train_y <- train$level_binary
test_y <- test$level_binary
pred <- knn(train_x, test_x, train_y, k = 3)
table(Predicted=pred, Actual=test_y)
## Actual
## Predicted 0 1
## 0 1746 100
## 1 117 1130
pollutants <- c("pm2.5","pm10","no2","so2","co","o3")
df$month <- as.numeric(format(as.Date(df$date), "%m"))
df$season <- ifelse(df$month %in% c(12,1,2), "Winter",
ifelse(df$month %in% c(3,4,5), "Summer",
ifelse(df$month %in% c(6,7,9), "Monsoon", "PostMonsoon")))
df %>%
group_by(season) %>%
summarise(across(all_of(pollutants), mean, na.rm = TRUE))
## # A tibble: 4 × 7
## season pm2.5 pm10 no2 so2 co o3
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Monsoon 0.0348 0.0730 0.0894 0.121 0.101 0.158
## 2 PostMonsoon 0.0792 0.130 0.133 0.155 0.157 0.186
## 3 Summer 0.0525 0.113 0.110 0.141 0.130 0.218
## 4 Winter 0.104 0.168 0.176 0.162 0.190 0.217
# Use categorical columns for rules
# Convert continuous PM2.5 values into categorical levels
df$PollutionLevel <- cut(df$pm2.5,
breaks = c(-Inf, 0.03, 0.06, Inf),
labels = c("Low", "Moderate", "High"))
rules_data <- df[, c("city","state","PollutionLevel")]
trans <- as(rules_data, "transactions")
rules <- apriori(trans, parameter=list(supp=0.05, conf=0.6))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.6 0.1 1 none FALSE TRUE 5 0.05 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 515
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[20 item(s), 10314 transaction(s)] done [0.00s].
## sorting and recoding items ... [14 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [24 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
inspect(head(rules))
## lhs rhs support confidence
## [1] {state=Punjab} => {city=Amritsar} 0.06166376 1
## [2] {city=Amritsar} => {state=Punjab} 0.06166376 1
## [3] {city=Amaravati} => {state=Andhra Pradesh} 0.06263331 1
## [4] {city=Kolkata} => {state=West Bengal} 0.10480900 1
## [5] {state=West Bengal} => {city=Kolkata} 0.10480900 1
## [6] {state=Delhi} => {city=Delhi} 0.10917200 1
## coverage lift count
## [1] 0.06166376 16.216981 636
## [2] 0.06166376 16.216981 636
## [3] 0.06263331 5.804164 646
## [4] 0.10480900 9.541166 1081
## [5] 0.10480900 9.541166 1081
## [6] 0.10917200 9.159858 1126
plot(rules, method="graph")