##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyselect)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.2 ✔ tidyr 1.3.2
## ✔ readr 2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(readxl)
library(corrplot)
## corrplot 0.95 loaded
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(GGally)
#Level 1: Basic importing and cleaning
db<- read_excel("C:\\Users\\VICTUS\\Desktop\\airdataset.xlsx")
## New names:
## • `` -> `...17`
## # A tibble: 6 × 25
## City AQI PM2.5 PM10 NO2 CO SO2 O3 `Temperature (°C)`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Rajkot 59 227. 160. 44.8 5.99 37.1 58.1 11.9
## 2 Bangalore 443 220. 280. 78.2 5.06 27.1 40.8 16.7
## 3 Bhopal 56 94.8 262. 70.0 6.59 34.6 42.7 30.4
## 4 Srinagar 342 121. 213. 71.8 4.44 32.3 97.0 32.7
## 5 Hyderabad 492 184. 165. 16.2 0.554 6.72 22.8 12.5
## 6 Bangalore 235 73.9 116. 68.1 7.37 36.7 44.8 41.2
## # ℹ 16 more variables: `Humidity (%)` <dbl>, `Wind Speed (km/h)` <dbl>,
## # `Rainfall (mm)` <dbl>, `Pressure (hPa)` <dbl>, `Vehicle Count` <dbl>,
## # `Industrial Activity Index` <dbl>, `Health Impact Score` <dbl>,
## # ...17 <lgl>, age <dbl>, age_group <chr>, gender <chr>, disease_name <chr>,
## # disease_category <chr>, severity <chr>, smoking_status <chr>,
## # blood_group <chr>
## tibble [1,194 × 25] (S3: tbl_df/tbl/data.frame)
## $ City : chr [1:1194] "Rajkot" "Bangalore" "Bhopal" "Srinagar" ...
## $ AQI : num [1:1194] 59 443 56 342 492 235 421 476 183 259 ...
## $ PM2.5 : num [1:1194] 227.4 219.7 94.8 121 184.3 ...
## $ PM10 : num [1:1194] 160 280 262 213 165 ...
## $ NO2 : num [1:1194] 44.8 78.2 70 71.8 16.2 ...
## $ CO : num [1:1194] 5.986 5.062 6.589 4.443 0.554 ...
## $ SO2 : num [1:1194] 37.13 27.14 34.63 32.3 6.72 ...
## $ O3 : num [1:1194] 58.1 40.8 42.7 97 22.8 ...
## $ Temperature (°C) : num [1:1194] 11.9 16.7 30.4 32.7 12.5 ...
## $ Humidity (%) : num [1:1194] 64.4 23.1 63.6 69.7 93.2 ...
## $ Wind Speed (km/h) : num [1:1194] 12.9 19.4 13.4 19.2 10.4 ...
## $ Rainfall (mm) : num [1:1194] 14.5 34.8 194.4 233.9 91.8 ...
## $ Pressure (hPa) : num [1:1194] 976 1022 985 950 1021 ...
## $ Vehicle Count : num [1:1194] 198941 35588 430785 209382 257650 ...
## $ Industrial Activity Index: num [1:1194] 4.86 5.26 3.8 4.43 9.35 ...
## $ Health Impact Score : num [1:1194] 10 10 10 10 10 10 10 10 10 10 ...
## $ ...17 : logi [1:1194] NA NA NA NA NA NA ...
## $ age : num [1:1194] 41 51 39 56 26 43 73 71 52 29 ...
## $ age_group : chr [1:1194] "40-49" "50-59" "30-39" "50-59" ...
## $ gender : chr [1:1194] "Male" "Female" "Female" "Male" ...
## $ disease_name : chr [1:1194] "Swine Flu (H1N1)" "Swine Flu (H1N1)" "COPD" "Stroke" ...
## $ disease_category : chr [1:1194] "Respiratory" "Respiratory" "Respiratory" "Non-Communicable" ...
## $ severity : chr [1:1194] "Critical" "Severe" "Severe" "Severe" ...
## $ smoking_status : chr [1:1194] "Former" "Current" "Never" "Current" ...
## $ blood_group : chr [1:1194] "O+" "B+" "A+" "A+" ...
## City AQI PM2.5 PM10
## Length:1194 Min. : 50.0 Min. : 10.18 Min. : 20.10
## Class :character 1st Qu.:151.0 1st Qu.: 67.84 1st Qu.: 94.71
## Mode :character Median :276.0 Median :125.80 Median :165.70
## Mean :271.4 Mean :128.06 Mean :163.14
## 3rd Qu.:377.8 3rd Qu.:187.68 3rd Qu.:230.02
## Max. :499.0 Max. :249.99 Max. :299.94
## NO2 CO SO2 O3
## Min. : 5.038 Min. : 0.1017 Min. : 2.037 Min. : 5.147
## 1st Qu.:22.251 1st Qu.: 2.8110 1st Qu.:12.983 1st Qu.:29.983
## Median :43.100 Median : 5.1492 Median :25.466 Median :53.548
## Mean :42.696 Mean : 5.1015 Mean :25.555 Mean :53.387
## 3rd Qu.:62.056 3rd Qu.: 7.3267 3rd Qu.:37.304 3rd Qu.:77.049
## Max. :79.959 Max. : 9.9999 Max. :49.973 Max. :99.978
## Temperature (°C) Humidity (%) Wind Speed (km/h) Rainfall (mm)
## Min. :10.05 Min. :20.03 Min. : 0.5004 Min. : 0.5863
## 1st Qu.:17.88 1st Qu.:43.64 1st Qu.: 5.2455 1st Qu.: 75.5070
## Median :27.34 Median :63.21 Median :10.1733 Median :150.6521
## Mean :27.19 Mean :61.65 Mean :10.1204 Mean :151.3860
## 3rd Qu.:36.30 3rd Qu.:81.05 3rd Qu.:14.8117 3rd Qu.:227.7400
## Max. :44.99 Max. :99.93 Max. :19.9749 Max. :299.0146
## Pressure (hPa) Vehicle Count Industrial Activity Index
## Min. : 950.1 Min. : 1653 Min. :0.0000586
## 1st Qu.: 975.8 1st Qu.:116002 1st Qu.:2.1675619
## Median :1001.1 Median :247869 Median :4.7733041
## Mean :1000.7 Mean :248505 Mean :4.8326865
## 3rd Qu.:1026.4 3rd Qu.:377682 3rd Qu.:7.3468181
## Max. :1049.8 Max. :499353 Max. :9.9805916
## Health Impact Score ...17 age age_group
## Min. :10 Mode:logical Min. : 0.00 Length:1194
## 1st Qu.:10 NA's:1194 1st Qu.:40.00 Class :character
## Median :10 Median :54.00 Mode :character
## Mean :10 Mean :53.19
## 3rd Qu.:10 3rd Qu.:68.00
## Max. :10 Max. :95.00
## gender disease_name disease_category severity
## Length:1194 Length:1194 Length:1194 Length:1194
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## smoking_status blood_group
## Length:1194 Length:1194
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## [1] "City" "AQI"
## [3] "PM2.5" "PM10"
## [5] "NO2" "CO"
## [7] "SO2" "O3"
## [9] "Temperature (°C)" "Humidity (%)"
## [11] "Wind Speed (km/h)" "Rainfall (mm)"
## [13] "Pressure (hPa)" "Vehicle Count"
## [15] "Industrial Activity Index" "Health Impact Score"
## [17] "...17" "age"
## [19] "age_group" "gender"
## [21] "disease_name" "disease_category"
## [23] "severity" "smoking_status"
## [25] "blood_group"
db_clean <- db %>%
select(!contains("Unnamed")) %>%
rename(
temp_c = `Temperature (°C)`,
humidity_pct = `Humidity (%)`,
wind_speed_kmh = `Wind Speed (km/h)`,
rainfall_mm = `Rainfall (mm)`,
pressure_hpa = `Pressure (hPa)`
) %>%
rename_with(tolower)
db_clean
## # A tibble: 1,194 × 25
## city aqi pm2.5 pm10 no2 co so2 o3 temp_c humidity_pct
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Rajkot 59 227. 160. 44.8 5.99 37.1 58.1 11.9 64.4
## 2 Bangalore 443 220. 280. 78.2 5.06 27.1 40.8 16.7 23.1
## 3 Bhopal 56 94.8 262. 70.0 6.59 34.6 42.7 30.4 63.6
## 4 Srinagar 342 121. 213. 71.8 4.44 32.3 97.0 32.7 69.7
## 5 Hyderabad 492 184. 165. 16.2 0.554 6.72 22.8 12.5 93.2
## 6 Bangalore 235 73.9 116. 68.1 7.37 36.7 44.8 41.2 65.3
## 7 Jaipur 421 146. 180. 26.2 8.98 4.55 72.1 14.9 74.7
## 8 Nashik 476 126. 43.9 61.0 9.91 9.94 8.63 29.0 74.9
## 9 Chennai 183 220. 145. 55.0 4.57 13.3 54.4 14.6 22.3
## 10 Srinagar 259 24.8 150. 41.9 5.27 39.2 91.4 15.2 28.1
## # ℹ 1,184 more rows
## # ℹ 15 more variables: wind_speed_kmh <dbl>, rainfall_mm <dbl>,
## # pressure_hpa <dbl>, `vehicle count` <dbl>,
## # `industrial activity index` <dbl>, `health impact score` <dbl>,
## # ...17 <lgl>, age <dbl>, age_group <chr>, gender <chr>, disease_name <chr>,
## # disease_category <chr>, severity <chr>, smoking_status <chr>,
## # blood_group <chr>
#Removing missing values
db_clean <- db_clean %>%
drop_na(city, aqi, pm2.5, age, disease_name)
#Converting Categorical Data to Factors
db_final <- db_clean %>%
mutate(
gender = as.factor(gender),
disease_category = as.factor(disease_category),
severity = factor(severity, levels = c("Mild", "Moderate", "Severe", "Critical"), ordered = TRUE),
smoking_status = as.factor(smoking_status)
)
db_final <-db_clean %>%
select(where(~!all(is.na(.))))
summary(db_final)
## city aqi pm2.5 pm10
## Length:1194 Min. : 50.0 Min. : 10.18 Min. : 20.10
## Class :character 1st Qu.:151.0 1st Qu.: 67.84 1st Qu.: 94.71
## Mode :character Median :276.0 Median :125.80 Median :165.70
## Mean :271.4 Mean :128.06 Mean :163.14
## 3rd Qu.:377.8 3rd Qu.:187.68 3rd Qu.:230.02
## Max. :499.0 Max. :249.99 Max. :299.94
## no2 co so2 o3
## Min. : 5.038 Min. : 0.1017 Min. : 2.037 Min. : 5.147
## 1st Qu.:22.251 1st Qu.: 2.8110 1st Qu.:12.983 1st Qu.:29.983
## Median :43.100 Median : 5.1492 Median :25.466 Median :53.548
## Mean :42.696 Mean : 5.1015 Mean :25.555 Mean :53.387
## 3rd Qu.:62.056 3rd Qu.: 7.3267 3rd Qu.:37.304 3rd Qu.:77.049
## Max. :79.959 Max. : 9.9999 Max. :49.973 Max. :99.978
## temp_c humidity_pct wind_speed_kmh rainfall_mm
## Min. :10.05 Min. :20.03 Min. : 0.5004 Min. : 0.5863
## 1st Qu.:17.88 1st Qu.:43.64 1st Qu.: 5.2455 1st Qu.: 75.5070
## Median :27.34 Median :63.21 Median :10.1733 Median :150.6521
## Mean :27.19 Mean :61.65 Mean :10.1204 Mean :151.3860
## 3rd Qu.:36.30 3rd Qu.:81.05 3rd Qu.:14.8117 3rd Qu.:227.7400
## Max. :44.99 Max. :99.93 Max. :19.9749 Max. :299.0146
## pressure_hpa vehicle count industrial activity index
## Min. : 950.1 Min. : 1653 Min. :0.0000586
## 1st Qu.: 975.8 1st Qu.:116002 1st Qu.:2.1675619
## Median :1001.1 Median :247869 Median :4.7733041
## Mean :1000.7 Mean :248505 Mean :4.8326865
## 3rd Qu.:1026.4 3rd Qu.:377682 3rd Qu.:7.3468181
## Max. :1049.8 Max. :499353 Max. :9.9805916
## health impact score age age_group gender
## Min. :10 Min. : 0.00 Length:1194 Length:1194
## 1st Qu.:10 1st Qu.:40.00 Class :character Class :character
## Median :10 Median :54.00 Mode :character Mode :character
## Mean :10 Mean :53.19
## 3rd Qu.:10 3rd Qu.:68.00
## Max. :10 Max. :95.00
## disease_name disease_category severity smoking_status
## Length:1194 Length:1194 Length:1194 Length:1194
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## blood_group
## Length:1194
## Class :character
## Mode :character
##
##
##
## [1] "city" "aqi"
## [3] "pm2.5" "pm10"
## [5] "no2" "co"
## [7] "so2" "o3"
## [9] "temp_c" "humidity_pct"
## [11] "wind_speed_kmh" "rainfall_mm"
## [13] "pressure_hpa" "vehicle count"
## [15] "industrial activity index" "health impact score"
## [17] "age" "age_group"
## [19] "gender" "disease_name"
## [21] "disease_category" "severity"
## [23] "smoking_status" "blood_group"
#Level 2
##1 Average AQI across all cities
#We calculate the average AQI to understand the overall pollution level across all cities. This gives us a general idea of how clean or polluted the air is on average.
mean(db_final$aqi, na.rm = TRUE)
## [1] 271.4456
##2 Distribution of Temp and Humidity
#We plot how temperature and humidity values are spread out. This helps us see whether most days are similar or if there are extreme conditions.
ggplot(db_final) + geom_density(aes(x = temp_c), fill="red", alpha=0.3) + labs(title="Temp Distribution")

ggplot(db_final) + geom_density(aes(x = humidity_pct), fill="blue", alpha=0.3) + labs(title="Humidity Distribution")

##3 Cities with "Good" AQI (< 50)
#We filter cities where AQI is less than 50 to find places with clean air. This helps us identify healthier environments.
db_final %>% filter(aqi < 50) %>% select(city) %>% unique()
## # A tibble: 0 × 1
## # ℹ 1 variable: city <chr>
##4 AQI Skewness (Histogram)
#We draw a histogram to see how AQI values are distributed. This shows whether most cities have low, moderate, or high pollution.
ggplot(db_final, aes(x = aqi)) + geom_histogram(bins = 30, fill = "steelblue", color = "white") + labs(title="AQI Histogram")

##5 City with highest Ozone (O3)
#We find the city with the highest ozone level. This helps identify areas where harmful gases are most concentrated.
db_final %>% slice_max(o3, n = 1) %>% select(city, o3)
## # A tibble: 1 × 2
## city o3
## <chr> <dbl>
## 1 Thane 100.0
##6 Most common disease
#We count which disease appears most often. This helps us understand which health issue is most linked with pollution.
db_final %>% count(disease_name, sort = TRUE) %>% head(1)
## # A tibble: 1 × 2
## disease_name n
## <chr> <int>
## 1 Lung Cancer 274
##7 Gender distribution
#We check how many males and females are in the dataset. This ensures our data is balanced and helps in further comparisons
db_final %>% count(gender)
## # A tibble: 3 × 2
## gender n
## <chr> <int>
## 1 Female 573
## 2 Male 605
## 3 Other 16
##8 Most frequent age group
#We find which age group appears the most. This tells us which group is most represented or possibly most affected.
db_final %>% count(age_group, sort = TRUE) %>% head(1)
## # A tibble: 1 × 2
## age_group n
## <chr> <int>
## 1 50-59 228
##9 PM10 vs AQI Variation by City
#We compare PM10 levels with AQI for each city. This helps us see if higher dust particles lead to worse air quality.
ggplot(db_final, aes(x = pm10, y = aqi, color = city)) + geom_point() + facet_wrap(~city)

##10 Former Smokers: Disease Category distribution
#We look at diseases among former smokers. This helps us understand long-term health effects even after quitting smoking.
db_final %>% filter(smoking_status == "Former") %>% count(disease_category)
## # A tibble: 2 × 2
## disease_category n
## <chr> <int>
## 1 Non-Communicable 113
## 2 Respiratory 123
##11 AQI vs Disease Severity (Boxplot)
#We compare AQI levels across severity levels. This helps us see if worse air quality leads to more serious diseases.
ggplot(db_final, aes(x = severity, y = aqi, fill = severity)) + geom_boxplot() + theme_minimal()

##12 PM2.5 vs Health Impact Score (Scatter)
#We check if higher PM2.5 leads to worse health scores. This helps confirm if fine particles are dangerous.
ggplot(db_final, aes(x = pm2.5, y = `health impact score`)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

##13 Which city shows the highest "Vehicle Count" but relatively lower NO2?
#We compare number of vehicles with NO₂ levels. This helps us see if traffic is a major pollution source.
b <- db_final %>%
group_by(city) %>%
summarise(
avg_vehicles = mean(`vehicle count`, na.rm = TRUE),
avg_no2 = mean(no2, na.rm = TRUE)
) %>%
mutate(efficiency_ratio = avg_vehicles / avg_no2) %>%
arrange(desc(efficiency_ratio))
head(b, 1)
## # A tibble: 1 × 4
## city avg_vehicles avg_no2 efficiency_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 Varanasi 264021. 36.8 7167.
#visualization
ggplot(b, aes(x = avg_vehicles, y = avg_no2, label = city)) +
geom_point(color = "darkgreen", size = 3)+
labs(title = "Vehicle Count vs NO2 Levels by City",
x = "Average Vehicle Count",
y = "Average NO2 Level") +
theme_minimal()

##14 Top 5 Cities: Worst Air & Highest Severity
#We identify cities with both high AQI and high disease severity. This highlights the most dangerous locations.
db_final %>%
group_by(city) %>%
summarise(
avg_aqi = mean(aqi, na.rm = TRUE),
avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)
) %>%
arrange(desc(avg_aqi))
## Warning: There were 24 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)`.
## ℹ In group 1: `city = "Agra"`.
## Caused by warning in `mean()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 23 remaining warnings.
## # A tibble: 24 × 3
## city avg_aqi avg_severity_score
## <chr> <dbl> <dbl>
## 1 Ahmedabad 317. NaN
## 2 Nagpur 296. NaN
## 3 Jaipur 293. NaN
## 4 Indore 286. NaN
## 5 Thane 283. NaN
## 6 Patna 280. NaN
## 7 Mumbai 277. NaN
## 8 Hyderabad 274. NaN
## 9 Surat 274. NaN
## 10 Nashik 271. NaN
## # ℹ 14 more rows
##15 Pie Chart of Health Severity
#We visualize how many people fall into each severity category. This gives a quick overview of health conditions.
db_final %>% count(severity) %>%
ggplot(aes(x = "", y = n, fill = severity)) +
geom_bar(stat = "identity") + coord_polar("y") + theme_void()

##16 Summary Table by City
#We summarize key stats like average AQI and patient count for each city. This helps compare cities easily.
summary_table <- db_final %>%
group_by(city) %>%
summarise(mean_aqi = mean(aqi), max_severity = max(severity), total_patients = n())
print(summary_table)
## # A tibble: 24 × 4
## city mean_aqi max_severity total_patients
## <chr> <dbl> <chr> <int>
## 1 Agra 263. Severe 54
## 2 Ahmedabad 317. Severe 48
## 3 Bangalore 264. Severe 52
## 4 Bhopal 255. Severe 53
## 5 Chennai 267. Severe 46
## 6 Delhi 233. Severe 39
## 7 Hyderabad 274. Severe 51
## 8 Indore 286. Severe 40
## 9 Jaipur 293. Severe 42
## 10 Kolkata 265. Severe 55
## # ℹ 14 more rows
##17 Young patients (<30) in high AQI showing "Severe"
#We check if young people are affected by high pollution. This helps understand if pollution impacts all age groups.
db_final %>% filter(age < 30, aqi > 300, severity == "Severe")
## # A tibble: 12 × 24
## city aqi pm2.5 pm10 no2 co so2 o3 temp_c humidity_pct
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Pune 359 90.0 145. 29.6 6.78 12.7 75.2 21.0 49.7
## 2 Nagpur 375 12.5 114. 40.2 2.60 34.8 44.7 22.0 49.6
## 3 Thane 417 71.4 109. 39.1 0.966 11.1 93.3 31.9 61.3
## 4 Thane 430 191. 39.1 51.4 3.19 46.6 16.9 28.5 66.1
## 5 Srinagar 338 193. 44.6 79.7 7.19 12.8 97.5 34.0 63.3
## 6 Patna 454 31.0 167. 62.1 5.88 45.5 27.8 10.6 43.7
## 7 Rajkot 433 196. 63.9 67.6 0.346 22.4 45.1 10.4 28.1
## 8 Vadodara 359 44.4 43.4 65.3 6.23 23.6 37.9 20.5 87.0
## 9 Bangalore 316 60.0 135. 71.4 2.03 4.29 98.8 31.6 35.6
## 10 Delhi 333 219. 106. 8.52 7.56 35.9 52.3 13.6 95.1
## 11 Nagpur 390 44.5 166. 78.9 8.84 35.7 36.5 38.7 46.7
## 12 Ahmedabad 458 162. 267. 7.84 5.19 30.5 91.8 26.3 93.7
## # ℹ 14 more variables: wind_speed_kmh <dbl>, rainfall_mm <dbl>,
## # pressure_hpa <dbl>, `vehicle count` <dbl>,
## # `industrial activity index` <dbl>, `health impact score` <dbl>, age <dbl>,
## # age_group <chr>, gender <chr>, disease_name <chr>, disease_category <chr>,
## # severity <chr>, smoking_status <chr>, blood_group <chr>
##18 City AQI Ranking vs. 300 Threshold
#We rank cities by AQI and compare them to a danger level (300). This helps identify critically polluted cities.
db_final %>% group_by(city) %>% summarise(avg_aqi = mean(aqi)) %>%
ggplot(aes(x = reorder(city, avg_aqi), y = avg_aqi)) +
geom_col(fill="blue") + geom_hline(yintercept = 300, linetype="dashed") + coord_flip()

##19 Vehicle Density vs NO2 Correlation
#We analyze whether more vehicles lead to more NO₂. This confirms the role of traffic in pollution.
ggplot(db_final, aes(x = `vehicle count`, y = no2)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

##20 Relative Frequency of Diseases
#We show how often each disease occurs. This helps identify common health problems
ggplot(db_final, aes(x = fct_infreq(disease_name))) + geom_bar(fill="orange") + coord_flip()

##21 Severity Distribution across Genders
#We compare disease severity between genders. This helps check if one gender is more affected.
ggplot(db_final, aes(x = severity, fill = gender)) + geom_bar(position = "dodge") + theme_bw()

#Level 3
##22 Which pollutant has the strongest linear correlation with the health impact score?
#In this step, we are trying to find out which pollutant (like PM2.5, NO₂, SO₂, etc.) is most strongly related to the health impact score. We calculate correlation between all numeric variables and display it using a correlation plot. Correlation tells us how closely two variables move together: A value close to +1 means strong positive relationship (as pollution increases, health impact worsens). A value close to 0 means little or no relationship
#Why we did this: We want to identify the most harmful pollutant affecting health.
#Insight:
#The pollutant with the highest correlation with health impact score is likely the biggest contributor to health problems, and should be the main focus for control.
num_data <- db_final %>% select(where(is.numeric))
cor_matrix <- cor(num_data, use = "complete.obs")
## Warning in cor(num_data, use = "complete.obs"): the standard deviation is zero
corrplot(cor_matrix, method = "color", addCoef.col = "black", number.cex = 0.7)

##23 Does the mean aqi differ significantly across different severity levels?
#Here, we are checking whether people with different disease severity (Mild, Moderate, Severe, Critical) are exposed to different levels of air pollution (AQI).
#We use ANOVA because: We are comparing more than two groups, we want to see if their average AQI is significantly different.
#The result shows a p-value = 0.695 (> 0.05) and F-value = 0.482, which means the model is not statistically significant. This indicates that AQI levels do not differ meaningfully across severity groups in this dataset, suggesting that pollution alone may not explain disease severity and other factors might be influencing it.
anovapply <- aov(aqi ~ severity, data = db_final)
summary(anovapply)
## Df Sum Sq Mean Sq F value Pr(>F)
## severity 3 24476 8159 0.482 0.695
## Residuals 1190 20140617 16925
# Post-hoc test to see which groups differ
TukeyHSD(anovapply)
## Tukey multiple comparisons of means
## 95% family-wise confidence level
##
## Fit: aov(formula = aqi ~ severity, data = db_final)
##
## $severity
## diff lwr upr p adj
## Mild-Critical 9.017778 -24.00580 42.04136 0.8961049
## Moderate-Critical -2.709402 -31.73589 26.31709 0.9951295
## Severe-Critical -4.093578 -31.37102 23.18386 0.9804369
## Moderate-Mild -11.727179 -42.34365 18.88929 0.7578055
## Severe-Mild -13.111355 -42.07495 15.85224 0.6493499
## Severe-Moderate -1.384176 -25.69200 22.92365 0.9988791
##24 Is industrial activity significantly higher for patients with specific disease categories? Using anova
#The result shows a p-value = 0.98 (> 0.05), meaning the model is not significant. This suggests that industrial activity levels are almost the same across different disease categories, indicating no strong relationship between industrial pollution and specific diseases in this dataset.
industry <- aov(`industrial activity index` ~ disease_category, data = db_final)
summary(industry)
## Df Sum Sq Mean Sq F value Pr(>F)
## disease_category 1 0 0.006 0.001 0.98
## Residuals 1192 10111 8.482
##25 What is the 90th percentile of aqi, and which cities exceed this value?
#To identify extreme pollution cases, not just average conditions.
#Insight:
#Cities above this threshold represent: Highly dangerous pollution zones needing urgent attention
thresh_90 <- quantile(db_final$aqi, 0.90)
db_final %>% filter(aqi > thresh_90) %>% group_by(city) %>% summarise(count = n())
## # A tibble: 24 × 2
## city count
## <chr> <int>
## 1 Agra 2
## 2 Ahmedabad 6
## 3 Bangalore 3
## 4 Bhopal 5
## 5 Chennai 5
## 6 Delhi 4
## 7 Hyderabad 8
## 8 Indore 9
## 9 Jaipur 3
## 10 Kolkata 4
## # ℹ 14 more rows
##26 Can we predict the health impact score based on pm2.5? (Independent: pm2.5)
#We build a model to see how PM2.5 affects health impact score.
#Linear regression creates a relationship like: “If PM2.5 increases, how much does health impact increase?”
#If PM2.5 is significant:It is a strong predictor of health damage
#The model shows an overall significant F-statistic (p < 2.2e-16), but the PM2.5 coefficient itself has a p-value = 0.155 (> 0.05), meaning it is not individually significant. Although the model shows R² = 0.50, indicating 50% variation explained, the PM2.5 variable itself does not strongly predict health impact. This suggests the relationship might be influenced by other hidden factors or data issues.
lr_model <- lm(`health impact score` ~ pm2.5, data = db_final)
summary(lr_model)
##
## Call:
## lm(formula = `health impact score` ~ pm2.5, data = db_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.286e-12 -1.400e-15 5.800e-15 1.350e-14 2.130e-14
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.000e+01 1.277e-14 7.828e+14 <2e-16 ***
## pm2.5 -1.247e-16 8.758e-17 -1.424e+00 0.155
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.113e-13 on 1192 degrees of freedom
## Multiple R-squared: 0.5001, Adjusted R-squared: 0.4996
## F-statistic: 1192 on 1 and 1192 DF, p-value: < 2.2e-16
# Plotting the regression line
ggplot(db_final, aes(x = pm2.5, y = `health impact score`)) +
geom_point() + geom_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula = 'y ~ x'

##27 Does wind speed show a non-linear relationship with aqi? (Independent: wind speed)
#Here, the results show p-values = 0.871 and 0.732 for polynomial terms and overall model p-value = 0.9308 (> 0.05) with R² ≈ 0.0001, meaning the model is not significant. This indicates that wind speed does not have a meaningful effect on AQI in this dataset, and adding a non-linear term does not improve understanding
poly_model <- lm(aqi ~ poly(wind_speed_kmh, 2), data = db_final)
summary(poly_model)
##
## Call:
## lm(formula = aqi ~ poly(wind_speed_kmh, 2), data = db_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -223.094 -120.825 4.031 106.069 229.026
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 271.446 3.765 72.089 <2e-16 ***
## poly(wind_speed_kmh, 2)1 -21.144 130.112 -0.163 0.871
## poly(wind_speed_kmh, 2)2 44.515 130.112 0.342 0.732
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 130.1 on 1191 degrees of freedom
## Multiple R-squared: 0.0001204, Adjusted R-squared: -0.001559
## F-statistic: 0.07173 on 2 and 1191 DF, p-value: 0.9308
# Visualization
ggplot(db_final, aes(x = wind_speed_kmh, y = aqi)) +
geom_point() + geom_smooth(method = "lm", formula = y ~ poly(x, 2), col = "blue")

##28 How do vehicle count and industrial_activity_index combined impact no2 levels?
#We used multiple regression to study the combined effect of vehicle count and industrial activity on NO₂ levels. The results show p-values = 0.561 (vehicle) and 0.242 (industry) and overall model p-value = 0.4341 (> 0.05) with R² = 0.0014, meaning the model is not significant. This suggests that neither vehicles nor industrial activity strongly explain NO₂ levels in this dataset, and other pollution sources may be more important.
mlr_model <- lm(no2 ~ `vehicle count` + `industrial activity index`, data = db_final)
summary(mlr_model)
##
## Call:
## lm(formula = no2 ~ `vehicle count` + `industrial activity index`,
## data = db_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.239 -20.219 0.545 19.142 38.572
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.081e+01 1.679e+00 24.311 <2e-16 ***
## `vehicle count` 2.558e-06 4.400e-06 0.581 0.561
## `industrial activity index` 2.587e-01 2.209e-01 1.171 0.242
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22.2 on 1191 degrees of freedom
## Multiple R-squared: 0.0014, Adjusted R-squared: -0.0002765
## F-statistic: 0.8351 on 2 and 1191 DF, p-value: 0.4341
##29 Which city has the highest variability (Standard Deviation) in its air quality?
#AQI standard deviation calculation to measure stability across different cities.Pinpoints the city with the most unpredictable air quality, highlighting where management is most difficult.
db_final %>% group_by(city) %>% summarise(sd_aqi = sd(aqi)) %>% arrange(desc(sd_aqi))
## # A tibble: 24 × 2
## city sd_aqi
## <chr> <dbl>
## 1 Chennai 143.
## 2 Nashik 142.
## 3 Indore 141.
## 4 Bhopal 140.
## 5 Mumbai 139.
## 6 Rajkot 139.
## 7 Patna 139.
## 8 Vadodara 137.
## 9 Nagpur 135.
## 10 Ludhiana 135.
## # ℹ 14 more rows
##30 Visualize the distribution and scatter relationships between all primary pollutants.
#Pair plot visualization to detect correlations between major pollutants.Reveals shared emission sources and identifies pollutant clusters that can be regulated together.
ggpairs(db_final, columns = c("pm2.5", "pm10", "no2", "co", "so2"))

##31 Create a 3-tier risk category (Low, Mid, High) based on percentiles of the health impact score
#Grouping health impact scores into Low, Medium, and High risk tiers. Simplifies complex data to identify vulnerable populations for targeted medical intervention.
db_final <- db_final %>%
mutate(risk_tier = ntile(`health impact score`, 3))
table(db_final$risk_tier)
##
## 1 2 3
## 398 398 398
##32 What is the frequency distribution of severity among different smoking_status groups?
#Comparison of disease severity across smoking demographics (smokers, non-smokers, former).Confirms whether smoking exacerbates the seriousness of pollution-related health conditions.
table(db_final$smoking_status, db_final$severity)
##
## Critical Mild Moderate Severe
## Current 46 40 76 89
## Former 59 38 43 96
## Never 120 111 206 270
##33 Does smoking_status change the effect of aqi on health impact score?
#The overall model is statistically significant (p < 0.05), but AQI alone is not significant (p = 1.000). However, the interaction term for former smokers is significant (p = 0.01097), showing that AQI affects health only for this group.
interaction_model <- lm(`health impact score` ~ aqi * smoking_status, data = db_final)
summary(interaction_model)
##
## Call:
## lm(formula = `health impact score` ~ aqi * smoking_status, data = db_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.192e-12 0.000e+00 0.000e+00 0.000e+00 1.157e-13
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.000e+01 3.140e-14 3.185e+14 < 2e-16 ***
## aqi 1.081e-28 1.062e-16 0.000e+00 1.00000
## smoking_statusFormer -1.348e-13 4.441e-14 -3.036e+00 0.00245 **
## smoking_statusNever 4.800e-26 3.632e-14 0.000e+00 1.00000
## aqi:smoking_statusFormer 3.755e-16 1.474e-16 2.548e+00 0.01097 *
## aqi:smoking_statusNever -1.446e-28 1.223e-16 0.000e+00 1.00000
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.103e-13 on 1188 degrees of freedom
## Multiple R-squared: 0.5001, Adjusted R-squared: 0.498
## F-statistic: 237.7 on 5 and 1188 DF, p-value: < 2.2e-16
#The result for (AQI ~ PM2.5) shows p-value = 0.353 (> 0.05) and R² ≈ 0.0007, meaning the model is not significant.
##34 training. testing and prediction
#Data partitioning to build and then validate the predictive model.Provides accuracy metrics and ensures the model is reliable for predicting future pollution impacts.
#The model shows extremely low error (RMSE ≈ 0) but R² is NA, indicating a near-perfect or unstable prediction. This suggests the model may be overfitting or the data may have very little variation, meaning predictions may not be reliable in real-world scenarios.
##creating Partition 80:20
trainIndex <- createDataPartition(db_final$aqi, p = .8, list = FALSE)
train_data <- db_final[trainIndex,]
test_data <- db_final[-trainIndex,]
##Train Model (AQI predicted by PM2.5)
model_pm25 <- lm(aqi ~ pm2.5, data = train_data)
##Prediction
predictions_pm25 <- predict(model_pm25, newdata = test_data)
##Evaluate
metrics_pm25 <- postResample(pred = predictions_pm25, obs = test_data$aqi)
print(metrics_pm25)
## RMSE Rsquared MAE
## 1.306434e+02 6.498142e-04 1.123021e+02
##Visualization
ggplot(test_data, aes(x = aqi, y = predictions_pm25)) +
geom_point(color = "darkorange", alpha = 0.5) +
geom_smooth(method = "lm", color = "darkred", linetype = "dashed") +
labs(title = "Refined Regression: AQI vs. PM2.5",
subtitle = "Testing core pollutant correlation",
x = "Actual AQI",
y = "Predicted AQI") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

##35 Applying polynomial regression
#We applied a polynomial model to check if a curved relationship improves prediction. The results show p-values = 0.406 and 0.739 and overall model p-value = 0.6701 (> 0.05), meaning the model is not significant. This indicates that even a non-linear approach does not improve prediction, and PM2.5 is not a strong predictor of AQI here.
##Train Polynomial Model
## We use poly(x, 2) to include both linear and squared terms
poly_model <- lm(aqi ~ poly(pm2.5, 2), data = train_data)
##Summary
summary(poly_model)
##
## Call:
## lm(formula = aqi ~ poly(pm2.5, 2), data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -231.29 -120.18 -0.08 107.13 231.93
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 271.586 4.199 64.684 <2e-16 ***
## poly(pm2.5, 2)1 109.611 129.886 0.844 0.399
## poly(pm2.5, 2)2 105.025 129.886 0.809 0.419
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 129.9 on 954 degrees of freedom
## Multiple R-squared: 0.00143, Adjusted R-squared: -0.0006636
## F-statistic: 0.683 on 2 and 954 DF, p-value: 0.5053
##Predict on Test Data
predictions_poly <- predict(poly_model, newdata = test_data)
##Performance Metrics
metrics_poly <- postResample(pred = predictions_poly, obs = test_data$aqi)
print(metrics_poly)
## RMSE Rsquared MAE
## 1.307783e+02 1.109301e-06 1.125074e+02
##Visualization
ggplot(test_data, aes(x = aqi, y = predictions_poly)) +
geom_point(color = "orange", alpha = 0.5) +
geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "purple", linetype = "dashed") +
labs(title = "Polynomial Regression: AQI vs. PM2.5 (Degree 2)",
subtitle = "Exploring non-linear air quality trends",
x = "Actual AQI",
y = "Predicted AQI") +
theme_minimal()

##36 Is the Polynomial model significantly better than the Linear model?
#We compared linear and polynomial models using ANOVA. The result shows p-value = 0.7387 (> 0.05), meaning the polynomial model is not significantly better than the linear model. This suggests that adding complexity does not improve the model, so the simpler linear model is sufficient.
anova(model_pm25, poly_model)
## Analysis of Variance Table
##
## Model 1: aqi ~ pm2.5
## Model 2: aqi ~ poly(pm2.5, 2)
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 955 16105489
## 2 954 16094459 1 11030 0.6538 0.419
##37 Numeric Exploration: Identifying Skewness in Pollutants, using standard deviation and mean to check for high-variability pollutants
#Calculates the average and standard deviation for each pollutant to identify those that are both high and unstable. The outcome highlights which pollutants fluctuate most, making them more dangerous and difficult for authorities to regulate.
db_final %>%
summarise(across(c(pm2.5, pm10, no2, so2, o3), list(mean = mean, sd = sd)))
## # A tibble: 1 × 10
## pm2.5_mean pm2.5_sd pm10_mean pm10_sd no2_mean no2_sd so2_mean so2_sd o3_mean
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 128. 69.8 163. 80.6 42.7 22.2 25.6 13.9 53.4
## # ℹ 1 more variable: o3_sd <dbl>
##38 Percentile Analysis: The "Top 5% Toxic" cases
#Focuses on the extreme 5% of AQI values to analyze the most severe pollution events. The outcome identifies specific high-risk cities frequently experiencing extreme toxicity, signaling a need for urgent emergency intervention.
top_toxic <- quantile(db_final$aqi, 0.95)
db_final %>% filter(aqi >= top_toxic) %>% count(city, sort = TRUE)
## # A tibble: 23 × 2
## city n
## <chr> <int>
## 1 Nashik 6
## 2 Ahmedabad 4
## 3 Hyderabad 4
## 4 Patna 4
## 5 Vadodara 4
## 6 Indore 3
## 7 Jaipur 3
## 8 Kolkata 3
## 9 Lucknow 3
## 10 Nagpur 3
## # ℹ 13 more rows
##39 Standard Deviation: Which city has the most unpredictable air?
#Uses standard deviation to find the specific location where air quality changes most frequently. The outcome identifies the city with the least stable AQI, pinpointing where pollution management and long-term planning are most difficult
db_final %>%
group_by(city) %>%
summarise(aqi_stability = sd(aqi)) %>%
arrange(desc(aqi_stability))
## # A tibble: 24 × 2
## city aqi_stability
## <chr> <dbl>
## 1 Chennai 143.
## 2 Nashik 142.
## 3 Indore 141.
## 4 Bhopal 140.
## 5 Mumbai 139.
## 6 Rajkot 139.
## 7 Patna 139.
## 8 Vadodara 137.
## 9 Nagpur 135.
## 10 Ludhiana 135.
## # ℹ 14 more rows
##40 Does rain actually mitigate the impact of industry on AQI?
#The results show all p-values > 0.05 (interaction p = 0.759) and overall model p-value = 0.7485, meaning the model is not significant. This indicates that rainfall does not significantly reduce the effect of industrial activity on AQI in this dataset.
rain <- lm(aqi ~ rainfall_mm * `industrial activity index`, data = db_final)
summary(rain)
##
## Call:
## lm(formula = aqi ~ rainfall_mm * `industrial activity index`,
## data = db_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -226.771 -119.676 5.066 106.236 231.133
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 269.034851 14.701483 18.300 <2e-16
## rainfall_mm 0.023053 0.084082 0.274 0.784
## `industrial activity index` -0.922411 2.609441 -0.353 0.724
## rainfall_mm:`industrial activity index` 0.004661 0.015193 0.307 0.759
##
## (Intercept) ***
## rainfall_mm
## `industrial activity index`
## rainfall_mm:`industrial activity index`
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 130.1 on 1190 degrees of freedom
## Multiple R-squared: 0.001023, Adjusted R-squared: -0.001495
## F-statistic: 0.4063 on 3 and 1190 DF, p-value: 0.7485