library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyselect)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.2     ✔ tidyr     1.3.2
## ✔ readr     2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(readxl)
library(corrplot)
## corrplot 0.95 loaded
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(GGally)

#Level 1: Basic importing and cleaning 
db<- read_excel("C:\\Users\\VICTUS\\Desktop\\airdataset.xlsx")
## New names:
## • `` -> `...17`
head(db)
## # A tibble: 6 × 25
##   City        AQI PM2.5  PM10   NO2    CO   SO2    O3 `Temperature (°C)`
##   <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>              <dbl>
## 1 Rajkot       59 227.   160.  44.8 5.99  37.1   58.1               11.9
## 2 Bangalore   443 220.   280.  78.2 5.06  27.1   40.8               16.7
## 3 Bhopal       56  94.8  262.  70.0 6.59  34.6   42.7               30.4
## 4 Srinagar    342 121.   213.  71.8 4.44  32.3   97.0               32.7
## 5 Hyderabad   492 184.   165.  16.2 0.554  6.72  22.8               12.5
## 6 Bangalore   235  73.9  116.  68.1 7.37  36.7   44.8               41.2
## # ℹ 16 more variables: `Humidity (%)` <dbl>, `Wind Speed (km/h)` <dbl>,
## #   `Rainfall (mm)` <dbl>, `Pressure (hPa)` <dbl>, `Vehicle Count` <dbl>,
## #   `Industrial Activity Index` <dbl>, `Health Impact Score` <dbl>,
## #   ...17 <lgl>, age <dbl>, age_group <chr>, gender <chr>, disease_name <chr>,
## #   disease_category <chr>, severity <chr>, smoking_status <chr>,
## #   blood_group <chr>
str(db)
## tibble [1,194 × 25] (S3: tbl_df/tbl/data.frame)
##  $ City                     : chr [1:1194] "Rajkot" "Bangalore" "Bhopal" "Srinagar" ...
##  $ AQI                      : num [1:1194] 59 443 56 342 492 235 421 476 183 259 ...
##  $ PM2.5                    : num [1:1194] 227.4 219.7 94.8 121 184.3 ...
##  $ PM10                     : num [1:1194] 160 280 262 213 165 ...
##  $ NO2                      : num [1:1194] 44.8 78.2 70 71.8 16.2 ...
##  $ CO                       : num [1:1194] 5.986 5.062 6.589 4.443 0.554 ...
##  $ SO2                      : num [1:1194] 37.13 27.14 34.63 32.3 6.72 ...
##  $ O3                       : num [1:1194] 58.1 40.8 42.7 97 22.8 ...
##  $ Temperature (°C)         : num [1:1194] 11.9 16.7 30.4 32.7 12.5 ...
##  $ Humidity (%)             : num [1:1194] 64.4 23.1 63.6 69.7 93.2 ...
##  $ Wind Speed (km/h)        : num [1:1194] 12.9 19.4 13.4 19.2 10.4 ...
##  $ Rainfall (mm)            : num [1:1194] 14.5 34.8 194.4 233.9 91.8 ...
##  $ Pressure (hPa)           : num [1:1194] 976 1022 985 950 1021 ...
##  $ Vehicle Count            : num [1:1194] 198941 35588 430785 209382 257650 ...
##  $ Industrial Activity Index: num [1:1194] 4.86 5.26 3.8 4.43 9.35 ...
##  $ Health Impact Score      : num [1:1194] 10 10 10 10 10 10 10 10 10 10 ...
##  $ ...17                    : logi [1:1194] NA NA NA NA NA NA ...
##  $ age                      : num [1:1194] 41 51 39 56 26 43 73 71 52 29 ...
##  $ age_group                : chr [1:1194] "40-49" "50-59" "30-39" "50-59" ...
##  $ gender                   : chr [1:1194] "Male" "Female" "Female" "Male" ...
##  $ disease_name             : chr [1:1194] "Swine Flu (H1N1)" "Swine Flu (H1N1)" "COPD" "Stroke" ...
##  $ disease_category         : chr [1:1194] "Respiratory" "Respiratory" "Respiratory" "Non-Communicable" ...
##  $ severity                 : chr [1:1194] "Critical" "Severe" "Severe" "Severe" ...
##  $ smoking_status           : chr [1:1194] "Former" "Current" "Never" "Current" ...
##  $ blood_group              : chr [1:1194] "O+" "B+" "A+" "A+" ...
summary(db)
##      City                AQI            PM2.5             PM10       
##  Length:1194        Min.   : 50.0   Min.   : 10.18   Min.   : 20.10  
##  Class :character   1st Qu.:151.0   1st Qu.: 67.84   1st Qu.: 94.71  
##  Mode  :character   Median :276.0   Median :125.80   Median :165.70  
##                     Mean   :271.4   Mean   :128.06   Mean   :163.14  
##                     3rd Qu.:377.8   3rd Qu.:187.68   3rd Qu.:230.02  
##                     Max.   :499.0   Max.   :249.99   Max.   :299.94  
##       NO2               CO               SO2               O3        
##  Min.   : 5.038   Min.   : 0.1017   Min.   : 2.037   Min.   : 5.147  
##  1st Qu.:22.251   1st Qu.: 2.8110   1st Qu.:12.983   1st Qu.:29.983  
##  Median :43.100   Median : 5.1492   Median :25.466   Median :53.548  
##  Mean   :42.696   Mean   : 5.1015   Mean   :25.555   Mean   :53.387  
##  3rd Qu.:62.056   3rd Qu.: 7.3267   3rd Qu.:37.304   3rd Qu.:77.049  
##  Max.   :79.959   Max.   : 9.9999   Max.   :49.973   Max.   :99.978  
##  Temperature (°C)  Humidity (%)   Wind Speed (km/h) Rainfall (mm)     
##  Min.   :10.05    Min.   :20.03   Min.   : 0.5004   Min.   :  0.5863  
##  1st Qu.:17.88    1st Qu.:43.64   1st Qu.: 5.2455   1st Qu.: 75.5070  
##  Median :27.34    Median :63.21   Median :10.1733   Median :150.6521  
##  Mean   :27.19    Mean   :61.65   Mean   :10.1204   Mean   :151.3860  
##  3rd Qu.:36.30    3rd Qu.:81.05   3rd Qu.:14.8117   3rd Qu.:227.7400  
##  Max.   :44.99    Max.   :99.93   Max.   :19.9749   Max.   :299.0146  
##  Pressure (hPa)   Vehicle Count    Industrial Activity Index
##  Min.   : 950.1   Min.   :  1653   Min.   :0.0000586        
##  1st Qu.: 975.8   1st Qu.:116002   1st Qu.:2.1675619        
##  Median :1001.1   Median :247869   Median :4.7733041        
##  Mean   :1000.7   Mean   :248505   Mean   :4.8326865        
##  3rd Qu.:1026.4   3rd Qu.:377682   3rd Qu.:7.3468181        
##  Max.   :1049.8   Max.   :499353   Max.   :9.9805916        
##  Health Impact Score  ...17              age         age_group        
##  Min.   :10          Mode:logical   Min.   : 0.00   Length:1194       
##  1st Qu.:10          NA's:1194      1st Qu.:40.00   Class :character  
##  Median :10                         Median :54.00   Mode  :character  
##  Mean   :10                         Mean   :53.19                     
##  3rd Qu.:10                         3rd Qu.:68.00                     
##  Max.   :10                         Max.   :95.00                     
##     gender          disease_name       disease_category     severity        
##  Length:1194        Length:1194        Length:1194        Length:1194       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  smoking_status     blood_group       
##  Length:1194        Length:1194       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
colnames(db)
##  [1] "City"                      "AQI"                      
##  [3] "PM2.5"                     "PM10"                     
##  [5] "NO2"                       "CO"                       
##  [7] "SO2"                       "O3"                       
##  [9] "Temperature (°C)"          "Humidity (%)"             
## [11] "Wind Speed (km/h)"         "Rainfall (mm)"            
## [13] "Pressure (hPa)"            "Vehicle Count"            
## [15] "Industrial Activity Index" "Health Impact Score"      
## [17] "...17"                     "age"                      
## [19] "age_group"                 "gender"                   
## [21] "disease_name"              "disease_category"         
## [23] "severity"                  "smoking_status"           
## [25] "blood_group"
db_clean <- db %>%
  select(!contains("Unnamed")) %>%
  rename(
    temp_c = `Temperature (°C)`,
    humidity_pct = `Humidity (%)`,
    wind_speed_kmh = `Wind Speed (km/h)`,
    rainfall_mm = `Rainfall (mm)`,
    pressure_hpa = `Pressure (hPa)`
  ) %>%
  rename_with(tolower)

db_clean
## # A tibble: 1,194 × 25
##    city        aqi pm2.5  pm10   no2    co   so2    o3 temp_c humidity_pct
##    <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>        <dbl>
##  1 Rajkot       59 227.  160.   44.8 5.99  37.1  58.1    11.9         64.4
##  2 Bangalore   443 220.  280.   78.2 5.06  27.1  40.8    16.7         23.1
##  3 Bhopal       56  94.8 262.   70.0 6.59  34.6  42.7    30.4         63.6
##  4 Srinagar    342 121.  213.   71.8 4.44  32.3  97.0    32.7         69.7
##  5 Hyderabad   492 184.  165.   16.2 0.554  6.72 22.8    12.5         93.2
##  6 Bangalore   235  73.9 116.   68.1 7.37  36.7  44.8    41.2         65.3
##  7 Jaipur      421 146.  180.   26.2 8.98   4.55 72.1    14.9         74.7
##  8 Nashik      476 126.   43.9  61.0 9.91   9.94  8.63   29.0         74.9
##  9 Chennai     183 220.  145.   55.0 4.57  13.3  54.4    14.6         22.3
## 10 Srinagar    259  24.8 150.   41.9 5.27  39.2  91.4    15.2         28.1
## # ℹ 1,184 more rows
## # ℹ 15 more variables: wind_speed_kmh <dbl>, rainfall_mm <dbl>,
## #   pressure_hpa <dbl>, `vehicle count` <dbl>,
## #   `industrial activity index` <dbl>, `health impact score` <dbl>,
## #   ...17 <lgl>, age <dbl>, age_group <chr>, gender <chr>, disease_name <chr>,
## #   disease_category <chr>, severity <chr>, smoking_status <chr>,
## #   blood_group <chr>
#Removing missing values
db_clean <- db_clean %>%
  drop_na(city, aqi, pm2.5, age, disease_name)

#Converting Categorical Data to Factors

db_final <- db_clean %>%
  mutate(
    gender = as.factor(gender),
    disease_category = as.factor(disease_category),
    severity = factor(severity, levels = c("Mild", "Moderate", "Severe", "Critical"), ordered = TRUE),
    smoking_status = as.factor(smoking_status)
  )

db_final <-db_clean %>%
  select(where(~!all(is.na(.))))

summary(db_final)
##      city                aqi            pm2.5             pm10       
##  Length:1194        Min.   : 50.0   Min.   : 10.18   Min.   : 20.10  
##  Class :character   1st Qu.:151.0   1st Qu.: 67.84   1st Qu.: 94.71  
##  Mode  :character   Median :276.0   Median :125.80   Median :165.70  
##                     Mean   :271.4   Mean   :128.06   Mean   :163.14  
##                     3rd Qu.:377.8   3rd Qu.:187.68   3rd Qu.:230.02  
##                     Max.   :499.0   Max.   :249.99   Max.   :299.94  
##       no2               co               so2               o3        
##  Min.   : 5.038   Min.   : 0.1017   Min.   : 2.037   Min.   : 5.147  
##  1st Qu.:22.251   1st Qu.: 2.8110   1st Qu.:12.983   1st Qu.:29.983  
##  Median :43.100   Median : 5.1492   Median :25.466   Median :53.548  
##  Mean   :42.696   Mean   : 5.1015   Mean   :25.555   Mean   :53.387  
##  3rd Qu.:62.056   3rd Qu.: 7.3267   3rd Qu.:37.304   3rd Qu.:77.049  
##  Max.   :79.959   Max.   : 9.9999   Max.   :49.973   Max.   :99.978  
##      temp_c       humidity_pct   wind_speed_kmh     rainfall_mm      
##  Min.   :10.05   Min.   :20.03   Min.   : 0.5004   Min.   :  0.5863  
##  1st Qu.:17.88   1st Qu.:43.64   1st Qu.: 5.2455   1st Qu.: 75.5070  
##  Median :27.34   Median :63.21   Median :10.1733   Median :150.6521  
##  Mean   :27.19   Mean   :61.65   Mean   :10.1204   Mean   :151.3860  
##  3rd Qu.:36.30   3rd Qu.:81.05   3rd Qu.:14.8117   3rd Qu.:227.7400  
##  Max.   :44.99   Max.   :99.93   Max.   :19.9749   Max.   :299.0146  
##   pressure_hpa    vehicle count    industrial activity index
##  Min.   : 950.1   Min.   :  1653   Min.   :0.0000586        
##  1st Qu.: 975.8   1st Qu.:116002   1st Qu.:2.1675619        
##  Median :1001.1   Median :247869   Median :4.7733041        
##  Mean   :1000.7   Mean   :248505   Mean   :4.8326865        
##  3rd Qu.:1026.4   3rd Qu.:377682   3rd Qu.:7.3468181        
##  Max.   :1049.8   Max.   :499353   Max.   :9.9805916        
##  health impact score      age         age_group            gender         
##  Min.   :10          Min.   : 0.00   Length:1194        Length:1194       
##  1st Qu.:10          1st Qu.:40.00   Class :character   Class :character  
##  Median :10          Median :54.00   Mode  :character   Mode  :character  
##  Mean   :10          Mean   :53.19                                        
##  3rd Qu.:10          3rd Qu.:68.00                                        
##  Max.   :10          Max.   :95.00                                        
##  disease_name       disease_category     severity         smoking_status    
##  Length:1194        Length:1194        Length:1194        Length:1194       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  blood_group       
##  Length:1194       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
names(db_final)
##  [1] "city"                      "aqi"                      
##  [3] "pm2.5"                     "pm10"                     
##  [5] "no2"                       "co"                       
##  [7] "so2"                       "o3"                       
##  [9] "temp_c"                    "humidity_pct"             
## [11] "wind_speed_kmh"            "rainfall_mm"              
## [13] "pressure_hpa"              "vehicle count"            
## [15] "industrial activity index" "health impact score"      
## [17] "age"                       "age_group"                
## [19] "gender"                    "disease_name"             
## [21] "disease_category"          "severity"                 
## [23] "smoking_status"            "blood_group"
#Level 2

##1 Average AQI across all cities
#We calculate the average AQI to understand the overall pollution level across all cities. This gives us a general idea of how clean or polluted the air is on average.

mean(db_final$aqi, na.rm = TRUE)
## [1] 271.4456
##2 Distribution of Temp and Humidity
#We plot how temperature and humidity values are spread out. This helps us see whether most days are similar or if there are extreme conditions.

ggplot(db_final) + geom_density(aes(x = temp_c), fill="red", alpha=0.3) + labs(title="Temp Distribution")

ggplot(db_final) + geom_density(aes(x = humidity_pct), fill="blue", alpha=0.3) + labs(title="Humidity Distribution")

##3 Cities with "Good" AQI (< 50)
#We filter cities where AQI is less than 50 to find places with clean air. This helps us identify healthier environments.

db_final %>% filter(aqi < 50) %>% select(city) %>% unique()
## # A tibble: 0 × 1
## # ℹ 1 variable: city <chr>
##4 AQI Skewness (Histogram)
#We draw a histogram to see how AQI values are distributed. This shows whether most cities have low, moderate, or high pollution.

ggplot(db_final, aes(x = aqi)) + geom_histogram(bins = 30, fill = "steelblue", color = "white") + labs(title="AQI Histogram")

##5 City with highest Ozone (O3)
#We find the city with the highest ozone level. This helps identify areas where harmful gases are most concentrated.

db_final %>% slice_max(o3, n = 1) %>% select(city, o3)
## # A tibble: 1 × 2
##   city     o3
##   <chr> <dbl>
## 1 Thane 100.0
##6 Most common disease
#We count which disease appears most often. This helps us understand which health issue is most linked with pollution.

db_final %>% count(disease_name, sort = TRUE) %>% head(1)
## # A tibble: 1 × 2
##   disease_name     n
##   <chr>        <int>
## 1 Lung Cancer    274
##7 Gender distribution
#We check how many males and females are in the dataset. This ensures our data is balanced and helps in further comparisons

db_final %>% count(gender)
## # A tibble: 3 × 2
##   gender     n
##   <chr>  <int>
## 1 Female   573
## 2 Male     605
## 3 Other     16
##8 Most frequent age group
#We find which age group appears the most. This tells us which group is most represented or possibly most affected.

db_final %>% count(age_group, sort = TRUE) %>% head(1)
## # A tibble: 1 × 2
##   age_group     n
##   <chr>     <int>
## 1 50-59       228
##9 PM10 vs AQI Variation by City
#We compare PM10 levels with AQI for each city. This helps us see if higher dust particles lead to worse air quality.

ggplot(db_final, aes(x = pm10, y = aqi, color = city)) + geom_point() + facet_wrap(~city)

##10 Former Smokers: Disease Category distribution
#We look at diseases among former smokers. This helps us understand long-term health effects even after quitting smoking.

db_final %>% filter(smoking_status == "Former") %>% count(disease_category)
## # A tibble: 2 × 2
##   disease_category     n
##   <chr>            <int>
## 1 Non-Communicable   113
## 2 Respiratory        123
##11 AQI vs Disease Severity (Boxplot)
#We compare AQI levels across severity levels. This helps us see if worse air quality leads to more serious diseases.

ggplot(db_final, aes(x = severity, y = aqi, fill = severity)) + geom_boxplot() + theme_minimal()

##12 PM2.5 vs Health Impact Score (Scatter)
#We check if higher PM2.5 leads to worse health scores. This helps confirm if fine particles are dangerous.

ggplot(db_final, aes(x = pm2.5, y = `health impact score`)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

##13 Which city shows the highest "Vehicle Count" but relatively lower NO2?
#We compare number of vehicles with NO₂ levels. This helps us see if traffic is a major pollution source.

b <- db_final %>%
  group_by(city) %>%
  summarise(
    avg_vehicles = mean(`vehicle count`, na.rm = TRUE),
    avg_no2 = mean(no2, na.rm = TRUE)
  ) %>%
  mutate(efficiency_ratio = avg_vehicles / avg_no2) %>%
  arrange(desc(efficiency_ratio))
head(b, 1)
## # A tibble: 1 × 4
##   city     avg_vehicles avg_no2 efficiency_ratio
##   <chr>           <dbl>   <dbl>            <dbl>
## 1 Varanasi      264021.    36.8            7167.
#visualization
ggplot(b, aes(x = avg_vehicles, y = avg_no2, label = city)) +
  geom_point(color = "darkgreen", size = 3)+
  labs(title = "Vehicle Count vs NO2 Levels by City",
       x = "Average Vehicle Count",
       y = "Average NO2 Level") +
  theme_minimal()

##14 Top 5 Cities: Worst Air & Highest Severity
#We identify cities with both high AQI and high disease severity. This highlights the most dangerous locations.

db_final %>%
  group_by(city) %>%
  summarise(
    avg_aqi = mean(aqi, na.rm = TRUE),
    avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)
  ) %>%
  arrange(desc(avg_aqi))
## Warning: There were 24 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)`.
## ℹ In group 1: `city = "Agra"`.
## Caused by warning in `mean()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 23 remaining warnings.
## # A tibble: 24 × 3
##    city      avg_aqi avg_severity_score
##    <chr>       <dbl>              <dbl>
##  1 Ahmedabad    317.                NaN
##  2 Nagpur       296.                NaN
##  3 Jaipur       293.                NaN
##  4 Indore       286.                NaN
##  5 Thane        283.                NaN
##  6 Patna        280.                NaN
##  7 Mumbai       277.                NaN
##  8 Hyderabad    274.                NaN
##  9 Surat        274.                NaN
## 10 Nashik       271.                NaN
## # ℹ 14 more rows
##15 Pie Chart of Health Severity
#We visualize how many people fall into each severity category. This gives a quick overview of health conditions.

db_final %>% count(severity) %>% 
  ggplot(aes(x = "", y = n, fill = severity)) + 
  geom_bar(stat = "identity") + coord_polar("y") + theme_void()

##16 Summary Table by City
#We summarize key stats like average AQI and patient count for each city. This helps compare cities easily.

summary_table <- db_final %>% 
  group_by(city) %>% 
  summarise(mean_aqi = mean(aqi), max_severity = max(severity), total_patients = n())
print(summary_table)
## # A tibble: 24 × 4
##    city      mean_aqi max_severity total_patients
##    <chr>        <dbl> <chr>                 <int>
##  1 Agra          263. Severe                   54
##  2 Ahmedabad     317. Severe                   48
##  3 Bangalore     264. Severe                   52
##  4 Bhopal        255. Severe                   53
##  5 Chennai       267. Severe                   46
##  6 Delhi         233. Severe                   39
##  7 Hyderabad     274. Severe                   51
##  8 Indore        286. Severe                   40
##  9 Jaipur        293. Severe                   42
## 10 Kolkata       265. Severe                   55
## # ℹ 14 more rows
##17 Young patients (<30) in high AQI showing "Severe"
#We check if young people are affected by high pollution. This helps understand if pollution impacts all age groups.

db_final %>% filter(age < 30, aqi > 300, severity == "Severe")
## # A tibble: 12 × 24
##    city        aqi pm2.5  pm10   no2    co   so2    o3 temp_c humidity_pct
##    <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>        <dbl>
##  1 Pune        359  90.0 145.  29.6  6.78  12.7   75.2   21.0         49.7
##  2 Nagpur      375  12.5 114.  40.2  2.60  34.8   44.7   22.0         49.6
##  3 Thane       417  71.4 109.  39.1  0.966 11.1   93.3   31.9         61.3
##  4 Thane       430 191.   39.1 51.4  3.19  46.6   16.9   28.5         66.1
##  5 Srinagar    338 193.   44.6 79.7  7.19  12.8   97.5   34.0         63.3
##  6 Patna       454  31.0 167.  62.1  5.88  45.5   27.8   10.6         43.7
##  7 Rajkot      433 196.   63.9 67.6  0.346 22.4   45.1   10.4         28.1
##  8 Vadodara    359  44.4  43.4 65.3  6.23  23.6   37.9   20.5         87.0
##  9 Bangalore   316  60.0 135.  71.4  2.03   4.29  98.8   31.6         35.6
## 10 Delhi       333 219.  106.   8.52 7.56  35.9   52.3   13.6         95.1
## 11 Nagpur      390  44.5 166.  78.9  8.84  35.7   36.5   38.7         46.7
## 12 Ahmedabad   458 162.  267.   7.84 5.19  30.5   91.8   26.3         93.7
## # ℹ 14 more variables: wind_speed_kmh <dbl>, rainfall_mm <dbl>,
## #   pressure_hpa <dbl>, `vehicle count` <dbl>,
## #   `industrial activity index` <dbl>, `health impact score` <dbl>, age <dbl>,
## #   age_group <chr>, gender <chr>, disease_name <chr>, disease_category <chr>,
## #   severity <chr>, smoking_status <chr>, blood_group <chr>
##18 City AQI Ranking vs. 300 Threshold
#We rank cities by AQI and compare them to a danger level (300). This helps identify critically polluted cities.

db_final %>% group_by(city) %>% summarise(avg_aqi = mean(aqi)) %>% 
  ggplot(aes(x = reorder(city, avg_aqi), y = avg_aqi)) + 
  geom_col(fill="blue") + geom_hline(yintercept = 300, linetype="dashed") + coord_flip()

##19 Vehicle Density vs NO2 Correlation
#We analyze whether more vehicles lead to more NO₂. This confirms the role of traffic in pollution.

ggplot(db_final, aes(x = `vehicle count`, y = no2)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

##20 Relative Frequency of Diseases
#We show how often each disease occurs. This helps identify common health problems
ggplot(db_final, aes(x = fct_infreq(disease_name))) + geom_bar(fill="orange") + coord_flip()

##21 Severity Distribution across Genders
#We compare disease severity between genders. This helps check if one gender is more affected.
ggplot(db_final, aes(x = severity, fill = gender)) + geom_bar(position = "dodge") + theme_bw()

#Level 3

##22 Which pollutant has the strongest linear correlation with the health impact score?
#In this step, we are trying to find out which pollutant (like PM2.5, NO₂, SO₂, etc.) is most strongly related to the health impact score. We calculate correlation between all numeric variables and display it using a correlation plot. Correlation tells us how closely two variables move together: A value close to +1 means strong positive relationship (as pollution increases, health impact worsens). A value close to 0 means little or no relationship

#Why we did this: We want to identify the most harmful pollutant affecting health.

#Insight:
#The pollutant with the highest correlation with health impact score is likely the biggest contributor to health problems, and should be the main focus for control.

num_data <- db_final %>% select(where(is.numeric))
cor_matrix <- cor(num_data, use = "complete.obs")
## Warning in cor(num_data, use = "complete.obs"): the standard deviation is zero
corrplot(cor_matrix, method = "color", addCoef.col = "black", number.cex = 0.7)

##23 Does the mean aqi differ significantly across different severity levels?
#Here, we are checking whether people with different disease severity (Mild, Moderate, Severe, Critical) are exposed to different levels of air pollution (AQI).

#We use ANOVA because: We are comparing more than two groups, we want to see if their average AQI is significantly different. 
#The result shows a p-value = 0.695 (> 0.05) and F-value = 0.482, which means the model is not statistically significant. This indicates that AQI levels do not differ meaningfully across severity groups in this dataset, suggesting that pollution alone may not explain disease severity and other factors might be influencing it.

anovapply <- aov(aqi ~ severity, data = db_final)
summary(anovapply)
##               Df   Sum Sq Mean Sq F value Pr(>F)
## severity       3    24476    8159   0.482  0.695
## Residuals   1190 20140617   16925
# Post-hoc test to see which groups differ
TukeyHSD(anovapply)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = aqi ~ severity, data = db_final)
## 
## $severity
##                         diff       lwr      upr     p adj
## Mild-Critical       9.017778 -24.00580 42.04136 0.8961049
## Moderate-Critical  -2.709402 -31.73589 26.31709 0.9951295
## Severe-Critical    -4.093578 -31.37102 23.18386 0.9804369
## Moderate-Mild     -11.727179 -42.34365 18.88929 0.7578055
## Severe-Mild       -13.111355 -42.07495 15.85224 0.6493499
## Severe-Moderate    -1.384176 -25.69200 22.92365 0.9988791
##24 Is industrial activity significantly higher for patients with specific disease categories? Using anova
#The result shows a p-value = 0.98 (> 0.05), meaning the model is not significant. This suggests that industrial activity levels are almost the same across different disease categories, indicating no strong relationship between industrial pollution and specific diseases in this dataset.

industry <- aov(`industrial activity index` ~ disease_category, data = db_final)
summary(industry)
##                    Df Sum Sq Mean Sq F value Pr(>F)
## disease_category    1      0   0.006   0.001   0.98
## Residuals        1192  10111   8.482
##25 What is the 90th percentile of aqi, and which cities exceed this value?
#To identify extreme pollution cases, not just average conditions.

#Insight:
#Cities above this threshold represent: Highly dangerous pollution zones needing urgent attention
thresh_90 <- quantile(db_final$aqi, 0.90)
db_final %>% filter(aqi > thresh_90) %>% group_by(city) %>% summarise(count = n())
## # A tibble: 24 × 2
##    city      count
##    <chr>     <int>
##  1 Agra          2
##  2 Ahmedabad     6
##  3 Bangalore     3
##  4 Bhopal        5
##  5 Chennai       5
##  6 Delhi         4
##  7 Hyderabad     8
##  8 Indore        9
##  9 Jaipur        3
## 10 Kolkata       4
## # ℹ 14 more rows
##26 Can we predict the health impact score based on pm2.5? (Independent: pm2.5)
#We build a model to see how PM2.5 affects health impact score.

#Linear regression creates a relationship like: “If PM2.5 increases, how much does health impact increase?”
#If PM2.5 is significant:It is a strong predictor of health damage
#The model shows an overall significant F-statistic (p < 2.2e-16), but the PM2.5 coefficient itself has a p-value = 0.155 (> 0.05), meaning it is not individually significant. Although the model shows R² = 0.50, indicating 50% variation explained, the PM2.5 variable itself does not strongly predict health impact. This suggests the relationship might be influenced by other hidden factors or data issues.

lr_model <- lm(`health impact score` ~ pm2.5, data = db_final)
summary(lr_model)
## 
## Call:
## lm(formula = `health impact score` ~ pm2.5, data = db_final)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.286e-12 -1.400e-15  5.800e-15  1.350e-14  2.130e-14 
## 
## Coefficients:
##               Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)  1.000e+01  1.277e-14  7.828e+14   <2e-16 ***
## pm2.5       -1.247e-16  8.758e-17 -1.424e+00    0.155    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.113e-13 on 1192 degrees of freedom
## Multiple R-squared:  0.5001, Adjusted R-squared:  0.4996 
## F-statistic:  1192 on 1 and 1192 DF,  p-value: < 2.2e-16
# Plotting the regression line
ggplot(db_final, aes(x = pm2.5, y = `health impact score`)) + 
  geom_point() + geom_smooth(method = "lm", col = "red")
## `geom_smooth()` using formula = 'y ~ x'

##27 Does wind speed show a non-linear relationship with aqi? (Independent: wind speed)
#Here, the results show p-values = 0.871 and 0.732 for polynomial terms and overall model p-value = 0.9308 (> 0.05) with R² ≈ 0.0001, meaning the model is not significant. This indicates that wind speed does not have a meaningful effect on AQI in this dataset, and adding a non-linear term does not improve understanding

poly_model <- lm(aqi ~ poly(wind_speed_kmh, 2), data = db_final)
summary(poly_model)
## 
## Call:
## lm(formula = aqi ~ poly(wind_speed_kmh, 2), data = db_final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -223.094 -120.825    4.031  106.069  229.026 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               271.446      3.765  72.089   <2e-16 ***
## poly(wind_speed_kmh, 2)1  -21.144    130.112  -0.163    0.871    
## poly(wind_speed_kmh, 2)2   44.515    130.112   0.342    0.732    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 130.1 on 1191 degrees of freedom
## Multiple R-squared:  0.0001204,  Adjusted R-squared:  -0.001559 
## F-statistic: 0.07173 on 2 and 1191 DF,  p-value: 0.9308
# Visualization
ggplot(db_final, aes(x = wind_speed_kmh, y = aqi)) + 
  geom_point() + geom_smooth(method = "lm", formula = y ~ poly(x, 2), col = "blue")

##28 How do vehicle count and industrial_activity_index combined impact no2 levels?
#We used multiple regression to study the combined effect of vehicle count and industrial activity on NO₂ levels. The results show p-values = 0.561 (vehicle) and 0.242 (industry) and overall model p-value = 0.4341 (> 0.05) with R² = 0.0014, meaning the model is not significant. This suggests that neither vehicles nor industrial activity strongly explain NO₂ levels in this dataset, and other pollution sources may be more important.

mlr_model <- lm(no2 ~ `vehicle count` + `industrial activity index`, data = db_final)
summary(mlr_model)
## 
## Call:
## lm(formula = no2 ~ `vehicle count` + `industrial activity index`, 
##     data = db_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -39.239 -20.219   0.545  19.142  38.572 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                 4.081e+01  1.679e+00  24.311   <2e-16 ***
## `vehicle count`             2.558e-06  4.400e-06   0.581    0.561    
## `industrial activity index` 2.587e-01  2.209e-01   1.171    0.242    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 22.2 on 1191 degrees of freedom
## Multiple R-squared:  0.0014, Adjusted R-squared:  -0.0002765 
## F-statistic: 0.8351 on 2 and 1191 DF,  p-value: 0.4341
##29 Which city has the highest variability (Standard Deviation) in its air quality?
#AQI standard deviation calculation to measure stability across different cities.Pinpoints the city with the most unpredictable air quality, highlighting where management is most difficult.

db_final %>% group_by(city) %>% summarise(sd_aqi = sd(aqi)) %>% arrange(desc(sd_aqi))
## # A tibble: 24 × 2
##    city     sd_aqi
##    <chr>     <dbl>
##  1 Chennai    143.
##  2 Nashik     142.
##  3 Indore     141.
##  4 Bhopal     140.
##  5 Mumbai     139.
##  6 Rajkot     139.
##  7 Patna      139.
##  8 Vadodara   137.
##  9 Nagpur     135.
## 10 Ludhiana   135.
## # ℹ 14 more rows
##30 Visualize the distribution and scatter relationships between all primary pollutants.
#Pair plot visualization to detect correlations between major pollutants.Reveals shared emission sources and identifies pollutant clusters that can be regulated together.

ggpairs(db_final, columns = c("pm2.5", "pm10", "no2", "co", "so2"))

##31 Create a 3-tier risk category (Low, Mid, High) based on percentiles of the health impact score
#Grouping health impact scores into Low, Medium, and High risk tiers. Simplifies complex data to identify vulnerable populations for targeted medical intervention.

db_final <- db_final %>% 
  mutate(risk_tier = ntile(`health impact score`, 3))
table(db_final$risk_tier)
## 
##   1   2   3 
## 398 398 398
##32 What is the frequency distribution of severity among different smoking_status groups?
#Comparison of disease severity across smoking demographics (smokers, non-smokers, former).Confirms whether smoking exacerbates the seriousness of pollution-related health conditions.

table(db_final$smoking_status, db_final$severity)
##          
##           Critical Mild Moderate Severe
##   Current       46   40       76     89
##   Former        59   38       43     96
##   Never        120  111      206    270
##33 Does smoking_status change the effect of aqi on health impact score?

#The overall model is statistically significant (p < 0.05), but AQI alone is not significant (p = 1.000). However, the interaction term for former smokers is significant (p = 0.01097), showing that AQI affects health only for this group.

interaction_model <- lm(`health impact score` ~ aqi * smoking_status, data = db_final)
summary(interaction_model)
## 
## Call:
## lm(formula = `health impact score` ~ aqi * smoking_status, data = db_final)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.192e-12  0.000e+00  0.000e+00  0.000e+00  1.157e-13 
## 
## Coefficients:
##                            Estimate Std. Error    t value Pr(>|t|)    
## (Intercept)               1.000e+01  3.140e-14  3.185e+14  < 2e-16 ***
## aqi                       1.081e-28  1.062e-16  0.000e+00  1.00000    
## smoking_statusFormer     -1.348e-13  4.441e-14 -3.036e+00  0.00245 ** 
## smoking_statusNever       4.800e-26  3.632e-14  0.000e+00  1.00000    
## aqi:smoking_statusFormer  3.755e-16  1.474e-16  2.548e+00  0.01097 *  
## aqi:smoking_statusNever  -1.446e-28  1.223e-16  0.000e+00  1.00000    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.103e-13 on 1188 degrees of freedom
## Multiple R-squared:  0.5001, Adjusted R-squared:  0.498 
## F-statistic: 237.7 on 5 and 1188 DF,  p-value: < 2.2e-16
#The result for (AQI ~ PM2.5) shows p-value = 0.353 (> 0.05) and R² ≈ 0.0007, meaning the model is not significant.


##34 training. testing and prediction
#Data partitioning to build and then validate the predictive model.Provides accuracy metrics and ensures the model is reliable for predicting future pollution impacts.

#The model shows extremely low error (RMSE ≈ 0) but R² is NA, indicating a near-perfect or unstable prediction. This suggests the model may be overfitting or the data may have very little variation, meaning predictions may not be reliable in real-world scenarios.

##creating Partition 80:20
trainIndex <- createDataPartition(db_final$aqi, p = .8, list = FALSE)
train_data <- db_final[trainIndex,]
test_data  <- db_final[-trainIndex,]

##Train Model (AQI predicted by PM2.5)
model_pm25 <- lm(aqi ~ pm2.5, data = train_data)

##Prediction
predictions_pm25 <- predict(model_pm25, newdata = test_data)

##Evaluate
metrics_pm25 <- postResample(pred = predictions_pm25, obs = test_data$aqi)
print(metrics_pm25)
##         RMSE     Rsquared          MAE 
## 1.306434e+02 6.498142e-04 1.123021e+02
##Visualization
ggplot(test_data, aes(x = aqi, y = predictions_pm25)) +
  geom_point(color = "darkorange", alpha = 0.5) +
  geom_smooth(method = "lm", color = "darkred", linetype = "dashed") +
  labs(title = "Refined Regression: AQI vs. PM2.5",
       subtitle = "Testing core pollutant correlation",
       x = "Actual AQI",
       y = "Predicted AQI") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

##35 Applying polynomial regression
#We applied a polynomial model to check if a curved relationship improves prediction. The results show p-values = 0.406 and 0.739 and overall model p-value = 0.6701 (> 0.05), meaning the model is not significant. This indicates that even a non-linear approach does not improve prediction, and PM2.5 is not a strong predictor of AQI here.


##Train Polynomial Model
## We use poly(x, 2) to include both linear and squared terms
poly_model <- lm(aqi ~ poly(pm2.5, 2), data = train_data)

##Summary
summary(poly_model)
## 
## Call:
## lm(formula = aqi ~ poly(pm2.5, 2), data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -231.29 -120.18   -0.08  107.13  231.93 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      271.586      4.199  64.684   <2e-16 ***
## poly(pm2.5, 2)1  109.611    129.886   0.844    0.399    
## poly(pm2.5, 2)2  105.025    129.886   0.809    0.419    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 129.9 on 954 degrees of freedom
## Multiple R-squared:  0.00143,    Adjusted R-squared:  -0.0006636 
## F-statistic: 0.683 on 2 and 954 DF,  p-value: 0.5053
##Predict on Test Data
predictions_poly <- predict(poly_model, newdata = test_data)

##Performance Metrics
metrics_poly <- postResample(pred = predictions_poly, obs = test_data$aqi)
print(metrics_poly)
##         RMSE     Rsquared          MAE 
## 1.307783e+02 1.109301e-06 1.125074e+02
##Visualization
ggplot(test_data, aes(x = aqi, y = predictions_poly)) +
  geom_point(color = "orange", alpha = 0.5) +
  geom_smooth(method = "lm", formula = y ~ poly(x, 2), color = "purple", linetype = "dashed") +
  labs(title = "Polynomial Regression: AQI vs. PM2.5 (Degree 2)",
       subtitle = "Exploring non-linear air quality trends",
       x = "Actual AQI",
       y = "Predicted AQI") +
  theme_minimal()

##36 Is the Polynomial model significantly better than the Linear model?
#We compared linear and polynomial models using ANOVA. The result shows p-value = 0.7387 (> 0.05), meaning the polynomial model is not significantly better than the linear model. This suggests that adding complexity does not improve the model, so the simpler linear model is sufficient.

anova(model_pm25, poly_model)
## Analysis of Variance Table
## 
## Model 1: aqi ~ pm2.5
## Model 2: aqi ~ poly(pm2.5, 2)
##   Res.Df      RSS Df Sum of Sq      F Pr(>F)
## 1    955 16105489                           
## 2    954 16094459  1     11030 0.6538  0.419
##37 Numeric Exploration: Identifying Skewness in Pollutants, using standard deviation and mean to check for high-variability pollutants
#Calculates the average and standard deviation for each pollutant to identify those that are both high and unstable. The outcome highlights which pollutants fluctuate most, making them more dangerous and difficult for authorities to regulate.
db_final %>% 
  summarise(across(c(pm2.5, pm10, no2, so2, o3), list(mean = mean, sd = sd)))
## # A tibble: 1 × 10
##   pm2.5_mean pm2.5_sd pm10_mean pm10_sd no2_mean no2_sd so2_mean so2_sd o3_mean
##        <dbl>    <dbl>     <dbl>   <dbl>    <dbl>  <dbl>    <dbl>  <dbl>   <dbl>
## 1       128.     69.8      163.    80.6     42.7   22.2     25.6   13.9    53.4
## # ℹ 1 more variable: o3_sd <dbl>
##38 Percentile Analysis: The "Top 5% Toxic" cases
#Focuses on the extreme 5% of AQI values to analyze the most severe pollution events. The outcome identifies specific high-risk cities frequently experiencing extreme toxicity, signaling a need for urgent emergency intervention.

top_toxic <- quantile(db_final$aqi, 0.95)
db_final %>% filter(aqi >= top_toxic) %>% count(city, sort = TRUE)
## # A tibble: 23 × 2
##    city          n
##    <chr>     <int>
##  1 Nashik        6
##  2 Ahmedabad     4
##  3 Hyderabad     4
##  4 Patna         4
##  5 Vadodara      4
##  6 Indore        3
##  7 Jaipur        3
##  8 Kolkata       3
##  9 Lucknow       3
## 10 Nagpur        3
## # ℹ 13 more rows
##39 Standard Deviation: Which city has the most unpredictable air?
#Uses standard deviation to find the specific location where air quality changes most frequently. The outcome identifies the city with the least stable AQI, pinpointing where pollution management and long-term planning are most difficult

db_final %>% 
  group_by(city) %>% 
  summarise(aqi_stability = sd(aqi)) %>% 
  arrange(desc(aqi_stability))
## # A tibble: 24 × 2
##    city     aqi_stability
##    <chr>            <dbl>
##  1 Chennai           143.
##  2 Nashik            142.
##  3 Indore            141.
##  4 Bhopal            140.
##  5 Mumbai            139.
##  6 Rajkot            139.
##  7 Patna             139.
##  8 Vadodara          137.
##  9 Nagpur            135.
## 10 Ludhiana          135.
## # ℹ 14 more rows
##40 Does rain actually mitigate the impact of industry on AQI?
#The results show all p-values > 0.05 (interaction p = 0.759) and overall model p-value = 0.7485, meaning the model is not significant. This indicates that rainfall does not significantly reduce the effect of industrial activity on AQI in this dataset.
rain <- lm(aqi ~ rainfall_mm * `industrial activity index`, data = db_final)
summary(rain)
## 
## Call:
## lm(formula = aqi ~ rainfall_mm * `industrial activity index`, 
##     data = db_final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -226.771 -119.676    5.066  106.236  231.133 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                             269.034851  14.701483  18.300   <2e-16
## rainfall_mm                               0.023053   0.084082   0.274    0.784
## `industrial activity index`              -0.922411   2.609441  -0.353    0.724
## rainfall_mm:`industrial activity index`   0.004661   0.015193   0.307    0.759
##                                            
## (Intercept)                             ***
## rainfall_mm                                
## `industrial activity index`                
## rainfall_mm:`industrial activity index`    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 130.1 on 1190 degrees of freedom
## Multiple R-squared:  0.001023,   Adjusted R-squared:  -0.001495 
## F-statistic: 0.4063 on 3 and 1190 DF,  p-value: 0.7485