library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyselect)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.2     ✔ tidyr     1.3.2
## ✔ readr     2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(readxl)

db<- read_excel("C:\\Users\\VICTUS\\Desktop\\airdataset.xlsx")
## New names:
## • `` -> `...17`
head(db)
## # A tibble: 6 × 25
##   City        AQI PM2.5  PM10   NO2    CO   SO2    O3 `Temperature (°C)`
##   <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>              <dbl>
## 1 Rajkot       59 227.   160.  44.8 5.99  37.1   58.1               11.9
## 2 Bangalore   443 220.   280.  78.2 5.06  27.1   40.8               16.7
## 3 Bhopal       56  94.8  262.  70.0 6.59  34.6   42.7               30.4
## 4 Srinagar    342 121.   213.  71.8 4.44  32.3   97.0               32.7
## 5 Hyderabad   492 184.   165.  16.2 0.554  6.72  22.8               12.5
## 6 Bangalore   235  73.9  116.  68.1 7.37  36.7   44.8               41.2
## # ℹ 16 more variables: `Humidity (%)` <dbl>, `Wind Speed (km/h)` <dbl>,
## #   `Rainfall (mm)` <dbl>, `Pressure (hPa)` <dbl>, `Vehicle Count` <dbl>,
## #   `Industrial Activity Index` <dbl>, `Health Impact Score` <dbl>,
## #   ...17 <lgl>, age <dbl>, age_group <chr>, gender <chr>, disease_name <chr>,
## #   disease_category <chr>, severity <chr>, smoking_status <chr>,
## #   blood_group <chr>
str(db)
## tibble [1,194 × 25] (S3: tbl_df/tbl/data.frame)
##  $ City                     : chr [1:1194] "Rajkot" "Bangalore" "Bhopal" "Srinagar" ...
##  $ AQI                      : num [1:1194] 59 443 56 342 492 235 421 476 183 259 ...
##  $ PM2.5                    : num [1:1194] 227.4 219.7 94.8 121 184.3 ...
##  $ PM10                     : num [1:1194] 160 280 262 213 165 ...
##  $ NO2                      : num [1:1194] 44.8 78.2 70 71.8 16.2 ...
##  $ CO                       : num [1:1194] 5.986 5.062 6.589 4.443 0.554 ...
##  $ SO2                      : num [1:1194] 37.13 27.14 34.63 32.3 6.72 ...
##  $ O3                       : num [1:1194] 58.1 40.8 42.7 97 22.8 ...
##  $ Temperature (°C)         : num [1:1194] 11.9 16.7 30.4 32.7 12.5 ...
##  $ Humidity (%)             : num [1:1194] 64.4 23.1 63.6 69.7 93.2 ...
##  $ Wind Speed (km/h)        : num [1:1194] 12.9 19.4 13.4 19.2 10.4 ...
##  $ Rainfall (mm)            : num [1:1194] 14.5 34.8 194.4 233.9 91.8 ...
##  $ Pressure (hPa)           : num [1:1194] 976 1022 985 950 1021 ...
##  $ Vehicle Count            : num [1:1194] 198941 35588 430785 209382 257650 ...
##  $ Industrial Activity Index: num [1:1194] 4.86 5.26 3.8 4.43 9.35 ...
##  $ Health Impact Score      : num [1:1194] 10 10 10 10 10 10 10 10 10 10 ...
##  $ ...17                    : logi [1:1194] NA NA NA NA NA NA ...
##  $ age                      : num [1:1194] 41 51 39 56 26 43 73 71 52 29 ...
##  $ age_group                : chr [1:1194] "40-49" "50-59" "30-39" "50-59" ...
##  $ gender                   : chr [1:1194] "Male" "Female" "Female" "Male" ...
##  $ disease_name             : chr [1:1194] "Swine Flu (H1N1)" "Swine Flu (H1N1)" "COPD" "Stroke" ...
##  $ disease_category         : chr [1:1194] "Respiratory" "Respiratory" "Respiratory" "Non-Communicable" ...
##  $ severity                 : chr [1:1194] "Critical" "Severe" "Severe" "Severe" ...
##  $ smoking_status           : chr [1:1194] "Former" "Current" "Never" "Current" ...
##  $ blood_group              : chr [1:1194] "O+" "B+" "A+" "A+" ...
summary(db)
##      City                AQI            PM2.5             PM10       
##  Length:1194        Min.   : 50.0   Min.   : 10.18   Min.   : 20.10  
##  Class :character   1st Qu.:151.0   1st Qu.: 67.84   1st Qu.: 94.71  
##  Mode  :character   Median :276.0   Median :125.80   Median :165.70  
##                     Mean   :271.4   Mean   :128.06   Mean   :163.14  
##                     3rd Qu.:377.8   3rd Qu.:187.68   3rd Qu.:230.02  
##                     Max.   :499.0   Max.   :249.99   Max.   :299.94  
##       NO2               CO               SO2               O3        
##  Min.   : 5.038   Min.   : 0.1017   Min.   : 2.037   Min.   : 5.147  
##  1st Qu.:22.251   1st Qu.: 2.8110   1st Qu.:12.983   1st Qu.:29.983  
##  Median :43.100   Median : 5.1492   Median :25.466   Median :53.548  
##  Mean   :42.696   Mean   : 5.1015   Mean   :25.555   Mean   :53.387  
##  3rd Qu.:62.056   3rd Qu.: 7.3267   3rd Qu.:37.304   3rd Qu.:77.049  
##  Max.   :79.959   Max.   : 9.9999   Max.   :49.973   Max.   :99.978  
##  Temperature (°C)  Humidity (%)   Wind Speed (km/h) Rainfall (mm)     
##  Min.   :10.05    Min.   :20.03   Min.   : 0.5004   Min.   :  0.5863  
##  1st Qu.:17.88    1st Qu.:43.64   1st Qu.: 5.2455   1st Qu.: 75.5070  
##  Median :27.34    Median :63.21   Median :10.1733   Median :150.6521  
##  Mean   :27.19    Mean   :61.65   Mean   :10.1204   Mean   :151.3860  
##  3rd Qu.:36.30    3rd Qu.:81.05   3rd Qu.:14.8117   3rd Qu.:227.7400  
##  Max.   :44.99    Max.   :99.93   Max.   :19.9749   Max.   :299.0146  
##  Pressure (hPa)   Vehicle Count    Industrial Activity Index
##  Min.   : 950.1   Min.   :  1653   Min.   :0.0000586        
##  1st Qu.: 975.8   1st Qu.:116002   1st Qu.:2.1675619        
##  Median :1001.1   Median :247869   Median :4.7733041        
##  Mean   :1000.7   Mean   :248505   Mean   :4.8326865        
##  3rd Qu.:1026.4   3rd Qu.:377682   3rd Qu.:7.3468181        
##  Max.   :1049.8   Max.   :499353   Max.   :9.9805916        
##  Health Impact Score  ...17              age         age_group        
##  Min.   :10          Mode:logical   Min.   : 0.00   Length:1194       
##  1st Qu.:10          NA's:1194      1st Qu.:40.00   Class :character  
##  Median :10                         Median :54.00   Mode  :character  
##  Mean   :10                         Mean   :53.19                     
##  3rd Qu.:10                         3rd Qu.:68.00                     
##  Max.   :10                         Max.   :95.00                     
##     gender          disease_name       disease_category     severity        
##  Length:1194        Length:1194        Length:1194        Length:1194       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  smoking_status     blood_group       
##  Length:1194        Length:1194       
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
colnames(db)
##  [1] "City"                      "AQI"                      
##  [3] "PM2.5"                     "PM10"                     
##  [5] "NO2"                       "CO"                       
##  [7] "SO2"                       "O3"                       
##  [9] "Temperature (°C)"          "Humidity (%)"             
## [11] "Wind Speed (km/h)"         "Rainfall (mm)"            
## [13] "Pressure (hPa)"            "Vehicle Count"            
## [15] "Industrial Activity Index" "Health Impact Score"      
## [17] "...17"                     "age"                      
## [19] "age_group"                 "gender"                   
## [21] "disease_name"              "disease_category"         
## [23] "severity"                  "smoking_status"           
## [25] "blood_group"
db_clean <- db %>%
  select(!contains("Unnamed")) %>%
  rename(
    temp_c = `Temperature (°C)`,
    humidity_pct = `Humidity (%)`,
    wind_speed_kmh = `Wind Speed (km/h)`,
    rainfall_mm = `Rainfall (mm)`,
    pressure_hpa = `Pressure (hPa)`
  ) %>%
  rename_with(tolower)

db_clean
## # A tibble: 1,194 × 25
##    city        aqi pm2.5  pm10   no2    co   so2    o3 temp_c humidity_pct
##    <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>        <dbl>
##  1 Rajkot       59 227.  160.   44.8 5.99  37.1  58.1    11.9         64.4
##  2 Bangalore   443 220.  280.   78.2 5.06  27.1  40.8    16.7         23.1
##  3 Bhopal       56  94.8 262.   70.0 6.59  34.6  42.7    30.4         63.6
##  4 Srinagar    342 121.  213.   71.8 4.44  32.3  97.0    32.7         69.7
##  5 Hyderabad   492 184.  165.   16.2 0.554  6.72 22.8    12.5         93.2
##  6 Bangalore   235  73.9 116.   68.1 7.37  36.7  44.8    41.2         65.3
##  7 Jaipur      421 146.  180.   26.2 8.98   4.55 72.1    14.9         74.7
##  8 Nashik      476 126.   43.9  61.0 9.91   9.94  8.63   29.0         74.9
##  9 Chennai     183 220.  145.   55.0 4.57  13.3  54.4    14.6         22.3
## 10 Srinagar    259  24.8 150.   41.9 5.27  39.2  91.4    15.2         28.1
## # ℹ 1,184 more rows
## # ℹ 15 more variables: wind_speed_kmh <dbl>, rainfall_mm <dbl>,
## #   pressure_hpa <dbl>, `vehicle count` <dbl>,
## #   `industrial activity index` <dbl>, `health impact score` <dbl>,
## #   ...17 <lgl>, age <dbl>, age_group <chr>, gender <chr>, disease_name <chr>,
## #   disease_category <chr>, severity <chr>, smoking_status <chr>,
## #   blood_group <chr>
#Removing missing values
db_clean <- db_clean %>%
  drop_na(city, aqi, pm2.5, age, disease_name)

#Converting Categorical Data to Factors

db_final <- db_clean %>%
  mutate(
    gender = as.factor(gender),
    disease_category = as.factor(disease_category),
    severity = factor(severity, levels = c("Mild", "Moderate", "Severe", "Critical"), ordered = TRUE),
    smoking_status = as.factor(smoking_status)
  )

db_final <-db_clean %>%
  select(where(~!all(is.na(.))))

summary(db_final)
##      city                aqi            pm2.5             pm10       
##  Length:1194        Min.   : 50.0   Min.   : 10.18   Min.   : 20.10  
##  Class :character   1st Qu.:151.0   1st Qu.: 67.84   1st Qu.: 94.71  
##  Mode  :character   Median :276.0   Median :125.80   Median :165.70  
##                     Mean   :271.4   Mean   :128.06   Mean   :163.14  
##                     3rd Qu.:377.8   3rd Qu.:187.68   3rd Qu.:230.02  
##                     Max.   :499.0   Max.   :249.99   Max.   :299.94  
##       no2               co               so2               o3        
##  Min.   : 5.038   Min.   : 0.1017   Min.   : 2.037   Min.   : 5.147  
##  1st Qu.:22.251   1st Qu.: 2.8110   1st Qu.:12.983   1st Qu.:29.983  
##  Median :43.100   Median : 5.1492   Median :25.466   Median :53.548  
##  Mean   :42.696   Mean   : 5.1015   Mean   :25.555   Mean   :53.387  
##  3rd Qu.:62.056   3rd Qu.: 7.3267   3rd Qu.:37.304   3rd Qu.:77.049  
##  Max.   :79.959   Max.   : 9.9999   Max.   :49.973   Max.   :99.978  
##      temp_c       humidity_pct   wind_speed_kmh     rainfall_mm      
##  Min.   :10.05   Min.   :20.03   Min.   : 0.5004   Min.   :  0.5863  
##  1st Qu.:17.88   1st Qu.:43.64   1st Qu.: 5.2455   1st Qu.: 75.5070  
##  Median :27.34   Median :63.21   Median :10.1733   Median :150.6521  
##  Mean   :27.19   Mean   :61.65   Mean   :10.1204   Mean   :151.3860  
##  3rd Qu.:36.30   3rd Qu.:81.05   3rd Qu.:14.8117   3rd Qu.:227.7400  
##  Max.   :44.99   Max.   :99.93   Max.   :19.9749   Max.   :299.0146  
##   pressure_hpa    vehicle count    industrial activity index
##  Min.   : 950.1   Min.   :  1653   Min.   :0.0000586        
##  1st Qu.: 975.8   1st Qu.:116002   1st Qu.:2.1675619        
##  Median :1001.1   Median :247869   Median :4.7733041        
##  Mean   :1000.7   Mean   :248505   Mean   :4.8326865        
##  3rd Qu.:1026.4   3rd Qu.:377682   3rd Qu.:7.3468181        
##  Max.   :1049.8   Max.   :499353   Max.   :9.9805916        
##  health impact score      age         age_group            gender         
##  Min.   :10          Min.   : 0.00   Length:1194        Length:1194       
##  1st Qu.:10          1st Qu.:40.00   Class :character   Class :character  
##  Median :10          Median :54.00   Mode  :character   Mode  :character  
##  Mean   :10          Mean   :53.19                                        
##  3rd Qu.:10          3rd Qu.:68.00                                        
##  Max.   :10          Max.   :95.00                                        
##  disease_name       disease_category     severity         smoking_status    
##  Length:1194        Length:1194        Length:1194        Length:1194       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  blood_group       
##  Length:1194       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
names(db_final)
##  [1] "city"                      "aqi"                      
##  [3] "pm2.5"                     "pm10"                     
##  [5] "no2"                       "co"                       
##  [7] "so2"                       "o3"                       
##  [9] "temp_c"                    "humidity_pct"             
## [11] "wind_speed_kmh"            "rainfall_mm"              
## [13] "pressure_hpa"              "vehicle count"            
## [15] "industrial activity index" "health impact score"      
## [17] "age"                       "age_group"                
## [19] "gender"                    "disease_name"             
## [21] "disease_category"          "severity"                 
## [23] "smoking_status"            "blood_group"
#1 Average AQI across all cities
mean(db_final$aqi, na.rm = TRUE)
## [1] 271.4456
#2 Distribution of Temp and Humidity
ggplot(db_final) + geom_density(aes(x = temp_c), fill="red", alpha=0.3) + labs(title="Temp Distribution")

ggplot(db_final) + geom_density(aes(x = humidity_pct), fill="blue", alpha=0.3) + labs(title="Humidity Distribution")

#3 Cities with "Good" AQI (< 50)
db_final %>% filter(aqi < 50) %>% select(city) %>% unique()
## # A tibble: 0 × 1
## # ℹ 1 variable: city <chr>
#4 AQI Skewness (Histogram)
ggplot(db_final, aes(x = aqi)) + geom_histogram(bins = 30, fill = "steelblue", color = "white") + labs(title="AQI Histogram")

#5 City with highest Ozone (O3)
db_final %>% slice_max(o3, n = 1) %>% select(city, o3)
## # A tibble: 1 × 2
##   city     o3
##   <chr> <dbl>
## 1 Thane 100.0
#6 Most common disease
db_final %>% count(disease_name, sort = TRUE) %>% head(1)
## # A tibble: 1 × 2
##   disease_name     n
##   <chr>        <int>
## 1 Lung Cancer    274
#7 Gender distribution
db_final %>% count(gender)
## # A tibble: 3 × 2
##   gender     n
##   <chr>  <int>
## 1 Female   573
## 2 Male     605
## 3 Other     16
#8 Most frequent age group
db_final %>% count(age_group, sort = TRUE) %>% head(1)
## # A tibble: 1 × 2
##   age_group     n
##   <chr>     <int>
## 1 50-59       228
#9 PM10 vs AQI Variation by City
ggplot(db_final, aes(x = pm10, y = aqi, color = city)) + geom_point() + facet_wrap(~city)

#10 Former Smokers: Disease Category distribution
db_final %>% filter(smoking_status == "Former") %>% count(disease_category)
## # A tibble: 2 × 2
##   disease_category     n
##   <chr>            <int>
## 1 Non-Communicable   113
## 2 Respiratory        123
#11 AQI vs Disease Severity (Boxplot)
ggplot(db_final, aes(x = severity, y = aqi, fill = severity)) + geom_boxplot() + theme_minimal()

#12 PM2.5 vs Health Impact Score (Scatter)
ggplot(db_final, aes(x = pm2.5, y = `health impact score`)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#13 Which city shows the highest "Vehicle Count" but relatively lower NO2?
b <- db_final %>%
  group_by(city) %>%
  summarise(
    avg_vehicles = mean(`vehicle count`, na.rm = TRUE),
    avg_no2 = mean(no2, na.rm = TRUE)
  ) %>%
  mutate(efficiency_ratio = avg_vehicles / avg_no2) %>%
  arrange(desc(efficiency_ratio))
head(b, 1)
## # A tibble: 1 × 4
##   city     avg_vehicles avg_no2 efficiency_ratio
##   <chr>           <dbl>   <dbl>            <dbl>
## 1 Varanasi      264021.    36.8            7167.
ggplot(b, aes(x = avg_vehicles, y = avg_no2, label = city)) +
  geom_point(color = "darkgreen", size = 3)+
  labs(title = "Vehicle Count vs NO2 Levels by City",
       x = "Average Vehicle Count",
       y = "Average NO2 Level") +
  theme_minimal()

#14 Top 5 Cities: Worst Air & Highest Severity
db_final %>%
  group_by(city) %>%
  summarise(
    avg_aqi = mean(aqi, na.rm = TRUE),
    avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)
  ) %>%
  arrange(desc(avg_aqi))
## Warning: There were 24 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)`.
## ℹ In group 1: `city = "Agra"`.
## Caused by warning in `mean()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 23 remaining warnings.
## # A tibble: 24 × 3
##    city      avg_aqi avg_severity_score
##    <chr>       <dbl>              <dbl>
##  1 Ahmedabad    317.                NaN
##  2 Nagpur       296.                NaN
##  3 Jaipur       293.                NaN
##  4 Indore       286.                NaN
##  5 Thane        283.                NaN
##  6 Patna        280.                NaN
##  7 Mumbai       277.                NaN
##  8 Hyderabad    274.                NaN
##  9 Surat        274.                NaN
## 10 Nashik       271.                NaN
## # ℹ 14 more rows
#15 Pie Chart of Health Severity
db_final %>% count(severity) %>% 
  ggplot(aes(x = "", y = n, fill = severity)) + 
  geom_bar(stat = "identity") + coord_polar("y") + theme_void()

#16 Summary Table by City
summary_table <- db_final %>% 
  group_by(city) %>% 
  summarise(mean_aqi = mean(aqi), max_severity = max(severity), total_patients = n())
print(summary_table)
## # A tibble: 24 × 4
##    city      mean_aqi max_severity total_patients
##    <chr>        <dbl> <chr>                 <int>
##  1 Agra          263. Severe                   54
##  2 Ahmedabad     317. Severe                   48
##  3 Bangalore     264. Severe                   52
##  4 Bhopal        255. Severe                   53
##  5 Chennai       267. Severe                   46
##  6 Delhi         233. Severe                   39
##  7 Hyderabad     274. Severe                   51
##  8 Indore        286. Severe                   40
##  9 Jaipur        293. Severe                   42
## 10 Kolkata       265. Severe                   55
## # ℹ 14 more rows
#17 Young patients (<30) in high AQI showing "Severe"
db_final %>% filter(age < 30, aqi > 300, severity == "Severe")
## # A tibble: 12 × 24
##    city        aqi pm2.5  pm10   no2    co   so2    o3 temp_c humidity_pct
##    <chr>     <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>        <dbl>
##  1 Pune        359  90.0 145.  29.6  6.78  12.7   75.2   21.0         49.7
##  2 Nagpur      375  12.5 114.  40.2  2.60  34.8   44.7   22.0         49.6
##  3 Thane       417  71.4 109.  39.1  0.966 11.1   93.3   31.9         61.3
##  4 Thane       430 191.   39.1 51.4  3.19  46.6   16.9   28.5         66.1
##  5 Srinagar    338 193.   44.6 79.7  7.19  12.8   97.5   34.0         63.3
##  6 Patna       454  31.0 167.  62.1  5.88  45.5   27.8   10.6         43.7
##  7 Rajkot      433 196.   63.9 67.6  0.346 22.4   45.1   10.4         28.1
##  8 Vadodara    359  44.4  43.4 65.3  6.23  23.6   37.9   20.5         87.0
##  9 Bangalore   316  60.0 135.  71.4  2.03   4.29  98.8   31.6         35.6
## 10 Delhi       333 219.  106.   8.52 7.56  35.9   52.3   13.6         95.1
## 11 Nagpur      390  44.5 166.  78.9  8.84  35.7   36.5   38.7         46.7
## 12 Ahmedabad   458 162.  267.   7.84 5.19  30.5   91.8   26.3         93.7
## # ℹ 14 more variables: wind_speed_kmh <dbl>, rainfall_mm <dbl>,
## #   pressure_hpa <dbl>, `vehicle count` <dbl>,
## #   `industrial activity index` <dbl>, `health impact score` <dbl>, age <dbl>,
## #   age_group <chr>, gender <chr>, disease_name <chr>, disease_category <chr>,
## #   severity <chr>, smoking_status <chr>, blood_group <chr>
#18 City AQI Ranking vs. 300 Threshold
db_final %>% group_by(city) %>% summarise(avg_aqi = mean(aqi)) %>% 
  ggplot(aes(x = reorder(city, avg_aqi), y = avg_aqi)) + 
  geom_col(fill="blue") + geom_hline(yintercept = 300, linetype="dashed") + coord_flip()

#19 Vehicle Density vs NO2 Correlation
ggplot(db_final, aes(x = `vehicle count`, y = no2)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#20 Relative Frequency of Diseases
ggplot(db_final, aes(x = fct_infreq(disease_name))) + geom_bar(fill="orange") + coord_flip()

#21 Severity Distribution across Genders
ggplot(db_final, aes(x = severity, fill = gender)) + geom_bar(position = "dodge") + theme_bw()