library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(tidyselect)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats   1.0.1     ✔ stringr   1.6.0
## ✔ lubridate 1.9.5     ✔ tibble    3.3.1
## ✔ purrr     1.2.2     ✔ tidyr     1.3.2
## ✔ readr     2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(readxl)

db<- read_excel("C:\\Users\\riya\\Downloads\\Book 1.xlsx")
## New names:
## • `` -> `...17`
db
head(db)
str(db)
## tibble [599 × 25] (S3: tbl_df/tbl/data.frame)
##  $ City                     : chr [1:599] "Rajkot" "Bangalore" "Bhopal" "Srinagar" ...
##  $ AQI                      : num [1:599] 59 443 56 342 492 235 421 476 183 259 ...
##  $ PM2.5                    : num [1:599] 227.4 219.7 94.8 121 184.3 ...
##  $ PM10                     : num [1:599] 160 280 262 213 165 ...
##  $ NO2                      : num [1:599] 44.8 78.2 70 71.8 16.2 ...
##  $ CO                       : num [1:599] 5.986 5.062 6.589 4.443 0.554 ...
##  $ SO2                      : num [1:599] 37.13 27.14 34.63 32.3 6.72 ...
##  $ O3                       : num [1:599] 58.1 40.8 42.7 97 22.8 ...
##  $ Temperature (°C)         : num [1:599] 11.9 16.7 30.4 32.7 12.5 ...
##  $ Humidity (%)             : num [1:599] 64.4 23.1 63.6 69.7 93.2 ...
##  $ Wind Speed (km/h)        : num [1:599] 12.9 19.4 13.4 19.2 10.4 ...
##  $ Rainfall (mm)            : num [1:599] 14.5 34.8 194.4 233.9 91.8 ...
##  $ Pressure (hPa)           : num [1:599] 976 1022 985 950 1021 ...
##  $ Vehicle Count            : num [1:599] 198941 35588 430785 209382 257650 ...
##  $ Industrial Activity Index: num [1:599] 4.86 5.26 3.8 4.43 9.35 ...
##  $ Health Impact Score      : num [1:599] 10 10 10 10 10 10 10 10 10 10 ...
##  $ ...17                    : logi [1:599] NA NA NA NA NA NA ...
##  $ age                      : num [1:599] 41 51 39 56 26 43 73 71 52 29 ...
##  $ age_group                : chr [1:599] "40-49" "50-59" "30-39" "50-59" ...
##  $ gender                   : chr [1:599] "Male" "Female" "Female" "Male" ...
##  $ disease_name             : chr [1:599] "Swine Flu (H1N1)" "Swine Flu (H1N1)" "COPD" "Stroke" ...
##  $ disease_category         : chr [1:599] "Respiratory" "Respiratory" "Respiratory" "Non-Communicable" ...
##  $ severity                 : chr [1:599] "Critical" "Severe" "Severe" "Severe" ...
##  $ smoking_status           : chr [1:599] "Former" "Current" "Never" "Current" ...
##  $ blood_group              : chr [1:599] "O+" "B+" "A+" "A+" ...
summary(db)
##      City                AQI            PM2.5             PM10       
##  Length:599         Min.   : 52.0   Min.   : 10.18   Min.   : 20.26  
##  Class :character   1st Qu.:152.0   1st Qu.: 65.52   1st Qu.: 92.62  
##  Mode  :character   Median :272.0   Median :127.43   Median :172.38  
##                     Mean   :272.2   Mean   :128.11   Mean   :164.57  
##                     3rd Qu.:379.0   3rd Qu.:187.42   3rd Qu.:230.89  
##                     Max.   :498.0   Max.   :249.64   Max.   :299.94  
##       NO2               CO              SO2               O3        
##  Min.   : 5.038   Min.   :0.1017   Min.   : 2.047   Min.   : 5.147  
##  1st Qu.:23.431   1st Qu.:2.8473   1st Qu.:12.968   1st Qu.:29.615  
##  Median :42.904   Median :5.1307   Median :24.500   Median :52.690  
##  Mean   :42.872   Mean   :5.1133   Mean   :24.919   Mean   :52.656  
##  3rd Qu.:61.002   3rd Qu.:7.3941   3rd Qu.:36.622   3rd Qu.:75.876  
##  Max.   :79.931   Max.   :9.9600   Max.   :49.801   Max.   :99.487  
##  Temperature (°C)  Humidity (%)   Wind Speed (km/h) Rainfall (mm)     
##  Min.   :10.05    Min.   :20.03   Min.   : 0.5178   Min.   :  0.7514  
##  1st Qu.:18.37    1st Qu.:44.45   1st Qu.: 5.1055   1st Qu.: 75.1218  
##  Median :27.27    Median :64.44   Median : 9.9258   Median :152.1604  
##  Mean   :27.32    Mean   :62.36   Mean   :10.0604   Mean   :152.1370  
##  3rd Qu.:36.34    3rd Qu.:81.04   3rd Qu.:14.7584   3rd Qu.:229.8298  
##  Max.   :44.98    Max.   :99.93   Max.   :19.9243   Max.   :299.0146  
##  Pressure (hPa)   Vehicle Count    Industrial Activity Index
##  Min.   : 950.2   Min.   :  1653   Min.   :0.0774           
##  1st Qu.: 975.9   1st Qu.:111060   1st Qu.:2.0299           
##  Median :1000.4   Median :245636   Median :4.7810           
##  Mean   :1001.1   Mean   :249754   Mean   :4.8443           
##  3rd Qu.:1027.4   3rd Qu.:380836   3rd Qu.:7.3941           
##  Max.   :1049.8   Max.   :499353   Max.   :9.9806           
##  Health Impact Score  ...17              age         age_group        
##  Min.   :10          Mode:logical   Min.   : 0.00   Length:599        
##  1st Qu.:10          NA's:599       1st Qu.:40.00   Class :character  
##  Median :10                         Median :53.00   Mode  :character  
##  Mean   :10                         Mean   :52.93                     
##  3rd Qu.:10                         3rd Qu.:68.00                     
##  Max.   :10                         Max.   :95.00                     
##     gender          disease_name       disease_category     severity        
##  Length:599         Length:599         Length:599         Length:599        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  smoking_status     blood_group       
##  Length:599         Length:599        
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
colnames(db)
##  [1] "City"                      "AQI"                      
##  [3] "PM2.5"                     "PM10"                     
##  [5] "NO2"                       "CO"                       
##  [7] "SO2"                       "O3"                       
##  [9] "Temperature (°C)"          "Humidity (%)"             
## [11] "Wind Speed (km/h)"         "Rainfall (mm)"            
## [13] "Pressure (hPa)"            "Vehicle Count"            
## [15] "Industrial Activity Index" "Health Impact Score"      
## [17] "...17"                     "age"                      
## [19] "age_group"                 "gender"                   
## [21] "disease_name"              "disease_category"         
## [23] "severity"                  "smoking_status"           
## [25] "blood_group"
#Data preprocessing i.e. removing empty columns and NA values
db_clean <- db %>%
  select(!contains("Unnamed")) %>%
  rename(
    temp_c = `Temperature (°C)`,
    humidity_pct = `Humidity (%)`,
    wind_speed_kmh = `Wind Speed (km/h)`,
    rainfall_mm = `Rainfall (mm)`,
    pressure_hpa = `Pressure (hPa)`
  ) %>%
  rename_with(tolower)

db_clean
#Removing missing values
db_clean <- db_clean %>%
  drop_na(city, aqi, pm2.5, age, disease_name)

#Converting Categorical Data to Factors

db_final <- db_clean %>%
  mutate(
    gender = as.factor(gender),
    disease_category = as.factor(disease_category),
    severity = factor(severity, levels = c("Mild", "Moderate", "Severe", "Critical"), ordered = TRUE),
    smoking_status = as.factor(smoking_status)
  )

db_final <-db_clean %>%
  select(where(~!all(is.na(.))))

summary(db_final)
##      city                aqi            pm2.5             pm10       
##  Length:599         Min.   : 52.0   Min.   : 10.18   Min.   : 20.26  
##  Class :character   1st Qu.:152.0   1st Qu.: 65.52   1st Qu.: 92.62  
##  Mode  :character   Median :272.0   Median :127.43   Median :172.38  
##                     Mean   :272.2   Mean   :128.11   Mean   :164.57  
##                     3rd Qu.:379.0   3rd Qu.:187.42   3rd Qu.:230.89  
##                     Max.   :498.0   Max.   :249.64   Max.   :299.94  
##       no2               co              so2               o3        
##  Min.   : 5.038   Min.   :0.1017   Min.   : 2.047   Min.   : 5.147  
##  1st Qu.:23.431   1st Qu.:2.8473   1st Qu.:12.968   1st Qu.:29.615  
##  Median :42.904   Median :5.1307   Median :24.500   Median :52.690  
##  Mean   :42.872   Mean   :5.1133   Mean   :24.919   Mean   :52.656  
##  3rd Qu.:61.002   3rd Qu.:7.3941   3rd Qu.:36.622   3rd Qu.:75.876  
##  Max.   :79.931   Max.   :9.9600   Max.   :49.801   Max.   :99.487  
##      temp_c       humidity_pct   wind_speed_kmh     rainfall_mm      
##  Min.   :10.05   Min.   :20.03   Min.   : 0.5178   Min.   :  0.7514  
##  1st Qu.:18.37   1st Qu.:44.45   1st Qu.: 5.1055   1st Qu.: 75.1218  
##  Median :27.27   Median :64.44   Median : 9.9258   Median :152.1604  
##  Mean   :27.32   Mean   :62.36   Mean   :10.0604   Mean   :152.1370  
##  3rd Qu.:36.34   3rd Qu.:81.04   3rd Qu.:14.7584   3rd Qu.:229.8298  
##  Max.   :44.98   Max.   :99.93   Max.   :19.9243   Max.   :299.0146  
##   pressure_hpa    vehicle count    industrial activity index
##  Min.   : 950.2   Min.   :  1653   Min.   :0.0774           
##  1st Qu.: 975.9   1st Qu.:111060   1st Qu.:2.0299           
##  Median :1000.4   Median :245636   Median :4.7810           
##  Mean   :1001.1   Mean   :249754   Mean   :4.8443           
##  3rd Qu.:1027.4   3rd Qu.:380836   3rd Qu.:7.3941           
##  Max.   :1049.8   Max.   :499353   Max.   :9.9806           
##  health impact score      age         age_group            gender         
##  Min.   :10          Min.   : 0.00   Length:599         Length:599        
##  1st Qu.:10          1st Qu.:40.00   Class :character   Class :character  
##  Median :10          Median :53.00   Mode  :character   Mode  :character  
##  Mean   :10          Mean   :52.93                                        
##  3rd Qu.:10          3rd Qu.:68.00                                        
##  Max.   :10          Max.   :95.00                                        
##  disease_name       disease_category     severity         smoking_status    
##  Length:599         Length:599         Length:599         Length:599        
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  blood_group       
##  Length:599        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
names(db_final)
##  [1] "city"                      "aqi"                      
##  [3] "pm2.5"                     "pm10"                     
##  [5] "no2"                       "co"                       
##  [7] "so2"                       "o3"                       
##  [9] "temp_c"                    "humidity_pct"             
## [11] "wind_speed_kmh"            "rainfall_mm"              
## [13] "pressure_hpa"              "vehicle count"            
## [15] "industrial activity index" "health impact score"      
## [17] "age"                       "age_group"                
## [19] "gender"                    "disease_name"             
## [21] "disease_category"          "severity"                 
## [23] "smoking_status"            "blood_group"
#1 Average AQI across all cities
mean(db_final$aqi, na.rm = TRUE)
## [1] 272.2003
#2 Distribution of Temp and Humidity
ggplot(db_final) + geom_density(aes(x = temp_c), fill="red", alpha=0.3) + labs(title="Temp Distribution")

ggplot(db_final) + geom_density(aes(x = humidity_pct), fill="blue", alpha=0.3) + labs(title="Humidity Distribution")

#3 Cities with "Good" AQI (< 50)
db_final %>% filter(aqi < 50) %>% select(city) %>% unique()
#4 AQI Skewness (Histogram)
ggplot(db_final, aes(x = aqi)) + geom_histogram(bins = 30, fill = "steelblue", color = "white") + labs(title="AQI Histogram")

#5 City with highest Ozone (O3)
db_final %>% slice_max(o3, n = 1) %>% select(city, o3)
#6 Most common disease
db_final %>% count(disease_name, sort = TRUE) %>% head(1)
#7 Gender distribution
db_final %>% count(gender)
#8 Most frequent age group
db_final %>% count(age_group, sort = TRUE) %>% head(1)
#9 PM10 vs AQI Variation by City
ggplot(db_final, aes(x = pm10, y = aqi, color = city)) + geom_point() + facet_wrap(~city)

#10 Former Smokers: Disease Category distribution
db_final %>% filter(smoking_status == "Former") %>% count(disease_category)
#11 AQI vs Disease Severity (Boxplot)
ggplot(db_final, aes(x = severity, y = aqi, fill = severity)) + geom_boxplot() + theme_minimal()

#12 PM2.5 vs Health Impact Score (Scatter)
ggplot(db_final, aes(x = pm2.5, y = `health impact score`)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#13 Which city shows the highest "Vehicle Count" but relatively lower NO2?
b <- db_final %>%
  group_by(city) %>%
  summarise(
    avg_vehicles = mean(`vehicle count`, na.rm = TRUE),
    avg_no2 = mean(no2, na.rm = TRUE)
  ) %>%
  mutate(efficiency_ratio = avg_vehicles / avg_no2) %>%
  arrange(desc(efficiency_ratio))
head(b, 1)
ggplot(b, aes(x = avg_vehicles, y = avg_no2, label = city)) +
  geom_point(color = "darkgreen", size = 3)+
  labs(title = "Vehicle Count vs NO2 Levels by City",
       x = "Average Vehicle Count",
       y = "Average NO2 Level") +
  theme_minimal()

#14 Top 5 Cities: Worst Air & Highest Severity
db_final %>%
    group_by(city) %>%
    summarise(
      avg_aqi = mean(aqi, na.rm = TRUE),
      avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)
    ) %>%
    arrange(desc(avg_aqi))
## Warning: There were 24 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)`.
## ℹ In group 1: `city = "Agra"`.
## Caused by warning in `mean()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 23 remaining warnings.
#15 Pie Chart of Health Severity
db_final %>% count(severity) %>% 
  ggplot(aes(x = "", y = n, fill = severity)) + 
  geom_bar(stat = "identity") + coord_polar("y") + theme_void()

#16 Summary Table by City
summary_table <- db_final %>% 
  group_by(city) %>% 
  summarise(mean_aqi = mean(aqi), max_severity = max(severity), total_patients = n())
print(summary_table)
## # A tibble: 24 × 4
##    city      mean_aqi max_severity total_patients
##    <chr>        <dbl> <chr>                 <int>
##  1 Agra          250. Severe                   26
##  2 Ahmedabad     304. Severe                   24
##  3 Bangalore     270. Severe                   23
##  4 Bhopal        252. Severe                   26
##  5 Chennai       236. Severe                   30
##  6 Delhi         247. Severe                   22
##  7 Hyderabad     282. Severe                   23
##  8 Indore        298. Severe                   15
##  9 Jaipur        297. Severe                   27
## 10 Kolkata       258. Severe                   26
## # ℹ 14 more rows
#17 Young patients (<30) in high AQI showing "Severe"
db_final %>% filter(age < 30, aqi > 300, severity == "Severe")
#18 City AQI Ranking vs. 300 Threshold
db_final %>% group_by(city) %>% summarise(avg_aqi = mean(aqi)) %>% 
  ggplot(aes(x = reorder(city, avg_aqi), y = avg_aqi)) + 
  geom_col(fill="blue") + geom_hline(yintercept = 300, linetype="dashed") + coord_flip()

#19 Vehicle Density vs NO2 Correlation
ggplot(db_final, aes(x = `vehicle count`, y = no2)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#20 Relative Frequency of Diseases
ggplot(db_final, aes(x = fct_infreq(disease_name))) + geom_bar(fill="orange") + coord_flip()

#21 Severity Distribution across Genders
ggplot(db_final, aes(x = severity, fill = gender)) + geom_bar(position = "dodge") + theme_bw()

View(db_final)