library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyselect)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.2 ✔ tidyr 1.3.2
## ✔ readr 2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(readxl)
db<- read_excel("C:\\Users\\VICTUS\\Desktop\\airdataset.xlsx")
## New names:
## • `` -> `...17`
head(db)
## # A tibble: 6 × 25
## City AQI PM2.5 PM10 NO2 CO SO2 O3 `Temperature (°C)`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Rajkot 59 227. 160. 44.8 5.99 37.1 58.1 11.9
## 2 Bangalore 443 220. 280. 78.2 5.06 27.1 40.8 16.7
## 3 Bhopal 56 94.8 262. 70.0 6.59 34.6 42.7 30.4
## 4 Srinagar 342 121. 213. 71.8 4.44 32.3 97.0 32.7
## 5 Hyderabad 492 184. 165. 16.2 0.554 6.72 22.8 12.5
## 6 Bangalore 235 73.9 116. 68.1 7.37 36.7 44.8 41.2
## # ℹ 16 more variables: `Humidity (%)` <dbl>, `Wind Speed (km/h)` <dbl>,
## # `Rainfall (mm)` <dbl>, `Pressure (hPa)` <dbl>, `Vehicle Count` <dbl>,
## # `Industrial Activity Index` <dbl>, `Health Impact Score` <dbl>,
## # ...17 <lgl>, age <dbl>, age_group <chr>, gender <chr>, disease_name <chr>,
## # disease_category <chr>, severity <chr>, smoking_status <chr>,
## # blood_group <chr>
str(db)
## tibble [1,194 × 25] (S3: tbl_df/tbl/data.frame)
## $ City : chr [1:1194] "Rajkot" "Bangalore" "Bhopal" "Srinagar" ...
## $ AQI : num [1:1194] 59 443 56 342 492 235 421 476 183 259 ...
## $ PM2.5 : num [1:1194] 227.4 219.7 94.8 121 184.3 ...
## $ PM10 : num [1:1194] 160 280 262 213 165 ...
## $ NO2 : num [1:1194] 44.8 78.2 70 71.8 16.2 ...
## $ CO : num [1:1194] 5.986 5.062 6.589 4.443 0.554 ...
## $ SO2 : num [1:1194] 37.13 27.14 34.63 32.3 6.72 ...
## $ O3 : num [1:1194] 58.1 40.8 42.7 97 22.8 ...
## $ Temperature (°C) : num [1:1194] 11.9 16.7 30.4 32.7 12.5 ...
## $ Humidity (%) : num [1:1194] 64.4 23.1 63.6 69.7 93.2 ...
## $ Wind Speed (km/h) : num [1:1194] 12.9 19.4 13.4 19.2 10.4 ...
## $ Rainfall (mm) : num [1:1194] 14.5 34.8 194.4 233.9 91.8 ...
## $ Pressure (hPa) : num [1:1194] 976 1022 985 950 1021 ...
## $ Vehicle Count : num [1:1194] 198941 35588 430785 209382 257650 ...
## $ Industrial Activity Index: num [1:1194] 4.86 5.26 3.8 4.43 9.35 ...
## $ Health Impact Score : num [1:1194] 10 10 10 10 10 10 10 10 10 10 ...
## $ ...17 : logi [1:1194] NA NA NA NA NA NA ...
## $ age : num [1:1194] 41 51 39 56 26 43 73 71 52 29 ...
## $ age_group : chr [1:1194] "40-49" "50-59" "30-39" "50-59" ...
## $ gender : chr [1:1194] "Male" "Female" "Female" "Male" ...
## $ disease_name : chr [1:1194] "Swine Flu (H1N1)" "Swine Flu (H1N1)" "COPD" "Stroke" ...
## $ disease_category : chr [1:1194] "Respiratory" "Respiratory" "Respiratory" "Non-Communicable" ...
## $ severity : chr [1:1194] "Critical" "Severe" "Severe" "Severe" ...
## $ smoking_status : chr [1:1194] "Former" "Current" "Never" "Current" ...
## $ blood_group : chr [1:1194] "O+" "B+" "A+" "A+" ...
summary(db)
## City AQI PM2.5 PM10
## Length:1194 Min. : 50.0 Min. : 10.18 Min. : 20.10
## Class :character 1st Qu.:151.0 1st Qu.: 67.84 1st Qu.: 94.71
## Mode :character Median :276.0 Median :125.80 Median :165.70
## Mean :271.4 Mean :128.06 Mean :163.14
## 3rd Qu.:377.8 3rd Qu.:187.68 3rd Qu.:230.02
## Max. :499.0 Max. :249.99 Max. :299.94
## NO2 CO SO2 O3
## Min. : 5.038 Min. : 0.1017 Min. : 2.037 Min. : 5.147
## 1st Qu.:22.251 1st Qu.: 2.8110 1st Qu.:12.983 1st Qu.:29.983
## Median :43.100 Median : 5.1492 Median :25.466 Median :53.548
## Mean :42.696 Mean : 5.1015 Mean :25.555 Mean :53.387
## 3rd Qu.:62.056 3rd Qu.: 7.3267 3rd Qu.:37.304 3rd Qu.:77.049
## Max. :79.959 Max. : 9.9999 Max. :49.973 Max. :99.978
## Temperature (°C) Humidity (%) Wind Speed (km/h) Rainfall (mm)
## Min. :10.05 Min. :20.03 Min. : 0.5004 Min. : 0.5863
## 1st Qu.:17.88 1st Qu.:43.64 1st Qu.: 5.2455 1st Qu.: 75.5070
## Median :27.34 Median :63.21 Median :10.1733 Median :150.6521
## Mean :27.19 Mean :61.65 Mean :10.1204 Mean :151.3860
## 3rd Qu.:36.30 3rd Qu.:81.05 3rd Qu.:14.8117 3rd Qu.:227.7400
## Max. :44.99 Max. :99.93 Max. :19.9749 Max. :299.0146
## Pressure (hPa) Vehicle Count Industrial Activity Index
## Min. : 950.1 Min. : 1653 Min. :0.0000586
## 1st Qu.: 975.8 1st Qu.:116002 1st Qu.:2.1675619
## Median :1001.1 Median :247869 Median :4.7733041
## Mean :1000.7 Mean :248505 Mean :4.8326865
## 3rd Qu.:1026.4 3rd Qu.:377682 3rd Qu.:7.3468181
## Max. :1049.8 Max. :499353 Max. :9.9805916
## Health Impact Score ...17 age age_group
## Min. :10 Mode:logical Min. : 0.00 Length:1194
## 1st Qu.:10 NA's:1194 1st Qu.:40.00 Class :character
## Median :10 Median :54.00 Mode :character
## Mean :10 Mean :53.19
## 3rd Qu.:10 3rd Qu.:68.00
## Max. :10 Max. :95.00
## gender disease_name disease_category severity
## Length:1194 Length:1194 Length:1194 Length:1194
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## smoking_status blood_group
## Length:1194 Length:1194
## Class :character Class :character
## Mode :character Mode :character
##
##
##
colnames(db)
## [1] "City" "AQI"
## [3] "PM2.5" "PM10"
## [5] "NO2" "CO"
## [7] "SO2" "O3"
## [9] "Temperature (°C)" "Humidity (%)"
## [11] "Wind Speed (km/h)" "Rainfall (mm)"
## [13] "Pressure (hPa)" "Vehicle Count"
## [15] "Industrial Activity Index" "Health Impact Score"
## [17] "...17" "age"
## [19] "age_group" "gender"
## [21] "disease_name" "disease_category"
## [23] "severity" "smoking_status"
## [25] "blood_group"
db_clean <- db %>%
select(!contains("Unnamed")) %>%
rename(
temp_c = `Temperature (°C)`,
humidity_pct = `Humidity (%)`,
wind_speed_kmh = `Wind Speed (km/h)`,
rainfall_mm = `Rainfall (mm)`,
pressure_hpa = `Pressure (hPa)`
) %>%
rename_with(tolower)
db_clean
## # A tibble: 1,194 × 25
## city aqi pm2.5 pm10 no2 co so2 o3 temp_c humidity_pct
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Rajkot 59 227. 160. 44.8 5.99 37.1 58.1 11.9 64.4
## 2 Bangalore 443 220. 280. 78.2 5.06 27.1 40.8 16.7 23.1
## 3 Bhopal 56 94.8 262. 70.0 6.59 34.6 42.7 30.4 63.6
## 4 Srinagar 342 121. 213. 71.8 4.44 32.3 97.0 32.7 69.7
## 5 Hyderabad 492 184. 165. 16.2 0.554 6.72 22.8 12.5 93.2
## 6 Bangalore 235 73.9 116. 68.1 7.37 36.7 44.8 41.2 65.3
## 7 Jaipur 421 146. 180. 26.2 8.98 4.55 72.1 14.9 74.7
## 8 Nashik 476 126. 43.9 61.0 9.91 9.94 8.63 29.0 74.9
## 9 Chennai 183 220. 145. 55.0 4.57 13.3 54.4 14.6 22.3
## 10 Srinagar 259 24.8 150. 41.9 5.27 39.2 91.4 15.2 28.1
## # ℹ 1,184 more rows
## # ℹ 15 more variables: wind_speed_kmh <dbl>, rainfall_mm <dbl>,
## # pressure_hpa <dbl>, `vehicle count` <dbl>,
## # `industrial activity index` <dbl>, `health impact score` <dbl>,
## # ...17 <lgl>, age <dbl>, age_group <chr>, gender <chr>, disease_name <chr>,
## # disease_category <chr>, severity <chr>, smoking_status <chr>,
## # blood_group <chr>
#Removing missing values
db_clean <- db_clean %>%
drop_na(city, aqi, pm2.5, age, disease_name)
#Converting Categorical Data to Factors
db_final <- db_clean %>%
mutate(
gender = as.factor(gender),
disease_category = as.factor(disease_category),
severity = factor(severity, levels = c("Mild", "Moderate", "Severe", "Critical"), ordered = TRUE),
smoking_status = as.factor(smoking_status)
)
db_final <-db_clean %>%
select(where(~!all(is.na(.))))
summary(db_final)
## city aqi pm2.5 pm10
## Length:1194 Min. : 50.0 Min. : 10.18 Min. : 20.10
## Class :character 1st Qu.:151.0 1st Qu.: 67.84 1st Qu.: 94.71
## Mode :character Median :276.0 Median :125.80 Median :165.70
## Mean :271.4 Mean :128.06 Mean :163.14
## 3rd Qu.:377.8 3rd Qu.:187.68 3rd Qu.:230.02
## Max. :499.0 Max. :249.99 Max. :299.94
## no2 co so2 o3
## Min. : 5.038 Min. : 0.1017 Min. : 2.037 Min. : 5.147
## 1st Qu.:22.251 1st Qu.: 2.8110 1st Qu.:12.983 1st Qu.:29.983
## Median :43.100 Median : 5.1492 Median :25.466 Median :53.548
## Mean :42.696 Mean : 5.1015 Mean :25.555 Mean :53.387
## 3rd Qu.:62.056 3rd Qu.: 7.3267 3rd Qu.:37.304 3rd Qu.:77.049
## Max. :79.959 Max. : 9.9999 Max. :49.973 Max. :99.978
## temp_c humidity_pct wind_speed_kmh rainfall_mm
## Min. :10.05 Min. :20.03 Min. : 0.5004 Min. : 0.5863
## 1st Qu.:17.88 1st Qu.:43.64 1st Qu.: 5.2455 1st Qu.: 75.5070
## Median :27.34 Median :63.21 Median :10.1733 Median :150.6521
## Mean :27.19 Mean :61.65 Mean :10.1204 Mean :151.3860
## 3rd Qu.:36.30 3rd Qu.:81.05 3rd Qu.:14.8117 3rd Qu.:227.7400
## Max. :44.99 Max. :99.93 Max. :19.9749 Max. :299.0146
## pressure_hpa vehicle count industrial activity index
## Min. : 950.1 Min. : 1653 Min. :0.0000586
## 1st Qu.: 975.8 1st Qu.:116002 1st Qu.:2.1675619
## Median :1001.1 Median :247869 Median :4.7733041
## Mean :1000.7 Mean :248505 Mean :4.8326865
## 3rd Qu.:1026.4 3rd Qu.:377682 3rd Qu.:7.3468181
## Max. :1049.8 Max. :499353 Max. :9.9805916
## health impact score age age_group gender
## Min. :10 Min. : 0.00 Length:1194 Length:1194
## 1st Qu.:10 1st Qu.:40.00 Class :character Class :character
## Median :10 Median :54.00 Mode :character Mode :character
## Mean :10 Mean :53.19
## 3rd Qu.:10 3rd Qu.:68.00
## Max. :10 Max. :95.00
## disease_name disease_category severity smoking_status
## Length:1194 Length:1194 Length:1194 Length:1194
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## blood_group
## Length:1194
## Class :character
## Mode :character
##
##
##
names(db_final)
## [1] "city" "aqi"
## [3] "pm2.5" "pm10"
## [5] "no2" "co"
## [7] "so2" "o3"
## [9] "temp_c" "humidity_pct"
## [11] "wind_speed_kmh" "rainfall_mm"
## [13] "pressure_hpa" "vehicle count"
## [15] "industrial activity index" "health impact score"
## [17] "age" "age_group"
## [19] "gender" "disease_name"
## [21] "disease_category" "severity"
## [23] "smoking_status" "blood_group"
#1 Average AQI across all cities
mean(db_final$aqi, na.rm = TRUE)
## [1] 271.4456
#2 Distribution of Temp and Humidity
ggplot(db_final) + geom_density(aes(x = temp_c), fill="red", alpha=0.3) + labs(title="Temp Distribution")

ggplot(db_final) + geom_density(aes(x = humidity_pct), fill="blue", alpha=0.3) + labs(title="Humidity Distribution")

#3 Cities with "Good" AQI (< 50)
db_final %>% filter(aqi < 50) %>% select(city) %>% unique()
## # A tibble: 0 × 1
## # ℹ 1 variable: city <chr>
#4 AQI Skewness (Histogram)
ggplot(db_final, aes(x = aqi)) + geom_histogram(bins = 30, fill = "steelblue", color = "white") + labs(title="AQI Histogram")

#5 City with highest Ozone (O3)
db_final %>% slice_max(o3, n = 1) %>% select(city, o3)
## # A tibble: 1 × 2
## city o3
## <chr> <dbl>
## 1 Thane 100.0
#6 Most common disease
db_final %>% count(disease_name, sort = TRUE) %>% head(1)
## # A tibble: 1 × 2
## disease_name n
## <chr> <int>
## 1 Lung Cancer 274
#7 Gender distribution
db_final %>% count(gender)
## # A tibble: 3 × 2
## gender n
## <chr> <int>
## 1 Female 573
## 2 Male 605
## 3 Other 16
#8 Most frequent age group
db_final %>% count(age_group, sort = TRUE) %>% head(1)
## # A tibble: 1 × 2
## age_group n
## <chr> <int>
## 1 50-59 228
#9 PM10 vs AQI Variation by City
ggplot(db_final, aes(x = pm10, y = aqi, color = city)) + geom_point() + facet_wrap(~city)

#10 Former Smokers: Disease Category distribution
db_final %>% filter(smoking_status == "Former") %>% count(disease_category)
## # A tibble: 2 × 2
## disease_category n
## <chr> <int>
## 1 Non-Communicable 113
## 2 Respiratory 123
#11 AQI vs Disease Severity (Boxplot)
ggplot(db_final, aes(x = severity, y = aqi, fill = severity)) + geom_boxplot() + theme_minimal()

#12 PM2.5 vs Health Impact Score (Scatter)
ggplot(db_final, aes(x = pm2.5, y = `health impact score`)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#13 Which city shows the highest "Vehicle Count" but relatively lower NO2?
b <- db_final %>%
group_by(city) %>%
summarise(
avg_vehicles = mean(`vehicle count`, na.rm = TRUE),
avg_no2 = mean(no2, na.rm = TRUE)
) %>%
mutate(efficiency_ratio = avg_vehicles / avg_no2) %>%
arrange(desc(efficiency_ratio))
head(b, 1)
## # A tibble: 1 × 4
## city avg_vehicles avg_no2 efficiency_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 Varanasi 264021. 36.8 7167.
ggplot(b, aes(x = avg_vehicles, y = avg_no2, label = city)) +
geom_point(color = "darkgreen", size = 3)+
labs(title = "Vehicle Count vs NO2 Levels by City",
x = "Average Vehicle Count",
y = "Average NO2 Level") +
theme_minimal()

#14 Top 5 Cities: Worst Air & Highest Severity
db_final %>%
group_by(city) %>%
summarise(
avg_aqi = mean(aqi, na.rm = TRUE),
avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)
) %>%
arrange(desc(avg_aqi))
## Warning: There were 24 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)`.
## ℹ In group 1: `city = "Agra"`.
## Caused by warning in `mean()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 23 remaining warnings.
## # A tibble: 24 × 3
## city avg_aqi avg_severity_score
## <chr> <dbl> <dbl>
## 1 Ahmedabad 317. NaN
## 2 Nagpur 296. NaN
## 3 Jaipur 293. NaN
## 4 Indore 286. NaN
## 5 Thane 283. NaN
## 6 Patna 280. NaN
## 7 Mumbai 277. NaN
## 8 Hyderabad 274. NaN
## 9 Surat 274. NaN
## 10 Nashik 271. NaN
## # ℹ 14 more rows
#15 Pie Chart of Health Severity
db_final %>% count(severity) %>%
ggplot(aes(x = "", y = n, fill = severity)) +
geom_bar(stat = "identity") + coord_polar("y") + theme_void()

#16 Summary Table by City
summary_table <- db_final %>%
group_by(city) %>%
summarise(mean_aqi = mean(aqi), max_severity = max(severity), total_patients = n())
print(summary_table)
## # A tibble: 24 × 4
## city mean_aqi max_severity total_patients
## <chr> <dbl> <chr> <int>
## 1 Agra 263. Severe 54
## 2 Ahmedabad 317. Severe 48
## 3 Bangalore 264. Severe 52
## 4 Bhopal 255. Severe 53
## 5 Chennai 267. Severe 46
## 6 Delhi 233. Severe 39
## 7 Hyderabad 274. Severe 51
## 8 Indore 286. Severe 40
## 9 Jaipur 293. Severe 42
## 10 Kolkata 265. Severe 55
## # ℹ 14 more rows
#17 Young patients (<30) in high AQI showing "Severe"
db_final %>% filter(age < 30, aqi > 300, severity == "Severe")
## # A tibble: 12 × 24
## city aqi pm2.5 pm10 no2 co so2 o3 temp_c humidity_pct
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Pune 359 90.0 145. 29.6 6.78 12.7 75.2 21.0 49.7
## 2 Nagpur 375 12.5 114. 40.2 2.60 34.8 44.7 22.0 49.6
## 3 Thane 417 71.4 109. 39.1 0.966 11.1 93.3 31.9 61.3
## 4 Thane 430 191. 39.1 51.4 3.19 46.6 16.9 28.5 66.1
## 5 Srinagar 338 193. 44.6 79.7 7.19 12.8 97.5 34.0 63.3
## 6 Patna 454 31.0 167. 62.1 5.88 45.5 27.8 10.6 43.7
## 7 Rajkot 433 196. 63.9 67.6 0.346 22.4 45.1 10.4 28.1
## 8 Vadodara 359 44.4 43.4 65.3 6.23 23.6 37.9 20.5 87.0
## 9 Bangalore 316 60.0 135. 71.4 2.03 4.29 98.8 31.6 35.6
## 10 Delhi 333 219. 106. 8.52 7.56 35.9 52.3 13.6 95.1
## 11 Nagpur 390 44.5 166. 78.9 8.84 35.7 36.5 38.7 46.7
## 12 Ahmedabad 458 162. 267. 7.84 5.19 30.5 91.8 26.3 93.7
## # ℹ 14 more variables: wind_speed_kmh <dbl>, rainfall_mm <dbl>,
## # pressure_hpa <dbl>, `vehicle count` <dbl>,
## # `industrial activity index` <dbl>, `health impact score` <dbl>, age <dbl>,
## # age_group <chr>, gender <chr>, disease_name <chr>, disease_category <chr>,
## # severity <chr>, smoking_status <chr>, blood_group <chr>
#18 City AQI Ranking vs. 300 Threshold
db_final %>% group_by(city) %>% summarise(avg_aqi = mean(aqi)) %>%
ggplot(aes(x = reorder(city, avg_aqi), y = avg_aqi)) +
geom_col(fill="blue") + geom_hline(yintercept = 300, linetype="dashed") + coord_flip()

#19 Vehicle Density vs NO2 Correlation
ggplot(db_final, aes(x = `vehicle count`, y = no2)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#20 Relative Frequency of Diseases
ggplot(db_final, aes(x = fct_infreq(disease_name))) + geom_bar(fill="orange") + coord_flip()

#21 Severity Distribution across Genders
ggplot(db_final, aes(x = severity, fill = gender)) + geom_bar(position = "dodge") + theme_bw()
