library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyselect)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.1 ✔ stringr 1.6.0
## ✔ lubridate 1.9.5 ✔ tibble 3.3.1
## ✔ purrr 1.2.2 ✔ tidyr 1.3.2
## ✔ readr 2.2.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidyr)
library(readxl)
db<- read_excel("C:\\Users\\riya\\Downloads\\Book 1.xlsx")
## New names:
## • `` -> `...17`
db
head(db)
str(db)
## tibble [599 × 25] (S3: tbl_df/tbl/data.frame)
## $ City : chr [1:599] "Rajkot" "Bangalore" "Bhopal" "Srinagar" ...
## $ AQI : num [1:599] 59 443 56 342 492 235 421 476 183 259 ...
## $ PM2.5 : num [1:599] 227.4 219.7 94.8 121 184.3 ...
## $ PM10 : num [1:599] 160 280 262 213 165 ...
## $ NO2 : num [1:599] 44.8 78.2 70 71.8 16.2 ...
## $ CO : num [1:599] 5.986 5.062 6.589 4.443 0.554 ...
## $ SO2 : num [1:599] 37.13 27.14 34.63 32.3 6.72 ...
## $ O3 : num [1:599] 58.1 40.8 42.7 97 22.8 ...
## $ Temperature (°C) : num [1:599] 11.9 16.7 30.4 32.7 12.5 ...
## $ Humidity (%) : num [1:599] 64.4 23.1 63.6 69.7 93.2 ...
## $ Wind Speed (km/h) : num [1:599] 12.9 19.4 13.4 19.2 10.4 ...
## $ Rainfall (mm) : num [1:599] 14.5 34.8 194.4 233.9 91.8 ...
## $ Pressure (hPa) : num [1:599] 976 1022 985 950 1021 ...
## $ Vehicle Count : num [1:599] 198941 35588 430785 209382 257650 ...
## $ Industrial Activity Index: num [1:599] 4.86 5.26 3.8 4.43 9.35 ...
## $ Health Impact Score : num [1:599] 10 10 10 10 10 10 10 10 10 10 ...
## $ ...17 : logi [1:599] NA NA NA NA NA NA ...
## $ age : num [1:599] 41 51 39 56 26 43 73 71 52 29 ...
## $ age_group : chr [1:599] "40-49" "50-59" "30-39" "50-59" ...
## $ gender : chr [1:599] "Male" "Female" "Female" "Male" ...
## $ disease_name : chr [1:599] "Swine Flu (H1N1)" "Swine Flu (H1N1)" "COPD" "Stroke" ...
## $ disease_category : chr [1:599] "Respiratory" "Respiratory" "Respiratory" "Non-Communicable" ...
## $ severity : chr [1:599] "Critical" "Severe" "Severe" "Severe" ...
## $ smoking_status : chr [1:599] "Former" "Current" "Never" "Current" ...
## $ blood_group : chr [1:599] "O+" "B+" "A+" "A+" ...
summary(db)
## City AQI PM2.5 PM10
## Length:599 Min. : 52.0 Min. : 10.18 Min. : 20.26
## Class :character 1st Qu.:152.0 1st Qu.: 65.52 1st Qu.: 92.62
## Mode :character Median :272.0 Median :127.43 Median :172.38
## Mean :272.2 Mean :128.11 Mean :164.57
## 3rd Qu.:379.0 3rd Qu.:187.42 3rd Qu.:230.89
## Max. :498.0 Max. :249.64 Max. :299.94
## NO2 CO SO2 O3
## Min. : 5.038 Min. :0.1017 Min. : 2.047 Min. : 5.147
## 1st Qu.:23.431 1st Qu.:2.8473 1st Qu.:12.968 1st Qu.:29.615
## Median :42.904 Median :5.1307 Median :24.500 Median :52.690
## Mean :42.872 Mean :5.1133 Mean :24.919 Mean :52.656
## 3rd Qu.:61.002 3rd Qu.:7.3941 3rd Qu.:36.622 3rd Qu.:75.876
## Max. :79.931 Max. :9.9600 Max. :49.801 Max. :99.487
## Temperature (°C) Humidity (%) Wind Speed (km/h) Rainfall (mm)
## Min. :10.05 Min. :20.03 Min. : 0.5178 Min. : 0.7514
## 1st Qu.:18.37 1st Qu.:44.45 1st Qu.: 5.1055 1st Qu.: 75.1218
## Median :27.27 Median :64.44 Median : 9.9258 Median :152.1604
## Mean :27.32 Mean :62.36 Mean :10.0604 Mean :152.1370
## 3rd Qu.:36.34 3rd Qu.:81.04 3rd Qu.:14.7584 3rd Qu.:229.8298
## Max. :44.98 Max. :99.93 Max. :19.9243 Max. :299.0146
## Pressure (hPa) Vehicle Count Industrial Activity Index
## Min. : 950.2 Min. : 1653 Min. :0.0774
## 1st Qu.: 975.9 1st Qu.:111060 1st Qu.:2.0299
## Median :1000.4 Median :245636 Median :4.7810
## Mean :1001.1 Mean :249754 Mean :4.8443
## 3rd Qu.:1027.4 3rd Qu.:380836 3rd Qu.:7.3941
## Max. :1049.8 Max. :499353 Max. :9.9806
## Health Impact Score ...17 age age_group
## Min. :10 Mode:logical Min. : 0.00 Length:599
## 1st Qu.:10 NA's:599 1st Qu.:40.00 Class :character
## Median :10 Median :53.00 Mode :character
## Mean :10 Mean :52.93
## 3rd Qu.:10 3rd Qu.:68.00
## Max. :10 Max. :95.00
## gender disease_name disease_category severity
## Length:599 Length:599 Length:599 Length:599
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## smoking_status blood_group
## Length:599 Length:599
## Class :character Class :character
## Mode :character Mode :character
##
##
##
colnames(db)
## [1] "City" "AQI"
## [3] "PM2.5" "PM10"
## [5] "NO2" "CO"
## [7] "SO2" "O3"
## [9] "Temperature (°C)" "Humidity (%)"
## [11] "Wind Speed (km/h)" "Rainfall (mm)"
## [13] "Pressure (hPa)" "Vehicle Count"
## [15] "Industrial Activity Index" "Health Impact Score"
## [17] "...17" "age"
## [19] "age_group" "gender"
## [21] "disease_name" "disease_category"
## [23] "severity" "smoking_status"
## [25] "blood_group"
#Data preprocessing i.e. removing empty columns and NA values
db_clean <- db %>%
select(!contains("Unnamed")) %>%
rename(
temp_c = `Temperature (°C)`,
humidity_pct = `Humidity (%)`,
wind_speed_kmh = `Wind Speed (km/h)`,
rainfall_mm = `Rainfall (mm)`,
pressure_hpa = `Pressure (hPa)`
) %>%
rename_with(tolower)
db_clean
#Removing missing values
db_clean <- db_clean %>%
drop_na(city, aqi, pm2.5, age, disease_name)
#Converting Categorical Data to Factors
db_final <- db_clean %>%
mutate(
gender = as.factor(gender),
disease_category = as.factor(disease_category),
severity = factor(severity, levels = c("Mild", "Moderate", "Severe", "Critical"), ordered = TRUE),
smoking_status = as.factor(smoking_status)
)
db_final <-db_clean %>%
select(where(~!all(is.na(.))))
summary(db_final)
## city aqi pm2.5 pm10
## Length:599 Min. : 52.0 Min. : 10.18 Min. : 20.26
## Class :character 1st Qu.:152.0 1st Qu.: 65.52 1st Qu.: 92.62
## Mode :character Median :272.0 Median :127.43 Median :172.38
## Mean :272.2 Mean :128.11 Mean :164.57
## 3rd Qu.:379.0 3rd Qu.:187.42 3rd Qu.:230.89
## Max. :498.0 Max. :249.64 Max. :299.94
## no2 co so2 o3
## Min. : 5.038 Min. :0.1017 Min. : 2.047 Min. : 5.147
## 1st Qu.:23.431 1st Qu.:2.8473 1st Qu.:12.968 1st Qu.:29.615
## Median :42.904 Median :5.1307 Median :24.500 Median :52.690
## Mean :42.872 Mean :5.1133 Mean :24.919 Mean :52.656
## 3rd Qu.:61.002 3rd Qu.:7.3941 3rd Qu.:36.622 3rd Qu.:75.876
## Max. :79.931 Max. :9.9600 Max. :49.801 Max. :99.487
## temp_c humidity_pct wind_speed_kmh rainfall_mm
## Min. :10.05 Min. :20.03 Min. : 0.5178 Min. : 0.7514
## 1st Qu.:18.37 1st Qu.:44.45 1st Qu.: 5.1055 1st Qu.: 75.1218
## Median :27.27 Median :64.44 Median : 9.9258 Median :152.1604
## Mean :27.32 Mean :62.36 Mean :10.0604 Mean :152.1370
## 3rd Qu.:36.34 3rd Qu.:81.04 3rd Qu.:14.7584 3rd Qu.:229.8298
## Max. :44.98 Max. :99.93 Max. :19.9243 Max. :299.0146
## pressure_hpa vehicle count industrial activity index
## Min. : 950.2 Min. : 1653 Min. :0.0774
## 1st Qu.: 975.9 1st Qu.:111060 1st Qu.:2.0299
## Median :1000.4 Median :245636 Median :4.7810
## Mean :1001.1 Mean :249754 Mean :4.8443
## 3rd Qu.:1027.4 3rd Qu.:380836 3rd Qu.:7.3941
## Max. :1049.8 Max. :499353 Max. :9.9806
## health impact score age age_group gender
## Min. :10 Min. : 0.00 Length:599 Length:599
## 1st Qu.:10 1st Qu.:40.00 Class :character Class :character
## Median :10 Median :53.00 Mode :character Mode :character
## Mean :10 Mean :52.93
## 3rd Qu.:10 3rd Qu.:68.00
## Max. :10 Max. :95.00
## disease_name disease_category severity smoking_status
## Length:599 Length:599 Length:599 Length:599
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## blood_group
## Length:599
## Class :character
## Mode :character
##
##
##
names(db_final)
## [1] "city" "aqi"
## [3] "pm2.5" "pm10"
## [5] "no2" "co"
## [7] "so2" "o3"
## [9] "temp_c" "humidity_pct"
## [11] "wind_speed_kmh" "rainfall_mm"
## [13] "pressure_hpa" "vehicle count"
## [15] "industrial activity index" "health impact score"
## [17] "age" "age_group"
## [19] "gender" "disease_name"
## [21] "disease_category" "severity"
## [23] "smoking_status" "blood_group"
#1 Average AQI across all cities
mean(db_final$aqi, na.rm = TRUE)
## [1] 272.2003
#2 Distribution of Temp and Humidity
ggplot(db_final) + geom_density(aes(x = temp_c), fill="red", alpha=0.3) + labs(title="Temp Distribution")

ggplot(db_final) + geom_density(aes(x = humidity_pct), fill="blue", alpha=0.3) + labs(title="Humidity Distribution")

#3 Cities with "Good" AQI (< 50)
db_final %>% filter(aqi < 50) %>% select(city) %>% unique()
#4 AQI Skewness (Histogram)
ggplot(db_final, aes(x = aqi)) + geom_histogram(bins = 30, fill = "steelblue", color = "white") + labs(title="AQI Histogram")

#5 City with highest Ozone (O3)
db_final %>% slice_max(o3, n = 1) %>% select(city, o3)
#6 Most common disease
db_final %>% count(disease_name, sort = TRUE) %>% head(1)
#7 Gender distribution
db_final %>% count(gender)
#8 Most frequent age group
db_final %>% count(age_group, sort = TRUE) %>% head(1)
#9 PM10 vs AQI Variation by City
ggplot(db_final, aes(x = pm10, y = aqi, color = city)) + geom_point() + facet_wrap(~city)

#10 Former Smokers: Disease Category distribution
db_final %>% filter(smoking_status == "Former") %>% count(disease_category)
#11 AQI vs Disease Severity (Boxplot)
ggplot(db_final, aes(x = severity, y = aqi, fill = severity)) + geom_boxplot() + theme_minimal()

#12 PM2.5 vs Health Impact Score (Scatter)
ggplot(db_final, aes(x = pm2.5, y = `health impact score`)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#13 Which city shows the highest "Vehicle Count" but relatively lower NO2?
b <- db_final %>%
group_by(city) %>%
summarise(
avg_vehicles = mean(`vehicle count`, na.rm = TRUE),
avg_no2 = mean(no2, na.rm = TRUE)
) %>%
mutate(efficiency_ratio = avg_vehicles / avg_no2) %>%
arrange(desc(efficiency_ratio))
head(b, 1)
ggplot(b, aes(x = avg_vehicles, y = avg_no2, label = city)) +
geom_point(color = "darkgreen", size = 3)+
labs(title = "Vehicle Count vs NO2 Levels by City",
x = "Average Vehicle Count",
y = "Average NO2 Level") +
theme_minimal()

#14 Top 5 Cities: Worst Air & Highest Severity
db_final %>%
group_by(city) %>%
summarise(
avg_aqi = mean(aqi, na.rm = TRUE),
avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)
) %>%
arrange(desc(avg_aqi))
## Warning: There were 24 warnings in `summarise()`.
## The first warning was:
## ℹ In argument: `avg_severity_score = mean(as.numeric(severity), na.rm = TRUE)`.
## ℹ In group 1: `city = "Agra"`.
## Caused by warning in `mean()`:
## ! NAs introduced by coercion
## ℹ Run `dplyr::last_dplyr_warnings()` to see the 23 remaining warnings.
#15 Pie Chart of Health Severity
db_final %>% count(severity) %>%
ggplot(aes(x = "", y = n, fill = severity)) +
geom_bar(stat = "identity") + coord_polar("y") + theme_void()

#16 Summary Table by City
summary_table <- db_final %>%
group_by(city) %>%
summarise(mean_aqi = mean(aqi), max_severity = max(severity), total_patients = n())
print(summary_table)
## # A tibble: 24 × 4
## city mean_aqi max_severity total_patients
## <chr> <dbl> <chr> <int>
## 1 Agra 250. Severe 26
## 2 Ahmedabad 304. Severe 24
## 3 Bangalore 270. Severe 23
## 4 Bhopal 252. Severe 26
## 5 Chennai 236. Severe 30
## 6 Delhi 247. Severe 22
## 7 Hyderabad 282. Severe 23
## 8 Indore 298. Severe 15
## 9 Jaipur 297. Severe 27
## 10 Kolkata 258. Severe 26
## # ℹ 14 more rows
#17 Young patients (<30) in high AQI showing "Severe"
db_final %>% filter(age < 30, aqi > 300, severity == "Severe")
#18 City AQI Ranking vs. 300 Threshold
db_final %>% group_by(city) %>% summarise(avg_aqi = mean(aqi)) %>%
ggplot(aes(x = reorder(city, avg_aqi), y = avg_aqi)) +
geom_col(fill="blue") + geom_hline(yintercept = 300, linetype="dashed") + coord_flip()

#19 Vehicle Density vs NO2 Correlation
ggplot(db_final, aes(x = `vehicle count`, y = no2)) + geom_point() + geom_smooth(method = "lm")
## `geom_smooth()` using formula = 'y ~ x'

#20 Relative Frequency of Diseases
ggplot(db_final, aes(x = fct_infreq(disease_name))) + geom_bar(fill="orange") + coord_flip()

#21 Severity Distribution across Genders
ggplot(db_final, aes(x = severity, fill = gender)) + geom_bar(position = "dodge") + theme_bw()

View(db_final)