This analysis explores air quality data across multiple cities and examines relationships between air pollution and health outcomes.
# Load necessary libraries
library(tidyverse)
library(readr)
library(reshape2)
library(ggplot2)
library(dplyr)
library(corrplot)
library(gridExtra)
library(knitr)
library(DT)
# Install and load Amelia if needed
if (!require(Amelia)) {
install.packages("Amelia")
library(Amelia)
}
# Load the dataset
data <- read.csv("air_quality_health_dataset.csv")
# Display basic information about the dataset
kable(head(data), caption = "First 6 rows of the dataset")
| city | date | aqi | pm2_5 | pm10 | no2 | o3 | temperature | humidity | hospital_admissions | population_density | hospital_capacity |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Los Angeles | 1/1/2020 | 65 | 34.0 | 52.7 | 2.2 | 38.5 | 33.5 | 33 | 5 | Rural | 1337 |
| Beijing | 1/2/2020 | 137 | 33.7 | 31.5 | 36.7 | 27.5 | -1.6 | 32 | 4 | Urban | 1545 |
| London | 1/3/2020 | 266 | 43.0 | 59.6 | 30.4 | 57.3 | 36.4 | 25 | 10 | Suburban | 1539 |
| Mexico City | 1/4/2020 | 293 | 33.7 | 37.9 | 12.3 | 42.7 | -1.0 | 67 | 10 | Urban | 552 |
| Delhi | 1/5/2020 | 493 | 50.3 | 34.8 | 31.2 | 35.6 | 33.5 | 72 | 9 | Suburban | 1631 |
| Cairo | 1/6/2020 | 28 | 67.2 | 44.9 | 41.9 | 47.8 | 7.9 | 89 | 11 | Urban | 1291 |
# Dataset dimensions
cat("Dataset dimensions:", dim(data)[1], "rows,", dim(data)[2], "columns\n")
## Dataset dimensions: 88489 rows, 12 columns
# Column names
cat("Column names:", paste(names(data), collapse = ", "))
## Column names: city, date, aqi, pm2_5, pm10, no2, o3, temperature, humidity, hospital_admissions, population_density, hospital_capacity
# Check for missing values
missing_summary <- data.frame(
Column = names(data),
Missing_Count = colSums(is.na(data)),
Missing_Percentage = round(colSums(is.na(data))/nrow(data) * 100, 2)
)
kable(missing_summary, caption = "Missing Values Summary")
| Column | Missing_Count | Missing_Percentage | |
|---|---|---|---|
| city | city | 0 | 0.00 |
| date | date | 0 | 0.00 |
| aqi | aqi | 2 | 0.00 |
| pm2_5 | pm2_5 | 1 | 0.00 |
| pm10 | pm10 | 1 | 0.00 |
| no2 | no2 | 3 | 0.00 |
| o3 | o3 | 0 | 0.00 |
| temperature | temperature | 8 | 0.01 |
| humidity | humidity | 0 | 0.00 |
| hospital_admissions | hospital_admissions | 1 | 0.00 |
| population_density | population_density | 0 | 0.00 |
| hospital_capacity | hospital_capacity | 1 | 0.00 |
# Visualize missing data
missmap(data, main = "Missing Values Heatmap")
Missing Values Heatmap
# Handle missing data: Replace NA values with median
data_imputed <- data %>%
mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))
# Treat outliers by Winsorizing
data_treated <- data_imputed %>%
mutate(across(where(is.numeric), ~ {
Q1 <- quantile(., 0.25, na.rm = TRUE)
Q3 <- quantile(., 0.75, na.rm = TRUE)
IQR_val <- IQR(., na.rm = TRUE)
pmax(pmin(., Q3 + 1.5 * IQR_val), Q1 - 1.5 * IQR_val)
}))
# Create new variable: Hospital Admission Percentage
data_treated <- data_treated %>%
mutate(HospitalAdmissionPercent = (hospital_admissions / hospital_capacity) * 100)
# Use population_density as area_type
data_treated$area_type <- data_treated$population_density
cat("Data preprocessing completed successfully!")
## Data preprocessing completed successfully!
# Five-number summary function
five_summary <- function(df) {
numeric_cols <- names(df)[sapply(df, is.numeric)]
numeric_data <- df %>% select(all_of(numeric_cols))
if (ncol(numeric_data) == 0) {
return(data.frame(Variable = character(), Min = numeric(), Max = numeric(),
Mean = numeric(), Median = numeric(), SD = numeric()))
}
summary_stats <- numeric_data %>%
summarise(across(everything(), list(
Min = ~min(., na.rm = TRUE),
Max = ~max(., na.rm = TRUE),
Mean = ~mean(., na.rm = TRUE),
Median = ~median(., na.rm = TRUE),
SD = ~sd(., na.rm = TRUE)
), .names = "{.col}_{.fn}"))
summary_long <- summary_stats %>%
pivot_longer(everything(), names_to = "Variable_Metric", values_to = "Value") %>%
separate(Variable_Metric, into = c("Variable", "Metric"), sep = "_(?=[^_]+$)") %>%
pivot_wider(names_from = Metric, values_from = Value)
return(summary_long)
}
# Display summary statistics
summary_table <- five_summary(data_treated)
kable(summary_table, digits = 2, caption = "Five-Number Summary Statistics")
| Variable | Min | Max | Mean | Median | SD |
|---|---|---|---|---|---|
| aqi | 0.0 | 499.00 | 249.37 | 249.00 | 144.48 |
| pm2_5 | 0.0 | 75.65 | 35.13 | 35.10 | 14.72 |
| pm10 | 0.0 | 103.85 | 50.09 | 50.00 | 19.72 |
| no2 | 3.2 | 56.80 | 30.00 | 30.00 | 9.92 |
| o3 | 7.6 | 72.40 | 39.98 | 40.00 | 11.94 |
| temperature | -5.0 | 40.00 | 17.52 | 17.50 | 12.96 |
| humidity | 20.0 | 94.00 | 56.95 | 57.00 | 21.63 |
| hospital_admissions | 0.0 | 16.00 | 8.02 | 8.00 | 3.63 |
| hospital_capacity | 50.0 | 1999.00 | 1024.45 | 1026.00 | 561.97 |
| HospitalAdmissionPercent | 0.0 | 32.00 | 1.51 | 0.79 | 2.32 |
# Hospital admissions by city
case_counts <- data_treated %>%
group_by(city) %>%
summarise(
TotalAdmissions = sum(hospital_admissions, na.rm = TRUE),
AvgAQI = round(mean(aqi, na.rm = TRUE), 2),
AvgPM25 = round(mean(pm2_5, na.rm = TRUE), 2),
AvgPM10 = round(mean(pm10, na.rm = TRUE), 2),
.groups = 'drop'
) %>%
arrange(desc(TotalAdmissions))
kable(case_counts, caption = "Hospital Admissions and Air Quality by City")
| city | TotalAdmissions | AvgAQI | AvgPM25 | AvgPM10 |
|---|---|---|---|---|
| Delhi | 211993 | 248.91 | 35.10 | 50.06 |
| Beijing | 177929 | 249.32 | 35.10 | 50.09 |
| Mexico City | 106690 | 248.61 | 35.04 | 50.05 |
| Los Angeles | 71934 | 253.07 | 35.08 | 50.58 |
| London | 56039 | 248.43 | 35.39 | 49.67 |
| Tokyo | 49164 | 247.56 | 35.36 | 49.96 |
| Cairo | 21809 | 252.60 | 35.19 | 50.13 |
| São Paulo | 13847 | 248.89 | 34.98 | 50.55 |
temp_plot <- ggplot(data_treated, aes(x = area_type, y = temperature, fill = area_type)) +
geom_boxplot(alpha = 0.7) +
theme_minimal() +
labs(title = "Temperature Comparison by Area Type",
y = "Temperature (°C)",
x = "Area Type") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
) +
scale_fill_brewer(palette = "Set3")
print(temp_plot)
Temperature Distribution Across Different Area Types
# Calculate averages
pm_avg <- data_treated %>%
group_by(city, area_type) %>%
summarise(avg_pm2_5 = mean(pm2_5, na.rm = TRUE),
avg_pm10 = mean(pm10, na.rm = TRUE),
.groups = 'drop')
# PM2.5 plot
pm25_plot <- ggplot(pm_avg, aes(x = city, y = avg_pm2_5, fill = area_type)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
theme_minimal() +
labs(title = "Average PM2.5 by City and Area Type",
y = "Average PM2.5 (μg/m³)",
x = "City") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
) +
scale_fill_brewer(palette = "Set2")
# PM10 plot
pm10_plot <- ggplot(pm_avg, aes(x = city, y = avg_pm10, fill = area_type)) +
geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
theme_minimal() +
labs(title = "Average PM10 by City and Area Type",
y = "Average PM10 (μg/m³)",
x = "City") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
) +
scale_fill_brewer(palette = "Set2")
# Display plots
print(pm25_plot)
PM2.5 and PM10 Levels by City and Area Type
print(pm10_plot)
PM2.5 and PM10 Levels by City and Area Type
aqi_hist <- ggplot(data_treated, aes(x = aqi)) +
geom_histogram(bins = 30, fill = "skyblue", alpha = 0.7, color = "black") +
theme_minimal() +
labs(title = "Distribution of AQI Values",
x = "Air Quality Index (AQI)",
y = "Frequency") +
theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))
print(aqi_hist)
Distribution of Air Quality Index Values
admission_plot <- ggplot(data_treated, aes(x = city, y = HospitalAdmissionPercent)) +
geom_boxplot(fill = "lightcoral", alpha = 0.7) +
theme_minimal() +
labs(title = "Hospital Admission Percentage by City",
x = "City",
y = "Hospital Admission Percentage (%)") +
theme(
axis.text.x = element_text(angle = 45, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
)
print(admission_plot)
Hospital Admission Percentage by City
# Select numeric variables for correlation
numeric_vars <- c("aqi", "pm2_5", "pm10", "no2", "o3", "temperature", "humidity",
"hospital_admissions", "hospital_capacity", "HospitalAdmissionPercent")
existing_vars <- numeric_vars[numeric_vars %in% names(data_treated)]
correlation_matrix <- cor(data_treated[existing_vars], use = "complete.obs")
# Correlation heatmap using corrplot
corrplot(correlation_matrix,
method = "color",
type = "upper",
order = "hclust",
tl.cex = 0.8,
tl.col = "black",
tl.srt = 45,
title = "Correlation Heatmap: Air Quality Variables")
Correlation Matrix of Air Quality Variables
# ggplot version
correlation_df <- as.data.frame(as.table(correlation_matrix))
names(correlation_df) <- c("Var1", "Var2", "Correlation")
corr_ggplot <- ggplot(correlation_df, aes(Var1, Var2, fill = Correlation)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 0, limit = c(-1,1), space = "Lab",
name="Correlation") +
theme_minimal() +
theme(
axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
) +
labs(title = "Correlation Heatmap (ggplot2)", x = "", y = "") +
coord_fixed()
print(corr_ggplot)
Correlation Matrix of Air Quality Variables
# Create an interactive table showing key metrics by city
city_summary <- data_treated %>%
group_by(city, area_type) %>%
summarise(
Records = n(),
Avg_AQI = round(mean(aqi, na.rm = TRUE), 1),
Avg_PM2.5 = round(mean(pm2_5, na.rm = TRUE), 1),
Avg_PM10 = round(mean(pm10, na.rm = TRUE), 1),
Avg_NO2 = round(mean(no2, na.rm = TRUE), 1),
Avg_O3 = round(mean(o3, na.rm = TRUE), 1),
Avg_Temperature = round(mean(temperature, na.rm = TRUE), 1),
Avg_Humidity = round(mean(humidity, na.rm = TRUE), 1),
Total_Admissions = sum(hospital_admissions, na.rm = TRUE),
Avg_Admission_Percent = round(mean(HospitalAdmissionPercent, na.rm = TRUE), 2),
.groups = 'drop'
) %>%
arrange(desc(Avg_AQI))
DT::datatable(city_summary,
caption = "Interactive Summary Table - Air Quality Metrics by City",
options = list(scrollX = TRUE, pageLength = 10))
# Calculate some key statistics for findings
max_aqi_city <- data_treated %>%
group_by(city) %>%
summarise(avg_aqi = mean(aqi, na.rm = TRUE), .groups = 'drop') %>%
arrange(desc(avg_aqi)) %>%
slice(1)
max_admissions_city <- case_counts %>% slice(1)
cat("Key Findings:\n")
## Key Findings:
cat("1. City with highest average AQI:", max_aqi_city$city, "with AQI of", round(max_aqi_city$avg_aqi, 1), "\n")
## 1. City with highest average AQI: Los Angeles with AQI of 253.1
cat("2. City with most hospital admissions:", max_admissions_city$city, "with", max_admissions_city$TotalAdmissions, "admissions\n")
## 2. City with most hospital admissions: Delhi with 211993 admissions
cat("3. Total number of cities analyzed:", length(unique(data_treated$city)), "\n")
## 3. Total number of cities analyzed: 8
cat("4. Total number of records:", nrow(data_treated), "\n")
## 4. Total number of records: 88489
This analysis provides a comprehensive overview of air quality conditions across multiple cities and their relationship with health outcomes. The visualizations reveal patterns in pollution levels across different area types and highlight cities with the most concerning air quality and health metrics.
Analysis completed on 2025-07-03