Air Quality Analysis Dashboard

This analysis explores air quality data across multiple cities and examines relationships between air pollution and health outcomes.

Load Libraries and Data

# Load necessary libraries
library(tidyverse)
library(readr)
library(reshape2)
library(ggplot2)
library(dplyr)
library(corrplot)
library(gridExtra)
library(knitr)
library(DT)

# Install and load Amelia if needed
if (!require(Amelia)) {
  install.packages("Amelia")
  library(Amelia)
}

# Load the dataset
data <- read.csv("air_quality_health_dataset.csv")

Data Overview

# Display basic information about the dataset
kable(head(data), caption = "First 6 rows of the dataset")
First 6 rows of the dataset
city date aqi pm2_5 pm10 no2 o3 temperature humidity hospital_admissions population_density hospital_capacity
Los Angeles 1/1/2020 65 34.0 52.7 2.2 38.5 33.5 33 5 Rural 1337
Beijing 1/2/2020 137 33.7 31.5 36.7 27.5 -1.6 32 4 Urban 1545
London 1/3/2020 266 43.0 59.6 30.4 57.3 36.4 25 10 Suburban 1539
Mexico City 1/4/2020 293 33.7 37.9 12.3 42.7 -1.0 67 10 Urban 552
Delhi 1/5/2020 493 50.3 34.8 31.2 35.6 33.5 72 9 Suburban 1631
Cairo 1/6/2020 28 67.2 44.9 41.9 47.8 7.9 89 11 Urban 1291
# Dataset dimensions
cat("Dataset dimensions:", dim(data)[1], "rows,", dim(data)[2], "columns\n")
## Dataset dimensions: 88489 rows, 12 columns
# Column names
cat("Column names:", paste(names(data), collapse = ", "))
## Column names: city, date, aqi, pm2_5, pm10, no2, o3, temperature, humidity, hospital_admissions, population_density, hospital_capacity

Missing Data Analysis

# Check for missing values
missing_summary <- data.frame(
  Column = names(data),
  Missing_Count = colSums(is.na(data)),
  Missing_Percentage = round(colSums(is.na(data))/nrow(data) * 100, 2)
)

kable(missing_summary, caption = "Missing Values Summary")
Missing Values Summary
Column Missing_Count Missing_Percentage
city city 0 0.00
date date 0 0.00
aqi aqi 2 0.00
pm2_5 pm2_5 1 0.00
pm10 pm10 1 0.00
no2 no2 3 0.00
o3 o3 0 0.00
temperature temperature 8 0.01
humidity humidity 0 0.00
hospital_admissions hospital_admissions 1 0.00
population_density population_density 0 0.00
hospital_capacity hospital_capacity 1 0.00
# Visualize missing data
missmap(data, main = "Missing Values Heatmap")
Missing Values Heatmap

Missing Values Heatmap

Data Preprocessing

# Handle missing data: Replace NA values with median
data_imputed <- data %>% 
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

# Treat outliers by Winsorizing
data_treated <- data_imputed %>% 
  mutate(across(where(is.numeric), ~ {
    Q1 <- quantile(., 0.25, na.rm = TRUE)
    Q3 <- quantile(., 0.75, na.rm = TRUE)
    IQR_val <- IQR(., na.rm = TRUE)
    pmax(pmin(., Q3 + 1.5 * IQR_val), Q1 - 1.5 * IQR_val)
  }))

# Create new variable: Hospital Admission Percentage
data_treated <- data_treated %>% 
  mutate(HospitalAdmissionPercent = (hospital_admissions / hospital_capacity) * 100)

# Use population_density as area_type
data_treated$area_type <- data_treated$population_density

cat("Data preprocessing completed successfully!")
## Data preprocessing completed successfully!

Summary Statistics

# Five-number summary function
five_summary <- function(df) {
  numeric_cols <- names(df)[sapply(df, is.numeric)]
  numeric_data <- df %>% select(all_of(numeric_cols))
  
  if (ncol(numeric_data) == 0) {
    return(data.frame(Variable = character(), Min = numeric(), Max = numeric(), 
                     Mean = numeric(), Median = numeric(), SD = numeric()))
  }
  
  summary_stats <- numeric_data %>% 
    summarise(across(everything(), list(
      Min = ~min(., na.rm = TRUE),
      Max = ~max(., na.rm = TRUE),
      Mean = ~mean(., na.rm = TRUE),
      Median = ~median(., na.rm = TRUE),
      SD = ~sd(., na.rm = TRUE)
    ), .names = "{.col}_{.fn}"))
  
  summary_long <- summary_stats %>% 
    pivot_longer(everything(), names_to = "Variable_Metric", values_to = "Value") %>%
    separate(Variable_Metric, into = c("Variable", "Metric"), sep = "_(?=[^_]+$)") %>%
    pivot_wider(names_from = Metric, values_from = Value)
  
  return(summary_long)
}

# Display summary statistics
summary_table <- five_summary(data_treated)
kable(summary_table, digits = 2, caption = "Five-Number Summary Statistics")
Five-Number Summary Statistics
Variable Min Max Mean Median SD
aqi 0.0 499.00 249.37 249.00 144.48
pm2_5 0.0 75.65 35.13 35.10 14.72
pm10 0.0 103.85 50.09 50.00 19.72
no2 3.2 56.80 30.00 30.00 9.92
o3 7.6 72.40 39.98 40.00 11.94
temperature -5.0 40.00 17.52 17.50 12.96
humidity 20.0 94.00 56.95 57.00 21.63
hospital_admissions 0.0 16.00 8.02 8.00 3.63
hospital_capacity 50.0 1999.00 1024.45 1026.00 561.97
HospitalAdmissionPercent 0.0 32.00 1.51 0.79 2.32

City Analysis

# Hospital admissions by city
case_counts <- data_treated %>% 
  group_by(city) %>%
  summarise(
    TotalAdmissions = sum(hospital_admissions, na.rm = TRUE),
    AvgAQI = round(mean(aqi, na.rm = TRUE), 2),
    AvgPM25 = round(mean(pm2_5, na.rm = TRUE), 2),
    AvgPM10 = round(mean(pm10, na.rm = TRUE), 2),
    .groups = 'drop'
  ) %>% 
  arrange(desc(TotalAdmissions))

kable(case_counts, caption = "Hospital Admissions and Air Quality by City")
Hospital Admissions and Air Quality by City
city TotalAdmissions AvgAQI AvgPM25 AvgPM10
Delhi 211993 248.91 35.10 50.06
Beijing 177929 249.32 35.10 50.09
Mexico City 106690 248.61 35.04 50.05
Los Angeles 71934 253.07 35.08 50.58
London 56039 248.43 35.39 49.67
Tokyo 49164 247.56 35.36 49.96
Cairo 21809 252.60 35.19 50.13
São Paulo 13847 248.89 34.98 50.55

Visualizations

Temperature Comparison by Area Type

temp_plot <- ggplot(data_treated, aes(x = area_type, y = temperature, fill = area_type)) + 
  geom_boxplot(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Temperature Comparison by Area Type", 
       y = "Temperature (°C)", 
       x = "Area Type") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  scale_fill_brewer(palette = "Set3")

print(temp_plot)
Temperature Distribution Across Different Area Types

Temperature Distribution Across Different Area Types

PM2.5 and PM10 Comparison

# Calculate averages
pm_avg <- data_treated %>% 
  group_by(city, area_type) %>% 
  summarise(avg_pm2_5 = mean(pm2_5, na.rm = TRUE),
            avg_pm10 = mean(pm10, na.rm = TRUE), 
            .groups = 'drop')

# PM2.5 plot
pm25_plot <- ggplot(pm_avg, aes(x = city, y = avg_pm2_5, fill = area_type)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Average PM2.5 by City and Area Type", 
       y = "Average PM2.5 (μg/m³)",
       x = "City") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  scale_fill_brewer(palette = "Set2")

# PM10 plot
pm10_plot <- ggplot(pm_avg, aes(x = city, y = avg_pm10, fill = area_type)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Average PM10 by City and Area Type", 
       y = "Average PM10 (μg/m³)",
       x = "City") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  scale_fill_brewer(palette = "Set2")

# Display plots
print(pm25_plot)
PM2.5 and PM10 Levels by City and Area Type

PM2.5 and PM10 Levels by City and Area Type

print(pm10_plot)
PM2.5 and PM10 Levels by City and Area Type

PM2.5 and PM10 Levels by City and Area Type

AQI Distribution

aqi_hist <- ggplot(data_treated, aes(x = aqi)) +
  geom_histogram(bins = 30, fill = "skyblue", alpha = 0.7, color = "black") +
  theme_minimal() +
  labs(title = "Distribution of AQI Values",
       x = "Air Quality Index (AQI)", 
       y = "Frequency") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

print(aqi_hist)
Distribution of Air Quality Index Values

Distribution of Air Quality Index Values

Hospital Admissions Analysis

admission_plot <- ggplot(data_treated, aes(x = city, y = HospitalAdmissionPercent)) +
  geom_boxplot(fill = "lightcoral", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Hospital Admission Percentage by City",
       x = "City", 
       y = "Hospital Admission Percentage (%)") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  )

print(admission_plot)
Hospital Admission Percentage by City

Hospital Admission Percentage by City

Correlation Analysis

# Select numeric variables for correlation
numeric_vars <- c("aqi", "pm2_5", "pm10", "no2", "o3", "temperature", "humidity", 
                 "hospital_admissions", "hospital_capacity", "HospitalAdmissionPercent")

existing_vars <- numeric_vars[numeric_vars %in% names(data_treated)]
correlation_matrix <- cor(data_treated[existing_vars], use = "complete.obs")

# Correlation heatmap using corrplot
corrplot(correlation_matrix, 
         method = "color",
         type = "upper",
         order = "hclust",
         tl.cex = 0.8,
         tl.col = "black",
         tl.srt = 45,
         title = "Correlation Heatmap: Air Quality Variables")
Correlation Matrix of Air Quality Variables

Correlation Matrix of Air Quality Variables

# ggplot version
correlation_df <- as.data.frame(as.table(correlation_matrix))
names(correlation_df) <- c("Var1", "Var2", "Correlation")

corr_ggplot <- ggplot(correlation_df, aes(Var1, Var2, fill = Correlation)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  labs(title = "Correlation Heatmap (ggplot2)", x = "", y = "") +
  coord_fixed()

print(corr_ggplot)
Correlation Matrix of Air Quality Variables

Correlation Matrix of Air Quality Variables

Interactive Data Table

# Create an interactive table showing key metrics by city
city_summary <- data_treated %>% 
  group_by(city, area_type) %>% 
  summarise(
    Records = n(),
    Avg_AQI = round(mean(aqi, na.rm = TRUE), 1),
    Avg_PM2.5 = round(mean(pm2_5, na.rm = TRUE), 1),
    Avg_PM10 = round(mean(pm10, na.rm = TRUE), 1),
    Avg_NO2 = round(mean(no2, na.rm = TRUE), 1),
    Avg_O3 = round(mean(o3, na.rm = TRUE), 1),
    Avg_Temperature = round(mean(temperature, na.rm = TRUE), 1),
    Avg_Humidity = round(mean(humidity, na.rm = TRUE), 1),
    Total_Admissions = sum(hospital_admissions, na.rm = TRUE),
    Avg_Admission_Percent = round(mean(HospitalAdmissionPercent, na.rm = TRUE), 2),
    .groups = 'drop'
  ) %>%
  arrange(desc(Avg_AQI))

DT::datatable(city_summary, 
              caption = "Interactive Summary Table - Air Quality Metrics by City",
              options = list(scrollX = TRUE, pageLength = 10))

Key Findings

# Calculate some key statistics for findings
max_aqi_city <- data_treated %>% 
  group_by(city) %>% 
  summarise(avg_aqi = mean(aqi, na.rm = TRUE), .groups = 'drop') %>% 
  arrange(desc(avg_aqi)) %>% 
  slice(1)

max_admissions_city <- case_counts %>% slice(1)

cat("Key Findings:\n")
## Key Findings:
cat("1. City with highest average AQI:", max_aqi_city$city, "with AQI of", round(max_aqi_city$avg_aqi, 1), "\n")
## 1. City with highest average AQI: Los Angeles with AQI of 253.1
cat("2. City with most hospital admissions:", max_admissions_city$city, "with", max_admissions_city$TotalAdmissions, "admissions\n")
## 2. City with most hospital admissions: Delhi with 211993 admissions
cat("3. Total number of cities analyzed:", length(unique(data_treated$city)), "\n")
## 3. Total number of cities analyzed: 8
cat("4. Total number of records:", nrow(data_treated), "\n")
## 4. Total number of records: 88489

Conclusion

This analysis provides a comprehensive overview of air quality conditions across multiple cities and their relationship with health outcomes. The visualizations reveal patterns in pollution levels across different area types and highlight cities with the most concerning air quality and health metrics.


Analysis completed on 2025-07-03