Air Quality Analysis Dashboard

This analysis explores air quality data across multiple cities and examines relationships between air pollution and health outcomes.

Load Libraries and Data

# Load necessary libraries
library(tidyverse)
library(readr)
library(reshape2)
library(ggplot2)
library(dplyr)
library(corrplot)
library(gridExtra)
library(knitr)
library(DT)

# Install and load Amelia if needed
if (!require(Amelia)) {
  install.packages("Amelia")
  library(Amelia)
}

# Load the dataset
data <- read.csv("air_quality_health_dataset.csv")

Data Overview

# Display basic information about the dataset
kable(head(data), caption = "First 6 rows of the dataset")

First 6 rows of the dataset
city	date	aqi	pm2_5	pm10	no2	o3	temperature	humidity	hospital_admissions	population_density	hospital_capacity
Los Angeles	1/1/2020	65	34.0	52.7	2.2	38.5	33.5	33	5	Rural	1337
Beijing	1/2/2020	137	33.7	31.5	36.7	27.5	-1.6	32	4	Urban	1545
London	1/3/2020	266	43.0	59.6	30.4	57.3	36.4	25	10	Suburban	1539
Mexico City	1/4/2020	293	33.7	37.9	12.3	42.7	-1.0	67	10	Urban	552
Delhi	1/5/2020	493	50.3	34.8	31.2	35.6	33.5	72	9	Suburban	1631
Cairo	1/6/2020	28	67.2	44.9	41.9	47.8	7.9	89	11	Urban	1291

# Dataset dimensions
cat("Dataset dimensions:", dim(data)[1], "rows,", dim(data)[2], "columns\n")

## Dataset dimensions: 88489 rows, 12 columns

# Column names
cat("Column names:", paste(names(data), collapse = ", "))

## Column names: city, date, aqi, pm2_5, pm10, no2, o3, temperature, humidity, hospital_admissions, population_density, hospital_capacity

Missing Data Analysis

# Check for missing values
missing_summary <- data.frame(
  Column = names(data),
  Missing_Count = colSums(is.na(data)),
  Missing_Percentage = round(colSums(is.na(data))/nrow(data) * 100, 2)
)

kable(missing_summary, caption = "Missing Values Summary")

Missing Values Summary
	Column	Missing_Count	Missing_Percentage
city	city	0	0.00
date	date	0	0.00
aqi	aqi	2	0.00
pm2_5	pm2_5	1	0.00
pm10	pm10	1	0.00
no2	no2	3	0.00
o3	o3	0	0.00
temperature	temperature	8	0.01
humidity	humidity	0	0.00
hospital_admissions	hospital_admissions	1	0.00
population_density	population_density	0	0.00
hospital_capacity	hospital_capacity	1	0.00

# Visualize missing data
missmap(data, main = "Missing Values Heatmap")

Missing Values Heatmap

Data Preprocessing

# Handle missing data: Replace NA values with median
data_imputed <- data %>% 
  mutate(across(where(is.numeric), ~ ifelse(is.na(.), median(., na.rm = TRUE), .)))

# Treat outliers by Winsorizing
data_treated <- data_imputed %>% 
  mutate(across(where(is.numeric), ~ {
    Q1 <- quantile(., 0.25, na.rm = TRUE)
    Q3 <- quantile(., 0.75, na.rm = TRUE)
    IQR_val <- IQR(., na.rm = TRUE)
    pmax(pmin(., Q3 + 1.5 * IQR_val), Q1 - 1.5 * IQR_val)
  }))

# Create new variable: Hospital Admission Percentage
data_treated <- data_treated %>% 
  mutate(HospitalAdmissionPercent = (hospital_admissions / hospital_capacity) * 100)

# Use population_density as area_type
data_treated$area_type <- data_treated$population_density

cat("Data preprocessing completed successfully!")

## Data preprocessing completed successfully!

Summary Statistics

# Five-number summary function
five_summary <- function(df) {
  numeric_cols <- names(df)[sapply(df, is.numeric)]
  numeric_data <- df %>% select(all_of(numeric_cols))
  
  if (ncol(numeric_data) == 0) {
    return(data.frame(Variable = character(), Min = numeric(), Max = numeric(), 
                     Mean = numeric(), Median = numeric(), SD = numeric()))
  }
  
  summary_stats <- numeric_data %>% 
    summarise(across(everything(), list(
      Min = ~min(., na.rm = TRUE),
      Max = ~max(., na.rm = TRUE),
      Mean = ~mean(., na.rm = TRUE),
      Median = ~median(., na.rm = TRUE),
      SD = ~sd(., na.rm = TRUE)
    ), .names = "{.col}_{.fn}"))
  
  summary_long <- summary_stats %>% 
    pivot_longer(everything(), names_to = "Variable_Metric", values_to = "Value") %>%
    separate(Variable_Metric, into = c("Variable", "Metric"), sep = "_(?=[^_]+$)") %>%
    pivot_wider(names_from = Metric, values_from = Value)
  
  return(summary_long)
}

# Display summary statistics
summary_table <- five_summary(data_treated)
kable(summary_table, digits = 2, caption = "Five-Number Summary Statistics")

Five-Number Summary Statistics
Variable	Min	Max	Mean	Median	SD
aqi	0.0	499.00	249.37	249.00	144.48
pm2_5	0.0	75.65	35.13	35.10	14.72
pm10	0.0	103.85	50.09	50.00	19.72
no2	3.2	56.80	30.00	30.00	9.92
o3	7.6	72.40	39.98	40.00	11.94
temperature	-5.0	40.00	17.52	17.50	12.96
humidity	20.0	94.00	56.95	57.00	21.63
hospital_admissions	0.0	16.00	8.02	8.00	3.63
hospital_capacity	50.0	1999.00	1024.45	1026.00	561.97
HospitalAdmissionPercent	0.0	32.00	1.51	0.79	2.32

City Analysis

# Hospital admissions by city
case_counts <- data_treated %>% 
  group_by(city) %>%
  summarise(
    TotalAdmissions = sum(hospital_admissions, na.rm = TRUE),
    AvgAQI = round(mean(aqi, na.rm = TRUE), 2),
    AvgPM25 = round(mean(pm2_5, na.rm = TRUE), 2),
    AvgPM10 = round(mean(pm10, na.rm = TRUE), 2),
    .groups = 'drop'
  ) %>% 
  arrange(desc(TotalAdmissions))

kable(case_counts, caption = "Hospital Admissions and Air Quality by City")

Hospital Admissions and Air Quality by City
city	TotalAdmissions	AvgAQI	AvgPM25	AvgPM10
Delhi	211993	248.91	35.10	50.06
Beijing	177929	249.32	35.10	50.09
Mexico City	106690	248.61	35.04	50.05
Los Angeles	71934	253.07	35.08	50.58
London	56039	248.43	35.39	49.67
Tokyo	49164	247.56	35.36	49.96
Cairo	21809	252.60	35.19	50.13
São Paulo	13847	248.89	34.98	50.55

Visualizations

Temperature Comparison by Area Type

temp_plot <- ggplot(data_treated, aes(x = area_type, y = temperature, fill = area_type)) + 
  geom_boxplot(alpha = 0.7) +
  theme_minimal() +
  labs(title = "Temperature Comparison by Area Type", 
       y = "Temperature (°C)", 
       x = "Area Type") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  scale_fill_brewer(palette = "Set3")

print(temp_plot)

Temperature Distribution Across Different Area Types

PM2.5 and PM10 Comparison

# Calculate averages
pm_avg <- data_treated %>% 
  group_by(city, area_type) %>% 
  summarise(avg_pm2_5 = mean(pm2_5, na.rm = TRUE),
            avg_pm10 = mean(pm10, na.rm = TRUE), 
            .groups = 'drop')

# PM2.5 plot
pm25_plot <- ggplot(pm_avg, aes(x = city, y = avg_pm2_5, fill = area_type)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Average PM2.5 by City and Area Type", 
       y = "Average PM2.5 (μg/m³)",
       x = "City") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  scale_fill_brewer(palette = "Set2")

# PM10 plot
pm10_plot <- ggplot(pm_avg, aes(x = city, y = avg_pm10, fill = area_type)) +
  geom_bar(stat = "identity", position = "dodge", alpha = 0.8) +
  theme_minimal() +
  labs(title = "Average PM10 by City and Area Type", 
       y = "Average PM10 (μg/m³)",
       x = "City") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  scale_fill_brewer(palette = "Set2")

# Display plots
print(pm25_plot)

PM2.5 and PM10 Levels by City and Area Type

print(pm10_plot)

PM2.5 and PM10 Levels by City and Area Type

AQI Distribution

aqi_hist <- ggplot(data_treated, aes(x = aqi)) +
  geom_histogram(bins = 30, fill = "skyblue", alpha = 0.7, color = "black") +
  theme_minimal() +
  labs(title = "Distribution of AQI Values",
       x = "Air Quality Index (AQI)", 
       y = "Frequency") +
  theme(plot.title = element_text(hjust = 0.5, size = 14, face = "bold"))

print(aqi_hist)

Distribution of Air Quality Index Values

Hospital Admissions Analysis

admission_plot <- ggplot(data_treated, aes(x = city, y = HospitalAdmissionPercent)) +
  geom_boxplot(fill = "lightcoral", alpha = 0.7) +
  theme_minimal() +
  labs(title = "Hospital Admission Percentage by City",
       x = "City", 
       y = "Hospital Admission Percentage (%)") +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  )

print(admission_plot)

Hospital Admission Percentage by City

Correlation Analysis

# Select numeric variables for correlation
numeric_vars <- c("aqi", "pm2_5", "pm10", "no2", "o3", "temperature", "humidity", 
                 "hospital_admissions", "hospital_capacity", "HospitalAdmissionPercent")

existing_vars <- numeric_vars[numeric_vars %in% names(data_treated)]
correlation_matrix <- cor(data_treated[existing_vars], use = "complete.obs")

# Correlation heatmap using corrplot
corrplot(correlation_matrix, 
         method = "color",
         type = "upper",
         order = "hclust",
         tl.cex = 0.8,
         tl.col = "black",
         tl.srt = 45,
         title = "Correlation Heatmap: Air Quality Variables")

Correlation Matrix of Air Quality Variables

# ggplot version
correlation_df <- as.data.frame(as.table(correlation_matrix))
names(correlation_df) <- c("Var1", "Var2", "Correlation")

corr_ggplot <- ggplot(correlation_df, aes(Var1, Var2, fill = Correlation)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
                       midpoint = 0, limit = c(-1,1), space = "Lab", 
                       name="Correlation") +
  theme_minimal() +
  theme(
    axis.text.x = element_text(angle = 45, vjust = 1, hjust = 1),
    plot.title = element_text(hjust = 0.5, size = 14, face = "bold")
  ) +
  labs(title = "Correlation Heatmap (ggplot2)", x = "", y = "") +
  coord_fixed()

print(corr_ggplot)

Correlation Matrix of Air Quality Variables

Interactive Data Table

# Create an interactive table showing key metrics by city
city_summary <- data_treated %>% 
  group_by(city, area_type) %>% 
  summarise(
    Records = n(),
    Avg_AQI = round(mean(aqi, na.rm = TRUE), 1),
    Avg_PM2.5 = round(mean(pm2_5, na.rm = TRUE), 1),
    Avg_PM10 = round(mean(pm10, na.rm = TRUE), 1),
    Avg_NO2 = round(mean(no2, na.rm = TRUE), 1),
    Avg_O3 = round(mean(o3, na.rm = TRUE), 1),
    Avg_Temperature = round(mean(temperature, na.rm = TRUE), 1),
    Avg_Humidity = round(mean(humidity, na.rm = TRUE), 1),
    Total_Admissions = sum(hospital_admissions, na.rm = TRUE),
    Avg_Admission_Percent = round(mean(HospitalAdmissionPercent, na.rm = TRUE), 2),
    .groups = 'drop'
  ) %>%
  arrange(desc(Avg_AQI))

DT::datatable(city_summary, 
              caption = "Interactive Summary Table - Air Quality Metrics by City",
              options = list(scrollX = TRUE, pageLength = 10))

Key Findings

# Calculate some key statistics for findings
max_aqi_city <- data_treated %>% 
  group_by(city) %>% 
  summarise(avg_aqi = mean(aqi, na.rm = TRUE), .groups = 'drop') %>% 
  arrange(desc(avg_aqi)) %>% 
  slice(1)

max_admissions_city <- case_counts %>% slice(1)

cat("Key Findings:\n")

## Key Findings:

cat("1. City with highest average AQI:", max_aqi_city$city, "with AQI of", round(max_aqi_city$avg_aqi, 1), "\n")

## 1. City with highest average AQI: Los Angeles with AQI of 253.1

cat("2. City with most hospital admissions:", max_admissions_city$city, "with", max_admissions_city$TotalAdmissions, "admissions\n")

## 2. City with most hospital admissions: Delhi with 211993 admissions

cat("3. Total number of cities analyzed:", length(unique(data_treated$city)), "\n")

## 3. Total number of cities analyzed: 8

cat("4. Total number of records:", nrow(data_treated), "\n")

## 4. Total number of records: 88489

Conclusion

This analysis provides a comprehensive overview of air quality conditions across multiple cities and their relationship with health outcomes. The visualizations reveal patterns in pollution levels across different area types and highlight cities with the most concerning air quality and health metrics.

Analysis completed on 2025-07-03

Air Quality Analysis Dashboard

M Ali Haider

2025-07-03

Air Quality Analysis Dashboard

Load Libraries and Data

Data Overview

Missing Data Analysis

Data Preprocessing

Summary Statistics

City Analysis

Visualizations

Temperature Comparison by Area Type

PM2.5 and PM10 Comparison

AQI Distribution

Hospital Admissions Analysis

Correlation Analysis

Interactive Data Table

Key Findings

Conclusion