Note: This analysis requires the following R
packages: readxl, dplyr, ggplot2,
tidyr, lubridate, and GGally.
Ensure they are installed before running the code.
library(readxl)
library(dplyr)
library(ggplot2)
library(tidyr)
library(lubridate)
library(GGally)
data_aqi <- read_excel(file.choose())
str(data_aqi)
## tibble [18,776 × 9] (S3: tbl_df/tbl/data.frame)
## $ date : POSIXct[1:18776], format: "2020-11-25 01:00:00" "2020-11-25 02:00:00" ...
## $ co : num [1:18776] 2617 3632 4539 4539 4379 ...
## $ no : num [1:18776] 2.18 23.25 52.75 50.96 42.92 ...
## $ no2 : num [1:18776] 70.6 89.1 100.1 111 117.9 ...
## $ o3 : num [1:18776] 13.59 0.33 1.11 6.44 17.17 ...
## $ so2 : num [1:18776] 38.6 54.4 68.7 78.2 87.7 ...
## $ pm2_5: num [1:18776] 365 421 464 455 448 ...
## $ pm10 : num [1:18776] 412 486 542 534 529 ...
## $ nh3 : num [1:18776] 28.6 41 49.1 48.1 46.6 ...
head(data_aqi)
## # A tibble: 6 × 9
## date co no no2 o3 so2 pm2_5 pm10 nh3
## <dttm> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2020-11-25 01:00:00 2617. 2.18 70.6 13.6 38.6 365. 412. 28.6
## 2 2020-11-25 02:00:00 3632. 23.2 89.1 0.33 54.4 421. 486. 41.0
## 3 2020-11-25 03:00:00 4539. 52.8 100. 1.11 68.7 464. 542. 49.1
## 4 2020-11-25 04:00:00 4539. 51.0 111. 6.44 78.2 455. 534 48.1
## 5 2020-11-25 05:00:00 4379. 42.9 118. 17.2 87.7 448. 529. 46.6
## 6 2020-11-25 06:00:00 3899. 28.4 118. 40.0 101. 437. 512. 42.0
## **3. Convert Date Column**
data_aqi <- data_aqi %>%
mutate(date = as.Date(date, format = "%Y-%m-%d"))
colSums(is.na(data_aqi))
## date co no no2 o3 so2 pm2_5 pm10 nh3
## 0 0 0 0 0 0 0 0 0
data_aqi %>%
summarise(across(where(is.numeric), ~ sum(abs(scale(.)) > 3, na.rm = TRUE)))
## # A tibble: 1 × 8
## co no no2 o3 so2 pm2_5 pm10 nh3
## <int> <int> <int> <int> <int> <int> <int> <int>
## 1 432 530 355 297 351 412 398 411
data_aqi %>%
pivot_longer(cols = -date, names_to = "pollutant", values_to = "value") %>%
ggplot(aes(x = pollutant, y = value)) +
geom_boxplot(outlier.color = "red") +
theme_minimal() +
labs(title = "Boxplot of Pollutants with Outliers Highlighted")
data_aqi_long <- data_aqi %>%
pivot_longer(cols = -date, names_to = "Pollutant", values_to = "Value")
ggplot(data_aqi_long, aes(x = Pollutant, y = Value)) +
geom_boxplot(outlier.colour = "red", outlier.shape = 16, outlier.size = 2) +
scale_y_log10() +
theme_minimal() +
labs(title = "Boxplot of Pollutants (Log-Scaled)", y = "Log-Scaled Value", x = "Pollutant") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
data_aqi %>%
ggplot(aes(x = date, y = pm2_5)) +
geom_line(color = "red") +
geom_smooth(method = "loess", color = "blue") +
geom_vline(xintercept = as.Date(c("2021-11-01", "2022-01-01", "2022-11-01", "2023-01-01")),
linetype = "dashed", color = "black") +
labs(title = "Delhi PM2.5 Trend Over Time with Key Pollution Peaks",
x = "Date", y = "PM2.5 Levels") +
theme_minimal()
These patterns repeat annually, confirming that pollution in Delhi follows a predictable cycle.
data_aqi %>%
mutate(month = lubridate::floor_date(date, "month")) %>%
group_by(month) %>%
summarise(pm2_5_avg = mean(pm2_5, na.rm = TRUE)) %>%
ggplot(aes(x = month, y = pm2_5_avg, fill = pm2_5_avg)) +
geom_tile(width = 20, height = 6) +
scale_fill_gradient(low = "green", high = "red") +
labs(title = "Monthly PM2.5 Levels in Delhi", x = "Month", y = "Average PM2.5") +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5))
df_top10 <- data_aqi %>%
group_by(date) %>%
summarise(pm2_5 = max(pm2_5)) %>%
arrange(desc(pm2_5)) %>%
head(10)
ggplot(df_top10, aes(x = reorder(date, pm2_5), y = pm2_5, fill = pm2_5)) +
geom_col(color = "black") +
scale_fill_gradient(low = "yellow", high = "red") +
labs(title = "Top 10 PM2.5 Pollution Days in Delhi",
x = "Date", y = "PM2.5 Level") +
geom_text(aes(label = round(pm2_5, 2)), vjust = -0.5, size = 5, fontface = "bold") +
theme_minimal() +
theme(
plot.title = element_text(hjust = 0.5, size = 14, face = "bold"),
axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1, size = 10),
axis.text.y = element_text(size = 10),
legend.position = "right"
)