Dataset 3: New York City Air Quality Data

Analysis: Compare air pollutants levels vs. locations vs. time of year.

Conclusion:

I can observe here that the pollution levels for Nitrogen dioxide (NO2) and Fine particles (PM 2.5) have decreased over the years.

Midtown(CD5) location has the highest pollution levels for Nitrogen dioxide (NO2) and Fine particles (PM 2.5) for 2021.

library(dplyr,warn.conflicts = FALSE)
library(tidyr,warn.conflicts = FALSE)
library(readr,warn.conflicts = FALSE)
library(ggplot2)

#read the csv file 
pollutant_levels <- read.csv('https://raw.githubusercontent.com/datanerddhanya/DATA607/main/Air_Quality_20240302.csv', fileEncoding="latin1")

#select only the Annual Report time period category for analysis in Time.Period column
pollutant_levels_summary <- pollutant_levels[grep("Annual",pollutant_levels$Time.Period),] 

# cleanup Time.period variable to remove "Annual Report" and convert to numeric
pollutant_levels_summary$Time.Period <- gsub("Annual Average ", " ",pollutant_levels_summary$Time.Period) 


#Summarize to Pollutant and Year
pollutant_levels_name <- pollutant_levels_summary  |>
  group_by(Name,Time.Period) |>
  summarise(mean = mean(Data.Value))
## `summarise()` has grouped output by 'Name'. You can override using the
## `.groups` argument.
#Summarize to Location and Year
pollutant_levels_location <- pollutant_levels_summary  |>
  group_by(Geo.Place.Name, Time.Period) |>
  summarise(mean = mean(Data.Value))
## `summarise()` has grouped output by 'Geo.Place.Name'. You can override using
## the `.groups` argument.
#Summarize to Location and Year
pollutant_levels_name_location <- pollutant_levels_summary  |>
  group_by(Name,Geo.Place.Name, Time.Period) |>
  summarise(mean = mean(Data.Value))
## `summarise()` has grouped output by 'Name', 'Geo.Place.Name'. You can override
## using the `.groups` argument.
#Visualise using plots
# I can observe here that the pollution levels for Nitrogen dioxide (NO2) and Fine particles (PM 2.5) have decreased over the years.

ggplot(data =pollutant_levels_name, aes(Time.Period,mean) )+
  geom_col(aes(colour=Name))

#which city has the highest pollution levels of Nitrogen dioxide (NO2) and Fine particles (PM 2.5)
#for the year 2021

pollutant_levels_name_location[grepl("2021",pollutant_levels_name_location$Time.Period),] |>

arrange(desc(mean))
## # A tibble: 228 × 4
## # Groups:   Name, Geo.Place.Name [228]
##    Name                   Geo.Place.Name                       Time.Period  mean
##    <chr>                  <chr>                                <chr>       <dbl>
##  1 Nitrogen dioxide (NO2) Midtown (CD5)                        " 2021"      25.2
##  2 Nitrogen dioxide (NO2) Gramercy Park - Murray Hill          " 2021"      23.5
##  3 Nitrogen dioxide (NO2) Chelsea - Clinton                    " 2021"      23.3
##  4 Nitrogen dioxide (NO2) Chelsea-Village                      " 2021"      22.6
##  5 Nitrogen dioxide (NO2) Clinton and Chelsea (CD4)            " 2021"      22.3
##  6 Nitrogen dioxide (NO2) Stuyvesant Town and Turtle Bay (CD6) " 2021"      22.1
##  7 Nitrogen dioxide (NO2) Upper East Side-Gramercy             " 2021"      21.8
##  8 Nitrogen dioxide (NO2) Financial District (CD1)             " 2021"      21.6
##  9 Nitrogen dioxide (NO2) Lower Manhattan                      " 2021"      21.5
## 10 Nitrogen dioxide (NO2) Greenwich Village - SoHo             " 2021"      21.3
## # ℹ 218 more rows
#Midtown(CD5) has the highest pollution levels for Nitrogen dioxide (NO2) and Fine particles (PM 2.5) for 2021.