This report analyzes daily ozone levels across the United States in 2020, focusing on trends, rankings, and outlier detection.
#1. Load Libraries and Data
# Load necessary libraries
library(readr)
## Warning: package 'readr' was built under R version 4.3.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
# Read in the data
ozone <- read_csv("C:/Users/wrahm/OneDrive/Desktop/ANLC 801/Dataset/daily_44201_2020_Ozone/daily_44201_2020.csv")
## Rows: 391923 Columns: 29
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (16): State Code, County Code, Site Num, Datum, Parameter Name, Sample ...
## dbl (10): Parameter Code, POC, Latitude, Longitude, Observation Count, Obse...
## lgl (1): Method Code
## date (2): Date Local, Date of Last Change
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Clean column names
names(ozone) <- make.names(names(ozone))
2.1 Dimensions and Structure
# Number of rows and columns
cat("Number of Rows:", nrow(ozone), "\n")
## Number of Rows: 391923
cat("Number of Columns:", ncol(ozone), "\n")
## Number of Columns: 29
# Display the first and last few rows
head(ozone)
## # A tibble: 6 × 29
## State.Code County.Code Site.Num Parameter.Code POC Latitude Longitude Datum
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 01 003 0010 44201 1 30.5 -87.9 NAD83
## 2 01 003 0010 44201 1 30.5 -87.9 NAD83
## 3 01 003 0010 44201 1 30.5 -87.9 NAD83
## 4 01 003 0010 44201 1 30.5 -87.9 NAD83
## 5 01 003 0010 44201 1 30.5 -87.9 NAD83
## 6 01 003 0010 44201 1 30.5 -87.9 NAD83
## # ℹ 21 more variables: Parameter.Name <chr>, Sample.Duration <chr>,
## # Pollutant.Standard <chr>, Date.Local <date>, Units.of.Measure <chr>,
## # Event.Type <chr>, Observation.Count <dbl>, Observation.Percent <dbl>,
## # Arithmetic.Mean <dbl>, X1st.Max.Value <dbl>, X1st.Max.Hour <dbl>,
## # AQI <dbl>, Method.Code <lgl>, Method.Name <chr>, Local.Site.Name <chr>,
## # Address <chr>, State.Name <chr>, County.Name <chr>, City.Name <chr>,
## # CBSA.Name <chr>, Date.of.Last.Change <date>
tail(ozone)
## # A tibble: 6 × 29
## State.Code County.Code Site.Num Parameter.Code POC Latitude Longitude Datum
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 80 026 8012 44201 1 32.5 -115. WGS84
## 2 80 026 8012 44201 1 32.5 -115. WGS84
## 3 80 026 8012 44201 1 32.5 -115. WGS84
## 4 80 026 8012 44201 1 32.5 -115. WGS84
## 5 80 026 8012 44201 1 32.5 -115. WGS84
## 6 80 026 8012 44201 1 32.5 -115. WGS84
## # ℹ 21 more variables: Parameter.Name <chr>, Sample.Duration <chr>,
## # Pollutant.Standard <chr>, Date.Local <date>, Units.of.Measure <chr>,
## # Event.Type <chr>, Observation.Count <dbl>, Observation.Percent <dbl>,
## # Arithmetic.Mean <dbl>, X1st.Max.Value <dbl>, X1st.Max.Hour <dbl>,
## # AQI <dbl>, Method.Code <lgl>, Method.Name <chr>, Local.Site.Name <chr>,
## # Address <chr>, State.Name <chr>, County.Name <chr>, City.Name <chr>,
## # CBSA.Name <chr>, Date.of.Last.Change <date>
# Check the structure
str(ozone)
## spc_tbl_ [391,923 × 29] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ State.Code : chr [1:391923] "01" "01" "01" "01" ...
## $ County.Code : chr [1:391923] "003" "003" "003" "003" ...
## $ Site.Num : chr [1:391923] "0010" "0010" "0010" "0010" ...
## $ Parameter.Code : num [1:391923] 44201 44201 44201 44201 44201 ...
## $ POC : num [1:391923] 1 1 1 1 1 1 1 1 1 1 ...
## $ Latitude : num [1:391923] 30.5 30.5 30.5 30.5 30.5 ...
## $ Longitude : num [1:391923] -87.9 -87.9 -87.9 -87.9 -87.9 ...
## $ Datum : chr [1:391923] "NAD83" "NAD83" "NAD83" "NAD83" ...
## $ Parameter.Name : chr [1:391923] "Ozone" "Ozone" "Ozone" "Ozone" ...
## $ Sample.Duration : chr [1:391923] "8-HR RUN AVG BEGIN HOUR" "8-HR RUN AVG BEGIN HOUR" "8-HR RUN AVG BEGIN HOUR" "8-HR RUN AVG BEGIN HOUR" ...
## $ Pollutant.Standard : chr [1:391923] "Ozone 8-hour 2015" "Ozone 8-hour 2015" "Ozone 8-hour 2015" "Ozone 8-hour 2015" ...
## $ Date.Local : Date[1:391923], format: "2020-02-29" "2020-03-01" ...
## $ Units.of.Measure : chr [1:391923] "Parts per million" "Parts per million" "Parts per million" "Parts per million" ...
## $ Event.Type : chr [1:391923] "None" "None" "None" "None" ...
## $ Observation.Count : num [1:391923] 1 17 12 17 17 17 17 17 17 17 ...
## $ Observation.Percent: num [1:391923] 6 100 71 100 100 100 100 100 100 100 ...
## $ Arithmetic.Mean : num [1:391923] 0.005 0.0469 0.0401 0.0341 0.0279 ...
## $ X1st.Max.Value : num [1:391923] 0.005 0.051 0.043 0.042 0.035 0.035 0.041 0.041 0.044 0.04 ...
## $ X1st.Max.Hour : num [1:391923] 23 10 12 7 19 14 9 9 10 10 ...
## $ AQI : num [1:391923] 5 47 40 39 32 32 38 38 41 37 ...
## $ Method.Code : logi [1:391923] NA NA NA NA NA NA ...
## $ Method.Name : chr [1:391923] "-" "-" "-" "-" ...
## $ Local.Site.Name : chr [1:391923] "FAIRHOPE, Alabama" "FAIRHOPE, Alabama" "FAIRHOPE, Alabama" "FAIRHOPE, Alabama" ...
## $ Address : chr [1:391923] "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA" "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA" "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA" "FAIRHOPE HIGH SCHOOL, 1 PIRATE DRIVE, FAIRHOPE, ALABAMA" ...
## $ State.Name : chr [1:391923] "Alabama" "Alabama" "Alabama" "Alabama" ...
## $ County.Name : chr [1:391923] "Baldwin" "Baldwin" "Baldwin" "Baldwin" ...
## $ City.Name : chr [1:391923] "Fairhope" "Fairhope" "Fairhope" "Fairhope" ...
## $ CBSA.Name : chr [1:391923] "Daphne-Fairhope-Foley, AL" "Daphne-Fairhope-Foley, AL" "Daphne-Fairhope-Foley, AL" "Daphne-Fairhope-Foley, AL" ...
## $ Date.of.Last.Change: Date[1:391923], format: "2021-02-25" "2021-02-25" ...
## - attr(*, "spec")=
## .. cols(
## .. `State Code` = col_character(),
## .. `County Code` = col_character(),
## .. `Site Num` = col_character(),
## .. `Parameter Code` = col_double(),
## .. POC = col_double(),
## .. Latitude = col_double(),
## .. Longitude = col_double(),
## .. Datum = col_character(),
## .. `Parameter Name` = col_character(),
## .. `Sample Duration` = col_character(),
## .. `Pollutant Standard` = col_character(),
## .. `Date Local` = col_date(format = ""),
## .. `Units of Measure` = col_character(),
## .. `Event Type` = col_character(),
## .. `Observation Count` = col_double(),
## .. `Observation Percent` = col_double(),
## .. `Arithmetic Mean` = col_double(),
## .. `1st Max Value` = col_double(),
## .. `1st Max Hour` = col_double(),
## .. AQI = col_double(),
## .. `Method Code` = col_logical(),
## .. `Method Name` = col_character(),
## .. `Local Site Name` = col_character(),
## .. Address = col_character(),
## .. `State Name` = col_character(),
## .. `County Name` = col_character(),
## .. `City Name` = col_character(),
## .. `CBSA Name` = col_character(),
## .. `Date of Last Change` = col_date(format = "")
## .. )
## - attr(*, "problems")=<externalptr>
2.2 Missing Values
# Check for missing values
colSums(is.na(ozone))
## State.Code County.Code Site.Num Parameter.Code
## 0 0 0 0
## POC Latitude Longitude Datum
## 0 0 0 0
## Parameter.Name Sample.Duration Pollutant.Standard Date.Local
## 0 0 0 0
## Units.of.Measure Event.Type Observation.Count Observation.Percent
## 0 0 0 0
## Arithmetic.Mean X1st.Max.Value X1st.Max.Hour AQI
## 0 0 0 0
## Method.Code Method.Name Local.Site.Name Address
## 391923 0 19612 0
## State.Name County.Name City.Name CBSA.Name
## 0 0 0 40854
## Date.of.Last.Change
## 0
3.1 Descriptive Statistics
# Summary of ozone measurements
summary(ozone$Arithmetic.Mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.00330 0.02323 0.03071 0.03082 0.03806 0.13553
# Calculate deciles
quantile(ozone$Arithmetic.Mean, seq(0, 1, 0.1), na.rm = TRUE)
## 0% 10% 20% 30% 40% 50% 60% 70%
## -0.003300 0.017000 0.021471 0.024882 0.027882 0.030706 0.033471 0.036412
## 80% 90% 100%
## 0.039824 0.044412 0.135529
4.1 Average Ozone Levels by County
# Ranking counties by average ozone levels
ranking <- ozone %>%
group_by(State.Name, County.Name) %>%
summarize(average_ozone = mean(Arithmetic.Mean, na.rm = TRUE)) %>%
arrange(desc(average_ozone))
## `summarise()` has grouped output by 'State.Name'. You can override using the
## `.groups` argument.
# Display top 10 counties
head(ranking, 10)
## # A tibble: 10 × 3
## # Groups: State.Name [7]
## State.Name County.Name average_ozone
## <chr> <chr> <dbl>
## 1 Texas Culberson 0.0503
## 2 Colorado Clear Creek 0.0485
## 3 California Mariposa 0.0468
## 4 Wyoming Albany 0.0467
## 5 Colorado Gilpin 0.0453
## 6 Wyoming Uinta 0.0443
## 7 Nevada White Pine 0.0443
## 8 Colorado Gunnison 0.0438
## 9 Arizona Gila 0.0432
## 10 Utah San Juan 0.0431
5.1 Distribution of Ozone Levels
ggplot(ozone, aes(x = Arithmetic.Mean)) +
geom_histogram(binwidth = 0.005, fill = "blue", color = "black") +
labs(title = "Distribution of Daily Ozone Levels",
x = "Ozone Level (Arithmetic Mean)",
y = "Frequency") +
theme_minimal()
5.2 Boxplot of Ozone Levels by State
ggplot(ozone, aes(x = State.Name, y = Arithmetic.Mean)) +
geom_boxplot(outlier.color = "red", outlier.shape = 16, outlier.size = 2, fill = "skyblue") +
labs(title = "Ozone Levels by State (2020)",
x = "State",
y = "Ozone Level (Arithmetic Mean)") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# Identify outliers
outliers <- ozone %>%
filter(Arithmetic.Mean > quantile(Arithmetic.Mean, 0.95, na.rm = TRUE))
# Highlight outliers in the visualization
ggplot(ozone, aes(x = State.Name, y = Arithmetic.Mean)) +
geom_boxplot(aes(fill = State.Name), outlier.shape = NA, alpha = 0.5) +
geom_jitter(aes(color = "Data Points"), width = 0.2, alpha = 0.5) +
geom_point(data = outliers, aes(x = State.Name, y = Arithmetic.Mean, color = "Outliers"),
size = 2, shape = 16) +
labs(title = "Ozone Levels by State with Highlighted Outliers",
x = "State",
y = "Ozone Level (Arithmetic Mean)") +
scale_color_manual(name = "Legend", values = c("Data Points" = "gray", "Outliers" = "red")) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
Distribution of Ozone Levels:
Regional Patterns:
Monthly Trends:
Outliers:
Ranking of Counties:
Next Steps: