Data Analysis Case Study: Changes in Fine Particle Air Pollution in the U.S.

Synopsis

In this chapter we aim to describe the changes in fine particle (PM2.5) outdoor air pollution in the United States between the years 1999 and 2012. Our overall hypothesis is that out door PM2.5 has decreased on average across the U.S. due to nationwide regulatory requirements arising from the Clean Air Act. To investigate this hypothesis, we obtained PM2.5 data from the U.S. Environmental Protection Agency which is collected from monitors sited across the U.S. We specifically obtained data for the years 1999 and 2012 (the most recent complete year available). From these data, we found that, on average across the U.S., levels of PM2.5 have decreased between 1999 and 2012. At one individual monitor, we found that levels have decreased and that the variability of PM2.5 has decreased. Most individual states also experienced decreases in PM2.5, although some states saw increases.

Loading and Processing the Raw Data

Reading in the 1999 data

# Read in the Table from the TXT file:
pm0 <- read.table("RD_501_88101_1999-0.txt", comment.char = "#", header = FALSE, sep = "|", na.strings = "")

# Spot Check Dimmensions and head of DF
dim(pm0)
## [1] 117421     28
head(pm0[, 1:13])
##   V1 V2 V3 V4 V5    V6 V7 V8  V9 V10      V11   V12    V13
## 1 RD  I  1 27  1 88101  1  7 105 120 19990103 00:00     NA
## 2 RD  I  1 27  1 88101  1  7 105 120 19990106 00:00     NA
## 3 RD  I  1 27  1 88101  1  7 105 120 19990109 00:00     NA
## 4 RD  I  1 27  1 88101  1  7 105 120 19990112 00:00  8.841
## 5 RD  I  1 27  1 88101  1  7 105 120 19990115 00:00 14.920
## 6 RD  I  1 27  1 88101  1  7 105 120 19990118 00:00  3.878

We then attach the column headers to the dataset and make sure that they are properly formated for R data frames.

# Attach Column Headers:
cnames <- readLines("RD_501_88101_1999-0.txt", 1)
cnames <- strsplit(cnames, "|", fixed = TRUE)


## Ensure names are properly formatted
names(pm0) <- make.names(cnames[[1]])
head(pm0[, 1:13])
##   X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1    RD           I          1          27       1     88101   1
## 2    RD           I          1          27       1     88101   1
## 3    RD           I          1          27       1     88101   1
## 4    RD           I          1          27       1     88101   1
## 5    RD           I          1          27       1     88101   1
## 6    RD           I          1          27       1     88101   1
##   Sample.Duration Unit Method     Date Start.Time Sample.Value
## 1               7  105    120 19990103      00:00           NA
## 2               7  105    120 19990106      00:00           NA
## 3               7  105    120 19990109      00:00           NA
## 4               7  105    120 19990112      00:00        8.841
## 5               7  105    120 19990115      00:00       14.920
## 6               7  105    120 19990118      00:00        3.878
x0 <- pm0$Sample.Value
summary(x0)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    7.20   11.50   13.74   17.90  157.10   13217
## Are missing values important here?
mean(is.na(x0))
## [1] 0.1125608

Reading in the 2012 data

pm1 <- read.table("RD_501_88101_2012-0.txt",
 comment.char = "#",
 header = FALSE, sep = "|",
 na.strings = "",
 nrow = 1304290)
# Attach Column Headers:
cnames1 <- readLines("RD_501_88101_2012-0.txt", 1)
cnames1 <- strsplit(cnames1, "|", fixed = TRUE)


## Ensure names are properly formatted
names(pm1) <- make.names(cnames1[[1]])
head(pm1[, 1:13])
##   X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## 1    RD           I          1           3      10     88101   1
## 2    RD           I          1           3      10     88101   1
## 3    RD           I          1           3      10     88101   1
## 4    RD           I          1           3      10     88101   1
## 5    RD           I          1           3      10     88101   1
## 6    RD           I          1           3      10     88101   1
##   Sample.Duration Unit Method     Date Start.Time Sample.Value
## 1               7  105    118 20120101      00:00          6.7
## 2               7  105    118 20120104      00:00          9.0
## 3               7  105    118 20120107      00:00          6.5
## 4               7  105    118 20120110      00:00          7.0
## 5               7  105    118 20120113      00:00          5.8
## 6               7  105    118 20120116      00:00          8.0
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
pm <- rbind(pm0, pm1)
pm <- mutate(pm, year = factor(rep(c(1999, 2012), c(nrow(pm0), nrow(pm1))))) %>%
 rename(PM = Sample.Value)

Results

Entire U.S. analysis

library(ggplot2)


## Take a random sample because it's faster
set.seed(2015)

# Get the Index and plot the boxplots
idx <- sample(nrow(pm), 1000)
qplot(year, log2(PM), data = pm[idx, ], geom = "boxplot")
## Warning: NaNs produced

## Warning: NaNs produced
## Warning: Removed 102 rows containing non-finite values (stat_boxplot).

with(pm, tapply(PM, year, summary))
## $`1999`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    7.20   11.50   13.74   17.90  157.10   13217 
## 
## $`2012`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -10.00    4.00    7.63    9.14   12.00  909.00   73133
filter(pm, year == "2012") %>% summarize(negative = mean(PM < 0, na.rm = TRUE))
##    negative
## 1 0.0215034
dates <- filter(pm, year == "2012")$Date
dates <- as.Date(as.character(dates), "%Y%m%d")
missing.months <- month.name[as.POSIXlt(dates)$mon + 1]
tab <- table(factor(missing.months, levels = month.name))
round(100 * tab / sum(tab))
## 
##   January  February     March     April       May      June      July 
##        15        13        15        13        14        13         8 
##    August September   October  November  December 
##         6         3         0         0         0

Changes in PM levels at an individual monitor

sites <- filter(pm, State.Code == 36) %>% select(County.Code, Site.ID, year) %>% unique
sites <- mutate(sites, site.code = paste(County.Code, Site.ID, sep = "."))

str(sites)
## 'data.frame':    51 obs. of  4 variables:
##  $ County.Code: int  1 1 5 5 5 5 13 27 29 29 ...
##  $ Site.ID    : int  5 12 73 80 83 110 11 1004 2 5 ...
##  $ year       : Factor w/ 2 levels "1999","2012": 1 1 1 1 1 1 1 1 1 1 ...
##  $ site.code  : chr  "1.5" "1.12" "5.73" "5.80" ...
site.year <- with(sites, split(site.code, year))
both <- intersect(site.year[[1]], site.year[[2]])
print(both)
##  [1] "1.5"     "1.12"    "5.80"    "13.11"   "29.5"    "31.3"    "63.2008"
##  [8] "67.1015" "85.55"   "101.3"
count <- mutate(pm, site.code = paste(County.Code, Site.ID, sep = ".")) %>%
 filter(site.code %in% both)
group_by(count, site.code) %>% summarize(n = n())
## Source: local data frame [10 x 2]
## 
##    site.code     n
##        (chr) (int)
## 1       1.12    92
## 2        1.5   186
## 3      101.3   527
## 4      13.11   213
## 5       29.5    94
## 6       31.3   198
## 7       5.80    92
## 8    63.2008   152
## 9    67.1015   153
## 10     85.55    38
pmsub <- filter(pm, State.Code == 36 & County.Code == 63 & Site.ID == 2008)

pmsub <- mutate(pmsub, date = as.Date(as.character(Date), "%Y%m%d"))
rng <- range(pmsub$PM, na.rm = TRUE)

par(mfrow = c(1, 2), mar = c(4, 5, 2, 1))

with(filter(pmsub, year == "1999"), {
 plot(date, PM, ylim = rng)
 abline(h = median(PM, na.rm = TRUE))
 })

 with(filter(pmsub, year == "2012"), {
 plot(date, PM, ylim = rng)
 abline(h = median(PM, na.rm = TRUE))
 })

Changes in state-wide PM levels

mn <- group_by(pm, year, State.Code) %>% summarize(PM = mean(PM, na.rm = TRUE))
head(mn)
## Source: local data frame [6 x 3]
## Groups: year [1]
## 
##     year State.Code        PM
##   (fctr)      (int)     (dbl)
## 1   1999          1 19.956391
## 2   1999          2  6.665929
## 3   1999          4 10.795547
## 4   1999          5 15.676067
## 5   1999          6 17.655412
## 6   1999          8  7.533304
tail(mn)
## Source: local data frame [6 x 3]
## Groups: year [1]
## 
##     year State.Code       PM
##   (fctr)      (int)    (dbl)
## 1   2012         51 8.708080
## 2   2012         53 6.364667
## 3   2012         54 9.821294
## 4   2012         55 7.914545
## 5   2012         56 4.005564
## 6   2012         72 6.048045
qplot(xyear, PM, data = mutate(mn, xyear = as.numeric(as.character(year))),
 color = factor(State.Code),
 geom = c("point", "line"))