get data # RD|Action Code|State Code|County Code|Site ID|Parameter|POC|Sample Duration|Unit|Method|Date|Start Time|Sample Value|Null Data Code|Sampling Frequency|Monitor Protocol (MP) ID|Qualifier - 1|Qualifier - 2|Qualifier - 3|Qualifier - 4|Qualifier - 5|Qualifier - 6|Qualifier - 7|Qualifier - 8|Qualifier - 9|Qualifier - 10|Alternate Method Detectable Limit|Uncertainty # RC|Action Code|State Code|County Code|Site ID|Parameter|POC|Unit|Method|Year|Period|Number of Samples|Composite Type|Sample Value|Monitor Protocol (MP) ID|Qualifier - 1|Qualifier - 2|Qualifier - 3|Qualifier - 4|Qualifier - 5|Qualifier - 6|Qualifier - 7|Qualifier - 8|Qualifier - 9|Qualifier - 10|Alternate Method Detectable Limit|Uncertainty RD|I|01|027|0001|88101|1|7|105|120|19990103|00:00||AS|3||||||||||||| RD|I|01|027|0001|88101|1|7|105|120|19990106|00:00||AS|3||||||||||||| RD|I|01|027|0001|88101|1|7|105|120|19990109|00:00||AS|3||||||||||||| RD|I|01|027|0001|88101|1|7|105|120|19990112|00:00|8.841||3||||||||||||| RD|I|01|027|0001|88101|1|7|105|120|19990115|00:00|14.92||3||||||||||||| RD|I|01|027|0001|88101|1|7|105|120|19990118|00:00|3.878||3|||||||||||||

Read 1999 data:

library(readr)

pm0 <- read_delim("data/RD_501_88101_1999-0.txt", 
                   delim = "|",
                  comment = "#", 
                  col_names = FALSE, 
                  na = "")

## Parsed with column specification:
## cols(
##   .default = col_logical(),
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_character(),
##   X4 = col_character(),
##   X5 = col_character(),
##   X6 = col_double(),
##   X7 = col_double(),
##   X8 = col_double(),
##   X9 = col_double(),
##   X10 = col_double(),
##   X11 = col_double(),
##   X12 = col_time(format = ""),
##   X13 = col_double(),
##   X14 = col_character(),
##   X15 = col_double(),
##   X17 = col_character()
## )

## See spec(...) for full column specifications.

dim(pm0)

## [1] 117421     28

head(pm0[, 1:13])

## # A tibble: 6 x 13
##   X1    X2    X3    X4    X5       X6    X7    X8    X9   X10    X11 X12   
##   <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl> <time>
## 1 RD    I     01    027   0001  88101     1     7   105   120 2.00e7 00'00"
## 2 RD    I     01    027   0001  88101     1     7   105   120 2.00e7 00'00"
## 3 RD    I     01    027   0001  88101     1     7   105   120 2.00e7 00'00"
## 4 RD    I     01    027   0001  88101     1     7   105   120 2.00e7 00'00"
## 5 RD    I     01    027   0001  88101     1     7   105   120 2.00e7 00'00"
## 6 RD    I     01    027   0001  88101     1     7   105   120 2.00e7 00'00"
## # ... with 1 more variable: X13 <dbl>

cnames <- readLines("Data/RD_501_88101_1999-0.txt", 1)
cnames <- strsplit(cnames, "|", fixed = TRUE)
names(pm0) <- make.names(cnames[[1]]) 
head(pm0[, 1:13])

## # A tibble: 6 x 13
##   X..RD Action.Code State.Code County.Code Site.ID Parameter   POC
##   <chr> <chr>       <chr>      <chr>       <chr>       <dbl> <dbl>
## 1 RD    I           01         027         0001        88101     1
## 2 RD    I           01         027         0001        88101     1
## 3 RD    I           01         027         0001        88101     1
## 4 RD    I           01         027         0001        88101     1
## 5 RD    I           01         027         0001        88101     1
## 6 RD    I           01         027         0001        88101     1
## # ... with 6 more variables: Sample.Duration <dbl>, Unit <dbl>, Method <dbl>,
## #   Date <dbl>, Start.Time <time>, Sample.Value <dbl>

The column we are interested in is the Sample.Value column which contains the PM2.5 measurements. Here we extract that column and print a brief summary.

x0 <- pm0$Sample.Value

summary(x0)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    7.20   11.50   13.74   17.90  157.10   13217

Clustering based just on average acceleration

## Are missing values important here?
mean(is.na(x0))

## [1] 0.1125608

Read 2012 data:

pm1 <- read_delim("Data/RD_501_88101_2012-0.txt", 
                   comment = "#", 
                   col_names = FALSE, 
                   delim = "|", 
                   na = "")

## Parsed with column specification:
## cols(
##   .default = col_logical(),
##   X1 = col_character(),
##   X2 = col_character(),
##   X3 = col_character(),
##   X4 = col_character(),
##   X5 = col_character(),
##   X6 = col_double(),
##   X7 = col_double(),
##   X8 = col_double(),
##   X9 = col_double(),
##   X10 = col_double(),
##   X11 = col_double(),
##   X12 = col_time(format = ""),
##   X13 = col_double(),
##   X14 = col_character(),
##   X15 = col_double(),
##   X17 = col_character()
## )

## See spec(...) for full column specifications.

## Warning: 20271 parsing failures.
##    row col           expected actual                           file
##   1082 X18 1/0/T/F/TRUE/FALSE      5 'Data/RD_501_88101_2012-0.txt'
##   1091 X18 1/0/T/F/TRUE/FALSE      5 'Data/RD_501_88101_2012-0.txt'
## 353841 X27 1/0/T/F/TRUE/FALSE      2 'Data/RD_501_88101_2012-0.txt'
## 353842 X27 1/0/T/F/TRUE/FALSE      2 'Data/RD_501_88101_2012-0.txt'
## 353843 X27 1/0/T/F/TRUE/FALSE      2 'Data/RD_501_88101_2012-0.txt'
## ...... ... .................. ...... ..............................
## See problems(...) for more details.

names(pm1) <- make.names(cnames[[1]])

Since we will be comparing the two years of data, it makes sense to combine them into a ## single data frame

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

pm <- rbind(pm0, pm1)

Create a factor variable indicating which year the data comes from and rename the Sample.Value variable to a more sensible PM.

pm <- mutate(pm, year = factor(rep(c(1999, 2012), c(nrow(pm0), nrow(pm1))))) %>% rename(PM = Sample.Value)

#Take a random sample:

In order to show aggregate changes in PM across the entire monitoring network, we can make boxplots of all monitor values in 1999 and 2012. Here, we take the log of the PM values to adjust for the skew in the data.

library(ggplot2)

## Take a random sample because it's faster
set.seed(2015)
idx <- sample(nrow(pm), 1000)
qplot(year, log2(PM), data = pm[idx, ], geom = "boxplot")

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning in FUN(X[[i]], ...): NaNs produced

## Warning: Removed 89 rows containing non-finite values (stat_boxplot).

make some summaries of the two year’s worth data to get at actual numbers:

with(pm, tapply(PM, year, summary))

## $`1999`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    0.00    7.20   11.50   13.74   17.90  157.10   13217 
## 
## $`2012`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -10.00    4.00    7.63    9.14   12.00  908.97   73133

Check if negative values occur more often in some parts of the year than other parts:

library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

negative <- filter(pm, year == "2012") %>% 
        mutate(negative = PM < 0, date = ymd(Date)) %>%
        select(date, negative)

Cluster 3 is a mix of laying, sitting and standing, the rest are for single factors, repeating the clustering should give similar results.

Extract the month from each of the dates with negative values and attempt to identify when negative values occur most often.

filter(pm, year == "2012") %>% summarize(negative = mean(PM < 0, na.rm = TRUE))

## # A tibble: 1 x 1
##   negative
##      <dbl>
## 1   0.0215

mutate(negative, month = factor(month.name[month(date)], levels = month.name)) %>%
         group_by(month) %>%
         summarize(pct.negative = mean(negative, na.rm = TRUE) * 100)

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 10 x 2
##    month     pct.negative
##    <fct>            <dbl>
##  1 January          2.97 
##  2 February         2.43 
##  3 March            2.41 
##  4 April            2.18 
##  5 May              1.73 
##  6 June             2.56 
##  7 July             0.792
##  8 August           1.06 
##  9 September        1.52 
## 10 October          0

The bulk of the negative values seem to occur in the first four months of the year (January–April)

Single monitor in New York State to see if PM levels at that monitor decreased from 1999 to 2012.

Subset the data frames to only include data from New York (State.Code == 36) and only include the County.Code and the Site.ID (i.e. monitor number) variables.

sites <- filter(pm, State.Code == 36) %>% select(County.Code, Site.ID, year) %>% unique

sites <- mutate(sites, site.code = paste(County.Code, Site.ID, sep = "."))
str(sites)

## tibble [51 x 4] (S3: tbl_df/tbl/data.frame)
##  $ County.Code: chr [1:51] "001" "001" "005" "005" ...
##  $ Site.ID    : chr [1:51] "0005" "0012" "0073" "0080" ...
##  $ year       : Factor w/ 2 levels "1999","2012": 1 1 1 1 1 1 1 1 1 1 ...
##  $ site.code  : chr [1:51] "001.0005" "001.0012" "005.0073" "005.0080" ...

Finally, we want the intersection between the sites present in 1999 and 2012 so that we might choose a monitor that has data in both periods.

site.year <- with(sites, split(site.code, year))
both <- intersect(site.year[[1]], site.year[[2]])
print(both)

##  [1] "001.0005" "001.0012" "005.0080" "013.0011" "029.0005" "031.0003"
##  [7] "063.2008" "067.1015" "085.0055" "101.0003"

##choose one that had a reasonable amount of data in each year.

count <- mutate(pm, site.code = paste(County.Code, Site.ID, sep = ".")) %>% 
         filter(site.code %in% both)
group_by(count, site.code) %>% summarize(n = n())

## `summarise()` ungrouping output (override with `.groups` argument)

## # A tibble: 10 x 2
##    site.code     n
##    <chr>     <int>
##  1 001.0005    186
##  2 001.0012     92
##  3 005.0080     92
##  4 013.0011    213
##  5 029.0005     94
##  6 031.0003    198
##  7 063.2008    152
##  8 067.1015    153
##  9 085.0055     38
## 10 101.0003    527

Let’s focus on County 63 and site ID 2008.

pmsub <- filter(pm, State.Code == "36" & County.Code == "063" & Site.ID == "2008") %>%
         select(Date, year, PM) %>%
         mutate(Date = ymd(Date), yday = yday(Date))

qplot(yday, PM, data = pmsub, facets = . ~ year, xlab = "Day of the year")

## Warning: Removed 35 rows containing missing values (geom_point).

The variation (spread) in the PM values in 2012 is much smaller than it was in 1999. This suggest that not only are median levels of PM lower in 2012, but that there are fewer large spikes from day to day.

Changes in state-wide PM levels:

This analysis falls somewhere in between looking at the entire country all at once and looking at an individual monitor.

mn <- group_by(pm, year, State.Code) %>% summarize(PM = mean(PM, na.rm = TRUE))

## `summarise()` regrouping output by 'year' (override with `.groups` argument)

head(mn)

## # A tibble: 6 x 3
## # Groups:   year [1]
##   year  State.Code    PM
##   <fct> <chr>      <dbl>
## 1 1999  01         20.0 
## 2 1999  02          6.67
## 3 1999  04         10.8 
## 4 1999  05         15.7 
## 5 1999  06         17.7 
## 6 1999  08          7.53

qplot(xyear, PM, data = mutate(mn, xyear = as.numeric(as.character(year))), 
       color = factor(State.Code), 
       geom = c("point", "line"))

This plot needs a bit of work still, it’s not easy to see by eye, it looks but that many states have decreased the average PM levels from 1999 to 2012, and a few states actually increased their levels …

PM25 clustering-case-study

Linda Angulo Lopez

29/09/2020