get data # RD|Action Code|State Code|County Code|Site ID|Parameter|POC|Sample Duration|Unit|Method|Date|Start Time|Sample Value|Null Data Code|Sampling Frequency|Monitor Protocol (MP) ID|Qualifier - 1|Qualifier - 2|Qualifier - 3|Qualifier - 4|Qualifier - 5|Qualifier - 6|Qualifier - 7|Qualifier - 8|Qualifier - 9|Qualifier - 10|Alternate Method Detectable Limit|Uncertainty # RC|Action Code|State Code|County Code|Site ID|Parameter|POC|Unit|Method|Year|Period|Number of Samples|Composite Type|Sample Value|Monitor Protocol (MP) ID|Qualifier - 1|Qualifier - 2|Qualifier - 3|Qualifier - 4|Qualifier - 5|Qualifier - 6|Qualifier - 7|Qualifier - 8|Qualifier - 9|Qualifier - 10|Alternate Method Detectable Limit|Uncertainty RD|I|01|027|0001|88101|1|7|105|120|19990103|00:00||AS|3||||||||||||| RD|I|01|027|0001|88101|1|7|105|120|19990106|00:00||AS|3||||||||||||| RD|I|01|027|0001|88101|1|7|105|120|19990109|00:00||AS|3||||||||||||| RD|I|01|027|0001|88101|1|7|105|120|19990112|00:00|8.841||3||||||||||||| RD|I|01|027|0001|88101|1|7|105|120|19990115|00:00|14.92||3||||||||||||| RD|I|01|027|0001|88101|1|7|105|120|19990118|00:00|3.878||3|||||||||||||
library(readr)
pm0 <- read_delim("data/RD_501_88101_1999-0.txt",
delim = "|",
comment = "#",
col_names = FALSE,
na = "")
## Parsed with column specification:
## cols(
## .default = col_logical(),
## X1 = col_character(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_character(),
## X6 = col_double(),
## X7 = col_double(),
## X8 = col_double(),
## X9 = col_double(),
## X10 = col_double(),
## X11 = col_double(),
## X12 = col_time(format = ""),
## X13 = col_double(),
## X14 = col_character(),
## X15 = col_double(),
## X17 = col_character()
## )
## See spec(...) for full column specifications.
dim(pm0)
## [1] 117421 28
head(pm0[, 1:13])
## # A tibble: 6 x 13
## X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <time>
## 1 RD I 01 027 0001 88101 1 7 105 120 2.00e7 00'00"
## 2 RD I 01 027 0001 88101 1 7 105 120 2.00e7 00'00"
## 3 RD I 01 027 0001 88101 1 7 105 120 2.00e7 00'00"
## 4 RD I 01 027 0001 88101 1 7 105 120 2.00e7 00'00"
## 5 RD I 01 027 0001 88101 1 7 105 120 2.00e7 00'00"
## 6 RD I 01 027 0001 88101 1 7 105 120 2.00e7 00'00"
## # ... with 1 more variable: X13 <dbl>
cnames <- readLines("Data/RD_501_88101_1999-0.txt", 1)
cnames <- strsplit(cnames, "|", fixed = TRUE)
names(pm0) <- make.names(cnames[[1]])
head(pm0[, 1:13])
## # A tibble: 6 x 13
## X..RD Action.Code State.Code County.Code Site.ID Parameter POC
## <chr> <chr> <chr> <chr> <chr> <dbl> <dbl>
## 1 RD I 01 027 0001 88101 1
## 2 RD I 01 027 0001 88101 1
## 3 RD I 01 027 0001 88101 1
## 4 RD I 01 027 0001 88101 1
## 5 RD I 01 027 0001 88101 1
## 6 RD I 01 027 0001 88101 1
## # ... with 6 more variables: Sample.Duration <dbl>, Unit <dbl>, Method <dbl>,
## # Date <dbl>, Start.Time <time>, Sample.Value <dbl>
x0 <- pm0$Sample.Value
summary(x0)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 7.20 11.50 13.74 17.90 157.10 13217
## Are missing values important here?
mean(is.na(x0))
## [1] 0.1125608
pm1 <- read_delim("Data/RD_501_88101_2012-0.txt",
comment = "#",
col_names = FALSE,
delim = "|",
na = "")
## Parsed with column specification:
## cols(
## .default = col_logical(),
## X1 = col_character(),
## X2 = col_character(),
## X3 = col_character(),
## X4 = col_character(),
## X5 = col_character(),
## X6 = col_double(),
## X7 = col_double(),
## X8 = col_double(),
## X9 = col_double(),
## X10 = col_double(),
## X11 = col_double(),
## X12 = col_time(format = ""),
## X13 = col_double(),
## X14 = col_character(),
## X15 = col_double(),
## X17 = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 20271 parsing failures.
## row col expected actual file
## 1082 X18 1/0/T/F/TRUE/FALSE 5 'Data/RD_501_88101_2012-0.txt'
## 1091 X18 1/0/T/F/TRUE/FALSE 5 'Data/RD_501_88101_2012-0.txt'
## 353841 X27 1/0/T/F/TRUE/FALSE 2 'Data/RD_501_88101_2012-0.txt'
## 353842 X27 1/0/T/F/TRUE/FALSE 2 'Data/RD_501_88101_2012-0.txt'
## 353843 X27 1/0/T/F/TRUE/FALSE 2 'Data/RD_501_88101_2012-0.txt'
## ...... ... .................. ...... ..............................
## See problems(...) for more details.
names(pm1) <- make.names(cnames[[1]])
Since we will be comparing the two years of data, it makes sense to combine them into a ## single data frame
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
pm <- rbind(pm0, pm1)
pm <- mutate(pm, year = factor(rep(c(1999, 2012), c(nrow(pm0), nrow(pm1))))) %>% rename(PM = Sample.Value)
#Take a random sample:
In order to show aggregate changes in PM across the entire monitoring network, we can make boxplots of all monitor values in 1999 and 2012. Here, we take the log of the PM values to adjust for the skew in the data.
library(ggplot2)
## Take a random sample because it's faster
set.seed(2015)
idx <- sample(nrow(pm), 1000)
qplot(year, log2(PM), data = pm[idx, ], geom = "boxplot")
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning in FUN(X[[i]], ...): NaNs produced
## Warning: Removed 89 rows containing non-finite values (stat_boxplot).
with(pm, tapply(PM, year, summary))
## $`1999`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.00 7.20 11.50 13.74 17.90 157.10 13217
##
## $`2012`
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -10.00 4.00 7.63 9.14 12.00 908.97 73133
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
negative <- filter(pm, year == "2012") %>%
mutate(negative = PM < 0, date = ymd(Date)) %>%
select(date, negative)
Cluster 3 is a mix of laying, sitting and standing, the rest are for single factors, repeating the clustering should give similar results.
filter(pm, year == "2012") %>% summarize(negative = mean(PM < 0, na.rm = TRUE))
## # A tibble: 1 x 1
## negative
## <dbl>
## 1 0.0215
mutate(negative, month = factor(month.name[month(date)], levels = month.name)) %>%
group_by(month) %>%
summarize(pct.negative = mean(negative, na.rm = TRUE) * 100)
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 10 x 2
## month pct.negative
## <fct> <dbl>
## 1 January 2.97
## 2 February 2.43
## 3 March 2.41
## 4 April 2.18
## 5 May 1.73
## 6 June 2.56
## 7 July 0.792
## 8 August 1.06
## 9 September 1.52
## 10 October 0
The bulk of the negative values seem to occur in the first four months of the year (January–April)
Subset the data frames to only include data from New York (State.Code == 36) and only include the County.Code and the Site.ID (i.e. monitor number) variables.
sites <- filter(pm, State.Code == 36) %>% select(County.Code, Site.ID, year) %>% unique
sites <- mutate(sites, site.code = paste(County.Code, Site.ID, sep = "."))
str(sites)
## tibble [51 x 4] (S3: tbl_df/tbl/data.frame)
## $ County.Code: chr [1:51] "001" "001" "005" "005" ...
## $ Site.ID : chr [1:51] "0005" "0012" "0073" "0080" ...
## $ year : Factor w/ 2 levels "1999","2012": 1 1 1 1 1 1 1 1 1 1 ...
## $ site.code : chr [1:51] "001.0005" "001.0012" "005.0073" "005.0080" ...
site.year <- with(sites, split(site.code, year))
both <- intersect(site.year[[1]], site.year[[2]])
print(both)
## [1] "001.0005" "001.0012" "005.0080" "013.0011" "029.0005" "031.0003"
## [7] "063.2008" "067.1015" "085.0055" "101.0003"
##choose one that had a reasonable amount of data in each year.
count <- mutate(pm, site.code = paste(County.Code, Site.ID, sep = ".")) %>%
filter(site.code %in% both)
group_by(count, site.code) %>% summarize(n = n())
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 10 x 2
## site.code n
## <chr> <int>
## 1 001.0005 186
## 2 001.0012 92
## 3 005.0080 92
## 4 013.0011 213
## 5 029.0005 94
## 6 031.0003 198
## 7 063.2008 152
## 8 067.1015 153
## 9 085.0055 38
## 10 101.0003 527
pmsub <- filter(pm, State.Code == "36" & County.Code == "063" & Site.ID == "2008") %>%
select(Date, year, PM) %>%
mutate(Date = ymd(Date), yday = yday(Date))
qplot(yday, PM, data = pmsub, facets = . ~ year, xlab = "Day of the year")
## Warning: Removed 35 rows containing missing values (geom_point).
The variation (spread) in the PM values in 2012 is much smaller than it was in 1999. This suggest that not only are median levels of PM lower in 2012, but that there are fewer large spikes from day to day.
This analysis falls somewhere in between looking at the entire country all at once and looking at an individual monitor.
mn <- group_by(pm, year, State.Code) %>% summarize(PM = mean(PM, na.rm = TRUE))
## `summarise()` regrouping output by 'year' (override with `.groups` argument)
head(mn)
## # A tibble: 6 x 3
## # Groups: year [1]
## year State.Code PM
## <fct> <chr> <dbl>
## 1 1999 01 20.0
## 2 1999 02 6.67
## 3 1999 04 10.8
## 4 1999 05 15.7
## 5 1999 06 17.7
## 6 1999 08 7.53
qplot(xyear, PM, data = mutate(mn, xyear = as.numeric(as.character(year))),
color = factor(State.Code),
geom = c("point", "line"))
This plot needs a bit of work still, it’s not easy to see by eye, it looks but that many states have decreased the average PM levels from 1999 to 2012, and a few states actually increased their levels …