Mastering software development in R by Johns Hopkins University, R programming environment course, week 4 Reading and Summarizing Data Quiz
Read in data and modify variable names
library(readr)
data <- read_csv("data/daily_SPEC_2014.csv.bz2")
## Parsed with column specification:
## cols(
## .default = col_character(),
## `Parameter Code` = col_integer(),
## POC = col_integer(),
## Latitude = col_double(),
## Longitude = col_double(),
## `Date Local` = col_date(format = ""),
## `Observation Count` = col_integer(),
## `Observation Percent` = col_double(),
## `Arithmetic Mean` = col_double(),
## `1st Max Value` = col_double(),
## `1st Max Hour` = col_integer(),
## `Method Code` = col_integer(),
## `Date of Last Change` = col_date(format = "")
## )
## See spec(...) for full column specifications.
colnames(data) <- gsub(" +", ".", colnames(data))
01 What is average Arithmetic.Mean for “Bromine PM2.5 LC” in the state of Wisconsin in this dataset?
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data %>% select(State.Name, Parameter.Name, Arithmetic.Mean) %>%
group_by(State.Name) %>%
filter(State.Name=="Wisconsin") %>%
group_by(Parameter.Name) %>%
filter(Parameter.Name=="Bromine PM2.5 LC")%>%
summarize(mean=mean(Arithmetic.Mean,na.rm = TRUE))
## # A tibble: 1 × 2
## Parameter.Name mean
## <chr> <dbl>
## 1 Bromine PM2.5 LC 0.003960482
02 Calculate the average of each chemical constituent across all states, monitoring sites and all time points.
require(dplyr)
data %>% select(State.Name, Site.Num,Parameter.Name, Arithmetic.Mean,Date.Local)%>%
group_by(State.Name, Site.Num, Date.Local,Parameter.Name) %>%
summarize(mean=mean(Arithmetic.Mean,na.rm = TRUE)) %>%
arrange(desc(mean))
## Source: local data frame [1,690,291 x 5]
## Groups: State.Name, Site.Num, Date.Local [111,096]
##
## State.Name Site.Num Date.Local Parameter.Name
## <chr> <chr> <date> <chr>
## 1 District Of Columbia 0043 2014-01-12 OC CSN Unadjusted PM2.5 LC TOT
## 2 District Of Columbia 0043 2014-01-13 OC CSN Unadjusted PM2.5 LC TOT
## 3 District Of Columbia 0043 2014-01-14 OC CSN Unadjusted PM2.5 LC TOT
## 4 District Of Columbia 0043 2014-01-11 OC CSN Unadjusted PM2.5 LC TOT
## 5 District Of Columbia 0043 2014-03-02 OC CSN Unadjusted PM2.5 LC TOT
## 6 District Of Columbia 0043 2014-02-09 OC CSN Unadjusted PM2.5 LC TOT
## 7 District Of Columbia 0043 2014-01-10 OC CSN Unadjusted PM2.5 LC TOT
## 8 District Of Columbia 0043 2014-03-11 OC CSN Unadjusted PM2.5 LC TOT
## 9 District Of Columbia 0043 2014-01-09 OC CSN Unadjusted PM2.5 LC TOT
## 10 District Of Columbia 0043 2014-03-12 OC CSN Unadjusted PM2.5 LC TOT
## # ... with 1,690,281 more rows, and 1 more variables: mean <dbl>
03 Which monitoring site has the highest average level of “Sulfate PM2.5 LC” across all time?
require(dplyr)
data %>% filter(Parameter.Name=="Sulfate PM2.5 LC")%>%
group_by(State.Code, County.Code,Site.Num) %>%
summarize(mean=mean(Arithmetic.Mean,na.rm = TRUE)) %>%
arrange(desc(mean))
## Source: local data frame [358 x 4]
## Groups: State.Code, County.Code [313]
##
## State.Code County.Code Site.Num mean
## <chr> <chr> <chr> <dbl>
## 1 39 081 0017 3.182189
## 2 42 003 0064 3.055483
## 3 54 039 1005 2.938800
## 4 18 019 0006 2.738700
## 5 39 153 0023 2.706449
## 6 39 035 0060 2.640185
## 7 39 087 0012 2.638311
## 8 54 051 1002 2.619043
## 9 21 111 0067 2.549750
## 10 18 037 2001 2.516367
## # ... with 348 more rows
04 What is the absolute difference in the average levels of “EC PM2.5 LC TOR” between the states California and Arizona, across all time and all monitoring sites?
require(dplyr)
data %>% filter(State.Name %in% c("California","Arizona") & Parameter.Name == "EC PM2.5 LC TOR") %>%
group_by(State.Name) %>%
summarize(mean=mean(Arithmetic.Mean,na.rm = TRUE))%>%
tidyr::spread(State.Name, mean) %>%
mutate(diff = Arizona - California)
## # A tibble: 1 × 3
## Arizona California diff
## <dbl> <dbl> <dbl>
## 1 0.1791704 0.1977374 -0.01856696
06 How many monitoring sites are labelled as both RESIDENTIAL for “Land Use” and SUBURBAN for “Location Setting”?
require(dplyr)
library(readxl)
site <- read_excel("data/aqs_sites.xlsx")
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in A20237 / R20237C1: got 'CC'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in A20238 / R20238C1: got 'CC'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in A20239 / R20239C1: got 'CC'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in A20240 / R20240C1: got 'CC'
colnames(site) <- gsub(" +", ".", colnames(site))
site %>% filter(Land.Use=="RESIDENTIAL" & Location.Setting=="SUBURBAN")%>%
summarize(N=n())
## # A tibble: 1 × 1
## N
## <int>
## 1 3527
with(site, table(Land.Use, Location.Setting))
## Location.Setting
## Land.Use RURAL SUBURBAN UNKNOWN URBAN AND CENTER CITY
## AGRICULTURAL 2233 62 5 10
## BLIGHTED AREAS 5 0 0 3
## COMMERCIAL 353 1610 26 3208
## DESERT 140 2 1 1
## FOREST 620 15 1 1
## INDUSTRIAL 1330 1207 3 1008
## MILITARY RESERVATION 7 6 0 1
## MOBILE 20 110 0 130
## RESIDENTIAL 753 3527 12 1625
## UNKNOWN 145 0 896 0
07 What is the median level of “EC PM2.5 LC TOR” amongst monitoring sites that are labelled as both “RESIDENTIAL” and “SUBURBAN” in the eastern U.S., where eastern is defined as Longitude greater than or equal to -100?
require(dplyr)
site <- rename(site, Site.Num = Site.Number) %>%
select(State.Code, County.Code, Site.Num, Longitude, Land.Use,
Location.Setting)
str(site)
## Classes 'tbl_df', 'tbl' and 'data.frame': 20239 obs. of 6 variables:
## $ State.Code : num 1 1 1 1 1 1 1 1 1 1 ...
## $ County.Code : num 1 1 1 3 3 3 3 5 5 7 ...
## $ Site.Num : num 1 2 3 1 2 3 10 1 2 1 ...
## $ Longitude : num -86.5 -86.4 -86.8 0 -87.7 ...
## $ Land.Use : chr "RESIDENTIAL" "AGRICULTURAL" "FOREST" "UNKNOWN" ...
## $ Location.Setting: chr "SUBURBAN" "RURAL" "RURAL" "RURAL" ...
subdata <- mutate(data, State.Code = as.numeric(State.Code),
County.Code = as.numeric(County.Code),
Site.Num = as.numeric(Site.Num)) %>%
select(State.Code, County.Code, Site.Num, Parameter.Name, Arithmetic.Mean, Date.Local)
str(subdata) # make sure variables are in the same class
## Classes 'tbl_df', 'tbl' and 'data.frame': 2108467 obs. of 6 variables:
## $ State.Code : num 1 1 1 1 1 1 1 1 1 1 ...
## $ County.Code : num 3 3 3 3 3 3 3 3 3 3 ...
## $ Site.Num : num 10 10 10 10 10 10 10 10 10 10 ...
## $ Parameter.Name : chr "Ambient Temperature" "Ambient Temperature" "Ambient Temperature" "Ambient Temperature" ...
## $ Arithmetic.Mean: num 10.9 14.1 0.7 17.9 12.7 11 9.1 6.3 8.6 -4.1 ...
## $ Date.Local : Date, format: "2014-01-02" "2014-01-05" ...
m <- left_join(subdata, site, by = c("State.Code", "County.Code", "Site.Num"))
str(m)
## Classes 'tbl_df', 'tbl' and 'data.frame': 2108467 obs. of 9 variables:
## $ State.Code : num 1 1 1 1 1 1 1 1 1 1 ...
## $ County.Code : num 3 3 3 3 3 3 3 3 3 3 ...
## $ Site.Num : num 10 10 10 10 10 10 10 10 10 10 ...
## $ Parameter.Name : chr "Ambient Temperature" "Ambient Temperature" "Ambient Temperature" "Ambient Temperature" ...
## $ Arithmetic.Mean : num 10.9 14.1 0.7 17.9 12.7 11 9.1 6.3 8.6 -4.1 ...
## $ Date.Local : Date, format: "2014-01-02" "2014-01-05" ...
## $ Longitude : num -87.9 -87.9 -87.9 -87.9 -87.9 ...
## $ Land.Use : chr "COMMERCIAL" "COMMERCIAL" "COMMERCIAL" "COMMERCIAL" ...
## $ Location.Setting: chr "SUBURBAN" "SUBURBAN" "SUBURBAN" "SUBURBAN" ...
m %>% filter(Parameter.Name == "EC PM2.5 LC TOR" & Land.Use == "RESIDENTIAL" &
Location.Setting == "SUBURBAN" & Longitude >= -100) %>%
group_by(Parameter.Name)%>%
summarize(median=median(Arithmetic.Mean,na.rm = TRUE))
## # A tibble: 1 × 2
## Parameter.Name median
## <chr> <dbl>
## 1 EC PM2.5 LC TOR 0.61
08 Amongst monitoring sites that are labeled as COMMERCIAL for “Land Use”, which month of the year has the highest average levels of “Sulfate PM2.5 LC”?
require(dplyr)
subdata %>% left_join(site,by = c("State.Code", "County.Code", "Site.Num")) %>%
filter(Land.Use == "COMMERCIAL",
Parameter.Name == "Sulfate PM2.5 LC") %>%
mutate(month = lubridate::month(Date.Local,label=TRUE)) %>%
group_by(month) %>%
summarise(mean = mean(Arithmetic.Mean, na.rm = TRUE)) %>%
arrange(desc(mean))
## # A tibble: 12 × 2
## month mean
## <ord> <dbl>
## 1 Feb 2.021325
## 2 Mar 1.805260
## 3 Jul 1.777605
## 4 Aug 1.761226
## 5 Jun 1.750571
## 6 Sep 1.645010
## 7 Apr 1.567614
## 8 May 1.558096
## 9 Dec 1.537649
## 10 Jan 1.316738
## 11 Oct 1.313770
## 12 Nov 1.295837
09 Take a look at the data for the monitoring site identified by State Code 6, County Code 65, and Site Number 8001 (this monitor is in California). At this monitor, for how many days is the sum of “Sulfate PM2.5 LC” and “Total Nitrate PM2.5 LC” greater than 10?
require(dplyr)
data %>% filter(State.Code == "06" & County.Code == "065" & Site.Num == "8001" &
Parameter.Name %in% c("Sulfate PM2.5 LC", "Total Nitrate PM2.5 LC"))%>%
group_by(Parameter.Name, Date.Local) %>%
select(State.Code, County.Code, Site.Num, Date.Local, Parameter.Name,
Arithmetic.Mean) %>%
summarise(mean = mean(Arithmetic.Mean, na.rm = TRUE)) %>%
group_by(Date.Local) %>%
summarise(Total = sum(mean, na.rm = TRUE)) %>%
filter(Total > 10)
## # A tibble: 11 × 2
## Date.Local Total
## <date> <dbl>
## 1 2014-01-11 13.60467
## 2 2014-01-29 17.16333
## 3 2014-02-10 11.82500
## 4 2014-02-19 17.69000
## 5 2014-02-25 16.90000
## 6 2014-03-06 11.37667
## 7 2014-03-24 10.05667
## 8 2014-10-11 15.81000
## 9 2014-10-14 11.49500
## 10 2014-10-20 10.83500
## 11 2014-11-10 21.01000
10 Which monitoring site in the dataset has the highest correlation between “Sulfate PM2.5 LC” and “Total Nitrate PM2.5 LC” across all dates? Identify the monitoring site by it’s State, County, and Site Number code.
require(dplyr)
data %>%
filter(Parameter.Name %in% c("Sulfate PM2.5 LC", "Total Nitrate PM2.5 LC")) %>%
group_by(State.Code, County.Code, Site.Num, Parameter.Name, Date.Local) %>%
select(State.Code, County.Code, Site.Num, Date.Local, Parameter.Name,
Arithmetic.Mean) %>%
summarise(mean = mean(Arithmetic.Mean, na.rm = TRUE)) %>%
tidyr::spread(Parameter.Name, mean) %>%
group_by(State.Code, County.Code, Site.Num) %>%
summarise(correlation = cor(`Sulfate PM2.5 LC`, `Total Nitrate PM2.5 LC`)) %>%
arrange(desc(correlation))
## Source: local data frame [358 x 4]
## Groups: State.Code, County.Code [313]
##
## State.Code County.Code Site.Num correlation
## <chr> <chr> <chr> <dbl>
## 1 02 090 0035 0.8978038
## 2 08 001 0006 0.8956944
## 3 34 001 0006 0.8812428
## 4 42 045 0002 0.8739804
## 5 02 090 0010 0.8637321
## 6 53 033 0030 0.8471710
## 7 02 090 0034 0.8412910
## 8 41 033 0010 0.7920220
## 9 16 037 0002 0.7907279
## 10 38 017 1004 0.7900868
## # ... with 348 more rows