Mastering software development in R by Johns Hopkins University, R programming environment course, week 4 Reading and Summarizing Data Quiz

Read in data and modify variable names

library(readr)
data <- read_csv("data/daily_SPEC_2014.csv.bz2")
## Parsed with column specification:
## cols(
##   .default = col_character(),
##   `Parameter Code` = col_integer(),
##   POC = col_integer(),
##   Latitude = col_double(),
##   Longitude = col_double(),
##   `Date Local` = col_date(format = ""),
##   `Observation Count` = col_integer(),
##   `Observation Percent` = col_double(),
##   `Arithmetic Mean` = col_double(),
##   `1st Max Value` = col_double(),
##   `1st Max Hour` = col_integer(),
##   `Method Code` = col_integer(),
##   `Date of Last Change` = col_date(format = "")
## )
## See spec(...) for full column specifications.
colnames(data) <- gsub(" +", ".", colnames(data))

01 What is average Arithmetic.Mean for “Bromine PM2.5 LC” in the state of Wisconsin in this dataset?

require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data %>% select(State.Name, Parameter.Name, Arithmetic.Mean) %>%
         group_by(State.Name) %>%
         filter(State.Name=="Wisconsin") %>% 
         group_by(Parameter.Name) %>%
         filter(Parameter.Name=="Bromine PM2.5 LC")%>%
         summarize(mean=mean(Arithmetic.Mean,na.rm = TRUE))
## # A tibble: 1 × 2
##     Parameter.Name        mean
##              <chr>       <dbl>
## 1 Bromine PM2.5 LC 0.003960482

02 Calculate the average of each chemical constituent across all states, monitoring sites and all time points.

require(dplyr)
data %>% select(State.Name, Site.Num,Parameter.Name, Arithmetic.Mean,Date.Local)%>%
  group_by(State.Name, Site.Num, Date.Local,Parameter.Name) %>%
  summarize(mean=mean(Arithmetic.Mean,na.rm = TRUE)) %>%
  arrange(desc(mean))
## Source: local data frame [1,690,291 x 5]
## Groups: State.Name, Site.Num, Date.Local [111,096]
## 
##              State.Name Site.Num Date.Local                 Parameter.Name
##                   <chr>    <chr>     <date>                          <chr>
## 1  District Of Columbia     0043 2014-01-12 OC CSN Unadjusted PM2.5 LC TOT
## 2  District Of Columbia     0043 2014-01-13 OC CSN Unadjusted PM2.5 LC TOT
## 3  District Of Columbia     0043 2014-01-14 OC CSN Unadjusted PM2.5 LC TOT
## 4  District Of Columbia     0043 2014-01-11 OC CSN Unadjusted PM2.5 LC TOT
## 5  District Of Columbia     0043 2014-03-02 OC CSN Unadjusted PM2.5 LC TOT
## 6  District Of Columbia     0043 2014-02-09 OC CSN Unadjusted PM2.5 LC TOT
## 7  District Of Columbia     0043 2014-01-10 OC CSN Unadjusted PM2.5 LC TOT
## 8  District Of Columbia     0043 2014-03-11 OC CSN Unadjusted PM2.5 LC TOT
## 9  District Of Columbia     0043 2014-01-09 OC CSN Unadjusted PM2.5 LC TOT
## 10 District Of Columbia     0043 2014-03-12 OC CSN Unadjusted PM2.5 LC TOT
## # ... with 1,690,281 more rows, and 1 more variables: mean <dbl>

03 Which monitoring site has the highest average level of “Sulfate PM2.5 LC” across all time?

require(dplyr)
data %>% filter(Parameter.Name=="Sulfate PM2.5 LC")%>%
  group_by(State.Code, County.Code,Site.Num) %>%
  summarize(mean=mean(Arithmetic.Mean,na.rm = TRUE)) %>%
  arrange(desc(mean))
## Source: local data frame [358 x 4]
## Groups: State.Code, County.Code [313]
## 
##    State.Code County.Code Site.Num     mean
##         <chr>       <chr>    <chr>    <dbl>
## 1          39         081     0017 3.182189
## 2          42         003     0064 3.055483
## 3          54         039     1005 2.938800
## 4          18         019     0006 2.738700
## 5          39         153     0023 2.706449
## 6          39         035     0060 2.640185
## 7          39         087     0012 2.638311
## 8          54         051     1002 2.619043
## 9          21         111     0067 2.549750
## 10         18         037     2001 2.516367
## # ... with 348 more rows

04 What is the absolute difference in the average levels of “EC PM2.5 LC TOR” between the states California and Arizona, across all time and all monitoring sites?

require(dplyr)
data %>% filter(State.Name %in% c("California","Arizona") & Parameter.Name == "EC PM2.5 LC TOR") %>%
  group_by(State.Name) %>%
  summarize(mean=mean(Arithmetic.Mean,na.rm = TRUE))%>%
  tidyr::spread(State.Name, mean) %>%
  mutate(diff = Arizona - California)
## # A tibble: 1 × 3
##     Arizona California        diff
##       <dbl>      <dbl>       <dbl>
## 1 0.1791704  0.1977374 -0.01856696

05 What is the median level of “OC PM2.5 LC TOR” in the western United States, across all time? Define western as any monitoring location that has a Longitude LESS THAN -100

require(dplyr)
data %>% filter(Parameter.Name == ("OC PM2.5 LC TOR")) %>%
  mutate(region=ifelse(Longitude < -100, "west","east"))%>%
  group_by(Parameter.Name,region)%>%
  summarize(median=median(Arithmetic.Mean,na.rm = TRUE))%>%
  tidyr::spread(region, median)
## Source: local data frame [1 x 3]
## Groups: Parameter.Name [1]
## 
##    Parameter.Name  east  west
## *           <chr> <dbl> <dbl>
## 1 OC PM2.5 LC TOR  0.88  0.43

06 How many monitoring sites are labelled as both RESIDENTIAL for “Land Use” and SUBURBAN for “Location Setting”?

require(dplyr)
library(readxl)

site <- read_excel("data/aqs_sites.xlsx")
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in A20237 / R20237C1: got 'CC'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in A20238 / R20238C1: got 'CC'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in A20239 / R20239C1: got 'CC'
## Warning in read_fun(path = path, sheet = sheet, limits = limits, shim =
## shim, : Expecting numeric in A20240 / R20240C1: got 'CC'
colnames(site) <- gsub(" +", ".", colnames(site))

site %>% filter(Land.Use=="RESIDENTIAL" &  Location.Setting=="SUBURBAN")%>%
  summarize(N=n())
## # A tibble: 1 × 1
##       N
##   <int>
## 1  3527
with(site, table(Land.Use, Location.Setting))
##                       Location.Setting
## Land.Use               RURAL SUBURBAN UNKNOWN URBAN AND CENTER CITY
##   AGRICULTURAL          2233       62       5                    10
##   BLIGHTED AREAS           5        0       0                     3
##   COMMERCIAL             353     1610      26                  3208
##   DESERT                 140        2       1                     1
##   FOREST                 620       15       1                     1
##   INDUSTRIAL            1330     1207       3                  1008
##   MILITARY RESERVATION     7        6       0                     1
##   MOBILE                  20      110       0                   130
##   RESIDENTIAL            753     3527      12                  1625
##   UNKNOWN                145        0     896                     0

07 What is the median level of “EC PM2.5 LC TOR” amongst monitoring sites that are labelled as both “RESIDENTIAL” and “SUBURBAN” in the eastern U.S., where eastern is defined as Longitude greater than or equal to -100?

require(dplyr)
site <- rename(site, Site.Num = Site.Number) %>%
  select(State.Code, County.Code, Site.Num, Longitude, Land.Use, 
         Location.Setting)
str(site)
## Classes 'tbl_df', 'tbl' and 'data.frame':    20239 obs. of  6 variables:
##  $ State.Code      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ County.Code     : num  1 1 1 3 3 3 3 5 5 7 ...
##  $ Site.Num        : num  1 2 3 1 2 3 10 1 2 1 ...
##  $ Longitude       : num  -86.5 -86.4 -86.8 0 -87.7 ...
##  $ Land.Use        : chr  "RESIDENTIAL" "AGRICULTURAL" "FOREST" "UNKNOWN" ...
##  $ Location.Setting: chr  "SUBURBAN" "RURAL" "RURAL" "RURAL" ...
subdata <- mutate(data, State.Code = as.numeric(State.Code),
               County.Code = as.numeric(County.Code),
               Site.Num = as.numeric(Site.Num)) %>%
        select(State.Code, County.Code, Site.Num, Parameter.Name, Arithmetic.Mean, Date.Local)
str(subdata) # make sure variables are in the same class
## Classes 'tbl_df', 'tbl' and 'data.frame':    2108467 obs. of  6 variables:
##  $ State.Code     : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ County.Code    : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ Site.Num       : num  10 10 10 10 10 10 10 10 10 10 ...
##  $ Parameter.Name : chr  "Ambient Temperature" "Ambient Temperature" "Ambient Temperature" "Ambient Temperature" ...
##  $ Arithmetic.Mean: num  10.9 14.1 0.7 17.9 12.7 11 9.1 6.3 8.6 -4.1 ...
##  $ Date.Local     : Date, format: "2014-01-02" "2014-01-05" ...
m <- left_join(subdata, site, by = c("State.Code", "County.Code", "Site.Num"))
str(m)
## Classes 'tbl_df', 'tbl' and 'data.frame':    2108467 obs. of  9 variables:
##  $ State.Code      : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ County.Code     : num  3 3 3 3 3 3 3 3 3 3 ...
##  $ Site.Num        : num  10 10 10 10 10 10 10 10 10 10 ...
##  $ Parameter.Name  : chr  "Ambient Temperature" "Ambient Temperature" "Ambient Temperature" "Ambient Temperature" ...
##  $ Arithmetic.Mean : num  10.9 14.1 0.7 17.9 12.7 11 9.1 6.3 8.6 -4.1 ...
##  $ Date.Local      : Date, format: "2014-01-02" "2014-01-05" ...
##  $ Longitude       : num  -87.9 -87.9 -87.9 -87.9 -87.9 ...
##  $ Land.Use        : chr  "COMMERCIAL" "COMMERCIAL" "COMMERCIAL" "COMMERCIAL" ...
##  $ Location.Setting: chr  "SUBURBAN" "SUBURBAN" "SUBURBAN" "SUBURBAN" ...
m %>% filter(Parameter.Name == "EC PM2.5 LC TOR" & Land.Use == "RESIDENTIAL" &  
            Location.Setting == "SUBURBAN" & Longitude >= -100) %>%
      group_by(Parameter.Name)%>%
      summarize(median=median(Arithmetic.Mean,na.rm = TRUE))
## # A tibble: 1 × 2
##    Parameter.Name median
##             <chr>  <dbl>
## 1 EC PM2.5 LC TOR   0.61

08 Amongst monitoring sites that are labeled as COMMERCIAL for “Land Use”, which month of the year has the highest average levels of “Sulfate PM2.5 LC”?

require(dplyr)
subdata %>% left_join(site,by = c("State.Code", "County.Code", "Site.Num")) %>%
  filter(Land.Use == "COMMERCIAL",
         Parameter.Name == "Sulfate PM2.5 LC") %>%
  mutate(month = lubridate::month(Date.Local,label=TRUE)) %>%
  group_by(month) %>%
  summarise(mean = mean(Arithmetic.Mean, na.rm = TRUE)) %>%
  arrange(desc(mean))
## # A tibble: 12 × 2
##    month     mean
##    <ord>    <dbl>
## 1    Feb 2.021325
## 2    Mar 1.805260
## 3    Jul 1.777605
## 4    Aug 1.761226
## 5    Jun 1.750571
## 6    Sep 1.645010
## 7    Apr 1.567614
## 8    May 1.558096
## 9    Dec 1.537649
## 10   Jan 1.316738
## 11   Oct 1.313770
## 12   Nov 1.295837

09 Take a look at the data for the monitoring site identified by State Code 6, County Code 65, and Site Number 8001 (this monitor is in California). At this monitor, for how many days is the sum of “Sulfate PM2.5 LC” and “Total Nitrate PM2.5 LC” greater than 10?

require(dplyr)
data %>% filter(State.Code == "06" & County.Code == "065" & Site.Num == "8001" &
                Parameter.Name %in% c("Sulfate PM2.5 LC", "Total Nitrate PM2.5 LC"))%>%
         group_by(Parameter.Name, Date.Local) %>%
         select(State.Code, County.Code, Site.Num, Date.Local, Parameter.Name,
                Arithmetic.Mean) %>%
         summarise(mean = mean(Arithmetic.Mean, na.rm = TRUE)) %>%
         group_by(Date.Local) %>%
         summarise(Total = sum(mean, na.rm = TRUE)) %>%
         filter(Total > 10)
## # A tibble: 11 × 2
##    Date.Local    Total
##        <date>    <dbl>
## 1  2014-01-11 13.60467
## 2  2014-01-29 17.16333
## 3  2014-02-10 11.82500
## 4  2014-02-19 17.69000
## 5  2014-02-25 16.90000
## 6  2014-03-06 11.37667
## 7  2014-03-24 10.05667
## 8  2014-10-11 15.81000
## 9  2014-10-14 11.49500
## 10 2014-10-20 10.83500
## 11 2014-11-10 21.01000

10 Which monitoring site in the dataset has the highest correlation between “Sulfate PM2.5 LC” and “Total Nitrate PM2.5 LC” across all dates? Identify the monitoring site by it’s State, County, and Site Number code.

require(dplyr)
data %>%
  filter(Parameter.Name %in% c("Sulfate PM2.5 LC", "Total Nitrate PM2.5 LC")) %>%
  group_by(State.Code, County.Code, Site.Num, Parameter.Name, Date.Local) %>%
  select(State.Code, County.Code, Site.Num, Date.Local, Parameter.Name,
         Arithmetic.Mean) %>%
  summarise(mean = mean(Arithmetic.Mean, na.rm = TRUE)) %>%
  tidyr::spread(Parameter.Name, mean) %>%
  group_by(State.Code, County.Code, Site.Num) %>%
  summarise(correlation = cor(`Sulfate PM2.5 LC`, `Total Nitrate PM2.5 LC`)) %>%
  arrange(desc(correlation))
## Source: local data frame [358 x 4]
## Groups: State.Code, County.Code [313]
## 
##    State.Code County.Code Site.Num correlation
##         <chr>       <chr>    <chr>       <dbl>
## 1          02         090     0035   0.8978038
## 2          08         001     0006   0.8956944
## 3          34         001     0006   0.8812428
## 4          42         045     0002   0.8739804
## 5          02         090     0010   0.8637321
## 6          53         033     0030   0.8471710
## 7          02         090     0034   0.8412910
## 8          41         033     0010   0.7920220
## 9          16         037     0002   0.7907279
## 10         38         017     1004   0.7900868
## # ... with 348 more rows