Exercises

These exercises accompany the Merging tutorial.

  1. Use the merge() function to merge airdata with the AQS units table found at https://aqs.epa.gov/aqsweb/codes/data/Units.csv.

Solution

  1. Do the same merge described in Exercise 1 but use the dplyr function inner_join().

Solution

  1. Subset the AQS State and County code table (https://aqs.epa.gov/aqsweb/codes/data/StateCountyCodes.csv) down to just the counties that have monitors in airdata using the semi_join() function.

Solution

  1. Subset the AQS Criteria table (https://aqs.epa.gov/aqsweb/codes/data/Parameters-CRITERIA.csv) down to parameters that are not in airdata using the anti_join() function.

Solution

  1. Do a full merge of the pollutants and monitors data frames from the Merging tutorial. Do a semi join with the table below, but only keep records where the Violation column is TRUE.
criteria <- read.table(header=T, text='
  Parameter   Duration    Monitor   Violation              
      ozone         1h          1       FALSE
        so2         1h          1        TRUE
      ozone         8h          2        TRUE
        no2         1h          4       FALSE
                       ')

Solution

Solutions

Solution 1

library(region5air)
library(dplyr)
data(airdata)
as.tbl(airdata)
## Source: local data frame [367,595 x 20]
## 
##            site data_status action_code           datetime parameter
##           (chr)       (int)       (int)              (chr)     (int)
## 1  840170890005           0          10 20141231T0100-0600     44201
## 2  840170311601           0          10 20141231T0100-0600     44201
## 3  840170314002           0          10 20141231T0100-0600     44201
## 4  840170310001           0          10 20141231T0100-0600     44201
## 5  840171110001           0          10 20141231T0100-0600     44201
## 6  840170971007           0          10 20141231T0100-0600     44201
## 7  840170314201           0          10 20141231T0100-0600     44201
## 8  840170310076           0          10 20141231T0100-0600     44201
## 9  840170313103           0          10 20141231T0100-0600     44201
## 10 840171971011           0          10 20141231T0100-0600     44201
## ..          ...         ...         ...                ...       ...
## Variables not shown: duration (int), frequency (int), value (dbl), unit
##   (int), qc (int), poc (int), lat (dbl), lon (dbl), GISDatum (chr), elev
##   (int), method_code (int), mpc (chr), mpc_value (chr), uncertainty (lgl),
##   qualifiers (chr)
aqs_units <- read.csv("https://aqs.epa.gov/aqsweb/codes/data/Units.csv",
                      skip = 1, stringsAsFactors = FALSE)
head(aqs_units)
##   Unit                     Unit.Desc
## 1    1 Micrograms/cubic meter (25 C)
## 2    2  Micrograms/cubic meter (0 C)
## 3    3  Nanograms/cubic meter (25 C)
## 4    4   Nanograms/cubic meter (0 C)
## 5    5 Milligrams/cubic meter (25 C)
## 6    6  Milligrams/cubic meter (0 C)
air <- merge(airdata, aqs_units, by.x = "unit", by.y = "Unit")
as.tbl(air)
## Source: local data frame [367,595 x 21]
## 
##     unit         site data_status action_code           datetime parameter
##    (int)        (chr)       (int)       (int)              (chr)     (int)
## 1      7 840170890005           0          10 20141231T0100-0600     44201
## 2      7 840170311601           0          10 20141231T0100-0600     44201
## 3      7 840170314002           0          10 20141231T0100-0600     44201
## 4      7 840170310001           0          10 20141231T0100-0600     44201
## 5      7 840171110001           0          10 20141231T0100-0600     44201
## 6      7 840170971007           0          10 20141231T0100-0600     44201
## 7      7 840170314201           0          10 20141231T0100-0600     44201
## 8      7 840170310076           0          10 20141231T0100-0600     44201
## 9      7 840170313103           0          10 20141231T0100-0600     44201
## 10     7 840171971011           0          10 20141231T0100-0600     44201
## ..   ...          ...         ...         ...                ...       ...
## Variables not shown: duration (int), frequency (int), value (dbl), qc
##   (int), poc (int), lat (dbl), lon (dbl), GISDatum (chr), elev (int),
##   method_code (int), mpc (chr), mpc_value (chr), uncertainty (lgl),
##   qualifiers (chr), Unit.Desc (chr)

Back to exercises

Solution 2

air <- inner_join(airdata, aqs_units, by = c("unit" = "Unit"))
as.tbl(air)
## Source: local data frame [367,595 x 21]
## 
##            site data_status action_code           datetime parameter
##           (chr)       (int)       (int)              (chr)     (int)
## 1  840170890005           0          10 20141231T0100-0600     44201
## 2  840170311601           0          10 20141231T0100-0600     44201
## 3  840170314002           0          10 20141231T0100-0600     44201
## 4  840170310001           0          10 20141231T0100-0600     44201
## 5  840171110001           0          10 20141231T0100-0600     44201
## 6  840170971007           0          10 20141231T0100-0600     44201
## 7  840170314201           0          10 20141231T0100-0600     44201
## 8  840170310076           0          10 20141231T0100-0600     44201
## 9  840170313103           0          10 20141231T0100-0600     44201
## 10 840171971011           0          10 20141231T0100-0600     44201
## ..          ...         ...         ...                ...       ...
## Variables not shown: duration (int), frequency (int), value (dbl), unit
##   (int), qc (int), poc (int), lat (dbl), lon (dbl), GISDatum (chr), elev
##   (int), method_code (int), mpc (chr), mpc_value (chr), uncertainty (lgl),
##   qualifiers (chr), Unit.Desc (chr)

Back to exercises

Solution 3

aqs_counties <- read.csv("https://aqs.epa.gov/aqsweb/codes/data/StateCountyCodes.csv",
                       skip = 1, stringsAsFactors = FALSE)
as.tbl(aqs_counties)
## Source: local data frame [3,279 x 6]
## 
##    State.Name State.Abbr State.Code Region County.Name County.Code
##         (chr)      (chr)      (chr)  (int)       (chr)       (int)
## 1     Alabama         AL         01      4     Autauga           1
## 2     Alabama         AL         01      4     Baldwin           3
## 3     Alabama         AL         01      4     Barbour           5
## 4     Alabama         AL         01      4        Bibb           7
## 5     Alabama         AL         01      4      Blount           9
## 6     Alabama         AL         01      4     Bullock          11
## 7     Alabama         AL         01      4      Butler          13
## 8     Alabama         AL         01      4     Calhoun          15
## 9     Alabama         AL         01      4    Chambers          17
## 10    Alabama         AL         01      4    Cherokee          19
## ..        ...        ...        ...    ...         ...         ...
airdata$State.Code <- substr(airdata$site, 4, 5)
airdata$County.Code <- as.integer(substr(airdata$site, 6, 8))
my_counties <- semi_join(aqs_counties, airdata)
## Joining by: c("State.Code", "County.Code")
my_counties
##    State.Name State.Abbr State.Code Region County.Name County.Code
## 1    Illinois         IL         17      5        Kane          89
## 2    Illinois         IL         17      5        Cook          31
## 3    Illinois         IL         17      5     McHenry         111
## 4    Illinois         IL         17      5        Lake          97
## 5    Illinois         IL         17      5        Will         197
## 6   Wisconsin         WI         55      5     Kenosha          59
## 7    Illinois         IL         17      5      DuPage          43
## 8     Indiana         IN         18      5        Lake          89
## 9     Indiana         IN         18      5      Porter         127
## 10    Indiana         IN         18      5      Jasper          73

Back to exercises

Solution 4

First we read in the AQS table and narrow it down to just a few unique columns

aqs_criteria <- read.csv("https://aqs.epa.gov/aqsweb/codes/data/Parameters-CRITERIA.csv",
                       skip = 1, stringsAsFactors = FALSE)
as.tbl(aqs_criteria)
## Source: local data frame [1,228 x 17]
## 
##    Parameter.Code Parameter.Desc Standard.Unit Meth.Code Duration.Code
##             (int)          (chr)         (int)     (int)         (chr)
## 1           12128 Lead (TSP) STP             1        43             7
## 2           12128 Lead (TSP) STP             1        43             7
## 3           12128 Lead (TSP) STP             1        43             7
## 4           12128 Lead (TSP) STP             1        43             7
## 5           12128 Lead (TSP) STP             1        43             7
## 6           12128 Lead (TSP) STP             1        43             8
## 7           12128 Lead (TSP) STP             1        43             8
## 8           12128 Lead (TSP) STP             1        43             8
## 9           12128 Lead (TSP) STP             1        43             8
## 10          12128 Lead (TSP) STP             1        43             8
## ..            ...            ...           ...       ...           ...
## Variables not shown: Duration.Desc (chr), Reported.Unit (int),
##   Reported.Unit.Desc (chr), Collection.Desc (chr), Sample.Analysis.Disc
##   (chr), FED.MDL (dbl), Summary.Scale (int), Designated.Method.Desc (chr),
##   Designated.Method.ID (chr), ABS.Max.Sample.Value (chr),
##   ABS.Min.Sample.Value (chr), Conversion.Ind (chr)
aqs_criteria <- distinct(select(aqs_criteria, Parameter.Code, 
                                Parameter.Desc, Duration.Desc))
as.tbl(aqs_criteria)
## Source: local data frame [21 x 3]
## 
##    Parameter.Code         Parameter.Desc               Duration.Desc
##             (int)                  (chr)                       (chr)
## 1           12128         Lead (TSP) STP                     24 HOUR
## 2           12128         Lead (TSP) STP                     1 MONTH
## 3           12128         Lead (TSP) STP              COMPOSITE DATA
## 4           14129          Lead (TSP) LC                     24 HOUR
## 5           42101        Carbon monoxide                      1 HOUR
## 6           42401         Sulfur dioxide                      1 HOUR
## 7           42401         Sulfur dioxide                    5 MINUTE
## 8           42401         Sulfur dioxide                     24 HOUR
## 9           42401         Sulfur dioxide                      1 WEEK
## 10          42602 Nitrogen dioxide (NO2) INTEGREATED PASSIVE 2-WEEKS
## ..            ...                    ...                         ...

Now we filter down to rows with parameters that are not in airdata.

missing_criteria <- anti_join(aqs_criteria, airdata, 
                              by = c("Parameter.Code" = "parameter"))
missing_criteria
##    Parameter.Code         Parameter.Desc               Duration.Desc
## 1           42401         Sulfur dioxide                      1 HOUR
## 2           42401         Sulfur dioxide                    5 MINUTE
## 3           42401         Sulfur dioxide                     24 HOUR
## 4           42401         Sulfur dioxide                      1 WEEK
## 5           42602 Nitrogen dioxide (NO2) INTEGREATED PASSIVE 2-WEEKS
## 6           42602 Nitrogen dioxide (NO2) INTEGREATED PASSIVE 3-WEEKS
## 7           42602 Nitrogen dioxide (NO2)  INTEGRATED PASSIVE 4-WEEKS
## 8           42602 Nitrogen dioxide (NO2)                      1 HOUR
## 9           42602 Nitrogen dioxide (NO2)                     24 HOUR
## 10          85129   Lead PM10 LC FRM/FEM                     24 HOUR
## 11          81102  PM10 Total 0-10um STP                     24 HOUR
## 12          81102  PM10 Total 0-10um STP                      1 HOUR
## 13          14129          Lead (TSP) LC                     24 HOUR
## 14          12128         Lead (TSP) STP                     24 HOUR
## 15          12128         Lead (TSP) STP                     1 MONTH
## 16          12128         Lead (TSP) STP              COMPOSITE DATA
## 17          42101        Carbon monoxide                      1 HOUR

Back to exercises

Solution 5

monitors <- read.table(header=T, text='
  monitorid        lat       long                 
          1  42.467573 -87.810047     
          2  42.049148 -88.273029
          3  39.110539 -90.324080
                       ')

pollutants <- read.table(header=T, text='
  pollutant   duration    monitorid                 
      ozone         1h            1
        so2         1h            1
      ozone         8h            2
        no2         1h            4
                       ')

mon_pol <- full_join(monitors, pollutants)
## Joining by: "monitorid"
criteria <- filter(criteria, Violation == TRUE)

mon_pol <- semi_join(mon_pol, criteria, 
                     by = c("pollutant" = "Parameter",
                            "duration" = "Duration",
                            "monitorid" = "Monitor"))

mon_pol
##   monitorid      lat      long pollutant duration
## 1         1 42.46757 -87.81005       so2       1h
## 2         2 42.04915 -88.27303     ozone       8h

Back to exercises