These exercises accompany the Merging tutorial.
merge()
function to merge airdata
with the AQS units table found at https://aqs.epa.gov/aqsweb/codes/data/Units.csv.dplyr
function inner_join()
.airdata
using the semi_join()
function.airdata
using the anti_join()
function.pollutants
and monitors
data frames from the Merging tutorial. Do a semi join with the table below, but only keep records where the Violation column is TRUE.criteria <- read.table(header=T, text='
Parameter Duration Monitor Violation
ozone 1h 1 FALSE
so2 1h 1 TRUE
ozone 8h 2 TRUE
no2 1h 4 FALSE
')
library(region5air)
library(dplyr)
data(airdata)
as.tbl(airdata)
## Source: local data frame [367,595 x 20]
##
## site data_status action_code datetime parameter
## (chr) (int) (int) (chr) (int)
## 1 840170890005 0 10 20141231T0100-0600 44201
## 2 840170311601 0 10 20141231T0100-0600 44201
## 3 840170314002 0 10 20141231T0100-0600 44201
## 4 840170310001 0 10 20141231T0100-0600 44201
## 5 840171110001 0 10 20141231T0100-0600 44201
## 6 840170971007 0 10 20141231T0100-0600 44201
## 7 840170314201 0 10 20141231T0100-0600 44201
## 8 840170310076 0 10 20141231T0100-0600 44201
## 9 840170313103 0 10 20141231T0100-0600 44201
## 10 840171971011 0 10 20141231T0100-0600 44201
## .. ... ... ... ... ...
## Variables not shown: duration (int), frequency (int), value (dbl), unit
## (int), qc (int), poc (int), lat (dbl), lon (dbl), GISDatum (chr), elev
## (int), method_code (int), mpc (chr), mpc_value (chr), uncertainty (lgl),
## qualifiers (chr)
aqs_units <- read.csv("https://aqs.epa.gov/aqsweb/codes/data/Units.csv",
skip = 1, stringsAsFactors = FALSE)
head(aqs_units)
## Unit Unit.Desc
## 1 1 Micrograms/cubic meter (25 C)
## 2 2 Micrograms/cubic meter (0 C)
## 3 3 Nanograms/cubic meter (25 C)
## 4 4 Nanograms/cubic meter (0 C)
## 5 5 Milligrams/cubic meter (25 C)
## 6 6 Milligrams/cubic meter (0 C)
air <- merge(airdata, aqs_units, by.x = "unit", by.y = "Unit")
as.tbl(air)
## Source: local data frame [367,595 x 21]
##
## unit site data_status action_code datetime parameter
## (int) (chr) (int) (int) (chr) (int)
## 1 7 840170890005 0 10 20141231T0100-0600 44201
## 2 7 840170311601 0 10 20141231T0100-0600 44201
## 3 7 840170314002 0 10 20141231T0100-0600 44201
## 4 7 840170310001 0 10 20141231T0100-0600 44201
## 5 7 840171110001 0 10 20141231T0100-0600 44201
## 6 7 840170971007 0 10 20141231T0100-0600 44201
## 7 7 840170314201 0 10 20141231T0100-0600 44201
## 8 7 840170310076 0 10 20141231T0100-0600 44201
## 9 7 840170313103 0 10 20141231T0100-0600 44201
## 10 7 840171971011 0 10 20141231T0100-0600 44201
## .. ... ... ... ... ... ...
## Variables not shown: duration (int), frequency (int), value (dbl), qc
## (int), poc (int), lat (dbl), lon (dbl), GISDatum (chr), elev (int),
## method_code (int), mpc (chr), mpc_value (chr), uncertainty (lgl),
## qualifiers (chr), Unit.Desc (chr)
air <- inner_join(airdata, aqs_units, by = c("unit" = "Unit"))
as.tbl(air)
## Source: local data frame [367,595 x 21]
##
## site data_status action_code datetime parameter
## (chr) (int) (int) (chr) (int)
## 1 840170890005 0 10 20141231T0100-0600 44201
## 2 840170311601 0 10 20141231T0100-0600 44201
## 3 840170314002 0 10 20141231T0100-0600 44201
## 4 840170310001 0 10 20141231T0100-0600 44201
## 5 840171110001 0 10 20141231T0100-0600 44201
## 6 840170971007 0 10 20141231T0100-0600 44201
## 7 840170314201 0 10 20141231T0100-0600 44201
## 8 840170310076 0 10 20141231T0100-0600 44201
## 9 840170313103 0 10 20141231T0100-0600 44201
## 10 840171971011 0 10 20141231T0100-0600 44201
## .. ... ... ... ... ...
## Variables not shown: duration (int), frequency (int), value (dbl), unit
## (int), qc (int), poc (int), lat (dbl), lon (dbl), GISDatum (chr), elev
## (int), method_code (int), mpc (chr), mpc_value (chr), uncertainty (lgl),
## qualifiers (chr), Unit.Desc (chr)
aqs_counties <- read.csv("https://aqs.epa.gov/aqsweb/codes/data/StateCountyCodes.csv",
skip = 1, stringsAsFactors = FALSE)
as.tbl(aqs_counties)
## Source: local data frame [3,279 x 6]
##
## State.Name State.Abbr State.Code Region County.Name County.Code
## (chr) (chr) (chr) (int) (chr) (int)
## 1 Alabama AL 01 4 Autauga 1
## 2 Alabama AL 01 4 Baldwin 3
## 3 Alabama AL 01 4 Barbour 5
## 4 Alabama AL 01 4 Bibb 7
## 5 Alabama AL 01 4 Blount 9
## 6 Alabama AL 01 4 Bullock 11
## 7 Alabama AL 01 4 Butler 13
## 8 Alabama AL 01 4 Calhoun 15
## 9 Alabama AL 01 4 Chambers 17
## 10 Alabama AL 01 4 Cherokee 19
## .. ... ... ... ... ... ...
airdata$State.Code <- substr(airdata$site, 4, 5)
airdata$County.Code <- as.integer(substr(airdata$site, 6, 8))
my_counties <- semi_join(aqs_counties, airdata)
## Joining by: c("State.Code", "County.Code")
my_counties
## State.Name State.Abbr State.Code Region County.Name County.Code
## 1 Illinois IL 17 5 Kane 89
## 2 Illinois IL 17 5 Cook 31
## 3 Illinois IL 17 5 McHenry 111
## 4 Illinois IL 17 5 Lake 97
## 5 Illinois IL 17 5 Will 197
## 6 Wisconsin WI 55 5 Kenosha 59
## 7 Illinois IL 17 5 DuPage 43
## 8 Indiana IN 18 5 Lake 89
## 9 Indiana IN 18 5 Porter 127
## 10 Indiana IN 18 5 Jasper 73
First we read in the AQS table and narrow it down to just a few unique columns
aqs_criteria <- read.csv("https://aqs.epa.gov/aqsweb/codes/data/Parameters-CRITERIA.csv",
skip = 1, stringsAsFactors = FALSE)
as.tbl(aqs_criteria)
## Source: local data frame [1,228 x 17]
##
## Parameter.Code Parameter.Desc Standard.Unit Meth.Code Duration.Code
## (int) (chr) (int) (int) (chr)
## 1 12128 Lead (TSP) STP 1 43 7
## 2 12128 Lead (TSP) STP 1 43 7
## 3 12128 Lead (TSP) STP 1 43 7
## 4 12128 Lead (TSP) STP 1 43 7
## 5 12128 Lead (TSP) STP 1 43 7
## 6 12128 Lead (TSP) STP 1 43 8
## 7 12128 Lead (TSP) STP 1 43 8
## 8 12128 Lead (TSP) STP 1 43 8
## 9 12128 Lead (TSP) STP 1 43 8
## 10 12128 Lead (TSP) STP 1 43 8
## .. ... ... ... ... ...
## Variables not shown: Duration.Desc (chr), Reported.Unit (int),
## Reported.Unit.Desc (chr), Collection.Desc (chr), Sample.Analysis.Disc
## (chr), FED.MDL (dbl), Summary.Scale (int), Designated.Method.Desc (chr),
## Designated.Method.ID (chr), ABS.Max.Sample.Value (chr),
## ABS.Min.Sample.Value (chr), Conversion.Ind (chr)
aqs_criteria <- distinct(select(aqs_criteria, Parameter.Code,
Parameter.Desc, Duration.Desc))
as.tbl(aqs_criteria)
## Source: local data frame [21 x 3]
##
## Parameter.Code Parameter.Desc Duration.Desc
## (int) (chr) (chr)
## 1 12128 Lead (TSP) STP 24 HOUR
## 2 12128 Lead (TSP) STP 1 MONTH
## 3 12128 Lead (TSP) STP COMPOSITE DATA
## 4 14129 Lead (TSP) LC 24 HOUR
## 5 42101 Carbon monoxide 1 HOUR
## 6 42401 Sulfur dioxide 1 HOUR
## 7 42401 Sulfur dioxide 5 MINUTE
## 8 42401 Sulfur dioxide 24 HOUR
## 9 42401 Sulfur dioxide 1 WEEK
## 10 42602 Nitrogen dioxide (NO2) INTEGREATED PASSIVE 2-WEEKS
## .. ... ... ...
Now we filter down to rows with parameters that are not in airdata
.
missing_criteria <- anti_join(aqs_criteria, airdata,
by = c("Parameter.Code" = "parameter"))
missing_criteria
## Parameter.Code Parameter.Desc Duration.Desc
## 1 42401 Sulfur dioxide 1 HOUR
## 2 42401 Sulfur dioxide 5 MINUTE
## 3 42401 Sulfur dioxide 24 HOUR
## 4 42401 Sulfur dioxide 1 WEEK
## 5 42602 Nitrogen dioxide (NO2) INTEGREATED PASSIVE 2-WEEKS
## 6 42602 Nitrogen dioxide (NO2) INTEGREATED PASSIVE 3-WEEKS
## 7 42602 Nitrogen dioxide (NO2) INTEGRATED PASSIVE 4-WEEKS
## 8 42602 Nitrogen dioxide (NO2) 1 HOUR
## 9 42602 Nitrogen dioxide (NO2) 24 HOUR
## 10 85129 Lead PM10 LC FRM/FEM 24 HOUR
## 11 81102 PM10 Total 0-10um STP 24 HOUR
## 12 81102 PM10 Total 0-10um STP 1 HOUR
## 13 14129 Lead (TSP) LC 24 HOUR
## 14 12128 Lead (TSP) STP 24 HOUR
## 15 12128 Lead (TSP) STP 1 MONTH
## 16 12128 Lead (TSP) STP COMPOSITE DATA
## 17 42101 Carbon monoxide 1 HOUR
monitors <- read.table(header=T, text='
monitorid lat long
1 42.467573 -87.810047
2 42.049148 -88.273029
3 39.110539 -90.324080
')
pollutants <- read.table(header=T, text='
pollutant duration monitorid
ozone 1h 1
so2 1h 1
ozone 8h 2
no2 1h 4
')
mon_pol <- full_join(monitors, pollutants)
## Joining by: "monitorid"
criteria <- filter(criteria, Violation == TRUE)
mon_pol <- semi_join(mon_pol, criteria,
by = c("pollutant" = "Parameter",
"duration" = "Duration",
"monitorid" = "Monitor"))
mon_pol
## monitorid lat long pollutant duration
## 1 1 42.46757 -87.81005 so2 1h
## 2 2 42.04915 -88.27303 ozone 8h