dplyr can be used to transform and manipulate data (see https://rpubs.com/pjozefek/643143) and it can be used to join or filter several datasets (relational data).

In this report I will be exploring the three ways in which this is achieved.

  • Mutating Joins - add new variables to one dataset by matching observations in another.
  • Filtering Joins - filter observations in one dataset based on matches in another.
  • Set Operations - Treat observations as set elements.

The Datasets


The nycflights13 package contains the flights dataset along with four related datasets: airlines, airports, planes, weather.

> library(tidyverse) #load dplyr
> library(nycflights13) #load datasets
> library(knitr) #load library for table formatting
> library(kableExtra) #load library for table formatting
> 
> # flights dataset
> kable(head(flights)) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 8,position="left")%>%
+                 row_spec(0,background="peachpuff") 
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute time_hour
2013 1 1 517 515 2 830 819 11 UA 1545 N14228 EWR IAH 227 1400 5 15 2013-01-01 05:00:00
2013 1 1 533 529 4 850 830 20 UA 1714 N24211 LGA IAH 227 1416 5 29 2013-01-01 05:00:00
2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5 40 2013-01-01 05:00:00
2013 1 1 544 545 -1 1004 1022 -18 B6 725 N804JB JFK BQN 183 1576 5 45 2013-01-01 05:00:00
2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6 0 2013-01-01 06:00:00
2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5 58 2013-01-01 05:00:00

The flights dataset has 336776 rows and 19 columns.

> # airlines dataset
> kable(head(airlines)) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
carrier name
9E Endeavor Air Inc. 
AA American Airlines Inc. 
AS Alaska Airlines Inc. 
B6 JetBlue Airways
DL Delta Air Lines Inc. 
EV ExpressJet Airlines Inc. 

The airlines dataset has 16 rows and 2 columns.

> # airports dataset
> kable(head(airports)) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
faa name lat lon alt tz dst tzone
04G Lansdowne Airport 41.13047 -80.61958 1044 -5 A America/New_York
06A Moton Field Municipal Airport 32.46057 -85.68003 264 -6 A America/Chicago
06C Schaumburg Regional 41.98934 -88.10124 801 -6 A America/Chicago
06N Randall Airport 41.43191 -74.39156 523 -5 A America/New_York
09J Jekyll Island Airport 31.07447 -81.42778 11 -5 A America/New_York
0A9 Elizabethton Municipal Airport 36.37122 -82.17342 1593 -5 A America/New_York

The airports dataset has 1458 rows and 8 columns.

> # planes dataset
> kable(head(planes)) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
tailnum year type manufacturer model engines seats speed engine
N10156 2004 Fixed wing multi engine EMBRAER EMB-145XR 2 55 NA Turbo-fan
N102UW 1998 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
N103US 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
N104UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan
N10575 2002 Fixed wing multi engine EMBRAER EMB-145LR 2 55 NA Turbo-fan
N105UW 1999 Fixed wing multi engine AIRBUS INDUSTRIE A320-214 2 182 NA Turbo-fan

The planes dataset has 3322 rows and 9 columns.

> # weather dataset
> kable(head(weather)) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
origin year month day hour temp dewp humid wind_dir wind_speed wind_gust precip pressure visib time_hour
EWR 2013 1 1 1 39.02 26.06 59.37 270 10.35702 NA 0 1012.0 10 2013-01-01 01:00:00
EWR 2013 1 1 2 39.02 26.96 61.63 250 8.05546 NA 0 1012.3 10 2013-01-01 02:00:00
EWR 2013 1 1 3 39.02 28.04 64.43 240 11.50780 NA 0 1012.5 10 2013-01-01 03:00:00
EWR 2013 1 1 4 39.92 28.04 62.21 250 12.65858 NA 0 1012.2 10 2013-01-01 04:00:00
EWR 2013 1 1 5 39.02 28.04 64.43 260 12.65858 NA 0 1011.9 10 2013-01-01 05:00:00
EWR 2013 1 1 6 37.94 28.04 67.21 240 11.50780 NA 0 1012.4 10 2013-01-01 06:00:00

The weather dataset has 26115 rows and 15 columns.

The Datasets - Exercises

  1. Imagine you wanted to draw the route each plane flies from its origin to its destination. What variables would you need? What tables would you need to combine?
  • Variables - longitude and latitude for the origin and destination.
  • Tables - combine flights with airports.
> library(tidyverse)
> library(nycflights13)
> 
> flights_route <- flights %>%
+   inner_join(airports,by = c("origin"="faa")) %>% 
+   inner_join(airports,by = c("dest"="faa"))
> 
> (flights_route <- flights_route %>% 
+   select(origin,lat.x,lon.x,dest,lat.y,lon.y) %>% 
+   head(100))
# A tibble: 100 x 6
   origin lat.x lon.x dest  lat.y lon.y
   <chr>  <dbl> <dbl> <chr> <dbl> <dbl>
 1 EWR     40.7 -74.2 IAH    30.0 -95.3
 2 LGA     40.8 -73.9 IAH    30.0 -95.3
 3 JFK     40.6 -73.8 MIA    25.8 -80.3
 4 LGA     40.8 -73.9 ATL    33.6 -84.4
 5 EWR     40.7 -74.2 ORD    42.0 -87.9
 6 EWR     40.7 -74.2 FLL    26.1 -80.2
 7 LGA     40.8 -73.9 IAD    38.9 -77.5
 8 JFK     40.6 -73.8 MCO    28.4 -81.3
 9 LGA     40.8 -73.9 ORD    42.0 -87.9
10 JFK     40.6 -73.8 PBI    26.7 -80.1
# ... with 90 more rows
> flights_route %>%
+   ggplot() +
+   borders("state") + coord_quickmap() +
+   geom_segment(aes(
+     x = lon.x, xend = lon.y,
+     y = lat.x, yend = lat.y,
+     color=dest),
+     arrow = arrow(length = unit(0.1, "cm"))) +
+   labs(x="longitude",y="latitude")+ 
+   theme_minimal()+
+   theme(legend.position = "none") 

  1. I forgot to draw the relationship between weather and airports. What is the relationship and how should it appear in the diagram?
  • They would be connected by weather$origin = airports$faa.
> left_join(weather, airports,by=c("origin"="faa")) 
# A tibble: 26,115 x 22
   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
 1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4 
 2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06
 3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5 
 4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7 
 5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7 
 6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
 7 EWR     2013     1     1     7  39.0  28.0  64.4      240      15.0 
 8 EWR     2013     1     1     8  39.9  28.0  62.2      250      10.4 
 9 EWR     2013     1     1     9  39.9  28.0  62.2      260      15.0 
10 EWR     2013     1     1    10  41    28.0  59.6      260      13.8 
# ... with 26,105 more rows, and 12 more variables: wind_gust <dbl>,
#   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, name <chr>,
#   lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>, dst <chr>, tzone <chr>
  1. weather only contains information for the origin (NYC) airports. If it contained weather records for all airports in the USA, what additional relation would it define with flights?
  • Right now you can only connect it to flights by year, month, day, origin.
  • With records for all airports you could connect it by year, month, day, dest.
  1. We know that some of the days of the year are “special,” and fewer than usual fly on them. How might you represent that data as a data frame? What would be the primary keys of that table? How would it connect to the existing tables?
  • Primary keys - year, month, day.
> (special_days <- tribble(
+   ~year, ~month, ~day, ~holiday,
+   2013, 01, 01, "New Years Day",
+   2013, 07, 04, "Independence Day",
+   2013, 11, 29, "Thanksgiving Day",
+   2013, 12, 25, "Christmas Day"
+ ))
# A tibble: 4 x 4
   year month   day holiday         
  <dbl> <dbl> <dbl> <chr>           
1  2013     1     1 New Years Day   
2  2013     7     4 Independence Day
3  2013    11    29 Thanksgiving Day
4  2013    12    25 Christmas Day   

Keys


Keys are used to connect each table and they uniquely identify an observation (example: every US citizen has a unique Social Security number).

  • Primary Key - uniquely identifies an observation in its own table
  • Foreign Key - uniquely identifies an observation in another table

One way to verify that a key is unique is by checking that the count() is not greater than 1.

> #count tailnum
> TailnumKey <- planes %>% 
+   count(tailnum) %>% filter(n>1)
> nrow(TailnumKey)
[1] 0

tailnum uniquely identifies a plane.

> #count by specific day/time and airport
> WeatherKey <- weather %>% count(year, month, day, hour, origin) %>% 
+   filter(n>1) 
> nrow(WeatherKey)
[1] 3

A specific time should have a unique weather recording for a specific airport, but it seems that there are 3 records with more than one. It could be an error. In this instance I will explore those 3 further and adjust the data accordingly.

> #view records with more than 1
> weather %>% count(year, month, day, hour, origin) %>% 
+   filter(n>1) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
year month day hour origin n
2013 11 3 1 EWR 2
2013 11 3 1 JFK 2
2013 11 3 1 LGA 2
> weather %>% 
+   filter(month==11,day==3,hour==1,origin %in% c("EWR","JFK","LGA"))%>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
origin year month day hour temp dewp humid wind_dir wind_speed wind_gust precip pressure visib time_hour
EWR 2013 11 3 1 51.98 39.02 61.15 310 6.90468 NA 0 1009.8 10 2013-11-03 01:00:00
EWR 2013 11 3 1 50.00 39.02 65.80 290 5.75390 NA 0 1010.5 10 2013-11-03 01:00:00
JFK 2013 11 3 1 53.96 37.94 54.51 320 9.20624 NA 0 1009.8 10 2013-11-03 01:00:00
JFK 2013 11 3 1 51.98 37.94 58.62 310 6.90468 NA 0 1010.5 10 2013-11-03 01:00:00
LGA 2013 11 3 1 55.04 39.02 54.67 330 9.20624 NA 0 1009.3 10 2013-11-03 01:00:00
LGA 2013 11 3 1 53.96 39.92 58.89 310 8.05546 NA 0 1010.2 10 2013-11-03 01:00:00

I decided to average those 3 days for each airport. Year, month, day, hour, and origin will now be a unique identifier.

> weather2 <- weather %>% 
+   group_by(year,month,day,hour,origin) %>% 
+   summarize(
+     across(temp:visib, mean),
+     across(time_hour, first)
+   )
> 
> weather2 %>% 
+   filter(month==11,day==3,hour==1,origin %in% c("EWR","JFK","LGA"))%>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
year month day hour origin temp dewp humid wind_dir wind_speed wind_gust precip pressure visib time_hour
2013 11 3 1 EWR 50.99 39.02 63.475 300 6.32929 NA 0 1010.15 10 2013-11-03 01:00:00
2013 11 3 1 JFK 52.97 37.94 56.565 315 8.05546 NA 0 1010.15 10 2013-11-03 01:00:00
2013 11 3 1 LGA 54.50 39.47 56.780 320 8.63085 NA 0 1009.75 10 2013-11-03 01:00:00
> WeatherKey <- weather2 %>% count(year, month, day, hour, origin) %>% 
+   filter(n>1)
> nrow(WeatherKey)
[1] 0

Sometimes there is no combination of variables that uniquely identifies an observation. In the flights data set neither the specific day and flight number nor the specific day and tailnumber are unique.

> # day and flight
> flights %>% 
+   count(year,month,day,flight) %>% 
+   filter(n>1) %>% head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
year month day flight n
2013 1 1 1 2
2013 1 1 3 2
2013 1 1 4 2
2013 1 1 11 3
2013 1 1 15 2
2013 1 1 21 2
> #day and tailnum
> flights %>% 
+   count(year,month,day,tailnum) %>% 
+   filter(n>1) %>% head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
year month day tailnum n
2013 1 1 N0EGMQ 2
2013 1 1 N11189 2
2013 1 1 N11536 2
2013 1 1 N11544 3
2013 1 1 N11551 2
2013 1 1 N12540 2

Keys - Exercises

  1. Add a surrogate key to flights.
> flights %>% mutate(id=row_number()) %>% 
+   select(month,day,flight,origin,dest,id)
# A tibble: 336,776 x 6
   month   day flight origin dest     id
   <int> <int>  <int> <chr>  <chr> <int>
 1     1     1   1545 EWR    IAH       1
 2     1     1   1714 LGA    IAH       2
 3     1     1   1141 JFK    MIA       3
 4     1     1    725 JFK    BQN       4
 5     1     1    461 LGA    ATL       5
 6     1     1   1696 EWR    ORD       6
 7     1     1    507 EWR    FLL       7
 8     1     1   5708 LGA    IAD       8
 9     1     1     79 JFK    MCO       9
10     1     1    301 LGA    ORD      10
# ... with 336,766 more rows
  1. Identify the keys in the following datasets:
  1. Lahman::Batting

  2. babyname::babynames

  3. nasaweather::atmos

  4. fueleconomy::vehicles

  5. ggplot2::diamonds

  • Each call of count() contains the keys. Each call of nrow() results in \(0\).
  • ggplot2 has no unique keys. Some rows are duplicates.
> Lahman::Batting %>%
+   count(playerID, yearID, stint) %>%
+   filter(n > 1) %>%
+   nrow()
> 
> babynames::babynames %>%
+   count(year, sex, name, wt=n()) %>%
+   filter(n > 1) %>%
+   nrow()
> 
> nasaweather::atmos %>%
+   count(lat, long, year, month) %>%
+   filter(n > 1) %>%
+   nrow()
> 
> fueleconomy::vehicles %>%
+   count(id) %>%
+   filter(n > 1) %>%
+   nrow()
> #No keys
> ggplot2::diamonds %>% distinct() %>%nrow()
[1] 53794
> nrow(ggplot2::diamonds)
[1] 53940

Mutating Joins


Similar to mutate(), a mutating join adds new variables to the right of a table. It does so by:

  • Matching keys between tables
  • Copying variables from one table to another

For example, if you want to add carrier name to the flights dataset you would join it to the airlines table using the carrier variable as your key.

It is easier to see when starting with a smaller dataset.

> #make a smaller data set
> flights2 <- flights %>% 
+   select(year:day,hour,origin,dest,tailnum,carrier)
> 
> #display first 6
> flights2 %>% head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
year month day hour origin dest tailnum carrier
2013 1 1 5 EWR IAH N14228 UA
2013 1 1 5 LGA IAH N24211 UA
2013 1 1 5 JFK MIA N619AA AA
2013 1 1 5 JFK BQN N804JB B6
2013 1 1 6 LGA ATL N668DN DL
2013 1 1 5 EWR ORD N39463 UA

We can see that the column name is now added to the right of the flights2 table.

> #add name using join
> flights2 %>% 
+   select(-origin,-dest) %>% 
+   left_join(airlines,by="carrier") %>% 
+   head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
year month day hour tailnum carrier name
2013 1 1 5 N14228 UA United Air Lines Inc. 
2013 1 1 5 N24211 UA United Air Lines Inc. 
2013 1 1 5 N619AA AA American Airlines Inc. 
2013 1 1 5 N804JB B6 JetBlue Airways
2013 1 1 6 N668DN DL Delta Air Lines Inc. 
2013 1 1 5 N39463 UA United Air Lines Inc. 

We could achieve the same result using mutate(), but it is more difficult.

> #add name using mutate
> flights2 %>% 
+   select(-origin,-dest) %>% 
+   mutate(name=airlines$name[match(carrier,airlines$carrier)]) %>% 
+   head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
year month day hour tailnum carrier name
2013 1 1 5 N14228 UA United Air Lines Inc. 
2013 1 1 5 N24211 UA United Air Lines Inc. 
2013 1 1 5 N619AA AA American Airlines Inc. 
2013 1 1 5 N804JB B6 JetBlue Airways
2013 1 1 6 N668DN DL Delta Air Lines Inc. 
2013 1 1 5 N39463 UA United Air Lines Inc. 

Join Types

There are four kinds of mutating joins

  • Inner Join - Matches observations where the keys are equal. Unmatched rows are not included in the result.
  • Left Join - Keeps all observations from the left table.
  • Right Join - Keeps all observations from the right table.
  • Full Join - Keeps all observations from both the left and right tables.

Some examples are provided below:

> #Sample tables
> x <- tribble(
+   ~key,~val_x,
+      1,"x1",
+      2,"x2",
+      3,"x3"
+ )
>   
> y <- tribble(
+   ~key,~val_y,
+      1,"y1",
+      2,"y2",
+      4,"y3"
+ )
key val_x
1 x1
2 x2
3 x3
key val_y
1 y1
2 y2
4 y3

Inner Join

> x %>% 
+   inner_join(y, by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff") 
key val_x val_y
1 x1 y1
2 x2 y2

Left Join

> x %>% 
+   left_join(y, by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
key val_x val_y
1 x1 y1
2 x2 y2
3 x3 NA

Right Join

> x %>% 
+   right_join(y, by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
key val_x val_y
1 x1 y1
2 x2 y2
4 NA y3

Full Join

> x %>% 
+   full_join(y, by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
key val_x val_y
1 x1 y1
2 x2 y2
3 x3 NA
4 NA y3

Duplicate Keys

It is possible to have duplicate keys in one or both tables. When both tables have duplicate keys the result includes all combinations (Cartesian product).

Some examples are provided below:

Left Join - Left table with duplicate key

> x <- tribble(
+   ~key,~val_x,
+      1,"x1",
+      2,"x2",
+      2,"x3",
+      1,"x4"
+ )
> 
> y <- tribble(
+   ~key,~val_y,
+      1,"y1",
+      2,"y2"
+ )
key val_x
1 x1
2 x2
2 x3
1 x4
key val_y
1 y1
2 y2



Left Join

> left_join(x,y,by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
key val_x val_y
1 x1 y1
2 x2 y2
2 x3 y2
1 x4 y1

Left Join - Both tables with duplicate key

> x <- tribble(
+   ~key,~val_x,
+      1,"x1",
+      2,"x2",
+      2,"x3",
+      3,"x4"
+ )
> 
> y <- tribble(
+   ~key,~val_y,
+      1,"y1",
+      2,"y2",
+      2,"y3",
+      3,"y4"
+ )
key val_x
1 x1
2 x2
2 x3
3 x4
key val_y
1 y1
2 y2
2 y3
3 y4

Left Join

> left_join(x,y,by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
key val_x val_y
1 x1 y1
2 x2 y2
2 x2 y3
2 x3 y2
2 x3 y3
3 x4 y4

Defining Key Columns

There are several ways to specify the key:

  • Default - when no key is specified it will match the tables using their common variables
  • Single Character Vector - when by = "x" is used the tables will be matched on that common variable.
  • Named Character Vector - when by = c("a" = "b") is used the variable a in table x will be matched to the variable b in table y.

Some examples are provided below:

Default - a message displays the common variables. In this case year, month, day, hour, origin.

> flights2 %>% 
+   left_join(weather2) %>% 
+   head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
Joining, by = c("year", "month", "day", "hour", "origin")
year month day hour origin dest tailnum carrier temp dewp humid wind_dir wind_speed wind_gust precip pressure visib time_hour
2013 1 1 5 EWR IAH N14228 UA 39.02 28.04 64.43 260 12.65858 NA 0 1011.9 10 2013-01-01 05:00:00
2013 1 1 5 LGA IAH N24211 UA 39.92 24.98 54.81 250 14.96014 21.86482 0 1011.4 10 2013-01-01 05:00:00
2013 1 1 5 JFK MIA N619AA AA 39.02 26.96 61.63 260 14.96014 NA 0 1012.1 10 2013-01-01 05:00:00
2013 1 1 5 JFK BQN N804JB B6 39.02 26.96 61.63 260 14.96014 NA 0 1012.1 10 2013-01-01 05:00:00
2013 1 1 6 LGA ATL N668DN DL 39.92 24.98 54.81 260 16.11092 23.01560 0 1011.7 10 2013-01-01 06:00:00
2013 1 1 5 EWR ORD N39463 UA 39.02 28.04 64.43 260 12.65858 NA 0 1011.9 10 2013-01-01 05:00:00

Single Character Vector - Each table has a year variable, but they are not constrained to be equal. They output labels them as year.x and year.y.

> flights2 %>% 
+   left_join(planes, by="tailnum") %>% 
+   head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
year.x month day hour origin dest tailnum carrier year.y type manufacturer model engines seats speed engine
2013 1 1 5 EWR IAH N14228 UA 1999 Fixed wing multi engine BOEING 737-824 2 149 NA Turbo-fan
2013 1 1 5 LGA IAH N24211 UA 1998 Fixed wing multi engine BOEING 737-824 2 149 NA Turbo-fan
2013 1 1 5 JFK MIA N619AA AA 1990 Fixed wing multi engine BOEING 757-223 2 178 NA Turbo-fan
2013 1 1 5 JFK BQN N804JB B6 2012 Fixed wing multi engine AIRBUS A320-232 2 200 NA Turbo-fan
2013 1 1 6 LGA ATL N668DN DL 1991 Fixed wing multi engine BOEING 757-232 2 178 NA Turbo-fan
2013 1 1 5 EWR ORD N39463 UA 2012 Fixed wing multi engine BOEING 737-924ER 2 191 NA Turbo-fan

Named Character Vector - the airports table includes latitude and longitude. You would need to specify which airport you are interested in - the destination or the origin.

> flights2 %>% 
+   left_join(airports, c("dest" = "faa")) %>% 
+   head %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
year month day hour origin dest tailnum carrier name lat lon alt tz dst tzone
2013 1 1 5 EWR IAH N14228 UA George Bush Intercontinental 29.98443 -95.34144 97 -6 A America/Chicago
2013 1 1 5 LGA IAH N24211 UA George Bush Intercontinental 29.98443 -95.34144 97 -6 A America/Chicago
2013 1 1 5 JFK MIA N619AA AA Miami Intl 25.79325 -80.29056 8 -5 A America/New_York
2013 1 1 5 JFK BQN N804JB B6 NA NA NA NA NA NA NA
2013 1 1 6 LGA ATL N668DN DL Hartsfield Jackson Atlanta Intl 33.63672 -84.42807 1026 -5 A America/New_York
2013 1 1 5 EWR ORD N39463 UA Chicago Ohare Intl 41.97860 -87.90484 668 -6 A America/Chicago
> flights2 %>% 
+   left_join(airports, c("origin" = "faa")) %>% 
+   head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
year month day hour origin dest tailnum carrier name lat lon alt tz dst tzone
2013 1 1 5 EWR IAH N14228 UA Newark Liberty Intl 40.69250 -74.16867 18 -5 A America/New_York
2013 1 1 5 LGA IAH N24211 UA La Guardia 40.77725 -73.87261 22 -5 A America/New_York
2013 1 1 5 JFK MIA N619AA AA John F Kennedy Intl 40.63975 -73.77893 13 -5 A America/New_York
2013 1 1 5 JFK BQN N804JB B6 John F Kennedy Intl 40.63975 -73.77893 13 -5 A America/New_York
2013 1 1 6 LGA ATL N668DN DL La Guardia 40.77725 -73.87261 22 -5 A America/New_York
2013 1 1 5 EWR ORD N39463 UA Newark Liberty Intl 40.69250 -74.16867 18 -5 A America/New_York

Mutating Joins - Exercises

  1. Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays.
> (avg_delay <- 
+ flights %>% filter(!is.na(arr_delay)) %>% group_by(dest) %>% 
+   summarize(delay_mean=mean(arr_delay)) %>% 
+   inner_join(airports,by=c("dest"="faa")))
# A tibble: 100 x 9
   dest  delay_mean name                 lat    lon   alt    tz dst   tzone     
   <chr>      <dbl> <chr>              <dbl>  <dbl> <dbl> <dbl> <chr> <chr>     
 1 ABQ         4.38 Albuquerque Inter~  35.0 -107.   5355    -7 A     America/D~
 2 ACK         4.85 Nantucket Mem       41.3  -70.1    48    -5 A     America/N~
 3 ALB        14.4  Albany Intl         42.7  -73.8   285    -5 A     America/N~
 4 ANC        -2.5  Ted Stevens Ancho~  61.2 -150.    152    -9 A     America/A~
 5 ATL        11.3  Hartsfield Jackso~  33.6  -84.4  1026    -5 A     America/N~
 6 AUS         6.02 Austin Bergstrom ~  30.2  -97.7   542    -6 A     America/C~
 7 AVL         8.00 Asheville Regiona~  35.4  -82.5  2165    -5 A     America/N~
 8 BDL         7.05 Bradley Intl        41.9  -72.7   173    -5 A     America/N~
 9 BGR         8.03 Bangor Intl         44.8  -68.8   192    -5 A     America/N~
10 BHM        16.9  Birmingham Intl     33.6  -86.8   644    -6 A     America/C~
# ... with 90 more rows
> #remove Hawaii and Alaska, for  better plotting 
> avg_delay <- avg_delay %>% filter(lat<55 & lat>22)
> 
> avg_delay %>% ggplot()+
+   borders("state")+coord_quickmap()+
+   geom_point(aes(x=lon, y=lat, color=delay_mean))+ 
+   scale_color_gradient(low="blue", high="red")+
+   theme_minimal()

  1. Add the location of the origin and destination to flights.
> flights %>%
+   left_join(airports,by = c("origin" = "faa")) %>%
+   left_join(airports,by = c("dest" = "faa"),
+     suffix = c("_origin", "_dest")) %>% 
+   select(year:day, hour, origin, dest, starts_with("lat"),
+          starts_with("lon"))
# A tibble: 336,776 x 10
    year month   day  hour origin dest  lat_origin lat_dest lon_origin lon_dest
   <int> <int> <int> <dbl> <chr>  <chr>      <dbl>    <dbl>      <dbl>    <dbl>
 1  2013     1     1     5 EWR    IAH         40.7     30.0      -74.2    -95.3
 2  2013     1     1     5 LGA    IAH         40.8     30.0      -73.9    -95.3
 3  2013     1     1     5 JFK    MIA         40.6     25.8      -73.8    -80.3
 4  2013     1     1     5 JFK    BQN         40.6     NA        -73.8     NA  
 5  2013     1     1     6 LGA    ATL         40.8     33.6      -73.9    -84.4
 6  2013     1     1     5 EWR    ORD         40.7     42.0      -74.2    -87.9
 7  2013     1     1     6 EWR    FLL         40.7     26.1      -74.2    -80.2
 8  2013     1     1     6 LGA    IAD         40.8     38.9      -73.9    -77.5
 9  2013     1     1     6 JFK    MCO         40.6     28.4      -73.8    -81.3
10  2013     1     1     6 LGA    ORD         40.8     42.0      -73.9    -87.9
# ... with 336,766 more rows
  1. Is there a relationship between the age of planes and its delays?
  • The average delay increases until a plane is about 10 years old and then it declines.
> (plane_age <- flights %>% filter(!is.na(arr_delay)) %>% 
+   left_join(planes, by="tailnum",suffix = c("_flights","_planes")) %>% 
+     mutate(age=2013-year_planes) %>% 
+     group_by(tailnum) %>% 
+     summarize(mean_delay=mean(arr_delay, na.rm=TRUE),
+               year=first(year_planes),age=first(age)) %>% 
+     filter(!is.na(age)) %>% 
+     group_by(age) %>% 
+     summarize(plane_delay=mean(mean_delay), n=n())
+   )
# A tibble: 46 x 3
     age plane_delay     n
   <dbl>       <dbl> <int>
 1     0       2.53     92
 2     1       0.732    95
 3     2       4.24     66
 4     3       3.06     48
 5     4       5.88     84
 6     5       4.59    147
 7     6       7.88    123
 8     7       7.64    126
 9     8      11.1     160
10     9      12.4     192
# ... with 36 more rows
> plane_age %>% ggplot()+
+   geom_point(aes(x=age,y=plane_delay, alpha=n, size=n))+
+   scale_x_continuous("Age of plane", breaks = seq(0, 60, by = 10))+
+   scale_y_continuous("Mean delay", breaks = seq(-20, 15, by = 5))

  1. What weather conditions make it more likely to see a delay?
  • The average delay tends to increase with precipitation, although most precip data points are below 0.121
  • The average delay increases as visibility decreases.
> (flight_weather <-
+     flights %>%
+     inner_join(weather, by = c("origin","year","month","day","hour")) %>%
+     mutate(precip_decile=cut_interval(precip,10)) %>% 
+     select(origin,month,day,dep_delay,precip,precip_decile) %>% 
+     group_by(precip_decile) %>% 
+     summarize(mean_delay=mean(dep_delay, na.rm=TRUE),
+               mean_precip=mean(precip,na.rm=TRUE),n=n())
+ )
# A tibble: 9 x 4
  precip_decile mean_delay mean_precip      n
  <fct>              <dbl>       <dbl>  <int>
1 [0,0.121]           12.4     0.00207 331848
2 (0.121,0.242]       40.8     0.175     2285
3 (0.242,0.363]       35.5     0.303      611
4 (0.363,0.484]       51.9     0.425      292
5 (0.484,0.605]       29.5     0.55        69
6 (0.605,0.726]       41.3     0.661       55
7 (0.726,0.847]       94.7     0.82        19
8 (0.847,0.968]       27.8     0.94        20
9 (1.09,1.21]        113.      1.21        21
> (flight_weather <-
+     flights %>%
+     inner_join(weather, by = c("origin","year","month","day","hour")) %>%
+     mutate(visib_decile=cut_interval(visib,n=10)) %>% 
+     select(origin,month,day,dep_delay,visib,visib_decile) %>% 
+     group_by(visib_decile) %>% 
+     summarize(mean_delay=mean(dep_delay, na.rm=TRUE),
+               mean_visib=mean(visib,na.rm=TRUE),
+               n=n())
+ )
# A tibble: 10 x 4
   visib_decile mean_delay mean_visib      n
   <fct>             <dbl>      <dbl>  <int>
 1 [0,1]              30.4      0.547   5449
 2 (1,2]              25.5      1.80    5255
 3 (2,3]              21.2      2.80    6058
 4 (3,4]              19.5      4       2253
 5 (4,5]              21.1      5       4809
 6 (5,6]              19.8      6       6260
 7 (6,7]              19.2      7       7347
 8 (7,8]              20.0      8       7614
 9 (8,9]              18.8      9      11436
10 (9,10]             11.0     10     278739
> ggplot(flight_weather)+ 
+   geom_point(aes(x=mean_visib, y=mean_delay, color=visib_decile),
+              size=3)+
+   theme(legend.position = "none") 

  1. What happened on June 13, 2013? Display the spatial pattern of delays.
  • Model and actress Joanna Krupa (34) weds businessman Romain Zago in Southern California, but I don’t think that affected flights😄.
  • There were major storms.
> (june_weather <- flights %>%
+   filter(year == 2013, month == 6, day == 13, !is.na(arr_delay)) %>%
+   group_by(dest) %>%
+   summarize(delay = mean(arr_delay)) %>%
+   inner_join(airports, by = c("dest" = "faa")))
# A tibble: 82 x 9
   dest  delay name                     lat    lon   alt    tz dst   tzone      
   <chr> <dbl> <chr>                  <dbl>  <dbl> <dbl> <dbl> <chr> <chr>      
 1 ABQ    64   Albuquerque Internati~  35.0 -107.   5355    -7 A     America/De~
 2 ACK     8   Nantucket Mem           41.3  -70.1    48    -5 A     America/Ne~
 3 ALB   -19   Albany Intl             42.7  -73.8   285    -5 A     America/Ne~
 4 ATL   109.  Hartsfield Jackson At~  33.6  -84.4  1026    -5 A     America/Ne~
 5 AUS    64.3 Austin Bergstrom Intl   30.2  -97.7   542    -6 A     America/Ch~
 6 AVL    90   Asheville Regional Ai~  35.4  -82.5  2165    -5 A     America/Ne~
 7 BDL    18.5 Bradley Intl            41.9  -72.7   173    -5 A     America/Ne~
 8 BGR   102   Bangor Intl             44.8  -68.8   192    -5 A     America/Ne~
 9 BHM   158   Birmingham Intl         33.6  -86.8   644    -6 A     America/Ch~
10 BNA    95.9 Nashville Intl          36.1  -86.7   599    -6 A     America/Ch~
# ... with 72 more rows
> june_weather <- june_weather %>% filter(lat<55 & lat>22)
>   
>   ggplot(june_weather) +
+   borders("state") + coord_quickmap() +
+   geom_point(aes(y = lat, x = lon, color = delay, size=delay)) +
+   scale_color_gradientn(colours = rainbow(5))

Other Implementations

dplyr can accomplish many of the same things as SQL

dplyr SQL
inner_join(x, y, by = “z”) SELECT * FROM x INNER JOIN y USING (z)
left_join(x, y, by = “z”) SELECT * FROM x LEFT OUTER JOIN y USING (z)
right_join(x, y, by = “z”) SELECT * FROM x RIGHT OUTER JOIN y USING (z)
full_join(x, y, by = “z”) SELECT * FROM x FULL OUTER JOIN y USING (z)
inner_join(x, y, by = c(“a” = “b”)) SELECT * FROM x INNER JOIN y ON x.a = y.b

Mutating Joins can also be accomplished using base R

dplyr merge
inner_join(x, y) merge(x, y)
left_join(x, y) merge(x, y, all.x = TRUE)
right_join(x, y) merge(x, y, all.y = TRUE)
full_join(x, y) merge(x, y, all.x = TRUE, all.y = TRUE)

Filtering Joins


Filtering Joins allow you to filter observations in one table based on matches in another table. There are two types:

  • semi_join(x, y) - keeps all observations from x that have a match in y
  • anti_join(x, y) - drops all observations from x that have a match in y

If we wanted to view all the flights to the top 10 destinations we could:

  • Create a table with the top 10 destinations.
  • Filter the flights dataset based on the newly created table.
> # Top 10 destinations
> top_dest <- flights %>% 
+   count(dest, sort=TRUE) %>% 
+   head(10)
> kable(top_dest) %>% 
+ kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
dest n
ORD 17283
ATL 17215
LAX 16174
BOS 15508
MCO 14082
CLT 14064
SFO 13331
FLL 12055
MIA 11728
DCA 9705
> # Filter flights based on top_dest
> flights %>% 
+   semi_join(top_dest) %>% 
+   head(10) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 8,position="left")%>%
+                 row_spec(0,background="peachpuff")
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time arr_delay carrier flight tailnum origin dest air_time distance hour minute time_hour
2013 1 1 542 540 2 923 850 33 AA 1141 N619AA JFK MIA 160 1089 5 40 2013-01-01 05:00:00
2013 1 1 554 600 -6 812 837 -25 DL 461 N668DN LGA ATL 116 762 6 0 2013-01-01 06:00:00
2013 1 1 554 558 -4 740 728 12 UA 1696 N39463 EWR ORD 150 719 5 58 2013-01-01 05:00:00
2013 1 1 555 600 -5 913 854 19 B6 507 N516JB EWR FLL 158 1065 6 0 2013-01-01 06:00:00
2013 1 1 557 600 -3 838 846 -8 B6 79 N593JB JFK MCO 140 944 6 0 2013-01-01 06:00:00
2013 1 1 558 600 -2 753 745 8 AA 301 N3ALAA LGA ORD 138 733 6 0 2013-01-01 06:00:00
2013 1 1 558 600 -2 924 917 7 UA 194 N29129 JFK LAX 345 2475 6 0 2013-01-01 06:00:00
2013 1 1 558 600 -2 923 937 -14 UA 1124 N53441 EWR SFO 361 2565 6 0 2013-01-01 06:00:00
2013 1 1 559 559 0 702 706 -4 B6 1806 N708JB JFK BOS 44 187 5 59 2013-01-01 05:00:00
2013 1 1 600 600 0 851 858 -7 B6 371 N595JB LGA FLL 152 1076 6 0 2013-01-01 06:00:00

Anti-Joins are helpful in determining which values do not match. There are many flights that don’t have a match in planes.

> flights %>% 
+   anti_join(planes, by="tailnum") %>% 
+   count(tailnum, sort=TRUE) %>% 
+   head(10) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
tailnum n
NA 2512
N725MQ 575
N722MQ 513
N723MQ 507
N713MQ 483
N735MQ 396
N0EGMQ 371
N534MQ 364
N542MQ 363
N531MQ 349

Filtering Joins - Exercises

  1. What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common?
  • You can see from the columns with NAs that these are canceled flights.
> flights %>% filter(is.na(tailnum)) %>% glimpse()
Rows: 2,512
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ day            <int> 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 9, 10, 10, 11, 12, 12,...
$ dep_time       <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ sched_dep_time <int> 1545, 1601, 857, 645, 845, 1830, 840, 820, 1645, 755...
$ dep_delay      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ arr_time       <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ sched_arr_time <int> 1910, 1735, 1209, 952, 1015, 2044, 1001, 958, 1838, ...
$ arr_delay      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ carrier        <chr> "AA", "UA", "UA", "UA", "9E", "9E", "9E", "9E", "US"...
$ flight         <int> 133, 623, 714, 719, 3405, 3716, 3422, 3317, 123, 402...
$ tailnum        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ origin         <chr> "JFK", "EWR", "EWR", "EWR", "JFK", "EWR", "JFK", "JF...
$ dest           <chr> "LAX", "ORD", "MIA", "DFW", "DCA", "DTW", "BOS", "BU...
$ air_time       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ distance       <dbl> 2475, 719, 1085, 1372, 213, 488, 187, 301, 529, 569,...
$ hour           <dbl> 15, 16, 8, 6, 8, 18, 8, 8, 16, 7, 12, 15, 7, 17, 9, ...
$ minute         <dbl> 45, 1, 57, 45, 45, 30, 40, 20, 45, 55, 51, 0, 0, 0, ...
$ time_hour      <dttm> 2013-01-02 15:00:00, 2013-01-02 16:00:00, 2013-01-0...
  1. Filter flights to only show flights with planes that have flown at least 100 flights.
> flights %>% filter(!is.na(tailnum)) %>% 
+   group_by(tailnum) %>% mutate(total=n()) %>% 
+   filter(total>=100)
# A tibble: 228,390 x 20
# Groups:   tailnum [1,217]
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      544            545        -1     1004           1022
 4  2013     1     1      554            558        -4      740            728
 5  2013     1     1      555            600        -5      913            854
 6  2013     1     1      557            600        -3      709            723
 7  2013     1     1      557            600        -3      838            846
 8  2013     1     1      558            600        -2      849            851
 9  2013     1     1      558            600        -2      853            856
10  2013     1     1      558            600        -2      923            937
# ... with 228,380 more rows, and 12 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
#   total <int>
  1. Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.
> library(fueleconomy)
> vehicles %>% semi_join(common, by = c("make", "model"))
# A tibble: 14,531 x 12
      id make  model   year class   trans  drive     cyl displ fuel    hwy   cty
   <dbl> <chr> <chr>  <dbl> <chr>   <chr>  <chr>   <dbl> <dbl> <chr> <dbl> <dbl>
 1  1833 Acura Integ~  1986 Subcom~ Autom~ Front-~     4   1.6 Regu~    28    22
 2  1834 Acura Integ~  1986 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
 3  3037 Acura Integ~  1987 Subcom~ Autom~ Front-~     4   1.6 Regu~    28    22
 4  3038 Acura Integ~  1987 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
 5  4183 Acura Integ~  1988 Subcom~ Autom~ Front-~     4   1.6 Regu~    27    22
 6  4184 Acura Integ~  1988 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
 7  5303 Acura Integ~  1989 Subcom~ Autom~ Front-~     4   1.6 Regu~    27    22
 8  5304 Acura Integ~  1989 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
 9  6442 Acura Integ~  1990 Subcom~ Autom~ Front-~     4   1.8 Regu~    24    20
10  6443 Acura Integ~  1990 Subcom~ Manua~ Front-~     4   1.8 Regu~    26    21
# ... with 14,521 more rows
  1. Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patters?
  • Rather than just taking any 48 hours, I chose to look at the two consecutive days with the greatest average delays.
  • July 22nd and 23rd had the largest delays, but it wasn’t a weather issue.
  • Southwest Airlines Flight 345 was a scheduled flight from Nashville International Airport, Tennessee, to New York City’s LaGuardia Airport. On July 22, 2013, the Boeing 737 operating the route suffered a front landing gear collapse while landing at LaGuardia Airport, injuring 9 people on board.
> (flights_48 <- flights %>% filter(!is.na(arr_delay)) %>% 
+   arrange(year,month,day) %>% 
+   group_by(year,month,day) %>% 
+   summarize(mean_delay=mean(arr_delay)) %>% 
+   mutate(mean_lag=lag(mean_delay),two_day=mean_lag+mean_delay,
+          dupe=lead(two_day)))
# A tibble: 365 x 7
# Groups:   year, month [12]
    year month   day mean_delay mean_lag two_day    dupe
   <int> <int> <int>      <dbl>    <dbl>   <dbl>   <dbl>
 1  2013     1     1     12.7     NA      NA      25.3  
 2  2013     1     2     12.7     12.7    25.3    18.4  
 3  2013     1     3      5.73    12.7    18.4     3.80 
 4  2013     1     4     -1.93     5.73    3.80   -3.46 
 5  2013     1     5     -1.53    -1.93   -3.46    2.71 
 6  2013     1     6      4.24    -1.53    2.71   -0.711
 7  2013     1     7     -4.95     4.24   -0.711  -8.17 
 8  2013     1     8     -3.23    -4.95   -8.17   -3.49 
 9  2013     1     9     -0.264   -3.23   -3.49   -6.16 
10  2013     1    10     -5.90    -0.264  -6.16  -10.7  
# ... with 355 more rows
> (one <- flights_48 %>% arrange(desc(two_day)) %>% head(1))
# A tibble: 1 x 7
# Groups:   year, month [1]
   year month   day mean_delay mean_lag two_day  dupe
  <int> <int> <int>      <dbl>    <dbl>   <dbl> <dbl>
1  2013     7    23       45.0     62.8    108.  59.0
> (two <- flights_48 %>% arrange(desc(dupe))%>% head(1))
# A tibble: 1 x 7
# Groups:   year, month [1]
   year month   day mean_delay mean_lag two_day  dupe
  <int> <int> <int>      <dbl>    <dbl>   <dbl> <dbl>
1  2013     7    22       62.8     15.5    78.2  108.
> (three <- bind_rows(two,one))
# A tibble: 2 x 7
# Groups:   year, month [1]
   year month   day mean_delay mean_lag two_day  dupe
  <int> <int> <int>      <dbl>    <dbl>   <dbl> <dbl>
1  2013     7    22       62.8     15.5    78.2 108. 
2  2013     7    23       45.0     62.8   108.   59.0
> weather %>% semi_join(three) %>% group_by(day) %>% 
+   summarize(avg_temp=mean(temp),
+             avg_wind_speed=mean(wind_speed),
+             avg_precip=mean(precip),
+             avg_visib=mean(visib))
# A tibble: 2 x 5
    day avg_temp avg_wind_speed avg_precip avg_visib
  <int>    <dbl>          <dbl>      <dbl>     <dbl>
1    22     78.5           7.96    0.00597      8.91
2    23     80.6          10.1     0.0154       9.33
  1. What does anti_join(flights,airports,by=c("dest"="faa")) tell you? What does anti_join(airports,flights,by=c("faa"="dest")) tell you?
> #flights that have a destination not listed in airports
> anti_join(flights,airports, by=c("dest"="faa")) %>% 
+   distinct(dest)
# A tibble: 4 x 1
  dest 
  <chr>
1 BQN  
2 SJU  
3 STT  
4 PSE  
> #airports that are not a destination in flights
> anti_join(airports,flights, by=c("faa"="dest")) 
# A tibble: 1,357 x 8
   faa   name                       lat    lon   alt    tz dst   tzone          
   <chr> <chr>                    <dbl>  <dbl> <dbl> <dbl> <chr> <chr>          
 1 04G   Lansdowne Airport         41.1  -80.6  1044    -5 A     America/New_Yo~
 2 06A   Moton Field Municipal A~  32.5  -85.7   264    -6 A     America/Chicago
 3 06C   Schaumburg Regional       42.0  -88.1   801    -6 A     America/Chicago
 4 06N   Randall Airport           41.4  -74.4   523    -5 A     America/New_Yo~
 5 09J   Jekyll Island Airport     31.1  -81.4    11    -5 A     America/New_Yo~
 6 0A9   Elizabethton Municipal ~  36.4  -82.2  1593    -5 A     America/New_Yo~
 7 0G6   Williams County Airport   41.5  -84.5   730    -5 A     America/New_Yo~
 8 0G7   Finger Lakes Regional A~  42.9  -76.8   492    -5 A     America/New_Yo~
 9 0P2   Shoestring Aviation Air~  39.8  -76.6  1000    -5 U     America/New_Yo~
10 0S9   Jefferson County Intl     48.1 -123.    108    -8 A     America/Los_An~
# ... with 1,347 more rows
  1. You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis.
  • 17 planes were flown by more than one carrier
> (planes_carriers <- flights %>%
+   filter(!is.na(tailnum)) %>%
+   distinct(tailnum, carrier))
# A tibble: 4,060 x 2
   carrier tailnum
   <chr>   <chr>  
 1 UA      N14228 
 2 UA      N24211 
 3 AA      N619AA 
 4 B6      N804JB 
 5 DL      N668DN 
 6 UA      N39463 
 7 B6      N516JB 
 8 EV      N829AS 
 9 B6      N593JB 
10 AA      N3ALAA 
# ... with 4,050 more rows
> (planes_carriers %>% count(tailnum) %>%
+   filter(n > 1) %>% nrow())
[1] 17

Set Operations


There are also several set functions that can be helpful.

  • intersect(x, y) - only observations in both x and y
  • union(x, y) - unique observations in x and y
  • setdiff(x, y) - observations in x but not y
> df1 <- tribble(
+   ~x, ~y,
+    1,  1,
+    2,  1
+ )
> 
> df2 <- tribble(
+   ~x, ~y,
+    1,  1,
+    1,  2
+ )
x y
1 1
2 1
x y
1 1
1 2
> intersect(df1,df2) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
x y
1 1
> union(df1,df2) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
x y
1 1
2 1
1 2
> setdiff(df1,df2) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
x y
2 1
> setdiff(df2,df1) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")
x y
1 2