The Datasets

The nycflights13 package contains the flights dataset along with four related datasets: airlines, airports, planes, weather.

> library(tidyverse) #load dplyr
> library(nycflights13) #load datasets
> library(knitr) #load library for table formatting
> library(kableExtra) #load library for table formatting
> 
> # flights dataset
> kable(head(flights)) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 8,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	dep_time	sched_dep_time	dep_delay	arr_time	sched_arr_time	arr_delay	carrier	flight	tailnum	origin	dest	air_time	distance	hour	minute	time_hour
2013	1	1	517	515	2	830	819	11	UA	1545	N14228	EWR	IAH	227	1400	5	15	2013-01-01 05:00:00
2013	1	1	533	529	4	850	830	20	UA	1714	N24211	LGA	IAH	227	1416	5	29	2013-01-01 05:00:00
2013	1	1	542	540	2	923	850	33	AA	1141	N619AA	JFK	MIA	160	1089	5	40	2013-01-01 05:00:00
2013	1	1	544	545	-1	1004	1022	-18	B6	725	N804JB	JFK	BQN	183	1576	5	45	2013-01-01 05:00:00
2013	1	1	554	600	-6	812	837	-25	DL	461	N668DN	LGA	ATL	116	762	6	0	2013-01-01 06:00:00
2013	1	1	554	558	-4	740	728	12	UA	1696	N39463	EWR	ORD	150	719	5	58	2013-01-01 05:00:00

The flights dataset has 336776 rows and 19 columns.

> # airlines dataset
> kable(head(airlines)) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

carrier	name
9E	Endeavor Air Inc.
AA	American Airlines Inc.
AS	Alaska Airlines Inc.
B6	JetBlue Airways
DL	Delta Air Lines Inc.
EV	ExpressJet Airlines Inc.

The airlines dataset has 16 rows and 2 columns.

> # airports dataset
> kable(head(airports)) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

faa	name	lat	lon	alt	tz	dst	tzone
04G	Lansdowne Airport	41.13047	-80.61958	1044	-5	A	America/New_York
06A	Moton Field Municipal Airport	32.46057	-85.68003	264	-6	A	America/Chicago
06C	Schaumburg Regional	41.98934	-88.10124	801	-6	A	America/Chicago
06N	Randall Airport	41.43191	-74.39156	523	-5	A	America/New_York
09J	Jekyll Island Airport	31.07447	-81.42778	11	-5	A	America/New_York
0A9	Elizabethton Municipal Airport	36.37122	-82.17342	1593	-5	A	America/New_York

The airports dataset has 1458 rows and 8 columns.

> # planes dataset
> kable(head(planes)) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

tailnum	year	type	manufacturer	model	engines	seats	speed	engine
N10156	2004	Fixed wing multi engine	EMBRAER	EMB-145XR	2	55	NA	Turbo-fan
N102UW	1998	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NA	Turbo-fan
N103US	1999	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NA	Turbo-fan
N104UW	1999	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NA	Turbo-fan
N10575	2002	Fixed wing multi engine	EMBRAER	EMB-145LR	2	55	NA	Turbo-fan
N105UW	1999	Fixed wing multi engine	AIRBUS INDUSTRIE	A320-214	2	182	NA	Turbo-fan

The planes dataset has 3322 rows and 9 columns.

> # weather dataset
> kable(head(weather)) %>%
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

origin	year	month	day	hour	temp	dewp	humid	wind_dir	wind_speed	wind_gust	pressure	visib	time_hour
EWR	2013	1	1	1	39.02	26.06	59.37	270	10.35702	NA	1012.0	10	2013-01-01 01:00:00
EWR	2013	1	1	2	39.02	26.96	61.63	250	8.05546	NA	1012.3	10	2013-01-01 02:00:00
EWR	2013	1	1	3	39.02	28.04	64.43	240	11.50780	NA	1012.5	10	2013-01-01 03:00:00
EWR	2013	1	1	4	39.92	28.04	62.21	250	12.65858	NA	1012.2	10	2013-01-01 04:00:00
EWR	2013	1	1	5	39.02	28.04	64.43	260	12.65858	NA	1011.9	10	2013-01-01 05:00:00
EWR	2013	1	1	6	37.94	28.04	67.21	240	11.50780	NA	1012.4	10	2013-01-01 06:00:00

The weather dataset has 26115 rows and 15 columns.

The Datasets - Exercises

Imagine you wanted to draw the route each plane flies from its origin to its destination. What variables would you need? What tables would you need to combine?

Variables - longitude and latitude for the origin and destination.
Tables - combine flights with airports.

> library(tidyverse)
> library(nycflights13)
> 
> flights_route <- flights %>%
+   inner_join(airports,by = c("origin"="faa")) %>% 
+   inner_join(airports,by = c("dest"="faa"))
> 
> (flights_route <- flights_route %>% 
+   select(origin,lat.x,lon.x,dest,lat.y,lon.y) %>% 
+   head(100))

# A tibble: 100 x 6
   origin lat.x lon.x dest  lat.y lon.y
   <chr>  <dbl> <dbl> <chr> <dbl> <dbl>
 1 EWR     40.7 -74.2 IAH    30.0 -95.3
 2 LGA     40.8 -73.9 IAH    30.0 -95.3
 3 JFK     40.6 -73.8 MIA    25.8 -80.3
 4 LGA     40.8 -73.9 ATL    33.6 -84.4
 5 EWR     40.7 -74.2 ORD    42.0 -87.9
 6 EWR     40.7 -74.2 FLL    26.1 -80.2
 7 LGA     40.8 -73.9 IAD    38.9 -77.5
 8 JFK     40.6 -73.8 MCO    28.4 -81.3
 9 LGA     40.8 -73.9 ORD    42.0 -87.9
10 JFK     40.6 -73.8 PBI    26.7 -80.1
# ... with 90 more rows

> flights_route %>%
+   ggplot() +
+   borders("state") + coord_quickmap() +
+   geom_segment(aes(
+     x = lon.x, xend = lon.y,
+     y = lat.x, yend = lat.y,
+     color=dest),
+     arrow = arrow(length = unit(0.1, "cm"))) +
+   labs(x="longitude",y="latitude")+ 
+   theme_minimal()+
+   theme(legend.position = "none")

I forgot to draw the relationship between weather and airports. What is the relationship and how should it appear in the diagram?

They would be connected by weather$origin = airports$faa.

> left_join(weather, airports,by=c("origin"="faa"))

# A tibble: 26,115 x 22
   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed
   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>
 1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4 
 2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06
 3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5 
 4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7 
 5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7 
 6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5 
 7 EWR     2013     1     1     7  39.0  28.0  64.4      240      15.0 
 8 EWR     2013     1     1     8  39.9  28.0  62.2      250      10.4 
 9 EWR     2013     1     1     9  39.9  28.0  62.2      260      15.0 
10 EWR     2013     1     1    10  41    28.0  59.6      260      13.8 
# ... with 26,105 more rows, and 12 more variables: wind_gust <dbl>,
#   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, name <chr>,
#   lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>, dst <chr>, tzone <chr>

weather only contains information for the origin (NYC) airports. If it contained weather records for all airports in the USA, what additional relation would it define with flights?

Right now you can only connect it to flights by year, month, day, origin.
With records for all airports you could connect it by year, month, day, dest.

We know that some of the days of the year are “special,” and fewer than usual fly on them. How might you represent that data as a data frame? What would be the primary keys of that table? How would it connect to the existing tables?

Primary keys - year, month, day.

> (special_days <- tribble(
+   ~year, ~month, ~day, ~holiday,
+   2013, 01, 01, "New Years Day",
+   2013, 07, 04, "Independence Day",
+   2013, 11, 29, "Thanksgiving Day",
+   2013, 12, 25, "Christmas Day"
+ ))

# A tibble: 4 x 4
   year month   day holiday         
  <dbl> <dbl> <dbl> <chr>           
1  2013     1     1 New Years Day   
2  2013     7     4 Independence Day
3  2013    11    29 Thanksgiving Day
4  2013    12    25 Christmas Day

Keys

Keys are used to connect each table and they uniquely identify an observation (example: every US citizen has a unique Social Security number).

Primary Key - uniquely identifies an observation in its own table
Foreign Key - uniquely identifies an observation in another table

One way to verify that a key is unique is by checking that the count() is not greater than 1.

> #count tailnum
> TailnumKey <- planes %>% 
+   count(tailnum) %>% filter(n>1)
> nrow(TailnumKey)

[1] 0

tailnum uniquely identifies a plane.

> #count by specific day/time and airport
> WeatherKey <- weather %>% count(year, month, day, hour, origin) %>% 
+   filter(n>1) 
> nrow(WeatherKey)

[1] 3

A specific time should have a unique weather recording for a specific airport, but it seems that there are 3 records with more than one. It could be an error. In this instance I will explore those 3 further and adjust the data accordingly.

> #view records with more than 1
> weather %>% count(year, month, day, hour, origin) %>% 
+   filter(n>1) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	hour	origin	n
2013	11	3	1	EWR	2
2013	11	3	1	JFK	2
2013	11	3	1	LGA	2

> weather %>% 
+   filter(month==11,day==3,hour==1,origin %in% c("EWR","JFK","LGA"))%>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

origin	year	month	day	hour	temp	dewp	humid	wind_dir	wind_speed	wind_gust	pressure	visib	time_hour
EWR	2013	11	3	1	51.98	39.02	61.15	310	6.90468	NA	1009.8	10	2013-11-03 01:00:00
EWR	2013	11	3	1	50.00	39.02	65.80	290	5.75390	NA	1010.5	10	2013-11-03 01:00:00
JFK	2013	11	3	1	53.96	37.94	54.51	320	9.20624	NA	1009.8	10	2013-11-03 01:00:00
JFK	2013	11	3	1	51.98	37.94	58.62	310	6.90468	NA	1010.5	10	2013-11-03 01:00:00
LGA	2013	11	3	1	55.04	39.02	54.67	330	9.20624	NA	1009.3	10	2013-11-03 01:00:00
LGA	2013	11	3	1	53.96	39.92	58.89	310	8.05546	NA	1010.2	10	2013-11-03 01:00:00

I decided to average those 3 days for each airport. Year, month, day, hour, and origin will now be a unique identifier.

> weather2 <- weather %>% 
+   group_by(year,month,day,hour,origin) %>% 
+   summarize(
+     across(temp:visib, mean),
+     across(time_hour, first)
+   )
> 
> weather2 %>% 
+   filter(month==11,day==3,hour==1,origin %in% c("EWR","JFK","LGA"))%>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	hour	origin	temp	dewp	humid	wind_dir	wind_speed	wind_gust	pressure	visib	time_hour
2013	11	3	1	EWR	50.99	39.02	63.475	300	6.32929	NA	1010.15	10	2013-11-03 01:00:00
2013	11	3	1	JFK	52.97	37.94	56.565	315	8.05546	NA	1010.15	10	2013-11-03 01:00:00
2013	11	3	1	LGA	54.50	39.47	56.780	320	8.63085	NA	1009.75	10	2013-11-03 01:00:00

> WeatherKey <- weather2 %>% count(year, month, day, hour, origin) %>% 
+   filter(n>1)
> nrow(WeatherKey)

[1] 0

Sometimes there is no combination of variables that uniquely identifies an observation. In the flights data set neither the specific day and flight number nor the specific day and tailnumber are unique.

> # day and flight
> flights %>% 
+   count(year,month,day,flight) %>% 
+   filter(n>1) %>% head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	flight	n
2013	1	1	1	2
2013	1	1	3	2
2013	1	1	4	2
2013	1	1	11	3
2013	1	1	15	2
2013	1	1	21	2

> #day and tailnum
> flights %>% 
+   count(year,month,day,tailnum) %>% 
+   filter(n>1) %>% head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	tailnum	n
2013	1	1	N0EGMQ	2
2013	1	1	N11189	2
2013	1	1	N11536	2
2013	1	1	N11544	3
2013	1	1	N11551	2
2013	1	1	N12540	2

Keys - Exercises

Add a surrogate key to flights.

> flights %>% mutate(id=row_number()) %>% 
+   select(month,day,flight,origin,dest,id)

# A tibble: 336,776 x 6
   month   day flight origin dest     id
   <int> <int>  <int> <chr>  <chr> <int>
 1     1     1   1545 EWR    IAH       1
 2     1     1   1714 LGA    IAH       2
 3     1     1   1141 JFK    MIA       3
 4     1     1    725 JFK    BQN       4
 5     1     1    461 LGA    ATL       5
 6     1     1   1696 EWR    ORD       6
 7     1     1    507 EWR    FLL       7
 8     1     1   5708 LGA    IAD       8
 9     1     1     79 JFK    MCO       9
10     1     1    301 LGA    ORD      10
# ... with 336,766 more rows

Identify the keys in the following datasets:

Lahman::Batting
babyname::babynames
nasaweather::atmos
fueleconomy::vehicles
ggplot2::diamonds

Each call of count() contains the keys. Each call of nrow() results in $0$.
ggplot2 has no unique keys. Some rows are duplicates.

> Lahman::Batting %>%
+   count(playerID, yearID, stint) %>%
+   filter(n > 1) %>%
+   nrow()
> 
> babynames::babynames %>%
+   count(year, sex, name, wt=n()) %>%
+   filter(n > 1) %>%
+   nrow()
> 
> nasaweather::atmos %>%
+   count(lat, long, year, month) %>%
+   filter(n > 1) %>%
+   nrow()
> 
> fueleconomy::vehicles %>%
+   count(id) %>%
+   filter(n > 1) %>%
+   nrow()

> #No keys
> ggplot2::diamonds %>% distinct() %>%nrow()

[1] 53794

> nrow(ggplot2::diamonds)

[1] 53940

Mutating Joins

Similar to mutate(), a mutating join adds new variables to the right of a table. It does so by:

Matching keys between tables
Copying variables from one table to another

For example, if you want to add carrier name to the flights dataset you would join it to the airlines table using the carrier variable as your key.

It is easier to see when starting with a smaller dataset.

> #make a smaller data set
> flights2 <- flights %>% 
+   select(year:day,hour,origin,dest,tailnum,carrier)
> 
> #display first 6
> flights2 %>% head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	hour	origin	dest	tailnum	carrier
2013	1	1	5	EWR	IAH	N14228	UA
2013	1	1	5	LGA	IAH	N24211	UA
2013	1	1	5	JFK	MIA	N619AA	AA
2013	1	1	5	JFK	BQN	N804JB	B6
2013	1	1	6	LGA	ATL	N668DN	DL
2013	1	1	5	EWR	ORD	N39463	UA

We can see that the column name is now added to the right of the flights2 table.

> #add name using join
> flights2 %>% 
+   select(-origin,-dest) %>% 
+   left_join(airlines,by="carrier") %>% 
+   head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	hour	tailnum	carrier	name
2013	1	1	5	N14228	UA	United Air Lines Inc.
2013	1	1	5	N24211	UA	United Air Lines Inc.
2013	1	1	5	N619AA	AA	American Airlines Inc.
2013	1	1	5	N804JB	B6	JetBlue Airways
2013	1	1	6	N668DN	DL	Delta Air Lines Inc.
2013	1	1	5	N39463	UA	United Air Lines Inc.

We could achieve the same result using mutate(), but it is more difficult.

> #add name using mutate
> flights2 %>% 
+   select(-origin,-dest) %>% 
+   mutate(name=airlines$name[match(carrier,airlines$carrier)]) %>% 
+   head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	hour	tailnum	carrier	name
2013	1	1	5	N14228	UA	United Air Lines Inc.
2013	1	1	5	N24211	UA	United Air Lines Inc.
2013	1	1	5	N619AA	AA	American Airlines Inc.
2013	1	1	5	N804JB	B6	JetBlue Airways
2013	1	1	6	N668DN	DL	Delta Air Lines Inc.
2013	1	1	5	N39463	UA	United Air Lines Inc.

Join Types

There are four kinds of mutating joins

Inner Join - Matches observations where the keys are equal. Unmatched rows are not included in the result.
Left Join - Keeps all observations from the left table.
Right Join - Keeps all observations from the right table.
Full Join - Keeps all observations from both the left and right tables.

Some examples are provided below:

> #Sample tables
> x <- tribble(
+   ~key,~val_x,
+      1,"x1",
+      2,"x2",
+      3,"x3"
+ )
>   
> y <- tribble(
+   ~key,~val_y,
+      1,"y1",
+      2,"y2",
+      4,"y3"
+ )

key	val_x
1	x1
2	x2
3	x3

key	val_y
1	y1
2	y2
4	y3

Inner Join

> x %>% 
+   inner_join(y, by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

key	val_x	val_y
1	x1	y1
2	x2	y2

Left Join

> x %>% 
+   left_join(y, by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

key	val_x	val_y
1	x1	y1
2	x2	y2
3	x3	NA

Right Join

> x %>% 
+   right_join(y, by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

key	val_x	val_y
1	x1	y1
2	x2	y2
4	NA	y3

Full Join

> x %>% 
+   full_join(y, by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

key	val_x	val_y
1	x1	y1
2	x2	y2
3	x3	NA
4	NA	y3

Duplicate Keys

It is possible to have duplicate keys in one or both tables. When both tables have duplicate keys the result includes all combinations (Cartesian product).

Some examples are provided below:

Left Join - Left table with duplicate key

> x <- tribble(
+   ~key,~val_x,
+      1,"x1",
+      2,"x2",
+      2,"x3",
+      1,"x4"
+ )
> 
> y <- tribble(
+   ~key,~val_y,
+      1,"y1",
+      2,"y2"
+ )

key	val_x
1	x1
2	x2
2	x3
1	x4

key	val_y
1	y1
2	y2

Left Join

> left_join(x,y,by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

key	val_x	val_y
1	x1	y1
2	x2	y2
2	x3	y2
1	x4	y1

Left Join - Both tables with duplicate key

> x <- tribble(
+   ~key,~val_x,
+      1,"x1",
+      2,"x2",
+      2,"x3",
+      3,"x4"
+ )
> 
> y <- tribble(
+   ~key,~val_y,
+      1,"y1",
+      2,"y2",
+      2,"y3",
+      3,"y4"
+ )

key	val_x
1	x1
2	x2
2	x3
3	x4

key	val_y
1	y1
2	y2
2	y3
3	y4

Left Join

> left_join(x,y,by="key") %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

key	val_x	val_y
1	x1	y1
2	x2	y2
2	x2	y3
2	x3	y2
2	x3	y3
3	x4	y4

Defining Key Columns

There are several ways to specify the key:

Default - when no key is specified it will match the tables using their common variables
Single Character Vector - when by = "x" is used the tables will be matched on that common variable.
Named Character Vector - when by = c("a" = "b") is used the variable a in table x will be matched to the variable b in table y.

Some examples are provided below:

Default - a message displays the common variables. In this case year, month, day, hour, origin.

> flights2 %>% 
+   left_join(weather2) %>% 
+   head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

Joining, by = c("year", "month", "day", "hour", "origin")

year	month	day	hour	origin	dest	tailnum	carrier	temp	dewp	humid	wind_dir	wind_speed	wind_gust	pressure	visib	time_hour
2013	1	1	5	EWR	IAH	N14228	UA	39.02	28.04	64.43	260	12.65858	NA	1011.9	10	2013-01-01 05:00:00
2013	1	1	5	LGA	IAH	N24211	UA	39.92	24.98	54.81	250	14.96014	21.86482	1011.4	10	2013-01-01 05:00:00
2013	1	1	5	JFK	MIA	N619AA	AA	39.02	26.96	61.63	260	14.96014	NA	1012.1	10	2013-01-01 05:00:00
2013	1	1	5	JFK	BQN	N804JB	B6	39.02	26.96	61.63	260	14.96014	NA	1012.1	10	2013-01-01 05:00:00
2013	1	1	6	LGA	ATL	N668DN	DL	39.92	24.98	54.81	260	16.11092	23.01560	1011.7	10	2013-01-01 06:00:00
2013	1	1	5	EWR	ORD	N39463	UA	39.02	28.04	64.43	260	12.65858	NA	1011.9	10	2013-01-01 05:00:00

Single Character Vector - Each table has a year variable, but they are not constrained to be equal. They output labels them as year.x and year.y.

> flights2 %>% 
+   left_join(planes, by="tailnum") %>% 
+   head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

year.x	month	day	hour	origin	dest	tailnum	carrier	year.y	type	manufacturer	model	engines	seats	speed	engine
2013	1	1	5	EWR	IAH	N14228	UA	1999	Fixed wing multi engine	BOEING	737-824	2	149	NA	Turbo-fan
2013	1	1	5	LGA	IAH	N24211	UA	1998	Fixed wing multi engine	BOEING	737-824	2	149	NA	Turbo-fan
2013	1	1	5	JFK	MIA	N619AA	AA	1990	Fixed wing multi engine	BOEING	757-223	2	178	NA	Turbo-fan
2013	1	1	5	JFK	BQN	N804JB	B6	2012	Fixed wing multi engine	AIRBUS	A320-232	2	200	NA	Turbo-fan
2013	1	1	6	LGA	ATL	N668DN	DL	1991	Fixed wing multi engine	BOEING	757-232	2	178	NA	Turbo-fan
2013	1	1	5	EWR	ORD	N39463	UA	2012	Fixed wing multi engine	BOEING	737-924ER	2	191	NA	Turbo-fan

Named Character Vector - the airports table includes latitude and longitude. You would need to specify which airport you are interested in - the destination or the origin.

> flights2 %>% 
+   left_join(airports, c("dest" = "faa")) %>% 
+   head %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	hour	origin	dest	tailnum	carrier	name	lat	lon	alt	tz	dst	tzone
2013	1	1	5	EWR	IAH	N14228	UA	George Bush Intercontinental	29.98443	-95.34144	97	-6	A	America/Chicago
2013	1	1	5	LGA	IAH	N24211	UA	George Bush Intercontinental	29.98443	-95.34144	97	-6	A	America/Chicago
2013	1	1	5	JFK	MIA	N619AA	AA	Miami Intl	25.79325	-80.29056	8	-5	A	America/New_York
2013	1	1	5	JFK	BQN	N804JB	B6	NA	NA	NA	NA	NA	NA	NA
2013	1	1	6	LGA	ATL	N668DN	DL	Hartsfield Jackson Atlanta Intl	33.63672	-84.42807	1026	-5	A	America/New_York
2013	1	1	5	EWR	ORD	N39463	UA	Chicago Ohare Intl	41.97860	-87.90484	668	-6	A	America/Chicago

> flights2 %>% 
+   left_join(airports, c("origin" = "faa")) %>% 
+   head() %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	hour	origin	dest	tailnum	carrier	name	lat	lon	alt	tz	dst	tzone
2013	1	1	5	EWR	IAH	N14228	UA	Newark Liberty Intl	40.69250	-74.16867	18	-5	A	America/New_York
2013	1	1	5	LGA	IAH	N24211	UA	La Guardia	40.77725	-73.87261	22	-5	A	America/New_York
2013	1	1	5	JFK	MIA	N619AA	AA	John F Kennedy Intl	40.63975	-73.77893	13	-5	A	America/New_York
2013	1	1	5	JFK	BQN	N804JB	B6	John F Kennedy Intl	40.63975	-73.77893	13	-5	A	America/New_York
2013	1	1	6	LGA	ATL	N668DN	DL	La Guardia	40.77725	-73.87261	22	-5	A	America/New_York
2013	1	1	5	EWR	ORD	N39463	UA	Newark Liberty Intl	40.69250	-74.16867	18	-5	A	America/New_York

Mutating Joins - Exercises

Compute the average delay by destination, then join on the airports data frame so you can show the spatial distribution of delays.

> (avg_delay <- 
+ flights %>% filter(!is.na(arr_delay)) %>% group_by(dest) %>% 
+   summarize(delay_mean=mean(arr_delay)) %>% 
+   inner_join(airports,by=c("dest"="faa")))

# A tibble: 100 x 9
   dest  delay_mean name                 lat    lon   alt    tz dst   tzone     
   <chr>      <dbl> <chr>              <dbl>  <dbl> <dbl> <dbl> <chr> <chr>     
 1 ABQ         4.38 Albuquerque Inter~  35.0 -107.   5355    -7 A     America/D~
 2 ACK         4.85 Nantucket Mem       41.3  -70.1    48    -5 A     America/N~
 3 ALB        14.4  Albany Intl         42.7  -73.8   285    -5 A     America/N~
 4 ANC        -2.5  Ted Stevens Ancho~  61.2 -150.    152    -9 A     America/A~
 5 ATL        11.3  Hartsfield Jackso~  33.6  -84.4  1026    -5 A     America/N~
 6 AUS         6.02 Austin Bergstrom ~  30.2  -97.7   542    -6 A     America/C~
 7 AVL         8.00 Asheville Regiona~  35.4  -82.5  2165    -5 A     America/N~
 8 BDL         7.05 Bradley Intl        41.9  -72.7   173    -5 A     America/N~
 9 BGR         8.03 Bangor Intl         44.8  -68.8   192    -5 A     America/N~
10 BHM        16.9  Birmingham Intl     33.6  -86.8   644    -6 A     America/C~
# ... with 90 more rows

> #remove Hawaii and Alaska, for  better plotting 
> avg_delay <- avg_delay %>% filter(lat<55 & lat>22)
> 
> avg_delay %>% ggplot()+
+   borders("state")+coord_quickmap()+
+   geom_point(aes(x=lon, y=lat, color=delay_mean))+ 
+   scale_color_gradient(low="blue", high="red")+
+   theme_minimal()

Add the location of the origin and destination to flights.

> flights %>%
+   left_join(airports,by = c("origin" = "faa")) %>%
+   left_join(airports,by = c("dest" = "faa"),
+     suffix = c("_origin", "_dest")) %>% 
+   select(year:day, hour, origin, dest, starts_with("lat"),
+          starts_with("lon"))

# A tibble: 336,776 x 10
    year month   day  hour origin dest  lat_origin lat_dest lon_origin lon_dest
   <int> <int> <int> <dbl> <chr>  <chr>      <dbl>    <dbl>      <dbl>    <dbl>
 1  2013     1     1     5 EWR    IAH         40.7     30.0      -74.2    -95.3
 2  2013     1     1     5 LGA    IAH         40.8     30.0      -73.9    -95.3
 3  2013     1     1     5 JFK    MIA         40.6     25.8      -73.8    -80.3
 4  2013     1     1     5 JFK    BQN         40.6     NA        -73.8     NA  
 5  2013     1     1     6 LGA    ATL         40.8     33.6      -73.9    -84.4
 6  2013     1     1     5 EWR    ORD         40.7     42.0      -74.2    -87.9
 7  2013     1     1     6 EWR    FLL         40.7     26.1      -74.2    -80.2
 8  2013     1     1     6 LGA    IAD         40.8     38.9      -73.9    -77.5
 9  2013     1     1     6 JFK    MCO         40.6     28.4      -73.8    -81.3
10  2013     1     1     6 LGA    ORD         40.8     42.0      -73.9    -87.9
# ... with 336,766 more rows

Is there a relationship between the age of planes and its delays?

The average delay increases until a plane is about 10 years old and then it declines.

> (plane_age <- flights %>% filter(!is.na(arr_delay)) %>% 
+   left_join(planes, by="tailnum",suffix = c("_flights","_planes")) %>% 
+     mutate(age=2013-year_planes) %>% 
+     group_by(tailnum) %>% 
+     summarize(mean_delay=mean(arr_delay, na.rm=TRUE),
+               year=first(year_planes),age=first(age)) %>% 
+     filter(!is.na(age)) %>% 
+     group_by(age) %>% 
+     summarize(plane_delay=mean(mean_delay), n=n())
+   )

# A tibble: 46 x 3
     age plane_delay     n
   <dbl>       <dbl> <int>
 1     0       2.53     92
 2     1       0.732    95
 3     2       4.24     66
 4     3       3.06     48
 5     4       5.88     84
 6     5       4.59    147
 7     6       7.88    123
 8     7       7.64    126
 9     8      11.1     160
10     9      12.4     192
# ... with 36 more rows

> plane_age %>% ggplot()+
+   geom_point(aes(x=age,y=plane_delay, alpha=n, size=n))+
+   scale_x_continuous("Age of plane", breaks = seq(0, 60, by = 10))+
+   scale_y_continuous("Mean delay", breaks = seq(-20, 15, by = 5))

What weather conditions make it more likely to see a delay?

The average delay tends to increase with precipitation, although most precip data points are below 0.121
The average delay increases as visibility decreases.

> (flight_weather <-
+     flights %>%
+     inner_join(weather, by = c("origin","year","month","day","hour")) %>%
+     mutate(precip_decile=cut_interval(precip,10)) %>% 
+     select(origin,month,day,dep_delay,precip,precip_decile) %>% 
+     group_by(precip_decile) %>% 
+     summarize(mean_delay=mean(dep_delay, na.rm=TRUE),
+               mean_precip=mean(precip,na.rm=TRUE),n=n())
+ )

# A tibble: 9 x 4
  precip_decile mean_delay mean_precip      n
  <fct>              <dbl>       <dbl>  <int>
1 [0,0.121]           12.4     0.00207 331848
2 (0.121,0.242]       40.8     0.175     2285
3 (0.242,0.363]       35.5     0.303      611
4 (0.363,0.484]       51.9     0.425      292
5 (0.484,0.605]       29.5     0.55        69
6 (0.605,0.726]       41.3     0.661       55
7 (0.726,0.847]       94.7     0.82        19
8 (0.847,0.968]       27.8     0.94        20
9 (1.09,1.21]        113.      1.21        21

> (flight_weather <-
+     flights %>%
+     inner_join(weather, by = c("origin","year","month","day","hour")) %>%
+     mutate(visib_decile=cut_interval(visib,n=10)) %>% 
+     select(origin,month,day,dep_delay,visib,visib_decile) %>% 
+     group_by(visib_decile) %>% 
+     summarize(mean_delay=mean(dep_delay, na.rm=TRUE),
+               mean_visib=mean(visib,na.rm=TRUE),
+               n=n())
+ )

# A tibble: 10 x 4
   visib_decile mean_delay mean_visib      n
   <fct>             <dbl>      <dbl>  <int>
 1 [0,1]              30.4      0.547   5449
 2 (1,2]              25.5      1.80    5255
 3 (2,3]              21.2      2.80    6058
 4 (3,4]              19.5      4       2253
 5 (4,5]              21.1      5       4809
 6 (5,6]              19.8      6       6260
 7 (6,7]              19.2      7       7347
 8 (7,8]              20.0      8       7614
 9 (8,9]              18.8      9      11436
10 (9,10]             11.0     10     278739

> ggplot(flight_weather)+ 
+   geom_point(aes(x=mean_visib, y=mean_delay, color=visib_decile),
+              size=3)+
+   theme(legend.position = "none")

What happened on June 13, 2013? Display the spatial pattern of delays.

Model and actress Joanna Krupa (34) weds businessman Romain Zago in Southern California, but I don’t think that affected flights😄.
There were major storms.

> (june_weather <- flights %>%
+   filter(year == 2013, month == 6, day == 13, !is.na(arr_delay)) %>%
+   group_by(dest) %>%
+   summarize(delay = mean(arr_delay)) %>%
+   inner_join(airports, by = c("dest" = "faa")))

# A tibble: 82 x 9
   dest  delay name                     lat    lon   alt    tz dst   tzone      
   <chr> <dbl> <chr>                  <dbl>  <dbl> <dbl> <dbl> <chr> <chr>      
 1 ABQ    64   Albuquerque Internati~  35.0 -107.   5355    -7 A     America/De~
 2 ACK     8   Nantucket Mem           41.3  -70.1    48    -5 A     America/Ne~
 3 ALB   -19   Albany Intl             42.7  -73.8   285    -5 A     America/Ne~
 4 ATL   109.  Hartsfield Jackson At~  33.6  -84.4  1026    -5 A     America/Ne~
 5 AUS    64.3 Austin Bergstrom Intl   30.2  -97.7   542    -6 A     America/Ch~
 6 AVL    90   Asheville Regional Ai~  35.4  -82.5  2165    -5 A     America/Ne~
 7 BDL    18.5 Bradley Intl            41.9  -72.7   173    -5 A     America/Ne~
 8 BGR   102   Bangor Intl             44.8  -68.8   192    -5 A     America/Ne~
 9 BHM   158   Birmingham Intl         33.6  -86.8   644    -6 A     America/Ch~
10 BNA    95.9 Nashville Intl          36.1  -86.7   599    -6 A     America/Ch~
# ... with 72 more rows

> june_weather <- june_weather %>% filter(lat<55 & lat>22)
>   
>   ggplot(june_weather) +
+   borders("state") + coord_quickmap() +
+   geom_point(aes(y = lat, x = lon, color = delay, size=delay)) +
+   scale_color_gradientn(colours = rainbow(5))

Other Implementations

dplyr can accomplish many of the same things as SQL

dplyr	SQL
inner_join(x, y, by = “z”)	SELECT * FROM x INNER JOIN y USING (z)
left_join(x, y, by = “z”)	SELECT * FROM x LEFT OUTER JOIN y USING (z)
right_join(x, y, by = “z”)	SELECT * FROM x RIGHT OUTER JOIN y USING (z)
full_join(x, y, by = “z”)	SELECT * FROM x FULL OUTER JOIN y USING (z)
inner_join(x, y, by = c(“a” = “b”))	SELECT * FROM x INNER JOIN y ON x.a = y.b

Mutating Joins can also be accomplished using base R

dplyr	merge
inner_join(x, y)	merge(x, y)
left_join(x, y)	merge(x, y, all.x = TRUE)
right_join(x, y)	merge(x, y, all.y = TRUE)
full_join(x, y)	merge(x, y, all.x = TRUE, all.y = TRUE)

Filtering Joins

Filtering Joins allow you to filter observations in one table based on matches in another table. There are two types:

semi_join(x, y) - keeps all observations from x that have a match in y
anti_join(x, y) - drops all observations from x that have a match in y

If we wanted to view all the flights to the top 10 destinations we could:

Create a table with the top 10 destinations.
Filter the flights dataset based on the newly created table.

> # Top 10 destinations
> top_dest <- flights %>% 
+   count(dest, sort=TRUE) %>% 
+   head(10)
> kable(top_dest) %>% 
+ kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

dest	n
ORD	17283
ATL	17215
LAX	16174
BOS	15508
MCO	14082
CLT	14064
SFO	13331
FLL	12055
MIA	11728
DCA	9705

> # Filter flights based on top_dest
> flights %>% 
+   semi_join(top_dest) %>% 
+   head(10) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 8,position="left")%>%
+                 row_spec(0,background="peachpuff")

year	month	day	dep_time	sched_dep_time	dep_delay	arr_time	sched_arr_time	arr_delay	carrier	flight	tailnum	origin	dest	air_time	distance	hour	minute	time_hour
2013	1	1	542	540	2	923	850	33	AA	1141	N619AA	JFK	MIA	160	1089	5	40	2013-01-01 05:00:00
2013	1	1	554	600	-6	812	837	-25	DL	461	N668DN	LGA	ATL	116	762	6	0	2013-01-01 06:00:00
2013	1	1	554	558	-4	740	728	12	UA	1696	N39463	EWR	ORD	150	719	5	58	2013-01-01 05:00:00
2013	1	1	555	600	-5	913	854	19	B6	507	N516JB	EWR	FLL	158	1065	6	0	2013-01-01 06:00:00
2013	1	1	557	600	-3	838	846	-8	B6	79	N593JB	JFK	MCO	140	944	6	0	2013-01-01 06:00:00
2013	1	1	558	600	-2	753	745	8	AA	301	N3ALAA	LGA	ORD	138	733	6	0	2013-01-01 06:00:00
2013	1	1	558	600	-2	924	917	7	UA	194	N29129	JFK	LAX	345	2475	6	0	2013-01-01 06:00:00
2013	1	1	558	600	-2	923	937	-14	UA	1124	N53441	EWR	SFO	361	2565	6	0	2013-01-01 06:00:00
2013	1	1	559	559	0	702	706	-4	B6	1806	N708JB	JFK	BOS	44	187	5	59	2013-01-01 05:00:00
2013	1	1	600	600	0	851	858	-7	B6	371	N595JB	LGA	FLL	152	1076	6	0	2013-01-01 06:00:00

Anti-Joins are helpful in determining which values do not match. There are many flights that don’t have a match in planes.

> flights %>% 
+   anti_join(planes, by="tailnum") %>% 
+   count(tailnum, sort=TRUE) %>% 
+   head(10) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

tailnum	n
NA	2512
N725MQ	575
N722MQ	513
N723MQ	507
N713MQ	483
N735MQ	396
N0EGMQ	371
N534MQ	364
N542MQ	363
N531MQ	349

Filtering Joins - Exercises

What does it mean for a flight to have a missing tailnum? What do the tail numbers that don’t have a matching record in planes have in common?

You can see from the columns with NAs that these are canceled flights.

> flights %>% filter(is.na(tailnum)) %>% glimpse()

Rows: 2,512
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ day            <int> 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 9, 10, 10, 11, 12, 12,...
$ dep_time       <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ sched_dep_time <int> 1545, 1601, 857, 645, 845, 1830, 840, 820, 1645, 755...
$ dep_delay      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ arr_time       <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ sched_arr_time <int> 1910, 1735, 1209, 952, 1015, 2044, 1001, 958, 1838, ...
$ arr_delay      <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ carrier        <chr> "AA", "UA", "UA", "UA", "9E", "9E", "9E", "9E", "US"...
$ flight         <int> 133, 623, 714, 719, 3405, 3716, 3422, 3317, 123, 402...
$ tailnum        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ origin         <chr> "JFK", "EWR", "EWR", "EWR", "JFK", "EWR", "JFK", "JF...
$ dest           <chr> "LAX", "ORD", "MIA", "DFW", "DCA", "DTW", "BOS", "BU...
$ air_time       <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ distance       <dbl> 2475, 719, 1085, 1372, 213, 488, 187, 301, 529, 569,...
$ hour           <dbl> 15, 16, 8, 6, 8, 18, 8, 8, 16, 7, 12, 15, 7, 17, 9, ...
$ minute         <dbl> 45, 1, 57, 45, 45, 30, 40, 20, 45, 55, 51, 0, 0, 0, ...
$ time_hour      <dttm> 2013-01-02 15:00:00, 2013-01-02 16:00:00, 2013-01-0...

Filter flights to only show flights with planes that have flown at least 100 flights.

> flights %>% filter(!is.na(tailnum)) %>% 
+   group_by(tailnum) %>% mutate(total=n()) %>% 
+   filter(total>=100)

# A tibble: 228,390 x 20
# Groups:   tailnum [1,217]
    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
 1  2013     1     1      517            515         2      830            819
 2  2013     1     1      533            529         4      850            830
 3  2013     1     1      544            545        -1     1004           1022
 4  2013     1     1      554            558        -4      740            728
 5  2013     1     1      555            600        -5      913            854
 6  2013     1     1      557            600        -3      709            723
 7  2013     1     1      557            600        -3      838            846
 8  2013     1     1      558            600        -2      849            851
 9  2013     1     1      558            600        -2      853            856
10  2013     1     1      558            600        -2      923            937
# ... with 228,380 more rows, and 12 more variables: arr_delay <dbl>,
#   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
#   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
#   total <int>

Combine fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.

> library(fueleconomy)
> vehicles %>% semi_join(common, by = c("make", "model"))

# A tibble: 14,531 x 12
      id make  model   year class   trans  drive     cyl displ fuel    hwy   cty
   <dbl> <chr> <chr>  <dbl> <chr>   <chr>  <chr>   <dbl> <dbl> <chr> <dbl> <dbl>
 1  1833 Acura Integ~  1986 Subcom~ Autom~ Front-~     4   1.6 Regu~    28    22
 2  1834 Acura Integ~  1986 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
 3  3037 Acura Integ~  1987 Subcom~ Autom~ Front-~     4   1.6 Regu~    28    22
 4  3038 Acura Integ~  1987 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
 5  4183 Acura Integ~  1988 Subcom~ Autom~ Front-~     4   1.6 Regu~    27    22
 6  4184 Acura Integ~  1988 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
 7  5303 Acura Integ~  1989 Subcom~ Autom~ Front-~     4   1.6 Regu~    27    22
 8  5304 Acura Integ~  1989 Subcom~ Manua~ Front-~     4   1.6 Regu~    28    23
 9  6442 Acura Integ~  1990 Subcom~ Autom~ Front-~     4   1.8 Regu~    24    20
10  6443 Acura Integ~  1990 Subcom~ Manua~ Front-~     4   1.8 Regu~    26    21
# ... with 14,521 more rows

Find the 48 hours (over the course of the whole year) that have the worst delays. Cross-reference it with the weather data. Can you see any patters?

Rather than just taking any 48 hours, I chose to look at the two consecutive days with the greatest average delays.
July 22nd and 23rd had the largest delays, but it wasn’t a weather issue.
Southwest Airlines Flight 345 was a scheduled flight from Nashville International Airport, Tennessee, to New York City’s LaGuardia Airport. On July 22, 2013, the Boeing 737 operating the route suffered a front landing gear collapse while landing at LaGuardia Airport, injuring 9 people on board.

> (flights_48 <- flights %>% filter(!is.na(arr_delay)) %>% 
+   arrange(year,month,day) %>% 
+   group_by(year,month,day) %>% 
+   summarize(mean_delay=mean(arr_delay)) %>% 
+   mutate(mean_lag=lag(mean_delay),two_day=mean_lag+mean_delay,
+          dupe=lead(two_day)))

# A tibble: 365 x 7
# Groups:   year, month [12]
    year month   day mean_delay mean_lag two_day    dupe
   <int> <int> <int>      <dbl>    <dbl>   <dbl>   <dbl>
 1  2013     1     1     12.7     NA      NA      25.3  
 2  2013     1     2     12.7     12.7    25.3    18.4  
 3  2013     1     3      5.73    12.7    18.4     3.80 
 4  2013     1     4     -1.93     5.73    3.80   -3.46 
 5  2013     1     5     -1.53    -1.93   -3.46    2.71 
 6  2013     1     6      4.24    -1.53    2.71   -0.711
 7  2013     1     7     -4.95     4.24   -0.711  -8.17 
 8  2013     1     8     -3.23    -4.95   -8.17   -3.49 
 9  2013     1     9     -0.264   -3.23   -3.49   -6.16 
10  2013     1    10     -5.90    -0.264  -6.16  -10.7  
# ... with 355 more rows

> (one <- flights_48 %>% arrange(desc(two_day)) %>% head(1))

# A tibble: 1 x 7
# Groups:   year, month [1]
   year month   day mean_delay mean_lag two_day  dupe
  <int> <int> <int>      <dbl>    <dbl>   <dbl> <dbl>
1  2013     7    23       45.0     62.8    108.  59.0

> (two <- flights_48 %>% arrange(desc(dupe))%>% head(1))

# A tibble: 1 x 7
# Groups:   year, month [1]
   year month   day mean_delay mean_lag two_day  dupe
  <int> <int> <int>      <dbl>    <dbl>   <dbl> <dbl>
1  2013     7    22       62.8     15.5    78.2  108.

> (three <- bind_rows(two,one))

# A tibble: 2 x 7
# Groups:   year, month [1]
   year month   day mean_delay mean_lag two_day  dupe
  <int> <int> <int>      <dbl>    <dbl>   <dbl> <dbl>
1  2013     7    22       62.8     15.5    78.2 108. 
2  2013     7    23       45.0     62.8   108.   59.0

> weather %>% semi_join(three) %>% group_by(day) %>% 
+   summarize(avg_temp=mean(temp),
+             avg_wind_speed=mean(wind_speed),
+             avg_precip=mean(precip),
+             avg_visib=mean(visib))

# A tibble: 2 x 5
    day avg_temp avg_wind_speed avg_precip avg_visib
  <int>    <dbl>          <dbl>      <dbl>     <dbl>
1    22     78.5           7.96    0.00597      8.91
2    23     80.6          10.1     0.0154       9.33

What does anti_join(flights,airports,by=c("dest"="faa")) tell you? What does anti_join(airports,flights,by=c("faa"="dest")) tell you?

> #flights that have a destination not listed in airports
> anti_join(flights,airports, by=c("dest"="faa")) %>% 
+   distinct(dest)

# A tibble: 4 x 1
  dest 
  <chr>
1 BQN  
2 SJU  
3 STT  
4 PSE

> #airports that are not a destination in flights
> anti_join(airports,flights, by=c("faa"="dest"))

# A tibble: 1,357 x 8
   faa   name                       lat    lon   alt    tz dst   tzone          
   <chr> <chr>                    <dbl>  <dbl> <dbl> <dbl> <chr> <chr>          
 1 04G   Lansdowne Airport         41.1  -80.6  1044    -5 A     America/New_Yo~
 2 06A   Moton Field Municipal A~  32.5  -85.7   264    -6 A     America/Chicago
 3 06C   Schaumburg Regional       42.0  -88.1   801    -6 A     America/Chicago
 4 06N   Randall Airport           41.4  -74.4   523    -5 A     America/New_Yo~
 5 09J   Jekyll Island Airport     31.1  -81.4    11    -5 A     America/New_Yo~
 6 0A9   Elizabethton Municipal ~  36.4  -82.2  1593    -5 A     America/New_Yo~
 7 0G6   Williams County Airport   41.5  -84.5   730    -5 A     America/New_Yo~
 8 0G7   Finger Lakes Regional A~  42.9  -76.8   492    -5 A     America/New_Yo~
 9 0P2   Shoestring Aviation Air~  39.8  -76.6  1000    -5 U     America/New_Yo~
10 0S9   Jefferson County Intl     48.1 -123.    108    -8 A     America/Los_An~
# ... with 1,347 more rows

You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis.

17 planes were flown by more than one carrier

> (planes_carriers <- flights %>%
+   filter(!is.na(tailnum)) %>%
+   distinct(tailnum, carrier))

# A tibble: 4,060 x 2
   carrier tailnum
   <chr>   <chr>  
 1 UA      N14228 
 2 UA      N24211 
 3 AA      N619AA 
 4 B6      N804JB 
 5 DL      N668DN 
 6 UA      N39463 
 7 B6      N516JB 
 8 EV      N829AS 
 9 B6      N593JB 
10 AA      N3ALAA 
# ... with 4,050 more rows

> (planes_carriers %>% count(tailnum) %>%
+   filter(n > 1) %>% nrow())

[1] 17

Set Operations

There are also several set functions that can be helpful.

intersect(x, y) - only observations in both x and y
union(x, y) - unique observations in x and y
setdiff(x, y) - observations in x but not y

> df1 <- tribble(
+   ~x, ~y,
+    1,  1,
+    2,  1
+ )
> 
> df2 <- tribble(
+   ~x, ~y,
+    1,  1,
+    1,  2
+ )

x	y
1	1
2	1

x	y
1	1
1	2

> intersect(df1,df2) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

x	y
1	1

> union(df1,df2) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

x	y
1	1
2	1
1	2

> setdiff(df1,df2) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

x	y
2	1

> setdiff(df2,df1) %>% 
+   kable() %>% 
+   kable_styling(bootstrap_options = c("striped","condensed"),
+                 full_width = F, font_size = 10,position="left")%>%
+                 row_spec(0,background="peachpuff")

x	y
1	2

Joining Data with DPLYR

R Markdown

Paul Jozefek

2020-07-28

The Datasets

The Datasets - Exercises

Keys

Keys - Exercises

Mutating Joins

Join Types

Duplicate Keys

Defining Key Columns

Mutating Joins - Exercises

Other Implementations

Filtering Joins

Filtering Joins - Exercises

Set Operations