dplyr can be used to transform and manipulate data (see https://rpubs.com/pjozefek/643143) and it can be used to join or filter several datasets (relational data).
In this report I will be exploring the three ways in which this is achieved.
The nycflights13 package contains the flights dataset along with four related datasets: airlines, airports, planes, weather.
> library(tidyverse) #load dplyr
> library(nycflights13) #load datasets
> library(knitr) #load library for table formatting
> library(kableExtra) #load library for table formatting
>
> # flights dataset
> kable(head(flights)) %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 8,position="left")%>%
+ row_spec(0,background="peachpuff") | year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 517 | 515 | 2 | 830 | 819 | 11 | UA | 1545 | N14228 | EWR | IAH | 227 | 1400 | 5 | 15 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 533 | 529 | 4 | 850 | 830 | 20 | UA | 1714 | N24211 | LGA | IAH | 227 | 1416 | 5 | 29 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 542 | 540 | 2 | 923 | 850 | 33 | AA | 1141 | N619AA | JFK | MIA | 160 | 1089 | 5 | 40 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 544 | 545 | -1 | 1004 | 1022 | -18 | B6 | 725 | N804JB | JFK | BQN | 183 | 1576 | 5 | 45 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 554 | 600 | -6 | 812 | 837 | -25 | DL | 461 | N668DN | LGA | ATL | 116 | 762 | 6 | 0 | 2013-01-01 06:00:00 |
| 2013 | 1 | 1 | 554 | 558 | -4 | 740 | 728 | 12 | UA | 1696 | N39463 | EWR | ORD | 150 | 719 | 5 | 58 | 2013-01-01 05:00:00 |
The flights dataset has 336776 rows and 19 columns.
> # airlines dataset
> kable(head(airlines)) %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | carrier | name |
|---|---|
| 9E | Endeavor Air Inc. |
| AA | American Airlines Inc. |
| AS | Alaska Airlines Inc. |
| B6 | JetBlue Airways |
| DL | Delta Air Lines Inc. |
| EV | ExpressJet Airlines Inc. |
The airlines dataset has 16 rows and 2 columns.
> # airports dataset
> kable(head(airports)) %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | faa | name | lat | lon | alt | tz | dst | tzone |
|---|---|---|---|---|---|---|---|
| 04G | Lansdowne Airport | 41.13047 | -80.61958 | 1044 | -5 | A | America/New_York |
| 06A | Moton Field Municipal Airport | 32.46057 | -85.68003 | 264 | -6 | A | America/Chicago |
| 06C | Schaumburg Regional | 41.98934 | -88.10124 | 801 | -6 | A | America/Chicago |
| 06N | Randall Airport | 41.43191 | -74.39156 | 523 | -5 | A | America/New_York |
| 09J | Jekyll Island Airport | 31.07447 | -81.42778 | 11 | -5 | A | America/New_York |
| 0A9 | Elizabethton Municipal Airport | 36.37122 | -82.17342 | 1593 | -5 | A | America/New_York |
The airports dataset has 1458 rows and 8 columns.
> # planes dataset
> kable(head(planes)) %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | tailnum | year | type | manufacturer | model | engines | seats | speed | engine |
|---|---|---|---|---|---|---|---|---|
| N10156 | 2004 | Fixed wing multi engine | EMBRAER | EMB-145XR | 2 | 55 | NA | Turbo-fan |
| N102UW | 1998 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
| N103US | 1999 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
| N104UW | 1999 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
| N10575 | 2002 | Fixed wing multi engine | EMBRAER | EMB-145LR | 2 | 55 | NA | Turbo-fan |
| N105UW | 1999 | Fixed wing multi engine | AIRBUS INDUSTRIE | A320-214 | 2 | 182 | NA | Turbo-fan |
The planes dataset has 3322 rows and 9 columns.
> # weather dataset
> kable(head(weather)) %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | origin | year | month | day | hour | temp | dewp | humid | wind_dir | wind_speed | wind_gust | precip | pressure | visib | time_hour |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| EWR | 2013 | 1 | 1 | 1 | 39.02 | 26.06 | 59.37 | 270 | 10.35702 | NA | 0 | 1012.0 | 10 | 2013-01-01 01:00:00 |
| EWR | 2013 | 1 | 1 | 2 | 39.02 | 26.96 | 61.63 | 250 | 8.05546 | NA | 0 | 1012.3 | 10 | 2013-01-01 02:00:00 |
| EWR | 2013 | 1 | 1 | 3 | 39.02 | 28.04 | 64.43 | 240 | 11.50780 | NA | 0 | 1012.5 | 10 | 2013-01-01 03:00:00 |
| EWR | 2013 | 1 | 1 | 4 | 39.92 | 28.04 | 62.21 | 250 | 12.65858 | NA | 0 | 1012.2 | 10 | 2013-01-01 04:00:00 |
| EWR | 2013 | 1 | 1 | 5 | 39.02 | 28.04 | 64.43 | 260 | 12.65858 | NA | 0 | 1011.9 | 10 | 2013-01-01 05:00:00 |
| EWR | 2013 | 1 | 1 | 6 | 37.94 | 28.04 | 67.21 | 240 | 11.50780 | NA | 0 | 1012.4 | 10 | 2013-01-01 06:00:00 |
The weather dataset has 26115 rows and 15 columns.
flights with airports.> library(tidyverse)
> library(nycflights13)
>
> flights_route <- flights %>%
+ inner_join(airports,by = c("origin"="faa")) %>%
+ inner_join(airports,by = c("dest"="faa"))
>
> (flights_route <- flights_route %>%
+ select(origin,lat.x,lon.x,dest,lat.y,lon.y) %>%
+ head(100))# A tibble: 100 x 6
origin lat.x lon.x dest lat.y lon.y
<chr> <dbl> <dbl> <chr> <dbl> <dbl>
1 EWR 40.7 -74.2 IAH 30.0 -95.3
2 LGA 40.8 -73.9 IAH 30.0 -95.3
3 JFK 40.6 -73.8 MIA 25.8 -80.3
4 LGA 40.8 -73.9 ATL 33.6 -84.4
5 EWR 40.7 -74.2 ORD 42.0 -87.9
6 EWR 40.7 -74.2 FLL 26.1 -80.2
7 LGA 40.8 -73.9 IAD 38.9 -77.5
8 JFK 40.6 -73.8 MCO 28.4 -81.3
9 LGA 40.8 -73.9 ORD 42.0 -87.9
10 JFK 40.6 -73.8 PBI 26.7 -80.1
# ... with 90 more rows
> flights_route %>%
+ ggplot() +
+ borders("state") + coord_quickmap() +
+ geom_segment(aes(
+ x = lon.x, xend = lon.y,
+ y = lat.x, yend = lat.y,
+ color=dest),
+ arrow = arrow(length = unit(0.1, "cm"))) +
+ labs(x="longitude",y="latitude")+
+ theme_minimal()+
+ theme(legend.position = "none") weather and airports. What is the relationship and how should it appear in the diagram?weather$origin = airports$faa.# A tibble: 26,115 x 22
origin year month day hour temp dewp humid wind_dir wind_speed
<chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
# ... with 26,105 more rows, and 12 more variables: wind_gust <dbl>,
# precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, name <chr>,
# lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>, dst <chr>, tzone <chr>
weather only contains information for the origin (NYC) airports. If it contained weather records for all airports in the USA, what additional relation would it define with flights?year, month, day, origin.year, month, day, dest.year, month, day.> (special_days <- tribble(
+ ~year, ~month, ~day, ~holiday,
+ 2013, 01, 01, "New Years Day",
+ 2013, 07, 04, "Independence Day",
+ 2013, 11, 29, "Thanksgiving Day",
+ 2013, 12, 25, "Christmas Day"
+ ))# A tibble: 4 x 4
year month day holiday
<dbl> <dbl> <dbl> <chr>
1 2013 1 1 New Years Day
2 2013 7 4 Independence Day
3 2013 11 29 Thanksgiving Day
4 2013 12 25 Christmas Day
Keys are used to connect each table and they uniquely identify an observation (example: every US citizen has a unique Social Security number).
One way to verify that a key is unique is by checking that the count() is not greater than 1.
[1] 0
tailnum uniquely identifies a plane.
> #count by specific day/time and airport
> WeatherKey <- weather %>% count(year, month, day, hour, origin) %>%
+ filter(n>1)
> nrow(WeatherKey)[1] 3
A specific time should have a unique weather recording for a specific airport, but it seems that there are 3 records with more than one. It could be an error. In this instance I will explore those 3 further and adjust the data accordingly.
> #view records with more than 1
> weather %>% count(year, month, day, hour, origin) %>%
+ filter(n>1) %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | year | month | day | hour | origin | n |
|---|---|---|---|---|---|
| 2013 | 11 | 3 | 1 | EWR | 2 |
| 2013 | 11 | 3 | 1 | JFK | 2 |
| 2013 | 11 | 3 | 1 | LGA | 2 |
> weather %>%
+ filter(month==11,day==3,hour==1,origin %in% c("EWR","JFK","LGA"))%>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | origin | year | month | day | hour | temp | dewp | humid | wind_dir | wind_speed | wind_gust | precip | pressure | visib | time_hour |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| EWR | 2013 | 11 | 3 | 1 | 51.98 | 39.02 | 61.15 | 310 | 6.90468 | NA | 0 | 1009.8 | 10 | 2013-11-03 01:00:00 |
| EWR | 2013 | 11 | 3 | 1 | 50.00 | 39.02 | 65.80 | 290 | 5.75390 | NA | 0 | 1010.5 | 10 | 2013-11-03 01:00:00 |
| JFK | 2013 | 11 | 3 | 1 | 53.96 | 37.94 | 54.51 | 320 | 9.20624 | NA | 0 | 1009.8 | 10 | 2013-11-03 01:00:00 |
| JFK | 2013 | 11 | 3 | 1 | 51.98 | 37.94 | 58.62 | 310 | 6.90468 | NA | 0 | 1010.5 | 10 | 2013-11-03 01:00:00 |
| LGA | 2013 | 11 | 3 | 1 | 55.04 | 39.02 | 54.67 | 330 | 9.20624 | NA | 0 | 1009.3 | 10 | 2013-11-03 01:00:00 |
| LGA | 2013 | 11 | 3 | 1 | 53.96 | 39.92 | 58.89 | 310 | 8.05546 | NA | 0 | 1010.2 | 10 | 2013-11-03 01:00:00 |
I decided to average those 3 days for each airport. Year, month, day, hour, and origin will now be a unique identifier.
> weather2 <- weather %>%
+ group_by(year,month,day,hour,origin) %>%
+ summarize(
+ across(temp:visib, mean),
+ across(time_hour, first)
+ )
>
> weather2 %>%
+ filter(month==11,day==3,hour==1,origin %in% c("EWR","JFK","LGA"))%>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | year | month | day | hour | origin | temp | dewp | humid | wind_dir | wind_speed | wind_gust | precip | pressure | visib | time_hour |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 11 | 3 | 1 | EWR | 50.99 | 39.02 | 63.475 | 300 | 6.32929 | NA | 0 | 1010.15 | 10 | 2013-11-03 01:00:00 |
| 2013 | 11 | 3 | 1 | JFK | 52.97 | 37.94 | 56.565 | 315 | 8.05546 | NA | 0 | 1010.15 | 10 | 2013-11-03 01:00:00 |
| 2013 | 11 | 3 | 1 | LGA | 54.50 | 39.47 | 56.780 | 320 | 8.63085 | NA | 0 | 1009.75 | 10 | 2013-11-03 01:00:00 |
> WeatherKey <- weather2 %>% count(year, month, day, hour, origin) %>%
+ filter(n>1)
> nrow(WeatherKey)[1] 0
Sometimes there is no combination of variables that uniquely identifies an observation. In the flights data set neither the specific day and flight number nor the specific day and tailnumber are unique.
> # day and flight
> flights %>%
+ count(year,month,day,flight) %>%
+ filter(n>1) %>% head() %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | year | month | day | flight | n |
|---|---|---|---|---|
| 2013 | 1 | 1 | 1 | 2 |
| 2013 | 1 | 1 | 3 | 2 |
| 2013 | 1 | 1 | 4 | 2 |
| 2013 | 1 | 1 | 11 | 3 |
| 2013 | 1 | 1 | 15 | 2 |
| 2013 | 1 | 1 | 21 | 2 |
> #day and tailnum
> flights %>%
+ count(year,month,day,tailnum) %>%
+ filter(n>1) %>% head() %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | year | month | day | tailnum | n |
|---|---|---|---|---|
| 2013 | 1 | 1 | N0EGMQ | 2 |
| 2013 | 1 | 1 | N11189 | 2 |
| 2013 | 1 | 1 | N11536 | 2 |
| 2013 | 1 | 1 | N11544 | 3 |
| 2013 | 1 | 1 | N11551 | 2 |
| 2013 | 1 | 1 | N12540 | 2 |
# A tibble: 336,776 x 6
month day flight origin dest id
<int> <int> <int> <chr> <chr> <int>
1 1 1 1545 EWR IAH 1
2 1 1 1714 LGA IAH 2
3 1 1 1141 JFK MIA 3
4 1 1 725 JFK BQN 4
5 1 1 461 LGA ATL 5
6 1 1 1696 EWR ORD 6
7 1 1 507 EWR FLL 7
8 1 1 5708 LGA IAD 8
9 1 1 79 JFK MCO 9
10 1 1 301 LGA ORD 10
# ... with 336,766 more rows
Lahman::Batting
babyname::babynames
nasaweather::atmos
fueleconomy::vehicles
ggplot2::diamonds
count() contains the keys. Each call of nrow() results in \(0\).ggplot2 has no unique keys. Some rows are duplicates.> Lahman::Batting %>%
+ count(playerID, yearID, stint) %>%
+ filter(n > 1) %>%
+ nrow()
>
> babynames::babynames %>%
+ count(year, sex, name, wt=n()) %>%
+ filter(n > 1) %>%
+ nrow()
>
> nasaweather::atmos %>%
+ count(lat, long, year, month) %>%
+ filter(n > 1) %>%
+ nrow()
>
> fueleconomy::vehicles %>%
+ count(id) %>%
+ filter(n > 1) %>%
+ nrow()[1] 53794
[1] 53940
Similar to mutate(), a mutating join adds new variables to the right of a table. It does so by:
For example, if you want to add carrier name to the flights dataset you would join it to the airlines table using the carrier variable as your key.
It is easier to see when starting with a smaller dataset.
> #make a smaller data set
> flights2 <- flights %>%
+ select(year:day,hour,origin,dest,tailnum,carrier)
>
> #display first 6
> flights2 %>% head() %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | year | month | day | hour | origin | dest | tailnum | carrier |
|---|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 5 | EWR | IAH | N14228 | UA |
| 2013 | 1 | 1 | 5 | LGA | IAH | N24211 | UA |
| 2013 | 1 | 1 | 5 | JFK | MIA | N619AA | AA |
| 2013 | 1 | 1 | 5 | JFK | BQN | N804JB | B6 |
| 2013 | 1 | 1 | 6 | LGA | ATL | N668DN | DL |
| 2013 | 1 | 1 | 5 | EWR | ORD | N39463 | UA |
We can see that the column name is now added to the right of the flights2 table.
> #add name using join
> flights2 %>%
+ select(-origin,-dest) %>%
+ left_join(airlines,by="carrier") %>%
+ head() %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | year | month | day | hour | tailnum | carrier | name |
|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 5 | N14228 | UA | United Air Lines Inc. |
| 2013 | 1 | 1 | 5 | N24211 | UA | United Air Lines Inc. |
| 2013 | 1 | 1 | 5 | N619AA | AA | American Airlines Inc. |
| 2013 | 1 | 1 | 5 | N804JB | B6 | JetBlue Airways |
| 2013 | 1 | 1 | 6 | N668DN | DL | Delta Air Lines Inc. |
| 2013 | 1 | 1 | 5 | N39463 | UA | United Air Lines Inc. |
We could achieve the same result using mutate(), but it is more difficult.
> #add name using mutate
> flights2 %>%
+ select(-origin,-dest) %>%
+ mutate(name=airlines$name[match(carrier,airlines$carrier)]) %>%
+ head() %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | year | month | day | hour | tailnum | carrier | name |
|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 5 | N14228 | UA | United Air Lines Inc. |
| 2013 | 1 | 1 | 5 | N24211 | UA | United Air Lines Inc. |
| 2013 | 1 | 1 | 5 | N619AA | AA | American Airlines Inc. |
| 2013 | 1 | 1 | 5 | N804JB | B6 | JetBlue Airways |
| 2013 | 1 | 1 | 6 | N668DN | DL | Delta Air Lines Inc. |
| 2013 | 1 | 1 | 5 | N39463 | UA | United Air Lines Inc. |
There are four kinds of mutating joins
Some examples are provided below:
> #Sample tables
> x <- tribble(
+ ~key,~val_x,
+ 1,"x1",
+ 2,"x2",
+ 3,"x3"
+ )
>
> y <- tribble(
+ ~key,~val_y,
+ 1,"y1",
+ 2,"y2",
+ 4,"y3"
+ )| key | val_x |
|---|---|
| 1 | x1 |
| 2 | x2 |
| 3 | x3 |
| key | val_y |
|---|---|
| 1 | y1 |
| 2 | y2 |
| 4 | y3 |
Inner Join
> x %>%
+ inner_join(y, by="key") %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff") | key | val_x | val_y |
|---|---|---|
| 1 | x1 | y1 |
| 2 | x2 | y2 |
Left Join
> x %>%
+ left_join(y, by="key") %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| key | val_x | val_y |
|---|---|---|
| 1 | x1 | y1 |
| 2 | x2 | y2 |
| 3 | x3 | NA |
Right Join
> x %>%
+ right_join(y, by="key") %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| key | val_x | val_y |
|---|---|---|
| 1 | x1 | y1 |
| 2 | x2 | y2 |
| 4 | NA | y3 |
Full Join
> x %>%
+ full_join(y, by="key") %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| key | val_x | val_y |
|---|---|---|
| 1 | x1 | y1 |
| 2 | x2 | y2 |
| 3 | x3 | NA |
| 4 | NA | y3 |
It is possible to have duplicate keys in one or both tables. When both tables have duplicate keys the result includes all combinations (Cartesian product).
Some examples are provided below:
Left Join - Left table with duplicate key
> x <- tribble(
+ ~key,~val_x,
+ 1,"x1",
+ 2,"x2",
+ 2,"x3",
+ 1,"x4"
+ )
>
> y <- tribble(
+ ~key,~val_y,
+ 1,"y1",
+ 2,"y2"
+ )| key | val_x |
|---|---|
| 1 | x1 |
| 2 | x2 |
| 2 | x3 |
| 1 | x4 |
| key | val_y |
|---|---|
| 1 | y1 |
| 2 | y2 |
Left Join
> left_join(x,y,by="key") %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| key | val_x | val_y |
|---|---|---|
| 1 | x1 | y1 |
| 2 | x2 | y2 |
| 2 | x3 | y2 |
| 1 | x4 | y1 |
Left Join - Both tables with duplicate key
> x <- tribble(
+ ~key,~val_x,
+ 1,"x1",
+ 2,"x2",
+ 2,"x3",
+ 3,"x4"
+ )
>
> y <- tribble(
+ ~key,~val_y,
+ 1,"y1",
+ 2,"y2",
+ 2,"y3",
+ 3,"y4"
+ )| key | val_x |
|---|---|
| 1 | x1 |
| 2 | x2 |
| 2 | x3 |
| 3 | x4 |
| key | val_y |
|---|---|
| 1 | y1 |
| 2 | y2 |
| 2 | y3 |
| 3 | y4 |
Left Join
> left_join(x,y,by="key") %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| key | val_x | val_y |
|---|---|---|
| 1 | x1 | y1 |
| 2 | x2 | y2 |
| 2 | x2 | y3 |
| 2 | x3 | y2 |
| 2 | x3 | y3 |
| 3 | x4 | y4 |
There are several ways to specify the key:
by = "x" is used the tables will be matched on that common variable.by = c("a" = "b") is used the variable a in table x will be matched to the variable b in table y.Some examples are provided below:
Default - a message displays the common variables. In this case year, month, day, hour, origin.
> flights2 %>%
+ left_join(weather2) %>%
+ head() %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")Joining, by = c("year", "month", "day", "hour", "origin")
| year | month | day | hour | origin | dest | tailnum | carrier | temp | dewp | humid | wind_dir | wind_speed | wind_gust | precip | pressure | visib | time_hour |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 5 | EWR | IAH | N14228 | UA | 39.02 | 28.04 | 64.43 | 260 | 12.65858 | NA | 0 | 1011.9 | 10 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 5 | LGA | IAH | N24211 | UA | 39.92 | 24.98 | 54.81 | 250 | 14.96014 | 21.86482 | 0 | 1011.4 | 10 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 5 | JFK | MIA | N619AA | AA | 39.02 | 26.96 | 61.63 | 260 | 14.96014 | NA | 0 | 1012.1 | 10 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 5 | JFK | BQN | N804JB | B6 | 39.02 | 26.96 | 61.63 | 260 | 14.96014 | NA | 0 | 1012.1 | 10 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 6 | LGA | ATL | N668DN | DL | 39.92 | 24.98 | 54.81 | 260 | 16.11092 | 23.01560 | 0 | 1011.7 | 10 | 2013-01-01 06:00:00 |
| 2013 | 1 | 1 | 5 | EWR | ORD | N39463 | UA | 39.02 | 28.04 | 64.43 | 260 | 12.65858 | NA | 0 | 1011.9 | 10 | 2013-01-01 05:00:00 |
Single Character Vector - Each table has a year variable, but they are not constrained to be equal. They output labels them as year.x and year.y.
> flights2 %>%
+ left_join(planes, by="tailnum") %>%
+ head() %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| year.x | month | day | hour | origin | dest | tailnum | carrier | year.y | type | manufacturer | model | engines | seats | speed | engine |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 5 | EWR | IAH | N14228 | UA | 1999 | Fixed wing multi engine | BOEING | 737-824 | 2 | 149 | NA | Turbo-fan |
| 2013 | 1 | 1 | 5 | LGA | IAH | N24211 | UA | 1998 | Fixed wing multi engine | BOEING | 737-824 | 2 | 149 | NA | Turbo-fan |
| 2013 | 1 | 1 | 5 | JFK | MIA | N619AA | AA | 1990 | Fixed wing multi engine | BOEING | 757-223 | 2 | 178 | NA | Turbo-fan |
| 2013 | 1 | 1 | 5 | JFK | BQN | N804JB | B6 | 2012 | Fixed wing multi engine | AIRBUS | A320-232 | 2 | 200 | NA | Turbo-fan |
| 2013 | 1 | 1 | 6 | LGA | ATL | N668DN | DL | 1991 | Fixed wing multi engine | BOEING | 757-232 | 2 | 178 | NA | Turbo-fan |
| 2013 | 1 | 1 | 5 | EWR | ORD | N39463 | UA | 2012 | Fixed wing multi engine | BOEING | 737-924ER | 2 | 191 | NA | Turbo-fan |
Named Character Vector - the airports table includes latitude and longitude. You would need to specify which airport you are interested in - the destination or the origin.
> flights2 %>%
+ left_join(airports, c("dest" = "faa")) %>%
+ head %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| year | month | day | hour | origin | dest | tailnum | carrier | name | lat | lon | alt | tz | dst | tzone |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 5 | EWR | IAH | N14228 | UA | George Bush Intercontinental | 29.98443 | -95.34144 | 97 | -6 | A | America/Chicago |
| 2013 | 1 | 1 | 5 | LGA | IAH | N24211 | UA | George Bush Intercontinental | 29.98443 | -95.34144 | 97 | -6 | A | America/Chicago |
| 2013 | 1 | 1 | 5 | JFK | MIA | N619AA | AA | Miami Intl | 25.79325 | -80.29056 | 8 | -5 | A | America/New_York |
| 2013 | 1 | 1 | 5 | JFK | BQN | N804JB | B6 | NA | NA | NA | NA | NA | NA | NA |
| 2013 | 1 | 1 | 6 | LGA | ATL | N668DN | DL | Hartsfield Jackson Atlanta Intl | 33.63672 | -84.42807 | 1026 | -5 | A | America/New_York |
| 2013 | 1 | 1 | 5 | EWR | ORD | N39463 | UA | Chicago Ohare Intl | 41.97860 | -87.90484 | 668 | -6 | A | America/Chicago |
> flights2 %>%
+ left_join(airports, c("origin" = "faa")) %>%
+ head() %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| year | month | day | hour | origin | dest | tailnum | carrier | name | lat | lon | alt | tz | dst | tzone |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 5 | EWR | IAH | N14228 | UA | Newark Liberty Intl | 40.69250 | -74.16867 | 18 | -5 | A | America/New_York |
| 2013 | 1 | 1 | 5 | LGA | IAH | N24211 | UA | La Guardia | 40.77725 | -73.87261 | 22 | -5 | A | America/New_York |
| 2013 | 1 | 1 | 5 | JFK | MIA | N619AA | AA | John F Kennedy Intl | 40.63975 | -73.77893 | 13 | -5 | A | America/New_York |
| 2013 | 1 | 1 | 5 | JFK | BQN | N804JB | B6 | John F Kennedy Intl | 40.63975 | -73.77893 | 13 | -5 | A | America/New_York |
| 2013 | 1 | 1 | 6 | LGA | ATL | N668DN | DL | La Guardia | 40.77725 | -73.87261 | 22 | -5 | A | America/New_York |
| 2013 | 1 | 1 | 5 | EWR | ORD | N39463 | UA | Newark Liberty Intl | 40.69250 | -74.16867 | 18 | -5 | A | America/New_York |
airports data frame so you can show the spatial distribution of delays.> (avg_delay <-
+ flights %>% filter(!is.na(arr_delay)) %>% group_by(dest) %>%
+ summarize(delay_mean=mean(arr_delay)) %>%
+ inner_join(airports,by=c("dest"="faa")))# A tibble: 100 x 9
dest delay_mean name lat lon alt tz dst tzone
<chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 ABQ 4.38 Albuquerque Inter~ 35.0 -107. 5355 -7 A America/D~
2 ACK 4.85 Nantucket Mem 41.3 -70.1 48 -5 A America/N~
3 ALB 14.4 Albany Intl 42.7 -73.8 285 -5 A America/N~
4 ANC -2.5 Ted Stevens Ancho~ 61.2 -150. 152 -9 A America/A~
5 ATL 11.3 Hartsfield Jackso~ 33.6 -84.4 1026 -5 A America/N~
6 AUS 6.02 Austin Bergstrom ~ 30.2 -97.7 542 -6 A America/C~
7 AVL 8.00 Asheville Regiona~ 35.4 -82.5 2165 -5 A America/N~
8 BDL 7.05 Bradley Intl 41.9 -72.7 173 -5 A America/N~
9 BGR 8.03 Bangor Intl 44.8 -68.8 192 -5 A America/N~
10 BHM 16.9 Birmingham Intl 33.6 -86.8 644 -6 A America/C~
# ... with 90 more rows
> #remove Hawaii and Alaska, for better plotting
> avg_delay <- avg_delay %>% filter(lat<55 & lat>22)
>
> avg_delay %>% ggplot()+
+ borders("state")+coord_quickmap()+
+ geom_point(aes(x=lon, y=lat, color=delay_mean))+
+ scale_color_gradient(low="blue", high="red")+
+ theme_minimal()> flights %>%
+ left_join(airports,by = c("origin" = "faa")) %>%
+ left_join(airports,by = c("dest" = "faa"),
+ suffix = c("_origin", "_dest")) %>%
+ select(year:day, hour, origin, dest, starts_with("lat"),
+ starts_with("lon"))# A tibble: 336,776 x 10
year month day hour origin dest lat_origin lat_dest lon_origin lon_dest
<int> <int> <int> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 5 EWR IAH 40.7 30.0 -74.2 -95.3
2 2013 1 1 5 LGA IAH 40.8 30.0 -73.9 -95.3
3 2013 1 1 5 JFK MIA 40.6 25.8 -73.8 -80.3
4 2013 1 1 5 JFK BQN 40.6 NA -73.8 NA
5 2013 1 1 6 LGA ATL 40.8 33.6 -73.9 -84.4
6 2013 1 1 5 EWR ORD 40.7 42.0 -74.2 -87.9
7 2013 1 1 6 EWR FLL 40.7 26.1 -74.2 -80.2
8 2013 1 1 6 LGA IAD 40.8 38.9 -73.9 -77.5
9 2013 1 1 6 JFK MCO 40.6 28.4 -73.8 -81.3
10 2013 1 1 6 LGA ORD 40.8 42.0 -73.9 -87.9
# ... with 336,766 more rows
> (plane_age <- flights %>% filter(!is.na(arr_delay)) %>%
+ left_join(planes, by="tailnum",suffix = c("_flights","_planes")) %>%
+ mutate(age=2013-year_planes) %>%
+ group_by(tailnum) %>%
+ summarize(mean_delay=mean(arr_delay, na.rm=TRUE),
+ year=first(year_planes),age=first(age)) %>%
+ filter(!is.na(age)) %>%
+ group_by(age) %>%
+ summarize(plane_delay=mean(mean_delay), n=n())
+ )# A tibble: 46 x 3
age plane_delay n
<dbl> <dbl> <int>
1 0 2.53 92
2 1 0.732 95
3 2 4.24 66
4 3 3.06 48
5 4 5.88 84
6 5 4.59 147
7 6 7.88 123
8 7 7.64 126
9 8 11.1 160
10 9 12.4 192
# ... with 36 more rows
> plane_age %>% ggplot()+
+ geom_point(aes(x=age,y=plane_delay, alpha=n, size=n))+
+ scale_x_continuous("Age of plane", breaks = seq(0, 60, by = 10))+
+ scale_y_continuous("Mean delay", breaks = seq(-20, 15, by = 5))precip data points are below 0.121> (flight_weather <-
+ flights %>%
+ inner_join(weather, by = c("origin","year","month","day","hour")) %>%
+ mutate(precip_decile=cut_interval(precip,10)) %>%
+ select(origin,month,day,dep_delay,precip,precip_decile) %>%
+ group_by(precip_decile) %>%
+ summarize(mean_delay=mean(dep_delay, na.rm=TRUE),
+ mean_precip=mean(precip,na.rm=TRUE),n=n())
+ )# A tibble: 9 x 4
precip_decile mean_delay mean_precip n
<fct> <dbl> <dbl> <int>
1 [0,0.121] 12.4 0.00207 331848
2 (0.121,0.242] 40.8 0.175 2285
3 (0.242,0.363] 35.5 0.303 611
4 (0.363,0.484] 51.9 0.425 292
5 (0.484,0.605] 29.5 0.55 69
6 (0.605,0.726] 41.3 0.661 55
7 (0.726,0.847] 94.7 0.82 19
8 (0.847,0.968] 27.8 0.94 20
9 (1.09,1.21] 113. 1.21 21
> (flight_weather <-
+ flights %>%
+ inner_join(weather, by = c("origin","year","month","day","hour")) %>%
+ mutate(visib_decile=cut_interval(visib,n=10)) %>%
+ select(origin,month,day,dep_delay,visib,visib_decile) %>%
+ group_by(visib_decile) %>%
+ summarize(mean_delay=mean(dep_delay, na.rm=TRUE),
+ mean_visib=mean(visib,na.rm=TRUE),
+ n=n())
+ )# A tibble: 10 x 4
visib_decile mean_delay mean_visib n
<fct> <dbl> <dbl> <int>
1 [0,1] 30.4 0.547 5449
2 (1,2] 25.5 1.80 5255
3 (2,3] 21.2 2.80 6058
4 (3,4] 19.5 4 2253
5 (4,5] 21.1 5 4809
6 (5,6] 19.8 6 6260
7 (6,7] 19.2 7 7347
8 (7,8] 20.0 8 7614
9 (8,9] 18.8 9 11436
10 (9,10] 11.0 10 278739
> ggplot(flight_weather)+
+ geom_point(aes(x=mean_visib, y=mean_delay, color=visib_decile),
+ size=3)+
+ theme(legend.position = "none") > (june_weather <- flights %>%
+ filter(year == 2013, month == 6, day == 13, !is.na(arr_delay)) %>%
+ group_by(dest) %>%
+ summarize(delay = mean(arr_delay)) %>%
+ inner_join(airports, by = c("dest" = "faa")))# A tibble: 82 x 9
dest delay name lat lon alt tz dst tzone
<chr> <dbl> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 ABQ 64 Albuquerque Internati~ 35.0 -107. 5355 -7 A America/De~
2 ACK 8 Nantucket Mem 41.3 -70.1 48 -5 A America/Ne~
3 ALB -19 Albany Intl 42.7 -73.8 285 -5 A America/Ne~
4 ATL 109. Hartsfield Jackson At~ 33.6 -84.4 1026 -5 A America/Ne~
5 AUS 64.3 Austin Bergstrom Intl 30.2 -97.7 542 -6 A America/Ch~
6 AVL 90 Asheville Regional Ai~ 35.4 -82.5 2165 -5 A America/Ne~
7 BDL 18.5 Bradley Intl 41.9 -72.7 173 -5 A America/Ne~
8 BGR 102 Bangor Intl 44.8 -68.8 192 -5 A America/Ne~
9 BHM 158 Birmingham Intl 33.6 -86.8 644 -6 A America/Ch~
10 BNA 95.9 Nashville Intl 36.1 -86.7 599 -6 A America/Ch~
# ... with 72 more rows
> june_weather <- june_weather %>% filter(lat<55 & lat>22)
>
> ggplot(june_weather) +
+ borders("state") + coord_quickmap() +
+ geom_point(aes(y = lat, x = lon, color = delay, size=delay)) +
+ scale_color_gradientn(colours = rainbow(5))dplyr can accomplish many of the same things as SQL
| dplyr | SQL |
|---|---|
| inner_join(x, y, by = “z”) | SELECT * FROM x INNER JOIN y USING (z) |
| left_join(x, y, by = “z”) | SELECT * FROM x LEFT OUTER JOIN y USING (z) |
| right_join(x, y, by = “z”) | SELECT * FROM x RIGHT OUTER JOIN y USING (z) |
| full_join(x, y, by = “z”) | SELECT * FROM x FULL OUTER JOIN y USING (z) |
| inner_join(x, y, by = c(“a” = “b”)) | SELECT * FROM x INNER JOIN y ON x.a = y.b |
Mutating Joins can also be accomplished using base R
| dplyr | merge |
|---|---|
| inner_join(x, y) | merge(x, y) |
| left_join(x, y) | merge(x, y, all.x = TRUE) |
| right_join(x, y) | merge(x, y, all.y = TRUE) |
| full_join(x, y) | merge(x, y, all.x = TRUE, all.y = TRUE) |
Filtering Joins allow you to filter observations in one table based on matches in another table. There are two types:
If we wanted to view all the flights to the top 10 destinations we could:
flights dataset based on the newly created table.> # Top 10 destinations
> top_dest <- flights %>%
+ count(dest, sort=TRUE) %>%
+ head(10)
> kable(top_dest) %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| dest | n |
|---|---|
| ORD | 17283 |
| ATL | 17215 |
| LAX | 16174 |
| BOS | 15508 |
| MCO | 14082 |
| CLT | 14064 |
| SFO | 13331 |
| FLL | 12055 |
| MIA | 11728 |
| DCA | 9705 |
> # Filter flights based on top_dest
> flights %>%
+ semi_join(top_dest) %>%
+ head(10) %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 8,position="left")%>%
+ row_spec(0,background="peachpuff")| year | month | day | dep_time | sched_dep_time | dep_delay | arr_time | sched_arr_time | arr_delay | carrier | flight | tailnum | origin | dest | air_time | distance | hour | minute | time_hour |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 2013 | 1 | 1 | 542 | 540 | 2 | 923 | 850 | 33 | AA | 1141 | N619AA | JFK | MIA | 160 | 1089 | 5 | 40 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 554 | 600 | -6 | 812 | 837 | -25 | DL | 461 | N668DN | LGA | ATL | 116 | 762 | 6 | 0 | 2013-01-01 06:00:00 |
| 2013 | 1 | 1 | 554 | 558 | -4 | 740 | 728 | 12 | UA | 1696 | N39463 | EWR | ORD | 150 | 719 | 5 | 58 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 555 | 600 | -5 | 913 | 854 | 19 | B6 | 507 | N516JB | EWR | FLL | 158 | 1065 | 6 | 0 | 2013-01-01 06:00:00 |
| 2013 | 1 | 1 | 557 | 600 | -3 | 838 | 846 | -8 | B6 | 79 | N593JB | JFK | MCO | 140 | 944 | 6 | 0 | 2013-01-01 06:00:00 |
| 2013 | 1 | 1 | 558 | 600 | -2 | 753 | 745 | 8 | AA | 301 | N3ALAA | LGA | ORD | 138 | 733 | 6 | 0 | 2013-01-01 06:00:00 |
| 2013 | 1 | 1 | 558 | 600 | -2 | 924 | 917 | 7 | UA | 194 | N29129 | JFK | LAX | 345 | 2475 | 6 | 0 | 2013-01-01 06:00:00 |
| 2013 | 1 | 1 | 558 | 600 | -2 | 923 | 937 | -14 | UA | 1124 | N53441 | EWR | SFO | 361 | 2565 | 6 | 0 | 2013-01-01 06:00:00 |
| 2013 | 1 | 1 | 559 | 559 | 0 | 702 | 706 | -4 | B6 | 1806 | N708JB | JFK | BOS | 44 | 187 | 5 | 59 | 2013-01-01 05:00:00 |
| 2013 | 1 | 1 | 600 | 600 | 0 | 851 | 858 | -7 | B6 | 371 | N595JB | LGA | FLL | 152 | 1076 | 6 | 0 | 2013-01-01 06:00:00 |
Anti-Joins are helpful in determining which values do not match. There are many flights that don’t have a match in planes.
> flights %>%
+ anti_join(planes, by="tailnum") %>%
+ count(tailnum, sort=TRUE) %>%
+ head(10) %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| tailnum | n |
|---|---|
| NA | 2512 |
| N725MQ | 575 |
| N722MQ | 513 |
| N723MQ | 507 |
| N713MQ | 483 |
| N735MQ | 396 |
| N0EGMQ | 371 |
| N534MQ | 364 |
| N542MQ | 363 |
| N531MQ | 349 |
tailnum? What do the tail numbers that don’t have a matching record in planes have in common?Rows: 2,512
Columns: 19
$ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013...
$ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
$ day <int> 2, 2, 3, 3, 4, 4, 5, 7, 8, 9, 9, 10, 10, 11, 12, 12,...
$ dep_time <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ sched_dep_time <int> 1545, 1601, 857, 645, 845, 1830, 840, 820, 1645, 755...
$ dep_delay <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ arr_time <int> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ sched_arr_time <int> 1910, 1735, 1209, 952, 1015, 2044, 1001, 958, 1838, ...
$ arr_delay <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ carrier <chr> "AA", "UA", "UA", "UA", "9E", "9E", "9E", "9E", "US"...
$ flight <int> 133, 623, 714, 719, 3405, 3716, 3422, 3317, 123, 402...
$ tailnum <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ origin <chr> "JFK", "EWR", "EWR", "EWR", "JFK", "EWR", "JFK", "JF...
$ dest <chr> "LAX", "ORD", "MIA", "DFW", "DCA", "DTW", "BOS", "BU...
$ air_time <dbl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
$ distance <dbl> 2475, 719, 1085, 1372, 213, 488, 187, 301, 529, 569,...
$ hour <dbl> 15, 16, 8, 6, 8, 18, 8, 8, 16, 7, 12, 15, 7, 17, 9, ...
$ minute <dbl> 45, 1, 57, 45, 45, 30, 40, 20, 45, 55, 51, 0, 0, 0, ...
$ time_hour <dttm> 2013-01-02 15:00:00, 2013-01-02 16:00:00, 2013-01-0...
flights to only show flights with planes that have flown at least 100 flights.> flights %>% filter(!is.na(tailnum)) %>%
+ group_by(tailnum) %>% mutate(total=n()) %>%
+ filter(total>=100)# A tibble: 228,390 x 20
# Groups: tailnum [1,217]
year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
<int> <int> <int> <int> <int> <dbl> <int> <int>
1 2013 1 1 517 515 2 830 819
2 2013 1 1 533 529 4 850 830
3 2013 1 1 544 545 -1 1004 1022
4 2013 1 1 554 558 -4 740 728
5 2013 1 1 555 600 -5 913 854
6 2013 1 1 557 600 -3 709 723
7 2013 1 1 557 600 -3 838 846
8 2013 1 1 558 600 -2 849 851
9 2013 1 1 558 600 -2 853 856
10 2013 1 1 558 600 -2 923 937
# ... with 228,380 more rows, and 12 more variables: arr_delay <dbl>,
# carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
# air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
# total <int>
fueleconomy::vehicles and fueleconomy::common to find only the records for the most common models.# A tibble: 14,531 x 12
id make model year class trans drive cyl displ fuel hwy cty
<dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
1 1833 Acura Integ~ 1986 Subcom~ Autom~ Front-~ 4 1.6 Regu~ 28 22
2 1834 Acura Integ~ 1986 Subcom~ Manua~ Front-~ 4 1.6 Regu~ 28 23
3 3037 Acura Integ~ 1987 Subcom~ Autom~ Front-~ 4 1.6 Regu~ 28 22
4 3038 Acura Integ~ 1987 Subcom~ Manua~ Front-~ 4 1.6 Regu~ 28 23
5 4183 Acura Integ~ 1988 Subcom~ Autom~ Front-~ 4 1.6 Regu~ 27 22
6 4184 Acura Integ~ 1988 Subcom~ Manua~ Front-~ 4 1.6 Regu~ 28 23
7 5303 Acura Integ~ 1989 Subcom~ Autom~ Front-~ 4 1.6 Regu~ 27 22
8 5304 Acura Integ~ 1989 Subcom~ Manua~ Front-~ 4 1.6 Regu~ 28 23
9 6442 Acura Integ~ 1990 Subcom~ Autom~ Front-~ 4 1.8 Regu~ 24 20
10 6443 Acura Integ~ 1990 Subcom~ Manua~ Front-~ 4 1.8 Regu~ 26 21
# ... with 14,521 more rows
weather data. Can you see any patters?> (flights_48 <- flights %>% filter(!is.na(arr_delay)) %>%
+ arrange(year,month,day) %>%
+ group_by(year,month,day) %>%
+ summarize(mean_delay=mean(arr_delay)) %>%
+ mutate(mean_lag=lag(mean_delay),two_day=mean_lag+mean_delay,
+ dupe=lead(two_day)))# A tibble: 365 x 7
# Groups: year, month [12]
year month day mean_delay mean_lag two_day dupe
<int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 2013 1 1 12.7 NA NA 25.3
2 2013 1 2 12.7 12.7 25.3 18.4
3 2013 1 3 5.73 12.7 18.4 3.80
4 2013 1 4 -1.93 5.73 3.80 -3.46
5 2013 1 5 -1.53 -1.93 -3.46 2.71
6 2013 1 6 4.24 -1.53 2.71 -0.711
7 2013 1 7 -4.95 4.24 -0.711 -8.17
8 2013 1 8 -3.23 -4.95 -8.17 -3.49
9 2013 1 9 -0.264 -3.23 -3.49 -6.16
10 2013 1 10 -5.90 -0.264 -6.16 -10.7
# ... with 355 more rows
# A tibble: 1 x 7
# Groups: year, month [1]
year month day mean_delay mean_lag two_day dupe
<int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 2013 7 23 45.0 62.8 108. 59.0
# A tibble: 1 x 7
# Groups: year, month [1]
year month day mean_delay mean_lag two_day dupe
<int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 2013 7 22 62.8 15.5 78.2 108.
# A tibble: 2 x 7
# Groups: year, month [1]
year month day mean_delay mean_lag two_day dupe
<int> <int> <int> <dbl> <dbl> <dbl> <dbl>
1 2013 7 22 62.8 15.5 78.2 108.
2 2013 7 23 45.0 62.8 108. 59.0
> weather %>% semi_join(three) %>% group_by(day) %>%
+ summarize(avg_temp=mean(temp),
+ avg_wind_speed=mean(wind_speed),
+ avg_precip=mean(precip),
+ avg_visib=mean(visib))# A tibble: 2 x 5
day avg_temp avg_wind_speed avg_precip avg_visib
<int> <dbl> <dbl> <dbl> <dbl>
1 22 78.5 7.96 0.00597 8.91
2 23 80.6 10.1 0.0154 9.33
anti_join(flights,airports,by=c("dest"="faa")) tell you? What does anti_join(airports,flights,by=c("faa"="dest")) tell you?> #flights that have a destination not listed in airports
> anti_join(flights,airports, by=c("dest"="faa")) %>%
+ distinct(dest)# A tibble: 4 x 1
dest
<chr>
1 BQN
2 SJU
3 STT
4 PSE
# A tibble: 1,357 x 8
faa name lat lon alt tz dst tzone
<chr> <chr> <dbl> <dbl> <dbl> <dbl> <chr> <chr>
1 04G Lansdowne Airport 41.1 -80.6 1044 -5 A America/New_Yo~
2 06A Moton Field Municipal A~ 32.5 -85.7 264 -6 A America/Chicago
3 06C Schaumburg Regional 42.0 -88.1 801 -6 A America/Chicago
4 06N Randall Airport 41.4 -74.4 523 -5 A America/New_Yo~
5 09J Jekyll Island Airport 31.1 -81.4 11 -5 A America/New_Yo~
6 0A9 Elizabethton Municipal ~ 36.4 -82.2 1593 -5 A America/New_Yo~
7 0G6 Williams County Airport 41.5 -84.5 730 -5 A America/New_Yo~
8 0G7 Finger Lakes Regional A~ 42.9 -76.8 492 -5 A America/New_Yo~
9 0P2 Shoestring Aviation Air~ 39.8 -76.6 1000 -5 U America/New_Yo~
10 0S9 Jefferson County Intl 48.1 -123. 108 -8 A America/Los_An~
# ... with 1,347 more rows
# A tibble: 4,060 x 2
carrier tailnum
<chr> <chr>
1 UA N14228
2 UA N24211
3 AA N619AA
4 B6 N804JB
5 DL N668DN
6 UA N39463
7 B6 N516JB
8 EV N829AS
9 B6 N593JB
10 AA N3ALAA
# ... with 4,050 more rows
[1] 17
There are also several set functions that can be helpful.
| x | y |
|---|---|
| 1 | 1 |
| 2 | 1 |
| x | y |
|---|---|
| 1 | 1 |
| 1 | 2 |
> intersect(df1,df2) %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| x | y |
|---|---|
| 1 | 1 |
> union(df1,df2) %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| x | y |
|---|---|
| 1 | 1 |
| 2 | 1 |
| 1 | 2 |
> setdiff(df1,df2) %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| x | y |
|---|---|
| 2 | 1 |
> setdiff(df2,df1) %>%
+ kable() %>%
+ kable_styling(bootstrap_options = c("striped","condensed"),
+ full_width = F, font_size = 10,position="left")%>%
+ row_spec(0,background="peachpuff")| x | y |
|---|---|
| 1 | 2 |