tidyverse and the
nycflights13 datasetlibrary(tidyverse)
library(nycflights13)
library(glue)
Before dealing with the data, I will create a *_dt
(datetime) column with precise date and time of the flight departure and
arrival dates and times, for both scheduled and real ones.
# function to create datetimes from a military time column (522 --> 17:22 hours)
create_dt <- function(dataframe, time){
dataframe |>
mutate(datetime = make_datetime(
year = year, month = month, day = day,
hour = {{time}} %/% 100,
min = {{time}} %% 100)
) |>
pull(datetime)
}
I will make some changes and reorder the columns.
flights <- flights |>
mutate(dep_dt = create_dt(flights, dep_time),
arr_dt = create_dt(flights, arr_time),
sched_dep_dt = create_dt(flights, sched_dep_time),
sched_arr_dt = create_dt(flights, sched_arr_time),
.before=dep_time) |>
select(-c(dep_time, arr_time, sched_arr_time, sched_dep_time, time_hour)) |>
select(origin, dest, flight, tailnum, carrier, year:day, hour, minute,
sched_dep_dt, dep_dt, sched_arr_dt, arr_dt,
dep_delay:distance)
These focus on adding information from one table to another using exact key matches.
Use a join to add the full name of the airline (from the airlines table) to the flights table. Which carrier has the highest average departure delay?
flights |> left_join(airlines) |>
rename(carrier_code = carrier, carrier = name) |>
group_by(carrier) |>
summarise(avg_dep_delay = round( mean(dep_delay, na.rm = T), 2)) |>
arrange(-avg_dep_delay) |>
head(1)
## # A tibble: 1 × 2
## carrier avg_dep_delay
## <chr> <dbl>
## 1 Frontier Airlines Inc. 20.2
Join flights and airports so that you only keep flights where the destination airport has a corresponding entry in the airports table. Why might some flights be dropped? (Hint: check the dest and faa columns).
flights |> inner_join(airports, join_by(dest==faa))
## # A tibble: 329,174 × 25
## origin dest flight tailnum carrier year month day hour minute
## <chr> <chr> <int> <chr> <chr> <int> <int> <int> <dbl> <dbl>
## 1 EWR IAH 1545 N14228 UA 2013 1 1 5 15
## 2 LGA IAH 1714 N24211 UA 2013 1 1 5 29
## 3 JFK MIA 1141 N619AA AA 2013 1 1 5 40
## 4 LGA ATL 461 N668DN DL 2013 1 1 6 0
## 5 EWR ORD 1696 N39463 UA 2013 1 1 5 58
## 6 EWR FLL 507 N516JB B6 2013 1 1 6 0
## 7 LGA IAD 5708 N829AS EV 2013 1 1 6 0
## 8 JFK MCO 79 N593JB B6 2013 1 1 6 0
## 9 LGA ORD 301 N3ALAA AA 2013 1 1 6 0
## 10 JFK PBI 49 N793JB B6 2013 1 1 6 0
## # ℹ 329,164 more rows
## # ℹ 15 more variables: sched_dep_dt <dttm>, dep_dt <dttm>, sched_arr_dt <dttm>,
## # arr_dt <dttm>, dep_delay <dbl>, arr_delay <dbl>, air_time <dbl>,
## # distance <dbl>, name <chr>, lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>,
## # dst <chr>, tzone <chr>
total_flights <- nrow(flights)
valid_faa_destinations_count <- nrow(flights |> inner_join(airports, join_by(dest == faa)))
glue("There are a total of {total_flights} flights.")
## There are a total of 336776 flights.
glue("Only {valid_faa_destinations_count} ({round(valid_faa_destinations_count / total_flights * 100,2)}%) of them have a valid destination according to the FAA code.")
## Only 329174 (97.74%) of them have a valid destination according to the FAA code.
Join flights with the planes table. Calculate the age of the plane at the time of each flight (Flight Year - Manufacture Year) and find the correlation between plane age and arr_delay.
data <- flights |> left_join(planes, join_by(tailnum),
suffix=c("_flight", "_plane"))|>
select(flight, dep_dt, arr_delay, tailnum, year_flight, year_plane) |>
mutate(plane_age = year_flight - year_plane) |>
select(-c(year_plane, year_flight)) |>
relocate(arr_delay, .after=last_col()) |>
arrange(dep_dt, flight)
data
## # A tibble: 336,776 × 5
## flight dep_dt tailnum plane_age arr_delay
## <int> <dttm> <chr> <int> <dbl>
## 1 1545 2013-01-01 05:17:00 N14228 14 11
## 2 1714 2013-01-01 05:33:00 N24211 15 20
## 3 1141 2013-01-01 05:42:00 N619AA 23 33
## 4 725 2013-01-01 05:44:00 N804JB 1 -18
## 5 461 2013-01-01 05:54:00 N668DN 22 -25
## 6 1696 2013-01-01 05:54:00 N39463 1 12
## 7 507 2013-01-01 05:55:00 N516JB 13 19
## 8 79 2013-01-01 05:57:00 N593JB 9 -8
## 9 5708 2013-01-01 05:57:00 N829AS 15 -14
## 10 49 2013-01-01 05:58:00 N793JB 2 -2
## # ℹ 336,766 more rows
data |> group_by(plane_age) |>
summarise(avg_arrival_delay = mean(arr_delay, na.rm=T)) |>
ggplot(aes(x=plane_age, y=avg_arrival_delay))+
geom_point(color="blue") +
geom_smooth(method="lm", se=F, color="red") +
scale_y_continuous(limits = c(-5,15)) +
labs(x="Plane age", y="average arrival delay, minutes") +
theme_light()
Use a full_join between a subset of weather (e.g., only Jan 1st) and a subset of flights (only Jan 1st). Identify if there are any hours where we have weather data but no flights, or flights but no weather data.
flights |> filter(month==1 & day==1) |>
select(flight, month, day, dep_dt) |>
full_join(weather|> filter(month==1 & day==1)) |>
select( -year, -hour) |>
filter(if_any(everything(), is.na)) |>
select(-c(month, day, wind_gust))
## # A tibble: 34,626 × 12
## flight dep_dt origin temp dewp humid wind_dir wind_speed
## <int> <dttm> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1545 2013-01-01 05:17:00 EWR 39.0 26.1 59.4 270 10.4
## 2 1545 2013-01-01 05:17:00 EWR 39.0 27.0 61.6 250 8.06
## 3 1545 2013-01-01 05:17:00 EWR 39.0 28.0 64.4 240 11.5
## 4 1545 2013-01-01 05:17:00 EWR 39.9 28.0 62.2 250 12.7
## 5 1545 2013-01-01 05:17:00 EWR 39.0 28.0 64.4 260 12.7
## 6 1545 2013-01-01 05:17:00 EWR 37.9 28.0 67.2 240 11.5
## 7 1545 2013-01-01 05:17:00 EWR 39.0 28.0 64.4 240 15.0
## 8 1545 2013-01-01 05:17:00 EWR 39.9 28.0 62.2 250 10.4
## 9 1545 2013-01-01 05:17:00 EWR 39.9 28.0 62.2 260 15.0
## 10 1545 2013-01-01 05:17:00 EWR 41 28.0 59.6 260 13.8
## # ℹ 34,616 more rows
## # ℹ 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>
Join flights and weather using all common time variables (year, month, day, hour) and the origin airport. Calculate the average delay for flights departing in “high wind” conditions (e.g., wind speed > 20).
flights |> inner_join(weather, join_by(year, month, day, hour, origin)) |>
filter(wind_speed > 20) |>
summarise(avg_dep_delay_minutes = round(mean(dep_delay, na.rm=T),2) )
## # A tibble: 1 × 1
## avg_dep_delay_minutes
## <dbl>
## 1 16.9
These focus on using one table to filter another without adding new columns.
Create a table of the top 10 most frequent destination airports. Use a semi_join to filter the flights table to only include trips to these top 10 destinations.
most_frequent_destinations <- flights |> count(dest, sort=T, name="flights") |> head(10)
most_frequent_destinations
## # A tibble: 10 × 2
## dest flights
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
## 6 CLT 14064
## 7 SFO 13331
## 8 FLL 12055
## 9 MIA 11728
## 10 DCA 9705
flights |> semi_join(most_frequent_destinations) |>
select(flight, origin, dest, tailnum, dep_dt) |>
arrange(dep_dt, flight)
## # A tibble: 141,145 × 5
## flight origin dest tailnum dep_dt
## <int> <chr> <chr> <chr> <dttm>
## 1 1141 JFK MIA N619AA 2013-01-01 05:42:00
## 2 461 LGA ATL N668DN 2013-01-01 05:54:00
## 3 1696 EWR ORD N39463 2013-01-01 05:54:00
## 4 507 EWR FLL N516JB 2013-01-01 05:55:00
## 5 79 JFK MCO N593JB 2013-01-01 05:57:00
## 6 194 JFK LAX N29129 2013-01-01 05:58:00
## 7 301 LGA ORD N3ALAA 2013-01-01 05:58:00
## 8 1124 EWR SFO N53441 2013-01-01 05:58:00
## 9 1806 JFK BOS N708JB 2013-01-01 05:59:00
## 10 371 LGA FLL N595JB 2013-01-01 06:00:00
## # ℹ 141,135 more rows
Use an anti_join to find all flights that go to a destination (dest) that is not listed in the airports table. How many unique destinations are missing from the metadata?
flights |> anti_join(airports, join_by(dest==faa)) |>
select(flight, tailnum, origin, dest, dep_dt) |>
arrange(dep_dt, flight)
## # A tibble: 7,602 × 5
## flight tailnum origin dest dep_dt
## <int> <chr> <chr> <chr> <dttm>
## 1 725 N804JB JFK BQN 2013-01-01 05:44:00
## 2 709 N794JB JFK SJU 2013-01-01 06:15:00
## 3 413 N3BAAA JFK SJU 2013-01-01 06:28:00
## 4 1203 N77296 EWR SJU 2013-01-01 07:01:00
## 5 715 N651JB JFK SJU 2013-01-01 07:11:00
## 6 301 N900PC JFK SJU 2013-01-01 08:20:00
## 7 717 N527JB JFK SJU 2013-01-01 08:20:00
## 8 1357 N5FSAA JFK SJU 2013-01-01 08:40:00
## 9 655 N5EXAA JFK STT 2013-01-01 09:09:00
## 10 1519 N24715 EWR STT 2013-01-01 09:13:00
## # ℹ 7,592 more rows
How many unique destinations are missing from the metadata? (destinations which are not listed in the airports dataset).
flights |> anti_join(airports, join_by(dest == faa)) |>
distinct(dest)
## # A tibble: 4 × 1
## dest
## <chr>
## 1 BQN
## 2 SJU
## 3 STT
## 4 PSE
Note: These require dplyr 1.1.0 or later using the
join_by() function.
For every flight, find all weather observations at the origin airport
that occurred up to 5 hours before the flight’s departure. (Use
join_by(origin, time_hour > time_hour_weather)).
# weather dataset only records the datetime rounded to hours
# I need to mutate the dep_dt from flights to round to hours
flights |> mutate(dep_dt = round_date(dep_dt, unit="hour"),
last_48h = dep_dt - hours(5)) |>
select(flight, dep_dt, origin, last_48h) |>
left_join(weather, join_by(origin,
dep_dt >= time_hour,
last_48h <= time_hour)
) |>
select(-c(last_48h, year, month, day, hour)) |>
select(flight, origin, dep_dt, weather_dt = time_hour,
temp:last_col())
## # A tibble: 1,970,678 × 13
## flight origin dep_dt weather_dt temp dewp humid
## <int> <chr> <dttm> <dttm> <dbl> <dbl> <dbl>
## 1 1545 EWR 2013-01-01 05:00:00 NA NA NA NA
## 2 1714 LGA 2013-01-01 06:00:00 2013-01-01 01:00:00 39.9 26.1 57.3
## 3 1141 JFK 2013-01-01 06:00:00 2013-01-01 01:00:00 39.0 26.1 59.4
## 4 725 JFK 2013-01-01 06:00:00 2013-01-01 01:00:00 39.0 26.1 59.4
## 5 461 LGA 2013-01-01 06:00:00 2013-01-01 01:00:00 39.9 26.1 57.3
## 6 1696 EWR 2013-01-01 06:00:00 2013-01-01 01:00:00 39.0 26.1 59.4
## 7 507 EWR 2013-01-01 06:00:00 2013-01-01 01:00:00 39.0 26.1 59.4
## 8 5708 LGA 2013-01-01 06:00:00 2013-01-01 01:00:00 39.9 26.1 57.3
## 9 79 JFK 2013-01-01 06:00:00 2013-01-01 01:00:00 39.0 26.1 59.4
## 10 301 LGA 2013-01-01 06:00:00 2013-01-01 01:00:00 39.9 26.1 57.3
## # ℹ 1,970,668 more rows
## # ℹ 6 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>
Sometimes weather isn’t recorded exactly on the hour. For each flight, join the weather data where the weather timestamp is within a 30-minute window (before or after) of the flight’s scheduled time_hour.
flights |>
mutate(
# Create a proper datetime object
before_30min = sched_dep_dt - minutes(30),
after_30min = sched_dep_dt + minutes(30)) |>
left_join(weather,
join_by(origin,
before_30min <= time_hour,
after_30min >= time_hour)) |>
select(flight, dep_dt, weather_dt = time_hour,
!ends_with(c(".x", ".y", "time", "min")))
## # A tibble: 370,484 × 23
## flight dep_dt weather_dt origin dest tailnum carrier
## <int> <dttm> <dttm> <chr> <chr> <chr> <chr>
## 1 1545 2013-01-01 05:17:00 NA EWR IAH N14228 UA
## 2 1714 2013-01-01 05:33:00 NA LGA IAH N24211 UA
## 3 1141 2013-01-01 05:42:00 2013-01-01 01:00:00 JFK MIA N619AA AA
## 4 725 2013-01-01 05:44:00 2013-01-01 01:00:00 JFK BQN N804JB B6
## 5 461 2013-01-01 05:54:00 2013-01-01 01:00:00 LGA ATL N668DN DL
## 6 1696 2013-01-01 05:54:00 2013-01-01 01:00:00 EWR ORD N39463 UA
## 7 507 2013-01-01 05:55:00 2013-01-01 01:00:00 EWR FLL N516JB B6
## 8 5708 2013-01-01 05:57:00 2013-01-01 01:00:00 LGA IAD N829AS EV
## 9 79 2013-01-01 05:57:00 2013-01-01 01:00:00 JFK MCO N593JB B6
## 10 301 2013-01-01 05:58:00 2013-01-01 01:00:00 LGA ORD N3ALAA AA
## # ℹ 370,474 more rows
## # ℹ 16 more variables: minute <dbl>, sched_dep_dt <dttm>, sched_arr_dt <dttm>,
## # arr_dt <dttm>, dep_delay <dbl>, arr_delay <dbl>, distance <dbl>,
## # temp <dbl>, dewp <dbl>, humid <dbl>, wind_dir <dbl>, wind_speed <dbl>,
## # wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>
For each flight, find the next flight departing from the same origin.
Use a non-equi join where
flight_A$time_hour < flight_B$time_hour.
flights |>
mutate(
next_flight_within_15min = dep_dt + minutes(15)
) |>
select(origin, flight, tailnum, dep_dt, next_flight_within_15min) |>
left_join(flights, join_by(origin,
dep_dt < dep_dt,
next_flight_within_15min > dep_dt),
suffix=c("", "_next")) |>
select(origin, flight, tailnum, dep_dt,
flight_next, tailnum_next, dep_dt_next)
## # A tibble: 1,468,999 × 7
## origin flight tailnum dep_dt flight_next tailnum_next
## <chr> <int> <chr> <dttm> <int> <chr>
## 1 EWR 1545 N14228 2013-01-01 05:17:00 NA <NA>
## 2 LGA 1714 N24211 2013-01-01 05:33:00 NA <NA>
## 3 JFK 1141 N619AA 2013-01-01 05:42:00 725 N804JB
## 4 JFK 725 N804JB 2013-01-01 05:44:00 79 N593JB
## 5 JFK 725 N804JB 2013-01-01 05:44:00 49 N793JB
## 6 JFK 725 N804JB 2013-01-01 05:44:00 71 N657JB
## 7 JFK 725 N804JB 2013-01-01 05:44:00 194 N29129
## 8 LGA 461 N668DN 2013-01-01 05:54:00 5708 N829AS
## 9 LGA 461 N668DN 2013-01-01 05:54:00 301 N3ALAA
## 10 LGA 461 N668DN 2013-01-01 05:54:00 707 N3DUAA
## # ℹ 1,468,989 more rows
## # ℹ 1 more variable: dep_dt_next <dttm>
Calculate the time difference between consecutive departures at JFK.
flights_JFK <- flights |> filter(origin == "JFK") |>
mutate(
next_dt_within_15min = dep_dt + minutes(30)
) |>
select(flight, tailnum, dep_dt, next_dt_within_15min)
flights_JFK |> left_join(flights_JFK, join_by(dep_dt < dep_dt,
next_dt_within_15min >= dep_dt),
suffix=c("", "_next")) |>
arrange(dep_dt, flight, dep_dt_next) |>
select(!starts_with("next_dt_within_15min"))
## # A tibble: 1,010,452 × 6
## flight tailnum dep_dt flight_next tailnum_next
## <int> <chr> <dttm> <int> <chr>
## 1 1141 N619AA 2013-01-01 05:42:00 725 N804JB
## 2 1141 N619AA 2013-01-01 05:42:00 79 N593JB
## 3 1141 N619AA 2013-01-01 05:42:00 49 N793JB
## 4 1141 N619AA 2013-01-01 05:42:00 71 N657JB
## 5 1141 N619AA 2013-01-01 05:42:00 194 N29129
## 6 1141 N619AA 2013-01-01 05:42:00 1806 N708JB
## 7 1141 N619AA 2013-01-01 05:42:00 1743 N3739P
## 8 1141 N619AA 2013-01-01 05:42:00 303 N532UA
## 9 725 N804JB 2013-01-01 05:44:00 79 N593JB
## 10 725 N804JB 2013-01-01 05:44:00 49 N793JB
## # ℹ 1,010,442 more rows
## # ℹ 1 more variable: dep_dt_next <dttm>