## Warning: package 'viridis' was built under R version 4.2.1
## Warning: package 'fueleconomy' was built under R version 4.2.1
airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point() +
coord_quickmap()
avg_dest_delays <-
flights %>%
group_by(dest) %>%
# arrival delay NA's are cancelled flights
summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c(dest = "faa"))
avg_dest_delays %>%
ggplot(aes(lon, lat, colour = delay)) +
borders("state") +
geom_point() +
coord_quickmap()
airport_locations <- airports %>%
select(faa, lat, lon)
flights %>%
select(year:day, hour, origin, dest) %>%
left_join(
airport_locations,
by = c("origin" = "faa")
) %>%
left_join(
airport_locations,
by = c("dest" = "faa"),
suffix = c("_origin", "_dest")
# existing lat and lon variables in tibble gain the _origin suffix
# new lat and lon variables are given _dest suffix
)
## # A tibble: 336,776 × 10
## year month day hour origin dest lat_origin lon_origin lat_dest lon_dest
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH 40.7 -74.2 30.0 -95.3
## 2 2013 1 1 5 LGA IAH 40.8 -73.9 30.0 -95.3
## 3 2013 1 1 5 JFK MIA 40.6 -73.8 25.8 -80.3
## 4 2013 1 1 5 JFK BQN 40.6 -73.8 NA NA
## 5 2013 1 1 6 LGA ATL 40.8 -73.9 33.6 -84.4
## 6 2013 1 1 5 EWR ORD 40.7 -74.2 42.0 -87.9
## 7 2013 1 1 6 EWR FLL 40.7 -74.2 26.1 -80.2
## 8 2013 1 1 6 LGA IAD 40.8 -73.9 38.9 -77.5
## 9 2013 1 1 6 JFK MCO 40.6 -73.8 28.4 -81.3
## 10 2013 1 1 6 LGA ORD 40.8 -73.9 42.0 -87.9
## # … with 336,766 more rows
flights %>%
filter(year == 2013, month == 6, day == 13) %>%
group_by(dest) %>%
summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c("dest" = "faa")) %>%
ggplot(aes(y = lat, x = lon, size = delay, colour = delay)) +
borders("state") +
geom_point() +
coord_quickmap() +
scale_colour_viridis()
## Warning: Removed 3 rows containing missing values (geom_point).
#Find all planes that have flown at least 100 flights.
planes_gte100 <- flights %>%
filter(!is.na(tailnum)) %>%
group_by(tailnum) %>%
count() %>%
filter(n >= 100)
#semi join the data frame of planes that have flown at least 100 flights to the data frame of flights to select the flights by those planes.
flights %>%
semi_join(planes_gte100, by = "tailnum")
## # A tibble: 228,390 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 544 545 -1 1004 1022
## 4 2013 1 1 554 558 -4 740 728
## 5 2013 1 1 555 600 -5 913 854
## 6 2013 1 1 557 600 -3 709 723
## 7 2013 1 1 557 600 -3 838 846
## 8 2013 1 1 558 600 -2 849 851
## 9 2013 1 1 558 600 -2 853 856
## 10 2013 1 1 558 600 -2 923 937
## # … with 228,380 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
fueleconomy::vehicles %>%
semi_join(fueleconomy::common, by = c("make", "model"))
## # A tibble: 14,531 × 12
## id make model year class trans drive cyl displ fuel hwy cty
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1833 Acura Integra 1986 Subcompa… Auto… Fron… 4 1.6 Regu… 28 22
## 2 1834 Acura Integra 1986 Subcompa… Manu… Fron… 4 1.6 Regu… 28 23
## 3 3037 Acura Integra 1987 Subcompa… Auto… Fron… 4 1.6 Regu… 28 22
## 4 3038 Acura Integra 1987 Subcompa… Manu… Fron… 4 1.6 Regu… 28 23
## 5 4183 Acura Integra 1988 Subcompa… Auto… Fron… 4 1.6 Regu… 27 22
## 6 4184 Acura Integra 1988 Subcompa… Manu… Fron… 4 1.6 Regu… 28 23
## 7 5303 Acura Integra 1989 Subcompa… Auto… Fron… 4 1.6 Regu… 27 22
## 8 5304 Acura Integra 1989 Subcompa… Manu… Fron… 4 1.6 Regu… 28 23
## 9 6442 Acura Integra 1990 Subcompa… Auto… Fron… 4 1.8 Regu… 24 20
## 10 6443 Acura Integra 1990 Subcompa… Manu… Fron… 4 1.8 Regu… 26 21
## # … with 14,521 more rows
#Group flights by hour of scheduled departure time and calculate the average delay. Then select the 48 observations (hours) with the highest average delay.
worst_hours <- flights %>%
mutate(hour = sched_dep_time %/% 100) %>%
group_by(origin, year, month, day, hour) %>%
summarise(dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
ungroup() %>%
arrange(desc(dep_delay)) %>%
slice(1:48)
## `summarise()` has grouped output by 'origin', 'year', 'month', 'day'. You can
## override using the `.groups` argument.
#Then use semi_join() to get the weather for these hours.
weather_most_delayed <- semi_join(weather, worst_hours,
by = c("origin", "year",
"month", "day", "hour"))
select(weather_most_delayed, temp, wind_speed, precip) %>%
print(n = 48)
## # A tibble: 48 × 3
## temp wind_speed precip
## <dbl> <dbl> <dbl>
## 1 27.0 13.8 0
## 2 28.0 19.6 0
## 3 28.9 28.8 0
## 4 33.8 9.21 0.06
## 5 34.0 8.06 0.05
## 6 80.1 8.06 0
## 7 86 13.8 0
## 8 73.4 6.90 0.08
## 9 84.0 5.75 0
## 10 78.8 18.4 0.23
## 11 53.6 0 0
## 12 60.8 31.1 0.11
## 13 55.4 17.3 0.14
## 14 53.1 9.21 0.01
## 15 55.9 11.5 0.1
## 16 55.4 8.06 0.15
## 17 57.0 29.9 0
## 18 33.8 20.7 0.02
## 19 34.0 19.6 0.01
## 20 36.0 21.9 0.01
## 21 37.9 16.1 0
## 22 32 13.8 0.12
## 23 60.1 33.4 0.14
## 24 60.8 11.5 0.02
## 25 62.1 17.3 0
## 26 66.9 10.4 0
## 27 66.9 13.8 0
## 28 79.0 10.4 0
## 29 77 16.1 0.07
## 30 75.9 13.8 0
## 31 82.4 8.06 0
## 32 86 9.21 0
## 33 80.1 9.21 0
## 34 80.6 11.5 0
## 35 78.1 6.90 0
## 36 75.2 10.4 0.01
## 37 73.9 5.75 0.03
## 38 73.9 8.06 0
## 39 75.0 4.60 0
## 40 75.0 4.60 0.01
## 41 80.1 0 0.01
## 42 80.1 0 0
## 43 77 10.4 0
## 44 82.0 10.4 0
## 45 72.0 13.8 0.3
## 46 72.0 4.60 0.03
## 47 51.1 4.60 0
## 48 54.0 6.90 0
ggplot(weather_most_delayed, aes(x = precip, y = wind_speed, color = temp)) +
geom_point()