library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3 ✓ purrr 0.3.4
## ✓ tibble 3.1.0 ✓ dplyr 1.0.4
## ✓ tidyr 1.1.2 ✓ stringr 1.4.0
## ✓ readr 1.4.0 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(viridis)
## Loading required package: viridisLite
planes
## # A tibble: 3,322 x 9
## tailnum year type manufacturer model engines seats speed engine
## <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
## 1 N10156 2004 Fixed wing m… EMBRAER EMB-1… 2 55 NA Turbo-…
## 2 N102UW 1998 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 3 N103US 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 4 N104UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 5 N10575 2002 Fixed wing m… EMBRAER EMB-1… 2 55 NA Turbo-…
## 6 N105UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 7 N107US 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 8 N108UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 9 N109UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## 10 N110UW 1999 Fixed wing m… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
## # … with 3,312 more rows
#> # A tibble: 3,322 x 9
#> tailnum year type manufacturer model engines seats speed engine
#> <chr> <int> <chr> <chr> <chr> <int> <int> <int> <chr>
#> 1 N10156 2004 Fixed wing mu… EMBRAER EMB-1… 2 55 NA Turbo-…
#> 2 N102UW 1998 Fixed wing mu… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
#> 3 N103US 1999 Fixed wing mu… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
#> 4 N104UW 1999 Fixed wing mu… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
#> 5 N10575 2002 Fixed wing mu… EMBRAER EMB-1… 2 55 NA Turbo-…
#> 6 N105UW 1999 Fixed wing mu… AIRBUS INDUST… A320-… 2 182 NA Turbo-…
#> # … with 3,316 more rows
weather
## # A tibble: 26,115 x 15
## origin year month day hour temp dewp humid wind_dir wind_speed
## <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4
## 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06
## 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5
## 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7
## 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7
## 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5
## 7 EWR 2013 1 1 7 39.0 28.0 64.4 240 15.0
## 8 EWR 2013 1 1 8 39.9 28.0 62.2 250 10.4
## 9 EWR 2013 1 1 9 39.9 28.0 62.2 260 15.0
## 10 EWR 2013 1 1 10 41 28.0 59.6 260 13.8
## # … with 26,105 more rows, and 5 more variables: wind_gust <dbl>, precip <dbl>,
## # pressure <dbl>, visib <dbl>, time_hour <dttm>
#> # A tibble: 26,115 x 15
#> origin year month day hour temp dewp humid wind_dir wind_speed wind_gust
#> <chr> <int> <int> <int> <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
#> 1 EWR 2013 1 1 1 39.0 26.1 59.4 270 10.4 NA
#> 2 EWR 2013 1 1 2 39.0 27.0 61.6 250 8.06 NA
#> 3 EWR 2013 1 1 3 39.0 28.0 64.4 240 11.5 NA
#> 4 EWR 2013 1 1 4 39.9 28.0 62.2 250 12.7 NA
#> 5 EWR 2013 1 1 5 39.0 28.0 64.4 260 12.7 NA
#> 6 EWR 2013 1 1 6 37.9 28.0 67.2 240 11.5 NA
#> # … with 26,109 more rows, and 4 more variables: precip <dbl>, pressure <dbl>,
#> # visib <dbl>, time_hour <dttm>
flights_latlon <- flights %>%
inner_join(select(airports, origin = faa, origin_lat = lat, origin_lon = lon),
by = "origin"
) %>%
inner_join(select(airports, dest = faa, dest_lat = lat, dest_lon = lon),
by = "dest"
)
flights_latlon %>%
slice(1:100) %>%
ggplot(aes(
x = origin_lon, xend = dest_lon,
y = origin_lat, yend = dest_lat
)) +
borders("state") +
geom_segment(arrow = arrow(length = unit(0.1, "cm"))) +
coord_quickmap() +
labs(y = "Latitude", x = "Longitude")
airports %>%
semi_join(flights, c("faa" = "dest")) %>%
ggplot(aes(lon, lat)) +
borders("state") +
geom_point() +
coord_quickmap()
## 1.a Adding color of the points to display the average delay for each airport.
avg_dest_delays <-
flights %>%
group_by(dest) %>%
summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c(dest = "faa"))
avg_dest_delays %>%
ggplot(aes(lon, lat, colour = delay)) +
borders("state") +
geom_point() +
coord_quickmap()
## 2.Add the location of the origin and destination (i.e. the lat and lon) to flights
airport_locations <- airports %>%
select(faa, lat, lon)
flights %>%
select(year:day, hour, origin, dest) %>%
left_join(
airport_locations,
by = c("origin" = "faa")
) %>%
left_join(
airport_locations,
by = c("dest" = "faa")
)
## # A tibble: 336,776 x 10
## year month day hour origin dest lat.x lon.x lat.y lon.y
## <int> <int> <int> <dbl> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH 40.7 -74.2 30.0 -95.3
## 2 2013 1 1 5 LGA IAH 40.8 -73.9 30.0 -95.3
## 3 2013 1 1 5 JFK MIA 40.6 -73.8 25.8 -80.3
## 4 2013 1 1 5 JFK BQN 40.6 -73.8 NA NA
## 5 2013 1 1 6 LGA ATL 40.8 -73.9 33.6 -84.4
## 6 2013 1 1 5 EWR ORD 40.7 -74.2 42.0 -87.9
## 7 2013 1 1 6 EWR FLL 40.7 -74.2 26.1 -80.2
## 8 2013 1 1 6 LGA IAD 40.8 -73.9 38.9 -77.5
## 9 2013 1 1 6 JFK MCO 40.6 -73.8 28.4 -81.3
## 10 2013 1 1 6 LGA ORD 40.8 -73.9 42.0 -87.9
## # … with 336,766 more rows
Will do analysis on the planes age >25
plane_cohorts <- inner_join(flights,
select(planes, tailnum, plane_year = year),
by = "tailnum"
) %>%
mutate(age = year - plane_year) %>%
filter(!is.na(age)) %>%
mutate(age = if_else(age > 25, 25L, age)) %>%
group_by(age) %>%
summarise(
dep_delay_mean = mean(dep_delay, na.rm = TRUE),
dep_delay_sd = sd(dep_delay, na.rm = TRUE),
arr_delay_mean = mean(arr_delay, na.rm = TRUE),
arr_delay_sd = sd(arr_delay, na.rm = TRUE),
n_arr_delay = sum(!is.na(arr_delay)),
n_dep_delay = sum(!is.na(dep_delay))
)
ggplot(plane_cohorts, aes(x = age, y = dep_delay_mean)) +
geom_point() +
scale_x_continuous("Age of plane (years)", breaks = seq(0, 30, by = 10)) +
scale_y_continuous("Mean Departure Delay (minutes)")
ggplot(plane_cohorts, aes(x = age, y = arr_delay_mean)) +
geom_point() +
scale_x_continuous("Age of Plane (years)", breaks = seq(0, 30, by = 10)) +
scale_y_continuous("Mean Arrival Delay (minutes)")
## 4. What weather conditions make it more likely to see a delay?
flight_weather <-
flights %>%
inner_join(weather, by = c(
"origin" = "origin",
"year" = "year",
"month" = "month",
"day" = "day",
"hour" = "hour"
))
flight_weather %>%
group_by(precip) %>%
summarise(delay = mean(dep_delay, na.rm = TRUE)) %>%
ggplot(aes(x = precip, y = delay)) +
geom_line() + geom_point()
## 5.What happened on June 13, 2013? Display the spatial pattern of delays, and then use Google to cross-reference with the weather.
flights %>%
filter(year == 2013, month == 6, day == 13) %>%
group_by(dest) %>%
summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
inner_join(airports, by = c("dest" = "faa")) %>%
ggplot(aes(y = lat, x = lon, size = delay, colour = delay)) +
borders("state") +
geom_point() +
coord_quickmap() +
scale_colour_viridis()
## Warning: Removed 3 rows containing missing values (geom_point).
Flights that have a missing tailnum all have missing values of arr_time, meaning that the flight was canceled.
flights %>%
filter(is.na(tailnum), !is.na(arr_time)) %>%
nrow()
## [1] 0
#> [1] 0
flights %>%
filter(!is.na(tailnum)) %>%
group_by(tailnum) %>%
mutate(n = n()) %>%
filter(n >= 100)
## # A tibble: 228,390 x 20
## # Groups: tailnum [1,217]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 544 545 -1 1004 1022
## 4 2013 1 1 554 558 -4 740 728
## 5 2013 1 1 555 600 -5 913 854
## 6 2013 1 1 557 600 -3 709 723
## 7 2013 1 1 557 600 -3 838 846
## 8 2013 1 1 558 600 -2 849 851
## 9 2013 1 1 558 600 -2 853 856
## 10 2013 1 1 558 600 -2 923 937
## # … with 228,380 more rows, and 12 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## # n <int>
fueleconomy::vehicles %>%
semi_join(fueleconomy::common, by = c('make', 'model'))
## # A tibble: 14,531 x 12
## id make model year class trans drive cyl displ fuel hwy cty
## <dbl> <chr> <chr> <dbl> <chr> <chr> <chr> <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1833 Acura Integ… 1986 Subcom… Autom… Front-… 4 1.6 Regu… 28 22
## 2 1834 Acura Integ… 1986 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 3 3037 Acura Integ… 1987 Subcom… Autom… Front-… 4 1.6 Regu… 28 22
## 4 3038 Acura Integ… 1987 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 5 4183 Acura Integ… 1988 Subcom… Autom… Front-… 4 1.6 Regu… 27 22
## 6 4184 Acura Integ… 1988 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 7 5303 Acura Integ… 1989 Subcom… Autom… Front-… 4 1.6 Regu… 27 22
## 8 5304 Acura Integ… 1989 Subcom… Manua… Front-… 4 1.6 Regu… 28 23
## 9 6442 Acura Integ… 1990 Subcom… Autom… Front-… 4 1.8 Regu… 24 20
## 10 6443 Acura Integ… 1990 Subcom… Manua… Front-… 4 1.8 Regu… 26 21
## # … with 14,521 more rows
library(zoo)
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
flights %>%
group_by(year, month, day, hour) %>%
summarise(delay = mean(arr_delay, na.rm = TRUE)) %>%
ungroup() %>%
mutate(mov_ave = rollapply(delay, 48, mean, na.rm = TRUE, align = "right", fill = NA)) %>%
arrange(mov_ave %>% desc) %>%
select(-delay) %>%
head(10) %>%
kable()
## `summarise()` has grouped output by 'year', 'month', 'day'. You can override using the `.groups` argument.
| year | month | day | hour | mov_ave |
|---|---|---|---|---|
| 2013 | 7 | 23 | 23 | 54.29261 |
| 2013 | 7 | 24 | 5 | 54.07572 |
| 2013 | 6 | 28 | 23 | 53.97791 |
| 2013 | 7 | 24 | 6 | 53.93745 |
| 2013 | 6 | 29 | 5 | 53.38781 |
| 2013 | 7 | 24 | 7 | 53.37115 |
| 2013 | 7 | 23 | 22 | 53.23787 |
| 2013 | 7 | 10 | 23 | 53.03761 |
| 2013 | 7 | 24 | 8 | 52.79645 |
| 2013 | 6 | 29 | 6 | 52.76755 |
###5.What does anti_join(flights, airports, by = c(“dest” = “faa”)) tell you? What does anti_join(airports, flights, by = c(“faa” = “dest”)) tell you?
anti_join(flights, airports, by = c(“dest” = “faa”)) shows flight whose destinations are not included in the airports database.
anti_join(airports, flights, by = c(“faa” = “dest”)) shows airport names and locations that flights from flights are not flying to.
###6.You might expect that there’s an implicit relationship between plane and airline, because each plane is flown by a single airline. Confirm or reject this hypothesis using the tools you’ve learned above.
flights %>%
select(carrier, tailnum) %>%
group_by(tailnum) %>%
summarize(n = length(unique(carrier))) %>%
filter(n > 1)
## # A tibble: 18 x 2
## tailnum n
## <chr> <int>
## 1 N146PQ 2
## 2 N153PQ 2
## 3 N176PQ 2
## 4 N181PQ 2
## 5 N197PQ 2
## 6 N200PQ 2
## 7 N228PQ 2
## 8 N232PQ 2
## 9 N933AT 2
## 10 N935AT 2
## 11 N977AT 2
## 12 N978AT 2
## 13 N979AT 2
## 14 N981AT 2
## 15 N989AT 2
## 16 N990AT 2
## 17 N994AT 2
## 18 <NA> 7