knitr::opts_chunk$set(warning = FALSE)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.5 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.0.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
data(flights)
data(airlines)
data(airports)
data(weather)
#1.
flights %>% filter(month ==1, day==1)
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ... with 832 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#2.
flights %>%
select(month,day,flight, arr_delay) %>%
filter(arr_delay > 120)
## # A tibble: 10,034 x 4
## month day flight arr_delay
## <int> <int> <int> <dbl>
## 1 1 1 4576 137
## 2 1 1 3944 851
## 3 1 1 856 123
## 4 1 1 1086 145
## 5 1 1 4497 127
## 6 1 1 525 125
## 7 1 1 4181 136
## 8 1 1 5712 123
## 9 1 1 4092 123
## 10 1 1 4622 138
## # ... with 10,024 more rows
#3.
arrange(flights,desc(arr_delay))
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## 2 2013 6 15 1432 1935 1137 1607 2120
## 3 2013 1 10 1121 1635 1126 1239 1810
## 4 2013 9 20 1139 1845 1014 1457 2210
## 5 2013 7 22 845 1600 1005 1044 1815
## 6 2013 4 10 1100 1900 960 1342 2211
## 7 2013 3 17 2321 810 911 135 1020
## 8 2013 7 22 2257 759 898 121 1026
## 9 2013 12 5 756 1700 896 1058 2020
## 10 2013 5 3 1133 2055 878 1250 2215
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#The worst arrival delay time
flights[which.max(flights$arr_delay),]
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
#4.
flights %>%
select(carrier,arr_delay) %>%
group_by(carrier) %>%
summarise(delay_time = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(delay_time))
## # A tibble: 16 x 2
## carrier delay_time
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
#5
flights %>%
group_by(hour) %>%
summarise(average_delay_time=mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(average_delay_time))
## # A tibble: 20 x 2
## hour average_delay_time
## <dbl> <dbl>
## 1 21 18.4
## 2 20 16.7
## 3 19 16.7
## 4 17 16.0
## 5 22 16.0
## 6 18 14.8
## 7 16 12.6
## 8 15 12.3
## 9 23 11.8
## 10 14 9.20
## 11 13 6.54
## 12 12 3.49
## 13 11 1.48
## 14 10 0.954
## 15 8 -1.11
## 16 9 -1.45
## 17 6 -3.38
## 18 5 -4.80
## 19 7 -5.30
## 20 1 NaN
#6.
flights %>% slice(1:100) %>%
select(year:day, hour, origin, dest, tailnum, carrier) %>%
left_join(y=airlines,by="carrier") %>% head()
## # A tibble: 6 x 9
## year month day hour origin dest tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 LGA IAH N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 JFK MIA N619AA AA American Airlines Inc.
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBlue Airways
## 5 2013 1 1 6 LGA ATL N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 EWR ORD N39463 UA United Air Lines Inc.
answer06 <- flights %>% slice(1:100) %>%
select(year:day, hour, origin, dest, tailnum, carrier) %>%
left_join(y=airlines,by="carrier") %>% head()
#7.
answer06 %>% left_join(weather) %>% left_join(airports, by = c("dest" = "faa"))
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 6 x 26
## year month day hour origin dest tailnum carrier name.x temp dewp humid
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA United~ 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA United~ 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA Americ~ 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBlu~ 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL Delta ~ 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA United~ 39.0 28.0 64.4
## # ... with 14 more variables: wind_dir <dbl>, wind_speed <dbl>,
## # wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>, name.y <chr>, lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>,
## # dst <chr>, tzone <chr>
#8.
flights %>% filter(dest%in% c("ALB", "BDL", "BTV")) %>%
group_by(dest, month) %>%
summarise(quantity_flights=n())
## `summarise()` has grouped output by 'dest'. You can override using the `.groups` argument.
## # A tibble: 36 x 3
## # Groups: dest [3]
## dest month quantity_flights
## <chr> <int> <int>
## 1 ALB 1 64
## 2 ALB 2 58
## 3 ALB 3 57
## 4 ALB 4 13
## 5 ALB 5 59
## 6 ALB 6 34
## 7 ALB 7 15
## 8 ALB 8 20
## 9 ALB 9 20
## 10 ALB 10 1
## # ... with 26 more rows
#9.
flights %>% select(carrier, month, origin, dep_delay) %>%
group_by(carrier, month, origin) %>%
summarise(average_delay=mean(dep_delay, na.rm =TRUE))
## `summarise()` has grouped output by 'carrier', 'month'. You can override using the `.groups` argument.
## # A tibble: 399 x 4
## # Groups: carrier, month [185]
## carrier month origin average_delay
## <chr> <int> <chr> <dbl>
## 1 9E 1 EWR 12.9
## 2 9E 1 JFK 17.1
## 3 9E 1 LGA 17.4
## 4 9E 2 EWR -1.18
## 5 9E 2 JFK 18.0
## 6 9E 2 LGA 6.08
## 7 9E 3 EWR 5.6
## 8 9E 3 JFK 14.4
## 9 9E 3 LGA 6.95
## 10 9E 4 EWR 5.87
## # ... with 389 more rows