rm(list=ls())
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.4 ✓ stringr 1.4.0
## ✓ readr 2.1.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
data(flights)
data(airlines)
data(airports)
data(weather)
force(flights)
## # A tibble: 336,776 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Q1: we should use filter
flights %>% filter(month == 1, day == 1)
## # A tibble: 842 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Q2:
flights %>% filter(arr_delay>=120) %>%
select(year:day, arr_delay, everything())
## # A tibble: 10,200 × 19
## year month day arr_delay dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <dbl> <int> <int> <dbl> <int>
## 1 2013 1 1 137 811 630 101 1047
## 2 2013 1 1 851 848 1835 853 1001
## 3 2013 1 1 123 957 733 144 1056
## 4 2013 1 1 145 1114 900 134 1447
## 5 2013 1 1 127 1505 1310 115 1638
## 6 2013 1 1 125 1525 1340 105 1831
## 7 2013 1 1 136 1549 1445 64 1912
## 8 2013 1 1 123 1558 1359 119 1718
## 9 2013 1 1 123 1732 1630 62 2028
## 10 2013 1 1 138 1803 1620 103 2008
## # … with 10,190 more rows, and 11 more variables: sched_arr_time <int>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Q3:
flights %>% arrange(desc(arr_delay)) %>%
select(carrier, flight, arr_delay, everything()) %>%
slice(1)
## # A tibble: 1 × 19
## carrier flight arr_delay year month day dep_time sched_dep_time dep_delay
## <chr> <int> <dbl> <int> <int> <int> <int> <int> <dbl>
## 1 HA 51 1272 2013 1 9 641 900 1301
## # … with 10 more variables: arr_time <int>, sched_arr_time <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
#Q4:
flights %>% group_by(carrier) %>%
summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(arr_delay))
## # A tibble: 16 × 2
## carrier arr_delay
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
#Q5:
flights %>% group_by(hour) %>%
summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
arrange(desc(arr_delay))
## # A tibble: 20 × 2
## hour arr_delay
## <dbl> <dbl>
## 1 21 18.4
## 2 20 16.7
## 3 19 16.7
## 4 17 16.0
## 5 22 16.0
## 6 18 14.8
## 7 16 12.6
## 8 15 12.3
## 9 23 11.8
## 10 14 9.20
## 11 13 6.54
## 12 12 3.49
## 13 11 1.48
## 14 10 0.954
## 15 8 -1.11
## 16 9 -1.45
## 17 6 -3.38
## 18 5 -4.80
## 19 7 -5.30
## 20 1 NaN
#Q6
flights %>% slice(1:100) %>%
select(year:day, hour, origin, dest, tailnum, carrier) %>%
left_join(y = airlines, by = "carrier") %>%
head()
## # A tibble: 6 × 9
## year month day hour origin dest tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 LGA IAH N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 JFK MIA N619AA AA American Airlines Inc.
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBlue Airways
## 5 2013 1 1 6 LGA ATL N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 EWR ORD N39463 UA United Air Lines Inc.
answer06 <- flights %>% slice(1:100) %>%
select(year:day, hour, origin, dest, tailnum, carrier) %>%
left_join(y = airlines, by = "carrier") %>%
head()
#Q7:
answer06 %>% left_join(weather) %>%
left_join(airports, by = c("dest" = "faa"))
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 6 × 26
## year month day hour origin dest tailnum carrier name.x temp dewp humid
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA United… 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA United… 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA Americ… 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBlu… 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL Delta … 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA United… 39.0 28.0 64.4
## # … with 14 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## # precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, name.y <chr>,
## # lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>, dst <chr>, tzone <chr>
#Q8:
flights %>% filter(dest %in% c("ALB", "BDL", "BTV")) %>%
group_by(year, month, dest) %>%
summarise(count = n())
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
## # A tibble: 36 × 4
## # Groups: year, month [12]
## year month dest count
## <int> <int> <chr> <int>
## 1 2013 1 ALB 64
## 2 2013 1 BDL 37
## 3 2013 1 BTV 223
## 4 2013 2 ALB 58
## 5 2013 2 BDL 46
## 6 2013 2 BTV 189
## 7 2013 3 ALB 57
## 8 2013 3 BDL 62
## 9 2013 3 BTV 257
## 10 2013 4 ALB 13
## # … with 26 more rows
#Q9:
flights %>% select(carrier, month, origin, dep_delay) %>%
group_by(carrier, month, origin) %>%
mutate(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
arrange(desc(avg_dep_delay))
## # A tibble: 336,776 × 5
## # Groups: carrier, month, origin [399]
## carrier month origin dep_delay avg_dep_delay
## <chr> <int> <chr> <dbl> <dbl>
## 1 OO 1 LGA 67 67
## 2 OO 8 LGA -10 64
## 3 OO 8 LGA 154 64
## 4 OO 8 LGA 27 64
## 5 OO 8 LGA 85 64
## 6 OO 6 EWR -9 61
## 7 OO 6 EWR 131 61
## 8 HA 1 JFK -3 54.4
## 9 HA 1 JFK 9 54.4
## 10 HA 1 JFK 14 54.4
## # … with 336,766 more rows