library(pacman)
p_load(nycflights13, tidyverse)
data(flights)
data(airlines)
data(weather)
data(airports)
data(flights)
Q1. 請利用filter找出1月1號所有航班資料。
filter(flights,month==1,day==1)
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Q2. 利用filter找出抵達誤點超過二個小時以上所有航班資料,並將arr_delay顯示在第四欄。
flights %>% filter(arr_delay >= 120) %>%
select(year:day, arr_delay, everything())
## # A tibble: 10,200 x 19
## year month day arr_delay dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <dbl> <int> <int> <dbl> <int>
## 1 2013 1 1 137 811 630 101 1047
## 2 2013 1 1 851 848 1835 853 1001
## 3 2013 1 1 123 957 733 144 1056
## 4 2013 1 1 145 1114 900 134 1447
## 5 2013 1 1 127 1505 1310 115 1638
## 6 2013 1 1 125 1525 1340 105 1831
## 7 2013 1 1 136 1549 1445 64 1912
## 8 2013 1 1 123 1558 1359 119 1718
## 9 2013 1 1 123 1732 1630 62 2028
## 10 2013 1 1 138 1803 1620 103 2008
## # … with 10,190 more rows, and 11 more variables: sched_arr_time <int>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Q3. 利用arrange, slice將抵達誤點時間, 以遞減的順序進行排列,並找出誤點最久的班機資訊(carrier, flight)。
flights %>%
arrange(desc(arr_delay)) %>%
select(year:day, carrier,flight,arr_delay,dep_time,sched_dep_time,dep_delay) %>%
slice(1)
## # A tibble: 1 x 9
## year month day carrier flight arr_delay dep_time sched_dep_time dep_delay
## <int> <int> <int> <chr> <int> <dbl> <int> <int> <dbl>
## 1 2013 1 9 HA 51 1272 641 900 1301
Q4. 那家航空公司誤點(arr_delay)最嚴重?平均時間為多少?
flights %>%
group_by(carrier) %>%
summarise(arr_delay=mean(arr_delay,na.rm = TRUE)) %>%
arrange(desc(arr_delay))
## # A tibble: 16 x 2
## carrier arr_delay
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
Q5. 如果你想要盡可能避開誤點,應該搭乘一天中那個時間的航班?
flights %>%
group_by(hour) %>%
summarise(arr_delay=mean(arr_delay,na.rm = TRUE)) %>%
arrange(desc(arr_delay))
## # A tibble: 20 x 2
## hour arr_delay
## <dbl> <dbl>
## 1 21 18.4
## 2 20 16.7
## 3 19 16.7
## 4 17 16.0
## 5 22 16.0
## 6 18 14.8
## 7 16 12.6
## 8 15 12.3
## 9 23 11.8
## 10 14 9.20
## 11 13 6.54
## 12 12 3.49
## 13 11 1.48
## 14 10 0.954
## 15 8 -1.11
## 16 9 -1.45
## 17 6 -3.38
## 18 5 -4.80
## 19 7 -5.30
## 20 1 NaN
Q6. 請從flights中「依序」選出year:day, hour, origin, dest, tailnum, carrier等欄位,再抽出前100筆記資料,最後和airlines做left_join;換言之,就是把airlines
select(flights,year:day,hour,origin,dest,tailnum,carrier)
## # A tibble: 336,776 x 8
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # … with 336,766 more rows
Q7. 利用left_join將answer06與weather結合,再與airports結合。
Q8. 請計算每個月抵達ALB, BDL, BTV三個機場的班機總次數。
flights %>%
group_by(year,month) %>%
mutate()
## # A tibble: 336,776 x 19
## # Groups: year, month [12]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Q9. 請依始發地、航空公司、及月份(origin, carrier, month) 計算平均起飛延誤(dep_delay)的時間。
flights %>%
group_by(carrier,month,origin) %>%
summarise(dep_delay=mean(dep_delay,na.rm = TRUE)) %>%
arrange(desc(dep_delay))
## # A tibble: 399 x 4
## # Groups: carrier, month [185]
## carrier month origin dep_delay
## <chr> <int> <chr> <dbl>
## 1 OO 1 LGA 67
## 2 OO 8 LGA 64
## 3 OO 6 EWR 61
## 4 HA 1 JFK 54.4
## 5 YV 6 LGA 42.8
## 6 FL 7 LGA 41.2
## 7 VX 7 JFK 39.8
## 8 FL 6 LGA 38.8
## 9 WN 6 EWR 36.2
## 10 F9 5 LGA 35.9
## # … with 389 more rows
Q10. 請將weather中temp以直方圖畫出分配 (hist())
hist(weather$temp)