install.packages("pacman")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
library(pacman)
install.packages("nycflights13")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
library(nycflights13)
p_load(nycflights13, tidyverse)
Q1. 請利用filter找出1月1號所有航班資料。
filter(flights,month==1,day==1)
## # A tibble: 842 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Q2. 利用filter找出抵達誤點超過二個小時以上所有航班資料,並將arr_delay顯示在第四欄。(提示:arr_delay >= 120, select(year:day, arr_delay, everything()))
filter(flights,arr_delay>=120,month==1,day==1)
## # A tibble: 24 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 811 630 101 1047 830
## 2 2013 1 1 848 1835 853 1001 1950
## 3 2013 1 1 957 733 144 1056 853
## 4 2013 1 1 1114 900 134 1447 1222
## 5 2013 1 1 1505 1310 115 1638 1431
## 6 2013 1 1 1525 1340 105 1831 1626
## 7 2013 1 1 1549 1445 64 1912 1656
## 8 2013 1 1 1558 1359 119 1718 1515
## 9 2013 1 1 1732 1630 62 2028 1825
## 10 2013 1 1 1803 1620 103 2008 1750
## # … with 14 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights,year:day, arr_delay, everything())
## # A tibble: 336,776 x 19
## year month day arr_delay dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <dbl> <int> <int> <dbl> <int>
## 1 2013 1 1 11 517 515 2 830
## 2 2013 1 1 20 533 529 4 850
## 3 2013 1 1 33 542 540 2 923
## 4 2013 1 1 -18 544 545 -1 1004
## 5 2013 1 1 -25 554 600 -6 812
## 6 2013 1 1 12 554 558 -4 740
## 7 2013 1 1 19 555 600 -5 913
## 8 2013 1 1 -14 557 600 -3 709
## 9 2013 1 1 -8 557 600 -3 838
## 10 2013 1 1 8 558 600 -2 753
## # … with 336,766 more rows, and 11 more variables: sched_arr_time <int>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Q3. 利用arrange, slice將抵達誤點時間, 以遞減的順序進行排列,並找出誤點最久的班機資訊(carrier, flight)。(提示:desc())
arrange(flights,desc(arr_delay))%>% slice(1:1)
## # A tibble: 1 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 9 641 900 1301 1242 1530
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>
Q4. 那家航空公司誤點(arr_delay)最嚴重?平均時間為多少? (提示:group_by, summarise, arrange, 計算平均數時,na.rm 要設定為TRUE)
flights %>% group_by(carrier) %>%
summarise_at(vars(arr_delay), funs(mean), na.rm = TRUE) %>%
arrange(desc(arr_delay)) %>% rename(arr_delay = arr_delay)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas:
##
## # Simple named list:
## list(mean = mean, median = median)
##
## # Auto named with `tibble::lst()`:
## tibble::lst(mean, median)
##
## # Using lambdas
## list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
## # A tibble: 16 x 2
## carrier arr_delay
## <chr> <dbl>
## 1 F9 21.9
## 2 FL 20.1
## 3 EV 15.8
## 4 YV 15.6
## 5 OO 11.9
## 6 MQ 10.8
## 7 WN 9.65
## 8 B6 9.46
## 9 9E 7.38
## 10 UA 3.56
## 11 US 2.13
## 12 VX 1.76
## 13 DL 1.64
## 14 AA 0.364
## 15 HA -6.92
## 16 AS -9.93
Q5. 如果你想要盡可能避開誤點,應該搭乘一天中那個時間的航班?(group_by(hour), summarise, arrange)
flights %>% group_by(hour) %>%
summarise_at(vars(arr_delay), funs(mean), na.rm = TRUE) %>%
arrange(desc(arr_delay)) %>% rename(arr_delay = arr_delay)
## # A tibble: 20 x 2
## hour arr_delay
## <dbl> <dbl>
## 1 21 18.4
## 2 20 16.7
## 3 19 16.7
## 4 17 16.0
## 5 22 16.0
## 6 18 14.8
## 7 16 12.6
## 8 15 12.3
## 9 23 11.8
## 10 14 9.20
## 11 13 6.54
## 12 12 3.49
## 13 11 1.48
## 14 10 0.954
## 15 8 -1.11
## 16 9 -1.45
## 17 6 -3.38
## 18 5 -4.80
## 19 7 -5.30
## 20 1 NaN
Q6. 請從flights中「依序」選出year:day, hour, origin, dest, tailnum, carrier等欄位,再抽出前100筆記資料,最後和airlines做left_join;換言之,就是把airlines
flights %>% select(year:day,hour, origin, dest, tailnum, carrier)
## # A tibble: 336,776 x 8
## year month day hour origin dest tailnum carrier
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA
## 2 2013 1 1 5 LGA IAH N24211 UA
## 3 2013 1 1 5 JFK MIA N619AA AA
## 4 2013 1 1 5 JFK BQN N804JB B6
## 5 2013 1 1 6 LGA ATL N668DN DL
## 6 2013 1 1 5 EWR ORD N39463 UA
## 7 2013 1 1 6 EWR FLL N516JB B6
## 8 2013 1 1 6 LGA IAD N829AS EV
## 9 2013 1 1 6 JFK MCO N593JB B6
## 10 2013 1 1 6 LGA ORD N3ALAA AA
## # … with 336,766 more rows
answer06 <- local({
slice(flights, 1:100) %>%
select(year:day, hour, origin, dest, tailnum, carrier) %>%
left_join(y = airlines, by = "carrier")
})
head(answer06)
## # A tibble: 6 x 9
## year month day hour origin dest tailnum carrier name
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr>
## 1 2013 1 1 5 EWR IAH N14228 UA United Air Lines Inc.
## 2 2013 1 1 5 LGA IAH N24211 UA United Air Lines Inc.
## 3 2013 1 1 5 JFK MIA N619AA AA American Airlines Inc.
## 4 2013 1 1 5 JFK BQN N804JB B6 JetBlue Airways
## 5 2013 1 1 6 LGA ATL N668DN DL Delta Air Lines Inc.
## 6 2013 1 1 5 EWR ORD N39463 UA United Air Lines Inc.
Q7. 利用left_join將answer06與weather結合,再與airports結合。(提示:left_join(…, by = c(“dest” = “faa”)))
left_join(left_join(answer06,weather,by =NULL,copy=FALSE,suffix= c("dest","faa")),airports,by =NULL,copy=FALSE,suffix= c("dest","faa"))
## Joining, by = c("year", "month", "day", "hour", "origin")Joining, by = "name"
## # A tibble: 100 x 26
## year month day hour origin dest tailnum carrier name temp dewp humid
## <int> <int> <int> <dbl> <chr> <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2013 1 1 5 EWR IAH N14228 UA Unit… 39.0 28.0 64.4
## 2 2013 1 1 5 LGA IAH N24211 UA Unit… 39.9 25.0 54.8
## 3 2013 1 1 5 JFK MIA N619AA AA Amer… 39.0 27.0 61.6
## 4 2013 1 1 5 JFK BQN N804JB B6 JetB… 39.0 27.0 61.6
## 5 2013 1 1 6 LGA ATL N668DN DL Delt… 39.9 25.0 54.8
## 6 2013 1 1 5 EWR ORD N39463 UA Unit… 39.0 28.0 64.4
## 7 2013 1 1 6 EWR FLL N516JB B6 JetB… 37.9 28.0 67.2
## 8 2013 1 1 6 LGA IAD N829AS EV Expr… 39.9 25.0 54.8
## 9 2013 1 1 6 JFK MCO N593JB B6 JetB… 37.9 27.0 64.3
## 10 2013 1 1 6 LGA ORD N3ALAA AA Amer… 39.9 25.0 54.8
## # … with 90 more rows, and 14 more variables: wind_dir <dbl>, wind_speed <dbl>,
## # wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>,
## # time_hour <dttm>, faa <chr>, lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>,
## # dst <chr>, tzone <chr>
Q8. 請計算每個月抵達ALB, BDL, BTV三個機場的班機總次數。(n()可以計算次數)
by_dest=group_by(flights,year,month,dest)
summarise(by_dest,count = n())
## # A tibble: 1,113 x 4
## # Groups: year, month [12]
## year month dest count
## <int> <int> <chr> <int>
## 1 2013 1 ALB 64
## 2 2013 1 ATL 1396
## 3 2013 1 AUS 169
## 4 2013 1 AVL 2
## 5 2013 1 BDL 37
## 6 2013 1 BHM 25
## 7 2013 1 BNA 399
## 8 2013 1 BOS 1245
## 9 2013 1 BQN 93
## 10 2013 1 BTV 223
## # … with 1,103 more rows
Q9. 請依始發地、航空公司、及月份(origin, carrier, month) 計算平均起飛延誤(dep_delay)的時間。
flights %>%
group_by(carrier,month,origin) %>%
summarise(dep_delay=mean(dep_delay,na.rm = TRUE)) %>%
arrange(desc(dep_delay))
## # A tibble: 399 x 4
## # Groups: carrier, month [185]
## carrier month origin dep_delay
## <chr> <int> <chr> <dbl>
## 1 OO 1 LGA 67
## 2 OO 8 LGA 64
## 3 OO 6 EWR 61
## 4 HA 1 JFK 54.4
## 5 YV 6 LGA 42.8
## 6 FL 7 LGA 41.2
## 7 VX 7 JFK 39.8
## 8 FL 6 LGA 38.8
## 9 WN 6 EWR 36.2
## 10 F9 5 LGA 35.9
## # … with 389 more rows
Q10. 請將weather中temp以直方圖畫出分配 (hist())。
hist(main="2013 New York Airport Temperature",
xlab="temperature",weather$temp)