install.packages("pacman")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
library(pacman)
install.packages("nycflights13")
## Installing package into '/usr/local/lib/R/site-library'
## (as 'lib' is unspecified)
library(nycflights13)
p_load(nycflights13, tidyverse)

Q1. 請利用filter找出1月1號所有航班資料。

filter(flights,month==1,day==1)
## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Q2. 利用filter找出抵達誤點超過二個小時以上所有航班資料,並將arr_delay顯示在第四欄。(提示:arr_delay >= 120, select(year:day, arr_delay, everything()))

filter(flights,arr_delay>=120,month==1,day==1)
## # A tibble: 24 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      811            630       101     1047            830
##  2  2013     1     1      848           1835       853     1001           1950
##  3  2013     1     1      957            733       144     1056            853
##  4  2013     1     1     1114            900       134     1447           1222
##  5  2013     1     1     1505           1310       115     1638           1431
##  6  2013     1     1     1525           1340       105     1831           1626
##  7  2013     1     1     1549           1445        64     1912           1656
##  8  2013     1     1     1558           1359       119     1718           1515
##  9  2013     1     1     1732           1630        62     2028           1825
## 10  2013     1     1     1803           1620       103     2008           1750
## # … with 14 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
select(flights,year:day, arr_delay, everything())
## # A tibble: 336,776 x 19
##     year month   day arr_delay dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>     <dbl>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1        11      517            515         2      830
##  2  2013     1     1        20      533            529         4      850
##  3  2013     1     1        33      542            540         2      923
##  4  2013     1     1       -18      544            545        -1     1004
##  5  2013     1     1       -25      554            600        -6      812
##  6  2013     1     1        12      554            558        -4      740
##  7  2013     1     1        19      555            600        -5      913
##  8  2013     1     1       -14      557            600        -3      709
##  9  2013     1     1        -8      557            600        -3      838
## 10  2013     1     1         8      558            600        -2      753
## # … with 336,766 more rows, and 11 more variables: sched_arr_time <int>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Q3. 利用arrange, slice將抵達誤點時間, 以遞減的順序進行排列,並找出誤點最久的班機資訊(carrier, flight)。(提示:desc())

arrange(flights,desc(arr_delay))%>% slice(1:1)
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     9      641            900      1301     1242           1530
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Q4. 那家航空公司誤點(arr_delay)最嚴重?平均時間為多少? (提示:group_by, summarise, arrange, 計算平均數時,na.rm 要設定為TRUE)

flights %>% group_by(carrier) %>% 
            summarise_at(vars(arr_delay), funs(mean), na.rm = TRUE) %>% 
            arrange(desc(arr_delay)) %>% rename(arr_delay = arr_delay)
## Warning: funs() is soft deprecated as of dplyr 0.8.0
## Please use a list of either functions or lambdas: 
## 
##   # Simple named list: 
##   list(mean = mean, median = median)
## 
##   # Auto named with `tibble::lst()`: 
##   tibble::lst(mean, median)
## 
##   # Using lambdas
##   list(~ mean(., trim = .2), ~ median(., na.rm = TRUE))
## This warning is displayed once per session.
## # A tibble: 16 x 2
##    carrier arr_delay
##    <chr>       <dbl>
##  1 F9         21.9  
##  2 FL         20.1  
##  3 EV         15.8  
##  4 YV         15.6  
##  5 OO         11.9  
##  6 MQ         10.8  
##  7 WN          9.65 
##  8 B6          9.46 
##  9 9E          7.38 
## 10 UA          3.56 
## 11 US          2.13 
## 12 VX          1.76 
## 13 DL          1.64 
## 14 AA          0.364
## 15 HA         -6.92 
## 16 AS         -9.93

Q5. 如果你想要盡可能避開誤點,應該搭乘一天中那個時間的航班?(group_by(hour), summarise, arrange)

flights %>% group_by(hour) %>% 
            summarise_at(vars(arr_delay), funs(mean), na.rm = TRUE) %>% 
            arrange(desc(arr_delay)) %>% rename(arr_delay = arr_delay)
## # A tibble: 20 x 2
##     hour arr_delay
##    <dbl>     <dbl>
##  1    21    18.4  
##  2    20    16.7  
##  3    19    16.7  
##  4    17    16.0  
##  5    22    16.0  
##  6    18    14.8  
##  7    16    12.6  
##  8    15    12.3  
##  9    23    11.8  
## 10    14     9.20 
## 11    13     6.54 
## 12    12     3.49 
## 13    11     1.48 
## 14    10     0.954
## 15     8    -1.11 
## 16     9    -1.45 
## 17     6    -3.38 
## 18     5    -4.80 
## 19     7    -5.30 
## 20     1   NaN

Q6. 請從flights中「依序」選出year:day, hour, origin, dest, tailnum, carrier等欄位,再抽出前100筆記資料,最後和airlines做left_join;換言之,就是把airlines

flights %>% select(year:day,hour, origin, dest, tailnum, carrier)
## # A tibble: 336,776 x 8
##     year month   day  hour origin dest  tailnum carrier
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>  
##  1  2013     1     1     5 EWR    IAH   N14228  UA     
##  2  2013     1     1     5 LGA    IAH   N24211  UA     
##  3  2013     1     1     5 JFK    MIA   N619AA  AA     
##  4  2013     1     1     5 JFK    BQN   N804JB  B6     
##  5  2013     1     1     6 LGA    ATL   N668DN  DL     
##  6  2013     1     1     5 EWR    ORD   N39463  UA     
##  7  2013     1     1     6 EWR    FLL   N516JB  B6     
##  8  2013     1     1     6 LGA    IAD   N829AS  EV     
##  9  2013     1     1     6 JFK    MCO   N593JB  B6     
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA     
## # … with 336,766 more rows
answer06 <- local({
  slice(flights, 1:100) %>%
    select(year:day, hour, origin, dest, tailnum, carrier) %>%
    left_join(y = airlines, by = "carrier")
})
head(answer06)
## # A tibble: 6 x 9
##    year month   day  hour origin dest  tailnum carrier name                  
##   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>                 
## 1  2013     1     1     5 EWR    IAH   N14228  UA      United Air Lines Inc. 
## 2  2013     1     1     5 LGA    IAH   N24211  UA      United Air Lines Inc. 
## 3  2013     1     1     5 JFK    MIA   N619AA  AA      American Airlines Inc.
## 4  2013     1     1     5 JFK    BQN   N804JB  B6      JetBlue Airways       
## 5  2013     1     1     6 LGA    ATL   N668DN  DL      Delta Air Lines Inc.  
## 6  2013     1     1     5 EWR    ORD   N39463  UA      United Air Lines Inc.

Q7. 利用left_join將answer06與weather結合,再與airports結合。(提示:left_join(…, by = c(“dest” = “faa”)))

 left_join(left_join(answer06,weather,by =NULL,copy=FALSE,suffix= c("dest","faa")),airports,by =NULL,copy=FALSE,suffix= c("dest","faa"))
## Joining, by = c("year", "month", "day", "hour", "origin")Joining, by = "name"
## # A tibble: 100 x 26
##     year month   day  hour origin dest  tailnum carrier name   temp  dewp humid
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr> <dbl> <dbl> <dbl>
##  1  2013     1     1     5 EWR    IAH   N14228  UA      Unit…  39.0  28.0  64.4
##  2  2013     1     1     5 LGA    IAH   N24211  UA      Unit…  39.9  25.0  54.8
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      Amer…  39.0  27.0  61.6
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      JetB…  39.0  27.0  61.6
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      Delt…  39.9  25.0  54.8
##  6  2013     1     1     5 EWR    ORD   N39463  UA      Unit…  39.0  28.0  64.4
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      JetB…  37.9  28.0  67.2
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      Expr…  39.9  25.0  54.8
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      JetB…  37.9  27.0  64.3
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      Amer…  39.9  25.0  54.8
## # … with 90 more rows, and 14 more variables: wind_dir <dbl>, wind_speed <dbl>,
## #   wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>, faa <chr>, lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>,
## #   dst <chr>, tzone <chr>

Q8. 請計算每個月抵達ALB, BDL, BTV三個機場的班機總次數。(n()可以計算次數)

by_dest=group_by(flights,year,month,dest)
summarise(by_dest,count = n())
## # A tibble: 1,113 x 4
## # Groups:   year, month [12]
##     year month dest  count
##    <int> <int> <chr> <int>
##  1  2013     1 ALB      64
##  2  2013     1 ATL    1396
##  3  2013     1 AUS     169
##  4  2013     1 AVL       2
##  5  2013     1 BDL      37
##  6  2013     1 BHM      25
##  7  2013     1 BNA     399
##  8  2013     1 BOS    1245
##  9  2013     1 BQN      93
## 10  2013     1 BTV     223
## # … with 1,103 more rows

Q9. 請依始發地、航空公司、及月份(origin, carrier, month) 計算平均起飛延誤(dep_delay)的時間。

flights %>%
  group_by(carrier,month,origin) %>%
  summarise(dep_delay=mean(dep_delay,na.rm = TRUE)) %>%
  arrange(desc(dep_delay))
## # A tibble: 399 x 4
## # Groups:   carrier, month [185]
##    carrier month origin dep_delay
##    <chr>   <int> <chr>      <dbl>
##  1 OO          1 LGA         67  
##  2 OO          8 LGA         64  
##  3 OO          6 EWR         61  
##  4 HA          1 JFK         54.4
##  5 YV          6 LGA         42.8
##  6 FL          7 LGA         41.2
##  7 VX          7 JFK         39.8
##  8 FL          6 LGA         38.8
##  9 WN          6 EWR         36.2
## 10 F9          5 LGA         35.9
## # … with 389 more rows

Q10. 請將weather中temp以直方圖畫出分配 (hist())。

hist(main="2013 New York Airport Temperature",
     xlab="temperature",weather$temp)