install.packages(“pacman”) library(pacman) p_load(nycflights13, tidyverse)

library(pacman)
p_load(nycflights13, tidyverse)
data(flights)
data(airlines)
data(weather)
data(airports)
head(flights)
## # A tibble: 6 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     1      517            515         2      830            819
## 2  2013     1     1      533            529         4      850            830
## 3  2013     1     1      542            540         2      923            850
## 4  2013     1     1      544            545        -1     1004           1022
## 5  2013     1     1      554            600        -6      812            837
## 6  2013     1     1      554            558        -4      740            728
## # … with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
head(airlines)
## # A tibble: 6 x 2
##   carrier name                    
##   <chr>   <chr>                   
## 1 9E      Endeavor Air Inc.       
## 2 AA      American Airlines Inc.  
## 3 AS      Alaska Airlines Inc.    
## 4 B6      JetBlue Airways         
## 5 DL      Delta Air Lines Inc.    
## 6 EV      ExpressJet Airlines Inc.
head(weather)
## # A tibble: 6 x 15
##   origin  year month   day  hour  temp  dewp humid wind_dir wind_speed wind_gust
##   <chr>  <int> <int> <int> <int> <dbl> <dbl> <dbl>    <dbl>      <dbl>     <dbl>
## 1 EWR     2013     1     1     1  39.0  26.1  59.4      270      10.4         NA
## 2 EWR     2013     1     1     2  39.0  27.0  61.6      250       8.06        NA
## 3 EWR     2013     1     1     3  39.0  28.0  64.4      240      11.5         NA
## 4 EWR     2013     1     1     4  39.9  28.0  62.2      250      12.7         NA
## 5 EWR     2013     1     1     5  39.0  28.0  64.4      260      12.7         NA
## 6 EWR     2013     1     1     6  37.9  28.0  67.2      240      11.5         NA
## # … with 4 more variables: precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>

Q1.找出1月1號所有航班資料

head(flights)[flights$month==1&flights$day==1,]
## Warning: Length of logical index must be 1 or 6, not 336776
## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7    NA    NA    NA       NA             NA        NA       NA             NA
##  8    NA    NA    NA       NA             NA        NA       NA             NA
##  9    NA    NA    NA       NA             NA        NA       NA             NA
## 10    NA    NA    NA       NA             NA        NA       NA             NA
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Q2. 找出抵達誤點超過二個小時以上所有航班資料,並將arr_delay顯示在第四欄

flights %>% filter(arr_delay >= 120) %>%
select(year:day, arr_delay, dep_time, sched_dep_time, dep_delay, arr_time)
## # A tibble: 10,200 x 8
##     year month   day arr_delay dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>     <dbl>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1       137      811            630       101     1047
##  2  2013     1     1       851      848           1835       853     1001
##  3  2013     1     1       123      957            733       144     1056
##  4  2013     1     1       145     1114            900       134     1447
##  5  2013     1     1       127     1505           1310       115     1638
##  6  2013     1     1       125     1525           1340       105     1831
##  7  2013     1     1       136     1549           1445        64     1912
##  8  2013     1     1       123     1558           1359       119     1718
##  9  2013     1     1       123     1732           1630        62     2028
## 10  2013     1     1       138     1803           1620       103     2008
## # … with 10,190 more rows

Q3. 將抵達誤點時間, 以遞減的順序進行排列,並找出誤點最久的班機資訊

flights %>%
 arrange(desc(arr_delay)) %>% 
 select(year:day, arr_delay, everything()) %>%
 slice(1)
## # A tibble: 1 x 19
##    year month   day arr_delay dep_time sched_dep_time dep_delay arr_time
##   <int> <int> <int>     <dbl>    <int>          <int>     <dbl>    <int>
## 1  2013     1     9      1272      641            900      1301     1242
## # … with 11 more variables: sched_arr_time <int>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Q4. 那家航空公司誤點(arr_delay)最嚴重?平均時間為多少?

flights %>% 
  group_by(carrier) %>%
  summarize(avg_arr_delay = mean(arr_delay,na.rm = T)) %>%
  arrange(desc(avg_arr_delay)) 
## # A tibble: 16 x 2
##    carrier avg_arr_delay
##    <chr>           <dbl>
##  1 F9             21.9  
##  2 FL             20.1  
##  3 EV             15.8  
##  4 YV             15.6  
##  5 OO             11.9  
##  6 MQ             10.8  
##  7 WN              9.65 
##  8 B6              9.46 
##  9 9E              7.38 
## 10 UA              3.56 
## 11 US              2.13 
## 12 VX              1.76 
## 13 DL              1.64 
## 14 AA              0.364
## 15 HA             -6.92 
## 16 AS             -9.93

Q5. 要盡可能避開誤點,應該搭乘一天中那個時間的航班?

flights %>% 
  group_by(hour) %>%
  summarize(arr_delay = mean(arr_delay,na.rm = T)) %>%
  arrange(desc(arr_delay)) 
## # A tibble: 20 x 2
##     hour arr_delay
##    <dbl>     <dbl>
##  1    21    18.4  
##  2    20    16.7  
##  3    19    16.7  
##  4    17    16.0  
##  5    22    16.0  
##  6    18    14.8  
##  7    16    12.6  
##  8    15    12.3  
##  9    23    11.8  
## 10    14     9.20 
## 11    13     6.54 
## 12    12     3.49 
## 13    11     1.48 
## 14    10     0.954
## 15     8    -1.11 
## 16     9    -1.45 
## 17     6    -3.38 
## 18     5    -4.80 
## 19     7    -5.30 
## 20     1   NaN

Q6. flights中「依序」選出year:day, hour, origin, dest, tailnum, carrier等欄位,再抽出前100筆記資料,最後和airlines做left_join

  slice(flights, 1:100) %>%
  select(year:day, hour, origin, dest, tailnum, carrier) %>%
  left_join( y=airlines, by ="carrier")
## # A tibble: 100 x 9
##     year month   day  hour origin dest  tailnum carrier name                    
##    <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>                   
##  1  2013     1     1     5 EWR    IAH   N14228  UA      United Air Lines Inc.   
##  2  2013     1     1     5 LGA    IAH   N24211  UA      United Air Lines Inc.   
##  3  2013     1     1     5 JFK    MIA   N619AA  AA      American Airlines Inc.  
##  4  2013     1     1     5 JFK    BQN   N804JB  B6      JetBlue Airways         
##  5  2013     1     1     6 LGA    ATL   N668DN  DL      Delta Air Lines Inc.    
##  6  2013     1     1     5 EWR    ORD   N39463  UA      United Air Lines Inc.   
##  7  2013     1     1     6 EWR    FLL   N516JB  B6      JetBlue Airways         
##  8  2013     1     1     6 LGA    IAD   N829AS  EV      ExpressJet Airlines Inc.
##  9  2013     1     1     6 JFK    MCO   N593JB  B6      JetBlue Airways         
## 10  2013     1     1     6 LGA    ORD   N3ALAA  AA      American Airlines Inc.  
## # … with 90 more rows

Q7. 將answer06與weather結合,再與airports結合

Q8. 計算每個月抵達ALB, BDL, BTV三個機場的班機總次數

Q9. 依始發地、航空公司、及月份(origin, carrier, month) 計算平均起飛延誤(dep_delay)的時間

flights %>% 
  group_by(origin, carrier, month) %>%
  summarize(avg_dep_delay = mean(dep_delay,na.rm = T)) %>%
  arrange(desc(avg_dep_delay)) 
## # A tibble: 399 x 4
## # Groups:   origin, carrier [35]
##    origin carrier month avg_dep_delay
##    <chr>  <chr>   <int>         <dbl>
##  1 LGA    OO          1          67  
##  2 LGA    OO          8          64  
##  3 EWR    OO          6          61  
##  4 JFK    HA          1          54.4
##  5 LGA    YV          6          42.8
##  6 LGA    FL          7          41.2
##  7 JFK    VX          7          39.8
##  8 LGA    FL          6          38.8
##  9 EWR    WN          6          36.2
## 10 LGA    F9          5          35.9
## # … with 389 more rows

Q10. 以直方圖畫出分配

hist(weather$temp)