rm(list=ls())
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.0     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
data(flights)
data(airlines)
data(airports)
data(weather)

# Q1: Find all flights on Jan 1. select, arrange, filter, mutate, transmutate, slice which one should we use?
flights %>% filter(month == 1, day == 1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# Q2: Try to find arrival delay (arr_delay) more than 120 minutes and show arr_delay in the fourth column.
 flights %>% filter(arr_delay >= 120) %>% 
    select(year:day, arr_delay, everything())
## # A tibble: 10,200 × 19
##     year month   day arr_delay dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>     <dbl>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1       137      811            630       101     1047
##  2  2013     1     1       851      848           1835       853     1001
##  3  2013     1     1       123      957            733       144     1056
##  4  2013     1     1       145     1114            900       134     1447
##  5  2013     1     1       127     1505           1310       115     1638
##  6  2013     1     1       125     1525           1340       105     1831
##  7  2013     1     1       136     1549           1445        64     1912
##  8  2013     1     1       123     1558           1359       119     1718
##  9  2013     1     1       123     1732           1630        62     2028
## 10  2013     1     1       138     1803           1620       103     2008
## # … with 10,190 more rows, and 11 more variables: sched_arr_time <int>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
# Q3: We need to find the longest arr_delay and its carrier, flight information. We need to think which function can rank data in descending order. Use arrange!
 flights %>% arrange(desc(arr_delay)) %>% 
    select(carrier, flight, arr_delay, everything()) %>% 
    slice(1)
## # A tibble: 1 × 19
##   carrier flight arr_delay  year month   day dep_time sched_dep_time dep_delay
##   <chr>    <int>     <dbl> <int> <int> <int>    <int>          <int>     <dbl>
## 1 HA          51      1272  2013     1     9      641            900      1301
## # … with 10 more variables: arr_time <int>, sched_arr_time <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
# Q4: Which carrier has the worst average arr_delay? its value? Use group_by, summarise/mutate and arrange
flights %>% group_by(carrier) %>% 
     # mutate(arr_delay_avg = mean(arr.delay,na.rm = TRUE )
     summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>% 
     arrange(desc(arr_delay))
## # A tibble: 16 × 2
##    carrier arr_delay
##    <chr>       <dbl>
##  1 F9         21.9  
##  2 FL         20.1  
##  3 EV         15.8  
##  4 YV         15.6  
##  5 OO         11.9  
##  6 MQ         10.8  
##  7 WN          9.65 
##  8 B6          9.46 
##  9 9E          7.38 
## 10 UA          3.56 
## 11 US          2.13 
## 12 VX          1.76 
## 13 DL          1.64 
## 14 AA          0.364
## 15 HA         -6.92 
## 16 AS         -9.93
# Q5: which hour should we take by avoiding the possible arr_delay (average arr_delay)? group_by(hour)
 flights %>% group_by(hour) %>% summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>% arrange(desc(arr_delay))
## # A tibble: 20 × 2
##     hour arr_delay
##    <dbl>     <dbl>
##  1    21    18.4  
##  2    20    16.7  
##  3    19    16.7  
##  4    17    16.0  
##  5    22    16.0  
##  6    18    14.8  
##  7    16    12.6  
##  8    15    12.3  
##  9    23    11.8  
## 10    14     9.20 
## 11    13     6.54 
## 12    12     3.49 
## 13    11     1.48 
## 14    10     0.954
## 15     8    -1.11 
## 16     9    -1.45 
## 17     6    -3.38 
## 18     5    -4.80 
## 19     7    -5.30 
## 20     1   NaN
# Q6: Based on flights, select the first 100 data of year, month, day, hour, origin, dest, tailnum, carrier.
 flights %>% slice(1:100) %>% 
     select(year:day, hour, origin, dest, tailnum, carrier) %>% 
     left_join(y = airlines, by = "carrier") %>% head()
## # A tibble: 6 × 9
##    year month   day  hour origin dest  tailnum carrier name                  
##   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>                 
## 1  2013     1     1     5 EWR    IAH   N14228  UA      United Air Lines Inc. 
## 2  2013     1     1     5 LGA    IAH   N24211  UA      United Air Lines Inc. 
## 3  2013     1     1     5 JFK    MIA   N619AA  AA      American Airlines Inc.
## 4  2013     1     1     5 JFK    BQN   N804JB  B6      JetBlue Airways       
## 5  2013     1     1     6 LGA    ATL   N668DN  DL      Delta Air Lines Inc.  
## 6  2013     1     1     5 EWR    ORD   N39463  UA      United Air Lines Inc.
 answer06 <- flights %>% slice(1:100) %>% 
     select(year:day, hour, origin, dest, tailnum, carrier) %>% 
     left_join(y = airlines, by = "carrier") %>% head()

# Q7: Use left_join() to combine 'answer06' and 'weather', and then combine with 'airports'.
 answer06 %>% left_join(weather) %>% 
     left_join(airports, by = c("dest" = "faa"))
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 6 × 26
##    year month   day  hour origin dest  tailnum carrier name.x   temp  dewp humid
##   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>   <dbl> <dbl> <dbl>
## 1  2013     1     1     5 EWR    IAH   N14228  UA      United…  39.0  28.0  64.4
## 2  2013     1     1     5 LGA    IAH   N24211  UA      United…  39.9  25.0  54.8
## 3  2013     1     1     5 JFK    MIA   N619AA  AA      Americ…  39.0  27.0  61.6
## 4  2013     1     1     5 JFK    BQN   N804JB  B6      JetBlu…  39.0  27.0  61.6
## 5  2013     1     1     6 LGA    ATL   N668DN  DL      Delta …  39.9  25.0  54.8
## 6  2013     1     1     5 EWR    ORD   N39463  UA      United…  39.0  28.0  64.4
## # … with 14 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, name.y <chr>,
## #   lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>, dst <chr>, tzone <chr>
# Q8: Count the number of flights for dest = ALB, BDL and BTV in each month. (use n())
 flights %>% filter(dest %in% c("ALB", "BDL", "BTV")) %>% 
     group_by(year, month, dest) %>% 
     summarise(count = n())
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
## # A tibble: 36 × 4
## # Groups:   year, month [12]
##     year month dest  count
##    <int> <int> <chr> <int>
##  1  2013     1 ALB      64
##  2  2013     1 BDL      37
##  3  2013     1 BTV     223
##  4  2013     2 ALB      58
##  5  2013     2 BDL      46
##  6  2013     2 BTV     189
##  7  2013     3 ALB      57
##  8  2013     3 BDL      62
##  9  2013     3 BTV     257
## 10  2013     4 ALB      13
## # … with 26 more rows
# Q9: Use carrier, month, origin to compute average departure delay time (dep_delay).
 flights %>% select(carrier, month, origin, dep_delay) %>% 
     group_by(carrier, month, origin) %>% 
     mutate(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>% 
        arrange(desc(avg_dep_delay))
## # A tibble: 336,776 × 5
## # Groups:   carrier, month, origin [399]
##    carrier month origin dep_delay avg_dep_delay
##    <chr>   <int> <chr>      <dbl>         <dbl>
##  1 OO          1 LGA           67          67  
##  2 OO          8 LGA          -10          64  
##  3 OO          8 LGA          154          64  
##  4 OO          8 LGA           27          64  
##  5 OO          8 LGA           85          64  
##  6 OO          6 EWR           -9          61  
##  7 OO          6 EWR          131          61  
##  8 HA          1 JFK           -3          54.4
##  9 HA          1 JFK            9          54.4
## 10 HA          1 JFK           14          54.4
## # … with 336,766 more rows