rm(list=ls())
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.1.4     ✓ stringr 1.4.0
## ✓ readr   2.1.1     ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
data(flights)
data(airlines)
data(airports)
data(weather)
force(flights)
## # A tibble: 336,776 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Q1: we should use filter
flights %>% filter(month == 1, day == 1)
## # A tibble: 842 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # … with 832 more rows, and 11 more variables: arr_delay <dbl>, carrier <chr>,
## #   flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Q2: 
flights %>% filter(arr_delay>=120) %>%
  select(year:day, arr_delay, everything())
## # A tibble: 10,200 × 19
##     year month   day arr_delay dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>     <dbl>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1       137      811            630       101     1047
##  2  2013     1     1       851      848           1835       853     1001
##  3  2013     1     1       123      957            733       144     1056
##  4  2013     1     1       145     1114            900       134     1447
##  5  2013     1     1       127     1505           1310       115     1638
##  6  2013     1     1       125     1525           1340       105     1831
##  7  2013     1     1       136     1549           1445        64     1912
##  8  2013     1     1       123     1558           1359       119     1718
##  9  2013     1     1       123     1732           1630        62     2028
## 10  2013     1     1       138     1803           1620       103     2008
## # … with 10,190 more rows, and 11 more variables: sched_arr_time <int>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#Q3: 
flights %>% arrange(desc(arr_delay)) %>%
  select(carrier, flight, arr_delay, everything()) %>%
  slice(1)
## # A tibble: 1 × 19
##   carrier flight arr_delay  year month   day dep_time sched_dep_time dep_delay
##   <chr>    <int>     <dbl> <int> <int> <int>    <int>          <int>     <dbl>
## 1 HA          51      1272  2013     1     9      641            900      1301
## # … with 10 more variables: arr_time <int>, sched_arr_time <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
#Q4: 
flights %>% group_by(carrier) %>%
  summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(arr_delay))
## # A tibble: 16 × 2
##    carrier arr_delay
##    <chr>       <dbl>
##  1 F9         21.9  
##  2 FL         20.1  
##  3 EV         15.8  
##  4 YV         15.6  
##  5 OO         11.9  
##  6 MQ         10.8  
##  7 WN          9.65 
##  8 B6          9.46 
##  9 9E          7.38 
## 10 UA          3.56 
## 11 US          2.13 
## 12 VX          1.76 
## 13 DL          1.64 
## 14 AA          0.364
## 15 HA         -6.92 
## 16 AS         -9.93
#Q5:
flights %>% group_by(hour) %>%
  summarise(arr_delay = mean(arr_delay, na.rm = TRUE)) %>%
  arrange(desc(arr_delay))
## # A tibble: 20 × 2
##     hour arr_delay
##    <dbl>     <dbl>
##  1    21    18.4  
##  2    20    16.7  
##  3    19    16.7  
##  4    17    16.0  
##  5    22    16.0  
##  6    18    14.8  
##  7    16    12.6  
##  8    15    12.3  
##  9    23    11.8  
## 10    14     9.20 
## 11    13     6.54 
## 12    12     3.49 
## 13    11     1.48 
## 14    10     0.954
## 15     8    -1.11 
## 16     9    -1.45 
## 17     6    -3.38 
## 18     5    -4.80 
## 19     7    -5.30 
## 20     1   NaN
#Q6
flights %>% slice(1:100) %>%
  select(year:day, hour, origin, dest, tailnum, carrier) %>%
  left_join(y = airlines, by = "carrier") %>% 
  head()
## # A tibble: 6 × 9
##    year month   day  hour origin dest  tailnum carrier name                  
##   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>                 
## 1  2013     1     1     5 EWR    IAH   N14228  UA      United Air Lines Inc. 
## 2  2013     1     1     5 LGA    IAH   N24211  UA      United Air Lines Inc. 
## 3  2013     1     1     5 JFK    MIA   N619AA  AA      American Airlines Inc.
## 4  2013     1     1     5 JFK    BQN   N804JB  B6      JetBlue Airways       
## 5  2013     1     1     6 LGA    ATL   N668DN  DL      Delta Air Lines Inc.  
## 6  2013     1     1     5 EWR    ORD   N39463  UA      United Air Lines Inc.
answer06 <- flights %>% slice(1:100) %>%
  select(year:day, hour, origin, dest, tailnum, carrier) %>%
  left_join(y = airlines, by = "carrier") %>%
  head()
#Q7:
answer06 %>% left_join(weather) %>%
  left_join(airports, by = c("dest" = "faa"))
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 6 × 26
##    year month   day  hour origin dest  tailnum carrier name.x   temp  dewp humid
##   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>   <dbl> <dbl> <dbl>
## 1  2013     1     1     5 EWR    IAH   N14228  UA      United…  39.0  28.0  64.4
## 2  2013     1     1     5 LGA    IAH   N24211  UA      United…  39.9  25.0  54.8
## 3  2013     1     1     5 JFK    MIA   N619AA  AA      Americ…  39.0  27.0  61.6
## 4  2013     1     1     5 JFK    BQN   N804JB  B6      JetBlu…  39.0  27.0  61.6
## 5  2013     1     1     6 LGA    ATL   N668DN  DL      Delta …  39.9  25.0  54.8
## 6  2013     1     1     5 EWR    ORD   N39463  UA      United…  39.0  28.0  64.4
## # … with 14 more variables: wind_dir <dbl>, wind_speed <dbl>, wind_gust <dbl>,
## #   precip <dbl>, pressure <dbl>, visib <dbl>, time_hour <dttm>, name.y <chr>,
## #   lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>, dst <chr>, tzone <chr>
#Q8:
flights %>% filter(dest %in% c("ALB", "BDL", "BTV")) %>%
  group_by(year, month, dest) %>%
  summarise(count = n())
## `summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
## # A tibble: 36 × 4
## # Groups:   year, month [12]
##     year month dest  count
##    <int> <int> <chr> <int>
##  1  2013     1 ALB      64
##  2  2013     1 BDL      37
##  3  2013     1 BTV     223
##  4  2013     2 ALB      58
##  5  2013     2 BDL      46
##  6  2013     2 BTV     189
##  7  2013     3 ALB      57
##  8  2013     3 BDL      62
##  9  2013     3 BTV     257
## 10  2013     4 ALB      13
## # … with 26 more rows
#Q9:
flights %>% select(carrier, month, origin, dep_delay) %>%
  group_by(carrier, month, origin) %>%
  mutate(avg_dep_delay = mean(dep_delay, na.rm = TRUE)) %>%
  arrange(desc(avg_dep_delay))
## # A tibble: 336,776 × 5
## # Groups:   carrier, month, origin [399]
##    carrier month origin dep_delay avg_dep_delay
##    <chr>   <int> <chr>      <dbl>         <dbl>
##  1 OO          1 LGA           67          67  
##  2 OO          8 LGA          -10          64  
##  3 OO          8 LGA          154          64  
##  4 OO          8 LGA           27          64  
##  5 OO          8 LGA           85          64  
##  6 OO          6 EWR           -9          61  
##  7 OO          6 EWR          131          61  
##  8 HA          1 JFK           -3          54.4
##  9 HA          1 JFK            9          54.4
## 10 HA          1 JFK           14          54.4
## # … with 336,766 more rows