knitr::opts_chunk$set(warning = FALSE)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
data(flights)
data(airlines)
data(airports)
data(weather)
#1.
flights %>% filter(month ==1, day==1)
## # A tibble: 842 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ... with 832 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#2.
flights %>% 
  select(month,day,flight, arr_delay) %>% 
  filter(arr_delay > 120)
## # A tibble: 10,034 x 4
##    month   day flight arr_delay
##    <int> <int>  <int>     <dbl>
##  1     1     1   4576       137
##  2     1     1   3944       851
##  3     1     1    856       123
##  4     1     1   1086       145
##  5     1     1   4497       127
##  6     1     1    525       125
##  7     1     1   4181       136
##  8     1     1   5712       123
##  9     1     1   4092       123
## 10     1     1   4622       138
## # ... with 10,024 more rows
#3.
arrange(flights,desc(arr_delay))
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     9      641            900      1301     1242           1530
##  2  2013     6    15     1432           1935      1137     1607           2120
##  3  2013     1    10     1121           1635      1126     1239           1810
##  4  2013     9    20     1139           1845      1014     1457           2210
##  5  2013     7    22      845           1600      1005     1044           1815
##  6  2013     4    10     1100           1900       960     1342           2211
##  7  2013     3    17     2321            810       911      135           1020
##  8  2013     7    22     2257            759       898      121           1026
##  9  2013    12     5      756           1700       896     1058           2020
## 10  2013     5     3     1133           2055       878     1250           2215
## # ... with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
#The worst arrival delay time
flights[which.max(flights$arr_delay),]
## # A tibble: 1 x 19
##    year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##   <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
## 1  2013     1     9      641            900      1301     1242           1530
## # ... with 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>
#4.
flights %>%
  select(carrier,arr_delay) %>% 
  group_by(carrier) %>%
  summarise(delay_time = mean(arr_delay, na.rm = TRUE)) %>% 
  arrange(desc(delay_time))
## # A tibble: 16 x 2
##    carrier delay_time
##    <chr>        <dbl>
##  1 F9          21.9  
##  2 FL          20.1  
##  3 EV          15.8  
##  4 YV          15.6  
##  5 OO          11.9  
##  6 MQ          10.8  
##  7 WN           9.65 
##  8 B6           9.46 
##  9 9E           7.38 
## 10 UA           3.56 
## 11 US           2.13 
## 12 VX           1.76 
## 13 DL           1.64 
## 14 AA           0.364
## 15 HA          -6.92 
## 16 AS          -9.93
#5
flights %>%
  group_by(hour) %>% 
  summarise(average_delay_time=mean(arr_delay, na.rm = TRUE)) %>% 
  arrange(desc(average_delay_time))
## # A tibble: 20 x 2
##     hour average_delay_time
##    <dbl>              <dbl>
##  1    21             18.4  
##  2    20             16.7  
##  3    19             16.7  
##  4    17             16.0  
##  5    22             16.0  
##  6    18             14.8  
##  7    16             12.6  
##  8    15             12.3  
##  9    23             11.8  
## 10    14              9.20 
## 11    13              6.54 
## 12    12              3.49 
## 13    11              1.48 
## 14    10              0.954
## 15     8             -1.11 
## 16     9             -1.45 
## 17     6             -3.38 
## 18     5             -4.80 
## 19     7             -5.30 
## 20     1            NaN
#6.
flights %>% slice(1:100) %>% 
   select(year:day, hour, origin, dest, tailnum, carrier) %>%
   left_join(y=airlines,by="carrier") %>% head()
## # A tibble: 6 x 9
##    year month   day  hour origin dest  tailnum carrier name                  
##   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>                 
## 1  2013     1     1     5 EWR    IAH   N14228  UA      United Air Lines Inc. 
## 2  2013     1     1     5 LGA    IAH   N24211  UA      United Air Lines Inc. 
## 3  2013     1     1     5 JFK    MIA   N619AA  AA      American Airlines Inc.
## 4  2013     1     1     5 JFK    BQN   N804JB  B6      JetBlue Airways       
## 5  2013     1     1     6 LGA    ATL   N668DN  DL      Delta Air Lines Inc.  
## 6  2013     1     1     5 EWR    ORD   N39463  UA      United Air Lines Inc.
 answer06 <- flights %>% slice(1:100) %>% 
   select(year:day, hour, origin, dest, tailnum, carrier) %>%
   left_join(y=airlines,by="carrier") %>% head()
 
 #7. 
 answer06 %>% left_join(weather) %>% left_join(airports, by = c("dest" = "faa")) 
## Joining, by = c("year", "month", "day", "hour", "origin")
## # A tibble: 6 x 26
##    year month   day  hour origin dest  tailnum carrier name.x   temp  dewp humid
##   <int> <int> <int> <dbl> <chr>  <chr> <chr>   <chr>   <chr>   <dbl> <dbl> <dbl>
## 1  2013     1     1     5 EWR    IAH   N14228  UA      United~  39.0  28.0  64.4
## 2  2013     1     1     5 LGA    IAH   N24211  UA      United~  39.9  25.0  54.8
## 3  2013     1     1     5 JFK    MIA   N619AA  AA      Americ~  39.0  27.0  61.6
## 4  2013     1     1     5 JFK    BQN   N804JB  B6      JetBlu~  39.0  27.0  61.6
## 5  2013     1     1     6 LGA    ATL   N668DN  DL      Delta ~  39.9  25.0  54.8
## 6  2013     1     1     5 EWR    ORD   N39463  UA      United~  39.0  28.0  64.4
## # ... with 14 more variables: wind_dir <dbl>, wind_speed <dbl>,
## #   wind_gust <dbl>, precip <dbl>, pressure <dbl>, visib <dbl>,
## #   time_hour <dttm>, name.y <chr>, lat <dbl>, lon <dbl>, alt <dbl>, tz <dbl>,
## #   dst <chr>, tzone <chr>
 #8.
flights %>% filter(dest%in% c("ALB", "BDL", "BTV")) %>% 
  group_by(dest, month) %>% 
  summarise(quantity_flights=n())
## `summarise()` has grouped output by 'dest'. You can override using the `.groups` argument.
## # A tibble: 36 x 3
## # Groups:   dest [3]
##    dest  month quantity_flights
##    <chr> <int>            <int>
##  1 ALB       1               64
##  2 ALB       2               58
##  3 ALB       3               57
##  4 ALB       4               13
##  5 ALB       5               59
##  6 ALB       6               34
##  7 ALB       7               15
##  8 ALB       8               20
##  9 ALB       9               20
## 10 ALB      10                1
## # ... with 26 more rows
#9.
flights %>% select(carrier, month, origin, dep_delay) %>%
  group_by(carrier, month, origin) %>%
  summarise(average_delay=mean(dep_delay, na.rm =TRUE))
## `summarise()` has grouped output by 'carrier', 'month'. You can override using the `.groups` argument.
## # A tibble: 399 x 4
## # Groups:   carrier, month [185]
##    carrier month origin average_delay
##    <chr>   <int> <chr>          <dbl>
##  1 9E          1 EWR            12.9 
##  2 9E          1 JFK            17.1 
##  3 9E          1 LGA            17.4 
##  4 9E          2 EWR            -1.18
##  5 9E          2 JFK            18.0 
##  6 9E          2 LGA             6.08
##  7 9E          3 EWR             5.6 
##  8 9E          3 JFK            14.4 
##  9 9E          3 LGA             6.95
## 10 9E          4 EWR             5.87
## # ... with 389 more rows