pacman::p_load(nycflights13, dplyr, ggplot2)

Question 1:

How many records and fields are there?
View(flights)
summary(flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                     
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00.00  
##  Median :29.00   Median :2013-07-03 10:00:00.00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54.64  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00.00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00  
## 
str(flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
maxdep <- max(flights$dep_delay, na.rm=TRUE)

maxdep_id <- which(flights$dep_delay==maxdep)

flights[maxdep_id, 10:12]

Question 2

Which flight has largest departure delay?
maxdep <- max(flights$dep_delay, na.rm=TRUE)

maxdep_id <- which(flights$dep_delay==maxdep)

flights[maxdep_id, 10:12]
sortf <- arrange(flights,desc(dep_delay)) 

select(sortf, carrier, flight, tailnum, everything())

Question 3

Provide correct average departure delay values
flights %>% 

 group_by(year, month, day) %>% 

 summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
not_cancelled <- flights %>% 
 filter(!is.na(dep_delay))

not_cancelled %>% 
 group_by(year, month, day) %>% 
 summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.

Question 4

How to get tailnum for lowest average arrival delay?
delays <- flights %>%
  filter(!is.na(arr_delay)) %>%
  group_by(tailnum) %>%
  summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE))

lowest_tailnum <- delays %>%
  filter(avg_arr_delay == min(avg_arr_delay))

print(lowest_tailnum)
## # A tibble: 1 × 2
##   tailnum avg_arr_delay
##   <chr>           <dbl>
## 1 N560AS            -53

Question 5

Get first and last departure times
not_cancelled %>% 

 group_by(year, month, day) %>% 

 summarise(

  first = min(dep_time),

  last = max(dep_time)

 )
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.

Question 6

Proportion of flights w/departure delays >one hour
delay_proportion <- flights %>%
  filter(!is.na(dep_delay)) %>%  # Remove canceled flights
  group_by(month) %>%
  summarise(
    total_flights = n(),  
    delayed_flights = sum(dep_delay > 60),  
    proportion_delayed = delayed_flights / total_flights
  ) %>%
  arrange(desc(proportion_delayed))  # Sort by highest proportion

print(delay_proportion)
## # A tibble: 12 × 4
##    month total_flights delayed_flights proportion_delayed
##    <int>         <int>           <int>              <dbl>
##  1     7         28485            3820             0.134 
##  2     6         27234            3494             0.128 
##  3    12         27110            2553             0.0942
##  4     4         27662            2535             0.0916
##  5     3         27973            2340             0.0837
##  6     5         28233            2309             0.0818
##  7     8         28841            2295             0.0796
##  8     2         23690            1654             0.0698
##  9     1         26483            1821             0.0688
## 10     9         27122            1330             0.0490
## 11    10         28653            1344             0.0469
## 12    11         27035            1086             0.0402

Question 7

What destinations have the most carriers?
dest_carrier_count <- flights %>%
  group_by(dest) %>%
  summarise(num_carriers = n_distinct(carrier)) %>%
  arrange(desc(num_carriers))  # Sort in descending order

print(dest_carrier_count)
## # A tibble: 105 × 2
##    dest  num_carriers
##    <chr>        <int>
##  1 ATL              7
##  2 BOS              7
##  3 CLT              7
##  4 ORD              7
##  5 TPA              7
##  6 AUS              6
##  7 DCA              6
##  8 DTW              6
##  9 IAD              6
## 10 MSP              6
## # ℹ 95 more rows

Question 9

Different way of coding question 8
delays <- flights %>% 
 group_by(dest) %>% 
 summarise(
  count = n(),
  dist = mean(distance, na.rm = TRUE),
  delay = mean(arr_delay, na.rm = TRUE)
 ) %>% 
 filter(count > 20, dest != "HNL")
delays