##Q1

pacman::p_load(nycflights13, dplyr)

# View(flights) # this View() function opens lets you directly view the whole dataset

# glimpse(flights) # this glimpse() function provides a quick overview of the dataset

summary(flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                     
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00.00  
##  Median :29.00   Median :2013-07-03 10:00:00.00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54.64  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00.00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00.00  
## 

##Q3

flights %>% 

 group_by(year, month, day) %>% 

 summarise(mean = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ℹ 355 more rows
not_cancelled <- flights %>% 
filter(!is.na(dep_delay))

not_cancelled %>% 
group_by(year, month, day) %>% 
summarise(mean = mean(dep_delay))
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 4
## # Groups:   year, month [12]
##     year month   day  mean
##    <int> <int> <int> <dbl>
##  1  2013     1     1 11.5 
##  2  2013     1     2 13.9 
##  3  2013     1     3 11.0 
##  4  2013     1     4  8.95
##  5  2013     1     5  5.73
##  6  2013     1     6  7.15
##  7  2013     1     7  5.42
##  8  2013     1     8  2.55
##  9  2013     1     9  2.28
## 10  2013     1    10  2.84
## # ℹ 355 more rows

##Q4

average_delays <- flights %>%
  group_by(tailnum) %>%
  summarize(avg_delay = mean(arr_delay)) %>%
  arrange(avg_delay) 

average_delays
## # A tibble: 4,044 × 2
##    tailnum avg_delay
##    <chr>       <dbl>
##  1 N560AS      -53  
##  2 N315AS      -51  
##  3 N517AS      -40.5
##  4 N7AYAA      -35  
##  5 N585AS      -34.5
##  6 N915DN      -34  
##  7 N512AS      -31.5
##  8 N594AS      -31.3
##  9 N564AS      -31  
## 10 N593AS      -30.5
## # ℹ 4,034 more rows

##q5

not_cancelled %>% 

 group_by(year, month, day) %>% 

 summarise(

  first = min(dep_time),

  last = max(dep_time)

 )
## `summarise()` has grouped output by 'year', 'month'. You can override using the
## `.groups` argument.
## # A tibble: 365 × 5
## # Groups:   year, month [12]
##     year month   day first  last
##    <int> <int> <int> <int> <int>
##  1  2013     1     1   517  2356
##  2  2013     1     2    42  2354
##  3  2013     1     3    32  2349
##  4  2013     1     4    25  2358
##  5  2013     1     5    14  2357
##  6  2013     1     6    16  2355
##  7  2013     1     7    49  2359
##  8  2013     1     8   454  2351
##  9  2013     1     9     2  2252
## 10  2013     1    10     3  2320
## # ℹ 355 more rows

##Q6

delays_over_hour <- flights %>%
  group_by(month) %>%
  summarise(
    total_flights = n(),
    delayed_flights = sum(dep_delay > 60, na.rm = TRUE),
    proportion_delayed = delayed_flights / total_flights
  ) %>%
  arrange(desc(proportion_delayed))

# View the result
print(delays_over_hour)
## # A tibble: 12 × 4
##    month total_flights delayed_flights proportion_delayed
##    <int>         <int>           <int>              <dbl>
##  1     7         29425            3820             0.130 
##  2     6         28243            3494             0.124 
##  3    12         28135            2553             0.0907
##  4     4         28330            2535             0.0895
##  5     3         28834            2340             0.0812
##  6     5         28796            2309             0.0802
##  7     8         29327            2295             0.0783
##  8     1         27004            1821             0.0674
##  9     2         24951            1654             0.0663
## 10     9         27574            1330             0.0482
## 11    10         28889            1344             0.0465
## 12    11         27268            1086             0.0398

##Q7: which destinations have the most carriers?

flights %>%
  group_by(dest) %>%
  summarise(disc_count = n_distinct(carrier)) %>%
  arrange(desc(disc_count))
## # A tibble: 105 × 2
##    dest  disc_count
##    <chr>      <int>
##  1 ATL            7
##  2 BOS            7
##  3 CLT            7
##  4 ORD            7
##  5 TPA            7
##  6 AUS            6
##  7 DCA            6
##  8 DTW            6
##  9 IAD            6
## 10 MSP            6
## # ℹ 95 more rows