library(dplyr)
library(ggplot2)

Question 1.

pacman::p_load(nycflights13)

View(flights) # this View() function opens lets you directly view the whole dataset

glimpse(flights) # this glimpse() function provides a quick overview of the dataset
Rows: 336,776
Columns: 19
$ year           <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
$ month          <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ day            <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, 558, 558, 558, 559, 559, 559, 600, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, 600, 600, 600, 600, 559, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1, 0, -1, 0, 0, 1, -8, -3, -4, -4, 0…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849, 853, 924, 923, 941, 702, 854, 851,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851, 856, 917, 937, 910, 706, 902, 858,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -14, 31, -4, -8, -7, 12, -6, -8, 16, …
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "AA", "B6", "B6", "UA", "UA", "AA", …
$ flight         <int> 1545, 1714, 1141, 725, 461, 1696, 507, 5708, 79, 301, 49, 71, 194, 1124, 707, 1806, 1187, …
$ tailnum        <chr> "N14228", "N24211", "N619AA", "N804JB", "N668DN", "N39463", "N516JB", "N829AS", "N593JB", …
$ origin         <chr> "EWR", "LGA", "JFK", "JFK", "LGA", "EWR", "EWR", "LGA", "JFK", "LGA", "JFK", "JFK", "JFK",…
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD", "MCO", "ORD", "PBI", "TPA", "LAX",…
$ air_time       <dbl> 227, 227, 160, 183, 116, 150, 158, 53, 140, 138, 149, 158, 345, 361, 257, 44, 337, 152, 13…
$ distance       <dbl> 1400, 1416, 1089, 1576, 762, 719, 1065, 229, 944, 733, 1028, 1005, 2475, 2565, 1389, 187, …
$ hour           <dbl> 5, 5, 5, 5, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, …
$ minute         <dbl> 15, 29, 40, 45, 0, 58, 0, 0, 0, 0, 0, 0, 0, 0, 0, 59, 0, 0, 0, 0, 10, 5, 10, 10, 7, 0, 0, …
$ time_hour      <dttm> 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-01-01 05:00:00, 2013-…
summary(flights)
      year          month             day           dep_time    sched_dep_time   dep_delay          arr_time   
 Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106   Min.   : -43.00   Min.   :   1  
 1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906   1st Qu.:  -5.00   1st Qu.:1104  
 Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359   Median :  -2.00   Median :1535  
 Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344   Mean   :  12.64   Mean   :1502  
 3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729   3rd Qu.:  11.00   3rd Qu.:1940  
 Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359   Max.   :1301.00   Max.   :2400  
                                                 NA's   :8255                  NA's   :8255      NA's   :8713  
 sched_arr_time   arr_delay          carrier              flight       tailnum             origin         
 Min.   :   1   Min.   : -86.000   Length:336776      Min.   :   1   Length:336776      Length:336776     
 1st Qu.:1124   1st Qu.: -17.000   Class :character   1st Qu.: 553   Class :character   Class :character  
 Median :1556   Median :  -5.000   Mode  :character   Median :1496   Mode  :character   Mode  :character  
 Mean   :1536   Mean   :   6.895                      Mean   :1972                                        
 3rd Qu.:1945   3rd Qu.:  14.000                      3rd Qu.:3465                                        
 Max.   :2359   Max.   :1272.000                      Max.   :8500                                        
                NA's   :9430                                                                              
     dest              air_time        distance         hour           minute     
 Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00   Min.   : 0.00  
 Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00   1st Qu.: 8.00  
 Mode  :character   Median :129.0   Median : 872   Median :13.00   Median :29.00  
                    Mean   :150.7   Mean   :1040   Mean   :13.18   Mean   :26.23  
                    3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00   3rd Qu.:44.00  
                    Max.   :695.0   Max.   :4983   Max.   :23.00   Max.   :59.00  
                    NA's   :9430                                                  
   time_hour                     
 Min.   :2013-01-01 05:00:00.00  
 1st Qu.:2013-04-04 13:00:00.00  
 Median :2013-07-03 10:00:00.00  
 Mean   :2013-07-03 05:22:54.64  
 3rd Qu.:2013-10-01 07:00:00.00  
 Max.   :2013-12-31 23:00:00.00  
                                 

Question 2.

summarise(flights, delay=mean(dep_delay,na.rm=TRUE))
maxdep <- max(flights$dep_delay, na.rm=TRUE)

maxdep_id <- which(flights$dep_delay==maxdep)

flights[maxdep_id, 10:12]
sortf <- arrange(flights,desc(dep_delay)) 

select(sortf, carrier, flight, tailnum, everything())
select(flights, starts_with("dep"))

Question 3.

flights %>% 

 group_by(year, month, day) %>% 

 summarise(mean = mean(dep_delay))
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
delays <- not_cancelled %>% 

 group_by(tailnum) %>% 

 summarise(

  delay = mean(arr_delay)

 )
Error: object 'not_cancelled' not found

not_cancelled <- flights %>% 

 filter(!is.na(dep_delay))

not_cancelled %>% 

 group_by(year, month, day) %>% 

 summarise(mean = mean(dep_delay))
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.
not_cancelled <- flights %>% 

 filter(!is.na(dep_delay), !is.na(arr_delay))
head(not_cancelled)
flights %>% 

 group_by(year, month, day) %>% 

 summarise(mean = mean(dep_delay, na.rm = TRUE))
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

Question 4.

# Step 1: Remove flights with missing arrival delays
not_cancelled <- flights %>%
  filter(!is.na(arr_delay))

# Step 2: Compute the average arrival delay per tailnum
avg_delays <- not_cancelled %>%
  group_by(tailnum) %>%
  summarise(avg_arr_delay = mean(arr_delay, na.rm = TRUE))

# Step 3: Get the tailnum with the lowest average arrival delay
lowest_delay_tailnum <- avg_delays %>%
  filter(avg_arr_delay == min(avg_arr_delay, na.rm = TRUE))

# Print result
lowest_delay_tailnum

Question 5.

not_cancelled %>% 

 group_by(year, month, day) %>% 

 summarise(

  first = min(dep_time),

  last = max(dep_time)

 )
`summarise()` has grouped output by 'year', 'month'. You can override using the `.groups` argument.

Question 6.

not_cancelled1 <- flights %>%
  filter(!is.na(dep_delay))

# Step 2: Calculate the proportion of flights with dep_delay > 60 minutes per month
monthly_delays <- not_cancelled1 %>%
  group_by(month) %>%
  summarise(
    total_flights = n(),
    delayed_flights = sum(dep_delay > 60),
    proportion_delayed = delayed_flights / total_flights
  ) %>%
  arrange(desc(proportion_delayed))  # Sort by highest proportion of delays

# Print result
monthly_delays

Quetsion 7.

flights %>%
  filter(dest %in% c("TPA", "BOS", "ORD", "CLT", "ATL")) %>%
  group_by(dest) %>%
  summarise(num_carriers = n_distinct(carrier)) %>%
  arrange(desc(num_carriers))

Question 8.

by_dest <- group_by(flights, dest)
by_dest
delay <- summarise(by_dest,
                   count = n(),
                   dist = mean(distance, na.rm = TRUE),
                   delay = mean(arr_delay, na.rm = TRUE)
)
delay <- filter(delay, count> 20, dest != "HNL")
ggplot(data = delay, mapping = aes(x = dist, y = delay)) + 
  geom_point(aes(size = count), alpha = 1/3) + 
  geom_smooth(se = FALSE)

Question 9.

delays <- flights %>% 

 group_by(dest) %>% 

 summarise(

  count = n(),

  dist = mean(distance, na.rm = TRUE),

  delay = mean(arr_delay, na.rm = TRUE)

 ) %>% 

 filter(count > 20, dest != "HNL")

delays
LS0tCnRpdGxlOiAiQXNzaWdubWVudCA2IH4gS2F0aWUgUXVpbm4iCm91dHB1dDogaHRtbF9ub3RlYm9vawotLS0KCmBgYHtyfQpsaWJyYXJ5KGRwbHlyKQpsaWJyYXJ5KGdncGxvdDIpCmBgYAoKIyMjIFF1ZXN0aW9uIDEuIApgYGB7cn0KcGFjbWFuOjpwX2xvYWQobnljZmxpZ2h0czEzKQoKVmlldyhmbGlnaHRzKSAjIHRoaXMgVmlldygpIGZ1bmN0aW9uIG9wZW5zIGxldHMgeW91IGRpcmVjdGx5IHZpZXcgdGhlIHdob2xlIGRhdGFzZXQKCmdsaW1wc2UoZmxpZ2h0cykgIyB0aGlzIGdsaW1wc2UoKSBmdW5jdGlvbiBwcm92aWRlcyBhIHF1aWNrIG92ZXJ2aWV3IG9mIHRoZSBkYXRhc2V0CgpzdW1tYXJ5KGZsaWdodHMpCmBgYAoKIyMjIFF1ZXN0aW9uIDIuIApgYGB7cn0Kc3VtbWFyaXNlKGZsaWdodHMsIGRlbGF5PW1lYW4oZGVwX2RlbGF5LG5hLnJtPVRSVUUpKQpgYGAKCmBgYHtyfQptYXhkZXAgPC0gbWF4KGZsaWdodHMkZGVwX2RlbGF5LCBuYS5ybT1UUlVFKQoKbWF4ZGVwX2lkIDwtIHdoaWNoKGZsaWdodHMkZGVwX2RlbGF5PT1tYXhkZXApCgpmbGlnaHRzW21heGRlcF9pZCwgMTA6MTJdCmBgYAoKYGBge3J9CnNvcnRmIDwtIGFycmFuZ2UoZmxpZ2h0cyxkZXNjKGRlcF9kZWxheSkpIAoKc2VsZWN0KHNvcnRmLCBjYXJyaWVyLCBmbGlnaHQsIHRhaWxudW0sIGV2ZXJ5dGhpbmcoKSkKYGBgCgpgYGB7cn0Kc2VsZWN0KGZsaWdodHMsIHN0YXJ0c193aXRoKCJkZXAiKSkKYGBgCgojIyMgUXVlc3Rpb24gMy4gCmBgYHtyfQpmbGlnaHRzICU+JSAKCiBncm91cF9ieSh5ZWFyLCBtb250aCwgZGF5KSAlPiUgCgogc3VtbWFyaXNlKG1lYW4gPSBtZWFuKGRlcF9kZWxheSkpCmBgYAoKYGBge3J9CmRlbGF5cyA8LSBub3RfY2FuY2VsbGVkICU+JSAKCiBncm91cF9ieSh0YWlsbnVtKSAlPiUgCgogc3VtbWFyaXNlKAoKICBkZWxheSA9IG1lYW4oYXJyX2RlbGF5KQoKICkKYGBgCgpgYGB7cn0KCm5vdF9jYW5jZWxsZWQgPC0gZmxpZ2h0cyAlPiUgCgogZmlsdGVyKCFpcy5uYShkZXBfZGVsYXkpKQoKbm90X2NhbmNlbGxlZCAlPiUgCgogZ3JvdXBfYnkoeWVhciwgbW9udGgsIGRheSkgJT4lIAoKIHN1bW1hcmlzZShtZWFuID0gbWVhbihkZXBfZGVsYXkpKQpgYGAKYGBge3J9Cm5vdF9jYW5jZWxsZWQgPC0gZmxpZ2h0cyAlPiUgCgogZmlsdGVyKCFpcy5uYShkZXBfZGVsYXkpLCAhaXMubmEoYXJyX2RlbGF5KSkKaGVhZChub3RfY2FuY2VsbGVkKQpgYGAKYGBge3J9CmZsaWdodHMgJT4lIAoKIGdyb3VwX2J5KHllYXIsIG1vbnRoLCBkYXkpICU+JSAKCiBzdW1tYXJpc2UobWVhbiA9IG1lYW4oZGVwX2RlbGF5LCBuYS5ybSA9IFRSVUUpKQpgYGAKIyMjIFF1ZXN0aW9uIDQuIApgYGB7cn0KIyBTdGVwIDE6IFJlbW92ZSBmbGlnaHRzIHdpdGggbWlzc2luZyBhcnJpdmFsIGRlbGF5cwpub3RfY2FuY2VsbGVkIDwtIGZsaWdodHMgJT4lCiAgZmlsdGVyKCFpcy5uYShhcnJfZGVsYXkpKQoKIyBTdGVwIDI6IENvbXB1dGUgdGhlIGF2ZXJhZ2UgYXJyaXZhbCBkZWxheSBwZXIgdGFpbG51bQphdmdfZGVsYXlzIDwtIG5vdF9jYW5jZWxsZWQgJT4lCiAgZ3JvdXBfYnkodGFpbG51bSkgJT4lCiAgc3VtbWFyaXNlKGF2Z19hcnJfZGVsYXkgPSBtZWFuKGFycl9kZWxheSwgbmEucm0gPSBUUlVFKSkKCiMgU3RlcCAzOiBHZXQgdGhlIHRhaWxudW0gd2l0aCB0aGUgbG93ZXN0IGF2ZXJhZ2UgYXJyaXZhbCBkZWxheQpsb3dlc3RfZGVsYXlfdGFpbG51bSA8LSBhdmdfZGVsYXlzICU+JQogIGZpbHRlcihhdmdfYXJyX2RlbGF5ID09IG1pbihhdmdfYXJyX2RlbGF5LCBuYS5ybSA9IFRSVUUpKQoKIyBQcmludCByZXN1bHQKbG93ZXN0X2RlbGF5X3RhaWxudW0KYGBgCgojIyMgUXVlc3Rpb24gNS4gCmBgYHtyfQpub3RfY2FuY2VsbGVkICU+JSAKCiBncm91cF9ieSh5ZWFyLCBtb250aCwgZGF5KSAlPiUgCgogc3VtbWFyaXNlKAoKICBmaXJzdCA9IG1pbihkZXBfdGltZSksCgogIGxhc3QgPSBtYXgoZGVwX3RpbWUpCgogKQpgYGAKCiMjIyBRdWVzdGlvbiA2LiAKYGBge3J9Cm5vdF9jYW5jZWxsZWQxIDwtIGZsaWdodHMgJT4lCiAgZmlsdGVyKCFpcy5uYShkZXBfZGVsYXkpKQoKIyBTdGVwIDI6IENhbGN1bGF0ZSB0aGUgcHJvcG9ydGlvbiBvZiBmbGlnaHRzIHdpdGggZGVwX2RlbGF5ID4gNjAgbWludXRlcyBwZXIgbW9udGgKbW9udGhseV9kZWxheXMgPC0gbm90X2NhbmNlbGxlZDEgJT4lCiAgZ3JvdXBfYnkobW9udGgpICU+JQogIHN1bW1hcmlzZSgKICAgIHRvdGFsX2ZsaWdodHMgPSBuKCksCiAgICBkZWxheWVkX2ZsaWdodHMgPSBzdW0oZGVwX2RlbGF5ID4gNjApLAogICAgcHJvcG9ydGlvbl9kZWxheWVkID0gZGVsYXllZF9mbGlnaHRzIC8gdG90YWxfZmxpZ2h0cwogICkgJT4lCiAgYXJyYW5nZShkZXNjKHByb3BvcnRpb25fZGVsYXllZCkpICAjIFNvcnQgYnkgaGlnaGVzdCBwcm9wb3J0aW9uIG9mIGRlbGF5cwoKIyBQcmludCByZXN1bHQKbW9udGhseV9kZWxheXMKYGBgCgojIyMgUXVldHNpb24gNy4gCmBgYHtyfQpmbGlnaHRzICU+JQogIGZpbHRlcihkZXN0ICVpbiUgYygiVFBBIiwgIkJPUyIsICJPUkQiLCAiQ0xUIiwgIkFUTCIpKSAlPiUKICBncm91cF9ieShkZXN0KSAlPiUKICBzdW1tYXJpc2UobnVtX2NhcnJpZXJzID0gbl9kaXN0aW5jdChjYXJyaWVyKSkgJT4lCiAgYXJyYW5nZShkZXNjKG51bV9jYXJyaWVycykpCmBgYAoKCgojIyMgUXVlc3Rpb24gOC4gCmBgYHtyfQpieV9kZXN0IDwtIGdyb3VwX2J5KGZsaWdodHMsIGRlc3QpCmJ5X2Rlc3QKYGBgCgpgYGB7cn0KZGVsYXkgPC0gc3VtbWFyaXNlKGJ5X2Rlc3QsCiAgICAgICAgICAgICAgICAgICBjb3VudCA9IG4oKSwKICAgICAgICAgICAgICAgICAgIGRpc3QgPSBtZWFuKGRpc3RhbmNlLCBuYS5ybSA9IFRSVUUpLAogICAgICAgICAgICAgICAgICAgZGVsYXkgPSBtZWFuKGFycl9kZWxheSwgbmEucm0gPSBUUlVFKQopCgpgYGAKCgpgYGB7cn0KZGVsYXkgPC0gZmlsdGVyKGRlbGF5LCBjb3VudD4gMjAsIGRlc3QgIT0gIkhOTCIpCgpgYGAKCgpgYGB7cn0KZ2dwbG90KGRhdGEgPSBkZWxheSwgbWFwcGluZyA9IGFlcyh4ID0gZGlzdCwgeSA9IGRlbGF5KSkgKyAKICBnZW9tX3BvaW50KGFlcyhzaXplID0gY291bnQpLCBhbHBoYSA9IDEvMykgKyAKICBnZW9tX3Ntb290aChzZSA9IEZBTFNFKQpgYGAKCgojIyMgUXVlc3Rpb24gOS4gCmBgYHtyfQpkZWxheXMgPC0gZmxpZ2h0cyAlPiUgCgogZ3JvdXBfYnkoZGVzdCkgJT4lIAoKIHN1bW1hcmlzZSgKCiAgY291bnQgPSBuKCksCgogIGRpc3QgPSBtZWFuKGRpc3RhbmNlLCBuYS5ybSA9IFRSVUUpLAoKICBkZWxheSA9IG1lYW4oYXJyX2RlbGF5LCBuYS5ybSA9IFRSVUUpCgogKSAlPiUgCgogZmlsdGVyKGNvdW50ID4gMjAsIGRlc3QgIT0gIkhOTCIpCgpkZWxheXMKYGBgCgoKCgoKCg==