HW 4

library(nycflights13)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ISLR2)
library(gt)

Problem 1

flights_sm<- flights %>%
  select(dest, carrier, dep_time,sched_dep_time,dep_delay, arr_time, sched_arr_time, arr_delay)
glimpse(flights_sm)
Rows: 336,776
Columns: 8
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…

Problem 2

flights_sm %>%
  filter(dest %in% c("BOS", "ATL")) %>%
  glimpse()
Rows: 32,723
Columns: 8
$ dest           <chr> "ATL", "BOS", "ATL", "ATL", "ATL", "BOS", "ATL", "ATL",…
$ carrier        <chr> "DL", "B6", "MQ", "DL", "DL", "B6", "DL", "DL", "B6", "…
$ dep_time       <int> 554, 559, 600, 606, 615, 639, 658, 754, 801, 803, 807, …
$ sched_dep_time <int> 600, 559, 600, 610, 615, 640, 700, 759, 805, 810, 810, …
$ dep_delay      <dbl> -6, 0, 0, -4, 0, -1, -2, -5, -4, -7, -3, 4, -10, -5, -4…
$ arr_time       <int> 812, 702, 837, 837, 833, 739, 944, 1039, 900, 903, 1043…
$ sched_arr_time <int> 837, 706, 825, 845, 842, 749, 939, 1041, 919, 925, 1043…
$ arr_delay      <dbl> -25, -4, 12, -8, -9, -10, 5, -2, -19, -22, 0, 17, -14, …
flights_sm %>%
  filter(dep_delay >= 60, arr_delay < 0) %>%
  glimpse()
Rows: 3
Columns: 8
$ dest           <chr> "HNL", "LAX", "LAS"
$ carrier        <chr> "HA", "UA", "DL"
$ dep_time       <int> 1000, 932, 2018
$ sched_dep_time <int> 900, 831, 1915
$ dep_delay      <dbl> 60, 61, 63
$ arr_time       <int> 1513, 1149, 2210
$ sched_arr_time <int> 1540, 1151, 2211
$ arr_delay      <dbl> -27, -2, -1
flights_sm %>%
  filter(between(arr_time, 100, 700)) %>%
  glimpse()
Rows: 6,178
Columns: 8
$ dest           <chr> "SJU", "SJU", "SJU", "TPA", "FLL", "LAS", "MCI", "PSE",…
$ carrier        <chr> "UA", "B6", "B6", "B6", "B6", "B6", "EV", "B6", "B6", "…
$ dep_time       <int> 2102, 2140, 2217, 2217, 2229, 2326, 2343, 2353, 2353, 2…
$ sched_dep_time <int> 2108, 2135, 2229, 2130, 2159, 2130, 1724, 2359, 2359, 2…
$ dep_delay      <dbl> -6, 5, -12, 47, 30, 116, 379, -6, -6, -3, 43, 156, 2, 1…
$ arr_time       <int> 146, 210, 249, 140, 149, 131, 314, 425, 418, 425, 518, …
$ sched_arr_time <int> 158, 224, 315, 27, 100, 18, 1938, 445, 442, 437, 442, 2…
$ arr_delay      <dbl> -12, -14, -26, 73, 49, 73, 456, -20, -24, -12, 36, 154,…

problem 3

flights_sm %>%
  arrange(desc(dep_delay), desc(arr_delay)) %>%
  glimpse()
Rows: 336,776
Columns: 8
$ dest           <chr> "HNL", "CMH", "ORD", "SFO", "CVG", "TPA", "MSP", "PDX",…
$ carrier        <chr> "HA", "MQ", "MQ", "AA", "MQ", "DL", "DL", "DL", "DL", "…
$ dep_time       <int> 641, 1432, 1121, 1139, 845, 1100, 2321, 959, 2257, 756,…
$ sched_dep_time <int> 900, 1935, 1635, 1845, 1600, 1900, 810, 1900, 759, 1700…
$ dep_delay      <dbl> 1301, 1137, 1126, 1014, 1005, 960, 911, 899, 898, 896, …
$ arr_time       <int> 1242, 1607, 1239, 1457, 1044, 1342, 135, 1236, 121, 105…
$ sched_arr_time <int> 1530, 2120, 1810, 2210, 1815, 2211, 1020, 2226, 1026, 2…
$ arr_delay      <dbl> 1272, 1127, 1109, 1007, 989, 931, 915, 850, 895, 878, 8…
flights_sm %>%
  arrange(is.na(dep_time)) %>%
  glimpse()
Rows: 336,776
Columns: 8
$ dest           <chr> "IAH", "IAH", "MIA", "BQN", "ATL", "ORD", "FLL", "IAD",…
$ carrier        <chr> "UA", "UA", "AA", "B6", "DL", "UA", "B6", "EV", "B6", "…
$ dep_time       <int> 517, 533, 542, 544, 554, 554, 555, 557, 557, 558, 558, …
$ sched_dep_time <int> 515, 529, 540, 545, 600, 558, 600, 600, 600, 600, 600, …
$ dep_delay      <dbl> 2, 4, 2, -1, -6, -4, -5, -3, -3, -2, -2, -2, -2, -2, -1…
$ arr_time       <int> 830, 850, 923, 1004, 812, 740, 913, 709, 838, 753, 849,…
$ sched_arr_time <int> 819, 830, 850, 1022, 837, 728, 854, 723, 846, 745, 851,…
$ arr_delay      <dbl> 11, 20, 33, -18, -25, 12, 19, -14, -8, 8, -2, -3, 7, -1…

Problem 4

carrier_delays <- flights %>%
  group_by(carrier) %>%
  summarize(
    avg_dep_delay = mean(dep_delay, na.rm = TRUE),
    num_flights = n()
  ) %>%
  arrange(desc(avg_dep_delay))

carrier_delays %>% head(3)
# A tibble: 3 × 3
  carrier avg_dep_delay num_flights
  <chr>           <dbl>       <int>
1 F9               20.2         685
2 EV               20.0       54173
3 YV               19.0         601

Problem 5

Carseats <- Carseats %>%
  mutate(
    Price_CAD = Price * 1.34, 
    CompPrice_CAD = CompPrice * 1.34,
    Ad_per_person = Advertising / Population
  )

glimpse(Carseats)
Rows: 400
Columns: 14
$ Sales         <dbl> 9.50, 11.22, 10.06, 7.40, 4.15, 10.81, 6.63, 11.85, 6.54…
$ CompPrice     <dbl> 138, 111, 113, 117, 141, 124, 115, 136, 132, 132, 121, 1…
$ Income        <dbl> 73, 48, 35, 100, 64, 113, 105, 81, 110, 113, 78, 94, 35,…
$ Advertising   <dbl> 11, 16, 10, 4, 3, 13, 0, 15, 0, 0, 9, 4, 2, 11, 11, 5, 0…
$ Population    <dbl> 276, 260, 269, 466, 340, 501, 45, 425, 108, 131, 150, 50…
$ Price         <dbl> 120, 83, 80, 97, 128, 72, 108, 120, 124, 124, 100, 94, 1…
$ ShelveLoc     <fct> Bad, Good, Medium, Medium, Bad, Bad, Medium, Good, Mediu…
$ Age           <dbl> 42, 65, 59, 55, 38, 78, 71, 67, 76, 76, 26, 50, 62, 53, …
$ Education     <dbl> 17, 10, 12, 14, 13, 16, 15, 10, 10, 17, 10, 13, 18, 18, …
$ Urban         <fct> Yes, Yes, Yes, Yes, Yes, No, Yes, Yes, No, No, No, Yes, …
$ US            <fct> Yes, Yes, Yes, Yes, No, Yes, No, Yes, No, Yes, Yes, Yes,…
$ Price_CAD     <dbl> 160.80, 111.22, 107.20, 129.98, 171.52, 96.48, 144.72, 1…
$ CompPrice_CAD <dbl> 184.92, 148.74, 151.42, 156.78, 188.94, 166.16, 154.10, …
$ Ad_per_person <dbl> 0.039855072, 0.061538462, 0.037174721, 0.008583691, 0.00…

Problem 6

Carseats %>%
  group_by(US, ShelveLoc) %>%
  summarize(
    avg_price = mean(Price, na.rm = TRUE),
    avg_comp_price = mean(CompPrice, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  glimpse()
Rows: 6
Columns: 4
$ US             <fct> No, No, No, Yes, Yes, Yes
$ ShelveLoc      <fct> Bad, Good, Medium, Bad, Good, Medium
$ avg_price      <dbl> 111.7647, 118.0833, 113.6548, 115.6452, 117.8033, 116.8…
$ avg_comp_price <dbl> 123.2059, 126.8333, 124.5714, 124.4516, 125.3279, 125.4…

Problem 7

Carseats %>%
  group_by(US, ShelveLoc) %>%
  summarize(
    avg_price = mean(Price, na.rm = TRUE),
    avg_comp_price = mean(CompPrice, na.rm = TRUE),
    .groups = "drop"
  ) %>%
  ungroup() %>%  
  gt()
US ShelveLoc avg_price avg_comp_price
No Bad 111.7647 123.2059
No Good 118.0833 126.8333
No Medium 113.6548 124.5714
Yes Bad 115.6452 124.4516
Yes Good 117.8033 125.3279
Yes Medium 116.8963 125.4222