Exercise 1:

names(nycflights) 
##  [1] "year"      "month"     "day"       "dep_time"  "dep_delay" "arr_time" 
##  [7] "arr_delay" "carrier"   "tailnum"   "flight"    "origin"    "dest"     
## [13] "air_time"  "distance"  "hour"      "minute"
glimpse(nycflights)
## Rows: 32,735
## Columns: 16
## $ year      <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, ~
## $ month     <int> 6, 5, 12, 5, 7, 1, 12, 8, 9, 4, 6, 11, 4, 3, 10, 1, 2, 8, 10~
## $ day       <int> 30, 7, 8, 14, 21, 1, 9, 13, 26, 30, 17, 22, 26, 25, 21, 23, ~
## $ dep_time  <int> 940, 1657, 859, 1841, 1102, 1817, 1259, 1920, 725, 1323, 940~
## $ dep_delay <int> 15, -3, -1, -4, -3, -3, 14, 85, -10, 62, 5, 5, -2, 115, -4, ~
## $ arr_time  <int> 1216, 2104, 1238, 2122, 1230, 2008, 1617, 2032, 1027, 1549, ~
## $ arr_delay <int> -4, 10, 11, -34, -8, 3, 22, 71, -8, 60, -4, -2, 22, 91, -6, ~
## $ carrier   <chr> "VX", "DL", "DL", "DL", "9E", "AA", "WN", "B6", "AA", "EV", ~
## $ tailnum   <chr> "N626VA", "N3760C", "N712TW", "N914DL", "N823AY", "N3AXAA", ~
## $ flight    <int> 407, 329, 422, 2391, 3652, 353, 1428, 1407, 2279, 4162, 20, ~
## $ origin    <chr> "JFK", "JFK", "JFK", "JFK", "LGA", "LGA", "EWR", "JFK", "LGA~
## $ dest      <chr> "LAX", "SJU", "LAX", "TPA", "ORF", "ORD", "HOU", "IAD", "MIA~
## $ air_time  <int> 313, 216, 376, 135, 50, 138, 240, 48, 148, 110, 50, 161, 87,~
## $ distance  <int> 2475, 1598, 2475, 1005, 296, 733, 1411, 228, 1096, 820, 264,~
## $ hour      <int> 9, 16, 8, 18, 11, 18, 12, 19, 7, 13, 9, 13, 8, 20, 12, 20, 6~
## $ minute    <int> 40, 57, 59, 41, 2, 17, 59, 20, 25, 23, 40, 20, 9, 54, 17, 24~

The unit of observation is an individual flight (32735 flights total)

Exercise 2:

 ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram() 
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 15)

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 150)

lax_flights <- nycflights %>%
  filter(dest == "LAX")
glimpse(lax_flights)
## Rows: 1,583
## Columns: 16
## $ year      <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, ~
## $ month     <int> 6, 12, 7, 8, 3, 6, 11, 3, 9, 3, 1, 10, 4, 9, 2, 3, 3, 4, 12,~
## $ day       <int> 30, 8, 5, 22, 27, 9, 26, 24, 17, 5, 8, 4, 8, 26, 7, 31, 24, ~
## $ dep_time  <int> 940, 859, 920, 1108, 1158, 1914, 1545, 2005, 1437, 1153, 185~
## $ dep_delay <int> 15, -1, 5, -7, -2, -2, 0, 5, -8, -7, -6, -3, -10, -2, -4, -2~
## $ arr_time  <int> 1216, 1238, 1204, 1352, 1455, 2234, 1900, 2248, 1736, 1526, ~
## $ arr_delay <int> -4, 11, -6, -12, -16, 9, -20, -37, -22, 15, -27, -11, -3, -1~
## $ carrier   <chr> "VX", "DL", "AA", "UA", "DL", "UA", "AA", "UA", "UA", "DL", ~
## $ tailnum   <chr> "N626VA", "N712TW", "N328AA", "N597UA", "N721TW", "N26208", ~
## $ flight    <int> 407, 422, 1, 703, 863, 1439, 133, 1466, 841, 863, 21, 398, 3~
## $ origin    <chr> "JFK", "JFK", "JFK", "JFK", "JFK", "EWR", "JFK", "EWR", "JFK~
## $ dest      <chr> "LAX", "LAX", "LAX", "LAX", "LAX", "LAX", "LAX", "LAX", "LAX~
## $ air_time  <int> 313, 376, 302, 292, 336, 317, 334, 315, 325, 343, 337, 323, ~
## $ distance  <int> 2475, 2475, 2475, 2475, 2475, 2454, 2475, 2454, 2475, 2475, ~
## $ hour      <int> 9, 8, 9, 11, 11, 19, 15, 20, 14, 11, 18, 11, 11, 10, 5, 13, ~
## $ minute    <int> 40, 59, 20, 8, 58, 14, 45, 5, 37, 53, 59, 22, 50, 58, 57, 23~
ggplot(data = lax_flights, aes(x = dep_delay)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

lax_flights %>%
  summarize(mean_dd = mean(dep_delay), median_dd = median(dep_delay), sample_size = n())
##    mean_dd median_dd sample_size
## 1 9.782059        -1        1583
sfo_feb_flights <- nycflights %>%
  filter(dest == "SFO", month == 2)
sfo_feb_flights %>% summarise(
    mean_ad = mean(arr_delay),
    median_ad = median(arr_delay),
    sample_size = n()
)
##   mean_ad median_ad sample_size
## 1    -4.5       -11          68

The dataset included 68 flights headed to SFO in February.

Exercise 3:

ggplot(data = sfo_feb_flights, aes(x = arr_delay)) +
    geom_histogram(binwidth = 60)

The distribtion is right-skewed (so mean is likely higher than median)

Exercise 4:

sfo_feb_flights %>%
  summarise(
  median_ad = median(arr_delay), 
  mean_ad = mean(arr_delay),
  ad = n())
##   median_ad mean_ad ad
## 1       -11    -4.5 68

Mean arrival delay is smaller than median arrival delay (which matches exercise 3 conclusion because the right skew predicted that the mean of the arrival delay would be right (more positive, therefore less late) of the median)

Exercise 5:

nycflights %>%
  group_by(month) %>%
  summarize(median_ad = median(arr_delay), n_flights = n()) %>%
  arrange(desc(median_ad))
## # A tibble: 12 x 3
##    month median_ad n_flights
##    <int>     <dbl>     <int>
##  1    12         4      2716
##  2     7        -1      2742
##  3     4        -2      2781
##  4     6        -2      2732
##  5     1        -3      2610
##  6     2        -3      2286
##  7     8        -5      2880
##  8    11        -5      2733
##  9     3        -6      2869
## 10     5        -7      2821
## 11    10        -7      2884
## 12     9       -12      2681

December (12) has the longest median arrival delay because if the median is positive, than at least 50% of flights in December were delayed (4 min being the median)

Exercise 6:

nycflights %>%
  group_by(carrier) %>%
  summarize(mean_ad = mean(arr_delay), n_flights = n()) %>%
  arrange(desc(mean_ad)) 
## # A tibble: 16 x 3
##    carrier mean_ad n_flights
##    <chr>     <dbl>     <int>
##  1 HA       28.1          34
##  2 FL       19.6         307
##  3 OO       16.7           3
##  4 EV       16.2        5142
##  5 YV       15.1          53
##  6 F9       12.5          69
##  7 MQ       10.1        2507
##  8 B6        9.73       5376
##  9 WN        8.88       1261
## 10 9E        8.04       1696
## 11 UA        4.36       5770
## 12 VX        2.41        497
## 13 US        1.80       2015
## 14 AA        1.40       3188
## 15 DL        0.907      4751
## 16 AS      -11.3          66
# (group by, summarize by)

Based on mean, Hawaiian Airlines (HA) has the longest arrival delays.

Exercise 7:

nycflights <- nycflights %>% 
  mutate(speed = distance/(air_time/60))

nycflights %>%
  group_by(carrier) %>%
  summarize(mean_speed = mean(speed), n_flights = n()) %>%
  arrange(desc(mean_speed)) 
## # A tibble: 16 x 3
##    carrier mean_speed n_flights
##    <chr>        <dbl>     <int>
##  1 HA            481.        34
##  2 VX            445.       497
##  3 AS            443.        66
##  4 F9            427.        69
##  5 UA            421.      5770
##  6 DL            419.      4751
##  7 AA            417.      3188
##  8 B6            400.      5376
##  9 WN            399.      1261
## 10 FL            392.       307
## 11 MQ            367.      2507
## 12 OO            364.         3
## 13 EV            362.      5142
## 14 9E            347.      1696
## 15 US            343.      2015
## 16 YV            332.        53

Hawaiian Airlines (HA) operates flights with the highest speeds (according to this data).

Exercise 8:

nycflights %>%
  ggplot(aes(x=distance, y=speed)) +
  geom_point()

Travel distance and flight speed seem to have a positive monotonic relationship based on this data.

Exercise 9

nycflights %>%
  group_by(carrier) %>%
  summarize(mean_distance = mean(distance), n_flights = n()) %>%
  arrange(desc(mean_distance))
## # A tibble: 16 x 3
##    carrier mean_distance n_flights
##    <chr>           <dbl>     <int>
##  1 HA              4983         34
##  2 VX              2501.       497
##  3 AS              2402         66
##  4 F9              1620         69
##  5 UA              1528.      5770
##  6 AA              1350.      3188
##  7 DL              1245.      4751
##  8 B6              1063.      5376
##  9 WN               995.      1261
## 10 FL               651.       307
## 11 OO               615.         3
## 12 MQ               565.      2507
## 13 EV               562.      5142
## 14 US               557.      2015
## 15 9E               538.      1696
## 16 YV               395.        53

Hawaiian Airlines (HA) likely operates at the highest speeds because, on average, they’re the airline witht the most arrival delays, so flying faster hopefully compensates at least a little for late arrival