library(tidyverse)
library(openintro)
data(nycflights)
names(nycflights)
##  [1] "year"      "month"     "day"       "dep_time"  "dep_delay" "arr_time" 
##  [7] "arr_delay" "carrier"   "tailnum"   "flight"    "origin"    "dest"     
## [13] "air_time"  "distance"  "hour"      "minute"
?nycflights
glimpse(nycflights)
## Rows: 32,735
## Columns: 16
## $ year      <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
## $ month     <int> 6, 5, 12, 5, 7, 1, 12, 8, 9, 4, 6, 11, 4, 3, 10, 1, 2, 8, 10…
## $ day       <int> 30, 7, 8, 14, 21, 1, 9, 13, 26, 30, 17, 22, 26, 25, 21, 23, …
## $ dep_time  <int> 940, 1657, 859, 1841, 1102, 1817, 1259, 1920, 725, 1323, 940…
## $ dep_delay <dbl> 15, -3, -1, -4, -3, -3, 14, 85, -10, 62, 5, 5, -2, 115, -4, …
## $ arr_time  <int> 1216, 2104, 1238, 2122, 1230, 2008, 1617, 2032, 1027, 1549, …
## $ arr_delay <dbl> -4, 10, 11, -34, -8, 3, 22, 71, -8, 60, -4, -2, 22, 91, -6, …
## $ carrier   <chr> "VX", "DL", "DL", "DL", "9E", "AA", "WN", "B6", "AA", "EV", …
## $ tailnum   <chr> "N626VA", "N3760C", "N712TW", "N914DL", "N823AY", "N3AXAA", …
## $ flight    <int> 407, 329, 422, 2391, 3652, 353, 1428, 1407, 2279, 4162, 20, …
## $ origin    <chr> "JFK", "JFK", "JFK", "JFK", "LGA", "LGA", "EWR", "JFK", "LGA…
## $ dest      <chr> "LAX", "SJU", "LAX", "TPA", "ORF", "ORD", "HOU", "IAD", "MIA…
## $ air_time  <dbl> 313, 216, 376, 135, 50, 138, 240, 48, 148, 110, 50, 161, 87,…
## $ distance  <dbl> 2475, 1598, 2475, 1005, 296, 733, 1411, 228, 1096, 820, 264,…
## $ hour      <dbl> 9, 16, 8, 18, 11, 18, 12, 19, 7, 13, 9, 13, 8, 20, 12, 20, 6…
## $ minute    <dbl> 40, 57, 59, 41, 2, 17, 59, 20, 25, 23, 40, 20, 9, 54, 17, 24…

Exercise 1

#The three histograms show that a larger “binwidth” (150) will give a wider count of the delayed flights on the histogram and the smaller “binwidth” (15) appears less accurate because of how squished and small it is.

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 15)

ggplot(data = nycflights, aes(x = dep_delay)) +
  geom_histogram(binwidth = 150)

Exercise 2

#there were 68 flights departing new york and heading to SFO in february.

lax_flights <- nycflights %>%
  filter(dest == "LAX")
ggplot(data = lax_flights, aes(x = dep_delay)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

sfo_feb_flights <- nycflights %>%
  filter(dest == "SFO", month == 2)
ggplot(data = sfo_feb_flights, aes(x = dep_delay)) +
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

glimpse(sfo_feb_flights)
## Rows: 68
## Columns: 16
## $ year      <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, …
## $ month     <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, …
## $ day       <int> 18, 3, 15, 18, 24, 25, 7, 15, 13, 8, 11, 13, 25, 20, 12, 27,…
## $ dep_time  <int> 1527, 613, 955, 1928, 1340, 1415, 1032, 1805, 1056, 656, 191…
## $ dep_delay <dbl> 57, 14, -5, 15, 2, -10, 1, 20, -4, -4, 40, -2, -1, -6, -7, 2…
## $ arr_time  <int> 1903, 1008, 1313, 2239, 1644, 1737, 1352, 2122, 1412, 1039, …
## $ arr_delay <dbl> 48, 38, -28, -6, -21, -13, -10, 2, -13, -6, 2, -5, -30, -22,…
## $ carrier   <chr> "DL", "UA", "DL", "UA", "UA", "UA", "B6", "AA", "UA", "DL", …
## $ tailnum   <chr> "N711ZX", "N502UA", "N717TW", "N24212", "N76269", "N532UA", …
## $ flight    <int> 1322, 691, 1765, 1214, 1111, 394, 641, 177, 642, 1865, 272, …
## $ origin    <chr> "JFK", "JFK", "JFK", "EWR", "EWR", "JFK", "JFK", "JFK", "JFK…
## $ dest      <chr> "SFO", "SFO", "SFO", "SFO", "SFO", "SFO", "SFO", "SFO", "SFO…
## $ air_time  <dbl> 358, 367, 338, 353, 341, 355, 359, 338, 347, 361, 332, 351, …
## $ distance  <dbl> 2586, 2586, 2586, 2565, 2565, 2586, 2586, 2586, 2586, 2586, …
## $ hour      <dbl> 15, 6, 9, 19, 13, 14, 10, 18, 10, 6, 19, 8, 10, 18, 7, 17, 1…
## $ minute    <dbl> 27, 13, 55, 28, 40, 15, 32, 5, 56, 56, 10, 33, 48, 49, 23, 2…

Exercise 3

#The arrival delays show that it was more common for flights to SFO in February were earlier and on time than being late. It deviates to the left more.

sfo_feb_flights$arr_delay
##  [1]  48  38 -28  -6 -21 -13 -10   2 -13  -6   2  -5 -30 -22 -40  -1 -17 -24  21
## [20] -13  -5  -6  34 -45 -18 -14 -11  45 -48   8  -3  -3 -23 -35  99 -18 -17  18
## [39]  -5 -20  11   3  -2   8 -27 -30 -30  16 -66 196  76 -10 -11   1  -7 -13 -26
## [58]  -9   9   7 -15 -35 -20 -14 -20 -20 -34 -39
ggplot(data = sfo_feb_flights, aes(x = arr_delay)) +
  geom_histogram(binwidth = 10)

Exercise 4

#DL has the most variable delays

#February is expected to have the highest average delay departing from a NYC airport.

sfo_feb_flights %>%
  group_by(origin) %>%
  summarise(median_dd = median(dep_delay), iqr_dd = IQR(dep_delay), n_flights = n())
## # A tibble: 2 × 4
##   origin median_dd iqr_dd n_flights
##   <chr>      <dbl>  <dbl>     <int>
## 1 EWR          0.5   5.75         8
## 2 JFK         -2.5  15.2         60
library(dplyr)

sfo_feb_flights %>% summarise(mean_ard = mean(arr_delay),
iqr_ard = IQR(arr_delay))
## # A tibble: 1 × 2
##   mean_ard iqr_ard
##      <dbl>   <dbl>
## 1     -4.5    23.2
q <- arrange(sfo_feb_flights, carrier)
show(q)
## # A tibble: 68 × 16
##     year month   day dep_time dep_delay arr_time arr_delay carrier tailnum
##    <int> <int> <int>    <int>     <dbl>    <int>     <dbl> <chr>   <chr>  
##  1  2013     2    15     1805        20     2122         2 AA      N335AA 
##  2  2013     2     4     1107        37     1440        45 AA      N343AA 
##  3  2013     2    27     1830        45     2128         8 AA      N329AA 
##  4  2013     2     7     1741        -4     2117        -3 AA      N335AA 
##  5  2013     2    24     1547        17     1928        18 AA      N381AA 
##  6  2013     2     5      744        -1     1133         8 AA      N383AA 
##  7  2013     2    25      916        91     1241        76 AA      N335AA 
##  8  2013     2    25     1030         0     1356         1 AA      N367AA 
##  9  2013     2    11     1539         9     1844       -26 AA      N352AA 
## 10  2013     2    21     1745         0     2106       -14 AA      N329AA 
## # ℹ 58 more rows
## # ℹ 7 more variables: flight <int>, origin <chr>, dest <chr>, air_time <dbl>,
## #   distance <dbl>, hour <dbl>, minute <dbl>
nycflights %>%
  group_by(month) %>%
  summarise(mean_dd = mean(dep_delay)) %>%
  arrange(desc(mean_dd))
## # A tibble: 12 × 2
##    month mean_dd
##    <int>   <dbl>
##  1     7   20.8 
##  2     6   20.4 
##  3    12   17.4 
##  4     4   14.6 
##  5     3   13.5 
##  6     5   13.3 
##  7     8   12.6 
##  8     2   10.7 
##  9     1   10.2 
## 10     9    6.87
## 11    11    6.10
## 12    10    5.88

Exercise 5

#If I were to make my decision based from the month with the lowest mean, I would have a disadvantage because the mean is not the best representation of the data itself. Choosing the lowest median would be the more advantageous decision because it is more representative, thus painting a better picture.

nycflights <- nycflights %>%
  mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights %>%
  group_by(origin) %>%
  summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
  arrange(desc(ot_dep_rate))
## # A tibble: 3 × 2
##   origin ot_dep_rate
##   <chr>        <dbl>
## 1 LGA          0.728
## 2 JFK          0.694
## 3 EWR          0.637

Exercise 6

#I would choose LGA based on time departure percentage.

nycflights <- nycflights %>%
  mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))
nycflights %>%
  group_by(origin) %>%
  summarise(ot_dep_rate = sum(dep_type == "on time") / n()) %>%
  arrange(desc(ot_dep_rate))
## # A tibble: 3 × 2
##   origin ot_dep_rate
##   <chr>        <dbl>
## 1 LGA          0.728
## 2 JFK          0.694
## 3 EWR          0.637
ggplot(data = nycflights, aes(x = origin, fill = dep_type)) +
  geom_bar()

Exercise 7

nycflights <- nycflights %>%
  mutate(avg_speed = distance/(air_time/60))
nycflights <- nycflights %>%
  mutate(dep_type = ifelse(dep_delay < 5, "on time", "delayed"))

Exercise 8

#For shorter distances, the average is lower and as the minimum distance reaches about 500, then the average speed levels out and stays the same as the distance grows.

ggplot(data=nycflights , aes(y=avg_speed , x = distance))+
  geom_point()

Exercise 9

nycflights_of_AA_DA_UA <- nycflights %>%
  filter(carrier == "AA" | carrier == "DL" | carrier == "UA")
ggplot(data = nycflights_of_AA_DA_UA, aes(x = dep_delay, y = arr_delay, color= carrier)) + geom_point()

LS0tCnRpdGxlOiAiTGFiIE5hbWUiCmF1dGhvcjogIkFiaWdhaWwgTWFjYWxhZ3VpbSIKZGF0ZTogImBPY3RvYmVyIDEsIDIwMjRgIgpvdXRwdXQ6IG9wZW5pbnRybzo6bGFiX3JlcG9ydAotLS0KCmBgYHtyIGxvYWQtcGFja2FnZXMsIG1lc3NhZ2U9RkFMU0V9CmxpYnJhcnkodGlkeXZlcnNlKQpsaWJyYXJ5KG9wZW5pbnRybykKZGF0YShueWNmbGlnaHRzKQpuYW1lcyhueWNmbGlnaHRzKQo/bnljZmxpZ2h0cwpnbGltcHNlKG55Y2ZsaWdodHMpCmBgYAoKCgojIyMgRXhlcmNpc2UgMQoKI1RoZSB0aHJlZSBoaXN0b2dyYW1zIHNob3cgdGhhdCBhIGxhcmdlciAiYmlud2lkdGgiICgxNTApIHdpbGwgZ2l2ZSBhIHdpZGVyIGNvdW50IG9mIHRoZSBkZWxheWVkIGZsaWdodHMgb24gdGhlIGhpc3RvZ3JhbSBhbmQgdGhlIHNtYWxsZXIgImJpbndpZHRoIiAoMTUpIGFwcGVhcnMgbGVzcyBhY2N1cmF0ZSBiZWNhdXNlIG9mIGhvdyBzcXVpc2hlZCBhbmQgc21hbGwgaXQgaXMuIAoKYGBge3IgY29kZS1jaHVuay1sYWJlbDF9CmdncGxvdChkYXRhID0gbnljZmxpZ2h0cywgYWVzKHggPSBkZXBfZGVsYXkpKSArCiAgZ2VvbV9oaXN0b2dyYW0oKQpnZ3Bsb3QoZGF0YSA9IG55Y2ZsaWdodHMsIGFlcyh4ID0gZGVwX2RlbGF5KSkgKwogIGdlb21faGlzdG9ncmFtKGJpbndpZHRoID0gMTUpCmdncGxvdChkYXRhID0gbnljZmxpZ2h0cywgYWVzKHggPSBkZXBfZGVsYXkpKSArCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAxNTApCmBgYAoKIyMjIEV4ZXJjaXNlIDIKI3RoZXJlIHdlcmUgNjggZmxpZ2h0cyBkZXBhcnRpbmcgbmV3IHlvcmsgYW5kIGhlYWRpbmcgdG8gU0ZPIGluIGZlYnJ1YXJ5LiAKYGBge3IgY29kZS1jaHVuay1sYWJlbDJ9CmxheF9mbGlnaHRzIDwtIG55Y2ZsaWdodHMgJT4lCiAgZmlsdGVyKGRlc3QgPT0gIkxBWCIpCmdncGxvdChkYXRhID0gbGF4X2ZsaWdodHMsIGFlcyh4ID0gZGVwX2RlbGF5KSkgKwogIGdlb21faGlzdG9ncmFtKCkKc2ZvX2ZlYl9mbGlnaHRzIDwtIG55Y2ZsaWdodHMgJT4lCiAgZmlsdGVyKGRlc3QgPT0gIlNGTyIsIG1vbnRoID09IDIpCmdncGxvdChkYXRhID0gc2ZvX2ZlYl9mbGlnaHRzLCBhZXMoeCA9IGRlcF9kZWxheSkpICsKICBnZW9tX2hpc3RvZ3JhbSgpCmdsaW1wc2Uoc2ZvX2ZlYl9mbGlnaHRzKQpgYGAKIyMjIEV4ZXJjaXNlIDMKCiNUaGUgYXJyaXZhbCBkZWxheXMgc2hvdyB0aGF0IGl0IHdhcyBtb3JlIGNvbW1vbiBmb3IgZmxpZ2h0cyB0byBTRk8gaW4gRmVicnVhcnkgd2VyZSBlYXJsaWVyIGFuZCBvbiB0aW1lIHRoYW4gYmVpbmcgbGF0ZS4gSXQgZGV2aWF0ZXMgdG8gdGhlIGxlZnQgbW9yZS4gCmBgYHtyIGNvZGUtY2h1bmstbGFiZWwzfQpzZm9fZmViX2ZsaWdodHMkYXJyX2RlbGF5CgpnZ3Bsb3QoZGF0YSA9IHNmb19mZWJfZmxpZ2h0cywgYWVzKHggPSBhcnJfZGVsYXkpKSArCiAgZ2VvbV9oaXN0b2dyYW0oYmlud2lkdGggPSAxMCkKYGBgCgoKIyMjIEV4ZXJjaXNlIDQKCiNETCBoYXMgdGhlIG1vc3QgdmFyaWFibGUgZGVsYXlzIAoKI0ZlYnJ1YXJ5IGlzIGV4cGVjdGVkIHRvIGhhdmUgdGhlIGhpZ2hlc3QgYXZlcmFnZSBkZWxheSBkZXBhcnRpbmcgZnJvbSBhIE5ZQyBhaXJwb3J0LgpgYGB7ciBjb2RlLWNodW5rLWxhYmVsNH0Kc2ZvX2ZlYl9mbGlnaHRzICU+JQogIGdyb3VwX2J5KG9yaWdpbikgJT4lCiAgc3VtbWFyaXNlKG1lZGlhbl9kZCA9IG1lZGlhbihkZXBfZGVsYXkpLCBpcXJfZGQgPSBJUVIoZGVwX2RlbGF5KSwgbl9mbGlnaHRzID0gbigpKQoKbGlicmFyeShkcGx5cikKCnNmb19mZWJfZmxpZ2h0cyAlPiUgc3VtbWFyaXNlKG1lYW5fYXJkID0gbWVhbihhcnJfZGVsYXkpLAppcXJfYXJkID0gSVFSKGFycl9kZWxheSkpCgpxIDwtIGFycmFuZ2Uoc2ZvX2ZlYl9mbGlnaHRzLCBjYXJyaWVyKQpzaG93KHEpCgpueWNmbGlnaHRzICU+JQogIGdyb3VwX2J5KG1vbnRoKSAlPiUKICBzdW1tYXJpc2UobWVhbl9kZCA9IG1lYW4oZGVwX2RlbGF5KSkgJT4lCiAgYXJyYW5nZShkZXNjKG1lYW5fZGQpKQpgYGAKIyMjIEV4ZXJjaXNlIDUKCiNJZiBJIHdlcmUgdG8gbWFrZSBteSBkZWNpc2lvbiBiYXNlZCBmcm9tIHRoZSBtb250aCB3aXRoIHRoZSBsb3dlc3QgbWVhbiwgSSB3b3VsZCBoYXZlIGEgZGlzYWR2YW50YWdlIGJlY2F1c2UgdGhlIG1lYW4gaXMgbm90IHRoZSBiZXN0IHJlcHJlc2VudGF0aW9uIG9mIHRoZSBkYXRhIGl0c2VsZi4gQ2hvb3NpbmcgdGhlIGxvd2VzdCBtZWRpYW4gd291bGQgYmUgdGhlIG1vcmUgYWR2YW50YWdlb3VzIGRlY2lzaW9uIGJlY2F1c2UgaXQgaXMgbW9yZSByZXByZXNlbnRhdGl2ZSwgdGh1cyBwYWludGluZyBhIGJldHRlciBwaWN0dXJlLgpgYGB7ciBjb2RlLWNodW5rLWxhYmVsNX0KbnljZmxpZ2h0cyA8LSBueWNmbGlnaHRzICU+JQogIG11dGF0ZShkZXBfdHlwZSA9IGlmZWxzZShkZXBfZGVsYXkgPCA1LCAib24gdGltZSIsICJkZWxheWVkIikpCm55Y2ZsaWdodHMgJT4lCiAgZ3JvdXBfYnkob3JpZ2luKSAlPiUKICBzdW1tYXJpc2Uob3RfZGVwX3JhdGUgPSBzdW0oZGVwX3R5cGUgPT0gIm9uIHRpbWUiKSAvIG4oKSkgJT4lCiAgYXJyYW5nZShkZXNjKG90X2RlcF9yYXRlKSkKYGBgCgoKIyMjIEV4ZXJjaXNlIDYKCiNJIHdvdWxkIGNob29zZSBMR0EgYmFzZWQgb24gdGltZSBkZXBhcnR1cmUgcGVyY2VudGFnZS4gCmBgYHtyIGNvZGUtY2h1bmstbGFiZWw2fQpueWNmbGlnaHRzIDwtIG55Y2ZsaWdodHMgJT4lCiAgbXV0YXRlKGRlcF90eXBlID0gaWZlbHNlKGRlcF9kZWxheSA8IDUsICJvbiB0aW1lIiwgImRlbGF5ZWQiKSkKbnljZmxpZ2h0cyAlPiUKICBncm91cF9ieShvcmlnaW4pICU+JQogIHN1bW1hcmlzZShvdF9kZXBfcmF0ZSA9IHN1bShkZXBfdHlwZSA9PSAib24gdGltZSIpIC8gbigpKSAlPiUKICBhcnJhbmdlKGRlc2Mob3RfZGVwX3JhdGUpKQoKZ2dwbG90KGRhdGEgPSBueWNmbGlnaHRzLCBhZXMoeCA9IG9yaWdpbiwgZmlsbCA9IGRlcF90eXBlKSkgKwogIGdlb21fYmFyKCkKYGBgCgoKIyMjIEV4ZXJjaXNlIDcKCiMKYGBge3IgY29kZS1jaHVuay1sYWJlbDd9Cm55Y2ZsaWdodHMgPC0gbnljZmxpZ2h0cyAlPiUKICBtdXRhdGUoYXZnX3NwZWVkID0gZGlzdGFuY2UvKGFpcl90aW1lLzYwKSkKbnljZmxpZ2h0cyA8LSBueWNmbGlnaHRzICU+JQogIG11dGF0ZShkZXBfdHlwZSA9IGlmZWxzZShkZXBfZGVsYXkgPCA1LCAib24gdGltZSIsICJkZWxheWVkIikpCgpgYGAKCgojIyMgRXhlcmNpc2UgOAoKI0ZvciBzaG9ydGVyIGRpc3RhbmNlcywgdGhlIGF2ZXJhZ2UgaXMgbG93ZXIgYW5kIGFzIHRoZSBtaW5pbXVtIGRpc3RhbmNlIHJlYWNoZXMgYWJvdXQgNTAwLCB0aGVuIHRoZSBhdmVyYWdlIHNwZWVkIGxldmVscyBvdXQgYW5kIHN0YXlzIHRoZSBzYW1lIGFzIHRoZSBkaXN0YW5jZSBncm93cy4gIApgYGB7ciBjb2RlLWNodW5rLWxhYmVsOH0KZ2dwbG90KGRhdGE9bnljZmxpZ2h0cyAsIGFlcyh5PWF2Z19zcGVlZCAsIHggPSBkaXN0YW5jZSkpKwogIGdlb21fcG9pbnQoKQpgYGAKCgojIyMgRXhlcmNpc2UgOQoKIwpgYGB7ciBjb2RlLWNodW5rLWxhYmVsOX0KbnljZmxpZ2h0c19vZl9BQV9EQV9VQSA8LSBueWNmbGlnaHRzICU+JQogIGZpbHRlcihjYXJyaWVyID09ICJBQSIgfCBjYXJyaWVyID09ICJETCIgfCBjYXJyaWVyID09ICJVQSIpCmdncGxvdChkYXRhID0gbnljZmxpZ2h0c19vZl9BQV9EQV9VQSwgYWVzKHggPSBkZXBfZGVsYXksIHkgPSBhcnJfZGVsYXksIGNvbG9yPSBjYXJyaWVyKSkgKyBnZW9tX3BvaW50KCkKYGBg