NYC Flights Homework

Load the libraries and view the “flights” dataset

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                vars      n    mean      sd median trimmed     mad  min  max
## year              1 336776 2013.00    0.00   2013 2013.00    0.00 2013 2013
## month             2 336776    6.55    3.41      7    6.56    4.45    1   12
## day               3 336776   15.71    8.77     16   15.70   11.86    1   31
## dep_time          4 328521 1349.11  488.28   1401 1346.82  634.55    1 2400
## sched_dep_time    5 336776 1344.25  467.34   1359 1341.60  613.80  106 2359
## dep_delay         6 328521   12.64   40.21     -2    3.32    5.93  -43 1301
## arr_time          7 328063 1502.05  533.26   1535 1526.42  619.73    1 2400
## sched_arr_time    8 336776 1536.38  497.46   1556 1550.67  618.24    1 2359
## arr_delay         9 327346    6.90   44.63     -5   -1.03   20.76  -86 1272
## carrier*         10 336776    7.14    4.14      6    7.00    5.93    1   16
## flight           11 336776 1971.92 1632.47   1496 1830.51 1608.62    1 8500
## tailnum*         12 334264 1814.32 1199.75   1798 1778.21 1587.86    1 4043
## origin*          13 336776    1.95    0.82      2    1.94    1.48    1    3
## dest*            14 336776   50.03   28.12     50   49.56   32.62    1  105
## air_time         15 327346  150.69   93.69    129  140.03   75.61   20  695
## distance         16 336776 1039.91  733.23    872  955.27  569.32   17 4983
## hour             17 336776   13.18    4.66     13   13.15    5.93    1   23
## minute           18 336776   26.23   19.30     29   25.64   23.72    0   59
## time_hour        19 336776     NaN      NA     NA     NaN      NA  Inf -Inf
##                range  skew kurtosis   se
## year               0   NaN      NaN 0.00
## month             11 -0.01    -1.19 0.01
## day               30  0.01    -1.19 0.02
## dep_time        2399 -0.02    -1.09 0.85
## sched_dep_time  2253 -0.01    -1.20 0.81
## dep_delay       1344  4.80    43.95 0.07
## arr_time        2399 -0.47    -0.19 0.93
## sched_arr_time  2358 -0.35    -0.38 0.86
## arr_delay       1358  3.72    29.23 0.08
## carrier*          15  0.36    -1.21 0.01
## flight          8499  0.66    -0.85 2.81
## tailnum*        4042  0.17    -1.24 2.08
## origin*            2  0.09    -1.50 0.00
## dest*            104  0.13    -1.08 0.05
## air_time         675  1.07     0.86 0.16
## distance        4966  1.13     1.19 1.26
## hour              22  0.00    -1.21 0.01
## minute            59  0.09    -1.24 0.03
## time_hour       -Inf    NA       NA   NA
average_dep_delay <-
  filter(flights, dep_delay > 450 & (arr_time > 600 & arr_time < 1620) | dep_delay > 550)
flight_delay <- ggplot(flights) +
  geom_point(mapping = aes(x = arr_time, y = dep_delay)) +
  geom_point(data = average_dep_delay, mapping = aes(x = arr_time, y = dep_delay, color = origin))
flight_delay + ggtitle("flight delay") +
  xlab("arrival time") +
  ylab("departure delay") +
  scale_color_manual(name = "departure location", labels = c("EWR", "JFK", "LGA"), values = c("orangered2", "slateblue2", "springgreen3"))
## Warning: Removed 8713 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing missing values (geom_point).

average_dep_delay2 <-
  filter(flights, carrier == "B6")
flight_delay2 <- ggplot(average_dep_delay2) +
  geom_point(aes(x = arr_time, y = dep_delay, color = "gold2"), na.rm = TRUE) +
  geom_point(data = flights, mapping  = aes(x = arr_time, y = dep_delay), alpha = 1/20, na.rm = TRUE)
flight_delay2 + ggtitle("flight delay") +
  xlab("arrival time") +
  ylab("departure delay") +
  scale_color_manual(name = "carrier", labels = c("BlueJet"), values = c("gold2"))

## Conclusion I made 2 graphs the first had the main purpose of the first graph is to show the outliers. I looked into those outliers and found that more of the outliers are from JFK airport than any other single airport but it still does not make up more than half of the outliers. After trying to figure out why JFK has so many outliers I looked up “why is JFK” and the sixth auto complete was “why is JFK airport so bad” which while not conclusive does show that JFK is notoriously bad. The second graph I made to illustrate something that I had noticed when I was trying to explain the outliers. I noticed that a lot of the flights from 3AM-6AM where the same carrier so I filtered out those points and placed them in yellow then I took the entire dataset and overlaid it with a greatly reduced opacity. after looking into it, “B6” is the name for JetBlue and I could not figure out why they might be so prevalent in the early morning.