NYC Flights Homework

I load the libraries and view the “flights” dataset

#install.packages("psych")

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(psych)
## Warning: package 'psych' was built under R version 4.0.4
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
view(flights)
describe(flights)
## Warning in FUN(newX[, i], ...): no non-missing arguments to min; returning Inf
## Warning in FUN(newX[, i], ...): no non-missing arguments to max; returning -Inf
##                vars      n    mean      sd median trimmed     mad  min  max
## year              1 336776 2013.00    0.00   2013 2013.00    0.00 2013 2013
## month             2 336776    6.55    3.41      7    6.56    4.45    1   12
## day               3 336776   15.71    8.77     16   15.70   11.86    1   31
## dep_time          4 328521 1349.11  488.28   1401 1346.82  634.55    1 2400
## sched_dep_time    5 336776 1344.25  467.34   1359 1341.60  613.80  106 2359
## dep_delay         6 328521   12.64   40.21     -2    3.32    5.93  -43 1301
## arr_time          7 328063 1502.05  533.26   1535 1526.42  619.73    1 2400
## sched_arr_time    8 336776 1536.38  497.46   1556 1550.67  618.24    1 2359
## arr_delay         9 327346    6.90   44.63     -5   -1.03   20.76  -86 1272
## carrier*         10 336776    7.14    4.14      6    7.00    5.93    1   16
## flight           11 336776 1971.92 1632.47   1496 1830.51 1608.62    1 8500
## tailnum*         12 334264 1814.32 1199.75   1798 1778.21 1587.86    1 4043
## origin*          13 336776    1.95    0.82      2    1.94    1.48    1    3
## dest*            14 336776   50.03   28.12     50   49.56   32.62    1  105
## air_time         15 327346  150.69   93.69    129  140.03   75.61   20  695
## distance         16 336776 1039.91  733.23    872  955.27  569.32   17 4983
## hour             17 336776   13.18    4.66     13   13.15    5.93    1   23
## minute           18 336776   26.23   19.30     29   25.64   23.72    0   59
## time_hour        19 336776     NaN      NA     NA     NaN      NA  Inf -Inf
##                range  skew kurtosis   se
## year               0   NaN      NaN 0.00
## month             11 -0.01    -1.19 0.01
## day               30  0.01    -1.19 0.02
## dep_time        2399 -0.02    -1.09 0.85
## sched_dep_time  2253 -0.01    -1.20 0.81
## dep_delay       1344  4.80    43.95 0.07
## arr_time        2399 -0.47    -0.19 0.93
## sched_arr_time  2358 -0.35    -0.38 0.86
## arr_delay       1358  3.72    29.23 0.08
## carrier*          15  0.36    -1.21 0.01
## flight          8499  0.66    -0.85 2.81
## tailnum*        4042  0.17    -1.24 2.08
## origin*            2  0.09    -1.50 0.00
## dest*            104  0.13    -1.08 0.05
## air_time         675  1.07     0.86 0.16
## distance        4966  1.13     1.19 1.26
## hour              22  0.00    -1.21 0.01
## minute            59  0.09    -1.24 0.03
## time_hour       -Inf    NA       NA   NA

In this analysis I want to discover if the planes usually make up the time in the air that they lost during delays while departing.

Additionally, I want to see if there is a difference in this aspect between summer and winter months.

I rename months

flights$month[flights$month == 1]<- "January"
flights$month[flights$month == 2]<- "February"
flights$month[flights$month == 3]<- "March"
flights$month[flights$month == 4]<- "April"
flights$month[flights$month == 5]<- "May"
flights$month[flights$month == 6]<- "June"
flights$month[flights$month == 7]<- "July"
flights$month[flights$month == 8]<- "August"
flights$month[flights$month == 9]<- "September"
flights$month[flights$month == 10]<- "October"
flights$month[flights$month == 11]<- "November"
flights$month[flights$month == 12]<- "December"

view(flights)

I create a new variable (“gain”) with the time difference between delay in departure and delay in arriving. Then we arrange the new variable from minimum value to maximum.

flights_gain <- flights %>%
mutate(gain = arr_delay - dep_delay) %>%
  arrange(gain)
flights_gain
## # A tibble: 336,776 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <chr> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013 June     13     1907           1512       235     2134           1928
##  2  2013 Febr~    26     1000            900        60     1513           1540
##  3  2013 Febr~    23     1226            900       206     1746           1540
##  4  2013 May      13     1917           1900        17     2149           2251
##  5  2013 Febr~    27      924            900        24     1448           1540
##  6  2013 July     14     1917           1829        48     2109           2135
##  7  2013 July     17     2004           1930        34     2224           2304
##  8  2013 Dece~    27     1719           1648        31     1956           2038
##  9  2013 May       2     1947           1949        -2     2209           2324
## 10  2013 Nove~    13     2024           2015         9     2251           2354
## # ... with 336,766 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   gain <dbl>
flights_gain$month <- factor(flights_gain$month,  levels = c("January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December"))

p1 <- flights_gain %>%
  ggplot(aes(month, gain, fill =  month)) +
  ggtitle("Diference in delays") +
  xlab("Months") +
  ylab("Frequency") +
  geom_boxplot() +
  scale_fill_discrete()
p1
## Warning: Removed 9430 rows containing non-finite values (stat_boxplot).

I make a histogram to see how often the particular values of “gain” variable occur in the dataset. We can see that a histogram is slightly negatively skewed. It means that planes frequently catch up with the time difference if they depart later.

p5 <- qplot(data = flights_gain, gain, fill = month, geom = "histogram", bins = 25) + ggtitle("Difference in delayes") 
p5
## Warning: Removed 9430 rows containing non-finite values (stat_bin).

Doing the analysis below, I want to see the dataset with winter months when the “gain” variable is more or equal than 30 minuets. For visualization this, I use boxplot and histogram.

flights_winter_delay <- flights_gain %>%
filter(month %in% c("January", "February", "December") & gain >= 30)
flights_winter_delay
## # A tibble: 2,901 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <fct> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013 Janu~     1     1424           1349        35     1701           1556
##  2  2013 Janu~     1     1603           1605        -2     1818           1750
##  3  2013 Janu~     1     1840           1845        -5     2055           2030
##  4  2013 Janu~     2      933            929         4     1305           1231
##  5  2013 Janu~     2     1454           1450         4     1755           1721
##  6  2013 Janu~     3      850            855        -5     1126           1101
##  7  2013 Janu~     3     1519           1459        20     1855           1805
##  8  2013 Janu~     3     1706           1710        -4     2041           2015
##  9  2013 Janu~     5      859            905        -6     1253           1229
## 10  2013 Janu~     5     1443           1450        -7     1649           1626
## # ... with 2,891 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   gain <dbl>
p1 <- flights_winter_delay %>%
  ggplot(aes(month, gain, fill = month)) + 
  ggtitle("Diference in delays_winter months") +
  xlab("Months") +
  ylab("Frequency") +
  geom_boxplot() +
  scale_fill_discrete(name = "month", labels = c("January", "February", "December"))
p1

p2 <- flights_winter_delay %>%
  ggplot(aes(x= gain, fill=month)) +
  geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
  ggtitle("Diference in delays_winter months") +
  scale_fill_discrete(name = "Month", labels = c("January", "February", "December"))
p2

Doing this, I want to see the dataset with summer months when the “gain” variable is more or equal than 30 minuets.

flights_summer_delay <- flights_gain %>%
filter(month %in% c("June", "July", "August") & gain >= 30)
flights_summer_delay
## # A tibble: 4,521 x 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <fct> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013 June      2     1917           1735       102     2149           1937
##  2  2013 June      3      935            923        12     1208           1126
##  3  2013 June      3     1155           1200        -5     1535           1510
##  4  2013 June      3     1216           1220        -4     1440           1414
##  5  2013 June      3     1825           1830        -5     2225           2200
##  6  2013 June      5     1608           1600         8     1915           1837
##  7  2013 June      6     1551           1553        -2     1737           1709
##  8  2013 June      7     1733           1735        -2     2133           2105
##  9  2013 June      7     1928           1835        53     2336           2213
## 10  2013 June      7     1939           1800        99     2135           1926
## # ... with 4,511 more rows, and 12 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>,
## #   gain <dbl>

Here is a visualization of summer months with boxplot and histogram.

p3 <- flights_summer_delay %>%
  ggplot(aes(month, gain, fill = month)) + 
  ggtitle("Diference in delays_summer months") +
  xlab("Months") +
  ylab("Frequency") +
  geom_boxplot() +
  scale_fill_discrete(name = "month", labels = c("June", "July", "August"))
p3

p4 <- flights_summer_delay %>%
  ggplot(aes(x= gain, fill=month)) +
  geom_histogram(position="identity", alpha=0.5, binwidth = 5, color = "white")+
  ggtitle("Diference in delays_summer months") +
  scale_fill_discrete(name = "Month", labels = c("June", "July", "August"))
p4

The visualization of the gain of delays in the winter and summer months shows that planes have difficulties making up time in the air in the summertime compared with the winter season. That factor has been surprising for me as I expected that the winter months would have been worse. Probably summer months are worse in terms of catch-up time in the air because summer is a high season for travel. In the summer there are more planes in the air and traffic in the air might be overwhelmed.