# install.packages("nycflights13")
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(RColorBrewer)
## Warning: package 'RColorBrewer' was built under R version 4.0.3
flights <- flights
#view(flights)
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :29.00 Median :2013-07-03 10:00:00
## Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :59.00 Max. :2013-12-31 23:00:00
##
# nycflights13::flights
str(flights)
## tibble [336,776 x 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
summary_by_group <- flights %>%
group_by(origin,hour) %>%
summarise(count = n(),
dep_delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
#view(summary_by_group)
summary_by_group %>%
group_by(origin) %>%
summarise(max_delay = max(dep_delay, na.rm=T))
## # A tibble: 3 x 2
## origin max_delay
## <chr> <dbl>
## 1 EWR 31.1
## 2 JFK 26.1
## 3 LGA 25.8
summary(summary_by_group)
## origin hour count dep_delay
## Length:57 Min. : 1.00 Min. : 1 Min. :-0.3908
## Class :character 1st Qu.: 9.00 1st Qu.: 4427 1st Qu.: 5.5696
## Mode :character Median :14.00 Median : 6752 Median :12.9168
## Mean :13.61 Mean : 5908 Mean :13.0991
## 3rd Qu.:18.00 3rd Qu.: 7836 3rd Qu.:20.3002
## Max. :23.00 Max. :11133 Max. :31.0891
## NA's :1
summary_by_group %>% group_by(origin) %>% slice(which.max(dep_delay))
## # A tibble: 3 x 4
## # Groups: origin [3]
## origin hour count dep_delay
## <chr> <dbl> <int> <dbl>
## 1 EWR 19 5976 31.1
## 2 JFK 21 3461 26.1
## 3 LGA 22 221 25.8
# summary_by_group %>% group_by(origin) %>% slice(which.max(count))
# I don't think 3 or 4 minutes delay is meaningful.
ggplot(summary_by_group, aes(x=hour, y= dep_delay, fill = origin)) +
geom_col()+
facet_wrap( ~ origin, ncol = 3) +
labs(x = "departure hour", y= "average departure delay",
title = "Daily Pattern of Departure Delay")
## Warning: Removed 1 rows containing missing values (position_stack).
I did a simple analysis of the origins in order to obtain meaningful information with this data. According to the Bureau of Transportation Statistics, about 20 % of the total flight were delayed in 2021. Flight delays occur for different reasons, depending on the circumstances at the time of departure and arrival at the airport. This data shows the actual arrival and departure times and destinations of flights departing from New York in 2013. However, looking at many research data, flight time, distance or other variables in the data are difficult to be seen as the cause of the delay. It means that it is difficult to find out what occurs flights delay with this data. But we can find out which time of the day have the most delays in departures at airports with this data.
I looked at departure delay times by hour at three major airports. The ‘origin’ variable indicates three major airports in New York. EWR, JFK and LGA stand for each Newark Liberty International AirportJohn F. Kennedy International Airport and LaGuardia Airport. The variable ‘hour’ is measured by 24 hours and the departure hour range is from 5 to 23, indicating the local hours of the origins. Using group_by() function, I averaged the departure delay time by hourly at each airport. Looking at the chart, EWR has more departure delay occurrences and a longer average delay time compared to other airports. Looking at the chart, EWR has more departure delay occurrences and a longer average delay time compared to other airports. You can see that there are few delayed departures at 5 am, the first departure time, and the most departure delays occur around 20:00 at night. Passengers who plan to use each airport can get a rough estimate of the departure delay of their flight and schedule a flight by referring to the graph.
Thank you!