# install.packages("nycflights13")
library(nycflights13)
## Warning: package 'nycflights13' was built under R version 4.0.5
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.0.5
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## Warning: package 'ggplot2' was built under R version 4.0.5
## Warning: package 'tibble' was built under R version 4.0.5
## Warning: package 'tidyr' was built under R version 4.0.5
## Warning: package 'readr' was built under R version 4.0.5
## Warning: package 'dplyr' was built under R version 4.0.5
## Warning: package 'forcats' was built under R version 4.0.5
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(RColorBrewer)
## Warning: package 'RColorBrewer' was built under R version 4.0.3
flights <- flights
#view(flights)     
summary(flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                  
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 
# nycflights13::flights
str(flights)
## tibble [336,776 x 19] (S3: tbl_df/tbl/data.frame)
##  $ year          : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
##  $ month         : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ day           : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
##  $ dep_time      : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
##  $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
##  $ dep_delay     : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
##  $ arr_time      : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
##  $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
##  $ arr_delay     : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
##  $ carrier       : chr [1:336776] "UA" "UA" "AA" "B6" ...
##  $ flight        : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
##  $ tailnum       : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
##  $ origin        : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
##  $ dest          : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
##  $ air_time      : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
##  $ distance      : num [1:336776] 1400 1416 1089 1576 762 ...
##  $ hour          : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
##  $ minute        : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
##  $ time_hour     : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...

Create a new data set grouped by origin (3 airports) and hour(departure time) using group_by() and each row has a mean of departure delays.

summary_by_group <- flights %>%
  group_by(origin,hour) %>%
  summarise(count = n(),
            dep_delay = mean(dep_delay, na.rm = TRUE))
## `summarise()` has grouped output by 'origin'. You can override using the
## `.groups` argument.
#view(summary_by_group)

find max delays by origin using group_by()

summary_by_group %>%
  group_by(origin) %>%
  summarise(max_delay = max(dep_delay, na.rm=T))
## # A tibble: 3 x 2
##   origin max_delay
##   <chr>      <dbl>
## 1 EWR         31.1
## 2 JFK         26.1
## 3 LGA         25.8
summary(summary_by_group)
##     origin               hour           count         dep_delay      
##  Length:57          Min.   : 1.00   Min.   :    1   Min.   :-0.3908  
##  Class :character   1st Qu.: 9.00   1st Qu.: 4427   1st Qu.: 5.5696  
##  Mode  :character   Median :14.00   Median : 6752   Median :12.9168  
##                     Mean   :13.61   Mean   : 5908   Mean   :13.0991  
##                     3rd Qu.:18.00   3rd Qu.: 7836   3rd Qu.:20.3002  
##                     Max.   :23.00   Max.   :11133   Max.   :31.0891  
##                                                     NA's   :1

Find rows that contain maximum depature delay by origin using which.max() and slice()

summary_by_group %>% group_by(origin) %>% slice(which.max(dep_delay))
## # A tibble: 3 x 4
## # Groups:   origin [3]
##   origin  hour count dep_delay
##   <chr>  <dbl> <int>     <dbl>
## 1 EWR       19  5976      31.1
## 2 JFK       21  3461      26.1
## 3 LGA       22   221      25.8
# summary_by_group %>% group_by(origin) %>% slice(which.max(count))
# I don't think 3 or 4 minutes delay is meaningful.

Create a plot that show the result of the data frame ‘summary_by_group’.

ggplot(summary_by_group, aes(x=hour, y= dep_delay, fill = origin)) +
  geom_col()+
  facet_wrap( ~ origin, ncol = 3) +
  labs(x = "departure hour", y= "average departure delay",
       title = "Daily Pattern of Departure Delay")
## Warning: Removed 1 rows containing missing values (position_stack).

Requirements for the plot:

  1. Include at least one dplyr command (filter, sort, summarize, group_by, select, mutate, ….)
  2. Include labels for the x- and y-axes
  3. Include a title
  4. Your plot must incorporate at least 2 colors
  5. Include a legend that indicates what the colors represent
  6. Write a brief paragraph that describes the visualization you have created and at least one aspect of the plot that you would like to highlight.

#5

I did a simple analysis of the origins in order to obtain meaningful information with this data. According to the Bureau of Transportation Statistics, about 20 % of the total flight were delayed in 2021. Flight delays occur for different reasons, depending on the circumstances at the time of departure and arrival at the airport. This data shows the actual arrival and departure times and destinations of flights departing from New York in 2013. However, looking at many research data, flight time, distance or other variables in the data are difficult to be seen as the cause of the delay. It means that it is difficult to find out what occurs flights delay with this data. But we can find out which time of the day have the most delays in departures at airports with this data.

I looked at departure delay times by hour at three major airports. The ‘origin’ variable indicates three major airports in New York. EWR, JFK and LGA stand for each Newark Liberty International AirportJohn F. Kennedy International Airport and LaGuardia Airport. The variable ‘hour’ is measured by 24 hours and the departure hour range is from 5 to 23, indicating the local hours of the origins. Using group_by() function, I averaged the departure delay time by hourly at each airport. Looking at the chart, EWR has more departure delay occurrences and a longer average delay time compared to other airports. Looking at the chart, EWR has more departure delay occurrences and a longer average delay time compared to other airports. You can see that there are few delayed departures at 5 am, the first departure time, and the most departure delays occur around 20:00 at night. Passengers who plan to use each airport can get a rough estimate of the departure delay of their flight and schedule a flight by referring to the graph.

Thank you!