library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.1.2     v dplyr   1.0.6
## v tidyr   1.1.3     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
summary(flights)
##       year          month             day           dep_time    sched_dep_time
##  Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
##  1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
##  Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
##  Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
##  3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
##  Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
##                                                  NA's   :8255                 
##    dep_delay          arr_time    sched_arr_time   arr_delay       
##  Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
##  1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
##  Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
##  Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
##  3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
##  Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
##  NA's   :8255      NA's   :8713                  NA's   :9430      
##    carrier              flight       tailnum             origin         
##  Length:336776      Min.   :   1   Length:336776      Length:336776     
##  Class :character   1st Qu.: 553   Class :character   Class :character  
##  Mode  :character   Median :1496   Mode  :character   Mode  :character  
##                     Mean   :1972                                        
##                     3rd Qu.:3465                                        
##                     Max.   :8500                                        
##                                                                         
##      dest              air_time        distance         hour      
##  Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
##  Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
##  Mode  :character   Median :129.0   Median : 872   Median :13.00  
##                     Mean   :150.7   Mean   :1040   Mean   :13.18  
##                     3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
##                     Max.   :695.0   Max.   :4983   Max.   :23.00  
##                     NA's   :9430                                  
##      minute        time_hour                  
##  Min.   : 0.00   Min.   :2013-01-01 05:00:00  
##  1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00  
##  Median :29.00   Median :2013-07-03 10:00:00  
##  Mean   :26.23   Mean   :2013-07-03 05:22:54  
##  3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00  
##  Max.   :59.00   Max.   :2013-12-31 23:00:00  
## 
#view(flights)

Changing the numerical months to their corresponding names

Although all months do not to be changed as only certain ones are being observed, all the month variables were converted anyway

flights$month[flights$month == 1] <- "January"
flights$month[flights$month == 2] <- "February"
flights$month[flights$month == 3] <- "March"
flights$month[flights$month == 4] <- "April"
flights$month[flights$month == 5] <- "May"
flights$month[flights$month == 6] <- "June"
flights$month[flights$month == 7] <- "July"
flights$month[flights$month == 8] <- "August"
flights$month[flights$month == 9] <- "September"
flights$month[flights$month == 10] <- "October"
flights$month[flights$month == 11] <- "November"
flights$month[flights$month == 12] <- "December"

Filtering the entire dataset for just the desired data

Since the goal is to examine the flight delay differences by season, only 4 months will be observed. And since a positive “dep_delay” value indicates a delayed flight, only flights that had dep_delay > 0 will be used.

defining_delays1 <- flights %>%
  filter((month == "January" | month == "April" | month == "August" | month == "October") & dep_delay > 0)
defining_delays1
## # A tibble: 40,640 x 19
##     year month     day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <chr>   <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013 January     1      517            515         2      830            819
##  2  2013 January     1      533            529         4      850            830
##  3  2013 January     1      542            540         2      923            850
##  4  2013 January     1      601            600         1      844            850
##  5  2013 January     1      608            600         8      807            735
##  6  2013 January     1      611            600        11      945            931
##  7  2013 January     1      613            610         3      925            921
##  8  2013 January     1      623            610        13      920            915
##  9  2013 January     1      632            608        24      740            728
## 10  2013 January     1      644            636         8      931            940
## # ... with 40,630 more rows, and 11 more variables: arr_delay <dbl>,
## #   carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## #   air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>

Creating a new dataframe with only the desired data

Once the desired data is filtered, a dataframe containing only those variables will need to be created. The data will be sorted by Month and Carrier and the count will indicate the number of delays corresponding to both variables.

by_month <- defining_delays1 %>%
  group_by(month, carrier)

delay <- summarize(by_month, count=n())
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
delay
## # A tibble: 62 x 3
## # Groups:   month [4]
##    month carrier count
##    <chr> <chr>   <int>
##  1 April 9E        471
##  2 April AA        786
##  3 April AS         21
##  4 April B6       1889
##  5 April DL       1165
##  6 April EV       2007
##  7 April F9         18
##  8 April FL        138
##  9 April HA          4
## 10 April MQ        699
## # ... with 52 more rows

Since the Months are currently arranged in alphabetical order this will need to be changed to follow the natural order

delay$month<-factor(delay$month, levels = c("January", "April", "August", "October"))

Creating the visiualization

plot1 <- delay %>%
  ggplot() +
  geom_bar(aes(x = month, y = count, fill = carrier),
           position = "dodge", stat = "identity") +
  ggtitle("Number of Delays by Carrier for Selected Months in 2013") +
  theme(plot.title = element_text(hjust = 0.50)) +
  ylab("Number of Delays") +
  xlab("Months") +
  labs(fill = "Flight Carrier")
plot1