library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.1.2 v dplyr 1.0.6
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(nycflights13)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
summary(flights)
## year month day dep_time sched_dep_time
## Min. :2013 Min. : 1.000 Min. : 1.00 Min. : 1 Min. : 106
## 1st Qu.:2013 1st Qu.: 4.000 1st Qu.: 8.00 1st Qu.: 907 1st Qu.: 906
## Median :2013 Median : 7.000 Median :16.00 Median :1401 Median :1359
## Mean :2013 Mean : 6.549 Mean :15.71 Mean :1349 Mean :1344
## 3rd Qu.:2013 3rd Qu.:10.000 3rd Qu.:23.00 3rd Qu.:1744 3rd Qu.:1729
## Max. :2013 Max. :12.000 Max. :31.00 Max. :2400 Max. :2359
## NA's :8255
## dep_delay arr_time sched_arr_time arr_delay
## Min. : -43.00 Min. : 1 Min. : 1 Min. : -86.000
## 1st Qu.: -5.00 1st Qu.:1104 1st Qu.:1124 1st Qu.: -17.000
## Median : -2.00 Median :1535 Median :1556 Median : -5.000
## Mean : 12.64 Mean :1502 Mean :1536 Mean : 6.895
## 3rd Qu.: 11.00 3rd Qu.:1940 3rd Qu.:1945 3rd Qu.: 14.000
## Max. :1301.00 Max. :2400 Max. :2359 Max. :1272.000
## NA's :8255 NA's :8713 NA's :9430
## carrier flight tailnum origin
## Length:336776 Min. : 1 Length:336776 Length:336776
## Class :character 1st Qu.: 553 Class :character Class :character
## Mode :character Median :1496 Mode :character Mode :character
## Mean :1972
## 3rd Qu.:3465
## Max. :8500
##
## dest air_time distance hour
## Length:336776 Min. : 20.0 Min. : 17 Min. : 1.00
## Class :character 1st Qu.: 82.0 1st Qu.: 502 1st Qu.: 9.00
## Mode :character Median :129.0 Median : 872 Median :13.00
## Mean :150.7 Mean :1040 Mean :13.18
## 3rd Qu.:192.0 3rd Qu.:1389 3rd Qu.:17.00
## Max. :695.0 Max. :4983 Max. :23.00
## NA's :9430
## minute time_hour
## Min. : 0.00 Min. :2013-01-01 05:00:00
## 1st Qu.: 8.00 1st Qu.:2013-04-04 13:00:00
## Median :29.00 Median :2013-07-03 10:00:00
## Mean :26.23 Mean :2013-07-03 05:22:54
## 3rd Qu.:44.00 3rd Qu.:2013-10-01 07:00:00
## Max. :59.00 Max. :2013-12-31 23:00:00
##
#view(flights)
Changing the numerical months to their corresponding names
Although all months do not to be changed as only certain ones are being observed, all the month variables were converted anyway
flights$month[flights$month == 1] <- "January"
flights$month[flights$month == 2] <- "February"
flights$month[flights$month == 3] <- "March"
flights$month[flights$month == 4] <- "April"
flights$month[flights$month == 5] <- "May"
flights$month[flights$month == 6] <- "June"
flights$month[flights$month == 7] <- "July"
flights$month[flights$month == 8] <- "August"
flights$month[flights$month == 9] <- "September"
flights$month[flights$month == 10] <- "October"
flights$month[flights$month == 11] <- "November"
flights$month[flights$month == 12] <- "December"
Filtering the entire dataset for just the desired data
Since the goal is to examine the flight delay differences by season, only 4 months will be observed. And since a positive “dep_delay” value indicates a delayed flight, only flights that had dep_delay > 0 will be used.
defining_delays1 <- flights %>%
filter((month == "January" | month == "April" | month == "August" | month == "October") & dep_delay > 0)
defining_delays1
## # A tibble: 40,640 x 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <chr> <int> <int> <int> <dbl> <int> <int>
## 1 2013 January 1 517 515 2 830 819
## 2 2013 January 1 533 529 4 850 830
## 3 2013 January 1 542 540 2 923 850
## 4 2013 January 1 601 600 1 844 850
## 5 2013 January 1 608 600 8 807 735
## 6 2013 January 1 611 600 11 945 931
## 7 2013 January 1 613 610 3 925 921
## 8 2013 January 1 623 610 13 920 915
## 9 2013 January 1 632 608 24 740 728
## 10 2013 January 1 644 636 8 931 940
## # ... with 40,630 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
Creating a new dataframe with only the desired data
Once the desired data is filtered, a dataframe containing only those variables will need to be created. The data will be sorted by Month and Carrier and the count will indicate the number of delays corresponding to both variables.
by_month <- defining_delays1 %>%
group_by(month, carrier)
delay <- summarize(by_month, count=n())
## `summarise()` has grouped output by 'month'. You can override using the `.groups` argument.
delay
## # A tibble: 62 x 3
## # Groups: month [4]
## month carrier count
## <chr> <chr> <int>
## 1 April 9E 471
## 2 April AA 786
## 3 April AS 21
## 4 April B6 1889
## 5 April DL 1165
## 6 April EV 2007
## 7 April F9 18
## 8 April FL 138
## 9 April HA 4
## 10 April MQ 699
## # ... with 52 more rows
Since the Months are currently arranged in alphabetical order this will need to be changed to follow the natural order
delay$month<-factor(delay$month, levels = c("January", "April", "August", "October"))
Creating the visiualization
plot1 <- delay %>%
ggplot() +
geom_bar(aes(x = month, y = count, fill = carrier),
position = "dodge", stat = "identity") +
ggtitle("Number of Delays by Carrier for Selected Months in 2013") +
theme(plot.title = element_text(hjust = 0.50)) +
ylab("Number of Delays") +
xlab("Months") +
labs(fill = "Flight Carrier")
plot1
