# Load necessary packages
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(trelliscopejs)
library(openintro)
## Loading required package: airports
## Loading required package: cherryblossom
## Loading required package: usdata
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(purrr)
# Investigate initial correlations
cor(nycflights[,c(4:7,13:14)])
##              dep_time   dep_delay   arr_time   arr_delay    air_time
## dep_time   1.00000000  0.25929359 0.66391209  0.23196328 -0.02110843
## dep_delay  0.25929359  1.00000000 0.04225206  0.91606217 -0.01342772
## arr_time   0.66391209  0.04225206 1.00000000  0.03443377  0.05413819
## arr_delay  0.23196328  0.91606217 0.03443377  1.00000000 -0.02824045
## air_time  -0.02110843 -0.01342772 0.05413819 -0.02824045  1.00000000
## distance  -0.02056249 -0.01269835 0.04737048 -0.05445608  0.99072420
##              distance
## dep_time  -0.02056249
## dep_delay -0.01269835
## arr_time   0.04737048
## arr_delay -0.05445608
## air_time   0.99072420
## distance   1.00000000
# Make a date column from the separate year, month, and day columns using make_date in lubridate package
nycflights <- nycflights %>%
  mutate(date = make_date(month = month, day = day, year = year))
# This will be scatterplots faceted by airline carrier and hour of the day (on a 24 hour clock)

# I will find the cutoff for outliers by adding 1.5*IQR of airline departure delays to theh 75th quantile for each group of hour of day and airline carrier to determine which points are not shown

# Then, I filter to only include points that have departure delays that are less than this upper outlier bound 

# A cognostic to show the number of outliers removed for each combination of airline carrier and hour will be shown on the panels

# Another cognostic to a wikipedia page detailing which airline carrier the abbreviation from the data corresponds to, with additional information

# The scatterplots show the date on the x-axis, with departure delay in minutes on the y-axis. Each plot only shows values for a certain hour on a 24 hour clock for a certain airline carrier. The points are also colored by the airport of origin because this gives additional information without crowding the plot

# Axes are set to free to be able to show the respective axes for each plot better without extra space


nycflights %>%group_by(hour, carrier)%>%
  mutate(cutoff = quantile(dep_delay, 0.75)+1.5*(IQR(dep_delay)))%>%
    mutate(num_outliers = cog(val = sum(dep_delay > cutoff), desc = "Departure Delay is greater than 1.5*IQR of group", default_label = TRUE))%>%
  filter(dep_delay < cutoff) %>%
  mutate(carrier_wiki_page = cog(val = paste0("https://en.wikipedia.org/wiki/", carrier), 
            desc = "Airline Carrier Info", default_label = TRUE))%>%
  ggplot(aes(date, dep_delay, color = origin))+
  geom_point(alpha = 0.5)+
  facet_trelliscope(~carrier + hour, scales = "free",
                    name = "Departure Delay in Minutes by Date, faceted by hour of the day and airline carrier", desc = "Random sample of 32,735 flights that departed from NYC in 2013 (Negative Departures = Early)", 
                    nrow = 1, ncol = 2,
                    path = ".",
                    self_contained = TRUE)

Description

The dataset I am using contains 32,735 rows initially, with each row corresponding to a flight that departed from a New York airport in 2013. The original dataset contains 16 variables, among these are variables relating to the year, month, day, and hour of the flight. Also, there is information on departure and arrival delays in minutes. It is important to realize that there are negative times in the departure and arrival delay columns, but these are okay and reveal flights that arrived or departed earlier than expected. There is a two letter carrier abbreviation for the airline carriers present, of which there are 16. We can see how much time was spent in the air on the flight and the total distance flown, as well as the origin and destination of each flight.