Wings to Fly

library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ forcats   1.0.0     ✔ readr     2.1.4
✔ ggplot2   3.4.3     ✔ stringr   1.5.0
✔ lubridate 1.9.2     ✔ tibble    3.2.1
✔ purrr     1.0.1     ✔ tidyr     1.3.0
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(nycflights13)
library(pander)
library(plotly)

Attaching package: 'plotly'

The following object is masked from 'package:ggplot2':

    last_plot

The following object is masked from 'package:stats':

    filter

The following object is masked from 'package:graphics':

    layout
flights1 <- flights %>%
  mutate(winter = ifelse(month %in% c(12, 1, 2, 3), 1, 0))

Raw Data Visualization

ggplot(flights1, aes(x = air_time, y = distance, color = factor(winter))) + 
  geom_point() +
  theme_minimal() +
  scale_color_manual(values = c("0" = "blue", "1" = "red")) +
  labs(color = "Winter") # Optional: Adding a label to the legend

Manager Questions

1) For each origin airport (JFK, EWR, LGA), which airline has the lowest 75th percentile of departure delay for flights scheduled to leave earlier than noon?

noon <- flights %>%
  filter(hour < 12)
percentile <- noon %>%
  group_by(origin, carrier) %>%
  summarise(lower_departure_delay = quantile(dep_delay, 0.75, na.rm=TRUE)) %>%
  slice_min(lower_departure_delay)

pander(percentile)
origin carrier lower_departure_delay
EWR 9E -2
EWR US -2
JFK DL -1
JFK HA -1
LGA US -3

2) Which origin airport is best to minimize my chances of a late arrival when I am using Delta Airlines?

best_airport <- flights %>%
  filter(carrier == "DL") %>%
  group_by(origin) %>%
  summarise(Mean_Departure_Delay = mean(dep_delay, na.rm=TRUE))

pander(best_airport)
origin Mean_Departure_Delay
EWR 12.08
JFK 8.333
LGA 9.573
data1 <- flights %>%
  filter(carrier == "DL")

boxplot(dep_delay ~ origin, data=data1, 
        ylim = c(-20, 30), 
        main = "Mean Late Arrivals by Origin", 
        xlab = "Place of Origin", 
        ylab = "Departure Delay", 
        col = c("lightblue", "lightgreen", "lightpink"))

Which destination airport is the worst airport for arrival delays? You decide on the metric for “worst.”

worst_airport <- flights %>%
  filter(carrier == "DL") %>%
  group_by(origin) %>%
  summarise(Mean_Arrival_Delay = mean(arr_delay, na.rm=TRUE))

pander(worst_airport)
origin Mean_Arrival_Delay
EWR 8.78
JFK -2.379
LGA 3.928
data2 <- flights %>%
  filter(carrier == "DL")

boxplot(arr_delay ~ origin, data=data2,
        ylim = c(-80, 60), 
        main = "Mean Late Arrivals by Origin", 
        xlab = "Place of Origin", 
        ylab = "Arrival Delay", 
        col = c("lightblue", "lightgreen", "lightpink"))