NYC Flights Homework

Author

Daniel B

Load Libraries and Dataset

Load libraries

library(tidyverse)
library(nycflights13)

Load the dataset into your global environment

Load in flights dataset. Two more datasets are added in case they need to be joined for labeling.

data(flights)
data(airlines)
data(airports)

View data

head(flights)
# A tibble: 6 × 19
   year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
  <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
1  2013     1     1      517            515         2      830            819
2  2013     1     1      533            529         4      850            830
3  2013     1     1      542            540         2      923            850
4  2013     1     1      544            545        -1     1004           1022
5  2013     1     1      554            600        -6      812            837
6  2013     1     1      554            558        -4      740            728
# ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
#   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
#   hour <dbl>, minute <dbl>, time_hour <dttm>

Calculate Summary Statistics

Individual stats I wanted to pick out

mean(flights$dep_delay, na.rm = TRUE)
[1] 12.63907
min(flights$dep_delay, na.rm = TRUE)
[1] -43
max(flights$dep_delay, na.rm = TRUE)
[1] 1301
min(flights$dep_time, na.rm = TRUE)
[1] 1
max(flights$dep_time, na.rm = TRUE)
[1] 2400
n_distinct(flights$carrier)
[1] 16

Summarize

summary(flights)
      year          month             day           dep_time    sched_dep_time
 Min.   :2013   Min.   : 1.000   Min.   : 1.00   Min.   :   1   Min.   : 106  
 1st Qu.:2013   1st Qu.: 4.000   1st Qu.: 8.00   1st Qu.: 907   1st Qu.: 906  
 Median :2013   Median : 7.000   Median :16.00   Median :1401   Median :1359  
 Mean   :2013   Mean   : 6.549   Mean   :15.71   Mean   :1349   Mean   :1344  
 3rd Qu.:2013   3rd Qu.:10.000   3rd Qu.:23.00   3rd Qu.:1744   3rd Qu.:1729  
 Max.   :2013   Max.   :12.000   Max.   :31.00   Max.   :2400   Max.   :2359  
                                                 NA's   :8255                 
   dep_delay          arr_time    sched_arr_time   arr_delay       
 Min.   : -43.00   Min.   :   1   Min.   :   1   Min.   : -86.000  
 1st Qu.:  -5.00   1st Qu.:1104   1st Qu.:1124   1st Qu.: -17.000  
 Median :  -2.00   Median :1535   Median :1556   Median :  -5.000  
 Mean   :  12.64   Mean   :1502   Mean   :1536   Mean   :   6.895  
 3rd Qu.:  11.00   3rd Qu.:1940   3rd Qu.:1945   3rd Qu.:  14.000  
 Max.   :1301.00   Max.   :2400   Max.   :2359   Max.   :1272.000  
 NA's   :8255      NA's   :8713                  NA's   :9430      
   carrier              flight       tailnum             origin         
 Length:336776      Min.   :   1   Length:336776      Length:336776     
 Class :character   1st Qu.: 553   Class :character   Class :character  
 Mode  :character   Median :1496   Mode  :character   Mode  :character  
                    Mean   :1972                                        
                    3rd Qu.:3465                                        
                    Max.   :8500                                        
                                                                        
     dest              air_time        distance         hour      
 Length:336776      Min.   : 20.0   Min.   :  17   Min.   : 1.00  
 Class :character   1st Qu.: 82.0   1st Qu.: 502   1st Qu.: 9.00  
 Mode  :character   Median :129.0   Median : 872   Median :13.00  
                    Mean   :150.7   Mean   :1040   Mean   :13.18  
                    3rd Qu.:192.0   3rd Qu.:1389   3rd Qu.:17.00  
                    Max.   :695.0   Max.   :4983   Max.   :23.00  
                    NA's   :9430                                  
     minute        time_hour                     
 Min.   : 0.00   Min.   :2013-01-01 05:00:00.00  
 1st Qu.: 8.00   1st Qu.:2013-04-04 13:00:00.00  
 Median :29.00   Median :2013-07-03 10:00:00.00  
 Mean   :26.23   Mean   :2013-07-03 05:22:54.64  
 3rd Qu.:44.00   3rd Qu.:2013-10-01 07:00:00.00  
 Max.   :59.00   Max.   :2013-12-31 23:00:00.00  
                                                 

Graph On-Time Performance using Departure Delay and Arrival Delay

I work with a team that develops software for the transit industry, namely taxi, sedan, and paratransit markets. Some of the most important data that is collected for reporting is to analyze key performance indicators (KPIs) and the subset that agencies look at the most is “On-Time Performance” which is usually defined as arriving at the origin location within 15 minutes of the requested/scheduled pickup time. After googling to determine whether the airline industry uses the same metric, it turned out they do. So I worked on making a bidirectional bar graph that has both the departure delay percentage and arrival delay percentage for each carrier (16 in total). I tried exported the dataset to make sure the formula was correct, but it was taking too long, so not sure if it’s correct because I’m more familiar with SQL, but I guess it’s a learning process.

# Calculate the percentage of flights with less than 15 minutes delay (OTP-Punctuality)
delay_punctuality <- flights |>
  group_by(carrier) |>
  summarize(Departure_Percentage = sum(dep_delay <= 15, na.rm = TRUE) / n() * 100,
            Arrival_Percentage = sum(arr_delay <= 15, na.rm = TRUE) / n() * 100)

# Join the delay_punctuality dataset with the airlines dataset
delay_punctuality_labels <- left_join(delay_punctuality, airlines, by = "carrier")
delay_punctuality_labels$name <- gsub("Inc\\.|Co\\.", "", delay_punctuality_labels$name)

# Create a bidirectional horizontal bar chart
ggplot(delay_punctuality_labels, aes(x = -Departure_Percentage, y = reorder(name, Departure_Percentage))) +

  geom_text(aes(label = paste0(round(Departure_Percentage, 0), "%")), hjust = 1.1, size = 4) +  #departure % labels
  geom_bar(aes(fill = "Departure_Percentage"), stat = "identity", width = .75) +
  geom_bar(aes(x = Arrival_Percentage, fill = "Arrival_Percentage"), stat = "identity", width = .75) +
  geom_text(aes(x = Arrival_Percentage, label = paste0(round(Arrival_Percentage, 0), "%")), hjust = -.1, size = 4) +  # arrival % labels
  
  labs(x = "Departures < On-Time Performance > Arrivals", y = "Carrier",
      title = "On-Time Performance of Airline Carriers",
      caption = "(% of Flights < 15 Minutes Delay)") +
  
  scale_fill_manual(
    name = "Performance",
    breaks = c("Departure_Percentage", "Arrival_Percentage"),  # Specify the order of legend items
    values = c("Departure_Percentage" = "#8bd3c7", "Arrival_Percentage" = "#beb9db"),
    labels = c("Departure_Percentage" = "Departure", "Arrival_Percentage" = "Arrival")
  ) +
  
  scale_x_continuous(labels = abs, limits = c(-120, 120)) +  # Positive negative axis
  theme_minimal() + 
  theme(
    panel.grid.major.x = element_blank(),
    panel.grid.minor.x = element_blank(),
    panel.grid.major.y = element_blank(),
    panel.grid.minor.y = element_blank(),
    
    axis.text = element_text(size = 12), 
    
    axis.title.y = element_blank(),         
    plot.title = element_text(hjust = .36, size=14),
    
    axis.title.x = element_text(hjust = 0, size = 12, vjust = -0.75),
    plot.caption = element_text(hjust = .5, size=10, vjust = -0.75),
       
    
    plot.margin = margin(20, 10, 20, 10)    
  )