library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.1     ✔ purrr   1.0.1
## ✔ tibble  3.1.8     ✔ dplyr   1.1.0
## ✔ tidyr   1.3.0     ✔ stringr 1.5.0
## ✔ readr   2.1.4     ✔ forcats 1.0.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

Importing Dataset

To import the CSV file, I used readr. The file is saved as Arrival_Delays_Data.

df <- read_csv("~/Documents/Arrival_Delays_Data.csv")
## New names:
## Rows: 5 Columns: 7
## ── Column specification
## ──────────────────────────────────────────────────────── Delimiter: "," chr
## (2): ...1, ...2 dbl (3): Los Angeles, San Diego, San Francisco num (2):
## Phoenix, Seattle
## ℹ Use `spec()` to retrieve the full column specification for this data. ℹ
## Specify the column types or set `show_col_types = FALSE` to quiet this message.
## • `` -> `...1`
## • `` -> `...2`
df
## # A tibble: 5 Ă— 7
##   ...1    ...2    `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 <NA>    delayed            62      12          20             102     305
## 3 <NA>    <NA>               NA      NA          NA              NA      NA
## 4 AM WEST on time           694    4840         383             320     201
## 5 <NA>    delayed           117     415          65             129      61
glimpse(df)
## Rows: 5
## Columns: 7
## $ ...1            <chr> "ALASKA", NA, NA, "AM WEST", NA
## $ ...2            <chr> "on time", "delayed", NA, "on time", "delayed"
## $ `Los Angeles`   <dbl> 497, 62, NA, 694, 117
## $ Phoenix         <dbl> 221, 12, NA, 4840, 415
## $ `San Diego`     <dbl> 212, 20, NA, 383, 65
## $ `San Francisco` <dbl> 503, 102, NA, 320, 129
## $ Seattle         <dbl> 1841, 305, NA, 201, 61

Removing NAs

I removed the row of NAs that separated Alaska and Am West Airlines:

df <- df[rowSums(is.na(df)) != ncol(df), ]
df
## # A tibble: 4 Ă— 7
##   ...1    ...2    `Los Angeles` Phoenix `San Diego` `San Francisco` Seattle
##   <chr>   <chr>           <dbl>   <dbl>       <dbl>           <dbl>   <dbl>
## 1 ALASKA  on time           497     221         212             503    1841
## 2 <NA>    delayed            62      12          20             102     305
## 3 AM WEST on time           694    4840         383             320     201
## 4 <NA>    delayed           117     415          65             129      61

Renaming Columns

I renamed the first and second columns to Airline and Arrival_Status, respectively:

colnames(df) [1] <- "Airline"
colnames(df) [2] <- "Arrival_Status"

I filled in the remaining NAs with the Airline representing delayed arrival flights:

df[c(2,4),1] <- df[c(1,3),1]
df
## # A tibble: 4 Ă— 7
##   Airline Arrival_Status `Los Angeles` Phoenix `San Diego` San Francis…¹ Seattle
##   <chr>   <chr>                  <dbl>   <dbl>       <dbl>         <dbl>   <dbl>
## 1 ALASKA  on time                  497     221         212           503    1841
## 2 ALASKA  delayed                   62      12          20           102     305
## 3 AM WEST on time                  694    4840         383           320     201
## 4 AM WEST delayed                  117     415          65           129      61
## # … with abbreviated variable name ¹​`San Francisco`

Using pivot_longer()

To change the table wide to long, I used pivot_longer(), renaming the cities as Destination and values to Total Number of Flights:

df_longer <- df %>%
  pivot_longer(cols = c(3:7), names_to = "Destination", values_to = "Total_Number_of_Flights")
df_longer
## # A tibble: 20 Ă— 4
##    Airline Arrival_Status Destination   Total_Number_of_Flights
##    <chr>   <chr>          <chr>                           <dbl>
##  1 ALASKA  on time        Los Angeles                       497
##  2 ALASKA  on time        Phoenix                           221
##  3 ALASKA  on time        San Diego                         212
##  4 ALASKA  on time        San Francisco                     503
##  5 ALASKA  on time        Seattle                          1841
##  6 ALASKA  delayed        Los Angeles                        62
##  7 ALASKA  delayed        Phoenix                            12
##  8 ALASKA  delayed        San Diego                          20
##  9 ALASKA  delayed        San Francisco                     102
## 10 ALASKA  delayed        Seattle                           305
## 11 AM WEST on time        Los Angeles                       694
## 12 AM WEST on time        Phoenix                          4840
## 13 AM WEST on time        San Diego                         383
## 14 AM WEST on time        San Francisco                     320
## 15 AM WEST on time        Seattle                           201
## 16 AM WEST delayed        Los Angeles                       117
## 17 AM WEST delayed        Phoenix                           415
## 18 AM WEST delayed        San Diego                          65
## 19 AM WEST delayed        San Francisco                     129
## 20 AM WEST delayed        Seattle                            61

I stored the arrival delays into a new variable, df_delays, with the total number of delayed flights in descending order:

df_delays <- df_longer %>%
  filter(Arrival_Status == "delayed") %>%
  group_by(Destination) %>%
  arrange(desc(Total_Number_of_Flights))
df_delays
## # A tibble: 10 Ă— 4
## # Groups:   Destination [5]
##    Airline Arrival_Status Destination   Total_Number_of_Flights
##    <chr>   <chr>          <chr>                           <dbl>
##  1 AM WEST delayed        Phoenix                           415
##  2 ALASKA  delayed        Seattle                           305
##  3 AM WEST delayed        San Francisco                     129
##  4 AM WEST delayed        Los Angeles                       117
##  5 ALASKA  delayed        San Francisco                     102
##  6 AM WEST delayed        San Diego                          65
##  7 ALASKA  delayed        Los Angeles                        62
##  8 AM WEST delayed        Seattle                            61
##  9 ALASKA  delayed        San Diego                          20
## 10 ALASKA  delayed        Phoenix                            12

Average Number of Delays for Each Airline:

mean_delays <- df_delays %>%
  group_by(Airline) %>%
  summarise(Average_Delays = round(mean(Total_Number_of_Flights)), Number_of_Flights = n()) %>%
  arrange(desc(Average_Delays))
mean_delays
## # A tibble: 2 Ă— 3
##   Airline Average_Delays Number_of_Flights
##   <chr>            <dbl>             <int>
## 1 AM WEST            157                 5
## 2 ALASKA             100                 5

Using pivot_wider()

I used pivot_wider in order to separate the arrival ontime and delay data. I stored the results in a new variable, dfdelays_ontime:

dfdelays_ontime <- df_longer %>%
  pivot_wider(names_from = Arrival_Status, values_from = Total_Number_of_Flights)
dfdelays_ontime
## # A tibble: 10 Ă— 4
##    Airline Destination   `on time` delayed
##    <chr>   <chr>             <dbl>   <dbl>
##  1 ALASKA  Los Angeles         497      62
##  2 ALASKA  Phoenix             221      12
##  3 ALASKA  San Diego           212      20
##  4 ALASKA  San Francisco       503     102
##  5 ALASKA  Seattle            1841     305
##  6 AM WEST Los Angeles         694     117
##  7 AM WEST Phoenix            4840     415
##  8 AM WEST San Diego           383      65
##  9 AM WEST San Francisco       320     129
## 10 AM WEST Seattle             201      61

Renaming Column Name:

colnames(dfdelays_ontime) [3] = "on_time"

Analyzing Flight Percentages

I transformed the arrival on-time and delay data into percentages for each city, using kable to format the newly created table:

flight_percentages <- dfdelays_ontime %>% 
  mutate(total = delayed + on_time,
         On_Time_Percentage = round(on_time/total*100),
         Delayed_Percentage = round(delayed/total*100)) %>%
  arrange(desc(Delayed_Percentage))

knitr::kable(flight_percentages, "pipe", align=c("l", "c", "c"))
Airline Destination on_time delayed total On_Time_Percentage Delayed_Percentage
AM WEST San Francisco 320 129 449 71 29
AM WEST Seattle 201 61 262 77 23
ALASKA San Francisco 503 102 605 83 17
AM WEST San Diego 383 65 448 85 15
ALASKA Seattle 1841 305 2146 86 14
AM WEST Los Angeles 694 117 811 86 14
ALASKA Los Angeles 497 62 559 89 11
ALASKA San Diego 212 20 232 91 9
AM WEST Phoenix 4840 415 5255 92 8
ALASKA Phoenix 221 12 233 95 5

Arrival Ontime Percentages

Looking at ontime and delay percentages separately:

Arrival_Ontime <- flight_percentages %>%
  select(Airline, Destination, On_Time_Percentage) %>%
  arrange(desc(On_Time_Percentage))
knitr::kable(Arrival_Ontime, "pipe", align=c("l", "c", "c"))
Airline Destination On_Time_Percentage
ALASKA Phoenix 95
AM WEST Phoenix 92
ALASKA San Diego 91
ALASKA Los Angeles 89
ALASKA Seattle 86
AM WEST Los Angeles 86
AM WEST San Diego 85
ALASKA San Francisco 83
AM WEST Seattle 77
AM WEST San Francisco 71

Barplot of Arrival Ontime Percentages for each airline and city

ggplot(Arrival_Ontime, aes(x=Destination, y=On_Time_Percentage, fill=Airline)) +
    geom_bar(stat='identity', position='dodge', width=0.7) +
  theme_minimal()

Arrival Delay Percentages

Arrival_Delayed <- flight_percentages %>%
  select(Airline, Destination, Delayed_Percentage) %>%
  arrange(desc(Delayed_Percentage))
knitr::kable(Arrival_Delayed, "pipe", align=c("l", "c", "c"))
Airline Destination Delayed_Percentage
AM WEST San Francisco 29
AM WEST Seattle 23
ALASKA San Francisco 17
AM WEST San Diego 15
ALASKA Seattle 14
AM WEST Los Angeles 14
ALASKA Los Angeles 11
ALASKA San Diego 9
AM WEST Phoenix 8
ALASKA Phoenix 5

Barplot of Arrival Delay Percentages

ggplot(Arrival_Delayed, aes(x=Destination, y=Delayed_Percentage, fill=Airline)) +
    geom_bar(stat='identity', position='dodge') +
  theme_minimal()

Further Analysis of Arrival Ontime and Delay Data

I looked at the arrival ontime and delay data summary of each Airline, creating a boxplot that compares each Airline:

Alaska Airlines Delay Summary

Arrival_Delayed %>% 
  filter(Airline == "ALASKA") %>% 
  select(Delayed_Percentage) %>% 
  summary()
##  Delayed_Percentage
##  Min.   : 5.0      
##  1st Qu.: 9.0      
##  Median :11.0      
##  Mean   :11.2      
##  3rd Qu.:14.0      
##  Max.   :17.0

AM West Airlines Delay Summary

Arrival_Delayed %>% 
  filter(Airline == "AM WEST") %>% 
  select(Delayed_Percentage) %>% 
  summary()
##  Delayed_Percentage
##  Min.   : 8.0      
##  1st Qu.:14.0      
##  Median :15.0      
##  Mean   :17.8      
##  3rd Qu.:23.0      
##  Max.   :29.0

Arrival Delay Boxplot

ggplot(Arrival_Delayed, aes(Airline, Delayed_Percentage)) + 
  geom_boxplot()

Alaska Airlines Ontime Summary

Arrival_Ontime %>% 
  filter(Airline == "ALASKA") %>% 
  select(On_Time_Percentage) %>% 
  summary()
##  On_Time_Percentage
##  Min.   :83.0      
##  1st Qu.:86.0      
##  Median :89.0      
##  Mean   :88.8      
##  3rd Qu.:91.0      
##  Max.   :95.0

Am West Ontime Summary

Arrival_Ontime %>% 
  filter(Airline == "AM WEST") %>% 
  select(On_Time_Percentage) %>% 
  summary()
##  On_Time_Percentage
##  Min.   :71.0      
##  1st Qu.:77.0      
##  Median :85.0      
##  Mean   :82.2      
##  3rd Qu.:86.0      
##  Max.   :92.0

Arrival Ontime Boxplot

ggplot(Arrival_Ontime, aes(Airline, On_Time_Percentage)) + 
  geom_boxplot()

Conclusion

When comparing the arrival on-time and delay performances between Alaska and Am West Airlines, Alaska performed better than Am West overall. Alaska has fewer delays than Am West, with an average of 11% compared to Am West which has an average of 17%. When comparing the on-time and delay performance of each city, each Airline’s highest average of delays and lowest arrival on-time performance occurred in San Francisco, which could indicate that its airport terminal may have a high frequency of plane traffic coming in and out of the city. Conversely, the highest average of ontime performance and lowest average of delays each airline occurred in Phoenix, which could indicate that there is less plane traffic in that city.