Possibly needed libraries

library(tidyverse)
library(openintro)
library(stringr)
library(readr)
library(tidyr)
library(dplyr)
library(ggplot2)

Load in data

flights_url <- "https://raw.githubusercontent.com/RonBalaban/CUNY-SPS-R/main/Delays.csv"
flights_raw <- read.csv(flights_url, header = TRUE, stringsAsFactors = FALSE)
flights_raw
##        X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1                         NA      NA        NA            NA      NA
## 2 ALASKA on time         497     221       212           503    1841
## 3        delayed          62      12        20           102     305
## 4                         NA      NA        NA            NA      NA
## 5 AMWEST on time         694    4840       383           320     201
## 6        delayed         117     415        65           129      61

Remove both un-needed blank lines

Flights <- na.omit(flights_raw)
Flights
##        X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 2 ALASKA on time         497     221       212           503    1841
## 3        delayed          62      12        20           102     305
## 5 AMWEST on time         694    4840       383           320     201
## 6        delayed         117     415        65           129      61

Add missing column names

colnames(Flights)[1] <- "Airline"
colnames(Flights)[2] <- "Status" 
Flights <- dplyr::rename(Flights, 'Los Angeles' = Los.Angeles)
Flights <- dplyr::rename(Flights, 'Phoenix' = Phoenix)
Flights <- dplyr::rename(Flights, 'San Diego' = San.Diego)
Flights <- dplyr::rename(Flights, 'San Francisco'= San.Francisco)
Flights <- dplyr::rename(Flights, 'Seattle' = Seattle)
Flights
##   Airline  Status Los Angeles Phoenix San Diego San Francisco Seattle
## 2  ALASKA on time         497     221       212           503    1841
## 3         delayed          62      12        20           102     305
## 5  AMWEST on time         694    4840       383           320     201
## 6         delayed         117     415        65           129      61

Add the flight origin to missing rows

Flights_clean <- tidyr::fill(Flights, Airline)
# Fill doesn't work here oddly

Flights_clean[2,1] <- Flights_clean[1,1]
# Adds Alaska to missing space
Flights_clean[4,1] <- Flights_clean[3,1]
# Adds AMWEST to missing space
Flights_clean 
##   Airline  Status Los Angeles Phoenix San Diego San Francisco Seattle
## 2  ALASKA on time         497     221       212           503    1841
## 3  ALASKA delayed          62      12        20           102     305
## 5  AMWEST on time         694    4840       383           320     201
## 6  AMWEST delayed         117     415        65           129      61

Analysis to compare arrival delays for the two airlines

dplyr::glimpse(Flights_clean)
## Rows: 4
## Columns: 7
## $ Airline         <chr> "ALASKA", "ALASKA", "AMWEST", "AMWEST"
## $ Status          <chr> "on time", "delayed", "on time", "delayed"
## $ `Los Angeles`   <int> 497, 62, 694, 117
## $ Phoenix         <int> 221, 12, 4840, 415
## $ `San Diego`     <int> 212, 20, 383, 65
## $ `San Francisco` <int> 503, 102, 320, 129
## $ Seattle         <int> 1841, 305, 201, 61
# Now data is formatted properly, can analyze
# ------------------------------------------------------------------------------
Alaska_ontime_avg <- rowMeans(Flights_clean[1,3:length(Flights_clean)])
Alaska_ontime_avg # 654.8
##     2 
## 654.8
Alaska_delayed_avg <- rowMeans(Flights_clean[2,3:length(Flights_clean)])
Alaska_delayed_avg # 100.2
##     3 
## 100.2
# ------------------------------------------------------------------------------
Amwest_ontime_avg <- rowMeans(Flights_clean[3,3:length(Flights_clean)])
Amwest_ontime_avg # 1287.6 
##      5 
## 1287.6
Amwest_delayed_avg <- rowMeans(Flights_clean[4,3:length(Flights_clean)])
Amwest_delayed_avg # 157.4
##     6 
## 157.4

On average, there are more almost twice as many flights that are on-time with Amwest Airlines, but 50% more average delays as well.

dplyr::glimpse(Flights_clean)
## Rows: 4
## Columns: 7
## $ Airline         <chr> "ALASKA", "ALASKA", "AMWEST", "AMWEST"
## $ Status          <chr> "on time", "delayed", "on time", "delayed"
## $ `Los Angeles`   <int> 497, 62, 694, 117
## $ Phoenix         <int> 221, 12, 4840, 415
## $ `San Diego`     <int> 212, 20, 383, 65
## $ `San Francisco` <int> 503, 102, 320, 129
## $ Seattle         <int> 1841, 305, 201, 61
# ------------------------------------------------------------------------------
Alaska_ontime_total <- rowSums(Flights_clean[1,3:length(Flights_clean)])
Alaska_ontime_total # 3274
##    2 
## 3274
Alaska_delayed_total <- rowSums(Flights_clean[2,3:length(Flights_clean)])
Alaska_delayed_total # 501 
##   3 
## 501
# ------------------------------------------------------------------------------
Amwest_ontime_total <- rowSums(Flights_clean[3,3:length(Flights_clean)])
Amwest_ontime_total # 6438
##    5 
## 6438
Amwest_delayed_total<- rowSums(Flights_clean[4,3:length(Flights_clean)])
Amwest_delayed_total # 787
##   6 
## 787

Thus we can see that AmWest is actually a betetr airline overall, as it had a greater ratio of on-time flights compared to delayed-flights, both in raw amounts and averages. Let’s confirm with Dplyr

Better version; Make the data wide for cleaner analysis

Flights_Wide <- Flights_clean %>% 
  # change format from wide to long to tidy up data
  pivot_longer(`Los Angeles`:`Seattle`, names_to = "Location", values_to = "Count")

Flights_Wide
## # A tibble: 20 × 4
##    Airline Status  Location      Count
##    <chr>   <chr>   <chr>         <int>
##  1 ALASKA  on time Los Angeles     497
##  2 ALASKA  on time Phoenix         221
##  3 ALASKA  on time San Diego       212
##  4 ALASKA  on time San Francisco   503
##  5 ALASKA  on time Seattle        1841
##  6 ALASKA  delayed Los Angeles      62
##  7 ALASKA  delayed Phoenix          12
##  8 ALASKA  delayed San Diego        20
##  9 ALASKA  delayed San Francisco   102
## 10 ALASKA  delayed Seattle         305
## 11 AMWEST  on time Los Angeles     694
## 12 AMWEST  on time Phoenix        4840
## 13 AMWEST  on time San Diego       383
## 14 AMWEST  on time San Francisco   320
## 15 AMWEST  on time Seattle         201
## 16 AMWEST  delayed Los Angeles     117
## 17 AMWEST  delayed Phoenix         415
## 18 AMWEST  delayed San Diego        65
## 19 AMWEST  delayed San Francisco   129
## 20 AMWEST  delayed Seattle          61

Delayed Flights

Delayed_flights <- Flights_Wide %>%
  filter(Status  == "delayed")

Delayed_flights
## # A tibble: 10 × 4
##    Airline Status  Location      Count
##    <chr>   <chr>   <chr>         <int>
##  1 ALASKA  delayed Los Angeles      62
##  2 ALASKA  delayed Phoenix          12
##  3 ALASKA  delayed San Diego        20
##  4 ALASKA  delayed San Francisco   102
##  5 ALASKA  delayed Seattle         305
##  6 AMWEST  delayed Los Angeles     117
##  7 AMWEST  delayed Phoenix         415
##  8 AMWEST  delayed San Diego        65
##  9 AMWEST  delayed San Francisco   129
## 10 AMWEST  delayed Seattle          61
# ------------------------------------------------------------------------------
# By flight location
Delays_per_location <- Delayed_flights %>% 
  group_by(Airline, Location) %>% 
    summarise(Average_Delay = mean(Count), 
              Total_Delays = sum(Count)) %>% 
        arrange(desc(Average_Delay))

Delays_per_location
## # A tibble: 10 × 4
## # Groups:   Airline [2]
##    Airline Location      Average_Delay Total_Delays
##    <chr>   <chr>                 <dbl>        <int>
##  1 AMWEST  Phoenix                 415          415
##  2 ALASKA  Seattle                 305          305
##  3 AMWEST  San Francisco           129          129
##  4 AMWEST  Los Angeles             117          117
##  5 ALASKA  San Francisco           102          102
##  6 AMWEST  San Diego                65           65
##  7 ALASKA  Los Angeles              62           62
##  8 AMWEST  Seattle                  61           61
##  9 ALASKA  San Diego                20           20
## 10 ALASKA  Phoenix                  12           12
# ------------------------------------------------------------------------------
# By airline
Delays_per_airline <- Delayed_flights %>%
  group_by(Airline) %>%
    summarise(Average_Delay = mean(Count),
              Total_Delays = sum(Count)) %>%
    arrange(desc(Total_Delays))

Delays_per_airline
## # A tibble: 2 × 3
##   Airline Average_Delay Total_Delays
##   <chr>           <dbl>        <int>
## 1 AMWEST           157.          787
## 2 ALASKA           100.          501

Total Delays by Airline

ggplot(Delays_per_airline, aes(x= Airline, y = Total_Delays, color = Airline, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Total Delays Per Airline") +
  labs(x = "Airline Carrier", y = "Total Delays") +
    geom_text(aes(label = Total_Delays), vjust = 1.5, colour = "black")

Total Delays by location by Airline

ggplot(Delays_per_location, aes(x = Location, y = Average_Delay, color = Airline, fill = Airline)) +
  geom_bar(stat = "identity", position = "dodge") +
  ggtitle("Total Delays by Location by Airline") +
  labs(x = "Airport Destinations", y = "Total Delays") +
    geom_text(aes(label = Average_Delay), colour = "black")