Load libraries

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(data.table)
## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose

Read created cvs from github

url <- "https://raw.githubusercontent.com/folushoa/Data-Science/Data-607/Assignment%205/Assignment%205.csv"
airline_data <- fread(url, na.strings = "")
airline_data
##    Airline  Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1:  Alaska On Time         497     221       212           503    1841
## 2:    <NA> Delayed          62      12        20           102     305
## 3: AM West On Time         694    4840       383           320     201
## 4:    <NA> Delayed         117     415        65           129      61

Tidy data

airline_data <- airline_data %>% 
  fill(Airline, .direction = "down")
airline_data
##    Airline  Status Los Angeles Phoenix San Diego San Francisco Seattle
## 1:  Alaska On Time         497     221       212           503    1841
## 2:  Alaska Delayed          62      12        20           102     305
## 3: AM West On Time         694    4840       383           320     201
## 4: AM West Delayed         117     415        65           129      61
airline_data <- airline_data %>%
  pivot_longer(
    cols = !c(Airline, Status),
    names_to = "City",
    values_to = "Count"
  )
airline_data
## # A tibble: 20 × 4
##    Airline Status  City          Count
##    <chr>   <chr>   <chr>         <int>
##  1 Alaska  On Time Los Angeles     497
##  2 Alaska  On Time Phoenix         221
##  3 Alaska  On Time San Diego       212
##  4 Alaska  On Time San Francisco   503
##  5 Alaska  On Time Seattle        1841
##  6 Alaska  Delayed Los Angeles      62
##  7 Alaska  Delayed Phoenix          12
##  8 Alaska  Delayed San Diego        20
##  9 Alaska  Delayed San Francisco   102
## 10 Alaska  Delayed Seattle         305
## 11 AM West On Time Los Angeles     694
## 12 AM West On Time Phoenix        4840
## 13 AM West On Time San Diego       383
## 14 AM West On Time San Francisco   320
## 15 AM West On Time Seattle         201
## 16 AM West Delayed Los Angeles     117
## 17 AM West Delayed Phoenix         415
## 18 AM West Delayed San Diego        65
## 19 AM West Delayed San Francisco   129
## 20 AM West Delayed Seattle          61

Analyze data

Compare arrival delays of the two airlines

airline_data_delay <- airline_data %>% 
  filter(Status == "Delayed") %>% 
  group_by(Airline) %>% 
  summarise(
    "Total Count" = sum(Count),
  ) %>% 
  mutate(
    "Proportion" = signif(`Total Count`/sum(`Total Count`), digits = 4)
  )
airline_data_delay
## # A tibble: 2 × 3
##   Airline `Total Count` Proportion
##   <chr>           <int>      <dbl>
## 1 AM West           787      0.611
## 2 Alaska            501      0.389
airline_data_delay %>%
  ggplot(aes(x = `Airline`, y = `Proportion`)) +
  geom_bar(stat = "identity") 

Conclusion: From the table and the plot we see that Alaska airline had less delays than AM West, 0.389 to 0.611

Compare arrival delays by city

airline_data_delay <- airline_data %>% 
  filter(Status == "Delayed") %>% 
  group_by(Airline) %>% 
  mutate(
    "Proportion" = signif(`Count`/sum(`Count`), digits = 3)
  )
airline_data_delay
## # A tibble: 10 × 5
## # Groups:   Airline [2]
##    Airline Status  City          Count Proportion
##    <chr>   <chr>   <chr>         <int>      <dbl>
##  1 Alaska  Delayed Los Angeles      62     0.124 
##  2 Alaska  Delayed Phoenix          12     0.024 
##  3 Alaska  Delayed San Diego        20     0.0399
##  4 Alaska  Delayed San Francisco   102     0.204 
##  5 Alaska  Delayed Seattle         305     0.609 
##  6 AM West Delayed Los Angeles     117     0.149 
##  7 AM West Delayed Phoenix         415     0.527 
##  8 AM West Delayed San Diego        65     0.0826
##  9 AM West Delayed San Francisco   129     0.164 
## 10 AM West Delayed Seattle          61     0.0775
airline_data_delay %>%
  ggplot(aes(x = `Airline`, y = `Proportion`)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(~City)

Conclusion: From the plots by City, AM West arrives late more times than Alaska airline. So I would conclude that it is better to travel by AM West than Alaska.