1. Create a .CSV file (or optionally, a MySQL database!) that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.

  2. Read the information from your .CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.

# Tidyverse packages includes ggplot2, dplyr, tidyr already
library(tidyverse)

table <- read.csv("https://raw.githubusercontent.com/tonyCUNY/tonyCUNY/main/assignment5.csv")

table
##         X     X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2         delayed          62      12        20           102     305
## 3                          NA      NA        NA            NA      NA
## 4 AM WEST on time         694    4840       383           320     201
## 5         delayed         117     415        65           129      61
  1. Tidy and transform the data.
# Remove row with NA value
# Rename X and X.1 to Airline and Status
table_1 <- table |> 
              slice(-3) |>
              rename(Airline = X, Status = X.1)
# Fill in the missing values under Airline column
table_1[2, 1] <- 'ALASKA'
table_1[4, 1] <- 'AM WEST'

table_1
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61
# Use pivot_longer() to transform all location into Destination column
# arrange by Destination column

table_2 <- table_1 |> 
  pivot_longer(
    cols = c("Los.Angeles", "Phoenix", "San.Diego", "San.Francisco", "Seattle"),
    names_to = "Destination",
    values_to = "Count"
  ) |> 
    arrange(Destination)

table_2
## # A tibble: 20 × 4
##    Airline Status  Destination   Count
##    <chr>   <chr>   <chr>         <int>
##  1 ALASKA  on time Los.Angeles     497
##  2 ALASKA  delayed Los.Angeles      62
##  3 AM WEST on time Los.Angeles     694
##  4 AM WEST delayed Los.Angeles     117
##  5 ALASKA  on time Phoenix         221
##  6 ALASKA  delayed Phoenix          12
##  7 AM WEST on time Phoenix        4840
##  8 AM WEST delayed Phoenix         415
##  9 ALASKA  on time San.Diego       212
## 10 ALASKA  delayed San.Diego        20
## 11 AM WEST on time San.Diego       383
## 12 AM WEST delayed San.Diego        65
## 13 ALASKA  on time San.Francisco   503
## 14 ALASKA  delayed San.Francisco   102
## 15 AM WEST on time San.Francisco   320
## 16 AM WEST delayed San.Francisco   129
## 17 ALASKA  on time Seattle        1841
## 18 ALASKA  delayed Seattle         305
## 19 AM WEST on time Seattle         201
## 20 AM WEST delayed Seattle          61
# Use pivot_wider() to transform values under Status into two columns: On_time, Delayed
# value will be from Count column
# rename the new column

table_3 <- table_2 |> 
  pivot_wider(
    names_from = Status,
    values_from = Count,
  ) |> 
    rename(On_time = 'on time', Delayed = delayed)

table_3
## # A tibble: 10 × 4
##    Airline Destination   On_time Delayed
##    <chr>   <chr>           <int>   <int>
##  1 ALASKA  Los.Angeles       497      62
##  2 AM WEST Los.Angeles       694     117
##  3 ALASKA  Phoenix           221      12
##  4 AM WEST Phoenix          4840     415
##  5 ALASKA  San.Diego         212      20
##  6 AM WEST San.Diego         383      65
##  7 ALASKA  San.Francisco     503     102
##  8 AM WEST San.Francisco     320     129
##  9 ALASKA  Seattle          1841     305
## 10 AM WEST Seattle           201      61
# Create a column: Delay_rate

table_4 <- table_3 |> 
            mutate(Delay_rate = round((Delayed / (On_time + Delayed)) * 100, 2))
table_4
## # A tibble: 10 × 5
##    Airline Destination   On_time Delayed Delay_rate
##    <chr>   <chr>           <int>   <int>      <dbl>
##  1 ALASKA  Los.Angeles       497      62      11.1 
##  2 AM WEST Los.Angeles       694     117      14.4 
##  3 ALASKA  Phoenix           221      12       5.15
##  4 AM WEST Phoenix          4840     415       7.9 
##  5 ALASKA  San.Diego         212      20       8.62
##  6 AM WEST San.Diego         383      65      14.5 
##  7 ALASKA  San.Francisco     503     102      16.9 
##  8 AM WEST San.Francisco     320     129      28.7 
##  9 ALASKA  Seattle          1841     305      14.2 
## 10 AM WEST Seattle           201      61      23.3
  1. Perform analysis to compare the arrival delays for the two airlines
# Create a table showing two airlines Delay_rate for each Destination

table_5 <- table_4 |> 
            group_by(Destination, Airline) |> 
            summarize(Delay_rate) |> 
            spread(Airline, Delay_rate) |> 
            rename(AM_WEST = 'AM WEST')
## `summarise()` has grouped output by 'Destination'. You can override using the
## `.groups` argument.
table_5
## # A tibble: 5 × 3
## # Groups:   Destination [5]
##   Destination   ALASKA AM_WEST
##   <chr>          <dbl>   <dbl>
## 1 Los.Angeles    11.1     14.4
## 2 Phoenix         5.15     7.9
## 3 San.Diego       8.62    14.5
## 4 San.Francisco  16.9     28.7
## 5 Seattle        14.2     23.3

Conclusion: AM WEST has higher delay_rate than ALASKA

# Visualize the results

table_4 |>
  group_by(Airline) |> 
      ggplot(aes(x = Destination, y = Delay_rate, fill = Airline)) +
      geom_bar(stat = "identity", position = "dodge") +
      labs(title = "Airline Arrival Delay",
        x = "Destination",
        y = "Delayrate (%)")