Loading the Data

I loaded in the data, making sure to include na.strings so that any empty character cell would be filled with NA.

untidy_flights <- read.csv(url("https://raw.githubusercontent.com/ShanaFarber/cuny-sps/master/DATA_607/Assignment5/flights.csv"), na.strings=c(""))

untidy_flights
##         X     X.1 Los.Angeles Pheonix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503   1,841
## 2    <NA> delayed          62      12        20           102     305
## 3    <NA>    <NA>          NA    <NA>        NA            NA    <NA>
## 4 AM WEST on time         694   4,840       383           320     201
## 5    <NA> delayed         117     415        65           129      61

Cleaning

I made a tidy_flights data frame from untidy_flights and renamed the first two columns to be more descriptive.

tidy_flights <- untidy_flights

tidy_flights <- tidy_flights %>% 
  rename("airline" = X,
         "arrival_status" = X.1)

I removed the empty row from the data frame and used fill() to populate the empty airline cells with their respective airline.

# remove empty row
tidy_flights <- tidy_flights[!apply(is.na(tidy_flights), 1, all),]

# populate empty airlines
tidy_flights <- tidy_flights %>% 
  fill(airline, .direction = "down")

tidy_flights
##   airline arrival_status Los.Angeles Pheonix San.Diego San.Francisco Seattle
## 1  ALASKA        on time         497     221       212           503   1,841
## 2  ALASKA        delayed          62      12        20           102     305
## 4 AM WEST        on time         694   4,840       383           320     201
## 5 AM WEST        delayed         117     415        65           129      61

I found the code for removing the empty row here.

Documentation on fill() function found here.

I then changed from wide to long format:

# make long format
tidy_flights <- tidy_flights %>%
  pivot_longer(cols = c("Los.Angeles":"Seattle"),
               names_to = "city",
               values_to = "num_flights",
               values_transform = list(num_flights = as.character))

# remove any comma from num_flights and change to integer
tidy_flights$num_flights <- str_remove(tidy_flights$num_flights, ",")
tidy_flights$num_flights <- as.integer(tidy_flights$num_flights)

# remove . from city names
tidy_flights$city <- str_replace(tidy_flights$city, "\\.", " ")

# change arrival_status values to snake case
tidy_flights$arrival_status <- to_snake_case(tidy_flights$arrival_status)

tidy_flights
## # A tibble: 20 × 4
##    airline arrival_status city          num_flights
##    <chr>   <chr>          <chr>               <int>
##  1 ALASKA  on_time        Los Angeles           497
##  2 ALASKA  on_time        Pheonix               221
##  3 ALASKA  on_time        San Diego             212
##  4 ALASKA  on_time        San Francisco         503
##  5 ALASKA  on_time        Seattle              1841
##  6 ALASKA  delayed        Los Angeles            62
##  7 ALASKA  delayed        Pheonix                12
##  8 ALASKA  delayed        San Diego              20
##  9 ALASKA  delayed        San Francisco         102
## 10 ALASKA  delayed        Seattle               305
## 11 AM WEST on_time        Los Angeles           694
## 12 AM WEST on_time        Pheonix              4840
## 13 AM WEST on_time        San Diego             383
## 14 AM WEST on_time        San Francisco         320
## 15 AM WEST on_time        Seattle               201
## 16 AM WEST delayed        Los Angeles           117
## 17 AM WEST delayed        Pheonix               415
## 18 AM WEST delayed        San Diego              65
## 19 AM WEST delayed        San Francisco         129
## 20 AM WEST delayed        Seattle                61

I then split based on “on time” vs. “delayed”:

tidy_flights <- tidy_flights %>%
  pivot_wider(names_from = "arrival_status",
              values_from = "num_flights")

Comparing Arrival Delays

tidy_flights <- tidy_flights %>%
  arrange(desc(delayed))

knitr::kable(tidy_flights)
airline city on_time delayed
AM WEST Pheonix 4840 415
ALASKA Seattle 1841 305
AM WEST San Francisco 320 129
AM WEST Los Angeles 694 117
ALASKA San Francisco 503 102
AM WEST San Diego 383 65
ALASKA Los Angeles 497 62
AM WEST Seattle 201 61
ALASKA San Diego 212 20
ALASKA Pheonix 221 12
tidy_flights %>%
  ggplot(aes(x = city, y = delayed, fill = airline)) +
    geom_bar(stat = "identity", position = "dodge") +
    geom_text(aes(label = delayed), position = position_dodge(.9))

AM West flights to Pheonix had the most arrival delays (415 flights) while Alaska flights to Pheonix had the least arrival delays (12 flights).

What was the proportion of delayed flights for each airline?

tidy_flights <- tidy_flights %>%
  mutate("total" = on_time + delayed)

delayed_per_airline <- tidy_flights %>% 
  group_by(airline) %>%
  summarize("delayed" = sum(delayed), "total" = sum(total)) %>%
  mutate("prop_delayed" = delayed/total) 

knitr::kable(delayed_per_airline)
airline delayed total prop_delayed
ALASKA 501 3775 0.1327152
AM WEST 787 7225 0.1089273

About 13% of Alaska airline flights were delayed while about 11% of AM West flights were delayed.

What was the proportion of delayed flights for each city?

delayed_per_city <- tidy_flights %>%
  group_by(city) %>%
  summarize("delayed" = sum(delayed), "total" = sum(total)) %>%
  mutate("prop_delayed" = delayed/total) %>%
  arrange(desc(prop_delayed))

knitr::kable(delayed_per_city)
city delayed total prop_delayed
San Francisco 231 1054 0.2191651
Seattle 366 2408 0.1519934
Los Angeles 179 1370 0.1306569
San Diego 85 680 0.1250000
Pheonix 427 5488 0.0778061

Flights to San Francisco had the most delays overall (22%).

What was the proportion of delayed flights for each city according to each airline?

delayed_per_city_airline <- tidy_flights %>%
  mutate("prop_delayed" = delayed / total) %>%
  arrange(city, airline)

knitr::kable(delayed_per_city_airline)
airline city on_time delayed total prop_delayed
ALASKA Los Angeles 497 62 559 0.1109123
AM WEST Los Angeles 694 117 811 0.1442663
ALASKA Pheonix 221 12 233 0.0515021
AM WEST Pheonix 4840 415 5255 0.0789724
ALASKA San Diego 212 20 232 0.0862069
AM WEST San Diego 383 65 448 0.1450893
ALASKA San Francisco 503 102 605 0.1685950
AM WEST San Francisco 320 129 449 0.2873051
ALASKA Seattle 1841 305 2146 0.1421249
AM WEST Seattle 201 61 262 0.2328244
delayed_per_city_airline %>%
  ggplot(aes(x = city, y = prop_delayed, color = airline)) +
    geom_point() +
    scale_y_continuous(labels = scales::percent)

We can see that AM West has a greater percentage of delayed flights for each city. AM West flights to San Francisco had the largest percentage of delayed flights (29%), while Alaska flights to Phoenix had the least (5%).

When comparing this to the overall percentage of delayed flights for each airline, it appears that Alaska Airlines outperformed AM West in every city, yet under-performed overall. This is because AM West has a greater overall number of flights relative to Alaska Airlines, and the calculation for delay percentage did not include a weighting of the delay percentage for each city relative to the percentage that each city’s flights make of the total flights for each airline.

Let us now perform the weighted calculations:

# data frame for each airline's total flights
airline_totals <- tidy_flights %>%
  group_by(airline) %>%
  summarize("airline_total_flights" = sum(total))

# join to table of flight information
weighted_flights <- tidy_flights %>%
  left_join(airline_totals, on = "airline")
## Joining, by = "airline"
# calculate the percentage of flights to each city out of total airline flights
# calculate percentage of delayed flights out of total flights to city
weighted_flights <- weighted_flights %>%
  mutate("prop_of_flights" = total / airline_total_flights,
         "city_prop_delayed" = delayed / total)

# multiply percentages to get true percentage delay per airline to each city
weighted_flights <- weighted_flights %>%
  mutate("adj_prop_delayed" = prop_of_flights * city_prop_delayed)

# check that percentages still valid
weighted_flights %>% 
  group_by(airline) %>%
  summarize("prop_delayed" = sum(adj_prop_delayed))
## # A tibble: 2 × 2
##   airline prop_delayed
##   <chr>          <dbl>
## 1 ALASKA         0.133
## 2 AM WEST        0.109

The percentage of overall delays remains the same as above. We can now visualize the adjusted delay percentage of each airline’s flights to each city:

weighted_flights %>%
  ggplot(aes(x = city, y = adj_prop_delayed, color = airline)) +
    geom_point() +
    scale_y_continuous(labels = scales::percent)

la_diff <- weighted_flights %>% 
  filter(city == "Los Angeles") %>%
  summarize(la_diff = diff(adj_prop_delayed))

la_diff <- la_diff$la_diff[1]

We can now compare the true proportion of delayed flights for each city. AM West outperforms Alaska Airline in flights to San Francisco and Seattle, while Alaska outperforms AM West in flights to Phoenix and San Diego. The percentage of delayed flights to Los Angeles is similar between airlines, with AM West having a tiny edge on Alaska Airlines (a difference of about 0.023%)

We can compare the two plots below to see how the percentage changes when the proportion is weighted correctly:

delayed_per_city_airline %>%
  ggplot(aes(x = city, y = prop_delayed, color = airline)) +
    geom_point() +
    scale_y_continuous(labels = scales::percent) +
    labs(title = "Delay Percentage of Each Airline Per City")

weighted_flights %>%
  ggplot(aes(x = city, y = adj_prop_delayed, color = airline)) +
    geom_point() +
    scale_y_continuous(labels = scales::percent) +
    labs(title = "Adjusted Delay Percentage of Each Airline Per City")