I loaded in the data, making sure to include na.strings
so that any empty character cell would be filled with NA.
untidy_flights <- read.csv(url("https://raw.githubusercontent.com/ShanaFarber/cuny-sps/master/DATA_607/Assignment5/flights.csv"), na.strings=c(""))
untidy_flights
## X X.1 Los.Angeles Pheonix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 <NA> delayed 62 12 20 102 305
## 3 <NA> <NA> NA <NA> NA NA <NA>
## 4 AM WEST on time 694 4,840 383 320 201
## 5 <NA> delayed 117 415 65 129 61
I made a tidy_flights data frame from
untidy_flights and renamed the first two columns to be more
descriptive.
tidy_flights <- untidy_flights
tidy_flights <- tidy_flights %>%
rename("airline" = X,
"arrival_status" = X.1)
I removed the empty row from the data frame and used
fill() to populate the empty airline cells with their
respective airline.
# remove empty row
tidy_flights <- tidy_flights[!apply(is.na(tidy_flights), 1, all),]
# populate empty airlines
tidy_flights <- tidy_flights %>%
fill(airline, .direction = "down")
tidy_flights
## airline arrival_status Los.Angeles Pheonix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 ALASKA delayed 62 12 20 102 305
## 4 AM WEST on time 694 4,840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
I found the code for removing the empty row here.
Documentation on fill() function found here.
I then changed from wide to long format:
# make long format
tidy_flights <- tidy_flights %>%
pivot_longer(cols = c("Los.Angeles":"Seattle"),
names_to = "city",
values_to = "num_flights",
values_transform = list(num_flights = as.character))
# remove any comma from num_flights and change to integer
tidy_flights$num_flights <- str_remove(tidy_flights$num_flights, ",")
tidy_flights$num_flights <- as.integer(tidy_flights$num_flights)
# remove . from city names
tidy_flights$city <- str_replace(tidy_flights$city, "\\.", " ")
# change arrival_status values to snake case
tidy_flights$arrival_status <- to_snake_case(tidy_flights$arrival_status)
tidy_flights
## # A tibble: 20 × 4
## airline arrival_status city num_flights
## <chr> <chr> <chr> <int>
## 1 ALASKA on_time Los Angeles 497
## 2 ALASKA on_time Pheonix 221
## 3 ALASKA on_time San Diego 212
## 4 ALASKA on_time San Francisco 503
## 5 ALASKA on_time Seattle 1841
## 6 ALASKA delayed Los Angeles 62
## 7 ALASKA delayed Pheonix 12
## 8 ALASKA delayed San Diego 20
## 9 ALASKA delayed San Francisco 102
## 10 ALASKA delayed Seattle 305
## 11 AM WEST on_time Los Angeles 694
## 12 AM WEST on_time Pheonix 4840
## 13 AM WEST on_time San Diego 383
## 14 AM WEST on_time San Francisco 320
## 15 AM WEST on_time Seattle 201
## 16 AM WEST delayed Los Angeles 117
## 17 AM WEST delayed Pheonix 415
## 18 AM WEST delayed San Diego 65
## 19 AM WEST delayed San Francisco 129
## 20 AM WEST delayed Seattle 61
I then split based on “on time” vs. “delayed”:
tidy_flights <- tidy_flights %>%
pivot_wider(names_from = "arrival_status",
values_from = "num_flights")
tidy_flights <- tidy_flights %>%
arrange(desc(delayed))
knitr::kable(tidy_flights)
| airline | city | on_time | delayed |
|---|---|---|---|
| AM WEST | Pheonix | 4840 | 415 |
| ALASKA | Seattle | 1841 | 305 |
| AM WEST | San Francisco | 320 | 129 |
| AM WEST | Los Angeles | 694 | 117 |
| ALASKA | San Francisco | 503 | 102 |
| AM WEST | San Diego | 383 | 65 |
| ALASKA | Los Angeles | 497 | 62 |
| AM WEST | Seattle | 201 | 61 |
| ALASKA | San Diego | 212 | 20 |
| ALASKA | Pheonix | 221 | 12 |
tidy_flights %>%
ggplot(aes(x = city, y = delayed, fill = airline)) +
geom_bar(stat = "identity", position = "dodge") +
geom_text(aes(label = delayed), position = position_dodge(.9))
AM West flights to Pheonix had the most arrival delays (415 flights) while Alaska flights to Pheonix had the least arrival delays (12 flights).
What was the proportion of delayed flights for each airline?
tidy_flights <- tidy_flights %>%
mutate("total" = on_time + delayed)
delayed_per_airline <- tidy_flights %>%
group_by(airline) %>%
summarize("delayed" = sum(delayed), "total" = sum(total)) %>%
mutate("prop_delayed" = delayed/total)
knitr::kable(delayed_per_airline)
| airline | delayed | total | prop_delayed |
|---|---|---|---|
| ALASKA | 501 | 3775 | 0.1327152 |
| AM WEST | 787 | 7225 | 0.1089273 |
About 13% of Alaska airline flights were delayed while about 11% of AM West flights were delayed.
What was the proportion of delayed flights for each city?
delayed_per_city <- tidy_flights %>%
group_by(city) %>%
summarize("delayed" = sum(delayed), "total" = sum(total)) %>%
mutate("prop_delayed" = delayed/total) %>%
arrange(desc(prop_delayed))
knitr::kable(delayed_per_city)
| city | delayed | total | prop_delayed |
|---|---|---|---|
| San Francisco | 231 | 1054 | 0.2191651 |
| Seattle | 366 | 2408 | 0.1519934 |
| Los Angeles | 179 | 1370 | 0.1306569 |
| San Diego | 85 | 680 | 0.1250000 |
| Pheonix | 427 | 5488 | 0.0778061 |
Flights to San Francisco had the most delays overall (22%).
What was the proportion of delayed flights for each city according to each airline?
delayed_per_city_airline <- tidy_flights %>%
mutate("prop_delayed" = delayed / total) %>%
arrange(city, airline)
knitr::kable(delayed_per_city_airline)
| airline | city | on_time | delayed | total | prop_delayed |
|---|---|---|---|---|---|
| ALASKA | Los Angeles | 497 | 62 | 559 | 0.1109123 |
| AM WEST | Los Angeles | 694 | 117 | 811 | 0.1442663 |
| ALASKA | Pheonix | 221 | 12 | 233 | 0.0515021 |
| AM WEST | Pheonix | 4840 | 415 | 5255 | 0.0789724 |
| ALASKA | San Diego | 212 | 20 | 232 | 0.0862069 |
| AM WEST | San Diego | 383 | 65 | 448 | 0.1450893 |
| ALASKA | San Francisco | 503 | 102 | 605 | 0.1685950 |
| AM WEST | San Francisco | 320 | 129 | 449 | 0.2873051 |
| ALASKA | Seattle | 1841 | 305 | 2146 | 0.1421249 |
| AM WEST | Seattle | 201 | 61 | 262 | 0.2328244 |
delayed_per_city_airline %>%
ggplot(aes(x = city, y = prop_delayed, color = airline)) +
geom_point() +
scale_y_continuous(labels = scales::percent)
We can see that AM West has a greater percentage of delayed flights for each city. AM West flights to San Francisco had the largest percentage of delayed flights (29%), while Alaska flights to Phoenix had the least (5%).
When comparing this to the overall percentage of delayed flights for each airline, it appears that Alaska Airlines outperformed AM West in every city, yet under-performed overall. This is because AM West has a greater overall number of flights relative to Alaska Airlines, and the calculation for delay percentage did not include a weighting of the delay percentage for each city relative to the percentage that each city’s flights make of the total flights for each airline.
Let us now perform the weighted calculations:
# data frame for each airline's total flights
airline_totals <- tidy_flights %>%
group_by(airline) %>%
summarize("airline_total_flights" = sum(total))
# join to table of flight information
weighted_flights <- tidy_flights %>%
left_join(airline_totals, on = "airline")
## Joining, by = "airline"
# calculate the percentage of flights to each city out of total airline flights
# calculate percentage of delayed flights out of total flights to city
weighted_flights <- weighted_flights %>%
mutate("prop_of_flights" = total / airline_total_flights,
"city_prop_delayed" = delayed / total)
# multiply percentages to get true percentage delay per airline to each city
weighted_flights <- weighted_flights %>%
mutate("adj_prop_delayed" = prop_of_flights * city_prop_delayed)
# check that percentages still valid
weighted_flights %>%
group_by(airline) %>%
summarize("prop_delayed" = sum(adj_prop_delayed))
## # A tibble: 2 × 2
## airline prop_delayed
## <chr> <dbl>
## 1 ALASKA 0.133
## 2 AM WEST 0.109
The percentage of overall delays remains the same as above. We can now visualize the adjusted delay percentage of each airline’s flights to each city:
weighted_flights %>%
ggplot(aes(x = city, y = adj_prop_delayed, color = airline)) +
geom_point() +
scale_y_continuous(labels = scales::percent)
la_diff <- weighted_flights %>%
filter(city == "Los Angeles") %>%
summarize(la_diff = diff(adj_prop_delayed))
la_diff <- la_diff$la_diff[1]
We can now compare the true proportion of delayed flights for each city. AM West outperforms Alaska Airline in flights to San Francisco and Seattle, while Alaska outperforms AM West in flights to Phoenix and San Diego. The percentage of delayed flights to Los Angeles is similar between airlines, with AM West having a tiny edge on Alaska Airlines (a difference of about 0.023%)
We can compare the two plots below to see how the percentage changes when the proportion is weighted correctly:
delayed_per_city_airline %>%
ggplot(aes(x = city, y = prop_delayed, color = airline)) +
geom_point() +
scale_y_continuous(labels = scales::percent) +
labs(title = "Delay Percentage of Each Airline Per City")
weighted_flights %>%
ggplot(aes(x = city, y = adj_prop_delayed, color = airline)) +
geom_point() +
scale_y_continuous(labels = scales::percent) +
labs(title = "Adjusted Delay Percentage of Each Airline Per City")