airlines <- read.csv("https://raw.githubusercontent.com/Shayaeng/Data607/main/Assignment5/airlines.csv")
print(airlines)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 ALASKA on time 497 221 212 503 1,841
## 2 delayed 62 12 20 102 305
## 3 AM WEST on time 694 4,840 383 320 201
## 4 delayed 117 415 65 129 61
#name the columns without names and add values to the missing fields
airlines <- airlines %>%
rename('Airline' = X, 'ArrivalStatus' = X.1) %>%
mutate_all(~if_else(. == "", NA, .)) %>%
fill(Airline, .direction = "down")
#some columns had commas and are character columns
columns_with_commas <- c('Phoenix', 'Seattle')
for (column in columns_with_commas)
airlines[[column]] <- as.integer(gsub(",", "", airlines[[column]]))
#lengthen and widen the dataframe to make it easier to calculate the delays
cities <- c('Los.Angeles', 'Phoenix', 'San.Diego', 'San.Francisco', 'Seattle')
airlines_long <- airlines %>%
pivot_longer(cols = all_of(cities), names_to = "Destination", values_to = "Flights")
airlines_wide <- airlines_long %>%
pivot_wider(names_from = "ArrivalStatus", values_from = "Flights")
#calculate delay ratios per airline
airlines_wide <- airlines_wide %>%
mutate(DelayRatio = delayed/(delayed + `on time`))
ggplot(airlines_wide, aes(x = Destination, y = DelayRatio, fill = Airline)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Delay Ratio by Airline",
y = "Delay Ratio",
x = "City")
As is clear from the above bar plot, Am West has consistently higher ratios of flights being delayed. This held true across all the cities measured.