Load libraries

library(tidyverse)
library(knitr)

Read in csv file to clean

flights_raw <- as.data.frame(read.delim("https://github.com/cassandra-coste/CUNY607/raw/main/Flights%20Data.csv", header = TRUE, stringsAsFactors = FALSE, na.strings=c("","NA"), sep = ",", fileEncoding = "UTF-8-BOM"))

Tidying the dataframe

# rename nonsensical columns and use fill to fill missing airline data from Airline column

flights_clean <- flights_raw %>% rename(Airline = X, Status = X.1) %>% fill(Airline)

# delete row 3 which contains no data

flights_clean <- flights_clean [-3,]

# gather data using City as the key to separate city variable from count data on on-time versus delayed flights 

flights_clean <- gather(flights_clean, City, Count, 3:7)

# covert Count column to numeric accounting for untidy numbers that came with CSV file 

flights_clean[, 4]  <- as.numeric(gsub(",","",flights_clean[, 4]))

# use spread to separate out on-time variable from delayed variable

flights_clean <- flights_clean %>% spread(Status, Count)

# rename delayed and on-time columns and replace period with space in city column

flights_clean <- flights_clean %>% rename(Delayed = delayed, On_time = "on time")

flights_clean$City <- str_replace(flights_clean$City,"\\."," ")

View tidy data

kable(flights_clean , col.names = c('Airline', 'Departing City', 'Delayed', 'On Time'), align = "ccrr", caption = "Flights Data by City")
Flights Data by City
Airline Departing City Delayed On Time
ALASKA Los Angeles 62 497
ALASKA Phoenix 12 221
ALASKA San Diego 20 212
ALASKA San Francisco 102 503
ALASKA Seattle 305 1841
AM WEST Los Angeles 117 694
AM WEST Phoenix 415 4840
AM WEST San Diego 65 383
AM WEST San Francisco 129 320
AM WEST Seattle 61 201

Visualize Data

# Compute percentage of flights delayed

flights_Visualize <- flights_clean %>% mutate(total = rowSums(across(where(is.numeric)))) %>% mutate(percent_delayed = Delayed/total * 100)

## Look at percentage of flights delayed by airline by city

ggplot(flights_Visualize, aes(fill = Airline, x = reorder(City,-percent_delayed), y = percent_delayed)) + geom_bar(position = "dodge", stat="identity") + coord_flip() + ggtitle("Percentage of Flights Delayed by Airline") + ylab("Percentage of Flights Delayed") + xlab("Departing City") + scale_fill_discrete(name = "Airline", labels = c("Alaska", "America West"))

Takeaway

After interpreting this visualization, it’s easier to see that across the board, America West airlines experiences a higher percentage of flights delayed than Alaska airlines across all cities evaluated. You can also see that certain cities experience a greater number of delays all around for both airlines than other cities (i.e. if you are flying out of Phoenix on Alaska or America West Airlines, you might not need to worry about delays are much as if you are flying out of San Francisco).