Assignment 5

Overview


In this assignment, the following tasks are performed:

Figure 1. Overview flowchart

Import


Import csv file from Githup into an initial format.

# Download the flight info CSV from Github
csvfile <- getURL("https://raw.githubusercontent.com/mmippolito/cuny/main/data607/assignment5/flightinfo.csv")
rawdata <- read.csv(text = csvfile)
kable(rawdata, caption = "<i><font color=#000000><b>Table 1.</b> Raw flight data read from CSV</font></i>") %>% 
  kable_styling(latex_options = "striped")
Table 1. Raw flight data read from CSV
Airline Flight.Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
ALASKA on time 497 221 212 503 1,841
delayed 62 12 20 102 305
AM WEST on time 694 4,840 383 320 201
delayed 117 415 65 129 61

Gather


Preliminary tidying:

* Populate the missing airline names.
* Gather the data keyed off city name.
* Remove commas from flight counts.
* Convert flight counts to numeric values.

# Data will look like this:
#       Airline   Flight.Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
# [1,] "ALASKA"  "on time"     "497"       "221"   "212"     "503"         "1,841"
# [2,] ""        "delayed"     " 62"       "12"    " 20"     "102"         "305"  
# [3,] "AM WEST" "on time"     "694"       "4,840" "383"     "320"         "201"  
# [4,] ""        "delayed"     "117"       "415"   " 65"     "129"         "61"   

# Populate missing values in Airline column
rawdata[[1]][2] <- "ALASKA"
rawdata[[1]][4] <- "AM WEST"

# Gather into this format:
#       Airline   Flight.Status city            num_flights
#  [1,] "ALASKA"  "on time"     "Los.Angeles"   "497"    
#  [2,] "ALASKA"  "delayed"     "Los.Angeles"   "62"    
#  [3,] "AM WEST" "on time"     "Los.Angeles"   "694"    
#  [4,] "AM WEST" "delayed"     "Los.Angeles"   "117"    
flights <- gather(rawdata, "Los.Angeles", "Phoenix", "San.Diego", "San.Francisco", "Seattle", 
                  key = "city", value="num_flights")

# Remove commas from flight counts and convert to numeric
flights$num_flights <- str_remove_all(flights$num_flights, ",")
flights$num_flights <- as.numeric(flights$num_flights)

Children


Create child tables for airlines, status, and cities.

# Create airlines dataframe
airlines <- data.frame(airline_id = c(1, 2), airline = c("ALASKA", "AM WEST"))

# Create status dataframe
statuses <- data.frame(status_id = c(1, 2), status = c("on time", "delayed"))

# Create cities dataframe
cities <- group_by(flights, city) %>% summarize()
cities <- mutate(cities, city_id = row_number()) %>% relocate(city_id, .before = city)

# Add airport codes for readability on graphs
cities <- mutate(cities, code = c("LAX", "PHX", "SAN", "SFO", "SEA"))

# Display tables
kable(list(airlines, statuses, cities), 
  caption = "<i><font color=#000000><b>Table 2.</b> Child tables: airlines, statuses, and cities</font></i>") %>% 
  kable_styling(latex_options = "striped")
Table 2. Child tables: airlines, statuses, and cities
airline_id airline
1 ALASKA
2 AM WEST
status_id status
1 on time
2 delayed
city_id city code
1 Los.Angeles LAX
2 Phoenix PHX
3 San.Diego SAN
4 San.Francisco SFO
5 Seattle SEA

Normalize


Normalize the fights dataframe into third normal form (or maybe even Boyce Codd?). Entity relationship diagram:

Image of an entity relationship diagram

# Join airlines dataframe to flights
flights <- merge(flights, airlines, by.x = "Airline", by.y = "airline")

# Join statuses dataframe to flights
flights <- merge(flights, statuses, by.x = "Flight.Status", by.y = "status")

# Join cities dataframe to flights
flights <- merge(flights, cities, by = "city")

# Change to standard variable names
flights <- mutate(flights, city_fk = city_id)
flights <- mutate(flights, status_fk = status_id)
flights <- mutate(flights, airline_fk = airline_id)

# Add a unique id to flights table based on row number
flights <- mutate(flights, flight_id = row_number())

Cleanup


Perform the following cleanup work:

* Remove old variables.
* Rearrange the columns.
* Replaces dots in city names with spaces.
* Change airline names to title case.

# Remove old variables
flights <- select(flights, -city)
flights <- select(flights, -Flight.Status)
flights <- select(flights, -Airline)

# Rearrange columns into an order that actually makes sense
flights <- select(flights, flight_id, airline_fk, status_fk, city_fk, num_flights)

# Replace dots in city names with spaces
cities$city <- str_replace_all(cities$city, "\\.", " ")

# Change airline names to title case
airlines.airline <- str_to_title(airlines$airline)

# Show the tables
kable(head(flights), 
  caption = "<i><font color=#000000><b>Table 3.</b> Normalized Flights Dataframe (first 6 rows)</font></i>") %>% 
  kable_styling(latex_options = "striped")
Table 3. Normalized Flights Dataframe (first 6 rows)
flight_id airline_fk status_fk city_fk num_flights
1 1 2 1 62
2 2 2 1 117
3 1 1 1 497
4 2 1 1 694
5 2 1 2 4840
6 1 2 2 12

Analysis


# Analyze by city:
# 
# 1. Join flights data to the the cities and statuses frames
# 2. Group by city and status
# 3. Spread flights into two status columns (delayed and on time)
# 4. Calculate total flights and percentage of delayed flights
# 5. Order by city name

by_city <- flights %>% 
  merge(cities, by.x = "city_fk", by.y = "city_id") %>%
  merge(statuses, by.x = "status_fk", by.y = "status_id") %>%
  group_by(city, status) %>% summarize(count = sum(num_flights)) %>%
  spread(key = status, value = count) %>%
  mutate(total_flights = delayed + `on time`) %>%
  mutate(pct_delayed = round(100 * (delayed / total_flights), 1)) %>%
  arrange(desc(pct_delayed))
kable(by_city, 
  caption = "<i><font color=#000000><b>Table 4.</b> Flight status by city arranged by percentage of delayed flights</font></i>") %>% 
  kable_styling(latex_options = "striped")
Table 4. Flight status by city arranged by percentage of delayed flights
city delayed on time total_flights pct_delayed
San Francisco 231 823 1054 21.9
Seattle 366 2042 2408 15.2
Los Angeles 179 1191 1370 13.1
San Diego 85 595 680 12.5
Phoenix 427 5061 5488 7.8


As shown, San Francisco had the most flights delayed (21.9%), while Phoenix had the fewest (7.8%).

# Analyze by airline:
# 
# 1. Join flights data to the the airline and statuses frames
# 2. Group by airline and status
# 3. Spread flights into two status columns (delayed and on time)
# 4. Calculate total flights and percentage of delayed flights
# 5. Order by city name

overall_flights <- flights %>% 
  merge(statuses, by.x = "status_fk", by.y = "status_id") %>%
  group_by(status) %>% summarize(count = sum(num_flights)) %>%
  spread(key = status, value = count) %>%
  mutate(total_flights = delayed + `on time`) %>%
  mutate(pct_delayed = round(100 * (delayed / total_flights), 1)) %>%
  arrange(desc(pct_delayed))
kable(overall_flights, 
  caption = "<i><font color=#000000><b>Table 5.</b> Overall flight status arranged by percentage of delayed flights</font></i>") %>% 
  kable_styling(latex_options = "striped")
Table 5. Overall flight status arranged by percentage of delayed flights
delayed on time total_flights pct_delayed
1288 9712 11000 11.7


Overll, 11.7% of flights were delayed for both airlines at all cities in the data set. The discrepancy between the overall performance and the per-city performance might be explained in terms of weather; more delays would be expected at the airports with the worst weather conditions. The total number of passengers served by the airport annually may also contribute to the number of delays. Qualitatively, that San Francisco and Seattle match well with that assumption, but further analysis would be necessary to provide any kind of quantitative evaluation.

Further Analysis


Based on the above data, I was curious to see if I could find quantitative evidence to support my hypothesis that bad weather and busier airports contribute to more flight delays.

# Import data from Github; some notes about the variables:
#
# 1. passengers = number of passengers served in 2013, with the exception of San Diego, which data is from 2014
# 2. cloudy = average annual days that the city was cloudy, based on data from 1961-1990
# 3. foggy = average annual days that the city was foggy, based on data from 1961-1990
# 4. rainy = average annual days that had rainfall, based on data from 1981-2010
# 5. precip = average annual precipitation in inches, based on data from 1981-2010

csvfile <- getURL("https://raw.githubusercontent.com/mmippolito/cuny/main/data607/assignment5/airportinfo.csv")
airportdata <- read.csv(text = csvfile)

# Convert passengers to millions of passengers for readability
airportdata <- mutate(airportdata, Mpassengers = round(passengers / 1000000, 2)) %>% 
  select(-passengers) %>% arrange(city)

# Join table to the by_city table, get the airport code, and display the table
airportdata <- airportdata %>% merge(by_city, by = "city") %>%
  merge(cities, by = "city") %>%
  select(code, cloudy, foggy, rainy, precip, Mpassengers, pct_delayed) %>%
  mutate(code = factor(code, levels = unique(code))) %>%
  arrange(pct_delayed) %>% mutate(row = row_number()) %>% relocate(row, .before = code)
kable(airportdata, caption = "<i><font color=#000000><b>Table 6.</b> Selected airport data</font></i>") %>% 
  kable_styling(latex_options = "striped")
Table 6. Selected airport data
row code cloudy foggy rainy precip Mpassengers pct_delayed
1 PHX 70 7 30 8.2 40.32 7.8
2 SAN 102 97 42 10.3 18.77 12.5
3 LAX 103 92 36 12.8 66.70 13.1
4 SEA 226 165 149 37.7 34.82 15.2
5 SFO 105 108 68 20.7 44.94 21.9


# 
plotdata <- gather(airportdata, key = "stat", value = "value", 
  c("foggy", "rainy", "precip", "Mpassengers"))
ggplot() +
  geom_bar(data = airportdata, aes(x = code, y = pct_delayed), stat = "identity") +
  geom_point(data = plotdata, aes(x = code, y = value, color = stat, group = stat)) +
  geom_line(data = plotdata, aes(x = code, y = value, color = stat, group = stat)) + 
  ylab("value") + 
  ggtitle("Figure 2. Comparison of selected airport stats to percentage of flights delayed")

Conclusion


As shown, there seems to be a general correlation between weather and the number of delayed flights. There did not seem to be a similar correlation with passenger throughput. Therefore, based on this limited analysis, there is some quantitative evidence to suggest that bad weather seems to play a significant factor in flight delays.

One thing note that may seem paradoxicl is that, while Phoenix saw the most number of total flights, it also had the fewest percentage of delayed flights–only 7.8%. However, this can be explained by its hot and dry climate, leading to fewer delays due to weather.