DATA607 - Assignment 5

Assignment 5

Overview

In this assignment, the following tasks are performed:

Figure 1. Overview flowchart

Import

Import csv file from Githup into an initial format.

# Download the flight info CSV from Github
csvfile <- getURL("https://raw.githubusercontent.com/mmippolito/cuny/main/data607/assignment5/flightinfo.csv")
rawdata <- read.csv(text = csvfile)
kable(rawdata, caption = "<i><font color=#000000><b>Table 1.</b> Raw flight data read from CSV</font></i>") %>% 
  kable_styling(latex_options = "striped")

**Table 1.** Raw flight data read from CSV
Airline	Flight.Status	Los.Angeles	Phoenix	San.Diego	San.Francisco	Seattle
ALASKA	on time	497	221	212	503	1,841
	delayed	62	12	20	102	305
AM WEST	on time	694	4,840	383	320	201
	delayed	117	415	65	129	61

Gather

Preliminary tidying:

* Populate the missing airline names.
* Gather the data keyed off city name.
* Remove commas from flight counts.
* Convert flight counts to numeric values.

# Data will look like this:
#       Airline   Flight.Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
# [1,] "ALASKA"  "on time"     "497"       "221"   "212"     "503"         "1,841"
# [2,] ""        "delayed"     " 62"       "12"    " 20"     "102"         "305"  
# [3,] "AM WEST" "on time"     "694"       "4,840" "383"     "320"         "201"  
# [4,] ""        "delayed"     "117"       "415"   " 65"     "129"         "61"   

# Populate missing values in Airline column
rawdata[[1]][2] <- "ALASKA"
rawdata[[1]][4] <- "AM WEST"

# Gather into this format:
#       Airline   Flight.Status city            num_flights
#  [1,] "ALASKA"  "on time"     "Los.Angeles"   "497"    
#  [2,] "ALASKA"  "delayed"     "Los.Angeles"   "62"    
#  [3,] "AM WEST" "on time"     "Los.Angeles"   "694"    
#  [4,] "AM WEST" "delayed"     "Los.Angeles"   "117"    
flights <- gather(rawdata, "Los.Angeles", "Phoenix", "San.Diego", "San.Francisco", "Seattle", 
                  key = "city", value="num_flights")

# Remove commas from flight counts and convert to numeric
flights$num_flights <- str_remove_all(flights$num_flights, ",")
flights$num_flights <- as.numeric(flights$num_flights)

Children

Create child tables for airlines, status, and cities.

# Create airlines dataframe
airlines <- data.frame(airline_id = c(1, 2), airline = c("ALASKA", "AM WEST"))

# Create status dataframe
statuses <- data.frame(status_id = c(1, 2), status = c("on time", "delayed"))

# Create cities dataframe
cities <- group_by(flights, city) %>% summarize()
cities <- mutate(cities, city_id = row_number()) %>% relocate(city_id, .before = city)

# Add airport codes for readability on graphs
cities <- mutate(cities, code = c("LAX", "PHX", "SAN", "SFO", "SEA"))

# Display tables
kable(list(airlines, statuses, cities), 
  caption = "<i><font color=#000000><b>Table 2.</b> Child tables: airlines, statuses, and cities</font></i>") %>% 
  kable_styling(latex_options = "striped")

Table 2. Child tables: airlines, statuses, and cities

airline_id	airline
1	ALASKA
2	AM WEST

status_id	status
1	on time
2	delayed

city_id	city	code
1	Los.Angeles	LAX
2	Phoenix	PHX
3	San.Diego	SAN
4	San.Francisco	SFO
5	Seattle	SEA

Normalize

Normalize the fights dataframe into third normal form (or maybe even Boyce Codd?). Entity relationship diagram:

Image of an entity relationship diagram

# Join airlines dataframe to flights
flights <- merge(flights, airlines, by.x = "Airline", by.y = "airline")

# Join statuses dataframe to flights
flights <- merge(flights, statuses, by.x = "Flight.Status", by.y = "status")

# Join cities dataframe to flights
flights <- merge(flights, cities, by = "city")

# Change to standard variable names
flights <- mutate(flights, city_fk = city_id)
flights <- mutate(flights, status_fk = status_id)
flights <- mutate(flights, airline_fk = airline_id)

# Add a unique id to flights table based on row number
flights <- mutate(flights, flight_id = row_number())

Cleanup

Perform the following cleanup work:

* Remove old variables.
* Rearrange the columns.
* Replaces dots in city names with spaces.
* Change airline names to title case.

# Remove old variables
flights <- select(flights, -city)
flights <- select(flights, -Flight.Status)
flights <- select(flights, -Airline)

# Rearrange columns into an order that actually makes sense
flights <- select(flights, flight_id, airline_fk, status_fk, city_fk, num_flights)

# Replace dots in city names with spaces
cities$city <- str_replace_all(cities$city, "\\.", " ")

# Change airline names to title case
airlines.airline <- str_to_title(airlines$airline)

# Show the tables
kable(head(flights), 
  caption = "<i><font color=#000000><b>Table 3.</b> Normalized Flights Dataframe (first 6 rows)</font></i>") %>% 
  kable_styling(latex_options = "striped")

**Table 3.** Normalized Flights Dataframe (first 6 rows)
flight_id	airline_fk	status_fk	city_fk	num_flights
1	1	2	1	62
2	2	2	1	117
3	1	1	1	497
4	2	1	1	694
5	2	1	2	4840
6	1	2	2	12

Analysis

# Analyze by city:
# 
# 1. Join flights data to the the cities and statuses frames
# 2. Group by city and status
# 3. Spread flights into two status columns (delayed and on time)
# 4. Calculate total flights and percentage of delayed flights
# 5. Order by city name

by_city <- flights %>% 
  merge(cities, by.x = "city_fk", by.y = "city_id") %>%
  merge(statuses, by.x = "status_fk", by.y = "status_id") %>%
  group_by(city, status) %>% summarize(count = sum(num_flights)) %>%
  spread(key = status, value = count) %>%
  mutate(total_flights = delayed + `on time`) %>%
  mutate(pct_delayed = round(100 * (delayed / total_flights), 1)) %>%
  arrange(desc(pct_delayed))
kable(by_city, 
  caption = "<i><font color=#000000><b>Table 4.</b> Flight status by city arranged by percentage of delayed flights</font></i>") %>% 
  kable_styling(latex_options = "striped")

**Table 4.** Flight status by city arranged by percentage of delayed flights
city	delayed	on time	total_flights	pct_delayed
San Francisco	231	823	1054	21.9
Seattle	366	2042	2408	15.2
Los Angeles	179	1191	1370	13.1
San Diego	85	595	680	12.5
Phoenix	427	5061	5488	7.8

As shown, San Francisco had the most flights delayed (21.9%), while Phoenix had the fewest (7.8%).

# Analyze by airline:
# 
# 1. Join flights data to the the airline and statuses frames
# 2. Group by airline and status
# 3. Spread flights into two status columns (delayed and on time)
# 4. Calculate total flights and percentage of delayed flights
# 5. Order by city name

overall_flights <- flights %>% 
  merge(statuses, by.x = "status_fk", by.y = "status_id") %>%
  group_by(status) %>% summarize(count = sum(num_flights)) %>%
  spread(key = status, value = count) %>%
  mutate(total_flights = delayed + `on time`) %>%
  mutate(pct_delayed = round(100 * (delayed / total_flights), 1)) %>%
  arrange(desc(pct_delayed))
kable(overall_flights, 
  caption = "<i><font color=#000000><b>Table 5.</b> Overall flight status arranged by percentage of delayed flights</font></i>") %>% 
  kable_styling(latex_options = "striped")

**Table 5.** Overall flight status arranged by percentage of delayed flights
delayed	on time	total_flights	pct_delayed
1288	9712	11000	11.7

Overll, 11.7% of flights were delayed for both airlines at all cities in the data set. The discrepancy between the overall performance and the per-city performance might be explained in terms of weather; more delays would be expected at the airports with the worst weather conditions. The total number of passengers served by the airport annually may also contribute to the number of delays. Qualitatively, that San Francisco and Seattle match well with that assumption, but further analysis would be necessary to provide any kind of quantitative evaluation.

Further Analysis

Based on the above data, I was curious to see if I could find quantitative evidence to support my hypothesis that bad weather and busier airports contribute to more flight delays.

# Import data from Github; some notes about the variables:
#
# 1. passengers = number of passengers served in 2013, with the exception of San Diego, which data is from 2014
# 2. cloudy = average annual days that the city was cloudy, based on data from 1961-1990
# 3. foggy = average annual days that the city was foggy, based on data from 1961-1990
# 4. rainy = average annual days that had rainfall, based on data from 1981-2010
# 5. precip = average annual precipitation in inches, based on data from 1981-2010

csvfile <- getURL("https://raw.githubusercontent.com/mmippolito/cuny/main/data607/assignment5/airportinfo.csv")
airportdata <- read.csv(text = csvfile)

# Convert passengers to millions of passengers for readability
airportdata <- mutate(airportdata, Mpassengers = round(passengers / 1000000, 2)) %>% 
  select(-passengers) %>% arrange(city)

# Join table to the by_city table, get the airport code, and display the table
airportdata <- airportdata %>% merge(by_city, by = "city") %>%
  merge(cities, by = "city") %>%
  select(code, cloudy, foggy, rainy, precip, Mpassengers, pct_delayed) %>%
  mutate(code = factor(code, levels = unique(code))) %>%
  arrange(pct_delayed) %>% mutate(row = row_number()) %>% relocate(row, .before = code)
kable(airportdata, caption = "<i><font color=#000000><b>Table 6.</b> Selected airport data</font></i>") %>% 
  kable_styling(latex_options = "striped")

**Table 6.** Selected airport data
row	code	cloudy	foggy	rainy	precip	Mpassengers	pct_delayed
1	PHX	70	7	30	8.2	40.32	7.8
2	SAN	102	97	42	10.3	18.77	12.5
3	LAX	103	92	36	12.8	66.70	13.1
4	SEA	226	165	149	37.7	34.82	15.2
5	SFO	105	108	68	20.7	44.94	21.9

# 
plotdata <- gather(airportdata, key = "stat", value = "value", 
  c("foggy", "rainy", "precip", "Mpassengers"))
ggplot() +
  geom_bar(data = airportdata, aes(x = code, y = pct_delayed), stat = "identity") +
  geom_point(data = plotdata, aes(x = code, y = value, color = stat, group = stat)) +
  geom_line(data = plotdata, aes(x = code, y = value, color = stat, group = stat)) + 
  ylab("value") + 
  ggtitle("Figure 2. Comparison of selected airport stats to percentage of flights delayed")

Conclusion

As shown, there seems to be a general correlation between weather and the number of delayed flights. There did not seem to be a similar correlation with passenger throughput. Therefore, based on this limited analysis, there is some quantitative evidence to suggest that bad weather seems to play a significant factor in flight delays.

One thing note that may seem paradoxicl is that, while Phoenix saw the most number of total flights, it also had the fewest percentage of delayed flights–only 7.8%. However, this can be explained by its hot and dry climate, leading to fewer delays due to weather.