Contents

  1. Project overview
  2. Load libraries and import data
  3. Tidy Data
  4. Analyze Data
  5. Visualize Data

Assignment Overview

The chart below describes arrival delays for two airlines across five destinations.

knitr::include_graphics('/Users/katieevers/Desktop/airlineTable.png')

Your task is to:
1. Create a .CSV file that includes all of the information above. You’re encouraged to use a “wide” structure similar to how the information appears above, so that you can practice tidying and transformations as described below.
2. Read the information from your.CSV file into R, and use tidyr and dplyr as needed to tidy and transform your data.
3. Perform analysis to compare the arrival delays for the two airlines.

Load libraries and import data

library("tidyr")
library("dplyr")
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library("kableExtra")
## Warning: package 'kableExtra' was built under R version 3.5.2
library("ggplot2")

airlines <- read.csv("https://raw.githubusercontent.com/KatherineEvers/airlines/master/airlines.csv", header = TRUE)

airlines
##   Airline  Status Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        54           129      61

Tidy Data

Tidy data so that each variable forms a column and each observation forms a row:

tidyDf <- airlines %>%
  #separate data into destination and count columns
  gather(Destination, Count, 3:7) %>%
  #spread flight status into separate columns
  spread(2,4)

#manipulate table style
tidyDf  %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Airline Destination delayed on time
ALASKA Los.Angeles 62 497
ALASKA Phoenix 12 221
ALASKA San.Diego 20 212
ALASKA San.Francisco 102 503
ALASKA Seattle 305 1841
AM WEST Los.Angeles 117 694
AM WEST Phoenix 415 4840
AM WEST San.Diego 54 383
AM WEST San.Francisco 129 320
AM WEST Seattle 61 201
#arrange rows by destination 
tidyDf2 <- tidyDf %>%
  arrange(Destination, Airline)

tidyDf2  %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Airline Destination delayed on time
ALASKA Los.Angeles 62 497
AM WEST Los.Angeles 117 694
ALASKA Phoenix 12 221
AM WEST Phoenix 415 4840
ALASKA San.Diego 20 212
AM WEST San.Diego 54 383
ALASKA San.Francisco 102 503
AM WEST San.Francisco 129 320
ALASKA Seattle 305 1841
AM WEST Seattle 61 201

Analyze data

#calculate the percentages of delayed and on time flights for each airline and destination
tidyDf3 <- tidyDf2 %>% 
        mutate(total = delayed + `on time`, percentDelayed = round(delayed / total * 100, 2), 
               percentOnTime = round(100 * `on time` / total, 2)) 
        
tidyDf3  %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Airline Destination delayed on time total percentDelayed percentOnTime
ALASKA Los.Angeles 62 497 559 11.09 88.91
AM WEST Los.Angeles 117 694 811 14.43 85.57
ALASKA Phoenix 12 221 233 5.15 94.85
AM WEST Phoenix 415 4840 5255 7.90 92.10
ALASKA San.Diego 20 212 232 8.62 91.38
AM WEST San.Diego 54 383 437 12.36 87.64
ALASKA San.Francisco 102 503 605 16.86 83.14
AM WEST San.Francisco 129 320 449 28.73 71.27
ALASKA Seattle 305 1841 2146 14.21 85.79
AM WEST Seattle 61 201 262 23.28 76.72
#create summary table comparing percentage of total delayed flights for each airline
summaryDf <- tidyDf3 %>% 
  group_by(Airline) %>% 
  summarise(totalDelayed = sum(delayed), totalOnTime = sum(`on time`), totalFlights = sum(total)) %>% 
  mutate(percentDelayed = round(totalDelayed/totalFlights * 100, 2))

summaryDf  %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Airline totalDelayed totalOnTime totalFlights percentDelayed
ALASKA 501 3274 3775 13.27
AM WEST 776 6438 7214 10.76
#order destinations by decreasing percentage of total delayed flights
destinationDelays <- tidyDf3 %>% 
  group_by(Destination) %>% 
  summarise(averageDelayPercent = round(mean(percentDelayed),2)) %>% 
  arrange(desc(averageDelayPercent))

destinationDelays  %>% 
  kable(row.names = TRUE) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"))
Destination averageDelayPercent
1 San.Francisco 22.80
2 Seattle 18.75
3 Los.Angeles 12.76
4 San.Diego 10.49
5 Phoenix 6.53

Visualize data

#create bar graphs comparing the number of flights for each airline by destination
plot <- ggplot(tidyDf3, aes(y = total, x = Airline, color = Airline, fill = Airline)) + 
    geom_bar( stat = "identity") +
    geom_text(aes(label = total), vjust = -.3) +
    facet_wrap(~Destination) +
    ylim(0, 5500) +
    ylab("Number of Flights") +
    ggtitle("Number of Fights by Airline and Destination")

plot

plot1 <- ggplot(summaryDf, aes(Airline, percentDelayed)) + 
  geom_bar(aes(fill = Airline), position = "dodge", stat = "identity") + 
  geom_text(aes(label = percentDelayed), vjust = -.3) +
  ylab("Percentage of Delayed Flights") +
  ggtitle("Total Percentage of Delayed Flights by Airline")

plot1

#create bar graphs comparing the percentages of delayed flights for each airline by destination
plot2 <- ggplot(tidyDf3, aes(y=percentDelayed, x = Airline, color = Airline, fill = Airline)) + 
    geom_bar( stat = "identity") +
    geom_text(aes(label = percentDelayed), vjust = -.3) +
    facet_wrap(~Destination) +
    ylim(0, 35) +
    ylab("Percentage of Delayed Flights") +
    ggtitle("Percentage of Delayed Flights by Airline and Destination")
    
plot2

#create a line graph comparing the percentages of delayed flights for each airline by destination
plot3 <- ggplot(tidyDf3, aes(x = Destination, y = percentDelayed, group = Airline, color = Airline, shape = Airline)) + 
  geom_point(aes(shape = Airline)) + 
  geom_line() + 
  geom_text(aes(label = percentDelayed), vjust = -.7) +
  labs(x = "Destination", y = "Percentage of Delayed flights") + 
  theme(legend.title=element_blank()) +
  ggtitle("Percentage of Delayed Flights by Airline and Destination")

plot3

Based on this analysis, America West Airlines has a smaller percentage of total delayed flights than Alaska Airlines. However, Alaska has smaller percentages of delayed flights than America West by destination. It is notable that American West has 5255 flights to Phoenix with 7.9% of those flights being delayed, while Alaska has only 233 Phoenix flights with 5.15% of flights delayed. In addition, America West has only 262 flights to Seattle with 23.28% of flights delayed, while Alaska has 2146 Seattle flights with 14.21% delayed.