Introduction

The purpose of this assignment is to create a wide dataset and practice tidying and transforming the dataset using the tidyr and dplyr libraries.

library(RMySQL)
## Loading required package: DBI
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.6     v dplyr   1.0.3
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.4.0     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(ggplot2)
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(wesanderson)
library(keyring)
library(tidyr)

Importing Data

key_set('user', 'test')
## Please enter password in TK window (Alt+Tab)
db = dbConnect(MySQL(), user='test', password=key_get('user', 'test'), dbname='hw5_607', host='localhost')
airline_data <- dbGetQuery(db, "SELECT * FROM Airlines")
colnames(airline_data)[1] <- gsub('^...','',colnames(airline_data)[1])
airline_data %>%
  kbl(caption = "Wide Airline Data Exported from SQL") %>%
  kable_styling(bootstrap_options = "striped")
Wide Airline Data Exported from SQL
Airline Category Los Angeles Phoenix San Diego San Francisco Seattle
ALASKA on time 497 221 212 503 1841
ALASKA delayed 62 12 20 102 305
AM WEST on time 694 4840 383 320 201
AM WEST delayed 117 415 65 129 61

Tidying Up the Data

Converting the data from a wide format to a long format

airline_long <- pivot_longer(airline_data, cols=3:7, names_to = "Destination", values_to = "NumFlights" )
airline_long %>%
  kbl(caption = "Long Airline Data") %>%
  kable_styling(bootstrap_options = "striped")
Long Airline Data
Airline Category Destination NumFlights
ALASKA on time Los Angeles 497
ALASKA on time Phoenix 221
ALASKA on time San Diego 212
ALASKA on time San Francisco 503
ALASKA on time Seattle 1841
ALASKA delayed Los Angeles 62
ALASKA delayed Phoenix 12
ALASKA delayed San Diego 20
ALASKA delayed San Francisco 102
ALASKA delayed Seattle 305
AM WEST on time Los Angeles 694
AM WEST on time Phoenix 4840
AM WEST on time San Diego 383
AM WEST on time San Francisco 320
AM WEST on time Seattle 201
AM WEST delayed Los Angeles 117
AM WEST delayed Phoenix 415
AM WEST delayed San Diego 65
AM WEST delayed San Francisco 129
AM WEST delayed Seattle 61

Analysis

Let’s take a look at the total number of on-time and delayed flights per airline, both for each destination and summed across all destinations.

total <- pivot_wider(airline_long, names_from = Category, values_from = NumFlights)
total <- rename(total, ontime = "on time")
total <- total %>% mutate(percentage_ontime = ontime / (delayed + ontime)) 
total %>%
  kbl(caption = "Percentage of Flights Arriving On Time Per Destination") %>%
  kable_styling(bootstrap_options = "striped")
Percentage of Flights Arriving On Time Per Destination
Airline Destination ontime delayed percentage_ontime
ALASKA Los Angeles 497 62 0.8890877
ALASKA Phoenix 221 12 0.9484979
ALASKA San Diego 212 20 0.9137931
ALASKA San Francisco 503 102 0.8314050
ALASKA Seattle 1841 305 0.8578751
AM WEST Los Angeles 694 117 0.8557337
AM WEST Phoenix 4840 415 0.9210276
AM WEST San Diego 383 65 0.8549107
AM WEST San Francisco 320 129 0.7126949
AM WEST Seattle 201 61 0.7671756
airline_fewer <- select(airline_long, Airline, Category, NumFlights)
new <- airline_fewer %>% group_by(Airline, Category) %>% summarise(total = sum(NumFlights))
## `summarise()` has grouped output by 'Airline'. You can override using the `.groups` argument.
newnew <- pivot_wider(new, names_from = Category, values_from = total)
newnew <- rename(newnew, ontime = "on time")
newnew %>% mutate(percentage_ontime = ontime / (delayed + ontime)) %>%
  kbl(caption = "Percentage of Flights Arriving On Time") %>%
  kable_styling(bootstrap_options = "striped")
Percentage of Flights Arriving On Time
Airline delayed ontime percentage_ontime
ALASKA 501 3274 0.8672848
AM WEST 787 6438 0.8910727
ggplot(total, aes(x=Destination, y=as.factor(percentage_ontime), fill=Airline)) + geom_bar(stat="identity", position="dodge") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(axis.line = element_line(colour = "black"), axis.text.y=element_blank()) + scale_fill_manual(values=wes_palette( name="Zissou1", 2, type="continuous")) +labs(title= "Airline Arrival/Delay")

ggplot(airline_fewer, aes(x=Airline, y=Category, fill=Category)) + geom_bar(stat="identity") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(axis.line = element_line(colour = "black"), axis.text.y=element_blank()) + scale_fill_manual(values=wes_palette( name="Zissou1", 2)) +labs(title= "Airline Arrival/Delay Overall Totals")

## Conclusions

While Alaska has better performance on on-time arrivals on a per-city basis, AM West has a better overall performance. This is likely because the majority of AM West’s flights fly to Phoenix, which has the highest number of on time AM West arrivals (92.10%).

Reproducibility

Github link to Airlines Data: Click Here