The purpose of this assignment is to create a wide dataset and practice tidying and transforming the dataset using the tidyr and dplyr libraries.
library(RMySQL)
## Loading required package: DBI
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.6 v dplyr 1.0.3
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.4.0 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(ggplot2)
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(wesanderson)
library(keyring)
library(tidyr)
key_set('user', 'test')
## Please enter password in TK window (Alt+Tab)
db = dbConnect(MySQL(), user='test', password=key_get('user', 'test'), dbname='hw5_607', host='localhost')
airline_data <- dbGetQuery(db, "SELECT * FROM Airlines")
colnames(airline_data)[1] <- gsub('^...','',colnames(airline_data)[1])
airline_data %>%
kbl(caption = "Wide Airline Data Exported from SQL") %>%
kable_styling(bootstrap_options = "striped")
| Airline | Category | Los Angeles | Phoenix | San Diego | San Francisco | Seattle |
|---|---|---|---|---|---|---|
| ALASKA | on time | 497 | 221 | 212 | 503 | 1841 |
| ALASKA | delayed | 62 | 12 | 20 | 102 | 305 |
| AM WEST | on time | 694 | 4840 | 383 | 320 | 201 |
| AM WEST | delayed | 117 | 415 | 65 | 129 | 61 |
Converting the data from a wide format to a long format
airline_long <- pivot_longer(airline_data, cols=3:7, names_to = "Destination", values_to = "NumFlights" )
airline_long %>%
kbl(caption = "Long Airline Data") %>%
kable_styling(bootstrap_options = "striped")
| Airline | Category | Destination | NumFlights |
|---|---|---|---|
| ALASKA | on time | Los Angeles | 497 |
| ALASKA | on time | Phoenix | 221 |
| ALASKA | on time | San Diego | 212 |
| ALASKA | on time | San Francisco | 503 |
| ALASKA | on time | Seattle | 1841 |
| ALASKA | delayed | Los Angeles | 62 |
| ALASKA | delayed | Phoenix | 12 |
| ALASKA | delayed | San Diego | 20 |
| ALASKA | delayed | San Francisco | 102 |
| ALASKA | delayed | Seattle | 305 |
| AM WEST | on time | Los Angeles | 694 |
| AM WEST | on time | Phoenix | 4840 |
| AM WEST | on time | San Diego | 383 |
| AM WEST | on time | San Francisco | 320 |
| AM WEST | on time | Seattle | 201 |
| AM WEST | delayed | Los Angeles | 117 |
| AM WEST | delayed | Phoenix | 415 |
| AM WEST | delayed | San Diego | 65 |
| AM WEST | delayed | San Francisco | 129 |
| AM WEST | delayed | Seattle | 61 |
Let’s take a look at the total number of on-time and delayed flights per airline, both for each destination and summed across all destinations.
total <- pivot_wider(airline_long, names_from = Category, values_from = NumFlights)
total <- rename(total, ontime = "on time")
total <- total %>% mutate(percentage_ontime = ontime / (delayed + ontime))
total %>%
kbl(caption = "Percentage of Flights Arriving On Time Per Destination") %>%
kable_styling(bootstrap_options = "striped")
| Airline | Destination | ontime | delayed | percentage_ontime |
|---|---|---|---|---|
| ALASKA | Los Angeles | 497 | 62 | 0.8890877 |
| ALASKA | Phoenix | 221 | 12 | 0.9484979 |
| ALASKA | San Diego | 212 | 20 | 0.9137931 |
| ALASKA | San Francisco | 503 | 102 | 0.8314050 |
| ALASKA | Seattle | 1841 | 305 | 0.8578751 |
| AM WEST | Los Angeles | 694 | 117 | 0.8557337 |
| AM WEST | Phoenix | 4840 | 415 | 0.9210276 |
| AM WEST | San Diego | 383 | 65 | 0.8549107 |
| AM WEST | San Francisco | 320 | 129 | 0.7126949 |
| AM WEST | Seattle | 201 | 61 | 0.7671756 |
airline_fewer <- select(airline_long, Airline, Category, NumFlights)
new <- airline_fewer %>% group_by(Airline, Category) %>% summarise(total = sum(NumFlights))
## `summarise()` has grouped output by 'Airline'. You can override using the `.groups` argument.
newnew <- pivot_wider(new, names_from = Category, values_from = total)
newnew <- rename(newnew, ontime = "on time")
newnew %>% mutate(percentage_ontime = ontime / (delayed + ontime)) %>%
kbl(caption = "Percentage of Flights Arriving On Time") %>%
kable_styling(bootstrap_options = "striped")
| Airline | delayed | ontime | percentage_ontime |
|---|---|---|---|
| ALASKA | 501 | 3274 | 0.8672848 |
| AM WEST | 787 | 6438 | 0.8910727 |
ggplot(total, aes(x=Destination, y=as.factor(percentage_ontime), fill=Airline)) + geom_bar(stat="identity", position="dodge") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(axis.line = element_line(colour = "black"), axis.text.y=element_blank()) + scale_fill_manual(values=wes_palette( name="Zissou1", 2, type="continuous")) +labs(title= "Airline Arrival/Delay")
ggplot(airline_fewer, aes(x=Airline, y=Category, fill=Category)) + geom_bar(stat="identity") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + theme(axis.line = element_line(colour = "black"), axis.text.y=element_blank()) + scale_fill_manual(values=wes_palette( name="Zissou1", 2)) +labs(title= "Airline Arrival/Delay Overall Totals")
## Conclusions
While Alaska has better performance on on-time arrivals on a per-city basis, AM West has a better overall performance. This is likely because the majority of AM West’s flights fly to Phoenix, which has the highest number of on time AM West arrivals (92.10%).
Github link to Airlines Data: Click Here