Task:

Link to instructions

Import Necessary Libraries

library(dplyr, warn.conflicts = F)
library(tidyr)
library(ggplot2)
library(reshape2)
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

Import the .csv file

# Rename the first two columns
# After researching how to rename the missing rows, I came across the 'fill' function. I couldn't get it to work, until I tried replacing the empty cells with NAs. This did the trick! :)

flights_raw <- read.csv("https://raw.githubusercontent.com/JoshuaSturm/CUNY_MSDA/Public/Fall_2017/DATA_607/DATA_607_Homework_5/flights.csv", na.strings ="") %>%
  rename(Airline = X, Arrived = X.1) %>%
  fill(Airline) %>%
  na.omit()
tbl_df(flights_raw)
## # A tibble: 4 x 7
##   Airline Arrived Los.Angeles Phoenix San.Diego San.Francisco Seattle
## *  <fctr>  <fctr>       <int>   <int>     <int>         <int>   <int>
## 1  ALASKA on time         497     221       212           503    1841
## 2  ALASKA delayed          62      12        20           102     305
## 3 AM WEST on time         694    4840       383           320     201
## 4 AM WEST delayed         117     415        65           129      61

Tidy the table

# The table is now in wide format

flights <- flights_raw %>%
  gather("City", "Number", 3:7) %>%
  spread("Arrived", "Number", 3:7) %>%
  rename(Delayed = delayed, On_time = 'on time') %>%
  arrange(Airline)
## Warning in if (!is.na(fill)) {: the condition has length > 1 and only the
## first element will be used
flights[,c(1,2,4,3)]
##    Airline          City On_time Delayed
## 1   ALASKA   Los.Angeles     497      62
## 2   ALASKA       Phoenix     221      12
## 3   ALASKA     San.Diego     212      20
## 4   ALASKA San.Francisco     503     102
## 5   ALASKA       Seattle    1841     305
## 6  AM WEST   Los.Angeles     694     117
## 7  AM WEST       Phoenix    4840     415
## 8  AM WEST     San.Diego     383      65
## 9  AM WEST San.Francisco     320     129
## 10 AM WEST       Seattle     201      61

Add columns

# Using the existing data, add column for total flights to that city, as well as percentage that were on time
flights %>%
  group_by(Airline, City) %>%
  mutate(Total = sum(On_time, Delayed), Pct_on_time = On_time / sum(On_time, Delayed))
## # A tibble: 10 x 6
## # Groups:   Airline, City [10]
##    Airline          City Delayed On_time Total Pct_on_time
##     <fctr>         <chr>   <int>   <int> <int>       <dbl>
##  1  ALASKA   Los.Angeles      62     497   559   0.8890877
##  2  ALASKA       Phoenix      12     221   233   0.9484979
##  3  ALASKA     San.Diego      20     212   232   0.9137931
##  4  ALASKA San.Francisco     102     503   605   0.8314050
##  5  ALASKA       Seattle     305    1841  2146   0.8578751
##  6 AM WEST   Los.Angeles     117     694   811   0.8557337
##  7 AM WEST       Phoenix     415    4840  5255   0.9210276
##  8 AM WEST     San.Diego      65     383   448   0.8549107
##  9 AM WEST San.Francisco     129     320   449   0.7126949
## 10 AM WEST       Seattle      61     201   262   0.7671756

Analyze the data

Comparison of arrival stats for each airline

# Using ggplot, I charted the arrival status for each airline, by city

alaska_ttl <- flights %>%
  melt() %>%
  ggplot(aes(x = City, y = value, fill = variable)) +
   geom_bar(stat = 'identity') +
   geom_text(size = 3, aes(label = value),  position = position_stack(vjust = 0.5)) +
   labs(y = "flights") +
   facet_grid(~ Airline)
## Using Airline, City as id variables
alaska_ttl