Task:
Link to instructions
Import Necessary Libraries
library(dplyr, warn.conflicts = F)
library(tidyr)
library(ggplot2)
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
Import the .csv file
# Rename the first two columns
# After researching how to rename the missing rows, I came across the 'fill' function. I couldn't get it to work, until I tried replacing the empty cells with NAs. This did the trick! :)
flights_raw <- read.csv("https://raw.githubusercontent.com/JoshuaSturm/CUNY_MSDA/Public/Fall_2017/DATA_607/DATA_607_Homework_5/flights.csv", na.strings ="") %>%
rename(Airline = X, Arrived = X.1) %>%
fill(Airline) %>%
na.omit()
tbl_df(flights_raw)
## # A tibble: 4 x 7
## Airline Arrived Los.Angeles Phoenix San.Diego San.Francisco Seattle
## * <fctr> <fctr> <int> <int> <int> <int> <int>
## 1 ALASKA on time 497 221 212 503 1841
## 2 ALASKA delayed 62 12 20 102 305
## 3 AM WEST on time 694 4840 383 320 201
## 4 AM WEST delayed 117 415 65 129 61
Tidy the table
# The table is now in wide format
flights <- flights_raw %>%
gather("City", "Number", 3:7) %>%
spread("Arrived", "Number", 3:7) %>%
rename(Delayed = delayed, On_time = 'on time') %>%
arrange(Airline)
## Warning in if (!is.na(fill)) {: the condition has length > 1 and only the
## first element will be used
flights[,c(1,2,4,3)]
## Airline City On_time Delayed
## 1 ALASKA Los.Angeles 497 62
## 2 ALASKA Phoenix 221 12
## 3 ALASKA San.Diego 212 20
## 4 ALASKA San.Francisco 503 102
## 5 ALASKA Seattle 1841 305
## 6 AM WEST Los.Angeles 694 117
## 7 AM WEST Phoenix 4840 415
## 8 AM WEST San.Diego 383 65
## 9 AM WEST San.Francisco 320 129
## 10 AM WEST Seattle 201 61
Add columns
# Using the existing data, add column for total flights to that city, as well as percentage that were on time
flights %>%
group_by(Airline, City) %>%
mutate(Total = sum(On_time, Delayed), Pct_on_time = On_time / sum(On_time, Delayed))
## # A tibble: 10 x 6
## # Groups: Airline, City [10]
## Airline City Delayed On_time Total Pct_on_time
## <fctr> <chr> <int> <int> <int> <dbl>
## 1 ALASKA Los.Angeles 62 497 559 0.8890877
## 2 ALASKA Phoenix 12 221 233 0.9484979
## 3 ALASKA San.Diego 20 212 232 0.9137931
## 4 ALASKA San.Francisco 102 503 605 0.8314050
## 5 ALASKA Seattle 305 1841 2146 0.8578751
## 6 AM WEST Los.Angeles 117 694 811 0.8557337
## 7 AM WEST Phoenix 415 4840 5255 0.9210276
## 8 AM WEST San.Diego 65 383 448 0.8549107
## 9 AM WEST San.Francisco 129 320 449 0.7126949
## 10 AM WEST Seattle 61 201 262 0.7671756
Analyze the data
Comparison of arrival stats for each airline
# Using ggplot, I charted the arrival status for each airline, by city
alaska_ttl <- flights %>%
melt() %>%
ggplot(aes(x = City, y = value, fill = variable)) +
geom_bar(stat = 'identity') +
geom_text(size = 3, aes(label = value), position = position_stack(vjust = 0.5)) +
labs(y = "flights") +
facet_grid(~ Airline)
## Using Airline, City as id variables
alaska_ttl
