Load packages
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(reshape2)
#Read in .csv
source <- read.csv("https://raw.githubusercontent.com/danielhong98/MSDA-Spring-2016/f59001030c617a62564cc2a09937c4376afc317c/Tidying%20and%20Transforming%20Data.csv", header = TRUE, sep=",", stringsAsFactors = FALSE)
source
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
head(source)
## X X.1 Los.Angeles Phoenix San.Diego San.Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 delayed 117 415 65 129 61
#Clean the dataframe by removing columns/rows and assign column names for airlines and status, repeat row names
#source <- subset(source, c(TRUE, TRUE, FALSE, TRUE, TRUE), 2:8)
colnames(source)[1] <- "Carrier"
colnames(source)[2] <- "Status"
source[2,1] <- "Alaska"
source[5,1] <- "AM WEST"
upsource <- source[-(3), ]
#Cleaned dataframe is easier to read but more work to be done, tidy to have 4 columns and alternating rows
list <- gather(upsource, "City", "Frequency", 3:7)
head(list)
## Carrier Status City Frequency
## 1 Alaska on time Los.Angeles 497
## 2 Alaska delayed Los.Angeles 62
## 3 AM WEST on time Los.Angeles 694
## 4 AM WEST delayed Los.Angeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
#Some of the city names have extra characters, let's remove
list$City <- gsub("\\."," ",list$City)
head(list)
## Carrier Status City Frequency
## 1 Alaska on time Los Angeles 497
## 2 Alaska delayed Los Angeles 62
## 3 AM WEST on time Los Angeles 694
## 4 AM WEST delayed Los Angeles 117
## 5 Alaska on time Phoenix 221
## 6 Alaska delayed Phoenix 12
#Perform analysis to compare the arrival delays for the two airlines
##At first glance it appears that AM WEST has a higher percentage of delays in each city
list <- spread(list, Status, Frequency)
head(list)
## Carrier City delayed on time
## 1 Alaska Los Angeles 62 497
## 2 Alaska Phoenix 12 221
## 3 Alaska San Diego 20 212
## 4 Alaska San Francisco 102 503
## 5 Alaska Seattle 305 1841
## 6 AM WEST Los Angeles 117 694
list$delayed <- as.numeric(as.character(list$delayed))
list$`on time` <- as.numeric(as.character(list$`on time`))
list <- mutate(list, total = delayed + `on time`, pctdelay = delayed / total * 100)
head(list)
## Carrier City delayed on time total pctdelay
## 1 Alaska Los Angeles 62 497 559 11.091234
## 2 Alaska Phoenix 12 221 233 5.150215
## 3 Alaska San Diego 20 212 232 8.620690
## 4 Alaska San Francisco 102 503 605 16.859504
## 5 Alaska Seattle 305 1841 2146 14.212488
## 6 AM WEST Los Angeles 117 694 811 14.426634
list <- arrange(list, City, pctdelay)
list
## Carrier City delayed on time total pctdelay
## 1 Alaska Los Angeles 62 497 559 11.091234
## 2 AM WEST Los Angeles 117 694 811 14.426634
## 3 Alaska Phoenix 12 221 233 5.150215
## 4 AM WEST Phoenix 415 4840 5255 7.897241
## 5 Alaska San Diego 20 212 232 8.620690
## 6 AM WEST San Diego 65 383 448 14.508929
## 7 Alaska San Francisco 102 503 605 16.859504
## 8 AM WEST San Francisco 129 320 449 28.730512
## 9 Alaska Seattle 305 1841 2146 14.212488
## 10 AM WEST Seattle 61 201 262 23.282443
##Boxplot pct delays by city
qplot(City, pctdelay, data = list, geom = c("boxplot", "jitter"), fill = City, main = "Pct Delays by City", xlab = "", ylab = "Pct Delayed")

#Group by City
##It appears that most delays, ~22% of the flights, are delayed in San Francisoco and the least delays, ~8%, are in Phoenix
list %>% group_by(City) %>% summarise(aggdelay = sum(delayed), aggontime = sum(`on time`), aggpctdelay = (aggdelay / (aggdelay + aggontime) * 100))
## Source: local data frame [5 x 4]
##
## City aggdelay aggontime aggpctdelay
## (chr) (dbl) (dbl) (dbl)
## 1 Los Angeles 179 1191 13.065693
## 2 Phoenix 427 5061 7.780612
## 3 San Diego 85 595 12.500000
## 4 San Francisco 231 823 21.916509
## 5 Seattle 366 2042 15.199336
#Group by Carrier
##When we look at the aggregate totals by carrier, Alaska (~13%) has a higher percentage of delays compared to AM WEST (~11%)
list %>% group_by(Carrier) %>% summarise(aggdelay = sum(delayed), aggontime = sum(`on time`), aggpctdelay = (aggdelay / (aggdelay + aggontime) * 100))
## Source: local data frame [2 x 4]
##
## Carrier aggdelay aggontime aggpctdelay
## (chr) (dbl) (dbl) (dbl)
## 1 Alaska 501 3274 13.27152
## 2 AM WEST 787 6438 10.89273
#I could not create boxplots after grouping by City or Carrier. There was a post on Stackoverflow that used melt but it did not work when I tried.