#load data
setwd('C:/Users/Christina/Documents/GitHub/IS')
df<-read.csv("airline2.csv", na.strings = "")
data<-df[rowSums(is.na(df))!=7, ]
names(data)[1:2]<-c('airline','status')
library('zoo')
data$airline<-na.locf(data$airline)
#transform wide to long format
library(tidyr)
dflong <- data %>% gather(airport, number, -airline, -status)
library(dplyr)
#high lever statistics
delayed<-select(filter(dflong, status == 'DELAYED'),-status)
arrange(delayed, airline,desc(number))
## airline airport number
## 1 ALASKA SEA 305
## 2 ALASKA SFO 102
## 3 ALASKA LAX 62
## 4 ALASKA SAN 20
## 5 ALASKA PHX 12
## 6 AM WEST PHX 415
## 7 AM WEST SFO 129
## 8 AM WEST LAX 117
## 9 AM WEST SAN 65
## 10 AM WEST SEA 61
#average number of delays by airline
summarise(group_by(delayed, airline),airports = n(),avg = mean(number))
## Source: local data frame [2 x 3]
##
## airline airports avg
## 1 ALASKA 5 100.2
## 2 AM WEST 5 157.4
sum<-summarise(group_by(dflong, airline, status),airports = n(),total = sum(number))
#total delay rate
stats<-sum %>% group_by(airline) %>% mutate(pct=(100*total)/sum(total))
select(filter(stats, status=='DELAYED'),-status, -airports)
## Source: local data frame [2 x 3]
## Groups: airline
##
## airline total pct
## 1 ALASKA 501 13.27152
## 2 AM WEST 787 10.89273
Alaska appears to have less number of delayed flights, but overall higher delay rate.
#calculate percentage of delays at each airport
airports<-filter(dflong %>% group_by(airline, airport) %>% mutate(pct=(100*number)/sum(number)), status == 'DELAYED')
Barplot of delays by airline:
## Warning: package 'ggplot2' was built under R version 3.2.2
## Warning: package 'ggthemes' was built under R version 3.2.1
** It appears that at every airport, Alaska is less likely to be delayed. Recall that overall, Alaska has a highter delay rate! How is this possible?**
Are some airports notorious for congestion?
arrange(summarise(group_by(delayed, airport),total = sum(number)),desc(total))
## Source: local data frame [5 x 2]
##
## airport total
## 1 PHX 427
## 2 SEA 366
## 3 SFO 231
## 4 LAX 179
## 5 SAN 85
Phoenix and Seattle have large volumes of delayed flights.
AM West has a lower delayed rate overall, because it performed fairly well in Phonenix - about 75% of its total flights are on time.