require(tidyr)
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 3.1.3
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 3.1.3
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#import .csv and rename columns
air_arr <- read.csv("C:/users/Andrew/Documents/R/win-library/3.1/is607w6.csv", header = TRUE, as.is = TRUE,
col.names = c("airlines", "status", "Los_Angeles", "Phoenix", "San_Diego", "San_Francisco", "Seattle"))
#coerce city vectors to numeric and remove commas
air_arr[,3:7] <- apply(air_arr[,3:7], 2, function(air_arr) as.numeric(gsub(",","", air_arr)))
#Fill in the blank airlines slot. Had trouble finding an eloquent solution for larger datafiles with this problem. With time...
air_arr[2,1] <- "Alaska"
air_arr[5,1] <- "AM WEST"
air_arr
## airlines status Los_Angeles Phoenix San_Diego San_Francisco Seattle
## 1 Alaska on time 497 221 212 503 1841
## 2 Alaska delayed 62 12 20 102 305
## 3 NA NA NA NA NA
## 4 AM WEST on time 694 4840 383 320 201
## 5 AM WEST delayed 117 415 65 129 61
#Tidy up data.
air_delays <- air_arr %>%
gather(city, value, Los_Angeles:Seattle) %>%
filter(status != "") %>% #At this point, I believe it is technically tidy, but now to make it easier to analyze and pass on...
spread(status, value) %>%
mutate(tot_delayed = delayed / (delayed + `on time`))
AM WEST has a higher proportion of delays (over 6%) than Alaska
#Create and plot delays by airline
total_air_delays <- air_delays %>%
group_by(airlines) %>%
summarise(tpropdelays = mean(tot_delayed))
total_air_delays
## Source: local data frame [2 x 2]
##
## airlines tpropdelays
## 1 AM WEST 0.1776915
## 2 Alaska 0.1118683
barplot(total_air_delays$tpropdelays, names = total_air_delays$airlines, main = "Proportion of Delayed Flights by Airline")
AM WEST is especially bad in San Francisco (12% more than Alaska) and Seattle (9%)
#Create and plot delays by city and airline
city_air_delays <- air_delays %>%
group_by(airlines, city) %>%
summarise(cpropdelays = mean(tot_delayed)) %>%
spread(airlines, cpropdelays) %>%
mutate(AM_WEST_Diff = (`AM WEST` - Alaska))
city_air_delays
## Source: local data frame [5 x 4]
##
## city Alaska AM WEST AM_WEST_Diff
## 1 Los_Angeles 0.11091234 0.14426634 0.03335399
## 2 Phoenix 0.05150215 0.07897241 0.02747026
## 3 San_Diego 0.08620690 0.14508929 0.05888239
## 4 San_Francisco 0.16859504 0.28730512 0.11871008
## 5 Seattle 0.14212488 0.23282443 0.09069954
barplot(city_air_delays$AM_WEST_Diff, names = city_air_delays$city, main = "AM WEST delays minus Alaska delays")