Flights <- read.csv("/Users/wendyhayes/Desktop/MBA 676-R/domestic_flights_jan_2016.csv", stringsAsFactors = FALSE)
library(dplyr)
library(knitr)
library(ggvis)
IncompleteCases <- Flights %>% filter(!complete.cases(.))
length(IncompleteCases[IncompleteCases$Cancelled==0 & IncompleteCases$Diverted==0])
## [1] 0
Flights$FlightDate <- as.Date(Flights$FlightDate, format = "%m/%d/%Y")
Flights <- Flights %>% mutate(New_CRSDepTime = paste(FlightDate, sprintf("%04d", CRSDepTime)), New_CRSArrTime = paste(FlightDate, sprintf("%04d", CRSArrTime)), New_DepTime = paste(FlightDate, sprintf("%04d", DepTime)), New_ArrTime = paste(FlightDate, sprintf("%04d", ArrTime)), New_WheelsOff = paste(FlightDate, sprintf("%04d", WheelsOff)), New_WheelsOn = paste(FlightDate, sprintf("%04d", WheelsOn)))
Flights$New_CRSDepTime <- as.POSIXct(Flights$New_CRSDepTime, format="%Y-%m-%d %H%M")
Flights$New_CRSArrTime <- as.POSIXct(Flights$New_CRSArrTime, format="%Y-%m-%d %H%M")
Flights$New_DepTime <- as.POSIXct(Flights$New_DepTime, format="%Y-%m-%d %H%M")
Flights$New_ArrTime <- as.POSIXct(Flights$New_ArrTime, format="%Y-%m-%d %H%M")
Flights$New_WheelsOff <- as.POSIXct(Flights$New_WheelsOff, format="%Y-%m-%d %H%M")
Flights$New_WheelsOn <- as.POSIXct(Flights$New_WheelsOn, format="%Y-%m-%d %H%M")
Flights <- Flights %>% filter(Cancelled == 0, Diverted == 0) %>% filter(ArrTime>DepTime)
Flights <- Flights %>% filter(Cancelled == 0) %>%
mutate(ArrDelay = as.integer(New_ArrTime - New_CRSArrTime))
Flights<- Flights [Flights$ArrDelay>=0,]
Flights$ArrDelay <- Flights$ArrDelay/60
Flights %>% ggvis(~Flights$ArrDelay) %>% layer_histograms(width=50, fill := "green") %>%
add_axis("y", title = "Frequency of Delays", title_offset = 80) %>%
add_axis("x", title = "Minutes Delayed")
percent <- seq(0,.9,.10)
quants <- 1-percent
delayvalues <- quantile(Flights$ArrDelay, quants)
dvalues <- as.numeric(delayvalues)
qdif <- data.frame (MinutesDelayed = dvalues, Probability = quants)
pcity <- ggvis(qdif,x=~MinutesDelayed, y= ~Probability)
layer_points(pcity, fill = "blue")
medbycity <- by(Flights$ArrDelay, Flights$DestCityName,median)
TopTwentyDelayed <- sort(medbycity, decreasing=TRUE)[1:20]
DelayedDepart <- data.frame(DestinationCity=rownames(TopTwentyDelayed),DestinationMedianDelayed=as.numeric(TopTwentyDelayed))
kable(DelayedDepart)
| DestinationCity | DestinationMedianDelayed |
|---|---|
| Plattsburgh, NY | 131.0 |
| Fairbanks, AK | 61.0 |
| North Bend/Coos Bay, OR | 47.0 |
| Rock Springs, WY | 40.0 |
| Cody, WY | 36.5 |
| Alpena, MI | 35.0 |
| Santa Maria, CA | 33.5 |
| Aguadilla, PR | 32.0 |
| Bend/Redmond, OR | 28.0 |
| Trenton, NJ | 27.5 |
| Muskegon, MI | 27.0 |
| Roswell, NM | 27.0 |
| San Francisco, CA | 27.0 |
| Bellingham, WA | 26.0 |
| Latrobe, PA | 25.5 |
| Christiansted, VI | 24.0 |
| Paducah, KY | 24.0 |
| Adak Island, AK | 23.5 |
| Devils Lake, ND | 23.0 |
| San Luis Obispo, CA | 23.0 |
MedOriginCity <- by(Flights$ArrDelay,Flights$OriginCityName,median)
DelayedOrigin <- sort(MedOriginCity, decreasing =TRUE)[1:20]
DelayedbyOrigin <- data.frame(OriginCity=rownames(DelayedOrigin),OriginMedianDelayed=as.numeric(DelayedOrigin))
kable(DelayedbyOrigin)
| OriginCity | OriginMedianDelayed |
|---|---|
| Escanaba, MI | 95.5 |
| Cordova, AK | 83.0 |
| Yakutat, AK | 67.0 |
| Santa Maria, CA | 38.0 |
| Iron Mountain/Kingsfd, MI | 32.0 |
| Key West, FL | 31.0 |
| Meridian, MS | 31.0 |
| Niagara Falls, NY | 31.0 |
| North Bend/Coos Bay, OR | 31.0 |
| Columbus, GA | 30.0 |
| Eau Claire, WI | 28.5 |
| Eugene, OR | 28.0 |
| Daytona Beach, FL | 27.0 |
| Medford, OR | 27.0 |
| Muskegon, MI | 27.0 |
| Sitka, AK | 27.0 |
| Brownsville, TX | 26.0 |
| Plattsburgh, NY | 26.0 |
| Montgomery, AL | 25.0 |
| Arcata/Eureka, CA | 24.0 |
DelayOrigandDest <- inner_join(TopDelayed,DelayedbyOrigin,by=c("DestinationCity" = "OriginCity"))
colnames(DelayOrigandDest) <- c("City","MedianDestDelay","MedianOriginDelay")
kable(DelayOrigandDest)
| City | MedianDestDelay | MedianOriginDelay |
|---|---|---|
| Plattsburgh, NY | 131.0 | 26 |
| North Bend/Coos Bay, OR | 47.0 | 31 |
| Santa Maria, CA | 33.5 | 38 |
| Muskegon, MI | 27.0 | 27 |
PlotDataTogether <- data.frame(City=rep.int(DelayOrigandDest$City,2),Delay=c(DelayOrigandDest$MedianDestDelay, DelayOrigandDest$MedianOriginDelay), DelayType=c(rep.int("Destination",4),rep.int("Origination",4)))
Bothplotted <- ggvis(PlotDataTogether,x=~City, y= ~Delay, fill = ~factor(DelayType))
layer_bars(Bothplotted)