For this dataset, I examined all flights in the United States during November 2014, and explored a few lines of inquiry.
flights <- read.csv(file = "flights.csv", header=T, sep =",")
library(plyr)
library(broom)
library(ggplot2)
library(boot)
#Creates dataset with values for Depature Delay
carrier.delays1 <- flights[complete.cases(flights[,"CARRIER_DELAY"]),]
#Summarizes data by Carrier with flight volume and average carrier delay
carrier.summary <- ddply(carrier.delays1, "CARRIER_ID", summarise, carrier.sum = length(CARRIER), avg.delay = mean(CARRIER_DELAY))
#Reorders dataset by average delay
carrier.summary$CARRIER_ID <- reorder(carrier.summary$CARRIER_ID, carrier.summary$avg.delay)
#Shortens the name of U.s. Airways
carrier.summary$CARRIER_ID <- mapvalues(carrier.summary$CARRIER_ID, from = "US Airways Inc. (Merged with America West 9/05. Reporting for both starting 10/07.)", to = "US Airways Inc.")
#Creates plot of average delay by carrier
carrier.delays.plot <- ggplot(data = carrier.summary, aes(x = avg.delay, y = CARRIER_ID, size = carrier.sum)) + geom_point() + labs(x="Average Delay (mins)", y = "Carrier", title= "Average Delay by Carrier", size = "Flight Volume")
carrier.delays.plot
** Does Southwest Airlines have longer carrier delays than Delta Airlines? **
#Changes NA values to 0
flights[is.na(flights)] <- 0
## Warning in `[<-.factor`(`*tmp*`, thisvar, value = 0): invalid factor
## level, NA generated
## Warning in `[<-.factor`(`*tmp*`, thisvar, value = 0): invalid factor
## level, NA generated
## Warning in `[<-.factor`(`*tmp*`, thisvar, value = 0): invalid factor
## level, NA generated
## Warning in `[<-.factor`(`*tmp*`, thisvar, value = 0): invalid factor
## level, NA generated
#Creates a smaller dataframe for Southwest and American Airline flights
carrier.delays <- flights[which(flights$CARRIER == "WN" | flights$CARRIER == "DL"),]
#Creates a qqplot to display whether carrier delay minutes are normally distributed
with(carrier.delays, qqnorm(CARRIER_DELAY[CARRIER == "WN"]))
with(carrier.delays, qqline(CARRIER_DELAY[CARRIER == "WN"], col = "blue"))
#Takes the square root of carrier delay minutes
carrier.delays <- mutate(carrier.delays, sqrt.CARRIER_DELAY = sqrt(CARRIER_DELAY))
#QQ plot with square root of carrier delay minutes a better fit for T-test
with(carrier.delays, qqnorm(sqrt.CARRIER_DELAY[CARRIER == "WN"]))
with(carrier.delays, qqline(sqrt.CARRIER_DELAY[CARRIER == "WN"], col = "blue"))
#Runs a T-test to see whether the carrier delays for Delta Airlines and Southwest Airlines are the same.
with(carrier.delays, t.test(x = CARRIER_DELAY[CARRIER == "DL"], y = CARRIER_DELAY[CARRIER == "WN"]))
##
## Welch Two Sample t-test
##
## data: CARRIER_DELAY[CARRIER == "DL"] and CARRIER_DELAY[CARRIER == "WN"]
## t = -0.22, df = 108555, p-value = 0.8259
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.2123855 0.1695152
## sample estimates:
## mean of x mean of y
## 2.631062 2.652497
#Creates a plot of average carrier delay between the two airlines with 95% confidence interval.
ggplot(data = carrier.delays, mapping = aes(x = CARRIER_ID, y= CARRIER_DELAY)) + stat_summary(fun.data = mean_cl_normal, geom='pointrange', color = "red") + labs(x = '', y = 'Average Carrier Delay (min)', title = 'Mean and 95% CI, Average Carrier Delay')
##### Summary The p-value indicates is that we cannot reject the null hypothesis. We cannot conclude that Southwest Airlines, on average, experience longer carrier delays than Delta Airlines during November 2014.
#Creates dataset with values for Weather Delay
weather.delays1 <- flights[complete.cases(flights[,"WEATHER_DELAY"]),]
#Summarizes data by origin airport with flight volume and average weather delay
weather.delays.summary <- ddply(weather.delays1, "ORIGIN_NAME", summarise, origin.sum = length(ORIGIN), avg.delay = mean(WEATHER_DELAY))
#Reorders dataset by average delay
weather.delays.summary$ORIGIN_NAME <- reorder(weather.delays.summary$ORIGIN_NAME, weather.delays.summary$avg.delay)
#Takes only airports with volume over 2000 flights per month
weather.delays.summary <- weather.delays.summary[weather.delays.summary$origin.sum > 2000,]
#Creates plot of average delay by origin airport
weather.plot <- ggplot(data = weather.delays.summary, mapping = aes(x = avg.delay, y = ORIGIN_NAME, size = origin.sum)) + geom_point() + labs(x="Average Weather (mins)", y = "Origin Airport", title= "Average Weather Delay by at Originating Airport", size = "Flight Volume")
weather.plot
Does Detroit have significantly more weather delays than Minneapolis?
#Creates a dataframe for flights originating from Minneapolis or Detroit
weather.delays <- flights[which(flights$ORIGIN == "MSP" | flights$ORIGIN == "DTW"),]
#Runs a T-test to determine whether flights originating from Minneapolis and Detroit have statistically different weather delays
with(weather.delays, t.test(x = WEATHER_DELAY[ORIGIN == "MSP"], y = WEATHER_DELAY[ORIGIN == "DTW"]))
##
## Welch Two Sample t-test
##
## data: WEATHER_DELAY[ORIGIN == "MSP"] and WEATHER_DELAY[ORIGIN == "DTW"]
## t = -3.7203, df = 12716.25, p-value = 0.0001999
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.7941705 -0.2460796
## sample estimates:
## mean of x mean of y
## 0.3521466 0.8722717
#Plot that shows the average weather delays from lights originating from Minneapolis and Detroit
ggplot(data = weather.delays, mapping = aes(x = ORIGIN_NAME, y= WEATHER_DELAY)) + stat_summary(fun.data = mean_cl_normal, geom='pointrange', color = "steelblue") + labs(x = '', y = 'Average Weather Delay (min)', title = 'Mean and 95% CI, Average Weather Delay')
##### Summary Based on the evidence, we can say that Detroit Metropolitan Airport experiences more weather delays on average than Minneapolis during November 2014.
Examination of total flight volume during the course of a day for Wednesdays in November 2014
#Creates a dataframe of flights that occur on Wednesdays in November
wednesdays <- flights[which((flights$FL_DATE == "11/5/2014" | flights$FL_DATE == "11/12/2014" | flights$FL_DATE == "11/19/2014" | flights$FL_DATE == "11/26/2014")),]
#Creates a time bin vector for 24 discrete hours
time.bins <- seq(from = 0, to = 2400, by = 100)
#Creates time bin labels for each hour of the day
time.labels <- c("12AM", "1AM", "2AM", "3AM", "4AM", "5AM", "6AM","7AM", "8AM", "9AM","10AM","11AM","12PM","1PM","2PM", "3PM", "4PM", "5PM", "6PM", "7PM", "8PM", "9PM", "10PM", "11PM")
#Creates a new column and assigns flights to 24 time bins
wednesdays$DEP_BINS <- cut(wednesdays$DEP_TIME, breaks = time.bins, labels = time.labels)
#Removes flights that don't fall into time bins
wednesdays <- wednesdays[complete.cases(wednesdays[,"DEP_BINS"]),]
#Creates a summary table divided by Date and Time, with aggregated flight volume
wednesdays.summary <- ddply(wednesdays, c("FL_DATE", "DEP_BINS"), summarise, flight.volume = length(DEP_BINS))
#Reorders dates sequntially
wednesdays.summary[["FL_DATE"]]<- factor(wednesdays.summary[["FL_DATE"]], levels = c("11/5/2014", "11/12/2014", "11/19/2014", "11/26/2014"))
#Creates plot showing flight volume across 4 Wednesdays in November 2014
wed.plot <- ggplot(data=wednesdays.summary, mapping=aes(x=DEP_BINS, y=flight.volume, color=FL_DATE, group=FL_DATE)) + geom_line() + theme(axis.text.x = element_text(angle=90, vjust=1, hjust=1)) + labs(x = 'Time of Day', y = 'Total Flight Volume', title = 'Flight Volume over course of Day for Wednesdays in November 2014', color = 'Date')
wed.plot
##### Summary I chose to examine the four Wednesdays in November 2014 to examine the flight volume patterns during the day before Thanksgiving (11/26/2014). I had initially postulated that there would be heightened flight volume throughout the day on 11/26. However, the plot show that through most of the day, the flight volume is typical for Wednesdays in November. However, a notable divergence occurs beginning at 7PM. From then on, there is a significant increase in flight volume during the day before Thanksgiving.
Examine Weather Delays as a Function of Latitude
#Removes flights with no origin latitude
latitude <- flights[complete.cases(flights[,"ORIGIN_LAT"]),]
#Removes unwanted characters and tranforms latitudes to numeric
latitude$ORIGIN_LAT_CLEAN <- as.numeric(gsub("[^0-9.]", "", as.character(latitude$ORIGIN_LAT)))
latitude <- latitude[latitude$ORIGIN_LAT_CLEAN > 0,]
#Reorders latitudes from south to north
latitude <- latitude[order(latitude$ORIGIN_LAT_CLEAN),]
#Plots weather delays by latitude of originating airport
lat.plot <- ggplot(data = latitude, mapping = aes(x = ORIGIN_LAT_CLEAN, y = WEATHER_DELAY)) + geom_point(size = .5, alpha = .1) + coord_cartesian(ylim = c(0,200)) + labs(x="Latitude", y = "Weather Delay (mins)", title= "Flights by Weather Delay")
lat.plot
## Warning: Removed 317 rows containing missing values (geom_point).
#Shows positive value linear trendline
lat.plot + geom_smooth(method = "lm") + coord_cartesian(ylim = c(0,10))
## Warning: Removed 317 rows containing missing values (stat_smooth).
## Warning: Removed 317 rows containing missing values (geom_point).
#Create linear model of weather delays and latitude
lm.model = lm(formula = WEATHER_DELAY ~ ORIGIN_LAT_CLEAN, data = latitude)
lm.model
##
## Call:
## lm(formula = WEATHER_DELAY ~ ORIGIN_LAT_CLEAN, data = latitude)
##
## Coefficients:
## (Intercept) ORIGIN_LAT_CLEAN
## -0.47261 0.02214
#Displays coefficients and confidence intervals of model
#For every degree of latitude further north, on average, weather delays will increase by .02 minutes (1.2 seconds).
summary(lm.model)
##
## Call:
## lm(formula = WEATHER_DELAY ~ ORIGIN_LAT_CLEAN, data = latitude)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.11 -0.43 -0.33 -0.26 689.53
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.472609 0.055428 -8.526 <2e-16 ***
## ORIGIN_LAT_CLEAN 0.022141 0.001497 14.787 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 5.97 on 461735 degrees of freedom
## (317 observations deleted due to missingness)
## Multiple R-squared: 0.0004733, Adjusted R-squared: 0.0004712
## F-statistic: 218.7 on 1 and 461735 DF, p-value: < 2.2e-16
confint(lm.model)
## 2.5 % 97.5 %
## (Intercept) -0.58124695 -0.36397084
## ORIGIN_LAT_CLEAN 0.01920603 0.02507542
#Augments the data to show residuals
latitude.augmented <- augment(lm.model, data = latitude)
#Plots residuals
res.plot <- ggplot(latitude.augmented, mapping=aes(x=ORIGIN_LAT_CLEAN, y=.resid)) + geom_point(size = .5, alpha = .1) + labs(x="Latitude", y = "Residuals", title= "Residual Plot")
res.plot
##### Summary There is a small, but statistically significant positive relationship between weather delays and latitude. We would expect a rather weak relationship, because the climate is driven by much more than merely latitude. However, the further north the airport of origin, the longer the anticipated weather delay.