load("~/Documents/School/STA 141/Assignment 1/winterDelays.rda")
library(calibrate) # to label data points on some plots
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.1.1
library(RColorBrewer) # for purty colors
dim(winterDelays)
There are 1,961,489 flights in this data set.
sort(table(winterDelays$UNIQUE_CARRIER), decreasing = TRUE)[1]
With 354,963 flights, Southwest Airlines (WN) has the most flights in this data set.
# Order the carriers and origins to find the 10 and 20 most popular ones, respectively
carrierCount = sort(table(winterDelays$UNIQUE_CARRIER), decreasing = TRUE)
originCount = sort(table(winterDelays$ORIGIN), decreasing = TRUE)
The 20 airports with the largest number of flights are ATL, ORD, DFW, DEN, LAX, IAH, PHX, SFO, CLT, LAS, DTW, EWR, MSP, MCO, SLC, JFK, BOS, BWI, LGA, SEA
The 10 airlines with the largest number of flights are WN, EV, DL, OO, AA, UA, MQ, US, B6, FL
# Create a two way frequency table w/ all the data
threeTable = with(winterDelays, tapply(FLIGHTS, list(ORIGIN, UNIQUE_CARRIER), sum))
# Subset the data showing only the 20 top origins and 10 top carriers
threeTableNew = threeTable[names(originCount[1:20]), names(carrierCount[1:10])]
## WN EV DL OO AA UA MQ US B6 FL
## ATL 3369 29393 63957 558 1656 260 1992 1734 NA 17897
## ORD NA 17570 1852 8740 15653 18113 25434 2319 479 NA
## DFW NA 1633 1638 2005 50114 1326 29051 2163 337 NA
## DEN 17944 4975 2094 15603 1636 14383 756 1498 324 285
## LAX 11797 NA 6104 19803 9582 9268 2756 1921 1032 293
## IAH NA 25998 772 6463 1363 21311 793 1809 NA NA
## PHX 18698 15 2249 7006 1906 2065 56 18814 229 64
## SFO 5063 NA 2490 16838 3457 14453 NA 1658 1276 295
## CLT NA 1732 1600 193 664 99 1713 28284 479 644
## LAS 23477 NA 3731 2446 3115 3931 NA 2081 1152 421
## DTW 1874 7449 14912 1104 743 231 1561 1098 NA 676
## EWR 2045 15783 1312 NA 1136 14783 848 1413 2232 NA
## MSP 2334 2350 16429 8173 1037 882 1094 1334 NA 482
## MCO 9471 50 5465 NA 3168 3869 NA 2942 6148 5506
## SLC 3572 123 9700 18149 579 343 492 664 360 NA
## JFK NA 426 6375 NA 4674 1429 2279 901 13363 NA
## BOS 2175 860 3061 NA 3376 3697 NA 5693 11041 1172
## BWI 18762 920 2167 NA 934 1081 574 1510 491 4518
## LGA 1834 876 7885 1 4918 2446 5571 3985 2039 1284
## SEA 3442 NA 2636 1863 1523 2980 NA 1022 451 NA
mean(winterDelays[winterDelays$MONTH == 11,'ARR_DELAY'], na.rm = TRUE)
mean(winterDelays[winterDelays$MONTH == 12,'ARR_DELAY'], na.rm = TRUE)
Yes. The mean arrival delay for November was -0.1247 minutes, whereas the mean arrival delay for December was 6.893 minutes.
mean(winterDelays$ARR_DELAY, na.rm = TRUE)
median(winterDelays$ARR_DELAY, na.rm = TRUE)
names(sort(table(winterDelays$ARR_DELAY), decreasing = TRUE))[1] # mode
The mean arrival delay in this dataset is 3.3255 minutes. The median arrival delay is -5 minutes. The mode is -9 minutes.
delayDensity = density(winterDelays$ARR_DELAY, na.rm = TRUE)
plot(delayDensity, xlab = "Arrival Delay (minutes)", main = "5. Density Distribution of Arrival Delays")
It’s clear looking at the density function’s plot that most of the data centers around 0, so most flights had little to no delay. However, there are some substantial arrival delays in the data, so I would say that the mean is the better measure for characterizing the center of distribution. The mean is fairly close to zero while respecting the presence of larger values.
with(winterDelays, mean(winterDelays[UNIQUE_CARRIER == 'UA' & ORIGIN == 'SFO' &
DAY_OF_WEEK %in% c(6,7),'ARR_DELAY'], na.rm = TRUE))
with(winterDelays, sd(winterDelays[UNIQUE_CARRIER == 'UA' & ORIGIN == 'SFO' &
DAY_OF_WEEK %in% c(6,7),'ARR_DELAY'], na.rm = TRUE))
The mean is .9391 minutes with a standard deviation of 36.7625 minutes.
The best way to display the distributions of arrival delays for each month is by using a boxplot.
delaysByMonth = with(winterDelays, split(ARR_DELAY, MONTH)) # Pull out delays and month variables
names(delaysByMonth) = c("January", "February", "November", "December") # Set labels for plot
boxplot(delaysByMonth[c(3, 4, 1, 2)], ylim = c(-50,50), ylab = "Arrival Delay (minutes)",
main = "7. Arrival Delays per Month")
Because the distribution of the arrival delays is so right-skewed, it makes sense to “zoom in” on the box plot in order to more accurately see their distributions (see the plot in question 5). The plot’s y-axis is limited to flights that are 50 minutes early or fifty minutes late. From this, we can see that flights in December tend to have longer delays than others.
An array of the number of flights for each origin airport was already created for question 3. The array is sorted in descending order. Recreating the array here:
originCount = sort(table(winterDelays$ORIGIN), decreasing = TRUE)
Because there are 307 airports in the data, we should highlight a select number of them for clarity, so let’s highlight the top five airports, every 20 up to rank 100, then every 50 airports after that.
# Choose the datapoints we want to label as well as the names of the corresponding airports.
eightLabel = c(1:5,20,40,60,80,100,150,200,250,300)
eightLabelNames = names(originCount[eightLabel])
# Distinguish the points that will be labeled by coloring them red.
plot(originCount, type="o", pch = ifelse(names(originCount) %in% eightLabelNames, 19, 46),
col = ifelse(names(originCount) %in% eightLabelNames, "red", "black"),
xlab = "Origin Airports, Ranked by Number of Flights", ylab = "Number of Flights",
main = "8. Number of Flights for Each Airport")
# Add text labels to the points defined by eightLabel.
textxy(eightLabel, originCount[eightLabel], eightLabelNames, cex = 0.6, offset = .8, col = "red")
Something to note about this plot is that, when ranking the airports by number of flights, it resembles an exponential function - if you looked at it in the mirror. The top airports have a massive share of the total flights.
# Get a count of the number of flights per day of the week
dayCount = table(winterDelays$DAY_OF_WEEK)
# Since DAY_OF_WEEK consists of integers, change the names of the table to be strings of the day of week
names(dayCount) = cbind("MON","TUE","WED","THU","FRI","SAT","SUN")
sort(dayCount) # See highest and lowest num. of flights
## SAT SUN TUE MON WED FRI THU
## 235456 274382 275181 286929 288557 292745 308239
# Draw a plot of the above
plotNineColors = c("skyblue2","chartreuse4","darkseagreen3","yellow4","orange","lightpink","grey60")
plot(dayCount, type = "h", col = plotNineColors, lwd = 10, xlab = "Day of Week",
ylab = "Number of Flights", main = "9. Flights by Day")
Looking at the raw numbers as well as at the graph, the number of flights on weekdays are not too much higher than that of weekends, even though Saturday and Sunday are the two days with the lowest number of flights. The day with the highest number of flights is Thursday. Of course, this depends on your definition of “many more” flights.
# A flight is considered to be delayed if ARR_DELAY is greater than zero. Create a table of
# the number of delayed flights for each day of the week using a subset of the dataset.
delayPerDay = with(winterDelays[winterDelays$ARR_DELAY > 0,], table(DAY_OF_WEEK))
names(delayPerDay) = cbind("MON","TUE","WED","THU","FRI","SAT","SUN")
sort(delayPerDay, decreasing = TRUE)
## THU FRI MON WED SUN TUE SAT
## 115142 108652 106163 104861 97596 94570 76932
# Create a plot so there is something pretty to look at.
plot(delayPerDay, type = "h", col = plotNineColors, lwd = 10, yaxt = "n", xlab = "Day of Week",
ylab = "Number of Delayed Flights", main = "10. Delayed Flights by Day")
options(scipen=10) # Plot was disaplying Y axis labels in scientific notation so we need to stop that
axis(2, at = c(0,20000,40000,60000,80000,100000))
Thursday has the highest number of delayed flights. Of course, this may be deceptive as Thursday is also the day with the highest number of flights (delayed or not). It’s better to express this as a percentage.
delayPerDayPct = delayPerDay / dayCount
# put the two tables together
delayPerDayNew = cbind(delayPerDay,delayPerDayPct)
# Create a plot of the delay percentages so there is something pretty to look at.
plot(delayPerDayPct, type = "h", col = plotNineColors, lwd = 10, xlab = "Day of Week",
ylab = "Percentage of Flights Delayed", main = "10a. Percentage of Delayed Flights on Each Day")
With 37.3548% of flights delayed, Thursday has the highest percentage of delayed flights. In fact, the ranking of days with the most delayed flights is exactly the same as the ranking of days with the highest percentage of delayed flights. But it doesn’t hurt to be thorough. However, the percentage of flights that were delayed are much closer to each other than the number of flights delayed.
# Create a table of the median ARR_DELAY for each day of the week, and sort.
medDelayPerDay = with(winterDelays, tapply(ARR_DELAY, list(DAY_OF_WEEK), median, na.rm = TRUE))
names(medDelayPerDay) = cbind("MON","TUE","WED","THU","FRI","SAT","SUN")
sort(medDelayPerDay, decreasing = TRUE)
At -4 minutes delayed (or 4 minutes early), Monday, Thursday, and Friday have the largest median overall delay.
# Repeat the above, but for the 90th quantile of ARR_DELAY instead of median.
ninetyDelayPerDay = with(winterDelays, tapply(ARR_DELAY, list(DAY_OF_WEEK), quantile,
probs = 0.9, na.rm = TRUE))
names(ninetyDelayPerDay) = cbind("MON","TUE","WED","THU","FRI","SAT","SUN")
sort(ninetyDelayPerDay, decreasing = TRUE)
Wednesday has the highest overall delay (34 minutes late) when comparing the 90th quantiles for each day.
# Get the ten airports with most number of flights (origins and destinations)
originTopTen = names(originCount[1:10]) # originCount was defined during question 3.
destCount = sort(table(winterDelays$DEST), decreasing = TRUE)
destTopTen = names(destCount[1:10])
## [1] "ATL" "ORD" "DFW" "DEN" "LAX" "IAH" "PHX" "SFO" "CLT" "LAS"
## [1] "ATL" "ORD" "DFW" "DEN" "LAX" "IAH" "PHX" "SFO" "CLT" "LAS"
Note that the top ten origin and destination airports are the same.
# Get the median delays for each route when the origin airport or the destination airport is one of
# the top ten airports.
medianDelayPerPair = with(subset(winterDelays, ORIGIN %in% originTopTen | DEST %in% destTopTen),
aggregate(ARR_DELAY, list(ORIGIN,DEST), median, na.rm = TRUE))
names(medianDelayPerPair) = cbind("ORIGIN","DEST","medianDelay")
# Get the origin-destination pairs with the highest median delay
head(medianDelayPerPair[order(medianDelayPerPair$medianDelay, decreasing = TRUE),])
## ORIGIN DEST medianDelay
## 838 ORD GJT 124
## 895 CLT HSV 120
## 214 LAX BDL 79
## 355 HSV CLT 34
## 623 COU DFW 22
## 837 LAX GJT 22
Of all routes involving at least one of the ten airports with the highest number of flights, the above six routes have the worst median delay.
smoothScatter(winterDelays$DISTANCE,winterDelays$ARR_DELAY, nbin = 200, ylim = c(-30,300),
xlab = "Flight Distance (miles)", ylab = "Arrival Delay (minutes)",
main = "13. Relationship Between Distance and Delay")
Looking at this smoothed scatterplot, there does not seem to be a significant relationship between flight distance and delay time. Any significant delays are outliers of the data. Because of the high variance in arrival delays, we “zoom in” on this scatter plot to better see any relationship between the two variables.
corDistDelay = cor(winterDelays$DISTANCE,winterDelays$ARR_DELAY, use = "complete.obs")
The correlation coefficient between distance and arrival delay is -0.0313591, which implies a miniscule amount of negative correlation.
Using mean since really long delays should have some more weight than they would in a median, as those are the delays people notice more than a flight that gets delayed by, say, two minutes.
meanDelayPerHour = with(winterDelays, tapply(ARR_DELAY, list(DEP_TIME_BLK), mean, na.rm = TRUE))
plot(meanDelayPerHour, type = "o", xaxt = "n", xlab = "Time",
ylab = "Mean Arrival Delay (minutes)", main = "14. Mean Arrival Delay Per Hour")
axis(1, at = c(1:19), labels = names(meanDelayPerHour))
Based on the mean arrival delay, the five departure time blocks with the worst delays are the 6 PM hour (7.7134 minutes), the 8 PM hour (7.1976 minutes), the 5 PM hour (6.7434 minutes), the 7 PM hour (6.3294 minutes), and the 4 PM hour (6.1779 minutes). Basically, flights that leave in the evening have longer mean delay times.
As found in question 5, the mean arrival delay in this dataset is 3.3255 minutes. Using mean for reasons stated in question 14.
# Thanksgiving changes each year, so first we have to find which dates to use
unique(winterDelays$FL_DATE) # See the dates in the dataset
# Used google to find the Thanksgiving date: 2012-11-22, and Christmas is 2012-12-25
thankChrist = c("2012-11-22","2012-12-25")
# Get mean delays for each day in the dataset
meanDelayPerDay = with(winterDelays, tapply(ARR_DELAY, list(FL_DATE), mean, na.rm = TRUE))
meanDelayPerDay[thankChrist] # Get Txgiving and Xmas means
## 2012-11-22 2012-12-25
## -8.859218 14.125010
The mean delay on Thanksgiving was -8.8592 minutes, and the mean delay on Christmas was 14.125 minutes.
plot(meanDelayPerDay, type = "o", pch = ifelse(names(meanDelayPerDay) %in% thankChrist, 19, 46),
col = ifelse(names(meanDelayPerDay) %in% thankChrist, "maroon4", "slategray"), xaxt = "n",
xlab = "Flight Date", ylab = "Mean Arrival Delay (minutes)",
main = "15. Mean Arrival Delay By Date")
abline(3.3255,0)
axis(1, at = c(1:120), labels = names(meanDelayPerDay))
textxy(c(22,55), meanDelayPerDay[thankChrist], c("Thanksgiving","Christmas"), cex = .9,
offset = -0.6, col = "maroon4")
textxy(0, 3.9, "Overall Mean Delay", cex = .8, offset = 0.4, col = "darkblue")
Compared to the mean arrival delay overall, the delays are worse on Christmas, but better on Thanksgiving. In fact, flights tend to arrive early on Thanksgiving.
…You asked for it:
sapply(winterDelays, function(x) length(x[is.na(x)]))
## YEAR QUARTER MONTH
## 0 0 0
## DAY_OF_MONTH DAY_OF_WEEK FL_DATE
## 0 0 0
## UNIQUE_CARRIER AIRLINE_ID CARRIER
## 0 0 0
## TAIL_NUM FL_NUM ORIGIN_AIRPORT_ID
## 0 0 0
## ORIGIN_AIRPORT_SEQ_ID ORIGIN_CITY_MARKET_ID ORIGIN
## 0 0 0
## ORIGIN_CITY_NAME ORIGIN_STATE_ABR ORIGIN_STATE_FIPS
## 0 0 0
## ORIGIN_STATE_NM ORIGIN_WAC DEST_AIRPORT_ID
## 0 0 0
## DEST_AIRPORT_SEQ_ID DEST_CITY_MARKET_ID DEST
## 0 0 0
## DEST_CITY_NAME DEST_STATE_ABR DEST_STATE_FIPS
## 0 0 0
## DEST_STATE_NM DEST_WAC CRS_DEP_TIME
## 0 0 0
## DEP_TIME DEP_DELAY DEP_DELAY_NEW
## 30721 30721 30721
## DEP_DEL15 DEP_DELAY_GROUP DEP_TIME_BLK
## 30721 30721 0
## TAXI_OUT WHEELS_OFF WHEELS_ON
## 31540 31540 32950
## TAXI_IN CRS_ARR_TIME ARR_TIME
## 32950 0 32950
## ARR_DELAY ARR_DELAY_NEW ARR_DEL15
## 35780 35780 35780
## ARR_DELAY_GROUP ARR_TIME_BLK CANCELLED
## 35780 0 0
## CANCELLATION_CODE DIVERTED CRS_ELAPSED_TIME
## 0 0 0
## ACTUAL_ELAPSED_TIME AIR_TIME FLIGHTS
## 35780 35780 0
## DISTANCE DISTANCE_GROUP CARRIER_DELAY
## 0 0 1619153
## WEATHER_DELAY NAS_DELAY SECURITY_DELAY
## 1619153 1619153 1619153
## LATE_AIRCRAFT_DELAY FIRST_DEP_TIME TOTAL_ADD_GTIME
## 1619153 1950485 1950485
## LONGEST_ADD_GTIME DIV_AIRPORT_LANDINGS DIV_REACHED_DEST
## 1950485 0 1957674
## DIV_ACTUAL_ELAPSED_TIME DIV_ARR_DELAY DIV_DISTANCE
## 1958659 1958659 1957755
## DIV1_AIRPORT DIV1_AIRPORT_ID DIV1_AIRPORT_SEQ_ID
## 0 1957250 1957250
## DIV1_WHEELS_ON DIV1_TOTAL_GTIME DIV1_LONGEST_GTIME
## 1957249 1957249 1957249
## DIV1_WHEELS_OFF DIV1_TAIL_NUM DIV2_AIRPORT
## 1958597 0 0
## DIV2_AIRPORT_ID DIV2_AIRPORT_SEQ_ID DIV2_WHEELS_ON
## 1961419 1961419 1961419
## DIV2_TOTAL_GTIME DIV2_LONGEST_GTIME DIV2_WHEELS_OFF
## 1961419 1961419 1961479
## DIV2_TAIL_NUM DIV3_AIRPORT DIV3_AIRPORT_ID
## 0 0 1961487
## DIV3_AIRPORT_SEQ_ID DIV3_WHEELS_ON DIV3_TOTAL_GTIME
## 1961487 1961487 1961487
## DIV3_LONGEST_GTIME DIV3_WHEELS_OFF DIV3_TAIL_NUM
## 1961487 1961489 1961489
## DIV4_AIRPORT DIV4_AIRPORT_ID DIV4_AIRPORT_SEQ_ID
## 1961489 1961489 1961489
## DIV4_WHEELS_ON DIV4_TOTAL_GTIME DIV4_LONGEST_GTIME
## 1961489 1961489 1961489
## DIV4_WHEELS_OFF DIV4_TAIL_NUM DIV5_AIRPORT
## 1961489 1961489 1961489
## DIV5_AIRPORT_ID DIV5_AIRPORT_SEQ_ID DIV5_WHEELS_ON
## 1961489 1961489 1961489
## DIV5_TOTAL_GTIME DIV5_LONGEST_GTIME DIV5_WHEELS_OFF
## 1961489 1961489 1961489
## DIV5_TAIL_NUM X
## 1961489 1961489
There are 30,721 NA values for each of the three variables.
# Use the identical function to see whether, for entries where at least one of the relevant
# variables is NA, they are NA for the other two relevant variables. We do this twice due to
# the limitations of the identical function. If the function returns FALSE, then the number of NAs
# are NOT identical for the two variables in question.
with(subset(winterDelays, DEP_TIME == "NA" | DEP_DELAY == "NA" |
DEP_DELAY_NEW == "NA"), identical(DEP_TIME, DEP_DELAY))
## [1] FALSE
with(subset(winterDelays, DEP_TIME == "NA" | DEP_DELAY == "NA" |
DEP_DELAY_NEW == "NA"), identical(DEP_DELAY, DEP_DELAY_NEW))
## [1] TRUE
Not every entry where DEP_TIME is missing is also missing DEP_DELAY (or the other way around). Every entry where DEP_DELAY is missing is also missing DEP_DELAY_NEW. Regardless, these missing variables do NOT correspond to the same records for each of these variables.
The best way to show the distributions of the delays is with a density function, so we have to create a density function for each time block in the data.
# Make a list of the time blocks in ARR_TIME_BLK
timeBlock = levels(winterDelays$ARR_TIME_BLK)
hourDensity1 = with(subset(winterDelays, ARR_TIME_BLK == timeBlock[1]), density(ARR_DELAY, na.rm = TRUE))
# Plot the density function of the first time block:
# colorList = rainbow(19) # Create a list of colors to be used in the plot's legend
colorList=c("#C47A72","#66D13F","#CA52CC","#76BCC6","#536C34","#5C325F","#C4AA48","#CD3F38",
"#CB4782","#C3BF99","#CAA5C6","#404E50","#623324","#846ECD","#C7752F","#6283BA", "#74C872",
"#76D7B1","#C9D949")
plot(hourDensity1, xlim = c(-40,200), ylim = c(0,0.043), col = colorList[1],
xlab = "Arrival Delay (minutes)",
main = "18. Distributions of Arrival Delays, Subset by Time of Day")
# Finds the density functions for the rest of the time blocks, and adds to the above plot
# as line with a random color, adds that color to colorList to be used in the legend
for (i in 2:length(timeBlock)) { # there are 19 levels of time blocks
lines(with(subset(winterDelays, ARR_TIME_BLK == timeBlock[i]), density(ARR_DELAY, na.rm = TRUE)),
col = colorList[i]) # Find the density function for timeBlock i, and add it to the plot.
}
# add the legend for clarity
legend("topright", timeBlock, col = colorList, lwd = 2, cex = 0.6, title = "Time of Day")
Because of the extreme right skewing of the distributions, this plot is “zoomed in” a fair amount in order to increase clarity. By looking at the peaks of the density distribution graphs, some times of day tend to be closer to being on time than others. The higher the peak, the more flights had arrival delays close to zero.
However, zero or near-zero values of arrival delays were the most common for all times of the day so the distributions are pretty similar to one another.
To get the proportion, we find the number of flights that took off late and divide by the number of flights that took off.
If DEP_DELAY == NA for a flight, we don’t include it because it could make the proportion smaller than it should be. More accurately, this is the proportion of flights we know took off late over the number of flights that we know took off. Flights that took off on time have a value DEP_DELAY == 0.
flightDepDenom = subset(winterDelays, DEP_DELAY != "NA") # subset flights with known DEP_DELAY values
flightDepLate = subset(winterDelays, DEP_DELAY > 0) # subset flights that took off late
nrow(flightDepLate) / nrow(flightDepDenom) # find the proportion
The proportion of flights that took off late was 0.3717.
# Same process as number 19, just different variables and subsets.
flightArrDenom = subset(winterDelays, ARR_DELAY != "NA") # subset flights with known ARR_DELAY
flightArrLate = subset(winterDelays, ARR_DELAY > 0) # subset flights that arrived late
flightArrEarly = subset(winterDelays, ARR_DELAY < 0) # subset flights that arrived early
nrow(flightArrLate) / nrow(flightArrDenom) # proportion of flights that arrived late
nrow(flightArrEarly) / nrow(flightArrDenom) # proportion of flights that arrived early
The proportion of flights that arrived late was 0.3655. The proportion of flights that arrived early was 0.6113.
# subset flights with known DEP_DELAY & known ARR_DELAY. Drop NAs for reasons stated in question 19
flightArrDepDenom = subset(winterDelays, DEP_DELAY != "NA" & ARR_DELAY != "NA")
# subset flights that both took off and arrived late
flightArrDepLate = flightDepLate = subset(winterDelays, DEP_DELAY > 0 & ARR_DELAY > 0)
nrow(flightArrDepLate) / nrow(flightArrDepDenom) # get proportion
The proportion of flights that both took off and arrived late was 0.259.
# Since we care about planes that leave late, we use the subset of flights with DEP_DELAY > 0
with(flightDepLate, median(DEP_DELAY, na.rm = TRUE))
with(flightDepLate, median(ARR_DELAY, na.rm = TRUE))
with(flightDepLate, median(DEP_DELAY - ARR_DELAY, na.rm = TRUE))
The median departure delay of flights that leave late is 13 minutes and the median arrival delay of those flights is 10 minutes. The difference of the two means is 6 minutes, so planes leaving late tend to arrive late, but not as late as they left. They are able to make up six minutes of time.
# Compare the mean speed of flights that take off on time to the mean speed of flights
# that take off late.
flightDepOnTime = subset(winterDelays, DEP_DELAY == 0) # subset of on time flights
with(flightDepOnTime, median(DISTANCE / AIR_TIME, na.rm = TRUE))
with(flightDepLate, median(DISTANCE / AIR_TIME, na.rm = TRUE))
The median speed of on time flights is 6.757 miles per minute, whereas the median speed of flights that take off late is 6.7941 miles per minute. So yes, flights that take off late do fly faster to make up time, but not by much.
flightFromSFO = subset(winterDelays, ORIGIN == "SFO")
topFiveSFO = names(sort(table(flightFromSFO$DEST), decreasing = TRUE)[1:5])
The five most popular destination airports for flights leaving San Francisco (SFO) are as follows: Los Angeles (LAX, 5292 flights), Las Vegas (LAS, 2732), New York City (JFK, 2634), San Diego (SAN, 2508), and Chicago (ORD, 2434).
# subset data again to just get flights from SFO to its top five destinations
topFiveFlightFromSFO = subset(flightFromSFO, DEST %in% topFiveSFO, drop = TRUE)
# create a two-way frequency table, with the distances between the airports as the column names
twentyFiveTable = with(topFiveFlightFromSFO, tapply(FLIGHTS, list(DEST, DISTANCE), sum))
twentyFiveTable = twentyFiveTable[topFiveSFO,]
The distances from SFO to the top five airports are as follows:
To LAX: 337 miles. To LAS: 414 miles. To SAN: 447 miles. To ORD: 1846 miles. To JFK: 2586 miles.
These were computed by using the DISTANCE variable given by the data set. This can be done for all pairs of airports that have flights from one to another, one just has to subset the data differently.
# Get density function of air speeds for SFO to LAX
speedDensity1 = density(with(subset(topFiveFlightFromSFO, DEST == topFiveSFO[1]), (DISTANCE / AIR_TIME)),
na.rm=TRUE)
# Plot the density function of the avg speeds to LAX:
colorList26 = c("black","red","blue","green4","orange","purple") # Create a list of colors for plot
lineList26 = c(1:6)
plot(speedDensity1, lwd = 2, col = colorList26[1], lty = lineList26[1], xlim = c(4,10.5),
xlab = "Average Flight Speed (miles per minute)",
main = "26. Density Distributions of Average Speeds, Subset by Destination")
# Finds the density functions for the other destination airports, and adds to the above plot
# as line with a random color, adds that color to colorList26 to be used in the legend
for (i in 2:5) { # there are 5 airports
# Find the density function for DEST airport i, and add it to the plot.
lines(density(with(subset(topFiveFlightFromSFO, DEST == topFiveSFO[i]), (DISTANCE / AIR_TIME)),
na.rm=TRUE), lwd = 3, col = colorList26[i], lty = lineList26[i])
}
# add the legend for clarity
legend("topright", topFiveSFO, col = colorList26, lty = lineList26, lwd = 2, cex = 0.7,
title = "Destination Airport")
Looking at the density distributions of the average flight speeds for the five destination airports, there is a marked difference between the average flight speeds for the airports LAX, LAS, and SAN [Diego] and the average flight speeds for the airports JFK and ORD [Chicago]. It is probably not coincidental that JFK and ORD are much farther away from SFO than from LAX, LAS or SAN (see question 25). JFK, the farthest airport from SFO in this set, tends to have flights with a higher speed as well. Perhaps a longer flight means being able to keep the airplane at its top speed for longer than a shorter flight.
# note to self: EWR is the airport in Newark, NJ.
destSFO27 = c("LAX","JFK","EWR","LAX","JFK","EWR")
# Subset data to get flights from and to SFO / the airports defined in destSFO27
flightFromSFO27 = subset(flightFromSFO, DEST %in% destSFO27, drop = TRUE)
flightToSFO27 = subset(winterDelays, ORIGIN %in% destSFO27 & DEST == "SFO", drop = TRUE)
# Find density function for one of the routes
delayDensity27_1 = density(with(subset(flightFromSFO27, DEST == destSFO27[1]), ARR_DELAY), na.rm=TRUE)
# Plot the density function of the arrival delays for flights from SFO to LAX:
plot(delayDensity27_1, xlim = c(-50,100), lwd = 3, col = colorList26[1], lty = lineList26[1],
xlab = "Arrival Delay (minutes)",
main = "27. Density Distributions of Arrival Delays for Certain Routes")
# Finds the density functions for SFO to LAX/JFK/EWR, and adds to the above plot
# as line with a random color, adds that color to colorList26 to be used in the legend
for (i in 2:3) { # 3 routes from SFO
# Find the density function for DEST airport i, and add it to the plot.
lines(density(with(subset(flightFromSFO27, DEST == destSFO27[i]), ARR_DELAY), na.rm=TRUE),
lwd = 3, col = colorList26[i], lty = lineList26[i])
}
# Now add density functions for flights TO SFO from LAX/JFK/EWR
for (i in 4:6) { # 3 routes to SFO
# Find the density function for ORIGIN airport i, and add it to the plot.
lines(density(with(subset(flightToSFO27, ORIGIN == destSFO27[i]), ARR_DELAY), na.rm=TRUE),
lwd = 3, col = colorList26[i], lty = lineList26[i])
}
# add the legend for clarity
legend("topright", c("SFO to LAX","SFO to JFK","SFO to EWR","LAX to SFO","JFK to SFO","EWR to SFO"),
col = colorList26, lty = lineList26, lwd = 2, cex = 0.7, title = "Flight Route")
Because the distributions are skewed so far to the right, the plots are “zoomed in” by a fair amount in order to clearly see the differences in distributions.
The distributions of arrival delays for commuter flights (SFO to/from LAX) are slightly different from the distributions of arrival delays for longer flights (SFO to/from JFK or EWR). More flights between SFO and LAX were on time than that of the longer flights. Also, the long flights that were going to SFO had more on time arrivals compared to the long flights that were leaving SFO.
With this data, we could see which routes have more significant delays (at least 15 minutes) than others. There are dates included in the data, so we could perhaps see how the mean/median/90th quantile of arrival delays changes over time. Also, there is some data on whether flights were cancelled or not. That would be interesting to look at by seeing which times/days/etc have more cancelled flights than others.
It would be nice to have data for flights year round, so we can see which season or which month has the worst delays. Also, if we had data on the number of passengers on each flight, we could potentially see if flights with more passengers take off or land later than flights with less passengers. The possibilities are endless.