# load data
flights <- nycflights13::flights %>%
select(dep_delay,arr_delay,distance, carrier) %>%
#filter(carrier %in% c('UA','B6','EV','DL','AA','MQ')) %>% #Filter the top
type.convert()
What is the expected arrival delay as a function of the departure delay, flight distance and airline.
Each case represents a flight leaving from an airport in the New York City area (EWR, JFK and LGA) during 2013. There are 336776 cases in total.
The data is collected by the Bureau of Transportation Statistics.
This is an observational study since we have no control over the variables.
The data is collected by the Bureau of Transportation Statistics. The raw data for all available years can be found here. The 2013 data used in this study is available here.
Arrival delay (minutes): Quantitative
Airline: Qualitative Distance (miles): Quantitative Departure delay (minutes): Quantitative
summary(flights)
## dep_delay arr_delay distance carrier
## Min. : -43.00 Min. : -86.000 Min. : 17 UA :58665
## 1st Qu.: -5.00 1st Qu.: -17.000 1st Qu.: 502 B6 :54635
## Median : -2.00 Median : -5.000 Median : 872 EV :54173
## Mean : 12.64 Mean : 6.895 Mean :1040 DL :48110
## 3rd Qu.: 11.00 3rd Qu.: 14.000 3rd Qu.:1389 AA :32729
## Max. :1301.00 Max. :1272.000 Max. :4983 MQ :26397
## NA's :8255 NA's :9430 (Other):62067
summary(flights$carrier)
## 9E AA AS B6 DL EV F9 FL HA MQ OO UA
## 18460 32729 714 54635 48110 54173 685 3260 342 26397 32 58665
## US VX WN YV
## 20536 5162 12275 601
carrsum <- as.data.frame(summary(flights$carrier)) %>%
rownames_to_column() %>%
rename(fcount = "summary(flights$carrier)", carrier = "rowname") %>%
type.convert() %>%
arrange(fcount)
barplot(carrsum$fcount, names.arg = carrsum$carrier, horiz = TRUE)
hist(flights$dep_delay)
hist(flights$arr_delay)
hist(flights$distance)
# Basic Scatterplot Matrix
pairs(~dep_delay+arr_delay+distance,data=flights,
main="Scatterplot Matrix")