# load data
Field <- c("year", "month", "carrier", "carrier_name", "airport", "airport_name", "arr_flights", "arr_del15", "carrier_ct", "weather_ct", "nas_ct", "security_ct", "late_aircraft_ct", "arr_cancelled", "arr_diverted", "arr_delay", "carrier_delay", "weather_delay", "nas_delay", "security_delay", "late_aircraft_delay")
Description <- c("Year (yyyy)", "Month (mm)", "Airline carrier abbreviation", "Airline carrier name", "Airport Code", "Airport Name", "Total number of arriving flights in the observation", "Total number of delayed flights in the observation", "Number of flights delayed due to air carrier (subset of arr_del15)", "Number of flights delayed due to weather (subset of arr_del15)", "Number of flights delayed due to National Aviation System (subset of arr_del15)", "Number of flights delayed due to airport security (subset of arr_del15)", "Number of flights delayed due to a previous flight using the same aircraft being late", "Number of cancelled flights", "Number of flights diverted", "Arrival delay in minutes", "Carrier delay in minutes (subset of arr_delay)", "Weather delayed in minutes (subset of arr_delay)", "National Aviation System in minutes (subset of arr_delay)", "Security delay in minutes (subset of arr_delay)", "Aircraft delay in minutes (subset of arr_delay)")
VariableType <- c("Qualitative", "Qualitative", "Qualitative", "Qualitative", "Qualitative", "Qualitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative", "Quantitative")
VariableMeasure <- c("Independent", "Independent", "Independent", "Independent", "Independent", "Independent", "Independent", "Response", "Explanatory", "Explanatory", "Explanatory", "Explanatory", "Explanatory", "Independent", "Independent", "Response", "Explanatory", "Explanatory", "Explanatory", "Explanatory", "Explanatory")
FieldDefinitions <- data.frame(Field, VariableType, VariableMeasure, Description)
#Load Raw Data
flights_raw <- read.csv("https://raw.githubusercontent.com/dhairavc/DATA606/master/flights_delays.csv")
flights_raw <- flights_raw[, 1:21]
names(flights_raw) <- c("year","month","carrier","carrier_name","airport","airport_name","arr_flights","arr_del15","carrier_ct","weather_ct","nas_ct","security_ct","late_aircraft_ct","arr_cancelled","arr_diverted","arr_delay","carrier_delay","weather_delay","nas_delay","security_delay","late_aircraft_delay")
You should phrase your research question in a way that matches up with the scope of inference your dataset allows for.
The arrival delay data provide for some interesting questions:
1. What is the biggest contribution of flight delays?
2. Which holiday month (July, November, or December) is the worst for travel?
3. What is the most efficient airport?
4. Are airports generally getting more efficent over time?
5. What is the most efficient airline?
6. Which airline has degraded performance over time? Which has improved over time?
What are the cases, and how many are there?
Each case is a an airline by month, year, and airport with some data points around each.
n=68,153
Describe the method of data collection.
The Bureau of Transportation Statistics provides a CSV of the raw data for download.
This CSV will be uploaded to GitHub for analysis in R
What type of study is this (observational/experiment)?
This is an observational study. The creators of the data set, observed flight arrivals and noted down the total number of flights and some datapoints on the flights that arrived late
If you collected the data, state self-collected. If not, provide a citation/link.
BUREAU OF TRANSPORTATION STATISTICS
https://www.transtats.bts.gov/OT_Delay/OT_DelayCause1.asp?pn=1
What is the response variable? Is it quantitative or qualitative?
library(dplyr)
library(kableExtra)
FieldDefinitions %>% filter(VariableMeasure == "Response") %>% kable() %>% kable_styling()
| Field | VariableType | VariableMeasure | Description |
|---|---|---|---|
| arr_del15 | Quantitative | Response | Total number of delayed flights in the observation |
| arr_delay | Quantitative | Response | Arrival delay in minutes |
You should have two independent variables, one quantitative and one qualitative.
FieldDefinitions %>% filter(VariableMeasure == "Independent") %>% kable() %>% kable_styling()
| Field | VariableType | VariableMeasure | Description |
|---|---|---|---|
| year | Qualitative | Independent | Year (yyyy) |
| month | Qualitative | Independent | Month (mm) |
| carrier | Qualitative | Independent | Airline carrier abbreviation |
| carrier_name | Qualitative | Independent | Airline carrier name |
| airport | Qualitative | Independent | Airport Code |
| airport_name | Qualitative | Independent | Airport Name |
| arr_flights | Quantitative | Independent | Total number of arriving flights in the observation |
| arr_cancelled | Quantitative | Independent | Number of cancelled flights |
| arr_diverted | Quantitative | Independent | Number of flights diverted |
Provide summary statistics for each the variables. Also include appropriate visualizations related to your research question (e.g. scatter plot, boxplots, etc). This step requires the use of R, hence a code chunk is provided below. Insert more code chunks as needed.
library(dplyr)
library(ggplot2)
library(tidyr)
#summary statistics for all quantitative variables
flights_raw %>% select(arr_flights, arr_del15, carrier_ct, weather_ct, nas_ct, security_ct, late_aircraft_ct, arr_cancelled, arr_diverted, arr_delay, carrier_delay, weather_delay, nas_delay, security_delay, late_aircraft_delay) %>% summary()
## arr_flights arr_del15 carrier_ct weather_ct
## Min. : 1 Min. : 0.0 Min. : 0.00 Min. : 0.000
## 1st Qu.: 126 1st Qu.: 27.0 1st Qu.: 8.01 1st Qu.: 0.000
## Median : 333 Median : 71.0 Median : 20.63 Median : 1.500
## Mean : 1002 Mean : 198.1 Mean : 48.13 Mean : 6.386
## 3rd Qu.: 861 3rd Qu.: 180.0 3rd Qu.: 49.65 3rd Qu.: 5.630
## Max. :21977 Max. :6377.0 Max. :1792.07 Max. :641.540
## NA's :36 NA's :40 NA's :36 NA's :36
## nas_ct security_ct late_aircraft_ct arr_cancelled
## Min. : -0.01 Min. : 0.0000 Min. : 0.00 Min. : 0.00
## 1st Qu.: 8.60 1st Qu.: 0.0000 1st Qu.: 4.91 1st Qu.: 0.00
## Median : 25.20 Median : 0.0000 Median : 16.67 Median : 3.00
## Mean : 77.24 Mean : 0.4423 Mean : 65.91 Mean : 17.35
## 3rd Qu.: 68.55 3rd Qu.: 0.1300 3rd Qu.: 53.65 3rd Qu.: 12.00
## Max. :4091.27 Max. :80.5600 Max. :1885.47 Max. :1389.00
## NA's :36 NA's :36 NA's :36 NA's :36
## arr_diverted arr_delay carrier_delay weather_delay
## Min. : 0.000 Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 0.000 1st Qu.: 1320 1st Qu.: 397 1st Qu.: 0.0
## Median : 0.000 Median : 3745 Median : 1111 Median : 98.0
## Mean : 2.375 Mean : 11674 Mean : 3105 Mean : 541.6
## 3rd Qu.: 2.000 3rd Qu.: 10218 3rd Qu.: 2874 3rd Qu.: 442.0
## Max. :256.000 Max. :433687 Max. :196944 Max. :57707.0
## NA's :36 NA's :36 NA's :36 NA's :36
## nas_delay security_delay late_aircraft_delay
## Min. : -19 Min. : 0.00 Min. : 0
## 1st Qu.: 323 1st Qu.: 0.00 1st Qu.: 276
## Median : 1010 Median : 0.00 Median : 1057
## Mean : 3729 Mean : 17.72 Mean : 4280
## 3rd Qu.: 2996 3rd Qu.: 5.00 3rd Qu.: 3565
## Max. :238440 Max. :3194.00 Max. :148181
## NA's :36 NA's :36 NA's :36
#Barchart of arriving and delayed flights
flights_raw %>% select(year, arr_flights, arr_del15) %>% drop_na() %>%group_by(year) %>% summarize(ArrivingFlights=sum(arr_flights), DelayedFlights = sum(arr_del15)) %>% gather(key = "Type", "NumCount", 2:3) %>% ggplot( aes(x=year, y=NumCount, fill=Type)) + geom_col(position = 'dodge')
#Barchart of volume per airport
flights_raw %>% select(airport, arr_flights) %>% drop_na() %>% group_by(airport) %>% summarise(TotalFlights = sum(arr_flights)) %>% ggplot( aes(x=airport, y=TotalFlights, fill=TotalFlights)) + geom_col() + coord_flip()