Data Project Proposal

Data Preparation

This project will utilize the nycflights13 package added to R by Hadley Wickham. It contains information about all flights that departed from NYC in 2013.

library(nycflights13)
library(dplyr)
library(psych)

nycflights <- flights
nycweather <- weather

colnames(nycflights)
##  [1] "year"           "month"          "day"            "dep_time"      
##  [5] "sched_dep_time" "dep_delay"      "arr_time"       "sched_arr_time"
##  [9] "arr_delay"      "carrier"        "flight"         "tailnum"       
## [13] "origin"         "dest"           "air_time"       "distance"      
## [17] "hour"           "minute"         "time_hour"
# We only care about the performances of airports, not individual airlines or airplanes
nycflights <- nycflights[, c(1:9, 13:17)]

# We do not care about dew point, nor humidity, nor the direction of the wind.
nycweather <- nycweather[, c(1:6, 10:15)]

# Combine the datasets into one large dataset
# Then filter any incomplete rows from the merging
nycfinal <- left_join(nycflights, nycweather, by = c("origin", "year", "month", "day", "hour"))
nycfinal <- nycfinal[complete.cases(nycfinal), ]

Research Question

Does weather affect airport delays? And which causes the most delays? Wind, precipitation, or visibility?

Cases

Each case is a flight from a NYC airport (Newark, JFK, and LaGuardia) with the weather report for that flight’s departure time. There are 291,675 cases.

Data Collection

According to Hadley Wickham, data is collected from the Bureau of Transportation Statistics.

Type of Study

This is an observational study.

Data Source

The Bureau of Transportation Statistics for the year 2013.

Response

The response variable is delay and is numerical.

Explanatory

The explanatory variables are wind, precipitation, and visibility. All are numeric.

Relevant Summary Statistics

describe(nycfinal$dep_delay)
##    vars      n  mean    sd median trimmed  mad min  max range skew
## X1    1 291675 10.99 37.59     -2    2.38 5.93 -43 1301  1344 5.09
##    kurtosis   se
## X1    50.69 0.07
describe(nycfinal$arr_delay)
##    vars      n mean    sd median trimmed   mad min  max range skew
## X1    1 291675 4.97 42.14     -6    -2.3 20.76 -86 1272  1358 3.84
##    kurtosis   se
## X1    32.44 0.08
describe(nycfinal$wind_gust)
##    vars      n  mean    sd median trimmed  mad min     max   range  skew
## X1    1 291675 12.06 14.25  11.92   11.61 5.89   0 1206.43 1206.43 66.63
##    kurtosis   se
## X1  5577.33 0.03
describe(nycfinal$wind_speed)
##    vars      n  mean    sd median trimmed  mad min     max   range  skew
## X1    1 291675 10.48 12.38  10.36   10.09 5.12   0 1048.36 1048.36 66.63
##    kurtosis   se
## X1  5577.33 0.02
describe(nycfinal$precip)
##    vars      n mean   sd median trimmed mad min  max range skew kurtosis
## X1    1 291675    0 0.01      0       0   0   0 0.38  0.38   15   283.17
##    se
## X1  0
describe(nycfinal$visib)
##    vars      n mean   sd median trimmed mad min max range  skew kurtosis
## X1    1 291675 9.55 1.56     10    9.98   0   0  10    10 -4.11     17.5
##    se
## X1  0
# On time/early vs delayed departures
# Positive value means delayed flight
table(nycfinal$dep_delay >= 0)
## 
##  FALSE   TRUE 
## 167455 124220
# On time/early vs delayed arrivals
# Positive value means delayed flight
table(nycfinal$arr_delay >= 0)
## 
##  FALSE   TRUE 
## 173091 118584
# Delayed departures
describe(subset(nycfinal, dep_delay > 0)$dep_delay)
##    vars      n mean    sd median trimmed   mad min  max range skew
## X1    1 109330 36.9 51.78     18   25.94 22.24   1 1301  1300 3.62
##    kurtosis   se
## X1    27.97 0.16
# On time/early departures
describe(subset(nycfinal, dep_delay <= 0)$dep_delay)
##    vars      n  mean   sd median trimmed  mad min max range  skew kurtosis
## X1    1 182345 -4.54 3.04     -4   -4.35 2.97 -43   0    43 -0.67     0.76
##      se
## X1 0.01
# Delayed arrivals
describe(subset(nycfinal, arr_delay > 0)$arr_delay)
##    vars      n  mean    sd median trimmed   mad min  max range skew
## X1    1 113701 37.92 50.91     19   27.31 20.76   1 1272  1271 3.59
##    kurtosis   se
## X1    27.56 0.15
# On time/early arrivals
describe(subset(nycfinal, arr_delay <= 0)$arr_delay)
##    vars      n   mean    sd median trimmed   mad min max range skew
## X1    1 177974 -16.08 10.89    -15  -15.16 11.86 -86   0    86 -0.8
##    kurtosis   se
## X1     0.61 0.03

The Beaufort Scale categorizes wind types for both land and sea. According to this scale, anything about 32 MPH is qualified as High Wind/moderate gale/near gale.

# High wind according to Beaufort Scale
describe(subset(nycfinal, wind_speed >= 32)$wind_speed)
##    vars   n   mean     sd median trimmed  mad   min     max   range skew
## X1    1 421 113.78 272.89  34.52   34.42 1.71 32.22 1048.36 1016.14 3.13
##    kurtosis   se
## X1     7.79 13.3
# Light wind
describe(subset(nycfinal, wind_gust >= 32)$wind_gust)
##    vars    n  mean     sd median trimmed  mad   min     max   range skew
## X1    1 1360 63.93 180.26  34.43    35.2 1.96 33.11 1206.43 1173.33 6.17
##    kurtosis   se
## X1    36.16 4.89

According to this Ask the Captain, pilots need 600ft or approximately 0.11 miles of visibility for takeoffs. Other sources from Googling report .5 miles.

# Good visibility
describe(subset(nycfinal, visib >= .5)$visib)
##    vars      n mean   sd median trimmed mad min max range  skew kurtosis
## X1    1 290212 9.59 1.41     10    9.98   0 0.5  10   9.5 -4.18    18.32
##    se
## X1  0
# Bad visibility
describe(subset(nycfinal, visib < .5)$visib)
##    vars    n mean   sd median trimmed mad min  max range  skew kurtosis se
## X1    1 1463  0.2 0.07   0.25    0.22   0   0 0.25  0.25 -1.26     0.27  0