Workshop

Andrew Mosbo

Flights Data

library(tidyverse)

flights <- read.csv("flights.csv")
planes <- read.csv("planes.csv")
airlines <- read.csv("airlines.csv")
weather <- read.csv("weather.csv")
airports <- read.csv("airports.csv")

Merging Datasets by a Single Key

First, let’s start with adding some of the plane metadata to our flights dataset. The only common key is tail number. Be careful, as these have different names in the two datasets.

flights <- flights %>% 
  left_join(select(planes,
                   Tailnumber,
                   age,
                   seats), 
            by = c("tailnum" = "Tailnumber"))

Merging Datasets by Multiple Keys

  • Some data sets are linked by multiple keys.

  • Weather readings are taken at each airport every hour for every day of every month.

  • Each flight leaves an airport at a certain hour on a certain day in a certain month.

flights <- flights %>% 
  left_join(select(weather,
                   origin:hour,
                   wind_speed,
                   visib),
            by = c("origin","year","month","day","hour"))

Training-Testing Split

sample <- sample(c(TRUE, FALSE), nrow(flights),
                 replace=TRUE, prob=c(0.8,0.2))
train  <- flights[sample, ]
test   <- flights[!sample, ]

Logistic Regression

mod <- glm(ONTIME ~ wind_speed + seats + age + carrier + origin + 
             visib + distance + factor(month) + dest, 
           family = "binomial", data = train)

summary(mod)

Call:
glm(formula = ONTIME ~ wind_speed + seats + age + carrier + origin + 
    visib + distance + factor(month) + dest, family = "binomial", 
    data = train)

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.3544776  1.9958125  -1.180  0.23812    
wind_speed      -0.0339933  0.0008506 -39.962  < 2e-16 ***
seats           -0.0003010  0.0001056  -2.850  0.00438 ** 
age             -0.0119191  0.0010606 -11.238  < 2e-16 ***
carrierAA        0.5683144  0.0422164  13.462  < 2e-16 ***
carrierAS        0.5296095  0.1121496   4.722 2.33e-06 ***
carrierB6       -0.1966310  0.0251234  -7.827 5.01e-15 ***
carrierDL        0.4359896  0.0286972  15.193  < 2e-16 ***
carrierEV       -0.3192677  0.0267163 -11.950  < 2e-16 ***
carrierF9       -0.4909822  0.0996174  -4.929 8.28e-07 ***
carrierFL       -0.5855631  0.0618183  -9.472  < 2e-16 ***
carrierHA        0.6085308  0.1931599   3.150  0.00163 ** 
carrierMQ        0.0918150  0.0867987   1.058  0.29015    
carrierOO        0.0363974  0.4927673   0.074  0.94112    
carrierUA        0.2160660  0.0311088   6.946 3.77e-12 ***
carrierUS        0.1616788  0.0353953   4.568 4.93e-06 ***
carrierVX        0.2897841  0.0456180   6.352 2.12e-10 ***
carrierWN        0.0129393  0.0406573   0.318  0.75029    
carrierYV       -0.2850171  0.1042579  -2.734  0.00626 ** 
originJFK        0.1230425  0.0213275   5.769 7.97e-09 ***
originLGA        0.1036158  0.0208150   4.978 6.43e-07 ***
visib            0.1250408  0.0023342  53.570  < 2e-16 ***
distance         0.0011836  0.0010968   1.079  0.28055    
factor(month)2  -0.0236070  0.0228833  -1.032  0.30225    
factor(month)3   0.1013334  0.0221176   4.582 4.62e-06 ***
factor(month)4  -0.2219444  0.0219395 -10.116  < 2e-16 ***
factor(month)5   0.2276342  0.0222254  10.242  < 2e-16 ***
factor(month)6  -0.2801607  0.0219673 -12.754  < 2e-16 ***
factor(month)7  -0.3707311  0.0217846 -17.018  < 2e-16 ***
factor(month)8  -0.1410179  0.0219481  -6.425 1.32e-10 ***
factor(month)9   0.6264644  0.0237423  26.386  < 2e-16 ***
factor(month)10  0.2242118  0.0222968  10.056  < 2e-16 ***
factor(month)11  0.2156539  0.0224474   9.607  < 2e-16 ***
factor(month)12 -0.5343432  0.0222460 -24.020  < 2e-16 ***
destACK          1.9154594  1.7946759   1.067  0.28584    
destALB          2.0816990  1.8410860   1.131  0.25819    
destANC         -1.6863135  1.9439173  -0.867  0.38568    
destATL          0.8262853  1.1802273   0.700  0.48386    
destAUS          0.3165726  0.3758489   0.842  0.39963    
destAVL          1.3783152  1.3644002   1.010  0.31240    
destBDL          2.5393591  1.8710596   1.357  0.17472    
destBGR          2.1073140  1.6008231   1.316  0.18804    
destBHM          1.0145805  1.0741247   0.945  0.34488    
destBNA          1.2789133  1.1771263   1.086  0.27727    
destBOS          2.1300439  1.7921831   1.189  0.23463    
destBQN         -0.0047767  0.3203244  -0.015  0.98810    
destBTV          1.9585991  1.7128905   1.143  0.25285    
destBUF          1.8983037  1.6806582   1.130  0.25869    
destBUR         -0.8999350  0.7286727  -1.235  0.21682    
destBWI          1.8988421  1.8094498   1.049  0.29399    
destBZN         -0.4765748  0.4410424  -1.081  0.27989    
destCAE         -0.0651013  1.3660635  -0.048  0.96199    
destCAK          1.6232223  1.5804449   1.027  0.30439    
destCHO          1.9364420  1.7262358   1.122  0.26196    
destCHS          1.4563900  1.3106317   1.111  0.26648    
destCLE          1.6025232  1.5530398   1.032  0.30214    
destCLT          1.3475143  1.4135797   0.953  0.34046    
destCMH          1.8633849  1.4895091   1.251  0.21093    
destCRW          1.6288815  1.6283326   1.000  0.31715    
destCVG          1.4470572  1.3724770   1.054  0.29173    
destDAY          1.5221980  1.4129228   1.077  0.28133    
destDCA          1.6239015  1.7728241   0.916  0.35967    
destDEN         -0.1163944  0.2786694  -0.418  0.67618    
destDFW          0.4636676  0.5111671   0.907  0.36437    
destDSM          0.7134808  0.8921471   0.800  0.42386    
destDTW          1.6017290  1.4605990   1.097  0.27281    
destEGE          0.0983937  0.2713974   0.363  0.71695    
destEYW         -0.2393052  0.9339377  -0.256  0.79777    
destFLL          0.6440851  0.8408011   0.766  0.44365    
destGRR          1.1620979  1.3361366   0.870  0.38444    
destGSO          1.5810666  1.5079766   1.048  0.29442    
destGSP          1.2973588  1.3476719   0.963  0.33571    
destHDN         -0.4379989  0.6142525  -0.713  0.47581    
destHNL         -3.9184714  3.4581552  -1.133  0.25717    
destHOU          0.2940594  0.4680312   0.628  0.52981    
destIAD          1.8908753  1.7590010   1.075  0.28239    
destIAH          0.2897909  0.4810385   0.602  0.54689    
destILM          1.5492579  1.4811934   1.046  0.29558    
destIND          1.4812100  1.2898361   1.148  0.25082    
destJAC         -2.1294926  0.6564142  -3.244  0.00118 ** 
destJAX          0.9898429  1.1028006   0.898  0.36941    
destLAS         -0.4870440  0.4871093  -1.000  0.31738    
destLAX         -0.9001742  0.7264647  -1.239  0.21530    
destLEX          8.3304915 26.6986095   0.312  0.75503    
destLGB         -0.4147626  0.7246899  -0.572  0.56710    
destMCI          0.6253065  0.8061460   0.776  0.43794    
destMCO          0.9945294  0.9771929   1.018  0.30880    
destMDW          0.9115183  1.2181000   0.748  0.45427    
destMEM          0.8949714  0.9631543   0.929  0.35278    
destMHT          2.0370087  1.7703899   1.151  0.24990    
destMIA          0.7013288  0.8197303   0.856  0.39224    
destMKE          0.9813521  1.2033362   0.816  0.41477    
destMSN          0.9459269  1.1269226   0.839  0.40125    
destMSP          0.9034802  0.8968555   1.007  0.31375    
destMSY          0.5968978  0.7248384   0.823  0.41023    
destMTJ          0.5190114  0.8693080   0.597  0.55048    
destMVY          2.5115308  1.8261249   1.375  0.16903    
destMYR          2.1058418  1.4284912   1.474  0.14044    
destOAK         -0.7404328  0.8488198  -0.872  0.38304    
destOKC         -0.2148155  0.5738258  -0.374  0.70814    
destOMA          0.6805137  0.7646681   0.890  0.37349    
destORD          1.1563500  1.2095692   0.956  0.33907    
destORF          1.7642771  1.6880158   1.045  0.29594    
destPBI          0.6597384  0.8850313   0.745  0.45601    
destPDX         -1.0137702  0.7062896  -1.435  0.15119    
destPHL          1.7099359  1.9038039   0.898  0.36910    
destPHX         -0.5126188  0.3919070  -1.308  0.19087    
destPIT          1.8864305  1.6422297   1.149  0.25068    
destPSE         -0.0933305  0.3040984  -0.307  0.75891    
destPSP         -0.6499558  0.8641174  -0.752  0.45195    
destPVD          1.6889737  1.8228366   0.927  0.35415    
destPWM          1.7866043  1.7015836   1.050  0.29373    
destRDU          1.6448492  1.5406782   1.068  0.28569    
destRIC          1.5570387  1.6917183   0.920  0.35737    
destROC          1.9457165  1.7216562   1.130  0.25842    
destRSW          0.8955244  0.8370918   1.070  0.28471    
destSAN         -0.8435967  0.6956629  -1.213  0.22526    
destSAT          0.1013231  0.3230161   0.314  0.75377    
destSAV          1.2066040  1.2241135   0.986  0.32428    
destSBN          1.1479210  1.4543493   0.789  0.42994    
destSDF          1.3837362  1.2943255   1.069  0.28503    
destSEA         -0.6666294  0.6709239  -0.994  0.32042    
destSFO         -1.0323118  0.8450454  -1.222  0.22186    
destSJC         -0.5221374  0.8413022  -0.621  0.53484    
destSJU          0.2251598  0.2937428   0.767  0.44337    
destSLC         -0.2468997  0.2443601  -1.010  0.31231    
destSMF         -1.1573469  0.7914975  -1.462  0.14368    
destSNA         -0.3869076  0.7046445  -0.549  0.58295    
destSRQ          0.9807126  0.8747311   1.121  0.26222    
destSTL          0.8969032  1.0430718   0.860  0.38986    
destSTT          0.2173324  0.3024390   0.719  0.47239    
destSYR          1.9899992  1.7816718   1.117  0.26403    
destTPA          0.7996869  0.9119871   0.877  0.38056    
destTUL         -0.2730749  0.6884025  -0.397  0.69160    
destTVC          1.9002239  1.3247542   1.434  0.15146    
destTYS          0.8937593  1.3083328   0.683  0.49453    
destXNA          0.7793279  0.7731967   1.008  0.31349    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 294533  on 218102  degrees of freedom
Residual deviance: 281140  on 217966  degrees of freedom
  (51407 observations deleted due to missingness)
AIC: 281414

Number of Fisher Scoring iterations: 6

Variable Selection

anova(mod)
Analysis of Deviance Table

Model: binomial, link: logit

Response: ONTIME

Terms added sequentially (first to last)

               Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
NULL                          218102     294533              
wind_speed      1   1056.5    218101     293476 < 2.2e-16 ***
seats           1    331.8    218100     293144 < 2.2e-16 ***
age             1     12.6    218099     293132 0.0003827 ***
carrier        15   2738.1    218084     290394 < 2.2e-16 ***
origin          2    154.2    218082     290239 < 2.2e-16 ***
visib           1   3402.3    218081     286837 < 2.2e-16 ***
distance        1     12.0    218080     286825 0.0005265 ***
factor(month)  11   4490.1    218069     282335 < 2.2e-16 ***
dest          103   1195.0    217966     281140 < 2.2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Variable Selection

mod <- update(mod, .~. - dest - seats)
summary(mod)

Call:
glm(formula = ONTIME ~ wind_speed + age + carrier + origin + 
    visib + distance + factor(month), family = "binomial", data = train)

Coefficients:
                  Estimate Std. Error z value Pr(>|z|)    
(Intercept)     -2.901e-01  3.498e-02  -8.292  < 2e-16 ***
wind_speed      -3.368e-02  8.472e-04 -39.757  < 2e-16 ***
age             -1.442e-02  1.033e-03 -13.955  < 2e-16 ***
carrierAA        5.924e-01  3.751e-02  15.794  < 2e-16 ***
carrierAS        7.245e-01  1.016e-01   7.132 9.90e-13 ***
carrierB6       -1.851e-01  2.104e-02  -8.799  < 2e-16 ***
carrierDL        3.842e-01  2.465e-02  15.586  < 2e-16 ***
carrierEV       -2.759e-01  2.426e-02 -11.373  < 2e-16 ***
carrierF9       -7.084e-01  9.419e-02  -7.520 5.46e-14 ***
carrierFL       -7.809e-01  4.864e-02 -16.054  < 2e-16 ***
carrierHA        6.091e-01  1.500e-01   4.061 4.88e-05 ***
carrierMQ        2.897e-01  8.294e-02   3.493 0.000478 ***
carrierOO        1.054e-01  4.930e-01   0.214 0.830725    
carrierUA        2.332e-01  2.615e-02   8.919  < 2e-16 ***
carrierUS        1.125e-01  2.643e-02   4.258 2.06e-05 ***
carrierVX        2.979e-01  4.180e-02   7.127 1.02e-12 ***
carrierWN       -1.262e-01  3.045e-02  -4.144 3.41e-05 ***
carrierYV       -2.568e-01  1.013e-01  -2.535 0.011256 *  
originJFK        1.950e-01  1.548e-02  12.599  < 2e-16 ***
originLGA        1.032e-01  1.392e-02   7.415 1.22e-13 ***
visib            1.241e-01  2.327e-03  53.339  < 2e-16 ***
distance        -5.378e-05  7.803e-06  -6.893 5.48e-12 ***
factor(month)2  -2.461e-02  2.281e-02  -1.079 0.280516    
factor(month)3   1.022e-01  2.203e-02   4.642 3.45e-06 ***
factor(month)4  -2.144e-01  2.184e-02  -9.818  < 2e-16 ***
factor(month)5   2.324e-01  2.211e-02  10.510  < 2e-16 ***
factor(month)6  -2.698e-01  2.186e-02 -12.343  < 2e-16 ***
factor(month)7  -3.606e-01  2.166e-02 -16.648  < 2e-16 ***
factor(month)8  -1.307e-01  2.183e-02  -5.987 2.13e-09 ***
factor(month)9   6.324e-01  2.362e-02  26.769  < 2e-16 ***
factor(month)10  2.317e-01  2.218e-02  10.446  < 2e-16 ***
factor(month)11  2.182e-01  2.233e-02   9.770  < 2e-16 ***
factor(month)12 -5.327e-01  2.213e-02 -24.070  < 2e-16 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for binomial family taken to be 1)

    Null deviance: 294533  on 218102  degrees of freedom
Residual deviance: 282405  on 218070  degrees of freedom
  (51407 observations deleted due to missingness)
AIC: 282471

Number of Fisher Scoring iterations: 4

Evaluation

  • We can evaluate the model’s predictive accuracy with the testing data set.

  • The model returns a predicted probability.

  • Any predicted probability greater than 0.5 will be labeled as a 1 (a success) and 0 otherwise.

Prediction Accuracy 0.5 Threshold

test <- test %>% 
  mutate(pred_prob = predict(mod, test, type = "response")) %>% 
  mutate(prediction = if_else(pred_prob > 0.5,1,0))

(res <- table(test$ONTIME, test$prediction))
   
        0     1
  0  5939 16116
  1  4475 27849
sum(diag(res)) / sum(res)
[1] 0.6213428

Prediction Accuracy a priori

threshold <- sum(!train$ONTIME, na.rm = T) / sum(!is.na(train$ONTIME))

test <- test %>% 
  mutate(pred_prob = predict(mod, test, type = "response")) %>% 
  mutate(prediction = if_else(pred_prob > threshold,1,0))

(res <- table(test$ONTIME, test$prediction))
   
        0     1
  0  2137 19918
  1  1085 31239
sum(diag(res)) / sum(res)
[1] 0.6137663