FAA Analysis Part2 Analysis

Background: Flight landing.

Motivation: To reduce the risk of landing overrun.

Goal: To study what factors and how they would impact the landing distance of a commercial flight - using logistic regression

Data: Landing data from 950 commercial flights

Libraries Used

library(tidyverse)  #to visualize, transform, input, tidy and join data
library(haven)      #to input data from SAS
library(dplyr)      #data wrangling
library(stringr)    #string related functions
library(kableExtra) #to create HTML Table
library(DT)         #to preview the data sets
library(ggplot2)    #to vizualize data
library(MASS)       #step function
library(ROCR)       #to plot ROCR

Step0: Getting data ready

  • After cleaning and tidying data, we get:
## # A tibble: 6 x 8
##   aircraft duration no_pasg speed_ground speed_air height pitch distance
##   <fct>       <dbl>   <int>        <dbl>     <dbl>  <dbl> <dbl>    <dbl>
## 1 boeing       98.5      53        108.       109.   27.4  4.04    3370.
## 2 boeing      126.       69        102.       103.   27.8  4.12    2988.
## 3 boeing      112.       61         71.1       NA    18.6  4.43    1145.
## 4 boeing      197.       56         85.8       NA    30.7  3.88    1664.
## 5 boeing       90.1      70         59.9       NA    32.4  4.03    1050.
## 6 boeing      138.       55         75.0       NA    41.2  4.20    1627.

Step1: Creating Binary Responses

Created two binary variables below:

long.landing = 1 if distance > 2500; =0 otherwise
risky.landing = 1 if distance > 3000; =0 otherwise

# creating long.landing and risky.landing variables and removing distance variable
FAA_merged3 <- FAA_merged2 %>% mutate(long.landing = ifelse(distance>2500,1,0)) %>% 
          mutate(risky.landing =ifelse(distance>3000,1,0)) %>% dplyr::select(-distance)

str(FAA_merged3)
## Classes 'tbl_df', 'tbl' and 'data.frame':    831 obs. of  9 variables:
##  $ aircraft     : Factor w/ 2 levels "airbus","boeing": 2 2 2 2 2 2 2 2 2 2 ...
##  $ duration     : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg      : int  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground : num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air    : num  109 103 NA NA NA ...
##  $ height       : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch        : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ long.landing : num  1 1 0 0 0 0 0 0 0 0 ...
##  $ risky.landing: num  1 0 0 0 0 0 0 0 0 0 ...
head(FAA_merged3)
## # A tibble: 6 x 9
##   aircraft duration no_pasg speed_ground speed_air height pitch
##   <fct>       <dbl>   <int>        <dbl>     <dbl>  <dbl> <dbl>
## 1 boeing       98.5      53        108.       109.   27.4  4.04
## 2 boeing      126.       69        102.       103.   27.8  4.12
## 3 boeing      112.       61         71.1       NA    18.6  4.43
## 4 boeing      197.       56         85.8       NA    30.7  3.88
## 5 boeing       90.1      70         59.9       NA    32.4  4.03
## 6 boeing      138.       55         75.0       NA    41.2  4.20
## # ... with 2 more variables: long.landing <dbl>, risky.landing <dbl>


Step2: Histogram of long.landing
# creating long.landing and risky.landing variables and removing distance variable
FAA_merged3 %>% ggplot(aes(x=long.landing, fill=as.factor(long.landing))) +geom_bar() +theme_classic() +
  geom_text(stat = "count", aes(label = ..count.., y = ..count..),position = position_dodge(width = 1),
            vjust = -0.5, size = 6)


Step3: Single-factor regression analysis
variables <- c("aircraft", "duration","no_pasg" ,"speed_ground", "speed_air" ,"height" ,"pitch", "risky.landing")   
coeff3 <- rep(NA,length(variables))
p_val3 <- rep(NA,length(variables))

fit_all3 <- data.frame(variables, coeff3,p_val3)

for (i in seq_along(variables))
{
  fit_all3$coeff3[i] <- summary(glm(FAA_merged3$long.landing ~  FAA_merged3[[variables[i]]],family=binomial))$coefficients[,1][2]
  fit_all3$p_val3[i] <- summary(glm(FAA_merged3$long.landing ~  FAA_merged3[[variables[i]]],family=binomial))$coefficients[,4][2]
}

table1 <- fit_all3 %>% 
          mutate(sign_coef3 = ifelse(coeff3>0,"+","-")) %>%
          mutate(odds_ratio= exp(coeff3)) %>% 
          arrange(p_val3) 

table1
##       variables       coeff3       p_val3 sign_coef3   odds_ratio
## 1  speed_ground  0.472345752 3.935339e-14          + 1.603752e+00
## 2     speed_air  0.512321765 4.334124e-11          + 1.669162e+00
## 3      aircraft  0.864119860 8.398591e-05          + 2.372917e+00
## 4         pitch  0.400527824 4.664982e-02          + 1.492612e+00
## 5        height  0.008623997 4.218576e-01          + 1.008661e+00
## 6       no_pasg -0.007256406 6.058565e-01          - 9.927699e-01
## 7      duration -0.001070492 6.305122e-01          - 9.989301e-01
## 8 risky.landing 21.418699942 9.795390e-01          + 2.004579e+09


Step4: Visualizing Association of variables with “long.landing”
# speed_ground
plot(jitter(long.landing,0.1)~jitter(speed_ground),FAA_merged3,xlab="Speed Ground",ylab="Long Landing",pch=".")

ggplot(FAA_merged3,aes(x=speed_ground,fill=as.factor(long.landing))) + geom_histogram(position="dodge",binwidth=1)

# speed_air
plot(jitter(long.landing,0.1)~jitter(speed_air),FAA_merged3,xlab="Speed Air",ylab="Long Landing",pch=".")

ggplot(FAA_merged3,aes(x=speed_air,fill=as.factor(long.landing))) + geom_histogram(position="dodge",binwidth=1)

# pitch
plot(jitter(long.landing,0.1)~jitter(pitch),FAA_merged3,xlab="Pitch",ylab="Long Landing",pch=".")

ggplot(FAA_merged3,aes(x=pitch,fill=as.factor(long.landing))) + geom_histogram(position="dodge",binwidth=1)

# aircraft
ggplot(FAA_merged3, aes(as.factor(aircraft), fill=as.factor(long.landing))) +
  geom_bar(position = "fill") + xlab("Aircraft")



Step5: Fitting logisitic regression model
fit5 <- glm(long.landing ~  speed_ground + aircraft + pitch, 
            family=binomial, data = FAA_merged3)

summary(fit5)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + pitch, 
##     family = binomial, data = FAA_merged3)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.11589  -0.01116  -0.00026   0.00000   2.40741  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -67.92855   10.48408  -6.479 9.22e-11 ***
## speed_ground     0.61471    0.09184   6.694 2.18e-11 ***
## aircraftboeing   3.04348    0.73345   4.150 3.33e-05 ***
## pitch            1.06599    0.60389   1.765   0.0775 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  81.309  on 827  degrees of freedom
## AIC: 89.309
## 
## Number of Fisher Scoring iterations: 10



Step6: Forward selection using step AIC
fit_base <- glm(long.landing ~ 1, family=binomial, data = FAA_merged3[,-5])
fit_max <- glm(long.landing ~., family=binomial, data = FAA_merged3[,-5])



aic_fit6 <- step(fit_base, scope=list(lower=fit_base, upper=fit_max), direction = "forward") 
## Start:  AIC=624.78
## long.landing ~ 1
## 
##                 Df Deviance    AIC
## + speed_ground   1   107.40 136.49
## + risky.landing  1   309.08 338.17
## + aircraft       1   583.49 612.58
## + pitch          1   595.08 624.16
## <none>               597.69 624.78
## + height         1   597.29 626.37
## + no_pasg        1   597.46 626.54
## + duration       1   597.46 626.55
## 
## Step:  AIC=119.47
## long.landing ~ speed_ground
## 
##                 Df Deviance     AIC
## + aircraft       1   78.164  92.233
## + height         1   95.059 109.129
## + pitch          1   97.006 111.076
## + risky.landing  1  104.660 118.729
## <none>              107.401 119.470
## + duration       1  107.296 121.365
## + no_pasg        1  107.375 121.444
## 
## Step:  AIC=90.66
## long.landing ~ speed_ground + aircraft
## 
##                 Df Deviance    AIC
## + height         1   54.401 68.902
## + pitch          1   75.176 89.677
## <none>               78.164 90.665
## + duration       1   76.635 91.136
## + risky.landing  1   77.646 92.147
## + no_pasg        1   77.824 92.325
## 
## Step:  AIC=65.05
## long.landing ~ speed_ground + aircraft + height
## 
##                 Df Deviance    AIC
## + pitch          1   51.580 64.225
## <none>               54.401 65.047
## + risky.landing  1   53.634 66.279
## + duration       1   53.680 66.325
## + no_pasg        1   54.401 67.047
## 
## Step:  AIC=63.2
## long.landing ~ speed_ground + aircraft + height + pitch
## 
##                 Df Deviance    AIC
## <none>               51.580 63.204
## + duration       1   51.102 64.726
## + risky.landing  1   51.235 64.859
## + no_pasg        1   51.575 65.199
summary(aic_fit6)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height + 
##     pitch, family = binomial, data = FAA_merged3[, -5])
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.20284  -0.00054   0.00000   0.00000   2.35719  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -119.77598   24.41821  -4.905 9.33e-07 ***
## speed_ground      1.02266    0.20290   5.040 4.65e-07 ***
## aircraftboeing    5.13443    1.18091   4.348 1.37e-05 ***
## height            0.25795    0.06861   3.760  0.00017 ***
## pitch             1.53751    0.84109   1.828  0.06755 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  53.204  on 826  degrees of freedom
## AIC: 63.204
## 
## Number of Fisher Scoring iterations: 12
  • The model is different from the one obtained in step 5. Model in step 6 also has height and and shows pitch as not significant just like in model in step 5. We kept pitch in step 5 after visually observing the relationship between pitch and long landing.



Step7: Forward selection using step BIC
aic_fit7 <- step(fit_base, scope=list(lower=fit_base, upper=fit_max), direction = "forward",
                 k= log(nrow(FAA_merged3)), trace=0) 
summary(aic_fit7)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height, 
##     family = binomial, data = FAA_merged3[, -5])
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.43442  -0.00117   0.00000   0.00000   2.57435  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -102.95437   19.22882  -5.354 8.59e-08 ***
## speed_ground      0.92657    0.17242   5.374 7.70e-08 ***
## aircraftboeing    5.04813    1.11520   4.527 5.99e-06 ***
## height            0.23106    0.05959   3.877 0.000106 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  57.047  on 827  degrees of freedom
## AIC: 65.047
## 
## Number of Fisher Scoring iterations: 11
  • Model in step 7 does not have pitch in it but has other variables included in step 6.



Step8: Summary

Model: long.landing ~ speed_ground + aircraft + pitch

##       variables8    coeff8       p_val8 sign_coef8 odds_ratio8
## 1   speed ground 0.6147075 2.178523e-11          +    1.849116
## 2 aircraftboeing 3.0434815 3.331297e-05          +   20.978153
## 3          pitch 1.0659869 7.753041e-02          +    2.903703

AIC value: 89.309

  • Clearly, in the figure1, we can see that high speed ground results in long landing distance, which is a potential threat to landing safely.

  • Speed ground also gives a similar result. But since we have more data points for speed ground, we will keep speed ground in model instead of speed air

  • As pitch increases and is close to 5, the proportion of long landing increases compared with previous proportions for lower pitch. It might be interesting to explore pitch variable further.

  • The proportion of long landing distance= 1 is higher for boeing.



Step9.3: Single factor regression analysis for risky landing as response variable
variables <- c("aircraft", "duration","no_pasg" ,"speed_ground", "speed_air" ,"height" ,"pitch", "long.landing")   
coeff9.3 <- rep(NA,length(variables))
p_val9.3 <- rep(NA,length(variables))

fit_all9.3 <- data.frame(variables, coeff9.3,p_val9.3)

for (i in seq_along(variables))
{
  fit_all9.3$coeff9.3[i] <- summary(glm(FAA_merged3$risky.landing ~  FAA_merged3[[variables[i]]],family=binomial))$coefficients[,1][2]
  fit_all9.3$p_val9.3[i] <- summary(glm(FAA_merged3$risky.landing ~  FAA_merged3[[variables[i]]],family=binomial))$coefficients[,4][2]
}

table2 <- fit_all9.3 %>% 
  mutate(sign_coef9.3 = ifelse(coeff9.3>0,"+","-")) %>%
  mutate(odds_ratio= exp(coeff9.3)) %>% 
  arrange(p_val9.3)



Step9.4: Visualizing Association of variables with “risky.landing”
# speed_ground
plot(jitter(risky.landing,0.1)~jitter(speed_ground),FAA_merged3,xlab="Speed Ground",ylab="Risky Landing",pch=".")

ggplot(FAA_merged3,aes(x=speed_ground,fill=as.factor(risky.landing))) + geom_histogram(position="dodge",binwidth=1)

# speed_air
plot(jitter(risky.landing,0.1)~jitter(speed_air),FAA_merged3,xlab="Speed Air",ylab="Risky Landing",pch=".")

ggplot(FAA_merged3,aes(x=speed_air,fill=as.factor(risky.landing))) + geom_histogram(position="dodge",binwidth=1)

# pitch
plot(jitter(risky.landing,0.1)~jitter(pitch),FAA_merged3,xlab="Pitch",ylab="Risky Landing",pch=".")

ggplot(FAA_merged3,aes(x=pitch,fill=as.factor(risky.landing))) + geom_histogram(position="dodge",binwidth=1)

# aircraft
ggplot(FAA_merged3, aes(as.factor(aircraft), fill=as.factor(risky.landing))) +
  geom_bar(position = "fill") + xlab("Aircraft")



Step9.5: Fitting logisitic regression model
fit9.5 <- glm(risky.landing ~  speed_ground + aircraft, 
              family=binomial, data = FAA_merged3)

summary(fit9.5)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial, 
##     data = FAA_merged3)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.24398  -0.00011   0.00000   0.00000   1.61021  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -102.0772    24.7751  -4.120 3.79e-05 ***
## speed_ground      0.9263     0.2248   4.121 3.78e-05 ***
## aircraftboeing    4.0190     1.2494   3.217   0.0013 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  40.097  on 828  degrees of freedom
## AIC: 46.097
## 
## Number of Fisher Scoring iterations: 12



Step9.6: Forward selection using step AIC
fit_base9.6 <- glm(risky.landing ~ 1, family=binomial, data = FAA_merged3[,-5])
fit_max9.6 <- glm(risky.landing ~., family=binomial, data = FAA_merged3[,-5])



aic_fit9.6 <- step(fit_base9.6, scope=list(lower=fit_base9.6, upper=fit_max9.6), direction = "forward") 
## Start:  AIC=438.04
## risky.landing ~ 1
## 
##                Df Deviance    AIC
## + speed_ground  1    57.99  74.82
## + long.landing  1   134.60 151.43
## + aircraft      1   412.07 428.90
## + no_pasg       1   421.18 438.01
## <none>              423.22 438.04
## + pitch         1   421.54 438.37
## + duration      1   423.04 439.87
## + height        1   423.13 439.95
## 
## Step:  AIC=62.93
## risky.landing ~ speed_ground
## 
##                Df Deviance    AIC
## + aircraft      1   39.955 46.898
## + pitch         1   51.634 58.576
## + long.landing  1   53.529 60.471
## <none>              57.988 62.931
## + no_pasg       1   57.178 64.121
## + height        1   57.787 64.729
## + duration      1   57.951 64.893
## 
## Step:  AIC=46.1
## risky.landing ~ speed_ground + aircraft
## 
##                Df Deviance    AIC
## + no_pasg       1   37.559 45.700
## <none>              39.955 46.097
## + height        1   39.295 47.436
## + long.landing  1   39.461 47.602
## + duration      1   39.757 47.898
## + pitch         1   39.783 47.924
## 
## Step:  AIC=45.71
## risky.landing ~ speed_ground + aircraft + no_pasg
## 
##                Df Deviance    AIC
## <none>              37.559 45.707
## + height        1   36.985 47.133
## + long.landing  1   37.222 47.370
## + pitch         1   37.304 47.452
## + duration      1   37.548 47.696
summary(aic_fit9.6)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg, 
##     family = binomial, data = FAA_merged3[, -5])
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.33913  -0.00009   0.00000   0.00000   1.87810  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -99.90780   25.57993  -3.906 9.39e-05 ***
## speed_ground     0.94963    0.23559   4.031 5.56e-05 ***
## aircraftboeing   4.64188    1.47520   3.147  0.00165 ** 
## no_pasg         -0.08462    0.05732  -1.476  0.13987    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  37.707  on 827  degrees of freedom
## AIC: 45.707
## 
## Number of Fisher Scoring iterations: 12
  • The model in step 9.6 has no_pasg included, although it is shown as not significant. The other 2 variables in step 9.5 model are also in step 9.6 model.



Step9.7: Forward selection using step BIC
aic_fit9.7 <- step(fit_base9.6, scope=list(lower=fit_base9.6, upper=fit_max9.6), direction = "forward",
                   k= log(nrow(FAA_merged3)), trace=0) 
summary(aic_fit9.7)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial, 
##     data = FAA_merged3[, -5])
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.24398  -0.00011   0.00000   0.00000   1.61021  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -102.0772    24.7751  -4.120 3.79e-05 ***
## speed_ground      0.9263     0.2248   4.121 3.78e-05 ***
## aircraftboeing    4.0190     1.2494   3.217   0.0013 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  40.097  on 828  degrees of freedom
## AIC: 46.097
## 
## Number of Fisher Scoring iterations: 12
  • Model in step 9.7 is exactly the same as the model in step 9.5. We are finalizing this model.



Step10: Summary

Model: risky.landing ~ speed_ground + aircraft

##     variables9.8  coeff9.8     p_val9.8 sign_coef9.8 odds_ratio9.8
## 1   speed ground 0.9262743 3.778843e-05            +      2.525084
## 2 aircraftboeing 4.0190312 1.296316e-03            +     55.647168

AIC value: 46.097

  • Clearly, in the figure, we can see that high speed ground results in risky landing=1, which is a potential threat to landing safely.

  • Speed ground also gives a similar result. But since we have more data points for speed ground, we will keep speed ground in model instead of speed air

  • Pitch does not seem to play a significant role when we see the histogram of pitch for risky landing = 0 and 1. We will drop it from our model.

  • The proportion of risky landing distance= 1 is higher for boeing. Aircraft type seem to play a role.

  • We get the same model with step BIC as the model that we selected manually in step 9.5



Step11: Difference between models obtained by response= risky landing vs long landing
  • Variable pitch is included in the model with response = long landing but not in the model with response = risky landing

  • The AIC value of the model with response variable = risky landing is 46.097 which is almost half of the AIC value of model with response variable = long landing: 89.309. This indicates that the former model is better.



Step12: ROC
pred1 <- predict(fit9.5, type = "response")
pred_ROC <- prediction(pred1, FAA_merged3$risky.landing)
perf1 <- performance(pred_ROC, "tpr", "fpr")
plot(perf1, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred_ROC, "auc"), "y.values"))
## [1] 0.9986161
pred2 <- predict(fit5, type = "response")
pred_ROC2 <- prediction(pred2, FAA_merged3$long.landing)
perf2 <- performance(pred_ROC2, "tpr", "fpr")
plot(perf2, colorize=TRUE)

#Get the AUC
unlist(slot(performance(pred_ROC2, "auc"), "y.values"))
## [1] 0.996666
  • Since we are plotting the ROC curve on the same data that we used in our model, the area under the curve is high. We need test data to actually gauge the prediction performance of our model

  • Among the two models, the AUC is higher for the model: risky landing ~speed_ground + aircraft

Step13: Predicting probability of a new observation
ilogit <- function (x) {
  exp(x)/(1+exp(x))
}

# predicting probability of risky landing
new.ind_risky<-data.frame(aircraft="boeing", duration=200, no_pasg=80, speed_ground=115, speed_air=120, height=40, pitch=4)

#  Predicted probability for risky landing
predict(fit9.5,newdata=new.ind_risky,type="link",se=T)
## $fit
##        1 
## 8.463332 
## 
## $se.fit
## [1] 2.089367
## 
## $residual.scale
## [1] 1
# CI
round(ilogit(c(8.463332-1.96*2.089367,8.463332+1.96*2.089367)),3)
## [1] 0.987 1.000
# predicting probability of long landing
new.ind_long<-data.frame(aircraft="boeing", duration=200, no_pasg=80, speed_ground=115, speed_air=120, height=40, pitch=4)

#  Predicted probability for long landing
predict(fit5,newdata=new.ind_long,type="link",se=T)
## $fit
##        1 
## 10.07025 
## 
## $se.fit
## [1] 1.616338
## 
## $residual.scale
## [1] 1
# CI
round(ilogit(c(10.07025-1.96*1.616338,10.07025+1.96*1.616338)),3)
## [1] 0.999 1.000



Step14: Comparing Logit, Probit, and Complimentary Log Log Models
lmod.probit<-glm(risky.landing ~  speed_ground + aircraft,family=binomial(link=probit),FAA_merged3)

lmod.cloglog<-glm(risky.landing ~  speed_ground + aircraft,family=binomial(link=cloglog),FAA_merged3)

round(coef(fit9.5),3)
##    (Intercept)   speed_ground aircraftboeing 
##       -102.077          0.926          4.019
round(coef(lmod.probit),3)
##    (Intercept)   speed_ground aircraftboeing 
##        -58.693          0.532          2.357
round(coef(lmod.cloglog),3)
##    (Intercept)   speed_ground aircraftboeing 
##        -69.265          0.622          2.898
  • The coefficients of logit model is almost double that of probit and complimentary log log model



Step15: Plotting ROC for different models
# ROC for probit
pred15.1 <- predict(lmod.probit, type = "response")
pred_ROC15.1 <- prediction(pred15.1, FAA_merged3$risky.landing)
perf15.1 <- performance(pred_ROC15.1, "tpr", "fpr")
#Get the AUC
unlist(slot(performance(pred_ROC15.1, "auc"), "y.values"))
## [1] 0.9986161
# ROC for complimentary log log
pred15.2 <- predict(lmod.cloglog, type = "response")
pred_ROC15.2 <- prediction(pred15.2, FAA_merged3$risky.landing)
perf15.2 <- performance(pred_ROC15.2, "tpr", "fpr")
#Get the AUC
unlist(slot(performance(pred_ROC15.2, "auc"), "y.values"))
## [1] 0.9985736
plot(perf1, type="l", lty=1)
plot(perf15.1,  type="l", lty=2,add=TRUE)
plot(perf15.2, type="l", lty=3, add=TRUE)

  • The ROC curves are very simmilar and cannot be differentiated clearly



Step16: Top 5 risky landings
top_risky_observations <- data.frame(logit_prob=fit9.5$fitted.values,
                             probit_prob = lmod.probit$fitted.values,
                             clog_prob = lmod.cloglog$fitted.values)

top_risky_observations$index <- 1:nrow(top_risky_observations)
head(top_risky_observations)
##     logit_prob  probit_prob    clog_prob index
## 1 8.700396e-01 8.646452e-01 8.827953e-01     1
## 2 1.989379e-02 1.285756e-02 4.271187e-02     2
## 3 2.220446e-16 2.220446e-16 2.356424e-10     3
## 4 8.599751e-09 2.220446e-16 2.291489e-06     4
## 5 2.220446e-16 2.220446e-16 2.271906e-13     5
## 6 3.893281e-13 2.220446e-16 2.771489e-09     6
top5risky_logit <- top_risky_observations %>% arrange(desc(logit_prob)) %>% 
  top_n(n=5, wt=logit_prob) %>% dplyr::select(logit_prob,index)

top5risky_probit <- top_risky_observations %>% arrange(desc(probit_prob)) %>% 
  top_n(n=5, wt=probit_prob) %>% dplyr::select(probit_prob,index)

top5risky_clog <- top_risky_observations %>% arrange(desc(clog_prob)) %>% 
  top_n(n=5, wt=clog_prob) %>% dplyr::select(clog_prob,index)

index_compare <- data.frame(logit=head(top5risky_logit,5), probit=head(top5risky_probit,5),
                            clog=head(top5risky_clog,5))

index_compare
##   logit.logit_prob logit.index probit.probit_prob probit.index
## 1                1         362                  1           56
## 2                1         307                  1           64
## 3                1          64                  1          134
## 4                1         387                  1          176
## 5                1         408                  1          179
##   clog.clog_prob clog.index
## 1              1         19
## 2              1         29
## 3              1         30
## 4              1         56
## 5              1         64
  • The top probability for all models is approximated to 1. However, we can see that the indexes vary if we want high precision. Some values such as value with index 64 appear in all 3.



Step17: Predicting probability and comparing CI for new observation
#  Predicted probability for risky landing - probit
predict(lmod.probit,newdata=new.ind_risky,type="link",se=T)
## $fit
##        1 
## 4.872041 
## 
## $se.fit
## [1] 1.127892
## 
## $residual.scale
## [1] 1
# CI
round(ilogit(c(4.872041-1.96*1.127892,4.872041+1.96*1.127892)),3)
## [1] 0.935 0.999
# predicting probability of risky landing - cloglog

predict(lmod.cloglog ,newdata=new.ind_long,type="link",se=T)
## $fit
##       1 
## 5.16944 
## 
## $se.fit
## [1] 1.173423
## 
## $residual.scale
## [1] 1
# CI
round(ilogit(c(5.16944-1.96*1.173423,5.16944+1.96*1.173423)),3)
## [1] 0.946 0.999
# redicting probability of risky landing - cloglog 
round(ilogit(c(8.463332-1.96*2.089367,8.463332+1.96*2.089367)),3)
## [1] 0.987 1.000
  • The CI for probit and cloglog is wider than that of logit.