Dataset

#read excel files
library("readxl")
faa1 <- read_excel('faa1.xls')
faa2 <- read_excel('faa2.xls')
library('plyr')
faa <- rbind.fill(faa1,faa2)
library('tidyverse')
faa.no.dup <- distinct(faa,aircraft,no_pasg,speed_ground,speed_air,height,pitch,distance,.keep_all = T)
height_abnormal <- sum(faa.no.dup$height<6)
duration_abnormal <- sum(faa.no.dup$duration<40)
speed_ground_abnormal <- sum(faa.no.dup$speed_ground<30 | faa.no.dup$speed_ground > 140)
speed_air_abnormal <- sum(faa.no.dup$speed_air<30 | faa.no.dup$speed_air > 140)
faa.no.dup <- faa.no.dup[faa.no.dup$height > 6,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground > 30,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground < 140,]

faa.clean <- faa.no.dup
str(faa.clean)
## 'data.frame':    837 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...

Creating Binary Responses

Step 1. Create binary variables long.landing and risky.landing and drop distance variable

faa.clean$long.landing <- ifelse(faa.clean$distance > 2500,1,0)
faa.clean$risky.landing <- ifelse(faa.clean$distance > 3000,1,0)
faa.clean$distance <- NULL

Identifying important factors using the binary data of “long.landing”

Step 2. Use a pie chart or a histogram to show the distribution of “long.landing”

# pie chart for distribution of long.landing
pct <- round(table(faa.clean$long.landing)/length(faa.clean$long.landing)*100,1)
labs <- c("No","Yes")
labs <- paste(labs,pct)
labs <- paste(labs,"%",sep="")
pie(table(faa.clean$long.landing),labels= labs,col = rainbow(length(labs)),main = "Pie chart of Long Landing")

Step 3. Logistic Regression using a single factor each time

faa.clean$aircraft <- ifelse(faa.clean$aircraft == 'boeing', 1, 0)
vars <- c('aircraft','duration','no_pasg','speed_ground','speed_air','height','pitch')
fittedpvalues <- vector()
fittedcoeffs <- vector()
fittedoddratios <- vector()

lmod1 <- glm(long.landing~aircraft,data = faa.clean)
fittedpvalues[1] <- summary(lmod1)$coefficients[2,4] 
fittedcoeffs[1] <- summary(lmod1)$coefficients[2,1] 
fittedoddratios[1] <- exp(coef(lmod1))[2]

lmod2 <- glm(long.landing~duration,data = faa.clean)
fittedpvalues[2] <- summary(lmod2)$coefficients[2,4] 
fittedcoeffs[2] <- summary(lmod2)$coefficients[2,1] 
fittedoddratios[2] <- exp(coef(lmod2))[2]

lmod3 <- glm(long.landing~no_pasg,data = faa.clean)
fittedpvalues[3] <- summary(lmod3)$coefficients[2,4] 
fittedcoeffs[3] <- summary(lmod3)$coefficients[2,1] 
fittedoddratios[3] <- exp(coef(lmod3))[2]

lmod4 <- glm(long.landing~speed_ground,data = faa.clean)
fittedpvalues[4] <- summary(lmod4)$coefficients[2,4] 
fittedcoeffs[4] <- summary(lmod4)$coefficients[2,1] 
fittedoddratios[4] <- exp(coef(lmod4))[2]

lmod5 <- glm(long.landing~speed_air,data = faa.clean)
fittedpvalues[5] <- summary(lmod5)$coefficients[2,4] 
fittedcoeffs[5] <- summary(lmod5)$coefficients[2,1] 
fittedoddratios[5] <- exp(coef(lmod5))[2]

lmod6 <- glm(long.landing~height,data = faa.clean)
fittedpvalues[6] <- summary(lmod6)$coefficients[2,4] 
fittedcoeffs[6] <- summary(lmod6)$coefficients[2,1] 
fittedoddratios[6] <- exp(coef(lmod6))[2]

lmod7 <- glm(long.landing~pitch,data = faa.clean)
fittedpvalues[7] <- summary(lmod7)$coefficients[2,4] 
fittedcoeffs[7] <- summary(lmod7)$coefficients[2,1] 
fittedoddratios[7] <- exp(coef(lmod7))[2]

table <- tibble(
  variable = vars, 
  coefficient = fittedcoeffs,
  odds_ratio = fittedoddratios,
  pvalues = fittedpvalues[1:7]
)
table$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table %>% arrange(abs(pvalues))
## # A tibble: 7 x 5
##   variable     coefficient odds_ratio  pvalues dir     
##   <chr>              <dbl>      <dbl>    <dbl> <chr>   
## 1 speed_ground    0.0111        1.01  3.00e-92 positive
## 2 speed_air       0.0365        1.04  5.73e-35 positive
## 3 aircraft        0.0983        1.10  1.84e- 5 positive
## 4 pitch           0.0478        1.05  2.87e- 2 positive
## 5 height          0.00149       1.00  2.05e- 1 positive
## 6 duration       -0.000222      1.000 3.64e- 1 negative
## 7 no_pasg        -0.000895      0.999 5.61e- 1 negative

Based on the above analysis, we observe that speed_ground, speed_air and aircraft are significant variables at alpha = 0.05. We will still keep all variables to build full model except speed_air(which has high number of missing values and has high correlation with speed_ground variable as we will see below)

Step 4. Visualize the association of the significant variables

par(mfrow = c(2,2))
plot(faa.clean$long.landing ~ faa.clean$speed_ground,xlab = 'speed_ground',ylab = 'long.landing')
plot(faa.clean$long.landing ~ faa.clean$speed_air,xlab = 'speed_air',ylab = 'long.landing')
plot(faa.clean$long.landing ~ faa.clean$aircraft,xlab = 'aircraft',ylab = 'long.landing')
plot(faa.clean$long.landing ~ faa.clean$pitch,xlab = 'pitch',ylab = 'long.landing')

Step 5. Check models with speed_ground, speed_air and speed_ground+speed_air as the variables and landing

cat('Regression Coefficient with speed_ground as predictor is',summary(lm(long.landing~speed_ground,data = faa.clean))$coefficients[2,1],'\n')
## Regression Coefficient with speed_ground as predictor is 0.01106496
cat('Regression Coefficient with speed_air as predictor is',summary(lm(long.landing~speed_air,data = faa.clean))$coefficients[2,1],'\n')
## Regression Coefficient with speed_air as predictor is 0.03653113
cat('Regression Coefficient with both speed_ground and speed_air as predictors are',summary(lm(long.landing~speed_ground+speed_air,data = faa.clean))$coefficients[2,1],'and',summary(lm(long.landing~speed_ground+speed_air,data = faa.clean))$coefficients[3,1],'respectively')
## Regression Coefficient with both speed_ground and speed_air as predictors are 0.004556364 and 0.03195473 respectively
# check correlation between speed_ground and speed_air variables
cor(faa.clean$speed_ground,faa.clean$speed_air,use = "pairwise.complete.obs")
## [1] 0.9885722

This is a very high correlation and will lead to multicollinearity issue in the model if both variables are used. In our case, we will use the speed_ground variable because it has no missing values while the speed_air has almost 80% missing values. Also in the model where both variables are used, p-value for speed_ground is 0.004556364 making it insignificant which is not correct.

Step 6. Initiate a full model for predicting long.landing(Removing speed_air variable based on collinearity analysis)

lmodnull <- glm(long.landing ~ 1, family = binomial, data = faa.clean)
lmodfull <- glm(long.landing ~.-(speed_air + risky.landing), family = binomial, data = faa.clean)

Step 6a. Forward selection using AIC

model1.AIC <- step(lmodnull, scope=list(lower=lmodnull, upper=lmodfull), direction='forward')
## Start:  AIC=638.04
## long.landing ~ 1
## 
##                Df Deviance    AIC
## + speed_ground  1   110.89 140.14
## + aircraft      1   594.65 623.90
## + pitch         1   607.48 636.74
## <none>              610.79 638.04
## + height        1   609.56 638.81
## + duration      1   609.96 639.21
## + no_pasg       1   610.48 639.74
## 
## Step:  AIC=123.09
## long.landing ~ speed_ground
## 
##            Df Deviance     AIC
## + aircraft  1   79.563  93.764
## + height    1   95.938 110.139
## + pitch     1  100.202 114.403
## <none>         110.888 123.089
## + no_pasg   1  110.718 124.919
## + duration  1  110.860 125.061
## 
## Step:  AIC=92.07
## long.landing ~ speed_ground + aircraft
## 
##            Df Deviance    AIC
## + height    1   54.418 68.921
## + pitch     1   76.598 91.101
## <none>          79.563 92.066
## + duration  1   78.833 93.336
## + no_pasg   1   79.414 93.917
## 
## Step:  AIC=65.06
## long.landing ~ speed_ground + aircraft + height
## 
##            Df Deviance    AIC
## + pitch     1   51.591 64.233
## <none>          54.418 65.060
## + duration  1   53.711 66.353
## + no_pasg   1   54.418 67.060
## 
## Step:  AIC=63.21
## long.landing ~ speed_ground + aircraft + height + pitch
## 
##            Df Deviance    AIC
## <none>          51.591 63.212
## + duration  1   51.122 64.743
## + no_pasg   1   51.587 65.208
summary(model1.AIC)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height + 
##     pitch, family = binomial, data = faa.clean)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.20389  -0.00053   0.00000   0.00000   2.35854  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -119.90246   24.37354  -4.919 8.68e-07 ***
## speed_ground    1.02370    0.20253   5.055 4.32e-07 ***
## aircraft        5.14051    1.17848   4.362 1.29e-05 ***
## height          0.25834    0.06842   3.776  0.00016 ***
## pitch           1.53924    0.84130   1.830  0.06731 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 636.044  on 836  degrees of freedom
## Residual deviance:  53.212  on 832  degrees of freedom
## AIC: 63.212
## 
## Number of Fisher Scoring iterations: 12

Using forward selection by AIC , the final model includes the following variables - speed_ground,aircraft,height and pitch which is different from the analysis using p-values where pitch and height variables were not significant

Step 7. Forward Selection using BIC

model1.BIC <- step(lmodnull, scope=list(lower=lmodnull, upper=lmodfull), direction='forward',k = log(nrow(faa.clean)))
## Start:  AIC=642.77
## long.landing ~ 1
## 
##                Df Deviance    AIC
## + speed_ground  1   110.89 149.60
## + aircraft      1   594.65 633.36
## <none>              610.79 642.77
## + pitch         1   607.48 646.20
## + height        1   609.56 648.27
## + duration      1   609.96 648.67
## + no_pasg       1   610.48 649.20
## 
## Step:  AIC=132.55
## long.landing ~ speed_ground
## 
##            Df Deviance    AIC
## + aircraft  1   79.563 107.95
## + height    1   95.938 124.33
## + pitch     1  100.202 128.59
## <none>         110.888 132.55
## + no_pasg   1  110.718 139.11
## + duration  1  110.860 139.25
## 
## Step:  AIC=106.26
## long.landing ~ speed_ground + aircraft
## 
##            Df Deviance    AIC
## + height    1   54.418  87.84
## <none>          79.563 106.25
## + pitch     1   76.598 110.02
## + duration  1   78.833 112.25
## + no_pasg   1   79.414 112.84
## 
## Step:  AIC=83.98
## long.landing ~ speed_ground + aircraft + height
## 
##            Df Deviance    AIC
## <none>          54.418 83.979
## + pitch     1   51.591 87.882
## + duration  1   53.711 90.002
## + no_pasg   1   54.418 90.709
summary(model1.BIC)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height, 
##     family = binomial, data = faa.clean)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.43613  -0.00113   0.00000   0.00000   2.57645  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -103.09744   19.18549  -5.374 7.71e-08 ***
## speed_ground    0.92781    0.17207   5.392 6.96e-08 ***
## aircraft        5.05631    1.11257   4.545 5.50e-06 ***
## height          0.23154    0.05939   3.899 9.66e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 636.04  on 836  degrees of freedom
## Residual deviance:  57.06  on 833  degrees of freedom
## AIC: 65.06
## 
## Number of Fisher Scoring iterations: 11

Using forward selection by BIC, the final model includes the following variables - speed_ground,aircraft and height which is different from the analysis using p-values where the height variable was not significant. This is also different from model built using AIC as it does not include pitch variable in the final model

Step 8. Final Model for long landing

  1. We select the model developed using AIC which includes speed_ground, aircraft, height and pitch
  2. Speed_air was not used in building the model because it has high number of missing values
vars = vars <- c('aircraft','speed_ground','height','pitch')
fittedcoeffs <- vector()
fittedoddratios <- vector()

fittedcoeffs[1] <- summary(model1.AIC)$coefficients[2,1] 
fittedcoeffs[2] <- summary(model1.AIC)$coefficients[3,1] 
fittedcoeffs[3] <- summary(model1.AIC)$coefficients[4,1] 
fittedcoeffs[4] <- summary(model1.AIC)$coefficients[5,1] 
fittedoddratios[1] <- exp(coef(model1.AIC))[2]
fittedoddratios[2] <- exp(coef(model1.AIC))[3]
fittedoddratios[3] <- exp(coef(model1.AIC))[4]
fittedoddratios[4] <- exp(coef(model1.AIC))[5]

table <- tibble(
  variable = vars, 
  coefficient = fittedcoeffs,
  odds_ratio = fittedoddratios,
)
table$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table %>% arrange(desc(coefficient))
## # A tibble: 4 x 4
##   variable     coefficient odds_ratio dir     
##   <chr>              <dbl>      <dbl> <chr>   
## 1 speed_ground       5.14      171.   positive
## 2 pitch              1.54        4.66 positive
## 3 aircraft           1.02        2.78 positive
## 4 height             0.258       1.29 positive
  1. THe odds of long landing increases by a ratio of 170 when there is an increase in speed_ground by 1 unit
  2. The odds of long landing increase by a ratio of 2.783 when boeing aircraft is used

Step 9. Repeat steps 1-7 for risky landing

# pie chart for distribution of risky
pct <- round(table(faa.clean$risky.landing)/length(faa.clean$risky.landing)*100,1)
labs <- c("No","Yes")
labs <- paste(labs,pct)
labs <- paste(labs,"%",sep="")
pie(table(faa.clean$risky.landing),labels= labs,col = rainbow(length(labs)),main = "Pie chart of Risky Landing")

#Logistic Regression using single factor

vars <- c('aircraft','duration','no_pasg','speed_ground','speed_air','height','pitch')
fittedpvalues <- vector()
fittedcoeffs <- vector()
fittedoddratios <- vector()

lmod1 <- glm(risky.landing~aircraft,data = faa.clean)
fittedpvalues[1] <- summary(lmod1)$coefficients[2,4] 
fittedcoeffs[1] <- summary(lmod1)$coefficients[2,1] 
fittedoddratios[1] <- exp(coef(lmod1))[2]

lmod2 <- glm(risky.landing~duration,data = faa.clean)
fittedpvalues[2] <- summary(lmod2)$coefficients[2,4] 
fittedcoeffs[2] <- summary(lmod2)$coefficients[2,1] 
fittedoddratios[2] <- exp(coef(lmod2))[2]

lmod3 <- glm(risky.landing~no_pasg,data = faa.clean)
fittedpvalues[3] <- summary(lmod3)$coefficients[2,4] 
fittedcoeffs[3] <- summary(lmod3)$coefficients[2,1] 
fittedoddratios[3] <- exp(coef(lmod3))[2]

lmod4 <- glm(risky.landing~speed_ground,data = faa.clean)
fittedpvalues[4] <- summary(lmod4)$coefficients[2,4] 
fittedcoeffs[4] <- summary(lmod4)$coefficients[2,1] 
fittedoddratios[4] <- exp(coef(lmod4))[2]

lmod5 <- glm(risky.landing~speed_air,data = faa.clean)
fittedpvalues[5] <- summary(lmod5)$coefficients[2,4] 
fittedcoeffs[5] <- summary(lmod5)$coefficients[2,1] 
fittedoddratios[5] <- exp(coef(lmod5))[2]

lmod6 <- glm(risky.landing~height,data = faa.clean)
fittedpvalues[6] <- summary(lmod6)$coefficients[2,4] 
fittedcoeffs[6] <- summary(lmod6)$coefficients[2,1] 
fittedoddratios[6] <- exp(coef(lmod6))[2]

lmod7 <- glm(risky.landing~pitch,data = faa.clean)
fittedpvalues[7] <- summary(lmod7)$coefficients[2,4] 
fittedcoeffs[7] <- summary(lmod7)$coefficients[2,1] 
fittedoddratios[7] <- exp(coef(lmod7))[2]

exp(coef(lmod7))
## (Intercept)       pitch 
##   0.9594595   1.0295539
table <- tibble(
  variable = vars, 
  coefficient = fittedcoeffs,
  odds_ratio = fittedoddratios,
  pvalues = fittedpvalues[1:7]
)
table$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table %>% arrange(abs(pvalues))
## # A tibble: 7 x 5
##   variable     coefficient odds_ratio  pvalues dir     
##   <chr>              <dbl>      <dbl>    <dbl> <chr>   
## 1 speed_ground    0.00768       1.01  1.15e-66 positive
## 2 speed_air       0.0376        1.04  1.47e-49 positive
## 3 aircraft        0.0699        1.07  1.24e- 4 positive
## 4 pitch           0.0291        1.03  9.27e- 2 positive
## 5 no_pasg        -0.00164       0.998 1.78e- 1 negative
## 6 duration       -0.000140      1.000 4.72e- 1 negative
## 7 height          0.000174      1.00  8.52e- 1 positive

We observe from the above table that speed_ground, speed_air and aircraft are the significant variables at alpha = 0.05

# Visualize the association of the significant variables

par(mfrow = c(2,2))
plot(faa.clean$risky.landing ~ faa.clean$speed_ground,xlab = 'speed_ground',ylab = 'risky.landing')
plot(faa.clean$risky.landing ~ faa.clean$speed_air,xlab = 'speed_air',ylab = 'risky.landing')
plot(faa.clean$risky.landing ~ faa.clean$aircraft,xlab = 'aircraft',ylab = 'risky.landing')
plot(faa.clean$risky.landing ~ faa.clean$pitch,xlab = 'pitch',ylab = 'risky.landing')

# initiate null and full models
lmodnull <- glm(risky.landing ~ 1, family = binomial, data = faa.clean)
lmodfull <- glm(risky.landing ~.-(speed_air + long.landing), family = binomial, data = faa.clean)
model2.AIC <- step(lmodnull, scope=list(lower=lmodnull, upper=lmodfull), direction='forward')
## Start:  AIC=449.06
## risky.landing ~ 1
## 
##                Df Deviance    AIC
## + speed_ground  1    58.99  75.97
## + aircraft      1   421.54 438.52
## + pitch         1   431.79 448.77
## <none>              434.08 449.06
## + no_pasg       1   432.26 449.24
## + duration      1   433.56 450.54
## + height        1   434.07 451.05
## 
## Step:  AIC=63.95
## risky.landing ~ speed_ground
## 
##            Df Deviance    AIC
## + aircraft  1   40.159 47.118
## + pitch     1   52.021 58.980
## <none>          58.991 63.950
## + no_pasg   1   58.160 65.119
## + height    1   58.563 65.522
## + duration  1   58.714 65.673
## 
## Step:  AIC=46.29
## risky.landing ~ speed_ground + aircraft
## 
##            Df Deviance    AIC
## + no_pasg   1   37.711 45.846
## <none>          40.159 46.294
## + height    1   39.411 47.546
## + pitch     1   39.963 48.098
## + duration  1   40.063 48.198
## 
## Step:  AIC=45.85
## risky.landing ~ speed_ground + aircraft + no_pasg
## 
##            Df Deviance    AIC
## <none>          37.711 45.853
## + height    1   37.078 47.220
## + pitch     1   37.434 47.576
## + duration  1   37.711 47.853
summary(model2.AIC)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg, 
##     family = binomial, data = faa.clean)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.35062  -0.00008   0.00000   0.00000   1.87851  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -101.01866   25.61661  -3.943 8.03e-05 ***
## speed_ground    0.96032    0.23587   4.071 4.67e-05 ***
## aircraft        4.71633    1.47244   3.203  0.00136 ** 
## no_pasg        -0.08585    0.05753  -1.492  0.13561    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 447.057  on 836  degrees of freedom
## Residual deviance:  37.853  on 833  degrees of freedom
## AIC: 45.853
## 
## Number of Fisher Scoring iterations: 12

Using forward selection by AIC , the final model includes the following variables - speed_ground, aircraft and no_pasg

model2.BIC <- step(lmodnull, scope=list(lower=lmodnull, upper=lmodfull), direction='forward',k = log(nrow(faa.clean)))
## Start:  AIC=453.79
## risky.landing ~ 1
## 
##                Df Deviance    AIC
## + speed_ground  1    58.99  85.43
## + aircraft      1   421.54 447.98
## <none>              434.08 453.79
## + pitch         1   431.79 458.23
## + no_pasg       1   432.26 458.70
## + duration      1   433.56 460.00
## + height        1   434.07 460.51
## 
## Step:  AIC=73.41
## risky.landing ~ speed_ground
## 
##            Df Deviance    AIC
## + aircraft  1   40.159 61.307
## + pitch     1   52.021 73.170
## <none>          58.991 73.410
## + no_pasg   1   58.160 79.309
## + height    1   58.563 79.712
## + duration  1   58.714 79.863
## 
## Step:  AIC=60.48
## risky.landing ~ speed_ground + aircraft
## 
##            Df Deviance    AIC
## <none>          40.159 60.483
## + no_pasg   1   37.711 64.766
## + height    1   39.411 66.466
## + pitch     1   39.963 67.017
## + duration  1   40.063 67.117
summary(model2.BIC)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial, 
##     data = faa.clean)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2561  -0.0001   0.0000   0.0000   1.6054  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -103.3066    24.8837  -4.152 3.30e-05 ***
## speed_ground    0.9374     0.2258   4.152 3.29e-05 ***
## aircraft        4.0896     1.2485   3.276  0.00105 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 447.057  on 836  degrees of freedom
## Residual deviance:  40.294  on 834  degrees of freedom
## AIC: 46.294
## 
## Number of Fisher Scoring iterations: 12

Using forward selection by BIC, the final model includes the following variables - speed_ground and aircraft which is same as the analysis using p-values. However, this is different from model built using AIC as it does not include no_pasg variable in the final model

Final Model for risky.landing

  1. We select the model developed using BIC which includes speed_ground and aircraft.(We do not choose AIC model has it included no_apsg variable which is not significant)
  2. Speed_air was not used in building the model because it has high number of missing values
vars = vars <- c('aircraft','speed_ground')
fittedcoeffs <- vector()
fittedoddratios <- vector()

fittedcoeffs[1] <- summary(model2.BIC)$coefficients[2,1] 
fittedcoeffs[2] <- summary(model2.BIC)$coefficients[3,1] 
fittedoddratios[1] <- exp(coef(model2.BIC))[2]
fittedoddratios[2] <- exp(coef(model2.BIC))[3]

table <- tibble(
  variable = vars, 
  coefficient = fittedcoeffs,
  odds_ratio = fittedoddratios,
)
table$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table %>% arrange(desc(coefficient))
## # A tibble: 2 x 4
##   variable     coefficient odds_ratio dir     
##   <chr>              <dbl>      <dbl> <chr>   
## 1 speed_ground       4.09       59.7  positive
## 2 aircraft           0.937       2.55 positive
  1. The risky landing is higher in case of boeing planes
  2. The odds of risky landing increases by 7.7 times when speed ground increases by 1 unit

Compare the two models built for “long.landing” and “risky.landing”

Step 11. Difference between the two models

  1. Speed_ground, aircraft , pitch and height are important variables indetermining long landing while speed_ground and aircraft are important variables in determining risky landing
  2. The likelihood of risky landing is higher than long landing is boeing planes
  3. The increase in speed_ground increase the likelihood of long landing more than risky landing

Step 12. Plot ROC curve

long.landing.final.model <- model1.AIC
predprob1 <- predict(long.landing.final.model,type='response')
predout1 <- ifelse(predprob1<0.5,'no','yes')
faa.clean.m1 <- data.frame(faa.clean, predprob1,predout1)
thresh <- seq(0.01,0.5,0.01)
sensitivity1 <- specificity1 <- rep(NA,length(thresh))
for ( j in seq(along = thresh)){
pp <- ifelse(faa.clean.m1$predprob1<thresh[j],"no","yes")
xx <- xtabs(~long.landing+pp,faa.clean.m1)
specificity1[j]<- xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity1[j]<- xx[2,2]/(xx[2,1]+xx[2,2])
}

risky.landing.final.model <- model2.BIC
predprob2 <- predict(risky.landing.final.model,type='response')
predout2 <- ifelse(predprob2<0.5,'no','yes')
faa.clean.m2 <- data.frame(faa.clean, predprob2,predout2)
thresh <- seq(0.01,0.5,0.01)
sensitivity2 <- specificity2 <- rep(NA,length(thresh))
for ( j in seq(along = thresh)){
pp <- ifelse(faa.clean.m2$predprob2<thresh[j],"no","yes")
xx <- xtabs(~risky.landing+pp,faa.clean.m2)
specificity2[j]<- xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity2[j]<- xx[2,2]/(xx[2,1]+xx[2,2])
}

#matplot(thresh,cbind(sensitivity1,specificity1),type ='l',xlab="Threshold",ylab='Proportion',lty=1:2)
plot(1-specificity1,sensitivity1,type = 'l',col = 'red',xlab = '1- specificity', ylab = 'sensitivity');#abline(0,1,lty=2)
lines(1-specificity2,sensitivity2,col="green")
legend(0.04, 0.98, legend=c("Long Landing Model", "Risky Landing model"),
       col=c("red", "green"), lty=1:2, cex=0.8)

#install.packages('pROC')
# area under curve for long landing model
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
pred1 <- predict(long.landing.final.model)
roc_obj <- roc(faa.clean$long.landing, pred1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.9985
# area under curve for risky landing model
pred2 <- predict(risky.landing.final.model)
roc_obj2 <- roc(faa.clean$risky.landing, pred2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj2)
## Area under the curve: 0.9986

Step 13. Predict probability of long.landing and risky.landing

#long.landing
new.ind <- data.frame(aircraft = 1, duration = 200,no_pasg = 80, speed_ground = 115,speed_air = 120,height = 40,pitch = 4)
predict(long.landing.final.model,newdata = new.ind,type = 'response',se = T)
## $fit
## 1 
## 1 
## 
## $se.fit
##            1 
## 1.402991e-08 
## 
## $residual.scale
## [1] 1
round(c(1-1.96*1.402991e-08,1+ 1.96*1.402991e-08),3)
## [1] 1 1
##risky.landing
predict(risky.landing.final.model,newdata = new.ind,type = 'response',se = T)
## $fit
##         1 
## 0.9998136 
## 
## $se.fit
##            1 
## 0.0003894828 
## 
## $residual.scale
## [1] 1
round(c(0.9998136-1.96*1.071988e-13,0.9998136 + 1.96*1.071988e-13),3)
## [1] 1 1

Step 14.

#Using the risk factors identified using the model2.BIC(speed_ground + aircraft ) we build a probit model and complementary log-log link 
glm.probit <- glm(risky.landing~speed_ground + aircraft ,family = binomial(link = probit),data = faa.clean)
glm.clog <- glm(risky.landing~speed_ground + aircraft ,family = binomial(link = cloglog),data = faa.clean)
round(coef(model2.BIC),3)
##  (Intercept) speed_ground     aircraft 
##     -103.307        0.937        4.090
round(coef(glm.probit),3)
##  (Intercept) speed_ground     aircraft 
##      -59.420        0.539        2.402
round(coef(glm.clog),3)
##  (Intercept) speed_ground     aircraft 
##      -70.009        0.629        2.950

The two models- probit and cloglog are different from the logit model in terms of magnitudes of coefficients but same in sign.

risky.landing.logit.model <- model2.BIC
predprob2 <- predict(risky.landing.logit.model,type='response')
predout2 <- ifelse(predprob2<0.5,'no','yes')
faa.clean.m2 <- data.frame(faa.clean, predprob2,predout2)
thresh <- seq(0.01,0.5,0.01)
sensitivity2 <- specificity2 <- rep(NA,length(thresh))
for ( j in seq(along = thresh)){
pp <- ifelse(faa.clean.m2$predprob2<thresh[j],"no","yes")
xx <- xtabs(~risky.landing+pp,faa.clean.m2)
specificity2[j]<- xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity2[j]<- xx[2,2]/(xx[2,1]+xx[2,2])
}

risky.landing.probit.model <- glm.probit
predprob3 <- predict(risky.landing.probit.model,type='response')
predout3 <- ifelse(predprob3<0.5,'no','yes')
faa.clean.m3 <- data.frame(faa.clean, predprob3,predout3)
thresh <- seq(0.01,0.5,0.01)
sensitivity3 <- specificity3 <- rep(NA,length(thresh))
for ( j in seq(along = thresh)){
pp <- ifelse(faa.clean.m3$predprob3<thresh[j],"no","yes")
xx <- xtabs(~risky.landing+pp,faa.clean.m3)
specificity3[j]<- xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity3[j]<- xx[2,2]/(xx[2,1]+xx[2,2])
}

risky.landing.clog.model <- glm.clog
predprob4 <- predict(risky.landing.clog.model,type='response')
predout4 <- ifelse(predprob4<0.5,'no','yes')
faa.clean.m4 <- data.frame(faa.clean, predprob4,predout4)
thresh <- seq(0.01,0.5,0.01)
sensitivity4 <- specificity4 <- rep(NA,length(thresh))
for ( j in seq(along = thresh)){
pp <- ifelse(faa.clean.m4$predprob4<thresh[j],"no","yes")
xx <- xtabs(~risky.landing+pp,faa.clean.m4)
specificity4[j]<- xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity4[j]<- xx[2,2]/(xx[2,1]+xx[2,2])
}

#par(mfrow =c(1,2))
plot(1-specificity2,sensitivity2,type = 'l',col = 'red');
lines(1-specificity3,sensitivity3,col="green")
lines(1-specificity4,sensitivity4,col="blue")
legend(0.025, 0.98, legend=c("Logit model", "Probit Model", "CLog model"),
       col=c("red", "green","blue"), lty=1:2, cex=0.8)

We can see tha both the clog model and logit model overlap and have idential ROC curves

Step 16. Identify the top 5 risky landings(ran the below code with the given output. Knitting in R caused issues hence the code is commented below)

# logit <- predict(model2.BIC,type="response")
# probit <-  predict(glm.probit, type = "response")
# cloglog <- predict(glm.clog, type = "response")
# cat('logit top 5 records:\n ')
# tail(sort(p_logit),5);
# cat('probit top 5 records:\n ')
# tail(sort(p_probit),5);
# cat('clog top 5 records:\n ')
# tail(sort(p_cloglog),5)

#Output
# logit top 5 records:
#  400  66 317 373 166 
#   1   1   1   1   1 
# probit top 5 records:
#  373 396 400 421 662 
#   1   1   1   1   1 
# clog top 5 records:
#  662 688 770 784 788 
#   1   1   1   1   1 

The three models do not point to the same flights. Both logit and probit models predict flight#400 and 373 as risky landings

Step 17. Predictions on new data using probit and cloglog models

predict(glm.probit,newdata = new.ind,type = 'response',se = T)
## $fit
##         1 
## 0.9999996 
## 
## $se.fit
##            1 
## 2.179702e-06 
## 
## $residual.scale
## [1] 1
round(c(0.9999832-1.96*2.179702e-06,0.9999832 + 1.96*2.179702e-06 ),3)
## [1] 1 1
predict(glm.clog,newdata = new.ind,type = 'response',se = T)
## $fit
## 1 
## 1 
## 
## $se.fit
##            1 
## 2.574406e-16 
## 
## $residual.scale
## [1] 1
round(c(1-1.96*2.747415e-16,1 + 1.96*2.747415e-16),3)
## [1] 1 1

The fitted values in each of the logit, probit and clog models are equal to 1.