INTRODUCTION

  • Background: Flight landing.
  • Motivation: To reduce the risk of landing overrun.
  • Goal: To study what factors and how they would impact the landing distance of a commercial flight.

INITIAL EXPLORATION OF THE DATA

Step 1,2 - Reading the files

library(readxl)
library(dplyr)
library(ggplot2)
library(plyr)
library(tidyr)

#Importing Dataset(Excel)
FAA1 <- read_excel("C:/Users/Swagatam/Desktop/Statistical_Modeling/Week_1/FAA1.xls")
FAA2 <- read_excel("C:/Users/Swagatam/Desktop/Statistical_Modeling/Week_1/FAA2.xls")

#Checking Structure
str(FAA1)
## Classes 'tbl_df', 'tbl' and 'data.frame':    800 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
str(FAA2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    150 obs. of  7 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
#Merging the datasets
FAA<-bind_rows(FAA1,FAA2)
FAA.uni<-unique(FAA[,-2],incomparables = F)
FAA.final<-join(FAA.uni,FAA[,c("pitch","duration")] , type = "left",by="pitch",match="first")
#Structure of Combined Dataset
str(FAA.final)
## 'data.frame':    850 obs. of  8 variables:
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
summary(FAA.final)
##      pitch         aircraft            no_pasg      speed_ground   
##  Min.   :2.284   Length:850         Min.   :29.0   Min.   : 27.74  
##  1st Qu.:3.642   Class :character   1st Qu.:55.0   1st Qu.: 65.90  
##  Median :4.008   Mode  :character   Median :60.0   Median : 79.64  
##  Mean   :4.009                      Mean   :60.1   Mean   : 79.45  
##  3rd Qu.:4.377                      3rd Qu.:65.0   3rd Qu.: 92.06  
##  Max.   :5.927                      Max.   :87.0   Max.   :141.22  
##                                                                    
##    speed_air          height          distance          duration     
##  Min.   : 90.00   Min.   :-3.546   Min.   :  34.08   Min.   : 14.76  
##  1st Qu.: 96.25   1st Qu.:23.314   1st Qu.: 883.79   1st Qu.:119.49  
##  Median :101.15   Median :30.093   Median :1258.09   Median :153.95  
##  Mean   :103.80   Mean   :30.144   Mean   :1526.02   Mean   :154.01  
##  3rd Qu.:109.40   3rd Qu.:36.993   3rd Qu.:1936.95   3rd Qu.:188.91  
##  Max.   :141.72   Max.   :59.946   Max.   :6533.05   Max.   :305.62  
##  NA's   :642                                         NA's   :50
FAA.final<-FAA.final%>%
  filter(duration>40|is.na(duration))

FAA.final<-FAA.final%>%
  filter(height>=6|is.na(height))

FAA.final<-FAA.final%>%
  filter(speed_ground>=30 & speed_ground<=140)

FAA.final<-FAA.final%>%
  filter(distance<6000)
str(FAA.final)
## 'data.frame':    831 obs. of  8 variables:
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
summary(FAA.final)
##      pitch         aircraft            no_pasg       speed_ground   
##  Min.   :2.284   Length:831         Min.   :29.00   Min.   : 33.57  
##  1st Qu.:3.640   Class :character   1st Qu.:55.00   1st Qu.: 66.20  
##  Median :4.001   Mode  :character   Median :60.00   Median : 79.79  
##  Mean   :4.005                      Mean   :60.06   Mean   : 79.54  
##  3rd Qu.:4.370                      3rd Qu.:65.00   3rd Qu.: 91.91  
##  Max.   :5.927                      Max.   :87.00   Max.   :132.78  
##                                                                     
##    speed_air          height          distance          duration     
##  Min.   : 90.00   Min.   : 6.228   Min.   :  41.72   Min.   : 41.95  
##  1st Qu.: 96.23   1st Qu.:23.530   1st Qu.: 893.28   1st Qu.:119.63  
##  Median :101.12   Median :30.167   Median :1262.15   Median :154.28  
##  Mean   :103.48   Mean   :30.458   Mean   :1522.48   Mean   :154.78  
##  3rd Qu.:109.36   3rd Qu.:37.004   3rd Qu.:1936.63   3rd Qu.:189.66  
##  Max.   :132.91   Max.   :59.946   Max.   :5381.96   Max.   :305.62  
##  NA's   :628                                         NA's   :50
#Step 1
FAA.final<-FAA.final%>%
  mutate(FAA.final, long.landing = ifelse(distance > 2500, 1, 0))

FAA.final<-FAA.final%>%
  mutate(FAA.final, risky.landing = ifelse(distance > 3000, 1, 0))

FAA.final<-FAA.final%>%
  select(-distance)
  • Used the ‘unique’ function to remove duplicate values. Hence the final dataset had 850 observations
  • The are 6 variables that might affect the landing distance of major air carriers
  • The distance averages 1500 feet and has a median value of 1258 feet
  • In our further analysis we will try to filter out values that cannot be right for practical analysis. Some of them are mentioned below:
    • Duration being less than 40 min
    • It should be at least 6m high at the threshold of the runway
    • Ground speed should be between 30 and 140 kmph
  • After the removal of abnormal values, the ranges and means have changed. The ground speed variable now falls in between 34 to 133.

DATA CLEANING AND FURTHER EXPLORATION

Step 2, 3

ggplot(FAA.final,aes(long.landing))+geom_histogram(bins=3,fill="red")

l<-lapply( FAA.final[,c(-8,-9)], function(x) summary(glm(long.landing ~ x,data=FAA.final,family=binomial))$coefficients[2,c(1,4)])
l1<-data.frame(l)
l2<-t(l1)
l3<-data.frame(l2)
l3<-l3%>%
  mutate(Direction=ifelse(Estimate>=0,"Positive","Negative"))
l3<-l3%>%
  mutate(OddsRatio=exp(Estimate))
l3
##                  Estimate     Pr...z.. Direction OddsRatio
## pitch         0.400527824 4.664982e-02  Positive 1.4926123
## aircraft      0.864119860 8.398591e-05  Positive 2.3729167
## no_pasg      -0.007256406 6.058565e-01  Negative 0.9927699
## speed_ground  0.472345752 3.935339e-14  Positive 1.6037518
## speed_air     0.512321766 4.334124e-11  Positive 1.6691621
## height        0.008623997 4.218576e-01  Positive 1.0086613
## duration     -0.001070492 6.305122e-01  Negative 0.9989301
  • The significant factors are ground speed, air speed,pitch and aircraft make

HISTOGRAMS

Step 4

#Significant Factors: Speed_Ground,Speed_Air,Aircraft
#Pitch
ggplot(FAA.final, aes(x=pitch, fill=as.factor(long.landing))) +
  geom_histogram(position="dodge")+
  theme(legend.position="top")

#Speed_Air
ggplot(FAA.final, aes(x=speed_air, fill=as.factor(long.landing))) +
  geom_histogram(position="dodge")+
  theme(legend.position="top")

#Speed_Ground
ggplot(FAA.final, aes(x=speed_ground, fill=as.factor(long.landing))) +
  geom_histogram(position="dodge")+
  theme(legend.position="top")

* The long landing =0 data for pitch has a normal distribution * The long landing =0 data for speed_air has a right skewed distribution * The long landing =0 data for speed_ground has a right skewed distribution and follows speed_air due to high collinearity

FULL MODEL

Step 5

full.model <- glm(long.landing ~ .,data=FAA.final,family=binomial)
summary(full.model)     
## 
## Call:
## glm(formula = long.landing ~ ., family = binomial, data = FAA.final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.48513  -0.01382   0.00000   0.00000   1.56909  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -1.959e+02  5.627e+01  -3.481 0.000499 ***
## pitch           1.463e+00  1.057e+00   1.384 0.166281    
## aircraftboeing  8.766e+00  2.628e+00   3.335 0.000852 ***
## no_pasg        -7.327e-02  7.015e-02  -1.044 0.296317    
## speed_ground   -2.247e-01  3.842e-01  -0.585 0.558636    
## speed_air       1.980e+00  7.098e-01   2.790 0.005277 ** 
## height          4.216e-01  1.431e-01   2.946 0.003221 ** 
## duration        3.121e-04  1.046e-02   0.030 0.976209    
## risky.landing   1.113e+01  2.124e+03   0.005 0.995819    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 270.199  on 194  degrees of freedom
## Residual deviance:  32.898  on 186  degrees of freedom
##   (636 observations deleted due to missingness)
## AIC: 50.898
## 
## Number of Fisher Scoring iterations: 20
#Encoding
FAA.final<-FAA.final%>%
  mutate(aircraft.binary=ifelse(aircraft=="airbus",0,1))
#Remove Speed_Air
FAA.final.new<-FAA.final%>%
  select(-speed_air)
good.model<-glm(long.landing ~pitch+aircraft.binary+speed_ground,data=FAA.final.new,family=binomial)
summary(good.model)
## 
## Call:
## glm(formula = long.landing ~ pitch + aircraft.binary + speed_ground, 
##     family = binomial, data = FAA.final.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.11589  -0.01116  -0.00026   0.00000   2.40741  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -67.92855   10.48408  -6.479 9.22e-11 ***
## pitch             1.06599    0.60389   1.765   0.0775 .  
## aircraft.binary   3.04348    0.73345   4.150 3.33e-05 ***
## speed_ground      0.61471    0.09184   6.694 2.18e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  81.309  on 827  degrees of freedom
## AIC: 89.309
## 
## Number of Fisher Scoring iterations: 10
  • Due to high number of missing values in the speed_air variable and its high collinearity with speed_ground,we decided to perform further cleaning and removal of this variable

STEP MODEL AIC

Step 6

null.model.new<- glm(long.landing ~ 1,data=FAA.final.new,family=binomial)
full.model.new <- glm(long.landing ~ .,data=FAA.final.new,family=binomial)
model.AIC<-step(null.model.new,scope=list(lower=null.model.new, upper=full.model.new),trace=0,direction = "forward")
summary(model.AIC)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height + 
##     pitch, family = binomial, data = FAA.final.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.20284  -0.00054   0.00000   0.00000   2.35719  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -119.77598   24.41821  -4.905 9.33e-07 ***
## speed_ground      1.02266    0.20290   5.040 4.65e-07 ***
## aircraftboeing    5.13443    1.18091   4.348 1.37e-05 ***
## height            0.25795    0.06861   3.760  0.00017 ***
## pitch             1.53751    0.84109   1.828  0.06755 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  53.204  on 826  degrees of freedom
## AIC: 63.204
## 
## Number of Fisher Scoring iterations: 12
#only aircraft and speed_ground

STEP MODEL BIC

Step 7

summary(full.model.new)
## 
## Call:
## glm(formula = long.landing ~ ., family = binomial, data = FAA.final.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.10283  -0.00089   0.00000   0.00000   2.21181  
## 
## Coefficients: (1 not defined because of singularities)
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -1.131e+02  2.399e+01  -4.715 2.42e-06 ***
## pitch            1.197e+00  8.521e-01   1.404  0.16019    
## aircraftboeing   4.994e+00  1.189e+00   4.200 2.67e-05 ***
## no_pasg          9.929e-03  5.550e-02   0.179  0.85803    
## speed_ground     9.632e-01  2.001e-01   4.815 1.47e-06 ***
## height           2.356e-01  7.174e-02   3.284  0.00102 ** 
## duration         5.393e-03  7.649e-03   0.705  0.48077    
## risky.landing    1.522e+01  2.566e+03   0.006  0.99527    
## aircraft.binary         NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 597.692  on 780  degrees of freedom
## Residual deviance:  50.718  on 773  degrees of freedom
##   (50 observations deleted due to missingness)
## AIC: 66.718
## 
## Number of Fisher Scoring iterations: 20
model.BIC<-step(null.model.new,scope=list(lower=null.model.new, upper=full.model.new),k=log(nrow(FAA.final.new)))
## Start:  AIC=629.5
## long.landing ~ 1
## 
##                   Df Deviance    AIC
## + speed_ground     1   107.40 145.93
## + risky.landing    1   309.08 347.61
## + aircraft         1   583.49 622.02
## + aircraft.binary  1   583.49 622.02
## <none>                 597.69 629.50
## + pitch            1   595.08 633.61
## + height           1   597.29 635.82
## + no_pasg          1   597.46 635.99
## + duration         1   597.46 635.99
## 
## Step:  AIC=128.92
## long.landing ~ speed_ground
## 
##                   Df Deviance    AIC
## + aircraft         1    78.16 106.40
## + aircraft.binary  1    78.16 106.40
## + height           1    95.06 123.30
## + pitch            1    97.01 125.24
## <none>                 115.47 128.92
## + risky.landing    1   104.66 132.90
## + duration         1   107.30 135.53
## + no_pasg          1   107.37 135.61
## - speed_ground     1   622.78 629.50
## 
## Step:  AIC=104.83
## long.landing ~ speed_ground + aircraft
## 
##                 Df Deviance    AIC
## + height         1    54.40  87.79
## <none>                84.66 104.83
## + pitch          1    75.18 108.57
## + duration       1    76.64 110.03
## + risky.landing  1    77.65 111.04
## + no_pasg        1    77.82 111.22
## - aircraft       1   115.47 128.92
## - speed_ground   1   606.55 620.00
## 
## Step:  AIC=83.94
## long.landing ~ speed_ground + aircraft + height
## 
##                 Df Deviance    AIC
## <none>                57.05  83.94
## + pitch          1    51.58  87.84
## + risky.landing  1    53.63  89.89
## + duration       1    53.68  89.94
## + no_pasg        1    54.40  90.66
## - height         1    84.66 104.83
## - aircraft       1   100.46 120.63
## - speed_ground   1   605.79 625.96
summary(model.BIC)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height, 
##     family = binomial, data = FAA.final.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.43442  -0.00117   0.00000   0.00000   2.57435  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -102.95437   19.22882  -5.354 8.59e-08 ***
## speed_ground      0.92657    0.17242   5.374 7.70e-08 ***
## aircraftboeing    5.04813    1.11520   4.527 5.99e-06 ***
## height            0.23106    0.05959   3.877 0.000106 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  57.047  on 827  degrees of freedom
## AIC: 65.047
## 
## Number of Fisher Scoring iterations: 11
#Keep aircraft,speed_ground,height
  • Model: long.landing =-102.95+0.93(speed_ground)+5.04(aircraft)+0.23(height)
  • AIC - 65.047
  • Fisher Scoring Iterations - 11
  • No. of Variables - 3
  • The slopes of the predictors are positive showing an relation in same direction to the response
  • This model has the lowest AIC and also eliminates too many predictors
  • Selecting this as the Best Model

RISKY LANDINGS

Step 2

ggplot(FAA.final,aes(risky.landing))+geom_histogram(bins=3,fill="red")

RISKY LANDINGS

Step 3

l.risk<-lapply( FAA.final[,c(-8,-9,-10)], function(x) summary(glm(risky.landing ~ x,data=FAA.final,family=binomial))$coefficients[2,c(1,4)])
l1.risk<-data.frame(l.risk)
l2.risk<-t(l1.risk)
l3.risk<-data.frame(l2.risk)
l3.risk<-l3.risk%>%
  mutate(Direction=ifelse(Estimate>=0,"Positive","Negative"))
l3.risk<-l3.risk%>%
  mutate(OddsRatio=exp(Estimate))
l3.risk
##                  Estimate     Pr...z.. Direction OddsRatio
## pitch         0.371071969 1.432961e-01  Positive 1.4492874
## aircraft      1.001775330 4.560563e-04  Positive 2.7231120
## no_pasg      -0.025379344 1.536237e-01  Negative 0.9749400
## speed_ground  0.614218747 6.898006e-08  Positive 1.8482121
## speed_air     0.870401902 3.728032e-06  Positive 2.3878703
## height       -0.002218606 8.705917e-01  Negative 0.9977839
## duration     -0.001151836 6.801987e-01  Negative 0.9988488
  • Note: The significant factors for risky landings are: aircraft make,ground and air speed

RISKY LANDINGS

Step 4

#Significant Factors:Speed_Ground,Speed_Air,Aircraft

#Speed_Air
ggplot(FAA.final, aes(x=speed_air, fill=as.factor(risky.landing))) +
  geom_histogram(position="dodge")+
  theme(legend.position="top")

#Speed_Ground
ggplot(FAA.final, aes(x=speed_ground, fill=as.factor(risky.landing))) +
  geom_histogram(position="dodge")+
  theme(legend.position="top")

* The Speed_Air Variable is Right Skewed * The Speed_Air Variable is Normally distributed for risky.landing=0

FULL MODEL RISKY

Step 5

full.model.risk <- glm(risky.landing ~ .,data=FAA.final,family=binomial)
summary(full.model.risk)    
## 
## Call:
## glm(formula = risky.landing ~ ., family = binomial, data = FAA.final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -1.97055   0.00000   0.00000   0.00001   2.22865  
## 
## Coefficients: (1 not defined because of singularities)
##                   Estimate Std. Error z value Pr(>|z|)  
## (Intercept)     -1.613e+02  3.289e+03  -0.049   0.9609  
## pitch           -1.328e+00  1.435e+00  -0.925   0.3549  
## aircraftboeing   7.217e+00  3.034e+00   2.378   0.0174 *
## no_pasg         -1.171e-01  9.731e-02  -1.203   0.2289  
## speed_ground    -1.770e-01  5.059e-01  -0.350   0.7264  
## speed_air        1.615e+00  6.586e-01   2.452   0.0142 *
## height           4.372e-02  5.844e-02   0.748   0.4543  
## duration         2.012e-03  1.587e-02   0.127   0.8991  
## long.landing     1.358e+01  3.289e+03   0.004   0.9967  
## aircraft.binary         NA         NA      NA       NA  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 240.724  on 194  degrees of freedom
## Residual deviance:  22.095  on 186  degrees of freedom
##   (636 observations deleted due to missingness)
## AIC: 40.095
## 
## Number of Fisher Scoring iterations: 21
good.model.risk<-glm(risky.landing ~ aircraft.binary+speed_ground,data=FAA.final.new,family=binomial)
summary(good.model)
## 
## Call:
## glm(formula = long.landing ~ pitch + aircraft.binary + speed_ground, 
##     family = binomial, data = FAA.final.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.11589  -0.01116  -0.00026   0.00000   2.40741  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -67.92855   10.48408  -6.479 9.22e-11 ***
## pitch             1.06599    0.60389   1.765   0.0775 .  
## aircraft.binary   3.04348    0.73345   4.150 3.33e-05 ***
## speed_ground      0.61471    0.09184   6.694 2.18e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  81.309  on 827  degrees of freedom
## AIC: 89.309
## 
## Number of Fisher Scoring iterations: 10
  • Note: We would be removing the speed_air variable due to high number of missing values and high collinearity with speed_ground

AIC RISKY

Step 6

null.model.new.risk<- glm(risky.landing ~ 1,data=FAA.final.new,family=binomial)
full.model.new.risk<- glm(risky.landing ~ .,data=FAA.final.new,family=binomial)
model.AIC.risk<-step(null.model.new.risk,scope=list(lower=null.model.new.risk, upper=full.model.new.risk),trace=0,direction = "forward")
summary(model.AIC.risk)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg, 
##     family = binomial, data = FAA.final.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.33913  -0.00009   0.00000   0.00000   1.87810  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -99.90780   25.57993  -3.906 9.39e-05 ***
## speed_ground     0.94963    0.23559   4.031 5.56e-05 ***
## aircraftboeing   4.64188    1.47520   3.147  0.00165 ** 
## no_pasg         -0.08462    0.05732  -1.476  0.13987    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  37.707  on 827  degrees of freedom
## AIC: 45.707
## 
## Number of Fisher Scoring iterations: 12
#only aircraft and speed_ground

BIC RISKY

Step 7

model.BIC.risk<-step(null.model.new.risk,scope=list(lower=null.model.new.risk, upper=full.model.new.risk),k=log(nrow(FAA.final.new)))
## Start:  AIC=442.77
## risky.landing ~ 1
## 
##                   Df Deviance    AIC
## + speed_ground     1    57.99  84.26
## + long.landing     1   134.60 160.87
## + aircraft         1   412.07 438.34
## + aircraft.binary  1   412.07 438.34
## <none>                 423.22 442.77
## + no_pasg          1   421.18 447.45
## + pitch            1   421.54 447.82
## + duration         1   423.04 449.32
## + height           1   423.13 449.40
## 
## Step:  AIC=72.38
## risky.landing ~ speed_ground
## 
##                   Df Deviance    AIC
## + aircraft         1    39.96  61.07
## + aircraft.binary  1    39.96  61.07
## <none>                  58.93  72.38
## + pitch            1    51.63  72.74
## + long.landing     1    53.53  74.64
## + no_pasg          1    57.18  78.29
## + height           1    57.79  78.90
## + duration         1    57.95  79.06
## - speed_ground     1   436.04 442.77
## 
## Step:  AIC=60.26
## risky.landing ~ speed_ground + aircraft
## 
##                Df Deviance    AIC
## <none>               40.10  60.26
## + no_pasg       1    37.56  64.59
## + height        1    39.30  66.33
## + long.landing  1    39.46  66.49
## + duration      1    39.76  66.79
## + pitch         1    39.78  66.81
## - aircraft      1    58.93  72.38
## - speed_ground  1   422.74 436.18
summary(model.BIC.risk)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial, 
##     data = FAA.final.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.24398  -0.00011   0.00000   0.00000   1.61021  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -102.0772    24.7751  -4.120 3.79e-05 ***
## speed_ground      0.9263     0.2248   4.121 3.78e-05 ***
## aircraftboeing    4.0190     1.2494   3.217   0.0013 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  40.097  on 828  degrees of freedom
## AIC: 46.097
## 
## Number of Fisher Scoring iterations: 12
#only aircraft and speed_ground
  • Model: risky.landing =-102.07+0.93(speed_ground)+4.02(aircraft)
  • AIC: 46.1
  • Fisher Scoring Iterations - 12
  • Number of Variables - 2
  • The slopes of the predictors are positive showing an relation in same direction to the response
  • This model has the lowest AIC and also eliminates too many predictors (It has only 2)
  • Selecting this as the best model
  • Aircraft make plays a big role in estimation as it has a larger coefficient

COMPARING MODELS FOR RISKY AND LONG LANDINGS

Step 12

mylogit <- glm(formula = long.landing ~ speed_ground + aircraft, family = binomial, 
               data = FAA.final.new)
mylogit.risky <- glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial, 
               data = FAA.final.new)
summary(mylogit)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft, family = binomial, 
##     data = FAA.final.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.28368  -0.01418  -0.00039   0.00000   2.56541  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -60.77049    8.67075  -7.009 2.41e-12 ***
## speed_ground     0.58534    0.08441   6.934 4.08e-12 ***
## aircraftboeing   3.23679    0.71189   4.547 5.45e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  84.665  on 828  degrees of freedom
## AIC: 90.665
## 
## Number of Fisher Scoring iterations: 10
summary(mylogit.risky)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial, 
##     data = FAA.final.new)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.24398  -0.00011   0.00000   0.00000   1.61021  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -102.0772    24.7751  -4.120 3.79e-05 ***
## speed_ground      0.9263     0.2248   4.121 3.78e-05 ***
## aircraftboeing    4.0190     1.2494   3.217   0.0013 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  40.097  on 828  degrees of freedom
## AIC: 46.097
## 
## Number of Fisher Scoring iterations: 12
library(ROCR)
#Long_AUC
pred1 <- prediction(predict(mylogit), FAA.final.new$long.landing)
perf1 <- performance(pred1,"tpr","fpr")
plot(perf1,colorize=TRUE)

unlist(slot(performance(pred1, "auc"), "y.values"))
## [1] 0.9964526
#Risky_AUC
pred.risky <- prediction(predict(mylogit.risky ), FAA.final.new$risky.landing)
perf.risky <- performance(pred.risky ,"tpr","fpr")
plot(perf.risky ,colorize=TRUE)

unlist(slot(performance(pred.risky , "auc"), "y.values"))
## [1] 0.9986161
  • The Risky Landing Model has a lower AIC value
  • It also has lesser number of variables(2)
  • The Risky and Long Landings AUC are almost same and are equal to 1
  • The True Positive Rate determination is almost accurate suggesting a good model

PREDICTIONS and CI

Step 13

new.data <- data.frame(aircraft = 'boeing', duration=200, no_pasg=80, speed_ground=115,
                 speed_air=120,
                 height=40, pitch=4)
#Long_Prob
prob.long <- predict(mylogit, newdata=new.data, type="response",se=T)
CI.long <- c((prob.long$fit-
                (1.96*prob.long$se.fit)),(prob.long$fit+(1.96*prob.long$se.fit)))
#Risky_Prob
prob.risky <- predict(mylogit.risky, newdata=new.data, type="response",se=T)
CI.risky <- c((prob.risky$fit-
                (1.96*prob.risky$se.fit)),(prob.risky$fit+(1.96*prob.risky$se.fit)))
prob.long
## $fit
##         1 
## 0.9999434 
## 
## $se.fit
##            1 
## 8.630536e-05 
## 
## $residual.scale
## [1] 1
CI.long
##         1         1 
## 0.9997743 1.0001126
prob.risky
## $fit
##        1 
## 0.999789 
## 
## $se.fit
##            1 
## 0.0004408114 
## 
## $residual.scale
## [1] 1
CI.risky
##        1        1 
## 0.998925 1.000653

Note: * The new data point was fitted into the model. * The results for the Probabilities , Standard Errors and Confidence Intervals are reported below:

For LONG LANDINGS * $fit - 0.9999434 * $se.fit - 8.630536e-05 * CI.long: 0.9997743 to 1.0001126

For RISKY LANDINGS * $fit - 0.999789 * $se.fit - 0.0004408114 * CI.risky: 0.998925 to 1.000653