#Loading required libraries

library(readxl)
library(tidyverse)
library(plyr)
library(dplyr)
library(MASS)
library(faraway)

Initial exploration of the data

getwd()
## [1] "/Users/puneetbhatia/Desktop/UC spring sem/Stat computing"
# Importing data

FAA1 <- read_excel("FAA1-1.xls") 
FAA2 <- read_excel("FAA2-1.xls") 

# Structure of datasets

str(FAA1)
## Classes 'tbl_df', 'tbl' and 'data.frame':    800 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
dim(unique(FAA1))
## [1] 800   8

FAA1 has 800 rows and 8 columns

str(FAA2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    150 obs. of  7 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
dim(unique(FAA2))
## [1] 150   7

FAA2 has 150 rows and 7 columns. ‘duration’ column is not present in FAA2, but it is present in FAA1.

# Appending the data

combined <- rbind.fill(FAA1,FAA2)

# Checking duplicates

dim(combined[,-2])
## [1] 950   7
dim(unique(combined[,-2]))
## [1] 850   7

We have 100 duplicate records.

# Removing duplicate records

unique_combined <- unique(combined[,-2]) %>% left_join(FAA1)
## Joining, by = c("aircraft", "no_pasg", "speed_ground", "speed_air", "height",
## "pitch", "distance")
# Structure of the new data set

str(unique_combined)
## 'data.frame':    850 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...

Dataset has 850 rows and 8 variables

#unique_combined$aircraft <- as.factor(unique_combined$aircraft)
# Summary of the combined data

summary(unique_combined)
##    aircraft            no_pasg      speed_ground      speed_air     
##  Length:850         Min.   :29.0   Min.   : 27.74   Min.   : 90.00  
##  Class :character   1st Qu.:55.0   1st Qu.: 65.90   1st Qu.: 96.25  
##  Mode  :character   Median :60.0   Median : 79.64   Median :101.15  
##                     Mean   :60.1   Mean   : 79.45   Mean   :103.80  
##                     3rd Qu.:65.0   3rd Qu.: 92.06   3rd Qu.:109.40  
##                     Max.   :87.0   Max.   :141.22   Max.   :141.72  
##                                                     NA's   :642     
##      height           pitch          distance          duration     
##  Min.   :-3.546   Min.   :2.284   Min.   :  34.08   Min.   : 14.76  
##  1st Qu.:23.314   1st Qu.:3.642   1st Qu.: 883.79   1st Qu.:119.49  
##  Median :30.093   Median :4.008   Median :1258.09   Median :153.95  
##  Mean   :30.144   Mean   :4.009   Mean   :1526.02   Mean   :154.01  
##  3rd Qu.:36.993   3rd Qu.:4.377   3rd Qu.:1936.95   3rd Qu.:188.91  
##  Max.   :59.946   Max.   :5.927   Max.   :6533.05   Max.   :305.62  
##                                                     NA's   :50
boxplot(unique_combined$duration~unique_combined$aircraft)

boxplot(unique_combined$speed_ground~unique_combined$aircraft)

boxplot(unique_combined$speed_air~unique_combined$aircraft)

boxplot(unique_combined$height~unique_combined$aircraft)

boxplot(unique_combined$pitch~unique_combined$aircraft)

boxplot(unique_combined$distance~unique_combined$aircraft)

boxplot(unique_combined$no_pasg~unique_combined$aircraft)

hist(unique_combined$distance)

Summary of findings

Data Cleaning and further exploration

# Checking abnormal values

unique_combined %>% filter(duration<40)  #5 observations with duration less than 40
unique_combined %>% filter(!(speed_ground >= 30 & speed_ground <= 140)) # 3 observations with abnormal speed_ground
unique_combined %>% filter(!(speed_air >= 30 & speed_air <= 140| is.na(speed_air))) # 1 observation with abnormal speed_air
unique_combined %>% filter(height<6)  #10 observations with abnormal height
unique_combined %>% filter(distance>6000)  #2 observations with abnormal distance
#Cleaning data based on requirements

FAA_final <- unique_combined %>% filter(duration>40 | is.na(duration),speed_ground >= 30 & speed_ground <= 140, speed_air >= 30 & speed_air <= 140 | is.na(speed_air), height >= 6, distance < 6000)

dim(FAA_final)
## [1] 831   8
# Encoding aircraft (airbus is coded as 1 and boeing as 0)
for(i in 1:831)
{
if(FAA_final$aircraft[i] == 'airbus') 
{
  FAA_final$aircraft[i] = 1
}

if(FAA_final$aircraft[i] == 'boeing') 
{
  FAA_final$aircraft[i] = 0
}

}

FAA_final$aircraft <- as.numeric(FAA_final$aircraft)

The dataset has 831 rows after removing the abnormal values. 19 rows have been removed.

Part 2. Practice of modeling a binary response using logistic regression.

Create binary responses

FAA_final$long.landing <- with(FAA_final, ifelse(distance > 2500,1, 0))
FAA_final$risky.landing <- with(FAA_final, ifelse(distance > 3000,1, 0))

FAA_final <- FAA_final[,-7]

Identifying important factors using the binary data of “long.landing”

hist(FAA_final$long.landing)

l1 <- glm(long.landing~aircraft,family = binomial,FAA_final)
summary(l1)
## 
## Call:
## glm(formula = long.landing ~ aircraft, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.6166  -0.6166  -0.4112  -0.4112   2.2416  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.5636     0.1344 -11.638  < 2e-16 ***
## aircraft     -0.8641     0.2197  -3.933  8.4e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.78  on 830  degrees of freedom
## Residual deviance: 606.55  on 829  degrees of freedom
## AIC: 610.55
## 
## Number of Fisher Scoring iterations: 5
d_1 <- 'negative'
c_1 <- summary(l1)$coefficients[,1][2]
o_1 <- exp(c_1)
p_1 <- summary(l1)$coefficients[,4][2]


l2 <- glm(long.landing~no_pasg,family = binomial,FAA_final)
summary(l2)
## 
## Call:
## glm(formula = long.landing ~ no_pasg, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5710  -0.5213  -0.5125  -0.5005   2.1056  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -1.520896   0.846824  -1.796   0.0725 .
## no_pasg     -0.007256   0.014063  -0.516   0.6059  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.78  on 830  degrees of freedom
## Residual deviance: 622.51  on 829  degrees of freedom
## AIC: 626.51
## 
## Number of Fisher Scoring iterations: 4
d_2 <- 'negative'
c_2 <- summary(l2)$coefficients[,1][2]
o_2 <- exp(c_2)
p_2 <- summary(l2)$coefficients[,4][2]


l3 <- glm(long.landing~speed_ground,family = binomial,FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(l3)
## 
## Call:
## glm(formula = long.landing ~ speed_ground, family = binomial, 
##     data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.51572  -0.03478  -0.00180  -0.00004   2.37848  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -47.96055    6.28136  -7.635 2.25e-14 ***
## speed_ground   0.47235    0.06245   7.563 3.94e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.78  on 830  degrees of freedom
## Residual deviance: 115.47  on 829  degrees of freedom
## AIC: 119.47
## 
## Number of Fisher Scoring iterations: 10
d_3 <- 'positive'
c_3 <- summary(l3)$coefficients[,1][2]
o_3 <- exp(c_3)
p_3 <- summary(l3)$coefficients[,4][2]


l4 <- glm(long.landing~speed_air,family = binomial,FAA_final)
summary(l4)
## 
## Call:
## glm(formula = long.landing ~ speed_air, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.55980  -0.35662   0.00067   0.20985   2.19936  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -51.98365    7.84323  -6.628 3.41e-11 ***
## speed_air     0.51232    0.07772   6.592 4.33e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 281.37  on 202  degrees of freedom
## Residual deviance: 102.33  on 201  degrees of freedom
##   (628 observations deleted due to missingness)
## AIC: 106.33
## 
## Number of Fisher Scoring iterations: 7
d_4 <- 'positive'
c_4 <- summary(l4)$coefficients[,1][2]
o_4 <- exp(c_4)
p_4 <- summary(l4)$coefficients[,4][2]


l5 <- glm(long.landing~height,family = binomial,FAA_final)
summary(l5)
## 
## Call:
## glm(formula = long.landing ~ height, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5782  -0.5244  -0.5097  -0.4916   2.1193  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.220918   0.349916  -6.347  2.2e-10 ***
## height       0.008624   0.010737   0.803    0.422    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.78  on 830  degrees of freedom
## Residual deviance: 622.13  on 829  degrees of freedom
## AIC: 626.13
## 
## Number of Fisher Scoring iterations: 4
d_5 <- 'positive'
c_5 <- summary(l5)$coefficients[,1][2]
o_5 <- exp(c_5)
p_5 <- summary(l5)$coefficients[,4][2]

l6 <- glm(long.landing~pitch,family = binomial,FAA_final)
summary(l6)
## 
## Call:
## glm(formula = long.landing ~ pitch, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.7248  -0.5381  -0.5006  -0.4580   2.2687  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -3.5764     0.8297  -4.310 1.63e-05 ***
## pitch         0.4005     0.2013   1.989   0.0466 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.78  on 830  degrees of freedom
## Residual deviance: 618.79  on 829  degrees of freedom
## AIC: 622.79
## 
## Number of Fisher Scoring iterations: 4
d_6 <- 'positive'
c_6 <- summary(l6)$coefficients[,1][2]
o_6 <- exp(c_6)
p_6 <- summary(l6)$coefficients[,4][2]


l7 <- glm(long.landing~duration,family = binomial,FAA_final)
summary(l7)
## 
## Call:
## glm(formula = long.landing ~ duration, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5535  -0.5312  -0.5210  -0.5097   2.0840  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.753699   0.356787  -4.915 8.87e-07 ***
## duration    -0.001070   0.002226  -0.481    0.631    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 597.69  on 780  degrees of freedom
## Residual deviance: 597.46  on 779  degrees of freedom
##   (50 observations deleted due to missingness)
## AIC: 601.46
## 
## Number of Fisher Scoring iterations: 4
d_7 <- 'negative'
c_7 <- summary(l7)$coefficients[,1][2]
o_7 <- exp(c_7)
p_7 <- summary(l7)$coefficients[,4][2]

names = colnames(FAA_final[c(1:7)])
coff_size = c(c_1,c_2,c_3,c_4,c_5,c_6,c_7)
odds_ratio = c(o_1,o_2,o_3,o_4,o_5,o_6,o_7)
dir=c(d_1,d_2,d_3,d_4,d_5,d_6,d_7)
pval=c(p_1,p_2,p_3,p_4,p_5,p_6,p_7)

Table1 <- data.frame(names,coff_size,odds_ratio,dir,pval)

Important factors - speed_ground, speed_air, aircraft, pitch

# Visualising the association of important factors

plot(jitter(long.landing,0.1)~jitter(speed_ground),FAA_final,xlab="speed_ground",ylab="long.landing",pch=".")

plot(jitter(long.landing,0.1)~jitter(speed_air),FAA_final,xlab="speed_air",ylab="long.landing",pch=".")

plot(jitter(long.landing,0.1)~jitter(aircraft),FAA_final,xlab="aircraft",ylab="long.landing",pch=".")

plot(long.landing~aircraft, FAA_final)

library(ggplot2)

ggplot(FAA_final,aes(x=pitch,fill=long.landing))+geom_histogram(position="dodge",binwidth=1)

plot(jitter(long.landing,0.1)~jitter(pitch),FAA_final,xlab="pitch",ylab="long.landing",pch=".")

plot(long.landing~pitch, FAA_final)

speed_ground and speed_air are positively associated with the response. It is not clear if aircraft and pitch are associated with the response or not

# Model with all the variables except speed_air and duration (speed_air has collinearity with speed_ground, duration has the least p value as seen from Table1. Also both have missing values so removing them from the model)

glm0 <- glm(long.landing~speed_ground + aircraft + pitch + no_pasg + height,family = binomial,FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm0)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + pitch + 
##     no_pasg + height, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.20264  -0.00054   0.00000   0.00000   2.35660  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.146e+02  2.355e+01  -4.869 1.12e-06 ***
## speed_ground  1.023e+00  2.036e-01   5.023 5.09e-07 ***
## aircraft     -5.134e+00  1.181e+00  -4.348 1.38e-05 ***
## pitch         1.538e+00  8.429e-01   1.825 0.068073 .  
## no_pasg       3.975e-04  5.400e-02   0.007 0.994127    
## height        2.578e-01  7.177e-02   3.592 0.000328 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  53.204  on 825  degrees of freedom
## AIC: 65.204
## 
## Number of Fisher Scoring iterations: 12
nothing <- glm(long.landing~1,family = binomial,FAA_final)




# Stepwise forward selection using AIC

model.AIC<-stepAIC(nothing,direction="forward",scope=list(upper=glm0,lower=nothing))
## Start:  AIC=624.78
## long.landing ~ 1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                Df Deviance    AIC
## + speed_ground  1   115.47 119.47
## + aircraft      1   606.55 610.55
## + pitch         1   618.79 622.79
## <none>              622.78 624.78
## + height        1   622.13 626.13
## + no_pasg       1   622.51 626.51
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=119.47
## long.landing ~ speed_ground
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##            Df Deviance     AIC
## + aircraft  1   84.665  90.665
## + height    1  100.459 106.459
## + pitch     1  105.527 111.527
## <none>         115.470 119.470
## + no_pasg   1  115.468 121.468
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=90.66
## long.landing ~ speed_ground + aircraft
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##           Df Deviance    AIC
## + height   1   57.047 65.047
## + pitch    1   81.309 89.309
## <none>         84.665 90.665
## + no_pasg  1   84.219 92.219
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=65.05
## long.landing ~ speed_ground + aircraft + height
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##           Df Deviance    AIC
## + pitch    1   53.204 63.204
## <none>         57.047 65.047
## + no_pasg  1   57.031 67.031
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=63.2
## long.landing ~ speed_ground + aircraft + height + pitch
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##           Df Deviance    AIC
## <none>         53.204 63.204
## + no_pasg  1   53.204 65.204
# Model selected - long.landing ~ speed_ground + aircraft + height + pitch
summary(model.AIC)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height + 
##     pitch, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.20284  -0.00054   0.00000   0.00000   2.35719  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -114.64155   23.52202  -4.874 1.09e-06 ***
## speed_ground    1.02266    0.20290   5.040 4.65e-07 ***
## aircraft       -5.13443    1.18091  -4.348 1.37e-05 ***
## height          0.25795    0.06861   3.760  0.00017 ***
## pitch           1.53751    0.84109   1.828  0.06755 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  53.204  on 826  degrees of freedom
## AIC: 63.204
## 
## Number of Fisher Scoring iterations: 12
# Stepwise forward selection using BIC

model.BIC<-step(nothing,direction="forward",scope=list(upper=glm0,lower=nothing), criterion = "BIC", k=log(nrow(FAA_final)))
## Start:  AIC=629.5
## long.landing ~ 1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                Df Deviance    AIC
## + speed_ground  1   115.47 128.92
## + aircraft      1   606.55 620.00
## <none>              622.78 629.50
## + pitch         1   618.79 632.24
## + height        1   622.13 635.58
## + no_pasg       1   622.51 635.96
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=128.92
## long.landing ~ speed_ground
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##            Df Deviance    AIC
## + aircraft  1   84.665 104.83
## + height    1  100.459 120.63
## + pitch     1  105.527 125.69
## <none>         115.470 128.92
## + no_pasg   1  115.468 135.64
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=104.83
## long.landing ~ speed_ground + aircraft
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##           Df Deviance     AIC
## + height   1   57.047  83.937
## <none>         84.665 104.832
## + pitch    1   81.309 108.200
## + no_pasg  1   84.219 111.110
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=83.94
## long.landing ~ speed_ground + aircraft + height
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##           Df Deviance    AIC
## <none>         57.047 83.937
## + pitch    1   53.204 86.817
## + no_pasg  1   57.031 90.644
# Model selected - long.landing ~ speed_ground + aircraft + height

summary(model.BIC)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height, 
##     family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.43442  -0.00117   0.00000   0.00000   2.57435  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -97.90623   18.34950  -5.336 9.52e-08 ***
## speed_ground   0.92657    0.17242   5.374 7.70e-08 ***
## aircraft      -5.04813    1.11520  -4.527 5.99e-06 ***
## height         0.23106    0.05959   3.877 0.000106 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  57.047  on 827  degrees of freedom
## AIC: 65.047
## 
## Number of Fisher Scoring iterations: 11

EXECUTIVE SUMMARY

Final Model for Long Landing

# Summary of the final model

summary(model.AIC)
## 
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height + 
##     pitch, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.20284  -0.00054   0.00000   0.00000   2.35719  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -114.64155   23.52202  -4.874 1.09e-06 ***
## speed_ground    1.02266    0.20290   5.040 4.65e-07 ***
## aircraft       -5.13443    1.18091  -4.348 1.37e-05 ***
## height          0.25795    0.06861   3.760  0.00017 ***
## pitch           1.53751    0.84109   1.828  0.06755 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  53.204  on 826  degrees of freedom
## AIC: 63.204
## 
## Number of Fisher Scoring iterations: 12
c_1 <- summary(model.AIC)$coefficients[,1][2]
o_1 <- exp(c_1)
p_1 <- summary(model.AIC)$coefficients[,4][2]

c_2 <- summary(model.AIC)$coefficients[,1][3]
o_2 <- exp(c_2)
p_2 <- summary(model.AIC)$coefficients[,4][3]

c_3 <- summary(model.AIC)$coefficients[,1][4]
o_3 <- exp(c_3)
p_3 <- summary(model.AIC)$coefficients[,4][4]

c_4 <- summary(model.AIC)$coefficients[,1][5]
o_4 <- exp(c_4)
p_4 <- summary(model.AIC)$coefficients[,4][5]

names = c('speed_ground','aircraft','height','pitch')
coff_size = c(c_1,c_2,c_3,c_4)
odds_ratio = c(o_1,o_2,o_3,o_4)
pval=c(p_1,p_2,p_3,p_4)

Long_Landing_Table <- data.frame(names,coff_size,odds_ratio,pval)
Long_Landing_Table
#Plots

plot(jitter(long.landing,0.1)~jitter(speed_ground),FAA_final,xlab="speed_ground",ylab="long.landing",pch=".")

plot(jitter(long.landing,0.1)~jitter(aircraft),FAA_final,xlab="aircraft",ylab="long.landing",pch=".")

plot(jitter(long.landing,0.1)~jitter(pitch),FAA_final,xlab="pitch",ylab="long.landing",pch=".")

Identifying important factors using the binary data of “risky.landing”.

hist(FAA_final$risky.landing)

l1 <- glm(risky.landing~aircraft,family = binomial,FAA_final)
summary(l1)
## 
## Call:
## glm(formula = risky.landing ~ aircraft, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4793  -0.4793  -0.2958  -0.2958   2.5105  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -2.1059     0.1634 -12.886  < 2e-16 ***
## aircraft     -1.0018     0.2858  -3.505 0.000456 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.04  on 830  degrees of freedom
## Residual deviance: 422.74  on 829  degrees of freedom
## AIC: 426.74
## 
## Number of Fisher Scoring iterations: 5
d_1 <- 'negative'
c_1 <- summary(l1)$coefficients[,1][2]
o_1 <- exp(c_1)
p_1 <- summary(l1)$coefficients[,4][2]


l2 <- glm(risky.landing~no_pasg,family = binomial,FAA_final)
summary(l2)
## 
## Call:
## glm(formula = risky.landing ~ no_pasg, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5628  -0.4073  -0.3831  -0.3582   2.4815  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.02673    1.05514  -0.973    0.331
## no_pasg     -0.02538    0.01779  -1.427    0.154
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.04  on 830  degrees of freedom
## Residual deviance: 434.00  on 829  degrees of freedom
## AIC: 438
## 
## Number of Fisher Scoring iterations: 5
d_2 <- 'negative'
c_2 <- summary(l2)$coefficients[,1][2]
o_2 <- exp(c_2)
p_2 <- summary(l2)$coefficients[,4][2]


l3 <- glm(risky.landing~speed_ground,family = binomial,FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(l3)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground, family = binomial, 
##     data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.53709  -0.00383  -0.00009   0.00000   1.95417  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -66.1243    12.2025  -5.419  6.0e-08 ***
## speed_ground   0.6142     0.1139   5.394  6.9e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  58.931  on 829  degrees of freedom
## AIC: 62.931
## 
## Number of Fisher Scoring iterations: 11
d_3 <- 'positive'
c_3 <- summary(l3)$coefficients[,1][2]
o_3 <- exp(c_3)
p_3 <- summary(l3)$coefficients[,4][2]


l4 <- glm(risky.landing~speed_air,family = binomial,FAA_final)
summary(l4)
## 
## Call:
## glm(formula = risky.landing ~ speed_air, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.15041  -0.05679  -0.00616   0.00102   2.69736  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -93.5700    20.2180  -4.628 3.69e-06 ***
## speed_air     0.8704     0.1882   4.626 3.73e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 248.18  on 202  degrees of freedom
## Residual deviance:  44.58  on 201  degrees of freedom
##   (628 observations deleted due to missingness)
## AIC: 48.58
## 
## Number of Fisher Scoring iterations: 9
d_4 <- 'positive'
c_4 <- summary(l4)$coefficients[,1][2]
o_4 <- exp(c_4)
p_4 <- summary(l4)$coefficients[,4][2]


l5 <- glm(risky.landing~height,family = binomial,FAA_final)
summary(l5)
## 
## Call:
## glm(formula = risky.landing ~ height, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4007  -0.3931  -0.3901  -0.3868   2.3105  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.468143   0.433262  -5.697 1.22e-08 ***
## height      -0.002219   0.013619  -0.163    0.871    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.04  on 830  degrees of freedom
## Residual deviance: 436.02  on 829  degrees of freedom
## AIC: 440.02
## 
## Number of Fisher Scoring iterations: 5
d_5 <- 'negative'
c_5 <- summary(l5)$coefficients[,1][2]
o_5 <- exp(c_5)
p_5 <- summary(l5)$coefficients[,4][2]

l6 <- glm(risky.landing~pitch,family = binomial,FAA_final)
summary(l6)
## 
## Call:
## glm(formula = risky.landing ~ pitch, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5433  -0.4094  -0.3828  -0.3559   2.4829  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -4.0380     1.0461  -3.860 0.000113 ***
## pitch         0.3711     0.2535   1.464 0.143296    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.04  on 830  degrees of freedom
## Residual deviance: 433.89  on 829  degrees of freedom
## AIC: 437.89
## 
## Number of Fisher Scoring iterations: 5
d_6 <- 'positive'
c_6 <- summary(l6)$coefficients[,1][2]
o_6 <- exp(c_6)
p_6 <- summary(l6)$coefficients[,4][2]


l7 <- glm(risky.landing~duration,family = binomial,FAA_final)
summary(l7)
## 
## Call:
## glm(formula = risky.landing ~ duration, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.4253  -0.4067  -0.3985  -0.3898   2.3230  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -2.309326   0.446857  -5.168 2.37e-07 ***
## duration    -0.001152   0.002794  -0.412     0.68    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 423.22  on 780  degrees of freedom
## Residual deviance: 423.04  on 779  degrees of freedom
##   (50 observations deleted due to missingness)
## AIC: 427.04
## 
## Number of Fisher Scoring iterations: 5
d_7 <- 'negative'
c_7 <- summary(l7)$coefficients[,1][2]
o_7 <- exp(c_7)
p_7 <- summary(l7)$coefficients[,4][2]

names = colnames(FAA_final[c(1:7)])
coff_size = c(c_1,c_2,c_3,c_4,c_5,c_6,c_7)
odds_ratio = c(o_1,o_2,o_3,o_4,o_5,o_6,o_7)
dir=c(d_1,d_2,d_3,d_4,d_5,d_6,d_7)
pval=c(p_1,p_2,p_3,p_4,p_5,p_6,p_7)

Table2 <- data.frame(names,coff_size,odds_ratio,dir,pval)

Important factors - speed_ground, speed_air, aircraft, pitch

# Visualising the association of important factors

plot(jitter(risky.landing,0.1)~jitter(speed_ground),FAA_final,xlab="speed_ground",ylab="long.landing",pch=".")

plot(jitter(risky.landing,0.1)~jitter(speed_air),FAA_final,xlab="speed_air",ylab="long.landing",pch=".")

plot(jitter(risky.landing,0.1)~jitter(aircraft),FAA_final,xlab="aircraft",ylab="long.landing",pch=".")

plot(risky.landing~aircraft, FAA_final)

ggplot(FAA_final,aes(x=pitch,fill=risky.landing))+geom_histogram(position="dodge",binwidth=1)

plot(jitter(risky.landing,0.1)~jitter(pitch),FAA_final,xlab="pitch",ylab="long.landing",pch=".")

plot(risky.landing~pitch, FAA_final)

speed_ground and speed_air are positively associated with the response. It is not clear if aircraft and pitch are associated with the response or not

# Model with all the variables except speed_air and duration (speed_air has collinearity with speed_ground, duration has insignificant p value as seen from Table1. Also both have missing values so removing them from the model)

glm0 <- glm(risky.landing~speed_ground + aircraft + pitch + no_pasg + height,family = binomial,FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm0)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + pitch + 
##     no_pasg + height, family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.45728  -0.00009   0.00000   0.00000   1.87948  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -98.59616   25.69380  -3.837 0.000124 ***
## speed_ground   0.94620    0.24175   3.914 9.08e-05 ***
## aircraft      -4.45430    1.54406  -2.885 0.003917 ** 
## pitch          0.60040    0.77084   0.779 0.436041    
## no_pasg       -0.08726    0.05824  -1.498 0.134025    
## height         0.04376    0.04509   0.970 0.331814    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  36.478  on 825  degrees of freedom
## AIC: 48.478
## 
## Number of Fisher Scoring iterations: 12
nothing <- glm(risky.landing~1,family = binomial,FAA_final)




# Stepwise forward selection using AIC

model.AIC2<-stepAIC(nothing,direction="forward",scope=list(upper=glm0,lower=nothing))
## Start:  AIC=438.04
## risky.landing ~ 1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                Df Deviance    AIC
## + speed_ground  1    58.93  62.93
## + aircraft      1   422.74 426.74
## + pitch         1   433.89 437.89
## + no_pasg       1   434.00 438.00
## <none>              436.04 438.04
## + height        1   436.02 440.02
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=62.93
## risky.landing ~ speed_ground
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##            Df Deviance    AIC
## + aircraft  1   40.097 46.097
## + pitch     1   53.079 59.079
## <none>          58.931 62.931
## + no_pasg   1   58.318 64.318
## + height    1   58.667 64.667
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=46.1
## risky.landing ~ speed_ground + aircraft
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##           Df Deviance    AIC
## + no_pasg  1   37.707 45.707
## <none>         40.097 46.097
## + height   1   39.402 47.402
## + pitch    1   39.928 47.928
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=45.71
## risky.landing ~ speed_ground + aircraft + no_pasg
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##          Df Deviance    AIC
## <none>        37.707 45.707
## + height  1   37.099 47.099
## + pitch   1   37.449 47.449
# Model selected - risky.landing ~ speed_ground + aircraft + no_pasg
summary(model.AIC2)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg, 
##     family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.33913  -0.00009   0.00000   0.00000   1.87810  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -95.26592   24.49990  -3.888 0.000101 ***
## speed_ground   0.94963    0.23559   4.031 5.56e-05 ***
## aircraft      -4.64188    1.47520  -3.147 0.001652 ** 
## no_pasg       -0.08462    0.05732  -1.476 0.139866    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  37.707  on 827  degrees of freedom
## AIC: 45.707
## 
## Number of Fisher Scoring iterations: 12
# Stepwise forward selection using BIC

model.BIC2<-step(nothing,direction="forward",scope=list(upper=glm0,lower=nothing), criterion = "BIC", k=log(nrow(FAA_final)))
## Start:  AIC=442.77
## risky.landing ~ 1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##                Df Deviance    AIC
## + speed_ground  1    58.93  72.38
## + aircraft      1   422.74 436.18
## <none>              436.04 442.77
## + pitch         1   433.89 447.34
## + no_pasg       1   434.00 447.45
## + height        1   436.02 449.46
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=72.38
## risky.landing ~ speed_ground
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##            Df Deviance    AIC
## + aircraft  1   40.097 60.264
## <none>          58.931 72.376
## + pitch     1   53.079 73.247
## + no_pasg   1   58.318 78.486
## + height    1   58.667 78.835
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## 
## Step:  AIC=60.26
## risky.landing ~ speed_ground + aircraft
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##           Df Deviance    AIC
## <none>         40.097 60.264
## + no_pasg  1   37.707 64.597
## + height   1   39.402 66.292
## + pitch    1   39.928 66.819
# Model selected - risky.landing ~ speed_ground + aircraft 

summary(model.BIC2)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial, 
##     data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.24398  -0.00011   0.00000   0.00000   1.61021  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -98.0582    23.8303  -4.115 3.87e-05 ***
## speed_ground   0.9263     0.2248   4.121 3.78e-05 ***
## aircraft      -4.0190     1.2494  -3.217   0.0013 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  40.097  on 828  degrees of freedom
## AIC: 46.097
## 
## Number of Fisher Scoring iterations: 12

EXECUTIVE SUMMARY

Final Model for Risky Landing

# Summary of the final model

summary(model.AIC2)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg, 
##     family = binomial, data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.33913  -0.00009   0.00000   0.00000   1.87810  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -95.26592   24.49990  -3.888 0.000101 ***
## speed_ground   0.94963    0.23559   4.031 5.56e-05 ***
## aircraft      -4.64188    1.47520  -3.147 0.001652 ** 
## no_pasg       -0.08462    0.05732  -1.476 0.139866    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  37.707  on 827  degrees of freedom
## AIC: 45.707
## 
## Number of Fisher Scoring iterations: 12
c_1 <- summary(model.AIC2)$coefficients[,1][2]
o_1 <- exp(c_1)
p_1 <- summary(model.AIC2)$coefficients[,4][2]

c_2 <- summary(model.AIC2)$coefficients[,1][3]
o_2 <- exp(c_2)
p_2 <- summary(model.AIC2)$coefficients[,4][3]

c_3 <- summary(model.AIC2)$coefficients[,1][4]
o_3 <- exp(c_3)
p_3 <- summary(model.AIC2)$coefficients[,4][4]

names = c('speed_ground','aircraft','no_pasg')
coff_size = c(c_1,c_2,c_3)
odds_ratio = c(o_1,o_2,o_3)
pval=c(p_1,p_2,p_3)

Risky_Landing_Table <- data.frame(names,coff_size,odds_ratio,pval)
Risky_Landing_Table
#Plots

plot(jitter(risky.landing,0.1)~jitter(speed_ground),FAA_final,xlab="speed_ground",ylab="risky.landing",pch=".")

plot(jitter(risky.landing,0.1)~jitter(aircraft),FAA_final,xlab="aircraft",ylab="risky.landing",pch=".")

plot(jitter(risky.landing,0.1)~jitter(no_pasg),FAA_final,xlab="no_pasg",ylab="risky.landing",pch=".")

Comparing long.landing and risky.landing models

# ROC curve for long.landing

linpred_long.landing<-predict(model.AIC) ### Linear predictor
predprob_long.landing<-predict(model.AIC,type="response") ### predicted probabilities
predout_long.landing<-ifelse(predprob_long.landing<0.5,"0","1") ### Predicted outcomes using 0.5 as the threshold
FAA_final<-data.frame(FAA_final,predprob_long.landing,predout_long.landing)
xtabs(~long.landing+predout_long.landing,FAA_final)
##             predout_long.landing
## long.landing   0   1
##            0 723   5
##            1   6  97
thresh<-seq(0.01,0.5,0.01)
sensitivity<-specificity<-rep(NA,length(thresh))
for(j in seq(along=thresh)){
pp<-ifelse(FAA_final$predprob_long.landing<thresh[j],"0","1")
xx<-xtabs(~long.landing+pp,FAA_final)
specificity[j]<-xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity[j]<-xx[2,2]/(xx[2,1]+xx[2,2])
}
par(mfrow=c(1,2))
plot(1-specificity,sensitivity,type="l", col="red");abline(0,1,lty=2)

# ROC curve for risky.landing

linpred_risky.landing<-predict(model.AIC2) ### Linear predictor
predprob_risky.landing<-predict(model.AIC2,type="response") ### predicted probabilities
predout_risky.landing<-ifelse(predprob_risky.landing<0.5,"0","1") ### Predicted outcomes using 0.5 as the threshold
FAA_final<-data.frame(FAA_final,predprob_risky.landing,predout_risky.landing)
xtabs(~risky.landing+predout_risky.landing,FAA_final)
##              predout_risky.landing
## risky.landing   0   1
##             0 766   4
##             1   5  56
thresh<-seq(0.01,0.5,0.01)
sensitivity<-specificity<-rep(NA,length(thresh))
for(j in seq(along=thresh)){
pp<-ifelse(FAA_final$predprob_risky.landing<thresh[j],"0","1")
xx<-xtabs(~risky.landing+pp,FAA_final)
specificity[j]<-xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity[j]<-xx[2,2]/(xx[2,1]+xx[2,2])
}

lines(1-specificity,sensitivity,type="l", col="green")

We observe area under the curve (AUC) is more for risky landing model than long landing model.

Predicting Probabilities

new.ind<-
data.frame(aircraft=0,duration=200,no_pasg=80, speed_ground=115, speed_air=120,
height=40, pitch=4)

# Long Landing
### Predict the value for long landing
predict(model.AIC,newdata=new.ind,type="link") ### Linear predictor
##       1 
## 19.4327
predict(model.AIC,newdata=new.ind,type="response") ### Probability
## 1 
## 1
### Predict the value with its standard error
predict(model.AIC,newdata=new.ind,type="link",se=T)
## $fit
##       1 
## 19.4327 
## 
## $se.fit
## [1] 3.954009
## 
## $residual.scale
## [1] 1
round(ilogit(c(19.4327-1.96*3.954,19.4327+1.96*3.954)),3) ### Confidence interval
## [1] 1 1
predict(model.AIC,newdata=new.ind,type="response",se=T)
## $fit
## 1 
## 1 
## 
## $se.fit
##            1 
## 1.437223e-08 
## 
## $residual.scale
## [1] 1
round(c(1-1.96*1.437223e-08,1+1.96*1.437223e-08),3) ### Confidence interval
## [1] 1 1
# Risky Landing
### Predict the value for risky landing
predict(model.AIC2,newdata=new.ind,type="link") ### Linear predictor
##        1 
## 7.171796
predict(model.AIC2,newdata=new.ind,type="response") ### Probability
##         1 
## 0.9992326
### Predict the value with its standard error
predict(model.AIC2,newdata=new.ind,type="link",se=T)
## $fit
##        1 
## 7.171796 
## 
## $se.fit
## [1] 2.255343
## 
## $residual.scale
## [1] 1
round(ilogit(c(7.1717-1.96*2.255,7.1717+1.96*2.255)),3) ### Confidence interval
## [1] 0.94 1.00
predict(model.AIC2,newdata=new.ind,type="response",se=T)
## $fit
##         1 
## 0.9992326 
## 
## $se.fit
##           1 
## 0.001729316 
## 
## $residual.scale
## [1] 1
round(c(0.9992326-1.96*0.001729316 ,0.9992326+1.96*0.001729316),3) ### Confidence interval
## [1] 0.996 1.003

Compare models with different link functions

# Risky landing Probit Model

risky.probit<-
glm(risky.landing~speed_ground+aircraft+no_pasg,family=binomial(link=probit),FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(risky.probit)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg, 
##     family = binomial(link = probit), data = FAA_final)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.302   0.000   0.000   0.000   1.834  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -54.30276   12.86637  -4.221 2.44e-05 ***
## speed_ground   0.54152    0.12373   4.377 1.21e-05 ***
## aircraft      -2.70787    0.82130  -3.297 0.000977 ***
## no_pasg       -0.04849    0.03267  -1.484 0.137803    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  37.223  on 827  degrees of freedom
## AIC: 45.223
## 
## Number of Fisher Scoring iterations: 14
# Risky landing Hazard model with complementary log-log link

risky.cloglog<-
glm(risky.landing~speed_ground+aircraft+no_pasg,family=binomial(link=cloglog),FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(risky.cloglog)
## 
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg, 
##     family = binomial(link = cloglog), data = FAA_final)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.34112  -0.00177  -0.00004   0.00000   1.83591  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -63.03523   13.53092  -4.659 3.18e-06 ***
## speed_ground   0.62410    0.12931   4.826 1.39e-06 ***
## aircraft      -3.27406    0.89730  -3.649 0.000263 ***
## no_pasg       -0.05739    0.03608  -1.591 0.111648    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  39.313  on 827  degrees of freedom
## AIC: 47.313
## 
## Number of Fisher Scoring iterations: 13
# Comparing the coefficients of the 3 models

round(coef(model.AIC2),3)
##  (Intercept) speed_ground     aircraft      no_pasg 
##      -95.266        0.950       -4.642       -0.085
round(coef(risky.probit),3)
##  (Intercept) speed_ground     aircraft      no_pasg 
##      -54.303        0.542       -2.708       -0.048
round(coef(risky.cloglog),3)
##  (Intercept) speed_ground     aircraft      no_pasg 
##      -63.035        0.624       -3.274       -0.057
# ROC curve of the 3 models

#Logit


plot(1-specificity,sensitivity,type="l", col="green")

# Probit

linpred_risky.landing_2<-predict(risky.probit) ### Linear predictor
predprob_risky.landing_2<-predict(risky.probit,type="response") ### predicted probabilities
predout_risky.landing_2<-ifelse(predprob_risky.landing_2<0.5,"0","1") ### Predicted outcomes using 0.5 as the threshold
FAA_final<-data.frame(FAA_final,predprob_risky.landing_2,predout_risky.landing_2)
xtabs(~risky.landing+predout_risky.landing_2,FAA_final)
##              predout_risky.landing_2
## risky.landing   0   1
##             0 766   4
##             1   6  55
thresh<-seq(0.01,0.5,0.01)
sensitivity<-specificity<-rep(NA,length(thresh))
for(j in seq(along=thresh)){
pp<-ifelse(FAA_final$predprob_risky.landing_2<thresh[j],"0","1")
xx<-xtabs(~risky.landing+pp,FAA_final)
specificity[j]<-xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity[j]<-xx[2,2]/(xx[2,1]+xx[2,2])
}

lines(1-specificity,sensitivity,type="l", col="red")


# cloglog

linpred_risky.landing_3<-predict(risky.cloglog) ### Linear predictor
predprob_risky.landing_3<-predict(risky.cloglog,type="response") ### predicted probabilities
predout_risky.landing_3<-ifelse(predprob_risky.landing_3<0.5,"0","1") ### Predicted outcomes using 0.5 as the threshold
FAA_final<-data.frame(FAA_final,predprob_risky.landing_3,predout_risky.landing_3)
xtabs(~risky.landing+predout_risky.landing_3,FAA_final)
##              predout_risky.landing_3
## risky.landing   0   1
##             0 766   4
##             1   7  54
thresh<-seq(0.01,0.5,0.01)
sensitivity<-specificity<-rep(NA,length(thresh))
for(j in seq(along=thresh)){
pp<-ifelse(FAA_final$predprob_risky.landing_3<thresh[j],"0","1")
xx<-xtabs(~risky.landing+pp,FAA_final)
specificity[j]<-xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity[j]<-xx[2,2]/(xx[2,1]+xx[2,2])
}

lines(1-specificity,sensitivity,type="l", col="yellow")

Top Risky Landings

sort(-model.AIC2$fitted.values)[1:5]
## 362 307  64 387 408 
##  -1  -1  -1  -1  -1
# Top 5 risky landing using logit are Flights on row number 362,307,64,387,408

sort(-risky.probit$fitted.values)[1:5]
##  56  64 134 176 179 
##  -1  -1  -1  -1  -1
# Top 5 risky landing using probit are Flights on row number 56,64,134,176,179

sort(-risky.cloglog$fitted.values)[1:5]
## 19 29 30 56 64 
## -1 -1 -1 -1 -1
# Top 5 risky landing using cloglog are Flights on row number 19,29,30,56,64

# Flight on row number 64 is present in top 5 risky landings of all the 3 models

Prediction using probit model

# Risky Landing
### Predict the value for risky landing
predict(risky.probit,newdata=new.ind,type="link") ### Linear predictor
##        1 
## 4.093217
predict(risky.probit,newdata=new.ind,type="response") ### Probability
##         1 
## 0.9999787
### Predict the value with its standard error
predict(risky.probit,newdata=new.ind,type="link",se=T)
## $fit
##        1 
## 4.093217 
## 
## $se.fit
## [1] 1.219844
## 
## $residual.scale
## [1] 1
round(ilogit(c(4.093217-1.96*1.219844,4.093217+1.96*1.219844)),3) ### Confidence interval
## [1] 0.846 0.998
predict(risky.probit,newdata=new.ind,type="response",se=T)
## $fit
##         1 
## 0.9999787 
## 
## $se.fit
##            1 
## 0.0001119533 
## 
## $residual.scale
## [1] 1
round(c(0.9999787-1.96*0.0001119533 ,0.9999787+1.96*0.0001119533),3) ### Confidence interval
## [1] 1 1

Prediction using hazard model

# Risky Landing
### Predict the value for risky landing
predict(risky.cloglog,newdata=new.ind,type="link") ### Linear predictor
##        1 
## 4.144707
predict(risky.cloglog,newdata=new.ind,type="response") ### Probability
## 1 
## 1
### Predict the value with its standard error
predict(risky.cloglog,newdata=new.ind,type="link",se=T)
## $fit
##        1 
## 4.144707 
## 
## $se.fit
## [1] 1.248206
## 
## $residual.scale
## [1] 1
round(ilogit(c(4.144707 -1.96*1.248206,4.144707 +1.96*1.248206)),3) ### Confidence interval
## [1] 0.845 0.999
predict(risky.cloglog,newdata=new.ind,type="response",se=T)
## $fit
## 1 
## 1 
## 
## $se.fit
##            1 
## 2.771575e-16 
## 
## $residual.scale
## [1] 1
round(c(1-1.96*2.771575e-16 ,1+1.96*2.771575e-16),3) ### Confidence interval
## [1] 1 1