#Loading required libraries
library(readxl)
library(tidyverse)
library(plyr)
library(dplyr)
library(MASS)
library(faraway)
Initial exploration of the data
getwd()
## [1] "/Users/puneetbhatia/Desktop/UC spring sem/Stat computing"
# Importing data
FAA1 <- read_excel("FAA1-1.xls")
FAA2 <- read_excel("FAA2-1.xls")
# Structure of datasets
str(FAA1)
## Classes 'tbl_df', 'tbl' and 'data.frame': 800 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
dim(unique(FAA1))
## [1] 800 8
FAA1 has 800 rows and 8 columns
str(FAA2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 150 obs. of 7 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
dim(unique(FAA2))
## [1] 150 7
FAA2 has 150 rows and 7 columns. ‘duration’ column is not present in FAA2, but it is present in FAA1.
# Appending the data
combined <- rbind.fill(FAA1,FAA2)
# Checking duplicates
dim(combined[,-2])
## [1] 950 7
dim(unique(combined[,-2]))
## [1] 850 7
We have 100 duplicate records.
# Removing duplicate records
unique_combined <- unique(combined[,-2]) %>% left_join(FAA1)
## Joining, by = c("aircraft", "no_pasg", "speed_ground", "speed_air", "height",
## "pitch", "distance")
# Structure of the new data set
str(unique_combined)
## 'data.frame': 850 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
Dataset has 850 rows and 8 variables
#unique_combined$aircraft <- as.factor(unique_combined$aircraft)
# Summary of the combined data
summary(unique_combined)
## aircraft no_pasg speed_ground speed_air
## Length:850 Min. :29.0 Min. : 27.74 Min. : 90.00
## Class :character 1st Qu.:55.0 1st Qu.: 65.90 1st Qu.: 96.25
## Mode :character Median :60.0 Median : 79.64 Median :101.15
## Mean :60.1 Mean : 79.45 Mean :103.80
## 3rd Qu.:65.0 3rd Qu.: 92.06 3rd Qu.:109.40
## Max. :87.0 Max. :141.22 Max. :141.72
## NA's :642
## height pitch distance duration
## Min. :-3.546 Min. :2.284 Min. : 34.08 Min. : 14.76
## 1st Qu.:23.314 1st Qu.:3.642 1st Qu.: 883.79 1st Qu.:119.49
## Median :30.093 Median :4.008 Median :1258.09 Median :153.95
## Mean :30.144 Mean :4.009 Mean :1526.02 Mean :154.01
## 3rd Qu.:36.993 3rd Qu.:4.377 3rd Qu.:1936.95 3rd Qu.:188.91
## Max. :59.946 Max. :5.927 Max. :6533.05 Max. :305.62
## NA's :50
boxplot(unique_combined$duration~unique_combined$aircraft)
boxplot(unique_combined$speed_ground~unique_combined$aircraft)
boxplot(unique_combined$speed_air~unique_combined$aircraft)
boxplot(unique_combined$height~unique_combined$aircraft)
boxplot(unique_combined$pitch~unique_combined$aircraft)
boxplot(unique_combined$distance~unique_combined$aircraft)
boxplot(unique_combined$no_pasg~unique_combined$aircraft)
hist(unique_combined$distance)
Summary of findings
Data Cleaning and further exploration
# Checking abnormal values
unique_combined %>% filter(duration<40) #5 observations with duration less than 40
unique_combined %>% filter(!(speed_ground >= 30 & speed_ground <= 140)) # 3 observations with abnormal speed_ground
unique_combined %>% filter(!(speed_air >= 30 & speed_air <= 140| is.na(speed_air))) # 1 observation with abnormal speed_air
unique_combined %>% filter(height<6) #10 observations with abnormal height
unique_combined %>% filter(distance>6000) #2 observations with abnormal distance
#Cleaning data based on requirements
FAA_final <- unique_combined %>% filter(duration>40 | is.na(duration),speed_ground >= 30 & speed_ground <= 140, speed_air >= 30 & speed_air <= 140 | is.na(speed_air), height >= 6, distance < 6000)
dim(FAA_final)
## [1] 831 8
# Encoding aircraft (airbus is coded as 1 and boeing as 0)
for(i in 1:831)
{
if(FAA_final$aircraft[i] == 'airbus')
{
FAA_final$aircraft[i] = 1
}
if(FAA_final$aircraft[i] == 'boeing')
{
FAA_final$aircraft[i] = 0
}
}
FAA_final$aircraft <- as.numeric(FAA_final$aircraft)
The dataset has 831 rows after removing the abnormal values. 19 rows have been removed.
Part 2. Practice of modeling a binary response using logistic regression.
Create binary responses
FAA_final$long.landing <- with(FAA_final, ifelse(distance > 2500,1, 0))
FAA_final$risky.landing <- with(FAA_final, ifelse(distance > 3000,1, 0))
FAA_final <- FAA_final[,-7]
Identifying important factors using the binary data of “long.landing”
hist(FAA_final$long.landing)
l1 <- glm(long.landing~aircraft,family = binomial,FAA_final)
summary(l1)
##
## Call:
## glm(formula = long.landing ~ aircraft, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.6166 -0.6166 -0.4112 -0.4112 2.2416
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.5636 0.1344 -11.638 < 2e-16 ***
## aircraft -0.8641 0.2197 -3.933 8.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.78 on 830 degrees of freedom
## Residual deviance: 606.55 on 829 degrees of freedom
## AIC: 610.55
##
## Number of Fisher Scoring iterations: 5
d_1 <- 'negative'
c_1 <- summary(l1)$coefficients[,1][2]
o_1 <- exp(c_1)
p_1 <- summary(l1)$coefficients[,4][2]
l2 <- glm(long.landing~no_pasg,family = binomial,FAA_final)
summary(l2)
##
## Call:
## glm(formula = long.landing ~ no_pasg, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5710 -0.5213 -0.5125 -0.5005 2.1056
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.520896 0.846824 -1.796 0.0725 .
## no_pasg -0.007256 0.014063 -0.516 0.6059
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.78 on 830 degrees of freedom
## Residual deviance: 622.51 on 829 degrees of freedom
## AIC: 626.51
##
## Number of Fisher Scoring iterations: 4
d_2 <- 'negative'
c_2 <- summary(l2)$coefficients[,1][2]
o_2 <- exp(c_2)
p_2 <- summary(l2)$coefficients[,4][2]
l3 <- glm(long.landing~speed_ground,family = binomial,FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(l3)
##
## Call:
## glm(formula = long.landing ~ speed_ground, family = binomial,
## data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.51572 -0.03478 -0.00180 -0.00004 2.37848
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -47.96055 6.28136 -7.635 2.25e-14 ***
## speed_ground 0.47235 0.06245 7.563 3.94e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.78 on 830 degrees of freedom
## Residual deviance: 115.47 on 829 degrees of freedom
## AIC: 119.47
##
## Number of Fisher Scoring iterations: 10
d_3 <- 'positive'
c_3 <- summary(l3)$coefficients[,1][2]
o_3 <- exp(c_3)
p_3 <- summary(l3)$coefficients[,4][2]
l4 <- glm(long.landing~speed_air,family = binomial,FAA_final)
summary(l4)
##
## Call:
## glm(formula = long.landing ~ speed_air, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.55980 -0.35662 0.00067 0.20985 2.19936
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -51.98365 7.84323 -6.628 3.41e-11 ***
## speed_air 0.51232 0.07772 6.592 4.33e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 281.37 on 202 degrees of freedom
## Residual deviance: 102.33 on 201 degrees of freedom
## (628 observations deleted due to missingness)
## AIC: 106.33
##
## Number of Fisher Scoring iterations: 7
d_4 <- 'positive'
c_4 <- summary(l4)$coefficients[,1][2]
o_4 <- exp(c_4)
p_4 <- summary(l4)$coefficients[,4][2]
l5 <- glm(long.landing~height,family = binomial,FAA_final)
summary(l5)
##
## Call:
## glm(formula = long.landing ~ height, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5782 -0.5244 -0.5097 -0.4916 2.1193
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.220918 0.349916 -6.347 2.2e-10 ***
## height 0.008624 0.010737 0.803 0.422
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.78 on 830 degrees of freedom
## Residual deviance: 622.13 on 829 degrees of freedom
## AIC: 626.13
##
## Number of Fisher Scoring iterations: 4
d_5 <- 'positive'
c_5 <- summary(l5)$coefficients[,1][2]
o_5 <- exp(c_5)
p_5 <- summary(l5)$coefficients[,4][2]
l6 <- glm(long.landing~pitch,family = binomial,FAA_final)
summary(l6)
##
## Call:
## glm(formula = long.landing ~ pitch, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.7248 -0.5381 -0.5006 -0.4580 2.2687
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.5764 0.8297 -4.310 1.63e-05 ***
## pitch 0.4005 0.2013 1.989 0.0466 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.78 on 830 degrees of freedom
## Residual deviance: 618.79 on 829 degrees of freedom
## AIC: 622.79
##
## Number of Fisher Scoring iterations: 4
d_6 <- 'positive'
c_6 <- summary(l6)$coefficients[,1][2]
o_6 <- exp(c_6)
p_6 <- summary(l6)$coefficients[,4][2]
l7 <- glm(long.landing~duration,family = binomial,FAA_final)
summary(l7)
##
## Call:
## glm(formula = long.landing ~ duration, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5535 -0.5312 -0.5210 -0.5097 2.0840
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.753699 0.356787 -4.915 8.87e-07 ***
## duration -0.001070 0.002226 -0.481 0.631
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 597.69 on 780 degrees of freedom
## Residual deviance: 597.46 on 779 degrees of freedom
## (50 observations deleted due to missingness)
## AIC: 601.46
##
## Number of Fisher Scoring iterations: 4
d_7 <- 'negative'
c_7 <- summary(l7)$coefficients[,1][2]
o_7 <- exp(c_7)
p_7 <- summary(l7)$coefficients[,4][2]
names = colnames(FAA_final[c(1:7)])
coff_size = c(c_1,c_2,c_3,c_4,c_5,c_6,c_7)
odds_ratio = c(o_1,o_2,o_3,o_4,o_5,o_6,o_7)
dir=c(d_1,d_2,d_3,d_4,d_5,d_6,d_7)
pval=c(p_1,p_2,p_3,p_4,p_5,p_6,p_7)
Table1 <- data.frame(names,coff_size,odds_ratio,dir,pval)
Important factors - speed_ground, speed_air, aircraft, pitch
# Visualising the association of important factors
plot(jitter(long.landing,0.1)~jitter(speed_ground),FAA_final,xlab="speed_ground",ylab="long.landing",pch=".")
plot(jitter(long.landing,0.1)~jitter(speed_air),FAA_final,xlab="speed_air",ylab="long.landing",pch=".")
plot(jitter(long.landing,0.1)~jitter(aircraft),FAA_final,xlab="aircraft",ylab="long.landing",pch=".")
plot(long.landing~aircraft, FAA_final)
library(ggplot2)
ggplot(FAA_final,aes(x=pitch,fill=long.landing))+geom_histogram(position="dodge",binwidth=1)
plot(jitter(long.landing,0.1)~jitter(pitch),FAA_final,xlab="pitch",ylab="long.landing",pch=".")
plot(long.landing~pitch, FAA_final)
speed_ground and speed_air are positively associated with the response. It is not clear if aircraft and pitch are associated with the response or not
# Model with all the variables except speed_air and duration (speed_air has collinearity with speed_ground, duration has the least p value as seen from Table1. Also both have missing values so removing them from the model)
glm0 <- glm(long.landing~speed_ground + aircraft + pitch + no_pasg + height,family = binomial,FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm0)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + pitch +
## no_pasg + height, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.20264 -0.00054 0.00000 0.00000 2.35660
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.146e+02 2.355e+01 -4.869 1.12e-06 ***
## speed_ground 1.023e+00 2.036e-01 5.023 5.09e-07 ***
## aircraft -5.134e+00 1.181e+00 -4.348 1.38e-05 ***
## pitch 1.538e+00 8.429e-01 1.825 0.068073 .
## no_pasg 3.975e-04 5.400e-02 0.007 0.994127
## height 2.578e-01 7.177e-02 3.592 0.000328 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.778 on 830 degrees of freedom
## Residual deviance: 53.204 on 825 degrees of freedom
## AIC: 65.204
##
## Number of Fisher Scoring iterations: 12
nothing <- glm(long.landing~1,family = binomial,FAA_final)
# Stepwise forward selection using AIC
model.AIC<-stepAIC(nothing,direction="forward",scope=list(upper=glm0,lower=nothing))
## Start: AIC=624.78
## long.landing ~ 1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + speed_ground 1 115.47 119.47
## + aircraft 1 606.55 610.55
## + pitch 1 618.79 622.79
## <none> 622.78 624.78
## + height 1 622.13 626.13
## + no_pasg 1 622.51 626.51
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=119.47
## long.landing ~ speed_ground
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + aircraft 1 84.665 90.665
## + height 1 100.459 106.459
## + pitch 1 105.527 111.527
## <none> 115.470 119.470
## + no_pasg 1 115.468 121.468
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=90.66
## long.landing ~ speed_ground + aircraft
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + height 1 57.047 65.047
## + pitch 1 81.309 89.309
## <none> 84.665 90.665
## + no_pasg 1 84.219 92.219
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=65.05
## long.landing ~ speed_ground + aircraft + height
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + pitch 1 53.204 63.204
## <none> 57.047 65.047
## + no_pasg 1 57.031 67.031
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=63.2
## long.landing ~ speed_ground + aircraft + height + pitch
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 53.204 63.204
## + no_pasg 1 53.204 65.204
# Model selected - long.landing ~ speed_ground + aircraft + height + pitch
summary(model.AIC)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height +
## pitch, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.20284 -0.00054 0.00000 0.00000 2.35719
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -114.64155 23.52202 -4.874 1.09e-06 ***
## speed_ground 1.02266 0.20290 5.040 4.65e-07 ***
## aircraft -5.13443 1.18091 -4.348 1.37e-05 ***
## height 0.25795 0.06861 3.760 0.00017 ***
## pitch 1.53751 0.84109 1.828 0.06755 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.778 on 830 degrees of freedom
## Residual deviance: 53.204 on 826 degrees of freedom
## AIC: 63.204
##
## Number of Fisher Scoring iterations: 12
# Stepwise forward selection using BIC
model.BIC<-step(nothing,direction="forward",scope=list(upper=glm0,lower=nothing), criterion = "BIC", k=log(nrow(FAA_final)))
## Start: AIC=629.5
## long.landing ~ 1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + speed_ground 1 115.47 128.92
## + aircraft 1 606.55 620.00
## <none> 622.78 629.50
## + pitch 1 618.79 632.24
## + height 1 622.13 635.58
## + no_pasg 1 622.51 635.96
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=128.92
## long.landing ~ speed_ground
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + aircraft 1 84.665 104.83
## + height 1 100.459 120.63
## + pitch 1 105.527 125.69
## <none> 115.470 128.92
## + no_pasg 1 115.468 135.64
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=104.83
## long.landing ~ speed_ground + aircraft
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + height 1 57.047 83.937
## <none> 84.665 104.832
## + pitch 1 81.309 108.200
## + no_pasg 1 84.219 111.110
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=83.94
## long.landing ~ speed_ground + aircraft + height
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 57.047 83.937
## + pitch 1 53.204 86.817
## + no_pasg 1 57.031 90.644
# Model selected - long.landing ~ speed_ground + aircraft + height
summary(model.BIC)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height,
## family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.43442 -0.00117 0.00000 0.00000 2.57435
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -97.90623 18.34950 -5.336 9.52e-08 ***
## speed_ground 0.92657 0.17242 5.374 7.70e-08 ***
## aircraft -5.04813 1.11520 -4.527 5.99e-06 ***
## height 0.23106 0.05959 3.877 0.000106 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.778 on 830 degrees of freedom
## Residual deviance: 57.047 on 827 degrees of freedom
## AIC: 65.047
##
## Number of Fisher Scoring iterations: 11
EXECUTIVE SUMMARY
Final Model for Long Landing
# Summary of the final model
summary(model.AIC)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height +
## pitch, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.20284 -0.00054 0.00000 0.00000 2.35719
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -114.64155 23.52202 -4.874 1.09e-06 ***
## speed_ground 1.02266 0.20290 5.040 4.65e-07 ***
## aircraft -5.13443 1.18091 -4.348 1.37e-05 ***
## height 0.25795 0.06861 3.760 0.00017 ***
## pitch 1.53751 0.84109 1.828 0.06755 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.778 on 830 degrees of freedom
## Residual deviance: 53.204 on 826 degrees of freedom
## AIC: 63.204
##
## Number of Fisher Scoring iterations: 12
c_1 <- summary(model.AIC)$coefficients[,1][2]
o_1 <- exp(c_1)
p_1 <- summary(model.AIC)$coefficients[,4][2]
c_2 <- summary(model.AIC)$coefficients[,1][3]
o_2 <- exp(c_2)
p_2 <- summary(model.AIC)$coefficients[,4][3]
c_3 <- summary(model.AIC)$coefficients[,1][4]
o_3 <- exp(c_3)
p_3 <- summary(model.AIC)$coefficients[,4][4]
c_4 <- summary(model.AIC)$coefficients[,1][5]
o_4 <- exp(c_4)
p_4 <- summary(model.AIC)$coefficients[,4][5]
names = c('speed_ground','aircraft','height','pitch')
coff_size = c(c_1,c_2,c_3,c_4)
odds_ratio = c(o_1,o_2,o_3,o_4)
pval=c(p_1,p_2,p_3,p_4)
Long_Landing_Table <- data.frame(names,coff_size,odds_ratio,pval)
Long_Landing_Table
#Plots
plot(jitter(long.landing,0.1)~jitter(speed_ground),FAA_final,xlab="speed_ground",ylab="long.landing",pch=".")
plot(jitter(long.landing,0.1)~jitter(aircraft),FAA_final,xlab="aircraft",ylab="long.landing",pch=".")
plot(jitter(long.landing,0.1)~jitter(pitch),FAA_final,xlab="pitch",ylab="long.landing",pch=".")
Identifying important factors using the binary data of “risky.landing”.
hist(FAA_final$risky.landing)
l1 <- glm(risky.landing~aircraft,family = binomial,FAA_final)
summary(l1)
##
## Call:
## glm(formula = risky.landing ~ aircraft, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4793 -0.4793 -0.2958 -0.2958 2.5105
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.1059 0.1634 -12.886 < 2e-16 ***
## aircraft -1.0018 0.2858 -3.505 0.000456 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.04 on 830 degrees of freedom
## Residual deviance: 422.74 on 829 degrees of freedom
## AIC: 426.74
##
## Number of Fisher Scoring iterations: 5
d_1 <- 'negative'
c_1 <- summary(l1)$coefficients[,1][2]
o_1 <- exp(c_1)
p_1 <- summary(l1)$coefficients[,4][2]
l2 <- glm(risky.landing~no_pasg,family = binomial,FAA_final)
summary(l2)
##
## Call:
## glm(formula = risky.landing ~ no_pasg, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5628 -0.4073 -0.3831 -0.3582 2.4815
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.02673 1.05514 -0.973 0.331
## no_pasg -0.02538 0.01779 -1.427 0.154
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.04 on 830 degrees of freedom
## Residual deviance: 434.00 on 829 degrees of freedom
## AIC: 438
##
## Number of Fisher Scoring iterations: 5
d_2 <- 'negative'
c_2 <- summary(l2)$coefficients[,1][2]
o_2 <- exp(c_2)
p_2 <- summary(l2)$coefficients[,4][2]
l3 <- glm(risky.landing~speed_ground,family = binomial,FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(l3)
##
## Call:
## glm(formula = risky.landing ~ speed_ground, family = binomial,
## data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.53709 -0.00383 -0.00009 0.00000 1.95417
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -66.1243 12.2025 -5.419 6.0e-08 ***
## speed_ground 0.6142 0.1139 5.394 6.9e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 58.931 on 829 degrees of freedom
## AIC: 62.931
##
## Number of Fisher Scoring iterations: 11
d_3 <- 'positive'
c_3 <- summary(l3)$coefficients[,1][2]
o_3 <- exp(c_3)
p_3 <- summary(l3)$coefficients[,4][2]
l4 <- glm(risky.landing~speed_air,family = binomial,FAA_final)
summary(l4)
##
## Call:
## glm(formula = risky.landing ~ speed_air, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.15041 -0.05679 -0.00616 0.00102 2.69736
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -93.5700 20.2180 -4.628 3.69e-06 ***
## speed_air 0.8704 0.1882 4.626 3.73e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 248.18 on 202 degrees of freedom
## Residual deviance: 44.58 on 201 degrees of freedom
## (628 observations deleted due to missingness)
## AIC: 48.58
##
## Number of Fisher Scoring iterations: 9
d_4 <- 'positive'
c_4 <- summary(l4)$coefficients[,1][2]
o_4 <- exp(c_4)
p_4 <- summary(l4)$coefficients[,4][2]
l5 <- glm(risky.landing~height,family = binomial,FAA_final)
summary(l5)
##
## Call:
## glm(formula = risky.landing ~ height, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4007 -0.3931 -0.3901 -0.3868 2.3105
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.468143 0.433262 -5.697 1.22e-08 ***
## height -0.002219 0.013619 -0.163 0.871
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.04 on 830 degrees of freedom
## Residual deviance: 436.02 on 829 degrees of freedom
## AIC: 440.02
##
## Number of Fisher Scoring iterations: 5
d_5 <- 'negative'
c_5 <- summary(l5)$coefficients[,1][2]
o_5 <- exp(c_5)
p_5 <- summary(l5)$coefficients[,4][2]
l6 <- glm(risky.landing~pitch,family = binomial,FAA_final)
summary(l6)
##
## Call:
## glm(formula = risky.landing ~ pitch, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5433 -0.4094 -0.3828 -0.3559 2.4829
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.0380 1.0461 -3.860 0.000113 ***
## pitch 0.3711 0.2535 1.464 0.143296
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.04 on 830 degrees of freedom
## Residual deviance: 433.89 on 829 degrees of freedom
## AIC: 437.89
##
## Number of Fisher Scoring iterations: 5
d_6 <- 'positive'
c_6 <- summary(l6)$coefficients[,1][2]
o_6 <- exp(c_6)
p_6 <- summary(l6)$coefficients[,4][2]
l7 <- glm(risky.landing~duration,family = binomial,FAA_final)
summary(l7)
##
## Call:
## glm(formula = risky.landing ~ duration, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.4253 -0.4067 -0.3985 -0.3898 2.3230
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.309326 0.446857 -5.168 2.37e-07 ***
## duration -0.001152 0.002794 -0.412 0.68
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 423.22 on 780 degrees of freedom
## Residual deviance: 423.04 on 779 degrees of freedom
## (50 observations deleted due to missingness)
## AIC: 427.04
##
## Number of Fisher Scoring iterations: 5
d_7 <- 'negative'
c_7 <- summary(l7)$coefficients[,1][2]
o_7 <- exp(c_7)
p_7 <- summary(l7)$coefficients[,4][2]
names = colnames(FAA_final[c(1:7)])
coff_size = c(c_1,c_2,c_3,c_4,c_5,c_6,c_7)
odds_ratio = c(o_1,o_2,o_3,o_4,o_5,o_6,o_7)
dir=c(d_1,d_2,d_3,d_4,d_5,d_6,d_7)
pval=c(p_1,p_2,p_3,p_4,p_5,p_6,p_7)
Table2 <- data.frame(names,coff_size,odds_ratio,dir,pval)
Important factors - speed_ground, speed_air, aircraft, pitch
# Visualising the association of important factors
plot(jitter(risky.landing,0.1)~jitter(speed_ground),FAA_final,xlab="speed_ground",ylab="long.landing",pch=".")
plot(jitter(risky.landing,0.1)~jitter(speed_air),FAA_final,xlab="speed_air",ylab="long.landing",pch=".")
plot(jitter(risky.landing,0.1)~jitter(aircraft),FAA_final,xlab="aircraft",ylab="long.landing",pch=".")
plot(risky.landing~aircraft, FAA_final)
ggplot(FAA_final,aes(x=pitch,fill=risky.landing))+geom_histogram(position="dodge",binwidth=1)
plot(jitter(risky.landing,0.1)~jitter(pitch),FAA_final,xlab="pitch",ylab="long.landing",pch=".")
plot(risky.landing~pitch, FAA_final)
speed_ground and speed_air are positively associated with the response. It is not clear if aircraft and pitch are associated with the response or not
# Model with all the variables except speed_air and duration (speed_air has collinearity with speed_ground, duration has insignificant p value as seen from Table1. Also both have missing values so removing them from the model)
glm0 <- glm(risky.landing~speed_ground + aircraft + pitch + no_pasg + height,family = binomial,FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(glm0)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + pitch +
## no_pasg + height, family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.45728 -0.00009 0.00000 0.00000 1.87948
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -98.59616 25.69380 -3.837 0.000124 ***
## speed_ground 0.94620 0.24175 3.914 9.08e-05 ***
## aircraft -4.45430 1.54406 -2.885 0.003917 **
## pitch 0.60040 0.77084 0.779 0.436041
## no_pasg -0.08726 0.05824 -1.498 0.134025
## height 0.04376 0.04509 0.970 0.331814
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 36.478 on 825 degrees of freedom
## AIC: 48.478
##
## Number of Fisher Scoring iterations: 12
nothing <- glm(risky.landing~1,family = binomial,FAA_final)
# Stepwise forward selection using AIC
model.AIC2<-stepAIC(nothing,direction="forward",scope=list(upper=glm0,lower=nothing))
## Start: AIC=438.04
## risky.landing ~ 1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + speed_ground 1 58.93 62.93
## + aircraft 1 422.74 426.74
## + pitch 1 433.89 437.89
## + no_pasg 1 434.00 438.00
## <none> 436.04 438.04
## + height 1 436.02 440.02
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=62.93
## risky.landing ~ speed_ground
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + aircraft 1 40.097 46.097
## + pitch 1 53.079 59.079
## <none> 58.931 62.931
## + no_pasg 1 58.318 64.318
## + height 1 58.667 64.667
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=46.1
## risky.landing ~ speed_ground + aircraft
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + no_pasg 1 37.707 45.707
## <none> 40.097 46.097
## + height 1 39.402 47.402
## + pitch 1 39.928 47.928
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=45.71
## risky.landing ~ speed_ground + aircraft + no_pasg
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 37.707 45.707
## + height 1 37.099 47.099
## + pitch 1 37.449 47.449
# Model selected - risky.landing ~ speed_ground + aircraft + no_pasg
summary(model.AIC2)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg,
## family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.33913 -0.00009 0.00000 0.00000 1.87810
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -95.26592 24.49990 -3.888 0.000101 ***
## speed_ground 0.94963 0.23559 4.031 5.56e-05 ***
## aircraft -4.64188 1.47520 -3.147 0.001652 **
## no_pasg -0.08462 0.05732 -1.476 0.139866
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 37.707 on 827 degrees of freedom
## AIC: 45.707
##
## Number of Fisher Scoring iterations: 12
# Stepwise forward selection using BIC
model.BIC2<-step(nothing,direction="forward",scope=list(upper=glm0,lower=nothing), criterion = "BIC", k=log(nrow(FAA_final)))
## Start: AIC=442.77
## risky.landing ~ 1
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + speed_ground 1 58.93 72.38
## + aircraft 1 422.74 436.18
## <none> 436.04 442.77
## + pitch 1 433.89 447.34
## + no_pasg 1 434.00 447.45
## + height 1 436.02 449.46
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=72.38
## risky.landing ~ speed_ground
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## + aircraft 1 40.097 60.264
## <none> 58.931 72.376
## + pitch 1 53.079 73.247
## + no_pasg 1 58.318 78.486
## + height 1 58.667 78.835
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##
## Step: AIC=60.26
## risky.landing ~ speed_ground + aircraft
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Df Deviance AIC
## <none> 40.097 60.264
## + no_pasg 1 37.707 64.597
## + height 1 39.402 66.292
## + pitch 1 39.928 66.819
# Model selected - risky.landing ~ speed_ground + aircraft
summary(model.BIC2)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial,
## data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.24398 -0.00011 0.00000 0.00000 1.61021
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -98.0582 23.8303 -4.115 3.87e-05 ***
## speed_ground 0.9263 0.2248 4.121 3.78e-05 ***
## aircraft -4.0190 1.2494 -3.217 0.0013 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 40.097 on 828 degrees of freedom
## AIC: 46.097
##
## Number of Fisher Scoring iterations: 12
EXECUTIVE SUMMARY
Final Model for Risky Landing
# Summary of the final model
summary(model.AIC2)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg,
## family = binomial, data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.33913 -0.00009 0.00000 0.00000 1.87810
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -95.26592 24.49990 -3.888 0.000101 ***
## speed_ground 0.94963 0.23559 4.031 5.56e-05 ***
## aircraft -4.64188 1.47520 -3.147 0.001652 **
## no_pasg -0.08462 0.05732 -1.476 0.139866
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 37.707 on 827 degrees of freedom
## AIC: 45.707
##
## Number of Fisher Scoring iterations: 12
c_1 <- summary(model.AIC2)$coefficients[,1][2]
o_1 <- exp(c_1)
p_1 <- summary(model.AIC2)$coefficients[,4][2]
c_2 <- summary(model.AIC2)$coefficients[,1][3]
o_2 <- exp(c_2)
p_2 <- summary(model.AIC2)$coefficients[,4][3]
c_3 <- summary(model.AIC2)$coefficients[,1][4]
o_3 <- exp(c_3)
p_3 <- summary(model.AIC2)$coefficients[,4][4]
names = c('speed_ground','aircraft','no_pasg')
coff_size = c(c_1,c_2,c_3)
odds_ratio = c(o_1,o_2,o_3)
pval=c(p_1,p_2,p_3)
Risky_Landing_Table <- data.frame(names,coff_size,odds_ratio,pval)
Risky_Landing_Table
#Plots
plot(jitter(risky.landing,0.1)~jitter(speed_ground),FAA_final,xlab="speed_ground",ylab="risky.landing",pch=".")
plot(jitter(risky.landing,0.1)~jitter(aircraft),FAA_final,xlab="aircraft",ylab="risky.landing",pch=".")
plot(jitter(risky.landing,0.1)~jitter(no_pasg),FAA_final,xlab="no_pasg",ylab="risky.landing",pch=".")
Comparing long.landing and risky.landing models
# ROC curve for long.landing
linpred_long.landing<-predict(model.AIC) ### Linear predictor
predprob_long.landing<-predict(model.AIC,type="response") ### predicted probabilities
predout_long.landing<-ifelse(predprob_long.landing<0.5,"0","1") ### Predicted outcomes using 0.5 as the threshold
FAA_final<-data.frame(FAA_final,predprob_long.landing,predout_long.landing)
xtabs(~long.landing+predout_long.landing,FAA_final)
## predout_long.landing
## long.landing 0 1
## 0 723 5
## 1 6 97
thresh<-seq(0.01,0.5,0.01)
sensitivity<-specificity<-rep(NA,length(thresh))
for(j in seq(along=thresh)){
pp<-ifelse(FAA_final$predprob_long.landing<thresh[j],"0","1")
xx<-xtabs(~long.landing+pp,FAA_final)
specificity[j]<-xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity[j]<-xx[2,2]/(xx[2,1]+xx[2,2])
}
par(mfrow=c(1,2))
plot(1-specificity,sensitivity,type="l", col="red");abline(0,1,lty=2)
# ROC curve for risky.landing
linpred_risky.landing<-predict(model.AIC2) ### Linear predictor
predprob_risky.landing<-predict(model.AIC2,type="response") ### predicted probabilities
predout_risky.landing<-ifelse(predprob_risky.landing<0.5,"0","1") ### Predicted outcomes using 0.5 as the threshold
FAA_final<-data.frame(FAA_final,predprob_risky.landing,predout_risky.landing)
xtabs(~risky.landing+predout_risky.landing,FAA_final)
## predout_risky.landing
## risky.landing 0 1
## 0 766 4
## 1 5 56
thresh<-seq(0.01,0.5,0.01)
sensitivity<-specificity<-rep(NA,length(thresh))
for(j in seq(along=thresh)){
pp<-ifelse(FAA_final$predprob_risky.landing<thresh[j],"0","1")
xx<-xtabs(~risky.landing+pp,FAA_final)
specificity[j]<-xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity[j]<-xx[2,2]/(xx[2,1]+xx[2,2])
}
lines(1-specificity,sensitivity,type="l", col="green")
We observe area under the curve (AUC) is more for risky landing model than long landing model.
Predicting Probabilities
new.ind<-
data.frame(aircraft=0,duration=200,no_pasg=80, speed_ground=115, speed_air=120,
height=40, pitch=4)
# Long Landing
### Predict the value for long landing
predict(model.AIC,newdata=new.ind,type="link") ### Linear predictor
## 1
## 19.4327
predict(model.AIC,newdata=new.ind,type="response") ### Probability
## 1
## 1
### Predict the value with its standard error
predict(model.AIC,newdata=new.ind,type="link",se=T)
## $fit
## 1
## 19.4327
##
## $se.fit
## [1] 3.954009
##
## $residual.scale
## [1] 1
round(ilogit(c(19.4327-1.96*3.954,19.4327+1.96*3.954)),3) ### Confidence interval
## [1] 1 1
predict(model.AIC,newdata=new.ind,type="response",se=T)
## $fit
## 1
## 1
##
## $se.fit
## 1
## 1.437223e-08
##
## $residual.scale
## [1] 1
round(c(1-1.96*1.437223e-08,1+1.96*1.437223e-08),3) ### Confidence interval
## [1] 1 1
# Risky Landing
### Predict the value for risky landing
predict(model.AIC2,newdata=new.ind,type="link") ### Linear predictor
## 1
## 7.171796
predict(model.AIC2,newdata=new.ind,type="response") ### Probability
## 1
## 0.9992326
### Predict the value with its standard error
predict(model.AIC2,newdata=new.ind,type="link",se=T)
## $fit
## 1
## 7.171796
##
## $se.fit
## [1] 2.255343
##
## $residual.scale
## [1] 1
round(ilogit(c(7.1717-1.96*2.255,7.1717+1.96*2.255)),3) ### Confidence interval
## [1] 0.94 1.00
predict(model.AIC2,newdata=new.ind,type="response",se=T)
## $fit
## 1
## 0.9992326
##
## $se.fit
## 1
## 0.001729316
##
## $residual.scale
## [1] 1
round(c(0.9992326-1.96*0.001729316 ,0.9992326+1.96*0.001729316),3) ### Confidence interval
## [1] 0.996 1.003
Compare models with different link functions
# Risky landing Probit Model
risky.probit<-
glm(risky.landing~speed_ground+aircraft+no_pasg,family=binomial(link=probit),FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(risky.probit)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg,
## family = binomial(link = probit), data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.302 0.000 0.000 0.000 1.834
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -54.30276 12.86637 -4.221 2.44e-05 ***
## speed_ground 0.54152 0.12373 4.377 1.21e-05 ***
## aircraft -2.70787 0.82130 -3.297 0.000977 ***
## no_pasg -0.04849 0.03267 -1.484 0.137803
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 37.223 on 827 degrees of freedom
## AIC: 45.223
##
## Number of Fisher Scoring iterations: 14
# Risky landing Hazard model with complementary log-log link
risky.cloglog<-
glm(risky.landing~speed_ground+aircraft+no_pasg,family=binomial(link=cloglog),FAA_final)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(risky.cloglog)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg,
## family = binomial(link = cloglog), data = FAA_final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.34112 -0.00177 -0.00004 0.00000 1.83591
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -63.03523 13.53092 -4.659 3.18e-06 ***
## speed_ground 0.62410 0.12931 4.826 1.39e-06 ***
## aircraft -3.27406 0.89730 -3.649 0.000263 ***
## no_pasg -0.05739 0.03608 -1.591 0.111648
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 39.313 on 827 degrees of freedom
## AIC: 47.313
##
## Number of Fisher Scoring iterations: 13
# Comparing the coefficients of the 3 models
round(coef(model.AIC2),3)
## (Intercept) speed_ground aircraft no_pasg
## -95.266 0.950 -4.642 -0.085
round(coef(risky.probit),3)
## (Intercept) speed_ground aircraft no_pasg
## -54.303 0.542 -2.708 -0.048
round(coef(risky.cloglog),3)
## (Intercept) speed_ground aircraft no_pasg
## -63.035 0.624 -3.274 -0.057
# ROC curve of the 3 models
#Logit
plot(1-specificity,sensitivity,type="l", col="green")
# Probit
linpred_risky.landing_2<-predict(risky.probit) ### Linear predictor
predprob_risky.landing_2<-predict(risky.probit,type="response") ### predicted probabilities
predout_risky.landing_2<-ifelse(predprob_risky.landing_2<0.5,"0","1") ### Predicted outcomes using 0.5 as the threshold
FAA_final<-data.frame(FAA_final,predprob_risky.landing_2,predout_risky.landing_2)
xtabs(~risky.landing+predout_risky.landing_2,FAA_final)
## predout_risky.landing_2
## risky.landing 0 1
## 0 766 4
## 1 6 55
thresh<-seq(0.01,0.5,0.01)
sensitivity<-specificity<-rep(NA,length(thresh))
for(j in seq(along=thresh)){
pp<-ifelse(FAA_final$predprob_risky.landing_2<thresh[j],"0","1")
xx<-xtabs(~risky.landing+pp,FAA_final)
specificity[j]<-xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity[j]<-xx[2,2]/(xx[2,1]+xx[2,2])
}
lines(1-specificity,sensitivity,type="l", col="red")
# cloglog
linpred_risky.landing_3<-predict(risky.cloglog) ### Linear predictor
predprob_risky.landing_3<-predict(risky.cloglog,type="response") ### predicted probabilities
predout_risky.landing_3<-ifelse(predprob_risky.landing_3<0.5,"0","1") ### Predicted outcomes using 0.5 as the threshold
FAA_final<-data.frame(FAA_final,predprob_risky.landing_3,predout_risky.landing_3)
xtabs(~risky.landing+predout_risky.landing_3,FAA_final)
## predout_risky.landing_3
## risky.landing 0 1
## 0 766 4
## 1 7 54
thresh<-seq(0.01,0.5,0.01)
sensitivity<-specificity<-rep(NA,length(thresh))
for(j in seq(along=thresh)){
pp<-ifelse(FAA_final$predprob_risky.landing_3<thresh[j],"0","1")
xx<-xtabs(~risky.landing+pp,FAA_final)
specificity[j]<-xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity[j]<-xx[2,2]/(xx[2,1]+xx[2,2])
}
lines(1-specificity,sensitivity,type="l", col="yellow")
Top Risky Landings
sort(-model.AIC2$fitted.values)[1:5]
## 362 307 64 387 408
## -1 -1 -1 -1 -1
# Top 5 risky landing using logit are Flights on row number 362,307,64,387,408
sort(-risky.probit$fitted.values)[1:5]
## 56 64 134 176 179
## -1 -1 -1 -1 -1
# Top 5 risky landing using probit are Flights on row number 56,64,134,176,179
sort(-risky.cloglog$fitted.values)[1:5]
## 19 29 30 56 64
## -1 -1 -1 -1 -1
# Top 5 risky landing using cloglog are Flights on row number 19,29,30,56,64
# Flight on row number 64 is present in top 5 risky landings of all the 3 models
Prediction using probit model
# Risky Landing
### Predict the value for risky landing
predict(risky.probit,newdata=new.ind,type="link") ### Linear predictor
## 1
## 4.093217
predict(risky.probit,newdata=new.ind,type="response") ### Probability
## 1
## 0.9999787
### Predict the value with its standard error
predict(risky.probit,newdata=new.ind,type="link",se=T)
## $fit
## 1
## 4.093217
##
## $se.fit
## [1] 1.219844
##
## $residual.scale
## [1] 1
round(ilogit(c(4.093217-1.96*1.219844,4.093217+1.96*1.219844)),3) ### Confidence interval
## [1] 0.846 0.998
predict(risky.probit,newdata=new.ind,type="response",se=T)
## $fit
## 1
## 0.9999787
##
## $se.fit
## 1
## 0.0001119533
##
## $residual.scale
## [1] 1
round(c(0.9999787-1.96*0.0001119533 ,0.9999787+1.96*0.0001119533),3) ### Confidence interval
## [1] 1 1
Prediction using hazard model
# Risky Landing
### Predict the value for risky landing
predict(risky.cloglog,newdata=new.ind,type="link") ### Linear predictor
## 1
## 4.144707
predict(risky.cloglog,newdata=new.ind,type="response") ### Probability
## 1
## 1
### Predict the value with its standard error
predict(risky.cloglog,newdata=new.ind,type="link",se=T)
## $fit
## 1
## 4.144707
##
## $se.fit
## [1] 1.248206
##
## $residual.scale
## [1] 1
round(ilogit(c(4.144707 -1.96*1.248206,4.144707 +1.96*1.248206)),3) ### Confidence interval
## [1] 0.845 0.999
predict(risky.cloglog,newdata=new.ind,type="response",se=T)
## $fit
## 1
## 1
##
## $se.fit
## 1
## 2.771575e-16
##
## $residual.scale
## [1] 1
round(c(1-1.96*2.771575e-16 ,1+1.96*2.771575e-16),3) ### Confidence interval
## [1] 1 1