#read excel files
library("readxl")
faa1 <- read_excel('faa1.xls')
faa2 <- read_excel('faa2.xls')
library('plyr')
faa <- rbind.fill(faa1,faa2)
library('tidyverse')
faa.no.dup <- distinct(faa,aircraft,no_pasg,speed_ground,speed_air,height,pitch,distance,.keep_all = T)
height_abnormal <- sum(faa.no.dup$height<6)
duration_abnormal <- sum(faa.no.dup$duration<40)
speed_ground_abnormal <- sum(faa.no.dup$speed_ground<30 | faa.no.dup$speed_ground > 140)
speed_air_abnormal <- sum(faa.no.dup$speed_air<30 | faa.no.dup$speed_air > 140)
faa.no.dup <- faa.no.dup[faa.no.dup$height > 6,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground > 30,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground < 140,]
faa.clean <- faa.no.dup
str(faa.clean)
## 'data.frame': 837 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
faa.clean$long.landing <- ifelse(faa.clean$distance > 2500,1,0)
faa.clean$risky.landing <- ifelse(faa.clean$distance > 3000,1,0)
faa.clean$distance <- NULL
# pie chart for distribution of long.landing
pct <- round(table(faa.clean$long.landing)/length(faa.clean$long.landing)*100,1)
labs <- c("No","Yes")
labs <- paste(labs,pct)
labs <- paste(labs,"%",sep="")
pie(table(faa.clean$long.landing),labels= labs,col = rainbow(length(labs)),main = "Pie chart of Long Landing")
faa.clean$aircraft <- ifelse(faa.clean$aircraft == 'boeing', 1, 0)
vars <- c('aircraft','duration','no_pasg','speed_ground','speed_air','height','pitch')
fittedpvalues <- vector()
fittedcoeffs <- vector()
fittedoddratios <- vector()
lmod1 <- glm(long.landing~aircraft,data = faa.clean)
fittedpvalues[1] <- summary(lmod1)$coefficients[2,4]
fittedcoeffs[1] <- summary(lmod1)$coefficients[2,1]
fittedoddratios[1] <- exp(coef(lmod1))[2]
lmod2 <- glm(long.landing~duration,data = faa.clean)
fittedpvalues[2] <- summary(lmod2)$coefficients[2,4]
fittedcoeffs[2] <- summary(lmod2)$coefficients[2,1]
fittedoddratios[2] <- exp(coef(lmod2))[2]
lmod3 <- glm(long.landing~no_pasg,data = faa.clean)
fittedpvalues[3] <- summary(lmod3)$coefficients[2,4]
fittedcoeffs[3] <- summary(lmod3)$coefficients[2,1]
fittedoddratios[3] <- exp(coef(lmod3))[2]
lmod4 <- glm(long.landing~speed_ground,data = faa.clean)
fittedpvalues[4] <- summary(lmod4)$coefficients[2,4]
fittedcoeffs[4] <- summary(lmod4)$coefficients[2,1]
fittedoddratios[4] <- exp(coef(lmod4))[2]
lmod5 <- glm(long.landing~speed_air,data = faa.clean)
fittedpvalues[5] <- summary(lmod5)$coefficients[2,4]
fittedcoeffs[5] <- summary(lmod5)$coefficients[2,1]
fittedoddratios[5] <- exp(coef(lmod5))[2]
lmod6 <- glm(long.landing~height,data = faa.clean)
fittedpvalues[6] <- summary(lmod6)$coefficients[2,4]
fittedcoeffs[6] <- summary(lmod6)$coefficients[2,1]
fittedoddratios[6] <- exp(coef(lmod6))[2]
lmod7 <- glm(long.landing~pitch,data = faa.clean)
fittedpvalues[7] <- summary(lmod7)$coefficients[2,4]
fittedcoeffs[7] <- summary(lmod7)$coefficients[2,1]
fittedoddratios[7] <- exp(coef(lmod7))[2]
table <- tibble(
variable = vars,
coefficient = fittedcoeffs,
odds_ratio = fittedoddratios,
pvalues = fittedpvalues[1:7]
)
table$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table %>% arrange(abs(pvalues))
## # A tibble: 7 x 5
## variable coefficient odds_ratio pvalues dir
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 speed_ground 0.0111 1.01 3.00e-92 positive
## 2 speed_air 0.0365 1.04 5.73e-35 positive
## 3 aircraft 0.0983 1.10 1.84e- 5 positive
## 4 pitch 0.0478 1.05 2.87e- 2 positive
## 5 height 0.00149 1.00 2.05e- 1 positive
## 6 duration -0.000222 1.000 3.64e- 1 negative
## 7 no_pasg -0.000895 0.999 5.61e- 1 negative
par(mfrow = c(2,2))
plot(faa.clean$long.landing ~ faa.clean$speed_ground,xlab = 'speed_ground',ylab = 'long.landing')
plot(faa.clean$long.landing ~ faa.clean$speed_air,xlab = 'speed_air',ylab = 'long.landing')
plot(faa.clean$long.landing ~ faa.clean$aircraft,xlab = 'aircraft',ylab = 'long.landing')
plot(faa.clean$long.landing ~ faa.clean$pitch,xlab = 'pitch',ylab = 'long.landing')
cat('Regression Coefficient with speed_ground as predictor is',summary(lm(long.landing~speed_ground,data = faa.clean))$coefficients[2,1],'\n')
## Regression Coefficient with speed_ground as predictor is 0.01106496
cat('Regression Coefficient with speed_air as predictor is',summary(lm(long.landing~speed_air,data = faa.clean))$coefficients[2,1],'\n')
## Regression Coefficient with speed_air as predictor is 0.03653113
cat('Regression Coefficient with both speed_ground and speed_air as predictors are',summary(lm(long.landing~speed_ground+speed_air,data = faa.clean))$coefficients[2,1],'and',summary(lm(long.landing~speed_ground+speed_air,data = faa.clean))$coefficients[3,1],'respectively')
## Regression Coefficient with both speed_ground and speed_air as predictors are 0.004556364 and 0.03195473 respectively
# check correlation between speed_ground and speed_air variables
cor(faa.clean$speed_ground,faa.clean$speed_air,use = "pairwise.complete.obs")
## [1] 0.9885722
This is a very high correlation and will lead to multicollinearity issue in the model if both variables are used. In our case, we will use the speed_ground variable because it has no missing values while the speed_air has almost 80% missing values. Also in the model where both variables are used, p-value for speed_ground is 0.004556364 making it insignificant which is not correct.
lmodnull <- glm(long.landing ~ 1, family = binomial, data = faa.clean)
lmodfull <- glm(long.landing ~.-(speed_air + risky.landing), family = binomial, data = faa.clean)
model1.AIC <- step(lmodnull, scope=list(lower=lmodnull, upper=lmodfull), direction='forward')
## Start: AIC=638.04
## long.landing ~ 1
##
## Df Deviance AIC
## + speed_ground 1 110.89 140.14
## + aircraft 1 594.65 623.90
## + pitch 1 607.48 636.74
## <none> 610.79 638.04
## + height 1 609.56 638.81
## + duration 1 609.96 639.21
## + no_pasg 1 610.48 639.74
##
## Step: AIC=123.09
## long.landing ~ speed_ground
##
## Df Deviance AIC
## + aircraft 1 79.563 93.764
## + height 1 95.938 110.139
## + pitch 1 100.202 114.403
## <none> 110.888 123.089
## + no_pasg 1 110.718 124.919
## + duration 1 110.860 125.061
##
## Step: AIC=92.07
## long.landing ~ speed_ground + aircraft
##
## Df Deviance AIC
## + height 1 54.418 68.921
## + pitch 1 76.598 91.101
## <none> 79.563 92.066
## + duration 1 78.833 93.336
## + no_pasg 1 79.414 93.917
##
## Step: AIC=65.06
## long.landing ~ speed_ground + aircraft + height
##
## Df Deviance AIC
## + pitch 1 51.591 64.233
## <none> 54.418 65.060
## + duration 1 53.711 66.353
## + no_pasg 1 54.418 67.060
##
## Step: AIC=63.21
## long.landing ~ speed_ground + aircraft + height + pitch
##
## Df Deviance AIC
## <none> 51.591 63.212
## + duration 1 51.122 64.743
## + no_pasg 1 51.587 65.208
summary(model1.AIC)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height +
## pitch, family = binomial, data = faa.clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.20389 -0.00053 0.00000 0.00000 2.35854
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -119.90246 24.37354 -4.919 8.68e-07 ***
## speed_ground 1.02370 0.20253 5.055 4.32e-07 ***
## aircraft 5.14051 1.17848 4.362 1.29e-05 ***
## height 0.25834 0.06842 3.776 0.00016 ***
## pitch 1.53924 0.84130 1.830 0.06731 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 636.044 on 836 degrees of freedom
## Residual deviance: 53.212 on 832 degrees of freedom
## AIC: 63.212
##
## Number of Fisher Scoring iterations: 12
Using forward selection by AIC , the final model includes the following variables - speed_ground,aircraft,height and pitch which is different from the analysis using p-values where pitch and height variables were not significant
model1.BIC <- step(lmodnull, scope=list(lower=lmodnull, upper=lmodfull), direction='forward',k = log(nrow(faa.clean)))
## Start: AIC=642.77
## long.landing ~ 1
##
## Df Deviance AIC
## + speed_ground 1 110.89 149.60
## + aircraft 1 594.65 633.36
## <none> 610.79 642.77
## + pitch 1 607.48 646.20
## + height 1 609.56 648.27
## + duration 1 609.96 648.67
## + no_pasg 1 610.48 649.20
##
## Step: AIC=132.55
## long.landing ~ speed_ground
##
## Df Deviance AIC
## + aircraft 1 79.563 107.95
## + height 1 95.938 124.33
## + pitch 1 100.202 128.59
## <none> 110.888 132.55
## + no_pasg 1 110.718 139.11
## + duration 1 110.860 139.25
##
## Step: AIC=106.26
## long.landing ~ speed_ground + aircraft
##
## Df Deviance AIC
## + height 1 54.418 87.84
## <none> 79.563 106.25
## + pitch 1 76.598 110.02
## + duration 1 78.833 112.25
## + no_pasg 1 79.414 112.84
##
## Step: AIC=83.98
## long.landing ~ speed_ground + aircraft + height
##
## Df Deviance AIC
## <none> 54.418 83.979
## + pitch 1 51.591 87.882
## + duration 1 53.711 90.002
## + no_pasg 1 54.418 90.709
summary(model1.BIC)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height,
## family = binomial, data = faa.clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.43613 -0.00113 0.00000 0.00000 2.57645
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -103.09744 19.18549 -5.374 7.71e-08 ***
## speed_ground 0.92781 0.17207 5.392 6.96e-08 ***
## aircraft 5.05631 1.11257 4.545 5.50e-06 ***
## height 0.23154 0.05939 3.899 9.66e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 636.04 on 836 degrees of freedom
## Residual deviance: 57.06 on 833 degrees of freedom
## AIC: 65.06
##
## Number of Fisher Scoring iterations: 11
Using forward selection by BIC, the final model includes the following variables - speed_ground,aircraft and height which is different from the analysis using p-values where the height variable was not significant. This is also different from model built using AIC as it does not include pitch variable in the final model
vars = vars <- c('aircraft','speed_ground','height','pitch')
fittedcoeffs <- vector()
fittedoddratios <- vector()
fittedcoeffs[1] <- summary(model1.AIC)$coefficients[2,1]
fittedcoeffs[2] <- summary(model1.AIC)$coefficients[3,1]
fittedcoeffs[3] <- summary(model1.AIC)$coefficients[4,1]
fittedcoeffs[4] <- summary(model1.AIC)$coefficients[5,1]
fittedoddratios[1] <- exp(coef(model1.AIC))[2]
fittedoddratios[2] <- exp(coef(model1.AIC))[3]
fittedoddratios[3] <- exp(coef(model1.AIC))[4]
fittedoddratios[4] <- exp(coef(model1.AIC))[5]
table <- tibble(
variable = vars,
coefficient = fittedcoeffs,
odds_ratio = fittedoddratios,
)
table$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table %>% arrange(desc(coefficient))
## # A tibble: 4 x 4
## variable coefficient odds_ratio dir
## <chr> <dbl> <dbl> <chr>
## 1 speed_ground 5.14 171. positive
## 2 pitch 1.54 4.66 positive
## 3 aircraft 1.02 2.78 positive
## 4 height 0.258 1.29 positive
# pie chart for distribution of risky
pct <- round(table(faa.clean$risky.landing)/length(faa.clean$risky.landing)*100,1)
labs <- c("No","Yes")
labs <- paste(labs,pct)
labs <- paste(labs,"%",sep="")
pie(table(faa.clean$risky.landing),labels= labs,col = rainbow(length(labs)),main = "Pie chart of Risky Landing")
#Logistic Regression using single factor
vars <- c('aircraft','duration','no_pasg','speed_ground','speed_air','height','pitch')
fittedpvalues <- vector()
fittedcoeffs <- vector()
fittedoddratios <- vector()
lmod1 <- glm(risky.landing~aircraft,data = faa.clean)
fittedpvalues[1] <- summary(lmod1)$coefficients[2,4]
fittedcoeffs[1] <- summary(lmod1)$coefficients[2,1]
fittedoddratios[1] <- exp(coef(lmod1))[2]
lmod2 <- glm(risky.landing~duration,data = faa.clean)
fittedpvalues[2] <- summary(lmod2)$coefficients[2,4]
fittedcoeffs[2] <- summary(lmod2)$coefficients[2,1]
fittedoddratios[2] <- exp(coef(lmod2))[2]
lmod3 <- glm(risky.landing~no_pasg,data = faa.clean)
fittedpvalues[3] <- summary(lmod3)$coefficients[2,4]
fittedcoeffs[3] <- summary(lmod3)$coefficients[2,1]
fittedoddratios[3] <- exp(coef(lmod3))[2]
lmod4 <- glm(risky.landing~speed_ground,data = faa.clean)
fittedpvalues[4] <- summary(lmod4)$coefficients[2,4]
fittedcoeffs[4] <- summary(lmod4)$coefficients[2,1]
fittedoddratios[4] <- exp(coef(lmod4))[2]
lmod5 <- glm(risky.landing~speed_air,data = faa.clean)
fittedpvalues[5] <- summary(lmod5)$coefficients[2,4]
fittedcoeffs[5] <- summary(lmod5)$coefficients[2,1]
fittedoddratios[5] <- exp(coef(lmod5))[2]
lmod6 <- glm(risky.landing~height,data = faa.clean)
fittedpvalues[6] <- summary(lmod6)$coefficients[2,4]
fittedcoeffs[6] <- summary(lmod6)$coefficients[2,1]
fittedoddratios[6] <- exp(coef(lmod6))[2]
lmod7 <- glm(risky.landing~pitch,data = faa.clean)
fittedpvalues[7] <- summary(lmod7)$coefficients[2,4]
fittedcoeffs[7] <- summary(lmod7)$coefficients[2,1]
fittedoddratios[7] <- exp(coef(lmod7))[2]
exp(coef(lmod7))
## (Intercept) pitch
## 0.9594595 1.0295539
table <- tibble(
variable = vars,
coefficient = fittedcoeffs,
odds_ratio = fittedoddratios,
pvalues = fittedpvalues[1:7]
)
table$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table %>% arrange(abs(pvalues))
## # A tibble: 7 x 5
## variable coefficient odds_ratio pvalues dir
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 speed_ground 0.00768 1.01 1.15e-66 positive
## 2 speed_air 0.0376 1.04 1.47e-49 positive
## 3 aircraft 0.0699 1.07 1.24e- 4 positive
## 4 pitch 0.0291 1.03 9.27e- 2 positive
## 5 no_pasg -0.00164 0.998 1.78e- 1 negative
## 6 duration -0.000140 1.000 4.72e- 1 negative
## 7 height 0.000174 1.00 8.52e- 1 positive
We observe from the above table that speed_ground, speed_air and aircraft are the significant variables at alpha = 0.05
# Visualize the association of the significant variables
par(mfrow = c(2,2))
plot(faa.clean$risky.landing ~ faa.clean$speed_ground,xlab = 'speed_ground',ylab = 'risky.landing')
plot(faa.clean$risky.landing ~ faa.clean$speed_air,xlab = 'speed_air',ylab = 'risky.landing')
plot(faa.clean$risky.landing ~ faa.clean$aircraft,xlab = 'aircraft',ylab = 'risky.landing')
plot(faa.clean$risky.landing ~ faa.clean$pitch,xlab = 'pitch',ylab = 'risky.landing')
# initiate null and full models
lmodnull <- glm(risky.landing ~ 1, family = binomial, data = faa.clean)
lmodfull <- glm(risky.landing ~.-(speed_air + long.landing), family = binomial, data = faa.clean)
model2.AIC <- step(lmodnull, scope=list(lower=lmodnull, upper=lmodfull), direction='forward')
## Start: AIC=449.06
## risky.landing ~ 1
##
## Df Deviance AIC
## + speed_ground 1 58.99 75.97
## + aircraft 1 421.54 438.52
## + pitch 1 431.79 448.77
## <none> 434.08 449.06
## + no_pasg 1 432.26 449.24
## + duration 1 433.56 450.54
## + height 1 434.07 451.05
##
## Step: AIC=63.95
## risky.landing ~ speed_ground
##
## Df Deviance AIC
## + aircraft 1 40.159 47.118
## + pitch 1 52.021 58.980
## <none> 58.991 63.950
## + no_pasg 1 58.160 65.119
## + height 1 58.563 65.522
## + duration 1 58.714 65.673
##
## Step: AIC=46.29
## risky.landing ~ speed_ground + aircraft
##
## Df Deviance AIC
## + no_pasg 1 37.711 45.846
## <none> 40.159 46.294
## + height 1 39.411 47.546
## + pitch 1 39.963 48.098
## + duration 1 40.063 48.198
##
## Step: AIC=45.85
## risky.landing ~ speed_ground + aircraft + no_pasg
##
## Df Deviance AIC
## <none> 37.711 45.853
## + height 1 37.078 47.220
## + pitch 1 37.434 47.576
## + duration 1 37.711 47.853
summary(model2.AIC)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg,
## family = binomial, data = faa.clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.35062 -0.00008 0.00000 0.00000 1.87851
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -101.01866 25.61661 -3.943 8.03e-05 ***
## speed_ground 0.96032 0.23587 4.071 4.67e-05 ***
## aircraft 4.71633 1.47244 3.203 0.00136 **
## no_pasg -0.08585 0.05753 -1.492 0.13561
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 447.057 on 836 degrees of freedom
## Residual deviance: 37.853 on 833 degrees of freedom
## AIC: 45.853
##
## Number of Fisher Scoring iterations: 12
Using forward selection by AIC , the final model includes the following variables - speed_ground, aircraft and no_pasg
model2.BIC <- step(lmodnull, scope=list(lower=lmodnull, upper=lmodfull), direction='forward',k = log(nrow(faa.clean)))
## Start: AIC=453.79
## risky.landing ~ 1
##
## Df Deviance AIC
## + speed_ground 1 58.99 85.43
## + aircraft 1 421.54 447.98
## <none> 434.08 453.79
## + pitch 1 431.79 458.23
## + no_pasg 1 432.26 458.70
## + duration 1 433.56 460.00
## + height 1 434.07 460.51
##
## Step: AIC=73.41
## risky.landing ~ speed_ground
##
## Df Deviance AIC
## + aircraft 1 40.159 61.307
## + pitch 1 52.021 73.170
## <none> 58.991 73.410
## + no_pasg 1 58.160 79.309
## + height 1 58.563 79.712
## + duration 1 58.714 79.863
##
## Step: AIC=60.48
## risky.landing ~ speed_ground + aircraft
##
## Df Deviance AIC
## <none> 40.159 60.483
## + no_pasg 1 37.711 64.766
## + height 1 39.411 66.466
## + pitch 1 39.963 67.017
## + duration 1 40.063 67.117
summary(model2.BIC)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial,
## data = faa.clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2561 -0.0001 0.0000 0.0000 1.6054
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -103.3066 24.8837 -4.152 3.30e-05 ***
## speed_ground 0.9374 0.2258 4.152 3.29e-05 ***
## aircraft 4.0896 1.2485 3.276 0.00105 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 447.057 on 836 degrees of freedom
## Residual deviance: 40.294 on 834 degrees of freedom
## AIC: 46.294
##
## Number of Fisher Scoring iterations: 12
Using forward selection by BIC, the final model includes the following variables - speed_ground and aircraft which is same as the analysis using p-values. However, this is different from model built using AIC as it does not include no_pasg variable in the final model
vars = vars <- c('aircraft','speed_ground')
fittedcoeffs <- vector()
fittedoddratios <- vector()
fittedcoeffs[1] <- summary(model2.BIC)$coefficients[2,1]
fittedcoeffs[2] <- summary(model2.BIC)$coefficients[3,1]
fittedoddratios[1] <- exp(coef(model2.BIC))[2]
fittedoddratios[2] <- exp(coef(model2.BIC))[3]
table <- tibble(
variable = vars,
coefficient = fittedcoeffs,
odds_ratio = fittedoddratios,
)
table$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table %>% arrange(desc(coefficient))
## # A tibble: 2 x 4
## variable coefficient odds_ratio dir
## <chr> <dbl> <dbl> <chr>
## 1 speed_ground 4.09 59.7 positive
## 2 aircraft 0.937 2.55 positive
long.landing.final.model <- model1.AIC
predprob1 <- predict(long.landing.final.model,type='response')
predout1 <- ifelse(predprob1<0.5,'no','yes')
faa.clean.m1 <- data.frame(faa.clean, predprob1,predout1)
thresh <- seq(0.01,0.5,0.01)
sensitivity1 <- specificity1 <- rep(NA,length(thresh))
for ( j in seq(along = thresh)){
pp <- ifelse(faa.clean.m1$predprob1<thresh[j],"no","yes")
xx <- xtabs(~long.landing+pp,faa.clean.m1)
specificity1[j]<- xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity1[j]<- xx[2,2]/(xx[2,1]+xx[2,2])
}
risky.landing.final.model <- model2.BIC
predprob2 <- predict(risky.landing.final.model,type='response')
predout2 <- ifelse(predprob2<0.5,'no','yes')
faa.clean.m2 <- data.frame(faa.clean, predprob2,predout2)
thresh <- seq(0.01,0.5,0.01)
sensitivity2 <- specificity2 <- rep(NA,length(thresh))
for ( j in seq(along = thresh)){
pp <- ifelse(faa.clean.m2$predprob2<thresh[j],"no","yes")
xx <- xtabs(~risky.landing+pp,faa.clean.m2)
specificity2[j]<- xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity2[j]<- xx[2,2]/(xx[2,1]+xx[2,2])
}
#matplot(thresh,cbind(sensitivity1,specificity1),type ='l',xlab="Threshold",ylab='Proportion',lty=1:2)
plot(1-specificity1,sensitivity1,type = 'l',col = 'red',xlab = '1- specificity', ylab = 'sensitivity');#abline(0,1,lty=2)
lines(1-specificity2,sensitivity2,col="green")
legend(0.04, 0.98, legend=c("Long Landing Model", "Risky Landing model"),
col=c("red", "green"), lty=1:2, cex=0.8)
#install.packages('pROC')
# area under curve for long landing model
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
pred1 <- predict(long.landing.final.model)
roc_obj <- roc(faa.clean$long.landing, pred1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj)
## Area under the curve: 0.9985
# area under curve for risky landing model
pred2 <- predict(risky.landing.final.model)
roc_obj2 <- roc(faa.clean$risky.landing, pred2)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
auc(roc_obj2)
## Area under the curve: 0.9986
#long.landing
new.ind <- data.frame(aircraft = 1, duration = 200,no_pasg = 80, speed_ground = 115,speed_air = 120,height = 40,pitch = 4)
predict(long.landing.final.model,newdata = new.ind,type = 'response',se = T)
## $fit
## 1
## 1
##
## $se.fit
## 1
## 1.402991e-08
##
## $residual.scale
## [1] 1
round(c(1-1.96*1.402991e-08,1+ 1.96*1.402991e-08),3)
## [1] 1 1
##risky.landing
predict(risky.landing.final.model,newdata = new.ind,type = 'response',se = T)
## $fit
## 1
## 0.9998136
##
## $se.fit
## 1
## 0.0003894828
##
## $residual.scale
## [1] 1
round(c(0.9998136-1.96*1.071988e-13,0.9998136 + 1.96*1.071988e-13),3)
## [1] 1 1
#Using the risk factors identified using the model2.BIC(speed_ground + aircraft ) we build a probit model and complementary log-log link
glm.probit <- glm(risky.landing~speed_ground + aircraft ,family = binomial(link = probit),data = faa.clean)
glm.clog <- glm(risky.landing~speed_ground + aircraft ,family = binomial(link = cloglog),data = faa.clean)
round(coef(model2.BIC),3)
## (Intercept) speed_ground aircraft
## -103.307 0.937 4.090
round(coef(glm.probit),3)
## (Intercept) speed_ground aircraft
## -59.420 0.539 2.402
round(coef(glm.clog),3)
## (Intercept) speed_ground aircraft
## -70.009 0.629 2.950
The two models- probit and cloglog are different from the logit model in terms of magnitudes of coefficients but same in sign.
risky.landing.logit.model <- model2.BIC
predprob2 <- predict(risky.landing.logit.model,type='response')
predout2 <- ifelse(predprob2<0.5,'no','yes')
faa.clean.m2 <- data.frame(faa.clean, predprob2,predout2)
thresh <- seq(0.01,0.5,0.01)
sensitivity2 <- specificity2 <- rep(NA,length(thresh))
for ( j in seq(along = thresh)){
pp <- ifelse(faa.clean.m2$predprob2<thresh[j],"no","yes")
xx <- xtabs(~risky.landing+pp,faa.clean.m2)
specificity2[j]<- xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity2[j]<- xx[2,2]/(xx[2,1]+xx[2,2])
}
risky.landing.probit.model <- glm.probit
predprob3 <- predict(risky.landing.probit.model,type='response')
predout3 <- ifelse(predprob3<0.5,'no','yes')
faa.clean.m3 <- data.frame(faa.clean, predprob3,predout3)
thresh <- seq(0.01,0.5,0.01)
sensitivity3 <- specificity3 <- rep(NA,length(thresh))
for ( j in seq(along = thresh)){
pp <- ifelse(faa.clean.m3$predprob3<thresh[j],"no","yes")
xx <- xtabs(~risky.landing+pp,faa.clean.m3)
specificity3[j]<- xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity3[j]<- xx[2,2]/(xx[2,1]+xx[2,2])
}
risky.landing.clog.model <- glm.clog
predprob4 <- predict(risky.landing.clog.model,type='response')
predout4 <- ifelse(predprob4<0.5,'no','yes')
faa.clean.m4 <- data.frame(faa.clean, predprob4,predout4)
thresh <- seq(0.01,0.5,0.01)
sensitivity4 <- specificity4 <- rep(NA,length(thresh))
for ( j in seq(along = thresh)){
pp <- ifelse(faa.clean.m4$predprob4<thresh[j],"no","yes")
xx <- xtabs(~risky.landing+pp,faa.clean.m4)
specificity4[j]<- xx[1,1]/(xx[1,1]+xx[1,2])
sensitivity4[j]<- xx[2,2]/(xx[2,1]+xx[2,2])
}
#par(mfrow =c(1,2))
plot(1-specificity2,sensitivity2,type = 'l',col = 'red');
lines(1-specificity3,sensitivity3,col="green")
lines(1-specificity4,sensitivity4,col="blue")
legend(0.025, 0.98, legend=c("Logit model", "Probit Model", "CLog model"),
col=c("red", "green","blue"), lty=1:2, cex=0.8)
We can see tha both the clog model and logit model overlap and have idential ROC curves
# logit <- predict(model2.BIC,type="response")
# probit <- predict(glm.probit, type = "response")
# cloglog <- predict(glm.clog, type = "response")
# cat('logit top 5 records:\n ')
# tail(sort(p_logit),5);
# cat('probit top 5 records:\n ')
# tail(sort(p_probit),5);
# cat('clog top 5 records:\n ')
# tail(sort(p_cloglog),5)
#Output
# logit top 5 records:
# 400 66 317 373 166
# 1 1 1 1 1
# probit top 5 records:
# 373 396 400 421 662
# 1 1 1 1 1
# clog top 5 records:
# 662 688 770 784 788
# 1 1 1 1 1
The three models do not point to the same flights. Both logit and probit models predict flight#400 and 373 as risky landings
predict(glm.probit,newdata = new.ind,type = 'response',se = T)
## $fit
## 1
## 0.9999996
##
## $se.fit
## 1
## 2.179702e-06
##
## $residual.scale
## [1] 1
round(c(0.9999832-1.96*2.179702e-06,0.9999832 + 1.96*2.179702e-06 ),3)
## [1] 1 1
predict(glm.clog,newdata = new.ind,type = 'response',se = T)
## $fit
## 1
## 1
##
## $se.fit
## 1
## 2.574406e-16
##
## $residual.scale
## [1] 1
round(c(1-1.96*2.747415e-16,1 + 1.96*2.747415e-16),3)
## [1] 1 1
The fitted values in each of the logit, probit and clog models are equal to 1.