Background: Flight landing.
Motivation: To reduce the risk of landing overrun.
Goal: To study what factors and how they would impact the landing distance of a commercial flight - using logistic regression
Data: Landing data from 950 commercial flights
library(tidyverse) #to visualize, transform, input, tidy and join data
library(haven) #to input data from SAS
library(dplyr) #data wrangling
library(stringr) #string related functions
library(kableExtra) #to create HTML Table
library(DT) #to preview the data sets
library(ggplot2) #to vizualize data
library(MASS) #step function
library(ROCR) #to plot ROCR
## # A tibble: 6 x 8
## aircraft duration no_pasg speed_ground speed_air height pitch distance
## <fct> <dbl> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 boeing 98.5 53 108. 109. 27.4 4.04 3370.
## 2 boeing 126. 69 102. 103. 27.8 4.12 2988.
## 3 boeing 112. 61 71.1 NA 18.6 4.43 1145.
## 4 boeing 197. 56 85.8 NA 30.7 3.88 1664.
## 5 boeing 90.1 70 59.9 NA 32.4 4.03 1050.
## 6 boeing 138. 55 75.0 NA 41.2 4.20 1627.
Created two binary variables below:
long.landing = 1 if distance > 2500; =0 otherwise
risky.landing = 1 if distance > 3000; =0 otherwise
# creating long.landing and risky.landing variables and removing distance variable
FAA_merged3 <- FAA_merged2 %>% mutate(long.landing = ifelse(distance>2500,1,0)) %>%
mutate(risky.landing =ifelse(distance>3000,1,0)) %>% dplyr::select(-distance)
str(FAA_merged3)
## Classes 'tbl_df', 'tbl' and 'data.frame': 831 obs. of 9 variables:
## $ aircraft : Factor w/ 2 levels "airbus","boeing": 2 2 2 2 2 2 2 2 2 2 ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : int 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground : num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ long.landing : num 1 1 0 0 0 0 0 0 0 0 ...
## $ risky.landing: num 1 0 0 0 0 0 0 0 0 0 ...
head(FAA_merged3)
## # A tibble: 6 x 9
## aircraft duration no_pasg speed_ground speed_air height pitch
## <fct> <dbl> <int> <dbl> <dbl> <dbl> <dbl>
## 1 boeing 98.5 53 108. 109. 27.4 4.04
## 2 boeing 126. 69 102. 103. 27.8 4.12
## 3 boeing 112. 61 71.1 NA 18.6 4.43
## 4 boeing 197. 56 85.8 NA 30.7 3.88
## 5 boeing 90.1 70 59.9 NA 32.4 4.03
## 6 boeing 138. 55 75.0 NA 41.2 4.20
## # ... with 2 more variables: long.landing <dbl>, risky.landing <dbl>
# creating long.landing and risky.landing variables and removing distance variable
FAA_merged3 %>% ggplot(aes(x=long.landing, fill=as.factor(long.landing))) +geom_bar() +theme_classic() +
geom_text(stat = "count", aes(label = ..count.., y = ..count..),position = position_dodge(width = 1),
vjust = -0.5, size = 6)
variables <- c("aircraft", "duration","no_pasg" ,"speed_ground", "speed_air" ,"height" ,"pitch", "risky.landing")
coeff3 <- rep(NA,length(variables))
p_val3 <- rep(NA,length(variables))
fit_all3 <- data.frame(variables, coeff3,p_val3)
for (i in seq_along(variables))
{
fit_all3$coeff3[i] <- summary(glm(FAA_merged3$long.landing ~ FAA_merged3[[variables[i]]],family=binomial))$coefficients[,1][2]
fit_all3$p_val3[i] <- summary(glm(FAA_merged3$long.landing ~ FAA_merged3[[variables[i]]],family=binomial))$coefficients[,4][2]
}
table1 <- fit_all3 %>%
mutate(sign_coef3 = ifelse(coeff3>0,"+","-")) %>%
mutate(odds_ratio= exp(coeff3)) %>%
arrange(p_val3)
table1
## variables coeff3 p_val3 sign_coef3 odds_ratio
## 1 speed_ground 0.472345752 3.935339e-14 + 1.603752e+00
## 2 speed_air 0.512321765 4.334124e-11 + 1.669162e+00
## 3 aircraft 0.864119860 8.398591e-05 + 2.372917e+00
## 4 pitch 0.400527824 4.664982e-02 + 1.492612e+00
## 5 height 0.008623997 4.218576e-01 + 1.008661e+00
## 6 no_pasg -0.007256406 6.058565e-01 - 9.927699e-01
## 7 duration -0.001070492 6.305122e-01 - 9.989301e-01
## 8 risky.landing 21.418699942 9.795390e-01 + 2.004579e+09
# speed_ground
plot(jitter(long.landing,0.1)~jitter(speed_ground),FAA_merged3,xlab="Speed Ground",ylab="Long Landing",pch=".")
ggplot(FAA_merged3,aes(x=speed_ground,fill=as.factor(long.landing))) + geom_histogram(position="dodge",binwidth=1)
# speed_air
plot(jitter(long.landing,0.1)~jitter(speed_air),FAA_merged3,xlab="Speed Air",ylab="Long Landing",pch=".")
ggplot(FAA_merged3,aes(x=speed_air,fill=as.factor(long.landing))) + geom_histogram(position="dodge",binwidth=1)
# pitch
plot(jitter(long.landing,0.1)~jitter(pitch),FAA_merged3,xlab="Pitch",ylab="Long Landing",pch=".")
ggplot(FAA_merged3,aes(x=pitch,fill=as.factor(long.landing))) + geom_histogram(position="dodge",binwidth=1)
# aircraft
ggplot(FAA_merged3, aes(as.factor(aircraft), fill=as.factor(long.landing))) +
geom_bar(position = "fill") + xlab("Aircraft")
fit5 <- glm(long.landing ~ speed_ground + aircraft + pitch,
family=binomial, data = FAA_merged3)
summary(fit5)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + pitch,
## family = binomial, data = FAA_merged3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.11589 -0.01116 -0.00026 0.00000 2.40741
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -67.92855 10.48408 -6.479 9.22e-11 ***
## speed_ground 0.61471 0.09184 6.694 2.18e-11 ***
## aircraftboeing 3.04348 0.73345 4.150 3.33e-05 ***
## pitch 1.06599 0.60389 1.765 0.0775 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.778 on 830 degrees of freedom
## Residual deviance: 81.309 on 827 degrees of freedom
## AIC: 89.309
##
## Number of Fisher Scoring iterations: 10
fit_base <- glm(long.landing ~ 1, family=binomial, data = FAA_merged3[,-5])
fit_max <- glm(long.landing ~., family=binomial, data = FAA_merged3[,-5])
aic_fit6 <- step(fit_base, scope=list(lower=fit_base, upper=fit_max), direction = "forward")
## Start: AIC=624.78
## long.landing ~ 1
##
## Df Deviance AIC
## + speed_ground 1 107.40 136.49
## + risky.landing 1 309.08 338.17
## + aircraft 1 583.49 612.58
## + pitch 1 595.08 624.16
## <none> 597.69 624.78
## + height 1 597.29 626.37
## + no_pasg 1 597.46 626.54
## + duration 1 597.46 626.55
##
## Step: AIC=119.47
## long.landing ~ speed_ground
##
## Df Deviance AIC
## + aircraft 1 78.164 92.233
## + height 1 95.059 109.129
## + pitch 1 97.006 111.076
## + risky.landing 1 104.660 118.729
## <none> 107.401 119.470
## + duration 1 107.296 121.365
## + no_pasg 1 107.375 121.444
##
## Step: AIC=90.66
## long.landing ~ speed_ground + aircraft
##
## Df Deviance AIC
## + height 1 54.401 68.902
## + pitch 1 75.176 89.677
## <none> 78.164 90.665
## + duration 1 76.635 91.136
## + risky.landing 1 77.646 92.147
## + no_pasg 1 77.824 92.325
##
## Step: AIC=65.05
## long.landing ~ speed_ground + aircraft + height
##
## Df Deviance AIC
## + pitch 1 51.580 64.225
## <none> 54.401 65.047
## + risky.landing 1 53.634 66.279
## + duration 1 53.680 66.325
## + no_pasg 1 54.401 67.047
##
## Step: AIC=63.2
## long.landing ~ speed_ground + aircraft + height + pitch
##
## Df Deviance AIC
## <none> 51.580 63.204
## + duration 1 51.102 64.726
## + risky.landing 1 51.235 64.859
## + no_pasg 1 51.575 65.199
summary(aic_fit6)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height +
## pitch, family = binomial, data = FAA_merged3[, -5])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.20284 -0.00054 0.00000 0.00000 2.35719
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -119.77598 24.41821 -4.905 9.33e-07 ***
## speed_ground 1.02266 0.20290 5.040 4.65e-07 ***
## aircraftboeing 5.13443 1.18091 4.348 1.37e-05 ***
## height 0.25795 0.06861 3.760 0.00017 ***
## pitch 1.53751 0.84109 1.828 0.06755 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.778 on 830 degrees of freedom
## Residual deviance: 53.204 on 826 degrees of freedom
## AIC: 63.204
##
## Number of Fisher Scoring iterations: 12
aic_fit7 <- step(fit_base, scope=list(lower=fit_base, upper=fit_max), direction = "forward",
k= log(nrow(FAA_merged3)), trace=0)
summary(aic_fit7)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height,
## family = binomial, data = FAA_merged3[, -5])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.43442 -0.00117 0.00000 0.00000 2.57435
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -102.95437 19.22882 -5.354 8.59e-08 ***
## speed_ground 0.92657 0.17242 5.374 7.70e-08 ***
## aircraftboeing 5.04813 1.11520 4.527 5.99e-06 ***
## height 0.23106 0.05959 3.877 0.000106 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.778 on 830 degrees of freedom
## Residual deviance: 57.047 on 827 degrees of freedom
## AIC: 65.047
##
## Number of Fisher Scoring iterations: 11
Model: long.landing ~ speed_ground + aircraft + pitch
## variables8 coeff8 p_val8 sign_coef8 odds_ratio8
## 1 speed ground 0.6147075 2.178523e-11 + 1.849116
## 2 aircraftboeing 3.0434815 3.331297e-05 + 20.978153
## 3 pitch 1.0659869 7.753041e-02 + 2.903703
AIC value: 89.309
Clearly, in the figure1, we can see that high speed ground results in long landing distance, which is a potential threat to landing safely.
Speed ground also gives a similar result. But since we have more data points for speed ground, we will keep speed ground in model instead of speed air
As pitch increases and is close to 5, the proportion of long landing increases compared with previous proportions for lower pitch. It might be interesting to explore pitch variable further.
The proportion of long landing distance= 1 is higher for boeing.
variables <- c("aircraft", "duration","no_pasg" ,"speed_ground", "speed_air" ,"height" ,"pitch", "long.landing")
coeff9.3 <- rep(NA,length(variables))
p_val9.3 <- rep(NA,length(variables))
fit_all9.3 <- data.frame(variables, coeff9.3,p_val9.3)
for (i in seq_along(variables))
{
fit_all9.3$coeff9.3[i] <- summary(glm(FAA_merged3$risky.landing ~ FAA_merged3[[variables[i]]],family=binomial))$coefficients[,1][2]
fit_all9.3$p_val9.3[i] <- summary(glm(FAA_merged3$risky.landing ~ FAA_merged3[[variables[i]]],family=binomial))$coefficients[,4][2]
}
table2 <- fit_all9.3 %>%
mutate(sign_coef9.3 = ifelse(coeff9.3>0,"+","-")) %>%
mutate(odds_ratio= exp(coeff9.3)) %>%
arrange(p_val9.3)
# speed_ground
plot(jitter(risky.landing,0.1)~jitter(speed_ground),FAA_merged3,xlab="Speed Ground",ylab="Risky Landing",pch=".")
ggplot(FAA_merged3,aes(x=speed_ground,fill=as.factor(risky.landing))) + geom_histogram(position="dodge",binwidth=1)
# speed_air
plot(jitter(risky.landing,0.1)~jitter(speed_air),FAA_merged3,xlab="Speed Air",ylab="Risky Landing",pch=".")
ggplot(FAA_merged3,aes(x=speed_air,fill=as.factor(risky.landing))) + geom_histogram(position="dodge",binwidth=1)
# pitch
plot(jitter(risky.landing,0.1)~jitter(pitch),FAA_merged3,xlab="Pitch",ylab="Risky Landing",pch=".")
ggplot(FAA_merged3,aes(x=pitch,fill=as.factor(risky.landing))) + geom_histogram(position="dodge",binwidth=1)
# aircraft
ggplot(FAA_merged3, aes(as.factor(aircraft), fill=as.factor(risky.landing))) +
geom_bar(position = "fill") + xlab("Aircraft")
fit9.5 <- glm(risky.landing ~ speed_ground + aircraft,
family=binomial, data = FAA_merged3)
summary(fit9.5)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial,
## data = FAA_merged3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.24398 -0.00011 0.00000 0.00000 1.61021
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -102.0772 24.7751 -4.120 3.79e-05 ***
## speed_ground 0.9263 0.2248 4.121 3.78e-05 ***
## aircraftboeing 4.0190 1.2494 3.217 0.0013 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 40.097 on 828 degrees of freedom
## AIC: 46.097
##
## Number of Fisher Scoring iterations: 12
fit_base9.6 <- glm(risky.landing ~ 1, family=binomial, data = FAA_merged3[,-5])
fit_max9.6 <- glm(risky.landing ~., family=binomial, data = FAA_merged3[,-5])
aic_fit9.6 <- step(fit_base9.6, scope=list(lower=fit_base9.6, upper=fit_max9.6), direction = "forward")
## Start: AIC=438.04
## risky.landing ~ 1
##
## Df Deviance AIC
## + speed_ground 1 57.99 74.82
## + long.landing 1 134.60 151.43
## + aircraft 1 412.07 428.90
## + no_pasg 1 421.18 438.01
## <none> 423.22 438.04
## + pitch 1 421.54 438.37
## + duration 1 423.04 439.87
## + height 1 423.13 439.95
##
## Step: AIC=62.93
## risky.landing ~ speed_ground
##
## Df Deviance AIC
## + aircraft 1 39.955 46.898
## + pitch 1 51.634 58.576
## + long.landing 1 53.529 60.471
## <none> 57.988 62.931
## + no_pasg 1 57.178 64.121
## + height 1 57.787 64.729
## + duration 1 57.951 64.893
##
## Step: AIC=46.1
## risky.landing ~ speed_ground + aircraft
##
## Df Deviance AIC
## + no_pasg 1 37.559 45.700
## <none> 39.955 46.097
## + height 1 39.295 47.436
## + long.landing 1 39.461 47.602
## + duration 1 39.757 47.898
## + pitch 1 39.783 47.924
##
## Step: AIC=45.71
## risky.landing ~ speed_ground + aircraft + no_pasg
##
## Df Deviance AIC
## <none> 37.559 45.707
## + height 1 36.985 47.133
## + long.landing 1 37.222 47.370
## + pitch 1 37.304 47.452
## + duration 1 37.548 47.696
summary(aic_fit9.6)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg,
## family = binomial, data = FAA_merged3[, -5])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.33913 -0.00009 0.00000 0.00000 1.87810
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -99.90780 25.57993 -3.906 9.39e-05 ***
## speed_ground 0.94963 0.23559 4.031 5.56e-05 ***
## aircraftboeing 4.64188 1.47520 3.147 0.00165 **
## no_pasg -0.08462 0.05732 -1.476 0.13987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 37.707 on 827 degrees of freedom
## AIC: 45.707
##
## Number of Fisher Scoring iterations: 12
aic_fit9.7 <- step(fit_base9.6, scope=list(lower=fit_base9.6, upper=fit_max9.6), direction = "forward",
k= log(nrow(FAA_merged3)), trace=0)
summary(aic_fit9.7)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = binomial,
## data = FAA_merged3[, -5])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.24398 -0.00011 0.00000 0.00000 1.61021
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -102.0772 24.7751 -4.120 3.79e-05 ***
## speed_ground 0.9263 0.2248 4.121 3.78e-05 ***
## aircraftboeing 4.0190 1.2494 3.217 0.0013 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 40.097 on 828 degrees of freedom
## AIC: 46.097
##
## Number of Fisher Scoring iterations: 12
Model: risky.landing ~ speed_ground + aircraft
## variables9.8 coeff9.8 p_val9.8 sign_coef9.8 odds_ratio9.8
## 1 speed ground 0.9262743 3.778843e-05 + 2.525084
## 2 aircraftboeing 4.0190312 1.296316e-03 + 55.647168
AIC value: 46.097
Clearly, in the figure, we can see that high speed ground results in risky landing=1, which is a potential threat to landing safely.
Speed ground also gives a similar result. But since we have more data points for speed ground, we will keep speed ground in model instead of speed air
Pitch does not seem to play a significant role when we see the histogram of pitch for risky landing = 0 and 1. We will drop it from our model.
The proportion of risky landing distance= 1 is higher for boeing. Aircraft type seem to play a role.
We get the same model with step BIC as the model that we selected manually in step 9.5
Variable pitch is included in the model with response = long landing but not in the model with response = risky landing
The AIC value of the model with response variable = risky landing is 46.097 which is almost half of the AIC value of model with response variable = long landing: 89.309. This indicates that the former model is better.
pred1 <- predict(fit9.5, type = "response")
pred_ROC <- prediction(pred1, FAA_merged3$risky.landing)
perf1 <- performance(pred_ROC, "tpr", "fpr")
plot(perf1, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred_ROC, "auc"), "y.values"))
## [1] 0.9986161
pred2 <- predict(fit5, type = "response")
pred_ROC2 <- prediction(pred2, FAA_merged3$long.landing)
perf2 <- performance(pred_ROC2, "tpr", "fpr")
plot(perf2, colorize=TRUE)
#Get the AUC
unlist(slot(performance(pred_ROC2, "auc"), "y.values"))
## [1] 0.996666
Since we are plotting the ROC curve on the same data that we used in our model, the area under the curve is high. We need test data to actually gauge the prediction performance of our model
Among the two models, the AUC is higher for the model: risky landing ~speed_ground + aircraft
ilogit <- function (x) {
exp(x)/(1+exp(x))
}
# predicting probability of risky landing
new.ind_risky<-data.frame(aircraft="boeing", duration=200, no_pasg=80, speed_ground=115, speed_air=120, height=40, pitch=4)
# Predicted probability for risky landing
predict(fit9.5,newdata=new.ind_risky,type="link",se=T)
## $fit
## 1
## 8.463332
##
## $se.fit
## [1] 2.089367
##
## $residual.scale
## [1] 1
# CI
round(ilogit(c(8.463332-1.96*2.089367,8.463332+1.96*2.089367)),3)
## [1] 0.987 1.000
# predicting probability of long landing
new.ind_long<-data.frame(aircraft="boeing", duration=200, no_pasg=80, speed_ground=115, speed_air=120, height=40, pitch=4)
# Predicted probability for long landing
predict(fit5,newdata=new.ind_long,type="link",se=T)
## $fit
## 1
## 10.07025
##
## $se.fit
## [1] 1.616338
##
## $residual.scale
## [1] 1
# CI
round(ilogit(c(10.07025-1.96*1.616338,10.07025+1.96*1.616338)),3)
## [1] 0.999 1.000
lmod.probit<-glm(risky.landing ~ speed_ground + aircraft,family=binomial(link=probit),FAA_merged3)
lmod.cloglog<-glm(risky.landing ~ speed_ground + aircraft,family=binomial(link=cloglog),FAA_merged3)
round(coef(fit9.5),3)
## (Intercept) speed_ground aircraftboeing
## -102.077 0.926 4.019
round(coef(lmod.probit),3)
## (Intercept) speed_ground aircraftboeing
## -58.693 0.532 2.357
round(coef(lmod.cloglog),3)
## (Intercept) speed_ground aircraftboeing
## -69.265 0.622 2.898
# ROC for probit
pred15.1 <- predict(lmod.probit, type = "response")
pred_ROC15.1 <- prediction(pred15.1, FAA_merged3$risky.landing)
perf15.1 <- performance(pred_ROC15.1, "tpr", "fpr")
#Get the AUC
unlist(slot(performance(pred_ROC15.1, "auc"), "y.values"))
## [1] 0.9986161
# ROC for complimentary log log
pred15.2 <- predict(lmod.cloglog, type = "response")
pred_ROC15.2 <- prediction(pred15.2, FAA_merged3$risky.landing)
perf15.2 <- performance(pred_ROC15.2, "tpr", "fpr")
#Get the AUC
unlist(slot(performance(pred_ROC15.2, "auc"), "y.values"))
## [1] 0.9985736
plot(perf1, type="l", lty=1)
plot(perf15.1, type="l", lty=2,add=TRUE)
plot(perf15.2, type="l", lty=3, add=TRUE)
top_risky_observations <- data.frame(logit_prob=fit9.5$fitted.values,
probit_prob = lmod.probit$fitted.values,
clog_prob = lmod.cloglog$fitted.values)
top_risky_observations$index <- 1:nrow(top_risky_observations)
head(top_risky_observations)
## logit_prob probit_prob clog_prob index
## 1 8.700396e-01 8.646452e-01 8.827953e-01 1
## 2 1.989379e-02 1.285756e-02 4.271187e-02 2
## 3 2.220446e-16 2.220446e-16 2.356424e-10 3
## 4 8.599751e-09 2.220446e-16 2.291489e-06 4
## 5 2.220446e-16 2.220446e-16 2.271906e-13 5
## 6 3.893281e-13 2.220446e-16 2.771489e-09 6
top5risky_logit <- top_risky_observations %>% arrange(desc(logit_prob)) %>%
top_n(n=5, wt=logit_prob) %>% dplyr::select(logit_prob,index)
top5risky_probit <- top_risky_observations %>% arrange(desc(probit_prob)) %>%
top_n(n=5, wt=probit_prob) %>% dplyr::select(probit_prob,index)
top5risky_clog <- top_risky_observations %>% arrange(desc(clog_prob)) %>%
top_n(n=5, wt=clog_prob) %>% dplyr::select(clog_prob,index)
index_compare <- data.frame(logit=head(top5risky_logit,5), probit=head(top5risky_probit,5),
clog=head(top5risky_clog,5))
index_compare
## logit.logit_prob logit.index probit.probit_prob probit.index
## 1 1 362 1 56
## 2 1 307 1 64
## 3 1 64 1 134
## 4 1 387 1 176
## 5 1 408 1 179
## clog.clog_prob clog.index
## 1 1 19
## 2 1 29
## 3 1 30
## 4 1 56
## 5 1 64
# Predicted probability for risky landing - probit
predict(lmod.probit,newdata=new.ind_risky,type="link",se=T)
## $fit
## 1
## 4.872041
##
## $se.fit
## [1] 1.127892
##
## $residual.scale
## [1] 1
# CI
round(ilogit(c(4.872041-1.96*1.127892,4.872041+1.96*1.127892)),3)
## [1] 0.935 0.999
# predicting probability of risky landing - cloglog
predict(lmod.cloglog ,newdata=new.ind_long,type="link",se=T)
## $fit
## 1
## 5.16944
##
## $se.fit
## [1] 1.173423
##
## $residual.scale
## [1] 1
# CI
round(ilogit(c(5.16944-1.96*1.173423,5.16944+1.96*1.173423)),3)
## [1] 0.946 0.999
# redicting probability of risky landing - cloglog
round(ilogit(c(8.463332-1.96*2.089367,8.463332+1.96*2.089367)),3)
## [1] 0.987 1.000