library(tidyverse) #to visualize, transform, input, tidy and join data
library(dplyr) #data wrangling
library(stringr) #string related functions
library(kableExtra) #to create HTML Table
library(DT) #to preview the data sets
library(lubridate) #to apply the date functions
library(xlsx) #to load excel files
library(ROCR) #to use ROC curves
library(faraway) #to use the ilogit function
Loaded the datasets and did initial data cleaning(detailed steps were performed in the first assignment)
faa1 <- read.xlsx("FAA1.xls", sheetName = "FAA1")
faa2 <- read.xlsx("FAA2_2.xls", sheetName = "Sheet1")
faa <- bind_rows(faa1, faa2)
check <- faa %>%
select(-duration) %>%
duplicated() %>%
which()
faa <- faa[-check,]
faa_check <- faa %>%
filter((duration > 40| is.na(duration)) & (speed_ground >= 30) & (speed_ground <= 140) &
(height >= 6) & (distance < 6000))
faa <- faa_check
faa$duration_corrected <- NA
faa <- transform(faa, duration_corrected = ifelse(is.na(faa$duration), mean(faa$duration, na.rm=TRUE), faa$duration))
Step 1:
faa <- faa %>%
mutate(long.landing = as.factor(ifelse(distance > 2500, 1,0 )) ,
risky.landing = as.factor(ifelse(distance > 3000,1,0 )),
aircraft = as.factor(aircraft))
faa$duration <- NULL
faa$distance <- NULL
Step 2: Histogram to show distribution of “long.landing”
faa %>%
ggplot(aes(long.landing)) +
geom_bar()
only 12% aircrafts have long landing
round(prop.table(table(faa$long.landing)),2)
##
## 0 1
## 0.88 0.12
Step 3:
mdl_duration <- glm (faa$long.landing ~ faa$duration_corrected, family = "binomial")
mdl_speedgrnd <- glm (faa$long.landing ~ faa$speed_ground, family = "binomial")
mdl_height <- glm (faa$long.landing ~ faa$height, family = "binomial")
mdl_pitch <- glm (faa$long.landing ~ faa$pitch, family = "binomial")
mdl_nopasg <- glm (faa$long.landing ~ faa$no_pasg, family = "binomial")
mdl_speedair <- glm (faa$long.landing ~ faa$speed_air, family = "binomial")
mdl_aircraft <- glm (faa$long.landing ~ faa$aircraft, family = "binomial")
duration <- summary(mdl_duration)$coef[2,c(1,4)]
speed_ground <- summary(mdl_speedgrnd)$coef[2,c(1,4)]
height <- summary(mdl_height)$coef[2,c(1,4)]
pitch <- summary(mdl_pitch)$coef[2,c(1,4)]
no_pasg <- summary(mdl_nopasg)$coef[2,c(1,4)]
speed_air <- summary(mdl_speedair)$coef[2,c(1,4)]
aircraft_boeing <- summary(mdl_aircraft)$coef[2,c(1,4)]
aircraft_airbus <- summary(mdl_aircraft)$coef[1,c(1,4)]
coefficients <- c(duration[1], speed_ground[1], height[1], pitch[1], no_pasg[1],speed_air[1],aircraft_boeing[1],aircraft_airbus[1])
coefficients <- round(coefficients, digits = 3)
odds_ratio <- round(exp(coefficients), 3)
p_value <- c(duration[2], speed_ground[2], height[2], pitch[2], no_pasg[2],speed_air[2],aircraft_boeing[2],aircraft_airbus[2])
p_value <- round(p_value, digits = 3)
variable_names <- c("Duration","Ground Speed","Height","Pitch","No. of Passengers","Air Speed","Aircraft-Boeing", "Aircraft-Airbus")
table_2 <- data.frame(variable_names, coefficients,odds_ratio, p_value)
table_2$slope_direction <- ifelse(coefficients > 0 , "Positive", "Negative")
table_2 <- table_2 %>%
select(variable_names, coefficients, odds_ratio, p_value, slope_direction) %>%
arrange(p_value)
table_2
## variable_names coefficients odds_ratio p_value slope_direction
## 1 Ground Speed 0.472 1.603 0.000 Positive
## 2 Air Speed 0.512 1.669 0.000 Positive
## 3 Aircraft-Boeing 0.864 2.373 0.000 Positive
## 4 Aircraft-Airbus -2.428 0.088 0.000 Negative
## 5 Pitch 0.401 1.493 0.047 Positive
## 6 Height 0.009 1.009 0.422 Positive
## 7 No. of Passengers -0.007 0.993 0.606 Negative
## 8 Duration -0.001 0.999 0.626 Negative
Step 4:
we see speed_ground, speed_air, aircraft type, pitch and height appear to be positively correlated to long_landing.
Let’s visualize it -
check_plot <- function(x) {
ggplot(aes(x = x, fill = long.landing), data = faa) +
geom_histogram(position = 'dodge', aes(y = ..density..))
}
The probability of long landing increases with the increase in speed_ground
check_plot(faa$speed_ground)
Probability of long landing increases with increase in speed of air
check_plot(faa$speed_air)
Long landing isn’t affected by pitch of aircraft
check_plot(faa$pitch)
Long landing seem to be unaffected by height of aircraft
check_plot(faa$height)
Step 5:
I observed that speed of ground, aircraft and height are significant. Pitch is not significant like we observed in the previous table
full_model <- glm(long.landing ~ aircraft +
no_pasg + speed_ground + height +
pitch + duration_corrected, family = "binomial",
data = faa)
summary(full_model)
##
## Call:
## glm(formula = long.landing ~ aircraft + no_pasg + speed_ground +
## height + pitch + duration_corrected, family = "binomial",
## data = faa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.16087 -0.00052 0.00000 0.00000 2.32238
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.215e+02 2.500e+01 -4.858 1.19e-06 ***
## aircraftboeing 5.192e+00 1.200e+00 4.328 1.51e-05 ***
## no_pasg 3.423e-03 5.461e-02 0.063 0.950023
## speed_ground 1.033e+00 2.082e-01 4.960 7.03e-07 ***
## height 2.531e-01 7.253e-02 3.490 0.000483 ***
## pitch 1.484e+00 8.454e-01 1.755 0.079247 .
## duration_corrected 5.287e-03 7.864e-03 0.672 0.501355
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.778 on 830 degrees of freedom
## Residual deviance: 52.746 on 824 degrees of freedom
## AIC: 66.746
##
## Number of Fisher Scoring iterations: 12
Step 6:
Based on AIC model, results seem to be consistent with the table observed.
faa_clean <- select(faa, -risky.landing , -speed_air)
model01 <- glm(long.landing ~ 1,data=faa_clean, family = "binomial")
model61 <- glm(long.landing ~ speed_ground + aircraft + height + no_pasg + duration_corrected + pitch,data=faa_clean, family = "binomial")
model_1 <- MASS::stepAIC(model01,direction="forward",scope=list(upper=model61,lower=model01))
## Start: AIC=624.78
## long.landing ~ 1
##
## Df Deviance AIC
## + speed_ground 1 115.47 119.47
## + aircraft 1 606.55 610.55
## + pitch 1 618.79 622.79
## <none> 622.78 624.78
## + height 1 622.13 626.13
## + no_pasg 1 622.51 626.51
## + duration_corrected 1 622.54 626.54
##
## Step: AIC=119.47
## long.landing ~ speed_ground
##
## Df Deviance AIC
## + aircraft 1 84.665 90.665
## + height 1 100.459 106.459
## + pitch 1 105.527 111.527
## <none> 115.470 119.470
## + duration_corrected 1 115.378 121.378
## + no_pasg 1 115.468 121.468
##
## Step: AIC=90.66
## long.landing ~ speed_ground + aircraft
##
## Df Deviance AIC
## + height 1 57.047 65.047
## + pitch 1 81.309 89.309
## <none> 84.665 90.665
## + duration_corrected 1 83.164 91.164
## + no_pasg 1 84.219 92.219
##
## Step: AIC=65.05
## long.landing ~ speed_ground + aircraft + height
##
## Df Deviance AIC
## + pitch 1 53.204 63.204
## <none> 57.047 65.047
## + duration_corrected 1 56.288 66.288
## + no_pasg 1 57.031 67.031
##
## Step: AIC=63.2
## long.landing ~ speed_ground + aircraft + height + pitch
##
## Df Deviance AIC
## <none> 53.204 63.204
## + duration_corrected 1 52.750 64.750
## + no_pasg 1 53.204 65.204
summary(model_1)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height +
## pitch, family = "binomial", data = faa_clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.20284 -0.00054 0.00000 0.00000 2.35719
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -119.77598 24.41821 -4.905 9.33e-07 ***
## speed_ground 1.02266 0.20290 5.040 4.65e-07 ***
## aircraftboeing 5.13443 1.18091 4.348 1.37e-05 ***
## height 0.25795 0.06861 3.760 0.00017 ***
## pitch 1.53751 0.84109 1.828 0.06755 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.778 on 830 degrees of freedom
## Residual deviance: 53.204 on 826 degrees of freedom
## AIC: 63.204
##
## Number of Fisher Scoring iterations: 12
Step 7:
We further use BIC measure to find “best subsets” model. Pitch is not chosen by this model. This may be because BIC penalizes strongly
model_2 <- MASS::stepAIC(model01,direction="forward",scope=list(upper=model61,lower=model01), k = log(nrow(faa_clean)))
## Start: AIC=629.5
## long.landing ~ 1
##
## Df Deviance AIC
## + speed_ground 1 115.47 128.92
## + aircraft 1 606.55 620.00
## <none> 622.78 629.50
## + pitch 1 618.79 632.24
## + height 1 622.13 635.58
## + no_pasg 1 622.51 635.96
## + duration_corrected 1 622.54 635.98
##
## Step: AIC=128.92
## long.landing ~ speed_ground
##
## Df Deviance AIC
## + aircraft 1 84.665 104.83
## + height 1 100.459 120.63
## + pitch 1 105.527 125.69
## <none> 115.470 128.92
## + duration_corrected 1 115.378 135.54
## + no_pasg 1 115.468 135.64
##
## Step: AIC=104.83
## long.landing ~ speed_ground + aircraft
##
## Df Deviance AIC
## + height 1 57.047 83.937
## <none> 84.665 104.832
## + pitch 1 81.309 108.200
## + duration_corrected 1 83.164 110.054
## + no_pasg 1 84.219 111.110
##
## Step: AIC=83.94
## long.landing ~ speed_ground + aircraft + height
##
## Df Deviance AIC
## <none> 57.047 83.937
## + pitch 1 53.204 86.817
## + duration_corrected 1 56.288 89.901
## + no_pasg 1 57.031 90.644
summary(model_2)
##
## Call:
## glm(formula = long.landing ~ speed_ground + aircraft + height,
## family = "binomial", data = faa_clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.43442 -0.00117 0.00000 0.00000 2.57435
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -102.95437 19.22882 -5.354 8.59e-08 ***
## speed_ground 0.92657 0.17242 5.374 7.70e-08 ***
## aircraftboeing 5.04813 1.11520 4.527 5.99e-06 ***
## height 0.23106 0.05959 3.877 0.000106 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 622.778 on 830 degrees of freedom
## Residual deviance: 57.047 on 827 degrees of freedom
## AIC: 65.047
##
## Number of Fisher Scoring iterations: 11
Step 8:
The variables and their contribution in prediction of long landing is -
table_2
## variable_names coefficients odds_ratio p_value slope_direction
## 1 Ground Speed 0.472 1.603 0.000 Positive
## 2 Air Speed 0.512 1.669 0.000 Positive
## 3 Aircraft-Boeing 0.864 2.373 0.000 Positive
## 4 Aircraft-Airbus -2.428 0.088 0.000 Negative
## 5 Pitch 0.401 1.493 0.047 Positive
## 6 Height 0.009 1.009 0.422 Positive
## 7 No. of Passengers -0.007 0.993 0.606 Negative
## 8 Duration -0.001 0.999 0.626 Negative
The various plots that helped us understand the relationship between the variables better-
The probability of long landing increases with the increase in speed_ground
check_plot(faa$speed_ground)
Probability of long landing increases with increase in speed of air
check_plot(faa$speed_air)
Long landing isn’t affected by pitch of aircraft
check_plot(faa$pitch)
Long landing seem to be unaffected by height of aircraft
check_plot(faa$height)
Based on our analysis, our final model is: long.landing ~ speed_ground + aircraft + height
Step 9: Repeating all the steps for risky landing—
faa %>%
ggplot(aes(risky.landing)) +
geom_bar()
only 7% aircrafts have long landing
round(prop.table(table(faa$risky.landing)),2)
##
## 0 1
## 0.93 0.07
The speed of ground, air speed and aircraft make seems to be likely to affect the risky landing.
## variable_names coefficients odds_ratio p_value slope_direction
## 1 Ground Speed 0.614 1.848 0.000 Positive
## 2 Air Speed 0.870 2.387 0.000 Positive
## 3 Aircraft-Boeing 1.002 2.724 0.000 Positive
## 4 Aircraft-Airbus -3.108 0.045 0.000 Negative
## 5 Pitch 0.371 1.449 0.143 Positive
## 6 No. of Passengers -0.025 0.975 0.154 Negative
## 7 Duration -0.001 0.999 0.674 Negative
## 8 Height -0.002 0.998 0.871 Negative
we see speed_ground, speed_air, aircraft type, pitch and height appear to be positively correlated to long_landing.
Let’s visualize it -
check_plot_risky <- function(x) {
ggplot(aes(x = x, fill = risky.landing), data = faa) +
geom_histogram(position = 'dodge', aes(y = ..density..))
}
The probability of riksy landing increases with the increase in speed_ground
check_plot_risky(faa$speed_ground)
Probability of risky landing increases with increase in speed of air
check_plot_risky(faa$speed_air)
Risky landing isn’t affected by pitch of aircraft
check_plot_risky(faa$pitch)
Risky landing seem to be unaffected by height of aircraft
check_plot_risky(faa$height)
I observed that speed of ground and make of aircraft are significant. Results seem to be consistent with our observations before
full_model <- glm(risky.landing ~ aircraft +
no_pasg + speed_ground + height +
pitch + duration_corrected, family = "binomial",
data = faa)
summary(full_model)
##
## Call:
## glm(formula = risky.landing ~ aircraft + no_pasg + speed_ground +
## height + pitch + duration_corrected, family = "binomial",
## data = faa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.46375 -0.00009 0.00000 0.00000 1.85765
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.035e+02 2.801e+01 -3.697 0.000218 ***
## aircraftboeing 4.457e+00 1.547e+00 2.881 0.003970 **
## no_pasg -8.620e-02 6.035e-02 -1.428 0.153201
## speed_ground 9.488e-01 2.465e-01 3.848 0.000119 ***
## height 4.310e-02 4.624e-02 0.932 0.351382
## pitch 6.139e-01 7.982e-01 0.769 0.441878
## duration_corrected 7.952e-04 1.224e-02 0.065 0.948209
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 36.473 on 824 degrees of freedom
## AIC: 50.473
##
## Number of Fisher Scoring iterations: 12
Results seem to be inconsistent with our observation before. Based on AIC measure, number of passengers seems to be a significant variable whereas if we look at the p_value, it doesn’t support our hypothesis.
faa_clean2 <- select(faa, -long.landing , -speed_air)
model_null2 <- glm(risky.landing ~ 1,data=faa_clean2, family = "binomial")
model_full2 <- glm(risky.landing ~ speed_ground + aircraft + height + no_pasg + duration_corrected + pitch,data=faa_clean2, family = "binomial")
model_3 <- MASS::stepAIC(model_null2,direction="forward",scope=list(upper=model_full2,lower=model_null2))
## Start: AIC=438.04
## risky.landing ~ 1
##
## Df Deviance AIC
## + speed_ground 1 58.93 62.93
## + aircraft 1 422.74 426.74
## + pitch 1 433.89 437.89
## + no_pasg 1 434.00 438.00
## <none> 436.04 438.04
## + duration_corrected 1 435.86 439.86
## + height 1 436.02 440.02
##
## Step: AIC=62.93
## risky.landing ~ speed_ground
##
## Df Deviance AIC
## + aircraft 1 40.097 46.097
## + pitch 1 53.079 59.079
## <none> 58.931 62.931
## + no_pasg 1 58.318 64.318
## + height 1 58.667 64.667
## + duration_corrected 1 58.883 64.883
##
## Step: AIC=46.1
## risky.landing ~ speed_ground + aircraft
##
## Df Deviance AIC
## + no_pasg 1 37.707 45.707
## <none> 40.097 46.097
## + height 1 39.402 47.402
## + duration_corrected 1 39.884 47.884
## + pitch 1 39.928 47.928
##
## Step: AIC=45.71
## risky.landing ~ speed_ground + aircraft + no_pasg
##
## Df Deviance AIC
## <none> 37.707 45.707
## + height 1 37.099 47.099
## + pitch 1 37.449 47.449
## + duration_corrected 1 37.693 47.693
summary(model_3)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft + no_pasg,
## family = "binomial", data = faa_clean2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.33913 -0.00009 0.00000 0.00000 1.87810
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -99.90780 25.57993 -3.906 9.39e-05 ***
## speed_ground 0.94963 0.23559 4.031 5.56e-05 ***
## aircraftboeing 4.64188 1.47520 3.147 0.00165 **
## no_pasg -0.08462 0.05732 -1.476 0.13987
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 37.707 on 827 degrees of freedom
## AIC: 45.707
##
## Number of Fisher Scoring iterations: 12
Number of passengers, which seemed to be significant when considering AIC as the method for variable selection seem to be not significant when considering BIC. This may be because BIC penalizes strongly
model_4 <- MASS::stepAIC(model_null2,direction="forward",scope=list(upper=model_full2,lower=model_null2), k = log(nrow(faa_clean2)))
## Start: AIC=442.77
## risky.landing ~ 1
##
## Df Deviance AIC
## + speed_ground 1 58.93 72.38
## + aircraft 1 422.74 436.18
## <none> 436.04 442.77
## + pitch 1 433.89 447.34
## + no_pasg 1 434.00 447.45
## + duration_corrected 1 435.86 449.31
## + height 1 436.02 449.46
##
## Step: AIC=72.38
## risky.landing ~ speed_ground
##
## Df Deviance AIC
## + aircraft 1 40.097 60.264
## <none> 58.931 72.376
## + pitch 1 53.079 73.247
## + no_pasg 1 58.318 78.486
## + height 1 58.667 78.835
## + duration_corrected 1 58.883 79.051
##
## Step: AIC=60.26
## risky.landing ~ speed_ground + aircraft
##
## Df Deviance AIC
## <none> 40.097 60.264
## + no_pasg 1 37.707 64.597
## + height 1 39.402 66.292
## + duration_corrected 1 39.884 66.775
## + pitch 1 39.928 66.819
summary(model_4)
##
## Call:
## glm(formula = risky.landing ~ speed_ground + aircraft, family = "binomial",
## data = faa_clean2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.24398 -0.00011 0.00000 0.00000 1.61021
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -102.0772 24.7751 -4.120 3.79e-05 ***
## speed_ground 0.9263 0.2248 4.121 3.78e-05 ***
## aircraftboeing 4.0190 1.2494 3.217 0.0013 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 436.043 on 830 degrees of freedom
## Residual deviance: 40.097 on 828 degrees of freedom
## AIC: 46.097
##
## Number of Fisher Scoring iterations: 12
Step 11: Conclusions for both the type of landings -
Speed of ground, height and type of aircraft are significant predictors of long landing.
Only speed of ground and type of aircraft are significant predictors of risky landing and height of aircraft seems to be not playing a role when landing is risky.
BIC for long landing is 65.047 and for risky landing is 46.097.
AIC for long landing is 63.204 and for long landing is 45.707
Step 12: ROC Curves
Plot for risky landing is marginally smoother than that for long landing.
data1 <- select(faa, -risky.landing, -speed_air)
data2 <- select(faa, -long.landing, -speed_air)
pred1 <- prediction(predict(model_2), data1$long.landing)
roc1 <- performance(pred1, "tpr", "fpr", main = "ROC for long landing and risky landing")
plot(roc1)
pred2 <- prediction(predict(model_4), data2$risky.landing)
roc2 <- performance(pred2, "tpr", "fpr")
plot(roc2, add = TRUE, colorize = TRUE, main = "ROC for long landing and risky landing")
The AUC in case of long landing is 99.6% and that for risky landing is 99.9%.
#long landing
auc_ROCR1 <- performance(pred1, measure = "auc")
auc_ROCR1@y.values[[1]]
## [1] 0.998333
#risky landing
auc_ROCR2 <- performance(pred2, measure = "auc")
auc_ROCR2@y.values[[1]]
## [1] 0.9986161
Step 13: Prediction
Given few parameters,
The long landing probability -
new.ind <- data.frame(aircraft = "boeing",
duration_corrected = 200,
no_pasg = 80,
speed_ground = 115,
speed_air = 120,
height = 40,
pitch = 4)
pred1 <- predict(model_2,newdata=new.ind, type = "link", se = T)
fit <- ilogit(pred1$fit)
upper <- ilogit(pred1$fit + (1.96 * pred1$se.fit))
lower <- ilogit(pred1$fit - (1.96 * pred1$se.fit))
cat("The confidence interval for long landing-",lower,"||", fit, "||", upper)
## The confidence interval for long landing- 0.999985 || 1 || 1
The risky landing probability -
new.ind <- data.frame(aircraft = "boeing",
duration_corrected = 200,
no_pasg = 80,
speed_ground = 115,
speed_air = 120,
height = 40,
pitch = 4)
pred1 <- predict(model_4,newdata=new.ind,type = "link", se = T)
fit <- ilogit(pred1$fit)
upper <- ilogit(pred1$fit + (1.96 * pred1$se.fit))
lower <- ilogit(pred1$fit - (1.96 * pred1$se.fit))
cat("The confidence interval for risky landing-",lower,"||", fit, "||", upper)
## The confidence interval for risky landing- 0.9874843 || 0.999789 || 0.9999965
Step 14: Comparing models with different link functions
The coefficients are greater for logit model
The std. error is minimum for probit model
The slope direction is same for all the three models
probit <- glm(risky.landing ~ aircraft+speed_ground,
family=binomial (link = "probit"),
data=data2)
cloglog <- glm(risky.landing ~ aircraft+speed_ground,
family=binomial (link = "cloglog"),
data=data2)
round(summary(model_4)$coefficients,3)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -102.077 24.775 -4.120 0.000
## speed_ground 0.926 0.225 4.121 0.000
## aircraftboeing 4.019 1.249 3.217 0.001
round(summary(probit)$coefficients,3)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -58.693 13.313 -4.409 0.000
## aircraftboeing 2.357 0.702 3.359 0.001
## speed_ground 0.532 0.121 4.411 0.000
round(summary(cloglog)$coefficients,3)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -69.265 14.740 -4.699 0
## aircraftboeing 2.898 0.800 3.622 0
## speed_ground 0.622 0.133 4.690 0
Step 15: ROC curves for the three models
par(mfrow=c(1,3))
pred1 <- prediction(predict(model_4), data2$risky.landing)
perf <- performance(pred1, "tpr", "fpr")
plot(perf, main = "Logit Model")
pred2 <- prediction(predict(probit), data2$risky.landing)
perf <- performance(pred2, "tpr", "fpr")
plot(perf, main = "Probit Model")
pred3 <- prediction(predict(cloglog), data2$risky.landing)
perf <- performance(pred3, "tpr", "fpr")
plot(perf, main = "Cloglog Model")
AUC for the three are all same and it is 99%
auc_logit <- performance(pred1, measure = "auc")
auc_logit <- auc_logit@y.values[[1]]
auc_probit <- performance(pred2, measure = "auc")
auc_probit <- auc_probit@y.values[[1]]
auc_cloglog <- performance(pred3, measure = "auc")
auc_cloglog <- auc_cloglog@y.values[[1]]
cat("AUC for logit is:",auc_logit," AUC for Probit is: ",
auc_probit, " AUC for Loglog is: ", auc_cloglog)
## AUC for logit is: 0.9986161 AUC for Probit is: 0.9986161 AUC for Loglog is: 0.9985736
Step 16:
Flight in observations 64 is appearing in all the three models and is risky.
Flight in observation 307 and 176 is appearing in two of the models and is also having a risky landing
pred_logit <- predict(model_4, type = "response")
pred_probit <- predict(probit, type = "response")
pred_cloglog <- predict(cloglog, type = "response")
head(sort(pred_logit, decreasing = T))
## 362 307 64 387 408 176
## 1 1 1 1 1 1
head(sort(pred_probit, decreasing = T))
## 56 64 134 176 179 307
## 1 1 1 1 1 1
head(sort(pred_cloglog, decreasing = T))
## 19 29 30 56 64 90
## 1 1 1 1 1 1
Step 17:
The probability of risky landing is asymmetric for all the three models.
cat("The confidence interval for logit:",lower,"||", fit, "||", upper)
## The confidence interval for logit: 0.9874843 || 0.999789 || 0.9999965
pred2 <- predict(probit, new.ind, type = "link", se.fit = TRUE)
fit <- ilogit(pred2$fit)
upper <- ilogit(pred2$fit + (1.96 * pred2$se.fit))
lower <- ilogit(pred2$fit - (1.96 * pred2$se.fit))
cat("The confidence interval for probit:",lower,"||", fit, "||", upper)
## The confidence interval for probit: 0.9347085 || 0.9924005 || 0.9991612
pred3 <- predict(cloglog, new.ind, type = "link", se.fit = TRUE)
fit <- ilogit(pred3$fit)
upper <- ilogit(pred3$fit + (1.96 * pred3$se.fit))
lower <- ilogit(pred3$fit - (1.96 * pred3$se.fit))
cat("The confidence interval for cloglog:",lower,"||", fit, "||", upper)
## The confidence interval for cloglog: 0.9463195 || 0.9943444 || 0.99943