FAA1 <- read.csv("FAA1.csv", header = TRUE)
FAA2 <- read.csv("FAA2.csv", header = TRUE)
FAA2 <- FAA2[-c(151:200), ]
FAA <- merge(FAA1, FAA2, by = c("aircraft","no_pasg", "speed_ground","speed_air","height", "pitch", "distance"), all = TRUE)
FAA$aircraft_num <- ifelse(FAA$aircraft == "airbus", 1, 0)
FAA<- subset(FAA, duration > 40)
#remove speed_ground that's less than 30 or greater than 140
FAA<- subset(FAA, speed_ground >=30 & speed_ground <=140)
#remove speed_air that's less than 30 or greater than 140
FAA<- subset(FAA, speed_air >=30 & speed_ground <=140)
#remove height that's less than 6 meters
FAA<- subset(FAA, height >=6)
#remove distance that's over 6000 feet
FAA <- subset(FAA, distance < 6000)
#export the wrangled data to csv for
write.csv(FAA, "FAA_wrangled.csv", row.names = FALSE)
#create additional variables
FAA$long.landing <- ifelse(FAA$distance > 2500, 1, 0)
FAA$risky.landing <- ifelse(FAA$distance > 3000, 1, 0)
#discard variable distance
FAA <- FAA[, -7]
attach(FAA)
library(tidyverse)
library(ggplot2)
#fit the model with probit link
mod_risky_probit <- glm(risky.landing ~ speed_air + aircraft_num,
family = binomial (link = probit), data = FAA)
summary(mod_risky_probit)
Call:
glm(formula = risky.landing ~ speed_air + aircraft_num, family = binomial(link = probit),
data = FAA)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -71.5094 16.2455 -4.402 1.07e-05 ***
speed_air 0.6767 0.1538 4.400 1.08e-05 ***
aircraft_num -2.6410 0.7940 -3.326 0.00088 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 240.724 on 194 degrees of freedom
Residual deviance: 26.133 on 192 degrees of freedom
AIC: 32.133
Number of Fisher Scoring iterations: 11
#fit the model with clog-log link
mod_risky_cloglog <- glm(risky.landing ~ speed_air + aircraft_num,
family = binomial (link = cloglog), data = FAA)
summary(mod_risky_cloglog)
Call:
glm(formula = risky.landing ~ speed_air + aircraft_num, family = binomial(link = cloglog),
data = FAA)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -99.9646 24.9309 -4.010 6.08e-05 ***
speed_air 0.9380 0.2338 4.012 6.03e-05 ***
aircraft_num -3.2427 0.9996 -3.244 0.00118 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 240.724 on 194 degrees of freedom
Residual deviance: 24.333 on 192 degrees of freedom
AIC: 30.333
Number of Fisher Scoring iterations: 12
#logistic model from previous analysis
mod_risky_0 <- glm(risky.landing ~ speed_air + aircraft_num,
family = binomial, data = FAA)
summary(mod_risky_0)
Call:
glm(formula = risky.landing ~ speed_air + aircraft_num, family = binomial,
data = FAA)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -129.1675 32.4018 -3.986 6.71e-05 ***
speed_air 1.2206 0.3064 3.984 6.78e-05 ***
aircraft_num -4.5499 1.5124 -3.008 0.00263 **
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 240.724 on 194 degrees of freedom
Residual deviance: 26.281 on 192 degrees of freedom
AIC: 32.281
Number of Fisher Scoring iterations: 9
#compare the coefficients
round(coef(mod_risky_0), 3)
(Intercept) speed_air aircraft_num
-129.168 1.221 -4.550
round(coef(mod_risky_probit), 3)
(Intercept) speed_air aircraft_num
-71.509 0.677 -2.641
round(coef(mod_risky_cloglog), 3)
(Intercept) speed_air aircraft_num
-99.965 0.938 -3.243
#compare the predicted values
predval <- sapply(list(mod_risky_0, mod_risky_probit, mod_risky_cloglog), fitted)
colnames(predval) <- c("Logit", "Probit", "Cloglog")
round(predval[1:10,], 3)
Logit Probit Cloglog
4 0.000 0.000 0.000
6 0.000 0.000 0.000
15 0.000 0.000 0.000
22 0.001 0.000 0.002
23 0.502 0.493 0.475
24 1.000 1.000 1.000
29 0.592 0.573 0.574
30 0.962 0.961 1.000
31 1.000 1.000 1.000
51 0.777 0.748 0.811
round(predval[fitted(mod_risky_0) > 0.5,] ,3)
Logit Probit Cloglog
23 0.502 0.493 0.475
24 1.000 1.000 1.000
29 0.592 0.573 0.574
30 0.962 0.961 1.000
31 1.000 1.000 1.000
51 0.777 0.748 0.811
63 0.898 0.882 0.967
117 0.999 1.000 1.000
179 0.974 0.977 1.000
228 0.645 0.621 0.637
229 1.000 1.000 1.000
254 0.993 0.997 1.000
255 1.000 1.000 1.000
279 1.000 1.000 1.000
344 0.996 0.999 1.000
362 1.000 1.000 1.000
363 1.000 1.000 1.000
376 0.988 0.992 1.000
441 1.000 1.000 1.000
463 0.904 0.909 0.937
464 0.997 1.000 1.000
465 0.999 1.000 1.000
477 1.000 1.000 1.000
487 0.888 0.894 0.913
518 0.992 0.997 1.000
519 1.000 1.000 1.000
520 1.000 1.000 1.000
521 1.000 1.000 1.000
530 0.986 0.993 1.000
561 0.997 1.000 1.000
562 1.000 1.000 1.000
577 0.992 0.997 1.000
578 1.000 1.000 1.000
579 1.000 1.000 1.000
607 0.963 0.972 0.998
608 0.987 0.994 1.000
609 1.000 1.000 1.000
610 1.000 1.000 1.000
628 0.924 0.931 0.967
630 1.000 1.000 1.000
631 1.000 1.000 1.000
632 1.000 1.000 1.000
655 0.995 0.999 1.000
656 1.000 1.000 1.000
657 1.000 1.000 1.000
681 0.878 0.884 0.897
684 0.528 0.563 0.418
685 1.000 1.000 1.000
686 1.000 1.000 1.000
699 0.997 1.000 1.000
757 1.000 1.000 1.000
773 0.947 0.955 0.989
774 0.957 0.965 0.995
775 1.000 1.000 1.000
787 1.000 1.000 1.000
788 1.000 1.000 1.000
800 1.000 1.000 1.000
821 0.988 0.995 1.000
832 1.000 1.000 1.000
840 0.753 0.762 0.689
848 0.960 0.969 0.997
#plot the predicted probability for each model
FAA$logit_pred <- predict(mod_risky_0, type = "response")
FAA$probit_pred <- predict(mod_risky_probit, type = "response")
FAA$cloglog_pred <- predict(mod_risky_cloglog, type = "response")
ggplot(FAA, aes(x = speed_air)) +
geom_point(aes(y = risky.landing), alpha = 0.3) +
geom_smooth(aes(y = logit_pred, color = "Logit"), method = "loess") +
geom_smooth(aes(y = probit_pred, color = "Probit"), method = "loess") +
geom_smooth(aes(y = cloglog_pred, color = "Cloglog"), method = "loess") +
labs(title = "Comparison of Logistic, Probit, and Cloglog Models",
x = "Speed Air",
y = "Predicted Probability of Risky Landing") +
scale_color_manual(values = c("Logit" = "blue", "Probit" = "green", "Cloglog" = "red")) +
theme_minimal()
Based on the analysis, the cloglog model has a lower AIC score of 30.33. The logit and cloglog models give similar results, but probit is different, the coefficients are almost half of logit model. Based on the plot, we can see that the probability distribution is not symmetric, indicates that the cloglog is probably a better fit.
library(pROC)
roc_logit <- roc(FAA$risky.landing, FAA$logit_pred)
roc_probit <- roc(FAA$risky.landing, FAA$probit_pred)
roc_cloglog <- roc(FAA$risky.landing, FAA$cloglog_pred)
# Plot the ROC curves
plot(roc_logit, col = "blue", lwd = 2, main = "ROC Curves for Logit, Probit, and Cloglog Models", xlab = "1 - Specificity")
plot(roc_probit, col = "green", lwd = 2, add = TRUE)
plot(roc_cloglog, col = "red", lwd = 2, add = TRUE)
# Add legend
legend("bottomright", legend = c("Logit", "Probit", "Cloglog"),
col = c("blue", "green", "red"), lwd = 2)
auc(roc_logit)
Area under the curve: 0.9962
auc(roc_probit)
Area under the curve: 0.996
auc(roc_cloglog)
Area under the curve: 0.9962
We can see based on the ROC curve, the logit curve and cloglog has the same AUC score, and the line is basically overlaid. Those two models have the similar results.
#create an identifier for each flight
FAA$flight_id <- 1:nrow(FAA)
#sort flight based on the predicted probability
top5_logit <- FAA[order(-FAA$logit_pred), ][1:5, c("flight_id", "logit_pred")]
top5_probit <- FAA[order(-FAA$probit_pred), ][1:5, c("flight_id", "probit_pred")]
top5_cloglog <- FAA[order(-FAA$cloglog_pred), ][1:5, c("flight_id", "cloglog_pred")]
top5_summary <- data.frame(
Rank = 1:5,
Logit = paste("ID:", top5_logit$flight_id, "| Risk:", round(top5_logit$logit_pred, 3)),
Probit = paste("ID:", top5_probit$flight_id, "| Risk:", round(top5_probit$probit_pred, 3)),
Cloglog = paste("ID:", top5_cloglog$flight_id, "| Risk:", round(top5_cloglog$cloglog_pred, 3))
)
# Print top 5 risky landings
print(top5_summary, row.names = FALSE)
Each model shows different results as what flights are the riskest. However, flight 35 consistently showed on all three models, indicating that the flight is risky across different assumptions about the data. Flight 9 also showed up on probit and cloglog, also indicates high chance of being a risky flight.
new_airplane <- data.frame(speed_air = 120,aircraft_num = 0,height = 40,pitch = 4,no_pasg = 80,duration = 200,speed_ground = 115)
# Predict the probability for risky landing probit link
risky_landing_pred_probit <- predict(mod_risky_probit, new_airplane, type = "response", se.fit = TRUE)
# Calculate the 95% confidence intervals for risky landing
risky_landing_probit_ci_lower <- risky_landing_pred_probit$fit - 1.96 * risky_landing_pred_probit$se.fit
risky_landing_probit_ci_upper <- risky_landing_pred_probit$fit + 1.96 * risky_landing_pred_probit$se.fit
cat("\nPredicted probability for Risky Landing according to probit link:\n")
Predicted probability for Risky Landing according to probit link:
cat("Probability: ", risky_landing_pred_probit$fit, "\n")
Probability: 1
cat("95% CI: [", risky_landing_probit_ci_lower, ", ", risky_landing_probit_ci_upper, "]\n")
95% CI: [ 1 , 1 ]
# Predict the probability for risky landing clogclog link
risky_landing_pred_cloglog <- predict(mod_risky_cloglog, new_airplane, type = "response", se.fit = TRUE)
# Calculate the 95% confidence intervals for risky landing
risky_landing_cloglog_ci_lower <- risky_landing_pred_cloglog$fit - 1.96 * risky_landing_pred_cloglog$se.fit
risky_landing_cloglog_ci_upper <- risky_landing_pred_cloglog$fit + 1.96 * risky_landing_pred_cloglog$se.fit
cat("\nPredicted probability for Risky Landing according to probit link:\n")
Predicted probability for Risky Landing according to probit link:
cat("Probability: ", risky_landing_pred_cloglog$fit, "\n")
Probability: 1
cat("95% CI: [", risky_landing_cloglog_ci_lower, ", ", risky_landing_cloglog_ci_upper, "]\n")
95% CI: [ 1 , 1 ]
Based on the prediction, the probability for this new airplan to have a risky or long landing is both 100%, it’s even more accurate then the logit model we analyzed in step 13.