To reduce the risk of landing overrun.
Use various GLM (generalized linear models) to study what factors and how they would impact the landing distance of a commercial flight.
Landing data (landing distance and other parameters) from 950 commercial flights.
library(readxl)
library(dplyr)
library(GGally)
library(tidyverse)
library(data.table)
library(ggplot2)
library(knitr)
library(stringr)
library(ROCR)
library(faraway)
library(DT)
library(nnet)
library(knitr)
# reading input data
df1 <- read_excel("data/FAA1.xls", sheet = "FAA1")
df2 <- read_excel("data/FAA2.xls", sheet = "FAA2")
# structure of data
str(df1)
## Classes 'tbl_df', 'tbl' and 'data.frame': 800 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
# structure of data
str(df2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 150 obs. of 7 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
# checking how many rows are duplicated
FAA <- bind_rows(df1,df2)
FAA %>%
select(-duration) %>%
duplicated() %>%
sum() %>%
print()
## [1] 100
# removing duplicated rows
index <-
FAA %>%
select(-duration) %>%
duplicated() %>%
which()
FAA <- FAA[-index,]
# structure of data
str(FAA)
## Classes 'tbl_df', 'tbl' and 'data.frame': 850 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
# summary statistics
summary(select(FAA, -aircraft))
## duration no_pasg speed_ground speed_air
## Min. : 14.76 Min. :29.0 Min. : 27.74 Min. : 90.00
## 1st Qu.:119.49 1st Qu.:55.0 1st Qu.: 65.90 1st Qu.: 96.25
## Median :153.95 Median :60.0 Median : 79.64 Median :101.15
## Mean :154.01 Mean :60.1 Mean : 79.45 Mean :103.80
## 3rd Qu.:188.91 3rd Qu.:65.0 3rd Qu.: 92.06 3rd Qu.:109.40
## Max. :305.62 Max. :87.0 Max. :141.22 Max. :141.72
## NA's :50 NA's :642
## height pitch distance
## Min. :-3.546 Min. :2.284 Min. : 34.08
## 1st Qu.:23.314 1st Qu.:3.642 1st Qu.: 883.79
## Median :30.093 Median :4.008 Median :1258.09
## Mean :30.144 Mean :4.009 Mean :1526.02
## 3rd Qu.:36.993 3rd Qu.:4.377 3rd Qu.:1936.95
## Max. :59.946 Max. :5.927 Max. :6533.05
##
# distribution of aircraft variable
table(FAA$aircraft)
##
## airbus boeing
## 450 400
# removing abnormal values
FAA1<-
FAA %>%
filter((is.na(duration) | duration >40),
(speed_ground >=30 | speed_ground <=140),
(is.na(speed_air) | speed_air >=30 | speed_air <=140),
height >=6, distance < 6000)
print(nrow(FAA) - nrow(FAA1))
## [1] 17
# structure of data
str(FAA1)
## Classes 'tbl_df', 'tbl' and 'data.frame': 833 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
# summary statistics
summary(select(FAA1, -aircraft))
## duration no_pasg speed_ground speed_air
## Min. : 41.95 Min. :29.00 Min. : 27.74 Min. : 90.00
## 1st Qu.:119.67 1st Qu.:55.00 1st Qu.: 66.08 1st Qu.: 96.23
## Median :154.28 Median :60.00 Median : 79.75 Median :101.12
## Mean :154.83 Mean :60.04 Mean : 79.42 Mean :103.48
## 3rd Qu.:189.75 3rd Qu.:65.00 3rd Qu.: 91.87 3rd Qu.:109.36
## Max. :305.62 Max. :87.00 Max. :132.78 Max. :132.91
## NA's :50 NA's :630
## height pitch distance
## Min. : 6.228 Min. :2.284 Min. : 41.72
## 1st Qu.:23.530 1st Qu.:3.641 1st Qu.: 893.58
## Median :30.159 Median :4.004 Median :1262.15
## Mean :30.442 Mean :4.006 Mean :1521.71
## 3rd Qu.:36.995 3rd Qu.:4.371 3rd Qu.:1936.01
## Max. :59.946 Max. :5.927 Max. :5381.96
##
# distribution of aircraft variable
table(FAA1$aircraft)
##
## airbus boeing
## 444 389
# pairwise plots
ggpairs(FAA1)
# calculating average pitch by aircraft
tapply(FAA1$pitch, FAA1$aircraft, mean)
## airbus boeing
## 3.831139 4.205725
# calculating average distance by aircraft
tapply(FAA1$distance, FAA1$aircraft, mean)
## airbus boeing
## 1323.317 1748.152
# calculating magnitude and sign of correlation of all variables with distance
table1 <-
cor(select(FAA1, -aircraft), use="complete.obs") %>%
as.data.frame() %>%
mutate(variable = rownames(.)) %>%
select(variable, correlation = distance) %>%
filter(variable != "distance") %>%
mutate(sign = ifelse(correlation > 0, "positive", "negative"),
correlation = abs(correlation)) %>%
arrange(desc(correlation))
table1
## variable correlation sign
## 1 speed_air 0.94321897 positive
## 2 speed_ground 0.92877195 positive
## 3 height 0.05775639 positive
## 4 duration 0.05241698 positive
## 5 pitch 0.03402263 positive
## 6 no_pasg 0.03258255 negative
# pairwise plots
ggpairs(FAA1)
# calculating magnitude and sign of correlation of all variables with distance for aircraft = airbus
corr_airbus <-
cor(select(filter(FAA1, aircraft == "airbus"), -aircraft)
, use="complete.obs") %>%
as.data.frame() %>%
mutate(variable = rownames(.)) %>%
select(variable, correlation = distance) %>%
filter(variable != "distance") %>%
mutate(sign = ifelse(correlation > 0, "positive", "negative"),
correlation = abs(correlation),
make = "airbus") %>%
arrange(desc(correlation)) %>%
select(make, everything())
# calculating magnitude and sign of correlation of all variables with distance for aircraft = boeing
corr_boeing <-
cor(select(filter(FAA1, aircraft == "boeing"), -aircraft)
, use="complete.obs") %>%
as.data.frame() %>%
mutate(variable = rownames(.)) %>%
select(variable, correlation = distance) %>%
filter(variable != "distance") %>%
mutate(sign = ifelse(correlation > 0, "positive", "negative"),
correlation = abs(correlation),
make="boeing") %>%
arrange(desc(correlation)) %>%
select(make, everything())
# appending the above two tables
bind_rows(corr_airbus,corr_boeing)
## make variable correlation sign
## 1 airbus speed_air 0.96526576 positive
## 2 airbus speed_ground 0.93993656 positive
## 3 airbus height 0.16388434 positive
## 4 airbus no_pasg 0.09282119 negative
## 5 airbus pitch 0.07153580 positive
## 6 airbus duration 0.04167093 positive
## 7 boeing speed_air 0.97759838 positive
## 8 boeing speed_ground 0.96809860 positive
## 9 boeing pitch 0.08529579 negative
## 10 boeing duration 0.07121179 positive
## 11 boeing height 0.01637134 positive
## 12 boeing no_pasg 0.01578596 positive
# Regressing distance on all other variables
model <- lm(distance ~ ., data=FAA1)
# calculating the p-value and sign of correlation of all variables in the above regression model
table2 <-
summary(model)$coefficients[-1,c(1,4)] %>%
as.data.frame() %>%
mutate(variable = rownames(.),
direction_coeff = ifelse(Estimate >0, "positive", "negative")) %>%
select(variable, 'Pr(>|t|)', direction_coeff) %>%
arrange(.[,2])
table2
## variable Pr(>|t|) direction_coeff
## 1 aircraftboeing 5.616468e-50 positive
## 2 height 1.907468e-28 positive
## 3 speed_air 2.714575e-28 positive
## 4 no_pasg 1.521782e-01 negative
## 5 pitch 4.693869e-01 negative
## 6 duration 5.321979e-01 positive
## 7 speed_ground 5.811038e-01 negative
# normalizing numerical variables to have mean=0 and sd=1
FAA_normalized <-
apply(FAA1[,-1], 2, function(x) (x-mean(x, na.rm = TRUE))/sd(x, na.rm = TRUE)) %>%
as.data.frame() %>%
cbind(FAA1[,1])
# Regressing distance on all normalized variables
model <- lm(distance ~ ., data=FAA_normalized)
# calculating the coefficient and sign of coefficient in the above regression model
table3 <-
summary(model)$coefficients[-1,c(1), drop=FALSE] %>%
as.data.frame() %>%
mutate(variable = rownames(.),
direction_coeff = ifelse(Estimate >0, "positive", "negative"),
Estimate = abs(Estimate)) %>%
select(variable, coefficient = Estimate, direction_coeff) %>%
arrange(desc(coefficient))
table3
## variable coefficient direction_coeff
## 1 speed_air 0.930187613 positive
## 2 aircraftboeing 0.489092199 positive
## 3 height 0.149343800 positive
## 4 speed_ground 0.074773453 negative
## 5 no_pasg 0.016590213 negative
## 6 pitch 0.007928255 negative
## 7 duration 0.006889602 positive
We can see that table2 and table3 agree with each other. However, results from table1 are not consistent with the other two:
table1
## variable correlation sign
## 1 speed_air 0.94321897 positive
## 2 speed_ground 0.92877195 positive
## 3 height 0.05775639 positive
## 4 duration 0.05241698 positive
## 5 pitch 0.03402263 positive
## 6 no_pasg 0.03258255 negative
table2
## variable Pr(>|t|) direction_coeff
## 1 aircraftboeing 5.616468e-50 positive
## 2 height 1.907468e-28 positive
## 3 speed_air 2.714575e-28 positive
## 4 no_pasg 1.521782e-01 negative
## 5 pitch 4.693869e-01 negative
## 6 duration 5.321979e-01 positive
## 7 speed_ground 5.811038e-01 negative
table3
## variable coefficient direction_coeff
## 1 speed_air 0.930187613 positive
## 2 aircraftboeing 0.489092199 positive
## 3 height 0.149343800 positive
## 4 speed_ground 0.074773453 negative
## 5 no_pasg 0.016590213 negative
## 6 pitch 0.007928255 negative
## 7 duration 0.006889602 positive
# running 3 separate regression models with different input variables
model1 <- lm(distance ~ speed_ground, data=FAA1)
model2 <- lm(distance ~ speed_air, data=FAA1)
model3 <- lm(distance ~ speed_ground + speed_air, data=FAA1)
summary(model1)
##
## Call:
## lm(formula = distance ~ speed_ground, data = FAA1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -904.18 -319.13 -75.69 213.51 1912.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1720.6284 68.3579 -25.17 <2e-16 ***
## speed_ground 40.8252 0.8374 48.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 456 on 831 degrees of freedom
## Multiple R-squared: 0.7409, Adjusted R-squared: 0.7406
## F-statistic: 2377 on 1 and 831 DF, p-value: < 2.2e-16
summary(model2)
##
## Call:
## lm(formula = distance ~ speed_air, data = FAA1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -776.21 -196.39 8.72 209.17 624.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5455.709 207.547 -26.29 <2e-16 ***
## speed_air 79.532 1.997 39.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.3 on 201 degrees of freedom
## (630 observations deleted due to missingness)
## Multiple R-squared: 0.8875, Adjusted R-squared: 0.887
## F-statistic: 1586 on 1 and 201 DF, p-value: < 2.2e-16
summary(model3)
##
## Call:
## lm(formula = distance ~ speed_ground + speed_air, data = FAA1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -819.74 -202.02 3.52 211.25 636.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5462.28 207.48 -26.327 < 2e-16 ***
## speed_ground -14.37 12.68 -1.133 0.258
## speed_air 93.96 12.89 7.291 6.99e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 200 degrees of freedom
## (630 observations deleted due to missingness)
## Multiple R-squared: 0.8883, Adjusted R-squared: 0.8871
## F-statistic: 795 on 2 and 200 DF, p-value: < 2.2e-16
# correlation between speed_ground and speed_air
cor(FAA1$speed_ground, FAA1$speed_air, use = "complete.obs")
## [1] 0.9879383
# running 6 separate regression models with different input variables
model1 <- lm(distance ~ aircraft, data=FAA1)
model2 <- lm(distance ~ aircraft+height, data=FAA1)
model3 <- lm(distance ~ aircraft+height+speed_air, data=FAA1)
model4 <- lm(distance ~ aircraft+height+speed_air+no_pasg, data=FAA1)
model5 <- lm(distance ~ aircraft+height+speed_air+no_pasg+duration, data=FAA1)
model6 <- lm(distance ~ aircraft+height+speed_air+no_pasg+duration+speed_ground
, data=FAA1)
# a function that takes a regression model (model1, model2 etc.) and criteria string (r square, adjusted r-square, aic etc. ) as input and provides the the criteria value and the number of input variables in the model
n_criteria_value <- function(model, criteria)
{
n = dim(summary(model)$coefficients)[1]-1
if(criteria == "r_squared")
{
criteria_value <- summary(model)$r.squared
} else if(criteria == "adj_r_squared")
{
criteria_value <- summary(model)$adj.r.squared
} else if(criteria == "aic")
{
criteria_value <- AIC(model)
}
return(c(criteria_value, n))
}
# calculating r-square and number of input variables in model1 till model6
r_square <-
rbind(n_criteria_value(model1,"r_squared"),
n_criteria_value(model2,"r_squared"),
n_criteria_value(model3,"r_squared"),
n_criteria_value(model4,"r_squared"),
n_criteria_value(model5,"r_squared"),
n_criteria_value(model6,"r_squared")) %>%
as.data.frame() %>%
select(r_square = V1, n = V2)
r_square
## r_square n
## 1 0.05609854 1
## 2 0.06686502 2
## 3 0.97372092 3
## 4 0.97411172 4
## 5 0.97460905 5
## 6 0.97463989 6
# plotting r-square vs number of input variables
ggplot(data=r_square, aes(x=n, y=r_square)) +
geom_line() +
scale_x_continuous(breaks=1:6)
# calculating adjusted r-square and number of input variables in model1 till model6
adj_r_square <-
rbind(n_criteria_value(model1,"adj_r_squared"),
n_criteria_value(model2,"adj_r_squared"),
n_criteria_value(model3,"adj_r_squared"),
n_criteria_value(model4,"adj_r_squared"),
n_criteria_value(model5,"adj_r_squared"),
n_criteria_value(model6,"adj_r_squared")) %>%
as.data.frame() %>%
select(adj_r_square = V1, n = V2)
adj_r_square
## adj_r_square n
## 1 0.05496268 1
## 2 0.06461650 2
## 3 0.97332476 3
## 4 0.97358872 4
## 5 0.97393733 5
## 6 0.97383052 6
# plotting adjusted r-square vs number of input variables
ggplot(data=adj_r_square, aes(x=n, y=adj_r_square)) +
geom_line() +
scale_x_continuous(breaks=1:6)
# calculating aic and number of input variables in model1 till model6
aic <-
rbind(n_criteria_value(model1,"aic"),
n_criteria_value(model2,"aic"),
n_criteria_value(model3,"aic"),
n_criteria_value(model4,"aic"),
n_criteria_value(model5,"aic"),
n_criteria_value(model6,"aic")) %>%
as.data.frame() %>%
select(aic = V1, n = V2)
aic
## aic n
## 1 13645.148 1
## 2 13637.592 2
## 3 2571.310 3
## 4 2570.268 4
## 5 2471.476 5
## 6 2473.239 6
# plotting aic vs number of input variables
ggplot(data=aic, aes(x=n, y=aic)) +
geom_line() +
scale_x_continuous(breaks=1:6)
This is because post n=3, the adjusted r-square and aic remains almost constant. Thus, there is no use of adding more variables to the model.
library(SignifReg)
# The range of variables that the forward-selection algorithm will examine
scope <- distance ~ aircraft+height+speed_air+no_pasg+duration+speed_ground
# Building the final model using forward-selection algorithm
model <- SignifReg(scope=scope,
data=FAA1,
alpha=0.05,
direction="forward",
criterion="r-adj",
correction="FDR")
summary(model)
##
## Call:
## lm(formula = reg, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -300.74 -94.78 15.47 97.09 330.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6390.376 109.839 -58.18 <2e-16 ***
## speed_air 82.148 0.976 84.17 <2e-16 ***
## aircraftboeing 427.442 19.173 22.29 <2e-16 ***
## height 13.702 1.007 13.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 134.2 on 199 degrees of freedom
## (630 observations deleted due to missingness)
## Multiple R-squared: 0.9737, Adjusted R-squared: 0.9733
## F-statistic: 2458 on 3 and 199 DF, p-value: < 2.2e-16
data <- fread("data/FAA_Clean.csv") # reading data
data <-
data %>%
mutate(long.landing = as.factor(ifelse(distance>2500, 1, 0)), # long.landing
risky.landing = as.factor(ifelse(distance>3000, 1, 0)), # risky.landing
aircraft = as.factor(aircraft)) %>%
select(-distance) # discarding distance variable
round(prop.table(table(data$long.landing)),2)
##
## 0 1
## 0.88 0.12
ggplot(data=data, aes(x=long.landing)) +
geom_bar()
table1 <- data.frame()
for(i in 1:7)
{
model <- glm(long.landing~data[,i], data, family=binomial)
magnitude_coeff <- abs(model$coefficients[2])
odds_ratio <- exp(model$coefficients[2])
direction_coeff <- ifelse(sign(model$coefficients[2])>0,"+","-")
p_value <- summary(model)$coefficients[-1,c(1,4)][2]
df <- data.frame(variable = names(data[,i,drop=FALSE]),
magnitude_coeff=magnitude_coeff,
direction_coeff=direction_coeff,
odds_ratio=odds_ratio,
p_value=p_value,
row.names = NULL)
table1 <- rbind(table1, df)
}
table1 <-
table1 %>%
mutate(variable = str_replace(variable, "aircraft", "aircraft_boeing")) %>%
arrange(p_value)
kable(table1)
| variable | magnitude_coeff | direction_coeff | odds_ratio | p_value |
|---|---|---|---|---|
| speed_ground | 0.4723458 | + | 1.6037518 | 0.0000000 |
| speed_air | 0.5123218 | + | 1.6691621 | 0.0000000 |
| aircraft_boeing | 0.8578893 | + | 2.3581781 | 0.0000942 |
| pitch | 0.3972083 | + | 1.4876657 | 0.0486333 |
| height | 0.0088190 | + | 1.0088580 | 0.4115887 |
| duration | 0.0010994 | - | 0.9989012 | 0.6213848 |
| no_pasg | 0.0069205 | - | 0.9931034 | 0.6222435 |
edaPlot <- function(x)
{
ggplot(data, aes(x=x, fill=long.landing)) +
geom_histogram(position = 'dodge', aes(y=..density..))
}
edaPlot(data$speed_ground)
edaPlot(data$speed_air)
edaPlot(data$pitch)
table(data$aircraft, data$long.landing) %>%
prop.table(margin = 1) %>%
round(2)
##
## 0 1
## airbus 0.92 0.08
## boeing 0.83 0.17
clean_df <- select(data, -risky.landing)
full_model <- glm(long.landing ~ .,
data=clean_df,
family=binomial)
summary(full_model)
##
## Call:
## glm(formula = long.landing ~ ., family = binomial, data = clean_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.48853 -0.01367 0.00000 0.00047 1.56917
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.964e+02 5.607e+01 -3.502 0.000462 ***
## aircraftboeing 8.784e+00 2.623e+00 3.349 0.000811 ***
## duration 3.031e-04 1.048e-02 0.029 0.976919
## no_pasg -7.359e-02 7.009e-02 -1.050 0.293744
## speed_ground -2.255e-01 3.845e-01 -0.587 0.557471
## speed_air 1.985e+00 7.080e-01 2.804 0.005051 **
## height 4.226e-01 1.429e-01 2.956 0.003116 **
## pitch 1.469e+00 1.055e+00 1.392 0.163818
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 270.199 on 194 degrees of freedom
## Residual deviance: 32.909 on 187 degrees of freedom
## (638 observations deleted due to missingness)
## AIC: 48.909
##
## Number of Fisher Scoring iterations: 10
clean_df <- na.omit(select(data, -risky.landing, -speed_air))
full_model <- glm(long.landing ~ .,
data=clean_df,
family=binomial)
forward_selection_aic_long <- step(full_model)
## Start: AIC=65.08
## long.landing ~ aircraft + duration + no_pasg + speed_ground +
## height + pitch
##
## Df Deviance AIC
## - no_pasg 1 51.10 63.10
## - duration 1 51.57 63.57
## <none> 51.08 65.08
## - pitch 1 53.67 65.67
## - height 1 72.65 84.65
## - aircraft 1 85.82 97.82
## - speed_ground 1 583.37 595.37
##
## Step: AIC=63.1
## long.landing ~ aircraft + duration + speed_ground + height +
## pitch
##
## Df Deviance AIC
## - duration 1 51.58 61.58
## <none> 51.10 63.10
## - pitch 1 53.68 63.68
## - height 1 73.47 83.47
## - aircraft 1 85.86 95.86
## - speed_ground 1 583.54 593.54
##
## Step: AIC=61.58
## long.landing ~ aircraft + speed_ground + height + pitch
##
## Df Deviance AIC
## <none> 51.58 61.58
## - pitch 1 54.40 62.40
## - height 1 75.18 83.18
## - aircraft 1 86.06 94.06
## - speed_ground 1 583.66 591.66
forward_selection_bic_long <- step(full_model, k=log(nrow(clean_df)))
## Start: AIC=97.73
## long.landing ~ aircraft + duration + no_pasg + speed_ground +
## height + pitch
##
## Df Deviance AIC
## - no_pasg 1 51.10 91.08
## - duration 1 51.57 91.55
## - pitch 1 53.67 93.65
## <none> 51.08 97.73
## - height 1 72.65 112.63
## - aircraft 1 85.82 125.79
## - speed_ground 1 583.37 623.35
##
## Step: AIC=91.08
## long.landing ~ aircraft + duration + speed_ground + height +
## pitch
##
## Df Deviance AIC
## - duration 1 51.58 84.90
## - pitch 1 53.68 87.00
## <none> 51.10 91.08
## - height 1 73.47 106.78
## - aircraft 1 85.86 119.18
## - speed_ground 1 583.54 616.86
##
## Step: AIC=84.9
## long.landing ~ aircraft + speed_ground + height + pitch
##
## Df Deviance AIC
## - pitch 1 54.40 81.05
## <none> 51.58 84.90
## - height 1 75.18 101.83
## - aircraft 1 86.06 112.71
## - speed_ground 1 583.66 610.32
##
## Step: AIC=81.05
## long.landing ~ aircraft + speed_ground + height
##
## Df Deviance AIC
## <none> 54.40 81.05
## - height 1 78.16 98.15
## - aircraft 1 95.06 115.05
## - speed_ground 1 583.73 603.72
Model
summary(forward_selection_bic_long)
##
## Call:
## glm(formula = long.landing ~ aircraft + speed_ground + height,
## family = binomial, data = clean_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.35721 -0.00160 -0.00001 0.00000 2.55053
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -98.86483 18.64743 -5.302 1.15e-07 ***
## aircraftboeing 4.91354 1.10438 4.449 8.62e-06 ***
## speed_ground 0.88939 0.16684 5.331 9.79e-08 ***
## height 0.22063 0.06028 3.660 0.000252 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 598.240 on 782 degrees of freedom
## Residual deviance: 54.401 on 779 degrees of freedom
## AIC: 62.401
##
## Number of Fisher Scoring iterations: 11
Table
kable(table1)
| variable | magnitude_coeff | direction_coeff | odds_ratio | p_value |
|---|---|---|---|---|
| speed_ground | 0.4723458 | + | 1.6037518 | 0.0000000 |
| speed_air | 0.5123218 | + | 1.6691621 | 0.0000000 |
| aircraft_boeing | 0.8578893 | + | 2.3581781 | 0.0000942 |
| pitch | 0.3972083 | + | 1.4876657 | 0.0486333 |
| height | 0.0088190 | + | 1.0088580 | 0.4115887 |
| duration | 0.0010994 | - | 0.9989012 | 0.6213848 |
| no_pasg | 0.0069205 | - | 0.9931034 | 0.6222435 |
Figures and Plots
edaPlot(data$speed_ground)
edaPlot(data$speed_air)
table(data$aircraft, data$long.landing) %>%
prop.table(margin = 1) %>%
round(2)
##
## 0 1
## airbus 0.92 0.08
## boeing 0.83 0.17
Repeating steps 1-9 for landing risky variable
Model
summary(forward_selection_bic_risky)
##
## Call:
## glm(formula = risky.landing ~ aircraft + speed_ground, family = binomial,
## data = clean_df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.22465 -0.00014 0.00000 0.00000 1.60326
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -100.6448 24.8834 -4.045 5.24e-05 ***
## aircraftboeing 3.9763 1.2520 3.176 0.00149 **
## speed_ground 0.9132 0.2258 4.044 5.25e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 423.535 on 782 degrees of freedom
## Residual deviance: 39.955 on 780 degrees of freedom
## AIC: 45.955
##
## Number of Fisher Scoring iterations: 12
Table
kable(table1)
| variable | magnitude_coeff | direction_coeff | odds_ratio | p_value |
|---|---|---|---|---|
| speed_ground | 0.6142187 | + | 1.8482121 | 0.0000001 |
| speed_air | 0.8704019 | + | 2.3878703 | 0.0000037 |
| aircraft_boeing | 0.9959950 | + | 2.7074168 | 0.0004913 |
| pitch | 0.3680104 | + | 1.4448571 | 0.1469328 |
| no_pasg | 0.0250218 | - | 0.9752886 | 0.1590139 |
| duration | 0.0011794 | - | 0.9988213 | 0.6730755 |
| height | 0.0020422 | - | 0.9979599 | 0.8808610 |
Figures and Plots
edaPlot <- function(x)
{
ggplot(data, aes(x=x, fill=risky.landing)) +
geom_histogram(position = 'dodge', aes(y=..density..))
}
edaPlot(data$speed_ground)
Figures and Plots
edaPlot(data$speed_air)
Figures and Plots
table(data$aircraft, data$risky.landing) %>%
prop.table(margin = 1) %>%
round(2)
##
## 0 1
## airbus 0.96 0.04
## boeing 0.89 0.11
data1 <- na.omit(select(data, -risky.landing, -speed_air))
data2 <- na.omit(select(data, -long.landing, -speed_air))
par(mfrow=c(1,2))
pred1 <- prediction(predict(forward_selection_bic_long), data1$long.landing)
perf <- performance(pred1, "tpr", "fpr")
plot(perf, colorize=TRUE)
pred2 <- prediction(predict(forward_selection_bic_risky), data2$risky.landing)
perf <- performance(pred2, "tpr", "fpr")
plot(perf, colorize=TRUE)
# auc1 <- as.numeric(unlist(slot(performance(pred1, "auc"), "y.values")))
# auc2 <- as.numeric(unlist(slot(performance(pred2, "auc"), "y.values")))
## AUC value of 1st Model (Long Landing): 0.9983309
## AUC value of 2nd Model (Risky Landing): 0.9985016
Long Landing - Probability
df <- data.frame(aircraft="boeing",
duration=200,
no_pasg=80,
speed_ground=115,
speed_air=120,
height=40,
pitch=4)
pred1 <- predict(forward_selection_bic_long, df, se.fit = TRUE)
critval <- 1.96 ## approx 95% CI
upr1 <- ilogit(pred1$fit + (critval * pred1$se.fit))
lwr1 <- ilogit(pred1$fit - (critval * pred1$se.fit))
fit1 <- ilogit(pred1$fit)
cat("lower:", lwr1, "| fit:" , fit1, "| upper:", upr1)
## lower: 0.999974 | fit: 1 | upper: 1
Risky Landing - Probability
pred2 <- predict(forward_selection_bic_risky, df, se.fit = TRUE)
critval <- 1.96 ## approx 95% CI
upr2 <- ilogit(pred2$fit + (critval * pred2$se.fit))
lwr2 <- ilogit(pred2$fit - (critval * pred2$se.fit))
fit2 <- ilogit(pred2$fit)
cat("lower:", lwr2, "| fit:", fit2, "| upper:", upr2)
## lower: 0.9856821 | fit: 0.9997625 | upper: 0.9999961
data_risky <- na.omit(select(data, -long.landing, -speed_air))
model_probit <- glm(risky.landing ~ aircraft+speed_ground,
family=binomial (link = "probit"),
data=data_risky)
model_hazard <- glm(risky.landing ~ aircraft+speed_ground,
family=binomial (link = "cloglog"),
data=data_risky)
summary(forward_selection_bic_risky)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -100.6447983 24.8834113 -4.044654 5.240037e-05
## aircraftboeing 3.9763497 1.2520073 3.175980 1.493315e-03
## speed_ground 0.9131614 0.2258071 4.043989 5.254930e-05
summary(model_probit)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -57.9659503 13.4209151 -4.319076 1.566839e-05
## aircraftboeing 2.3378746 0.7039672 3.320999 8.969583e-04
## speed_ground 0.5255633 0.1216794 4.319246 1.565635e-05
summary(model_hazard)$coefficients
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -68.2607539 14.9333418 -4.571030 4.853327e-06
## aircraftboeing 2.8611935 0.8032538 3.562004 3.680345e-04
## speed_ground 0.6129655 0.1345006 4.557345 5.180422e-06
par(mfrow=c(1,3))
pred1 <- prediction(predict(forward_selection_bic_risky), data_risky$risky.landing)
perf <- performance(pred1, "tpr", "fpr")
plot(perf, colorize=TRUE)
pred2 <- prediction(predict(model_probit), data_risky$risky.landing)
perf <- performance(pred2, "tpr", "fpr")
plot(perf, colorize=TRUE)
pred3 <- prediction(predict(model_hazard), data_risky$risky.landing)
perf <- performance(pred3, "tpr", "fpr")
plot(perf, colorize=TRUE)
auc_logit <- as.numeric(unlist(slot(performance(pred1, "auc"), "y.values")))
auc_probit <- as.numeric(unlist(slot(performance(pred2, "auc"), "y.values")))
auc_hazard <- as.numeric(unlist(slot(performance(pred3, "auc"), "y.values")))
cat("auc_logit: ",auc_logit, "| auc_probit: ", auc_probit, "| auc_hazard: ",auc_hazard)
## auc_logit: 0.9985016 | auc_probit: 0.9985016 | auc_hazard: 0.9984555
pred_logit <- predict(forward_selection_bic_risky, type = "response")
pred_probit <- predict(model_probit, type = "response")
pred_hazard <- predict(model_hazard, type = "response")
head(sort(pred_logit, decreasing = TRUE),6)
## 364 309 64 389 410 178
## 1 1 1 1 1 1
head(sort(pred_probit, decreasing = TRUE),6)
## 56 64 136 178 181 309
## 1 1 1 1 1 1
head(sort(pred_hazard, decreasing = TRUE),6)
## 19 29 30 56 64 91
## 1 1 1 1 1 1
df <- data.frame(aircraft="boeing",
duration=200,
no_pasg=80,
speed_ground=115,
speed_air=120,
height=40,
pitch=4)
pred1 <- predict(model_probit, df, se.fit = TRUE)
critval <- 1.96 ## approx 95% CI
upr1 <- ilogit(pred1$fit + (critval * pred1$se.fit))
lwr1 <- ilogit(pred1$fit - (critval * pred1$se.fit))
fit1 <- ilogit(pred1$fit)
cat("lower:", lwr1, "| fit:" , fit1, "| upper:", upr1)
## lower: 0.9297331 | fit: 0.9919316 | upper: 0.9991254
pred2 <- predict(model_hazard, df, se.fit = TRUE)
critval <- 1.96 ## approx 95% CI
upr2 <- ilogit(pred2$fit + (critval * pred2$se.fit))
lwr2 <- ilogit(pred2$fit - (critval * pred2$se.fit))
fit2 <- ilogit(pred2$fit)
cat("lower:", lwr2, "| fit:", fit2, "| upper:", upr2)
## lower: 0.9404881 | fit: 0.9938886 | upper: 0.9994028
Data Setup
setwd("C:/Users/Akshay Kher/Desktop/Spring Sem/Statistical Modeling/Homework2")
data <- fread("FAA_Clean.csv") # reading data
data$dist <- ifelse(data$distance < 1000, 1, # creating new variable
ifelse(data$distance < 2500, 2, 3))
data$distance <- NULL # removing distance variable
data$dist <- as.factor(data$dist) # making output as factor
data$aircraft <- as.factor(data$aircraft) # making aircraft as factor
Building Model
Used multinomial logistic regression model to regress dist on to all input variables.
data <- select(data, -speed_air) %>%
na.omit()
multinomial_model <- multinom(dist ~ duration +
no_pasg +
speed_ground +
height +
aircraft +
pitch, data )
## # weights: 24 (14 variable)
## initial value 860.213422
## iter 10 value 535.322398
## iter 20 value 233.215870
## iter 30 value 215.490327
## iter 40 value 215.306686
## iter 50 value 215.095098
## final value 215.033762
## converged
Performed stepwise variable selection (AIC) to choose the best model.
step_multinomial_model <- step(multinomial_model) # stepwise variable selection
The summary of the model is as follows:
summary(step_multinomial_model) # summary of model
## Call:
## multinom(formula = dist ~ speed_ground + height + aircraft +
## pitch, data = data)
##
## Coefficients:
## (Intercept) speed_ground height aircraftboeing pitch
## 2 -19.41428 0.2167328 0.1390243 3.723276 -0.3293647
## 3 -132.77008 1.1895559 0.3810112 8.645437 1.0100623
##
## Std. Errors:
## (Intercept) speed_ground height aircraftboeing pitch
## 2 1.88154920 0.01765206 0.01703552 0.3975984 0.2659644
## 3 0.03364901 0.02846458 0.04038931 0.8568989 0.7434756
##
## Residual Deviance: 433.3108
## AIC: 453.3108
Table
The table below shows the mean value of all the input variables for dist =1, 2 and 3:
duration <- tapply(data$duration, data$dist, mean, na.rm=TRUE)
no_pasg <- tapply(data$no_pasg, data$dist, mean, na.rm=TRUE)
speed_ground <- tapply(data$speed_ground, data$dist, mean, na.rm=TRUE)
height <- tapply(data$height, data$dist, mean, na.rm=TRUE)
pitch <- tapply(data$pitch, data$dist, mean, na.rm=TRUE)
df <- data.frame(duration, no_pasg, speed_ground, height, pitch) %>%
t() %>%
as.data.frame()
names(df) <- c('dist1', 'dist2', 'dist3')
df$variable <- rownames(df)
rownames(df) <- NULL
df <- select(df, variable, everything())
kable(df)
| variable | dist1 | dist2 | dist3 |
|---|---|---|---|
| duration | 160.889530 | 151.954956 | 152.605081 |
| no_pasg | 60.232653 | 60.045662 | 59.740000 |
| speed_ground | 62.177997 | 82.107324 | 110.589909 |
| height | 27.908240 | 31.717500 | 31.032841 |
| pitch | 3.979672 | 4.017098 | 4.092904 |
Plots
Visualizing aircraft vs dist. Boeing has higher proportion of flights with higher dist as compared to Airbus.
ggplot(data, aes(x=aircraft, fill=dist)) +
geom_bar(position = 'dodge')
Visualizing speed_ground vs dist. As dist increases, speed_ground increases.
ggplot(data, aes(x=dist, y=speed_ground)) +
geom_boxplot()
Visualizing height vs dist. As dist increases, height seems to be unaffected.
ggplot(data, aes(x=dist, y=height)) +
geom_boxplot()
Conclusion