part2:Practice of modeling a binary response using logistic regression.

Create binary response

Step 1. From now on, please work on the cleaned FAA data set you prepared by carrying out Steps 1-9 in Part 1 of the project. Create two binary variables below and attach them to your data set.

Use the data in the Project PartI,write the data to a new Excel file named “FAA” create two binary variables and attach to the FAA dataset.

## 'data.frame':    831 obs. of  8 variables:
##  $ ï..aircraft : Factor w/ 2 levels "airbus","boeing": 1 1 1 1 1 1 1 1 1 1 ...
##  $ no_pasg     : int  36 38 40 41 43 44 45 45 45 45 ...
##  $ speed_ground: num  47.5 85.2 80.6 97.6 82.5 ...
##  $ speed_air   : num  49.5 86.4 81.9 97 83.8 ...
##  $ height      : num  14 37 28.6 38.4 30.1 ...
##  $ pitch       : num  4.3 4.12 3.62 3.53 4.09 ...
##  $ distance    : num  251 1257 1021 2168 1321 ...
##  $ duration    : num  172 188 93.5 123.3 109.2 ...

the cleaned dataset has 831 observation and 8 variables.

long.landing <- ifelse(FAA$distance>2500,1,0)
risky.landing <- ifelse(FAA$distance>3000,1,0)
aircraft <- FAA$ï..aircraft
FAA <- cbind(FAA[c(-1,-7)],long.landing,risky.landing,aircraft)

there are 103 samples of long.landing and 61 samples of risky.landing.

step2 Use a pie chart or a histogram to show the distribution of “long.landing”.

long.landing is binary, the hist only shows two value 0 or 1.

library(ggplot2)
ggplot(FAA,aes(x=long.landing))+geom_bar()

mytable <- table(FAA$long.landing)
lbls <- paste(names(mytable),"\n",mytable,sep="")
pie(mytable,labels = lbls,main = "pie chart of long.landing")

step3 Perform single-factor regression analysis for each of the potential risk factors, in a similar way to what you did in Steps 13-15 of Part 1. But here the response “long.landing” is binary. You may consider using logistic regression. Provide a table that ranks the factors from the most important to the least. This table contains 5 columns: the names of variables, the size of the regression coefficient, the odds ratio, the direction of the regression coefficient (positive or negative), and the p-value.

reg1 <- glm(FAA$long.landing~FAA$aircraft,family=binomial)
coef1 <- summary(reg1)$coefficients[2,1]
oddsRatio1 <-exp(coef1)
direction1 <- ifelse(coef1>0,"positive","negative")
pr1 <- summary(reg1)$coefficients[2,4]

reg2 <- glm(FAA$long.landing~FAA$no_pasg,family=binomial)
coef2 <- summary(reg2)$coefficients[2,1]
oddsRatio2 <-exp(coef2)
direction2 <- ifelse(coef2>0,"positive","negative")
pr2 <- summary(reg2)$coefficients[2,4]

reg3 <- glm(FAA$long.landing~FAA$speed_ground,family=binomial)
coef3 <- summary(reg3)$coefficients[2,1]
oddsRatio3 <-exp(coef3)
direction3 <- ifelse(coef3>0,"positive","negative")
pr3 <- summary(reg3)$coefficients[2,4]

reg4 <- glm(FAA$long.landing~FAA$speed_air,family=binomial)
coef4 <- summary(reg4)$coefficients[2,1]
oddsRatio4 <-exp(coef4)
direction4 <- ifelse(coef4>0,"positive","negative")
pr4 <- summary(reg4)$coefficients[2,4]

reg5 <- glm(FAA$long.landing~FAA$height,family=binomial)
coef5 <- summary(reg5)$coefficients[2,1]
oddsRatio5 <-exp(coef5)
direction5 <- ifelse(coef5>0,"positive","negative")
pr5 <- summary(reg5)$coefficients[2,4]

reg6 <- glm(FAA$long.landing~FAA$pitch,family=binomial)
coef6 <- summary(reg6)$coefficients[2,1]
oddsRatio6 <-exp(coef6)
direction6 <- ifelse(coef6>0,"positive","negative")
pr6 <- summary(reg6)$coefficients[2,4]

reg7 <- glm(FAA$long.landing~FAA$duration,family=binomial)
coef7 <- summary(reg7)$coefficients[2,1]
oddsRatio7 <-exp(coef7)
direction7 <- ifelse(coef7>0,"positive","negative")
pr7 <- summary(reg7)$coefficients[2,4]

names <- c("aircraft","no_passager","speed_ground","speed_air","height","pitch","duration")
reg_coef <- c(coef1,coef2,coef3,coef4,coef5,coef6,coef7)
odds_ratio <- c(oddsRatio1,oddsRatio2,oddsRatio3,oddsRatio4,oddsRatio5,oddsRatio6,oddsRatio7)
direction <- c(direction1,direction2,direction3,direction4,direction5,direction6,direction7)
pr <- c(pr1,pr2,pr3,pr4,pr5,pr6,pr7)
table_temp=data.frame(names,reg_coef,odds_ratio,direction,pr)
table0<-table_temp[order(table_temp$pr), , drop = FALSE]
table0
##          names     reg_coef odds_ratio direction           pr
## 3 speed_ground  0.472345752  1.6037518  positive 3.935339e-14
## 4    speed_air  0.547823337  1.7294844  positive 2.742794e-13
## 1     aircraft  0.864119860  2.3729167  positive 8.398591e-05
## 6        pitch  0.400527824  1.4926123  positive 4.664982e-02
## 5       height  0.008623997  1.0086613  positive 4.218576e-01
## 2  no_passager -0.007256406  0.9927699  negative 6.058565e-01
## 7     duration -0.001100801  0.9988998  negative 6.257257e-01

The factors are scaled in order to assess their impact on the response variable on the basis of the coefficient estimates.

FAA_0 <- FAA
FAA_0$duration_n <- 
  (FAA$duration - mean(FAA$duration, na.rm = T))/sd(FAA$duration, na.rm = T)
FAA_0$no_pasg_n <- 
  (FAA$no_pasg - mean(FAA$no_pasg, na.rm = T))/sd(FAA$no_pasg, na.rm = T)
FAA_0$speed_ground_n <- 
  (FAA$speed_ground - mean(FAA$speed_ground, na.rm = T))/sd(FAA$speed_ground, na.rm = T)
FAA_0$speed_air_n <- 
  (FAA$speed_air - mean(FAA$speed_air, na.rm = T))/sd(FAA$speed_air, na.rm = T)
FAA_0$height_n <- 
  (FAA$height - mean(FAA$height, na.rm = T))/sd(FAA$height, na.rm = T)
FAA_0$pitch_n <- 
  (FAA$pitch - mean(FAA$pitch, na.rm = T))/sd(FAA$pitch, na.rm = T)

reg1g <- glm(FAA$long.landing~FAA$aircraft,family=binomial)
coef1g <- summary(reg1g)$coefficients[2,1]
oddsRatio1g <-exp(coef1g)
direction1g <- ifelse(coef1g>0,"positive","negative")
pr1g <- summary(reg1g)$coefficients[2,4]

reg2g <- glm(FAA$long.landing~FAA_0$no_pasg_n,family=binomial)
coef2g <- summary(reg2g)$coefficients[2,1]
oddsRatio2g <-exp(coef2g)
direction2g <- ifelse(coef2g>0,"positive","negative")
pr2g <- summary(reg2g)$coefficients[2,4]

reg3g <- glm(FAA$long.landing~FAA_0$speed_ground_n,family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
coef3g <- summary(reg3g)$coefficients[2,1]
oddsRatio3g <-exp(coef3g)
direction3g <- ifelse(coef3g>0,"positive","negative")
pr3g <- summary(reg3g)$coefficients[2,4]

reg4g <- glm(FAA$long.landing~FAA_0$speed_air_n,family=binomial)
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
coef4g <- summary(reg4g)$coefficients[2,1]
oddsRatio4g <-exp(coef4g)
direction4g <- ifelse(coef4g>0,"positive","negative")
pr4g <- summary(reg4g)$coefficients[2,4]

reg5g <- glm(FAA$long.landing~FAA_0$height_n,family=binomial)
coef5g <- summary(reg5g)$coefficients[2,1]
oddsRatio5g <-exp(coef5g)
direction5g <- ifelse(coef5g>0,"positive","negative")
pr5g <- summary(reg5g)$coefficients[2,4]

reg6g <- glm(FAA$long.landing~FAA_0$pitch_n,family=binomial)
coef6g <- summary(reg6g)$coefficients[2,1]
oddsRatio6g <-exp(coef6g)
direction6g <- ifelse(coef6g>0,"positive","negative")
pr6g <- summary(reg6g)$coefficients[2,4]

reg7g <- glm(FAA$long.landing~FAA_0$duration_n,family=binomial)
coef7g <- summary(reg7)$coefficients[2,1]
oddsRatio7g <-exp(coef7)
direction7g <- ifelse(coef7>0,"positive","negative")
pr7g <- summary(reg7)$coefficients[2,4]

names <- c("aircraft_n","no_passager_n","speed_ground_n","speed_air_n","height_n","pitch_n","duration_n")
reg_coef_n <- c(coef1g,coef2g,coef3g,coef4g,coef5g,coef6g,coef7g)
odds_ratio_n <- c(oddsRatio1g,oddsRatio2g,oddsRatio3g,oddsRatio4g,oddsRatio5g,oddsRatio6g,oddsRatio7g)
direction_n <- c(direction1g,direction2g,direction3g,direction4g,direction5g,direction6g,direction7g)
pr_n <- c(pr1g,pr2g,pr3g,pr4g,pr5g,pr6g,pr7g)
table_temp_n=data.frame(names,reg_coef_n,odds_ratio_n,direction_n,pr_n)
table0_n<-table_temp_n[order(table_temp_n$pr), , drop = FALSE]
table0_n
##            names   reg_coef_n odds_ratio_n direction_n         pr_n
## 3 speed_ground_n  8.849716699 6.972413e+03    positive 3.935339e-14
## 4    speed_air_n  9.927933499 2.049494e+04    positive 2.742794e-13
## 1     aircraft_n  0.864119860 2.372917e+00    positive 8.398591e-05
## 6        pitch_n  0.210905554 1.234796e+00    positive 4.664982e-02
## 5       height_n  0.084384182 1.088047e+00    positive 4.218576e-01
## 2  no_passager_n -0.054360033 9.470911e-01    negative 6.058565e-01
## 7     duration_n -0.001100801 9.988998e-01    negative 6.257257e-01

the factors speed_ground, speed_air, aircraft and pitch were identified as important factors for analysis.

step4 For those significant factors identified in Step 3, visualize its association with “long.landing”. See the slides (pp. 12-21) for Lecture 3.

plot(jitter(long.landing,0.1)~jitter(speed_ground),FAA,xlab="Speed Ground",
     ylab="Long Landing",pch=".")

plot(jitter(long.landing,0.1)~jitter(speed_air),FAA,xlab="Speed Air",
     ylab="Long Landing",pch=".")

plot(jitter(long.landing,0.1)~jitter(pitch),FAA,xlab="Pitch",
     ylab="Long Landing",pch=".")

plot(jitter(long.landing,0.1)~jitter(height),FAA,xlab="Pitch",
     ylab="Long Landing",pch=".")

FAA$aircraft <- as.numeric(FAA$aircraft)
round(cor(FAA),2)
##               no_pasg speed_ground speed_air height pitch duration long.landing
## no_pasg          1.00         0.00      0.00   0.05 -0.02    -0.04        -0.02
## speed_ground     0.00         1.00      1.00  -0.06 -0.04    -0.05         0.62
## speed_air        0.00         1.00      1.00  -0.06 -0.04    -0.04         0.62
## height           0.05        -0.06     -0.06   1.00  0.02     0.01         0.03
## pitch           -0.02        -0.04     -0.04   0.02  1.00    -0.04         0.07
## duration        -0.04        -0.05     -0.04   0.01 -0.04     1.00        -0.02
## long.landing    -0.02         0.62      0.62   0.03  0.07    -0.02         1.00
## risky.landing   -0.05         0.54      0.54  -0.01  0.05    -0.01         0.75
## aircraft        -0.02        -0.04     -0.04  -0.01  0.35    -0.04         0.14
##               risky.landing aircraft
## no_pasg               -0.05    -0.02
## speed_ground           0.54    -0.04
## speed_air              0.54    -0.04
## height                -0.01    -0.01
## pitch                  0.05     0.35
## duration              -0.01    -0.04
## long.landing           0.75     0.14
## risky.landing          1.00     0.13
## aircraft               0.13     1.00
library(pheatmap)
pheatmap(as.matrix(cor(FAA)),display_numbers = T,main = "Correlation of FAA")

As we can see from the correlation, long.landing is highly correlated with speed_air and speed_ground followed by the factors aircraft, pitch, height,no_passager and duration. it is the same order as the significant facotors identified in the step3(table0)

step5 Based on the analysis results in Steps 3-4 and the collinearity result seen in Step 16 of Part 1, initiate a “full” model. Fit your model to the data and present your result.

## [1] 0.9990797
## 
## Call:
## glm(formula = long.landing ~ speed_ground + speed_air, family = "binomial", 
##     data = FAA)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.73137  -0.02700  -0.00091  -0.00001   2.37323  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -56.17381    7.69464  -7.300 2.87e-13 ***
## speed_ground  -0.08531    0.18403  -0.464  0.64298    
## speed_air      0.63840    0.21192   3.012  0.00259 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.78  on 830  degrees of freedom
## Residual deviance: 104.04  on 828  degrees of freedom
## AIC: 110.04
## 
## Number of Fisher Scoring iterations: 10

AIC value for speed_ground: 119.47 AIC value for speed_air: 108.26 AIC value for speed_ground + speed_air: 110.04

From the partI, we imputed the speed_air from the speed_ground. the correlation of the speed_air and speed_ground is very large. we keep the original data of speed_ground, remove the speed_air from our analysis.although the AIC for the speed_air is lower than speed_ground we pick up the factors speed_ground,aircraft,pitch,height into our model.

model_whole <- glm(FAA$long.landing ~., family = binomial,FAA)
summary(model_whole)
## 
## Call:
## glm(formula = FAA$long.landing ~ ., family = binomial, data = FAA)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
##  -2.56    0.00    0.00    0.00    1.58  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.063e+02  5.578e+01  -3.699 0.000217 ***
## no_pasg       -7.842e-02  6.998e-02  -1.121 0.262438    
## speed_ground  -2.284e-01  3.887e-01  -0.588 0.556771    
## speed_air      2.076e+00  7.108e-01   2.920 0.003498 ** 
## height         4.468e-01  1.414e-01   3.161 0.001575 ** 
## pitch          1.606e+00  1.039e+00   1.546 0.122023    
## duration       1.923e-04  1.071e-02   0.018 0.985677    
## risky.landing  1.105e+01  2.401e+03   0.005 0.996329    
## aircraft2      9.161e+00  2.632e+00   3.481 0.000499 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  33.334  on 822  degrees of freedom
## AIC: 51.334
## 
## Number of Fisher Scoring iterations: 20

the model_whole with every factors included except speed_air. the significant factor is speed_ground,height,pitch,aircraft. the AIC=68.43.

pick up factors: speed_ground,height,pitch,aircraft to build the full model_1. the AIC=63.204 pick up facrors: speed_ground,height,aircraft to build the full model_2. the AIC=65.04 AIC of the full model_1 is lower, so we decide the factor in the full model is speed_ground,height,pitch,aircraft

modfull <- glm(FAA$long.landing ~ FAA$speed_ground+ FAA$aircraft+FAA$pitch+FAA$height,family = binomial,FAA)
modfull2 <- glm(FAA$long.landing ~ FAA$speed_ground+ FAA$aircraft+FAA$height,family = binomial,FAA)
summary(modfull)
## 
## Call:
## glm(formula = FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + 
##     FAA$pitch + FAA$height, family = binomial, data = FAA)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.20284  -0.00054   0.00000   0.00000   2.35719  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -119.77598   24.41821  -4.905 9.33e-07 ***
## FAA$speed_ground    1.02266    0.20290   5.040 4.65e-07 ***
## FAA$aircraft2       5.13443    1.18091   4.348 1.37e-05 ***
## FAA$pitch           1.53751    0.84109   1.828  0.06755 .  
## FAA$height          0.25795    0.06861   3.760  0.00017 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  53.204  on 826  degrees of freedom
## AIC: 63.204
## 
## Number of Fisher Scoring iterations: 12
summary(modfull2)
## 
## Call:
## glm(formula = FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + 
##     FAA$height, family = binomial, data = FAA)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.43442  -0.00117   0.00000   0.00000   2.57435  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -102.95437   19.22882  -5.354 8.59e-08 ***
## FAA$speed_ground    0.92657    0.17242   5.374 7.70e-08 ***
## FAA$aircraft2       5.04813    1.11520   4.527 5.99e-06 ***
## FAA$height          0.23106    0.05959   3.877 0.000106 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 622.778  on 830  degrees of freedom
## Residual deviance:  57.047  on 827  degrees of freedom
## AIC: 65.047
## 
## Number of Fisher Scoring iterations: 11

Step 6. Use the R function “Step” to perform forward variable selection using AIC. Compare the result with the table obtained in Step 3. Are the results consistent?

modstart <- glm(FAA$long.landing~1,family = binomial,FAA)
step(modstart,scope = formula(modfull),direction = "forward")
## Start:  AIC=624.78
## FAA$long.landing ~ 1
## 
##                    Df Deviance    AIC
## + FAA$speed_ground  1   115.47 119.47
## + FAA$aircraft      1   606.55 610.55
## + FAA$pitch         1   618.79 622.79
## <none>                  622.78 624.78
## + FAA$height        1   622.13 626.13
## 
## Step:  AIC=119.47
## FAA$long.landing ~ FAA$speed_ground
## 
##                Df Deviance     AIC
## + FAA$aircraft  1   84.665  90.665
## + FAA$height    1  100.459 106.459
## + FAA$pitch     1  105.527 111.527
## <none>             115.470 119.470
## 
## Step:  AIC=90.66
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft
## 
##              Df Deviance    AIC
## + FAA$height  1   57.047 65.047
## + FAA$pitch   1   81.309 89.309
## <none>            84.665 90.665
## 
## Step:  AIC=65.05
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + FAA$height
## 
##             Df Deviance    AIC
## + FAA$pitch  1   53.204 63.204
## <none>           57.047 65.047
## 
## Step:  AIC=63.2
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + FAA$height + 
##     FAA$pitch
## 
## Call:  glm(formula = FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + 
##     FAA$height + FAA$pitch, family = binomial, data = FAA)
## 
## Coefficients:
##      (Intercept)  FAA$speed_ground     FAA$aircraft2        FAA$height  
##         -119.776             1.023             5.134             0.258  
##        FAA$pitch  
##            1.538  
## 
## Degrees of Freedom: 830 Total (i.e. Null);  826 Residual
## Null Deviance:       622.8 
## Residual Deviance: 53.2  AIC: 63.2

Use the step AIC for the variable selection. the AIC is 63.2. the best model from this step is with the factors speed_ground, aircraft, height and pitch. it is consistent with the step3.

step7 Use the R function “Step” to perform forward variable selection using BIC. Compare the result with that from the previous step.

step(modstart,criterion = "BIC",scope = formula(modfull),direction = "forward", k=log(831))
## Start:  AIC=629.5
## FAA$long.landing ~ 1
## 
##                    Df Deviance    AIC
## + FAA$speed_ground  1   115.47 128.92
## + FAA$aircraft      1   606.55 620.00
## <none>                  622.78 629.50
## + FAA$pitch         1   618.79 632.24
## + FAA$height        1   622.13 635.58
## 
## Step:  AIC=128.92
## FAA$long.landing ~ FAA$speed_ground
## 
##                Df Deviance    AIC
## + FAA$aircraft  1   84.665 104.83
## + FAA$height    1  100.459 120.63
## + FAA$pitch     1  105.527 125.69
## <none>             115.470 128.92
## 
## Step:  AIC=104.83
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft
## 
##              Df Deviance     AIC
## + FAA$height  1   57.047  83.937
## <none>            84.665 104.832
## + FAA$pitch   1   81.309 108.200
## 
## Step:  AIC=83.94
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + FAA$height
## 
##             Df Deviance    AIC
## <none>           57.047 83.937
## + FAA$pitch  1   53.204 86.817
## 
## Call:  glm(formula = FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + 
##     FAA$height, family = binomial, data = FAA)
## 
## Coefficients:
##      (Intercept)  FAA$speed_ground     FAA$aircraft2        FAA$height  
##        -102.9544            0.9266            5.0481            0.2311  
## 
## Degrees of Freedom: 830 Total (i.e. Null);  827 Residual
## Null Deviance:       622.8 
## Residual Deviance: 57.05     AIC: 65.05

the result from the step BIC is not accordance with step AIC, the stepwise of BIC does not pick up the pitch as a significant factor. the model with speed_ground,aircraft and height and the AIC=65.05, little higher compare to step6 (AIC=63.2). so the better model stay with speed_ground,aircraft,height, pitch)

Step 8. You are scheduled to meet with an FAA agent who wants to know “what are risk factors for long landings and how do they influence its occurrence?”. For your presentation, you are only allowed to show:

Use the step AIC to display the result. there are four significant factors, speed_ground,aircraft,height and pitch.

  1. the higher the speed_ground the longer the landing distance.
  2. Boeing is more related to long landing.
  3. the higher the pitch the longer the landing.
  4. the higher the height the longer the landing
## Start:  AIC=624.78
## FAA$long.landing ~ 1
## 
##                    Df Deviance    AIC
## + FAA$speed_ground  1   115.47 119.47
## + FAA$aircraft      1   606.55 610.55
## + FAA$pitch         1   618.79 622.79
## <none>                  622.78 624.78
## + FAA$height        1   622.13 626.13
## 
## Step:  AIC=119.47
## FAA$long.landing ~ FAA$speed_ground
## 
##                Df Deviance     AIC
## + FAA$aircraft  1   84.665  90.665
## + FAA$height    1  100.459 106.459
## + FAA$pitch     1  105.527 111.527
## <none>             115.470 119.470
## 
## Step:  AIC=90.66
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft
## 
##              Df Deviance    AIC
## + FAA$height  1   57.047 65.047
## + FAA$pitch   1   81.309 89.309
## <none>            84.665 90.665
## 
## Step:  AIC=65.05
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + FAA$height
## 
##             Df Deviance    AIC
## + FAA$pitch  1   53.204 63.204
## <none>           57.047 65.047
## 
## Step:  AIC=63.2
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + FAA$height + 
##     FAA$pitch
## 
## Call:  glm(formula = FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + 
##     FAA$height + FAA$pitch, family = binomial, data = FAA)
## 
## Coefficients:
##      (Intercept)  FAA$speed_ground     FAA$aircraft2        FAA$height  
##         -119.776             1.023             5.134             0.258  
##        FAA$pitch  
##            1.538  
## 
## Degrees of Freedom: 830 Total (i.e. Null);  826 Residual
## Null Deviance:       622.8 
## Residual Deviance: 53.2  AIC: 63.2
##          names    reg_coef odds_ratio direction           pr
## 3 speed_ground 0.472345752   1.603752  positive 3.935339e-14
## 1     aircraft 0.864119860   2.372917  positive 8.398591e-05
## 6        pitch 0.400527824   1.492612  positive 4.664982e-02
## 5       height 0.008623997   1.008661  positive 4.218576e-01

Identifying important factors using the binary data of “risky.landing”.

Step 9. Repeat Steps 1-7 but using “risky.landing” as the binary response.

Use a pie chart or a histogram to show the distribution of “risky.landing”.

(risky.landing step 2)

ggplot(FAA, aes(x = risky.landing)) + geom_bar()

mytable1 <- table(FAA$risky.landing)
lbls <- paste(names(mytable1),"\n",mytable1,sep="")
pie(mytable1,labels = lbls,main = "pie chart of risky.landing")

Perform single-factor regression analysis for each of the potential risk factors, in a similar way to what you did in Steps 13-15 of Part 1. But here the response “risky.landing” is binary. You may consider using logistic regression. Provide a table that ranks the factors from the most important to the least. This table contains 5 columns: the names of variables, the size of the regression coefficient, the odds ratio, the direction of the regression coefficient (positive or negative), and the p-value.

(risky.landing step 3)

reg1_1 <- glm(FAA$risky.landing~FAA$aircraft,family=binomial)
coef1_1  <- summary(reg1_1 )$coefficients[2,1]
oddsRatio1_1  <-exp(coef1_1 )
direction1_1  <- ifelse(coef1_1 >0,"positive","negative")
pr1_1  <- summary(reg1_1 )$coefficients[2,4]

reg2_2 <- glm(FAA$risky.landing~FAA$no_pasg,family=binomial)
coef2_2  <- summary(reg2_2 )$coefficients[2,1]
oddsRatio2_2  <-exp(coef2_2 )
direction2_2  <- ifelse(coef2_2 >0,"positive","negative")
pr2_2  <- summary(reg2_2 )$coefficients[2,4]

reg3_3 <- glm(FAA$risky.landing~FAA$speed_ground,family=binomial)
coef3_3 <- summary(reg3_3)$coefficients[2,1]
oddsRatio3_3 <-exp(coef3_3)
direction3_3 <- ifelse(coef3_3>0,"positive","negative")
pr3_3 <- summary(reg3_3)$coefficients[2,4]

reg4_4 <- glm(FAA$risky.landing~FAA$speed_air,family=binomial)
coef4_4 <- summary(reg4_4)$coefficients[2,1]
oddsRatio4_4 <-exp(coef4_4)
direction4_4 <- ifelse(coef4_4>0,"positive","negative")
pr4_4 <- summary(reg4_4)$coefficients[2,4]

reg5_5 <- glm(FAA$risky.landing~FAA$height,family=binomial)
coef5_5 <- summary(reg5_5)$coefficients[2,1]
oddsRatio5_5 <-exp(coef5_5)
direction5_5 <- ifelse(coef5_5>0,"positive","negative")
pr5_5 <- summary(reg5_5)$coefficients[2,4]

reg6_6 <- glm(FAA$risky.landing~FAA$pitch,family=binomial)
coef6_6 <- summary(reg6_6)$coefficients[2,1]
oddsRatio6_6 <-exp(coef6_6)
direction6_6 <- ifelse(coef6_6>0,"positive","negative")
pr6_6 <- summary(reg6_6)$coefficients[2,4]

reg7_7 <- glm(FAA$risky.landing~FAA$duration,family=binomial)
coef7_7 <- summary(reg7_7)$coefficients[2,1]
oddsRatio7_7 <-exp(coef7_7)
direction7_7 <- ifelse(coef7_7>0,"positive","negative")
pr7_7 <- summary(reg7_7)$coefficients[2,4]

names <- c("aircraft","no_passager","speed_ground","speed_air","height","pitch","duration")
reg_coef1 <- c(coef1_1,coef2_2,coef3_3,coef4_4,coef5_5,coef6_6,coef7_7)
odds_ratio1 <- c(oddsRatio1_1,oddsRatio2_2,oddsRatio3_3,oddsRatio4_4,oddsRatio5_5,oddsRatio6_6,oddsRatio7_7)
direction1 <- c(direction1_1,direction2_2,direction3_3,direction4_4,direction5_5,direction6_6,direction7_7)
pr1 <- c(pr1_1,pr2_2,pr3_3,pr4_4,pr5_5,pr6_6,pr7_7)
table_temp1=data.frame(names,reg_coef1,odds_ratio1,direction1,pr1)
table2<-table_temp1[order(table_temp1$pr1), , drop = FALSE]
table2
##          names    reg_coef1 odds_ratio1 direction1          pr1
## 3 speed_ground  0.614218747   1.8482121   positive 6.898006e-08
## 4    speed_air  0.870661285   2.3884898   positive 3.615180e-06
## 1     aircraft  1.001775330   2.7231120   positive 4.560563e-04
## 6        pitch  0.371071969   1.4492874   positive 1.432961e-01
## 2  no_passager -0.025379344   0.9749400   negative 1.536237e-01
## 7     duration -0.001201245   0.9987995   negative 6.738168e-01
## 5       height -0.002218606   0.9977839   negative 8.705917e-01
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
##            names   reg_coef_n odds_ratio_n direction_n         pr_n
## 3 speed_ground_n 11.507803083 9.948907e+04    positive 6.898006e-08
## 4    speed_air_n 15.778567188 7.121060e+06    positive 3.615180e-06
## 1     aircraft_n  1.001775330 2.723112e+00    positive 4.560563e-04
## 6        pitch_n  0.195395013 1.215791e+00    positive 1.432961e-01
## 2  no_passager_n -0.190124702 8.268560e-01    negative 1.536237e-01
## 7     duration_n -0.001100801 9.988998e-01    negative 6.257257e-01
## 5       height_n -0.021708638 9.785253e-01    negative 8.705917e-01

(risky.landing step 4)

##               no_pasg speed_ground speed_air height pitch duration long.landing
## no_pasg          1.00         0.00      0.00   0.05 -0.02    -0.04        -0.02
## speed_ground     0.00         1.00      1.00  -0.06 -0.04    -0.05         0.62
## speed_air        0.00         1.00      1.00  -0.06 -0.04    -0.04         0.62
## height           0.05        -0.06     -0.06   1.00  0.02     0.01         0.03
## pitch           -0.02        -0.04     -0.04   0.02  1.00    -0.04         0.07
## duration        -0.04        -0.05     -0.04   0.01 -0.04     1.00        -0.02
## long.landing    -0.02         0.62      0.62   0.03  0.07    -0.02         1.00
## risky.landing   -0.05         0.54      0.54  -0.01  0.05    -0.01         0.75
## aircraft        -0.02        -0.04     -0.04  -0.01  0.35    -0.04         0.14
##               risky.landing aircraft
## no_pasg               -0.05    -0.02
## speed_ground           0.54    -0.04
## speed_air              0.54    -0.04
## height                -0.01    -0.01
## pitch                  0.05     0.35
## duration              -0.01    -0.04
## long.landing           0.75     0.14
## risky.landing          1.00     0.13
## aircraft               0.13     1.00

As we can see from the correlation, risky.landing is highly correlated with speed_air and speed_ground followed by the factors aircraft, pitch. height,no_passager and duration have a negative coef. it is the same order as the significant facotors identified in the table1.

(risky.landing step 5)

Based on the analysis results in Steps 3-4 and the collinearity result seen in Step 16 of Part 1, initiate a “full” model. Fit your model to the data and present your result.

cor(FAA$speed_air,FAA$speed_ground)
## [1] 0.9990797
model_whole1 <- glm(FAA1$risky.landing ~., family = binomial,FAA1)
summary(model_whole1)
## 
## Call:
## glm(formula = FAA1$risky.landing ~ ., family = binomial, data = FAA1)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.439   0.000   0.000   0.000   1.852  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.148e+02  2.605e+03  -0.044 0.964854    
## no_pasg      -8.539e-02  6.025e-02  -1.417 0.156400    
## speed_ground  9.212e-01  2.512e-01   3.667 0.000246 ***
## height        4.098e-02  4.624e-02   0.886 0.375535    
## pitch         5.798e-01  7.980e-01   0.727 0.467515    
## duration      3.513e-04  1.214e-02   0.029 0.976917    
## long.landing  1.453e+01  2.605e+03   0.006 0.995548    
## aircraft2     4.319e+00  1.570e+00   2.751 0.005949 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  36.271  on 823  degrees of freedom
## AIC: 52.271
## 
## Number of Fisher Scoring iterations: 23

For the correlation of the speed_air and speed_ground is very large. we keep the original data of speed_ground, remove the speed_air from our analysis. the AIC of the model_whole without speed_air is 52.271, the significant factor: aircraft, speed_ground.

summary(modfull_1)
## 
## Call:
## glm(formula = FAA$risky.landing ~ FAA$speed_ground + FAA$aircraft, 
##     family = binomial)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.24398  -0.00011   0.00000   0.00000   1.61021  
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -106.0963    25.7459  -4.121 3.77e-05 ***
## FAA$speed_ground    0.9263     0.2248   4.121 3.78e-05 ***
## FAA$aircraft        4.0190     1.2494   3.217   0.0013 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  40.097  on 828  degrees of freedom
## AIC: 46.097
## 
## Number of Fisher Scoring iterations: 12

the AIC of the full model is 46.09. factors speed_ground and aircraft only.

(risky.landing step 6)

Use the R function “Step” to perform forward variable selection using AIC. Compare the result with the table obtained in Step 3. Are the results consistent?

modstart_1 <- glm(FAA$risky.landing~1,family = binomial,FAA)
step(modstart_1,scope = formula(modfull_1),direction = "forward")
## Start:  AIC=438.04
## FAA$risky.landing ~ 1
## 
##                    Df Deviance    AIC
## + FAA$speed_ground  1    58.93  62.93
## + FAA$aircraft      1   422.74 426.74
## <none>                  436.04 438.04
## 
## Step:  AIC=62.93
## FAA$risky.landing ~ FAA$speed_ground
## 
##                Df Deviance    AIC
## + FAA$aircraft  1   40.097 46.097
## <none>              58.931 62.931
## 
## Step:  AIC=46.1
## FAA$risky.landing ~ FAA$speed_ground + FAA$aircraft
## 
## Call:  glm(formula = FAA$risky.landing ~ FAA$speed_ground + FAA$aircraft, 
##     family = binomial, data = FAA)
## 
## Coefficients:
##      (Intercept)  FAA$speed_ground      FAA$aircraft  
##        -106.0963            0.9263            4.0190  
## 
## Degrees of Freedom: 830 Total (i.e. Null);  828 Residual
## Null Deviance:       436 
## Residual Deviance: 40.1  AIC: 46.1

Use the step AIC for the variable selection. the best model from this step is with the factors speed_ground,aircraft. it is consistent with the step3.

Use the R function “Step” to perform forward variable selection using BIC. Compare the result with that from the previous step.

step(modstart_1,criterion = "BIC",scope = formula(modfull_1),direction = "forward", k=log(831))
## Start:  AIC=442.77
## FAA$risky.landing ~ 1
## 
##                    Df Deviance    AIC
## + FAA$speed_ground  1    58.93  72.38
## + FAA$aircraft      1   422.74 436.18
## <none>                  436.04 442.77
## 
## Step:  AIC=72.38
## FAA$risky.landing ~ FAA$speed_ground
## 
##                Df Deviance    AIC
## + FAA$aircraft  1   40.097 60.264
## <none>              58.931 72.376
## 
## Step:  AIC=60.26
## FAA$risky.landing ~ FAA$speed_ground + FAA$aircraft
## 
## Call:  glm(formula = FAA$risky.landing ~ FAA$speed_ground + FAA$aircraft, 
##     family = binomial, data = FAA)
## 
## Coefficients:
##      (Intercept)  FAA$speed_ground      FAA$aircraft  
##        -106.0963            0.9263            4.0190  
## 
## Degrees of Freedom: 830 Total (i.e. Null);  828 Residual
## Null Deviance:       436 
## Residual Deviance: 40.1  AIC: 46.1

the result from the step BIC is accordance with step AIC, the model goes with speed_ground,aircraft.

Step 10. You are scheduled to meet with an FAA agent who wants to know “what are risk factors for risky landings and how do they influence its occurrence?”. For your presentation, you are only allowed to show:

Use the step AIC to display the result. there are only two significant factors, speed_ground,aircraft

  1. the higher the speed_ground the risker the landing.
  2. Boeing is more related to risky landing.
## Start:  AIC=624.78
## FAA$long.landing ~ 1
## 
##                    Df Deviance    AIC
## + FAA$speed_ground  1   115.47 119.47
## + FAA$aircraft      1   606.55 610.55
## + FAA$pitch         1   618.79 622.79
## <none>                  622.78 624.78
## + FAA$height        1   622.13 626.13
## 
## Step:  AIC=119.47
## FAA$long.landing ~ FAA$speed_ground
## 
##                Df Deviance     AIC
## + FAA$aircraft  1   84.665  90.665
## + FAA$height    1  100.459 106.459
## + FAA$pitch     1  105.527 111.527
## <none>             115.470 119.470
## 
## Step:  AIC=90.66
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft
## 
##              Df Deviance    AIC
## + FAA$height  1   57.047 65.047
## + FAA$pitch   1   81.309 89.309
## <none>            84.665 90.665
## 
## Step:  AIC=65.05
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + FAA$height
## 
##             Df Deviance    AIC
## + FAA$pitch  1   53.204 63.204
## <none>           57.047 65.047
## 
## Step:  AIC=63.2
## FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + FAA$height + 
##     FAA$pitch
## 
## Call:  glm(formula = FAA$long.landing ~ FAA$speed_ground + FAA$aircraft + 
##     FAA$height + FAA$pitch, family = binomial, data = FAA)
## 
## Coefficients:
##      (Intercept)  FAA$speed_ground      FAA$aircraft        FAA$height  
##         -124.910             1.023             5.134             0.258  
##        FAA$pitch  
##            1.538  
## 
## Degrees of Freedom: 830 Total (i.e. Null);  826 Residual
## Null Deviance:       622.8 
## Residual Deviance: 53.2  AIC: 63.2
table3 <- table2[c(1,3),]
table3
##          names reg_coef1 odds_ratio1 direction1          pr1
## 3 speed_ground 0.6142187    1.848212   positive 6.898006e-08
## 1     aircraft 1.0017753    2.723112   positive 4.560563e-04

Compare the two models built for “long.landing” and “risky.landing” Step 11. Use no more than three bullet statements to summarize the difference between the two models.

table1
##          names    reg_coef odds_ratio direction           pr
## 3 speed_ground 0.472345752   1.603752  positive 3.935339e-14
## 1     aircraft 0.864119860   2.372917  positive 8.398591e-05
## 6        pitch 0.400527824   1.492612  positive 4.664982e-02
## 5       height 0.008623997   1.008661  positive 4.218576e-01
table3
##          names reg_coef1 odds_ratio1 direction1          pr1
## 3 speed_ground 0.6142187    1.848212   positive 6.898006e-08
## 1     aircraft 1.0017753    2.723112   positive 4.560563e-04
  1. long.landing correlates with speed_ground, aircraft, pitch and height. risky.landing correlates with speed_ground and aircraft.
  2. speed_ground and aircraft play more important parts in risky.landing because the coef are bigger in the risky.landing model.

Step 12. Plot the ROC curve (sensitivity versus 1-specificity) for each model (see pp.32-33 in Lecture 4 slides). Draw the two curves in the same plot. Do you have any comment?

for long.landing model:

pred.model.l<- predict(modfull, type="response")
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
pred <- prediction(pred.model.l,FAA$long.landing)
perf <- performance(pred, "tpr", "fpr")
plot(perf,colorize=TRUE)

unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.9984663
thresh <- seq(0.01,0.5,0.01)
pred.model.l <- predict(modfull, type = "response")
pred.model.r <- predict(modfull_1, type = "response")

sensitivity <- specificity<-rep(NA,length(thresh))

for(j in seq(along=thresh)) {
  pp<-ifelse(pred.model.l < thresh[j], "no", "yes")
  xx<-xtabs(~long.landing + pp, FAA)
  specificity[j] <- xx[1,1]/(xx[1,1] + xx[1,2])
  sensitivity[j] <- xx[2,2]/(xx[2,1] + xx[2,2])
}

par(mfrow= c(1,2))
matplot(thresh,cbind(sensitivity, specificity), type="l", xlab="Threshold", ylab="Proportion", lty=1:2)
plot(1-specificity, sensitivity, type="l");abline(0, 1, lty=2)

for risky.landing model:

pred.model.r<- predict(modfull_1, type="response")
library(ROCR)
pred.r <- prediction(pred.model.r,FAA$risky.landing)
perf.r <- performance(pred.r, "tpr", "fpr")
plot(perf.r,colorize=TRUE)

unlist(slot(performance(pred.r, "auc"), "y.values"))
## [1] 0.9986161
thresh <- seq(0.01,0.5,0.01)
pred.model.l <- predict(modfull, type = "response")
pred.model.r <- predict(modfull_1, type = "response")

sensitivity.r <- specificity.r<-rep(NA,length(thresh))

for(j in seq(along=thresh)) {
  pp<-ifelse(pred.model.r < thresh[j], "no", "yes")
  xx<-xtabs(~risky.landing + pp, FAA)
  specificity.r[j] <- xx[1,1]/(xx[1,1] + xx[1,2])
  sensitivity.r[j] <- xx[2,2]/(xx[2,1] + xx[2,2])
}

par(mfrow= c(1,2))
matplot(thresh,cbind(sensitivity.r, specificity.r), type="l", xlab="Threshold", ylab="Proportion", lty=1:2)
plot(1-specificity.r, sensitivity.r, type="l");abline(0, 1, lty=2)

plot(1-specificity,sensitivity, type="l", col="blue")
points(1-specificity.r,sensitivity.r,type="l",col="red")
lines(1-specificity.r,sensitivity.r, col="red",lty=2)

unlist(slot(performance(pred, "auc"), "y.values"))
## [1] 0.9984663
unlist(slot(performance(pred.r, "auc"), "y.values"))
## [1] 0.9986161

the ROC curves for the two models for long.landing and risky.landing. the plot indicates that when the sensitivity increases, the specificity decreases. the area under the ROC curve indicates the predictive power of the model. the more area under the ROC curve the better the model.

Step 13. A commercial airplane is passing over the threshold of the runway, at this moment we have its basic information and measures of its airborne performance (Boeing, duration=200, no_pasg=80, speed_ground=115, speed_air=120, height=40, pitch=4). Predict its probability of being a long landing and a risky landing, respectively. Report the predicted probability as well as its 95% confidence interval.

mod.l <- glm(long.landing ~ speed_ground+ aircraft+pitch+height,family = binomial,FAA)
mod.r <- glm(risky.landing ~ speed_ground+ aircraft,family = binomial,FAA)
newdata <- data.frame(speed_ground=115,aircraft="boeing",pitch=4,height=40)
predict(mod.l,newdata = newdata,type = "response",se=T)
## $fit
## 1 
## 1 
## 
## $se.fit
##            1 
## 1.437223e-08 
## 
## $residual.scale
## [1] 1
predict(mod.r,newdata = newdata,type = "response",se=T)
## $fit
##        1 
## 0.999789 
## 
## $se.fit
##            1 
## 0.0004408114 
## 
## $residual.scale
## [1] 1
round((c(1-1.96*1.437223e-08 ,1+1.96*1.437223e-08 )),3)
## [1] 1 1
round((c(0.99-1.96*0.00044,0.99+1.96*0.00044)),3)
## [1] 0.989 0.991

The probability of long landing and risky landing are very high, both approximately 1. the 95%confidence interval for long landing and risky landing are very narrow.

Compare models with different link functions Step 14.

For the binary response “risky landing”, fit the following models using the risk factors identified in Steps 9-10: Probit model Hazard model with complementary log-log link Compare these two models with the logistic model. Do you have any comments?

model_r_logit <- glm(FAA$risky.landing ~ FAA$aircraft + FAA$speed_ground,FAA,family = binomial(link=logit))
summary(model_r_logit)
## 
## Call:
## glm(formula = FAA$risky.landing ~ FAA$aircraft + FAA$speed_ground, 
##     family = binomial(link = logit), data = FAA)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.24398  -0.00011   0.00000   0.00000   1.61021  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -102.0772    24.7751  -4.120 3.79e-05 ***
## FAA$aircraftboeing    4.0190     1.2494   3.217   0.0013 ** 
## FAA$speed_ground      0.9263     0.2248   4.121 3.78e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  40.097  on 828  degrees of freedom
## AIC: 46.097
## 
## Number of Fisher Scoring iterations: 12
model_r_probit <- glm(FAA$risky.landing ~ FAA$aircraft + FAA$speed_ground,FAA,family = binomial(link=probit))
summary(model_r_probit)
## 
## Call:
## glm(formula = FAA$risky.landing ~ FAA$aircraft + FAA$speed_ground, 
##     family = binomial(link = probit), data = FAA)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.210   0.000   0.000   0.000   1.573  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -58.6931    13.3133  -4.409 1.04e-05 ***
## FAA$aircraftboeing   2.3567     0.7016   3.359 0.000782 ***
## FAA$speed_ground     0.5322     0.1207   4.411 1.03e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  39.436  on 828  degrees of freedom
## AIC: 45.436
## 
## Number of Fisher Scoring iterations: 14
model_r_loglog <- glm(FAA$risky.landing ~ FAA$aircraft + FAA$speed_ground,FAA,family = binomial(link=cloglog))
summary(model_r_loglog)
## 
## Call:
## glm(formula = FAA$risky.landing ~ FAA$aircraft + FAA$speed_ground, 
##     family = binomial(link = cloglog), data = FAA)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.24103  -0.00183  -0.00004   0.00000   1.67963  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -69.2654    14.7396  -4.699 2.61e-06 ***
## FAA$aircraftboeing   2.8984     0.8002   3.622 0.000292 ***
## FAA$speed_ground     0.6221     0.1326   4.690 2.74e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 436.043  on 830  degrees of freedom
## Residual deviance:  41.443  on 828  degrees of freedom
## AIC: 47.443
## 
## Number of Fisher Scoring iterations: 13

Compare with these three models: 1) the cloglog model has the lowest AIC. 2) the logit model has the highest coefficient, probit model has the lowest coefficient. 3) the probit model has a little better residual deviance.

Step 15. Compare the three models by showing their ROC curves in the same plot (see Step 12).

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
pred_logit <- predict(model_r_logit, type = "response")
pred_probit <- predict(model_r_probit, type = "response")
pred_cloglog <- predict(model_r_loglog, type = "response")

ROC_logit <- roc(FAA$risky.landing[!is.na(FAA$speed_air)], pred_logit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_probit <- roc(FAA$risky.landing[!is.na(FAA$speed_air)], pred_probit)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
ROC_cloglog <- roc(FAA$risky.landing[!is.na(FAA$speed_air)], pred_cloglog)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ROC_logit)
lines(ROC_probit, col= 'blue')
lines(ROC_cloglog, col= "red")

The three ROC curves are very close to each other. it is difficult to compare in this case.

step16 Use each model to identify the top 5 risky landings. Do they point to the same flights?

pred_logit.index <- sort(as.numeric(names(tail(sort(pred_logit), 5))))
pred_logit.sort <- FAA[pred_logit.index,1:7]
pred_logit.sort
##     no_pasg speed_ground speed_air   height    pitch  duration long.landing
## 227      60     131.0352  131.3379 28.27797 3.660194 131.73110            1
## 513      52     132.7847  132.9115 18.17703 4.110664  63.32952            1
## 675      61     126.8393  126.1186 20.54783 4.334558 153.83445            1
## 773      67     129.3072  127.5933 23.97850 5.154699 154.52460            1
## 814      72     129.2649  128.4177 33.94900 4.139951 161.89247            1
pred_probit.index <- sort(as.numeric(names(tail(sort(pred_probit), 5))))
pred_probit.sort <- FAA[pred_probit.index,1:7]
pred_probit.sort
##     no_pasg speed_ground speed_air   height    pitch duration long.landing
## 675      61     126.8393  126.1186 20.54783 4.334558 153.8345            1
## 772      67     122.7566  123.8826 30.21657 3.213703 116.9845            1
## 773      67     129.3072  127.5933 23.97850 5.154699 154.5246            1
## 784      68     126.6692  127.9641 23.76423 2.993151 197.5464            1
## 814      72     129.2649  128.4177 33.94900 4.139951 161.8925            1
pred_cloglog.index <- sort(as.numeric(names(tail(sort(pred_cloglog), 5))))
pred_cloglog.sort <- FAA[pred_cloglog.index,1:7]
pred_cloglog.sort
##     no_pasg speed_ground speed_air   height    pitch duration long.landing
## 760      66     117.6406  112.2650 35.91004 4.058218 109.4517            1
## 772      67     122.7566  123.8826 30.21657 3.213703 116.9845            1
## 773      67     129.3072  127.5933 23.97850 5.154699 154.5246            1
## 784      68     126.6692  127.9641 23.76423 2.993151 197.5464            1
## 814      72     129.2649  128.4177 33.94900 4.139951 161.8925            1

We can refer from the models that the higest risky landing flight in probit model and cloglog model has 4 in common: they are 772,773,784,814. but only 814 and 773 appears in all three models.

step17 Use the probit model and hazard model to make prediction for the flight described in Step 13. Report the predicted probability as well as its 95% confidence interval. Compare the results with that from Step 13.

model_L_logit <- glm(long.landing ~ aircraft + speed_ground + height + pitch,FAA,family = binomial(link=logit))
model_L_probit <- glm(long.landing ~ aircraft + speed_ground+ height + pitch,FAA,family = binomial(link=probit))
model_L_loglog <- glm(long.landing ~ aircraft + speed_ground+ height + pitch,FAA,family = binomial(link=cloglog))

model_r_logit <- glm(risky.landing ~ aircraft + speed_ground,FAA,family = binomial(link=logit))
model_r_probit <- glm(risky.landing ~ aircraft + speed_ground,FAA,family = binomial(link=probit))
model_r_loglog <- glm(risky.landing ~ aircraft + speed_ground,FAA,family = binomial(link=cloglog)) 

newdata <- data.frame(speed_ground=115,aircraft="boeing",pitch=4,height=40)

pred_L_logit <- predict(model_L_logit,newdata, type = "response", se.fit = T) 
pred_L_probit <- predict(model_L_probit,newdata, type = "response", se.fit = T)
pred_L_loglog <- predict(model_L_loglog,newdata, type = "response", se.fit = T) 

pred_r_logit <- predict(model_r_logit,newdata, type = "response", se.fit = T)
pred_r_probit <- predict(model_r_probit,newdata, type = "response", se.fit = T)
pred_r_cloglog <- predict(model_r_loglog,newdata, type = "response", se.fit = T)

logit_inter_L <- round((c(1-1.96*1.437223e-08 ,1+1.96*1.437223e-08 )),3)
logit_inter_R <-round((c(0.99-1.96*0.00044,0.99+1.96*0.00044)),3)
probit_inter_L <- round((c(1-1.96*4.603419e-16 ,1+1.96*4.603419e-16 )),3)
probit_inter_R <- round((c(0.99-1.96*3.153557e-06 ,0.99+1.96*3.153557e-06 )),3)
loglog_inter_L <- round((c(1-1.96*5.343748e-16  ,1+1.96*5.343748e-16)),3)
loglog_inter_R <- round((c(1-1.96*2.605522e-16 ,1+1.96*2.605522e-16)),3)

pred_L_logitP <- predict(model_L_logit,newdata, type = "response") 
pred_L_probitP <- predict(model_L_probit,newdata, type = "response")
pred_L_loglogP <- predict(model_L_loglog,newdata, type = "response") 

pred_r_logitP <- predict(model_r_logit,newdata, type = "response")
pred_r_probitP <- predict(model_r_probit,newdata, type = "response")
pred_r_loglogP <- predict(model_r_loglog,newdata, type = "response")

name <- c("logit","probit","cloglog")
probability_L <- c(pred_L_logitP,pred_L_probitP,pred_L_loglogP)
probability_R <- c(pred_r_logitP,pred_r_probitP,pred_r_loglogP)
interval_L <- c(logit_inter_L,probit_inter_L,loglog_inter_L)
interval_R <- c(logit_inter_R,probit_inter_R,loglog_inter_R)
table_P <- data.frame(name,probability_L,probability_R)
table_P
##      name probability_L probability_R
## 1   logit             1     0.9997890
## 2  probit             1     0.9999994
## 3 cloglog             1     1.0000000

the probability for the three modesl of long.landing are all 1. the probability of the risky.landing for logit-model is 0.9998,probit-model is 0.999 and cloglog-model is 1. the interval for all the three models of the long.landing are from 1 to 1. the interval of the logit-model is from 0.989 to 0.991. the interval of the probit-model is from 0.99 to 0.99. the interval of the clonglong-model is from 1 to 1.

Again, please work on the cleaned FAA data set you prepared by carrying out Steps 1‐9 in Part 1 of the project. Create a multinomial variable and attach it to your data set.***

From the dataset from the part1, get the clean dataset. create multinomial variable Y which contains three factors:1,2,3. attach to the orgianl dataset. Discard the continuous data for “distance” and “speed_air”. The new dataset FAA contains 831 observations and 7 variables.

FAA_clean <- read.csv(file="C:/Users/yizha/Desktop/Spring 2020/Statistical Modeling/week-3/FAA.csv")
Y.distance <- ifelse(FAA_clean$distance<1000,1,ifelse(FAA_clean$distance>=1000 & FAA_clean$distance <2500,2,3))
FAA <- cbind(FAA_clean,Y.distance)
aircraft <- FAA$ï..aircraft
FAA <- cbind(FAA[,-c(1,4,7)],aircraft)
FAA$Y.distance <- as.factor(FAA$Y.distance)
str(FAA)
## 'data.frame':    831 obs. of  7 variables:
##  $ no_pasg     : int  36 38 40 41 43 44 45 45 45 45 ...
##  $ speed_ground: num  47.5 85.2 80.6 97.6 82.5 ...
##  $ height      : num  14 37 28.6 38.4 30.1 ...
##  $ pitch       : num  4.3 4.12 3.62 3.53 4.09 ...
##  $ duration    : num  172 188 93.5 123.3 109.2 ...
##  $ Y.distance  : Factor w/ 3 levels "1","2","3": 1 2 2 2 2 2 1 1 2 2 ...
##  $ aircraft    : Factor w/ 2 levels "airbus","boeing": 1 1 1 1 1 1 1 1 1 1 ...
summary(FAA)
##     no_pasg       speed_ground        height           pitch      
##  Min.   :29.00   Min.   : 33.57   Min.   : 6.228   Min.   :2.284  
##  1st Qu.:55.00   1st Qu.: 66.20   1st Qu.:23.530   1st Qu.:3.640  
##  Median :60.00   Median : 79.79   Median :30.167   Median :4.001  
##  Mean   :60.06   Mean   : 79.54   Mean   :30.458   Mean   :4.005  
##  3rd Qu.:65.00   3rd Qu.: 91.91   3rd Qu.:37.004   3rd Qu.:4.370  
##  Max.   :87.00   Max.   :132.78   Max.   :59.946   Max.   :5.927  
##     duration      Y.distance   aircraft  
##  Min.   : 41.95   1:269      airbus:444  
##  1st Qu.:122.67   2:459      boeing:387  
##  Median :154.78   3:103                  
##  Mean   :154.78                          
##  3rd Qu.:186.37                          
##  Max.   :305.62

Fit a multinomial model using the FAA dataset. we model “Y.distance” as an multinomial variable without an order, build a model with the rest of variables as expanatory variables. the significant factors are speed_ground, height, aircraft. the AIC is 450.0933

Use step(FAA.multi) to decide the best model, the AIC of the best model is 446.25. the significant variables are speed_ground, height, pitch, aircraft. The difference between diviances is 4.16. difference between degrees of freedom is 4,because the model has two sets of beta. chisq test is 0.616>0.5. there is no difference between full model and small model

Conclusion, build the final model(mod.final) which AIC=446.95. factors: speed_ground, height, aircraft.

library(rattle.data)
FAA.multi <- multinom(Y.distance ~.,data=FAA)
## # weights:  24 (14 variable)
## initial  value 912.946812 
## iter  10 value 536.529296
## iter  20 value 230.708682
## iter  30 value 212.146484
## iter  40 value 211.724448
## iter  50 value 211.271644
## final  value 211.046645 
## converged
summary(FAA.multi)
## Call:
## multinom(formula = Y.distance ~ ., data = FAA)
## 
## Coefficients:
##   (Intercept)     no_pasg speed_ground    height      pitch     duration
## 2   -20.56921 -0.02514318    0.2501309 0.1501214 -0.2512152 -0.003558517
## 3  -142.01342 -0.02168823    1.2826558 0.4030957  1.2326369  0.001730862
##   aircraftboeing
## 2       4.116231
## 3       9.305422
## 
## Std. Errors:
##   (Intercept)    no_pasg speed_ground     height     pitch    duration
## 2  2.28688389 0.01751070   0.02022332 0.01744858 0.2645913 0.002811905
## 3  0.03815991 0.05725845   0.04040976 0.04783759 0.7561328 0.008245808
##   aircraftboeing
## 2      0.4264189
## 3      0.8588740
## 
## Residual Deviance: 422.0933 
## AIC: 450.0933
step.FAA.multi <- step(FAA.multi)
## Start:  AIC=450.09
## Y.distance ~ no_pasg + speed_ground + height + pitch + duration + 
##     aircraft
## 
## trying - no_pasg 
## # weights:  21 (12 variable)
## initial  value 912.946812 
## iter  10 value 555.618888
## iter  20 value 230.885714
## iter  30 value 217.403847
## iter  40 value 215.867871
## iter  50 value 212.096386
## final  value 212.096381 
## converged
## trying - speed_ground 
## # weights:  21 (12 variable)
## initial  value 912.946812 
## iter  10 value 764.747199
## final  value 752.698851 
## converged
## trying - height 
## # weights:  21 (12 variable)
## initial  value 912.946812 
## iter  10 value 513.914601
## iter  20 value 287.856236
## iter  30 value 277.621068
## iter  40 value 277.531183
## final  value 277.526512 
## converged
## trying - pitch 
## # weights:  21 (12 variable)
## initial  value 912.946812 
## iter  10 value 540.109242
## iter  20 value 229.862098
## iter  30 value 216.002072
## iter  40 value 214.662260
## final  value 213.268358 
## converged
## trying - duration 
## # weights:  21 (12 variable)
## initial  value 912.946812 
## iter  10 value 555.798593
## iter  20 value 230.227634
## iter  30 value 215.237197
## iter  40 value 213.791085
## iter  50 value 212.082977
## iter  50 value 212.082976
## iter  50 value 212.082976
## final  value 212.082976 
## converged
## trying - aircraft 
## # weights:  21 (12 variable)
## initial  value 912.946812 
## iter  10 value 555.759252
## iter  20 value 313.605502
## iter  30 value 309.171696
## iter  40 value 309.153144
## final  value 309.151364 
## converged
##                Df       AIC
## - duration     12  448.1660
## - no_pasg      12  448.1928
## <none>         14  450.0933
## - pitch        12  450.5367
## - height       12  579.0530
## - aircraft     12  642.3027
## - speed_ground 12 1529.3977
## # weights:  21 (12 variable)
## initial  value 912.946812 
## iter  10 value 555.798593
## iter  20 value 230.227634
## iter  30 value 215.237197
## iter  40 value 213.791085
## iter  50 value 212.082977
## iter  50 value 212.082976
## iter  50 value 212.082976
## final  value 212.082976 
## converged
## 
## Step:  AIC=448.17
## Y.distance ~ no_pasg + speed_ground + height + pitch + aircraft
## 
## trying - no_pasg 
## # weights:  18 (10 variable)
## initial  value 912.946812 
## iter  10 value 381.601980
## iter  20 value 226.920380
## iter  30 value 218.193791
## iter  40 value 213.134854
## final  value 213.129113 
## converged
## trying - speed_ground 
## # weights:  18 (10 variable)
## initial  value 912.946812 
## iter  10 value 759.229933
## final  value 755.295778 
## converged
## trying - height 
## # weights:  18 (10 variable)
## initial  value 912.946812 
## iter  10 value 436.645262
## iter  20 value 280.500281
## iter  30 value 279.405907
## iter  40 value 279.375824
## iter  40 value 279.375822
## iter  40 value 279.375822
## final  value 279.375822 
## converged
## trying - pitch 
## # weights:  18 (10 variable)
## initial  value 912.946812 
## iter  10 value 439.090617
## iter  20 value 226.673256
## iter  30 value 219.383694
## iter  40 value 214.442879
## final  value 214.431068 
## converged
## trying - aircraft 
## # weights:  18 (10 variable)
## initial  value 912.946812 
## iter  10 value 498.723154
## iter  20 value 311.411199
## iter  30 value 310.489234
## final  value 310.455712 
## converged
##                Df       AIC
## - no_pasg      10  446.2582
## <none>         12  448.1660
## - pitch        10  448.8621
## - height       10  578.7516
## - aircraft     10  640.9114
## - speed_ground 10 1530.5916
## # weights:  18 (10 variable)
## initial  value 912.946812 
## iter  10 value 381.601980
## iter  20 value 226.920380
## iter  30 value 218.193791
## iter  40 value 213.134854
## final  value 213.129113 
## converged
## 
## Step:  AIC=446.26
## Y.distance ~ speed_ground + height + pitch + aircraft
## 
## trying - speed_ground 
## # weights:  15 (8 variable)
## initial  value 912.946812 
## iter  10 value 755.930587
## final  value 755.576400 
## converged
## trying - height 
## # weights:  15 (8 variable)
## initial  value 912.946812 
## iter  10 value 338.609900
## iter  20 value 283.368322
## iter  30 value 280.417649
## final  value 280.107902 
## converged
## trying - pitch 
## # weights:  15 (8 variable)
## initial  value 912.946812 
## iter  10 value 358.336150
## iter  20 value 230.468155
## iter  30 value 219.198168
## iter  40 value 215.476362
## iter  40 value 215.476360
## iter  40 value 215.476360
## final  value 215.476360 
## converged
## trying - aircraft 
## # weights:  15 (8 variable)
## initial  value 912.946812 
## iter  10 value 371.812672
## iter  20 value 311.876265
## iter  30 value 311.256957
## final  value 311.208398 
## converged
##                Df       AIC
## <none>         10  446.2582
## - pitch         8  446.9527
## - height        8  576.2158
## - aircraft      8  638.4168
## - speed_ground  8 1527.1528
summary(step.FAA.multi)
## Call:
## multinom(formula = Y.distance ~ speed_ground + height + pitch + 
##     aircraft, data = FAA)
## 
## Coefficients:
##   (Intercept) speed_ground    height      pitch aircraftboeing
## 2   -22.47693    0.2483347 0.1483717 -0.2432098       4.089318
## 3  -142.24754    1.2709771 0.4062396  1.2946709       9.220361
## 
## Std. Errors:
##   (Intercept) speed_ground     height     pitch aircraftboeing
## 2  2.06661064   0.01997638 0.01731625 0.2638094      0.4225622
## 3  0.03615591   0.02862813 0.03922743 0.7353315      0.8463685
## 
## Residual Deviance: 426.2582 
## AIC: 446.2582
deviance(step.FAA.multi)-deviance(FAA.multi)
## [1] 4.164935
FAA.multi$edf-step.FAA.multi$edf
## [1] 4
pchisq(deviance(step.FAA.multi)-deviance(FAA.multi),FAA.multi$edf-step.FAA.multi$edf,lower = F)
## [1] 0.3841443
mod.final <- multinom(Y.distance ~ speed_ground+height+aircraft,data=FAA)
## # weights:  15 (8 variable)
## initial  value 912.946812 
## iter  10 value 358.336150
## iter  20 value 230.468155
## iter  30 value 219.198168
## iter  40 value 215.476362
## iter  40 value 215.476360
## iter  40 value 215.476360
## final  value 215.476360 
## converged
summary(mod.final)
## Call:
## multinom(formula = Y.distance ~ speed_ground + height + aircraft, 
##     data = FAA)
## 
## Coefficients:
##   (Intercept) speed_ground    height aircraftboeing
## 2   -23.28484    0.2472743 0.1467859       3.982905
## 3  -126.43265    1.1756019 0.3782799       9.040905
## 
## Std. Errors:
##   (Intercept) speed_ground     height aircraftboeing
## 2  1.88720542   0.01980816 0.01714538      0.4027433
## 3  0.04519312   0.01276020 0.03604886      0.7502719
## 
## Residual Deviance: 430.9527 
## AIC: 446.9527

Check where the mean value lie for all the variables at different Y.

variables <- c("no_pasg","speed_groud*","height*","pitch","duration")
means <- matrix(c(no_pasg.mean,speedground.mean,height.mean,pitch.mean,duration.mean),nr=5,byrow = TRUE)
table.0 <- data.frame(variables,means)
colnames(table.0) <- c("variables","Y.distance=1","Y.distance=2","Y.distance=3")
table.0
##      variables Y.distance=1 Y.distance=2 Y.distance=3
## 1      no_pasg    60.256506    60.017429    59.699029
## 2 speed_groud*    62.440660    82.623547   110.478035
## 3      height*    28.143213    31.651764    31.182580
## 4        pitch     3.960774     4.009451     4.101969
## 5     duration   160.344060   151.985262   152.668304
ggplot(FAA, aes(x=aircraft, fill=Y.distance)) + 
  geom_bar(position = 'dodge')

ggplot(FAA, aes(x=Y.distance, y=speed_ground)) +
  geom_boxplot()

  1. the model with factors speed_ground, height, pitch, aircraft are significant.

  2. table.0 shows the mean of each group. table.0 makes sense that the higher speed_groud the longer distance the aircraft needs to land. same as the height and pitch. but when the distance >2500, the height and pitch do not have a significant impact on the distance.

3)boeing has higher pproportion of flights with longer distance compare to airbus.

  1. there is a significant increase in speed_ground with the distance.

  2. one-unit increase in speed_ground

    increases the odds of Y=2 by 1.28 relative to Y=1

    increases the odds of Y=3 by 3.24 relative to Y=1

    one-unit increase in height

    increases the odds of Y=2 by 1.15 relative to Y=1

    increases the odds of Y=3 by 1.45 relative to Y=1

    When looking at Boeing aircraft as compared to Airbus,

    odds of Y=2 increases by 53.67 relative to Y=1

    odds of Y=3 increases by 8441 relative to Y=1

The number of passengers is often of interest of airlines. What distribution would you use to model this variable? Do we have any variables (in the FAA data set) that are useful for predicting the number of passengers on board?

The distribution of the no_pasg is normally distributed. The poisson distribution will be use to model the number of passengers. use the variables to bulid a model to predict the number of passengers. from the model.poisson the variables are not significant, the information in the FAA dataset is not enough for predicting the number of passengers on board.

table(FAA$no_pasg)
## 
## 29 36 38 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 
##  1  1  1  1  1  1  2  5  6  9  6 16 17 20 17 28 28 33 32 45 27 38 43 52 53 35 
## 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 82 87 
## 50 41 29 37 26 26 26 17 13  8  7  7  7  6  5  2  1  2  2  1
plot(table(FAA$no_pasg))

model.possion <- glm(FAA$no_pasg~.,family = poisson, FAA)
summary(model.possion)
## 
## Call:
## glm(formula = FAA$no_pasg ~ ., family = poisson, data = FAA)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.3682  -0.6733   0.0186   0.6354   3.1512  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     4.073e+00  5.349e-02  76.156   <2e-16 ***
## speed_ground    4.910e-04  4.379e-04   1.121   0.2622    
## height          8.188e-04  4.859e-04   1.685   0.0919 .  
## pitch          -3.038e-03  9.123e-03  -0.333   0.7391    
## duration       -1.045e-04  9.605e-05  -1.088   0.2766    
## Y.distance2    -1.773e-02  1.440e-02  -1.231   0.2183    
## Y.distance3    -3.616e-02  2.750e-02  -1.315   0.1885    
## aircraftboeing  1.253e-03  1.054e-02   0.119   0.9054    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 783.06  on 830  degrees of freedom
## Residual deviance: 777.92  on 823  degrees of freedom
## AIC: 5720.1
## 
## Number of Fisher Scoring iterations: 4