Dataset

#read excel files
library("readxl")
faa1 <- read_excel('faa1.xls')
faa2 <- read_excel('faa2.xls')

#merge datasets
library('plyr')
faa <- rbind.fill(faa1,faa2)

#remove duplicated records
library('tidyverse')
faa.no.dup <- distinct(faa,aircraft,no_pasg,speed_ground,speed_air,height,pitch,distance,.keep_all = T)

#check abnormal values

height_abnormal <- sum(faa.no.dup$height<6)
duration_abnormal <- sum(faa.no.dup$duration<40)
speed_ground_abnormal <- sum(faa.no.dup$speed_ground<30 | faa.no.dup$speed_ground > 140)
speed_air_abnormal <- sum(faa.no.dup$speed_air<30 | faa.no.dup$speed_air > 140)

#remove rows with abnormal values
faa.no.dup <- faa.no.dup[faa.no.dup$height > 6,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground > 30,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground < 140,]
# Remove rows for blank values in duration column
faa.no.dup <- faa.no.dup[!is.na(faa.no.dup$duration), ]
str(faa.no.dup)

## 'data.frame':    787 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...

Create multinomial variable

faa.no.dup$Y <- ifelse(faa.no.dup$distance < 1000 , 1 , (ifelse(faa.no.dup$distance >=1000 & faa.no.dup$distance < 2500,2,3)))

# discard distance variable 
faa.no.dup$distance <- NULL
faa.no.dup$Y <- as.factor(faa.no.dup$Y)

#COnvert aircraft column to factor column
faa.no.dup$aircraft <- ifelse(faa.no.dup$aircraft == 'boeing', 1, 0)
faa.no.dup$aircraft <- as.factor(faa.no.dup$aircraft)

Create multinomial model without order

Note: We have not used speed_air variable as the model is not built correctly when using this variable. Also, we know that speed_air is highly correlated with speed_ground from previous exercises, hence this variable can be removed from the model

#install.packages("nnet")
library('nnet')
mmod <- multinom(Y ~ aircraft + duration + no_pasg + speed_ground + height +pitch , data = faa.no.dup)

## # weights:  24 (14 variable)
## initial  value 864.607871 
## iter  10 value 539.676454
## iter  20 value 217.648130
## iter  30 value 201.426407
## iter  40 value 201.003238
## iter  50 value 200.495995
## final  value 200.328849 
## converged

mmodsummary <- summary(mmod)
mmodsummary

## Call:
## multinom(formula = Y ~ aircraft + duration + no_pasg + speed_ground + 
##     height + pitch, data = faa.no.dup)
## 
## Coefficients:
##   (Intercept) aircraft1     duration     no_pasg speed_ground    height
## 2   -20.33682  4.090601 -0.003107794 -0.01856740     0.243910 0.1557553
## 3  -135.46203  9.086665  0.002174269 -0.01166523     1.225738 0.3915918
##        pitch
## 2 -0.3523123
## 3  0.9348272
## 
## Std. Errors:
##   (Intercept) aircraft1    duration    no_pasg speed_ground     height
## 2  2.33117597 0.4357412 0.002718939 0.01786958   0.02036963 0.01853599
## 3  0.04016127 0.8808222 0.008026612 0.05800040   0.04056543 0.04851647
##       pitch
## 2 0.2764486
## 3 0.7664359
## 
## Residual Deviance: 400.6577 
## AIC: 428.6577

Based on the above model, the significant variables are - speed_ground, aircraft, height and pitch.(The coefficients for these variables are outside the corresponding standard error range indicating that these are significant). Although the pitch is within the standard error range, its coefficient value is significant hence it may be significant which we will test using AIC

Lets perform variable selection using AIC to confirm the above results-

mmodi <- step(mmod)

## Start:  AIC=428.66
## Y ~ aircraft + duration + no_pasg + speed_ground + height + pitch
## 
## trying - aircraft 
## # weights:  21 (12 variable)
## initial  value 864.607871 
## iter  10 value 560.203043
## iter  20 value 299.447634
## iter  30 value 292.730755
## iter  40 value 292.713500
## final  value 292.712030 
## converged
## trying - duration 
## # weights:  21 (12 variable)
## initial  value 864.607871 
## iter  10 value 432.537959
## iter  20 value 216.594823
## iter  30 value 205.154985
## iter  40 value 203.711084
## iter  50 value 201.227632
## final  value 201.227610 
## converged
## trying - no_pasg 
## # weights:  21 (12 variable)
## initial  value 864.607871 
## iter  10 value 587.455079
## iter  20 value 218.840138
## iter  30 value 207.103381
## iter  40 value 204.037010
## iter  50 value 200.882016
## final  value 200.882014 
## converged
## trying - speed_ground 
## # weights:  21 (12 variable)
## initial  value 864.607871 
## iter  10 value 725.848413
## final  value 713.701466 
## converged
## trying - height 
## # weights:  21 (12 variable)
## initial  value 864.607871 
## iter  10 value 499.487557
## iter  20 value 273.737951
## iter  30 value 264.760545
## iter  40 value 264.644165
## final  value 264.643562 
## converged
## trying - pitch 
## # weights:  21 (12 variable)
## initial  value 864.607871 
## iter  10 value 544.567226
## iter  20 value 217.566142
## iter  30 value 205.268576
## iter  40 value 204.128188
## iter  50 value 202.445576
## iter  50 value 202.445575
## iter  50 value 202.445575
## final  value 202.445575 
## converged
##                Df       AIC
## - no_pasg      12  425.7640
## - duration     12  426.4552
## <none>         14  428.6577
## - pitch        12  428.8911
## - height       12  553.2871
## - aircraft     12  609.4241
## - speed_ground 12 1451.4029
## # weights:  21 (12 variable)
## initial  value 864.607871 
## iter  10 value 587.455079
## iter  20 value 218.840138
## iter  30 value 207.103381
## iter  40 value 204.037010
## iter  50 value 200.882016
## final  value 200.882014 
## converged
## 
## Step:  AIC=425.76
## Y ~ aircraft + duration + speed_ground + height + pitch
## 
## trying - aircraft 
## # weights:  18 (10 variable)
## initial  value 864.607871 
## iter  10 value 456.479180
## iter  20 value 296.035817
## iter  30 value 293.501212
## iter  40 value 293.230403
## final  value 293.230393 
## converged
## trying - duration 
## # weights:  18 (10 variable)
## initial  value 864.607871 
## iter  10 value 366.599898
## iter  20 value 213.575621
## iter  30 value 206.062874
## iter  40 value 201.783411
## final  value 201.768256 
## converged
## trying - speed_ground 
## # weights:  18 (10 variable)
## initial  value 864.607871 
## iter  10 value 717.233819
## final  value 713.942845 
## converged
## trying - height 
## # weights:  18 (10 variable)
## initial  value 864.607871 
## iter  10 value 452.233294
## iter  20 value 270.297430
## iter  30 value 265.287258
## iter  40 value 264.953367
## final  value 264.953361 
## converged
## trying - pitch 
## # weights:  18 (10 variable)
## initial  value 864.607871 
## iter  10 value 432.938740
## iter  20 value 217.194508
## iter  30 value 206.484038
## iter  40 value 202.988776
## final  value 202.988373 
## converged
##                Df       AIC
## - duration     10  423.5365
## <none>         12  425.7640
## - pitch        10  425.9767
## - height       10  549.9067
## - aircraft     10  606.4608
## - speed_ground 10 1447.8857
## # weights:  18 (10 variable)
## initial  value 864.607871 
## iter  10 value 366.599898
## iter  20 value 213.575621
## iter  30 value 206.062874
## iter  40 value 201.783411
## final  value 201.768256 
## converged
## 
## Step:  AIC=423.54
## Y ~ aircraft + speed_ground + height + pitch
## 
## trying - aircraft 
## # weights:  15 (8 variable)
## initial  value 864.607871 
## iter  10 value 352.205797
## iter  20 value 294.916685
## iter  30 value 294.436044
## final  value 294.420330 
## converged
## trying - speed_ground 
## # weights:  15 (8 variable)
## initial  value 864.607871 
## iter  10 value 716.633672
## final  value 716.572509 
## converged
## trying - height 
## # weights:  15 (8 variable)
## initial  value 864.607871 
## iter  10 value 321.686900
## iter  20 value 268.119010
## iter  30 value 266.439633
## final  value 266.221923 
## converged
## trying - pitch 
## # weights:  15 (8 variable)
## initial  value 864.607871 
## iter  10 value 340.652542
## iter  20 value 219.046480
## iter  30 value 206.742675
## iter  40 value 204.013064
## final  value 204.013034 
## converged
##                Df       AIC
## <none>         10  423.5365
## - pitch         8  424.0261
## - height        8  548.4438
## - aircraft      8  604.8407
## - speed_ground  8 1449.1450

The variable selection using AIC confirms that the significant variables are speed_ground, aircraft,height and pitch.

Model comparison

deviance(mmodi) - deviance(mmod)

## [1] 2.878813

mmod$edf - mmodi$edf

## [1] 4

pchisq(deviance(mmodi)-deviance(mmod),mmod$edf - mmodi$edf,lower = F)

## [1] 0.5783063

Create Table for significant variables

model <- c('Model1(p2/p1)','Model1(p2/p1)','Model1(p2/p1)','Model1(p2/p1)','Model2(p3/p1)','Model2(p3/p1)','Model2(p3/p1)','Model2(p3/p1)')
vars <- c('aircraft','speed_ground','height','pitch','aircraft','speed_ground','height','pitch')
varindex <- c(2,5,6,7,2,5,6,7)
fittedcoeffs <- vector()
fittedoddratios <- vector()
fittedse <- vector()

for(i in 1:length(vars)){
if(i <=4){
fittedcoeffs[i] <- mmodsummary$coefficients[1,varindex[i]]
fittedse[i] <- mmodsummary$standard.errors[1,varindex[i]]
fittedoddratios[i] <- exp(fittedcoeffs[i])
}
else{
fittedcoeffs[i] <- mmodsummary$coefficients[2,varindex[i]]
fittedse[i] <- mmodsummary$standard.errors[2,varindex[i]]
fittedoddratios[i] <- exp(fittedcoeffs[i])
}
}

table <- tibble(
  Model = model,
  variable = vars, 
  coefficient = fittedcoeffs,
  standarderror = fittedse,
  odds_ratio = fittedoddratios
)

table

## # A tibble: 8 x 5
##   Model         variable     coefficient standarderror odds_ratio
##   <chr>         <chr>              <dbl>         <dbl>      <dbl>
## 1 Model1(p2/p1) aircraft           4.09         0.436      59.8  
## 2 Model1(p2/p1) speed_ground       0.244        0.0204      1.28 
## 3 Model1(p2/p1) height             0.156        0.0185      1.17 
## 4 Model1(p2/p1) pitch             -0.352        0.276       0.703
## 5 Model2(p3/p1) aircraft           9.09         0.881    8837.   
## 6 Model2(p3/p1) speed_ground       1.23         0.0406      3.41 
## 7 Model2(p3/p1) height             0.392        0.0485      1.48 
## 8 Model2(p3/p1) pitch              0.935        0.766       2.55

When the aircraft maker is boeing, The odds of Y = 2(1000< landing distance < 2500) increases by 59 times and almost 8.8k times for Y=3(landing distance > 2500)
The odds of Y = 2(1000< landing distance < 2500) increases by 1.27 times with speed_ground and 3.4 times for Y=3(landing distance > 2500) with speed_ground.
The odds of Y = 2(1000< landing distance < 2500) increases by 1.16 times with height and 1.47 times for Y=3(landing distance > 2500) with height
The odds of Y = 2(1000< landing distance < 2500) increases by 0.7 times with pitch and 2.54 times for Y=3(landing distance > 2500) with pitch

Plotting graphs

#install.packages('dplyr')
library('dplyr')
library('ggplot2')
lds <-mutate(faa.no.dup, speedground =cut_number(speed_ground,10)) %>% group_by(speedground, Y) %>% summarise(count=n()) %>% group_by(speedground) %>% mutate(etotal=sum(count), proportion=count/etotal)

ldh <-mutate(faa.no.dup, height =cut_number(height,10)) %>% group_by(height, Y) %>% summarise(count=n()) %>% group_by(height) %>% mutate(etotal=sum(count), proportion=count/etotal)

lda <-group_by(faa.no.dup,aircraft, Y) %>% summarise(count=n()) %>% group_by(aircraft) %>% mutate(etotal=sum(count), proportion=count/etotal)

ldp <-mutate(faa.no.dup, pitch =cut_number(pitch,5)) %>% group_by(pitch, Y) %>% summarise(count=n()) %>% group_by(pitch) %>% mutate(etotal=sum(count), proportion=count/etotal)

ggplot(lds, aes(x=speedground, y=proportion, group=Y, linetype=Y)) + geom_line()

The proportion of Y=1(landing distance < 1000) decreases with increasing speed while proportion for Y=2(1000 < distance < 2500) increases until speeds of around 90 and starts to decrease as speed increases further. The proportion of landing Y=3(distance >2500) starts for speeds around 95mph and continously increases with increase in speed.

ggplot(ldh, aes(x=height, y=proportion, group=Y, linetype=Y)) + geom_line()

The proportion of Y=1(landing distance < 1000) overall decreases with increase in height while proportion for Y=2(1000 < landing distance <2500) and proportion of Y=3(landing distance >2500) overall increases with increase in height.

ggplot(lda, aes(x=aircraft, y=proportion, group=Y, linetype=Y)) + geom_line()

Aircraft variable is an important variable in determining the landing distance. The proportion of Y=1(landing distance < 1000) is lesser when the aircraft maker is boeing while proportion for Y=2(1000< landing distance < 2500) and proportion of Y=3(landing distance >2500) is higher for boeing

ggplot(ldp, aes(x=pitch, y=proportion, group=Y, linetype=Y)) + geom_line()

The proportion of Y=1(landing distance < 1000 overall decreases with increase in pitch while proportion for Y=2(1000 < landing distance < 2500) does not vary much with increase in pitch.The proportion of Y=3(landing distance >2500) increases with increase in pitch. Although it can be seen there is not a strong relation between pitch and Y

Conclusions:

speed_ground, aircraft type, height and pitch are important variables in determining the probabilities of landing distance being less than 1000, between 1000 and 2500 and greater than 2500.
When the aircraft maker is boeing, The odds of landing distance between 1000 and 2500 increases by 59 times and almost 8.8k times for landing distance greater than 2500

Q2. Number Of Passengers

Poisson Distribution should be used to model the no_pasg variable as this is a count data

modelp <- glm(no_pasg ~ aircraft + duration + speed_ground + height + 
    pitch + Y, family = poisson ,faa.no.dup)
summary(modelp)

## 
## Call:
## glm(formula = no_pasg ~ aircraft + duration + speed_ground + 
##     height + pitch + Y, family = poisson, data = faa.no.dup)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.3922  -0.6597   0.0274   0.6186   3.1690  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   4.084e+00  5.438e-02  75.105   <2e-16 ***
## aircraft1    -1.946e-03  1.085e-02  -0.179    0.858    
## duration     -8.455e-05  9.379e-05  -0.902    0.367    
## speed_ground  3.297e-04  4.430e-04   0.744    0.457    
## height        5.749e-04  5.000e-04   1.150    0.250    
## pitch        -2.122e-03  9.479e-03  -0.224    0.823    
## Y2           -1.191e-02  1.477e-02  -0.806    0.420    
## Y3           -2.713e-02  2.807e-02  -0.966    0.334    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 745.26  on 786  degrees of freedom
## Residual deviance: 742.16  on 779  degrees of freedom
## AIC: 5423.6
## 
## Number of Fisher Scoring iterations: 4

We observe from the above model that there are no significant variables which can predict the no_pasg data

Lets confirm this using the AIC method

step(modelp)

## Start:  AIC=5423.64
## no_pasg ~ aircraft + duration + speed_ground + height + pitch + 
##     Y
## 
##                Df Deviance    AIC
## - Y             2   743.11 5420.6
## - aircraft      1   742.19 5421.7
## - pitch         1   742.21 5421.7
## - speed_ground  1   742.71 5422.2
## - duration      1   742.97 5422.5
## - height        1   743.48 5423.0
## <none>              742.16 5423.6
## 
## Step:  AIC=5420.59
## no_pasg ~ aircraft + duration + speed_ground + height + pitch
## 
##                Df Deviance    AIC
## - speed_ground  1   743.12 5418.6
## - pitch         1   743.16 5418.6
## - aircraft      1   743.50 5419.0
## - duration      1   743.87 5419.3
## - height        1   743.91 5419.4
## <none>              743.11 5420.6
## 
## Step:  AIC=5418.6
## no_pasg ~ aircraft + duration + height + pitch
## 
##            Df Deviance    AIC
## - pitch     1   743.18 5416.7
## - aircraft  1   743.52 5417.0
## - duration  1   743.87 5417.4
## - height    1   743.94 5417.4
## <none>          743.12 5418.6
## 
## Step:  AIC=5416.66
## no_pasg ~ aircraft + duration + height
## 
##            Df Deviance    AIC
## - aircraft  1   743.76 5415.2
## - duration  1   743.91 5415.4
## - height    1   743.97 5415.5
## <none>          743.18 5416.7
## 
## Step:  AIC=5415.25
## no_pasg ~ duration + height
## 
##            Df Deviance    AIC
## - duration  1   744.44 5413.9
## - height    1   744.57 5414.1
## <none>          743.76 5415.2
## 
## Step:  AIC=5413.92
## no_pasg ~ height
## 
##          Df Deviance    AIC
## - height  1   745.26 5412.7
## <none>        744.44 5413.9
## 
## Step:  AIC=5412.74
## no_pasg ~ 1

## 
## Call:  glm(formula = no_pasg ~ 1, family = poisson, data = faa.no.dup)
## 
## Coefficients:
## (Intercept)  
##       4.096  
## 
## Degrees of Freedom: 786 Total (i.e. Null);  786 Residual
## Null Deviance:       745.3 
## Residual Deviance: 745.3     AIC: 5413

AIC variable selection chooses no variable significant in determing the no_pasg data

drop1(modelp,test="Chisq")

## Single term deletions
## 
## Model:
## no_pasg ~ aircraft + duration + speed_ground + height + pitch + 
##     Y
##              Df Deviance    AIC     LRT Pr(>Chi)
## <none>            742.16 5423.6                 
## aircraft      1   742.19 5421.7 0.03217   0.8577
## duration      1   742.97 5422.5 0.81283   0.3673
## speed_ground  1   742.71 5422.2 0.55408   0.4567
## height        1   743.48 5423.0 1.32201   0.2502
## pitch         1   742.21 5421.7 0.05014   0.8228
## Y             2   743.11 5420.6 0.94599   0.6231

We can see there is a very slight change in AIC when we delete individual variables which means that these variables are not significant

Multinomial Logistic Regression in R

Anupam Shukla

03/01/2020