Packages Required

library(tidyverse)  #to visualize, transform, input, tidy and join data
library(dplyr)      #data wrangling
library(kableExtra) #to create HTML Table
library(DT)         #to preview the data sets
library(xlsx)       #to load excel files
library(nnet)       #to implement multinomial function

Taking Distance as a multinomial factor

Loaded the datasets and did initial data cleaning(detailed steps were performed in the first assignment)

faa1 <- read.xlsx("FAA1.xls", sheetName = "FAA1")
faa2 <- read.xlsx("FAA2_2.xls", sheetName = "Sheet1")
faa <- bind_rows(faa1, faa2)
check <- faa %>%  
 select(-duration) %>% 
  duplicated() %>% 
  which()

faa <- faa[-check,]

faa_check <- faa %>% 
  filter((duration > 40| is.na(duration)) & (speed_ground >= 30) & (speed_ground <= 140) &
           (height >= 6) & (distance < 6000)) 
faa <- faa_check

We will create a mulitnomial variabe for distance.

faa1 <- faa %>% 
  mutate(Y = (ifelse(distance < 1000, 1, 
                     ifelse( distance >= 1000 & distance < 2000, 2, 3)) ))
faa1$distance <- NULL

Now, we will use multinomial model to fit Y. We treat the new variable Y as categorical under the assumption that the levels of Y have no natural ordering.

faa1$Y <- as.factor(faa1$Y)
faa1 <-  select(faa1, -speed_air ) %>% 
  na.omit()
mmod <- multinom(Y ~ aircraft + duration +
                   no_pasg + speed_ground + pitch + height , faa1)
## # weights:  24 (14 variable)
## initial  value 858.016197 
## iter  10 value 581.199724
## iter  20 value 236.239538
## iter  30 value 226.272595
## iter  40 value 226.221087
## iter  50 value 226.174784
## final  value 226.164124 
## converged
#summary(mmod)

Based on AIC, we get the model as -

mmodi <- step(mmod)
## Start:  AIC=480.33
## Y ~ aircraft + duration + no_pasg + speed_ground + pitch + height
## 
## trying - aircraft 
## # weights:  21 (12 variable)
## initial  value 858.016197 
## iter  10 value 599.571323
## iter  20 value 336.226736
## iter  30 value 334.049994
## iter  40 value 334.045290
## final  value 334.043139 
## converged
## trying - duration 
## # weights:  21 (12 variable)
## initial  value 858.016197 
## iter  10 value 518.225762
## iter  20 value 239.417331
## iter  30 value 228.258109
## iter  40 value 227.608269
## final  value 227.113768 
## converged
## trying - no_pasg 
## # weights:  21 (12 variable)
## initial  value 858.016197 
## iter  10 value 602.083949
## iter  20 value 240.843149
## iter  30 value 229.631112
## iter  40 value 228.963807
## final  value 228.210500 
## converged
## trying - speed_ground 
## # weights:  21 (12 variable)
## initial  value 858.016197 
## iter  10 value 804.987828
## final  value 794.030499 
## converged
## trying - pitch 
## # weights:  21 (12 variable)
## initial  value 858.016197 
## iter  10 value 582.954258
## iter  20 value 237.697711
## iter  30 value 228.361531
## iter  40 value 227.746783
## final  value 227.327445 
## converged
## trying - height 
## # weights:  21 (12 variable)
## initial  value 858.016197 
## iter  10 value 535.918721
## iter  20 value 302.074403
## iter  30 value 298.921547
## iter  40 value 298.913287
## final  value 298.909431 
## converged
##                Df       AIC
## - duration     12  478.2275
## - pitch        12  478.6549
## <none>         14  480.3282
## - no_pasg      12  480.4210
## - height       12  621.8189
## - aircraft     12  692.0863
## - speed_ground 12 1612.0610
## # weights:  21 (12 variable)
## initial  value 858.016197 
## iter  10 value 518.225762
## iter  20 value 239.417331
## iter  30 value 228.258109
## iter  40 value 227.608269
## final  value 227.113768 
## converged
## 
## Step:  AIC=478.23
## Y ~ aircraft + no_pasg + speed_ground + pitch + height
## 
## trying - aircraft 
## # weights:  18 (10 variable)
## initial  value 858.016197 
## iter  10 value 465.319936
## iter  20 value 335.496129
## iter  30 value 335.298663
## final  value 335.293692 
## converged
## trying - no_pasg 
## # weights:  18 (10 variable)
## initial  value 858.016197 
## iter  10 value 444.634863
## iter  20 value 238.604117
## iter  30 value 230.208561
## iter  40 value 229.083783
## iter  40 value 229.083781
## iter  40 value 229.083781
## final  value 229.083781 
## converged
## trying - speed_ground 
## # weights:  18 (10 variable)
## initial  value 858.016197 
## iter  10 value 799.384355
## final  value 796.687299 
## converged
## trying - pitch 
## # weights:  18 (10 variable)
## initial  value 858.016197 
## iter  10 value 488.119245
## iter  20 value 236.788749
## iter  30 value 228.715466
## iter  40 value 228.221740
## final  value 228.220496 
## converged
## trying - height 
## # weights:  18 (10 variable)
## initial  value 858.016197 
## iter  10 value 454.961171
## iter  20 value 300.709209
## iter  30 value 300.281087
## final  value 300.277702 
## converged
##                Df       AIC
## - pitch        10  476.4410
## - no_pasg      10  478.1676
## <none>         12  478.2275
## - height       10  620.5554
## - aircraft     10  690.5874
## - speed_ground 10 1613.3746
## # weights:  18 (10 variable)
## initial  value 858.016197 
## iter  10 value 488.119245
## iter  20 value 236.788749
## iter  30 value 228.715466
## iter  40 value 228.221740
## final  value 228.220496 
## converged
## 
## Step:  AIC=476.44
## Y ~ aircraft + no_pasg + speed_ground + height
## 
## trying - aircraft 
## # weights:  15 (8 variable)
## initial  value 858.016197 
## iter  10 value 394.729448
## iter  20 value 347.225190
## iter  30 value 347.118971
## final  value 347.118955 
## converged
## trying - no_pasg 
## # weights:  15 (8 variable)
## initial  value 858.016197 
## iter  10 value 354.469061
## iter  20 value 242.186395
## iter  30 value 231.398332
## final  value 230.124753 
## converged
## trying - speed_ground 
## # weights:  15 (8 variable)
## initial  value 858.016197 
## iter  10 value 797.491849
## final  value 797.489048 
## converged
## trying - height 
## # weights:  15 (8 variable)
## initial  value 858.016197 
## iter  10 value 363.948352
## iter  20 value 301.923434
## iter  30 value 301.017662
## final  value 301.004927 
## converged
##                Df       AIC
## - no_pasg       8  476.2495
## <none>         10  476.4410
## - height        8  618.0099
## - aircraft      8  710.2379
## - speed_ground  8 1610.9781
## # weights:  15 (8 variable)
## initial  value 858.016197 
## iter  10 value 354.469061
## iter  20 value 242.186395
## iter  30 value 231.398332
## final  value 230.124753 
## converged
## 
## Step:  AIC=476.25
## Y ~ aircraft + speed_ground + height
## 
## trying - aircraft 
## # weights:  12 (6 variable)
## initial  value 858.016197 
## iter  10 value 378.591341
## iter  20 value 351.025155
## final  value 350.780781 
## converged
## trying - speed_ground 
## # weights:  12 (6 variable)
## initial  value 858.016197 
## iter  10 value 797.741960
## final  value 797.741923 
## converged
## trying - height 
## # weights:  12 (6 variable)
## initial  value 858.016197 
## iter  10 value 345.462031
## iter  20 value 304.390327
## final  value 301.955008 
## converged
##                Df       AIC
## <none>          8  476.2495
## - height        6  615.9100
## - aircraft      6  713.5616
## - speed_ground  6 1607.4838
summary(mmodi)
## Call:
## multinom(formula = Y ~ aircraft + speed_ground + height, data = faa1)
## 
## Coefficients:
##   (Intercept) aircraftboeing speed_ground    height
## 2   -22.80493       3.863325    0.2393831 0.1516421
## 3  -103.34903       9.209321    1.0112370 0.3441978
## 
## Std. Errors:
##   (Intercept) aircraftboeing speed_ground     height
## 2  1.91262330      0.4056023  0.019916715 0.01813334
## 3  0.08472895      0.5723513  0.009967598 0.02808654
## 
## Residual Deviance: 460.2495 
## AIC: 476.2495

Now I want to see where the mean values lie for all the variables at different Y.

duration <- tapply(faa1$duration, faa1$Y, mean, na.rm=TRUE)
no_pasg <- tapply(faa1$no_pasg, faa1$Y, mean, na.rm=TRUE)
speed_ground <- tapply(faa1$speed_ground, faa1$Y, mean, na.rm=TRUE)
height <- tapply(faa1$height, faa1$Y, mean, na.rm=TRUE)
pitch <- tapply(faa1$pitch, faa1$Y, mean, na.rm=TRUE)

table <- round(data.frame(duration, no_pasg, speed_ground , height, pitch),3) %>% 
  t() %>% as.data.frame()

names(table) <- c('Y=1','Y=2','Y=3')
table$variable <- rownames(table)
rownames(table) <- NULL

table <- select(table, variable, everything())
table
##       variable     Y=1     Y=2     Y=3
## 1     duration 160.890 151.935 152.070
## 2      no_pasg  60.233  60.162  59.728
## 3 speed_ground  62.178  79.243 103.649
## 4       height  27.908  31.677  31.509
## 5        pitch   3.980   4.012   4.064
  1. A one-unit increase in speed_ground
  • increases the odds of Y=2 by 1.26 relative to Y=1

  • increases the odds of Y=3 by 2.75 relative to Y=1

  1. A one-unit increase in height
  • increases the odds of Y=2 by 1.16 relative to Y=1

  • increases the odds of Y=3 by 1.41 relative to Y=1

  1. When looking at Boeing aircraft as compared to Airbus,
  • odds of Y=2 increases by 47 relative to Y=1

  • odds of Y=3 increases by 9989 relative to Y=1

  1. Aircraft, speed of ground, and height are the final vairables that I obtained.

  2. There is a significant increase in speed of ground with distance.

ggplot(faa1 , aes(x = Y, y = speed_ground)) +
  geom_boxplot()

  1. There is a slight increase in the height of aircraft as distance increases.
ggplot(faa1 , aes(x = Y, y = height)) +
  geom_boxplot()

Question 2

For number of passengers, I would use poisson distribution

faa <- select(faa, -speed_air) %>% 
  na.omit()
modp<-glm(no_pasg ~ aircraft + speed_ground + height + pitch + 
            duration + distance, family=poisson, faa)
#summary(modp)
step_modp <- step(modp)
## Start:  AIC=5383.21
## no_pasg ~ aircraft + speed_ground + height + pitch + duration + 
##     distance
## 
##                Df Deviance    AIC
## - aircraft      1   739.18 5381.2
## - pitch         1   739.21 5381.2
## - speed_ground  1   739.88 5381.9
## - distance      1   740.08 5382.1
## - duration      1   740.25 5382.3
## - height        1   740.85 5382.9
## <none>              739.18 5383.2
## 
## Step:  AIC=5381.21
## no_pasg ~ speed_ground + height + pitch + duration + distance
## 
##                Df Deviance    AIC
## - pitch         1   739.22 5379.2
## - speed_ground  1   740.16 5380.2
## - duration      1   740.25 5380.3
## - distance      1   740.50 5380.5
## - height        1   740.96 5381.0
## <none>              739.18 5381.2
## 
## Step:  AIC=5379.25
## no_pasg ~ speed_ground + height + duration + distance
## 
##                Df Deviance    AIC
## - duration      1   740.27 5378.3
## - speed_ground  1   740.35 5378.4
## - distance      1   740.73 5378.8
## - height        1   741.02 5379.0
## <none>              739.22 5379.2
## 
## Step:  AIC=5378.3
## no_pasg ~ speed_ground + height + distance
## 
##                Df Deviance    AIC
## - speed_ground  1   741.41 5377.4
## - distance      1   741.73 5377.8
## - height        1   742.03 5378.1
## <none>              740.27 5378.3
## 
## Step:  AIC=5377.44
## no_pasg ~ height + distance
## 
##            Df Deviance    AIC
## - distance  1   741.73 5375.8
## - height    1   742.54 5376.6
## <none>          741.41 5377.4
## 
## Step:  AIC=5375.76
## no_pasg ~ height
## 
##          Df Deviance    AIC
## - height  1   742.75 5374.8
## <none>        741.73 5375.8
## 
## Step:  AIC=5374.78
## no_pasg ~ 1
summary(step_modp)
## 
## Call:
## glm(formula = no_pasg ~ 1, family = poisson, data = faa)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.4627  -0.6652  -0.0106   0.6261   3.2525  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 4.095709   0.004616   887.2   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 742.75  on 780  degrees of freedom
## Residual deviance: 742.75  on 780  degrees of freedom
## AIC: 5374.8
## 
## Number of Fisher Scoring iterations: 4

Thus, no variables seem useful for predicting the number of passengers and it is regressed only on intercept.