library(tidyverse) #to visualize, transform, input, tidy and join data
library(dplyr) #data wrangling
library(kableExtra) #to create HTML Table
library(DT) #to preview the data sets
library(xlsx) #to load excel files
library(nnet) #to implement multinomial function
Loaded the datasets and did initial data cleaning(detailed steps were performed in the first assignment)
faa1 <- read.xlsx("FAA1.xls", sheetName = "FAA1")
faa2 <- read.xlsx("FAA2_2.xls", sheetName = "Sheet1")
faa <- bind_rows(faa1, faa2)
check <- faa %>%
select(-duration) %>%
duplicated() %>%
which()
faa <- faa[-check,]
faa_check <- faa %>%
filter((duration > 40| is.na(duration)) & (speed_ground >= 30) & (speed_ground <= 140) &
(height >= 6) & (distance < 6000))
faa <- faa_check
We will create a mulitnomial variabe for distance.
faa1 <- faa %>%
mutate(Y = (ifelse(distance < 1000, 1,
ifelse( distance >= 1000 & distance < 2000, 2, 3)) ))
faa1$distance <- NULL
Now, we will use multinomial model to fit Y. We treat the new variable Y as categorical under the assumption that the levels of Y have no natural ordering.
faa1$Y <- as.factor(faa1$Y)
faa1 <- select(faa1, -speed_air ) %>%
na.omit()
mmod <- multinom(Y ~ aircraft + duration +
no_pasg + speed_ground + pitch + height , faa1)
## # weights: 24 (14 variable)
## initial value 858.016197
## iter 10 value 581.199724
## iter 20 value 236.239538
## iter 30 value 226.272595
## iter 40 value 226.221087
## iter 50 value 226.174784
## final value 226.164124
## converged
#summary(mmod)
Based on AIC, we get the model as -
mmodi <- step(mmod)
## Start: AIC=480.33
## Y ~ aircraft + duration + no_pasg + speed_ground + pitch + height
##
## trying - aircraft
## # weights: 21 (12 variable)
## initial value 858.016197
## iter 10 value 599.571323
## iter 20 value 336.226736
## iter 30 value 334.049994
## iter 40 value 334.045290
## final value 334.043139
## converged
## trying - duration
## # weights: 21 (12 variable)
## initial value 858.016197
## iter 10 value 518.225762
## iter 20 value 239.417331
## iter 30 value 228.258109
## iter 40 value 227.608269
## final value 227.113768
## converged
## trying - no_pasg
## # weights: 21 (12 variable)
## initial value 858.016197
## iter 10 value 602.083949
## iter 20 value 240.843149
## iter 30 value 229.631112
## iter 40 value 228.963807
## final value 228.210500
## converged
## trying - speed_ground
## # weights: 21 (12 variable)
## initial value 858.016197
## iter 10 value 804.987828
## final value 794.030499
## converged
## trying - pitch
## # weights: 21 (12 variable)
## initial value 858.016197
## iter 10 value 582.954258
## iter 20 value 237.697711
## iter 30 value 228.361531
## iter 40 value 227.746783
## final value 227.327445
## converged
## trying - height
## # weights: 21 (12 variable)
## initial value 858.016197
## iter 10 value 535.918721
## iter 20 value 302.074403
## iter 30 value 298.921547
## iter 40 value 298.913287
## final value 298.909431
## converged
## Df AIC
## - duration 12 478.2275
## - pitch 12 478.6549
## <none> 14 480.3282
## - no_pasg 12 480.4210
## - height 12 621.8189
## - aircraft 12 692.0863
## - speed_ground 12 1612.0610
## # weights: 21 (12 variable)
## initial value 858.016197
## iter 10 value 518.225762
## iter 20 value 239.417331
## iter 30 value 228.258109
## iter 40 value 227.608269
## final value 227.113768
## converged
##
## Step: AIC=478.23
## Y ~ aircraft + no_pasg + speed_ground + pitch + height
##
## trying - aircraft
## # weights: 18 (10 variable)
## initial value 858.016197
## iter 10 value 465.319936
## iter 20 value 335.496129
## iter 30 value 335.298663
## final value 335.293692
## converged
## trying - no_pasg
## # weights: 18 (10 variable)
## initial value 858.016197
## iter 10 value 444.634863
## iter 20 value 238.604117
## iter 30 value 230.208561
## iter 40 value 229.083783
## iter 40 value 229.083781
## iter 40 value 229.083781
## final value 229.083781
## converged
## trying - speed_ground
## # weights: 18 (10 variable)
## initial value 858.016197
## iter 10 value 799.384355
## final value 796.687299
## converged
## trying - pitch
## # weights: 18 (10 variable)
## initial value 858.016197
## iter 10 value 488.119245
## iter 20 value 236.788749
## iter 30 value 228.715466
## iter 40 value 228.221740
## final value 228.220496
## converged
## trying - height
## # weights: 18 (10 variable)
## initial value 858.016197
## iter 10 value 454.961171
## iter 20 value 300.709209
## iter 30 value 300.281087
## final value 300.277702
## converged
## Df AIC
## - pitch 10 476.4410
## - no_pasg 10 478.1676
## <none> 12 478.2275
## - height 10 620.5554
## - aircraft 10 690.5874
## - speed_ground 10 1613.3746
## # weights: 18 (10 variable)
## initial value 858.016197
## iter 10 value 488.119245
## iter 20 value 236.788749
## iter 30 value 228.715466
## iter 40 value 228.221740
## final value 228.220496
## converged
##
## Step: AIC=476.44
## Y ~ aircraft + no_pasg + speed_ground + height
##
## trying - aircraft
## # weights: 15 (8 variable)
## initial value 858.016197
## iter 10 value 394.729448
## iter 20 value 347.225190
## iter 30 value 347.118971
## final value 347.118955
## converged
## trying - no_pasg
## # weights: 15 (8 variable)
## initial value 858.016197
## iter 10 value 354.469061
## iter 20 value 242.186395
## iter 30 value 231.398332
## final value 230.124753
## converged
## trying - speed_ground
## # weights: 15 (8 variable)
## initial value 858.016197
## iter 10 value 797.491849
## final value 797.489048
## converged
## trying - height
## # weights: 15 (8 variable)
## initial value 858.016197
## iter 10 value 363.948352
## iter 20 value 301.923434
## iter 30 value 301.017662
## final value 301.004927
## converged
## Df AIC
## - no_pasg 8 476.2495
## <none> 10 476.4410
## - height 8 618.0099
## - aircraft 8 710.2379
## - speed_ground 8 1610.9781
## # weights: 15 (8 variable)
## initial value 858.016197
## iter 10 value 354.469061
## iter 20 value 242.186395
## iter 30 value 231.398332
## final value 230.124753
## converged
##
## Step: AIC=476.25
## Y ~ aircraft + speed_ground + height
##
## trying - aircraft
## # weights: 12 (6 variable)
## initial value 858.016197
## iter 10 value 378.591341
## iter 20 value 351.025155
## final value 350.780781
## converged
## trying - speed_ground
## # weights: 12 (6 variable)
## initial value 858.016197
## iter 10 value 797.741960
## final value 797.741923
## converged
## trying - height
## # weights: 12 (6 variable)
## initial value 858.016197
## iter 10 value 345.462031
## iter 20 value 304.390327
## final value 301.955008
## converged
## Df AIC
## <none> 8 476.2495
## - height 6 615.9100
## - aircraft 6 713.5616
## - speed_ground 6 1607.4838
summary(mmodi)
## Call:
## multinom(formula = Y ~ aircraft + speed_ground + height, data = faa1)
##
## Coefficients:
## (Intercept) aircraftboeing speed_ground height
## 2 -22.80493 3.863325 0.2393831 0.1516421
## 3 -103.34903 9.209321 1.0112370 0.3441978
##
## Std. Errors:
## (Intercept) aircraftboeing speed_ground height
## 2 1.91262330 0.4056023 0.019916715 0.01813334
## 3 0.08472895 0.5723513 0.009967598 0.02808654
##
## Residual Deviance: 460.2495
## AIC: 476.2495
Now I want to see where the mean values lie for all the variables at different Y.
duration <- tapply(faa1$duration, faa1$Y, mean, na.rm=TRUE)
no_pasg <- tapply(faa1$no_pasg, faa1$Y, mean, na.rm=TRUE)
speed_ground <- tapply(faa1$speed_ground, faa1$Y, mean, na.rm=TRUE)
height <- tapply(faa1$height, faa1$Y, mean, na.rm=TRUE)
pitch <- tapply(faa1$pitch, faa1$Y, mean, na.rm=TRUE)
table <- round(data.frame(duration, no_pasg, speed_ground , height, pitch),3) %>%
t() %>% as.data.frame()
names(table) <- c('Y=1','Y=2','Y=3')
table$variable <- rownames(table)
rownames(table) <- NULL
table <- select(table, variable, everything())
table
## variable Y=1 Y=2 Y=3
## 1 duration 160.890 151.935 152.070
## 2 no_pasg 60.233 60.162 59.728
## 3 speed_ground 62.178 79.243 103.649
## 4 height 27.908 31.677 31.509
## 5 pitch 3.980 4.012 4.064
increases the odds of Y=2 by 1.26 relative to Y=1
increases the odds of Y=3 by 2.75 relative to Y=1
increases the odds of Y=2 by 1.16 relative to Y=1
increases the odds of Y=3 by 1.41 relative to Y=1
odds of Y=2 increases by 47 relative to Y=1
odds of Y=3 increases by 9989 relative to Y=1
Aircraft, speed of ground, and height are the final vairables that I obtained.
There is a significant increase in speed of ground with distance.
ggplot(faa1 , aes(x = Y, y = speed_ground)) +
geom_boxplot()
ggplot(faa1 , aes(x = Y, y = height)) +
geom_boxplot()
Question 2
For number of passengers, I would use poisson distribution
faa <- select(faa, -speed_air) %>%
na.omit()
modp<-glm(no_pasg ~ aircraft + speed_ground + height + pitch +
duration + distance, family=poisson, faa)
#summary(modp)
step_modp <- step(modp)
## Start: AIC=5383.21
## no_pasg ~ aircraft + speed_ground + height + pitch + duration +
## distance
##
## Df Deviance AIC
## - aircraft 1 739.18 5381.2
## - pitch 1 739.21 5381.2
## - speed_ground 1 739.88 5381.9
## - distance 1 740.08 5382.1
## - duration 1 740.25 5382.3
## - height 1 740.85 5382.9
## <none> 739.18 5383.2
##
## Step: AIC=5381.21
## no_pasg ~ speed_ground + height + pitch + duration + distance
##
## Df Deviance AIC
## - pitch 1 739.22 5379.2
## - speed_ground 1 740.16 5380.2
## - duration 1 740.25 5380.3
## - distance 1 740.50 5380.5
## - height 1 740.96 5381.0
## <none> 739.18 5381.2
##
## Step: AIC=5379.25
## no_pasg ~ speed_ground + height + duration + distance
##
## Df Deviance AIC
## - duration 1 740.27 5378.3
## - speed_ground 1 740.35 5378.4
## - distance 1 740.73 5378.8
## - height 1 741.02 5379.0
## <none> 739.22 5379.2
##
## Step: AIC=5378.3
## no_pasg ~ speed_ground + height + distance
##
## Df Deviance AIC
## - speed_ground 1 741.41 5377.4
## - distance 1 741.73 5377.8
## - height 1 742.03 5378.1
## <none> 740.27 5378.3
##
## Step: AIC=5377.44
## no_pasg ~ height + distance
##
## Df Deviance AIC
## - distance 1 741.73 5375.8
## - height 1 742.54 5376.6
## <none> 741.41 5377.4
##
## Step: AIC=5375.76
## no_pasg ~ height
##
## Df Deviance AIC
## - height 1 742.75 5374.8
## <none> 741.73 5375.8
##
## Step: AIC=5374.78
## no_pasg ~ 1
summary(step_modp)
##
## Call:
## glm(formula = no_pasg ~ 1, family = poisson, data = faa)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.4627 -0.6652 -0.0106 0.6261 3.2525
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.095709 0.004616 887.2 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 742.75 on 780 degrees of freedom
## Residual deviance: 742.75 on 780 degrees of freedom
## AIC: 5374.8
##
## Number of Fisher Scoring iterations: 4
Thus, no variables seem useful for predicting the number of passengers and it is regressed only on intercept.