#read excel files
library("readxl")
faa1 <- read_excel('faa1.xls')
faa2 <- read_excel('faa2.xls')
#merge datasets
library('plyr')
faa <- rbind.fill(faa1,faa2)
#remove duplicated records
library('tidyverse')
faa.no.dup <- distinct(faa,aircraft,no_pasg,speed_ground,speed_air,height,pitch,distance,.keep_all = T)
#check abnormal values
height_abnormal <- sum(faa.no.dup$height<6)
duration_abnormal <- sum(faa.no.dup$duration<40)
speed_ground_abnormal <- sum(faa.no.dup$speed_ground<30 | faa.no.dup$speed_ground > 140)
speed_air_abnormal <- sum(faa.no.dup$speed_air<30 | faa.no.dup$speed_air > 140)
#remove rows with abnormal values
faa.no.dup <- faa.no.dup[faa.no.dup$height > 6,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground > 30,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground < 140,]
# Remove rows for blank values in duration column
faa.no.dup <- faa.no.dup[!is.na(faa.no.dup$duration), ]
str(faa.no.dup)
## 'data.frame': 787 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
faa.no.dup$Y <- ifelse(faa.no.dup$distance < 1000 , 1 , (ifelse(faa.no.dup$distance >=1000 & faa.no.dup$distance < 2500,2,3)))
# discard distance variable
faa.no.dup$distance <- NULL
faa.no.dup$Y <- as.factor(faa.no.dup$Y)
#COnvert aircraft column to factor column
faa.no.dup$aircraft <- ifelse(faa.no.dup$aircraft == 'boeing', 1, 0)
faa.no.dup$aircraft <- as.factor(faa.no.dup$aircraft)
deviance(mmodi) - deviance(mmod)
## [1] 2.878813
mmod$edf - mmodi$edf
## [1] 4
pchisq(deviance(mmodi)-deviance(mmod),mmod$edf - mmodi$edf,lower = F)
## [1] 0.5783063
model <- c('Model1(p2/p1)','Model1(p2/p1)','Model1(p2/p1)','Model1(p2/p1)','Model2(p3/p1)','Model2(p3/p1)','Model2(p3/p1)','Model2(p3/p1)')
vars <- c('aircraft','speed_ground','height','pitch','aircraft','speed_ground','height','pitch')
varindex <- c(2,5,6,7,2,5,6,7)
fittedcoeffs <- vector()
fittedoddratios <- vector()
fittedse <- vector()
for(i in 1:length(vars)){
if(i <=4){
fittedcoeffs[i] <- mmodsummary$coefficients[1,varindex[i]]
fittedse[i] <- mmodsummary$standard.errors[1,varindex[i]]
fittedoddratios[i] <- exp(fittedcoeffs[i])
}
else{
fittedcoeffs[i] <- mmodsummary$coefficients[2,varindex[i]]
fittedse[i] <- mmodsummary$standard.errors[2,varindex[i]]
fittedoddratios[i] <- exp(fittedcoeffs[i])
}
}
table <- tibble(
Model = model,
variable = vars,
coefficient = fittedcoeffs,
standarderror = fittedse,
odds_ratio = fittedoddratios
)
table
## # A tibble: 8 x 5
## Model variable coefficient standarderror odds_ratio
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 Model1(p2/p1) aircraft 4.09 0.436 59.8
## 2 Model1(p2/p1) speed_ground 0.244 0.0204 1.28
## 3 Model1(p2/p1) height 0.156 0.0185 1.17
## 4 Model1(p2/p1) pitch -0.352 0.276 0.703
## 5 Model2(p3/p1) aircraft 9.09 0.881 8837.
## 6 Model2(p3/p1) speed_ground 1.23 0.0406 3.41
## 7 Model2(p3/p1) height 0.392 0.0485 1.48
## 8 Model2(p3/p1) pitch 0.935 0.766 2.55
When the aircraft maker is boeing, The odds of Y = 2(1000< landing distance < 2500) increases by 59 times and almost 8.8k times for Y=3(landing distance > 2500)
The odds of Y = 2(1000< landing distance < 2500) increases by 1.27 times with speed_ground and 3.4 times for Y=3(landing distance > 2500) with speed_ground.
The odds of Y = 2(1000< landing distance < 2500) increases by 1.16 times with height and 1.47 times for Y=3(landing distance > 2500) with height
The odds of Y = 2(1000< landing distance < 2500) increases by 0.7 times with pitch and 2.54 times for Y=3(landing distance > 2500) with pitch
#install.packages('dplyr')
library('dplyr')
library('ggplot2')
lds <-mutate(faa.no.dup, speedground =cut_number(speed_ground,10)) %>% group_by(speedground, Y) %>% summarise(count=n()) %>% group_by(speedground) %>% mutate(etotal=sum(count), proportion=count/etotal)
ldh <-mutate(faa.no.dup, height =cut_number(height,10)) %>% group_by(height, Y) %>% summarise(count=n()) %>% group_by(height) %>% mutate(etotal=sum(count), proportion=count/etotal)
lda <-group_by(faa.no.dup,aircraft, Y) %>% summarise(count=n()) %>% group_by(aircraft) %>% mutate(etotal=sum(count), proportion=count/etotal)
ldp <-mutate(faa.no.dup, pitch =cut_number(pitch,5)) %>% group_by(pitch, Y) %>% summarise(count=n()) %>% group_by(pitch) %>% mutate(etotal=sum(count), proportion=count/etotal)
ggplot(lds, aes(x=speedground, y=proportion, group=Y, linetype=Y)) + geom_line()
The proportion of Y=1(landing distance < 1000) decreases with increasing speed while proportion for Y=2(1000 < distance < 2500) increases until speeds of around 90 and starts to decrease as speed increases further. The proportion of landing Y=3(distance >2500) starts for speeds around 95mph and continously increases with increase in speed.
ggplot(ldh, aes(x=height, y=proportion, group=Y, linetype=Y)) + geom_line()
The proportion of Y=1(landing distance < 1000) overall decreases with increase in height while proportion for Y=2(1000 < landing distance <2500) and proportion of Y=3(landing distance >2500) overall increases with increase in height.
ggplot(lda, aes(x=aircraft, y=proportion, group=Y, linetype=Y)) + geom_line()
Aircraft variable is an important variable in determining the landing distance. The proportion of Y=1(landing distance < 1000) is lesser when the aircraft maker is boeing while proportion for Y=2(1000< landing distance < 2500) and proportion of Y=3(landing distance >2500) is higher for boeing
ggplot(ldp, aes(x=pitch, y=proportion, group=Y, linetype=Y)) + geom_line()
The proportion of Y=1(landing distance < 1000 overall decreases with increase in pitch while proportion for Y=2(1000 < landing distance < 2500) does not vary much with increase in pitch.The proportion of Y=3(landing distance >2500) increases with increase in pitch. Although it can be seen there is not a strong relation between pitch and Y
speed_ground, aircraft type, height and pitch are important variables in determining the probabilities of landing distance being less than 1000, between 1000 and 2500 and greater than 2500.
When the aircraft maker is boeing, The odds of landing distance between 1000 and 2500 increases by 59 times and almost 8.8k times for landing distance greater than 2500
Poisson Distribution should be used to model the no_pasg variable as this is a count data
modelp <- glm(no_pasg ~ aircraft + duration + speed_ground + height +
pitch + Y, family = poisson ,faa.no.dup)
summary(modelp)
##
## Call:
## glm(formula = no_pasg ~ aircraft + duration + speed_ground +
## height + pitch + Y, family = poisson, data = faa.no.dup)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.3922 -0.6597 0.0274 0.6186 3.1690
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.084e+00 5.438e-02 75.105 <2e-16 ***
## aircraft1 -1.946e-03 1.085e-02 -0.179 0.858
## duration -8.455e-05 9.379e-05 -0.902 0.367
## speed_ground 3.297e-04 4.430e-04 0.744 0.457
## height 5.749e-04 5.000e-04 1.150 0.250
## pitch -2.122e-03 9.479e-03 -0.224 0.823
## Y2 -1.191e-02 1.477e-02 -0.806 0.420
## Y3 -2.713e-02 2.807e-02 -0.966 0.334
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 745.26 on 786 degrees of freedom
## Residual deviance: 742.16 on 779 degrees of freedom
## AIC: 5423.6
##
## Number of Fisher Scoring iterations: 4
We observe from the above model that there are no significant variables which can predict the no_pasg data
Lets confirm this using the AIC method
step(modelp)
## Start: AIC=5423.64
## no_pasg ~ aircraft + duration + speed_ground + height + pitch +
## Y
##
## Df Deviance AIC
## - Y 2 743.11 5420.6
## - aircraft 1 742.19 5421.7
## - pitch 1 742.21 5421.7
## - speed_ground 1 742.71 5422.2
## - duration 1 742.97 5422.5
## - height 1 743.48 5423.0
## <none> 742.16 5423.6
##
## Step: AIC=5420.59
## no_pasg ~ aircraft + duration + speed_ground + height + pitch
##
## Df Deviance AIC
## - speed_ground 1 743.12 5418.6
## - pitch 1 743.16 5418.6
## - aircraft 1 743.50 5419.0
## - duration 1 743.87 5419.3
## - height 1 743.91 5419.4
## <none> 743.11 5420.6
##
## Step: AIC=5418.6
## no_pasg ~ aircraft + duration + height + pitch
##
## Df Deviance AIC
## - pitch 1 743.18 5416.7
## - aircraft 1 743.52 5417.0
## - duration 1 743.87 5417.4
## - height 1 743.94 5417.4
## <none> 743.12 5418.6
##
## Step: AIC=5416.66
## no_pasg ~ aircraft + duration + height
##
## Df Deviance AIC
## - aircraft 1 743.76 5415.2
## - duration 1 743.91 5415.4
## - height 1 743.97 5415.5
## <none> 743.18 5416.7
##
## Step: AIC=5415.25
## no_pasg ~ duration + height
##
## Df Deviance AIC
## - duration 1 744.44 5413.9
## - height 1 744.57 5414.1
## <none> 743.76 5415.2
##
## Step: AIC=5413.92
## no_pasg ~ height
##
## Df Deviance AIC
## - height 1 745.26 5412.7
## <none> 744.44 5413.9
##
## Step: AIC=5412.74
## no_pasg ~ 1
##
## Call: glm(formula = no_pasg ~ 1, family = poisson, data = faa.no.dup)
##
## Coefficients:
## (Intercept)
## 4.096
##
## Degrees of Freedom: 786 Total (i.e. Null); 786 Residual
## Null Deviance: 745.3
## Residual Deviance: 745.3 AIC: 5413
AIC variable selection chooses no variable significant in determing the no_pasg data
drop1(modelp,test="Chisq")
## Single term deletions
##
## Model:
## no_pasg ~ aircraft + duration + speed_ground + height + pitch +
## Y
## Df Deviance AIC LRT Pr(>Chi)
## <none> 742.16 5423.6
## aircraft 1 742.19 5421.7 0.03217 0.8577
## duration 1 742.97 5422.5 0.81283 0.3673
## speed_ground 1 742.71 5422.2 0.55408 0.4567
## height 1 743.48 5423.0 1.32201 0.2502
## pitch 1 742.21 5421.7 0.05014 0.8228
## Y 2 743.11 5420.6 0.94599 0.6231
We can see there is a very slight change in AIC when we delete individual variables which means that these variables are not significant