Initial exploration of the data

#read excel files
library("readxl")
faa1 <- read_excel('faa1.xls')
faa2 <- read_excel('faa2.xls')
str(faa1)
## Classes 'tbl_df', 'tbl' and 'data.frame':    800 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...

FAA1 data has sample size = 800 and 8 variables/features

str(faa2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    150 obs. of  7 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...

FAA2 data has sample size = 150 and 7 variables/features. It does not contain the ‘duration’ variable which is present in FAA1

#merge datasets
library('plyr')
faa <- rbind.fill(faa1,faa2)
str(faa)
## 'data.frame':    950 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
#check for duplicates - the duplicates are calculated without including the duration column as this was not present in faa1
sum(duplicated(faa[,c(1,3:8)]))
## [1] 100

100 duplicated records are found. We will remove these records

library('tidyverse')
faa.no.dup <- distinct(faa,aircraft,no_pasg,speed_ground,speed_air,height,pitch,distance,.keep_all = T)
str(faa.no.dup)
## 'data.frame':    850 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...

The merged dataset has sample size = 850 and 8 variables/features

summary(faa.no.dup)
##    aircraft            duration         no_pasg      speed_ground   
##  Length:850         Min.   : 14.76   Min.   :29.0   Min.   : 27.74  
##  Class :character   1st Qu.:119.49   1st Qu.:55.0   1st Qu.: 65.90  
##  Mode  :character   Median :153.95   Median :60.0   Median : 79.64  
##                     Mean   :154.01   Mean   :60.1   Mean   : 79.45  
##                     3rd Qu.:188.91   3rd Qu.:65.0   3rd Qu.: 92.06  
##                     Max.   :305.62   Max.   :87.0   Max.   :141.22  
##                     NA's   :50                                      
##    speed_air          height           pitch          distance      
##  Min.   : 90.00   Min.   :-3.546   Min.   :2.284   Min.   :  34.08  
##  1st Qu.: 96.25   1st Qu.:23.314   1st Qu.:3.642   1st Qu.: 883.79  
##  Median :101.15   Median :30.093   Median :4.008   Median :1258.09  
##  Mean   :103.80   Mean   :30.144   Mean   :4.009   Mean   :1526.02  
##  3rd Qu.:109.40   3rd Qu.:36.993   3rd Qu.:4.377   3rd Qu.:1936.95  
##  Max.   :141.72   Max.   :59.946   Max.   :5.927   Max.   :6533.05  
##  NA's   :642

Important observations

  1. On combining FAA1 and FAA2 datasets, 950 records were found of which 100 were duplicate
  2. Duration column was missing in FAA2 dataset
  3. Speed_air column has 642 missing values in the combined dataset
  4. Duration column has 50 missing values in the combined dataset
  5. One sample in the records has negative height which is not correct

Data Cleaning and further exploration

#check abnormal values

height_abnormal <- sum(faa.no.dup$height<6)
duration_abnormal <- sum(faa.no.dup$duration<40)
speed_ground_abnormal <- sum(faa.no.dup$speed_ground<30 | faa.no.dup$speed_ground > 140)
speed_air_abnormal <- sum(faa.no.dup$speed_air<30 | faa.no.dup$speed_air > 140)
height_abnormal;duration_abnormal;speed_ground_abnormal;speed_air_abnormal
## [1] 10
## [1] NA
## [1] 3
## [1] NA

10 rows were found that contained abnormal values for height and 3 rows contained abnormal values for speed_ground

#remove rows with abnormal values
faa.no.dup <- faa.no.dup[faa.no.dup$height > 6,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground > 30,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground < 140,]
str(faa.no.dup)
## 'data.frame':    837 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
#plot histograms for the datasets
par(mfrow = c(2,4))
numericcols <- c('duration','no_pasg','speed_ground','speed_air','height','pitch','distance')
for (i in numericcols){
   hist(faa.no.dup[[i]], xlab = i,main='')
}

Important observations

Initial analysis for identifying important factors that impact the response variable “landing distance”

#encoding aircarft column a 0/1
faa.temp <- faa.no.dup
faa.temp$aircraft <- ifelse(faa.temp$aircraft == 'boeing', 1, 0)
#compute pairwise correlation
corr <- cor(faa.no.dup$distance,faa.temp[,-8],use = "pairwise.complete.obs")
table1 <- tibble(
  variable = c('aircraft','duration','no_pasg','speed_ground','speed_air','height','pitch'), 
  corr_coef = corr[1:7], 
)
table1$dir <- ifelse(table1$corr_coef > 0, "positive", "negative")
table1 %>% arrange(desc(abs(corr_coef)))
## # A tibble: 7 x 3
##   variable     corr_coef dir     
##   <chr>            <dbl> <chr>   
## 1 speed_air       0.944  positive
## 2 speed_ground    0.867  positive
## 3 aircraft        0.243  positive
## 4 height          0.115  positive
## 5 pitch           0.0935 positive
## 6 duration       -0.0645 negative
## 7 no_pasg        -0.0175 negative
#plot scatterplots for the datasets
par(mfrow = c(2,4))

plot(faa.temp$aircraft, faa.temp$distance,xlab = 'aircraft',ylab = 'distance')
plot(faa.temp$duration, faa.temp$distance,xlab = 'duration',ylab = 'distance')
plot(faa.temp$no_pasg, faa.temp$distance,xlab = 'no_pasg',ylab = 'distance')
plot(faa.temp$speed_ground, faa.temp$distance,xlab = 'speed_ground',ylab = 'distance')
plot(faa.temp$speed_air, faa.temp$distance,xlab = 'speed_air',ylab = 'distance')
plot(faa.temp$height, faa.temp$distance,xlab = 'height',ylab = 'distance')
plot(faa.temp$pitch, faa.temp$distance,xlab = 'pitch',ylab = 'distance')

As can be observed from both the table and the scatterplot the correlations of distance with speed_Air and speed_groud are significantly high

Regression using a single factor each time

vars <- c('aircraft','duration','no_pasg','speed_ground','speed_air','height','pitch')
fittedpvalues <- vector()
fittedcoeffs <- vector()

fittedpvalues[1] <- summary(lm(distance~aircraft,data = faa.temp))$coefficients[2,4] 
fittedcoeffs[1] <- summary(lm(distance~aircraft,data = faa.temp))$coefficients[2,1] 

fittedpvalues[2] <- summary(lm(distance~duration,data = faa.temp))$coefficients[2,4] 
fittedcoeffs[2] <- summary(lm(distance~duration,data = faa.temp))$coefficients[2,1] 

fittedpvalues[3] <- summary(lm(distance~no_pasg,data = faa.temp))$coefficients[2,4] 
fittedcoeffs[3] <- summary(lm(distance~no_pasg,data = faa.temp))$coefficients[2,1] 

fittedpvalues[4] <- summary(lm(distance~speed_ground,data = faa.temp))$coefficients[2,4] 
fittedcoeffs[4] <- summary(lm(distance~speed_ground,data = faa.temp))$coefficients[2,1] 

fittedpvalues[5] <- summary(lm(distance~speed_air,data = faa.temp))$coefficients[2,4]
fittedcoeffs[5] <- summary(lm(distance~speed_air,data = faa.temp))$coefficients[2,1] 

fittedpvalues[6] <- summary(lm(distance~height,data = faa.temp))$coefficients[2,4] 
fittedcoeffs[6] <- summary(lm(distance~height,data = faa.temp))$coefficients[2,1] 

fittedpvalues[7] <- summary(lm(distance~pitch,data = faa.temp))$coefficients[2,4]
fittedcoeffs[7] <- summary(lm(distance~pitch,data = faa.temp))$coefficients[2,1] 

table2 <- tibble(
  variable = vars, 
  pvalues = fittedpvalues[1:7]
)
table2$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table2 %>% arrange(abs(pvalues))
## # A tibble: 7 x 3
##   variable       pvalues dir     
##   <chr>            <dbl> <chr>   
## 1 speed_ground 2.90e-254 positive
## 2 speed_air    2.18e-100 positive
## 3 aircraft     1.08e- 12 positive
## 4 height       8.99e-  4 positive
## 5 pitch        6.80e-  3 positive
## 6 duration     7.04e-  2 negative
## 7 no_pasg      6.14e-  1 negative
#standardize each variable
faa.std <- faa.temp
distance <- faa.std[,8]
faa.std <- scale(faa.temp[,-8])
faa.std <- data.frame(cbind(faa.std,distance))
summary(faa.std)
##     aircraft          duration           no_pasg         
##  Min.   :-0.9358   Min.   :-2.82092   Min.   :-4.152686  
##  1st Qu.:-0.9358   1st Qu.:-0.71008   1st Qu.:-0.674881  
##  Median :-0.9358   Median : 0.00347   Median :-0.006073  
##  Mean   : 0.0000   Mean   : 0.00000   Mean   : 0.000000  
##  3rd Qu.: 1.0674   3rd Qu.: 0.71700   3rd Qu.: 0.662736  
##  Max.   : 1.0674   Max.   : 3.07649   Max.   : 3.605494  
##                    NA's   :50                            
##   speed_ground        speed_air           height        
##  Min.   :-2.44824   Min.   :-1.3698   Min.   :-2.47685  
##  1st Qu.:-0.71483   1st Qu.:-0.7432   1st Qu.:-0.70663  
##  Median : 0.01055   Median :-0.2511   Median :-0.03164  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.66202   3rd Qu.: 0.5777   3rd Qu.: 0.66634  
##  Max.   : 3.02769   Max.   : 3.3017   Max.   : 2.99870  
##                     NA's   :630                         
##      pitch              distance      
##  Min.   :-3.264440   Min.   :  41.72  
##  1st Qu.:-0.691831   1st Qu.: 896.58  
##  Median :-0.003058   Median :1264.93  
##  Mean   : 0.000000   Mean   :1531.77  
##  3rd Qu.: 0.690761   3rd Qu.:1943.13  
##  Max.   : 3.645477   Max.   :6309.95  
## 
#regress distance on standardized variables
fittedpvalues <- vector()
fittedcoeffs <- vector()

fittedpvalues[1] <- summary(lm(distance~aircraft,data = faa.std))$coefficients[2,4] 
fittedcoeffs[1] <- summary(lm(distance~aircraft,data = faa.std))$coefficients[2,1] 

fittedpvalues[2] <- summary(lm(distance~duration,data = faa.std))$coefficients[2,4] 
fittedcoeffs[2] <- summary(lm(distance~duration,data = faa.std))$coefficients[2,1] 

fittedpvalues[3] <- summary(lm(distance~no_pasg,data = faa.std))$coefficients[2,4] 
fittedcoeffs[3] <- summary(lm(distance~no_pasg,data = faa.std))$coefficients[2,1] 

fittedpvalues[4] <- summary(lm(distance~speed_ground,data = faa.std))$coefficients[2,4] 
fittedcoeffs[4] <- summary(lm(distance~speed_ground,data = faa.std))$coefficients[2,1] 

fittedpvalues[5] <- summary(lm(distance~speed_air,data = faa.std))$coefficients[2,4]
fittedcoeffs[5] <- summary(lm(distance~speed_air,data = faa.std))$coefficients[2,1] 

fittedpvalues[6] <- summary(lm(distance~height,data = faa.std))$coefficients[2,4] 
fittedcoeffs[6] <- summary(lm(distance~height,data = faa.std))$coefficients[2,1] 

fittedpvalues[7] <- summary(lm(distance~pitch,data = faa.std))$coefficients[2,4]
fittedcoeffs[7] <- summary(lm(distance~pitch,data = faa.std))$coefficients[2,1] 

table3 <- tibble(
  variable = vars, 
  pvalues = fittedpvalues[1:7]
)
table3$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table3 %>% arrange(abs(pvalues))
## # A tibble: 7 x 3
##   variable       pvalues dir     
##   <chr>            <dbl> <chr>   
## 1 speed_ground 2.90e-254 positive
## 2 speed_air    2.18e-100 positive
## 3 aircraft     1.08e- 12 positive
## 4 height       8.99e-  4 positive
## 5 pitch        6.80e-  3 positive
## 6 duration     7.04e-  2 negative
## 7 no_pasg      6.14e-  1 negative

On comparing the three tables 1,2 and 3, we find that the results are consistent except in the case of Table1 where speed_air is ranked above speed_ground while in Table 2 and Table 3 , speed_ground is ranked above speed_air in terms of importance.We will create a table that follows the order of ranking in the above tables without keeping speed_air since there are almost 80% missing values in this variable

table0 <- tibble(
  var_by_importance = c('speed_ground','aircraft','height','pitch','duration','no_pasg')
)
table0
## # A tibble: 6 x 1
##   var_by_importance
##   <chr>            
## 1 speed_ground     
## 2 aircraft         
## 3 height           
## 4 pitch            
## 5 duration         
## 6 no_pasg

Check collinearity

#Compare regression coefficients for distance vs speed_ground, distance vs speed_air, distance vs speed_ground+speed_air
cat('Regression Coefficient with speed_ground as predictor is',summary(lm(distance~speed_ground,data = faa.std))$coefficients[2,1],'\n')
## Regression Coefficient with speed_ground as predictor is 791.1548
cat('Regression Coefficient with speed_air as predictor is',summary(lm(distance~speed_air,data = faa.std))$coefficients[2,1],'\n')
## Regression Coefficient with speed_air as predictor is 805.2946
cat('Regression Coefficient with both speed_ground and speed_air as predictors are',summary(lm(distance~speed_ground+speed_air,data = faa.std))$coefficients[2,1],'and',summary(lm(distance~speed_ground+speed_air,data = faa.std))$coefficients[3,1],'respectively')
## Regression Coefficient with both speed_ground and speed_air as predictors are -288.4484 and 958.2226 respectively

There is a significant change in the 3 models. when only speed_ground is used as a variable, regression coefficient is 791.1548 while it is 805.2946 when speed_air is selected. When both are selected, coefficient for speed_ground becomes negative while speed_air coefficient further increases

# check correlation between speed_ground and speed_air variables
cor(faa.std$speed_ground,faa.std$speed_air,use = "pairwise.complete.obs")
## [1] 0.9885722

This ia a very high correlation and will lead to multicollinearity issue in the model if both variables are used. In our case, we will use the speed_ground variable because it has no missing values while the speed_air has almost 80% missing values. Also in the model where both variables are used, p-value for speed_ground is 0.238 making it insignificant which is not correct.

Variable selection based on our ranking in Table 0.

rsquared.model1 <- summary(lm(distance ~ speed_ground,data = faa.std))$r.squared
rsquared.model2 <- summary(lm(distance ~ speed_ground+aircraft,data = faa.std))$r.squared
rsquared.model3 <- summary(lm(distance ~ speed_ground+aircraft+height,data = faa.std))$r.squared
rsquared.model4 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch,data = faa.std))$r.squared
rsquared.model5 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch+duration,data = faa.std))$r.squared
rsquared.model6 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch+duration+no_pasg,data = faa.std))$r.squared

plot(c(1,2,3,4,5,6),c(rsquared.model1,rsquared.model2,rsquared.model3,rsquared.model4,rsquared.model5,rsquared.model6),type="b",ylab ="Rsquared", xlab ="The number of predictors")

The R-squared value keeps on increasing until we add 4 variables(speed_ground+speed_air+aircraft+height) after which it becomes constant.

adj.rsquared.model1 <- summary(lm(distance ~ speed_ground,data = faa.no.dup))$adj.r.squared
adj.rsquared.model2 <- summary(lm(distance ~ speed_ground+aircraft,data = faa.no.dup))$adj.r.squared
adj.rsquared.model3 <- summary(lm(distance ~ speed_ground+aircraft+height,data = faa.no.dup))$adj.r.squared
adj.rsquared.model4 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch,data = faa.no.dup))$adj.r.squared
adj.rsquared.model5 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch+duration,data = faa.no.dup))$adj.r.squared
adj.rsquared.model6 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch+duration+no_pasg,data = faa.no.dup))$adj.r.squared

plot(c(1,2,3,4,5,6),c(adj.rsquared.model1,adj.rsquared.model2,adj.rsquared.model3,adj.rsquared.model4,adj.rsquared.model5,adj.rsquared.model6),type="b",ylab ="Adjusted Rsquared", xlab ="The number of predictors")

The adjusted R square model follows a similar pattern as the R squared statistic.

aic.model1 <- AIC(lm(distance ~ speed_ground,data = faa.std))
aic.model2 <- AIC(lm(distance ~ speed_ground+aircraft,data = faa.std))
aic.model3 <- AIC(lm(distance ~ speed_ground+aircraft+height,data = faa.std))
aic.model4 <- AIC(lm(distance ~ speed_ground+aircraft+height+pitch,data = faa.std))
aic.model5 <- AIC(lm(distance ~ speed_ground+aircraft+height+pitch+duration,data = faa.std))
aic.model6 <- AIC(lm(distance ~ speed_ground+aircraft+height+pitch+duration + no_pasg,data = faa.std))

plot(c(1,2,3,4,5,6),c(aic.model1,aic.model2,aic.model3,aic.model4,aic.model5,aic.model6),type="b",ylab ="Adjusted Rsquared", xlab ="The number of predictors")

The above chart does not look quite right. There is a sudden drop in AIC when 5th predictor(duration) is added. Let’s remove this variable and draw AIC plot again.

aic.model1 <- AIC(lm(distance ~ speed_ground,data = faa.std))
aic.model2 <- AIC(lm(distance ~ speed_ground+aircraft,data = faa.std))
aic.model3 <- AIC(lm(distance ~ speed_ground+aircraft+height,data = faa.std))
aic.model4 <- AIC(lm(distance ~ speed_ground+aircraft+height+pitch,data = faa.std))
aic.model5 <- AIC(lm(distance ~ speed_ground+aircraft+height+pitch+no_pasg,data = faa.std))

plot(c(1,2,3,4,5),c(aic.model1,aic.model2,aic.model3,aic.model4,aic.model5),type="b",ylab ="Adjusted Rsquared", xlab ="The number of predictors")

Important Observation

Using the R-squared, adjusted R Squared as well as AIC, we can conclude that the following 3 variables can be used for building the linear regression model - speed_ground, aircraft and height

Variable selection based on automate algorithm.

require(MASS)
full.model <- lm(distance ~ speed_ground + aircraft + height + pitch + no_pasg,data= faa.std)
step.model <- stepAIC(full.model, direction = "forward", trace = FALSE)
summary(step.model)
## 
## Call:
## lm(formula = distance ~ speed_ground + aircraft + height + pitch + 
##     no_pasg, data = faa.std)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -696.92 -226.28  -91.59  123.89 1877.20 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1531.77      12.23 125.268   <2e-16 ***
## speed_ground   806.73      12.26  65.813   <2e-16 ***
## aircraft       242.75      13.10  18.529   <2e-16 ***
## height         143.12      12.27  11.668   <2e-16 ***
## pitch           21.73      13.10   1.659   0.0976 .  
## no_pasg        -15.42      12.25  -1.258   0.2086    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 353.8 on 831 degrees of freedom
## Multiple R-squared:  0.8508, Adjusted R-squared:  0.8499 
## F-statistic: 947.4 on 5 and 831 DF,  p-value: < 2.2e-16

Using the automated model, we can again see the 3 most important variables are : speed_ground, aircraft and height