#Loading required libraries

library(readxl)
library(tidyverse)
library(plyr)
library(dplyr)
library(MASS)

Initial exploration of the data

# Importing data

FAA1 <- read_excel("FAA1-1.xls") 
FAA2 <- read_excel("FAA2-1.xls") 

# Structure of datasets

str(FAA1)
## Classes 'tbl_df', 'tbl' and 'data.frame':    800 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
dim(unique(FAA1))
## [1] 800   8

FAA1 has 800 rows and 8 columns

str(FAA2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    150 obs. of  7 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
dim(unique(FAA2))
## [1] 150   7

FAA2 has 150 rows and 7 columns. ‘duration’ column is not present in FAA2, but it is present in FAA1.

# Appending the data

combined <- rbind.fill(FAA1,FAA2)

# Checking duplicates

dim(combined[,-2])
## [1] 950   7
dim(unique(combined[,-2]))
## [1] 850   7

We have 100 duplicate records.

# Removing duplicate records

unique_combined <- unique(combined[,-2]) %>% left_join(FAA1)
## Joining, by = c("aircraft", "no_pasg", "speed_ground", "speed_air", "height",
## "pitch", "distance")
# Structure of the new data set

str(unique_combined)
## 'data.frame':    850 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...

Dataset has 850 rows and 8 variables

#unique_combined$aircraft <- as.factor(unique_combined$aircraft)
# Summary of the combined data

summary(unique_combined)
##    aircraft            no_pasg      speed_ground      speed_air     
##  Length:850         Min.   :29.0   Min.   : 27.74   Min.   : 90.00  
##  Class :character   1st Qu.:55.0   1st Qu.: 65.90   1st Qu.: 96.25  
##  Mode  :character   Median :60.0   Median : 79.64   Median :101.15  
##                     Mean   :60.1   Mean   : 79.45   Mean   :103.80  
##                     3rd Qu.:65.0   3rd Qu.: 92.06   3rd Qu.:109.40  
##                     Max.   :87.0   Max.   :141.22   Max.   :141.72  
##                                                     NA's   :642     
##      height           pitch          distance          duration     
##  Min.   :-3.546   Min.   :2.284   Min.   :  34.08   Min.   : 14.76  
##  1st Qu.:23.314   1st Qu.:3.642   1st Qu.: 883.79   1st Qu.:119.49  
##  Median :30.093   Median :4.008   Median :1258.09   Median :153.95  
##  Mean   :30.144   Mean   :4.009   Mean   :1526.02   Mean   :154.01  
##  3rd Qu.:36.993   3rd Qu.:4.377   3rd Qu.:1936.95   3rd Qu.:188.91  
##  Max.   :59.946   Max.   :5.927   Max.   :6533.05   Max.   :305.62  
##                                                     NA's   :50
boxplot(unique_combined$duration~unique_combined$aircraft)

boxplot(unique_combined$speed_ground~unique_combined$aircraft)

boxplot(unique_combined$speed_air~unique_combined$aircraft)

boxplot(unique_combined$height~unique_combined$aircraft)

boxplot(unique_combined$pitch~unique_combined$aircraft)

boxplot(unique_combined$distance~unique_combined$aircraft)

boxplot(unique_combined$no_pasg~unique_combined$aircraft)

hist(unique_combined$distance)

Summary of findings

Data Cleaning and further exploration

# Checking abnormal values

unique_combined %>% filter(duration<40)  #5 observations with duration less than 40
unique_combined %>% filter(!(speed_ground >= 30 & speed_ground <= 140)) # 3 observations with abnormal speed_ground
unique_combined %>% filter(!(speed_air >= 30 & speed_air <= 140| is.na(speed_air))) # 1 observation with abnormal speed_air
unique_combined %>% filter(height<6)  #10 observations with abnormal height
unique_combined %>% filter(distance>6000)  #2 observations with abnormal distance
#Cleaning data based on requirements

FAA_final <- unique_combined %>% filter(duration>40 | is.na(duration),speed_ground >= 30 & speed_ground <= 140, speed_air >= 30 & speed_air <= 140 | is.na(speed_air), height >= 6, distance < 6000)

dim(FAA_final)
## [1] 831   8

The dataset has 831 rows after removing the abnormal values. 19 rows have been removed.

# Structure of cleaned data

str(FAA_final)
## 'data.frame':    831 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
summary(FAA_final)
##    aircraft            no_pasg       speed_ground      speed_air     
##  Length:831         Min.   :29.00   Min.   : 33.57   Min.   : 90.00  
##  Class :character   1st Qu.:55.00   1st Qu.: 66.20   1st Qu.: 96.23  
##  Mode  :character   Median :60.00   Median : 79.79   Median :101.12  
##                     Mean   :60.06   Mean   : 79.54   Mean   :103.48  
##                     3rd Qu.:65.00   3rd Qu.: 91.91   3rd Qu.:109.36  
##                     Max.   :87.00   Max.   :132.78   Max.   :132.91  
##                                                      NA's   :628     
##      height           pitch          distance          duration     
##  Min.   : 6.228   Min.   :2.284   Min.   :  41.72   Min.   : 41.95  
##  1st Qu.:23.530   1st Qu.:3.640   1st Qu.: 893.28   1st Qu.:119.63  
##  Median :30.167   Median :4.001   Median :1262.15   Median :154.28  
##  Mean   :30.458   Mean   :4.005   Mean   :1522.48   Mean   :154.78  
##  3rd Qu.:37.004   3rd Qu.:4.370   3rd Qu.:1936.63   3rd Qu.:189.66  
##  Max.   :59.946   Max.   :5.927   Max.   :5381.96   Max.   :305.62  
##                                                     NA's   :50
# Histogram of the variables to visualize the distribution

hist(FAA_final$no_pasg)

hist(FAA_final$speed_ground)

hist(FAA_final$speed_air)

hist(FAA_final$height)

hist(FAA_final$pitch)

hist(FAA_final$distance)

hist(FAA_final$duration)

# Boxplots of the numerical variables

boxplot(FAA_final$duration~FAA_final$aircraft)

boxplot(FAA_final$speed_ground~FAA_final$aircraft)

boxplot(FAA_final$speed_air~FAA_final$aircraft)

boxplot(FAA_final$height~FAA_final$aircraft)

boxplot(FAA_final$pitch~FAA_final$aircraft)

boxplot(FAA_final$distance~FAA_final$aircraft)

boxplot(FAA_final$no_pasg~FAA_final$aircraft)

Summary of findings

Initial analysis for identifying important factors that impact the response variable “landing distance”

# Compute the pairwise correlation between the landing distance and each factor X and storing it in Table1
cor1 <- cor(FAA_final$distance, FAA_final[,c(2,3,5,6)])

cor2 <- cor(subset(FAA_final,!is.na(FAA_final$speed_air))$distance,subset(FAA_final,!is.na(FAA_final$speed_air))$speed_air)


cor3 <- cor(subset(FAA_final,!is.na(FAA_final$duration))$distance,subset(FAA_final,!is.na(FAA_final$duration))$duration)

cor <- c(cor1,cor2,cor3)

direction <- c('negative','positive','positive','positive','positive','negative')

size_of_cor <- abs(cor)

names_variables <- colnames(FAA_final)[c(2,3,5,6,4,8)]

Table1 <- data.frame(names_variables,size_of_cor,direction)

Table1 <- Table1[
  order( Table1[,2],decreasing = TRUE ),
]

Table1
# Creating XY plots

plot(FAA_final$distance,FAA_final$duration)

plot(FAA_final$distance,FAA_final$no_pasg)

plot(FAA_final$distance,FAA_final$speed_ground)

plot(FAA_final$distance,FAA_final$speed_air)

plot(FAA_final$distance,FAA_final$height)

plot(FAA_final$distance,FAA_final$pitch)

On the basis of plots we observe some relationship between-

The relationship is consistent with computed high correlations for these variables. Other variables do not show any linear relationship which is consistent with their low size of correlations.

# Encoding aircraft (airbus is coded as 1 and boeing as 0)
for(i in 1:831)
{
if(FAA_final$aircraft[i] == 'airbus') 
{
  FAA_final$aircraft[i] = 1
}

if(FAA_final$aircraft[i] == 'boeing') 
{
  FAA_final$aircraft[i] = 0
}

}

FAA_final$aircraft <- as.numeric(FAA_final$aircraft)
cor_aircraft <- cor(FAA_final$aircraft,FAA_final$distance)

plot(FAA_final$aircraft,FAA_final$distance)

We can conclude from negative correlation and the plot that boeing has relatively higher distribution of distance than airbus.

Regression using a single factor each time

lm_1 <- lm(distance ~ aircraft, FAA_final)
summary(lm_1)
## 
## Call:
## lm(formula = distance ~ aircraft, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1281.6  -630.3  -229.4   388.2  3631.0 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1750.98      44.28   39.54  < 2e-16 ***
## aircraft     -427.67      60.58   -7.06 3.53e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 871.1 on 829 degrees of freedom
## Multiple R-squared:  0.05671,    Adjusted R-squared:  0.05557 
## F-statistic: 49.84 on 1 and 829 DF,  p-value: 3.526e-12
d_1 <- 'negative'
p_1 <- summary(lm_1)$coefficients[,4][2]

lm_2 <- lm(distance ~ no_pasg, FAA_final)
summary(lm_2)
## 
## Call:
## lm(formula = distance ~ no_pasg, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1444.8  -622.8  -271.3   414.6  3884.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1650.076    251.460   6.562 9.35e-11 ***
## no_pasg       -2.125      4.155  -0.511    0.609    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 896.7 on 829 degrees of freedom
## Multiple R-squared:  0.0003153,  Adjusted R-squared:  -0.0008906 
## F-statistic: 0.2615 on 1 and 829 DF,  p-value: 0.6093
d_2 <- 'negative'
p_2 <- summary(lm_2)$coefficients[,4][2]

lm_3 <- lm(distance ~ speed_ground, FAA_final)
summary(lm_3)
## 
## Call:
## lm(formula = distance ~ speed_ground, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -897.09 -319.16  -72.09  210.83 1798.88 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1773.9407    67.8388  -26.15   <2e-16 ***
## speed_ground    41.4422     0.8302   49.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 448.1 on 829 degrees of freedom
## Multiple R-squared:  0.7504, Adjusted R-squared:  0.7501 
## F-statistic:  2492 on 1 and 829 DF,  p-value: < 2.2e-16
d_3 <- 'positive'
p_3 <- summary(lm_3)$coefficients[,4][2]

lm_4 <- lm(distance ~ speed_air, FAA_final)
summary(lm_4)
## 
## Call:
## lm(formula = distance ~ speed_air, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -776.21 -196.39    8.72  209.17  624.34 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5455.709    207.547  -26.29   <2e-16 ***
## speed_air      79.532      1.997   39.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 276.3 on 201 degrees of freedom
##   (628 observations deleted due to missingness)
## Multiple R-squared:  0.8875, Adjusted R-squared:  0.887 
## F-statistic:  1586 on 1 and 201 DF,  p-value: < 2.2e-16
d_4 <- 'positive'
p_4 <- summary(lm_4)$coefficients[,4][2]

lm_5 <- lm(distance ~ height, FAA_final)
summary(lm_5)
## 
## Call:
## lm(formula = distance ~ height, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1338.5  -606.9  -253.8   388.5  3932.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1245.116    101.272  12.295  < 2e-16 ***
## height         9.107      3.166   2.877  0.00412 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 892.4 on 829 degrees of freedom
## Multiple R-squared:  0.009883,   Adjusted R-squared:  0.008688 
## F-statistic: 8.274 on 1 and 829 DF,  p-value: 0.004124
d_5 <- 'positive'
p_5 <- summary(lm_5)$coefficients[,4][2]

lm_6 <- lm(distance ~ pitch, FAA_final)
summary(lm_6)
## 
## Call:
## lm(formula = distance ~ pitch, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1338.1  -644.3  -241.2   402.5  3839.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    929.1      237.9   3.905 0.000102 ***
## pitch          148.1       58.9   2.515 0.012081 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 893.5 on 829 degrees of freedom
## Multiple R-squared:  0.007574,   Adjusted R-squared:  0.006377 
## F-statistic: 6.327 on 1 and 829 DF,  p-value: 0.01208
d_6 <- 'positive'
p_6 <- summary(lm_6)$coefficients[,4][2]

lm_7 <- lm(distance ~ duration, FAA_final)
summary(lm_7)
## 
## Call:
## lm(formula = distance ~ duration, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1464.9  -615.6  -274.7   408.5  3847.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1689.9942   108.5452  15.569   <2e-16 ***
## duration      -0.9613     0.6694  -1.436    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 904 on 779 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.00264,    Adjusted R-squared:  0.00136 
## F-statistic: 2.062 on 1 and 779 DF,  p-value: 0.1514
d_7 <- 'negative'
p_7 <- summary(lm_7)$coefficients[,4][2]

p_value <- c(p_1,p_2,p_3,p_4,p_5,p_6,p_7)
dir <- c(d_1,d_2,d_3,d_4,d_5,d_6,d_7)
names <- colnames(FAA_final[c(1:6,8)])

Table2 <- data.frame(names,p_value,dir)
Table2 <- Table2[
  order( Table2[,2] ),
]
Table2
# Standardizing each X variable

m1 <- mean(FAA_final$aircraft)
s1 <- sd(FAA_final$aircraft)

m2 <- mean(FAA_final$no_pasg)
s2 <- sd(FAA_final$no_pasg)

m3 <- mean(FAA_final$speed_ground)
s3 <- sd(FAA_final$speed_ground)

m4 <- mean(FAA_final$speed_air, na.rm = TRUE)
s4 <- sd(FAA_final$speed_air, na.rm = TRUE)

m5 <- mean(FAA_final$height)
s5 <- sd(FAA_final$height)

m6 <- mean(FAA_final$pitch)
s6 <- sd(FAA_final$pitch)

m7 <- mean(FAA_final$duration, na.rm = TRUE)
s7 <- sd(FAA_final$duration, na.rm = TRUE)

FAA_final <- FAA_final %>% mutate(aircraft_s = (aircraft - m1)/s1)
FAA_final <- FAA_final %>% mutate(no_pasg_s = (no_pasg - m2)/s2)
FAA_final <- FAA_final %>% mutate(speed_ground_s = (speed_ground - m3)/s3)
FAA_final <- FAA_final %>% mutate(speed_air_s = (speed_air - m4)/s4)
FAA_final <- FAA_final %>% mutate(height_s = (height - m5)/s5)
FAA_final <- FAA_final %>% mutate(pitch_s = (pitch - m6)/s6)
FAA_final <- FAA_final %>% mutate(duration_s = (duration - m7)/s7)


lmm_1 <- lm(distance ~ aircraft_s, FAA_final)
summary(lmm_1)
## 
## Call:
## lm(formula = distance ~ aircraft_s, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1281.6  -630.3  -229.4   388.2  3631.0 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1522.48      30.22   50.38  < 2e-16 ***
## aircraft_s   -213.46      30.24   -7.06 3.53e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 871.1 on 829 degrees of freedom
## Multiple R-squared:  0.05671,    Adjusted R-squared:  0.05557 
## F-statistic: 49.84 on 1 and 829 DF,  p-value: 3.526e-12
dd_1 <- 'negative'
p_1 <- summary(lmm_1)$coefficients[,1][2]


lmm_2 <- lm(distance ~ no_pasg, FAA_final)
summary(lmm_2)
## 
## Call:
## lm(formula = distance ~ no_pasg, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1444.8  -622.8  -271.3   414.6  3884.9 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1650.076    251.460   6.562 9.35e-11 ***
## no_pasg       -2.125      4.155  -0.511    0.609    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 896.7 on 829 degrees of freedom
## Multiple R-squared:  0.0003153,  Adjusted R-squared:  -0.0008906 
## F-statistic: 0.2615 on 1 and 829 DF,  p-value: 0.6093
dd_2 <- 'negative'
p_2 <- summary(lmm_2)$coefficients[,1][2]

lmm_3 <- lm(distance ~ speed_ground, FAA_final)
summary(lmm_3)
## 
## Call:
## lm(formula = distance ~ speed_ground, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -897.09 -319.16  -72.09  210.83 1798.88 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1773.9407    67.8388  -26.15   <2e-16 ***
## speed_ground    41.4422     0.8302   49.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 448.1 on 829 degrees of freedom
## Multiple R-squared:  0.7504, Adjusted R-squared:  0.7501 
## F-statistic:  2492 on 1 and 829 DF,  p-value: < 2.2e-16
dd_3 <- 'positive'
p_3 <- summary(lmm_3)$coefficients[,1][2]

lmm_4 <- lm(distance ~ speed_air, FAA_final)
summary(lmm_4)
## 
## Call:
## lm(formula = distance ~ speed_air, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -776.21 -196.39    8.72  209.17  624.34 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5455.709    207.547  -26.29   <2e-16 ***
## speed_air      79.532      1.997   39.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 276.3 on 201 degrees of freedom
##   (628 observations deleted due to missingness)
## Multiple R-squared:  0.8875, Adjusted R-squared:  0.887 
## F-statistic:  1586 on 1 and 201 DF,  p-value: < 2.2e-16
dd_4 <- 'positive'
p_4 <- summary(lmm_4)$coefficients[,1][2]

lmm_5 <- lm(distance ~ height, FAA_final)
summary(lmm_5)
## 
## Call:
## lm(formula = distance ~ height, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1338.5  -606.9  -253.8   388.5  3932.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1245.116    101.272  12.295  < 2e-16 ***
## height         9.107      3.166   2.877  0.00412 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 892.4 on 829 degrees of freedom
## Multiple R-squared:  0.009883,   Adjusted R-squared:  0.008688 
## F-statistic: 8.274 on 1 and 829 DF,  p-value: 0.004124
dd_5 <- 'positive'
p_5 <- summary(lmm_5)$coefficients[,1][2]

lmm_6 <- lm(distance ~ pitch, FAA_final)
summary(lmm_6)
## 
## Call:
## lm(formula = distance ~ pitch, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1338.1  -644.3  -241.2   402.5  3839.5 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    929.1      237.9   3.905 0.000102 ***
## pitch          148.1       58.9   2.515 0.012081 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 893.5 on 829 degrees of freedom
## Multiple R-squared:  0.007574,   Adjusted R-squared:  0.006377 
## F-statistic: 6.327 on 1 and 829 DF,  p-value: 0.01208
dd_6 <- 'positive'
p_6 <- summary(lmm_6)$coefficients[,1][2]

lmm_7 <- lm(distance ~ duration, FAA_final)
summary(lmm_7)
## 
## Call:
## lm(formula = distance ~ duration, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1464.9  -615.6  -274.7   408.5  3847.6 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1689.9942   108.5452  15.569   <2e-16 ***
## duration      -0.9613     0.6694  -1.436    0.151    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 904 on 779 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.00264,    Adjusted R-squared:  0.00136 
## F-statistic: 2.062 on 1 and 779 DF,  p-value: 0.1514
dd_7 <- 'negative'
p_7 <- summary(lmm_7)$coefficients[,1][2]

size_coff <- c(p_1,p_2,p_3,p_4,p_5,p_6,p_7)
dir <- c(dd_1,dd_2,dd_3,dd_4,dd_5,dd_6,dd_7)
names <- colnames(FAA_final[c(9:15)])

Table3 <- data.frame(names,abs(size_coff),dir)
Table3 <- Table3[
  order( Table3[,2],decreasing = TRUE ),
]
Table3

Relative order of importance on the basis of Table1,2 and 3

Table1 and Table2 are consistent. However we observe some differences in the order in Table3.

# Creating Table0 (Ranking based on all the factors)

names <- c('speed_ground','speed_air','aircraft','pitch','height','no_pasg','duration')
importance <- c(1:7)
Table0 <- data.frame(names,importance)

Check collinearity

l1 <- lm(distance~speed_ground, FAA_final)
summary(l1)
## 
## Call:
## lm(formula = distance ~ speed_ground, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -897.09 -319.16  -72.09  210.83 1798.88 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1773.9407    67.8388  -26.15   <2e-16 ***
## speed_ground    41.4422     0.8302   49.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 448.1 on 829 degrees of freedom
## Multiple R-squared:  0.7504, Adjusted R-squared:  0.7501 
## F-statistic:  2492 on 1 and 829 DF,  p-value: < 2.2e-16
l2 <- lm(distance~speed_air, FAA_final)
summary(l2)
## 
## Call:
## lm(formula = distance ~ speed_air, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -776.21 -196.39    8.72  209.17  624.34 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -5455.709    207.547  -26.29   <2e-16 ***
## speed_air      79.532      1.997   39.83   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 276.3 on 201 degrees of freedom
##   (628 observations deleted due to missingness)
## Multiple R-squared:  0.8875, Adjusted R-squared:  0.887 
## F-statistic:  1586 on 1 and 201 DF,  p-value: < 2.2e-16
l3 <- lm(distance~speed_ground + speed_air, FAA_final)
summary(l3)
## 
## Call:
## lm(formula = distance ~ speed_ground + speed_air, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -819.74 -202.02    3.52  211.25  636.25 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -5462.28     207.48 -26.327  < 2e-16 ***
## speed_ground   -14.37      12.68  -1.133    0.258    
## speed_air       93.96      12.89   7.291 6.99e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 276.1 on 200 degrees of freedom
##   (628 observations deleted due to missingness)
## Multiple R-squared:  0.8883, Adjusted R-squared:  0.8871 
## F-statistic:   795 on 2 and 200 DF,  p-value: < 2.2e-16

We observe sign change in speed_ground when we fit it along with speed_air in comparison when it is fitted alone.

# Cor between speed_air and speed_ground

cor(subset(FAA_final,!is.na(FAA_final$speed_air))$speed_air,subset(FAA_final,!is.na(FAA_final$speed_air))$speed_ground)
## [1] 0.9879383

The correlation is very high. I would only keep speed_ground in the model as it does not have NAs. On the other hand speed_air has a lot of NAs.

Variable selection based on our ranking in Table 0

L1 <- lm(distance~speed_ground, FAA_final)
summary(L1)
## 
## Call:
## lm(formula = distance ~ speed_ground, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -897.09 -319.16  -72.09  210.83 1798.88 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1773.9407    67.8388  -26.15   <2e-16 ***
## speed_ground    41.4422     0.8302   49.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 448.1 on 829 degrees of freedom
## Multiple R-squared:  0.7504, Adjusted R-squared:  0.7501 
## F-statistic:  2492 on 1 and 829 DF,  p-value: < 2.2e-16
r1 <- summary(L1)$r.squared

L2 <- lm(distance~speed_ground+aircraft, FAA_final)
summary(L2)
## 
## Call:
## lm(formula = distance ~ speed_ground + aircraft, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -749.29 -256.28  -67.79  150.40 1541.85 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1553.5112    58.0094  -26.78   <2e-16 ***
## speed_ground    41.9718     0.6958   60.32   <2e-16 ***
## aircraft      -491.4009    26.1192  -18.81   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 375.3 on 828 degrees of freedom
## Multiple R-squared:  0.8251, Adjusted R-squared:  0.8247 
## F-statistic:  1953 on 2 and 828 DF,  p-value: < 2.2e-16
r2 <- summary(L2)$r.squared

L3 <- lm(distance~speed_ground+aircraft+pitch, FAA_final)
summary(L3)
## 
## Call:
## lm(formula = distance ~ speed_ground + aircraft + pitch, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -753.66 -255.01  -68.03  146.27 1594.06 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -1756.2213   126.6005 -13.872   <2e-16 ***
## speed_ground    42.0050     0.6951  60.428   <2e-16 ***
## aircraft      -473.6692    27.8806 -16.989   <2e-16 ***
## pitch           47.5871    26.4260   1.801   0.0721 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 374.8 on 827 degrees of freedom
## Multiple R-squared:  0.8258, Adjusted R-squared:  0.8252 
## F-statistic:  1307 on 3 and 827 DF,  p-value: < 2.2e-16
r3 <- summary(L3)$r.squared

L4 <- lm(distance~speed_ground+aircraft+pitch+height, FAA_final)
summary(L4)
## 
## Call:
## lm(formula = distance ~ speed_ground + aircraft + pitch + height, 
##     data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -716.81 -224.12  -93.24  127.80 1500.95 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2183.0542   123.6415  -17.66   <2e-16 ***
## speed_ground    42.4283     0.6479   65.49   <2e-16 ***
## aircraft      -481.2682    25.9512  -18.55   <2e-16 ***
## pitch           39.6076    24.5991    1.61    0.108    
## height          14.0909     1.2398   11.37   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 348.7 on 826 degrees of freedom
## Multiple R-squared:  0.8494, Adjusted R-squared:  0.8486 
## F-statistic:  1164 on 4 and 826 DF,  p-value: < 2.2e-16
r4 <- summary(L4)$r.squared

L5 <- lm(distance~speed_ground+aircraft+pitch+height+no_pasg, FAA_final)
summary(L5)
## 
## Call:
## lm(formula = distance ~ speed_ground + aircraft + pitch + height + 
##     no_pasg, data = FAA_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -686.17 -224.30  -91.72  124.71 1512.83 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2051.9159   156.6244 -13.101   <2e-16 ***
## speed_ground    42.4295     0.6475  65.524   <2e-16 ***
## aircraft      -480.6917    25.9412 -18.530   <2e-16 ***
## pitch           39.2066    24.5881   1.595    0.111    
## height          14.1703     1.2405  11.423   <2e-16 ***
## no_pasg         -2.2039     1.6172  -1.363    0.173    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 348.5 on 825 degrees of freedom
## Multiple R-squared:  0.8497, Adjusted R-squared:  0.8488 
## F-statistic: 932.9 on 5 and 825 DF,  p-value: < 2.2e-16
r5 <- summary(L5)$r.squared

L6 <- lm(distance~speed_ground+aircraft+pitch+height+no_pasg+duration, FAA_final)
summary(L6)
## 
## Call:
## lm(formula = distance ~ speed_ground + aircraft + pitch + height + 
##     no_pasg + duration, data = FAA_final)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -690.6 -224.5  -91.5  123.0 1481.6 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -2.025e+03  1.704e+02 -11.884   <2e-16 ***
## speed_ground  4.257e+01  6.680e-01  63.719   <2e-16 ***
## aircraft     -4.888e+02  2.699e+01 -18.106   <2e-16 ***
## pitch         1.965e+01  2.587e+01   0.760    0.448    
## height        1.429e+01  1.294e+00  11.038   <2e-16 ***
## no_pasg      -1.633e+00  1.673e+00  -0.976    0.329    
## duration      4.676e-02  2.609e-01   0.179    0.858    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 351 on 774 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.8506, Adjusted R-squared:  0.8494 
## F-statistic: 734.5 on 6 and 774 DF,  p-value: < 2.2e-16
r6 <- summary(L6)$r.squared

num <- c(1:6)

plot(c(r1,r2,r3,r4,r5,r6),num)

R squared increases as the number of variables increase

adj.r1 <- summary(L1)$adj.r.squared
adj.r2 <- summary(L2)$adj.r.squared
adj.r3 <- summary(L3)$adj.r.squared
adj.r4 <- summary(L4)$adj.r.squared
adj.r5 <- summary(L5)$adj.r.squared
adj.r6 <- summary(L6)$adj.r.squared

c(adj.r1,adj.r2,adj.r3,adj.r4,adj.r5,adj.r6)
## [1] 0.7500773 0.8247095 0.8251830 0.8486423 0.8487992 0.8494442
plot(c(adj.r1,adj.r2,adj.r3,adj.r4,adj.r5,adj.r6),num)

Adj R Squared also increases but the increase is less per addition of new variable

# AIC

AIC1 <- AIC(L1)
AIC2 <- AIC(L2)
AIC3 <- AIC(L3)
AIC4 <- AIC(L4)
AIC5 <- AIC(L5)
AIC6 <- AIC(L6)

c(AIC1,AIC2,AIC3,AIC4,AIC5,AIC6)
## [1] 12508.81 12215.05 12213.79 12095.05 12095.18 11379.88
plot(c(AIC1,AIC2,AIC3,AIC4,AIC5,AIC6),num)

On the basis of Adj R Squared and AIC I would select model L4. That is the model with 4 predictors - (speed_ground,aircraft,pitch and height)

Variable selection based on automate algorithm

Model1_LM <- lm(distance ~ 1, data = FAA_final[,c(1:3,5:7)])
fit1_LM <- stepAIC(Model1_LM, direction = 'forward')
## Start:  AIC=11299.8
## distance ~ 1
fit1 <- lm(distance ~ ., FAA_final[,c(1:3,5:7)])
fit2 <- lm(distance ~ 1, FAA_final[,c(1:3,5:7)])
stepAIC(fit1,direction="backward")
## Start:  AIC=9734.9
## distance ~ aircraft + no_pasg + speed_ground + height + pitch
## 
##                Df Sum of Sq       RSS     AIC
## - no_pasg       1    225608 100445017  9734.8
## <none>                      100219409  9734.9
## - pitch         1    308863 100528273  9735.5
## - height        1  15851243 116070653  9854.9
## - aircraft      1  41711151 141930560 10022.1
## - speed_ground  1 521552033 621771442 11249.7
## 
## Step:  AIC=9734.77
## distance ~ aircraft + speed_ground + height + pitch
## 
##                Df Sum of Sq       RSS     AIC
## <none>                      100445017  9734.8
## - pitch         1    315259 100760276  9735.4
## - height        1  15708637 116153653  9853.5
## - aircraft      1  41822381 142267398 10022.0
## - speed_ground  1 521523226 621968242 11247.9
## 
## Call:
## lm(formula = distance ~ aircraft + speed_ground + height + pitch, 
##     data = FAA_final[, c(1:3, 5:7)])
## 
## Coefficients:
##  (Intercept)      aircraft  speed_ground        height         pitch  
##     -2183.05       -481.27         42.43         14.09         39.61
stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2))
## Start:  AIC=11299.8
## distance ~ 1
## 
##                Df Sum of Sq       RSS   AIC
## + speed_ground  1 500382567 166457762 10148
## + aircraft      1  37818390 629021939 11253
## + height        1   6590108 660250221 11294
## + pitch         1   5050617 661789712 11296
## <none>                      666840329 11300
## + no_pasg       1    210253 666630076 11302
## 
## Step:  AIC=10148.53
## distance ~ speed_ground
## 
##            Df Sum of Sq       RSS     AIC
## + aircraft  1  49848656 116609106  9854.8
## + height    1  14916377 151541385 10072.5
## + pitch     1   9765095 156692668 10100.3
## <none>                  166457762 10148.5
## + no_pasg   1    207528 166250234 10149.5
## 
## Step:  AIC=9854.77
## distance ~ speed_ground + aircraft
## 
##           Df Sum of Sq       RSS    AIC
## + height   1  15848830 100760276 9735.4
## + pitch    1    455453 116153653 9853.5
## <none>                 116609106 9854.8
## + no_pasg  1     87171 116521935 9856.1
## 
## Step:  AIC=9735.37
## distance ~ speed_ground + aircraft + height
## 
##           Df Sum of Sq       RSS    AIC
## + pitch    1    315259 100445017 9734.8
## <none>                 100760276 9735.4
## + no_pasg  1    232003 100528273 9735.5
## 
## Step:  AIC=9734.77
## distance ~ speed_ground + aircraft + height + pitch
## 
##           Df Sum of Sq       RSS    AIC
## <none>                 100445017 9734.8
## + no_pasg  1    225608 100219409 9734.9
## 
## Call:
## lm(formula = distance ~ speed_ground + aircraft + height + pitch, 
##     data = FAA_final[, c(1:3, 5:7)])
## 
## Coefficients:
##  (Intercept)  speed_ground      aircraft        height         pitch  
##     -2183.05         42.43       -481.27         14.09         39.61
stepAIC(fit2,direction="both",scope=list(upper=fit1,lower=fit2))
## Start:  AIC=11299.8
## distance ~ 1
## 
##                Df Sum of Sq       RSS   AIC
## + speed_ground  1 500382567 166457762 10148
## + aircraft      1  37818390 629021939 11253
## + height        1   6590108 660250221 11294
## + pitch         1   5050617 661789712 11296
## <none>                      666840329 11300
## + no_pasg       1    210253 666630076 11302
## 
## Step:  AIC=10148.53
## distance ~ speed_ground
## 
##                Df Sum of Sq       RSS     AIC
## + aircraft      1  49848656 116609106  9854.8
## + height        1  14916377 151541385 10072.5
## + pitch         1   9765095 156692668 10100.3
## <none>                      166457762 10148.5
## + no_pasg       1    207528 166250234 10149.5
## - speed_ground  1 500382567 666840329 11299.8
## 
## Step:  AIC=9854.77
## distance ~ speed_ground + aircraft
## 
##                Df Sum of Sq       RSS     AIC
## + height        1  15848830 100760276  9735.4
## + pitch         1    455453 116153653  9853.5
## <none>                      116609106  9854.8
## + no_pasg       1     87171 116521935  9856.1
## - aircraft      1  49848656 166457762 10148.5
## - speed_ground  1 512412832 629021939 11253.3
## 
## Step:  AIC=9735.37
## distance ~ speed_ground + aircraft + height
## 
##                Df Sum of Sq       RSS     AIC
## + pitch         1    315259 100445017  9734.8
## <none>                      100760276  9735.4
## + no_pasg       1    232003 100528273  9735.5
## - height        1  15848830 116609106  9854.8
## - aircraft      1  50781109 151541385 10072.5
## - speed_ground  1 521208000 621968276 11245.9
## 
## Step:  AIC=9734.77
## distance ~ speed_ground + aircraft + height + pitch
## 
##                Df Sum of Sq       RSS     AIC
## <none>                      100445017  9734.8
## + no_pasg       1    225608 100219409  9734.9
## - pitch         1    315259 100760276  9735.4
## - height        1  15708637 116153653  9853.5
## - aircraft      1  41822381 142267398 10022.0
## - speed_ground  1 521523226 621968242 11247.9
## 
## Call:
## lm(formula = distance ~ speed_ground + aircraft + height + pitch, 
##     data = FAA_final[, c(1:3, 5:7)])
## 
## Coefficients:
##  (Intercept)  speed_ground      aircraft        height         pitch  
##     -2183.05         42.43       -481.27         14.09         39.61

On the basis of stepAIC forward variable selection I would select model with 2 predictors - (speed_ground and aircraft)