#Loading required libraries
library(readxl)
library(tidyverse)
library(plyr)
library(dplyr)
library(MASS)
Initial exploration of the data
# Importing data
FAA1 <- read_excel("FAA1-1.xls")
FAA2 <- read_excel("FAA2-1.xls")
# Structure of datasets
str(FAA1)
## Classes 'tbl_df', 'tbl' and 'data.frame': 800 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
dim(unique(FAA1))
## [1] 800 8
FAA1 has 800 rows and 8 columns
str(FAA2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 150 obs. of 7 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
dim(unique(FAA2))
## [1] 150 7
FAA2 has 150 rows and 7 columns. ‘duration’ column is not present in FAA2, but it is present in FAA1.
# Appending the data
combined <- rbind.fill(FAA1,FAA2)
# Checking duplicates
dim(combined[,-2])
## [1] 950 7
dim(unique(combined[,-2]))
## [1] 850 7
We have 100 duplicate records.
# Removing duplicate records
unique_combined <- unique(combined[,-2]) %>% left_join(FAA1)
## Joining, by = c("aircraft", "no_pasg", "speed_ground", "speed_air", "height",
## "pitch", "distance")
# Structure of the new data set
str(unique_combined)
## 'data.frame': 850 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
Dataset has 850 rows and 8 variables
#unique_combined$aircraft <- as.factor(unique_combined$aircraft)
# Summary of the combined data
summary(unique_combined)
## aircraft no_pasg speed_ground speed_air
## Length:850 Min. :29.0 Min. : 27.74 Min. : 90.00
## Class :character 1st Qu.:55.0 1st Qu.: 65.90 1st Qu.: 96.25
## Mode :character Median :60.0 Median : 79.64 Median :101.15
## Mean :60.1 Mean : 79.45 Mean :103.80
## 3rd Qu.:65.0 3rd Qu.: 92.06 3rd Qu.:109.40
## Max. :87.0 Max. :141.22 Max. :141.72
## NA's :642
## height pitch distance duration
## Min. :-3.546 Min. :2.284 Min. : 34.08 Min. : 14.76
## 1st Qu.:23.314 1st Qu.:3.642 1st Qu.: 883.79 1st Qu.:119.49
## Median :30.093 Median :4.008 Median :1258.09 Median :153.95
## Mean :30.144 Mean :4.009 Mean :1526.02 Mean :154.01
## 3rd Qu.:36.993 3rd Qu.:4.377 3rd Qu.:1936.95 3rd Qu.:188.91
## Max. :59.946 Max. :5.927 Max. :6533.05 Max. :305.62
## NA's :50
boxplot(unique_combined$duration~unique_combined$aircraft)
boxplot(unique_combined$speed_ground~unique_combined$aircraft)
boxplot(unique_combined$speed_air~unique_combined$aircraft)
boxplot(unique_combined$height~unique_combined$aircraft)
boxplot(unique_combined$pitch~unique_combined$aircraft)
boxplot(unique_combined$distance~unique_combined$aircraft)
boxplot(unique_combined$no_pasg~unique_combined$aircraft)
hist(unique_combined$distance)
Summary of findings
Data Cleaning and further exploration
# Checking abnormal values
unique_combined %>% filter(duration<40) #5 observations with duration less than 40
unique_combined %>% filter(!(speed_ground >= 30 & speed_ground <= 140)) # 3 observations with abnormal speed_ground
unique_combined %>% filter(!(speed_air >= 30 & speed_air <= 140| is.na(speed_air))) # 1 observation with abnormal speed_air
unique_combined %>% filter(height<6) #10 observations with abnormal height
unique_combined %>% filter(distance>6000) #2 observations with abnormal distance
#Cleaning data based on requirements
FAA_final <- unique_combined %>% filter(duration>40 | is.na(duration),speed_ground >= 30 & speed_ground <= 140, speed_air >= 30 & speed_air <= 140 | is.na(speed_air), height >= 6, distance < 6000)
dim(FAA_final)
## [1] 831 8
The dataset has 831 rows after removing the abnormal values. 19 rows have been removed.
# Structure of cleaned data
str(FAA_final)
## 'data.frame': 831 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
summary(FAA_final)
## aircraft no_pasg speed_ground speed_air
## Length:831 Min. :29.00 Min. : 33.57 Min. : 90.00
## Class :character 1st Qu.:55.00 1st Qu.: 66.20 1st Qu.: 96.23
## Mode :character Median :60.00 Median : 79.79 Median :101.12
## Mean :60.06 Mean : 79.54 Mean :103.48
## 3rd Qu.:65.00 3rd Qu.: 91.91 3rd Qu.:109.36
## Max. :87.00 Max. :132.78 Max. :132.91
## NA's :628
## height pitch distance duration
## Min. : 6.228 Min. :2.284 Min. : 41.72 Min. : 41.95
## 1st Qu.:23.530 1st Qu.:3.640 1st Qu.: 893.28 1st Qu.:119.63
## Median :30.167 Median :4.001 Median :1262.15 Median :154.28
## Mean :30.458 Mean :4.005 Mean :1522.48 Mean :154.78
## 3rd Qu.:37.004 3rd Qu.:4.370 3rd Qu.:1936.63 3rd Qu.:189.66
## Max. :59.946 Max. :5.927 Max. :5381.96 Max. :305.62
## NA's :50
# Histogram of the variables to visualize the distribution
hist(FAA_final$no_pasg)
hist(FAA_final$speed_ground)
hist(FAA_final$speed_air)
hist(FAA_final$height)
hist(FAA_final$pitch)
hist(FAA_final$distance)
hist(FAA_final$duration)
# Boxplots of the numerical variables
boxplot(FAA_final$duration~FAA_final$aircraft)
boxplot(FAA_final$speed_ground~FAA_final$aircraft)
boxplot(FAA_final$speed_air~FAA_final$aircraft)
boxplot(FAA_final$height~FAA_final$aircraft)
boxplot(FAA_final$pitch~FAA_final$aircraft)
boxplot(FAA_final$distance~FAA_final$aircraft)
boxplot(FAA_final$no_pasg~FAA_final$aircraft)
Summary of findings
Initial analysis for identifying important factors that impact the response variable “landing distance”
# Compute the pairwise correlation between the landing distance and each factor X and storing it in Table1
cor1 <- cor(FAA_final$distance, FAA_final[,c(2,3,5,6)])
cor2 <- cor(subset(FAA_final,!is.na(FAA_final$speed_air))$distance,subset(FAA_final,!is.na(FAA_final$speed_air))$speed_air)
cor3 <- cor(subset(FAA_final,!is.na(FAA_final$duration))$distance,subset(FAA_final,!is.na(FAA_final$duration))$duration)
cor <- c(cor1,cor2,cor3)
direction <- c('negative','positive','positive','positive','positive','negative')
size_of_cor <- abs(cor)
names_variables <- colnames(FAA_final)[c(2,3,5,6,4,8)]
Table1 <- data.frame(names_variables,size_of_cor,direction)
Table1 <- Table1[
order( Table1[,2],decreasing = TRUE ),
]
Table1
# Creating XY plots
plot(FAA_final$distance,FAA_final$duration)
plot(FAA_final$distance,FAA_final$no_pasg)
plot(FAA_final$distance,FAA_final$speed_ground)
plot(FAA_final$distance,FAA_final$speed_air)
plot(FAA_final$distance,FAA_final$height)
plot(FAA_final$distance,FAA_final$pitch)
On the basis of plots we observe some relationship between-
The relationship is consistent with computed high correlations for these variables. Other variables do not show any linear relationship which is consistent with their low size of correlations.
# Encoding aircraft (airbus is coded as 1 and boeing as 0)
for(i in 1:831)
{
if(FAA_final$aircraft[i] == 'airbus')
{
FAA_final$aircraft[i] = 1
}
if(FAA_final$aircraft[i] == 'boeing')
{
FAA_final$aircraft[i] = 0
}
}
FAA_final$aircraft <- as.numeric(FAA_final$aircraft)
cor_aircraft <- cor(FAA_final$aircraft,FAA_final$distance)
plot(FAA_final$aircraft,FAA_final$distance)
We can conclude from negative correlation and the plot that boeing has relatively higher distribution of distance than airbus.
Regression using a single factor each time
lm_1 <- lm(distance ~ aircraft, FAA_final)
summary(lm_1)
##
## Call:
## lm(formula = distance ~ aircraft, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1281.6 -630.3 -229.4 388.2 3631.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1750.98 44.28 39.54 < 2e-16 ***
## aircraft -427.67 60.58 -7.06 3.53e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 871.1 on 829 degrees of freedom
## Multiple R-squared: 0.05671, Adjusted R-squared: 0.05557
## F-statistic: 49.84 on 1 and 829 DF, p-value: 3.526e-12
d_1 <- 'negative'
p_1 <- summary(lm_1)$coefficients[,4][2]
lm_2 <- lm(distance ~ no_pasg, FAA_final)
summary(lm_2)
##
## Call:
## lm(formula = distance ~ no_pasg, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1444.8 -622.8 -271.3 414.6 3884.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1650.076 251.460 6.562 9.35e-11 ***
## no_pasg -2.125 4.155 -0.511 0.609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 896.7 on 829 degrees of freedom
## Multiple R-squared: 0.0003153, Adjusted R-squared: -0.0008906
## F-statistic: 0.2615 on 1 and 829 DF, p-value: 0.6093
d_2 <- 'negative'
p_2 <- summary(lm_2)$coefficients[,4][2]
lm_3 <- lm(distance ~ speed_ground, FAA_final)
summary(lm_3)
##
## Call:
## lm(formula = distance ~ speed_ground, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -897.09 -319.16 -72.09 210.83 1798.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1773.9407 67.8388 -26.15 <2e-16 ***
## speed_ground 41.4422 0.8302 49.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 448.1 on 829 degrees of freedom
## Multiple R-squared: 0.7504, Adjusted R-squared: 0.7501
## F-statistic: 2492 on 1 and 829 DF, p-value: < 2.2e-16
d_3 <- 'positive'
p_3 <- summary(lm_3)$coefficients[,4][2]
lm_4 <- lm(distance ~ speed_air, FAA_final)
summary(lm_4)
##
## Call:
## lm(formula = distance ~ speed_air, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -776.21 -196.39 8.72 209.17 624.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5455.709 207.547 -26.29 <2e-16 ***
## speed_air 79.532 1.997 39.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.3 on 201 degrees of freedom
## (628 observations deleted due to missingness)
## Multiple R-squared: 0.8875, Adjusted R-squared: 0.887
## F-statistic: 1586 on 1 and 201 DF, p-value: < 2.2e-16
d_4 <- 'positive'
p_4 <- summary(lm_4)$coefficients[,4][2]
lm_5 <- lm(distance ~ height, FAA_final)
summary(lm_5)
##
## Call:
## lm(formula = distance ~ height, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1338.5 -606.9 -253.8 388.5 3932.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1245.116 101.272 12.295 < 2e-16 ***
## height 9.107 3.166 2.877 0.00412 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 892.4 on 829 degrees of freedom
## Multiple R-squared: 0.009883, Adjusted R-squared: 0.008688
## F-statistic: 8.274 on 1 and 829 DF, p-value: 0.004124
d_5 <- 'positive'
p_5 <- summary(lm_5)$coefficients[,4][2]
lm_6 <- lm(distance ~ pitch, FAA_final)
summary(lm_6)
##
## Call:
## lm(formula = distance ~ pitch, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1338.1 -644.3 -241.2 402.5 3839.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 929.1 237.9 3.905 0.000102 ***
## pitch 148.1 58.9 2.515 0.012081 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 893.5 on 829 degrees of freedom
## Multiple R-squared: 0.007574, Adjusted R-squared: 0.006377
## F-statistic: 6.327 on 1 and 829 DF, p-value: 0.01208
d_6 <- 'positive'
p_6 <- summary(lm_6)$coefficients[,4][2]
lm_7 <- lm(distance ~ duration, FAA_final)
summary(lm_7)
##
## Call:
## lm(formula = distance ~ duration, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1464.9 -615.6 -274.7 408.5 3847.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1689.9942 108.5452 15.569 <2e-16 ***
## duration -0.9613 0.6694 -1.436 0.151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 904 on 779 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.00264, Adjusted R-squared: 0.00136
## F-statistic: 2.062 on 1 and 779 DF, p-value: 0.1514
d_7 <- 'negative'
p_7 <- summary(lm_7)$coefficients[,4][2]
p_value <- c(p_1,p_2,p_3,p_4,p_5,p_6,p_7)
dir <- c(d_1,d_2,d_3,d_4,d_5,d_6,d_7)
names <- colnames(FAA_final[c(1:6,8)])
Table2 <- data.frame(names,p_value,dir)
Table2 <- Table2[
order( Table2[,2] ),
]
Table2
# Standardizing each X variable
m1 <- mean(FAA_final$aircraft)
s1 <- sd(FAA_final$aircraft)
m2 <- mean(FAA_final$no_pasg)
s2 <- sd(FAA_final$no_pasg)
m3 <- mean(FAA_final$speed_ground)
s3 <- sd(FAA_final$speed_ground)
m4 <- mean(FAA_final$speed_air, na.rm = TRUE)
s4 <- sd(FAA_final$speed_air, na.rm = TRUE)
m5 <- mean(FAA_final$height)
s5 <- sd(FAA_final$height)
m6 <- mean(FAA_final$pitch)
s6 <- sd(FAA_final$pitch)
m7 <- mean(FAA_final$duration, na.rm = TRUE)
s7 <- sd(FAA_final$duration, na.rm = TRUE)
FAA_final <- FAA_final %>% mutate(aircraft_s = (aircraft - m1)/s1)
FAA_final <- FAA_final %>% mutate(no_pasg_s = (no_pasg - m2)/s2)
FAA_final <- FAA_final %>% mutate(speed_ground_s = (speed_ground - m3)/s3)
FAA_final <- FAA_final %>% mutate(speed_air_s = (speed_air - m4)/s4)
FAA_final <- FAA_final %>% mutate(height_s = (height - m5)/s5)
FAA_final <- FAA_final %>% mutate(pitch_s = (pitch - m6)/s6)
FAA_final <- FAA_final %>% mutate(duration_s = (duration - m7)/s7)
lmm_1 <- lm(distance ~ aircraft_s, FAA_final)
summary(lmm_1)
##
## Call:
## lm(formula = distance ~ aircraft_s, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1281.6 -630.3 -229.4 388.2 3631.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1522.48 30.22 50.38 < 2e-16 ***
## aircraft_s -213.46 30.24 -7.06 3.53e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 871.1 on 829 degrees of freedom
## Multiple R-squared: 0.05671, Adjusted R-squared: 0.05557
## F-statistic: 49.84 on 1 and 829 DF, p-value: 3.526e-12
dd_1 <- 'negative'
p_1 <- summary(lmm_1)$coefficients[,1][2]
lmm_2 <- lm(distance ~ no_pasg, FAA_final)
summary(lmm_2)
##
## Call:
## lm(formula = distance ~ no_pasg, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1444.8 -622.8 -271.3 414.6 3884.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1650.076 251.460 6.562 9.35e-11 ***
## no_pasg -2.125 4.155 -0.511 0.609
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 896.7 on 829 degrees of freedom
## Multiple R-squared: 0.0003153, Adjusted R-squared: -0.0008906
## F-statistic: 0.2615 on 1 and 829 DF, p-value: 0.6093
dd_2 <- 'negative'
p_2 <- summary(lmm_2)$coefficients[,1][2]
lmm_3 <- lm(distance ~ speed_ground, FAA_final)
summary(lmm_3)
##
## Call:
## lm(formula = distance ~ speed_ground, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -897.09 -319.16 -72.09 210.83 1798.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1773.9407 67.8388 -26.15 <2e-16 ***
## speed_ground 41.4422 0.8302 49.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 448.1 on 829 degrees of freedom
## Multiple R-squared: 0.7504, Adjusted R-squared: 0.7501
## F-statistic: 2492 on 1 and 829 DF, p-value: < 2.2e-16
dd_3 <- 'positive'
p_3 <- summary(lmm_3)$coefficients[,1][2]
lmm_4 <- lm(distance ~ speed_air, FAA_final)
summary(lmm_4)
##
## Call:
## lm(formula = distance ~ speed_air, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -776.21 -196.39 8.72 209.17 624.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5455.709 207.547 -26.29 <2e-16 ***
## speed_air 79.532 1.997 39.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.3 on 201 degrees of freedom
## (628 observations deleted due to missingness)
## Multiple R-squared: 0.8875, Adjusted R-squared: 0.887
## F-statistic: 1586 on 1 and 201 DF, p-value: < 2.2e-16
dd_4 <- 'positive'
p_4 <- summary(lmm_4)$coefficients[,1][2]
lmm_5 <- lm(distance ~ height, FAA_final)
summary(lmm_5)
##
## Call:
## lm(formula = distance ~ height, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1338.5 -606.9 -253.8 388.5 3932.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1245.116 101.272 12.295 < 2e-16 ***
## height 9.107 3.166 2.877 0.00412 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 892.4 on 829 degrees of freedom
## Multiple R-squared: 0.009883, Adjusted R-squared: 0.008688
## F-statistic: 8.274 on 1 and 829 DF, p-value: 0.004124
dd_5 <- 'positive'
p_5 <- summary(lmm_5)$coefficients[,1][2]
lmm_6 <- lm(distance ~ pitch, FAA_final)
summary(lmm_6)
##
## Call:
## lm(formula = distance ~ pitch, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1338.1 -644.3 -241.2 402.5 3839.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 929.1 237.9 3.905 0.000102 ***
## pitch 148.1 58.9 2.515 0.012081 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 893.5 on 829 degrees of freedom
## Multiple R-squared: 0.007574, Adjusted R-squared: 0.006377
## F-statistic: 6.327 on 1 and 829 DF, p-value: 0.01208
dd_6 <- 'positive'
p_6 <- summary(lmm_6)$coefficients[,1][2]
lmm_7 <- lm(distance ~ duration, FAA_final)
summary(lmm_7)
##
## Call:
## lm(formula = distance ~ duration, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1464.9 -615.6 -274.7 408.5 3847.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1689.9942 108.5452 15.569 <2e-16 ***
## duration -0.9613 0.6694 -1.436 0.151
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 904 on 779 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.00264, Adjusted R-squared: 0.00136
## F-statistic: 2.062 on 1 and 779 DF, p-value: 0.1514
dd_7 <- 'negative'
p_7 <- summary(lmm_7)$coefficients[,1][2]
size_coff <- c(p_1,p_2,p_3,p_4,p_5,p_6,p_7)
dir <- c(dd_1,dd_2,dd_3,dd_4,dd_5,dd_6,dd_7)
names <- colnames(FAA_final[c(9:15)])
Table3 <- data.frame(names,abs(size_coff),dir)
Table3 <- Table3[
order( Table3[,2],decreasing = TRUE ),
]
Table3
Relative order of importance on the basis of Table1,2 and 3
Table1 and Table2 are consistent. However we observe some differences in the order in Table3.
# Creating Table0 (Ranking based on all the factors)
names <- c('speed_ground','speed_air','aircraft','pitch','height','no_pasg','duration')
importance <- c(1:7)
Table0 <- data.frame(names,importance)
Check collinearity
l1 <- lm(distance~speed_ground, FAA_final)
summary(l1)
##
## Call:
## lm(formula = distance ~ speed_ground, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -897.09 -319.16 -72.09 210.83 1798.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1773.9407 67.8388 -26.15 <2e-16 ***
## speed_ground 41.4422 0.8302 49.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 448.1 on 829 degrees of freedom
## Multiple R-squared: 0.7504, Adjusted R-squared: 0.7501
## F-statistic: 2492 on 1 and 829 DF, p-value: < 2.2e-16
l2 <- lm(distance~speed_air, FAA_final)
summary(l2)
##
## Call:
## lm(formula = distance ~ speed_air, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -776.21 -196.39 8.72 209.17 624.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5455.709 207.547 -26.29 <2e-16 ***
## speed_air 79.532 1.997 39.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.3 on 201 degrees of freedom
## (628 observations deleted due to missingness)
## Multiple R-squared: 0.8875, Adjusted R-squared: 0.887
## F-statistic: 1586 on 1 and 201 DF, p-value: < 2.2e-16
l3 <- lm(distance~speed_ground + speed_air, FAA_final)
summary(l3)
##
## Call:
## lm(formula = distance ~ speed_ground + speed_air, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -819.74 -202.02 3.52 211.25 636.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5462.28 207.48 -26.327 < 2e-16 ***
## speed_ground -14.37 12.68 -1.133 0.258
## speed_air 93.96 12.89 7.291 6.99e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 200 degrees of freedom
## (628 observations deleted due to missingness)
## Multiple R-squared: 0.8883, Adjusted R-squared: 0.8871
## F-statistic: 795 on 2 and 200 DF, p-value: < 2.2e-16
We observe sign change in speed_ground when we fit it along with speed_air in comparison when it is fitted alone.
# Cor between speed_air and speed_ground
cor(subset(FAA_final,!is.na(FAA_final$speed_air))$speed_air,subset(FAA_final,!is.na(FAA_final$speed_air))$speed_ground)
## [1] 0.9879383
The correlation is very high. I would only keep speed_ground in the model as it does not have NAs. On the other hand speed_air has a lot of NAs.
Variable selection based on our ranking in Table 0
L1 <- lm(distance~speed_ground, FAA_final)
summary(L1)
##
## Call:
## lm(formula = distance ~ speed_ground, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -897.09 -319.16 -72.09 210.83 1798.88
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1773.9407 67.8388 -26.15 <2e-16 ***
## speed_ground 41.4422 0.8302 49.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 448.1 on 829 degrees of freedom
## Multiple R-squared: 0.7504, Adjusted R-squared: 0.7501
## F-statistic: 2492 on 1 and 829 DF, p-value: < 2.2e-16
r1 <- summary(L1)$r.squared
L2 <- lm(distance~speed_ground+aircraft, FAA_final)
summary(L2)
##
## Call:
## lm(formula = distance ~ speed_ground + aircraft, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -749.29 -256.28 -67.79 150.40 1541.85
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1553.5112 58.0094 -26.78 <2e-16 ***
## speed_ground 41.9718 0.6958 60.32 <2e-16 ***
## aircraft -491.4009 26.1192 -18.81 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 375.3 on 828 degrees of freedom
## Multiple R-squared: 0.8251, Adjusted R-squared: 0.8247
## F-statistic: 1953 on 2 and 828 DF, p-value: < 2.2e-16
r2 <- summary(L2)$r.squared
L3 <- lm(distance~speed_ground+aircraft+pitch, FAA_final)
summary(L3)
##
## Call:
## lm(formula = distance ~ speed_ground + aircraft + pitch, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -753.66 -255.01 -68.03 146.27 1594.06
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1756.2213 126.6005 -13.872 <2e-16 ***
## speed_ground 42.0050 0.6951 60.428 <2e-16 ***
## aircraft -473.6692 27.8806 -16.989 <2e-16 ***
## pitch 47.5871 26.4260 1.801 0.0721 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 374.8 on 827 degrees of freedom
## Multiple R-squared: 0.8258, Adjusted R-squared: 0.8252
## F-statistic: 1307 on 3 and 827 DF, p-value: < 2.2e-16
r3 <- summary(L3)$r.squared
L4 <- lm(distance~speed_ground+aircraft+pitch+height, FAA_final)
summary(L4)
##
## Call:
## lm(formula = distance ~ speed_ground + aircraft + pitch + height,
## data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -716.81 -224.12 -93.24 127.80 1500.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2183.0542 123.6415 -17.66 <2e-16 ***
## speed_ground 42.4283 0.6479 65.49 <2e-16 ***
## aircraft -481.2682 25.9512 -18.55 <2e-16 ***
## pitch 39.6076 24.5991 1.61 0.108
## height 14.0909 1.2398 11.37 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 348.7 on 826 degrees of freedom
## Multiple R-squared: 0.8494, Adjusted R-squared: 0.8486
## F-statistic: 1164 on 4 and 826 DF, p-value: < 2.2e-16
r4 <- summary(L4)$r.squared
L5 <- lm(distance~speed_ground+aircraft+pitch+height+no_pasg, FAA_final)
summary(L5)
##
## Call:
## lm(formula = distance ~ speed_ground + aircraft + pitch + height +
## no_pasg, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -686.17 -224.30 -91.72 124.71 1512.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2051.9159 156.6244 -13.101 <2e-16 ***
## speed_ground 42.4295 0.6475 65.524 <2e-16 ***
## aircraft -480.6917 25.9412 -18.530 <2e-16 ***
## pitch 39.2066 24.5881 1.595 0.111
## height 14.1703 1.2405 11.423 <2e-16 ***
## no_pasg -2.2039 1.6172 -1.363 0.173
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 348.5 on 825 degrees of freedom
## Multiple R-squared: 0.8497, Adjusted R-squared: 0.8488
## F-statistic: 932.9 on 5 and 825 DF, p-value: < 2.2e-16
r5 <- summary(L5)$r.squared
L6 <- lm(distance~speed_ground+aircraft+pitch+height+no_pasg+duration, FAA_final)
summary(L6)
##
## Call:
## lm(formula = distance ~ speed_ground + aircraft + pitch + height +
## no_pasg + duration, data = FAA_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -690.6 -224.5 -91.5 123.0 1481.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2.025e+03 1.704e+02 -11.884 <2e-16 ***
## speed_ground 4.257e+01 6.680e-01 63.719 <2e-16 ***
## aircraft -4.888e+02 2.699e+01 -18.106 <2e-16 ***
## pitch 1.965e+01 2.587e+01 0.760 0.448
## height 1.429e+01 1.294e+00 11.038 <2e-16 ***
## no_pasg -1.633e+00 1.673e+00 -0.976 0.329
## duration 4.676e-02 2.609e-01 0.179 0.858
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 351 on 774 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.8506, Adjusted R-squared: 0.8494
## F-statistic: 734.5 on 6 and 774 DF, p-value: < 2.2e-16
r6 <- summary(L6)$r.squared
num <- c(1:6)
plot(c(r1,r2,r3,r4,r5,r6),num)
R squared increases as the number of variables increase
adj.r1 <- summary(L1)$adj.r.squared
adj.r2 <- summary(L2)$adj.r.squared
adj.r3 <- summary(L3)$adj.r.squared
adj.r4 <- summary(L4)$adj.r.squared
adj.r5 <- summary(L5)$adj.r.squared
adj.r6 <- summary(L6)$adj.r.squared
c(adj.r1,adj.r2,adj.r3,adj.r4,adj.r5,adj.r6)
## [1] 0.7500773 0.8247095 0.8251830 0.8486423 0.8487992 0.8494442
plot(c(adj.r1,adj.r2,adj.r3,adj.r4,adj.r5,adj.r6),num)
Adj R Squared also increases but the increase is less per addition of new variable
# AIC
AIC1 <- AIC(L1)
AIC2 <- AIC(L2)
AIC3 <- AIC(L3)
AIC4 <- AIC(L4)
AIC5 <- AIC(L5)
AIC6 <- AIC(L6)
c(AIC1,AIC2,AIC3,AIC4,AIC5,AIC6)
## [1] 12508.81 12215.05 12213.79 12095.05 12095.18 11379.88
plot(c(AIC1,AIC2,AIC3,AIC4,AIC5,AIC6),num)
On the basis of Adj R Squared and AIC I would select model L4. That is the model with 4 predictors - (speed_ground,aircraft,pitch and height)
Variable selection based on automate algorithm
Model1_LM <- lm(distance ~ 1, data = FAA_final[,c(1:3,5:7)])
fit1_LM <- stepAIC(Model1_LM, direction = 'forward')
## Start: AIC=11299.8
## distance ~ 1
fit1 <- lm(distance ~ ., FAA_final[,c(1:3,5:7)])
fit2 <- lm(distance ~ 1, FAA_final[,c(1:3,5:7)])
stepAIC(fit1,direction="backward")
## Start: AIC=9734.9
## distance ~ aircraft + no_pasg + speed_ground + height + pitch
##
## Df Sum of Sq RSS AIC
## - no_pasg 1 225608 100445017 9734.8
## <none> 100219409 9734.9
## - pitch 1 308863 100528273 9735.5
## - height 1 15851243 116070653 9854.9
## - aircraft 1 41711151 141930560 10022.1
## - speed_ground 1 521552033 621771442 11249.7
##
## Step: AIC=9734.77
## distance ~ aircraft + speed_ground + height + pitch
##
## Df Sum of Sq RSS AIC
## <none> 100445017 9734.8
## - pitch 1 315259 100760276 9735.4
## - height 1 15708637 116153653 9853.5
## - aircraft 1 41822381 142267398 10022.0
## - speed_ground 1 521523226 621968242 11247.9
##
## Call:
## lm(formula = distance ~ aircraft + speed_ground + height + pitch,
## data = FAA_final[, c(1:3, 5:7)])
##
## Coefficients:
## (Intercept) aircraft speed_ground height pitch
## -2183.05 -481.27 42.43 14.09 39.61
stepAIC(fit2,direction="forward",scope=list(upper=fit1,lower=fit2))
## Start: AIC=11299.8
## distance ~ 1
##
## Df Sum of Sq RSS AIC
## + speed_ground 1 500382567 166457762 10148
## + aircraft 1 37818390 629021939 11253
## + height 1 6590108 660250221 11294
## + pitch 1 5050617 661789712 11296
## <none> 666840329 11300
## + no_pasg 1 210253 666630076 11302
##
## Step: AIC=10148.53
## distance ~ speed_ground
##
## Df Sum of Sq RSS AIC
## + aircraft 1 49848656 116609106 9854.8
## + height 1 14916377 151541385 10072.5
## + pitch 1 9765095 156692668 10100.3
## <none> 166457762 10148.5
## + no_pasg 1 207528 166250234 10149.5
##
## Step: AIC=9854.77
## distance ~ speed_ground + aircraft
##
## Df Sum of Sq RSS AIC
## + height 1 15848830 100760276 9735.4
## + pitch 1 455453 116153653 9853.5
## <none> 116609106 9854.8
## + no_pasg 1 87171 116521935 9856.1
##
## Step: AIC=9735.37
## distance ~ speed_ground + aircraft + height
##
## Df Sum of Sq RSS AIC
## + pitch 1 315259 100445017 9734.8
## <none> 100760276 9735.4
## + no_pasg 1 232003 100528273 9735.5
##
## Step: AIC=9734.77
## distance ~ speed_ground + aircraft + height + pitch
##
## Df Sum of Sq RSS AIC
## <none> 100445017 9734.8
## + no_pasg 1 225608 100219409 9734.9
##
## Call:
## lm(formula = distance ~ speed_ground + aircraft + height + pitch,
## data = FAA_final[, c(1:3, 5:7)])
##
## Coefficients:
## (Intercept) speed_ground aircraft height pitch
## -2183.05 42.43 -481.27 14.09 39.61
stepAIC(fit2,direction="both",scope=list(upper=fit1,lower=fit2))
## Start: AIC=11299.8
## distance ~ 1
##
## Df Sum of Sq RSS AIC
## + speed_ground 1 500382567 166457762 10148
## + aircraft 1 37818390 629021939 11253
## + height 1 6590108 660250221 11294
## + pitch 1 5050617 661789712 11296
## <none> 666840329 11300
## + no_pasg 1 210253 666630076 11302
##
## Step: AIC=10148.53
## distance ~ speed_ground
##
## Df Sum of Sq RSS AIC
## + aircraft 1 49848656 116609106 9854.8
## + height 1 14916377 151541385 10072.5
## + pitch 1 9765095 156692668 10100.3
## <none> 166457762 10148.5
## + no_pasg 1 207528 166250234 10149.5
## - speed_ground 1 500382567 666840329 11299.8
##
## Step: AIC=9854.77
## distance ~ speed_ground + aircraft
##
## Df Sum of Sq RSS AIC
## + height 1 15848830 100760276 9735.4
## + pitch 1 455453 116153653 9853.5
## <none> 116609106 9854.8
## + no_pasg 1 87171 116521935 9856.1
## - aircraft 1 49848656 166457762 10148.5
## - speed_ground 1 512412832 629021939 11253.3
##
## Step: AIC=9735.37
## distance ~ speed_ground + aircraft + height
##
## Df Sum of Sq RSS AIC
## + pitch 1 315259 100445017 9734.8
## <none> 100760276 9735.4
## + no_pasg 1 232003 100528273 9735.5
## - height 1 15848830 116609106 9854.8
## - aircraft 1 50781109 151541385 10072.5
## - speed_ground 1 521208000 621968276 11245.9
##
## Step: AIC=9734.77
## distance ~ speed_ground + aircraft + height + pitch
##
## Df Sum of Sq RSS AIC
## <none> 100445017 9734.8
## + no_pasg 1 225608 100219409 9734.9
## - pitch 1 315259 100760276 9735.4
## - height 1 15708637 116153653 9853.5
## - aircraft 1 41822381 142267398 10022.0
## - speed_ground 1 521523226 621968242 11247.9
##
## Call:
## lm(formula = distance ~ speed_ground + aircraft + height + pitch,
## data = FAA_final[, c(1:3, 5:7)])
##
## Coefficients:
## (Intercept) speed_ground aircraft height pitch
## -2183.05 42.43 -481.27 14.09 39.61
On the basis of stepAIC forward variable selection I would select model with 2 predictors - (speed_ground and aircraft)