#read excel files
library("readxl")
faa1 <- read_excel('faa1.xls')
faa2 <- read_excel('faa2.xls')
str(faa1)
## Classes 'tbl_df', 'tbl' and 'data.frame': 800 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
FAA1 data has sample size = 800 and 8 variables/features
str(faa2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 150 obs. of 7 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
FAA2 data has sample size = 150 and 7 variables/features. It does not contain the ‘duration’ variable which is present in FAA1
#merge datasets
library('plyr')
faa <- rbind.fill(faa1,faa2)
str(faa)
## 'data.frame': 950 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
#check for duplicates - the duplicates are calculated without including the duration column as this was not present in faa1
sum(duplicated(faa[,c(1,3:8)]))
## [1] 100
100 duplicated records are found. We will remove these records
library('tidyverse')
faa.no.dup <- distinct(faa,aircraft,no_pasg,speed_ground,speed_air,height,pitch,distance,.keep_all = T)
str(faa.no.dup)
## 'data.frame': 850 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
The merged dataset has sample size = 850 and 8 variables/features
summary(faa.no.dup)
## aircraft duration no_pasg speed_ground
## Length:850 Min. : 14.76 Min. :29.0 Min. : 27.74
## Class :character 1st Qu.:119.49 1st Qu.:55.0 1st Qu.: 65.90
## Mode :character Median :153.95 Median :60.0 Median : 79.64
## Mean :154.01 Mean :60.1 Mean : 79.45
## 3rd Qu.:188.91 3rd Qu.:65.0 3rd Qu.: 92.06
## Max. :305.62 Max. :87.0 Max. :141.22
## NA's :50
## speed_air height pitch distance
## Min. : 90.00 Min. :-3.546 Min. :2.284 Min. : 34.08
## 1st Qu.: 96.25 1st Qu.:23.314 1st Qu.:3.642 1st Qu.: 883.79
## Median :101.15 Median :30.093 Median :4.008 Median :1258.09
## Mean :103.80 Mean :30.144 Mean :4.009 Mean :1526.02
## 3rd Qu.:109.40 3rd Qu.:36.993 3rd Qu.:4.377 3rd Qu.:1936.95
## Max. :141.72 Max. :59.946 Max. :5.927 Max. :6533.05
## NA's :642
#check abnormal values
height_abnormal <- sum(faa.no.dup$height<6)
duration_abnormal <- sum(faa.no.dup$duration<40)
speed_ground_abnormal <- sum(faa.no.dup$speed_ground<30 | faa.no.dup$speed_ground > 140)
speed_air_abnormal <- sum(faa.no.dup$speed_air<30 | faa.no.dup$speed_air > 140)
height_abnormal;duration_abnormal;speed_ground_abnormal;speed_air_abnormal
## [1] 10
## [1] NA
## [1] 3
## [1] NA
10 rows were found that contained abnormal values for height and 3 rows contained abnormal values for speed_ground
#remove rows with abnormal values
faa.no.dup <- faa.no.dup[faa.no.dup$height > 6,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground > 30,]
faa.no.dup <- faa.no.dup[faa.no.dup$speed_ground < 140,]
str(faa.no.dup)
## 'data.frame': 837 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
#plot histograms for the datasets
par(mfrow = c(2,4))
numericcols <- c('duration','no_pasg','speed_ground','speed_air','height','pitch','distance')
for (i in numericcols){
hist(faa.no.dup[[i]], xlab = i,main='')
}
#encoding aircarft column a 0/1
faa.temp <- faa.no.dup
faa.temp$aircraft <- ifelse(faa.temp$aircraft == 'boeing', 1, 0)
#compute pairwise correlation
corr <- cor(faa.no.dup$distance,faa.temp[,-8],use = "pairwise.complete.obs")
table1 <- tibble(
variable = c('aircraft','duration','no_pasg','speed_ground','speed_air','height','pitch'),
corr_coef = corr[1:7],
)
table1$dir <- ifelse(table1$corr_coef > 0, "positive", "negative")
table1 %>% arrange(desc(abs(corr_coef)))
## # A tibble: 7 x 3
## variable corr_coef dir
## <chr> <dbl> <chr>
## 1 speed_air 0.944 positive
## 2 speed_ground 0.867 positive
## 3 aircraft 0.243 positive
## 4 height 0.115 positive
## 5 pitch 0.0935 positive
## 6 duration -0.0645 negative
## 7 no_pasg -0.0175 negative
#plot scatterplots for the datasets
par(mfrow = c(2,4))
plot(faa.temp$aircraft, faa.temp$distance,xlab = 'aircraft',ylab = 'distance')
plot(faa.temp$duration, faa.temp$distance,xlab = 'duration',ylab = 'distance')
plot(faa.temp$no_pasg, faa.temp$distance,xlab = 'no_pasg',ylab = 'distance')
plot(faa.temp$speed_ground, faa.temp$distance,xlab = 'speed_ground',ylab = 'distance')
plot(faa.temp$speed_air, faa.temp$distance,xlab = 'speed_air',ylab = 'distance')
plot(faa.temp$height, faa.temp$distance,xlab = 'height',ylab = 'distance')
plot(faa.temp$pitch, faa.temp$distance,xlab = 'pitch',ylab = 'distance')
As can be observed from both the table and the scatterplot the correlations of distance with speed_Air and speed_groud are significantly high
vars <- c('aircraft','duration','no_pasg','speed_ground','speed_air','height','pitch')
fittedpvalues <- vector()
fittedcoeffs <- vector()
fittedpvalues[1] <- summary(lm(distance~aircraft,data = faa.temp))$coefficients[2,4]
fittedcoeffs[1] <- summary(lm(distance~aircraft,data = faa.temp))$coefficients[2,1]
fittedpvalues[2] <- summary(lm(distance~duration,data = faa.temp))$coefficients[2,4]
fittedcoeffs[2] <- summary(lm(distance~duration,data = faa.temp))$coefficients[2,1]
fittedpvalues[3] <- summary(lm(distance~no_pasg,data = faa.temp))$coefficients[2,4]
fittedcoeffs[3] <- summary(lm(distance~no_pasg,data = faa.temp))$coefficients[2,1]
fittedpvalues[4] <- summary(lm(distance~speed_ground,data = faa.temp))$coefficients[2,4]
fittedcoeffs[4] <- summary(lm(distance~speed_ground,data = faa.temp))$coefficients[2,1]
fittedpvalues[5] <- summary(lm(distance~speed_air,data = faa.temp))$coefficients[2,4]
fittedcoeffs[5] <- summary(lm(distance~speed_air,data = faa.temp))$coefficients[2,1]
fittedpvalues[6] <- summary(lm(distance~height,data = faa.temp))$coefficients[2,4]
fittedcoeffs[6] <- summary(lm(distance~height,data = faa.temp))$coefficients[2,1]
fittedpvalues[7] <- summary(lm(distance~pitch,data = faa.temp))$coefficients[2,4]
fittedcoeffs[7] <- summary(lm(distance~pitch,data = faa.temp))$coefficients[2,1]
table2 <- tibble(
variable = vars,
pvalues = fittedpvalues[1:7]
)
table2$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table2 %>% arrange(abs(pvalues))
## # A tibble: 7 x 3
## variable pvalues dir
## <chr> <dbl> <chr>
## 1 speed_ground 2.90e-254 positive
## 2 speed_air 2.18e-100 positive
## 3 aircraft 1.08e- 12 positive
## 4 height 8.99e- 4 positive
## 5 pitch 6.80e- 3 positive
## 6 duration 7.04e- 2 negative
## 7 no_pasg 6.14e- 1 negative
#standardize each variable
faa.std <- faa.temp
distance <- faa.std[,8]
faa.std <- scale(faa.temp[,-8])
faa.std <- data.frame(cbind(faa.std,distance))
summary(faa.std)
## aircraft duration no_pasg
## Min. :-0.9358 Min. :-2.82092 Min. :-4.152686
## 1st Qu.:-0.9358 1st Qu.:-0.71008 1st Qu.:-0.674881
## Median :-0.9358 Median : 0.00347 Median :-0.006073
## Mean : 0.0000 Mean : 0.00000 Mean : 0.000000
## 3rd Qu.: 1.0674 3rd Qu.: 0.71700 3rd Qu.: 0.662736
## Max. : 1.0674 Max. : 3.07649 Max. : 3.605494
## NA's :50
## speed_ground speed_air height
## Min. :-2.44824 Min. :-1.3698 Min. :-2.47685
## 1st Qu.:-0.71483 1st Qu.:-0.7432 1st Qu.:-0.70663
## Median : 0.01055 Median :-0.2511 Median :-0.03164
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.66202 3rd Qu.: 0.5777 3rd Qu.: 0.66634
## Max. : 3.02769 Max. : 3.3017 Max. : 2.99870
## NA's :630
## pitch distance
## Min. :-3.264440 Min. : 41.72
## 1st Qu.:-0.691831 1st Qu.: 896.58
## Median :-0.003058 Median :1264.93
## Mean : 0.000000 Mean :1531.77
## 3rd Qu.: 0.690761 3rd Qu.:1943.13
## Max. : 3.645477 Max. :6309.95
##
#regress distance on standardized variables
fittedpvalues <- vector()
fittedcoeffs <- vector()
fittedpvalues[1] <- summary(lm(distance~aircraft,data = faa.std))$coefficients[2,4]
fittedcoeffs[1] <- summary(lm(distance~aircraft,data = faa.std))$coefficients[2,1]
fittedpvalues[2] <- summary(lm(distance~duration,data = faa.std))$coefficients[2,4]
fittedcoeffs[2] <- summary(lm(distance~duration,data = faa.std))$coefficients[2,1]
fittedpvalues[3] <- summary(lm(distance~no_pasg,data = faa.std))$coefficients[2,4]
fittedcoeffs[3] <- summary(lm(distance~no_pasg,data = faa.std))$coefficients[2,1]
fittedpvalues[4] <- summary(lm(distance~speed_ground,data = faa.std))$coefficients[2,4]
fittedcoeffs[4] <- summary(lm(distance~speed_ground,data = faa.std))$coefficients[2,1]
fittedpvalues[5] <- summary(lm(distance~speed_air,data = faa.std))$coefficients[2,4]
fittedcoeffs[5] <- summary(lm(distance~speed_air,data = faa.std))$coefficients[2,1]
fittedpvalues[6] <- summary(lm(distance~height,data = faa.std))$coefficients[2,4]
fittedcoeffs[6] <- summary(lm(distance~height,data = faa.std))$coefficients[2,1]
fittedpvalues[7] <- summary(lm(distance~pitch,data = faa.std))$coefficients[2,4]
fittedcoeffs[7] <- summary(lm(distance~pitch,data = faa.std))$coefficients[2,1]
table3 <- tibble(
variable = vars,
pvalues = fittedpvalues[1:7]
)
table3$dir <- ifelse(fittedcoeffs >= 0, "positive", "negative")
table3 %>% arrange(abs(pvalues))
## # A tibble: 7 x 3
## variable pvalues dir
## <chr> <dbl> <chr>
## 1 speed_ground 2.90e-254 positive
## 2 speed_air 2.18e-100 positive
## 3 aircraft 1.08e- 12 positive
## 4 height 8.99e- 4 positive
## 5 pitch 6.80e- 3 positive
## 6 duration 7.04e- 2 negative
## 7 no_pasg 6.14e- 1 negative
On comparing the three tables 1,2 and 3, we find that the results are consistent except in the case of Table1 where speed_air is ranked above speed_ground while in Table 2 and Table 3 , speed_ground is ranked above speed_air in terms of importance.We will create a table that follows the order of ranking in the above tables without keeping speed_air since there are almost 80% missing values in this variable
table0 <- tibble(
var_by_importance = c('speed_ground','aircraft','height','pitch','duration','no_pasg')
)
table0
## # A tibble: 6 x 1
## var_by_importance
## <chr>
## 1 speed_ground
## 2 aircraft
## 3 height
## 4 pitch
## 5 duration
## 6 no_pasg
#Compare regression coefficients for distance vs speed_ground, distance vs speed_air, distance vs speed_ground+speed_air
cat('Regression Coefficient with speed_ground as predictor is',summary(lm(distance~speed_ground,data = faa.std))$coefficients[2,1],'\n')
## Regression Coefficient with speed_ground as predictor is 791.1548
cat('Regression Coefficient with speed_air as predictor is',summary(lm(distance~speed_air,data = faa.std))$coefficients[2,1],'\n')
## Regression Coefficient with speed_air as predictor is 805.2946
cat('Regression Coefficient with both speed_ground and speed_air as predictors are',summary(lm(distance~speed_ground+speed_air,data = faa.std))$coefficients[2,1],'and',summary(lm(distance~speed_ground+speed_air,data = faa.std))$coefficients[3,1],'respectively')
## Regression Coefficient with both speed_ground and speed_air as predictors are -288.4484 and 958.2226 respectively
There is a significant change in the 3 models. when only speed_ground is used as a variable, regression coefficient is 791.1548 while it is 805.2946 when speed_air is selected. When both are selected, coefficient for speed_ground becomes negative while speed_air coefficient further increases
# check correlation between speed_ground and speed_air variables
cor(faa.std$speed_ground,faa.std$speed_air,use = "pairwise.complete.obs")
## [1] 0.9885722
This ia a very high correlation and will lead to multicollinearity issue in the model if both variables are used. In our case, we will use the speed_ground variable because it has no missing values while the speed_air has almost 80% missing values. Also in the model where both variables are used, p-value for speed_ground is 0.238 making it insignificant which is not correct.
rsquared.model1 <- summary(lm(distance ~ speed_ground,data = faa.std))$r.squared
rsquared.model2 <- summary(lm(distance ~ speed_ground+aircraft,data = faa.std))$r.squared
rsquared.model3 <- summary(lm(distance ~ speed_ground+aircraft+height,data = faa.std))$r.squared
rsquared.model4 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch,data = faa.std))$r.squared
rsquared.model5 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch+duration,data = faa.std))$r.squared
rsquared.model6 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch+duration+no_pasg,data = faa.std))$r.squared
plot(c(1,2,3,4,5,6),c(rsquared.model1,rsquared.model2,rsquared.model3,rsquared.model4,rsquared.model5,rsquared.model6),type="b",ylab ="Rsquared", xlab ="The number of predictors")
The R-squared value keeps on increasing until we add 4 variables(speed_ground+speed_air+aircraft+height) after which it becomes constant.
adj.rsquared.model1 <- summary(lm(distance ~ speed_ground,data = faa.no.dup))$adj.r.squared
adj.rsquared.model2 <- summary(lm(distance ~ speed_ground+aircraft,data = faa.no.dup))$adj.r.squared
adj.rsquared.model3 <- summary(lm(distance ~ speed_ground+aircraft+height,data = faa.no.dup))$adj.r.squared
adj.rsquared.model4 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch,data = faa.no.dup))$adj.r.squared
adj.rsquared.model5 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch+duration,data = faa.no.dup))$adj.r.squared
adj.rsquared.model6 <- summary(lm(distance ~ speed_ground+aircraft+height+pitch+duration+no_pasg,data = faa.no.dup))$adj.r.squared
plot(c(1,2,3,4,5,6),c(adj.rsquared.model1,adj.rsquared.model2,adj.rsquared.model3,adj.rsquared.model4,adj.rsquared.model5,adj.rsquared.model6),type="b",ylab ="Adjusted Rsquared", xlab ="The number of predictors")
The adjusted R square model follows a similar pattern as the R squared statistic.
aic.model1 <- AIC(lm(distance ~ speed_ground,data = faa.std))
aic.model2 <- AIC(lm(distance ~ speed_ground+aircraft,data = faa.std))
aic.model3 <- AIC(lm(distance ~ speed_ground+aircraft+height,data = faa.std))
aic.model4 <- AIC(lm(distance ~ speed_ground+aircraft+height+pitch,data = faa.std))
aic.model5 <- AIC(lm(distance ~ speed_ground+aircraft+height+pitch+duration,data = faa.std))
aic.model6 <- AIC(lm(distance ~ speed_ground+aircraft+height+pitch+duration + no_pasg,data = faa.std))
plot(c(1,2,3,4,5,6),c(aic.model1,aic.model2,aic.model3,aic.model4,aic.model5,aic.model6),type="b",ylab ="Adjusted Rsquared", xlab ="The number of predictors")
The above chart does not look quite right. There is a sudden drop in AIC when 5th predictor(duration) is added. Let’s remove this variable and draw AIC plot again.
aic.model1 <- AIC(lm(distance ~ speed_ground,data = faa.std))
aic.model2 <- AIC(lm(distance ~ speed_ground+aircraft,data = faa.std))
aic.model3 <- AIC(lm(distance ~ speed_ground+aircraft+height,data = faa.std))
aic.model4 <- AIC(lm(distance ~ speed_ground+aircraft+height+pitch,data = faa.std))
aic.model5 <- AIC(lm(distance ~ speed_ground+aircraft+height+pitch+no_pasg,data = faa.std))
plot(c(1,2,3,4,5),c(aic.model1,aic.model2,aic.model3,aic.model4,aic.model5),type="b",ylab ="Adjusted Rsquared", xlab ="The number of predictors")
Using the R-squared, adjusted R Squared as well as AIC, we can conclude that the following 3 variables can be used for building the linear regression model - speed_ground, aircraft and height
require(MASS)
full.model <- lm(distance ~ speed_ground + aircraft + height + pitch + no_pasg,data= faa.std)
step.model <- stepAIC(full.model, direction = "forward", trace = FALSE)
summary(step.model)
##
## Call:
## lm(formula = distance ~ speed_ground + aircraft + height + pitch +
## no_pasg, data = faa.std)
##
## Residuals:
## Min 1Q Median 3Q Max
## -696.92 -226.28 -91.59 123.89 1877.20
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1531.77 12.23 125.268 <2e-16 ***
## speed_ground 806.73 12.26 65.813 <2e-16 ***
## aircraft 242.75 13.10 18.529 <2e-16 ***
## height 143.12 12.27 11.668 <2e-16 ***
## pitch 21.73 13.10 1.659 0.0976 .
## no_pasg -15.42 12.25 -1.258 0.2086
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 353.8 on 831 degrees of freedom
## Multiple R-squared: 0.8508, Adjusted R-squared: 0.8499
## F-statistic: 947.4 on 5 and 831 DF, p-value: < 2.2e-16
Using the automated model, we can again see the 3 most important variables are : speed_ground, aircraft and height