# reading input data
df1 <- read_excel("FAA1.xls", sheet = "FAA1")
df2 <- read_excel("FAA2.xls", sheet = "FAA2")
# structure of data
str(df1)
## Classes 'tbl_df', 'tbl' and 'data.frame': 800 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
# structure of data
str(df2)
## Classes 'tbl_df', 'tbl' and 'data.frame': 150 obs. of 7 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
# checking how many rows are duplicated
FAA <- bind_rows(df1,df2)
FAA %>%
select(-duration) %>%
duplicated() %>%
sum() %>%
print()
## [1] 100
# removing duplicated rows
index <-
FAA %>%
select(-duration) %>%
duplicated() %>%
which()
FAA <- FAA[-index,]
# structure of data
str(FAA)
## Classes 'tbl_df', 'tbl' and 'data.frame': 850 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
# summary statistics
summary(select(FAA, -aircraft))
## duration no_pasg speed_ground speed_air
## Min. : 14.76 Min. :29.0 Min. : 27.74 Min. : 90.00
## 1st Qu.:119.49 1st Qu.:55.0 1st Qu.: 65.90 1st Qu.: 96.25
## Median :153.95 Median :60.0 Median : 79.64 Median :101.15
## Mean :154.01 Mean :60.1 Mean : 79.45 Mean :103.80
## 3rd Qu.:188.91 3rd Qu.:65.0 3rd Qu.: 92.06 3rd Qu.:109.40
## Max. :305.62 Max. :87.0 Max. :141.22 Max. :141.72
## NA's :50 NA's :642
## height pitch distance
## Min. :-3.546 Min. :2.284 Min. : 34.08
## 1st Qu.:23.314 1st Qu.:3.642 1st Qu.: 883.79
## Median :30.093 Median :4.008 Median :1258.09
## Mean :30.144 Mean :4.009 Mean :1526.02
## 3rd Qu.:36.993 3rd Qu.:4.377 3rd Qu.:1936.95
## Max. :59.946 Max. :5.927 Max. :6533.05
##
# distribution of aircraft variable
table(FAA$aircraft)
##
## airbus boeing
## 450 400
# removing abnormal values
FAA1<-
FAA %>%
filter((is.na(duration) | duration >40),
(speed_ground >=30 | speed_ground <=140),
(is.na(speed_air) | speed_air >=30 | speed_air <=140),
height >=6, distance < 6000)
print(nrow(FAA) - nrow(FAA1))
## [1] 17
# structure of data
str(FAA1)
## Classes 'tbl_df', 'tbl' and 'data.frame': 833 obs. of 8 variables:
## $ aircraft : chr "boeing" "boeing" "boeing" "boeing" ...
## $ duration : num 98.5 125.7 112 196.8 90.1 ...
## $ no_pasg : num 53 69 61 56 70 55 54 57 61 56 ...
## $ speed_ground: num 107.9 101.7 71.1 85.8 59.9 ...
## $ speed_air : num 109 103 NA NA NA ...
## $ height : num 27.4 27.8 18.6 30.7 32.4 ...
## $ pitch : num 4.04 4.12 4.43 3.88 4.03 ...
## $ distance : num 3370 2988 1145 1664 1050 ...
# summary statistics
summary(select(FAA1, -aircraft))
## duration no_pasg speed_ground speed_air
## Min. : 41.95 Min. :29.00 Min. : 27.74 Min. : 90.00
## 1st Qu.:119.67 1st Qu.:55.00 1st Qu.: 66.08 1st Qu.: 96.23
## Median :154.28 Median :60.00 Median : 79.75 Median :101.12
## Mean :154.83 Mean :60.04 Mean : 79.42 Mean :103.48
## 3rd Qu.:189.75 3rd Qu.:65.00 3rd Qu.: 91.87 3rd Qu.:109.36
## Max. :305.62 Max. :87.00 Max. :132.78 Max. :132.91
## NA's :50 NA's :630
## height pitch distance
## Min. : 6.228 Min. :2.284 Min. : 41.72
## 1st Qu.:23.530 1st Qu.:3.641 1st Qu.: 893.58
## Median :30.159 Median :4.004 Median :1262.15
## Mean :30.442 Mean :4.006 Mean :1521.71
## 3rd Qu.:36.995 3rd Qu.:4.371 3rd Qu.:1936.01
## Max. :59.946 Max. :5.927 Max. :5381.96
##
# distribution of aircraft variable
table(FAA1$aircraft)
##
## airbus boeing
## 444 389
# pairwise plots
ggpairs(FAA1)
# calculating average pitch by aircraft
tapply(FAA1$pitch, FAA1$aircraft, mean)
## airbus boeing
## 3.831139 4.205725
# calculating average distance by aircraft
tapply(FAA1$distance, FAA1$aircraft, mean)
## airbus boeing
## 1323.317 1748.152
# calculating magnitude and sign of correlation of all variables with distance
table1 <-
cor(select(FAA1, -aircraft), use="complete.obs") %>%
as.data.frame() %>%
mutate(variable = rownames(.)) %>%
select(variable, correlation = distance) %>%
filter(variable != "distance") %>%
mutate(sign = ifelse(correlation > 0, "positive", "negative"),
correlation = abs(correlation)) %>%
arrange(desc(correlation))
table1
## variable correlation sign
## 1 speed_air 0.94321897 positive
## 2 speed_ground 0.92877195 positive
## 3 height 0.05775639 positive
## 4 duration 0.05241698 positive
## 5 pitch 0.03402263 positive
## 6 no_pasg 0.03258255 negative
# pairwise plots
ggpairs(FAA1)
# calculating magnitude and sign of correlation of all variables with distance for aircraft = airbus
corr_airbus <-
cor(select(filter(FAA1, aircraft == "airbus"), -aircraft)
, use="complete.obs") %>%
as.data.frame() %>%
mutate(variable = rownames(.)) %>%
select(variable, correlation = distance) %>%
filter(variable != "distance") %>%
mutate(sign = ifelse(correlation > 0, "positive", "negative"),
correlation = abs(correlation),
make = "airbus") %>%
arrange(desc(correlation)) %>%
select(make, everything())
# calculating magnitude and sign of correlation of all variables with distance for aircraft = boeing
corr_boeing <-
cor(select(filter(FAA1, aircraft == "boeing"), -aircraft)
, use="complete.obs") %>%
as.data.frame() %>%
mutate(variable = rownames(.)) %>%
select(variable, correlation = distance) %>%
filter(variable != "distance") %>%
mutate(sign = ifelse(correlation > 0, "positive", "negative"),
correlation = abs(correlation),
make="boeing") %>%
arrange(desc(correlation)) %>%
select(make, everything())
# appending the above two tables
bind_rows(corr_airbus,corr_boeing)
## make variable correlation sign
## 1 airbus speed_air 0.96526576 positive
## 2 airbus speed_ground 0.93993656 positive
## 3 airbus height 0.16388434 positive
## 4 airbus no_pasg 0.09282119 negative
## 5 airbus pitch 0.07153580 positive
## 6 airbus duration 0.04167093 positive
## 7 boeing speed_air 0.97759838 positive
## 8 boeing speed_ground 0.96809860 positive
## 9 boeing pitch 0.08529579 negative
## 10 boeing duration 0.07121179 positive
## 11 boeing height 0.01637134 positive
## 12 boeing no_pasg 0.01578596 positive
# Regressing distance on all other variables
model <- lm(distance ~ ., data=FAA1)
# calculating the p-value and sign of correlation of all variables in the above regression model
table2 <-
summary(model)$coefficients[-1,c(1,4)] %>%
as.data.frame() %>%
mutate(variable = rownames(.),
direction_coeff = ifelse(Estimate >0, "positive", "negative")) %>%
select(variable, 'Pr(>|t|)', direction_coeff) %>%
arrange(.[,2])
table2
## variable Pr(>|t|) direction_coeff
## 1 aircraftboeing 5.616468e-50 positive
## 2 height 1.907468e-28 positive
## 3 speed_air 2.714575e-28 positive
## 4 no_pasg 1.521782e-01 negative
## 5 pitch 4.693869e-01 negative
## 6 duration 5.321979e-01 positive
## 7 speed_ground 5.811038e-01 negative
# normalizing numerical variables to have mean=0 and sd=1
FAA_normalized <-
apply(FAA1[,-1], 2, function(x) (x-mean(x, na.rm = TRUE))/sd(x, na.rm = TRUE)) %>%
as.data.frame() %>%
cbind(FAA1[,1])
# Regressing distance on all normalized variables
model <- lm(distance ~ ., data=FAA_normalized)
# calculating the coefficient and sign of coefficient in the above regression model
table3 <-
summary(model)$coefficients[-1,c(1), drop=FALSE] %>%
as.data.frame() %>%
mutate(variable = rownames(.),
direction_coeff = ifelse(Estimate >0, "positive", "negative"),
Estimate = abs(Estimate)) %>%
select(variable, coefficient = Estimate, direction_coeff) %>%
arrange(desc(coefficient))
table3
## variable coefficient direction_coeff
## 1 speed_air 0.930187613 positive
## 2 aircraftboeing 0.489092199 positive
## 3 height 0.149343800 positive
## 4 speed_ground 0.074773453 negative
## 5 no_pasg 0.016590213 negative
## 6 pitch 0.007928255 negative
## 7 duration 0.006889602 positive
We can see that table2 and table3 agree with each other. However, results from table1 are not consistent with the other two:
table1
## variable correlation sign
## 1 speed_air 0.94321897 positive
## 2 speed_ground 0.92877195 positive
## 3 height 0.05775639 positive
## 4 duration 0.05241698 positive
## 5 pitch 0.03402263 positive
## 6 no_pasg 0.03258255 negative
table2
## variable Pr(>|t|) direction_coeff
## 1 aircraftboeing 5.616468e-50 positive
## 2 height 1.907468e-28 positive
## 3 speed_air 2.714575e-28 positive
## 4 no_pasg 1.521782e-01 negative
## 5 pitch 4.693869e-01 negative
## 6 duration 5.321979e-01 positive
## 7 speed_ground 5.811038e-01 negative
table3
## variable coefficient direction_coeff
## 1 speed_air 0.930187613 positive
## 2 aircraftboeing 0.489092199 positive
## 3 height 0.149343800 positive
## 4 speed_ground 0.074773453 negative
## 5 no_pasg 0.016590213 negative
## 6 pitch 0.007928255 negative
## 7 duration 0.006889602 positive
# running 3 separate regression models with different input variables
model1 <- lm(distance ~ speed_ground, data=FAA1)
model2 <- lm(distance ~ speed_air, data=FAA1)
model3 <- lm(distance ~ speed_ground + speed_air, data=FAA1)
summary(model1)
##
## Call:
## lm(formula = distance ~ speed_ground, data = FAA1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -904.18 -319.13 -75.69 213.51 1912.03
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1720.6284 68.3579 -25.17 <2e-16 ***
## speed_ground 40.8252 0.8374 48.75 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 456 on 831 degrees of freedom
## Multiple R-squared: 0.7409, Adjusted R-squared: 0.7406
## F-statistic: 2377 on 1 and 831 DF, p-value: < 2.2e-16
summary(model2)
##
## Call:
## lm(formula = distance ~ speed_air, data = FAA1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -776.21 -196.39 8.72 209.17 624.34
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5455.709 207.547 -26.29 <2e-16 ***
## speed_air 79.532 1.997 39.83 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.3 on 201 degrees of freedom
## (630 observations deleted due to missingness)
## Multiple R-squared: 0.8875, Adjusted R-squared: 0.887
## F-statistic: 1586 on 1 and 201 DF, p-value: < 2.2e-16
summary(model3)
##
## Call:
## lm(formula = distance ~ speed_ground + speed_air, data = FAA1)
##
## Residuals:
## Min 1Q Median 3Q Max
## -819.74 -202.02 3.52 211.25 636.25
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -5462.28 207.48 -26.327 < 2e-16 ***
## speed_ground -14.37 12.68 -1.133 0.258
## speed_air 93.96 12.89 7.291 6.99e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 276.1 on 200 degrees of freedom
## (630 observations deleted due to missingness)
## Multiple R-squared: 0.8883, Adjusted R-squared: 0.8871
## F-statistic: 795 on 2 and 200 DF, p-value: < 2.2e-16
# correlation between speed_ground and speed_air
cor(FAA1$speed_ground, FAA1$speed_air, use = "complete.obs")
## [1] 0.9879383
# running 6 separate regression models with different input variables
model1 <- lm(distance ~ aircraft, data=FAA1)
model2 <- lm(distance ~ aircraft+height, data=FAA1)
model3 <- lm(distance ~ aircraft+height+speed_air, data=FAA1)
model4 <- lm(distance ~ aircraft+height+speed_air+no_pasg, data=FAA1)
model5 <- lm(distance ~ aircraft+height+speed_air+no_pasg+duration, data=FAA1)
model6 <- lm(distance ~ aircraft+height+speed_air+no_pasg+duration+speed_ground
, data=FAA1)
# a function that takes a regression model (model1, model2 etc.) and criteria string (r square, adjusted r-square, aic etc. ) as input and provides the the criteria value and the number of input variables in the model
n_criteria_value <- function(model, criteria)
{
n = dim(summary(model)$coefficients)[1]-1
if(criteria == "r_squared")
{
criteria_value <- summary(model)$r.squared
} else if(criteria == "adj_r_squared")
{
criteria_value <- summary(model)$adj.r.squared
} else if(criteria == "aic")
{
criteria_value <- AIC(model)
}
return(c(criteria_value, n))
}
# calculating r-square and number of input variables in model1 till model6
r_square <-
rbind(n_criteria_value(model1,"r_squared"),
n_criteria_value(model2,"r_squared"),
n_criteria_value(model3,"r_squared"),
n_criteria_value(model4,"r_squared"),
n_criteria_value(model5,"r_squared"),
n_criteria_value(model6,"r_squared")) %>%
as.data.frame() %>%
select(r_square = V1, n = V2)
r_square
## r_square n
## 1 0.05609854 1
## 2 0.06686502 2
## 3 0.97372092 3
## 4 0.97411172 4
## 5 0.97460905 5
## 6 0.97463989 6
# plotting r-square vs number of input variables
ggplot(data=r_square, aes(x=n, y=r_square)) +
geom_line() +
scale_x_continuous(breaks=1:6)
# calculating adjusted r-square and number of input variables in model1 till model6
adj_r_square <-
rbind(n_criteria_value(model1,"adj_r_squared"),
n_criteria_value(model2,"adj_r_squared"),
n_criteria_value(model3,"adj_r_squared"),
n_criteria_value(model4,"adj_r_squared"),
n_criteria_value(model5,"adj_r_squared"),
n_criteria_value(model6,"adj_r_squared")) %>%
as.data.frame() %>%
select(adj_r_square = V1, n = V2)
adj_r_square
## adj_r_square n
## 1 0.05496268 1
## 2 0.06461650 2
## 3 0.97332476 3
## 4 0.97358872 4
## 5 0.97393733 5
## 6 0.97383052 6
# plotting adjusted r-square vs number of input variables
ggplot(data=adj_r_square, aes(x=n, y=adj_r_square)) +
geom_line() +
scale_x_continuous(breaks=1:6)
# calculating aic and number of input variables in model1 till model6
aic <-
rbind(n_criteria_value(model1,"aic"),
n_criteria_value(model2,"aic"),
n_criteria_value(model3,"aic"),
n_criteria_value(model4,"aic"),
n_criteria_value(model5,"aic"),
n_criteria_value(model6,"aic")) %>%
as.data.frame() %>%
select(aic = V1, n = V2)
aic
## aic n
## 1 13645.148 1
## 2 13637.592 2
## 3 2571.310 3
## 4 2570.268 4
## 5 2471.476 5
## 6 2473.239 6
# plotting aic vs number of input variables
ggplot(data=aic, aes(x=n, y=aic)) +
geom_line() +
scale_x_continuous(breaks=1:6)
This is because post n=3, the adjusted r-square and aic remains almost constant. Thus, there is no use of adding more variables to the model.
library(SignifReg)
# The range of variables that the forward-selection algorithm will examine
scope <- distance ~ aircraft+height+speed_air+no_pasg+duration+speed_ground
# Building the final model using forward-selection algorithm
model <- SignifReg(scope=scope,
data=FAA1,
alpha=0.05,
direction="forward",
criterion="r-adj",
correction="FDR")
summary(model)
##
## Call:
## lm(formula = reg, data = data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -300.74 -94.78 15.47 97.09 330.41
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6390.376 109.839 -58.18 <2e-16 ***
## speed_air 82.148 0.976 84.17 <2e-16 ***
## aircraftboeing 427.442 19.173 22.29 <2e-16 ***
## height 13.702 1.007 13.60 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 134.2 on 199 degrees of freedom
## (630 observations deleted due to missingness)
## Multiple R-squared: 0.9737, Adjusted R-squared: 0.9733
## F-statistic: 2458 on 3 and 199 DF, p-value: < 2.2e-16