Goal: To study what factors and how they impact the landing distance of a commercial flight

Summary:

  1. The first step is to import and clean the data so that it is ready for the next step i.e. statistical analysis. So, I imported the files, explored the data, identified extreme values (abnormal), merged the datasets and removed duplicate observations. Along with this, I also looked at the statistical summary and explored missing data. The number of observations left after merging, removing the duplicate rows and removing the abnormal values was 840.

  2. Next step was to analyze the data using data visualization, correlation and regression techniques. The goal of this step was to find the relationship between the response variable (distance) and the predictors like speed_ground, speed_air, etc. (if any) using correlation and regression analysis. Also, I visualized the dataset using the scatter plot. Visualization is not enough as we need to see if the relationship is statistically significant. So, I moved on to Correlation analysis. Correlation helps to identify if there is any relationship between the predictor variable and the response. One thing where correlation fails is that it doesn’t explain the relationship between the response variable and all the predictor variables together. So, the next step I did was multiple regression. After performing multiple regression and considering the issue of multicollinearity, I found that speed_ground was not significant.

  3. Finally, I applied StepAIC to perform forward variable selection based. Speed_air, aircraft, height and no_pasg predictors were selected based on lowest AIC value.

# Loading the relevant libraries

library(readxl)
library(dplyr)
library(ggplot2)
library(tidyr)
library(purrr)
library(MASS)
# Importing the Excel files -----------------------------------------------


faa1 <- read_excel("C:/Users/ashis/Downloads/Stat Modelling/Week 1/FAA1.xls")
faa2 <- read_excel("C:/Users/ashis/Downloads/Stat Modelling/Week 1/FAA2.xls")


# Exploring the data ------------------------------------------------------


str(faa1)
## Classes 'tbl_df', 'tbl' and 'data.frame':    800 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
str(faa2)
## Classes 'tbl_df', 'tbl' and 'data.frame':    150 obs. of  7 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
head(faa1)
## # A tibble: 6 x 8
##   aircraft duration no_pasg speed_ground speed_air height pitch distance
##   <chr>       <dbl>   <dbl>        <dbl>     <dbl>  <dbl> <dbl>    <dbl>
## 1 boeing       98.5      53        108.       109.   27.4  4.04    3370.
## 2 boeing      126.       69        102.       103.   27.8  4.12    2988.
## 3 boeing      112.       61         71.1       NA    18.6  4.43    1145.
## 4 boeing      197.       56         85.8       NA    30.7  3.88    1664.
## 5 boeing       90.1      70         59.9       NA    32.4  4.03    1050.
## 6 boeing      138.       55         75.0       NA    41.2  4.20    1627.
head(faa2)
## # A tibble: 6 x 7
##   aircraft no_pasg speed_ground speed_air height pitch distance
##   <chr>      <dbl>        <dbl>     <dbl>  <dbl> <dbl>    <dbl>
## 1 boeing        53        108.       109.   27.4  4.04    3370.
## 2 boeing        69        102.       103.   27.8  4.12    2988.
## 3 boeing        61         71.1       NA    18.6  4.43    1145.
## 4 boeing        56         85.8       NA    30.7  3.88    1664.
## 5 boeing        70         59.9       NA    32.4  4.03    1050.
## 6 boeing        55         75.0       NA    41.2  4.20    1627.
dim(faa1)
## [1] 800   8
dim(faa2)
## [1] 150   7
identical(faa1,faa2)
## [1] FALSE
# Creating an additional column 'duration' in FAA2 --------------------------



faa2 <- faa2 %>% mutate(duration = NA)


# Converting the data type of 'duration' column to numeric --------------


faa2$duration <- as.numeric(faa2$duration)



# Arranging the columns of faa2 in the same order as faa1 -----------------


faa2 <- faa2 %>% dplyr::select(aircraft,duration,no_pasg,speed_ground,speed_air,
                        height,pitch,distance)

# Merging the datasets faa1 and faa2 --------------------------------------


faa_merged <- rbind(faa1,faa2)



# Fetching  the index of duplicate rows after removing the duration column. We can see that 100 rows are duplicate


which(duplicated(faa_merged[,-2]))
##   [1] 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817
##  [18] 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834
##  [35] 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851
##  [52] 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868
##  [69] 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
##  [86] 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900
faa_duplicate <- faa_merged[which(duplicated(faa_merged[,-2])),]
faa_duplicate
## # A tibble: 100 x 8
##    aircraft duration no_pasg speed_ground speed_air height pitch distance
##    <chr>       <dbl>   <dbl>        <dbl>     <dbl>  <dbl> <dbl>    <dbl>
##  1 boeing         NA      53        108.       109.   27.4  4.04    3370.
##  2 boeing         NA      69        102.       103.   27.8  4.12    2988.
##  3 boeing         NA      61         71.1       NA    18.6  4.43    1145.
##  4 boeing         NA      56         85.8       NA    30.7  3.88    1664.
##  5 boeing         NA      70         59.9       NA    32.4  4.03    1050.
##  6 boeing         NA      55         75.0       NA    41.2  4.20    1627.
##  7 boeing         NA      54         54.4       NA    24.0  3.84     805.
##  8 boeing         NA      57         57.1       NA    19.4  4.64     574.
##  9 boeing         NA      61         85.4       NA    35.4  4.23    1699.
## 10 boeing         NA      56         61.8       NA    36.7  4.18    1138.
## # ... with 90 more rows
# Removing the duplicate rows from the merged dataset ---------------------


faa_merged <- faa_merged[-801:-900,]


# Summary statistic of the merged data. We are left with 850 rows

str(faa_merged)
## Classes 'tbl_df', 'tbl' and 'data.frame':    850 obs. of  8 variables:
##  $ aircraft    : chr  "boeing" "boeing" "boeing" "boeing" ...
##  $ duration    : num  98.5 125.7 112 196.8 90.1 ...
##  $ no_pasg     : num  53 69 61 56 70 55 54 57 61 56 ...
##  $ speed_ground: num  107.9 101.7 71.1 85.8 59.9 ...
##  $ speed_air   : num  109 103 NA NA NA ...
##  $ height      : num  27.4 27.8 18.6 30.7 32.4 ...
##  $ pitch       : num  4.04 4.12 4.43 3.88 4.03 ...
##  $ distance    : num  3370 2988 1145 1664 1050 ...
summary(faa_merged)
##    aircraft            duration         no_pasg      speed_ground   
##  Length:850         Min.   : 14.76   Min.   :29.0   Min.   : 27.74  
##  Class :character   1st Qu.:119.49   1st Qu.:55.0   1st Qu.: 65.90  
##  Mode  :character   Median :153.95   Median :60.0   Median : 79.64  
##                     Mean   :154.01   Mean   :60.1   Mean   : 79.45  
##                     3rd Qu.:188.91   3rd Qu.:65.0   3rd Qu.: 92.06  
##                     Max.   :305.62   Max.   :87.0   Max.   :141.22  
##                     NA's   :50                                      
##    speed_air          height           pitch          distance      
##  Min.   : 90.00   Min.   :-3.546   Min.   :2.284   Min.   :  34.08  
##  1st Qu.: 96.25   1st Qu.:23.314   1st Qu.:3.642   1st Qu.: 883.79  
##  Median :101.15   Median :30.093   Median :4.008   Median :1258.09  
##  Mean   :103.80   Mean   :30.144   Mean   :4.009   Mean   :1526.02  
##  3rd Qu.:109.40   3rd Qu.:36.993   3rd Qu.:4.377   3rd Qu.:1936.95  
##  Max.   :141.72   Max.   :59.946   Max.   :5.927   Max.   :6533.05  
##  NA's   :642
faa_merged$aircraft <- as.factor(faa_merged$aircraft)


# Removing the abnormal values in height and duration columns -------------


faa_merged <- faa_merged %>% filter((is.na(duration) | duration > 40) &  height > 0)


# Plotting density plot for all the variables except aircraft variable ---------


faa_merged[,-1] %>% 
  gather() %>% 
  ggplot(aes(value)) +
  facet_wrap(~ key, scales = "free") +
  geom_density() + 
  theme_minimal()

# Computing the correlation with distance for all the variables but aircraft


correlation_vector <- vector("numeric", 6)
for (i in 2:(ncol(faa_merged) - 1)) {
  
  
  correlation_vector[i - 1] <- as.numeric(cor(faa_merged[,i], faa_merged[ncol(faa_merged)], 
                                   use = "complete.obs"))
  
}
names(correlation_vector) <- c("duration","no_pasg","speed_ground","speed_air",
                               "height", "pitch")
correlation_vector
##     duration      no_pasg speed_ground    speed_air       height 
##  -0.05287377  -0.02457070   0.86166762   0.94786540   0.11543232 
##        pitch 
##   0.09726243
# Plotting the scatter matrix ---------------------------------------------



GGally::ggpairs(faa_merged[, ])

# Applying the linear regression model to all the variables one by one -----


list_model <- list()
summary_model <- list()
for (i in names(faa_merged[,1:7])) {

 
  list_model[[i]] <- lm(paste("distance ~", i), data = faa_merged) 
  summary_model[[i]] <- summary( list_model[[i]])
 
}




merged_coeff <- map(summary_model,coefficients)
pvalue_merged <- sort(unlist(map(merged_coeff,8)))


# Scaling the predictors and applying lm model ----------------------------



faa_scaled <- data.frame(scale(faa_merged[,2:7]))
class(faa_scaled)
## [1] "data.frame"
faa_scaled[,c(7,8)] <- faa_merged[,c(1,8)]


scaled_model <- list()
scaled_summary <- list()
for (i in names(faa_scaled[,1:7])) {
  
  
  scaled_model[[i]] <- lm(paste("distance ~", i), data = faa_scaled) 
  scaled_summary[[i]] <- summary( scaled_model[[i]])
  
}

scaled_coeff <- map(scaled_summary,coefficients)
scaled_beta <- sort(unlist(map(scaled_coeff,2)), decreasing = TRUE)



# Printing correlation, p-value and slope coefficient. In terms of correlation, speed_air is highly correlated with distance, then speed_ground and so on. Looking at p values, all the p values are less than 5% but speed_ground is the lowest, then speed_air, etc. In terms of coefficient, speed_air is the highest, then speed_ground, etc.


sort(correlation_vector, decreasing = TRUE)
##    speed_air speed_ground       height        pitch      no_pasg 
##   0.94786540   0.86166762   0.11543232   0.09726243  -0.02457070 
##     duration 
##  -0.05287377
pvalue_merged
##  speed_ground     speed_air      aircraft        height         pitch 
## 4.372864e-249 8.896065e-103  4.011232e-12  8.029532e-04  4.781048e-03 
##      duration       no_pasg 
##  1.375954e-01  4.769768e-01
scaled_beta
##    speed_air speed_ground     aircraft       height        pitch 
##    846.84021    798.14333    438.26834    106.92236     90.09200 
##      no_pasg     duration 
##    -22.75928    -49.48541
# Applying lm model (speed_air, speed_ground and both simultaneously) to check multicollinearity. 


model_colli1 <- lm(distance~speed_ground, data = faa_merged)
model_colli2 <- lm(distance~speed_air, data = faa_merged)
model_colli3 <- lm(distance~speed_air + speed_ground, data = faa_merged)

summ_colli1 <- summary(model_colli1)
summ_colli2 <- summary(model_colli2)
summ_colli3 <- summary(model_colli3)


# Printing p values for speed_ground, speed_air & speed_ground+speed_air



summ_colli1$coefficients[8]
## [1] 4.372864e-249
summ_colli2$coefficients[8]
## [1] 8.896065e-103
summ_colli3$coefficients[11]
## [1] 6.321636e-12
summ_colli3$coefficients[12]
## [1] 0.2605788
# Applying forward selection based on variables selected after step 16. Considered speed_air as the best variable based on p-value, correlation and slope coefficient, then speed_ground, etc. Adding the variable one at time and looking at R2, Adjusted R2 and AIC value.




model_forward_sel <- list()
model_forward_sel[[1]] <- lm(distance~speed_air, data = faa_merged)
model_forward_sel[[2]] <- lm(distance~speed_air+speed_ground, data = faa_merged)
model_forward_sel[[3]] <- lm(distance~speed_air+speed_ground+aircraft, data = faa_merged)
model_forward_sel[[4]] <- lm(distance~speed_air+speed_ground+aircraft+height, 
                         data = faa_merged)
model_forward_sel[[5]] <- lm(distance~speed_air+speed_ground+aircraft+height+pitch, 
                         data = faa_merged)
model_forward_sel[[6]] <- lm(distance~speed_air+speed_ground+aircraft+height+pitch+no_pasg, 
                         data = faa_merged)

model_forward <- list()

for (i in 1:length(model_forward_sel)) {
  model_forward[[i]] <- summary(model_forward_sel[[i]])
}



r_square <- map(model_forward,"r.squared")
adjusted_rsquare <- map(model_forward,"adj.r.squared")

# Printing R square, Adjusted R Square & AIC. We can see that R2 increases as the number of variables are increasing. Adjusted R2 is also increasing even though it is applying penalty for adding the variable. AIC is lowest for speed_air+speed_ground+aircraft+height


r_square
## [[1]]
## [1] 0.8984488
## 
## [[2]]
## [1] 0.8990847
## 
## [[3]]
## [1] 0.9540371
## 
## [[4]]
## [1] 0.9754762
## 
## [[5]]
## [1] 0.975483
## 
## [[6]]
## [1] 0.9758141
adjusted_rsquare
## [[1]]
## [1] 0.8979486
## 
## [[2]]
## [1] 0.8980855
## 
## [[3]]
## [1] 0.9533511
## 
## [[4]]
## [1] 0.9749857
## 
## [[5]]
## [1] 0.974867
## 
## [[6]]
## [1] 0.9750812
map(model_forward_sel, AIC)
## [[1]]
## [1] 2903.86
## 
## [[2]]
## [1] 2904.573
## 
## [[3]]
## [1] 2745.351
## 
## [[4]]
## [1] 2618.572
## 
## [[5]]
## [1] 2620.515
## 
## [[6]]
## [1] 2619.728
# Performing forward variable selection using stepAIC. We found that AIC is lowest for speed_air + aircraft + height + no_pasg i.e. 1955.48



entire_model <- lm(distance~., data = na.omit(faa_merged))
null_model <- lm(distance~1, data = na.omit(faa_merged))

stepAIC(null_model, scope = list(lower = null_model,upper = entire_model),
        direction = "forward")
## Start:  AIC=2682.54
## distance ~ 1
## 
##                Df Sum of Sq       RSS    AIC
## + speed_air     1 143944957  15942768 2230.4
## + speed_ground  1 140048296  19839429 2273.4
## + aircraft      1   5775391 154112334 2677.3
## <none>                      159887725 2682.5
## + pitch         1    991324 158896401 2683.3
## + height        1    707080 159180645 2683.7
## + duration      1    370042 159517683 2684.1
## + no_pasg       1    200305 159687420 2684.3
## 
## Step:  AIC=2230.36
## distance ~ speed_air
## 
##                Df Sum of Sq      RSS    AIC
## + aircraft      1   8642406  7300363 2078.5
## + height        1   2868393 13074375 2193.3
## + pitch         1   1062502 14880266 2218.8
## <none>                      15942768 2230.4
## + no_pasg       1    142329 15800439 2230.6
## + speed_ground  1     75985 15866783 2231.4
## + duration      1      8648 15934120 2232.2
## 
## Step:  AIC=2078.49
## distance ~ speed_air + aircraft
## 
##                Df Sum of Sq     RSS    AIC
## + height        1   3426505 3873858 1955.7
## <none>                      7300363 2078.5
## + duration      1     52208 7248155 2079.1
## + no_pasg       1     44258 7256104 2079.3
## + speed_ground  1     42073 7258290 2079.3
## + pitch         1      3705 7296658 2080.4
## 
## Step:  AIC=1955.65
## distance ~ speed_air + aircraft + height
## 
##                Df Sum of Sq     RSS    AIC
## + no_pasg       1     42386 3831472 1955.5
## <none>                      3873858 1955.7
## + duration      1     10970 3862888 1957.1
## + speed_ground  1      7715 3866143 1957.3
## + pitch         1       488 3873370 1957.6
## 
## Step:  AIC=1955.48
## distance ~ speed_air + aircraft + height + no_pasg
## 
##                Df Sum of Sq     RSS    AIC
## <none>                      3831472 1955.5
## + duration      1    7980.7 3823491 1957.1
## + speed_ground  1    6881.7 3824590 1957.1
## + pitch         1     755.5 3830716 1957.5
## 
## Call:
## lm(formula = distance ~ speed_air + aircraft + height + no_pasg, 
##     data = na.omit(faa_merged))
## 
## Coefficients:
##    (Intercept)       speed_air  aircraftboeing          height  
##      -6446.140          83.713         442.235          14.105  
##        no_pasg  
##         -2.098