Read Data

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.5.1

airlines.df = read.csv("BOMDELBOM.csv")

Q1

Linear-linear Model

model1 = lm(Price ~ AdvancedBookingDays + Airline + Departure + IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + SeatPitch + SeatWidth, data = airlines.df)
summary(model1)

## 
## Call:
## lm(formula = Price ~ AdvancedBookingDays + Airline + Departure + 
##     IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + 
##     SeatPitch + SeatWidth, data = airlines.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2671.2 -1266.2  -456.4   517.4 11953.9 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4292.94    8897.87  -0.482   0.6298    
## AdvancedBookingDays    -87.70      12.47  -7.033 1.43e-11 ***
## AirlineIndiGo         -577.17     778.64  -0.741   0.4591    
## AirlineJet            -120.75     436.69  -0.277   0.7823    
## AirlineSpice Jet     -1118.38     697.85  -1.603   0.1101    
## DeparturePM           -589.79     275.23  -2.143   0.0329 *  
## IsWeekendYes          -345.92     408.06  -0.848   0.3973    
## IsDiwaliYes           4346.80     568.14   7.651 2.90e-13 ***
## DepartureCityCodeDEL -1413.46     351.54  -4.021 7.38e-05 ***
## FlyingMinutes           38.97      29.27   1.331   0.1841    
## SeatPitch             -279.19     226.64  -1.232   0.2190    
## SeatWidth              868.58     507.54   1.711   0.0881 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2079 on 293 degrees of freedom
## Multiple R-squared:  0.2695, Adjusted R-squared:  0.2421 
## F-statistic: 9.828 on 11 and 293 DF,  p-value: 3.604e-15

Q2

Log-Linear Model

model2 = lm(log(Price) ~ AdvancedBookingDays + Airline + Departure + IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + SeatPitch + SeatWidth, data = airlines.df)
summary(model2)

## 
## Call:
## lm(formula = log(Price) ~ AdvancedBookingDays + Airline + Departure + 
##     IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + 
##     SeatPitch + SeatWidth, data = airlines.df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57006 -0.19770 -0.05792  0.12935  1.24672 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.549474   1.243788   5.266 2.71e-07 ***
## AdvancedBookingDays  -0.014639   0.001743  -8.399 1.97e-15 ***
## AirlineIndiGo        -0.098622   0.108842  -0.906   0.3656    
## AirlineJet            0.001113   0.061043   0.018   0.9855    
## AirlineSpice Jet     -0.127169   0.097548  -1.304   0.1934    
## DeparturePM          -0.055844   0.038473  -1.452   0.1477    
## IsWeekendYes         -0.036748   0.057041  -0.644   0.5199    
## IsDiwaliYes           0.744738   0.079418   9.377  < 2e-16 ***
## DepartureCityCodeDEL -0.264017   0.049140  -5.373 1.58e-07 ***
## FlyingMinutes         0.008717   0.004092   2.131   0.0340 *  
## SeatPitch            -0.032824   0.031681  -1.036   0.3010    
## SeatWidth             0.122364   0.070947   1.725   0.0856 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2906 on 293 degrees of freedom
## Multiple R-squared:  0.3671, Adjusted R-squared:  0.3433 
## F-statistic: 15.45 on 11 and 293 DF,  p-value: < 2.2e-16

Q3

The log-linear model is preferred over the linear model beacuse the Adjusted R-Squared increases from 24.21% to 34.33%.

Q4a

plot(model1,2)

plot(model2,2)

The data are not normally distributed. As seen in the qqplot, some data points are deviating widely form the 45 degree line.

Q4b(i) Shapiro-Wilks Test for Normality

library(nortest)
shapiro.test(airlines.df$Price)

## 
##  Shapiro-Wilk normality test
## 
## data:  airlines.df$Price
## W = 0.77653, p-value < 2.2e-16

log_var = log(airlines.df$Price)
shapiro.test(log_var)

## 
##  Shapiro-Wilk normality test
## 
## data:  log_var
## W = 0.93966, p-value = 8.002e-10

p-value < 0.05 for both models, implying that the distribution of the data are significantly different from normal distribution

Q4b(ii) Anderson Darling Test for Normality

library(nortest)
ad.test(airlines.df$Price)

## 
##  Anderson-Darling normality test
## 
## data:  airlines.df$Price
## A = 19.412, p-value < 2.2e-16

log_var = log(airlines.df$Price)
ad.test(log_var)

## 
##  Anderson-Darling normality test
## 
## data:  log_var
## A = 5.8513, p-value = 1.978e-14

p-value < 0.05 for both models, implying that the distribution of the data are significantly different from normal distribution

Q5

plot(model1,1)

plot(model2,1)

From visual inspection of the distribution of residuals the two models can be assumed fairly linear.

Q6

library(caret)

## Warning: package 'caret' was built under R version 3.5.1

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.5.1

PriceTrans <- BoxCoxTrans(airlines.df$Price)
PriceTrans

## Box-Cox Transformation
## 
## 305 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2607    4051    4681    5395    5725   18015 
## 
## Largest/Smallest: 6.91 
## Sample Skewness: 2.26 
## 
## Estimated Lambda: -0.8

airlines.df = cbind(airlines.df, PriceNew = predict(PriceTrans, airlines.df$Price))
model3 = lm(PriceNew ~ AdvancedBookingDays + Airline + Departure + IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + SeatPitch + SeatWidth, data = airlines.df)
summary(model3)

## 
## Call:
## lm(formula = PriceNew ~ AdvancedBookingDays + Airline + Departure + 
##     IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + 
##     SeatPitch + SeatWidth, data = airlines.df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.868e-04 -1.930e-04 -2.246e-05  1.777e-04  9.443e-04 
## 
## Coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           1.246e+00  1.218e-03 1022.900  < 2e-16 ***
## AdvancedBookingDays  -1.551e-05  1.707e-06   -9.084  < 2e-16 ***
## AirlineIndiGo        -1.190e-04  1.066e-04   -1.117  0.26509    
## AirlineJet            6.273e-06  5.979e-05    0.105  0.91652    
## AirlineSpice Jet     -1.075e-04  9.555e-05   -1.125  0.26147    
## DeparturePM          -3.419e-05  3.769e-05   -0.907  0.36506    
## IsWeekendYes         -2.680e-05  5.587e-05   -0.480  0.63183    
## IsDiwaliYes           7.987e-04  7.779e-05   10.267  < 2e-16 ***
## DepartureCityCodeDEL -2.844e-04  4.813e-05   -5.909 9.53e-09 ***
## FlyingMinutes         1.056e-05  4.008e-06    2.635  0.00887 ** 
## SeatPitch            -2.475e-05  3.103e-05   -0.798  0.42579    
## SeatWidth             1.143e-04  6.950e-05    1.645  0.10106    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002847 on 293 degrees of freedom
## Multiple R-squared:  0.4183, Adjusted R-squared:  0.3965 
## F-statistic: 19.16 on 11 and 293 DF,  p-value: < 2.2e-16

Box-Cox Transformation gives an Estimated Lambda value = -0.8

Q7a

plot(model3,2)

From comparison of qqplot of model1 and model2 to the new model, we can infer that model3 is more normally distributed, as points are more closer to the 45 degree line.

Q7b(i) Shapiro-Wilks Test for Normality

library(nortest)
shapiro.test(airlines.df$PriceNew)

## 
##  Shapiro-Wilk normality test
## 
## data:  airlines.df$PriceNew
## W = 0.98545, p-value = 0.003551

As p-value is less than 0.05 we can’t assume normality.

Q7b(ii) Anderson Darling Test for Normality

library(nortest)
ad.test(airlines.df$PriceNew)

## 
##  Anderson-Darling normality test
## 
## data:  airlines.df$PriceNew
## A = 1.6763, p-value = 0.0002619

The p-value is less than 5%, so the residuals are not normally distributed. Hence we can’t assume normality.

Q7c

plot(model3,1)

Based on the visual inspection of the linear model(model1) and the Box-Cox Transformed model(model3), we can conclude that the model3 has residuals more close to the horizontal line, so the model is more linear.