air = read.csv(paste("AirlinePricingData.csv", sep=""))
colnames(air)

##  [1] "FlightNumber"        "Airline"             "DepartureCityCode"  
##  [4] "ArrivalCityCode"     "DepartureTime"       "ArrivalTime"        
##  [7] "Departure"           "FlyingMinutes"       "Aircraft"           
## [10] "PlaneModel"          "Capacity"            "SeatPitch"          
## [13] "SeatWidth"           "DataCollectionDate"  "DateDeparture"      
## [16] "IsWeekend"           "Price"               "AdvancedBookingDays"
## [19] "IsDiwali"            "DayBeforeDiwali"     "DayAfterDiwali"     
## [22] "MetroDeparture"      "MetroArrival"        "MarketShare"        
## [25] "LoadFactor"

Model 1: Price= B0+B1AdvancebookingDays+B2Airline+B3Departure+B4IsWeekend+B5IsDiwali +B6DepartureCityCode +B7FlyingMinutes+B8SeatPitch+B9SeatWidth + E

Model1 = lm(Price ~ AdvancedBookingDays + Airline + Departure + IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + SeatPitch + SeatWidth, data = air)
summary(Model1)

## 
## Call:
## lm(formula = Price ~ AdvancedBookingDays + Airline + Departure + 
##     IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + 
##     SeatPitch + SeatWidth, data = air)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2671.2 -1266.2  -456.4   517.4 11953.9 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4292.94    8897.87  -0.482   0.6298    
## AdvancedBookingDays    -87.70      12.47  -7.033 1.43e-11 ***
## AirlineIndiGo         -577.17     778.64  -0.741   0.4591    
## AirlineJet            -120.75     436.69  -0.277   0.7823    
## AirlineSpice Jet     -1118.38     697.85  -1.603   0.1101    
## DeparturePM           -589.79     275.23  -2.143   0.0329 *  
## IsWeekendYes          -345.92     408.06  -0.848   0.3973    
## IsDiwali              4346.80     568.14   7.651 2.90e-13 ***
## DepartureCityCodeDEL -1413.46     351.54  -4.021 7.38e-05 ***
## FlyingMinutes           38.97      29.27   1.331   0.1841    
## SeatPitch             -279.19     226.64  -1.232   0.2190    
## SeatWidth              868.58     507.54   1.711   0.0881 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2079 on 293 degrees of freedom
## Multiple R-squared:  0.2695, Adjusted R-squared:  0.2421 
## F-statistic: 9.828 on 11 and 293 DF,  p-value: 3.604e-15

Model 2: log(Price)= 0+ß1AdvancebookingDays+ß2Airline+ß3Departure+ß4IsWeekend+ß5IsDiwali +ß6DepartureCityCode +ß7FlyingMinutes+ß8SeatPitch+ß9SeatWidth+ E

Model2 = lm(log(Price) ~ AdvancedBookingDays + Airline + Departure + IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + SeatPitch + SeatWidth, data = air)
summary(Model2)

## 
## Call:
## lm(formula = log(Price) ~ AdvancedBookingDays + Airline + Departure + 
##     IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + 
##     SeatPitch + SeatWidth, data = air)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57006 -0.19770 -0.05792  0.12935  1.24672 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.549474   1.243788   5.266 2.71e-07 ***
## AdvancedBookingDays  -0.014639   0.001743  -8.399 1.97e-15 ***
## AirlineIndiGo        -0.098622   0.108842  -0.906   0.3656    
## AirlineJet            0.001113   0.061043   0.018   0.9855    
## AirlineSpice Jet     -0.127169   0.097548  -1.304   0.1934    
## DeparturePM          -0.055844   0.038473  -1.452   0.1477    
## IsWeekendYes         -0.036748   0.057041  -0.644   0.5199    
## IsDiwali              0.744738   0.079418   9.377  < 2e-16 ***
## DepartureCityCodeDEL -0.264017   0.049140  -5.373 1.58e-07 ***
## FlyingMinutes         0.008717   0.004092   2.131   0.0340 *  
## SeatPitch            -0.032824   0.031681  -1.036   0.3010    
## SeatWidth             0.122364   0.070947   1.725   0.0856 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2906 on 293 degrees of freedom
## Multiple R-squared:  0.3671, Adjusted R-squared:  0.3433 
## F-statistic: 15.45 on 11 and 293 DF,  p-value: < 2.2e-16

3. Compare the models. Which one is preferred? Why?

The log-linear model is preferred over the linear model beacuse the Adjusted R-Squared increases from 24.21% to 34.33%.. So it is a slightly better fitting model.

4a. Test the above linear-linear and log-linear models for normality, based on visual examination of their qqplots

plot(Model1,2)

plot(Model2,2)

In both models all the points do not fall approximately along the reference line, so we cannot assume normality. The output shows that 182 and 183 are the outliers.

4b. Test the above model for normality, based on (i) Shapiro-Wilks test; (ii) Anderson-Darling test?

Is the normality assumption violated in the linear-linear model? Is the normality assumption violated in the log-linear model?

#Model1
# Shapiro-Wilk's normality test
shapiro.test(air$Price)

## 
##  Shapiro-Wilk normality test
## 
## data:  air$Price
## W = 0.77653, p-value < 2.2e-16

## Anderson-Darling normality test
library(nortest)
ad.test(air$Price)

## 
##  Anderson-Darling normality test
## 
## data:  air$Price
## A = 19.412, p-value < 2.2e-16

From the output, the p-value < 0.05 implies that the distribution of the data are not significantly same from normal distribution. In other words, we cannot assume the normality in the linear-linear model.

#Model1
# Shapiro-Wilk's normality test
shapiro.test(log(air$Price))

## 
##  Shapiro-Wilk normality test
## 
## data:  log(air$Price)
## W = 0.93966, p-value = 8.002e-10

## Anderson-Darling normality test
library(nortest)
ad.test(log(air$Price))

## 
##  Anderson-Darling normality test
## 
## data:  log(air$Price)
## A = 5.8513, p-value = 1.978e-14

5. Test the above models for linearity, based on visual examination of the Residual versus Fitted plot.

Is the linearity assumption violated in the linear-linear model?

Is the linearity assumption violated in the log-linear model?

# normal probability plot of residuals
plot(Model1, 1)

# normal probability plot of residuals
plot(Model2, 1)

In both models all the points do not fall approximately along the reference line, so we cannot assume normality.

6. Run a suitable Box-Cox transformation of the dependent variable (Price in the BOMDELBOM dataset);

What “lambda” value does Box-Cox transformation indicate?

Redo the regression model using the transformed Price variable.

library(caret)

## Loading required package: lattice

## Loading required package: ggplot2

priceTrans <- BoxCoxTrans(air$Price)
priceTrans

## Box-Cox Transformation
## 
## 305 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2607    4051    4681    5395    5725   18015 
## 
## Largest/Smallest: 6.91 
## Sample Skewness: 2.26 
## 
## Estimated Lambda: -0.8

The lambda value indicated by the transformation is -0.8

# append the transformed variable to air
air <- cbind(air, priceNew = predict(priceTrans, air$Price))
TransformedModel = lm( priceNew ~ AdvancedBookingDays + Airline + Departure + IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + SeatPitch + SeatWidth, data = air)
summary(TransformedModel)

## 
## Call:
## lm(formula = priceNew ~ AdvancedBookingDays + Airline + Departure + 
##     IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + 
##     SeatPitch + SeatWidth, data = air)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.868e-04 -1.930e-04 -2.246e-05  1.777e-04  9.443e-04 
## 
## Coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           1.246e+00  1.218e-03 1022.900  < 2e-16 ***
## AdvancedBookingDays  -1.551e-05  1.707e-06   -9.084  < 2e-16 ***
## AirlineIndiGo        -1.190e-04  1.066e-04   -1.117  0.26509    
## AirlineJet            6.273e-06  5.979e-05    0.105  0.91652    
## AirlineSpice Jet     -1.075e-04  9.555e-05   -1.125  0.26147    
## DeparturePM          -3.419e-05  3.769e-05   -0.907  0.36506    
## IsWeekendYes         -2.680e-05  5.587e-05   -0.480  0.63183    
## IsDiwali              7.987e-04  7.779e-05   10.267  < 2e-16 ***
## DepartureCityCodeDEL -2.844e-04  4.813e-05   -5.909 9.53e-09 ***
## FlyingMinutes         1.056e-05  4.008e-06    2.635  0.00887 ** 
## SeatPitch            -2.475e-05  3.103e-05   -0.798  0.42579    
## SeatWidth             1.143e-04  6.950e-05    1.645  0.10106    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002847 on 293 degrees of freedom
## Multiple R-squared:  0.4183, Adjusted R-squared:  0.3965 
## F-statistic: 19.16 on 11 and 293 DF,  p-value: < 2.2e-16

7a. Test the Box-Cox transformed model for normality, based on visual examination of a qqplot (and compare it with the linear-linear qqplot)

plot(TransformedModel,2)

From comparison of qqplot of model1 and model2 to the new model, we can infer that model3 is more normally distributed, as points are more closer to the 45 degree line.

7b. Test the Box-Cox transformed model for normality, based on (i) Shapiro-Wilks test; (ii) Anderson-Darling test? Is the normality assumption violated?

#Model1
# Shapiro-Wilk's normality test
shapiro.test(log(air$priceNew))

## 
##  Shapiro-Wilk normality test
## 
## data:  log(air$priceNew)
## W = 0.98546, p-value = 0.00356

## Anderson-Darling normality test
library(nortest)
ad.test(log(air$priceNew))

## 
##  Anderson-Darling normality test
## 
## data:  log(air$priceNew)
## A = 1.6751, p-value = 0.0002637

From the output, the p-value < 0.05 implies that the distribution of the data are significantly different from normal distribution. In other words, we cannot assume the normality.

7c. Test the Box-Cox transformed model for linearity, based on visual examination of a of the Residual versus Fitted plot (and compare it with the Residual versus Fitted plot based on the linear-linear model)

# residual vs. fitted plot
plot(TransformedModel, 1)

plot(Model1, 1)

Based on the visual inspection of the linear model(model1) and the Box-Cox Transformed model(model3), we can conclude that the model3 has residuals more close to the horizontal line, so the model is more linear.

Box-Cox Plot | Team D

Adarsh Adwait | PGP33209

3 November 2018