air = read.csv(paste("AirlinePricingData.csv", sep=""))
colnames(air)
## [1] "FlightNumber" "Airline" "DepartureCityCode"
## [4] "ArrivalCityCode" "DepartureTime" "ArrivalTime"
## [7] "Departure" "FlyingMinutes" "Aircraft"
## [10] "PlaneModel" "Capacity" "SeatPitch"
## [13] "SeatWidth" "DataCollectionDate" "DateDeparture"
## [16] "IsWeekend" "Price" "AdvancedBookingDays"
## [19] "IsDiwali" "DayBeforeDiwali" "DayAfterDiwali"
## [22] "MetroDeparture" "MetroArrival" "MarketShare"
## [25] "LoadFactor"
Model 1: Price= B0+B1AdvancebookingDays+B2Airline+B3Departure+B4IsWeekend+B5IsDiwali +B6DepartureCityCode +B7FlyingMinutes+B8SeatPitch+B9SeatWidth + E
Model1 = lm(Price ~ AdvancedBookingDays + Airline + Departure + IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + SeatPitch + SeatWidth, data = air)
summary(Model1)
##
## Call:
## lm(formula = Price ~ AdvancedBookingDays + Airline + Departure +
## IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes +
## SeatPitch + SeatWidth, data = air)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2671.2 -1266.2 -456.4 517.4 11953.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4292.94 8897.87 -0.482 0.6298
## AdvancedBookingDays -87.70 12.47 -7.033 1.43e-11 ***
## AirlineIndiGo -577.17 778.64 -0.741 0.4591
## AirlineJet -120.75 436.69 -0.277 0.7823
## AirlineSpice Jet -1118.38 697.85 -1.603 0.1101
## DeparturePM -589.79 275.23 -2.143 0.0329 *
## IsWeekendYes -345.92 408.06 -0.848 0.3973
## IsDiwali 4346.80 568.14 7.651 2.90e-13 ***
## DepartureCityCodeDEL -1413.46 351.54 -4.021 7.38e-05 ***
## FlyingMinutes 38.97 29.27 1.331 0.1841
## SeatPitch -279.19 226.64 -1.232 0.2190
## SeatWidth 868.58 507.54 1.711 0.0881 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2079 on 293 degrees of freedom
## Multiple R-squared: 0.2695, Adjusted R-squared: 0.2421
## F-statistic: 9.828 on 11 and 293 DF, p-value: 3.604e-15
Model 2: log(Price)= 0+ß1AdvancebookingDays+ß2Airline+ß3Departure+ß4IsWeekend+ß5IsDiwali +ß6DepartureCityCode +ß7FlyingMinutes+ß8SeatPitch+ß9SeatWidth+ E
Model2 = lm(log(Price) ~ AdvancedBookingDays + Airline + Departure + IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + SeatPitch + SeatWidth, data = air)
summary(Model2)
##
## Call:
## lm(formula = log(Price) ~ AdvancedBookingDays + Airline + Departure +
## IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes +
## SeatPitch + SeatWidth, data = air)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57006 -0.19770 -0.05792 0.12935 1.24672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.549474 1.243788 5.266 2.71e-07 ***
## AdvancedBookingDays -0.014639 0.001743 -8.399 1.97e-15 ***
## AirlineIndiGo -0.098622 0.108842 -0.906 0.3656
## AirlineJet 0.001113 0.061043 0.018 0.9855
## AirlineSpice Jet -0.127169 0.097548 -1.304 0.1934
## DeparturePM -0.055844 0.038473 -1.452 0.1477
## IsWeekendYes -0.036748 0.057041 -0.644 0.5199
## IsDiwali 0.744738 0.079418 9.377 < 2e-16 ***
## DepartureCityCodeDEL -0.264017 0.049140 -5.373 1.58e-07 ***
## FlyingMinutes 0.008717 0.004092 2.131 0.0340 *
## SeatPitch -0.032824 0.031681 -1.036 0.3010
## SeatWidth 0.122364 0.070947 1.725 0.0856 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2906 on 293 degrees of freedom
## Multiple R-squared: 0.3671, Adjusted R-squared: 0.3433
## F-statistic: 15.45 on 11 and 293 DF, p-value: < 2.2e-16
The log-linear model is preferred over the linear model beacuse the Adjusted R-Squared increases from 24.21% to 34.33%.. So it is a slightly better fitting model.
plot(Model1,2)
plot(Model2,2)
In both models all the points do not fall approximately along the reference line, so we cannot assume normality. The output shows that 182 and 183 are the outliers.
Is the normality assumption violated in the linear-linear model? Is the normality assumption violated in the log-linear model?
#Model1
# Shapiro-Wilk's normality test
shapiro.test(air$Price)
##
## Shapiro-Wilk normality test
##
## data: air$Price
## W = 0.77653, p-value < 2.2e-16
## Anderson-Darling normality test
library(nortest)
ad.test(air$Price)
##
## Anderson-Darling normality test
##
## data: air$Price
## A = 19.412, p-value < 2.2e-16
From the output, the p-value < 0.05 implies that the distribution of the data are not significantly same from normal distribution. In other words, we cannot assume the normality in the linear-linear model.
#Model1
# Shapiro-Wilk's normality test
shapiro.test(log(air$Price))
##
## Shapiro-Wilk normality test
##
## data: log(air$Price)
## W = 0.93966, p-value = 8.002e-10
## Anderson-Darling normality test
library(nortest)
ad.test(log(air$Price))
##
## Anderson-Darling normality test
##
## data: log(air$Price)
## A = 5.8513, p-value = 1.978e-14
From the output, the p-value < 0.05 implies that the distribution of the data are not significantly same from normal distribution. In other words, we cannot assume the normality in the log-linear model too.
# normal probability plot of residuals
plot(Model1, 1)
# normal probability plot of residuals
plot(Model2, 1)
In both models all the points do not fall approximately along the reference line, so we cannot assume normality.
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
priceTrans <- BoxCoxTrans(air$Price)
priceTrans
## Box-Cox Transformation
##
## 305 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2607 4051 4681 5395 5725 18015
##
## Largest/Smallest: 6.91
## Sample Skewness: 2.26
##
## Estimated Lambda: -0.8
The lambda value indicated by the transformation is -0.8
# append the transformed variable to air
air <- cbind(air, priceNew = predict(priceTrans, air$Price))
TransformedModel = lm( priceNew ~ AdvancedBookingDays + Airline + Departure + IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + SeatPitch + SeatWidth, data = air)
summary(TransformedModel)
##
## Call:
## lm(formula = priceNew ~ AdvancedBookingDays + Airline + Departure +
## IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes +
## SeatPitch + SeatWidth, data = air)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.868e-04 -1.930e-04 -2.246e-05 1.777e-04 9.443e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.246e+00 1.218e-03 1022.900 < 2e-16 ***
## AdvancedBookingDays -1.551e-05 1.707e-06 -9.084 < 2e-16 ***
## AirlineIndiGo -1.190e-04 1.066e-04 -1.117 0.26509
## AirlineJet 6.273e-06 5.979e-05 0.105 0.91652
## AirlineSpice Jet -1.075e-04 9.555e-05 -1.125 0.26147
## DeparturePM -3.419e-05 3.769e-05 -0.907 0.36506
## IsWeekendYes -2.680e-05 5.587e-05 -0.480 0.63183
## IsDiwali 7.987e-04 7.779e-05 10.267 < 2e-16 ***
## DepartureCityCodeDEL -2.844e-04 4.813e-05 -5.909 9.53e-09 ***
## FlyingMinutes 1.056e-05 4.008e-06 2.635 0.00887 **
## SeatPitch -2.475e-05 3.103e-05 -0.798 0.42579
## SeatWidth 1.143e-04 6.950e-05 1.645 0.10106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0002847 on 293 degrees of freedom
## Multiple R-squared: 0.4183, Adjusted R-squared: 0.3965
## F-statistic: 19.16 on 11 and 293 DF, p-value: < 2.2e-16
plot(TransformedModel,2)
From comparison of qqplot of model1 and model2 to the new model, we can infer that model3 is more normally distributed, as points are more closer to the 45 degree line.
#Model1
# Shapiro-Wilk's normality test
shapiro.test(log(air$priceNew))
##
## Shapiro-Wilk normality test
##
## data: log(air$priceNew)
## W = 0.98546, p-value = 0.00356
## Anderson-Darling normality test
library(nortest)
ad.test(log(air$priceNew))
##
## Anderson-Darling normality test
##
## data: log(air$priceNew)
## A = 1.6751, p-value = 0.0002637
From the output, the p-value < 0.05 implies that the distribution of the data are significantly different from normal distribution. In other words, we cannot assume the normality.
# residual vs. fitted plot
plot(TransformedModel, 1)
plot(Model1, 1)
Based on the visual inspection of the linear model(model1) and the Box-Cox Transformed model(model3), we can conclude that the model3 has residuals more close to the horizontal line, so the model is more linear.