library(nortest)
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
library(car)
## Loading required package: carData
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
airline.df = read.csv("AirlinePricingData.csv")
attach(airline.df)
model <- lm(Price ~ AdvancedBookingDays+Airline+Departure+IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth,data=airline.df)
#summary(model)
stepmodel = step(model,trace=0,steps = 1000)
summary(stepmodel)
##
## Call:
## lm(formula = Price ~ AdvancedBookingDays + Departure + IsDiwali +
## DepartureCityCode + FlyingMinutes + SeatWidth, data = airline.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2943.6 -1295.4 -402.5 554.3 12176.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11267.41 6975.81 -1.615 0.1073
## AdvancedBookingDays -85.88 12.40 -6.926 2.67e-11 ***
## DeparturePM -439.34 254.41 -1.727 0.0852 .
## IsDiwali 4353.97 562.35 7.742 1.53e-13 ***
## DepartureCityCodeDEL -1558.31 273.40 -5.700 2.88e-08 ***
## FlyingMinutes 53.32 27.69 1.925 0.0551 .
## SeatWidth 646.53 283.64 2.279 0.0233 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2076 on 298 degrees of freedom
## Multiple R-squared: 0.2593, Adjusted R-squared: 0.2444
## F-statistic: 17.39 on 6 and 298 DF, p-value: < 2.2e-16
logmodel <- lm(log(Price) ~ AdvancedBookingDays+Airline+Departure+IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth,data=airline.df)
#summary(logmodel)
steplogmodel = step(logmodel,trace=0,steps = 1000)
summary(steplogmodel)
##
## Call:
## lm(formula = log(Price) ~ AdvancedBookingDays + IsDiwali + DepartureCityCode +
## FlyingMinutes + SeatWidth, data = airline.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.60061 -0.19536 -0.05364 0.12451 1.28510
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.373460 0.943614 6.754 7.48e-11 ***
## AdvancedBookingDays -0.014373 0.001728 -8.316 3.25e-15 ***
## IsDiwali 0.743241 0.078436 9.476 < 2e-16 ***
## DepartureCityCodeDEL -0.290431 0.038150 -7.613 3.54e-13 ***
## FlyingMinutes 0.010290 0.003850 2.673 0.00793 **
## SeatWidth 0.059465 0.037728 1.576 0.11605
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2898 on 299 degrees of freedom
## Multiple R-squared: 0.3577, Adjusted R-squared: 0.3469
## F-statistic: 33.3 on 5 and 299 DF, p-value: < 2.2e-16
Log Model is preferred as the adjusted r-square value is higher than linear-linear model.
par(mfrow=c(2,2))
plot(stepmodel,2)
plot(steplogmodel,2)
shapiro.test(Price)
##
## Shapiro-Wilk normality test
##
## data: Price
## W = 0.77653, p-value < 2.2e-16
ad.test(Price)
##
## Anderson-Darling normality test
##
## data: Price
## A = 19.412, p-value < 2.2e-16
Yes, the normality assumption is violated in the linear-linear model as the p-value < 0.01 for both the tests.
shapiro.test(log(Price))
##
## Shapiro-Wilk normality test
##
## data: log(Price)
## W = 0.93966, p-value = 8.002e-10
ad.test(log(Price))
##
## Anderson-Darling normality test
##
## data: log(Price)
## A = 5.8513, p-value = 1.978e-14
Though there is an improvement in the p-value, the normality assumption is still violated in the log-linear model. The p-value < 0.01 for both the tests.
plot(stepmodel,1)
The model adheres to Linearity if the fitted plot has a close to horizontal line. This model does not have a horizontal line.
plot(steplogmodel,1)
## The Linearity assumption is violated for the log-linear model.
Though there is an improvement from the linear-linear model. This model does not have a horizontal line.
priceTrans <- BoxCoxTrans(Price)
priceTrans
## Box-Cox Transformation
##
## 305 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2607 4051 4681 5395 5725 18015
##
## Largest/Smallest: 6.91
## Sample Skewness: 2.26
##
## Estimated Lambda: -0.8
Lambda value is -0.8
PriceNew = predict(priceTrans, Price)
head(PriceNew)
## [1] 1.248375 1.249299 1.248351 1.248431 1.248931 1.248889
# append the transformed variable to Airline.df data
airline.df <- cbind(airline.df, PriceNew)
modelnew <- lm(PriceNew ~ AdvancedBookingDays+Airline+Departure+IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth,data=airline.df)
stepmodelnew = step(modelnew,trace = 0, steps = 1000)
summary(stepmodelnew)
##
## Call:
## lm(formula = PriceNew ~ AdvancedBookingDays + IsDiwali + DepartureCityCode +
## FlyingMinutes, data = airline.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.0008027 -0.0001893 -0.0000232 0.0001689 0.0009734
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.247e+00 4.861e-04 2566.062 < 2e-16 ***
## AdvancedBookingDays -1.523e-05 1.689e-06 -9.014 < 2e-16 ***
## IsDiwali 7.941e-04 7.670e-05 10.353 < 2e-16 ***
## DepartureCityCodeDEL -3.001e-04 3.449e-05 -8.701 2.23e-16 ***
## FlyingMinutes 1.090e-05 3.601e-06 3.027 0.00268 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0002841 on 300 degrees of freedom
## Multiple R-squared: 0.4071, Adjusted R-squared: 0.3992
## F-statistic: 51.51 on 4 and 300 DF, p-value: < 2.2e-16
plot(stepmodel,2)
plot(stepmodelnew,2)
The transformed model appears to have improved from the base linear-linear model. However we cannot conclusively say if it the normality assumption holds true or not.
shapiro.test(PriceNew)
##
## Shapiro-Wilk normality test
##
## data: PriceNew
## W = 0.98545, p-value = 0.003551
ad.test(PriceNew)
##
## Anderson-Darling normality test
##
## data: PriceNew
## A = 1.6763, p-value = 0.0002619
The normality assumption is violated as both tests have a p-value < 0.05. The Box-Cox Transformed model also does not adhere to normality assumption.
plot(stepmodel,1)
plot(stepmodelnew,1)
The transformed model appears to have drastically improved from the base linear-linear model. However it doesn’t appear sufficient to justify a horizontal line.
plot(stepmodelnew,3)
ncvTest(stepmodelnew)
## Non-constant Variance Score Test
## Variance formula: ~ fitted.values
## Chisquare = 0.04804043, Df = 1, p = 0.82651
bptest(stepmodelnew)
##
## studentized Breusch-Pagan test
##
## data: stepmodelnew
## BP = 18.644, df = 4, p-value = 0.0009232