setwd("C:/Users/Anshumaan/Desktop/2018 DAM/session 11")
library(stats)
library(graphics)
dataflight<-read.csv("BOMDELBOM.csv")#importing data
attach(dataflight)# attaching data columns
library(nortest)#Shapiro-Wilks test and Anderson-Darling test
library(caret)#BoxCox Transformation
## Loading required package: lattice
## Loading required package: ggplot2
**************************
#linear regression
fit <- lm(Price~ AdvancedBookingDays+Airline+Departure+IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth, data = dataflight)
summary(fit)
##
## Call:
## lm(formula = Price ~ AdvancedBookingDays + Airline + Departure +
## IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes +
## SeatPitch + SeatWidth, data = dataflight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2671.2 -1266.2 -456.4 517.4 11953.9
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -4292.94 8897.87 -0.482 0.6298
## AdvancedBookingDays -87.70 12.47 -7.033 1.43e-11 ***
## AirlineIndiGo -577.17 778.64 -0.741 0.4591
## AirlineJet -120.75 436.69 -0.277 0.7823
## AirlineSpice Jet -1118.38 697.85 -1.603 0.1101
## DeparturePM -589.79 275.23 -2.143 0.0329 *
## IsWeekendYes -345.92 408.06 -0.848 0.3973
## IsDiwaliYes 4346.80 568.14 7.651 2.90e-13 ***
## DepartureCityCodeDEL -1413.46 351.54 -4.021 7.38e-05 ***
## FlyingMinutes 38.97 29.27 1.331 0.1841
## SeatPitch -279.19 226.64 -1.232 0.2190
## SeatWidth 868.58 507.54 1.711 0.0881 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2079 on 293 degrees of freedom
## Multiple R-squared: 0.2695, Adjusted R-squared: 0.2421
## F-statistic: 9.828 on 11 and 293 DF, p-value: 3.604e-15
**************************
#log linear regression
pricelog<-log(Price)
fit1 <- lm(pricelog~ AdvancedBookingDays+Airline+Departure+IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth, data = dataflight)
summary(fit1)
##
## Call:
## lm(formula = pricelog ~ AdvancedBookingDays + Airline + Departure +
## IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes +
## SeatPitch + SeatWidth, data = dataflight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57006 -0.19770 -0.05792 0.12935 1.24672
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.549474 1.243788 5.266 2.71e-07 ***
## AdvancedBookingDays -0.014639 0.001743 -8.399 1.97e-15 ***
## AirlineIndiGo -0.098622 0.108842 -0.906 0.3656
## AirlineJet 0.001113 0.061043 0.018 0.9855
## AirlineSpice Jet -0.127169 0.097548 -1.304 0.1934
## DeparturePM -0.055844 0.038473 -1.452 0.1477
## IsWeekendYes -0.036748 0.057041 -0.644 0.5199
## IsDiwaliYes 0.744738 0.079418 9.377 < 2e-16 ***
## DepartureCityCodeDEL -0.264017 0.049140 -5.373 1.58e-07 ***
## FlyingMinutes 0.008717 0.004092 2.131 0.0340 *
## SeatPitch -0.032824 0.031681 -1.036 0.3010
## SeatWidth 0.122364 0.070947 1.725 0.0856 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2906 on 293 degrees of freedom
## Multiple R-squared: 0.3671, Adjusted R-squared: 0.3433
## F-statistic: 15.45 on 11 and 293 DF, p-value: < 2.2e-16
**************************
#Which model is better?
#Linear model has adjusted R sq. as 24.21%
#Log linear model is better with adjusted R sq. 34.33%
**************************
#visual Inspection test for normality using qqplot
plot(fit,2)

#Data points on the right hand side of the graph are not close to the 45 degree line
plot(fit1,2)

#Data points are much closer to 45 degree line as compared to previous model
**************************
#Shapiro-Wilks test and Anderson-Darling test
#linear linear model
ad.test(Price)
##
## Anderson-Darling normality test
##
## data: Price
## A = 19.412, p-value < 2.2e-16
shapiro.test(Price)
##
## Shapiro-Wilk normality test
##
## data: Price
## W = 0.77653, p-value < 2.2e-16
#Is the normality assumption violated in the linear-linear model?
#Yes
#p value of 'price' in linear model is less than .05 which means we cannot assume normality as distribution of the data are significantly different from normal distribution
#log linear model
ad.test(pricelog)
##
## Anderson-Darling normality test
##
## data: pricelog
## A = 5.8513, p-value = 1.978e-14
shapiro.test(pricelog)
##
## Shapiro-Wilk normality test
##
## data: pricelog
## W = 0.93966, p-value = 8.002e-10
#Is the normality assumption violated in the log-linear model?
#Yes, p value of 'pricelog' in log linear model is less than .05 which means we cannot assume normality as distribution of the data are significantly different from normal distribution
**************************
#visual inspection using residual plots
plot(fit,1)

#Is the linearity assumption violated in the linear-linear model?
#Yes. We can see Curvature in the Residual Plot line for Linear Linear model which shows that error terms might not be normally distributed
plot(fit1,1)

#Is the linearity assumption violated in the log-linear model?
#As compared to linear linear model, log linear graph seems much closer to normality. However, there is presence of curvature in the redline which means it violates normality assumption.
**************************
#BoxCox Transformation
PriceTrans <- BoxCoxTrans(Price)
PriceTrans
## Box-Cox Transformation
##
## 305 data points used to estimate Lambda
##
## Input data summary:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2607 4051 4681 5395 5725 18015
##
## Largest/Smallest: 6.91
## Sample Skewness: 2.26
##
## Estimated Lambda: -0.8
#What "lambda" value does Box-Cox transformation indicate?
#-0.8
dataflight<-cbind(dataflight, PriceNew=predict(PriceTrans,Price))
head(dataflight)
## FlightNumber Airline DepartureCityCode ArrivalCityCode DepartureTime
## 1 9W 313 Jet DEL BOM 225
## 2 9W 339 Jet BOM DEL 300
## 3 SG 161 Spice Jet DEL BOM 350
## 4 6E 171 IndiGo DEL BOM 455
## 5 SG 160 Spice Jet BOM DEL 555
## 6 9W 762 Jet BOM DEL 605
## ArrivalTime Departure FlyingMinutes Aircraft PlaneModel Capacity
## 1 435 AM 130 Boeing 738 156
## 2 505 AM 125 Boeing 738 156
## 3 605 AM 135 Boeing 738 189
## 4 710 AM 135 Airbus A320 180
## 5 805 AM 130 Boeing 738 189
## 6 815 AM 130 Boeing 738 156
## SeatPitch SeatWidth DataCollectionDate DateDeparture IsWeekend Price
## 1 30 17 Sep 13 2018 Nov 6 2018 No 4051
## 2 30 17 Sep 15 2018 Nov 6 2018 No 11587
## 3 29 17 Sep 19 2018 Nov 6 2018 No 3977
## 4 30 18 Sep 8 2018 Nov 6 2018 No 4234
## 5 29 17 Sep 19 2018 Nov 6 2018 No 6837
## 6 30 17 Sep 15 2018 Nov 6 2018 No 6518
## AdvancedBookingDays IsDiwali DayBeforeDiwali DayAfterDiwali MarketShare
## 1 54 Yes Yes No 15.4
## 2 52 Yes Yes No 15.4
## 3 48 Yes Yes No 13.2
## 4 59 Yes Yes No 39.6
## 5 48 Yes Yes No 13.2
## 6 52 Yes Yes No 15.4
## LoadFactor PriceNew
## 1 83.32 1.248375
## 2 83.32 1.249299
## 3 94.06 1.248351
## 4 87.20 1.248431
## 5 94.06 1.248931
## 6 83.32 1.248889
attach(dataflight)
## The following objects are masked from dataflight (pos = 7):
##
## AdvancedBookingDays, Aircraft, Airline, ArrivalCityCode,
## ArrivalTime, Capacity, DataCollectionDate, DateDeparture,
## DayAfterDiwali, DayBeforeDiwali, Departure, DepartureCityCode,
## DepartureTime, FlightNumber, FlyingMinutes, IsDiwali,
## IsWeekend, LoadFactor, MarketShare, PlaneModel, Price,
## SeatPitch, SeatWidth
fit_trans<-lm(PriceNew~AdvancedBookingDays+Airline+Departure+IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth, data = dataflight)
summary(fit_trans)
##
## Call:
## lm(formula = PriceNew ~ AdvancedBookingDays + Airline + Departure +
## IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes +
## SeatPitch + SeatWidth, data = dataflight)
##
## Residuals:
## Min 1Q Median 3Q Max
## -7.868e-04 -1.930e-04 -2.246e-05 1.777e-04 9.443e-04
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.246e+00 1.218e-03 1022.900 < 2e-16 ***
## AdvancedBookingDays -1.551e-05 1.707e-06 -9.084 < 2e-16 ***
## AirlineIndiGo -1.190e-04 1.066e-04 -1.117 0.26509
## AirlineJet 6.273e-06 5.979e-05 0.105 0.91652
## AirlineSpice Jet -1.075e-04 9.555e-05 -1.125 0.26147
## DeparturePM -3.419e-05 3.769e-05 -0.907 0.36506
## IsWeekendYes -2.680e-05 5.587e-05 -0.480 0.63183
## IsDiwaliYes 7.987e-04 7.779e-05 10.267 < 2e-16 ***
## DepartureCityCodeDEL -2.844e-04 4.813e-05 -5.909 9.53e-09 ***
## FlyingMinutes 1.056e-05 4.008e-06 2.635 0.00887 **
## SeatPitch -2.475e-05 3.103e-05 -0.798 0.42579
## SeatWidth 1.143e-04 6.950e-05 1.645 0.10106
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0002847 on 293 degrees of freedom
## Multiple R-squared: 0.4183, Adjusted R-squared: 0.3965
## F-statistic: 19.16 on 11 and 293 DF, p-value: < 2.2e-16
**************************
#Test the Box-Cox transformed model for normality, based on visual examination of a qqplot (and compare it with the linear-linear qqplot)
plot(fit_trans,2)

#As compared to linear linear model, qqplot graph seems much closer to normality as Data points are much closer to 45 degree line however concrete conclusion cannot be derived from it
#Test the Box-Cox transformed model for normality, based on (i) Shapiro-Wilks test; (ii) Anderson-Darling test? Is the normality assumption violated?
ad.test(PriceNew)
##
## Anderson-Darling normality test
##
## data: PriceNew
## A = 1.6763, p-value = 0.0002619
shapiro.test(PriceNew)
##
## Shapiro-Wilk normality test
##
## data: PriceNew
## W = 0.98545, p-value = 0.003551
#p value is still less than 0.05 hence we cannot assume distribution to be normal
**************************
#Test the Box-Cox transformed model for linearity, based on visual examination of a of the Residual versus Fitted plot (and compare it with the Residual versus Fitted plot based on the linear-linear model)
plot(fit_trans,1)

#We can see Curvature in the Residual Plot line which is similar to Linear Linear model which shows that error terms might not be normally distributed