setwd("C:/Users/Anshumaan/Desktop/2018 DAM/session 11")
library(stats)
library(graphics)
dataflight<-read.csv("BOMDELBOM.csv")#importing data
attach(dataflight)# attaching data columns
library(nortest)#Shapiro-Wilks test and Anderson-Darling test
library(caret)#BoxCox Transformation
## Loading required package: lattice
## Loading required package: ggplot2

**************************

#linear regression
fit <- lm(Price~ AdvancedBookingDays+Airline+Departure+IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth, data = dataflight)
summary(fit)
## 
## Call:
## lm(formula = Price ~ AdvancedBookingDays + Airline + Departure + 
##     IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + 
##     SeatPitch + SeatWidth, data = dataflight)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2671.2 -1266.2  -456.4   517.4 11953.9 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4292.94    8897.87  -0.482   0.6298    
## AdvancedBookingDays    -87.70      12.47  -7.033 1.43e-11 ***
## AirlineIndiGo         -577.17     778.64  -0.741   0.4591    
## AirlineJet            -120.75     436.69  -0.277   0.7823    
## AirlineSpice Jet     -1118.38     697.85  -1.603   0.1101    
## DeparturePM           -589.79     275.23  -2.143   0.0329 *  
## IsWeekendYes          -345.92     408.06  -0.848   0.3973    
## IsDiwaliYes           4346.80     568.14   7.651 2.90e-13 ***
## DepartureCityCodeDEL -1413.46     351.54  -4.021 7.38e-05 ***
## FlyingMinutes           38.97      29.27   1.331   0.1841    
## SeatPitch             -279.19     226.64  -1.232   0.2190    
## SeatWidth              868.58     507.54   1.711   0.0881 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2079 on 293 degrees of freedom
## Multiple R-squared:  0.2695, Adjusted R-squared:  0.2421 
## F-statistic: 9.828 on 11 and 293 DF,  p-value: 3.604e-15

**************************

#log linear regression
pricelog<-log(Price)
fit1 <- lm(pricelog~ AdvancedBookingDays+Airline+Departure+IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth, data = dataflight)
summary(fit1)
## 
## Call:
## lm(formula = pricelog ~ AdvancedBookingDays + Airline + Departure + 
##     IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + 
##     SeatPitch + SeatWidth, data = dataflight)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57006 -0.19770 -0.05792  0.12935  1.24672 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.549474   1.243788   5.266 2.71e-07 ***
## AdvancedBookingDays  -0.014639   0.001743  -8.399 1.97e-15 ***
## AirlineIndiGo        -0.098622   0.108842  -0.906   0.3656    
## AirlineJet            0.001113   0.061043   0.018   0.9855    
## AirlineSpice Jet     -0.127169   0.097548  -1.304   0.1934    
## DeparturePM          -0.055844   0.038473  -1.452   0.1477    
## IsWeekendYes         -0.036748   0.057041  -0.644   0.5199    
## IsDiwaliYes           0.744738   0.079418   9.377  < 2e-16 ***
## DepartureCityCodeDEL -0.264017   0.049140  -5.373 1.58e-07 ***
## FlyingMinutes         0.008717   0.004092   2.131   0.0340 *  
## SeatPitch            -0.032824   0.031681  -1.036   0.3010    
## SeatWidth             0.122364   0.070947   1.725   0.0856 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2906 on 293 degrees of freedom
## Multiple R-squared:  0.3671, Adjusted R-squared:  0.3433 
## F-statistic: 15.45 on 11 and 293 DF,  p-value: < 2.2e-16

**************************

#Which model is better?
#Linear model has adjusted R sq. as 24.21%
#Log linear model is better with adjusted R sq. 34.33% 

**************************

#visual Inspection test for normality using qqplot
plot(fit,2)

#Data points on the right hand side of the graph are not close to the 45 degree line
plot(fit1,2)

#Data points are much closer to 45 degree line as compared to previous model

**************************

#Shapiro-Wilks test and Anderson-Darling test
#linear linear model
ad.test(Price)
## 
##  Anderson-Darling normality test
## 
## data:  Price
## A = 19.412, p-value < 2.2e-16
shapiro.test(Price)
## 
##  Shapiro-Wilk normality test
## 
## data:  Price
## W = 0.77653, p-value < 2.2e-16
#Is the normality assumption violated in the linear-linear model?
#Yes
#p value of 'price' in linear model is less than .05 which means we cannot assume normality as distribution of the data are significantly different from normal distribution
#log linear model
ad.test(pricelog)
## 
##  Anderson-Darling normality test
## 
## data:  pricelog
## A = 5.8513, p-value = 1.978e-14
shapiro.test(pricelog)
## 
##  Shapiro-Wilk normality test
## 
## data:  pricelog
## W = 0.93966, p-value = 8.002e-10
#Is the normality assumption violated in the log-linear model?
#Yes, p value of 'pricelog' in log linear model is less than .05 which means we cannot assume normality as distribution of the data are significantly different from normal distribution

**************************

#visual inspection using residual plots
plot(fit,1)

#Is the linearity assumption violated in the linear-linear model?
#Yes. We can see Curvature in the Residual Plot line for Linear Linear model which shows that error terms might not be normally distributed 
plot(fit1,1)

#Is the linearity assumption violated in the log-linear model?
#As compared to linear linear model, log linear graph seems much closer to normality. However, there is presence of curvature in the redline which means it violates normality assumption.

**************************

#BoxCox Transformation
PriceTrans <- BoxCoxTrans(Price)
PriceTrans
## Box-Cox Transformation
## 
## 305 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2607    4051    4681    5395    5725   18015 
## 
## Largest/Smallest: 6.91 
## Sample Skewness: 2.26 
## 
## Estimated Lambda: -0.8
#What "lambda" value does Box-Cox transformation indicate?
#-0.8
dataflight<-cbind(dataflight, PriceNew=predict(PriceTrans,Price))
head(dataflight)
##   FlightNumber   Airline DepartureCityCode ArrivalCityCode DepartureTime
## 1       9W 313       Jet               DEL             BOM           225
## 2       9W 339       Jet               BOM             DEL           300
## 3       SG 161 Spice Jet               DEL             BOM           350
## 4       6E 171    IndiGo               DEL             BOM           455
## 5       SG 160 Spice Jet               BOM             DEL           555
## 6       9W 762       Jet               BOM             DEL           605
##   ArrivalTime Departure FlyingMinutes Aircraft PlaneModel Capacity
## 1         435        AM           130   Boeing        738      156
## 2         505        AM           125   Boeing        738      156
## 3         605        AM           135   Boeing        738      189
## 4         710        AM           135   Airbus       A320      180
## 5         805        AM           130   Boeing        738      189
## 6         815        AM           130   Boeing        738      156
##   SeatPitch SeatWidth DataCollectionDate DateDeparture IsWeekend Price
## 1        30        17        Sep 13 2018    Nov 6 2018        No  4051
## 2        30        17        Sep 15 2018    Nov 6 2018        No 11587
## 3        29        17        Sep 19 2018    Nov 6 2018        No  3977
## 4        30        18         Sep 8 2018    Nov 6 2018        No  4234
## 5        29        17        Sep 19 2018    Nov 6 2018        No  6837
## 6        30        17        Sep 15 2018    Nov 6 2018        No  6518
##   AdvancedBookingDays IsDiwali DayBeforeDiwali DayAfterDiwali MarketShare
## 1                  54      Yes             Yes             No        15.4
## 2                  52      Yes             Yes             No        15.4
## 3                  48      Yes             Yes             No        13.2
## 4                  59      Yes             Yes             No        39.6
## 5                  48      Yes             Yes             No        13.2
## 6                  52      Yes             Yes             No        15.4
##   LoadFactor PriceNew
## 1      83.32 1.248375
## 2      83.32 1.249299
## 3      94.06 1.248351
## 4      87.20 1.248431
## 5      94.06 1.248931
## 6      83.32 1.248889
attach(dataflight)
## The following objects are masked from dataflight (pos = 7):
## 
##     AdvancedBookingDays, Aircraft, Airline, ArrivalCityCode,
##     ArrivalTime, Capacity, DataCollectionDate, DateDeparture,
##     DayAfterDiwali, DayBeforeDiwali, Departure, DepartureCityCode,
##     DepartureTime, FlightNumber, FlyingMinutes, IsDiwali,
##     IsWeekend, LoadFactor, MarketShare, PlaneModel, Price,
##     SeatPitch, SeatWidth
fit_trans<-lm(PriceNew~AdvancedBookingDays+Airline+Departure+IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth, data = dataflight)
summary(fit_trans)
## 
## Call:
## lm(formula = PriceNew ~ AdvancedBookingDays + Airline + Departure + 
##     IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + 
##     SeatPitch + SeatWidth, data = dataflight)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.868e-04 -1.930e-04 -2.246e-05  1.777e-04  9.443e-04 
## 
## Coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           1.246e+00  1.218e-03 1022.900  < 2e-16 ***
## AdvancedBookingDays  -1.551e-05  1.707e-06   -9.084  < 2e-16 ***
## AirlineIndiGo        -1.190e-04  1.066e-04   -1.117  0.26509    
## AirlineJet            6.273e-06  5.979e-05    0.105  0.91652    
## AirlineSpice Jet     -1.075e-04  9.555e-05   -1.125  0.26147    
## DeparturePM          -3.419e-05  3.769e-05   -0.907  0.36506    
## IsWeekendYes         -2.680e-05  5.587e-05   -0.480  0.63183    
## IsDiwaliYes           7.987e-04  7.779e-05   10.267  < 2e-16 ***
## DepartureCityCodeDEL -2.844e-04  4.813e-05   -5.909 9.53e-09 ***
## FlyingMinutes         1.056e-05  4.008e-06    2.635  0.00887 ** 
## SeatPitch            -2.475e-05  3.103e-05   -0.798  0.42579    
## SeatWidth             1.143e-04  6.950e-05    1.645  0.10106    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002847 on 293 degrees of freedom
## Multiple R-squared:  0.4183, Adjusted R-squared:  0.3965 
## F-statistic: 19.16 on 11 and 293 DF,  p-value: < 2.2e-16

**************************

#Test the Box-Cox transformed model for normality, based on visual examination of a qqplot (and compare it with the linear-linear qqplot)
plot(fit_trans,2)

#As compared to linear linear model, qqplot graph seems much closer to normality as Data points are much closer to 45 degree line however concrete conclusion cannot be derived from it
#Test the Box-Cox transformed model for normality, based on  (i) Shapiro-Wilks test; (ii) Anderson-Darling test? Is the normality assumption violated?
ad.test(PriceNew)
## 
##  Anderson-Darling normality test
## 
## data:  PriceNew
## A = 1.6763, p-value = 0.0002619
shapiro.test(PriceNew)
## 
##  Shapiro-Wilk normality test
## 
## data:  PriceNew
## W = 0.98545, p-value = 0.003551
#p value is still less than 0.05 hence we cannot assume distribution to be normal

**************************

#Test the Box-Cox transformed model for linearity, based on visual examination of a  of the Residual versus Fitted plot (and compare it with the Residual versus Fitted plot based on the linear-linear model)
plot(fit_trans,1)

#We can see Curvature in the Residual Plot line which is similar to Linear Linear model which shows that error terms might not be normally distributed