**************************

#A.1. Linear regression
model0<-Price ~ AdvancedBookingDays + Airline + Departure + IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + SeatPitch + SeatWidth
linearlinearfit<-lm(model0)
summary(linearlinearfit)
## 
## Call:
## lm(formula = model0)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2671.2 -1266.2  -456.4   517.4 11953.9 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -4292.94    8897.87  -0.482   0.6298    
## AdvancedBookingDays    -87.70      12.47  -7.033 1.43e-11 ***
## AirlineIndiGo         -577.17     778.64  -0.741   0.4591    
## AirlineJet            -120.75     436.69  -0.277   0.7823    
## AirlineSpice Jet     -1118.38     697.85  -1.603   0.1101    
## DeparturePM           -589.79     275.23  -2.143   0.0329 *  
## IsWeekendYes          -345.92     408.06  -0.848   0.3973    
## IsDiwaliYes           4346.80     568.14   7.651 2.90e-13 ***
## DepartureCityCodeDEL -1413.46     351.54  -4.021 7.38e-05 ***
## FlyingMinutes           38.97      29.27   1.331   0.1841    
## SeatPitch             -279.19     226.64  -1.232   0.2190    
## SeatWidth              868.58     507.54   1.711   0.0881 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2079 on 293 degrees of freedom
## Multiple R-squared:  0.2695, Adjusted R-squared:  0.2421 
## F-statistic: 9.828 on 11 and 293 DF,  p-value: 3.604e-15

**************************

#A.2. Log-linear regression
logPrice <- log(Price)
model1<-logPrice ~ AdvancedBookingDays+Airline+Departure +IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth
loglinearfit<-lm(model1)
summary(loglinearfit)
## 
## Call:
## lm(formula = model1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.57006 -0.19770 -0.05792  0.12935  1.24672 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           6.549474   1.243788   5.266 2.71e-07 ***
## AdvancedBookingDays  -0.014639   0.001743  -8.399 1.97e-15 ***
## AirlineIndiGo        -0.098622   0.108842  -0.906   0.3656    
## AirlineJet            0.001113   0.061043   0.018   0.9855    
## AirlineSpice Jet     -0.127169   0.097548  -1.304   0.1934    
## DeparturePM          -0.055844   0.038473  -1.452   0.1477    
## IsWeekendYes         -0.036748   0.057041  -0.644   0.5199    
## IsDiwaliYes           0.744738   0.079418   9.377  < 2e-16 ***
## DepartureCityCodeDEL -0.264017   0.049140  -5.373 1.58e-07 ***
## FlyingMinutes         0.008717   0.004092   2.131   0.0340 *  
## SeatPitch            -0.032824   0.031681  -1.036   0.3010    
## SeatWidth             0.122364   0.070947   1.725   0.0856 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2906 on 293 degrees of freedom
## Multiple R-squared:  0.3671, Adjusted R-squared:  0.3433 
## F-statistic: 15.45 on 11 and 293 DF,  p-value: < 2.2e-16

**************************

#A.3. Which model is better?
#Linear model has adjusted R sq. as 24.21%
#Log linear model is better with adjusted R sq. 34.33% 

**************************

#A.4a. Visual Inspection test for normality using qqplot
plot(linearlinearfit,2)

qqnorm(Price)
qqline(Price, col = "red")

#Data points on the right hand side of the graph are not close to the 45 degree line
plot(loglinearfit,2)

qqnorm(logPrice)
qqline(logPrice, col = "red")

#Data points are much closer to 45 degree line as compared to previous model

**************************

#A.4b. Shapiro-Wilks test and Anderson-Darling test
#linear linear model
ad.test(Price)
## 
##  Anderson-Darling normality test
## 
## data:  Price
## A = 19.412, p-value < 2.2e-16
shapiro.test(Price)
## 
##  Shapiro-Wilk normality test
## 
## data:  Price
## W = 0.77653, p-value < 2.2e-16
#Is the normality assumption violated in the linear-linear model?
#Yes
#p value of 'price' in linear model is less than .05 which means we cannot assume normality as distribution of the data are significantly different from normal distribution
#log linear model
ad.test(logPrice)
## 
##  Anderson-Darling normality test
## 
## data:  logPrice
## A = 5.8513, p-value = 1.978e-14
shapiro.test(logPrice)
## 
##  Shapiro-Wilk normality test
## 
## data:  logPrice
## W = 0.93966, p-value = 8.002e-10
#Is the normality assumption violated in the log-linear model?
#Yes, p-value of 'pricelog' in log linear model is less than .05 which means we cannot assume normality as distribution of the data are significantly different from normal distribution

**************************

#A.5 visual inspection using residual plots
plot(linearlinearfit,1)

#Is the linearity assumption violated in the linear-linear model?
#Yes. We can see Curvature in the Residual Plot line for Linear Linear model which shows that error terms might not be normally distributed 
plot(loglinearfit,1)

#Is the linearity assumption violated in the log-linear model?
#As compared to linear linear model, log linear graph seems much closer to normality. However, there is presence of curvature in the red-line which means it violates normality assumption.

**************************

#A.6. BoxCox Transformation
PriceTrans <- BoxCoxTrans(Price)
PriceTrans
## Box-Cox Transformation
## 
## 305 data points used to estimate Lambda
## 
## Input data summary:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2607    4051    4681    5395    5725   18015 
## 
## Largest/Smallest: 6.91 
## Sample Skewness: 2.26 
## 
## Estimated Lambda: -0.8
#What "lambda" value does Box-Cox transformation indicate?
#-0.8
airline.df<-cbind(airline.df, PriceNew=predict(PriceTrans,Price))
head(airline.df)
##   FlightNumber   Airline DepartureCityCode ArrivalCityCode DepartureTime
## 1       9W 313       Jet               DEL             BOM           225
## 2       9W 339       Jet               BOM             DEL           300
## 3       SG 161 Spice Jet               DEL             BOM           350
## 4       6E 171    IndiGo               DEL             BOM           455
## 5       SG 160 Spice Jet               BOM             DEL           555
## 6       9W 762       Jet               BOM             DEL           605
##   ArrivalTime Departure FlyingMinutes Aircraft PlaneModel Capacity
## 1         435        AM           130   Boeing        738      156
## 2         505        AM           125   Boeing        738      156
## 3         605        AM           135   Boeing        738      189
## 4         710        AM           135   Airbus       A320      180
## 5         805        AM           130   Boeing        738      189
## 6         815        AM           130   Boeing        738      156
##   SeatPitch SeatWidth DataCollectionDate DateDeparture IsWeekend Price
## 1        30        17        Sep 13 2018    Nov 6 2018        No  4051
## 2        30        17        Sep 15 2018    Nov 6 2018        No 11587
## 3        29        17        Sep 19 2018    Nov 6 2018        No  3977
## 4        30        18         Sep 8 2018    Nov 6 2018        No  4234
## 5        29        17        Sep 19 2018    Nov 6 2018        No  6837
## 6        30        17        Sep 15 2018    Nov 6 2018        No  6518
##   AdvancedBookingDays IsDiwali DayBeforeDiwali DayAfterDiwali MarketShare
## 1                  54      Yes             Yes             No        15.4
## 2                  52      Yes             Yes             No        15.4
## 3                  48      Yes             Yes             No        13.2
## 4                  59      Yes             Yes             No        39.6
## 5                  48      Yes             Yes             No        13.2
## 6                  52      Yes             Yes             No        15.4
##   LoadFactor PriceNew
## 1      83.32 1.248375
## 2      83.32 1.249299
## 3      94.06 1.248351
## 4      87.20 1.248431
## 5      94.06 1.248931
## 6      83.32 1.248889
attach(airline.df)
## The following objects are masked from airline.df (pos = 3):
## 
##     AdvancedBookingDays, Aircraft, Airline, ArrivalCityCode,
##     ArrivalTime, Capacity, DataCollectionDate, DateDeparture,
##     DayAfterDiwali, DayBeforeDiwali, Departure, DepartureCityCode,
##     DepartureTime, FlightNumber, FlyingMinutes, IsDiwali,
##     IsWeekend, LoadFactor, MarketShare, PlaneModel, Price,
##     SeatPitch, SeatWidth
fit_trans<-lm(PriceNew~AdvancedBookingDays+Airline+Departure+IsWeekend+IsDiwali+DepartureCityCode+FlyingMinutes+SeatPitch+SeatWidth)
summary(fit_trans)
## 
## Call:
## lm(formula = PriceNew ~ AdvancedBookingDays + Airline + Departure + 
##     IsWeekend + IsDiwali + DepartureCityCode + FlyingMinutes + 
##     SeatPitch + SeatWidth)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -7.868e-04 -1.930e-04 -2.246e-05  1.777e-04  9.443e-04 
## 
## Coefficients:
##                        Estimate Std. Error  t value Pr(>|t|)    
## (Intercept)           1.246e+00  1.218e-03 1022.900  < 2e-16 ***
## AdvancedBookingDays  -1.551e-05  1.707e-06   -9.084  < 2e-16 ***
## AirlineIndiGo        -1.190e-04  1.066e-04   -1.117  0.26509    
## AirlineJet            6.273e-06  5.979e-05    0.105  0.91652    
## AirlineSpice Jet     -1.075e-04  9.555e-05   -1.125  0.26147    
## DeparturePM          -3.419e-05  3.769e-05   -0.907  0.36506    
## IsWeekendYes         -2.680e-05  5.587e-05   -0.480  0.63183    
## IsDiwaliYes           7.987e-04  7.779e-05   10.267  < 2e-16 ***
## DepartureCityCodeDEL -2.844e-04  4.813e-05   -5.909 9.53e-09 ***
## FlyingMinutes         1.056e-05  4.008e-06    2.635  0.00887 ** 
## SeatPitch            -2.475e-05  3.103e-05   -0.798  0.42579    
## SeatWidth             1.143e-04  6.950e-05    1.645  0.10106    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0002847 on 293 degrees of freedom
## Multiple R-squared:  0.4183, Adjusted R-squared:  0.3965 
## F-statistic: 19.16 on 11 and 293 DF,  p-value: < 2.2e-16

**************************

#A.7a. Test the Box-Cox transformed model for normality, based on visual examination of a qqplot (and compare it with the linear-linear qqplot)
plot(fit_trans,2)

#As compared to linear linear model, qqplot graph seems much closer to normality as the data points are much closer to 45 degree line however concrete conclusion cannot be derived from it.

**************************

#A.7b. Test the Box-Cox transformed model for normality, based on  (i) Shapiro-Wilks test; (ii) Anderson-Darling test? Is the normality assumption violated?
ad.test(PriceNew)
## 
##  Anderson-Darling normality test
## 
## data:  PriceNew
## A = 1.6763, p-value = 0.0002619
shapiro.test(PriceNew)
## 
##  Shapiro-Wilk normality test
## 
## data:  PriceNew
## W = 0.98545, p-value = 0.003551
#p-value for both the tests is still less than 0.05 hence we cannot assume distribution to be normal

**************************

#A.7c. Test the Box-Cox transformed model for linearity, based on visual examination of a  of the Residual versus Fitted plot (and compare it with the Residual versus Fitted plot based on the linear-linear model)
plot(fit_trans,1)

plot(linearlinearfit,1)

#We can see Curvature in the Residual Plot line which is similar to linear linear model which shows that error terms might not be normally distributed. However, the distance from the 0 line is lesser.