air.df<-read.csv("C:/Users/here_is_sachin/Downloads/R udemy/SixAirlinesDataV2.csv")
summary(air.df)
## Airline Aircraft FlightDuration TravelMonth
## AirFrance: 74 AirBus:151 Min. : 1.250 Aug:127
## British :175 Boeing:307 1st Qu.: 4.260 Jul: 75
## Delta : 46 Median : 7.790 Oct:127
## Jet : 61 Mean : 7.578 Sep:129
## Singapore: 40 3rd Qu.:10.620
## Virgin : 62 Max. :14.660
## IsInternational SeatsEconomy SeatsPremium PitchEconomy
## Domestic : 40 Min. : 78.0 Min. : 8.00 Min. :30.00
## International:418 1st Qu.:133.0 1st Qu.:21.00 1st Qu.:31.00
## Median :185.0 Median :36.00 Median :31.00
## Mean :202.3 Mean :33.65 Mean :31.22
## 3rd Qu.:243.0 3rd Qu.:40.00 3rd Qu.:32.00
## Max. :389.0 Max. :66.00 Max. :33.00
## PitchPremium WidthEconomy WidthPremium PriceEconomy
## Min. :34.00 Min. :17.00 Min. :17.00 Min. : 65
## 1st Qu.:38.00 1st Qu.:18.00 1st Qu.:19.00 1st Qu.: 413
## Median :38.00 Median :18.00 Median :19.00 Median :1242
## Mean :37.91 Mean :17.84 Mean :19.47 Mean :1327
## 3rd Qu.:38.00 3rd Qu.:18.00 3rd Qu.:21.00 3rd Qu.:1909
## Max. :40.00 Max. :19.00 Max. :21.00 Max. :3593
## PricePremium PriceRelative SeatsTotal PitchDifference
## Min. : 86.0 Min. :0.0200 Min. : 98 Min. : 2.000
## 1st Qu.: 528.8 1st Qu.:0.1000 1st Qu.:166 1st Qu.: 6.000
## Median :1737.0 Median :0.3650 Median :227 Median : 7.000
## Mean :1845.3 Mean :0.4872 Mean :236 Mean : 6.688
## 3rd Qu.:2989.0 3rd Qu.:0.7400 3rd Qu.:279 3rd Qu.: 7.000
## Max. :7414.0 Max. :1.8900 Max. :441 Max. :10.000
## WidthDifference PercentPremiumSeats
## Min. :0.000 Min. : 4.71
## 1st Qu.:1.000 1st Qu.:12.28
## Median :1.000 Median :13.21
## Mean :1.633 Mean :14.65
## 3rd Qu.:3.000 3rd Qu.:15.36
## Max. :4.000 Max. :24.69
Different Airlines
Dif <- table(air.df$Airline)
Dif
##
## AirFrance British Delta Jet Singapore Virgin
## 74 175 46 61 40 62
barplot(Dif)
plot(air.df$TravelMonth,main = "Frequecy of flights in months")
Histogram
par(mfrow=c(1,2))
hist(air.df$PriceEconomy, breaks = 6, main = "Economy Price",xlab = "Price Economy", ylim = c(0,150))
hist(air.df$PricePremium,breaks = 6,main = "Premium Price",xlab = "Price Premium")
par(mfrow=c(1,2))
boxplot(air.df$SeatsEconomy, breaks = 10, main = "Economy Seats")
boxplot(air.df$SeatsPremium,breaks = 5,main = "Premium Seats")
Airline-Wise Relative Price Distribution
boxplot(PriceEconomy~Airline, data = air.df ,horizontal = TRUE,yaxt="n",xlab = "Price in USD", main= "Airline-wise Economy class-price distribution")
axis(side=2,at=c(1:6),labels = c("Aif","Bts","Delt","Jet","Sigp","Vg"))
Price-difference Vs Flight Duration
library(car)
scatterplot(PriceRelative~FlightDuration, cex = 0.9, pch=19, main = " Relative price difference vs Flight Duration",data = air.df)
Correlation
round(cor(Filter(is.numeric, air.df)),2)
## FlightDuration SeatsEconomy SeatsPremium PitchEconomy
## FlightDuration 1.00 0.20 0.16 0.29
## SeatsEconomy 0.20 1.00 0.63 0.14
## SeatsPremium 0.16 0.63 1.00 -0.03
## PitchEconomy 0.29 0.14 -0.03 1.00
## PitchPremium 0.10 0.12 0.00 -0.55
## WidthEconomy 0.46 0.37 0.46 0.29
## WidthPremium 0.10 0.10 0.00 -0.54
## PriceEconomy 0.57 0.13 0.11 0.37
## PricePremium 0.65 0.18 0.22 0.23
## PriceRelative 0.12 0.00 -0.10 -0.42
## SeatsTotal 0.20 0.99 0.72 0.12
## PitchDifference -0.04 0.04 0.02 -0.78
## WidthDifference -0.12 -0.08 -0.22 -0.64
## PercentPremiumSeats 0.06 -0.33 0.49 -0.10
## PitchPremium WidthEconomy WidthPremium PriceEconomy
## FlightDuration 0.10 0.46 0.10 0.57
## SeatsEconomy 0.12 0.37 0.10 0.13
## SeatsPremium 0.00 0.46 0.00 0.11
## PitchEconomy -0.55 0.29 -0.54 0.37
## PitchPremium 1.00 -0.02 0.75 0.05
## WidthEconomy -0.02 1.00 0.08 0.07
## WidthPremium 0.75 0.08 1.00 -0.06
## PriceEconomy 0.05 0.07 -0.06 1.00
## PricePremium 0.09 0.15 0.06 0.90
## PriceRelative 0.42 -0.04 0.50 -0.29
## SeatsTotal 0.11 0.41 0.09 0.13
## PitchDifference 0.95 -0.13 0.76 -0.10
## WidthDifference 0.70 -0.39 0.88 -0.08
## PercentPremiumSeats -0.18 0.23 -0.18 0.07
## PricePremium PriceRelative SeatsTotal PitchDifference
## FlightDuration 0.65 0.12 0.20 -0.04
## SeatsEconomy 0.18 0.00 0.99 0.04
## SeatsPremium 0.22 -0.10 0.72 0.02
## PitchEconomy 0.23 -0.42 0.12 -0.78
## PitchPremium 0.09 0.42 0.11 0.95
## WidthEconomy 0.15 -0.04 0.41 -0.13
## WidthPremium 0.06 0.50 0.09 0.76
## PriceEconomy 0.90 -0.29 0.13 -0.10
## PricePremium 1.00 0.03 0.19 -0.02
## PriceRelative 0.03 1.00 -0.01 0.47
## SeatsTotal 0.19 -0.01 1.00 0.03
## PitchDifference -0.02 0.47 0.03 1.00
## WidthDifference -0.01 0.49 -0.11 0.76
## PercentPremiumSeats 0.12 -0.16 -0.22 -0.09
## WidthDifference PercentPremiumSeats
## FlightDuration -0.12 0.06
## SeatsEconomy -0.08 -0.33
## SeatsPremium -0.22 0.49
## PitchEconomy -0.64 -0.10
## PitchPremium 0.70 -0.18
## WidthEconomy -0.39 0.23
## WidthPremium 0.88 -0.18
## PriceEconomy -0.08 0.07
## PricePremium -0.01 0.12
## PriceRelative 0.49 -0.16
## SeatsTotal -0.11 -0.22
## PitchDifference 0.76 -0.09
## WidthDifference 1.00 -0.28
## PercentPremiumSeats -0.28 1.00
Regression Analysis
fit=lm((air.df$PricePremium-air.df$PriceEconomy) ~ air.df$PitchDifference+air.df$WidthDifference+air.df$FlightDuration)
summary(fit)
##
## Call:
## lm(formula = (air.df$PricePremium - air.df$PriceEconomy) ~ air.df$PitchDifference +
## air.df$WidthDifference + air.df$FlightDuration)
##
## Residuals:
## Min 1Q Median 3Q Max
## -859.4 -324.7 -62.7 150.1 3331.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -286.933 117.833 -2.435 0.0153 *
## air.df$PitchDifference 10.387 20.779 0.500 0.6174
## air.df$WidthDifference 74.641 30.977 2.410 0.0164 *
## air.df$FlightDuration 80.992 6.754 11.992 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 506.1 on 454 degrees of freedom
## Multiple R-squared: 0.2538, Adjusted R-squared: 0.2489
## F-statistic: 51.48 on 3 and 454 DF, p-value: < 2.2e-16
T-test Hypothesis: there is no difference between an economy class ticket and a premium economy class ticket.
t.test(air.df$PriceEconomy,air.df$PricePremium,var.equal = TRUE,paired = FALSE)
##
## Two Sample t-test
##
## data: air.df$PriceEconomy and air.df$PricePremium
## t = -6.8304, df = 914, p-value = 1.544e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -667.0699 -369.2926
## sample estimates:
## mean of x mean of y
## 1327.076 1845.258
The null hypothesis is rejected because the t-Test gives a very low p-value(p<0.5) .
Pearson’s Correlation Tests: Between difference price with width
cor.test((air.df$PricePremium-air.df$PriceEconomy),air.df$WidthDifference)
##
## Pearson's product-moment correlation
##
## data: (air.df$PricePremium - air.df$PriceEconomy) and air.df$WidthDifference
## t = 2.5291, df = 456, p-value = 0.01177
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02627012 0.20700978
## sample estimates:
## cor
## 0.1176138
Between difference in price with pitch
cor.test((air.df$PricePremium-air.df$PriceEconomy),air.df$PitchDifference)
##
## Pearson's product-moment correlation
##
## data: (air.df$PricePremium - air.df$PriceEconomy) and air.df$PitchDifference
## t = 2.7688, df = 456, p-value = 0.005855
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03739893 0.21764764
## sample estimates:
## cor
## 0.1285851
Regression Model
model <- lm(PriceRelative ~ ., data=air.df)
summary(model)
##
## Call:
## lm(formula = PriceRelative ~ ., data = air.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.76373 -0.08269 0.00438 0.08002 0.84672
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.993e-01 2.948e+00 -0.135 0.892302
## AirlineBritish -3.971e-01 1.107e-01 -3.586 0.000373 ***
## AirlineDelta -3.865e-01 2.203e-01 -1.755 0.080020 .
## AirlineJet -2.584e-01 9.594e-02 -2.693 0.007354 **
## AirlineSingapore -3.535e-01 1.297e-01 -2.725 0.006685 **
## AirlineVirgin -3.575e-01 2.031e-01 -1.761 0.078997 .
## AircraftBoeing 4.003e-02 2.968e-02 1.349 0.178089
## FlightDuration 2.613e-02 4.727e-03 5.526 5.63e-08 ***
## TravelMonthJul 2.111e-02 3.145e-02 0.671 0.502475
## TravelMonthOct 2.778e-02 2.670e-02 1.041 0.298619
## TravelMonthSep -6.617e-03 2.664e-02 -0.248 0.803924
## IsInternationalInternational 2.785e-02 2.502e-01 0.111 0.911400
## SeatsEconomy 8.090e-04 5.462e-04 1.481 0.139313
## SeatsPremium -7.374e-03 3.615e-03 -2.040 0.041967 *
## PitchEconomy -1.756e-02 7.994e-02 -0.220 0.826207
## PitchPremium 5.960e-02 9.165e-02 0.650 0.515823
## WidthEconomy -9.207e-02 5.266e-02 -1.748 0.081085 .
## WidthPremium 4.904e-02 1.365e-01 0.359 0.719527
## PriceEconomy -9.325e-04 3.318e-05 -28.105 < 2e-16 ***
## PricePremium 5.781e-04 2.294e-05 25.197 < 2e-16 ***
## SeatsTotal NA NA NA NA
## PitchDifference NA NA NA NA
## WidthDifference NA NA NA NA
## PercentPremiumSeats 1.114e-02 7.653e-03 1.456 0.146197
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2123 on 437 degrees of freedom
## Multiple R-squared: 0.7878, Adjusted R-squared: 0.7781
## F-statistic: 81.12 on 20 and 437 DF, p-value: < 2.2e-16
1.In International flights Variation in prices is more . 2.Pitch And Width Differences in International flights were more. 3.Relative pricing and the no. of premium class seats are more in international flights. 4. The intercept of the equation : y = b0 + b1x1 + b2x2 + . is 0.892302 where y is the relative premium price and x are all other variables.
Multiple r-squared value is 0.7878 therefore the model accounts for 78.78% of the variance in relative premium price.The fact that the adjusted r-squared value is less than that of multiple r-squared value, it shows that it improves the model more than would be expected by chance.
Since the p-value is less than 0.05, hence there is a strong correlation overall.