airline.df <- read.csv("SixAirlinesDataV2.csv")
View(airline.df)
summary(airline.df)
## Airline Aircraft FlightDuration TravelMonth
## AirFrance: 74 AirBus:151 Min. : 1.250 Aug:127
## British :175 Boeing:307 1st Qu.: 4.260 Jul: 75
## Delta : 46 Median : 7.790 Oct:127
## Jet : 61 Mean : 7.578 Sep:129
## Singapore: 40 3rd Qu.:10.620
## Virgin : 62 Max. :14.660
## IsInternational SeatsEconomy SeatsPremium PitchEconomy
## Domestic : 40 Min. : 78.0 Min. : 8.00 Min. :30.00
## International:418 1st Qu.:133.0 1st Qu.:21.00 1st Qu.:31.00
## Median :185.0 Median :36.00 Median :31.00
## Mean :202.3 Mean :33.65 Mean :31.22
## 3rd Qu.:243.0 3rd Qu.:40.00 3rd Qu.:32.00
## Max. :389.0 Max. :66.00 Max. :33.00
## PitchPremium WidthEconomy WidthPremium PriceEconomy
## Min. :34.00 Min. :17.00 Min. :17.00 Min. : 65
## 1st Qu.:38.00 1st Qu.:18.00 1st Qu.:19.00 1st Qu.: 413
## Median :38.00 Median :18.00 Median :19.00 Median :1242
## Mean :37.91 Mean :17.84 Mean :19.47 Mean :1327
## 3rd Qu.:38.00 3rd Qu.:18.00 3rd Qu.:21.00 3rd Qu.:1909
## Max. :40.00 Max. :19.00 Max. :21.00 Max. :3593
## PricePremium PriceRelative SeatsTotal PitchDifference
## Min. : 86.0 Min. :0.0200 Min. : 98 Min. : 2.000
## 1st Qu.: 528.8 1st Qu.:0.1000 1st Qu.:166 1st Qu.: 6.000
## Median :1737.0 Median :0.3650 Median :227 Median : 7.000
## Mean :1845.3 Mean :0.4872 Mean :236 Mean : 6.688
## 3rd Qu.:2989.0 3rd Qu.:0.7400 3rd Qu.:279 3rd Qu.: 7.000
## Max. :7414.0 Max. :1.8900 Max. :441 Max. :10.000
## WidthDifference PercentPremiumSeats
## Min. :0.000 Min. : 4.71
## 1st Qu.:1.000 1st Qu.:12.28
## Median :1.000 Median :13.21
## Mean :1.633 Mean :14.65
## 3rd Qu.:3.000 3rd Qu.:15.36
## Max. :4.000 Max. :24.69
air <- table(airline.df$Airline)
air
##
## AirFrance British Delta Jet Singapore Virgin
## 74 175 46 61 40 62
barplot(air)
mth <- table(airline.df$TravelMonth)
mth
##
## Aug Jul Oct Sep
## 127 75 127 129
plot(airline.df$TravelMonth,col= "lightblue",main = "Frequecy of flights in travel months")
par(mfrow=c(1,2))
hist(airline.df$FlightDuration,col="lightblue",main = "Flight Duration",breaks = 8,xlab = "Flight Duration")
boxplot(airline.df$FlightDuration,col="lightblue",main = "Flight Duration")
## ## Plots ## Internation Flights
plot(airline.df$IsInternational,ylim =c(0,450))
par(mfrow=c(1,2))
hist(airline.df$PitchEconomy,col = "lightblue",main="pitchEconomy",xlab = "Pitch Economy")
hist(airline.df$PitchPremium,col = "orange",main="PitchPremium",ylim = c(0,450))
boxplot(PriceEconomy~Airline, data = airline.df,horizontal = TRUE,col= c("red","purple","yellow","orange","lightblue","green"), yaxt="n",xlab = "Price in USD", main= "Airline-wise Economy class-price distribution")
axis(side=2,at=c(1:6),labels = c("Aif","Bts","Delt","Jet","Sigp","Vg"))
## Width Difference in Domestic Vs International Flights.
wd<- table(airline.df$WidthEconomy,airline.df$WidthPremium)
wd
##
## 17 18 19 20 21
## 17 28 0 32 0 54
## 18 0 12 224 0 68
## 19 0 0 0 40 0
par(mfrow=c(1,2))
hist(airline.df$WidthEconomy,col="lightblue",main = "Width Difference in Economy Class",xlim = c(15,20),ylim = c(0,300))
hist(airline.df$WidthPremium,col="orange",main = "Width Difference in Premium Class",xlim = c(15,20),ylim = c(0,300))
## Pitch Difference in Domestic Vs International Flights.
table(airline.df$IsInternational,airline.df$PitchDifference)
##
## 2 3 6 7 10
## Domestic 24 16 0 0 0
## International 0 0 121 243 54
par(mfrow=c(1,2))
hist(airline.df$PitchEconomy,col="lightblue",main = "Pitch Difference in Economy Class")
hist(airline.df$PitchPremium,col="orange",main = "Pitch Difference in Premium Class")
boxplot(PricePremium~Airline, data = airline.df ,horizontal = TRUE,col=c("red","purple","yellow","orange","lightblue","green"),yaxt="n",xlab = "Price in USD", main= "Airline-wise Economy class-price distribution")
axis(side=2,at=c(1:6),labels = c("Aif","Bts","Delt","Jet","Sigp","Vg"))
library(car)
scatterplot(PriceRelative~FlightDuration, cex = 0.9, pch=19, main = " Relative price difference vs Flight Duration",data = airline.df)
library(car)
scatterplot(PriceRelative~WidthDifference,cex = 0.9, pch=19,main ="RELATIVE PRICE DIFFERECE VS WIDTH DIFFERENCE",data = airline.df)
library(car)
scatterplot(PriceRelative~PitchDifference,main ="RELATIVE PRICE DIFFERECE VS PITCH DIFFERENCE",data = airline.df)
library(car)
scatterplotMatrix(airline.df[,c("PriceEconomy","FlightDuration","SeatsEconomy","SeatsPremium","PricePremium")],main="SCATTERPLOT MATRIX")
observations: 1. PriceEconomy is positively correlated with FlightDuration,Seatseconomy. 2. PriceEconomy is negatively correlated with SeatsPremium. 3. PricePremium is negatively correlated with SeatsEconomy and FlightDuration. 4. PricePremium is positively correlated with PriceEconomy.
library(car)
scatterplotMatrix(airline.df[,c("PriceEconomy","PitchDifference","WidthDifference","PricePremium")],main="SCATTERPLOT MATRIX")
round(cor(Filter(is.numeric, airline.df)),2)
## FlightDuration SeatsEconomy SeatsPremium PitchEconomy
## FlightDuration 1.00 0.20 0.16 0.29
## SeatsEconomy 0.20 1.00 0.63 0.14
## SeatsPremium 0.16 0.63 1.00 -0.03
## PitchEconomy 0.29 0.14 -0.03 1.00
## PitchPremium 0.10 0.12 0.00 -0.55
## WidthEconomy 0.46 0.37 0.46 0.29
## WidthPremium 0.10 0.10 0.00 -0.54
## PriceEconomy 0.57 0.13 0.11 0.37
## PricePremium 0.65 0.18 0.22 0.23
## PriceRelative 0.12 0.00 -0.10 -0.42
## SeatsTotal 0.20 0.99 0.72 0.12
## PitchDifference -0.04 0.04 0.02 -0.78
## WidthDifference -0.12 -0.08 -0.22 -0.64
## PercentPremiumSeats 0.06 -0.33 0.49 -0.10
## PitchPremium WidthEconomy WidthPremium PriceEconomy
## FlightDuration 0.10 0.46 0.10 0.57
## SeatsEconomy 0.12 0.37 0.10 0.13
## SeatsPremium 0.00 0.46 0.00 0.11
## PitchEconomy -0.55 0.29 -0.54 0.37
## PitchPremium 1.00 -0.02 0.75 0.05
## WidthEconomy -0.02 1.00 0.08 0.07
## WidthPremium 0.75 0.08 1.00 -0.06
## PriceEconomy 0.05 0.07 -0.06 1.00
## PricePremium 0.09 0.15 0.06 0.90
## PriceRelative 0.42 -0.04 0.50 -0.29
## SeatsTotal 0.11 0.41 0.09 0.13
## PitchDifference 0.95 -0.13 0.76 -0.10
## WidthDifference 0.70 -0.39 0.88 -0.08
## PercentPremiumSeats -0.18 0.23 -0.18 0.07
## PricePremium PriceRelative SeatsTotal PitchDifference
## FlightDuration 0.65 0.12 0.20 -0.04
## SeatsEconomy 0.18 0.00 0.99 0.04
## SeatsPremium 0.22 -0.10 0.72 0.02
## PitchEconomy 0.23 -0.42 0.12 -0.78
## PitchPremium 0.09 0.42 0.11 0.95
## WidthEconomy 0.15 -0.04 0.41 -0.13
## WidthPremium 0.06 0.50 0.09 0.76
## PriceEconomy 0.90 -0.29 0.13 -0.10
## PricePremium 1.00 0.03 0.19 -0.02
## PriceRelative 0.03 1.00 -0.01 0.47
## SeatsTotal 0.19 -0.01 1.00 0.03
## PitchDifference -0.02 0.47 0.03 1.00
## WidthDifference -0.01 0.49 -0.11 0.76
## PercentPremiumSeats 0.12 -0.16 -0.22 -0.09
## WidthDifference PercentPremiumSeats
## FlightDuration -0.12 0.06
## SeatsEconomy -0.08 -0.33
## SeatsPremium -0.22 0.49
## PitchEconomy -0.64 -0.10
## PitchPremium 0.70 -0.18
## WidthEconomy -0.39 0.23
## WidthPremium 0.88 -0.18
## PriceEconomy -0.08 0.07
## PricePremium -0.01 0.12
## PriceRelative 0.49 -0.16
## SeatsTotal -0.11 -0.22
## PitchDifference 0.76 -0.09
## WidthDifference 1.00 -0.28
## PercentPremiumSeats -0.28 1.00
par(mfrow=c(1,1))
library(corrgram)
corrgram(airline.df, upper.panel=panel.pie,main= "Corrgram of store variables" )
NullHypothesis: There is no difference between an economy class ticket and a premium economy class ticket.
t.test(airline.df$PriceEconomy,airline.df$PricePremium,var.equal = TRUE,paired = FALSE)
##
## Two Sample t-test
##
## data: airline.df$PriceEconomy and airline.df$PricePremium
## t = -6.8304, df = 914, p-value = 1.544e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -667.0699 -369.2926
## sample estimates:
## mean of x mean of y
## 1327.076 1845.258
The null hypothesis is rejected because the t-Test gives a very low p-value(p<0.5) and there is a difference between economy class and premium economy class tickets.
1.Check the correlation Between PriceRelative and PitchDifference
cor.test(airline.df$PriceRelative,airline.df$PitchDifference)
##
## Pearson's product-moment correlation
##
## data: airline.df$PriceRelative and airline.df$PitchDifference
## t = 11.331, df = 456, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3940262 0.5372817
## sample estimates:
## cor
## 0.4687302
As p-value<0.5,we can conclude that there is indeed a relation between Price of two classes And Pitch Difference
2.Check the Correlation between FlightDuration And Relative price of two classes
cor.test(airline.df$PriceRelative,airline.df$FlightDuration,method = "pearson")
##
## Pearson's product-moment correlation
##
## data: airline.df$PriceRelative and airline.df$FlightDuration
## t = 2.6046, df = 456, p-value = 0.009498
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02977856 0.21036806
## sample estimates:
## cor
## 0.121075
As, pvalue <0.5,we can conclude that there is a correlation between FlightDuration And Relative price of two classes.
3.Check Relation Between in Relativeprice with SeatsPremium
cor.test(airline.df$PriceRelative,airline.df$SeatsPremium)
##
## Pearson's product-moment correlation
##
## data: airline.df$PriceRelative and airline.df$SeatsPremium
## t = -2.0854, df = 456, p-value = 0.03759
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.18715605 -0.00561924
## sample estimates:
## cor
## -0.09719601
pvalue<0.5,Hence we can say that there is a significant correlation between Prices of two classes and SeatsPremium.
4.Check The Correlation between RelativePrice and WidthDifference.
cor.test(airline.df$PriceRelative,airline.df$WidthDifference)
##
## Pearson's product-moment correlation
##
## data: airline.df$PriceRelative and airline.df$WidthDifference
## t = 11.869, df = 456, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4125388 0.5528218
## sample estimates:
## cor
## 0.4858024
pvalue<0.5,There is Significant Correlation between Price of twoclasses and Widthdifference.
t.test(PriceRelative ~ IsInternational,data = airline.df)
##
## Welch Two Sample t-test
##
## data: PriceRelative by IsInternational
## t = -19.451, df = 446.12, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.4855215 -0.3964139
## sample estimates:
## mean in group Domestic mean in group International
## 0.0847500 0.5257177
p-value<0.5,Hence there is relation between Relative Price and Internation flights Vs Domestic flights.
Consider the following Regression Equation:
y=B0 + B1(X1) + B2(X2) + B3(X3) where,y is dependent variable,X1,X2,X3 are dependent variables and β0,β1 and β2 are Beta-Coefficent. y=B0 + B1(FlightDuration) + B2(IsInternational) + B3(PitchDifference) + B4(WidthDifference)
fit=lm(PriceRelative ~ FlightDuration+IsInternational+PitchDifference+WidthDifference,airline.df)
summary(fit)
##
## Call:
## lm(formula = PriceRelative ~ FlightDuration + IsInternational +
## PitchDifference + WidthDifference, data = airline.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.82785 -0.24889 -0.06653 0.13341 1.30701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.415845 0.092414 -4.500 8.66e-06 ***
## FlightDuration 0.040569 0.006253 6.488 2.29e-10 ***
## IsInternationalInternational -0.627755 0.125854 -4.988 8.71e-07 ***
## PitchDifference 0.152867 0.024902 6.139 1.82e-09 ***
## WidthDifference 0.089529 0.024166 3.705 0.000238 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3719 on 453 degrees of freedom
## Multiple R-squared: 0.3246, Adjusted R-squared: 0.3187
## F-statistic: 54.44 on 4 and 453 DF, p-value: < 2.2e-16
Hence, the p-values and the coefficients suggest that the model is a good fit and the regression is good and that we have to reject the null hypothesis in case of the 3 variables.
fit$coefficients
## (Intercept) FlightDuration
## -0.41584519 0.04056873
## IsInternationalInternational PitchDifference
## -0.62775507 0.15286694
## WidthDifference
## 0.08952863
airline.df$PitchDifference <- factor(airline.df$PitchDifference)
airline.df$WidthDifference <- factor(airline.df$WidthDifference)
airline.df$IsInternational<- factor(airline.df$IsInternational)
table(airline.df$PitchDifference)
##
## 2 3 6 7 10
## 24 16 121 243 54
table(airline.df$WidthDifference)
##
## 0 1 2 3 4
## 40 264 32 68 54
table(airline.df$IsInternational)
##
## Domestic International
## 40 418
table(airline.df$PitchDifference)
##
## 2 3 6 7 10
## 24 16 121 243 54
newmodel <- lm(PriceRelative ~ FlightDuration+IsInternational+PitchDifference+WidthDifference,airline.df)
summary(newmodel)
##
## Call:
## lm(formula = PriceRelative ~ FlightDuration + IsInternational +
## PitchDifference + WidthDifference, data = airline.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.81588 -0.24663 -0.05383 0.11960 1.48499
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.038819 0.077923 -0.498 0.618604
## FlightDuration 0.038630 0.006286 6.146 1.76e-09 ***
## IsInternationalInternational 0.875034 0.090368 9.683 < 2e-16 ***
## PitchDifference3 -0.009391 0.118873 -0.079 0.937071
## PitchDifference6 -0.593447 0.095382 -6.222 1.13e-09 ***
## PitchDifference7 -0.464184 0.076211 -6.091 2.41e-09 ***
## PitchDifference10 NA NA NA NA
## WidthDifference1 -0.237927 0.053327 -4.462 1.03e-05 ***
## WidthDifference2 -0.354692 0.093457 -3.795 0.000168 ***
## WidthDifference3 NA NA NA NA
## WidthDifference4 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3683 on 450 degrees of freedom
## Multiple R-squared: 0.3421, Adjusted R-squared: 0.3319
## F-statistic: 33.43 on 7 and 450 DF, p-value: < 2.2e-16
newmodel$coefficients
## (Intercept) FlightDuration
## -0.038819094 0.038630251
## IsInternationalInternational PitchDifference3
## 0.875033562 -0.009390536
## PitchDifference6 PitchDifference7
## -0.593447129 -0.464184176
## PitchDifference10 WidthDifference1
## NA -0.237926955
## WidthDifference2 WidthDifference3
## -0.354691979 NA
## WidthDifference4
## NA
-> We can notice that there is significant increase IsInternational Variable after converting it into factors. -> Hence this is best fitted moodel