This is a mini project based on the study of Premium Economy Vs Economy Ticket Pricing by Airlines. The main research question that we are concerned with is “What factors explain the difference in price between an economy ticket and a premium-economy airline ticket?” Some other questions to be answered are: How Premium and Economy class ticket prices vary in domestic and International Flights respectively? Does the Relative Price increase or decrease with percentage of premium and economy seats? What are the other contributing factors?
setwd("D:/desktop/Data Analytics internship-sameer mathur/work/datasets")
air.df<-read.csv(paste("SixAirlinesDataV2.csv", sep=""))
View(air.df)
library(psych)
describe(air.df)
## vars n mean sd median trimmed mad min
## Airline* 1 458 3.01 1.65 2.00 2.89 1.48 1.00
## Aircraft* 2 458 1.67 0.47 2.00 1.71 0.00 1.00
## FlightDuration 3 458 7.58 3.54 7.79 7.57 4.81 1.25
## TravelMonth* 4 458 2.56 1.17 3.00 2.58 1.48 1.00
## IsInternational* 5 458 1.91 0.28 2.00 2.00 0.00 1.00
## SeatsEconomy 6 458 202.31 76.37 185.00 194.64 85.99 78.00
## SeatsPremium 7 458 33.65 13.26 36.00 33.35 11.86 8.00
## PitchEconomy 8 458 31.22 0.66 31.00 31.26 0.00 30.00
## PitchPremium 9 458 37.91 1.31 38.00 38.05 0.00 34.00
## WidthEconomy 10 458 17.84 0.56 18.00 17.81 0.00 17.00
## WidthPremium 11 458 19.47 1.10 19.00 19.53 0.00 17.00
## PriceEconomy 12 458 1327.08 988.27 1242.00 1244.40 1159.39 65.00
## PricePremium 13 458 1845.26 1288.14 1737.00 1799.05 1845.84 86.00
## PriceRelative 14 458 0.49 0.45 0.36 0.42 0.41 0.02
## SeatsTotal 15 458 235.96 85.29 227.00 228.73 90.44 98.00
## PitchDifference 16 458 6.69 1.76 7.00 6.76 0.00 2.00
## WidthDifference 17 458 1.63 1.19 1.00 1.53 0.00 0.00
## PercentPremiumSeats 18 458 14.65 4.84 13.21 14.31 2.68 4.71
## max range skew kurtosis se
## Airline* 6.00 5.00 0.61 -0.95 0.08
## Aircraft* 2.00 1.00 -0.72 -1.48 0.02
## FlightDuration 14.66 13.41 -0.07 -1.12 0.17
## TravelMonth* 4.00 3.00 -0.14 -1.46 0.05
## IsInternational* 2.00 1.00 -2.91 6.50 0.01
## SeatsEconomy 389.00 311.00 0.72 -0.36 3.57
## SeatsPremium 66.00 58.00 0.23 -0.46 0.62
## PitchEconomy 33.00 3.00 -0.03 -0.35 0.03
## PitchPremium 40.00 6.00 -1.51 3.52 0.06
## WidthEconomy 19.00 2.00 -0.04 -0.08 0.03
## WidthPremium 21.00 4.00 -0.08 -0.31 0.05
## PriceEconomy 3593.00 3528.00 0.51 -0.88 46.18
## PricePremium 7414.00 7328.00 0.50 0.43 60.19
## PriceRelative 1.89 1.87 1.17 0.72 0.02
## SeatsTotal 441.00 343.00 0.70 -0.53 3.99
## PitchDifference 10.00 8.00 -0.54 1.78 0.08
## WidthDifference 4.00 4.00 0.84 -0.53 0.06
## PercentPremiumSeats 24.69 19.98 0.71 0.28 0.23
#MEAN#
apply(air.df[,6:18], FUN=mean, MARGIN=2)
## SeatsEconomy SeatsPremium PitchEconomy
## 202.3122271 33.6484716 31.2183406
## PitchPremium WidthEconomy WidthPremium
## 37.9061135 17.8384279 19.4716157
## PriceEconomy PricePremium PriceRelative
## 1327.0764192 1845.2576419 0.4872052
## SeatsTotal PitchDifference WidthDifference
## 235.9606987 6.6877729 1.6331878
## PercentPremiumSeats
## 14.6454148
#STANDARD DEVIATION#
apply(air.df[,6:18], FUN=sd, MARGIN=2)
## SeatsEconomy SeatsPremium PitchEconomy
## 76.3735257 13.2614183 0.6551695
## PitchPremium WidthEconomy WidthPremium
## 1.3139238 0.5575102 1.0971726
## PriceEconomy PricePremium PriceRelative
## 988.2732729 1288.1355206 0.4505873
## SeatsTotal PitchDifference WidthDifference
## 85.2931525 1.7617077 1.1892807
## PercentPremiumSeats
## 4.8424513
par(mfrow=c(1,1))
pie(table(air.df$Airline),col=c("violet","blue","green","yellow","red","turquoise"),main="Airline split up")
par(mfrow=c(1,1))
pie(table(air.df$Aircraft),col=c("red","yellow"),main="Aircraft manufacturer split up")
par(mfrow=c(1,3))
hist(air.df$FlightDuration,xlab="Flight Duration(hrs)",ylab="Frequency",main="flight duration histogram", col=c("red","blue","green","yellow"), breaks=15, xlim=c(0,16), ylim=c(0,60))
boxplot(air.df$FlightDuration,main="Flight Duration Boxplot", xlab="Flight Duration(hrs)",ylab="Frequency")
barplot(air.df$FlightDuration,main = "Flight Duration Barplot", ylim=c(0,16), xlab="Flight Duration(hrs)",ylab="Frequency")
pie(table(air.df$TravelMonth),main="Analysing peak months",col=c("linen","magenta","moccasin","pink"))
par(mfrow=c(1,3))
hist(air.df$SeatsEconomy,xlab="Number of Economy Seats ",ylab="Frequency",main="Seats Economy", col=c("blue","green"), xlim=c(0,450),ylim=c(0,195))
boxplot(air.df$SeatsEconomy,main="Number of Economy Seats Boxplot", xlab="Number of Economy Seats ",ylab="Frequency")
barplot(air.df$SeatsEconomy,main = "Number of Economy Seats Barplot", ylim=c(0,400), xlab="Number of Economy Seats ",ylab="Frequency")
par(mfrow=c(1,3))
hist(air.df$SeatsPremium,xlab="Number of Premium Economy Seats ",ylab="Frequency",main="Seats PremiumEconomy", col=c("pink","magenta"),xlim=c(0,70),ylim=c(0,200))
boxplot(air.df$SeatsPremium,main="Number of PremiumEconomy Seats Boxplot")
barplot(air.df$SeatsPremium,main = "Number of PremiumEconomy Seats Barplot",ylim=c(0,70))
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
par(mfrow=c(1,1))
scatterplot(air.df$SeatsEconomy,air.df$PriceEconomy, main="scatterplot of pricing of no. of seats with price of economy class")
library(lattice)
scatterplot(air.df$SeatsPremium,air.df$PricePremium, main="scatterplot of pricing of no. of seats with price of premium economy class")
par(mfrow=c(2,1))
histogram(~PriceEconomy | IsInternational,data=air.df, col="maroon",main="Price of economy class tickets in international and domestic flights")
boxplot(air.df$PriceEconomy~air.df$IsInternational,ylab="Price Economy",xlab="Terminal",col=c("yellow2","green3"), main="Price of economy class tickets in international and domestic flights")
par(mfrow=c(2,1))
histogram(~PricePremium | IsInternational,data=air.df,col="yellow2", main="Price of premium economy class tickets in international and domestic flights")
boxplot(air.df$PricePremium~air.df$IsInternational,ylab="Price PremiumEconomy",xlab="Terminal",col=c("turquoise","blue"), main="Price of premium economy class tickets in international and domestic flights")
par(mfrow=c(1,2))
histogram(~PriceRelative | IsInternational, data=air.df,
type="count",
layout=c(2,1),
col=c("burlywood", "darkolivegreen"),main="relative pricing in international and domestic flights")
bwplot(air.df$PriceRelative~ air.df$IsInternational |air.df$Airline,ylab="PriceRelative",main="relative pricing in international and domestic flights in each arilines")
par(mfrow=c(2,1))
histogram(~PercentPremiumSeats | Airline + IsInternational, data=air.df,
layout=c(6,2),
col="orangered", main="percent premiumseats in international and domestic flights in each airlines")
barchart(~PercentPremiumSeats | Airline + IsInternational,ylab="PercentPremiumSeats",data=air.df,col="maroon",
layout=c(6,2), main="percent premiumseats in international and domestic flights in each airlines")
par(mfrow=c(2,1))
scatterplot(air.df$PricePremium,air.df$FlightDuration,main="premium class pricing with flight duration")
scatterplot(air.df$PriceEconomy,air.df$FlightDuration,main="economy class pricing with flight duration")
par(mfrow=c(1,2))
boxplot(air.df$PitchDifference~air.df$IsInternational,main="Pitch difference with terminal", xlab="Terminal", ylab="pitch difference")
barchart(PitchDifference ~ Airline, data=air.df,
groups=IsInternational, auto.key=TRUE,
par.settings = simpleTheme(col=c("gray95", "gray50")),main="Pitch difference with terminal in each airlines", xlab="airlines")
par(mfrow=c(1,2))
boxplot(air.df$WidthDifference~air.df$IsInternational,main="Width difference with terminal", xlab="Terminal", ylab="pitch difference")
barchart(WidthDifference ~ Airline, data=air.df,
groups=IsInternational, auto.key=TRUE,
par.settings = simpleTheme(col=c("gray95", "gray50")), main="Width difference with terminal in each airlines", xlab="airlines")
library(corrgram)
par(mfrow=c(1,1))
corrgram(air.df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of dataset")
From the corrgram, the pricing of tickets is highly correlated to the flight duration. And also, the price of premium economy tickets are highly correlated to picth and width and also on the number of seats. Same is true with economy seats also.
# Variance-Covariance Matrice
var(1:18,1:18)
## [1] 28.5
cov(1:18,1:18)
## [1] 28.5
Hypothesis: there is no difference between an economy class ticket and a premium economy class ticket.
t.test(air.df$PriceEconomy,air.df$PricePremium,var.equal = TRUE,paired = FALSE)
##
## Two Sample t-test
##
## data: air.df$PriceEconomy and air.df$PricePremium
## t = -6.8304, df = 914, p-value = 1.544e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -667.0699 -369.2926
## sample estimates:
## mean of x mean of y
## 1327.076 1845.258
The null hypothesis is rejected because the t-Test gives a very low p-value and there is a difference between economy class and premium economy class tickets.
cor.test((air.df$PricePremium-air.df$PriceEconomy),air.df$WidthDifference)
##
## Pearson's product-moment correlation
##
## data: (air.df$PricePremium - air.df$PriceEconomy) and air.df$WidthDifference
## t = 2.5291, df = 456, p-value = 0.01177
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02627012 0.20700978
## sample estimates:
## cor
## 0.1176138
cor.test((air.df$PricePremium-air.df$PriceEconomy),air.df$PitchDifference)
##
## Pearson's product-moment correlation
##
## data: (air.df$PricePremium - air.df$PriceEconomy) and air.df$PitchDifference
## t = 2.7688, df = 456, p-value = 0.005855
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03739893 0.21764764
## sample estimates:
## cor
## 0.1285851
cor.test((air.df$PricePremium-air.df$PriceEconomy),air.df$FlightDuration)
##
## Pearson's product-moment correlation
##
## data: (air.df$PricePremium - air.df$PriceEconomy) and air.df$FlightDuration
## t = 11.435, df = 456, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3976578 0.5403379
## sample estimates:
## cor
## 0.4720837
The 3 correlations tests yield or suggest that the difference in pricing of the 2 class of tickets depends strongly on the flightduration and also on the pitch and width difference.(p-value<0.05)
fit=lm((air.df$PricePremium-air.df$PriceEconomy) ~ air.df$PitchDifference+air.df$WidthDifference+air.df$FlightDuration)
summary(fit)
##
## Call:
## lm(formula = (air.df$PricePremium - air.df$PriceEconomy) ~ air.df$PitchDifference +
## air.df$WidthDifference + air.df$FlightDuration)
##
## Residuals:
## Min 1Q Median 3Q Max
## -859.4 -324.7 -62.7 150.1 3331.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -286.933 117.833 -2.435 0.0153 *
## air.df$PitchDifference 10.387 20.779 0.500 0.6174
## air.df$WidthDifference 74.641 30.977 2.410 0.0164 *
## air.df$FlightDuration 80.992 6.754 11.992 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 506.1 on 454 degrees of freedom
## Multiple R-squared: 0.2538, Adjusted R-squared: 0.2489
## F-statistic: 51.48 on 3 and 454 DF, p-value: < 2.2e-16
fit$coefficients
## (Intercept) air.df$PitchDifference air.df$WidthDifference
## -286.93258 10.38682 74.64098
## air.df$FlightDuration
## 80.99227
Hence, the p-values and the coefficients suggest that the model is a good fit and the regression is good and that we have to reject the null hypothesis in case of the 3 variables.