Read the data into R
a.df <- read.csv(paste("SixAirlinesDataV2.csv", sep=""))
Summarize the data to understand the mean, median, standard deviation of each variable
summary(a.df)
## Airline Aircraft FlightDuration TravelMonth
## AirFrance: 74 AirBus:151 Min. : 1.250 Aug:127
## British :175 Boeing:307 1st Qu.: 4.260 Jul: 75
## Delta : 46 Median : 7.790 Oct:127
## Jet : 61 Mean : 7.578 Sep:129
## Singapore: 40 3rd Qu.:10.620
## Virgin : 62 Max. :14.660
## IsInternational SeatsEconomy SeatsPremium PitchEconomy
## Domestic : 40 Min. : 78.0 Min. : 8.00 Min. :30.00
## International:418 1st Qu.:133.0 1st Qu.:21.00 1st Qu.:31.00
## Median :185.0 Median :36.00 Median :31.00
## Mean :202.3 Mean :33.65 Mean :31.22
## 3rd Qu.:243.0 3rd Qu.:40.00 3rd Qu.:32.00
## Max. :389.0 Max. :66.00 Max. :33.00
## PitchPremium WidthEconomy WidthPremium PriceEconomy
## Min. :34.00 Min. :17.00 Min. :17.00 Min. : 65
## 1st Qu.:38.00 1st Qu.:18.00 1st Qu.:19.00 1st Qu.: 413
## Median :38.00 Median :18.00 Median :19.00 Median :1242
## Mean :37.91 Mean :17.84 Mean :19.47 Mean :1327
## 3rd Qu.:38.00 3rd Qu.:18.00 3rd Qu.:21.00 3rd Qu.:1909
## Max. :40.00 Max. :19.00 Max. :21.00 Max. :3593
## PricePremium PriceRelative SeatsTotal PitchDifference
## Min. : 86.0 Min. :0.0200 Min. : 98 Min. : 2.000
## 1st Qu.: 528.8 1st Qu.:0.1000 1st Qu.:166 1st Qu.: 6.000
## Median :1737.0 Median :0.3650 Median :227 Median : 7.000
## Mean :1845.3 Mean :0.4872 Mean :236 Mean : 6.688
## 3rd Qu.:2989.0 3rd Qu.:0.7400 3rd Qu.:279 3rd Qu.: 7.000
## Max. :7414.0 Max. :1.8900 Max. :441 Max. :10.000
## WidthDifference PercentPremiumSeats
## Min. :0.000 Min. : 4.71
## 1st Qu.:1.000 1st Qu.:12.28
## Median :1.000 Median :13.21
## Mean :1.633 Mean :14.65
## 3rd Qu.:3.000 3rd Qu.:15.36
## Max. :4.000 Max. :24.69
Comparing Premium Economy Ticket Prices and Economy Ticket Prices.
plot(~a.df$PriceEconomy + a.df$PricePremium, main="Premium Economy Price vs. Economy Price")
abline(0,1)
Analysing Pitch Difference of Premium Economy seats and the pitch of Economy seats.
library(lattice)
histogram(~a.df$PitchDifference, main = "Distribution of Pitch Difference", xlab="Difference in Pitch")
Analysing effect of Pitch Difference on the relative price of Economy and Premium Economy.
rel_pr = aggregate(cbind(PriceEconomy,PricePremium, PriceRelative) ~ PitchDifference, data = a.df, mean)
library(car)
scatterplot(rel_pr$PitchDifference, rel_pr$PriceRelative, main="Relative Price Difference & Pitch", xlab="Pitch Difference", ylab="Relative Price b/w Economy and Premium Economy")
## Warning in smoother(.x, .y, col = col[2], log.x = logged("x"), log.y =
## logged("y"), : could not fit positive part of the spread
Analysing effect of Pitch Difference on the price of Economy and Premium Economy.
boxplot(a.df$PriceRelative~a.df$PitchDifference, main="Relative Price Difference vs.Pitch", ylab="Pitch Difference", xlab="Relative Price b/w Economy and Premium Economy")
Comparing distribution of the difference in the width of Premium Economy seats and the width of Economy seats.
library(lattice)
histogram(~a.df$WidthDifference, main = "Distribution of Difference in Seat Width", xlab="Difference in Seat Width")
Analysing effect of plane capacity
xyplot(a.df$PriceRelative ~ a.df$SeatsTotal,type = c("p", "g"), xlab = "Total Seats (Economy + Premium Economy Seats)", ylab = "Rel. Price Difference")
Analysing percentage of Premium Economy Seats
boxplot(a.df$PercentPremiumSeats, main="Percentage of Premium Economy Seats", ylab="Percentage of Premium Economy Seats in Plane")
Scatter Plots to understand how are the variables correlated pair-wise
library(car)
scatterplotMatrix(~PricePremium+PriceEconomy+SeatsTotal+PercentPremiumSeats+PitchDifference+WidthDifference, data=a.df, main="Premium Economy vs. Economy Airfares")
Create a Variance-Covariance Matrix
library(Hmisc)
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
colairlines <- c("PricePremium","PriceEconomy","PitchDifference","WidthDifference")
corMatrix <- rcorr(as.matrix(a.df[,colairlines]))
corMatrix
## PricePremium PriceEconomy PitchDifference WidthDifference
## PricePremium 1.00 0.90 -0.02 -0.01
## PriceEconomy 0.90 1.00 -0.10 -0.08
## PitchDifference -0.02 -0.10 1.00 0.76
## WidthDifference -0.01 -0.08 0.76 1.00
##
## n= 458
##
##
## P
## PricePremium PriceEconomy PitchDifference WidthDifference
## PricePremium 0.0000 0.6998 0.8059
## PriceEconomy 0.0000 0.0332 0.0708
## PitchDifference 0.6998 0.0332 0.0000
## WidthDifference 0.8059 0.0708 0.0000
colairlines2 <- c("PricePremium","PriceEconomy","SeatsTotal","PercentPremiumSeats")
corMatrix2 <- rcorr(as.matrix(a.df[,colairlines2]))
corMatrix2
## PricePremium PriceEconomy SeatsTotal
## PricePremium 1.00 0.90 0.19
## PriceEconomy 0.90 1.00 0.13
## SeatsTotal 0.19 0.13 1.00
## PercentPremiumSeats 0.12 0.07 -0.22
## PercentPremiumSeats
## PricePremium 0.12
## PriceEconomy 0.07
## SeatsTotal -0.22
## PercentPremiumSeats 1.00
##
## n= 458
##
##
## P
## PricePremium PriceEconomy SeatsTotal
## PricePremium 0.0000 0.0000
## PriceEconomy 0.0000 0.0045
## SeatsTotal 0.0000 0.0045
## PercentPremiumSeats 0.0127 0.1628 0.0000
## PercentPremiumSeats
## PricePremium 0.0127
## PriceEconomy 0.1628
## SeatsTotal 0.0000
## PercentPremiumSeats
Draw a Corrgram
library(Hmisc)
library(car)
library(corrgram)
colairlines <- c("PricePremium","PriceEconomy","PitchDifference","WidthDifference","SeatsTotal","PercentPremiumSeats")
corrgram(a.df[,colairlines], order=TRUE, main="Premium Economy vs. Economy Airfares", lower.panel=panel.pts, upper.panel=panel.pie, diag.panel=panel.minmax, text.panel=panel.txt)
Run T-Test to test the following null hypthesis: Premium and Economy airfares are same.
t.test(a.df$PricePremium, a.df$PriceEconomy)
##
## Welch Two Sample t-test
##
## data: a.df$PricePremium and a.df$PriceEconomy
## t = 6.8304, df = 856.56, p-value = 1.605e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 369.2793 667.0831
## sample estimates:
## mean of x mean of y
## 1845.258 1327.076
Since the p-value is less than 0.05, we fail to reject the null hypothesis that they are equal.
In this model we try regressing Price Premium on all the remaining columns.
m <- PricePremium ~ PriceEconomy + PitchDifference + WidthDifference + PercentPremiumSeats + SeatsTotal + IsInternational + TravelMonth + FlightDuration + Aircraft
fit <- lm(m, data = a.df)
summary(fit)
##
## Call:
## lm(formula = m, data = a.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -977.2 -246.3 -47.9 135.2 3419.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.211e+03 1.755e+02 -6.898 1.82e-11 ***
## PriceEconomy 1.064e+00 3.114e-02 34.175 < 2e-16 ***
## PitchDifference 8.510e+01 3.913e+01 2.175 0.030163 *
## WidthDifference 1.240e+02 3.438e+01 3.607 0.000345 ***
## PercentPremiumSeats 3.177e+01 5.250e+00 6.052 3.04e-09 ***
## SeatsTotal 1.925e+00 3.360e-01 5.729 1.87e-08 ***
## IsInternationalInternational -7.537e+02 2.135e+02 -3.530 0.000458 ***
## TravelMonthJul -3.441e+01 7.074e+01 -0.486 0.626904
## TravelMonthOct 2.692e+01 6.036e+01 0.446 0.655795
## TravelMonthSep -2.097e+00 6.015e+01 -0.035 0.972203
## FlightDuration 8.455e+01 8.809e+00 9.598 < 2e-16 ***
## AircraftBoeing -2.082e+00 5.651e+01 -0.037 0.970625
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 480.7 on 446 degrees of freedom
## Multiple R-squared: 0.8641, Adjusted R-squared: 0.8607
## F-statistic: 257.7 on 11 and 446 DF, p-value: < 2.2e-16