setwd("C:/Users/Jaya/Downloads")
airlines.df <- read.csv(paste("SixAirlinesDataV2.csv", sep=""))
library(psych)
View(airlines.df)
summary(airlines.df)
## Airline Aircraft FlightDuration TravelMonth
## AirFrance: 74 AirBus:151 Min. : 1.250 Aug:127
## British :175 Boeing:307 1st Qu.: 4.260 Jul: 75
## Delta : 46 Median : 7.790 Oct:127
## Jet : 61 Mean : 7.578 Sep:129
## Singapore: 40 3rd Qu.:10.620
## Virgin : 62 Max. :14.660
## IsInternational SeatsEconomy SeatsPremium PitchEconomy
## Domestic : 40 Min. : 78.0 Min. : 8.00 Min. :30.00
## International:418 1st Qu.:133.0 1st Qu.:21.00 1st Qu.:31.00
## Median :185.0 Median :36.00 Median :31.00
## Mean :202.3 Mean :33.65 Mean :31.22
## 3rd Qu.:243.0 3rd Qu.:40.00 3rd Qu.:32.00
## Max. :389.0 Max. :66.00 Max. :33.00
## PitchPremium WidthEconomy WidthPremium PriceEconomy
## Min. :34.00 Min. :17.00 Min. :17.00 Min. : 65
## 1st Qu.:38.00 1st Qu.:18.00 1st Qu.:19.00 1st Qu.: 413
## Median :38.00 Median :18.00 Median :19.00 Median :1242
## Mean :37.91 Mean :17.84 Mean :19.47 Mean :1327
## 3rd Qu.:38.00 3rd Qu.:18.00 3rd Qu.:21.00 3rd Qu.:1909
## Max. :40.00 Max. :19.00 Max. :21.00 Max. :3593
## PricePremium PriceRelative SeatsTotal PitchDifference
## Min. : 86.0 Min. :0.0200 Min. : 98 Min. : 2.000
## 1st Qu.: 528.8 1st Qu.:0.1000 1st Qu.:166 1st Qu.: 6.000
## Median :1737.0 Median :0.3650 Median :227 Median : 7.000
## Mean :1845.3 Mean :0.4872 Mean :236 Mean : 6.688
## 3rd Qu.:2989.0 3rd Qu.:0.7400 3rd Qu.:279 3rd Qu.: 7.000
## Max. :7414.0 Max. :1.8900 Max. :441 Max. :10.000
## WidthDifference PercentPremiumSeats
## Min. :0.000 Min. : 4.71
## 1st Qu.:1.000 1st Qu.:12.28
## Median :1.000 Median :13.21
## Mean :1.633 Mean :14.65
## 3rd Qu.:3.000 3rd Qu.:15.36
## Max. :4.000 Max. :24.69
str(airlines.df)
## 'data.frame': 458 obs. of 18 variables:
## $ Airline : Factor w/ 6 levels "AirFrance","British",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Aircraft : Factor w/ 2 levels "AirBus","Boeing": 2 2 2 2 2 2 2 2 2 2 ...
## $ FlightDuration : num 12.25 12.25 12.25 12.25 8.16 ...
## $ TravelMonth : Factor w/ 4 levels "Aug","Jul","Oct",..: 2 1 4 3 1 4 3 1 4 4 ...
## $ IsInternational : Factor w/ 2 levels "Domestic","International": 2 2 2 2 2 2 2 2 2 2 ...
## $ SeatsEconomy : int 122 122 122 122 122 122 122 122 122 122 ...
## $ SeatsPremium : int 40 40 40 40 40 40 40 40 40 40 ...
## $ PitchEconomy : int 31 31 31 31 31 31 31 31 31 31 ...
## $ PitchPremium : int 38 38 38 38 38 38 38 38 38 38 ...
## $ WidthEconomy : int 18 18 18 18 18 18 18 18 18 18 ...
## $ WidthPremium : int 19 19 19 19 19 19 19 19 19 19 ...
## $ PriceEconomy : int 2707 2707 2707 2707 1793 1793 1793 1476 1476 1705 ...
## $ PricePremium : int 3725 3725 3725 3725 2999 2999 2999 2997 2997 2989 ...
## $ PriceRelative : num 0.38 0.38 0.38 0.38 0.67 0.67 0.67 1.03 1.03 0.75 ...
## $ SeatsTotal : int 162 162 162 162 162 162 162 162 162 162 ...
## $ PitchDifference : int 7 7 7 7 7 7 7 7 7 7 ...
## $ WidthDifference : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PercentPremiumSeats: num 24.7 24.7 24.7 24.7 24.7 ...
library(psych)
describe(airlines.df)
## vars n mean sd median trimmed mad min
## Airline* 1 458 3.01 1.65 2.00 2.89 1.48 1.00
## Aircraft* 2 458 1.67 0.47 2.00 1.71 0.00 1.00
## FlightDuration 3 458 7.58 3.54 7.79 7.57 4.81 1.25
## TravelMonth* 4 458 2.56 1.17 3.00 2.58 1.48 1.00
## IsInternational* 5 458 1.91 0.28 2.00 2.00 0.00 1.00
## SeatsEconomy 6 458 202.31 76.37 185.00 194.64 85.99 78.00
## SeatsPremium 7 458 33.65 13.26 36.00 33.35 11.86 8.00
## PitchEconomy 8 458 31.22 0.66 31.00 31.26 0.00 30.00
## PitchPremium 9 458 37.91 1.31 38.00 38.05 0.00 34.00
## WidthEconomy 10 458 17.84 0.56 18.00 17.81 0.00 17.00
## WidthPremium 11 458 19.47 1.10 19.00 19.53 0.00 17.00
## PriceEconomy 12 458 1327.08 988.27 1242.00 1244.40 1159.39 65.00
## PricePremium 13 458 1845.26 1288.14 1737.00 1799.05 1845.84 86.00
## PriceRelative 14 458 0.49 0.45 0.36 0.42 0.41 0.02
## SeatsTotal 15 458 235.96 85.29 227.00 228.73 90.44 98.00
## PitchDifference 16 458 6.69 1.76 7.00 6.76 0.00 2.00
## WidthDifference 17 458 1.63 1.19 1.00 1.53 0.00 0.00
## PercentPremiumSeats 18 458 14.65 4.84 13.21 14.31 2.68 4.71
## max range skew kurtosis se
## Airline* 6.00 5.00 0.61 -0.95 0.08
## Aircraft* 2.00 1.00 -0.72 -1.48 0.02
## FlightDuration 14.66 13.41 -0.07 -1.12 0.17
## TravelMonth* 4.00 3.00 -0.14 -1.46 0.05
## IsInternational* 2.00 1.00 -2.91 6.50 0.01
## SeatsEconomy 389.00 311.00 0.72 -0.36 3.57
## SeatsPremium 66.00 58.00 0.23 -0.46 0.62
## PitchEconomy 33.00 3.00 -0.03 -0.35 0.03
## PitchPremium 40.00 6.00 -1.51 3.52 0.06
## WidthEconomy 19.00 2.00 -0.04 -0.08 0.03
## WidthPremium 21.00 4.00 -0.08 -0.31 0.05
## PriceEconomy 3593.00 3528.00 0.51 -0.88 46.18
## PricePremium 7414.00 7328.00 0.50 0.43 60.19
## PriceRelative 1.89 1.87 1.17 0.72 0.02
## SeatsTotal 441.00 343.00 0.70 -0.53 3.99
## PitchDifference 10.00 8.00 -0.54 1.78 0.08
## WidthDifference 4.00 4.00 0.84 -0.53 0.06
## PercentPremiumSeats 24.69 19.98 0.71 0.28 0.23
=> Airline
library(lattice)
boxplot(airlines.df$PriceRelative ~airlines.df$Airline ,
xlab = "airline")
correlation between Airline and Relative Price.
=> Aircraft
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(airlines.df$PriceRelative~airlines.df$Aircraft,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.Aircraft ",
xlab="Aircraft",
ylab="Price Relative")
## [1] "212" "308" "156" "157" "158" "159" "379" "380" "381" "382"
So we can say that the model of aircraft And relative price have a weak correlation.
=> Flight Duration
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$FlightDuration,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.Flight Duration ",
xlab="Flight Duration",
ylab="Price Relative")
correlation between Relative Price and FlightDuration
=> Travel Month
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$TravelMonth,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.PitchPremium ",
xlab="Month of Travel",
ylab="Price Relative")
## [1] "157" "160" "379" "440" "159" "162" "381" "158" "161" "380" "406"
Month of travel does not have any correlation with Relative Price.
=> IsInternational
library(lattice)
boxplot(airlines.df$PriceRelative ~airlines.df$IsInternational ,
ylab="Relative Price")
correlation between IsInternational and Relative Price.
=> Pitch
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$PitchEconomy,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.PitchEconomy ",
xlab="PitchEconomy",
ylab="Price Relative")
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$PitchPremium,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.PitchPremium ",
xlab="PitchPremium",
ylab="Price Relative")
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$PitchDifference,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.PitchDifference ",
xlab="PitchDifference",
ylab="Price Relative")
Therefore Pitch Economy,Pitch Premium and pitch Diff have a correlation with Relative price.
=> Width
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$WidthEconomy,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.WidthEconomy ",
xlab="WidthEconomy",
ylab="Price Relative")
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$WidthPremium,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.WidthPremium ",
xlab="WidthPremium",
ylab="Price Relative")
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$WidthDifference,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.WidthDifference ",
xlab="WidthDifference",
ylab="Price Relative")
Therefore WidthEconomy does not have a statistically significant correlation Whereas,WidthPremium and WidthDifference has a statistically significant correlation.
=> Number of Seats
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$SeatsEconomy,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.SeatsEconomy ",
xlab="SeatsEconomy",
ylab="Price Relative")
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$SeatsPremium,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.SeatsPremium ",
xlab="SeatsPremium",
ylab="Price Relative")
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$SeatsTotal,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Price Relative vs.SeatsTotal ",
xlab="SeatsTotal",
ylab="Price Relative")
Therefore Seats_Economy and Seats Total are not the factors but Seats_Premium has a negative correlation with Price_Relative
=> Percent of Premium seats
library(car)
scatterplot(airlines.df$PriceRelative~airlines.df$PercentPremiumSeats,spread=FALSE, smoother.args=list(lty=2), pch=19,
main="Scatterplot of Relative price vs Percent of Premium seats ",
xlab="Percent of Premium seats",
ylab="Relative price")
negative correlation with Relative price.
So from the above data we conclude that, we get the factors which affect the Relative Pricing. Airline, Flightduration , IsInternational, Pitch Economy, Pitch Premium, Pitch Difference, Width Premium , Width Difference, Seats Premium
library(car)
scatterplotMatrix(airlines.df[,c("PriceRelative","Airline","FlightDuration","IsInternational","PitchEconomy","PitchPremium","PitchDifference","WidthPremium","WidthDifference","SeatsPremium")],
spread=FALSE, smoother.args=list(lty=2),
main="Scatter Plot Matrix")
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
## Warning in smoother(x, y, col = col[2], log.x = FALSE, log.y = FALSE,
## spread = spread, : could not fit smooth
library(corrgram)
corrgram(airlines.df, order=FALSE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of Variables")
PriceRelative, PitchEconomy, WidthEconomy
Null Hypothesis-“The relative price does not depends on any of the following variables- -Airline -Flight duration -IsInternational -PitchEconomy -PitchPremium -Pitch Difference -WidthEconomy -WidthPremium -Width Difference”
t.test(airlines.df$PriceRelative ~ airlines.df$IsInternational )
##
## Welch Two Sample t-test
##
## data: airlines.df$PriceRelative by airlines.df$IsInternational
## t = -19.451, df = 446.12, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.4855215 -0.3964139
## sample estimates:
## mean in group Domestic mean in group International
## 0.0847500 0.5257177
Since the p value is less that 0.05 We reject the null hypothesis and conclude that There is a strong correlation between Relative price and IsInternational.
Converting the variable Airline and IsInternational into a factor variable
str(airlines.df)
## 'data.frame': 458 obs. of 18 variables:
## $ Airline : Factor w/ 6 levels "AirFrance","British",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Aircraft : Factor w/ 2 levels "AirBus","Boeing": 2 2 2 2 2 2 2 2 2 2 ...
## $ FlightDuration : num 12.25 12.25 12.25 12.25 8.16 ...
## $ TravelMonth : Factor w/ 4 levels "Aug","Jul","Oct",..: 2 1 4 3 1 4 3 1 4 4 ...
## $ IsInternational : Factor w/ 2 levels "Domestic","International": 2 2 2 2 2 2 2 2 2 2 ...
## $ SeatsEconomy : int 122 122 122 122 122 122 122 122 122 122 ...
## $ SeatsPremium : int 40 40 40 40 40 40 40 40 40 40 ...
## $ PitchEconomy : int 31 31 31 31 31 31 31 31 31 31 ...
## $ PitchPremium : int 38 38 38 38 38 38 38 38 38 38 ...
## $ WidthEconomy : int 18 18 18 18 18 18 18 18 18 18 ...
## $ WidthPremium : int 19 19 19 19 19 19 19 19 19 19 ...
## $ PriceEconomy : int 2707 2707 2707 2707 1793 1793 1793 1476 1476 1705 ...
## $ PricePremium : int 3725 3725 3725 3725 2999 2999 2999 2997 2997 2989 ...
## $ PriceRelative : num 0.38 0.38 0.38 0.38 0.67 0.67 0.67 1.03 1.03 0.75 ...
## $ SeatsTotal : int 162 162 162 162 162 162 162 162 162 162 ...
## $ PitchDifference : int 7 7 7 7 7 7 7 7 7 7 ...
## $ WidthDifference : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PercentPremiumSeats: num 24.7 24.7 24.7 24.7 24.7 ...
model <- lm(PriceRelative ~Airline
+FlightDuration
+IsInternational
+PitchEconomy
+PitchPremium
+WidthEconomy
+WidthPremium
, data=airlines.df)
summary(model)
##
## Call:
## lm(formula = PriceRelative ~ Airline + FlightDuration + IsInternational +
## PitchEconomy + PitchPremium + WidthEconomy + WidthPremium,
## data = airlines.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8159 -0.1933 -0.0549 0.1009 1.4689
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.904967 4.976180 -0.182 0.855775
## AirlineBritish 0.250299 0.159931 1.565 0.118283
## AirlineDelta 0.061917 0.354717 0.175 0.861510
## AirlineJet 0.534838 0.140825 3.798 0.000166 ***
## AirlineSingapore 0.322987 0.204182 1.582 0.114390
## AirlineVirgin 0.417100 0.326186 1.279 0.201661
## FlightDuration 0.036153 0.006173 5.856 9.2e-09 ***
## IsInternationalInternational -0.406100 0.399055 -1.018 0.309394
## PitchEconomy -0.055148 0.133621 -0.413 0.680012
## PitchPremium 0.085155 0.151506 0.562 0.574361
## WidthEconomy -0.073197 0.078098 -0.937 0.349140
## WidthPremium 0.052925 0.222769 0.238 0.812319
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.36 on 446 degrees of freedom
## Multiple R-squared: 0.3772, Adjusted R-squared: 0.3618
## F-statistic: 24.55 on 11 and 446 DF, p-value: < 2.2e-16
model$coefficients
## (Intercept) AirlineBritish
## -0.90496697 0.25029934
## AirlineDelta AirlineJet
## 0.06191693 0.53483769
## AirlineSingapore AirlineVirgin
## 0.32298704 0.41709999
## FlightDuration IsInternationalInternational
## 0.03615307 -0.40609991
## PitchEconomy PitchPremium
## -0.05514750 0.08515457
## WidthEconomy WidthPremium
## -0.07319684 0.05292480
predictedProfit = data.frame(fitted(model))
actualProfit = data.frame(airlines.df$PriceRelative)
Comparison = cbind(actualProfit, predictedProfit)
View(Comparison)
some(Comparison)
## airlines.df.PriceRelative fitted.model.
## 41 0.24 0.2920278
## 125 0.17 0.2197217
## 139 0.19 0.2406904
## 172 0.84 0.8025652
## 241 1.13 0.5299150
## 300 0.07 0.1102492
## 342 0.08 0.1924596
## 362 0.03 0.3461101
## 372 0.14 0.6354819
## 420 0.61 0.6085275
So we can say that the Model predicts the Relative price values near to dataset values.
Airline Company : Lowest relative price difference in Airlines are AirFrance and Delta , Highest being Jet Airlines.
Type of Flight(International/Domestic) : t.Domestic Flights have low Difference in prices of tickets than International flights.
Increase in Flight duration, Pitch and width Results in Increasing Relative Price.