Read data:
air.df <- read.csv(paste("SixAirlinesDataV2.csv", sep=""))
View(air.df)
Getting Summary stats:
summary(air.df)
## Airline Aircraft FlightDuration TravelMonth
## AirFrance: 74 AirBus:151 Min. : 1.250 Aug:127
## British :175 Boeing:307 1st Qu.: 4.260 Jul: 75
## Delta : 46 Median : 7.790 Oct:127
## Jet : 61 Mean : 7.578 Sep:129
## Singapore: 40 3rd Qu.:10.620
## Virgin : 62 Max. :14.660
## IsInternational SeatsEconomy SeatsPremium PitchEconomy
## Domestic : 40 Min. : 78.0 Min. : 8.00 Min. :30.00
## International:418 1st Qu.:133.0 1st Qu.:21.00 1st Qu.:31.00
## Median :185.0 Median :36.00 Median :31.00
## Mean :202.3 Mean :33.65 Mean :31.22
## 3rd Qu.:243.0 3rd Qu.:40.00 3rd Qu.:32.00
## Max. :389.0 Max. :66.00 Max. :33.00
## PitchPremium WidthEconomy WidthPremium PriceEconomy
## Min. :34.00 Min. :17.00 Min. :17.00 Min. : 65
## 1st Qu.:38.00 1st Qu.:18.00 1st Qu.:19.00 1st Qu.: 413
## Median :38.00 Median :18.00 Median :19.00 Median :1242
## Mean :37.91 Mean :17.84 Mean :19.47 Mean :1327
## 3rd Qu.:38.00 3rd Qu.:18.00 3rd Qu.:21.00 3rd Qu.:1909
## Max. :40.00 Max. :19.00 Max. :21.00 Max. :3593
## PricePremium PriceRelative SeatsTotal PitchDifference
## Min. : 86.0 Min. :0.0200 Min. : 98 Min. : 2.000
## 1st Qu.: 528.8 1st Qu.:0.1000 1st Qu.:166 1st Qu.: 6.000
## Median :1737.0 Median :0.3650 Median :227 Median : 7.000
## Mean :1845.3 Mean :0.4872 Mean :236 Mean : 6.688
## 3rd Qu.:2989.0 3rd Qu.:0.7400 3rd Qu.:279 3rd Qu.: 7.000
## Max. :7414.0 Max. :1.8900 Max. :441 Max. :10.000
## WidthDifference PercentPremiumSeats
## Min. :0.000 Min. : 4.71
## 1st Qu.:1.000 1st Qu.:12.28
## Median :1.000 Median :13.21
## Mean :1.633 Mean :14.65
## 3rd Qu.:3.000 3rd Qu.:15.36
## Max. :4.000 Max. :24.69
Converting into factor variables:
air.df$Airline<-factor(air.df$Airline)
air.df$Aircraft <- factor(air.df$Aircraft)
air.df$TravelMonth <- factor(air.df$TravelMonth)
air.df$IsInternational <- factor(air.df$IsInternational)
Plotting each variable indiviually:
plot(air.df$Airline)
plot(air.df$Airline)
plot(air.df$Aircraft)
plot(air.df$TravelMonth)
plot(air.df$IsInternational)
flight duration
hist(air.df$FlightDuration,main = " Flight Duration",xlab ="Flight Duration",ylab = "Frequency")
number of seats
hist(air.df$SeatsEconomy,main = "number of seats of economy class",xlab = "Number of Economy Seat",ylab = "Frequency")
hist(air.df$SeatsPremium, main = " number of seats of premium class", xlab= "Seat in Premium class",ylab = "Frequency")
pitch
hist(air.df$PitchEconomy,main = " pitch of economy class seats",xlab="Pitch of economy class seats",ylab = "Frequency")
hist(air.df$PitchPremium,main = "pitch of premium class seats",xlab = "Pitch of premium class seats",ylab = "Frequency")
width of seats
hist(air.df$WidthEconomy,main = "Distribution of width of economic class seats",xlab = "Width of Economy class seats",ylab = "Frequency")
hist(air.df$WidthPremium,main = "Distribution of width of premium class seats",xlab = "Price of economy class seats",ylab = "Frequency")
price of seats
hist(air.df$PricePremium,main = "Distribution of price in premium class seats",xlab = "Price of premium class seats",ylab = "Frequency")
hist(air.df$PriceEconomy,main = "Distribution of price in economy class seats",xlab = "Price of economy class seats",ylab = "Frequency")
hist(air.df$PriceRelative,main = "Distribution of Relative price",xlab = "Relative(PricePremium - PriceEconomy) / PriceEconomy",ylab = "Frequency")
total seats
hist(air.df$SeatsTotal,main = "Distribution of total seats(Economy + premium)",xlab = "Total seats",ylab = "Frequency")
pitch difference
hist(air.df$PitchDifference,main = "Distribution of Pitch difference",xlab = "Pitch difference",ylab = "Frequency")
width difffrence
hist(air.df$WidthDifference,main = "Distribution of Width difference",xlab = "Width difference",ylab = "Frequency")
hist(air.df$PercentPremiumSeats,main = "Distribution of Percentage of premium seats",xlab ="Percentage of premium seats" ,ylab = "Frequency")
Plotting Corrgram to get the correlation between the matrix:
library(corrgram)
corrgram(air.df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of store variables")
Generating correlation matrix:
cor(air.df[,c(3,6:18)])
## FlightDuration SeatsEconomy SeatsPremium PitchEconomy
## FlightDuration 1.00000000 0.195621187 0.161236400 0.29377174
## SeatsEconomy 0.19562119 1.000000000 0.625056587 0.14412692
## SeatsPremium 0.16123640 0.625056587 1.000000000 -0.03421296
## PitchEconomy 0.29377174 0.144126924 -0.034212963 1.00000000
## PitchPremium 0.09621471 0.119221250 0.004883123 -0.55060624
## WidthEconomy 0.45647720 0.373670252 0.455782883 0.29448586
## WidthPremium 0.10343747 0.102431959 -0.002717527 -0.53929285
## PriceEconomy 0.56664039 0.128167220 0.113642176 0.36866123
## PricePremium 0.64873981 0.177000928 0.217612376 0.22614179
## PriceRelative 0.12107501 0.003956939 -0.097196009 -0.42302204
## SeatsTotal 0.20023299 0.992607966 0.715171053 0.12373524
## PitchDifference -0.03749288 0.035318044 0.016365566 -0.78254993
## WidthDifference -0.11856070 -0.080670148 -0.216168666 -0.63557430
## PercentPremiumSeats 0.06051625 -0.330935223 0.485029771 -0.10280880
## PitchPremium WidthEconomy WidthPremium PriceEconomy
## FlightDuration 0.096214708 0.45647720 0.103437469 0.56664039
## SeatsEconomy 0.119221250 0.37367025 0.102431959 0.12816722
## SeatsPremium 0.004883123 0.45578288 -0.002717527 0.11364218
## PitchEconomy -0.550606241 0.29448586 -0.539292852 0.36866123
## PitchPremium 1.000000000 -0.02374087 0.750259029 0.05038455
## WidthEconomy -0.023740873 1.00000000 0.081918728 0.06799061
## WidthPremium 0.750259029 0.08191873 1.000000000 -0.05704522
## PriceEconomy 0.050384550 0.06799061 -0.057045224 1.00000000
## PricePremium 0.088539147 0.15054837 0.064020043 0.90138870
## PriceRelative 0.417539056 -0.04396116 0.504247591 -0.28856711
## SeatsTotal 0.107512784 0.40545860 0.091297500 0.13243313
## PitchDifference 0.950591466 -0.12722421 0.760121272 -0.09952511
## WidthDifference 0.703281797 -0.39320512 0.884149655 -0.08449975
## PercentPremiumSeats -0.175487414 0.22714172 -0.183312058 0.06532232
## PricePremium PriceRelative SeatsTotal PitchDifference
## FlightDuration 0.64873981 0.121075014 0.20023299 -0.03749288
## SeatsEconomy 0.17700093 0.003956939 0.99260797 0.03531804
## SeatsPremium 0.21761238 -0.097196009 0.71517105 0.01636557
## PitchEconomy 0.22614179 -0.423022038 0.12373524 -0.78254993
## PitchPremium 0.08853915 0.417539056 0.10751278 0.95059147
## WidthEconomy 0.15054837 -0.043961160 0.40545860 -0.12722421
## WidthPremium 0.06402004 0.504247591 0.09129750 0.76012127
## PriceEconomy 0.90138870 -0.288567110 0.13243313 -0.09952511
## PricePremium 1.00000000 0.031846537 0.19232533 -0.01806629
## PriceRelative 0.03184654 1.000000000 -0.01156894 0.46873025
## SeatsTotal 0.19232533 -0.011568942 1.00000000 0.03416915
## PitchDifference -0.01806629 0.468730249 0.03416915 1.00000000
## WidthDifference -0.01151218 0.485802437 -0.10584398 0.76089108
## PercentPremiumSeats 0.11639097 -0.161565556 -0.22091465 -0.09264869
## WidthDifference PercentPremiumSeats
## FlightDuration -0.11856070 0.06051625
## SeatsEconomy -0.08067015 -0.33093522
## SeatsPremium -0.21616867 0.48502977
## PitchEconomy -0.63557430 -0.10280880
## PitchPremium 0.70328180 -0.17548741
## WidthEconomy -0.39320512 0.22714172
## WidthPremium 0.88414965 -0.18331206
## PriceEconomy -0.08449975 0.06532232
## PricePremium -0.01151218 0.11639097
## PriceRelative 0.48580244 -0.16156556
## SeatsTotal -0.10584398 -0.22091465
## PitchDifference 0.76089108 -0.09264869
## WidthDifference 1.00000000 -0.27559416
## PercentPremiumSeats -0.27559416 1.00000000
Impelmentin regression for price of economy class:
fit1<-lm(air.df$PriceEconomy~ air.df$FlightDuration+air.df$SeatsEconomy+air.df$SeatsPremium+air.df$PitchDifference+air.df$WidthDifference+air.df$PricePremium)
Getting summary stats of regression analysis:
summary(fit1)
##
## Call:
## lm(formula = air.df$PriceEconomy ~ air.df$FlightDuration + air.df$SeatsEconomy +
## air.df$SeatsPremium + air.df$PitchDifference + air.df$WidthDifference +
## air.df$PricePremium)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2365.36 -217.56 14.31 160.67 912.01
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 469.97042 101.91927 4.611 5.22e-06 ***
## air.df$FlightDuration -14.65160 7.29161 -2.009 0.04509 *
## air.df$SeatsEconomy 0.60660 0.32494 1.867 0.06258 .
## air.df$SeatsPremium -10.16913 1.98929 -5.112 4.72e-07 ***
## air.df$PitchDifference -4.56145 17.67483 -0.258 0.79647
## air.df$WidthDifference -82.47864 27.17490 -3.035 0.00254 **
## air.df$PricePremium 0.73312 0.02011 36.461 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 408.6 on 451 degrees of freedom
## Multiple R-squared: 0.8313, Adjusted R-squared: 0.829
## F-statistic: 370.3 on 6 and 451 DF, p-value: < 2.2e-16
Getting variable whose p-vlaue is significant:
which(summary(fit1)$coefficients[,4]<0.05)
## (Intercept) air.df$FlightDuration air.df$SeatsPremium
## 1 2 4
## air.df$WidthDifference air.df$PricePremium
## 6 7
Getting variable whose p-vlaue is not significant:
which(summary(fit1)$coefficients[,4]>0.05)
## air.df$SeatsEconomy air.df$PitchDifference
## 3 5
Getting confidence interval:
confint(fit1)
## 2.5 % 97.5 %
## (Intercept) 269.67479442 670.2660411
## air.df$FlightDuration -28.98135553 -0.3218542
## air.df$SeatsEconomy -0.03198731 1.2451917
## air.df$SeatsPremium -14.07856714 -6.2597021
## air.df$PitchDifference -39.29670094 30.1737960
## air.df$WidthDifference -135.88377697 -29.0735063
## air.df$PricePremium 0.69360554 0.7726346
Getting coefficients:
coefficients(fit1)
## (Intercept) air.df$FlightDuration air.df$SeatsEconomy
## 469.9704178 -14.6516049 0.6066022
## air.df$SeatsPremium air.df$PitchDifference air.df$WidthDifference
## -10.1691346 -4.5614525 -82.4786416
## air.df$PricePremium
## 0.7331200
Impelmentin regression for price of premium class:
fit2<-lm(air.df$PricePremium~ air.df$FlightDuration+air.df$SeatsEconomy+air.df$SeatsPremium+air.df$PitchDifference+air.df$WidthDifference+air.df$PriceEconomy)
Getting summary stats of regression analysis:
summary(fit2)
##
## Call:
## lm(formula = air.df$PricePremium ~ air.df$FlightDuration + air.df$SeatsEconomy +
## air.df$SeatsPremium + air.df$PitchDifference + air.df$WidthDifference +
## air.df$PriceEconomy)
##
## Residuals:
## Min 1Q Median 3Q Max
## -890.7 -256.1 -24.4 154.8 3545.4
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -473.51759 120.89029 -3.917 0.000104 ***
## air.df$FlightDuration 74.60808 7.88567 9.461 < 2e-16 ***
## air.df$SeatsEconomy -0.82139 0.38253 -2.147 0.032305 *
## air.df$SeatsPremium 15.21609 2.30282 6.608 1.10e-10 ***
## air.df$PitchDifference -28.15622 20.79226 -1.354 0.176361
## air.df$WidthDifference 149.55172 31.58030 4.736 2.93e-06 ***
## air.df$PriceEconomy 1.01851 0.02793 36.461 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 481.6 on 451 degrees of freedom
## Multiple R-squared: 0.862, Adjusted R-squared: 0.8602
## F-statistic: 469.6 on 6 and 451 DF, p-value: < 2.2e-16
Getting variable whose p-vlaue is significant:
which(summary(fit2)$coefficients[,4]<0.05)
## (Intercept) air.df$FlightDuration air.df$SeatsEconomy
## 1 2 3
## air.df$SeatsPremium air.df$WidthDifference air.df$PriceEconomy
## 4 6 7
Getting variable whose p-vlaue is not significant:
which(summary(fit2)$coefficients[,4]>0.05)
## air.df$PitchDifference
## 5
Getting confidence interval:
confint(fit2)
## 2.5 % 97.5 %
## (Intercept) -711.0957670 -235.93941844
## air.df$FlightDuration 59.1108533 90.10530535
## air.df$SeatsEconomy -1.5731505 -0.06962792
## air.df$SeatsPremium 10.6905016 19.74167400
## air.df$PitchDifference -69.0179491 12.70551341
## air.df$WidthDifference 87.4889300 211.61451794
## air.df$PriceEconomy 0.9636146 1.07340835
Getting coefficients:
coefficients(fit2)
## (Intercept) air.df$FlightDuration air.df$SeatsEconomy
## -473.5175927 74.6080793 -0.8213892
## air.df$SeatsPremium air.df$PitchDifference air.df$WidthDifference
## 15.2160878 -28.1562179 149.5517240
## air.df$PriceEconomy
## 1.0185115
Flight duration postively correlated.( 74.6080793)
Seats in economy negatively correlated.( -0.8213892)
Seats in premium postively correlated.( 15.2160878)
More Duration means more distance and that means more cost.
If more seats are available in economy more peolpe try to fit in there so price of premium class need to be decreased.
More width of seats mean less number of seats in particular in same area so more cost.