setwd("C:/Users/Prabha Shankar/Desktop/Winter Internship/R file")
var1.df <- read.csv("SixAirlinesDataV2.csv")
summary(var1.df)
## Airline Aircraft FlightDuration TravelMonth
## AirFrance: 74 AirBus:151 Min. : 1.250 Aug:127
## British :175 Boeing:307 1st Qu.: 4.260 Jul: 75
## Delta : 46 Median : 7.790 Oct:127
## Jet : 61 Mean : 7.578 Sep:129
## Singapore: 40 3rd Qu.:10.620
## Virgin : 62 Max. :14.660
## IsInternational SeatsEconomy SeatsPremium PitchEconomy
## Domestic : 40 Min. : 78.0 Min. : 8.00 Min. :30.00
## International:418 1st Qu.:133.0 1st Qu.:21.00 1st Qu.:31.00
## Median :185.0 Median :36.00 Median :31.00
## Mean :202.3 Mean :33.65 Mean :31.22
## 3rd Qu.:243.0 3rd Qu.:40.00 3rd Qu.:32.00
## Max. :389.0 Max. :66.00 Max. :33.00
## PitchPremium WidthEconomy WidthPremium PriceEconomy
## Min. :34.00 Min. :17.00 Min. :17.00 Min. : 65
## 1st Qu.:38.00 1st Qu.:18.00 1st Qu.:19.00 1st Qu.: 413
## Median :38.00 Median :18.00 Median :19.00 Median :1242
## Mean :37.91 Mean :17.84 Mean :19.47 Mean :1327
## 3rd Qu.:38.00 3rd Qu.:18.00 3rd Qu.:21.00 3rd Qu.:1909
## Max. :40.00 Max. :19.00 Max. :21.00 Max. :3593
## PricePremium PriceRelative SeatsTotal PitchDifference
## Min. : 86.0 Min. :0.0200 Min. : 98 Min. : 2.000
## 1st Qu.: 528.8 1st Qu.:0.1000 1st Qu.:166 1st Qu.: 6.000
## Median :1737.0 Median :0.3650 Median :227 Median : 7.000
## Mean :1845.3 Mean :0.4872 Mean :236 Mean : 6.688
## 3rd Qu.:2989.0 3rd Qu.:0.7400 3rd Qu.:279 3rd Qu.: 7.000
## Max. :7414.0 Max. :1.8900 Max. :441 Max. :10.000
## WidthDifference PercentPremiumSeats
## Min. :0.000 Min. : 4.71
## 1st Qu.:1.000 1st Qu.:12.28
## Median :1.000 Median :13.21
## Mean :1.633 Mean :14.65
## 3rd Qu.:3.000 3rd Qu.:15.36
## Max. :4.000 Max. :24.69
library(psych)
## Warning: package 'psych' was built under R version 3.3.3
describe(var1.df$FlightDuration)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 458 7.58 3.54 7.79 7.57 4.81 1.25 14.66 13.41 -0.07 -1.12
## se
## X1 0.17
table(var1.df$TravelMonth)
##
## Aug Jul Oct Sep
## 127 75 127 129
barplot(table(var1.df$TravelMonth), xlab="Month", ylab = "No. of Flights", col="grey" )
boxplot(var1.df$FlightDuration, horizontal = TRUE, xlab="Duration(hrs)")
table(var1.df$IsInternational)
##
## Domestic International
## 40 418
barplot(table(var1.df$IsInternational), ylab = "No. of Flights", col="grey" )
describe(var1.df$SeatsEconomy)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 458 202.31 76.37 185 194.64 85.99 78 389 311 0.72 -0.36
## se
## X1 3.57
describe(var1.df$WidthEconomy)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 458 17.84 0.56 18 17.81 0 17 19 2 -0.04 -0.08
## se
## X1 0.03
barplot(table(var1.df$WidthEconomy),xlab="Width(Inches)", ylab = "No. of Flights", col="grey" )
describe(var1.df$PriceEconomy)
## vars n mean sd median trimmed mad min max range skew
## X1 1 458 1327.08 988.27 1242 1244.4 1159.39 65 3593 3528 0.51
## kurtosis se
## X1 -0.88 46.18
boxplot(var1.df$PriceEconomy, horizontal = TRUE, xlab="Ticket Price(USD)")
describe(var1.df$PriceRelative)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 458 0.49 0.45 0.36 0.42 0.41 0.02 1.89 1.87 1.17 0.72
## se
## X1 0.02
boxplot(var1.df$PriceRelative, horizontal = TRUE, xlab="
(PricePremium - PriceEconomy) / PriceEconomy")
describe(var1.df$SeatsTotal)
## vars n mean sd median trimmed mad min max range skew kurtosis
## X1 1 458 235.96 85.29 227 228.73 90.44 98 441 343 0.7 -0.53
## se
## X1 3.99
boxplot(var1.df$SeatsTotal, horizontal = TRUE, xlab="No. of Seats")
describe(var1.df$PitchDifference)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 458 6.69 1.76 7 6.76 0 2 10 8 -0.54 1.78 0.08
barplot(table(var1.df$PitchDifference),xlab="PitchDifference(Inches)", ylab = "No. of Flights", col="grey" )
describe(var1.df$WidthDifference)
## vars n mean sd median trimmed mad min max range skew kurtosis se
## X1 1 458 1.63 1.19 1 1.53 0 0 4 4 0.84 -0.53 0.06
barplot(table(var1.df$WidthDifference),xlab="WidthDifference(Inches)", ylab = "No. of Flights", col="grey" )
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.3.3
corrgram(var1.df, order=FALSE,
lower.panel=panel.shade,
upper.panel=panel.pie,
text.panel=panel.txt,
main="Corrgram of analyze relations between variable of dataframe")
A.Test on the correlation between difference of price and PitchDifference .
cor.test((var1.df$PricePremium-var1.df$PriceEconomy),var1.df$PitchDifference)
##
## Pearson's product-moment correlation
##
## data: (var1.df$PricePremium - var1.df$PriceEconomy) and var1.df$PitchDifference
## t = 2.7688, df = 456, p-value = 0.005855
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.03739893 0.21764764
## sample estimates:
## cor
## 0.1285851
library(car)
## Warning: package 'car' was built under R version 3.3.3
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot((var1.df$PricePremium-var1.df$PriceEconomy),var1.df$PitchDifference)
B.Test on the corelation between difference of price and WidthDifference.
cor.test((var1.df$PricePremium-var1.df$PriceEconomy),var1.df$WidthDifference)
##
## Pearson's product-moment correlation
##
## data: (var1.df$PricePremium - var1.df$PriceEconomy) and var1.df$WidthDifference
## t = 2.5291, df = 456, p-value = 0.01177
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.02627012 0.20700978
## sample estimates:
## cor
## 0.1176138
library(car)
scatterplot((var1.df$PricePremium-var1.df$PriceEconomy),var1.df$WidthDifference)
C.Test on the corelation between difference of price and FlightDuration .
cor.test((var1.df$PricePremium-var1.df$PriceEconomy),var1.df$FlightDuration)
##
## Pearson's product-moment correlation
##
## data: (var1.df$PricePremium - var1.df$PriceEconomy) and var1.df$FlightDuration
## t = 11.435, df = 456, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3976578 0.5403379
## sample estimates:
## cor
## 0.4720837
library(car)
scatterplot((var1.df$PricePremium-var1.df$PriceEconomy),var1.df$FlightDuration)
The above correlations tests yield or suggest that the difference in pricing of the 2 class of tickets depends strongly on the flightduration since p vlue is significantly less (i.e.2.2e-16) and also on the pitch and width difference.(p-value<0.05).
Null Hypothesis : there is no difference between an economy class ticket and a premium economy class ticket.
t.test(var1.df$PriceEconomy,var1.df$PricePremium,var.equal = TRUE,paired = FALSE)
##
## Two Sample t-test
##
## data: var1.df$PriceEconomy and var1.df$PricePremium
## t = -6.8304, df = 914, p-value = 1.544e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -667.0699 -369.2926
## sample estimates:
## mean of x mean of y
## 1327.076 1845.258
The null hypothesis is rejected because the t-Test gives a very low p-value and there is a difference between economy class and premium economy class tickets.
var2 <- (var1.df$PricePremium-var1.df$PriceEconomy) ~ var1.df$PitchDifference+var1.df$WidthDifference+var1.df$FlightDuration
var3 <- lm(var2)
summary(var3)
##
## Call:
## lm(formula = var2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -859.4 -324.7 -62.7 150.1 3331.5
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -286.933 117.833 -2.435 0.0153 *
## var1.df$PitchDifference 10.387 20.779 0.500 0.6174
## var1.df$WidthDifference 74.641 30.977 2.410 0.0164 *
## var1.df$FlightDuration 80.992 6.754 11.992 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 506.1 on 454 degrees of freedom
## Multiple R-squared: 0.2538, Adjusted R-squared: 0.2489
## F-statistic: 51.48 on 3 and 454 DF, p-value: < 2.2e-16
A.Beta coefficients of Model.
var3$coefficients
## (Intercept) var1.df$PitchDifference var1.df$WidthDifference
## -286.93258 10.38682 74.64098
## var1.df$FlightDuration
## 80.99227
B . Confidence Intervals on the beta coefficients.
confint(var3)
## 2.5 % 97.5 %
## (Intercept) -518.49881 -55.36635
## var1.df$PitchDifference -30.44766 51.22130
## var1.df$WidthDifference 13.76513 135.51683
## var1.df$FlightDuration 67.72008 94.26446
C.Plot of the model.
library(car)
plot(var2)
abline(var2)
1.The data set is normally distributed therfore we can easily perform the regression analysis . 2. As we can see from the regression analysis, that the difference in price between an economy ticket and a premium-economy ticket (PriceRelative) depends significantly on FlightDuration and WidthDifference and less significantly on PitchDifference .