# Read data into R
setwd("~/R")
airlines.df <- read.csv(paste("SixAirlines.csv",sep = ""))
attach(airlines.df)
# Data summary
library(psych)
describe(airlines.df)
## vars n mean sd median trimmed mad min
## Airline* 1 458 3.01 1.65 2.00 2.89 1.48 1.00
## Aircraft* 2 458 1.67 0.47 2.00 1.71 0.00 1.00
## FlightDuration 3 458 7.58 3.54 7.79 7.57 4.81 1.25
## TravelMonth* 4 458 2.56 1.17 3.00 2.58 1.48 1.00
## IsInternational* 5 458 1.91 0.28 2.00 2.00 0.00 1.00
## SeatsEconomy 6 458 202.31 76.37 185.00 194.64 85.99 78.00
## SeatsPremium 7 458 33.65 13.26 36.00 33.35 11.86 8.00
## PitchEconomy 8 458 31.22 0.66 31.00 31.26 0.00 30.00
## PitchPremium 9 458 37.91 1.31 38.00 38.05 0.00 34.00
## WidthEconomy 10 458 17.84 0.56 18.00 17.81 0.00 17.00
## WidthPremium 11 458 19.47 1.10 19.00 19.53 0.00 17.00
## PriceEconomy 12 458 1327.08 988.27 1242.00 1244.40 1159.39 65.00
## PricePremium 13 458 1845.26 1288.14 1737.00 1799.05 1845.84 86.00
## PriceRelative 14 458 0.49 0.45 0.36 0.42 0.41 0.02
## SeatsTotal 15 458 235.96 85.29 227.00 228.73 90.44 98.00
## PitchDifference 16 458 6.69 1.76 7.00 6.76 0.00 2.00
## WidthDifference 17 458 1.63 1.19 1.00 1.53 0.00 0.00
## PercentPremiumSeats 18 458 14.65 4.84 13.21 14.31 2.68 4.71
## max range skew kurtosis se
## Airline* 6.00 5.00 0.61 -0.95 0.08
## Aircraft* 2.00 1.00 -0.72 -1.48 0.02
## FlightDuration 14.66 13.41 -0.07 -1.12 0.17
## TravelMonth* 4.00 3.00 -0.14 -1.46 0.05
## IsInternational* 2.00 1.00 -2.91 6.50 0.01
## SeatsEconomy 389.00 311.00 0.72 -0.36 3.57
## SeatsPremium 66.00 58.00 0.23 -0.46 0.62
## PitchEconomy 33.00 3.00 -0.03 -0.35 0.03
## PitchPremium 40.00 6.00 -1.51 3.52 0.06
## WidthEconomy 19.00 2.00 -0.04 -0.08 0.03
## WidthPremium 21.00 4.00 -0.08 -0.31 0.05
## PriceEconomy 3593.00 3528.00 0.51 -0.88 46.18
## PricePremium 7414.00 7328.00 0.50 0.43 60.19
## PriceRelative 1.89 1.87 1.17 0.72 0.02
## SeatsTotal 441.00 343.00 0.70 -0.53 3.99
## PitchDifference 10.00 8.00 -0.54 1.78 0.08
## WidthDifference 4.00 4.00 0.84 -0.53 0.06
## PercentPremiumSeats 24.69 19.98 0.71 0.28 0.23
# Data types
str(airlines.df)
## 'data.frame': 458 obs. of 18 variables:
## $ Airline : Factor w/ 6 levels "AirFrance","British",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Aircraft : Factor w/ 2 levels "AirBus","Boeing": 2 2 2 2 2 2 2 2 2 2 ...
## $ FlightDuration : num 12.25 12.25 12.25 12.25 8.16 ...
## $ TravelMonth : Factor w/ 4 levels "Aug","Jul","Oct",..: 2 1 4 3 1 4 3 1 4 4 ...
## $ IsInternational : Factor w/ 2 levels "Domestic","International": 2 2 2 2 2 2 2 2 2 2 ...
## $ SeatsEconomy : int 122 122 122 122 122 122 122 122 122 122 ...
## $ SeatsPremium : int 40 40 40 40 40 40 40 40 40 40 ...
## $ PitchEconomy : int 31 31 31 31 31 31 31 31 31 31 ...
## $ PitchPremium : int 38 38 38 38 38 38 38 38 38 38 ...
## $ WidthEconomy : int 18 18 18 18 18 18 18 18 18 18 ...
## $ WidthPremium : int 19 19 19 19 19 19 19 19 19 19 ...
## $ PriceEconomy : int 2707 2707 2707 2707 1793 1793 1793 1476 1476 1705 ...
## $ PricePremium : int 3725 3725 3725 3725 2999 2999 2999 2997 2997 2989 ...
## $ PriceRelative : num 0.38 0.38 0.38 0.38 0.67 0.67 0.67 1.03 1.03 0.75 ...
## $ SeatsTotal : int 162 162 162 162 162 162 162 162 162 162 ...
## $ PitchDifference : int 7 7 7 7 7 7 7 7 7 7 ...
## $ WidthDifference : int 1 1 1 1 1 1 1 1 1 1 ...
## $ PercentPremiumSeats: num 24.7 24.7 24.7 24.7 24.7 ...
# Visualizing single-variable distributions.
par(mfrow=c(1,2))
boxplot(FlightDuration,main="Flight Duration Boxplot")
barplot(FlightDuration,main = "Flight Duration Barplot")
boxplot(SeatsEconomy,main="No. of Economy Seats Boxplot")
barplot(SeatsEconomy,main = "No. of Economy Seats Barplot")
boxplot(SeatsPremium,main="No. of Premium Economy Seats Boxplot")
barplot(SeatsPremium,main = "No. of Premium Economy Seats Barplot")
boxplot(PitchEconomy,main="Economy Pitch Boxplot")
barplot(PitchEconomy,main = "Economy Pitch Barplot")
boxplot(PitchPremium,main="Premium Economy Pitch Boxplot")
barplot(PitchPremium,main = "Premium Economy Pitch Barplot")
boxplot(WidthEconomy,main="Economy Width Boxplot")
barplot(WidthEconomy,main = "Economy Width Barplot")
boxplot(WidthPremium,main="Premium Width Boxplot")
barplot(WidthPremium,main = "Premium Width Barplot")
boxplot(PriceEconomy,main="Economy Seat Price Boxplot")
barplot(PriceEconomy,main = "Economy Seat Price Barplot")
boxplot(PricePremium,main="Premium Seat Price Boxplot")
barplot(PricePremium,main = "Premium Seat Price Barplot")
boxplot(PriceRelative,main="Relative Price Boxplot")
barplot(PriceRelative,main = "Relative Price Barplot")
boxplot(SeatsTotal,main="Total Seats Boxplot")
barplot(SeatsTotal,main = "Total Seats Barplot")
boxplot(PitchDifference,main="Pitch Difference Boxplot")
barplot(PitchDifference,main = "Pitch Difference Barplot")
boxplot(WidthDifference,main="Width Difference Boxplot")
barplot(WidthDifference,main = "Width Difference Barplot")
boxplot(PercentPremiumSeats,main = "Premium Seat Percentage Boxplot")
barplot(PercentPremiumSeats,main = "Premium Seat Percentage Barplot")
par(mfrow=c(1,1))
# Scatter plots to analyse dependency between PriceRelative and WidthDifference, PitchDifference, FlightDuration, No. of Premium Seats, Airline.
library(car)
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
scatterplot(PriceRelative~WidthDifference,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatter plot of Relative Price vs Width Difference",xlab="Width Difference",ylab="Relative Price")
scatterplot(PriceRelative~PitchDifference,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatter plot of Relative Price vs Pitch Difference",xlab="Pitch Difference",ylab="Relative Price")
scatterplot(PriceRelative~FlightDuration,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatter plot of Relative Price vs FLight Duration",xlab="Flight Duration",ylab="Relative Price")
scatterplot(PriceRelative~SeatsPremium,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatter plot of Relative Price vs No. of Premium Seats",xlab="Premium Seats",ylab="Relative Price")
scatterplot(PriceRelative~Airline,spread=FALSE,smoother.args=list(lty=2),pch=19,main="Scatter plot of Relative Price vs No. of Premium Seats",xlab="Premium Seats",ylab="Relative Price")
## [1] "406" "407" "212" "408" "213" "426" "427" "214" "409" "339" "367"
## [12] "368" "369" "110" "111" "240" "241" "260" "271" "272" "185" "186"
## [23] "187" "188" "189" "190"
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
corrgram(airlines.df, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of airline dataset")
* From the corrgram, the relevant conclusions are: 1. The prices of the tickets are highly correlated with the Pitch and Width of the Seats, with Pitch having having correlation(both Premium and Economy). 2. The pricing of tickets is highly correlated to the flight duration.
# Variance-Covariance Matrices.
var(1:18,1:18)
## [1] 28.5
cov(1:18,1:18)
## [1] 28.5
Running a t-test,
t.test(PriceEconomy,PricePremium,var.equal = TRUE,paired = FALSE)
##
## Two Sample t-test
##
## data: PriceEconomy and PricePremium
## t = -6.8304, df = 914, p-value = 1.544e-11
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -667.0699 -369.2926
## sample estimates:
## mean of x mean of y
## 1327.076 1845.258
The t-test resulting in a low p-value(<0.01) signifies significant difference between the respective pricings.
m<-lm(PriceRelative~PitchDifference+WidthDifference+FlightDuration+PercentPremiumSeats)
summary(m)
##
## Call:
## lm(formula = PriceRelative ~ PitchDifference + WidthDifference +
## FlightDuration + PercentPremiumSeats)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.79439 -0.29424 -0.03427 0.16197 1.13688
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.179033 0.101492 -1.764 0.07840 .
## PitchDifference 0.059311 0.015921 3.725 0.00022 ***
## WidthDifference 0.118140 0.024555 4.811 2.05e-06 ***
## FlightDuration 0.021707 0.005085 4.269 2.39e-05 ***
## PercentPremiumSeats -0.005999 0.003898 -1.539 0.12454
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.381 on 453 degrees of freedom
## Multiple R-squared: 0.2913, Adjusted R-squared: 0.285
## F-statistic: 46.54 on 4 and 453 DF, p-value: < 2.2e-16
Inference: