Reading local data in R
# reading data and storing into `store.df` dataframe
airline.df <- read.csv(paste("AirlinePricingData - AirlinePricingData.csv"))
# number of rows and columns
dim(airline.df)
## [1] 305 25
Q1a. Write R code to generate the correlation matrix for the given continuous variables {“Price”, “AdvancedBookingDays”, “FlyingMinutes”, “Capacity”, “SeatPitch”, “SeatWidth”}
airlines <- airline.df[,c(17,18,8,11,12,13)]
## Pearson Correlation
round(cor(airlines),2)
## Price AdvancedBookingDays FlyingMinutes Capacity
## Price 1.00 -0.01 -0.02 -0.03
## AdvancedBookingDays -0.01 1.00 0.01 -0.01
## FlyingMinutes -0.02 0.01 1.00 -0.32
## Capacity -0.03 -0.01 -0.32 1.00
## SeatPitch 0.07 -0.01 -0.03 0.51
## SeatWidth -0.06 0.05 -0.18 0.45
## SeatPitch SeatWidth
## Price 0.07 -0.06
## AdvancedBookingDays -0.01 0.05
## FlyingMinutes -0.03 -0.18
## Capacity 0.51 0.45
## SeatPitch 1.00 0.32
## SeatWidth 0.32 1.00
## Spearman Correlation
round(cor(airlines, method="spearman"),2)
## Price AdvancedBookingDays FlyingMinutes Capacity
## Price 1.00 -0.09 0.02 -0.07
## AdvancedBookingDays -0.09 1.00 0.02 -0.01
## FlyingMinutes 0.02 0.02 1.00 -0.34
## Capacity -0.07 -0.01 -0.34 1.00
## SeatPitch 0.15 0.00 0.02 0.22
## SeatWidth -0.15 0.10 -0.18 0.54
## SeatPitch SeatWidth
## Price 0.15 -0.15
## AdvancedBookingDays 0.00 0.10
## FlyingMinutes 0.02 -0.18
## Capacity 0.22 0.54
## SeatPitch 1.00 0.32
## SeatWidth 0.32 1.00
Q1b. Write R code to generate the correlation matrix with their significance values for the given continuous variables {“Price”, “AdvancedBookingDays”, “FlyingMinutes”, “Capacity”, “SeatPitch”, “SeatWidth”}
library(psych)
corr.test(airlines, use="complete")
## Call:corr.test(x = airlines, use = "complete")
## Correlation matrix
## Price AdvancedBookingDays FlyingMinutes Capacity
## Price 1.00 -0.01 -0.02 -0.03
## AdvancedBookingDays -0.01 1.00 0.01 -0.01
## FlyingMinutes -0.02 0.01 1.00 -0.32
## Capacity -0.03 -0.01 -0.32 1.00
## SeatPitch 0.07 -0.01 -0.03 0.51
## SeatWidth -0.06 0.05 -0.18 0.45
## SeatPitch SeatWidth
## Price 0.07 -0.06
## AdvancedBookingDays -0.01 0.05
## FlyingMinutes -0.03 -0.18
## Capacity 0.51 0.45
## SeatPitch 1.00 0.32
## SeatWidth 0.32 1.00
## Sample Size
## [1] 305
## Probability values (Entries above the diagonal are adjusted for multiple tests.)
## Price AdvancedBookingDays FlyingMinutes Capacity
## Price 0.00 1.00 1.00 1
## AdvancedBookingDays 0.87 0.00 1.00 1
## FlyingMinutes 0.75 0.93 0.00 0
## Capacity 0.65 0.88 0.00 0
## SeatPitch 0.19 0.81 0.55 0
## SeatWidth 0.30 0.34 0.00 0
## SeatPitch SeatWidth
## Price 1 1.00
## AdvancedBookingDays 1 1.00
## FlyingMinutes 1 0.01
## Capacity 0 0.00
## SeatPitch 0 0.00
## SeatWidth 0 0.00
##
## To see confidence intervals of the correlations, print with the short=FALSE option
Q1d. Write R code to visualize Q1b.
library(corrgram)
cols <- colorRampPalette(c("darkgoldenrod4", "burlywood1",
"darkkhaki", "darkgreen"))
corrgram(airlines, order=TRUE, col.regions=cols,
lower.panel=panel.shade,
upper.panel=panel.conf, text.panel=panel.txt,
main="A Corrgram for airlines correlations")

Q1e. Write R code to generate the following visual
library("PerformanceAnalytics")
chart.Correlation(airlines, histogram = TRUE, pch=19)

Q2a. Test whether the ticket price of Mumbai to Delhi flights are more than INR 5000
airline_price <- airline.df[which(airline.df$DepartureCityCode == 'BOM' & airline.df$ArrivalCityCode == 'DEL'),'Price']
t.test(airline_price, mu=5000)
##
## One Sample t-test
##
## data: airline_price
## t = 6.0784, df = 129, p-value = 1.277e-08
## alternative hypothesis: true mean is not equal to 5000
## 95 percent confidence interval:
## 5844.506 6659.601
## sample estimates:
## mean of x
## 6252.054
Q2b. Test whether the ticket price of morning flights are costlier than the after-noon flights
morning_flights <- airline.df[which(airline.df$Departure == 'AM'),'Price']
afternoon_flights <- airline.df[which(airline.df$Departure == 'PM'),'Price']
t.test(morning_flights,afternoon_flights, paired=FALSE)
##
## Welch Two Sample t-test
##
## data: morning_flights and afternoon_flights
## t = 1.736, df = 296.58, p-value = 0.08359
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -61.22997 977.79637
## sample estimates:
## mean of x mean of y
## 5598.893 5140.610
Q2c. Test whether the ticket price at Diwali is more compare to non-Diwali.
diwali_flights <- airline.df[which(airline.df$IsDiwali==1 | airline.df$DayBeforeDiwali==1 | airline.df$DayAfterDiwali==1),'Price']
non_diwali_flights <- airline.df[which(airline.df$IsDiwali==0 & airline.df$DayBeforeDiwali==0 & airline.df$DayAfterDiwali==0),'Price']
t.test(diwali_flights,non_diwali_flights, paired=FALSE)
##
## Welch Two Sample t-test
##
## data: diwali_flights and non_diwali_flights
## t = 2.9799, df = 244.52, p-value = 0.003174
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 282.6141 1384.7250
## sample estimates:
## mean of x mean of y
## 5897.479 5063.810
Q2d. Test whether the ticket price for Air India flights costlier than IndiGo flights.
air_india_flights <- airline.df[which(airline.df$Airline == 'Air India'),'Price']
indigo_flights <- airline.df[which(airline.df$Airline == 'IndiGo'),'Price']
t.test(air_india_flights,indigo_flights, paired=FALSE)
##
## Welch Two Sample t-test
##
## data: air_india_flights and indigo_flights
## t = 2.7205, df = 87.71, p-value = 0.007859
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 392.2247 2518.7253
## sample estimates:
## mean of x mean of y
## 6335.000 4879.525
Q3a. Regress Price on Advanced Booking Days. Write R code to construct a simple linear regression model and output the summary of the model.
describe(airline.df)
## vars n mean sd median trimmed mad
## FlightNumber* 1 305 31.86 18.35 32.00 31.82 23.72
## Airline* 2 305 2.60 0.88 3.00 2.62 1.48
## DepartureCityCode* 3 305 1.57 0.50 2.00 1.59 0.00
## ArrivalCityCode* 4 305 1.43 0.50 1.00 1.41 0.00
## DepartureTime 5 305 1249.54 579.86 1035.00 1237.63 593.04
## ArrivalTime 6 305 1329.31 613.52 1215.00 1347.73 585.63
## Departure* 7 305 1.45 0.50 1.00 1.43 0.00
## FlyingMinutes 8 305 136.03 4.71 135.00 135.80 7.41
## Aircraft* 9 305 1.54 0.50 2.00 1.55 0.00
## PlaneModel* 10 305 3.82 2.71 3.00 3.63 2.97
## Capacity 11 305 176.36 32.39 180.00 172.19 14.83
## SeatPitch 12 305 30.26 0.93 30.00 30.17 0.00
## SeatWidth 13 305 17.41 0.49 17.00 17.38 0.00
## DataCollectionDate* 14 305 4.36 1.98 5.00 4.45 1.48
## DateDeparture* 15 305 8.14 6.69 7.00 7.64 8.90
## IsWeekend* 16 305 1.13 0.34 1.00 1.04 0.00
## Price 17 305 5394.54 2388.29 4681.00 4984.80 1052.65
## AdvancedBookingDays 18 305 28.90 22.30 30.00 28.50 34.10
## IsDiwali 19 305 0.40 0.49 0.00 0.37 0.00
## DayBeforeDiwali 20 305 0.19 0.40 0.00 0.12 0.00
## DayAfterDiwali 21 305 0.20 0.40 0.00 0.13 0.00
## MetroDeparture 22 305 1.00 0.00 1.00 1.00 0.00
## MetroArrival 23 305 1.00 0.00 1.00 1.00 0.00
## MarketShare 24 305 21.18 11.04 15.40 19.90 3.11
## LoadFactor 25 305 85.13 4.32 83.32 84.82 5.75
## min max range skew kurtosis se
## FlightNumber* 1.00 63.00 62.00 0.01 -1.22 1.05
## Airline* 1.00 4.00 3.00 -0.32 -0.61 0.05
## DepartureCityCode* 1.00 2.00 1.00 -0.30 -1.92 0.03
## ArrivalCityCode* 1.00 2.00 1.00 0.30 -1.92 0.03
## DepartureTime 225.00 2320.00 2095.00 0.22 -1.29 33.20
## ArrivalTime 20.00 2345.00 2325.00 -0.07 -0.76 35.13
## Departure* 1.00 2.00 1.00 0.22 -1.96 0.03
## FlyingMinutes 125.00 145.00 20.00 0.28 -0.33 0.27
## Aircraft* 1.00 2.00 1.00 -0.16 -1.98 0.03
## PlaneModel* 1.00 9.00 8.00 0.23 -1.60 0.16
## Capacity 138.00 303.00 165.00 2.11 5.91 1.85
## SeatPitch 29.00 33.00 4.00 1.03 0.61 0.05
## SeatWidth 17.00 18.00 1.00 0.37 -1.86 0.03
## DataCollectionDate* 1.00 7.00 6.00 -0.38 -1.15 0.11
## DateDeparture* 1.00 20.00 19.00 0.40 -1.38 0.38
## IsWeekend* 1.00 2.00 1.00 2.13 2.56 0.02
## Price 2607.00 18015.00 15408.00 2.26 6.41 136.75
## AdvancedBookingDays 2.00 61.00 59.00 0.03 -1.68 1.28
## IsDiwali 0.00 1.00 1.00 0.42 -1.83 0.03
## DayBeforeDiwali 0.00 1.00 1.00 1.54 0.39 0.02
## DayAfterDiwali 0.00 1.00 1.00 1.47 0.15 0.02
## MetroDeparture 1.00 1.00 0.00 NaN NaN 0.00
## MetroArrival 1.00 1.00 0.00 NaN NaN 0.00
## MarketShare 13.20 39.60 26.40 1.05 -0.86 0.63
## LoadFactor 78.73 94.06 15.33 0.71 0.06 0.25
attach(airline.df)
plot(Price, AdvancedBookingDays, xlab="Price", ylab="AdvancedBookingDays")

fit <- lm(Price ~ AdvancedBookingDays, data=airline.df)
summary(fit)
##
## Call:
## lm(formula = Price ~ AdvancedBookingDays, data = airline.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2786.5 -1320.8 -688.9 351.2 12594.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5422.959 224.497 24.16 <2e-16 ***
## AdvancedBookingDays -0.983 6.154 -0.16 0.873
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2392 on 303 degrees of freedom
## Multiple R-squared: 8.422e-05, Adjusted R-squared: -0.003216
## F-statistic: 0.02552 on 1 and 303 DF, p-value: 0.8732
Q3b. Regress Price on Advanced Booking Days and IsDiwali. Write R code to construct a multiple linear regression model and output the summary of the model
attach(airline.df)
plot(Price, IsDiwali, xlab="Price", ylab="IsDiwali")

fit <- lm(Price ~ AdvancedBookingDays + IsDiwali, data=airline.df)
summary(fit)
##
## Call:
## lm(formula = Price ~ AdvancedBookingDays + IsDiwali, data = airline.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3222.7 -1210.4 -498.1 738.3 11972.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6225.95 229.19 27.165 < 2e-16 ***
## AdvancedBookingDays -91.77 12.88 -7.126 7.64e-12 ***
## IsDiwali 4590.92 585.99 7.834 8.08e-14 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2184 on 302 degrees of freedom
## Multiple R-squared: 0.169, Adjusted R-squared: 0.1635
## F-statistic: 30.7 on 2 and 302 DF, p-value: 7.264e-13