data.df <-read.csv(paste("AirlinePricingData.csv", sep=""))
View(data.df)
colnames(data.df)
## [1] "FlightNumber" "Airline" "DepartureCityCode"
## [4] "ArrivalCityCode" "DepartureTime" "ArrivalTime"
## [7] "Departure" "FlyingMinutes" "Aircraft"
## [10] "PlaneModel" "Capacity" "SeatPitch"
## [13] "SeatWidth" "DataCollectionDate" "DateDeparture"
## [16] "IsWeekend" "Price" "AdvancedBookingDays"
## [19] "IsDiwali" "DayBeforeDiwali" "DayAfterDiwali"
## [22] "MetroDeparture" "MetroArrival" "MarketShare"
## [25] "LoadFactor"
Var <- data.df[c("Price", "AdvancedBookingDays", "FlyingMinutes", "Capacity", "SeatPitch", "SeatWidth")]
cor(Var)
## Price AdvancedBookingDays FlyingMinutes
## Price 1.000000000 -0.009177029 -0.018219539
## AdvancedBookingDays -0.009177029 1.000000000 0.005109801
## FlyingMinutes -0.018219539 0.005109801 1.000000000
## Capacity -0.025983460 -0.008817351 -0.320284501
## SeatPitch 0.074540632 -0.014181686 -0.034175640
## SeatWidth -0.059567815 0.054692981 -0.182982214
## Capacity SeatPitch SeatWidth
## Price -0.025983460 0.07454063 -0.05956782
## AdvancedBookingDays -0.008817351 -0.01418169 0.05469298
## FlyingMinutes -0.320284501 -0.03417564 -0.18298221
## Capacity 1.000000000 0.50652704 0.45303789
## SeatPitch 0.506527044 1.00000000 0.31946224
## SeatWidth 0.453037895 0.31946224 1.00000000
Var <- data.df[c("Price", "AdvancedBookingDays", "FlyingMinutes", "Capacity", "SeatPitch", "SeatWidth")]
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
rcorr(as.matrix(Var))
## Price AdvancedBookingDays FlyingMinutes Capacity
## Price 1.00 -0.01 -0.02 -0.03
## AdvancedBookingDays -0.01 1.00 0.01 -0.01
## FlyingMinutes -0.02 0.01 1.00 -0.32
## Capacity -0.03 -0.01 -0.32 1.00
## SeatPitch 0.07 -0.01 -0.03 0.51
## SeatWidth -0.06 0.05 -0.18 0.45
## SeatPitch SeatWidth
## Price 0.07 -0.06
## AdvancedBookingDays -0.01 0.05
## FlyingMinutes -0.03 -0.18
## Capacity 0.51 0.45
## SeatPitch 1.00 0.32
## SeatWidth 0.32 1.00
##
## n= 305
##
##
## P
## Price AdvancedBookingDays FlyingMinutes Capacity
## Price 0.8732 0.7513 0.6513
## AdvancedBookingDays 0.8732 0.9292 0.8781
## FlyingMinutes 0.7513 0.9292 0.0000
## Capacity 0.6513 0.8781 0.0000
## SeatPitch 0.1942 0.8052 0.5521 0.0000
## SeatWidth 0.2998 0.3411 0.0013 0.0000
## SeatPitch SeatWidth
## Price 0.1942 0.2998
## AdvancedBookingDays 0.8052 0.3411
## FlyingMinutes 0.5521 0.0013
## Capacity 0.0000 0.0000
## SeatPitch 0.0000
## SeatWidth 0.0000
library(corrgram)
##
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
##
## panel.fill
corrgram(Var, order=TRUE, lower.panel=panel.conf,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corrgram of Variable intercorrelations")
library("PerformanceAnalytics")
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(Var, histogram = TRUE, pch=19)
price1 <- data.df[data.df\(DepartureCityCode == "BOM" & data.df\)ArrivalCityCode == “DEL”,] library(psych) describe(price1$Price)
m = mean(data.df$price)
## Warning in mean.default(data.df$price): argument is not numeric or logical:
## returning NA
t.test(data.df$Price, mu=5000, alternative = 'greater')
##
## One Sample t-test
##
## data: data.df$Price
## t = 2.8851, df = 304, p-value = 0.002096
## alternative hypothesis: true mean is greater than 5000
## 95 percent confidence interval:
## 5168.918 Inf
## sample estimates:
## mean of x
## 5394.544
p value less than 0.05. Thus rejecting null hypothesis
Thus, Price of flights is higher than INR 5000
morning_flights <- data.df[which(data.df$Departure == 'AM'),'Price']
afternoon_flights <- data.df[which(data.df$Departure == 'PM'),'Price']
t.test(morning_flights,afternoon_flights, paired=FALSE)
##
## Welch Two Sample t-test
##
## data: morning_flights and afternoon_flights
## t = 1.736, df = 296.58, p-value = 0.08359
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -61.22997 977.79637
## sample estimates:
## mean of x mean of y
## 5598.893 5140.610
p value is greater than 0.05. Thus null hypothesis is true.
THus, prices are not related with time of the flight.
DiwaliY <- data.df[which(data.df$IsDiwali == 1),'Price']
DiwaliN <- data.df[which(data.df$IsDiwali == 0),'Price']
t.test(DiwaliY, DiwaliN,paired=FALSE)
##
## Welch Two Sample t-test
##
## data: DiwaliY and DiwaliN
## t = 2.9799, df = 244.52, p-value = 0.003174
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 282.6141 1384.7250
## sample estimates:
## mean of x mean of y
## 5897.479 5063.810
p value less than 0.05. Thus rejecting null hypothesis
Thus, prices differ during diwali vs non-diwali time.
DiwaliY <- data.df[which(data.df$IsDiwali == 1),'Price']
DiwaliN <- data.df[which(data.df$IsDiwali == 0),'Price']
t.test(DiwaliY, DiwaliN,paired=FALSE, mu = 0, alternative = "greater")
##
## Welch Two Sample t-test
##
## data: DiwaliY and DiwaliN
## t = 2.9799, df = 244.52, p-value = 0.001587
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 371.7482 Inf
## sample estimates:
## mean of x mean of y
## 5897.479 5063.810
p value less than 0.05. Thus rejecting null hypothesis
Thus, Price during Diwali is higher.
air_india_flights <- data.df[which(data.df$Airline == 'Air India'),'Price']
indigo_flights <- data.df[which(data.df$Airline == 'IndiGo'),'Price']
t.test(air_india_flights,indigo_flights, paired=FALSE )
##
## Welch Two Sample t-test
##
## data: air_india_flights and indigo_flights
## t = 2.7205, df = 87.71, p-value = 0.007859
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## 392.2247 2518.7253
## sample estimates:
## mean of x mean of y
## 6335.000 4879.525
p value less than 0.05. Thus rejecting null hypothesis
Thus, prices of IndiGo and Air India vary.
air_india_flights <- data.df[which(data.df$Airline == 'Air India'),'Price']
indigo_flights <- data.df[which(data.df$Airline == 'IndiGo'),'Price']
t.test(air_india_flights,indigo_flights, paired=FALSE,mu = 0, alternative = "greater" )
##
## Welch Two Sample t-test
##
## data: air_india_flights and indigo_flights
## t = 2.7205, df = 87.71, p-value = 0.00393
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 566.0833 Inf
## sample estimates:
## mean of x mean of y
## 6335.000 4879.525
p value less than 0.05. Thus rejecting null hypothesis
Thus, Price of Air India is higher than IndiGo.
data.df <-read.csv(paste("AirlinePricingData.csv", sep=""))
fit <- lm(Price ~ AdvancedBookingDays , data = data.df)
summary(fit)
##
## Call:
## lm(formula = Price ~ AdvancedBookingDays, data = data.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2786.5 -1320.8 -688.9 351.2 12594.0
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5422.959 224.497 24.16 <2e-16 ***
## AdvancedBookingDays -0.983 6.154 -0.16 0.873
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2392 on 303 degrees of freedom
## Multiple R-squared: 8.422e-05, Adjusted R-squared: -0.003216
## F-statistic: 0.02552 on 1 and 303 DF, p-value: 0.8732