Variable Explanation
# Following are the variables involved in dataset concerning Airline Ticket Prices:
# • FlightNumber (Factor Variable): These are unique numbers assigned for varied flight routes & airline
# • Airline (Factor Variable): Dataset consists of four major airlines, with market collective market share of over 80% in domestic Indian market. These airlines are Air India, IndiGo, Jet Airways & Spicejet
# • DepartureCity (Factor Variable): Dataset consists of 79 different departure cities
# • ArrivalCity (Factor Variable): Dataset consists of 76 different arrival cities
# • DepartureTime (Continuous Variable): Departure time is an important variable, since it may have influence on ticket prices. Prices may follow a pattern based on departure time on weekdays and weekends
# • ArrivalTime (Continuous Variable): Arrival time is an important variable, since it may have influence on ticket prices. Prices may follow a pattern based on Arrival time on weekdays and weekends
# • Departure (Factor Variable): Consists of only two unique entries – AM & PM. Departure, clubbed with DepartureTime may have a strong pattern of ticket price
# • Fly (Factor Variable): This is an important variable with four levels. These are combinations of departure & arrival city being Metro & Non-Metro. A metro-to-metro flight is expected to be costliest while non metro-to-non metro cheapest
# • FlyingTime (Continuous Variable): Flying time may be one of the most important factors with cost & ticket price implication. Higher flying time indicates higher distance & hence, fuel consumption
# • Aircraft (Factor Variable): These are type of airplanes with some implication on seating capacity, arrangement, size etc.
# • PlaneModel (Factor Variable): Within aircraft, PlaneModel indicates different models
# • Capacity (Continuous Variable): Number of seats in airplane: Higher number of seats may tend to lower down price per ticket (economies of scale)
# • SeatPitch (Continuous Variable): It is distance between two consecutive seats (back and front). Higher the SeatPitch, higher will be the legroom. It may have a small impact on airline ticket price
# • SeatWidth (Continuous Variable): It is distance between one armrest to the other
# • DataCollectionDate (Factor Variable): All entries in dataset were collected in a period of 7 days, 18th Oct 2018 – 24th Oct 2018
# • DateDeparture (Factor Variable):
# • DayDeparture (Factor Variable): This variable has seven days in a week as its entries.
# • Weekend (Factor Variable): This variable has only two entries, yes or no. On weekend, since the traffic if higher or demand is more, prices are expected to be more
# • Price (Continuous Variable): Dependent variable
# • AdBookDays (Continuous Variable): Difference between departure date and date of booking
# • Diwali (Factor Variable): Diwali festival in India stimulates high air travel, hence higher demand. Therefore, ticket price on Diwali could be much higher than any regular day
# • DayBeforeDiwali (Factor Variable) Diwali festival in India stimulates high air travel, hence higher demand. Therefore, ticket price one day before Diwali could be much higher than any regular day
# • DayAfterDiwali (Factor Variable) Diwali festival in India stimulates high air travel, hence higher demand. Therefore, ticket price one day after Diwali could be much higher than any regular day
knitr::opts_chunk$set(echo = TRUE)
df <- read.csv("FourIndianAirlinesData.csv")
colnames(df)
## [1] "FlightNumber" "Airline" "DepartureCity"
## [4] "ArrivalCity" "DepartureTime" "ArrivalTime"
## [7] "Departure" "Fly" "FlyingTime"
## [10] "Aircraft" "PlaneModel" "Capacity"
## [13] "SeatPitch" "SeatWidth" "DataCollectionDate"
## [16] "DateDeparture" "DayDeparture" "Weekend"
## [19] "Price" "AdvBookDays" "Diwali"
## [22] "DayBeforeDiwali" "DayAfterDiwali"
attach(df)
str(df)
## 'data.frame': 8187 obs. of 23 variables:
## $ FlightNumber : Factor w/ 1765 levels "6E 101","6E 102",..: 203 301 404 889 934 1146 1255 1358 1514 790 ...
## $ Airline : Factor w/ 4 levels "Air India","IndiGo",..: 2 2 2 3 3 3 1 1 1 2 ...
## $ DepartureCity : Factor w/ 79 levels "Agartala","Agatti",..: 54 11 54 4 51 51 51 63 68 29 ...
## $ ArrivalCity : Factor w/ 76 levels "Agartala","Ahmedabad",..: 13 66 61 48 8 51 24 51 51 2 ...
## $ DepartureTime : int 2250 820 1810 2110 305 900 200 730 1225 500 ...
## $ ArrivalTime : int 100 945 2000 2255 445 1125 320 910 205 705 ...
## $ Departure : Factor w/ 2 levels "AM","PM": 2 1 2 2 1 1 1 1 2 1 ...
## $ Fly : Factor w/ 4 levels "MM","MN","NM",..: 2 4 2 3 2 1 2 3 3 4 ...
## $ FlyingTime : int 130 85 110 90 100 145 80 100 100 125 ...
## $ Aircraft : Factor w/ 4 levels "Aerospatiale",..: 2 2 2 3 3 3 2 2 2 2 ...
## $ PlaneModel : Factor w/ 122 levels "-"," Boeing 737-800 (738)",..: 68 68 68 109 111 111 70 60 70 61 ...
## $ Capacity : int 180 180 180 168 168 168 182 144 182 180 ...
## $ SeatPitch : num 30 30 30 30 30 30 31.5 29.5 31.5 30 ...
## $ SeatWidth : num 18 18 18 17 17 17 17.5 17.8 17.5 18 ...
## $ DataCollectionDate: Factor w/ 7 levels "Oct 18 2018",..: 6 6 6 6 6 6 6 6 6 3 ...
## $ DateDeparture : Factor w/ 29 levels "Nov 03 2018",..: 22 22 22 22 22 22 22 22 22 20 ...
## $ DayDeparture : Factor w/ 7 levels "Friday","Monday",..: 6 6 6 6 6 6 6 6 6 2 ...
## $ Weekend : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ Price : int 6623 4051 6623 15297 9405 12188 1892 11134 10454 5625 ...
## $ AdvBookDays : int 1 1 1 1 1 1 1 1 1 2 ...
## $ Diwali : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ DayBeforeDiwali : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
## $ DayAfterDiwali : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
# plotting histogram
hist(df$Price ,main = "Histogram of variable Price",
xlab = "Price",col = c("gray"))
hist(df$FlyingTime ,main = "Histogram of variable FlyingTime",
xlab = "FlyingTime",col = c("Blue"))
hist(df$AdvBookDays ,main = "Histogram of variable AdvBookDays",
xlab = "AdvBookDays",col = c("gray"))
boxplot(df$Price,width = 0.5,
horizontal = TRUE,main = "boxplot for variable Price",
xlab = "Price",col = c("lightblue"))
boxplot(df$FlyingTime,width = 0.5,
horizontal = TRUE,main = "boxplot for variable FlyingTime",
xlab = "FlyingTime",col = c("gray"))
boxplot(df$AdvBookDays,width = 0.5,
horizontal = TRUE,main = "boxplot for variable AdvBookDays",
xlab = "AdvBookDays",col = c("lightblue"))
#Pie Chart for Airline and Departure
# Pie Chart with Percentages (Aieline)
slices <- c(1543, 3811, 1905, 928)
lbls <- c("Air India", "IndiGo", "Jet Airways", "Spice Jet")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),
main="Pie Chart of Airlines")
#Departure
slices <- c(874,8187)
lbls <- c("AM", "PM")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),
main="Pie Chart of Departure")
library(psych)
describeBy(Price,Airline, mat = TRUE)[,c(2,4:7, 10:12)]
## group1 n mean sd median min max range
## X11 Air India 1543 6712.027 4455.835 5490 1025 32003 30978
## X12 IndiGo 3811 4979.133 2595.443 4355 637 43454 42817
## X13 Jet Airways 1905 6134.127 3126.986 5569 1145 38560 37415
## X14 Spice Jet 928 4815.122 2435.603 4207 1299 21639 20340
describeBy(Price,Departure, mat = TRUE)[,c(2,4:7, 10:12)]
## group1 n mean sd median min max range
## X11 AM 4802 5686.845 3495.559 4780 1025 43454 42429
## X12 PM 3385 5370.117 2772.478 4775 637 32003 31366
describeBy(Price,Weekend, mat = TRUE)[,c(2,4:7, 10:12)]
## group1 n mean sd median min max range
## X11 No 6774 5500.027 3267.063 4671 637 43454 42817
## X12 Yes 1413 5823.706 2970.390 5363 999 32003 31004
describeBy(Price,DayDeparture, mat = TRUE)[,c(2,4:7, 10:12)]
## group1 n mean sd median min max range
## X11 Friday 118 5815.271 2884.999 5434.0 2134 25025 22891
## X12 Monday 1298 6189.549 3564.124 5484.5 999 38560 37561
## X13 Saturday 624 5566.455 2289.683 5240.5 999 20422 19423
## X14 Sunday 789 6027.160 3401.674 5417.0 1529 32003 30474
## X15 Thursday 1786 5243.237 3065.742 4361.0 637 22238 21601
## X16 Tuesday 2761 5353.585 3335.518 4511.0 974 43454 42480
## X17 Wednesday 811 5414.641 2843.221 4750.0 1025 21992 20967
tapply(Price,Weekend, summary)
## $No
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 637 3380 4671 5500 6890 43454
##
## $Yes
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 999 4051 5363 5824 6954 32003
tapply(Price,Departure, summary)
## $AM
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1025 3491 4780 5687 6989 43454
##
## $PM
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 637 3518 4775 5370 6753 32003
tapply(Price,Fly, summary)
## $MM
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2703 4440 6435 6633 7791 43454
##
## $MN
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 637 3782 4900 5729 6932 31931
##
## $NM
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1336 3579 4750 5594 6756 32003
##
## $NN
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 974 2817 3994 4666 5776 38560
tapply(Price,DayDeparture, summary)
## $Friday
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2134 4065 5434 5815 6958 25025
##
## $Monday
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 999 4037 5484 6190 7410 38560
##
## $Saturday
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 999 4067 5240 5566 6885 20422
##
## $Sunday
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1529 4043 5417 6027 7046 32003
##
## $Thursday
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 637 3203 4361 5243 6418 22238
##
## $Tuesday
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 974 3209 4511 5354 6623 43454
##
## $Wednesday
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1025 3585 4750 5415 6934 21992
tapply(Price,Airline, summary)
## $`Air India`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1025 4099 5490 6712 7695 32003
##
## $IndiGo
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 637 3208 4355 4979 6361 43454
##
## $`Jet Airways`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1145 4052 5569 6134 7488 38560
##
## $`Spice Jet`
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1299 3156 4207 4815 5923 21639
Boxplots
library(ggpubr)
## Loading required package: ggplot2
##
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
##
## %+%, alpha
## Loading required package: magrittr
ggplot(df, aes(x=Weekend, y=Price)) + geom_boxplot()
ggplot(df, aes(x=Airline, y=Price)) + geom_boxplot()
ggplot(df, aes(x=Fly, y=Price)) + geom_boxplot()
ggplot(df, aes(x=Departure, y=Price)) + geom_boxplot()
ggplot(df, aes(x=DayDeparture, y=Price)) + geom_boxplot()
library(gplots)
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
plotmeans(Price~Weekend, cex=0.8)
plotmeans(Price~Airline, cex=0.8)
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
plotmeans(Price~Fly, cex=0.8)
plotmeans(Price~Departure, cex=0.8)
plotmeans(Price~DayDeparture, cex=0.8)
cor(Price,AdvBookDays, method = "pearson")
## [1] -0.2401463
cor.test(Price,AdvBookDays, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: Price and AdvBookDays
## t = -22.381, df = 8185, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2604534 -0.2196269
## sample estimates:
## cor
## -0.2401463
cor(Price,FlyingTime, method = "pearson")
## [1] 0.3143604
cor.test(Price,FlyingTime, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: Price and FlyingTime
## t = 29.959, df = 8185, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2947053 0.3337496
## sample estimates:
## cor
## 0.3143604
cor(Price,FlyingTime, method = "pearson")
## [1] 0.3143604
cor.test(Price,FlyingTime, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: Price and FlyingTime
## t = 29.959, df = 8185, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2947053 0.3337496
## sample estimates:
## cor
## 0.3143604
library(ggplot2)
library(gdata)
## gdata: Unable to locate valid perl interpreter
## gdata:
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata:
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
##
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
##
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
library(reshape2)
g1<-ggplot(df, aes(x=AdvBookDays, y=Price)) + geom_point()
g1
g2<-ggplot(df, aes(x=FlyingTime, y=Price)) + geom_point()
g2
t1=as.data.frame(tapply(df$Price, list(df$Airline,df$DayDeparture), mean))
t1=cbind(row.names(t1),t1)
colnames(t1)[1]="Airline"
t1
## Airline Friday Monday Saturday Sunday Thursday
## Air India Air India 8452.333 6940.439 6550.514 8271.277 6301.280
## IndiGo IndiGo 5482.904 5770.741 5261.912 5254.350 4597.084
## Jet Airways Jet Airways 5240.409 6476.875 5953.511 6138.136 5951.884
## Spice Jet Spice Jet 4708.826 5941.024 4596.714 5355.231 4414.737
## Tuesday Wednesday
## Air India 6710.780 5763.346
## IndiGo 4742.719 4769.169
## Jet Airways 6058.924 6395.697
## Spice Jet 4487.682 4731.229
t2=as.data.frame(tapply(df$Price, list(df$Airline,df$Weekend), mean))
t2=cbind(row.names(t2),t2)
colnames(t2)[1]="Airline"
t2
## Airline No Yes
## Air India Air India 6536.871 7576.354
## IndiGo IndiGo 4916.839 5257.935
## Jet Airways Jet Airways 6148.546 6050.439
## Spice Jet Spice Jet 4744.386 5115.249
t3=as.data.frame(tapply(df$Price, list(df$Airline,df$Fly), mean))
t3=cbind(row.names(t3),t3)
colnames(t3)[1]="Airline"
t3
## Airline MM MN NM NN
## Air India Air India 7924.236 6726.250 6524.852 5376.669
## IndiGo IndiGo 5838.798 5007.977 5106.394 4572.920
## Jet Airways Jet Airways 7104.712 6335.525 5834.895 5079.274
## Spice Jet Spice Jet 5543.304 4989.598 4768.590 4410.424
t4=as.data.frame(tapply(df$Price, list(df$Airline,df$Departure), mean))
t4=cbind(row.names(t4),t4)
colnames(t4)[1]="Airline"
t4
## Airline AM PM
## Air India Air India 6701.908 6812.645
## IndiGo IndiGo 5144.172 4854.327
## Jet Airways Jet Airways 6018.686 6234.288
## Spice Jet Spice Jet 4741.458 6007.389
data_t1 <- melt(t1, id.vars='Airline')
colnames(data_t1)[2:3]<-c("DayDeparture","Price")
g3<-ggplot(data_t1, aes(fill=Airline, y=Price, x=DayDeparture)) + geom_bar(position="dodge", stat="identity")
g3
data_t2 <- melt(t2, id.vars='Airline')
colnames(data_t2)[2:3]<-c("Weekend","Price")
g4<-ggplot(data_t2, aes(fill=Airline, y=Price, x=Weekend)) + geom_bar(position="dodge", stat="identity")
g4
data_t3 <- melt(t3, id.vars='Airline')
colnames(data_t3)[2:3]<-c("Fly","Price")
g5<-ggplot(data_t3, aes(fill=Airline, y=Price, x=Fly)) + geom_bar(position="dodge", stat="identity")
g5
data_t4 <- melt(t4, id.vars='Airline')
colnames(data_t4)[2:3]<-c("Departure","Price")
g6<-ggplot(data_t4, aes(fill=Airline, y=Price, x=Departure)) + geom_bar(position="dodge", stat="identity")
g6
library(corrgram)
## Registered S3 method overwritten by 'seriation':
## method from
## reorder.hclust gclus
airlineSubset <- df[,c('FlyingTime','Price','AdvBookDays')]
corMat <- cor(airlineSubset, use = "complete")
round(corMat, 3)
## FlyingTime Price AdvBookDays
## FlyingTime 1.000 0.314 -0.004
## Price 0.314 1.000 -0.240
## AdvBookDays -0.004 -0.240 1.000
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(airlineSubset), method = "circle")
corrgram(df[,c('Price','FlyingTime','AdvBookDays')],
lower.panel=panel.shade,
upper.panel=panel.conf,
text.panel=panel.txt,main="corrgram",)
library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
##
## Attaching package: 'xts'
## The following objects are masked from 'package:gdata':
##
## first, last
##
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
##
## textplot
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(df[,c('Price','FlyingTime','AdvBookDays')])