Variable Explanation

# Following are the variables involved in dataset concerning Airline Ticket Prices:
# • FlightNumber (Factor Variable): These are unique numbers assigned for varied flight routes & airline
# • Airline (Factor Variable): Dataset consists of four major airlines, with market collective market share of over 80% in domestic Indian market. These airlines are Air India, IndiGo, Jet Airways & Spicejet
# • DepartureCity (Factor Variable): Dataset consists of 79 different departure cities
# • ArrivalCity (Factor Variable): Dataset consists of 76 different arrival cities
# • DepartureTime (Continuous Variable): Departure time is an important variable, since it may have influence on ticket prices. Prices may follow a pattern based on departure time on weekdays and weekends
# • ArrivalTime (Continuous Variable): Arrival time is an important variable, since it may have influence on ticket prices. Prices may follow a pattern based on Arrival time on weekdays and weekends
# • Departure (Factor Variable): Consists of only two unique entries – AM & PM. Departure, clubbed with DepartureTime may have a strong pattern of ticket price
# • Fly (Factor Variable): This is an important variable with four levels. These are combinations of departure & arrival city being Metro & Non-Metro. A metro-to-metro flight is expected to be costliest while non metro-to-non metro cheapest
# • FlyingTime (Continuous Variable): Flying time may be one of the most important factors with cost & ticket price implication. Higher flying time indicates higher distance & hence, fuel consumption
# • Aircraft (Factor Variable): These are type of airplanes with some implication on seating capacity, arrangement, size etc.
# • PlaneModel (Factor Variable): Within aircraft, PlaneModel indicates different models
# • Capacity (Continuous Variable): Number of seats in airplane: Higher number of seats may tend to lower down price per ticket (economies of scale)
# • SeatPitch (Continuous Variable): It is distance between two consecutive seats (back and front). Higher the SeatPitch, higher will be the legroom. It may have a small impact on airline ticket price
# • SeatWidth (Continuous Variable): It is distance between one armrest to the other
# • DataCollectionDate (Factor Variable): All entries in dataset were collected in a period of 7 days, 18th Oct 2018 – 24th Oct 2018
# • DateDeparture (Factor Variable): 
# • DayDeparture (Factor Variable): This variable has seven days in a week as its entries. 
# • Weekend (Factor Variable): This variable has only two entries, yes or no. On weekend, since the traffic if higher or demand is more, prices are expected to be more
# • Price (Continuous Variable): Dependent variable
# • AdBookDays (Continuous Variable): Difference between departure date and date of booking
# • Diwali (Factor Variable): Diwali festival in India stimulates high air travel, hence higher demand. Therefore, ticket price on Diwali could be much higher than any regular day
# • DayBeforeDiwali (Factor Variable) Diwali festival in India stimulates high air travel, hence higher demand. Therefore, ticket price one day before Diwali could be much higher than any regular day
# • DayAfterDiwali (Factor Variable) Diwali festival in India stimulates high air travel, hence higher demand. Therefore, ticket price one day after Diwali could be much higher than any regular day
knitr::opts_chunk$set(echo = TRUE)
df <- read.csv("FourIndianAirlinesData.csv")
colnames(df)
##  [1] "FlightNumber"       "Airline"            "DepartureCity"     
##  [4] "ArrivalCity"        "DepartureTime"      "ArrivalTime"       
##  [7] "Departure"          "Fly"                "FlyingTime"        
## [10] "Aircraft"           "PlaneModel"         "Capacity"          
## [13] "SeatPitch"          "SeatWidth"          "DataCollectionDate"
## [16] "DateDeparture"      "DayDeparture"       "Weekend"           
## [19] "Price"              "AdvBookDays"        "Diwali"            
## [22] "DayBeforeDiwali"    "DayAfterDiwali"
attach(df)
str(df)
## 'data.frame':    8187 obs. of  23 variables:
##  $ FlightNumber      : Factor w/ 1765 levels "6E 101","6E 102",..: 203 301 404 889 934 1146 1255 1358 1514 790 ...
##  $ Airline           : Factor w/ 4 levels "Air India","IndiGo",..: 2 2 2 3 3 3 1 1 1 2 ...
##  $ DepartureCity     : Factor w/ 79 levels "Agartala","Agatti",..: 54 11 54 4 51 51 51 63 68 29 ...
##  $ ArrivalCity       : Factor w/ 76 levels "Agartala","Ahmedabad",..: 13 66 61 48 8 51 24 51 51 2 ...
##  $ DepartureTime     : int  2250 820 1810 2110 305 900 200 730 1225 500 ...
##  $ ArrivalTime       : int  100 945 2000 2255 445 1125 320 910 205 705 ...
##  $ Departure         : Factor w/ 2 levels "AM","PM": 2 1 2 2 1 1 1 1 2 1 ...
##  $ Fly               : Factor w/ 4 levels "MM","MN","NM",..: 2 4 2 3 2 1 2 3 3 4 ...
##  $ FlyingTime        : int  130 85 110 90 100 145 80 100 100 125 ...
##  $ Aircraft          : Factor w/ 4 levels "Aerospatiale",..: 2 2 2 3 3 3 2 2 2 2 ...
##  $ PlaneModel        : Factor w/ 122 levels "-","   Boeing 737-800 (738)",..: 68 68 68 109 111 111 70 60 70 61 ...
##  $ Capacity          : int  180 180 180 168 168 168 182 144 182 180 ...
##  $ SeatPitch         : num  30 30 30 30 30 30 31.5 29.5 31.5 30 ...
##  $ SeatWidth         : num  18 18 18 17 17 17 17.5 17.8 17.5 18 ...
##  $ DataCollectionDate: Factor w/ 7 levels "Oct 18 2018",..: 6 6 6 6 6 6 6 6 6 3 ...
##  $ DateDeparture     : Factor w/ 29 levels "Nov 03 2018",..: 22 22 22 22 22 22 22 22 22 20 ...
##  $ DayDeparture      : Factor w/ 7 levels "Friday","Monday",..: 6 6 6 6 6 6 6 6 6 2 ...
##  $ Weekend           : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Price             : int  6623 4051 6623 15297 9405 12188 1892 11134 10454 5625 ...
##  $ AdvBookDays       : int  1 1 1 1 1 1 1 1 1 2 ...
##  $ Diwali            : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DayBeforeDiwali   : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
##  $ DayAfterDiwali    : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...

Histogram

# plotting histogram
hist(df$Price ,main = "Histogram of variable Price",
xlab = "Price",col = c("gray"))

hist(df$FlyingTime ,main = "Histogram of variable FlyingTime",
xlab = "FlyingTime",col = c("Blue"))

hist(df$AdvBookDays ,main = "Histogram of variable AdvBookDays",
xlab = "AdvBookDays",col = c("gray"))

Boxplot for Price, FlyingTime and AdvBookDays

boxplot(df$Price,width = 0.5,
horizontal = TRUE,main = "boxplot for variable Price",
xlab = "Price",col = c("lightblue"))

boxplot(df$FlyingTime,width = 0.5,
horizontal = TRUE,main = "boxplot for variable FlyingTime",
xlab = "FlyingTime",col = c("gray"))

boxplot(df$AdvBookDays,width = 0.5,
horizontal = TRUE,main = "boxplot for variable AdvBookDays",
xlab = "AdvBookDays",col = c("lightblue"))

#Pie Chart for Airline and Departure

# Pie Chart with Percentages (Aieline)
slices <- c(1543, 3811, 1905, 928)
lbls <- c("Air India", "IndiGo", "Jet Airways", "Spice Jet")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),
   main="Pie Chart of Airlines")

#Departure
slices <- c(874,8187)
lbls <- c("AM", "PM")
pct <- round(slices/sum(slices)*100)
lbls <- paste(lbls, pct) # add percents to labels
lbls <- paste(lbls,"%",sep="") # ad % to labels
pie(slices,labels = lbls, col=rainbow(length(lbls)),
   main="Pie Chart of Departure")

DescribeBy

library(psych)
describeBy(Price,Airline, mat = TRUE)[,c(2,4:7, 10:12)]
##          group1    n     mean       sd median  min   max range
## X11   Air India 1543 6712.027 4455.835   5490 1025 32003 30978
## X12      IndiGo 3811 4979.133 2595.443   4355  637 43454 42817
## X13 Jet Airways 1905 6134.127 3126.986   5569 1145 38560 37415
## X14   Spice Jet  928 4815.122 2435.603   4207 1299 21639 20340
describeBy(Price,Departure, mat = TRUE)[,c(2,4:7, 10:12)]
##     group1    n     mean       sd median  min   max range
## X11     AM 4802 5686.845 3495.559   4780 1025 43454 42429
## X12     PM 3385 5370.117 2772.478   4775  637 32003 31366
describeBy(Price,Weekend, mat = TRUE)[,c(2,4:7, 10:12)]
##     group1    n     mean       sd median min   max range
## X11     No 6774 5500.027 3267.063   4671 637 43454 42817
## X12    Yes 1413 5823.706 2970.390   5363 999 32003 31004
describeBy(Price,DayDeparture, mat = TRUE)[,c(2,4:7, 10:12)]
##        group1    n     mean       sd median  min   max range
## X11    Friday  118 5815.271 2884.999 5434.0 2134 25025 22891
## X12    Monday 1298 6189.549 3564.124 5484.5  999 38560 37561
## X13  Saturday  624 5566.455 2289.683 5240.5  999 20422 19423
## X14    Sunday  789 6027.160 3401.674 5417.0 1529 32003 30474
## X15  Thursday 1786 5243.237 3065.742 4361.0  637 22238 21601
## X16   Tuesday 2761 5353.585 3335.518 4511.0  974 43454 42480
## X17 Wednesday  811 5414.641 2843.221 4750.0 1025 21992 20967

Grouped Summary

tapply(Price,Weekend, summary)
## $No
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     637    3380    4671    5500    6890   43454 
## 
## $Yes
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     999    4051    5363    5824    6954   32003
tapply(Price,Departure, summary)
## $AM
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1025    3491    4780    5687    6989   43454 
## 
## $PM
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     637    3518    4775    5370    6753   32003
tapply(Price,Fly, summary)
## $MM
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2703    4440    6435    6633    7791   43454 
## 
## $MN
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     637    3782    4900    5729    6932   31931 
## 
## $NM
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1336    3579    4750    5594    6756   32003 
## 
## $NN
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     974    2817    3994    4666    5776   38560
tapply(Price,DayDeparture, summary)
## $Friday
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2134    4065    5434    5815    6958   25025 
## 
## $Monday
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     999    4037    5484    6190    7410   38560 
## 
## $Saturday
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     999    4067    5240    5566    6885   20422 
## 
## $Sunday
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1529    4043    5417    6027    7046   32003 
## 
## $Thursday
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     637    3203    4361    5243    6418   22238 
## 
## $Tuesday
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     974    3209    4511    5354    6623   43454 
## 
## $Wednesday
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1025    3585    4750    5415    6934   21992
tapply(Price,Airline, summary)
## $`Air India`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1025    4099    5490    6712    7695   32003 
## 
## $IndiGo
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     637    3208    4355    4979    6361   43454 
## 
## $`Jet Airways`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1145    4052    5569    6134    7488   38560 
## 
## $`Spice Jet`
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1299    3156    4207    4815    5923   21639

Boxplots

library(ggpubr)
## Loading required package: ggplot2
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Loading required package: magrittr
ggplot(df, aes(x=Weekend, y=Price)) + geom_boxplot()

ggplot(df, aes(x=Airline, y=Price)) + geom_boxplot()

ggplot(df, aes(x=Fly, y=Price)) + geom_boxplot()

ggplot(df, aes(x=Departure, y=Price)) + geom_boxplot()

ggplot(df, aes(x=DayDeparture, y=Price)) + geom_boxplot()

Mean Plots

library(gplots)
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
plotmeans(Price~Weekend, cex=0.8)

plotmeans(Price~Airline, cex=0.8)
## Warning in arrows(x, li, x, pmax(y - gap, li), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped
## Warning in arrows(x, ui, x, pmin(y + gap, ui), col = barcol, lwd = lwd, :
## zero-length arrow is of indeterminate angle and so skipped

plotmeans(Price~Fly, cex=0.8)

plotmeans(Price~Departure, cex=0.8)

plotmeans(Price~DayDeparture, cex=0.8)

Correlation - Bivariate

cor(Price,AdvBookDays, method = "pearson")
## [1] -0.2401463
cor.test(Price,AdvBookDays, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  Price and AdvBookDays
## t = -22.381, df = 8185, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.2604534 -0.2196269
## sample estimates:
##        cor 
## -0.2401463
cor(Price,FlyingTime, method = "pearson")
## [1] 0.3143604
cor.test(Price,FlyingTime, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  Price and FlyingTime
## t = 29.959, df = 8185, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2947053 0.3337496
## sample estimates:
##       cor 
## 0.3143604
cor(Price,FlyingTime, method = "pearson")
## [1] 0.3143604
cor.test(Price,FlyingTime, method = "pearson")
## 
##  Pearson's product-moment correlation
## 
## data:  Price and FlyingTime
## t = 29.959, df = 8185, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2947053 0.3337496
## sample estimates:
##       cor 
## 0.3143604

Scatter plots

library(ggplot2)
library(gdata)
## gdata: Unable to locate valid perl interpreter
## gdata: 
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata: 
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
## 
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
## 
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
## 
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
library(reshape2)

g1<-ggplot(df, aes(x=AdvBookDays, y=Price)) + geom_point()
g1

g2<-ggplot(df, aes(x=FlyingTime, y=Price)) + geom_point()
g2

Two sided tables

t1=as.data.frame(tapply(df$Price, list(df$Airline,df$DayDeparture), mean))
t1=cbind(row.names(t1),t1)
colnames(t1)[1]="Airline"
t1
##                 Airline   Friday   Monday Saturday   Sunday Thursday
## Air India     Air India 8452.333 6940.439 6550.514 8271.277 6301.280
## IndiGo           IndiGo 5482.904 5770.741 5261.912 5254.350 4597.084
## Jet Airways Jet Airways 5240.409 6476.875 5953.511 6138.136 5951.884
## Spice Jet     Spice Jet 4708.826 5941.024 4596.714 5355.231 4414.737
##              Tuesday Wednesday
## Air India   6710.780  5763.346
## IndiGo      4742.719  4769.169
## Jet Airways 6058.924  6395.697
## Spice Jet   4487.682  4731.229
t2=as.data.frame(tapply(df$Price, list(df$Airline,df$Weekend), mean))
t2=cbind(row.names(t2),t2)
colnames(t2)[1]="Airline"
t2
##                 Airline       No      Yes
## Air India     Air India 6536.871 7576.354
## IndiGo           IndiGo 4916.839 5257.935
## Jet Airways Jet Airways 6148.546 6050.439
## Spice Jet     Spice Jet 4744.386 5115.249
t3=as.data.frame(tapply(df$Price, list(df$Airline,df$Fly), mean))
t3=cbind(row.names(t3),t3)
colnames(t3)[1]="Airline"
t3
##                 Airline       MM       MN       NM       NN
## Air India     Air India 7924.236 6726.250 6524.852 5376.669
## IndiGo           IndiGo 5838.798 5007.977 5106.394 4572.920
## Jet Airways Jet Airways 7104.712 6335.525 5834.895 5079.274
## Spice Jet     Spice Jet 5543.304 4989.598 4768.590 4410.424
t4=as.data.frame(tapply(df$Price, list(df$Airline,df$Departure), mean))
t4=cbind(row.names(t4),t4)
colnames(t4)[1]="Airline"
t4
##                 Airline       AM       PM
## Air India     Air India 6701.908 6812.645
## IndiGo           IndiGo 5144.172 4854.327
## Jet Airways Jet Airways 6018.686 6234.288
## Spice Jet     Spice Jet 4741.458 6007.389

Bar plots

data_t1 <- melt(t1, id.vars='Airline')
colnames(data_t1)[2:3]<-c("DayDeparture","Price")
g3<-ggplot(data_t1, aes(fill=Airline, y=Price, x=DayDeparture)) + geom_bar(position="dodge", stat="identity")
g3

data_t2 <- melt(t2, id.vars='Airline')
colnames(data_t2)[2:3]<-c("Weekend","Price")
g4<-ggplot(data_t2, aes(fill=Airline, y=Price, x=Weekend)) + geom_bar(position="dodge", stat="identity")
g4

data_t3 <- melt(t3, id.vars='Airline')
colnames(data_t3)[2:3]<-c("Fly","Price")
g5<-ggplot(data_t3, aes(fill=Airline, y=Price, x=Fly)) + geom_bar(position="dodge", stat="identity")
g5

data_t4 <- melt(t4, id.vars='Airline')
colnames(data_t4)[2:3]<-c("Departure","Price")
g6<-ggplot(data_t4, aes(fill=Airline, y=Price, x=Departure)) + geom_bar(position="dodge", stat="identity")
g6

Correlogram

library(corrgram)
## Registered S3 method overwritten by 'seriation':
##   method         from 
##   reorder.hclust gclus
airlineSubset <- df[,c('FlyingTime','Price','AdvBookDays')]

corMat <- cor(airlineSubset, use = "complete")

round(corMat, 3)
##             FlyingTime  Price AdvBookDays
## FlyingTime       1.000  0.314      -0.004
## Price            0.314  1.000      -0.240
## AdvBookDays     -0.004 -0.240       1.000
library(corrplot)
## corrplot 0.84 loaded
corrplot(cor(airlineSubset), method = "circle")

corrgram(df[,c('Price','FlyingTime','AdvBookDays')],
lower.panel=panel.shade,
upper.panel=panel.conf,
text.panel=panel.txt,main="corrgram",)

library(PerformanceAnalytics)
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:gdata':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:gplots':
## 
##     textplot
## The following object is masked from 'package:graphics':
## 
##     legend
chart.Correlation(df[,c('Price','FlyingTime','AdvBookDays')])