Airline Ticket Pricing

Sameer Mathur

Basic Data Summary and Analysis

Reading data

Number of Rows and Columns

# reading data into R
airline.df <- read.csv(paste("AirlinePricingData.csv"))
# rows and columns of the data frame
dim(airline.df)
[1] 305  23

Column Names

# data column names
colnames(airline.df)
 [1] "FlightNumber"        "Airline"             "DepartureCityCode"  
 [4] "ArrivalCityCode"     "DepartureTime"       "ArrivalTime"        
 [7] "FlyingMinutes"       "Aircraft"            "PlaneModel"         
[10] "Capacity"            "SeatPitch"           "SeatWidth"          
[13] "DataCollectionDate"  "DateDeparture"       "Price"              
[16] "AdvancedBookingDays" "IsDiwali"            "DayBeforeDiwali"    
[19] "DayAfterDiwali"      "MetroDeparture"      "MetroArrival"       
[22] "MarketShare"         "PassengerLoadFactor"

Data Types

# data types of the data columns
str(airline.df)
'data.frame':   305 obs. of  23 variables:
 $ FlightNumber       : Factor w/ 63 levels "6E 129","6E 155",..: 1 1 1 1 1 2 2 2 2 2 ...
 $ Airline            : Factor w/ 4 levels "Air India","IndiGo",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ DepartureCityCode  : Factor w/ 2 levels "BOM","DEL": 2 2 2 2 2 2 2 2 2 2 ...
 $ ArrivalCityCode    : Factor w/ 2 levels "BOM","DEL": 1 1 1 1 1 1 1 1 1 1 ...
 $ DepartureTime      : int  1730 1730 1730 1730 1730 2225 2225 2225 2225 2225 ...
 $ ArrivalTime        : int  1945 1945 1945 1945 1945 40 40 40 40 40 ...
 $ FlyingMinutes      : int  135 135 135 135 135 135 135 135 135 135 ...
 $ Aircraft           : Factor w/ 2 levels "Airbus","Boeing": 1 1 1 1 1 1 1 1 1 1 ...
 $ PlaneModel         : Factor w/ 9 levels "738","739","77W",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ Capacity           : int  180 180 180 180 180 180 180 180 180 180 ...
 $ SeatPitch          : int  30 30 30 30 30 30 30 30 30 30 ...
 $ SeatWidth          : num  18 18 18 18 18 18 18 18 18 18 ...
 $ DataCollectionDate : Factor w/ 7 levels "Sep 10 2018",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ DateDeparture      : Factor w/ 20 levels "Nov 6 2018","Nov 8 2018",..: 3 14 2 11 1 2 11 14 1 3 ...
 $ Price              : int  3031 3990 4234 4597 4234 4234 5153 3194 2607 2712 ...
 $ AdvancedBookingDays: int  30 7 59 2 57 59 2 7 57 30 ...
 $ IsDiwali           : int  0 0 1 0 1 1 0 0 1 0 ...
 $ DayBeforeDiwali    : int  0 0 0 0 1 0 0 0 1 0 ...
 $ DayAfterDiwali     : int  0 0 1 0 0 1 0 0 0 0 ...
 $ MetroDeparture     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ MetroArrival       : int  1 1 1 1 1 1 1 1 1 1 ...
 $ MarketShare        : num  39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 ...
 $ PassengerLoadFactor: num  83.3 87.2 87.2 87.2 87.2 ...

Conversion of some data types of data columns into factor

# convert 'IsDiwali' into  factor
airline.df$IsDiwali <- as.factor(airline.df$IsDiwali)
# convert 'DayBeforeDiwali' into  factor
airline.df$DayBeforeDiwali <- as.factor(airline.df$DayBeforeDiwali)
# convert 'DayAfterDiwali' into  factor
airline.df$DayAfterDiwali <- as.factor(airline.df$DayAfterDiwali)
# convert 'MetroDeparture' into  factor
airline.df$MetroDeparture <- as.factor(airline.df$MetroDeparture)
# convert 'MetroArrival' into  factor
airline.df$MetroArrival <- as.factor(airline.df$MetroArrival)

# verify
str(airline.df)
'data.frame':   305 obs. of  23 variables:
 $ FlightNumber       : Factor w/ 63 levels "6E 129","6E 155",..: 1 1 1 1 1 2 2 2 2 2 ...
 $ Airline            : Factor w/ 4 levels "Air India","IndiGo",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ DepartureCityCode  : Factor w/ 2 levels "BOM","DEL": 2 2 2 2 2 2 2 2 2 2 ...
 $ ArrivalCityCode    : Factor w/ 2 levels "BOM","DEL": 1 1 1 1 1 1 1 1 1 1 ...
 $ DepartureTime      : int  1730 1730 1730 1730 1730 2225 2225 2225 2225 2225 ...
 $ ArrivalTime        : int  1945 1945 1945 1945 1945 40 40 40 40 40 ...
 $ FlyingMinutes      : int  135 135 135 135 135 135 135 135 135 135 ...
 $ Aircraft           : Factor w/ 2 levels "Airbus","Boeing": 1 1 1 1 1 1 1 1 1 1 ...
 $ PlaneModel         : Factor w/ 9 levels "738","739","77W",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ Capacity           : int  180 180 180 180 180 180 180 180 180 180 ...
 $ SeatPitch          : int  30 30 30 30 30 30 30 30 30 30 ...
 $ SeatWidth          : num  18 18 18 18 18 18 18 18 18 18 ...
 $ DataCollectionDate : Factor w/ 7 levels "Sep 10 2018",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ DateDeparture      : Factor w/ 20 levels "Nov 6 2018","Nov 8 2018",..: 3 14 2 11 1 2 11 14 1 3 ...
 $ Price              : int  3031 3990 4234 4597 4234 4234 5153 3194 2607 2712 ...
 $ AdvancedBookingDays: int  30 7 59 2 57 59 2 7 57 30 ...
 $ IsDiwali           : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 1 2 1 ...
 $ DayBeforeDiwali    : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 2 1 ...
 $ DayAfterDiwali     : Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 1 1 1 ...
 $ MetroDeparture     : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
 $ MetroArrival       : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
 $ MarketShare        : num  39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 ...
 $ PassengerLoadFactor: num  83.3 87.2 87.2 87.2 87.2 ...

Descriptive statistics of the data frame

# attaching data columns
attach(airline.df)
# descriptive statistics
library(psych)
describe(airline.df)[c(7, 10:12, 15:17, 22, 23), c(2:5, 8:9)]
                      n    mean      sd  median     min      max
FlyingMinutes       305  140.75   13.74  135.00  125.00   180.00
Capacity            305  176.36   32.39  180.00  138.00   303.00
SeatPitch           305   30.26    0.93   30.00   29.00    33.00
SeatWidth           305   17.41    0.49   17.00   17.00    18.00
Price               305 5394.54 2388.29 4681.00 2607.00 18015.00
AdvancedBookingDays 305   28.90   22.30   30.00    2.00    61.00
IsDiwali*           305    1.40    0.49    1.00    1.00     2.00
MarketShare         305   21.18   11.04   15.40   13.20    39.60
PassengerLoadFactor 305   85.12    4.32   83.32   78.73    94.06

Number of flights by airlines in Diwali

# flights by airline in Diwali
table(Airline, IsDiwali)
           IsDiwali
Airline      0  1
  Air India 25 16
  IndiGo    48 32
  Jet       87 57
  Spice Jet 24 16

Average ticket price and advanced booking days by airline

# average ticket price by airline
agg0 <- aggregate(list(Price, AdvancedBookingDays), by = list(Airline), mean)
colnames(agg0) <- c("Airline", "AverageTicketPrice", "AdvancedBookingDays")
agg0
    Airline AverageTicketPrice AdvancedBookingDays
1 Air India           6335.000            26.78049
2    IndiGo           4879.525            31.40000
3       Jet           5496.146            28.54167
4 Spice Jet           5094.850            27.40000

Mean and SD of airline ticket price by airline

# average ticket price by airline
agg1 <- aggregate(Price, by = list(Airline), mean)
colnames(agg1) <- c("Airline", "AverageTicketPrice")
#agg1

# sd ticket price by airline
agg2 <- aggregate(Price, by = list(Airline), sd)
colnames(agg2) <- c("Airline", "SD")
#agg2

# average ticket price and standard deviation
agg <- merge(agg1, agg2, by = c("Airline"))
agg
    Airline AverageTicketPrice       SD
1 Air India           6335.000 2693.637
2    IndiGo           4879.525 2956.455
3       Jet           5496.146 1989.659
4 Spice Jet           5094.850 1787.422

Simple Linear Regression

# OLS model
fitLinOLSModel <- lm(Price ~ IsDiwali, data = airline.df)
# summary of the OLS model
summary(fitLinOLSModel)

Call:
lm(formula = Price ~ IsDiwali, data = airline.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-3290.5 -1510.8  -534.5   561.2 12951.2 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   5063.8      173.8  29.143  < 2e-16 ***
IsDiwali1      833.7      275.9   3.022  0.00273 ** 
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2357 on 303 degrees of freedom
Multiple R-squared:  0.02926,   Adjusted R-squared:  0.02605 
F-statistic: 9.132 on 1 and 303 DF,  p-value: 0.002726

Multiple Linear Regression

# OLS model
fitMulOLSModel <- lm(Price ~ IsDiwali + AdvancedBookingDays, data = airline.df)
# summary of the OLS model
summary(fitMulOLSModel)

Call:
lm(formula = Price ~ IsDiwali + AdvancedBookingDays, data = airline.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-3222.7 -1210.4  -498.1   738.3 11972.6 

Coefficients:
                    Estimate Std. Error t value Pr(>|t|)    
(Intercept)          6225.95     229.19  27.165  < 2e-16 ***
IsDiwali1            4590.92     585.99   7.834 8.08e-14 ***
AdvancedBookingDays   -91.77      12.88  -7.126 7.64e-12 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2184 on 302 degrees of freedom
Multiple R-squared:  0.169, Adjusted R-squared:  0.1635 
F-statistic:  30.7 on 2 and 302 DF,  p-value: 7.264e-13

Interaction Model

# OLS interaction model
fitOLSModelInt <- lm(Price ~ IsDiwali * AdvancedBookingDays, data = airline.df)
# summary of the OLS interaction model
summary(fitOLSModelInt)

Call:
lm(formula = Price ~ IsDiwali * AdvancedBookingDays, data = airline.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-4368.6 -1240.0  -506.4   817.2 12125.2 

Coefficients:
                              Estimate Std. Error t value Pr(>|t|)    
(Intercept)                    6044.78     226.68  26.666  < 2e-16 ***
IsDiwali1                     15733.65    2640.94   5.958 7.14e-09 ***
AdvancedBookingDays             -77.47      12.95  -5.983 6.22e-09 ***
IsDiwali1:AdvancedBookingDays  -218.80      50.64  -4.321 2.11e-05 ***
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2123 on 301 degrees of freedom
Multiple R-squared:  0.2175,    Adjusted R-squared:  0.2097 
F-statistic: 27.89 on 3 and 301 DF,  p-value: 6.082e-16