Airline Ticket Pricing

Sameer Mathur

Basic Data Summary and Analysis

Reading data

# reading data into R
airline.df <- read.csv(paste("AirlinePricingData.csv"))
# rows and columns of the data frame
dim(airline.df)
[1] 305  21
# data column names
colnames(airline.df)
 [1] "FlightNumber"        "Airline"             "DepartureCityCode"  
 [4] "ArrivalCityCode"     "DepartureTime"       "ArrivalTime"        
 [7] "FlyingMinutes"       "Aircraft"            "PlaneModel"         
[10] "Capacity"            "SeatPitch"           "SeatWidth"          
[13] "DataCollectionDate"  "DateDeparture"       "Price"              
[16] "AdvancedBookingDays" "IsDiwali"            "DayBeforeDiwali"    
[19] "DayAfterDiwali"      "MetroDeparture"      "MetroArrival"       

Data structure of the data frame

# data types of the data columns
str(airline.df)
'data.frame':   305 obs. of  21 variables:
 $ FlightNumber       : Factor w/ 63 levels "6E 129","6E 155",..: 1 1 1 1 1 2 2 2 2 2 ...
 $ Airline            : Factor w/ 4 levels "Air India","IndiGo",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ DepartureCityCode  : Factor w/ 2 levels "BOM","DEL": 2 2 2 2 2 2 2 2 2 2 ...
 $ ArrivalCityCode    : Factor w/ 2 levels "BOM","DEL": 1 1 1 1 1 1 1 1 1 1 ...
 $ DepartureTime      : int  1730 1730 1730 1730 1730 2225 2225 2225 2225 2225 ...
 $ ArrivalTime        : int  1945 1945 1945 1945 1945 40 40 40 40 40 ...
 $ FlyingMinutes      : int  135 135 135 135 135 135 135 135 135 135 ...
 $ Aircraft           : Factor w/ 2 levels "Airbus","Boeing": 1 1 1 1 1 1 1 1 1 1 ...
 $ PlaneModel         : Factor w/ 9 levels "738","739","77W",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ Capacity           : int  180 180 180 180 180 180 180 180 180 180 ...
 $ SeatPitch          : int  30 30 30 30 30 30 30 30 30 30 ...
 $ SeatWidth          : num  18 18 18 18 18 18 18 18 18 18 ...
 $ DataCollectionDate : Factor w/ 7 levels "Sep 10 2018",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ DateDeparture      : Factor w/ 20 levels "Nov 6 2018","Nov 8 2018",..: 3 14 2 11 1 2 11 14 1 3 ...
 $ Price              : int  3031 3990 4234 4597 4234 4234 5153 3194 2607 2712 ...
 $ AdvancedBookingDays: int  30 7 59 2 57 59 2 7 57 30 ...
 $ IsDiwali           : int  0 0 1 0 1 1 0 0 1 0 ...
 $ DayBeforeDiwali    : int  0 0 0 0 1 0 0 0 1 0 ...
 $ DayAfterDiwali     : int  0 0 1 0 0 1 0 0 0 0 ...
 $ MetroDeparture     : int  1 1 1 1 1 1 1 1 1 1 ...
 $ MetroArrival       : int  1 1 1 1 1 1 1 1 1 1 ...

Conversion of some data types of data columns into factor

# convert 'IsDiwali' into  factor
airline.df$IsDiwali <- as.factor(airline.df$IsDiwali)
# convert 'DayBeforeDiwali' into  factor
airline.df$DayBeforeDiwali <- as.factor(airline.df$DayBeforeDiwali)
# convert 'DayAfterDiwali' into  factor
airline.df$DayAfterDiwali <- as.factor(airline.df$DayAfterDiwali)
# convert 'MetroDeparture' into  factor
airline.df$MetroDeparture <- as.factor(airline.df$MetroDeparture)
# convert 'MetroArrival' into  factor
airline.df$MetroArrival <- as.factor(airline.df$MetroArrival)

# verify
str(airline.df)
'data.frame':   305 obs. of  21 variables:
 $ FlightNumber       : Factor w/ 63 levels "6E 129","6E 155",..: 1 1 1 1 1 2 2 2 2 2 ...
 $ Airline            : Factor w/ 4 levels "Air India","IndiGo",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ DepartureCityCode  : Factor w/ 2 levels "BOM","DEL": 2 2 2 2 2 2 2 2 2 2 ...
 $ ArrivalCityCode    : Factor w/ 2 levels "BOM","DEL": 1 1 1 1 1 1 1 1 1 1 ...
 $ DepartureTime      : int  1730 1730 1730 1730 1730 2225 2225 2225 2225 2225 ...
 $ ArrivalTime        : int  1945 1945 1945 1945 1945 40 40 40 40 40 ...
 $ FlyingMinutes      : int  135 135 135 135 135 135 135 135 135 135 ...
 $ Aircraft           : Factor w/ 2 levels "Airbus","Boeing": 1 1 1 1 1 1 1 1 1 1 ...
 $ PlaneModel         : Factor w/ 9 levels "738","739","77W",..: 6 6 6 6 6 6 6 6 6 6 ...
 $ Capacity           : int  180 180 180 180 180 180 180 180 180 180 ...
 $ SeatPitch          : int  30 30 30 30 30 30 30 30 30 30 ...
 $ SeatWidth          : num  18 18 18 18 18 18 18 18 18 18 ...
 $ DataCollectionDate : Factor w/ 7 levels "Sep 10 2018",..: 1 1 1 1 1 1 1 1 1 1 ...
 $ DateDeparture      : Factor w/ 20 levels "Nov 6 2018","Nov 8 2018",..: 3 14 2 11 1 2 11 14 1 3 ...
 $ Price              : int  3031 3990 4234 4597 4234 4234 5153 3194 2607 2712 ...
 $ AdvancedBookingDays: int  30 7 59 2 57 59 2 7 57 30 ...
 $ IsDiwali           : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 1 2 1 ...
 $ DayBeforeDiwali    : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 2 1 ...
 $ DayAfterDiwali     : Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 1 1 1 ...
 $ MetroDeparture     : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
 $ MetroArrival       : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...

Descriptive statistics of the data frame

# attaching data columns
attach(airline.df)
# descriptive statistics
library(psych)
describe(airline.df)[, c(1:5, 8:10)]
                    vars   n    mean      sd median  min   max range
FlightNumber*          1 305   31.86   18.35     32    1    63    62
Airline*               2 305    2.60    0.88      3    1     4     3
DepartureCityCode*     3 305    1.57    0.50      2    1     2     1
ArrivalCityCode*       4 305    1.43    0.50      1    1     2     1
DepartureTime          5 305 1249.54  579.86   1035  225  2320  2095
ArrivalTime            6 305 1329.31  613.52   1215   20  2345  2325
FlyingMinutes          7 305  140.75   13.74    135  125   180    55
Aircraft*              8 305    1.54    0.50      2    1     2     1
PlaneModel*            9 305    3.82    2.71      3    1     9     8
Capacity              10 305  176.36   32.39    180  138   303   165
SeatPitch             11 305   30.26    0.93     30   29    33     4
SeatWidth             12 305   17.41    0.49     17   17    18     1
DataCollectionDate*   13 305    4.36    1.98      5    1     7     6
DateDeparture*        14 305    8.14    6.69      7    1    20    19
Price                 15 305 5394.54 2388.29   4681 2607 18015 15408
AdvancedBookingDays   16 305   28.90   22.30     30    2    61    59
IsDiwali*             17 305    1.40    0.49      1    1     2     1
DayBeforeDiwali*      18 305    1.19    0.40      1    1     2     1
DayAfterDiwali*       19 305    1.20    0.40      1    1     2     1
MetroDeparture*       20 305    1.00    0.00      1    1     1     0
MetroArrival*         21 305    1.00    0.00      1    1     1     0

Number of flights by airlines in Diwali

# flights by airline in Diwali
table(Airline, IsDiwali)
           IsDiwali
Airline      0  1
  Air India 25 16
  IndiGo    48 32
  Jet       87 57
  Spice Jet 24 16

Average ticket price and advanced booking days by airline

# average ticket price by airline
agg0 <- aggregate(list(Price, AdvancedBookingDays), by = list(Airline), mean)
colnames(agg0) <- c("Airline", "AverageTicketPrice", "AdvancedBookingDays")
agg0
    Airline AverageTicketPrice AdvancedBookingDays
1 Air India           6335.000            26.78049
2    IndiGo           4879.525            31.40000
3       Jet           5496.146            28.54167
4 Spice Jet           5094.850            27.40000

Mean and SD of airline ticket price by airline

# average ticket price by airline
agg1 <- aggregate(Price, by = list(Airline), mean)
colnames(agg1) <- c("Airline", "AverageTicketPrice")
#agg1

# sd ticket price by airline
agg2 <- aggregate(Price, by = list(Airline), sd)
colnames(agg2) <- c("Airline", "SD")
#agg2

# average ticket price and standard deviation
agg <- merge(agg1, agg2, by = c("Airline"))
agg
    Airline AverageTicketPrice       SD
1 Air India           6335.000 2693.637
2    IndiGo           4879.525 2956.455
3       Jet           5496.146 1989.659
4 Spice Jet           5094.850 1787.422

Simple Linear Regression

# OLS model
fitOLSModel <- lm(Price ~ 
                 IsDiwali
               + AdvancedBookingDays
               + Airline
               + Capacity
               + SeatPitch
               + SeatWidth
               + FlyingMinutes
               , data = airline.df)
# summary of the OLS model
summary(fitOLSModel)

Call:
lm(formula = Price ~ IsDiwali + AdvancedBookingDays + Airline + 
    Capacity + SeatPitch + SeatWidth + FlyingMinutes, data = airline.df)

Residuals:
    Min      1Q  Median      3Q     Max 
-2947.4 -1289.5  -468.4   528.8 12270.5 

Coefficients:
                     Estimate Std. Error t value Pr(>|t|)    
(Intercept)         -2198.063  10041.966  -0.219   0.8269    
IsDiwali1            4465.051    583.811   7.648  2.9e-13 ***
AdvancedBookingDays   -88.691     12.866  -6.894  3.3e-11 ***
AirlineIndiGo       -1794.835    750.202  -2.392   0.0174 *  
AirlineJet          -1163.088    456.954  -2.545   0.0114 *  
AirlineSpice Jet     -955.081    732.845  -1.303   0.1935    
Capacity              -12.550      7.268  -1.727   0.0853 .  
SeatPitch              90.619    289.315   0.313   0.7543    
SeatWidth             629.593    575.502   1.094   0.2749    
FlyingMinutes         -13.908      9.960  -1.396   0.1637    
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Residual standard error: 2155 on 295 degrees of freedom
Multiple R-squared:  0.2102,    Adjusted R-squared:  0.1861 
F-statistic: 8.724 on 9 and 295 DF,  p-value: 1.258e-11