Sameer Mathur
Basic Data Summary and Analysis
Number of Rows and Columns
# reading data into R
airline.df <- read.csv(paste("AirlinePricingData.csv"))
# rows and columns of the data frame
dim(airline.df)
[1] 305 23
Column Names
# data column names
colnames(airline.df)
[1] "FlightNumber" "Airline" "DepartureCityCode"
[4] "ArrivalCityCode" "DepartureTime" "ArrivalTime"
[7] "FlyingMinutes" "Aircraft" "PlaneModel"
[10] "Capacity" "SeatPitch" "SeatWidth"
[13] "DataCollectionDate" "DateDeparture" "Price"
[16] "AdvancedBookingDays" "IsDiwali" "DayBeforeDiwali"
[19] "DayAfterDiwali" "MetroDeparture" "MetroArrival"
[22] "MarketShare" "PassengerLoadFactor"
# data types of the data columns
str(airline.df)
'data.frame': 305 obs. of 23 variables:
$ FlightNumber : Factor w/ 63 levels "6E 129","6E 155",..: 1 1 1 1 1 2 2 2 2 2 ...
$ Airline : Factor w/ 4 levels "Air India","IndiGo",..: 2 2 2 2 2 2 2 2 2 2 ...
$ DepartureCityCode : Factor w/ 2 levels "BOM","DEL": 2 2 2 2 2 2 2 2 2 2 ...
$ ArrivalCityCode : Factor w/ 2 levels "BOM","DEL": 1 1 1 1 1 1 1 1 1 1 ...
$ DepartureTime : int 1730 1730 1730 1730 1730 2225 2225 2225 2225 2225 ...
$ ArrivalTime : int 1945 1945 1945 1945 1945 40 40 40 40 40 ...
$ FlyingMinutes : int 135 135 135 135 135 135 135 135 135 135 ...
$ Aircraft : Factor w/ 2 levels "Airbus","Boeing": 1 1 1 1 1 1 1 1 1 1 ...
$ PlaneModel : Factor w/ 9 levels "738","739","77W",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Capacity : int 180 180 180 180 180 180 180 180 180 180 ...
$ SeatPitch : int 30 30 30 30 30 30 30 30 30 30 ...
$ SeatWidth : num 18 18 18 18 18 18 18 18 18 18 ...
$ DataCollectionDate : Factor w/ 7 levels "Sep 10 2018",..: 1 1 1 1 1 1 1 1 1 1 ...
$ DateDeparture : Factor w/ 20 levels "Nov 6 2018","Nov 8 2018",..: 3 14 2 11 1 2 11 14 1 3 ...
$ Price : int 3031 3990 4234 4597 4234 4234 5153 3194 2607 2712 ...
$ AdvancedBookingDays: int 30 7 59 2 57 59 2 7 57 30 ...
$ IsDiwali : int 0 0 1 0 1 1 0 0 1 0 ...
$ DayBeforeDiwali : int 0 0 0 0 1 0 0 0 1 0 ...
$ DayAfterDiwali : int 0 0 1 0 0 1 0 0 0 0 ...
$ MetroDeparture : int 1 1 1 1 1 1 1 1 1 1 ...
$ MetroArrival : int 1 1 1 1 1 1 1 1 1 1 ...
$ MarketShare : num 39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 ...
$ PassengerLoadFactor: num 83.3 87.2 87.2 87.2 87.2 ...
# convert 'IsDiwali' into factor
airline.df$IsDiwali <- as.factor(airline.df$IsDiwali)
# convert 'DayBeforeDiwali' into factor
airline.df$DayBeforeDiwali <- as.factor(airline.df$DayBeforeDiwali)
# convert 'DayAfterDiwali' into factor
airline.df$DayAfterDiwali <- as.factor(airline.df$DayAfterDiwali)
# convert 'MetroDeparture' into factor
airline.df$MetroDeparture <- as.factor(airline.df$MetroDeparture)
# convert 'MetroArrival' into factor
airline.df$MetroArrival <- as.factor(airline.df$MetroArrival)
# verify
str(airline.df)
'data.frame': 305 obs. of 23 variables:
$ FlightNumber : Factor w/ 63 levels "6E 129","6E 155",..: 1 1 1 1 1 2 2 2 2 2 ...
$ Airline : Factor w/ 4 levels "Air India","IndiGo",..: 2 2 2 2 2 2 2 2 2 2 ...
$ DepartureCityCode : Factor w/ 2 levels "BOM","DEL": 2 2 2 2 2 2 2 2 2 2 ...
$ ArrivalCityCode : Factor w/ 2 levels "BOM","DEL": 1 1 1 1 1 1 1 1 1 1 ...
$ DepartureTime : int 1730 1730 1730 1730 1730 2225 2225 2225 2225 2225 ...
$ ArrivalTime : int 1945 1945 1945 1945 1945 40 40 40 40 40 ...
$ FlyingMinutes : int 135 135 135 135 135 135 135 135 135 135 ...
$ Aircraft : Factor w/ 2 levels "Airbus","Boeing": 1 1 1 1 1 1 1 1 1 1 ...
$ PlaneModel : Factor w/ 9 levels "738","739","77W",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Capacity : int 180 180 180 180 180 180 180 180 180 180 ...
$ SeatPitch : int 30 30 30 30 30 30 30 30 30 30 ...
$ SeatWidth : num 18 18 18 18 18 18 18 18 18 18 ...
$ DataCollectionDate : Factor w/ 7 levels "Sep 10 2018",..: 1 1 1 1 1 1 1 1 1 1 ...
$ DateDeparture : Factor w/ 20 levels "Nov 6 2018","Nov 8 2018",..: 3 14 2 11 1 2 11 14 1 3 ...
$ Price : int 3031 3990 4234 4597 4234 4234 5153 3194 2607 2712 ...
$ AdvancedBookingDays: int 30 7 59 2 57 59 2 7 57 30 ...
$ IsDiwali : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 1 2 1 ...
$ DayBeforeDiwali : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 2 1 ...
$ DayAfterDiwali : Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 1 1 1 ...
$ MetroDeparture : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
$ MetroArrival : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
$ MarketShare : num 39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 39.6 ...
$ PassengerLoadFactor: num 83.3 87.2 87.2 87.2 87.2 ...
# attaching data columns
attach(airline.df)
# descriptive statistics
library(psych)
describe(airline.df)[c(7, 10:12, 15:17, 22, 23), c(2:5, 8:9)]
n mean sd median min max
FlyingMinutes 305 140.75 13.74 135.00 125.00 180.00
Capacity 305 176.36 32.39 180.00 138.00 303.00
SeatPitch 305 30.26 0.93 30.00 29.00 33.00
SeatWidth 305 17.41 0.49 17.00 17.00 18.00
Price 305 5394.54 2388.29 4681.00 2607.00 18015.00
AdvancedBookingDays 305 28.90 22.30 30.00 2.00 61.00
IsDiwali* 305 1.40 0.49 1.00 1.00 2.00
MarketShare 305 21.18 11.04 15.40 13.20 39.60
PassengerLoadFactor 305 85.12 4.32 83.32 78.73 94.06
# flights by airline in Diwali
table(Airline, IsDiwali)
IsDiwali
Airline 0 1
Air India 25 16
IndiGo 48 32
Jet 87 57
Spice Jet 24 16
# average ticket price by airline
agg0 <- aggregate(list(Price, AdvancedBookingDays), by = list(Airline), mean)
colnames(agg0) <- c("Airline", "AverageTicketPrice", "AdvancedBookingDays")
agg0
Airline AverageTicketPrice AdvancedBookingDays
1 Air India 6335.000 26.78049
2 IndiGo 4879.525 31.40000
3 Jet 5496.146 28.54167
4 Spice Jet 5094.850 27.40000
# average ticket price by airline
agg1 <- aggregate(Price, by = list(Airline), mean)
colnames(agg1) <- c("Airline", "AverageTicketPrice")
#agg1
# sd ticket price by airline
agg2 <- aggregate(Price, by = list(Airline), sd)
colnames(agg2) <- c("Airline", "SD")
#agg2
# average ticket price and standard deviation
agg <- merge(agg1, agg2, by = c("Airline"))
agg
Airline AverageTicketPrice SD
1 Air India 6335.000 2693.637
2 IndiGo 4879.525 2956.455
3 Jet 5496.146 1989.659
4 Spice Jet 5094.850 1787.422
# OLS model
fitLinOLSModel <- lm(Price ~ IsDiwali, data = airline.df)
# summary of the OLS model
summary(fitLinOLSModel)
Call:
lm(formula = Price ~ IsDiwali, data = airline.df)
Residuals:
Min 1Q Median 3Q Max
-3290.5 -1510.8 -534.5 561.2 12951.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 5063.8 173.8 29.143 < 2e-16 ***
IsDiwali1 833.7 275.9 3.022 0.00273 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2357 on 303 degrees of freedom
Multiple R-squared: 0.02926, Adjusted R-squared: 0.02605
F-statistic: 9.132 on 1 and 303 DF, p-value: 0.002726
# OLS model
fitMulOLSModel <- lm(Price ~ IsDiwali + AdvancedBookingDays, data = airline.df)
# summary of the OLS model
summary(fitMulOLSModel)
Call:
lm(formula = Price ~ IsDiwali + AdvancedBookingDays, data = airline.df)
Residuals:
Min 1Q Median 3Q Max
-3222.7 -1210.4 -498.1 738.3 11972.6
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6225.95 229.19 27.165 < 2e-16 ***
IsDiwali1 4590.92 585.99 7.834 8.08e-14 ***
AdvancedBookingDays -91.77 12.88 -7.126 7.64e-12 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2184 on 302 degrees of freedom
Multiple R-squared: 0.169, Adjusted R-squared: 0.1635
F-statistic: 30.7 on 2 and 302 DF, p-value: 7.264e-13
# OLS interaction model
fitOLSModelInt <- lm(Price ~ IsDiwali * AdvancedBookingDays, data = airline.df)
# summary of the OLS interaction model
summary(fitOLSModelInt)
Call:
lm(formula = Price ~ IsDiwali * AdvancedBookingDays, data = airline.df)
Residuals:
Min 1Q Median 3Q Max
-4368.6 -1240.0 -506.4 817.2 12125.2
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 6044.78 226.68 26.666 < 2e-16 ***
IsDiwali1 15733.65 2640.94 5.958 7.14e-09 ***
AdvancedBookingDays -77.47 12.95 -5.983 6.22e-09 ***
IsDiwali1:AdvancedBookingDays -218.80 50.64 -4.321 2.11e-05 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2123 on 301 degrees of freedom
Multiple R-squared: 0.2175, Adjusted R-squared: 0.2097
F-statistic: 27.89 on 3 and 301 DF, p-value: 6.082e-16