Sameer Mathur
Basic Data Summary and Analysis
# reading data into R
airline.df <- read.csv(paste("AirlinePricingData.csv"))
# rows and columns of the data frame
dim(airline.df)
[1] 305 21
# data column names
colnames(airline.df)
[1] "FlightNumber" "Airline" "DepartureCityCode"
[4] "ArrivalCityCode" "DepartureTime" "ArrivalTime"
[7] "FlyingMinutes" "Aircraft" "PlaneModel"
[10] "Capacity" "SeatPitch" "SeatWidth"
[13] "DataCollectionDate" "DateDeparture" "Price"
[16] "AdvancedBookingDays" "IsDiwali" "DayBeforeDiwali"
[19] "DayAfterDiwali" "MetroDeparture" "MetroArrival"
# data types of the data columns
str(airline.df)
'data.frame': 305 obs. of 21 variables:
$ FlightNumber : Factor w/ 63 levels "6E 129","6E 155",..: 1 1 1 1 1 2 2 2 2 2 ...
$ Airline : Factor w/ 4 levels "Air India","IndiGo",..: 2 2 2 2 2 2 2 2 2 2 ...
$ DepartureCityCode : Factor w/ 2 levels "BOM","DEL": 2 2 2 2 2 2 2 2 2 2 ...
$ ArrivalCityCode : Factor w/ 2 levels "BOM","DEL": 1 1 1 1 1 1 1 1 1 1 ...
$ DepartureTime : int 1730 1730 1730 1730 1730 2225 2225 2225 2225 2225 ...
$ ArrivalTime : int 1945 1945 1945 1945 1945 40 40 40 40 40 ...
$ FlyingMinutes : int 135 135 135 135 135 135 135 135 135 135 ...
$ Aircraft : Factor w/ 2 levels "Airbus","Boeing": 1 1 1 1 1 1 1 1 1 1 ...
$ PlaneModel : Factor w/ 9 levels "738","739","77W",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Capacity : int 180 180 180 180 180 180 180 180 180 180 ...
$ SeatPitch : int 30 30 30 30 30 30 30 30 30 30 ...
$ SeatWidth : num 18 18 18 18 18 18 18 18 18 18 ...
$ DataCollectionDate : Factor w/ 7 levels "Sep 10 2018",..: 1 1 1 1 1 1 1 1 1 1 ...
$ DateDeparture : Factor w/ 20 levels "Nov 6 2018","Nov 8 2018",..: 3 14 2 11 1 2 11 14 1 3 ...
$ Price : int 3031 3990 4234 4597 4234 4234 5153 3194 2607 2712 ...
$ AdvancedBookingDays: int 30 7 59 2 57 59 2 7 57 30 ...
$ IsDiwali : int 0 0 1 0 1 1 0 0 1 0 ...
$ DayBeforeDiwali : int 0 0 0 0 1 0 0 0 1 0 ...
$ DayAfterDiwali : int 0 0 1 0 0 1 0 0 0 0 ...
$ MetroDeparture : int 1 1 1 1 1 1 1 1 1 1 ...
$ MetroArrival : int 1 1 1 1 1 1 1 1 1 1 ...
# convert 'IsDiwali' into factor
airline.df$IsDiwali <- as.factor(airline.df$IsDiwali)
# convert 'DayBeforeDiwali' into factor
airline.df$DayBeforeDiwali <- as.factor(airline.df$DayBeforeDiwali)
# convert 'DayAfterDiwali' into factor
airline.df$DayAfterDiwali <- as.factor(airline.df$DayAfterDiwali)
# convert 'MetroDeparture' into factor
airline.df$MetroDeparture <- as.factor(airline.df$MetroDeparture)
# convert 'MetroArrival' into factor
airline.df$MetroArrival <- as.factor(airline.df$MetroArrival)
# verify
str(airline.df)
'data.frame': 305 obs. of 21 variables:
$ FlightNumber : Factor w/ 63 levels "6E 129","6E 155",..: 1 1 1 1 1 2 2 2 2 2 ...
$ Airline : Factor w/ 4 levels "Air India","IndiGo",..: 2 2 2 2 2 2 2 2 2 2 ...
$ DepartureCityCode : Factor w/ 2 levels "BOM","DEL": 2 2 2 2 2 2 2 2 2 2 ...
$ ArrivalCityCode : Factor w/ 2 levels "BOM","DEL": 1 1 1 1 1 1 1 1 1 1 ...
$ DepartureTime : int 1730 1730 1730 1730 1730 2225 2225 2225 2225 2225 ...
$ ArrivalTime : int 1945 1945 1945 1945 1945 40 40 40 40 40 ...
$ FlyingMinutes : int 135 135 135 135 135 135 135 135 135 135 ...
$ Aircraft : Factor w/ 2 levels "Airbus","Boeing": 1 1 1 1 1 1 1 1 1 1 ...
$ PlaneModel : Factor w/ 9 levels "738","739","77W",..: 6 6 6 6 6 6 6 6 6 6 ...
$ Capacity : int 180 180 180 180 180 180 180 180 180 180 ...
$ SeatPitch : int 30 30 30 30 30 30 30 30 30 30 ...
$ SeatWidth : num 18 18 18 18 18 18 18 18 18 18 ...
$ DataCollectionDate : Factor w/ 7 levels "Sep 10 2018",..: 1 1 1 1 1 1 1 1 1 1 ...
$ DateDeparture : Factor w/ 20 levels "Nov 6 2018","Nov 8 2018",..: 3 14 2 11 1 2 11 14 1 3 ...
$ Price : int 3031 3990 4234 4597 4234 4234 5153 3194 2607 2712 ...
$ AdvancedBookingDays: int 30 7 59 2 57 59 2 7 57 30 ...
$ IsDiwali : Factor w/ 2 levels "0","1": 1 1 2 1 2 2 1 1 2 1 ...
$ DayBeforeDiwali : Factor w/ 2 levels "0","1": 1 1 1 1 2 1 1 1 2 1 ...
$ DayAfterDiwali : Factor w/ 2 levels "0","1": 1 1 2 1 1 2 1 1 1 1 ...
$ MetroDeparture : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
$ MetroArrival : Factor w/ 1 level "1": 1 1 1 1 1 1 1 1 1 1 ...
# attaching data columns
attach(airline.df)
# descriptive statistics
library(psych)
describe(airline.df)[, c(1:5, 8:10)]
vars n mean sd median min max range
FlightNumber* 1 305 31.86 18.35 32 1 63 62
Airline* 2 305 2.60 0.88 3 1 4 3
DepartureCityCode* 3 305 1.57 0.50 2 1 2 1
ArrivalCityCode* 4 305 1.43 0.50 1 1 2 1
DepartureTime 5 305 1249.54 579.86 1035 225 2320 2095
ArrivalTime 6 305 1329.31 613.52 1215 20 2345 2325
FlyingMinutes 7 305 140.75 13.74 135 125 180 55
Aircraft* 8 305 1.54 0.50 2 1 2 1
PlaneModel* 9 305 3.82 2.71 3 1 9 8
Capacity 10 305 176.36 32.39 180 138 303 165
SeatPitch 11 305 30.26 0.93 30 29 33 4
SeatWidth 12 305 17.41 0.49 17 17 18 1
DataCollectionDate* 13 305 4.36 1.98 5 1 7 6
DateDeparture* 14 305 8.14 6.69 7 1 20 19
Price 15 305 5394.54 2388.29 4681 2607 18015 15408
AdvancedBookingDays 16 305 28.90 22.30 30 2 61 59
IsDiwali* 17 305 1.40 0.49 1 1 2 1
DayBeforeDiwali* 18 305 1.19 0.40 1 1 2 1
DayAfterDiwali* 19 305 1.20 0.40 1 1 2 1
MetroDeparture* 20 305 1.00 0.00 1 1 1 0
MetroArrival* 21 305 1.00 0.00 1 1 1 0
# flights by airline in Diwali
table(Airline, IsDiwali)
IsDiwali
Airline 0 1
Air India 25 16
IndiGo 48 32
Jet 87 57
Spice Jet 24 16
# average ticket price by airline
agg0 <- aggregate(list(Price, AdvancedBookingDays), by = list(Airline), mean)
colnames(agg0) <- c("Airline", "AverageTicketPrice", "AdvancedBookingDays")
agg0
Airline AverageTicketPrice AdvancedBookingDays
1 Air India 6335.000 26.78049
2 IndiGo 4879.525 31.40000
3 Jet 5496.146 28.54167
4 Spice Jet 5094.850 27.40000
# average ticket price by airline
agg1 <- aggregate(Price, by = list(Airline), mean)
colnames(agg1) <- c("Airline", "AverageTicketPrice")
#agg1
# sd ticket price by airline
agg2 <- aggregate(Price, by = list(Airline), sd)
colnames(agg2) <- c("Airline", "SD")
#agg2
# average ticket price and standard deviation
agg <- merge(agg1, agg2, by = c("Airline"))
agg
Airline AverageTicketPrice SD
1 Air India 6335.000 2693.637
2 IndiGo 4879.525 2956.455
3 Jet 5496.146 1989.659
4 Spice Jet 5094.850 1787.422
# OLS model
fitOLSModel <- lm(Price ~
IsDiwali
+ AdvancedBookingDays
+ Airline
+ Capacity
+ SeatPitch
+ SeatWidth
+ FlyingMinutes
, data = airline.df)
# summary of the OLS model
summary(fitOLSModel)
Call:
lm(formula = Price ~ IsDiwali + AdvancedBookingDays + Airline +
Capacity + SeatPitch + SeatWidth + FlyingMinutes, data = airline.df)
Residuals:
Min 1Q Median 3Q Max
-2947.4 -1289.5 -468.4 528.8 12270.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2198.063 10041.966 -0.219 0.8269
IsDiwali1 4465.051 583.811 7.648 2.9e-13 ***
AdvancedBookingDays -88.691 12.866 -6.894 3.3e-11 ***
AirlineIndiGo -1794.835 750.202 -2.392 0.0174 *
AirlineJet -1163.088 456.954 -2.545 0.0114 *
AirlineSpice Jet -955.081 732.845 -1.303 0.1935
Capacity -12.550 7.268 -1.727 0.0853 .
SeatPitch 90.619 289.315 0.313 0.7543
SeatWidth 629.593 575.502 1.094 0.2749
FlyingMinutes -13.908 9.960 -1.396 0.1637
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Residual standard error: 2155 on 295 degrees of freedom
Multiple R-squared: 0.2102, Adjusted R-squared: 0.1861
F-statistic: 8.724 on 9 and 295 DF, p-value: 1.258e-11