Reading local data in R

# reading data and storing into `store.df` dataframe
airline.df <- read.csv(paste("AirlinePricingData - AirlinePricingData.csv"))
# number of rows and columns
dim(airline.df)
## [1] 305  25

Q1a. Write R code to generate the correlation matrix for the given continuous variables {“Price”, “AdvancedBookingDays”, “FlyingMinutes”, “Capacity”, “SeatPitch”, “SeatWidth”}

airlines <- airline.df[,c(17,18,8,11,12,13)]

## Pearson Correlation
round(cor(airlines),2)
##                     Price AdvancedBookingDays FlyingMinutes Capacity
## Price                1.00               -0.01         -0.02    -0.03
## AdvancedBookingDays -0.01                1.00          0.01    -0.01
## FlyingMinutes       -0.02                0.01          1.00    -0.32
## Capacity            -0.03               -0.01         -0.32     1.00
## SeatPitch            0.07               -0.01         -0.03     0.51
## SeatWidth           -0.06                0.05         -0.18     0.45
##                     SeatPitch SeatWidth
## Price                    0.07     -0.06
## AdvancedBookingDays     -0.01      0.05
## FlyingMinutes           -0.03     -0.18
## Capacity                 0.51      0.45
## SeatPitch                1.00      0.32
## SeatWidth                0.32      1.00
## Spearman Correlation
round(cor(airlines, method="spearman"),2)
##                     Price AdvancedBookingDays FlyingMinutes Capacity
## Price                1.00               -0.09          0.02    -0.07
## AdvancedBookingDays -0.09                1.00          0.02    -0.01
## FlyingMinutes        0.02                0.02          1.00    -0.34
## Capacity            -0.07               -0.01         -0.34     1.00
## SeatPitch            0.15                0.00          0.02     0.22
## SeatWidth           -0.15                0.10         -0.18     0.54
##                     SeatPitch SeatWidth
## Price                    0.15     -0.15
## AdvancedBookingDays      0.00      0.10
## FlyingMinutes            0.02     -0.18
## Capacity                 0.22      0.54
## SeatPitch                1.00      0.32
## SeatWidth                0.32      1.00

Q1b. Write R code to generate the correlation matrix with their significance values for the given continuous variables {“Price”, “AdvancedBookingDays”, “FlyingMinutes”, “Capacity”, “SeatPitch”, “SeatWidth”}

library(psych)
corr.test(airlines, use="complete")
## Call:corr.test(x = airlines, use = "complete")
## Correlation matrix 
##                     Price AdvancedBookingDays FlyingMinutes Capacity
## Price                1.00               -0.01         -0.02    -0.03
## AdvancedBookingDays -0.01                1.00          0.01    -0.01
## FlyingMinutes       -0.02                0.01          1.00    -0.32
## Capacity            -0.03               -0.01         -0.32     1.00
## SeatPitch            0.07               -0.01         -0.03     0.51
## SeatWidth           -0.06                0.05         -0.18     0.45
##                     SeatPitch SeatWidth
## Price                    0.07     -0.06
## AdvancedBookingDays     -0.01      0.05
## FlyingMinutes           -0.03     -0.18
## Capacity                 0.51      0.45
## SeatPitch                1.00      0.32
## SeatWidth                0.32      1.00
## Sample Size 
## [1] 305
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##                     Price AdvancedBookingDays FlyingMinutes Capacity
## Price                0.00                1.00          1.00        1
## AdvancedBookingDays  0.87                0.00          1.00        1
## FlyingMinutes        0.75                0.93          0.00        0
## Capacity             0.65                0.88          0.00        0
## SeatPitch            0.19                0.81          0.55        0
## SeatWidth            0.30                0.34          0.00        0
##                     SeatPitch SeatWidth
## Price                       1      1.00
## AdvancedBookingDays         1      1.00
## FlyingMinutes               1      0.01
## Capacity                    0      0.00
## SeatPitch                   0      0.00
## SeatWidth                   0      0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

Q1d. Write R code to visualize Q1b.

library(corrgram)
cols <- colorRampPalette(c("darkgoldenrod4", "burlywood1",
                           "darkkhaki", "darkgreen"))
corrgram(airlines, order=TRUE, col.regions=cols,
         lower.panel=panel.shade,
         upper.panel=panel.conf, text.panel=panel.txt,
         main="A Corrgram for airlines correlations")

Q1e. Write R code to generate the following visual

library("PerformanceAnalytics")
chart.Correlation(airlines, histogram = TRUE, pch=19)

Q2a. Test whether the ticket price of Mumbai to Delhi flights are more than INR 5000

airline_price <- airline.df[which(airline.df$DepartureCityCode == 'BOM' & airline.df$ArrivalCityCode == 'DEL'),'Price']

t.test(airline_price, mu=5000)
## 
##  One Sample t-test
## 
## data:  airline_price
## t = 6.0784, df = 129, p-value = 1.277e-08
## alternative hypothesis: true mean is not equal to 5000
## 95 percent confidence interval:
##  5844.506 6659.601
## sample estimates:
## mean of x 
##  6252.054

Q2b. Test whether the ticket price of morning flights are costlier than the after-noon flights

morning_flights <- airline.df[which(airline.df$Departure == 'AM'),'Price']

afternoon_flights <- airline.df[which(airline.df$Departure == 'PM'),'Price']

t.test(morning_flights,afternoon_flights, paired=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  morning_flights and afternoon_flights
## t = 1.736, df = 296.58, p-value = 0.08359
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -61.22997 977.79637
## sample estimates:
## mean of x mean of y 
##  5598.893  5140.610

Q2c. Test whether the ticket price at Diwali is more compare to non-Diwali.

diwali_flights <- airline.df[which(airline.df$IsDiwali==1 | airline.df$DayBeforeDiwali==1 | airline.df$DayAfterDiwali==1),'Price']

non_diwali_flights <- airline.df[which(airline.df$IsDiwali==0 & airline.df$DayBeforeDiwali==0 & airline.df$DayAfterDiwali==0),'Price']

t.test(diwali_flights,non_diwali_flights, paired=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  diwali_flights and non_diwali_flights
## t = 2.9799, df = 244.52, p-value = 0.003174
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   282.6141 1384.7250
## sample estimates:
## mean of x mean of y 
##  5897.479  5063.810

Q2d. Test whether the ticket price for Air India flights costlier than IndiGo flights.

air_india_flights <- airline.df[which(airline.df$Airline == 'Air India'),'Price']

indigo_flights <- airline.df[which(airline.df$Airline == 'IndiGo'),'Price']

t.test(air_india_flights,indigo_flights, paired=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  air_india_flights and indigo_flights
## t = 2.7205, df = 87.71, p-value = 0.007859
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   392.2247 2518.7253
## sample estimates:
## mean of x mean of y 
##  6335.000  4879.525

Q3a. Regress Price on Advanced Booking Days. Write R code to construct a simple linear regression model and output the summary of the model.

describe(airline.df)
##                     vars   n    mean      sd  median trimmed     mad
## FlightNumber*          1 305   31.86   18.35   32.00   31.82   23.72
## Airline*               2 305    2.60    0.88    3.00    2.62    1.48
## DepartureCityCode*     3 305    1.57    0.50    2.00    1.59    0.00
## ArrivalCityCode*       4 305    1.43    0.50    1.00    1.41    0.00
## DepartureTime          5 305 1249.54  579.86 1035.00 1237.63  593.04
## ArrivalTime            6 305 1329.31  613.52 1215.00 1347.73  585.63
## Departure*             7 305    1.45    0.50    1.00    1.43    0.00
## FlyingMinutes          8 305  136.03    4.71  135.00  135.80    7.41
## Aircraft*              9 305    1.54    0.50    2.00    1.55    0.00
## PlaneModel*           10 305    3.82    2.71    3.00    3.63    2.97
## Capacity              11 305  176.36   32.39  180.00  172.19   14.83
## SeatPitch             12 305   30.26    0.93   30.00   30.17    0.00
## SeatWidth             13 305   17.41    0.49   17.00   17.38    0.00
## DataCollectionDate*   14 305    4.36    1.98    5.00    4.45    1.48
## DateDeparture*        15 305    8.14    6.69    7.00    7.64    8.90
## IsWeekend*            16 305    1.13    0.34    1.00    1.04    0.00
## Price                 17 305 5394.54 2388.29 4681.00 4984.80 1052.65
## AdvancedBookingDays   18 305   28.90   22.30   30.00   28.50   34.10
## IsDiwali              19 305    0.40    0.49    0.00    0.37    0.00
## DayBeforeDiwali       20 305    0.19    0.40    0.00    0.12    0.00
## DayAfterDiwali        21 305    0.20    0.40    0.00    0.13    0.00
## MetroDeparture        22 305    1.00    0.00    1.00    1.00    0.00
## MetroArrival          23 305    1.00    0.00    1.00    1.00    0.00
## MarketShare           24 305   21.18   11.04   15.40   19.90    3.11
## LoadFactor            25 305   85.13    4.32   83.32   84.82    5.75
##                         min      max    range  skew kurtosis     se
## FlightNumber*          1.00    63.00    62.00  0.01    -1.22   1.05
## Airline*               1.00     4.00     3.00 -0.32    -0.61   0.05
## DepartureCityCode*     1.00     2.00     1.00 -0.30    -1.92   0.03
## ArrivalCityCode*       1.00     2.00     1.00  0.30    -1.92   0.03
## DepartureTime        225.00  2320.00  2095.00  0.22    -1.29  33.20
## ArrivalTime           20.00  2345.00  2325.00 -0.07    -0.76  35.13
## Departure*             1.00     2.00     1.00  0.22    -1.96   0.03
## FlyingMinutes        125.00   145.00    20.00  0.28    -0.33   0.27
## Aircraft*              1.00     2.00     1.00 -0.16    -1.98   0.03
## PlaneModel*            1.00     9.00     8.00  0.23    -1.60   0.16
## Capacity             138.00   303.00   165.00  2.11     5.91   1.85
## SeatPitch             29.00    33.00     4.00  1.03     0.61   0.05
## SeatWidth             17.00    18.00     1.00  0.37    -1.86   0.03
## DataCollectionDate*    1.00     7.00     6.00 -0.38    -1.15   0.11
## DateDeparture*         1.00    20.00    19.00  0.40    -1.38   0.38
## IsWeekend*             1.00     2.00     1.00  2.13     2.56   0.02
## Price               2607.00 18015.00 15408.00  2.26     6.41 136.75
## AdvancedBookingDays    2.00    61.00    59.00  0.03    -1.68   1.28
## IsDiwali               0.00     1.00     1.00  0.42    -1.83   0.03
## DayBeforeDiwali        0.00     1.00     1.00  1.54     0.39   0.02
## DayAfterDiwali         0.00     1.00     1.00  1.47     0.15   0.02
## MetroDeparture         1.00     1.00     0.00   NaN      NaN   0.00
## MetroArrival           1.00     1.00     0.00   NaN      NaN   0.00
## MarketShare           13.20    39.60    26.40  1.05    -0.86   0.63
## LoadFactor            78.73    94.06    15.33  0.71     0.06   0.25
attach(airline.df)
plot(Price, AdvancedBookingDays, xlab="Price", ylab="AdvancedBookingDays")

fit <- lm(Price ~ AdvancedBookingDays, data=airline.df)
summary(fit)
## 
## Call:
## lm(formula = Price ~ AdvancedBookingDays, data = airline.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2786.5 -1320.8  -688.9   351.2 12594.0 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5422.959    224.497   24.16   <2e-16 ***
## AdvancedBookingDays   -0.983      6.154   -0.16    0.873    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2392 on 303 degrees of freedom
## Multiple R-squared:  8.422e-05,  Adjusted R-squared:  -0.003216 
## F-statistic: 0.02552 on 1 and 303 DF,  p-value: 0.8732

Q3b. Regress Price on Advanced Booking Days and IsDiwali. Write R code to construct a multiple linear regression model and output the summary of the model

attach(airline.df)
plot(Price, IsDiwali, xlab="Price", ylab="IsDiwali")

fit <- lm(Price ~ AdvancedBookingDays + IsDiwali, data=airline.df)
summary(fit)
## 
## Call:
## lm(formula = Price ~ AdvancedBookingDays + IsDiwali, data = airline.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3222.7 -1210.4  -498.1   738.3 11972.6 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          6225.95     229.19  27.165  < 2e-16 ***
## AdvancedBookingDays   -91.77      12.88  -7.126 7.64e-12 ***
## IsDiwali             4590.92     585.99   7.834 8.08e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2184 on 302 degrees of freedom
## Multiple R-squared:  0.169,  Adjusted R-squared:  0.1635 
## F-statistic:  30.7 on 2 and 302 DF,  p-value: 7.264e-13