Reading the data

data.df <-read.csv(paste("AirlinePricingData.csv", sep=""))
View(data.df)

Number of columns and their headings

colnames(data.df)
##  [1] "FlightNumber"        "Airline"             "DepartureCityCode"  
##  [4] "ArrivalCityCode"     "DepartureTime"       "ArrivalTime"        
##  [7] "Departure"           "FlyingMinutes"       "Aircraft"           
## [10] "PlaneModel"          "Capacity"            "SeatPitch"          
## [13] "SeatWidth"           "DataCollectionDate"  "DateDeparture"      
## [16] "IsWeekend"           "Price"               "AdvancedBookingDays"
## [19] "IsDiwali"            "DayBeforeDiwali"     "DayAfterDiwali"     
## [22] "MetroDeparture"      "MetroArrival"        "MarketShare"        
## [25] "LoadFactor"

Q1a. Write R code to generate the correlation matrix for the given continuous variables {“Price”, “AdvancedBookingDays”, “FlyingMinutes”, “Capacity”, “SeatPitch”, “SeatWidth”}

Var <- data.df[c("Price", "AdvancedBookingDays", "FlyingMinutes", "Capacity", "SeatPitch", "SeatWidth")]
cor(Var)
##                            Price AdvancedBookingDays FlyingMinutes
## Price                1.000000000        -0.009177029  -0.018219539
## AdvancedBookingDays -0.009177029         1.000000000   0.005109801
## FlyingMinutes       -0.018219539         0.005109801   1.000000000
## Capacity            -0.025983460        -0.008817351  -0.320284501
## SeatPitch            0.074540632        -0.014181686  -0.034175640
## SeatWidth           -0.059567815         0.054692981  -0.182982214
##                         Capacity   SeatPitch   SeatWidth
## Price               -0.025983460  0.07454063 -0.05956782
## AdvancedBookingDays -0.008817351 -0.01418169  0.05469298
## FlyingMinutes       -0.320284501 -0.03417564 -0.18298221
## Capacity             1.000000000  0.50652704  0.45303789
## SeatPitch            0.506527044  1.00000000  0.31946224
## SeatWidth            0.453037895  0.31946224  1.00000000

Q1b. Write R code to generate the correlation matrix, along with their significance values, for the given continuous variables {“Price”, “AdvancedBookingDays”, “FlyingMinutes”, “Capacity”, “SeatPitch”, “SeatWidth”}

Var <- data.df[c("Price", "AdvancedBookingDays", "FlyingMinutes", "Capacity", "SeatPitch", "SeatWidth")]
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(as.matrix(Var))
##                     Price AdvancedBookingDays FlyingMinutes Capacity
## Price                1.00               -0.01         -0.02    -0.03
## AdvancedBookingDays -0.01                1.00          0.01    -0.01
## FlyingMinutes       -0.02                0.01          1.00    -0.32
## Capacity            -0.03               -0.01         -0.32     1.00
## SeatPitch            0.07               -0.01         -0.03     0.51
## SeatWidth           -0.06                0.05         -0.18     0.45
##                     SeatPitch SeatWidth
## Price                    0.07     -0.06
## AdvancedBookingDays     -0.01      0.05
## FlyingMinutes           -0.03     -0.18
## Capacity                 0.51      0.45
## SeatPitch                1.00      0.32
## SeatWidth                0.32      1.00
## 
## n= 305 
## 
## 
## P
##                     Price  AdvancedBookingDays FlyingMinutes Capacity
## Price                      0.8732              0.7513        0.6513  
## AdvancedBookingDays 0.8732                     0.9292        0.8781  
## FlyingMinutes       0.7513 0.9292                            0.0000  
## Capacity            0.6513 0.8781              0.0000                
## SeatPitch           0.1942 0.8052              0.5521        0.0000  
## SeatWidth           0.2998 0.3411              0.0013        0.0000  
##                     SeatPitch SeatWidth
## Price               0.1942    0.2998   
## AdvancedBookingDays 0.8052    0.3411   
## FlyingMinutes       0.5521    0.0013   
## Capacity            0.0000    0.0000   
## SeatPitch                     0.0000   
## SeatWidth           0.0000

Q1c. Write R code to visualize the correlation matrix in Q1b.

library(corrgram)
## 
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
## 
##     panel.fill
corrgram(Var, order=TRUE, lower.panel=panel.conf,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corrgram of Variable intercorrelations")

Q1e. Write R code to generate the following corrgram. (Hint: This is a repeat of the previous question, where you had flexibility to create the corrgram of YOUR choice. Here, you will need to use package PerformanceAnalytics.)

library("PerformanceAnalytics")
## Loading required package: xts
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
chart.Correlation(Var, histogram = TRUE, pch=19)

Q2a. Test whether the ticket prices of Mumbai to Delhi flights are more than INR 5000.

price1 <- data.df[data.df\(DepartureCityCode == "BOM" & data.df\)ArrivalCityCode == “DEL”,] library(psych) describe(price1$Price)

m = mean(data.df$price)
## Warning in mean.default(data.df$price): argument is not numeric or logical:
## returning NA
t.test(data.df$Price, mu=5000, alternative = 'greater')
## 
##  One Sample t-test
## 
## data:  data.df$Price
## t = 2.8851, df = 304, p-value = 0.002096
## alternative hypothesis: true mean is greater than 5000
## 95 percent confidence interval:
##  5168.918      Inf
## sample estimates:
## mean of x 
##  5394.544

p value less than 0.05. Thus rejecting null hypothesis

Thus, Price of flights is higher than INR 5000

Q2b. Test whether the ticket prices of morning flights are greater than the afternoon flights

morning_flights <- data.df[which(data.df$Departure == 'AM'),'Price']

afternoon_flights <- data.df[which(data.df$Departure == 'PM'),'Price']

t.test(morning_flights,afternoon_flights, paired=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  morning_flights and afternoon_flights
## t = 1.736, df = 296.58, p-value = 0.08359
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -61.22997 977.79637
## sample estimates:
## mean of x mean of y 
##  5598.893  5140.610

p value is greater than 0.05. Thus null hypothesis is true.

THus, prices are not related with time of the flight.

Q2c. Test whether the ticket prices around Diwali is more compared to non-Diwali ticket prices.

DiwaliY <- data.df[which(data.df$IsDiwali == 1),'Price']

DiwaliN <- data.df[which(data.df$IsDiwali == 0),'Price']

t.test(DiwaliY, DiwaliN,paired=FALSE)
## 
##  Welch Two Sample t-test
## 
## data:  DiwaliY and DiwaliN
## t = 2.9799, df = 244.52, p-value = 0.003174
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   282.6141 1384.7250
## sample estimates:
## mean of x mean of y 
##  5897.479  5063.810

p value less than 0.05. Thus rejecting null hypothesis

Thus, prices differ during diwali vs non-diwali time.

DiwaliY <- data.df[which(data.df$IsDiwali == 1),'Price']

DiwaliN <- data.df[which(data.df$IsDiwali == 0),'Price']

t.test(DiwaliY, DiwaliN,paired=FALSE, mu = 0, alternative = "greater")
## 
##  Welch Two Sample t-test
## 
## data:  DiwaliY and DiwaliN
## t = 2.9799, df = 244.52, p-value = 0.001587
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  371.7482      Inf
## sample estimates:
## mean of x mean of y 
##  5897.479  5063.810

p value less than 0.05. Thus rejecting null hypothesis

Thus, Price during Diwali is higher.

Q2d. Test whether the ticket prices on Air India flights are greater than IndiGo flights

air_india_flights <- data.df[which(data.df$Airline == 'Air India'),'Price']

indigo_flights <- data.df[which(data.df$Airline == 'IndiGo'),'Price']

t.test(air_india_flights,indigo_flights, paired=FALSE )
## 
##  Welch Two Sample t-test
## 
## data:  air_india_flights and indigo_flights
## t = 2.7205, df = 87.71, p-value = 0.007859
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##   392.2247 2518.7253
## sample estimates:
## mean of x mean of y 
##  6335.000  4879.525

p value less than 0.05. Thus rejecting null hypothesis

Thus, prices of IndiGo and Air India vary.

air_india_flights <- data.df[which(data.df$Airline == 'Air India'),'Price']

indigo_flights <- data.df[which(data.df$Airline == 'IndiGo'),'Price']

t.test(air_india_flights,indigo_flights, paired=FALSE,mu = 0, alternative = "greater" )
## 
##  Welch Two Sample t-test
## 
## data:  air_india_flights and indigo_flights
## t = 2.7205, df = 87.71, p-value = 0.00393
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  566.0833      Inf
## sample estimates:
## mean of x mean of y 
##  6335.000  4879.525

p value less than 0.05. Thus rejecting null hypothesis

Thus, Price of Air India is higher than IndiGo.

Q3a. Run a simple linear regression of airline ticket Price on the Advanced Booking Days. Write R code to output the summary of the model.

data.df <-read.csv(paste("AirlinePricingData.csv", sep=""))
fit <- lm(Price ~ AdvancedBookingDays , data = data.df)
summary(fit)
## 
## Call:
## lm(formula = Price ~ AdvancedBookingDays, data = data.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2786.5 -1320.8  -688.9   351.2 12594.0 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5422.959    224.497   24.16   <2e-16 ***
## AdvancedBookingDays   -0.983      6.154   -0.16    0.873    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2392 on 303 degrees of freedom
## Multiple R-squared:  8.422e-05,  Adjusted R-squared:  -0.003216 
## F-statistic: 0.02552 on 1 and 303 DF,  p-value: 0.8732