Creating the dataset

getwd()
## [1] "C:/Users/Risheel/Desktop/2018 DAM"
airlines <-read.csv("AirlinePricingData.csv")
library("dplyr")
## Warning: package 'dplyr' was built under R version 3.5.1
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
airlines1<- select(airlines,Price,AdvancedBookingDays,FlyingMinutes,Capacity,SeatPitch,SeatWidth)

Q1a. Write R code to generate the correlation matrix for the given continuous variables {“Price”, “AdvancedBookingDays”, “FlyingMinutes”, “Capacity”, “SeatPitch”, “SeatWidth”}

Matrix1<-cor(airlines1)
round(Matrix1,2)
##                     Price AdvancedBookingDays FlyingMinutes Capacity
## Price                1.00               -0.01         -0.02    -0.03
## AdvancedBookingDays -0.01                1.00          0.01    -0.01
## FlyingMinutes       -0.02                0.01          1.00    -0.32
## Capacity            -0.03               -0.01         -0.32     1.00
## SeatPitch            0.07               -0.01         -0.03     0.51
## SeatWidth           -0.06                0.05         -0.18     0.45
##                     SeatPitch SeatWidth
## Price                    0.07     -0.06
## AdvancedBookingDays     -0.01      0.05
## FlyingMinutes           -0.03     -0.18
## Capacity                 0.51      0.45
## SeatPitch                1.00      0.32
## SeatWidth                0.32      1.00

Q1b. Write R code to generate the correlation matrix, along with their significance values, for the given continuous variables {“Price”, “AdvancedBookingDays”, “FlyingMinutes”, “Capacity”, “SeatPitch”, “SeatWidth”}

library("psych")
## Warning: package 'psych' was built under R version 3.5.1
corr.test(airlines1,use="complete")
## Call:corr.test(x = airlines1, use = "complete")
## Correlation matrix 
##                     Price AdvancedBookingDays FlyingMinutes Capacity
## Price                1.00               -0.01         -0.02    -0.03
## AdvancedBookingDays -0.01                1.00          0.01    -0.01
## FlyingMinutes       -0.02                0.01          1.00    -0.32
## Capacity            -0.03               -0.01         -0.32     1.00
## SeatPitch            0.07               -0.01         -0.03     0.51
## SeatWidth           -0.06                0.05         -0.18     0.45
##                     SeatPitch SeatWidth
## Price                    0.07     -0.06
## AdvancedBookingDays     -0.01      0.05
## FlyingMinutes           -0.03     -0.18
## Capacity                 0.51      0.45
## SeatPitch                1.00      0.32
## SeatWidth                0.32      1.00
## Sample Size 
## [1] 305
## Probability values (Entries above the diagonal are adjusted for multiple tests.) 
##                     Price AdvancedBookingDays FlyingMinutes Capacity
## Price                0.00                1.00          1.00        1
## AdvancedBookingDays  0.87                0.00          1.00        1
## FlyingMinutes        0.75                0.93          0.00        0
## Capacity             0.65                0.88          0.00        0
## SeatPitch            0.19                0.81          0.55        0
## SeatWidth            0.30                0.34          0.00        0
##                     SeatPitch SeatWidth
## Price                       1      1.00
## AdvancedBookingDays         1      1.00
## FlyingMinutes               1      0.01
## Capacity                    0      0.00
## SeatPitch                   0      0.00
## SeatWidth                   0      0.00
## 
##  To see confidence intervals of the correlations, print with the short=FALSE option

Q1c. Write R code to visualize the correlation matrix in Q1b

library("corrgram")
## Warning: package 'corrgram' was built under R version 3.5.1
corrgram(airlines1, order=TRUE,lower.panel = panel.conf,upper.panel = panel.pts, text.panel = panel.txt, diag.panel = panel.minmax, main="Corrgram of airlines data")

Q1e. Write R code to generate the following corrgram.

library("PerformanceAnalytics")
## Warning: package 'PerformanceAnalytics' was built under R version 3.5.1
## Loading required package: xts
## Warning: package 'xts' was built under R version 3.5.1
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 3.5.1
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Attaching package: 'xts'
## The following objects are masked from 'package:dplyr':
## 
##     first, last
## 
## Attaching package: 'PerformanceAnalytics'
## The following object is masked from 'package:graphics':
## 
##     legend
chart.Correlation(airlines1, histogram = TRUE)

Q2a. Test whether the ticket prices of Mumbai to Delhi flights are more than INR 5000.

MumDel <- subset(airlines, select = (Price), airlines$DepartureCityCode=="BOM" & airlines$ArrivalCityCode=="DEL")
boxplot(MumDel$Price, ylab = "Price" , xlab   = "MumDel")

ttest1 <- t.test(MumDel$Price, mu = 5000 , alternative = "greater")
ttest1
## 
##  One Sample t-test
## 
## data:  MumDel$Price
## t = 6.0784, df = 129, p-value = 6.385e-09
## alternative hypothesis: true mean is greater than 5000
## 95 percent confidence interval:
##  5910.787      Inf
## sample estimates:
## mean of x 
##  6252.054

The pvalue of the t-test is way low the CI. So we can reject the null hypthesis and say The ticket price of fiilghts from Mumbai to Delhi is greater than INR 5000

Q2b. Test whether the ticket prices of morning flights are greater than the afternoon flights

airlines$morning = factor(airlines$Departure, levels = c("AM","PM"), labels = c(1,0))
aggregate(Price~morning,data=airlines, FUN = mean)
##   morning    Price
## 1       1 5598.893
## 2       0 5140.610
boxplot(Price~morning,data = airlines, main="Price of morning and evening flight", xlab="Flights (1= Morning, 0 =Evening)", ylab="Price")

ttest2= t.test(Price~morning, data=airlines,alternative="greater")
ttest2
## 
##  Welch Two Sample t-test
## 
## data:  Price by morning
## t = 1.736, df = 296.58, p-value = 0.0418
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  22.71262      Inf
## sample estimates:
## mean in group 1 mean in group 0 
##        5598.893        5140.610

At 5% level of signifincance We can reject null hypothesis. So the morning flights prices are higher than evening flight

Q2c. Test whether the ticket prices around Diwali is more compared to non-Diwali ticket prices.

aggregate(Price~IsDiwali, data= airlines, FUN = mean)
##   IsDiwali    Price
## 1        0 5063.810
## 2        1 5897.479
boxplot(Price~IsDiwali,data = airlines, main="Price of flight around Diwali and not", xlab="Flights (1= Diwali, 0 =No Diwali)", ylab="Price")

ttest4= t.test(Price~IsDiwali, data=airlines,alternative="less")
ttest4
## 
##  Welch Two Sample t-test
## 
## data:  Price by IsDiwali
## t = -2.9799, df = 244.52, p-value = 0.001587
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
##       -Inf -371.7482
## sample estimates:
## mean in group 0 mean in group 1 
##        5063.810        5897.479

AT 5% LOS rejecting null hypothesis. Price is higher around Diwali

Q2d. Test whether the ticket prices on Air India flights are greater than IndiGo flights

airlines$AirInd = factor(airlines$Airline, levels = c("Air India","IndiGo"), labels = c(1,0))
aggregate(Price~AirInd, data=airlines, FUN = mean)
##   AirInd    Price
## 1      1 6335.000
## 2      0 4879.525
boxplot(Price~AirInd,data = airlines, main="Price of Air India and Indigo flight", xlab="Flights (1= Air India, 0 =Indigo)", ylab="Price")

ttest3= t.test(Price~AirInd, data=airlines,alternative="greater")
ttest3
## 
##  Welch Two Sample t-test
## 
## data:  Price by AirInd
## t = 2.7205, df = 87.71, p-value = 0.00393
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
##  566.0833      Inf
## sample estimates:
## mean in group 1 mean in group 0 
##        6335.000        4879.525

At 5% LOS rejecting the Null hypothesis, Air India flight price is higher than IndiGo flight.

Q3a. Run a simple linear regression of airline ticket Price on the Advanced Booking Days. Write R code to output the summary of the model.

m=lm(airlines$Price~airlines$AdvancedBookingDays)
summary(m)
## 
## Call:
## lm(formula = airlines$Price ~ airlines$AdvancedBookingDays)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2786.5 -1320.8  -688.9   351.2 12594.0 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  5422.959    224.497   24.16   <2e-16 ***
## airlines$AdvancedBookingDays   -0.983      6.154   -0.16    0.873    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2392 on 303 degrees of freedom
## Multiple R-squared:  8.422e-05,  Adjusted R-squared:  -0.003216 
## F-statistic: 0.02552 on 1 and 303 DF,  p-value: 0.8732