Read and viewing the data-set

setwd("~/SIP/SIP Phase 2/R Programming/Udemy Class Material/Week 3")
airlines.df <- read.csv(paste("SixAirlinesDataV2.csv", sep=""))
View(airlines.df)

Summary Statistics

summary(airlines.df)
##       Airline      Aircraft   FlightDuration   TravelMonth
##  AirFrance: 74   AirBus:151   Min.   : 1.250   Aug:127    
##  British  :175   Boeing:307   1st Qu.: 4.260   Jul: 75    
##  Delta    : 46                Median : 7.790   Oct:127    
##  Jet      : 61                Mean   : 7.578   Sep:129    
##  Singapore: 40                3rd Qu.:10.620              
##  Virgin   : 62                Max.   :14.660              
##       IsInternational  SeatsEconomy    SeatsPremium    PitchEconomy  
##  Domestic     : 40    Min.   : 78.0   Min.   : 8.00   Min.   :30.00  
##  International:418    1st Qu.:133.0   1st Qu.:21.00   1st Qu.:31.00  
##                       Median :185.0   Median :36.00   Median :31.00  
##                       Mean   :202.3   Mean   :33.65   Mean   :31.22  
##                       3rd Qu.:243.0   3rd Qu.:40.00   3rd Qu.:32.00  
##                       Max.   :389.0   Max.   :66.00   Max.   :33.00  
##   PitchPremium    WidthEconomy    WidthPremium    PriceEconomy 
##  Min.   :34.00   Min.   :17.00   Min.   :17.00   Min.   :  65  
##  1st Qu.:38.00   1st Qu.:18.00   1st Qu.:19.00   1st Qu.: 413  
##  Median :38.00   Median :18.00   Median :19.00   Median :1242  
##  Mean   :37.91   Mean   :17.84   Mean   :19.47   Mean   :1327  
##  3rd Qu.:38.00   3rd Qu.:18.00   3rd Qu.:21.00   3rd Qu.:1909  
##  Max.   :40.00   Max.   :19.00   Max.   :21.00   Max.   :3593  
##   PricePremium    PriceRelative      SeatsTotal  PitchDifference 
##  Min.   :  86.0   Min.   :0.0200   Min.   : 98   Min.   : 2.000  
##  1st Qu.: 528.8   1st Qu.:0.1000   1st Qu.:166   1st Qu.: 6.000  
##  Median :1737.0   Median :0.3650   Median :227   Median : 7.000  
##  Mean   :1845.3   Mean   :0.4872   Mean   :236   Mean   : 6.688  
##  3rd Qu.:2989.0   3rd Qu.:0.7400   3rd Qu.:279   3rd Qu.: 7.000  
##  Max.   :7414.0   Max.   :1.8900   Max.   :441   Max.   :10.000  
##  WidthDifference PercentPremiumSeats
##  Min.   :0.000   Min.   : 4.71      
##  1st Qu.:1.000   1st Qu.:12.28      
##  Median :1.000   Median :13.21      
##  Mean   :1.633   Mean   :14.65      
##  3rd Qu.:3.000   3rd Qu.:15.36      
##  Max.   :4.000   Max.   :24.69

Creating subsets of data with respect to the Airlines

British <- subset(airlines.df, Airline == "British")
Virgin <- subset(airlines.df, Airline == "Virgin")
Delta <- subset(airlines.df, Airline == "Delta")
AirFrance <- subset(airlines.df, Airline == "AirFrance")
Jet <- subset(airlines.df, Airline == "Jet")
Singapore <- subset(airlines.df, Airline == "Singapore")

Boxplots Airline Vs. Premium Economy Price and IsInternational Vs. Premium Economy Price

library(lattice)
## Warning: package 'lattice' was built under R version 3.4.3
boxplot(PricePremium ~ IsInternational, data = airlines.df, xlab = "Price ($)", horizontal = TRUE, main = "Price Distribution of Different class across Domestic and International", col = c("violet", "orange","blue", "green", "yellow", "red"))

boxplot(PricePremium ~ IsInternational, data = airlines.df, xlab = "Price ($)", horizontal = TRUE, main = "Price Distribution of Premium Economy class across Domestic and International", col = c("violet", "orange","blue", "green", "yellow", "red"))

 boxplot(PriceEconomy ~ Airline, data = airlines.df, xlab = "Price ($)", ylab = "Airline", horizontal = TRUE, main = "Price Distribution of Economy class across Airlines", col = c("violet", "orange","blue", "green", "yellow", "red"))

#AirFrance, Virgin have high Premium Economy Price #British, Singapore have moderate Premium Economy Price #Jet, Delta have low Premium Economy Price #International flights’ Premium Economy Price is higher than domestic flights’

Scatterplot Matrix between PriceEconomy, PricePremium, PitchDifference, WidthDifference, PercentPremiumSeats, Flight Duration

library(car)
## Warning: package 'car' was built under R version 3.4.3
scatterplotMatrix(formula = ~PriceEconomy +PricePremium + PitchDifference + WidthDifference + PercentPremiumSeats + FlightDuration, data = airlines.df)

correlation Matrix and Corrgram

library(corrgram)
## Warning: package 'corrgram' was built under R version 3.4.3
cor(airlines.df[, c(3, 12, 13, 16:18)])
##                     FlightDuration PriceEconomy PricePremium
## FlightDuration          1.00000000   0.56664039   0.64873981
## PriceEconomy            0.56664039   1.00000000   0.90138870
## PricePremium            0.64873981   0.90138870   1.00000000
## PitchDifference        -0.03749288  -0.09952511  -0.01806629
## WidthDifference        -0.11856070  -0.08449975  -0.01151218
## PercentPremiumSeats     0.06051625   0.06532232   0.11639097
##                     PitchDifference WidthDifference PercentPremiumSeats
## FlightDuration          -0.03749288     -0.11856070          0.06051625
## PriceEconomy            -0.09952511     -0.08449975          0.06532232
## PricePremium            -0.01806629     -0.01151218          0.11639097
## PitchDifference          1.00000000      0.76089108         -0.09264869
## WidthDifference          0.76089108      1.00000000         -0.27559416
## PercentPremiumSeats     -0.09264869     -0.27559416          1.00000000
corrgram(airlines.df, order = TRUE, lower.panel = panel.shade, upper.panel = panel.pie, text.panel=panel.txt, main = "Corrgram of airlines intercorrealtions")

#From the correlogram the it evident that thereis positive correlation between the price of premium class seat and FlightDuration, SeatsPremium, PriceEConomy, WidthEconomy, PitchEconomy, SeatsTotal and SeatsEconomy.

Performing correlation tests to check the correlation

cor.test(airlines.df$PricePremium, airlines.df$FlightDuration)
## 
##  Pearson's product-moment correlation
## 
## data:  airlines.df$PricePremium and airlines.df$FlightDuration
## t = 18.204, df = 456, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.5923218 0.6988270
## sample estimates:
##       cor 
## 0.6487398
cor.test(airlines.df$PricePremium, airlines.df$SeatsTotal)
## 
##  Pearson's product-moment correlation
## 
## data:  airlines.df$PricePremium and airlines.df$SeatsTotal
## t = 4.1851, df = 456, p-value = 3.421e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1025049 0.2790349
## sample estimates:
##       cor 
## 0.1923253
cor.test(airlines.df$PricePremium, airlines.df$SeatsEconomy)
## 
##  Pearson's product-moment correlation
## 
## data:  airlines.df$PricePremium and airlines.df$SeatsEconomy
## t = 3.8403, df = 456, p-value = 0.0001402
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.08678154 0.26434066
## sample estimates:
##       cor 
## 0.1770009
cor.test(airlines.df$PricePremium, airlines.df$SeatsPremium)
## 
##  Pearson's product-moment correlation
## 
## data:  airlines.df$PricePremium and airlines.df$SeatsPremium
## t = 4.761, df = 456, p-value = 2.591e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1285487 0.3031938
## sample estimates:
##       cor 
## 0.2176124
cor.test(airlines.df$PricePremium, airlines.df$PitchEconomy)
## 
##  Pearson's product-moment correlation
## 
## data:  airlines.df$PricePremium and airlines.df$PitchEconomy
## t = 4.9575, df = 456, p-value = 1.009e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1373612 0.3113179
## sample estimates:
##       cor 
## 0.2261418
cor.test(airlines.df$PricePremium, airlines.df$WidthEconomy)
## 
##  Pearson's product-moment correlation
## 
## data:  airlines.df$PricePremium and airlines.df$WidthEconomy
## t = 3.2519, df = 456, p-value = 0.001231
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0597457 0.2388800
## sample estimates:
##       cor 
## 0.1505484
cor.test(airlines.df$PricePremium, airlines.df$PriceEconomy)
## 
##  Pearson's product-moment correlation
## 
## data:  airlines.df$PricePremium and airlines.df$PriceEconomy
## t = 44.452, df = 456, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.8826622 0.9172579
## sample estimates:
##       cor 
## 0.9013887
cor.test(airlines.df$PricePremium, airlines.df$PercentPremiumSeats)
## 
##  Pearson's product-moment correlation
## 
## data:  airlines.df$PricePremium and airlines.df$PercentPremiumSeats
## t = 2.5024, df = 456, p-value = 0.01268
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.0250311 0.2058228
## sample estimates:
##      cor 
## 0.116391

The results show good correlation between all the above variables mentioned.

Making a Linera Regression Model with the above variables on Price Premium

fit <- lm(PricePremium ~ FlightDuration+SeatsTotal+SeatsEconomy+SeatsPremium+PitchEconomy+WidthEconomy+PriceEconomy+PercentPremiumSeats, data = airlines.df)
summary(fit)
## 
## Call:
## lm(formula = PricePremium ~ FlightDuration + SeatsTotal + SeatsEconomy + 
##     SeatsPremium + PitchEconomy + WidthEconomy + PriceEconomy + 
##     PercentPremiumSeats, data = airlines.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -715.2 -268.7  -79.5  126.9 3193.6 
## 
## Coefficients: (1 not defined because of singularities)
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         7510.66412 1274.70557   5.892 7.48e-09 ***
## FlightDuration        75.48733    8.96891   8.417 5.21e-16 ***
## SeatsTotal            19.99110    6.68139   2.992  0.00292 ** 
## SeatsEconomy         -22.01275    7.75443  -2.839  0.00473 ** 
## SeatsPremium                NA         NA      NA       NA    
## PitchEconomy        -258.40498   40.24849  -6.420 3.46e-10 ***
## WidthEconomy          30.51273   57.95981   0.526  0.59884    
## PriceEconomy           1.08216    0.03105  34.851  < 2e-16 ***
## PercentPremiumSeats  -28.30520   15.56054  -1.819  0.06957 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 474.3 on 450 degrees of freedom
## Multiple R-squared:  0.8665, Adjusted R-squared:  0.8644 
## F-statistic: 417.2 on 7 and 450 DF,  p-value: < 2.2e-16

The summary statistics show that the R-squared value of the model is 0.8665 which is very high. However, te variables SeastPremium, WidthEconomy, PercentPremiumSeats have very less statistical significance (p>0.05)

Making another Linear Regression Model without the variables SeatsPremium, WidthEconomy and PercentPremiumSeats

fit2 <- lm(PricePremium~FlightDuration+SeatsTotal+PitchEconomy+SeatsEconomy+PriceEconomy, data = airlines.df)
summary(fit2)
## 
## Call:
## lm(formula = PricePremium ~ FlightDuration + SeatsTotal + PitchEconomy + 
##     SeatsEconomy + PriceEconomy, data = airlines.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -811.6 -255.6  -70.1  121.9 3215.6 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7848.1045  1158.8056   6.773 3.95e-11 ***
## FlightDuration   75.7793     7.7476   9.781  < 2e-16 ***
## SeatsTotal        8.6861     2.1909   3.965 8.54e-05 ***
## PitchEconomy   -265.0437    37.5671  -7.055 6.51e-12 ***
## SeatsEconomy     -8.7836     2.4490  -3.587 0.000372 ***
## PriceEconomy      1.0735     0.0283  37.934  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 475 on 452 degrees of freedom
## Multiple R-squared:  0.8655, Adjusted R-squared:  0.864 
## F-statistic: 581.8 on 5 and 452 DF,  p-value: < 2.2e-16

The R-squared value of the model is 0.8655, which is very high suggesting that the model is a good predictor of the price of premium economy class. All the explanatory variables/ regressors are statistically significant(p<0.05).

SUMMARY: Though the scatterplot matrix and correlogram show all the variables that are correlated with the price of premium economy class. In the linear regression models, few varibles do not have enough statistical significance. Therefore, variables such as Flight Duration, Total no. of seats, Pitch in economy class, no. of Seats in Economy class, Price of economy class influence the price of premium economy, from the linear regression model we understand that No. of Premium seats, Width of economy class and Percentage seats in Premium Class are not statistically significant regressors. Thus, excludin these varibles the other variables included in the new regression model proposed can be considered as the factors that drive the high price of Premium Economy class seats.

A new regression model with less number of Regressors. From the definition of premium economy class, the variables that must significantly drive the price difference must be FlightDuration, PercentPremiumSeats, WidthDifference, PriceEconomy and PitchDifference because these relate to the reasons because of which the airlines industry promotes the Premium Economy Class

fit1 <- lm(PricePremium~FlightDuration+PercentPremiumSeats+PitchDifference+WidthDifference+PriceEconomy, data=airlines.df)
summary(fit1)
## 
## Call:
## lm(formula = PricePremium ~ FlightDuration + PercentPremiumSeats + 
##     PitchDifference + WidthDifference + PriceEconomy, data = airlines.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -803.2 -287.5  -46.8  151.6 3434.6 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         -572.66050  132.93317  -4.308 2.02e-05 ***
## FlightDuration        76.97237    8.07003   9.538  < 2e-16 ***
## PercentPremiumSeats   21.58784    5.09201   4.240 2.72e-05 ***
## PitchDifference       -4.03335   20.94083  -0.193 0.847353    
## WidthDifference      115.29104   32.16186   3.585 0.000374 ***
## PriceEconomy           1.02266    0.02881  35.497  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 496.8 on 452 degrees of freedom
## Multiple R-squared:  0.8529, Adjusted R-squared:  0.8512 
## F-statistic:   524 on 5 and 452 DF,  p-value: < 2.2e-16

The results show that these variables impact the Price of Premium Economy class with an R-squared value of 0.8529. Therefore, it can be concluded that The Fliht Duration, Percentage of Premium seats, Pitch difference between the economy and premium economy class, Width difference between economy and premium economy class and the price of economy class are the driving factors for the increased price in the Premium Economy class. However, Percentage Premium seast has lesser statistical significance suggesting that the no of seats in premium economy class play a smaller role in driving the price of the premium economy class.