->Reading and Viewing the dataset into R

airline.df <- read.csv(paste("SixAirlinesDataV2.csv", sep=""))
View(airline.df)

-> Summarizing the data

 library(psych)
 describe(airline.df)
##                     vars   n    mean      sd  median trimmed     mad   min
## Airline*               1 458    3.01    1.65    2.00    2.89    1.48  1.00
## Aircraft*              2 458    1.67    0.47    2.00    1.71    0.00  1.00
## FlightDuration         3 458    7.58    3.54    7.79    7.57    4.81  1.25
## TravelMonth*           4 458    2.56    1.17    3.00    2.58    1.48  1.00
## IsInternational*       5 458    1.91    0.28    2.00    2.00    0.00  1.00
## SeatsEconomy           6 458  202.31   76.37  185.00  194.64   85.99 78.00
## SeatsPremium           7 458   33.65   13.26   36.00   33.35   11.86  8.00
## PitchEconomy           8 458   31.22    0.66   31.00   31.26    0.00 30.00
## PitchPremium           9 458   37.91    1.31   38.00   38.05    0.00 34.00
## WidthEconomy          10 458   17.84    0.56   18.00   17.81    0.00 17.00
## WidthPremium          11 458   19.47    1.10   19.00   19.53    0.00 17.00
## PriceEconomy          12 458 1327.08  988.27 1242.00 1244.40 1159.39 65.00
## PricePremium          13 458 1845.26 1288.14 1737.00 1799.05 1845.84 86.00
## PriceRelative         14 458    0.49    0.45    0.36    0.42    0.41  0.02
## SeatsTotal            15 458  235.96   85.29  227.00  228.73   90.44 98.00
## PitchDifference       16 458    6.69    1.76    7.00    6.76    0.00  2.00
## WidthDifference       17 458    1.63    1.19    1.00    1.53    0.00  0.00
## PercentPremiumSeats   18 458   14.65    4.84   13.21   14.31    2.68  4.71
##                         max   range  skew kurtosis    se
## Airline*               6.00    5.00  0.61    -0.95  0.08
## Aircraft*              2.00    1.00 -0.72    -1.48  0.02
## FlightDuration        14.66   13.41 -0.07    -1.12  0.17
## TravelMonth*           4.00    3.00 -0.14    -1.46  0.05
## IsInternational*       2.00    1.00 -2.91     6.50  0.01
## SeatsEconomy         389.00  311.00  0.72    -0.36  3.57
## SeatsPremium          66.00   58.00  0.23    -0.46  0.62
## PitchEconomy          33.00    3.00 -0.03    -0.35  0.03
## PitchPremium          40.00    6.00 -1.51     3.52  0.06
## WidthEconomy          19.00    2.00 -0.04    -0.08  0.03
## WidthPremium          21.00    4.00 -0.08    -0.31  0.05
## PriceEconomy        3593.00 3528.00  0.51    -0.88 46.18
## PricePremium        7414.00 7328.00  0.50     0.43 60.19
## PriceRelative          1.89    1.87  1.17     0.72  0.02
## SeatsTotal           441.00  343.00  0.70    -0.53  3.99
## PitchDifference       10.00    8.00 -0.54     1.78  0.08
## WidthDifference        4.00    4.00  0.84    -0.53  0.06
## PercentPremiumSeats   24.69   19.98  0.71     0.28  0.23

->Vector of variables

str(airline.df)
## 'data.frame':    458 obs. of  18 variables:
##  $ Airline            : Factor w/ 6 levels "AirFrance","British",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Aircraft           : Factor w/ 2 levels "AirBus","Boeing": 2 2 2 2 2 2 2 2 2 2 ...
##  $ FlightDuration     : num  12.25 12.25 12.25 12.25 8.16 ...
##  $ TravelMonth        : Factor w/ 4 levels "Aug","Jul","Oct",..: 2 1 4 3 1 4 3 1 4 4 ...
##  $ IsInternational    : Factor w/ 2 levels "Domestic","International": 2 2 2 2 2 2 2 2 2 2 ...
##  $ SeatsEconomy       : int  122 122 122 122 122 122 122 122 122 122 ...
##  $ SeatsPremium       : int  40 40 40 40 40 40 40 40 40 40 ...
##  $ PitchEconomy       : int  31 31 31 31 31 31 31 31 31 31 ...
##  $ PitchPremium       : int  38 38 38 38 38 38 38 38 38 38 ...
##  $ WidthEconomy       : int  18 18 18 18 18 18 18 18 18 18 ...
##  $ WidthPremium       : int  19 19 19 19 19 19 19 19 19 19 ...
##  $ PriceEconomy       : int  2707 2707 2707 2707 1793 1793 1793 1476 1476 1705 ...
##  $ PricePremium       : int  3725 3725 3725 3725 2999 2999 2999 2997 2997 2989 ...
##  $ PriceRelative      : num  0.38 0.38 0.38 0.38 0.67 0.67 0.67 1.03 1.03 0.75 ...
##  $ SeatsTotal         : int  162 162 162 162 162 162 162 162 162 162 ...
##  $ PitchDifference    : int  7 7 7 7 7 7 7 7 7 7 ...
##  $ WidthDifference    : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ PercentPremiumSeats: num  24.7 24.7 24.7 24.7 24.7 ...

-> About the data

 table(airline.df$Airline)
## 
## AirFrance   British     Delta       Jet Singapore    Virgin 
##        74       175        46        61        40        62
 table(airline.df$Aircraft)
## 
## AirBus Boeing 
##    151    307
 table(airline.df$TravelMonth)
## 
## Aug Jul Oct Sep 
## 127  75 127 129
 table(airline.df$IsInternational)
## 
##      Domestic International 
##            40           418
 mean(airline.df$PriceRelative)
## [1] 0.4872052
 max(airline.df$SeatsTotal)
## [1] 441
 min(airline.df$SeatsTotal)
## [1] 98

->Creating a new column in the dataset

percent.economy <-( (airline.df$SeatsEconomy/airline.df$SeatsTotal)*100  )
airline.df$PercentEconomySeats <- percent.economy
 View(airline.df)

-> Visualization of variables independently

par(mfrow=c(3,2)) 
 with(airline.df, boxplot(airline.df$PriceEconomy , horizontal = TRUE, col = "yellow",
         main="boxplot of prices in economoy airlines",
         xlab="PriceEconomy"), cex=0.5)
with(airline.df , boxplot(airline.df$PricePremium , horizontal = TRUE, col = "yellow",
         main="boxplot of prices in premium economoy airlines",
         xlab="PricePremium"),cex=0.5)
with(airline.df , boxplot(airline.df$PitchDifference , horizontal = TRUE, col = "yellow",
             main="boxplot of pitch difference in  airlines",
             xlab="Pitchdifference"),cex=0.5)
with(airline.df , boxplot(airline.df$WidthDifference , horizontal = TRUE, col = "yellow",
             main="boxplot of width difference in airlines",
             xlab="Widthdifference"),cex=0.5)
with(airline.df , boxplot(airline.df$PercentEconomySeats , horizontal = TRUE, col = "yellow",
                          main="boxplot of percentage of economy seats ",
                          xlab="Percent Economy seats"),cex=0.5)
with(airline.df , boxplot(airline.df$PercentPremiumSeats , horizontal = TRUE, col = "yellow",
                          main="boxplot of percentage of premium economy seats",
                          xlab="Percent premium seats"),cex=0.5)

->Visualization of variables correlated pair-wise

1)Boxplots

par(mfrow=c(2,1))
with(airline.df ,boxplot(PriceRelative~PitchDifference,data=airline.df,
        main="Relative Price Difference vs. Pitch", ylab="Pitch Difference", 
        xlab="Relative Price ", horizontal=TRUE))
with(airline.df , boxplot(PriceRelative~WidthDifference,data=airline.df,
        main="Relative Price Difference vs. Pitch", ylab="Width Difference", 
        xlab="Relative Price ", horizontal=TRUE))

2)Scatterplots

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
 scatterplot(SeatsEconomy ~ SeatsPremium,data=airline.df ,  spread=FALSE,
             smoother.args=list(lty=2), pch=19, 
             main= "Scatterplot of number of seats in economy and 
             premium economy airline" )

 scatterplot(PriceEconomy ~ PricePremium,data=airline.df ,  spread=FALSE,
             smoother.args=list(lty=2), pch=19 ,
             main= "Scatterplot of price of economy and 
             premium economy airline" )

 plot(PriceRelative~ PitchDifference,data=airline.df)

 plot(PriceRelative~ WidthDifference,data=airline.df )

3)histograms

 library(lattice)
 histogram(~PricePremium | Airline , data=airline.df)

 histogram(~PriceEconomy | Airline , data=airline.df)

 histogram(~PricePremium | Aircraft , data=airline.df)

 histogram(~PriceEconomy | Aircraft , data=airline.df)

->Corrogram

library(corrgram)
 corrgram(airline.df, order=FALSE, 
          lower.panel=panel.shade,
          upper.panel=panel.pie, 
          diag.panel=panel.minmax,
          text.panel=panel.txt,
          main="Corrgram of all the  intercorrelations")

->Correlation Visualization

 library(corrplot)    
## corrplot 0.84 loaded
 corrplot(corr=cor(airline.df[ ,6:19 ], use="complete.obs"), 
          method ="ellipse")

->Regression model

  1. Model-1
fit1<-lm(PricePremium~FlightDuration+PriceEconomy+SeatsTotal+PitchDifference+
           WidthDifference+PercentEconomySeats+PercentPremiumSeats
          + Airline+Aircraft+IsInternational+TravelMonth,data=airline.df)
 summary(fit1)
## 
## Call:
## lm(formula = PricePremium ~ FlightDuration + PriceEconomy + SeatsTotal + 
##     PitchDifference + WidthDifference + PercentEconomySeats + 
##     PercentPremiumSeats + Airline + Aircraft + IsInternational + 
##     TravelMonth, data = airline.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -932.39 -222.10  -55.12  134.47 2916.42 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -4.685e+06  1.839e+06  -2.548  0.01119 *  
## FlightDuration                5.746e+01  9.240e+00   6.218 1.17e-09 ***
## PriceEconomy                  1.171e+00  3.989e-02  29.350  < 2e-16 ***
## SeatsTotal                   -2.433e-01  4.149e-01  -0.586  0.55792    
## PitchDifference              -4.552e+01  8.352e+01  -0.545  0.58603    
## WidthDifference               1.047e+02  1.063e+02   0.985  0.32510    
## PercentEconomySeats           4.685e+04  1.839e+04   2.547  0.01120 *  
## PercentPremiumSeats           4.683e+04  1.839e+04   2.546  0.01122 *  
## AirlineBritish                9.823e+02  1.848e+02   5.315 1.70e-07 ***
## AirlineDelta                  5.972e+02  2.303e+02   2.593  0.00983 ** 
## AirlineJet                    4.074e+02  1.935e+02   2.105  0.03586 *  
## AirlineSingapore              2.380e+02  1.335e+02   1.783  0.07532 .  
## AirlineVirgin                 1.001e+03  1.521e+02   6.578 1.36e-10 ***
## AircraftBoeing               -5.078e+01  6.338e+01  -0.801  0.42346    
## IsInternationalInternational  1.215e+02  3.456e+02   0.351  0.72545    
## TravelMonthJul               -3.210e+01  6.494e+01  -0.494  0.62135    
## TravelMonthOct                2.430e+01  5.505e+01   0.441  0.65918    
## TravelMonthSep               -5.074e+00  5.485e+01  -0.093  0.92633    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 438.3 on 440 degrees of freedom
## Multiple R-squared:  0.8886, Adjusted R-squared:  0.8842 
## F-statistic: 206.4 on 17 and 440 DF,  p-value: < 2.2e-16
  1. Model-2
fit<-lm(PricePremium~FlightDuration+PriceEconomy+SeatsTotal+PitchDifference+
           WidthDifference+PercentEconomySeats+PercentPremiumSeats,data=airline.df)
 summary(fit)
## 
## Call:
## lm(formula = PricePremium ~ FlightDuration + PriceEconomy + SeatsTotal + 
##     PitchDifference + WidthDifference + PercentEconomySeats + 
##     PercentPremiumSeats, data = airline.df)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -865.4 -258.1  -23.9  171.0 3491.3 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          3.463e+06  1.114e+06   3.107  0.00201 ** 
## FlightDuration       7.423e+01  7.898e+00   9.398  < 2e-16 ***
## PriceEconomy         9.980e-01  2.844e-02  35.095  < 2e-16 ***
## SeatsTotal           1.193e+00  2.946e-01   4.051 6.01e-05 ***
## PitchDifference     -3.681e+01  2.109e+01  -1.746  0.08156 .  
## WidthDifference      1.814e+02  3.328e+01   5.450 8.32e-08 ***
## PercentEconomySeats -3.463e+04  1.114e+04  -3.108  0.00200 ** 
## PercentPremiumSeats -3.461e+04  1.114e+04  -3.106  0.00202 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 481.1 on 450 degrees of freedom
## Multiple R-squared:  0.8626, Adjusted R-squared:  0.8605 
## F-statistic: 403.7 on 7 and 450 DF,  p-value: < 2.2e-16

-Model-2 isthe best fit model since most of the independent variables in thismodel are statistically significant ( p-value <0.05)

in a more significant way.