R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

# Regression Template

# Importing the dataset
dataset = read.csv('SixAirlinesDataV2.csv')
View(dataset)
#summary statistics
summary(dataset)
##       Airline      Aircraft   FlightDuration   TravelMonth
##  AirFrance: 74   AirBus:151   Min.   : 1.250   Aug:127    
##  British  :175   Boeing:307   1st Qu.: 4.260   Jul: 75    
##  Delta    : 46                Median : 7.790   Oct:127    
##  Jet      : 61                Mean   : 7.578   Sep:129    
##  Singapore: 40                3rd Qu.:10.620              
##  Virgin   : 62                Max.   :14.660              
##       IsInternational  SeatsEconomy    SeatsPremium    PitchEconomy  
##  Domestic     : 40    Min.   : 78.0   Min.   : 8.00   Min.   :30.00  
##  International:418    1st Qu.:133.0   1st Qu.:21.00   1st Qu.:31.00  
##                       Median :185.0   Median :36.00   Median :31.00  
##                       Mean   :202.3   Mean   :33.65   Mean   :31.22  
##                       3rd Qu.:243.0   3rd Qu.:40.00   3rd Qu.:32.00  
##                       Max.   :389.0   Max.   :66.00   Max.   :33.00  
##   PitchPremium    WidthEconomy    WidthPremium    PriceEconomy 
##  Min.   :34.00   Min.   :17.00   Min.   :17.00   Min.   :  65  
##  1st Qu.:38.00   1st Qu.:18.00   1st Qu.:19.00   1st Qu.: 413  
##  Median :38.00   Median :18.00   Median :19.00   Median :1242  
##  Mean   :37.91   Mean   :17.84   Mean   :19.47   Mean   :1327  
##  3rd Qu.:38.00   3rd Qu.:18.00   3rd Qu.:21.00   3rd Qu.:1909  
##  Max.   :40.00   Max.   :19.00   Max.   :21.00   Max.   :3593  
##   PricePremium    PriceRelative      SeatsTotal  PitchDifference 
##  Min.   :  86.0   Min.   :0.0200   Min.   : 98   Min.   : 2.000  
##  1st Qu.: 528.8   1st Qu.:0.1000   1st Qu.:166   1st Qu.: 6.000  
##  Median :1737.0   Median :0.3650   Median :227   Median : 7.000  
##  Mean   :1845.3   Mean   :0.4872   Mean   :236   Mean   : 6.688  
##  3rd Qu.:2989.0   3rd Qu.:0.7400   3rd Qu.:279   3rd Qu.: 7.000  
##  Max.   :7414.0   Max.   :1.8900   Max.   :441   Max.   :10.000  
##  WidthDifference PercentPremiumSeats
##  Min.   :0.000   Min.   : 4.71      
##  1st Qu.:1.000   1st Qu.:12.28      
##  Median :1.000   Median :13.21      
##  Mean   :1.633   Mean   :14.65      
##  3rd Qu.:3.000   3rd Qu.:15.36      
##  Max.   :4.000   Max.   :24.69
#Encoding categorical data
#Removing one dummy variable to avoid the dummy variable trap
dataset$Airline = factor(dataset$Airline,
                       levels = c('British',
                                  'Delta',
                                  'AirFrance',
                                  'Singapore',
                                  'Virgin'
                                  ),
                       labels = c(1, 2, 3,4,5))

#viewing the corelational matrix
library(corrgram)
corrgram(dataset, order=TRUE, lower.panel=panel.shade,
         upper.panel=panel.pie, text.panel=panel.txt,
         main="Corgram of Store Variables")

#Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
 set.seed(123)
split = sample.split(dataset$PriceRelative, SplitRatio = 2/3)
training_set = subset(dataset, split == TRUE)
 test_set = subset(dataset, split == FALSE)
View(training_set)
View(test_set)
#Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)

# Fitting the Regression Model to the dataset
# Create your regressor here
regressor = lm(PriceRelative ~ .,
               data = training_set)
summary(regressor)
## 
## Call:
## lm(formula = PriceRelative ~ ., data = training_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.65802 -0.07231 -0.00272  0.06854  0.67044 
## 
## Coefficients: (3 not defined because of singularities)
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  -6.421e-01  3.096e+00  -0.207 0.835878    
## Airline2                      1.507e-01  2.854e-01   0.528 0.598019    
## Airline3                      3.951e-01  1.125e-01   3.512 0.000529 ***
## Airline4                      3.427e-02  2.144e-01   0.160 0.873176    
## Airline5                      1.819e-01  2.712e-01   0.671 0.503026    
## AircraftBoeing                6.061e-02  2.699e-02   2.246 0.025620 *  
## FlightDuration                2.687e-02  4.586e-03   5.859 1.49e-08 ***
## TravelMonthJul                1.762e-02  3.163e-02   0.557 0.578122    
## TravelMonthOct                9.758e-03  2.663e-02   0.366 0.714374    
## TravelMonthSep                2.658e-02  2.656e-02   1.001 0.317961    
## IsInternationalInternational  3.658e-01  4.824e-01   0.758 0.448976    
## SeatsEconomy                  9.195e-04  4.983e-04   1.845 0.066197 .  
## SeatsPremium                 -6.126e-03  3.329e-03  -1.840 0.066953 .  
## PitchEconomy                  2.004e-02  1.014e-01   0.198 0.843538    
## PitchPremium                  1.374e-02  1.465e-01   0.094 0.925338    
## WidthEconomy                 -3.523e-02  5.235e-02  -0.673 0.501563    
## WidthPremium                 -9.823e-03  1.337e-01  -0.073 0.941480    
## PriceEconomy                 -8.926e-04  2.949e-05 -30.270  < 2e-16 ***
## PricePremium                  5.576e-04  2.096e-05  26.601  < 2e-16 ***
## SeatsTotal                           NA         NA      NA       NA    
## PitchDifference                      NA         NA      NA       NA    
## WidthDifference                      NA         NA      NA       NA    
## PercentPremiumSeats           1.465e-02  7.147e-03   2.049 0.041490 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1585 on 245 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.857,  Adjusted R-squared:  0.8459 
## F-statistic:  77.3 on 19 and 245 DF,  p-value: < 2.2e-16
# Fitting the most significant Regression Model on basis of p values to the dataset
# Create your regressor here
regressor2 = lm(PriceRelative ~ Airline+FlightDuration+SeatsEconomy+SeatsPremium+PriceEconomy+PricePremium,
               data = training_set)
summary(regressor2)
## 
## Call:
## lm(formula = PriceRelative ~ Airline + FlightDuration + SeatsEconomy + 
##     SeatsPremium + PriceEconomy + PricePremium, data = training_set)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.70132 -0.08556 -0.01068  0.08934  0.70728 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     2.331e-01  6.712e-02   3.472 0.000606 ***
## Airline2       -1.244e-01  4.704e-02  -2.646 0.008664 ** 
## Airline3        4.003e-01  4.682e-02   8.550 1.15e-15 ***
## Airline4        8.580e-03  4.467e-02   0.192 0.847842    
## Airline5        1.170e-01  3.058e-02   3.825 0.000165 ***
## FlightDuration  3.215e-02  4.483e-03   7.173 7.94e-12 ***
## SeatsEconomy   -1.115e-04  1.879e-04  -0.593 0.553489    
## SeatsPremium    6.351e-04  1.643e-03   0.387 0.699351    
## PriceEconomy   -8.754e-04  2.992e-05 -29.258  < 2e-16 ***
## PricePremium    5.579e-04  2.134e-05  26.139  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1623 on 255 degrees of freedom
##   (50 observations deleted due to missingness)
## Multiple R-squared:  0.8438, Adjusted R-squared:  0.8383 
## F-statistic: 153.1 on 9 and 255 DF,  p-value: < 2.2e-16
##trying out a non linear regressor like SVM to check whether further deductions can be made]
#install.packages('e1071')
library(e1071)
regressor3 = svm(formula =PriceRelative ~ .,
                data = dataset,
                type = 'eps-regression',
                kernel = 'radial')
summary(regressor3)
## 
## Call:
## svm(formula = PriceRelative ~ ., data = dataset, type = "eps-regression", 
##     kernel = "radial")
## 
## 
## Parameters:
##    SVM-Type:  eps-regression 
##  SVM-Kernel:  radial 
##        cost:  1 
##       gamma:  0.04347826 
##     epsilon:  0.1 
## 
## 
## Number of Support Vectors:  233
y_pred2 = predict(regressor2, newdata = test_set)
#scatterplots of the most dependent factors
attach(dataset)
plot(Airline, PriceRelative, main="PriceRelative VS Airline", 
     ylab="PriceRealtive", xlab="Airline ", pch=19)

plot(FlightDuration, PriceRelative, main="PriceRelative VS FlightDuration", 
     ylab="PriceRealtive", xlab="FlightDuration", pch=19)

#scatterplotmatrix of the most depdendent factors
library(car)
scatterplotMatrix(~PriceRelative+Airline+FlightDuration+SeatsEconomy+SeatsPremium+PriceEconomy+PricePremium,data=dataset,main="ScatterPlot Matrix")

# Predicting a new result
y_pred = predict(regressor2, newdata = test_set)
View(y_pred2)