This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
# Regression Template
# Importing the dataset
dataset = read.csv('SixAirlinesDataV2.csv')
View(dataset)
#summary statistics
summary(dataset)
## Airline Aircraft FlightDuration TravelMonth
## AirFrance: 74 AirBus:151 Min. : 1.250 Aug:127
## British :175 Boeing:307 1st Qu.: 4.260 Jul: 75
## Delta : 46 Median : 7.790 Oct:127
## Jet : 61 Mean : 7.578 Sep:129
## Singapore: 40 3rd Qu.:10.620
## Virgin : 62 Max. :14.660
## IsInternational SeatsEconomy SeatsPremium PitchEconomy
## Domestic : 40 Min. : 78.0 Min. : 8.00 Min. :30.00
## International:418 1st Qu.:133.0 1st Qu.:21.00 1st Qu.:31.00
## Median :185.0 Median :36.00 Median :31.00
## Mean :202.3 Mean :33.65 Mean :31.22
## 3rd Qu.:243.0 3rd Qu.:40.00 3rd Qu.:32.00
## Max. :389.0 Max. :66.00 Max. :33.00
## PitchPremium WidthEconomy WidthPremium PriceEconomy
## Min. :34.00 Min. :17.00 Min. :17.00 Min. : 65
## 1st Qu.:38.00 1st Qu.:18.00 1st Qu.:19.00 1st Qu.: 413
## Median :38.00 Median :18.00 Median :19.00 Median :1242
## Mean :37.91 Mean :17.84 Mean :19.47 Mean :1327
## 3rd Qu.:38.00 3rd Qu.:18.00 3rd Qu.:21.00 3rd Qu.:1909
## Max. :40.00 Max. :19.00 Max. :21.00 Max. :3593
## PricePremium PriceRelative SeatsTotal PitchDifference
## Min. : 86.0 Min. :0.0200 Min. : 98 Min. : 2.000
## 1st Qu.: 528.8 1st Qu.:0.1000 1st Qu.:166 1st Qu.: 6.000
## Median :1737.0 Median :0.3650 Median :227 Median : 7.000
## Mean :1845.3 Mean :0.4872 Mean :236 Mean : 6.688
## 3rd Qu.:2989.0 3rd Qu.:0.7400 3rd Qu.:279 3rd Qu.: 7.000
## Max. :7414.0 Max. :1.8900 Max. :441 Max. :10.000
## WidthDifference PercentPremiumSeats
## Min. :0.000 Min. : 4.71
## 1st Qu.:1.000 1st Qu.:12.28
## Median :1.000 Median :13.21
## Mean :1.633 Mean :14.65
## 3rd Qu.:3.000 3rd Qu.:15.36
## Max. :4.000 Max. :24.69
#Encoding categorical data
#Removing one dummy variable to avoid the dummy variable trap
dataset$Airline = factor(dataset$Airline,
levels = c('British',
'Delta',
'AirFrance',
'Singapore',
'Virgin'
),
labels = c(1, 2, 3,4,5))
#viewing the corelational matrix
library(corrgram)
corrgram(dataset, order=TRUE, lower.panel=panel.shade,
upper.panel=panel.pie, text.panel=panel.txt,
main="Corgram of Store Variables")

#Splitting the dataset into the Training set and Test set
# install.packages('caTools')
library(caTools)
set.seed(123)
split = sample.split(dataset$PriceRelative, SplitRatio = 2/3)
training_set = subset(dataset, split == TRUE)
test_set = subset(dataset, split == FALSE)
View(training_set)
View(test_set)
#Feature Scaling
# training_set = scale(training_set)
# test_set = scale(test_set)
# Fitting the Regression Model to the dataset
# Create your regressor here
regressor = lm(PriceRelative ~ .,
data = training_set)
summary(regressor)
##
## Call:
## lm(formula = PriceRelative ~ ., data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.65802 -0.07231 -0.00272 0.06854 0.67044
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -6.421e-01 3.096e+00 -0.207 0.835878
## Airline2 1.507e-01 2.854e-01 0.528 0.598019
## Airline3 3.951e-01 1.125e-01 3.512 0.000529 ***
## Airline4 3.427e-02 2.144e-01 0.160 0.873176
## Airline5 1.819e-01 2.712e-01 0.671 0.503026
## AircraftBoeing 6.061e-02 2.699e-02 2.246 0.025620 *
## FlightDuration 2.687e-02 4.586e-03 5.859 1.49e-08 ***
## TravelMonthJul 1.762e-02 3.163e-02 0.557 0.578122
## TravelMonthOct 9.758e-03 2.663e-02 0.366 0.714374
## TravelMonthSep 2.658e-02 2.656e-02 1.001 0.317961
## IsInternationalInternational 3.658e-01 4.824e-01 0.758 0.448976
## SeatsEconomy 9.195e-04 4.983e-04 1.845 0.066197 .
## SeatsPremium -6.126e-03 3.329e-03 -1.840 0.066953 .
## PitchEconomy 2.004e-02 1.014e-01 0.198 0.843538
## PitchPremium 1.374e-02 1.465e-01 0.094 0.925338
## WidthEconomy -3.523e-02 5.235e-02 -0.673 0.501563
## WidthPremium -9.823e-03 1.337e-01 -0.073 0.941480
## PriceEconomy -8.926e-04 2.949e-05 -30.270 < 2e-16 ***
## PricePremium 5.576e-04 2.096e-05 26.601 < 2e-16 ***
## SeatsTotal NA NA NA NA
## PitchDifference NA NA NA NA
## WidthDifference NA NA NA NA
## PercentPremiumSeats 1.465e-02 7.147e-03 2.049 0.041490 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1585 on 245 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.857, Adjusted R-squared: 0.8459
## F-statistic: 77.3 on 19 and 245 DF, p-value: < 2.2e-16
# Fitting the most significant Regression Model on basis of p values to the dataset
# Create your regressor here
regressor2 = lm(PriceRelative ~ Airline+FlightDuration+SeatsEconomy+SeatsPremium+PriceEconomy+PricePremium,
data = training_set)
summary(regressor2)
##
## Call:
## lm(formula = PriceRelative ~ Airline + FlightDuration + SeatsEconomy +
## SeatsPremium + PriceEconomy + PricePremium, data = training_set)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.70132 -0.08556 -0.01068 0.08934 0.70728
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.331e-01 6.712e-02 3.472 0.000606 ***
## Airline2 -1.244e-01 4.704e-02 -2.646 0.008664 **
## Airline3 4.003e-01 4.682e-02 8.550 1.15e-15 ***
## Airline4 8.580e-03 4.467e-02 0.192 0.847842
## Airline5 1.170e-01 3.058e-02 3.825 0.000165 ***
## FlightDuration 3.215e-02 4.483e-03 7.173 7.94e-12 ***
## SeatsEconomy -1.115e-04 1.879e-04 -0.593 0.553489
## SeatsPremium 6.351e-04 1.643e-03 0.387 0.699351
## PriceEconomy -8.754e-04 2.992e-05 -29.258 < 2e-16 ***
## PricePremium 5.579e-04 2.134e-05 26.139 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1623 on 255 degrees of freedom
## (50 observations deleted due to missingness)
## Multiple R-squared: 0.8438, Adjusted R-squared: 0.8383
## F-statistic: 153.1 on 9 and 255 DF, p-value: < 2.2e-16
##trying out a non linear regressor like SVM to check whether further deductions can be made]
#install.packages('e1071')
library(e1071)
regressor3 = svm(formula =PriceRelative ~ .,
data = dataset,
type = 'eps-regression',
kernel = 'radial')
summary(regressor3)
##
## Call:
## svm(formula = PriceRelative ~ ., data = dataset, type = "eps-regression",
## kernel = "radial")
##
##
## Parameters:
## SVM-Type: eps-regression
## SVM-Kernel: radial
## cost: 1
## gamma: 0.04347826
## epsilon: 0.1
##
##
## Number of Support Vectors: 233
y_pred2 = predict(regressor2, newdata = test_set)
#scatterplots of the most dependent factors
attach(dataset)
plot(Airline, PriceRelative, main="PriceRelative VS Airline",
ylab="PriceRealtive", xlab="Airline ", pch=19)

plot(FlightDuration, PriceRelative, main="PriceRelative VS FlightDuration",
ylab="PriceRealtive", xlab="FlightDuration", pch=19)

#scatterplotmatrix of the most depdendent factors
library(car)
scatterplotMatrix(~PriceRelative+Airline+FlightDuration+SeatsEconomy+SeatsPremium+PriceEconomy+PricePremium,data=dataset,main="ScatterPlot Matrix")

# Predicting a new result
y_pred = predict(regressor2, newdata = test_set)
View(y_pred2)