Objective: Determine airfare based on existing data

Data Source: U.S. Department of Transportation

Data Description: Air fares and passengers for U.S. Domestic Routes for 4th Quarter of 2002.

Setting up environment

library(dplyr)
library(lubridate)
library(ggvis)
library(Amelia)


# Initializing working directory
dir_Data        = "/Users/richleung/Dropbox/Projects/Airfare and Demand 2002Q4/1_Input"
dir_Main      = "/Users/richleung/Dropbox/Projects/Airfare and Demand 2002Q4/2_Code"
dir_Output    = "/Users/richleung/Dropbox/Projects/Airfare and Demand 2002Q4/3_Output"



## Import raw data file
setwd(dir_Data)
Data_orig <- read.table("Airfare_Demand_4Q2002.csv", sep=",", header=TRUE)
# convert to local data frame
Data_orig <- tbl_df(Data_orig)

Quick data exploration

summary(Data_orig)
##      City1         City2     Avg_Fare_Cities     Distance     
##  ATL    : 64   TPA    : 46   Min.   : 50.52   Min.   : 108.0  
##  ORD    : 63   MCO    : 43   1st Qu.:125.97   1st Qu.: 553.5  
##  DFW    : 52   LGA    : 42   Median :161.34   Median : 919.0  
##  BWI    : 47   SEA    : 42   Mean   :163.38   Mean   :1057.0  
##  BOS    : 38   IAD    : 40   3rd Qu.:197.39   3rd Qu.:1452.5  
##  IAH    : 38   PHX    : 36   Max.   :401.23   Max.   :2724.0  
##  (Other):698   (Other):751                                    
##  Avg_Weekly_PAX   Mkt_Leading_Airline Market_Share_LeadingAirline
##  Min.   : 181.4   WN     :257         Min.   : 17.68             
##  1st Qu.: 257.2   DL     :162         1st Qu.: 46.51             
##  Median : 404.8   AA     :135         Median : 59.24             
##  Mean   : 672.3   UA     : 91         Mean   : 60.13             
##  3rd Qu.: 769.9   NW     : 86         3rd Qu.: 73.51             
##  Max.   :8950.8   CO     : 69         Max.   :100.00             
##                   (Other):200                                    
##  Avg_Fare_LeadingAirline Low_Price_Airline Market_Share_LowPriceAirline
##  Min.   : 50.52          WN     :230       Min.   :  1.06              
##  1st Qu.:124.08          DL     :151       1st Qu.: 13.20              
##  Median :161.40          AA     :129       Median : 26.14              
##  Mean   :166.65          US     :101       Mean   : 34.55              
##  3rd Qu.:202.98          FL     : 60       3rd Qu.: 52.97              
##  Max.   :490.03          HP     : 58       Max.   :100.00              
##                          (Other):271                                   
##  Price_LowPriceAirline
##  Min.   : 49.61       
##  1st Qu.:113.77       
##  Median :137.82       
##  Mean   :143.19       
##  3rd Qu.:168.53       
##  Max.   :387.94       
## 
# Search for NULLS in data set
# summary(is.na(Data_orig))                   # alternative approach
sapply(Data_orig, function(x) sum(is.na(x)))
##                        City1                        City2 
##                            0                            0 
##              Avg_Fare_Cities                     Distance 
##                            0                            0 
##               Avg_Weekly_PAX          Mkt_Leading_Airline 
##                            0                            0 
##  Market_Share_LeadingAirline      Avg_Fare_LeadingAirline 
##                            0                            0 
##            Low_Price_Airline Market_Share_LowPriceAirline 
##                            0                            0 
##        Price_LowPriceAirline 
##                            0
# Check for unique values per variable
sapply(Data_orig, function(x) length(unique(x)))
##                        City1                        City2 
##                           90                           85 
##              Avg_Fare_Cities                     Distance 
##                          980                          771 
##               Avg_Weekly_PAX          Mkt_Leading_Airline 
##                          921                           16 
##  Market_Share_LeadingAirline      Avg_Fare_LeadingAirline 
##                          928                          972 
##            Low_Price_Airline Market_Share_LowPriceAirline 
##                           19                          918 
##        Price_LowPriceAirline 
##                          970
# A much nicer and conlidated approach
missmap(Data_orig, main = "Missing Values vs. Observed Values")

Filters for flights whose destination is IAD.

Data_master <- Data_orig %>% 
  filter(City2 == "IAD") %>% 
  select(Avg_Fare_Cities, Distance, Avg_Weekly_PAX)

1.) Initial regression

model1 <- lm(Avg_Fare_Cities ~ Distance + Avg_Weekly_PAX, data = Data_master)


Data Screening (part 1)

library(QuantPsyc)
library(ppcor)

2.) Outliers (Mahalanobis)

Cut-off Score = chi-square distribution with 0.999 of probabilities on N observations.

mahal = mahalanobis(Data_master, colMeans(Data_master), cov(Data_master))
cutoff_mahal = qchisq(1-0.001, ncol(Data_master))
bad_mahal = as.numeric(mahal > cutoff_mahal)                  # people over the cutoff scores
table(bad_mahal)
## bad_mahal
##  0  1 
## 39  1

3.) Leverage

Defn: Influence of that person on slope of the regression line

Cut-off score: (2k+2) / N

k = 2                                                         # number of IVs
leverage = hatvalues(model1)
cutoff_leverage = (2*k+2) / nrow(Data_master)
bad_leverage = as.numeric(leverage > cutoff_leverage)         # people over cutoff scores (undue influence on slope of regression line)
table(bad_leverage)
## bad_leverage
##  0  1 
## 39  1

4.) Influence (Cook’s)

Defn: Effects that a single case possess on the whole model. Often described as Leverage + Discrepancy.

Cut-off score: 4 / (N-k-1)

cooks = cooks.distance(model1)
cutoff_cooks = 4 / (nrow(Data_master)-k-1)
bad_cooks = as.numeric(cooks > cutoff_cooks)                  # people over cutoff scores
table(bad_cooks)
## bad_cooks
##  0  1 
## 35  5

5.) Overall Outliers (Mahalanobis + Leverage + Influence)

Outliers_total = bad_mahal + bad_leverage + bad_cooks
table(Outliers_total)
## Outliers_total
##  0  1  3 
## 35  4  1

Remove observation with 2+ outlier criterias. Set as new dataset.

Data_master_NoOutliers = subset(Data_master, Outliers_total < 2)

6.) Regression post removal of outliers

model2 = lm(Avg_Fare_Cities ~ Distance + Avg_Weekly_PAX, data = Data_master_NoOutliers)
summary(model2)
## 
## Call:
## lm(formula = Avg_Fare_Cities ~ Distance + Avg_Weekly_PAX, data = Data_master_NoOutliers)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -115.609  -23.912    4.853   30.992  114.236 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.493e+02  1.863e+01   8.011 1.63e-09 ***
## Distance        5.649e-02  1.064e-02   5.309 5.82e-06 ***
## Avg_Weekly_PAX -2.313e-04  1.207e-02  -0.019    0.985    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.65 on 36 degrees of freedom
## Multiple R-squared:  0.4439, Adjusted R-squared:  0.413 
## F-statistic: 14.37 on 2 and 36 DF,  p-value: 2.585e-05

RESULT:

variable Coefficient (b) t-Value p-Value pr^2 Notes
[Distance] 0.0564940 5.309 0 0.4391469565 IV is significant
[Avg_Weekly_PAX] -0.0002313 -0.019 0.985 0.0000102036 IV is NOT significant

F(2, 36) = 14.37, p < 0.001, R^2 = 0.413 –> Model is significant



Data Screening (part 2)

7.) Multicollinerity

Objective: Want X and Y to be correlated | Do NOT want Xs to be highly correlated (waste power of Degree-of-Freedoms == Supression)

summary(model2, correlation = TRUE)
## 
## Call:
## lm(formula = Avg_Fare_Cities ~ Distance + Avg_Weekly_PAX, data = Data_master_NoOutliers)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -115.609  -23.912    4.853   30.992  114.236 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.493e+02  1.863e+01   8.011 1.63e-09 ***
## Distance        5.649e-02  1.064e-02   5.309 5.82e-06 ***
## Avg_Weekly_PAX -2.313e-04  1.207e-02  -0.019    0.985    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47.65 on 36 degrees of freedom
## Multiple R-squared:  0.4439, Adjusted R-squared:  0.413 
## F-statistic: 14.37 on 2 and 36 DF,  p-value: 2.585e-05
## 
## Correlation of Coefficients:
##                (Intercept) Distance
## Distance       -0.77               
## Avg_Weekly_PAX -0.60        0.13

8.) Linearity

standardized = rstudent(model2)                               # Standarized Residuals
fitted = scale(model2$fitted.values)                          # Fitted values

qqnorm(standardized)
abline(0,1)

* RESULT: Plot reflects shape of a linear function.

9.) Normality

hist(standardized)

10.) Homogeneity & Homoscedasticity

plot(fitted, standardized)
abline(0,0)
abline(v = 0)

* RESULT: Homogeneity == Pretty Good as observations are NOT beyond the +/- 2 areas. * RESULT: Homoscedasticity == Pretty Good as observations are NOT clumped together.



11.) Review Model IVs for signifiance using Beta == Z-Score == Standardize Regression Coefficient

lm.beta(model2)
##       Distance Avg_Weekly_PAX 
##    0.665948095   -0.002404025

12.) Review Model IVs for signifiance using Partial Correlation (pr^2) of dataset (sans outliers).

options(scipen = 999)                                         # turn off scientific notation
partials = pcor(Data_master_NoOutliers)
partials$estimate^2
##                 Avg_Fare_Cities    Distance Avg_Weekly_PAX
## Avg_Fare_Cities    1.0000000000 0.439146957   0.0000102036
## Distance           0.4391469565 1.000000000   0.0097959815
## Avg_Weekly_PAX     0.0000102036 0.009795981   1.0000000000


13.) Final Model

options(scipen = 0)                                           # turn on scientific notation
model3 = lm(Avg_Fare_Cities ~ Distance, data = Data_master_NoOutliers)
summary(model3)
## 
## Call:
## lm(formula = Avg_Fare_Cities ~ Distance, data = Data_master_NoOutliers)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -115.594  -24.050    4.897   31.007  114.160 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 149.04417   14.76509  10.094 3.55e-12 ***
## Distance      0.05652    0.01040   5.435 3.66e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 47 on 37 degrees of freedom
## Multiple R-squared:  0.4439, Adjusted R-squared:  0.4289 
## F-statistic: 29.54 on 1 and 37 DF,  p-value: 3.658e-06

Given that [Distance] has a relatively stronger correlation than [Avg_Weekly_PAX], and also that Beta for [Distance] > [Avg_Weekly_PAX]. I drop [Avg_Weekly_PAX] from Linear Regression.

As a result, I removed all IV except for [Distance].

variable Coefficient (b) t-Value p-Value
[Distance] 0.0564940 5.435 0

RESULT:

Avg_Fare_Cities = 149.2568206 + (0.0564940 * Distance)

Finally, I predict price of ticket to IAD by inserting distances from BOS, CVG, LAX, SFO, SEA with predict().

IV.model3 <- data.frame(Distance = c(413, 380.65, 2311, 2442, 2329))
Avg_Fare_Cities.predict <- predict(model3, IV.model3)
Avg_Fare_Cities.predict
##        1        2        3        4        5 
## 172.3876 170.5591 279.6655 287.0698 280.6829
Destination Origin Distance actual Air fare predict Air fare
IAD BOS 413 $ 163.37 $ 172.3876
IAD CVG 411 $ 194.70 $ 170.5591
IAD LAX 2311 $ 361.57 $ 279.6655
IAD SFO 2442 $ 401.23 $ 287.0698
IAD SEA 2329 $ 288.14 $ 280.6829