Objective: Determine airfare based on existing data
Data Source: U.S. Department of Transportation
Data Description: Air fares and passengers for U.S. Domestic Routes for 4th Quarter of 2002.
library(dplyr)
library(lubridate)
library(ggvis)
library(Amelia)
# Initializing working directory
dir_Data = "/Users/richleung/Dropbox/Projects/Airfare and Demand 2002Q4/1_Input"
dir_Main = "/Users/richleung/Dropbox/Projects/Airfare and Demand 2002Q4/2_Code"
dir_Output = "/Users/richleung/Dropbox/Projects/Airfare and Demand 2002Q4/3_Output"
## Import raw data file
setwd(dir_Data)
Data_orig <- read.table("Airfare_Demand_4Q2002.csv", sep=",", header=TRUE)
# convert to local data frame
Data_orig <- tbl_df(Data_orig)
summary(Data_orig)
## City1 City2 Avg_Fare_Cities Distance
## ATL : 64 TPA : 46 Min. : 50.52 Min. : 108.0
## ORD : 63 MCO : 43 1st Qu.:125.97 1st Qu.: 553.5
## DFW : 52 LGA : 42 Median :161.34 Median : 919.0
## BWI : 47 SEA : 42 Mean :163.38 Mean :1057.0
## BOS : 38 IAD : 40 3rd Qu.:197.39 3rd Qu.:1452.5
## IAH : 38 PHX : 36 Max. :401.23 Max. :2724.0
## (Other):698 (Other):751
## Avg_Weekly_PAX Mkt_Leading_Airline Market_Share_LeadingAirline
## Min. : 181.4 WN :257 Min. : 17.68
## 1st Qu.: 257.2 DL :162 1st Qu.: 46.51
## Median : 404.8 AA :135 Median : 59.24
## Mean : 672.3 UA : 91 Mean : 60.13
## 3rd Qu.: 769.9 NW : 86 3rd Qu.: 73.51
## Max. :8950.8 CO : 69 Max. :100.00
## (Other):200
## Avg_Fare_LeadingAirline Low_Price_Airline Market_Share_LowPriceAirline
## Min. : 50.52 WN :230 Min. : 1.06
## 1st Qu.:124.08 DL :151 1st Qu.: 13.20
## Median :161.40 AA :129 Median : 26.14
## Mean :166.65 US :101 Mean : 34.55
## 3rd Qu.:202.98 FL : 60 3rd Qu.: 52.97
## Max. :490.03 HP : 58 Max. :100.00
## (Other):271
## Price_LowPriceAirline
## Min. : 49.61
## 1st Qu.:113.77
## Median :137.82
## Mean :143.19
## 3rd Qu.:168.53
## Max. :387.94
##
# Search for NULLS in data set
# summary(is.na(Data_orig)) # alternative approach
sapply(Data_orig, function(x) sum(is.na(x)))
## City1 City2
## 0 0
## Avg_Fare_Cities Distance
## 0 0
## Avg_Weekly_PAX Mkt_Leading_Airline
## 0 0
## Market_Share_LeadingAirline Avg_Fare_LeadingAirline
## 0 0
## Low_Price_Airline Market_Share_LowPriceAirline
## 0 0
## Price_LowPriceAirline
## 0
# Check for unique values per variable
sapply(Data_orig, function(x) length(unique(x)))
## City1 City2
## 90 85
## Avg_Fare_Cities Distance
## 980 771
## Avg_Weekly_PAX Mkt_Leading_Airline
## 921 16
## Market_Share_LeadingAirline Avg_Fare_LeadingAirline
## 928 972
## Low_Price_Airline Market_Share_LowPriceAirline
## 19 918
## Price_LowPriceAirline
## 970
# A much nicer and conlidated approach
missmap(Data_orig, main = "Missing Values vs. Observed Values")
Filters for flights whose destination is IAD.
Data_master <- Data_orig %>%
filter(City2 == "IAD") %>%
select(Avg_Fare_Cities, Distance, Avg_Weekly_PAX)
model1 <- lm(Avg_Fare_Cities ~ Distance + Avg_Weekly_PAX, data = Data_master)
Data Screening (part 1)
library(QuantPsyc)
library(ppcor)
Cut-off Score = chi-square distribution with 0.999 of probabilities on N observations.
mahal = mahalanobis(Data_master, colMeans(Data_master), cov(Data_master))
cutoff_mahal = qchisq(1-0.001, ncol(Data_master))
bad_mahal = as.numeric(mahal > cutoff_mahal) # people over the cutoff scores
table(bad_mahal)
## bad_mahal
## 0 1
## 39 1
Defn: Influence of that person on slope of the regression line
Cut-off score: (2k+2) / N
k = 2 # number of IVs
leverage = hatvalues(model1)
cutoff_leverage = (2*k+2) / nrow(Data_master)
bad_leverage = as.numeric(leverage > cutoff_leverage) # people over cutoff scores (undue influence on slope of regression line)
table(bad_leverage)
## bad_leverage
## 0 1
## 39 1
Defn: Effects that a single case possess on the whole model. Often described as Leverage + Discrepancy.
Cut-off score: 4 / (N-k-1)
cooks = cooks.distance(model1)
cutoff_cooks = 4 / (nrow(Data_master)-k-1)
bad_cooks = as.numeric(cooks > cutoff_cooks) # people over cutoff scores
table(bad_cooks)
## bad_cooks
## 0 1
## 35 5
Outliers_total = bad_mahal + bad_leverage + bad_cooks
table(Outliers_total)
## Outliers_total
## 0 1 3
## 35 4 1
Remove observation with 2+ outlier criterias. Set as new dataset.
Data_master_NoOutliers = subset(Data_master, Outliers_total < 2)
model2 = lm(Avg_Fare_Cities ~ Distance + Avg_Weekly_PAX, data = Data_master_NoOutliers)
summary(model2)
##
## Call:
## lm(formula = Avg_Fare_Cities ~ Distance + Avg_Weekly_PAX, data = Data_master_NoOutliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -115.609 -23.912 4.853 30.992 114.236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.493e+02 1.863e+01 8.011 1.63e-09 ***
## Distance 5.649e-02 1.064e-02 5.309 5.82e-06 ***
## Avg_Weekly_PAX -2.313e-04 1.207e-02 -0.019 0.985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.65 on 36 degrees of freedom
## Multiple R-squared: 0.4439, Adjusted R-squared: 0.413
## F-statistic: 14.37 on 2 and 36 DF, p-value: 2.585e-05
RESULT:
| variable | Coefficient (b) | t-Value | p-Value | pr^2 | Notes |
|---|---|---|---|---|---|
| [Distance] | 0.0564940 | 5.309 | 0 | 0.4391469565 | IV is significant |
| [Avg_Weekly_PAX] | -0.0002313 | -0.019 | 0.985 | 0.0000102036 | IV is NOT significant |
F(2, 36) = 14.37, p < 0.001, R^2 = 0.413 –> Model is significant
Data Screening (part 2)
Objective: Want X and Y to be correlated | Do NOT want Xs to be highly correlated (waste power of Degree-of-Freedoms == Supression)
summary(model2, correlation = TRUE)
##
## Call:
## lm(formula = Avg_Fare_Cities ~ Distance + Avg_Weekly_PAX, data = Data_master_NoOutliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -115.609 -23.912 4.853 30.992 114.236
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.493e+02 1.863e+01 8.011 1.63e-09 ***
## Distance 5.649e-02 1.064e-02 5.309 5.82e-06 ***
## Avg_Weekly_PAX -2.313e-04 1.207e-02 -0.019 0.985
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47.65 on 36 degrees of freedom
## Multiple R-squared: 0.4439, Adjusted R-squared: 0.413
## F-statistic: 14.37 on 2 and 36 DF, p-value: 2.585e-05
##
## Correlation of Coefficients:
## (Intercept) Distance
## Distance -0.77
## Avg_Weekly_PAX -0.60 0.13
standardized = rstudent(model2) # Standarized Residuals
fitted = scale(model2$fitted.values) # Fitted values
qqnorm(standardized)
abline(0,1)
* RESULT: Plot reflects shape of a linear function.
hist(standardized)
plot(fitted, standardized)
abline(0,0)
abline(v = 0)
* RESULT: Homogeneity == Pretty Good as observations are NOT beyond the +/- 2 areas. * RESULT: Homoscedasticity == Pretty Good as observations are NOT clumped together.
lm.beta(model2)
## Distance Avg_Weekly_PAX
## 0.665948095 -0.002404025
options(scipen = 999) # turn off scientific notation
partials = pcor(Data_master_NoOutliers)
partials$estimate^2
## Avg_Fare_Cities Distance Avg_Weekly_PAX
## Avg_Fare_Cities 1.0000000000 0.439146957 0.0000102036
## Distance 0.4391469565 1.000000000 0.0097959815
## Avg_Weekly_PAX 0.0000102036 0.009795981 1.0000000000
options(scipen = 0) # turn on scientific notation
model3 = lm(Avg_Fare_Cities ~ Distance, data = Data_master_NoOutliers)
summary(model3)
##
## Call:
## lm(formula = Avg_Fare_Cities ~ Distance, data = Data_master_NoOutliers)
##
## Residuals:
## Min 1Q Median 3Q Max
## -115.594 -24.050 4.897 31.007 114.160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 149.04417 14.76509 10.094 3.55e-12 ***
## Distance 0.05652 0.01040 5.435 3.66e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 47 on 37 degrees of freedom
## Multiple R-squared: 0.4439, Adjusted R-squared: 0.4289
## F-statistic: 29.54 on 1 and 37 DF, p-value: 3.658e-06
Given that [Distance] has a relatively stronger correlation than [Avg_Weekly_PAX], and also that Beta for [Distance] > [Avg_Weekly_PAX]. I drop [Avg_Weekly_PAX] from Linear Regression.
As a result, I removed all IV except for [Distance].
| variable | Coefficient (b) | t-Value | p-Value |
|---|---|---|---|
| [Distance] | 0.0564940 | 5.435 | 0 |
RESULT:
Avg_Fare_Cities = 149.2568206 + (0.0564940 * Distance)
Finally, I predict price of ticket to IAD by inserting distances from BOS, CVG, LAX, SFO, SEA with predict().
IV.model3 <- data.frame(Distance = c(413, 380.65, 2311, 2442, 2329))
Avg_Fare_Cities.predict <- predict(model3, IV.model3)
Avg_Fare_Cities.predict
## 1 2 3 4 5
## 172.3876 170.5591 279.6655 287.0698 280.6829
| Destination | Origin | Distance | actual Air fare | predict Air fare |
|---|---|---|---|---|
| IAD | BOS | 413 | $ 163.37 | $ 172.3876 |
| IAD | CVG | 411 | $ 194.70 | $ 170.5591 |
| IAD | LAX | 2311 | $ 361.57 | $ 279.6655 |
| IAD | SFO | 2442 | $ 401.23 | $ 287.0698 |
| IAD | SEA | 2329 | $ 288.14 | $ 280.6829 |