Load the Airfares.csv and install/load any required packages.
airfares <- read.csv("D:/MSBA/3-Winter 2020/560/data/Airfares.csv")
head(airfares)
## S_CODE S_CITY E_CODE E_CITY COUPON NEW
## 1 * Dallas/Fort Worth TX * Amarillo TX 1.00 3
## 2 * Atlanta GA * Baltimore/Wash Intl MD 1.06 3
## 3 * Boston MA * Baltimore/Wash Intl MD 1.06 3
## 4 ORD Chicago IL * Baltimore/Wash Intl MD 1.06 3
## 5 MDW Chicago IL * Baltimore/Wash Intl MD 1.06 3
## 6 * Cleveland OH * Baltimore/Wash Intl MD 1.01 3
## VACATION SW HI S_INCOME E_INCOME S_POP E_POP SLOT GATE
## 1 No Yes 5291.99 28637 21112 3036732 205711 Free Free
## 2 No No 5419.16 26993 29838 3532657 7145897 Free Free
## 3 No No 9185.28 30124 29838 5787293 7145897 Free Free
## 4 No Yes 2657.35 29260 29838 7830332 7145897 Controlled Free
## 5 No Yes 2657.35 29260 29838 7830332 7145897 Free Free
## 6 No Yes 3408.11 26046 29838 2230955 7145897 Free Free
## DISTANCE PAX FARE
## 1 312 7864 64.11
## 2 576 8820 174.47
## 3 364 6452 207.76
## 4 612 25144 85.47
## 5 612 25144 85.47
## 6 309 13386 56.76
Many of our numeric fields contain special characters so we will need to clean them before we can run any analysis.
airfares$FARE <- as.numeric(substring(as.character(airfares$FARE),2))
toberemovedChars <- c("$",",")
for(c in toberemovedChars)
{
airfares$S_INCOME <- sub(x = as.character(airfares$S_INCOME),
pattern = c,replacement = "",fixed = TRUE)
airfares$E_INCOME <- sub(x = as.character(airfares$E_INCOME),
pattern = c,replacement = "",fixed = TRUE)
}
airfares$S_INCOME <- as.numeric(airfares$S_INCOME)
airfares$E_INCOME <- as.numeric(airfares$E_INCOME)
airfares$S_POP <- as.numeric(airfares$S_POP)
airfares$E_POP <- as.numeric(airfares$E_POP)
airfares$NEW <- as.numeric(airfares$NEW)
airfares$DISTANCE <- as.numeric(airfares$DISTANCE)
airfares$PAX <- as.numeric(airfares$PAX)
str(airfares)
## 'data.frame': 638 obs. of 18 variables:
## $ S_CODE : Factor w/ 8 levels "*","DCA","EWR",..: 1 1 1 8 7 1 1 1 1 1 ...
## $ S_CITY : Factor w/ 51 levels "Albuquerque NM",..: 14 3 7 9 9 11 14 18 23 25 ...
## $ E_CODE : Factor w/ 8 levels "*","DCA","EWR",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ E_CITY : Factor w/ 68 levels "Amarillo TX",..: 1 2 2 2 2 2 2 2 2 2 ...
## $ COUPON : num 1 1.06 1.06 1.06 1.06 1.01 1.28 1.15 1.33 1.6 ...
## $ NEW : num 3 3 3 3 3 3 3 3 3 2 ...
## $ VACATION: Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 2 1 1 ...
## $ SW : Factor w/ 2 levels "No","Yes": 2 1 1 2 2 2 1 2 2 2 ...
## $ HI : num 5292 5419 9185 2657 2657 ...
## $ S_INCOME: num 28637 26993 30124 29260 29260 ...
## $ E_INCOME: num 21112 29838 29838 29838 29838 ...
## $ S_POP : num 3036732 3532657 5787293 7830332 7830332 ...
## $ E_POP : num 205711 7145897 7145897 7145897 7145897 ...
## $ SLOT : Factor w/ 2 levels "Controlled","Free": 2 2 2 1 2 2 2 2 2 2 ...
## $ GATE : Factor w/ 2 levels "Constrained",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ DISTANCE: num 312 576 364 612 612 ...
## $ PAX : num 7864 8820 6452 25144 25144 ...
## $ FARE : num 4.11 74.47 7.76 5.47 5.47 ...
Explore the numerical predictors and response (FARE) by creating a correlation table and examining some scatterplots between FARE and those predictors. What seems to be the best single predictor of FARE?
numair=airfares[,c(5,6,9:13,16:18)] #separates out numerical columns for correlation
par(mfrow=c(1,2))
cmair=cor(numair,use="complete.obs")
corrplot(cmair,type="lower",main="Airfair Correlation Matrix",mar=c(0,0,1,0),tl.cex=0.8,tl.col="black", tl.srt=45,
col=brewer.pal(n=8, name="PuOr"))
plot(numair$FARE,numair$COUPON,xlab="Coupon",ylab="Fare",pch=20,col="mediumpurple4")
plot(numair$FARE,numair$NEW,xlab="New",ylab="Fare",pch=20,col="mediumpurple4")
plot(numair$FARE,numair$HI,xlab="HI",ylab="Fare",pch=20,col="mediumpurple4")
plot(numair$FARE,numair$S_INCOME,xlab="S Income",ylab="Fare",pch=20,col="mediumpurple4")
plot(numair$FARE,numair$E_INCOME,xlab="E Income",ylab="Fare",pch=20,col="mediumpurple4")
plot(numair$FARE,numair$S_POP,xlab="S Pop",ylab="Fare",pch=20,col="mediumpurple4")
plot(numair$FARE,numair$E_POP,xlab="S Pop",ylab="Fare",pch=20,col="mediumpurple4")
plot(numair$FARE,numair$DISTANCE,xlab="Distance",ylab="Fare",pch=20,col="mediumpurple4")
plot(numair$FARE,numair$PAX,xlab="Pax",ylab="Fare",pch=20,col="mediumpurple4")
round(cmair,2)
## COUPON NEW HI S_INCOME E_INCOME S_POP E_POP DISTANCE PAX FARE
## COUPON 1.00 0.02 -0.35 -0.09 0.05 -0.11 0.09 0.75 -0.34 0.35
## NEW 0.02 1.00 0.05 0.03 0.11 -0.02 0.06 0.08 0.01 0.03
## HI -0.35 0.05 1.00 -0.03 0.08 -0.17 -0.06 -0.31 -0.17 -0.05
## S_INCOME -0.09 0.03 -0.03 1.00 -0.14 0.52 -0.27 0.03 0.14 0.18
## E_INCOME 0.05 0.11 0.08 -0.14 1.00 -0.14 0.46 0.18 0.26 0.17
## S_POP -0.11 -0.02 -0.17 0.52 -0.14 1.00 -0.28 0.02 0.28 0.11
## E_POP 0.09 0.06 -0.06 -0.27 0.46 -0.28 1.00 0.12 0.31 0.17
## DISTANCE 0.75 0.08 -0.31 0.03 0.18 0.02 0.12 1.00 -0.10 0.41
## PAX -0.34 0.01 -0.17 0.14 0.26 0.28 0.31 -0.10 1.00 -0.07
## FARE 0.35 0.03 -0.05 0.18 0.17 0.11 0.17 0.41 -0.07 1.00
Based on the scatterplots, COUPON and DISTANCE exhibit linear relationships with FARE. When we we look at the values in our correlation table we can see that DISTANCE has the strongest correlation and is therefore the best single predictor of FARE.
Explore the categorical predictors (excluding the first four) by computing the percentage of flights in each category. Create a pivot table with the average fare in each category. Which categorical predictor seems best for predicting FARE?
catair=airfares[,c(7,8,14,15,18)] #separates out categorical columns for pivot table
#VACATION
v<-count(catair,'VACATION')
v=mutate(v,
pct = freq / sum(freq))
v
## VACATION freq pct
## 1 No 468 0.7335423
## 2 Yes 170 0.2664577
v2=aggregate(catair$FARE, list(catair$VACATION),mean)
v2
## Group.1 x
## 1 No 36.11660
## 2 Yes 27.45147
#SW
sw<-count(catair,'SW')
sw=mutate(sw,
pct = freq / sum(freq))
sw
## SW freq pct
## 1 No 444 0.6959248
## 2 Yes 194 0.3040752
sw2=aggregate(catair$FARE, list(catair$SW),mean)
sw2
## Group.1 x
## 1 No 41.13324
## 2 Yes 17.04206
#SLOT
s<-count(catair,'SLOT')
s=mutate(s,
pct = freq / sum(freq))
s
## SLOT freq pct
## 1 Controlled 182 0.2852665
## 2 Free 456 0.7147335
s2=aggregate(catair$FARE, list(catair$SLOT),mean)
s2
## Group.1 x
## 1 Controlled 42.21324
## 2 Free 30.45287
#GATE
g<-count(catair,'GATE')
g=mutate(g,
pct = freq / sum(freq))
g
## GATE freq pct
## 1 Constrained 124 0.1943574
## 2 Free 514 0.8056426
g2=aggregate(catair$FARE, list(catair$VACATION),mean)
g2
## Group.1 x
## 1 No 36.11660
## 2 Yes 27.45147
Based on the average FARE as calculated by categorical variables is the highest for Controlled SLOT and the lowest for SW yes. From this we can guess that SouthWest serviced flights will be the cheapest and airports that are known for controlling their landing slot congestion will be the most expensive
Convert categorical variables (e.g, SW) into dummy variables. Then, partition the data into training and validation sets. The model will be fit to the training data and evaluated on the validation set.
dumair<-airfares[c(5:18)]
dumair<-dummy.data.frame(dumair,names = c("VACATION","SW","SLOT","GATE"),sep=".")
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
## Warning in model.matrix.default(~x - 1, model.frame(~x - 1), contrasts = FALSE):
## non-list contrasts argument ignored
names(dumair)
## [1] "COUPON" "NEW" "VACATION.No" "VACATION.Yes"
## [5] "SW.No" "SW.Yes" "HI" "S_INCOME"
## [9] "E_INCOME" "S_POP" "E_POP" "SLOT.Controlled"
## [13] "SLOT.Free" "GATE.Constrained" "GATE.Free" "DISTANCE"
## [17] "PAX" "FARE"
smp_size <- floor(0.75 * nrow(dumair))
## set the seed to make your partition reproductible
set.seed(123)
train_ind <- sample(seq_len(nrow(dumair)), size = smp_size)
train <- dumair[train_ind, ]
test <- dumair[-train_ind, ]
Use stepwise regression to reduce the number of predictors. You can ignore the first four predictors (S_CODE, S_CITY, E_CODE, E_CITY). Report the estimated model selected.
#create a full model first
trainlm<- train
trainlm<- lm(FARE~.,data = trainlm)
smr <- summary(trainlm)
paste(paste("Multiple R-squared: ",round(smr$r.squared,digits=6)),
paste("Adjusted R-squared: ",round(smr$adj.r.squared,digits=6)))
## [1] "Multiple R-squared: 0.314639 Adjusted R-squared: 0.295437"
#run stepwise regression
trainlmstep <- lm(FARE~.,data=train)
step(trainlmstep, direction = "both")
## Start: AIC=3062.42
## FARE ~ COUPON + NEW + VACATION.No + VACATION.Yes + SW.No + SW.Yes +
## HI + S_INCOME + E_INCOME + S_POP + E_POP + SLOT.Controlled +
## SLOT.Free + GATE.Constrained + GATE.Free + DISTANCE + PAX
##
##
## Step: AIC=3062.42
## FARE ~ COUPON + NEW + VACATION.No + VACATION.Yes + SW.No + SW.Yes +
## HI + S_INCOME + E_INCOME + S_POP + E_POP + SLOT.Controlled +
## SLOT.Free + GATE.Constrained + DISTANCE + PAX
##
##
## Step: AIC=3062.42
## FARE ~ COUPON + NEW + VACATION.No + VACATION.Yes + SW.No + SW.Yes +
## HI + S_INCOME + E_INCOME + S_POP + E_POP + SLOT.Controlled +
## GATE.Constrained + DISTANCE + PAX
##
##
## Step: AIC=3062.42
## FARE ~ COUPON + NEW + VACATION.No + VACATION.Yes + SW.No + HI +
## S_INCOME + E_INCOME + S_POP + E_POP + SLOT.Controlled + GATE.Constrained +
## DISTANCE + PAX
##
##
## Step: AIC=3062.42
## FARE ~ COUPON + NEW + VACATION.No + SW.No + HI + S_INCOME + E_INCOME +
## S_POP + E_POP + SLOT.Controlled + GATE.Constrained + DISTANCE +
## PAX
##
## Df Sum of Sq RSS AIC
## - E_INCOME 1 121.6 273269 3060.6
## - COUPON 1 474.3 273622 3061.2
## - NEW 1 583.0 273731 3061.4
## - HI 1 698.1 273846 3061.6
## - SLOT.Controlled 1 933.1 274081 3062.1
## <none> 273148 3062.4
## - S_INCOME 1 1728.8 274877 3063.4
## - VACATION.No 1 2045.9 275194 3064.0
## - S_POP 1 2191.8 275340 3064.2
## - GATE.Constrained 1 2887.8 276036 3065.4
## - E_POP 1 2917.0 276065 3065.5
## - PAX 1 3869.2 277017 3067.1
## - SW.No 1 10323.7 283472 3078.2
## - DISTANCE 1 14417.2 287565 3085.0
##
## Step: AIC=3060.63
## FARE ~ COUPON + NEW + VACATION.No + SW.No + HI + S_INCOME + S_POP +
## E_POP + SLOT.Controlled + GATE.Constrained + DISTANCE + PAX
##
## Df Sum of Sq RSS AIC
## - COUPON 1 449.8 273719 3059.4
## - NEW 1 526.6 273796 3059.6
## - HI 1 818.9 274088 3060.1
## - SLOT.Controlled 1 1047.0 274316 3060.5
## <none> 273269 3060.6
## - S_INCOME 1 1627.7 274897 3061.5
## - S_POP 1 2101.9 275371 3062.3
## - VACATION.No 1 2163.4 275433 3062.4
## + E_INCOME 1 121.6 273148 3062.4
## - GATE.Constrained 1 2862.6 276132 3063.6
## - E_POP 1 3239.2 276509 3064.3
## - PAX 1 3754.9 277024 3065.2
## - SW.No 1 11094.6 284364 3077.7
## - DISTANCE 1 15219.3 288489 3084.5
##
## Step: AIC=3059.42
## FARE ~ NEW + VACATION.No + SW.No + HI + S_INCOME + S_POP + E_POP +
## SLOT.Controlled + GATE.Constrained + DISTANCE + PAX
##
## Df Sum of Sq RSS AIC
## - HI 1 526 274245 3058.3
## - NEW 1 584 274303 3058.4
## - SLOT.Controlled 1 1139 274858 3059.4
## <none> 273719 3059.4
## - S_INCOME 1 1489 275208 3060.0
## + COUPON 1 450 273269 3060.6
## - S_POP 1 1945 275664 3060.8
## + E_INCOME 1 97 273622 3061.2
## - VACATION.No 1 2329 276049 3061.5
## - GATE.Constrained 1 2896 276615 3062.4
## - E_POP 1 3386 277105 3063.3
## - PAX 1 5865 279584 3067.6
## - SW.No 1 11605 285324 3077.3
## - DISTANCE 1 35552 309271 3115.8
##
## Step: AIC=3058.34
## FARE ~ NEW + VACATION.No + SW.No + S_INCOME + S_POP + E_POP +
## SLOT.Controlled + GATE.Constrained + DISTANCE + PAX
##
## Df Sum of Sq RSS AIC
## - NEW 1 537 274782 3057.3
## - SLOT.Controlled 1 958 275203 3058.0
## <none> 274245 3058.3
## - S_INCOME 1 1488 275733 3058.9
## - S_POP 1 1676 275921 3059.2
## + HI 1 526 273719 3059.4
## + E_INCOME 1 197 274048 3060.0
## + COUPON 1 157 274088 3060.1
## - VACATION.No 1 2848 277093 3061.3
## - GATE.Constrained 1 2958 277203 3061.5
## - E_POP 1 3164 277409 3061.8
## - PAX 1 6140 280385 3066.9
## - SW.No 1 12859 287104 3078.2
## - DISTANCE 1 36901 311146 3116.7
##
## Step: AIC=3057.27
## FARE ~ VACATION.No + SW.No + S_INCOME + S_POP + E_POP + SLOT.Controlled +
## GATE.Constrained + DISTANCE + PAX
##
## Df Sum of Sq RSS AIC
## - SLOT.Controlled 1 873 275655 3056.8
## <none> 274782 3057.3
## - S_INCOME 1 1451 276233 3057.8
## - S_POP 1 1760 276542 3058.3
## + NEW 1 537 274245 3058.3
## + HI 1 479 274303 3058.4
## + COUPON 1 197 274585 3058.9
## + E_INCOME 1 117 274665 3059.1
## - VACATION.No 1 2716 277498 3060.0
## - GATE.Constrained 1 2874 277656 3060.2
## - E_POP 1 3185 277967 3060.8
## - PAX 1 6183 280965 3065.9
## - SW.No 1 12776 287558 3077.0
## - DISTANCE 1 36371 311152 3114.7
##
## Step: AIC=3056.79
## FARE ~ VACATION.No + SW.No + S_INCOME + S_POP + E_POP + GATE.Constrained +
## DISTANCE + PAX
##
## Df Sum of Sq RSS AIC
## <none> 275655 3056.8
## + SLOT.Controlled 1 873 274782 3057.3
## + SLOT.Free 1 873 274782 3057.3
## - S_INCOME 1 1799 277454 3057.9
## + NEW 1 453 275203 3058.0
## + HI 1 317 275338 3058.2
## + COUPON 1 285 275370 3058.3
## + E_INCOME 1 200 275455 3058.4
## - GATE.Constrained 1 2130 277785 3058.5
## - VACATION.No 1 2718 278373 3059.5
## - S_POP 1 2860 278516 3059.7
## - E_POP 1 5043 280698 3063.4
## - PAX 1 6471 282126 3065.9
## - SW.No 1 14861 290517 3079.9
## - DISTANCE 1 35508 311163 3112.7
##
## Call:
## lm(formula = FARE ~ VACATION.No + SW.No + S_INCOME + S_POP +
## E_POP + GATE.Constrained + DISTANCE + PAX, data = train)
##
## Coefficients:
## (Intercept) VACATION.No SW.No S_INCOME
## -2.001e+01 6.060e+00 1.385e+01 7.068e-04
## S_POP E_POP GATE.Constrained DISTANCE
## 1.093e-06 1.612e-06 5.741e+00 1.424e-02
## PAX
## -3.360e-04
#select model
trainmodel <- lm(formula = FARE ~ VACATION.No + SW.No + S_INCOME + S_POP +
E_POP + GATE.Constrained + DISTANCE + PAX, data = train)
smr2 <- summary(trainmodel)
paste(paste("Multiple R-squared: ",round(smr2$r.squared,digits=6)),
paste("Adjusted R-squared: ",round(smr2$adj.r.squared,digits=6)))
## [1] "Multiple R-squared: 0.308348 Adjusted R-squared: 0.29655"
Based on the stepwise regression we create a training model with the variables: VACATION.No, SW.No, S_INCOME, S_POP, E_POP, GATE.Constrained, DISTANCE, PAX.
Repeat (ii) using exhaustive search instead of stepwise regression. Compare the resulting best model to the one you obtained in (ii) in terms of the predictors that are in the model.
trainsearch <- regsubsets(FARE~.,data = train, nbest=1,nvmax=dim(train)[2],
method = "exhaustive")
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 4 linear dependencies found
## Reordering variables and trying again:
sum<-summary(trainsearch)
#show models
sum$which
## (Intercept) COUPON NEW VACATION.No VACATION.Yes SW.No SW.Yes HI
## 1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## 3 TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## 4 TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## 5 TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## 6 TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## 7 TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## 8 TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## 9 TRUE FALSE FALSE FALSE TRUE TRUE FALSE FALSE
## 10 TRUE FALSE TRUE TRUE FALSE TRUE FALSE FALSE
## 11 TRUE FALSE TRUE FALSE TRUE TRUE FALSE TRUE
## 12 TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
## 13 TRUE TRUE TRUE TRUE FALSE TRUE FALSE TRUE
## S_INCOME E_INCOME S_POP E_POP SLOT.Controlled SLOT.Free GATE.Constrained
## 1 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 3 FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 4 FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## 5 FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## 6 FALSE FALSE FALSE FALSE FALSE TRUE TRUE
## 7 FALSE FALSE TRUE TRUE FALSE FALSE TRUE
## 8 TRUE FALSE TRUE TRUE FALSE FALSE TRUE
## 9 TRUE FALSE TRUE TRUE FALSE TRUE TRUE
## 10 TRUE FALSE TRUE TRUE TRUE FALSE TRUE
## 11 TRUE FALSE TRUE TRUE TRUE FALSE TRUE
## 12 TRUE FALSE TRUE TRUE TRUE FALSE TRUE
## 13 TRUE TRUE TRUE TRUE TRUE FALSE TRUE
## GATE.Free DISTANCE PAX
## 1 FALSE TRUE FALSE
## 2 FALSE TRUE FALSE
## 3 FALSE TRUE FALSE
## 4 FALSE TRUE FALSE
## 5 TRUE TRUE FALSE
## 6 FALSE TRUE TRUE
## 7 FALSE TRUE TRUE
## 8 FALSE TRUE TRUE
## 9 FALSE TRUE TRUE
## 10 FALSE TRUE TRUE
## 11 FALSE TRUE TRUE
## 12 FALSE TRUE TRUE
## 13 FALSE TRUE TRUE
#show metrics
sum$rsq
## [1] 0.1583163 0.2452912 0.2760658 0.2847776 0.2916732 0.2986068 0.3038347
## [8] 0.3083478 0.3105391 0.3118862 0.3132056 0.3143343 0.3146393
sum$adjr2
## [1] 0.1565481 0.2421135 0.2714839 0.2787292 0.2841697 0.2896718 0.2934662
## [8] 0.2965499 0.2972802 0.2971514 0.2969938 0.2966397 0.2954373
sum$cp
## [1] 90.920723 34.545000 15.889744 12.042563 9.414349 6.760675 5.251829
## [8] 4.222668 4.751969 5.847800 6.962217 8.204709 10.000000
#select model
trainmodel2 <- lm(formula=FARE ~ VACATION.Yes + SW.No + S_INCOME + S_POP + E_POP + SLOT.Free + GATE.Constrained + DISTANCE + PAX, data = train)
smr3 <- summary(trainmodel2)
paste(paste("Multiple R-squared: ",round(smr3$r.squared,digits=6)),
paste("Adjusted R-squared: ",round(smr3$adj.r.squared,digits=6)))
## [1] "Multiple R-squared: 0.310539 Adjusted R-squared: 0.29728"
Using the exhaustive search function we ended up with a longer and slightly better performing formula that includes VACATION.Yes, SW.No, S_INCOME, S_POP, E_POP, SLOT.Free, GATE.Constrained, DISTANCE, and PAX.
Compare the predictive accuracy of both models (ii) and (iii) using measures such as RMSE and average error and lift charts.
tmp1 <- predict(trainmodel,test)
accuracy(tmp1,test$FARE)
## ME RMSE MAE MPE MAPE
## Test set -2.909937 24.52203 19.61812 -501.7952 526.5029
tmp2 <- predict(trainmodel2,test)
accuracy(tmp2,test$FARE)
## ME RMSE MAE MPE MAPE
## Test set -2.903888 24.26736 19.34281 -489.9047 513.8652
anova(trainmodel,trainmodel2)
## Analysis of Variance Table
##
## Model 1: FARE ~ VACATION.No + SW.No + S_INCOME + S_POP + E_POP + GATE.Constrained +
## DISTANCE + PAX
## Model 2: FARE ~ VACATION.Yes + SW.No + S_INCOME + S_POP + E_POP + SLOT.Free +
## GATE.Constrained + DISTANCE + PAX
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 469 275655
## 2 468 274782 1 873.3 1.4874 0.2232
par(mfrow=c(1,2))
plotLift(tmp1,labels = tmp1,n.buckets=5,main="ii Model")
plotLift(tmp2,labels = tmp2,n.buckets=5,main="iii Model")
Using model (iii), predict the average fare on a route with the following characteristics: COUPON = 1.202, NEW = 3, VACATION = No, SW = No, HI = 4442.141, S_INCOME = $28,760, E_INCOME = $27,664, S_POP = 4,557,004, E_POP = 3,195,503, SLOT = Free, GATE = Free, PAX = 12,782, DISTANCE = 1976 miles.
newdata1 <- data.frame(COUPON = 1.202, NEW = 3, VACATION.Yes = 0, SW.No = 1, HI = 4442.141, S_INCOME = 28760, E_INCOME = 27664, S_POP = 4557004, E_POP = 3195503, SLOT.Free = 1, GATE.Constrained = 0, PAX = 12782, DISTANCE = 1976)
predict(trainmodel2,newdata = newdata1)
## 1
## 52.91415
The average fare will be $52.91
Predict the reduction in average fare on the route in (v) if Southwest decides to cover this route [using model (iii)].
newdata2 <- data.frame(COUPON = 1.202, NEW = 3, VACATION.Yes = 0, SW.No = 0, HI = 4442.141, S_INCOME = 28760, E_INCOME = 27664, S_POP = 4557004, E_POP = 3195503, SLOT.Free = 1, GATE.Constrained = 0, PAX = 12782, DISTANCE = 1976)
predict(trainmodel2,newdata = newdata2)
## 1
## 39.78239
If Southwest decides to cover this route the average fare will drop to $39.78.
In reality, which of the factors will not be available for predicting the average fare from a new airport (i.e., before flights start operating on those routes)? Which ones can be estimated? How?
Since HI, PAX, and NEW are calculated based on the route, those would not be available prior to starting a new route. COUPON, SW, Incomes, and Populations can all be predicted with a high level of confidence since those factors will remain constant regardless of the proposed flight. VACATION, SLOT, and GATE can be predicted based on the proposed route and knowlege of the airport’s policies.
Select a model that includes only factors that are available before flights begin to operate on the new route. Use an exhaustive search to find such a model.
trainsearch2 <- regsubsets(FARE~ COUPON + VACATION.Yes + VACATION.No + SW.Yes + SW.No +
S_INCOME + E_INCOME + S_POP + E_POP + SLOT.Free +
SLOT.Controlled + GATE.Free + GATE.Constrained + DISTANCE,
data = train, nbest=1,nvmax=dim(train)[2],
method = "exhaustive")
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax, force.in =
## force.in, : 4 linear dependencies found
## Reordering variables and trying again:
sum2<-summary(trainsearch2)
#show models
sum2$which
## (Intercept) COUPON VACATION.Yes VACATION.No SW.Yes SW.No S_INCOME E_INCOME
## 1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 2 TRUE FALSE FALSE FALSE TRUE FALSE FALSE FALSE
## 3 TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE
## 4 TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE
## 5 TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
## 6 TRUE TRUE TRUE FALSE FALSE TRUE FALSE FALSE
## 7 TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE
## 8 TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE
## 9 TRUE TRUE TRUE FALSE TRUE FALSE TRUE FALSE
## 10 TRUE TRUE TRUE FALSE TRUE FALSE TRUE TRUE
## S_POP E_POP SLOT.Free SLOT.Controlled GATE.Free GATE.Constrained DISTANCE
## 1 FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## 2 FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## 3 FALSE FALSE FALSE FALSE FALSE FALSE TRUE
## 4 FALSE FALSE FALSE FALSE TRUE FALSE TRUE
## 5 FALSE FALSE FALSE TRUE TRUE FALSE TRUE
## 6 FALSE FALSE TRUE FALSE FALSE TRUE TRUE
## 7 FALSE FALSE FALSE TRUE FALSE TRUE TRUE
## 8 FALSE TRUE FALSE TRUE FALSE TRUE TRUE
## 9 TRUE TRUE FALSE TRUE FALSE TRUE TRUE
## 10 TRUE TRUE TRUE FALSE TRUE FALSE TRUE
#show metrics
sum2$rsq
## [1] 0.1583163 0.2452912 0.2760658 0.2847776 0.2916732 0.2941584 0.2973583
## [8] 0.2985959 0.2995023 0.2995026
sum2$adjr2
## [1] 0.1565481 0.2421135 0.2714839 0.2787292 0.2841697 0.2851668 0.2868934
## [8] 0.2866317 0.2860312 0.2845026
sum2$cp
## [1] 82.318297 26.831500 8.490775 4.732633 2.174896 2.532251 2.417275
## [8] 3.599272 5.000151 7.000000
#select model
trainmodel3 <- lm(formula=FARE ~ VACATION.Yes + SW.Yes + S_INCOME + E_POP + SLOT.Controlled + GATE.Constrained + DISTANCE, data = train)
smr4 <- summary(trainmodel3)
paste(paste("Multiple R-squared: ",round(smr4$r.squared,digits=6)),
paste("Adjusted R-squared: ",round(smr4$adj.r.squared,digits=6)))
## [1] "Multiple R-squared: 0.294716 Adjusted R-squared: 0.284212"
Use the model in (viii) to predict the average fare on a route with characteristics COUPON = 1.202, NEW = 3, VACATION = No, SW = No, HI = 4442.141, S_INCOME = $28,760, E_INCOME = $27,664, S_ POP = 4,557,004, E_POP = 3,195,503, SLOT = Free, GATE = Free, PAX = 12782, DISTANCE = 1976 miles.
newdata3 <- data.frame(COUPON = 1.202, NEW = 3, VACATION.Yes = 0, SW.Yes = 0, HI = 4442.141, S_INCOME = 28760, E_INCOME = 27664, S_POP = 4557004, E_POP = 3195503, SLOT.Controlled = 0, GATE.Constrained = 0, PAX = 12782, DISTANCE = 1976)
predict(trainmodel3,newdata = newdata3)
## 1
## 54.33273
The average fare will be $54.33
Compare the predictive accuracy of this model with model (iii). Is this model good enough, or is it worthwhile reevaluating the model once flights begin on the new route?
tmp2 <- predict(trainmodel2,test)
accuracy(tmp2,test$FARE)
## ME RMSE MAE MPE MAPE
## Test set -2.903888 24.26736 19.34281 -489.9047 513.8652
tmp3 <- predict(trainmodel3,test)
accuracy(tmp3,test$FARE)
## ME RMSE MAE MPE MAPE
## Test set -2.683554 24.4353 19.71071 -501.8301 526.8958
anova(trainmodel2,trainmodel3)
## Analysis of Variance Table
##
## Model 1: FARE ~ VACATION.Yes + SW.No + S_INCOME + S_POP + E_POP + SLOT.Free +
## GATE.Constrained + DISTANCE + PAX
## Model 2: FARE ~ VACATION.Yes + SW.Yes + S_INCOME + E_POP + SLOT.Controlled +
## GATE.Constrained + DISTANCE
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 468 274782
## 2 470 281088 -2 -6306.2 5.3703 0.004944 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
This model appears to be good enough and possibly slighly more accurate than the all inclusive model.