# code chunk specifies whether the R code, warnings, and output 
# will be included in the output files.
library(knitr)
knitr::opts_chunk$set(echo = TRUE,      # include code chunk in the output file
                      warnings = FALSE, # sometimes, you code may produce warning messages,
                                        # you can choose to include the warning messages in
                                        # the output file. 
                      results = TRUE    # you can also decide whether to include the output
                                        # in the output file.
                      )  
airfare = read.table("https://users.stat.ufl.edu/~winner/data/airq402.dat")

##renaming the variables to include the dataset's context:
colnames(airfare)[1] <- "City1"
colnames(airfare)[2] <- "City2"
colnames(airfare)[3] <- "averageFare"
colnames(airfare)[4] <- "distance"
colnames(airfare)[5] <- "averageWeeklyPassengers"
colnames(airfare)[6] <- "marketLeadingAirline"
colnames(airfare)[7] <- "marketShare"
colnames(airfare)[8] <- "averageFare2"
colnames(airfare)[9] <- "lowPriceAirline"
colnames(airfare)[10] <- "marketShare2"
colnames(airfare)[11] <- "price"



Introduction & Description of the Data

This data set is called airq402.dat. It describes the airfares and passengers for U.S. Domestic Routes for 4th Quarter of 2002. That is, in includes information concerning airfare pricing, passenger volume, market share, and distances flown. The data was sourced by the United States Department of Transportation, and presumably comes from a random sample of size n=1,000 of the total number of flights that occured in the fourth quarter of 2002, from the first of October to the 31st of December. The variables and respective descriptions are as follows:

As for practical and analytical questions, I am curious to know which cities had the highest volume of people travelling to and from them, especially considering these data were collected during the fourth fiscal quarter of the year, during the holiday season. I would also like to know why some variables are repeated with different observation values.

The data set does have enough information for me to answer my main statistical inquiry of the relationship between distance and average fare. In the dataset there are elevel variables, and 1,000 total observations. It is stipulated in the guidelines that there are to be at least 15 observations for each variable.


Practical Question

Our objective and primary question is to determine the relationship between airplane ticket price and relevant predictor variables in the dataset.



Exploratory Data Analysis

As shown in last week’s assignment, a pairwise scatterplot of the data show that multiple variables seem highly correlated, and may in fact be repetitive given the nature of the names they were given. Examples include marketShare and marketShare2, which in the original data, we both simply named “market share”.

airfareNumVars <- c(3, 4, 5, 7, 8, 10, 11)
airfareNumeric <- airfare[airfareNumVars]

pairs(airfareNumeric, main ="Pair-wise Association: Scatter Plot")

Because of the extreme correlation between variables like marketShare and marketShare2, I will remove the duplicate variables from the dataset, and create a subset called airfaireMult with only the variables that I am interested in.

airfareMult <- airfare[c(3, 4, 5, 7, 11)]
kable(head(airfareMult))
averageFare distance averageWeeklyPassengers marketShare price
114.47 528 424.56 70.19 111.03
122.47 860 276.84 75.10 118.94
214.42 852 215.76 78.89 167.12
69.40 288 606.84 96.97 68.86
158.13 723 313.04 39.79 145.42
135.17 1204 199.02 40.68 127.69

Multiple Linear Regression Model & Diagnostics

full.model = lm(price ~ ., data = airfareMult)
kable(summary(full.model)$coef, caption ="Statistics of Regression Coefficients")
Statistics of Regression Coefficients
Estimate Std. Error t value Pr(>|t|)
(Intercept) 41.4524702 4.1537034 9.979641 0.0000000
averageFare 0.6873902 0.0163178 42.125196 0.0000000
distance 0.0042585 0.0016127 2.640632 0.0084048
averageWeeklyPassengers -0.0025610 0.0009608 -2.665499 0.0078119
marketShare -0.2218813 0.0448108 -4.951510 0.0000009

Next I will conduct residual diagnostic tests:

par(mfrow=c(2,2))
plot(full.model)

We can see from the residual plots that there are some violations:

We first perform Box-Cox transformation to correct the non-constant variance and correct the non-normality of the QQ plot.

library(MASS)
par(pty = "s", mfrow = c(2, 2), oma=c(.1,.1,.1,.1), mar=c(4, 0, 2, 0))
##
boxcox(price ~ averageWeeklyPassengers + averageFare + distance +  log(marketShare)  
       , data = airfareMult, lambda = seq(0, 1, length = 10), 
       xlab=expression(paste(lambda, ": log dist2MRT")))
##
boxcox(price ~ averageWeeklyPassengers + averageFare + distance +  marketShare
       , data = airfareMult, lambda = seq(-0.5, 1, length = 10), 
       xlab=expression(paste(lambda, ": dist2MRT")))
##
boxcox(price ~ log(1+averageWeeklyPassengers) + averageFare + distance +  marketShare 
      , data = airfareMult, lambda = seq(-0.5, 1, length = 10), 
       xlab=expression(paste(lambda, ": log-age")))
##
boxcox(price ~ log(1+averageWeeklyPassengers) + averageFare + distance +  log(marketShare)
       , data = airfareMult, lambda = seq(-0.5, 1, length = 10), 
      xlab=expression(paste(lambda, ": log-age, log.dist2MRT")))

sqrt.price.log.share = lm((price)^0.5 ~ averageWeeklyPassengers + averageFare + distance +  log(marketShare)  , data = airfareMult)
kable(summary(sqrt.price.log.share)$coef, caption = "log-transformed model")
log-transformed model
Estimate Std. Error t value Pr(>|t|)
(Intercept) 9.0478876 0.4356211 20.770087 0.0000000
averageWeeklyPassengers -0.0001440 0.0000381 -3.777641 0.0001677
averageFare 0.0281128 0.0006526 43.078822 0.0000000
distance 0.0002236 0.0000655 3.413054 0.0006683
log(marketShare) -0.4881688 0.0993120 -4.915508 0.0000010
par(mfrow = c(2,2))
plot(sqrt.price.log.share)

log.price = lm(log(price) ~ averageWeeklyPassengers + averageFare + distance +  marketShare, data = airfareMult)
kable(summary(log.price)$coef, caption = "log-transformed model")
log-transformed model
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.2582865 0.0281403 151.323388 0.00e+00
averageWeeklyPassengers -0.0000323 0.0000065 -4.968216 8.00e-07
averageFare 0.0047022 0.0001105 42.534880 0.00e+00
distance 0.0000362 0.0000109 3.316360 9.45e-04
marketShare -0.0022034 0.0003036 -7.258122 0.00e+00
par(mfrow = c(2,2))
plot(log.price)

#define plotting area
par(pty = "s", mfrow = c(1, 3))
#Q-Q plot for original model
qqnorm(full.model$residuals, main = "Full-Model")
qqline(full.model$residuals)
#Q-Q plot for Box-Cox transformed model
qqnorm(log.price$residuals, main = "Log-Price")
qqline(log.price$residuals)
#display both Q-Q plots
qqnorm(sqrt.price.log.share$residuals, main = "sqrt price log dist")
qqline(sqrt.price.log.share$residuals)

Goodness of Fit Measures

select=function(m){ # m is an object: model
 e = m$resid                           # residuals
 n0 = length(e)                        # sample size
 SSE=(m$df)*(summary(m)$sigma)^2       # sum of squared error
 R.sq=summary(m)$r.squared             # Coefficient of determination: R square!
 R.adj=summary(m)$adj.r                # Adjusted R square
 MSE=(summary(m)$sigma)^2              # square error
 Cp=(SSE/MSE)-(n0-2*(n0-m$df))         # Mellow's p
 AIC=n0*log(SSE)-n0*log(n0)+2*(n0-m$df)          # Akaike information criterion
 SBC=n0*log(SSE)-n0*log(n0)+(log(n0))*(n0-m$df)  # Schwarz Bayesian Information criterion
 X=model.matrix(m)                     # design matrix of the model
 H=X%*%solve(t(X)%*%X)%*%t(X)          # hat matrix
 d=e/(1-diag(H))                       
 PRESS=t(d)%*%d   # predicted residual error sum of squares (PRESS)- a cross-validation measure
 tbl = as.data.frame(cbind(SSE=SSE, R.sq=R.sq, R.adj = R.adj, Cp = Cp, AIC = AIC, SBC = SBC, PRD = PRESS))
 names(tbl)=c("SSE", "R.sq", "R.adj", "Cp", "AIC", "SBC", "PRESS")
 tbl
 }

output.sum = rbind(select(full.model), select(sqrt.price.log.share), select(log.price))
row.names(output.sum) = c("full.model", "sqrt.price.log.dist", "log.price")
kable(output.sum, caption = "Goodness-of-fit Measures of Candidate Models")
Goodness-of-fit Measures of Candidate Models
SSE R.sq R.adj Cp AIC SBC PRESS
full.model 521669.29147 0.7658960 0.7649548 5 6267.034 6291.5726 529107.91138
sqrt.price.log.dist 822.47761 0.7805446 0.7796624 5 -185.434 -160.8952 833.20936
log.price 23.94319 0.7815141 0.7806357 5 -3722.071 -3697.5327 24.25261

We see that the Goodness of Fit measures of the third model are superior, thus we choose to use the third model as our final model.

kable(summary(log.price)$coef, caption = "Inferential Statistics of Final Model")
Inferential Statistics of Final Model
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.2582865 0.0281403 151.323388 0.00e+00
averageWeeklyPassengers -0.0000323 0.0000065 -4.968216 8.00e-07
averageFare 0.0047022 0.0001105 42.534880 0.00e+00
distance 0.0000362 0.0000109 3.316360 9.45e-04
marketShare -0.0022034 0.0003036 -7.258122 0.00e+00
cmtrx <- summary(log.price)$coef
kable(cmtrx, caption = "Inferential Statistics of Final Model")
Inferential Statistics of Final Model
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.2582865 0.0281403 151.323388 0.00e+00
averageWeeklyPassengers -0.0000323 0.0000065 -4.968216 8.00e-07
averageFare 0.0047022 0.0001105 42.534880 0.00e+00
distance 0.0000362 0.0000109 3.316360 9.45e-04
marketShare -0.0022034 0.0003036 -7.258122 0.00e+00

Summary of the Model

\[ \log(price) =4.2583 - 0.0000323\times averageWeeklyPassengers +0.0047022\times averageFare + 0.0000362\times distance - 0.0022034\times marketShare \]

averageWeeklyPassengers and marketShare are both negatively associated with price, while distance and averageFare are positively associated with price.

Discussion

We used regression techniques such as box-cox transformation for the response, and other transformations of the independent variables to identify a suitable final model for this dataset. Due to the nature of some of the variables from the initial data set, I performed variable selection to remove what appeared to be repetitive variables from the data set.

I used common global goodness-of-fit tests for model selection.

I had difficulty addressing and interpreting the true meaning of the final regression coefficients because of the nature of the transformations used.

The violation of the assumption of normality persists.

Bootstrapping Cases

Here, we use bootstrapping to discern the confidence intervals of the regression coefficients from the final model.

##
B = 1000       # choose the number of bootstrap replicates.
## 
num.p = dim(model.frame(log.price))[2]  # returns number of parameters in the model
smpl.n = dim(model.frame(log.price))[1] # sample size
## zero matrix to store bootstrap coefficients 
coef.mtrx = matrix(rep(0, B*num.p), ncol = num.p)       
## 
for (i in 1:B){
  bootc.id = sample(1:smpl.n, smpl.n, replace = TRUE) # fit final model to the bootstrap sample
  log.price.btc = lm(log(price) ~ averageWeeklyPassengers + averageFare + distance +  marketShare, data = airfareMult[bootc.id,])     
  coef.mtrx[i,] = coef(log.price.btc)    # extract coefs from bootstrap regression model    
}

Here we create histograms of the bootstrap estimates of regression coefficients.

boot.hist = function(cmtrx, bt.coef.mtrx, var.id, var.nm){
  ## bt.coef.mtrx = matrix for storing bootstrap estimates of coefficients
  ## var.id = variable ID (1, 2, ..., k+1)
  ## var.nm = variable name on the hist title, must be the string in the double quotes
  ## coefficient matrix of the final model
  ## Bootstrap sampling distribution of the estimated coefficients
  x1.1 <- seq(min(bt.coef.mtrx[,var.id]), max(bt.coef.mtrx[,var.id]), length=300 )
  y1.1 <- dnorm(x1.1, mean(bt.coef.mtrx[,var.id]), sd(bt.coef.mtrx[,var.id]))
  # height of the histogram - use it to make a nice-looking histogram.
  highestbar = max(hist(bt.coef.mtrx[,var.id], plot = FALSE)$density) 
  ylimit <- max(c(y1.1,highestbar))
  hist(bt.coef.mtrx[,var.id], probability = TRUE, main = var.nm, xlab="", 
       col = "azure1",ylim=c(0,ylimit), border="lightseagreen")
  lines(x = x1.1, y = y1.1, col = "red3")
  lines(density(bt.coef.mtrx[,var.id], adjust=2), col="blue") 
  #legend("topright", c(""))
}


par(mfrow=c(2,3))  # histograms of bootstrap coefs
boot.hist(bt.coef.mtrx=coef.mtrx, var.id=1, var.nm ="Intercept" )
boot.hist(bt.coef.mtrx=coef.mtrx, var.id=2, var.nm ="Average Weekly Passengers" )
boot.hist(bt.coef.mtrx=coef.mtrx, var.id=3, var.nm ="Average Fare" )
boot.hist(bt.coef.mtrx=coef.mtrx, var.id=4, var.nm ="Distance" )
boot.hist(bt.coef.mtrx=coef.mtrx, var.id=5, var.nm ="Market Share" )

The red density curve uses the estimated regression coefficients and their corresponding standard error in the output of the regression procedure. The p-values reported in the output are based on the red curve.

The blue curve is a non-parametric data-driven estimate of the density of bootstrap sampling distribution. The bootstrap confidence intervals of the regressions are based on these non-parametric bootstrap sampling distributions.

Evidently, the curves in each histogram are quite close to each other, which hints at a consistency between the significance test results and the corresponding bootstrap intervals.

Next, I’ll find 95% bootstrap confidence intervals of each regression coefficient.

num.p = dim(coef.mtrx)[2]  # number of parameters
btc.ci = NULL
btc.wd = NULL
for (i in 1:num.p){
  lci.025 = round(quantile(coef.mtrx[, i], 0.025, type = 2),8)
  uci.975 = round(quantile(coef.mtrx[, i],0.975, type = 2 ),8)
  btc.wd[i] =  uci.975 - lci.025
  btc.ci[i] = paste("[", round(lci.025,4),", ", round(uci.975,4),"]")
 }
#as.data.frame(btc.ci)
kable(as.data.frame(cbind(formatC(cmtrx,4,format="f"), btc.ci.95=btc.ci)), 
      caption = "Regression Coefficient Matrix")
Regression Coefficient Matrix
Estimate Std. Error t value Pr(>|t|) btc.ci.95
(Intercept) 4.2583 0.0281 151.3234 0.0000 [ 4.1891 , 4.3255 ]
averageWeeklyPassengers -0.0000 0.0000 -4.9682 0.0000 [ 0 , 0 ]
averageFare 0.0047 0.0001 42.5349 0.0000 [ 0.0044 , 0.005 ]
distance 0.0000 0.0000 3.3164 0.0009 [ 0 , 1e-04 ]
marketShare -0.0022 0.0003 -7.2581 0.0000 [ -0.0028 , -0.0016 ]

The results are decently consistent.

Bootstrap Residuals

hist(sort(log.price$residuals),n=40,
     xlab="Residuals",
     col = "lightblue",
     border="navy",
     main = "Histogram of Residuals")

This histogram of the residuals shows one outlier and a leftward skew.

I will now generate bootstrap confidence intervals of regression coefficients, and plot them in histograms.

## Final model
#log.price <- lm(log(PriceUnitArea) ~ HouseAge + NumConvenStores + sale.year +  
#                 Dist2MRT.kilo  + geo.group, data = final.data)
model.resid = log.price$residuals
##
B=1000
num.p = dim(model.matrix(log.price))[2]   # number of parameters
samp.n = dim(model.matrix(log.price))[1]  # sample size
btr.mtrx = matrix(rep(0,6*B), ncol=num.p) # zero matrix to store boot coefs
for (i in 1:B){
  ## Bootstrap response values
  bt.lg.price = log.price$fitted.values + 
        sample(log.price$residuals, samp.n, replace = TRUE)  # bootstrap residuals
  # replace PriceUnitArea with bootstrap log price
  airfareMult$bt.lg.price =  bt.lg.price   #  send the boot response to the data
  btr.model = lm(bt.lg.price ~ averageWeeklyPassengers + averageFare + distance +  
                 marketShare, data = airfareMult)   # b
  btr.mtrx[i,]=btr.model$coefficients
}

boot.hist = function(bt.coef.mtrx, var.id, var.nm){
  ## bt.coef.mtrx = matrix for storing bootstrap estimates of coefficients
  ## var.id = variable ID (1, 2, ..., k+1)
  ## var.nm = variable name on the hist title, must be the string in the double quotes
  ## Bootstrap sampling distribution of the estimated coefficients
  x1.1 <- seq(min(bt.coef.mtrx[,var.id]), max(bt.coef.mtrx[,var.id]), length=300 )
  y1.1 <- dnorm(x1.1, mean(bt.coef.mtrx[,var.id]), sd(bt.coef.mtrx[,var.id]))
  # height of the histogram - use it to make a nice-looking histogram.
  highestbar = max(hist(bt.coef.mtrx[,var.id], plot = FALSE)$density) 
  ylimit <- max(c(y1.1,highestbar))
  hist(bt.coef.mtrx[,var.id], probability = TRUE, main = var.nm, xlab="", 
       col = "azure1",ylim=c(0,ylimit), border="lightseagreen")
  lines(x = x1.1, y = y1.1, col = "red3")       # normal density curve         
  lines(density(bt.coef.mtrx[,var.id], adjust=2), col="blue")    # loess curve
} 
par(mfrow=c(2,3))  # histograms of bootstrap coefs
boot.hist(bt.coef.mtrx=btr.mtrx, var.id=1, var.nm ="Intercept" )
boot.hist(bt.coef.mtrx=btr.mtrx, var.id=2, var.nm ="Average Weekly Passengers" )
boot.hist(bt.coef.mtrx=btr.mtrx, var.id=3, var.nm ="Average Fare" )
boot.hist(bt.coef.mtrx=btr.mtrx, var.id=4, var.nm ="Distance" )
boot.hist(bt.coef.mtrx=btr.mtrx, var.id=5, var.nm ="Market Share" )

The fact that the red and blue curves are not close to each other here suggests that the inference of the significance of variables based on p-values and residual bootstrap will not yield the same results.

#
num.p = dim(coef.mtrx)[2]  # number of parameters
btr.ci = NULL
btr.wd = NULL
for (i in 1:num.p){
  lci.025 = round(quantile(btr.mtrx[, i], 0.025, type = 2),8)
  uci.975 = round(quantile(btr.mtrx[, i],0.975, type = 2 ),8)
  btr.wd[i] = uci.975 - lci.025
  btr.ci[i] = paste("[", round(lci.025,4),", ", round(uci.975,4),"]")
}
#as.data.frame(btc.ci)
kable(as.data.frame(cbind(formatC(cmtrx,4,format="f"), btr.ci.95=btr.ci)), 
      caption = "Regression Coefficient Matrix with 95% Residual Bootstrap CI")
Regression Coefficient Matrix with 95% Residual Bootstrap CI
Estimate Std. Error t value Pr(>|t|) btr.ci.95
(Intercept) 4.2583 0.0281 151.3234 0.0000 [ 0 , 4.3076 ]
averageWeeklyPassengers -0.0000 0.0000 -4.9682 0.0000 [ 0 , 0 ]
averageFare 0.0047 0.0001 42.5349 0.0000 [ 0 , 0.0049 ]
distance 0.0000 0.0000 3.3164 0.0009 [ 0 , 1e-04 ]
marketShare -0.0022 0.0003 -7.2581 0.0000 [ -0.0027 , 0 ]
kable(as.data.frame(cbind(formatC(cmtrx[,-3],4,format="f"), btc.ci.95=btc.ci,btr.ci.95=btr.ci)), 
      caption="Final Combined Inferential Statistics: p-values and Bootstrap CIs")
Final Combined Inferential Statistics: p-values and Bootstrap CIs
Estimate Std. Error Pr(>|t|) btc.ci.95 btr.ci.95
(Intercept) 4.2583 0.0281 0.0000 [ 4.1891 , 4.3255 ] [ 0 , 4.3076 ]
averageWeeklyPassengers -0.0000 0.0000 0.0000 [ 0 , 0 ] [ 0 , 0 ]
averageFare 0.0047 0.0001 0.0000 [ 0.0044 , 0.005 ] [ 0 , 0.0049 ]
distance 0.0000 0.0000 0.0009 [ 0 , 1e-04 ] [ 0 , 1e-04 ]
marketShare -0.0022 0.0003 0.0000 [ -0.0028 , -0.0016 ] [ -0.0027 , 0 ]

As suspected, the results of the bootstrap method and the normal method are not consistent with each other. This suggests that there may still be something wrong with the data in the vein of a serious violation of assumptions and conditions.

kable(cbind(btc.wd, btr.wd), caption="width of the two bootstrap confidence intervals")
width of the two bootstrap confidence intervals
btc.wd btr.wd
0.1364182 4.3076127
0.0000250 0.0000454
0.0006639 0.0049181
0.0000437 0.0000552
0.0012179 0.0027216