To demonstrate the choice methods, we begin with the Sydney Transportation study. Commuters can choose to go into city either by car or by train. The response in binary so we can choose the logistic regression model. In this study we know the time and cost of travel by car and by train. These are the explanatory varibales in this case. We use the linear combinations of four explanatory varibales(car time, car cost, Train time, train cost) to predict the consumer choice.
# Predicting Commuter Transportation Choices (R)
library(lattice) # multivariate data visualization
names(sydney) <- c("Car_Time", "Car_Cost", "Train_Time", "Train_Cost", "Choice")
plotting_data_frame <- sydney[, 1:4]
Scatter plot matrix with simple linear regression. Models and lowess smooth fits for variable pairs
pairs(plotting_data_frame,
panel = function(x, y) {
points(x, y, cex = 0.5)
abline(lm(y ~ x), lty = "solid", col = "red")
lines(lowess(x, y))
}
)
sydney_cormat <-
cor(sydney[, c("Car_Time", "Car_Cost", "Train_Time", "Train_Cost")])
correlation_heat_map(sydney_cormat)
# specify and fit logistic regression model
sydney_model <- {Choice ~ Car_Time + Car_Cost + Train_Time + Train_Cost}
sydney_fit <- glm(sydney_model, family=binomial, data=sydney)
print(summary(sydney_fit))
##
## Call:
## glm(formula = sydney_model, family = binomial, data = sydney)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1291 -0.6151 -0.1621 0.6166 2.8337
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.444027 0.584971 -2.469 0.0136 *
## Car_Time 0.056507 0.010911 5.179 2.23e-07 ***
## Car_Cost 0.029842 0.006968 4.283 1.84e-05 ***
## Train_Time 0.014918 0.009482 1.573 0.1156
## Train_Cost -0.111293 0.016521 -6.737 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 458.36 on 332 degrees of freedom
## Residual deviance: 272.63 on 328 degrees of freedom
## AIC: 282.63
##
## Number of Fisher Scoring iterations: 5
print(anova(sydney_fit, test="Chisq"))
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Choice
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 332 458.36
## Car_Time 1 90.402 331 367.96 < 2.2e-16 ***
## Car_Cost 1 22.822 330 345.14 1.778e-06 ***
## Train_Time 1 7.563 329 337.57 0.005956 **
## Train_Cost 1 64.938 328 272.63 7.727e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# compute predicted probability of taking the train
sydney$Predict_Prob_TRAIN <- predict.glm(sydney_fit, type = "response")
#pdf(file = "fig_predicting_choice_density_evaluation.pdf",
# width = 8.5, height = 8.5)
plotting_object <- densityplot( ~ Predict_Prob_TRAIN | Choice,
data = sydney,
layout = c(1,2), aspect=1, col = "darkblue",
plot.points = "rug",
strip=function(...) strip.default(..., style=1),
xlab="Predicted Probability of Taking Train")
print(plotting_object)
predicted car-or-train choice using 0.5 cut-off
sydney$Predict_Choice <- ifelse((sydney$Predict_Prob_TRAIN > 0.6), 2, 1)
sydney$Predict_Choice <- factor(sydney$Predict_Choice,
levels = c(1, 2), labels = c("CAR", "TRAIN"))
Creating confusion matrix
confusion_matrix <- table(sydney$Predict_Choice, sydney$Choice)
cat("\nConfusion Matrix (rows = Predicted Choice, columns = Actual Choice\n")
##
## Confusion Matrix (rows = Predicted Choice, columns = Actual Choice
print(confusion_matrix)
##
## CAR TRAIN
## CAR 163 37
## TRAIN 20 113
predictive_accuracy <- (confusion_matrix[1,1] + confusion_matrix[2,2])/
sum(confusion_matrix)
cat("\nPercent Accuracy: ", round(predictive_accuracy * 100, digits = 1))
##
## Percent Accuracy: 82.9
How much lower would train ticket prices have to be to increase public transportation usage (TRAIN) by 10 percent?
train_cost_vector <-
seq(min(sydney$Train_Cost), max(sydney$Train_Cost), length=1000)
beta.vector <- sydney_fit$coefficients
train_probability_vector <- numeric(1000)
for (i in 1:1000) {
X.vector <- c(1, mean(sydney$Car_Time), mean(sydney$Train_Time),
mean(sydney$Car_Cost), train_cost_vector[i])
train_probability_vector[i] <-
exp(X.vector %*% beta.vector)/
(1 + exp(X.vector %*% beta.vector))
}
Currently 150 out of 333 commuters (45 percent) use the train.
Determine price required for 55 percent of commuters to take the train this is the desired quota set by public administrators
index <- 1 # beginning index for search
while (train_probability_vector[index] > 0.60) index <- index + 1
Solution_Price <- train_cost_vector[index]
cat("\nSolution Price: ", Solution_Price)
##
## Solution Price: 32.04505
Current_Mean_Price <- mean(sydney$Train_Cost)
How much do administrators need to lower prices? use greatest integer function to ensure quota is exceeded
Cents_Lower <- ceiling(Current_Mean_Price - Solution_Price)
cat("\nLower prices by ", Cents_Lower, "cents\n")
##
## Lower prices by 7 cents
plot(train_cost_vector, train_probability_vector,
type="l",ylim=c(0,1.0), las = 1,
xlab="Cost of Taking the Train (in cents)",
ylab="Estimated Probability of Taking the Train")