# Predicting Commuter Transportation Choices (R)
library(lattice) # multivariate data visualization
load("/Users/jyothi/Downloads/MDS_Chapter_2/correlation_heat_map.RData") # from R utility programs
# read data from comma-delimited text file... create data frame object
sydney <- read.csv("/Users/jyothi/Downloads/MDS_Chapter_2/sydney.csv")
names(sydney) <-
c("Car_Time", "Car_Cost", "Train_Time", "Train_Cost", "Choice")
plotting_data_frame <- sydney[, 1:4]
# scatter plot matrix with simple linear regression
# models and lowess smooth fits for variable pairs
pdf(file = "fig_predicting_choice_scatter_plot_matrix.pdf",
width = 8.5, height = 8.5)
pairs(plotting_data_frame,
panel = function(x, y) {
points(x, y, cex = 0.5)
abline(lm(y ~ x), lty = "solid", col = "red")
lines(lowess(x, y))
}
)
dev.off()
## quartz_off_screen
## 2
# correlation heat map for the explanatory variables
pdf(file = "fig_predicting_choice_correlation_heat_map.pdf",
width = 8.5, height = 8.5)
sydney_cormat <-
cor(sydney[, c("Car_Time", "Car_Cost", "Train_Time", "Train_Cost")])
correlation_heat_map(sydney_cormat)
dev.off()
## quartz_off_screen
## 2
# specify and fit logistic regression model
sydney_model <- {Choice ~ Car_Time + Car_Cost + Train_Time + Train_Cost}
sydney_fit <- glm(sydney_model, family=binomial, data=sydney)
print(summary(sydney_fit))
##
## Call:
## glm(formula = sydney_model, family = binomial, data = sydney)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1291 -0.6151 -0.1621 0.6166 2.8337
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.444027 0.584971 -2.469 0.0136 *
## Car_Time 0.056507 0.010911 5.179 2.23e-07 ***
## Car_Cost 0.029842 0.006968 4.283 1.84e-05 ***
## Train_Time 0.014918 0.009482 1.573 0.1156
## Train_Cost -0.111293 0.016521 -6.737 1.62e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 458.36 on 332 degrees of freedom
## Residual deviance: 272.63 on 328 degrees of freedom
## AIC: 282.63
##
## Number of Fisher Scoring iterations: 5
print(anova(sydney_fit, test="Chisq"))
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Choice
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 332 458.36
## Car_Time 1 90.402 331 367.96 < 2.2e-16 ***
## Car_Cost 1 22.822 330 345.14 1.778e-06 ***
## Train_Time 1 7.563 329 337.57 0.005956 **
## Train_Cost 1 64.938 328 272.63 7.727e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# compute predicted probability of taking the train
sydney$Predict_Prob_TRAIN <- predict.glm(sydney_fit, type = "response")
pdf(file = "fig_predicting_choice_density_evaluation.pdf",
width = 8.5, height = 8.5)
plotting_object <- densityplot( ~ Predict_Prob_TRAIN | Choice,
data = sydney,
layout = c(1,2), aspect=1, col = "darkblue",
plot.points = "rug",
strip=function(...) strip.default(..., style=1),
xlab="Predicted Probability of Taking Train")
print(plotting_object)
dev.off()
## quartz_off_screen
## 2
# predicted car-or-train choice using 0.5 cut-off
sydney$Predict_Choice <- ifelse((sydney$Predict_Prob_TRAIN > 0.5), 2, 1)
sydney$Predict_Choice <- factor(sydney$Predict_Choice,
levels = c(1, 2), labels = c("CAR", "TRAIN"))
confusion_matrix <- table(sydney$Predict_Choice, sydney$Choice)
cat("\nConfusion Matrix (rows = Predicted Choice, columns = Actual Choice\n")
##
## Confusion Matrix (rows = Predicted Choice, columns = Actual Choice
print(confusion_matrix)
##
## CAR TRAIN
## CAR 155 30
## TRAIN 28 120
predictive_accuracy <- (confusion_matrix[1,1] + confusion_matrix[2,2])/
sum(confusion_matrix)
cat("\nPercent Accuracy: ", round(predictive_accuracy * 100, digits = 1))
##
## Percent Accuracy: 82.6
# How much lower would train ticket prices have to be to increase
# public transportation usage (TRAIN) by 10 percent?
train_cost_vector <-
seq(min(sydney$Train_Cost), max(sydney$Train_Cost), length=1000)
beta.vector <- sydney_fit$coefficients
train_probability_vector <- numeric(1000)
for (i in 1:1000) {
X.vector <- c(1, mean(sydney$Car_Time), mean(sydney$Train_Time),
mean(sydney$Car_Cost), train_cost_vector[i])
train_probability_vector[i] <-
exp(X.vector %*% beta.vector)/
(1 + exp(X.vector %*% beta.vector))
}
# currently 150 out of 333 commuters (45 percent) use the train
# determine price required for 55 percent of commuters to take the train
# this is the desired quota set by public administrators
index <- 1 # beginning index for search
while (train_probability_vector[index] > 0.55) index <- index + 1
Solution_Price <- train_cost_vector[index]
cat("\nSolution Price: ", Solution_Price)
##
## Solution Price: 33.84685
Current_Mean_Price <- mean(sydney$Train_Cost)
# how much do administrators need to lower prices?
# use greatest integer function to ensure quota is exceeded
Cents_Lower <- ceiling(Current_Mean_Price - Solution_Price)
cat("\nLower prices by ", Cents_Lower, "cents\n")
##
## Lower prices by 5 cents
pdf(file = "fig_predicting_choice_ticket_price_solution.pdf",
width = 8.5, height = 8.5)
plot(train_cost_vector, train_probability_vector,
type="l",ylim=c(0,1.0), las = 1,
xlab="Cost of Taking the Train (in cents)",
ylab="Estimated Probability of Taking the Train")
# plot current average train ticket price as vertical line
abline(v = Current_Mean_Price, col = "red", lty = "solid", lwd = 2)
abline(v = Solution_Price, col = "blue", lty = "dashed", lwd = 2)
legend("topright", legend = c("Current Mean Train Ticket Price",
paste("Solution Price (", Cents_Lower, " Cents Lower)", sep = "")),
col = c("red", "blue"), pch = c(NA, NA), lwd = c(2, 2),
border = "black", lty = c("solid", "dashed"), cex = 1.25)
dev.off()
## quartz_off_screen
## 2
# of a tax would public administrators have to impose in order to have
# a comparable effect to train ticket prices?
# Evaluate the logistic regression model in terms of its out-of-sample
Car_cost_vector <- seq(min(sydney$Car_Cost), max(sydney$Car_Cost), length=1000)
car_probability_vector <- numeric(1000)
for (i in 1:1000) {
Y.vector <- c(1, mean(sydney$Car_Time), mean(sydney$Train_Time),
Car_cost_vector[i], mean(sydney$Train_Cost))
car_probability_vector[i] <-
exp(Y.vector %*% beta.vector)/
(1 + exp(Y.vector %*% beta.vector))
}
#car_probability_vector
index <- 1 # beginning index for search
while (car_probability_vector[index] < 0.5) index <- index + 1
Solution_Price <- Car_cost_vector[index]
cat("\nSolution Price: ", Solution_Price)
##
## Solution Price: 77.11111
Current_Mean_Price <- mean(sydney$Car_Cost)
Current_Mean_Price
## [1] 57.75375
Cents_Lower <- ceiling(Solution_Price- Current_Mean_Price)
Cents_Lower
## [1] 20
cat("\nLower prices by ", Cents_Lower, "cents\n")
##
## Lower prices by 20 cents