# Predicting Commuter Transportation Choices (R)

library(lattice)  # multivariate data visualization

load("/Users/jyothi/Downloads/MDS_Chapter_2/correlation_heat_map.RData")  # from R utility programs

# read data from comma-delimited text file... create data frame object
sydney <- read.csv("/Users/jyothi/Downloads/MDS_Chapter_2/sydney.csv")
names(sydney) <- 
    c("Car_Time", "Car_Cost", "Train_Time", "Train_Cost", "Choice")
plotting_data_frame <- sydney[, 1:4]

# scatter plot matrix with simple linear regression
# models and lowess smooth fits for variable pairs
pdf(file = "fig_predicting_choice_scatter_plot_matrix.pdf", 
    width = 8.5, height = 8.5)
pairs(plotting_data_frame,
    panel = function(x, y) {
        points(x, y, cex = 0.5)
        abline(lm(y ~ x), lty = "solid", col = "red")
        lines(lowess(x, y))
        }
    )
dev.off()  
## quartz_off_screen 
##                 2
# correlation heat map for the explanatory variables
pdf(file = "fig_predicting_choice_correlation_heat_map.pdf",
    width = 8.5, height = 8.5)
sydney_cormat <- 
    cor(sydney[, c("Car_Time", "Car_Cost", "Train_Time", "Train_Cost")])
correlation_heat_map(sydney_cormat)
dev.off()
## quartz_off_screen 
##                 2
# specify and fit logistic regression model
sydney_model <- {Choice ~ Car_Time + Car_Cost + Train_Time + Train_Cost}
sydney_fit <- glm(sydney_model, family=binomial, data=sydney)
print(summary(sydney_fit))
## 
## Call:
## glm(formula = sydney_model, family = binomial, data = sydney)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1291  -0.6151  -0.1621   0.6166   2.8337  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.444027   0.584971  -2.469   0.0136 *  
## Car_Time     0.056507   0.010911   5.179 2.23e-07 ***
## Car_Cost     0.029842   0.006968   4.283 1.84e-05 ***
## Train_Time   0.014918   0.009482   1.573   0.1156    
## Train_Cost  -0.111293   0.016521  -6.737 1.62e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 458.36  on 332  degrees of freedom
## Residual deviance: 272.63  on 328  degrees of freedom
## AIC: 282.63
## 
## Number of Fisher Scoring iterations: 5
print(anova(sydney_fit, test="Chisq"))
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Choice
## 
## Terms added sequentially (first to last)
## 
## 
##            Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                         332     458.36              
## Car_Time    1   90.402       331     367.96 < 2.2e-16 ***
## Car_Cost    1   22.822       330     345.14 1.778e-06 ***
## Train_Time  1    7.563       329     337.57  0.005956 ** 
## Train_Cost  1   64.938       328     272.63 7.727e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# compute predicted probability of taking the train 
sydney$Predict_Prob_TRAIN <- predict.glm(sydney_fit, type = "response") 

pdf(file = "fig_predicting_choice_density_evaluation.pdf", 
    width = 8.5, height = 8.5)
plotting_object <- densityplot( ~ Predict_Prob_TRAIN | Choice, 
               data = sydney, 
               layout = c(1,2), aspect=1, col = "darkblue", 
               plot.points = "rug",
               strip=function(...) strip.default(..., style=1),
               xlab="Predicted Probability of Taking Train") 
print(plotting_object) 
dev.off()
## quartz_off_screen 
##                 2
# predicted car-or-train choice using 0.5 cut-off
sydney$Predict_Choice <- ifelse((sydney$Predict_Prob_TRAIN > 0.5), 2, 1)
sydney$Predict_Choice <- factor(sydney$Predict_Choice,
    levels = c(1, 2), labels = c("CAR", "TRAIN"))
    
confusion_matrix <- table(sydney$Predict_Choice, sydney$Choice)
cat("\nConfusion Matrix (rows = Predicted Choice, columns = Actual Choice\n")
## 
## Confusion Matrix (rows = Predicted Choice, columns = Actual Choice
print(confusion_matrix)
##        
##         CAR TRAIN
##   CAR   155    30
##   TRAIN  28   120
predictive_accuracy <- (confusion_matrix[1,1] + confusion_matrix[2,2])/
                        sum(confusion_matrix)                                              
cat("\nPercent Accuracy: ", round(predictive_accuracy * 100, digits = 1))
## 
## Percent Accuracy:  82.6
# How much lower would train ticket prices have to be to increase
# public transportation usage (TRAIN) by 10 percent?
train_cost_vector <- 
     seq(min(sydney$Train_Cost), max(sydney$Train_Cost), length=1000)
beta.vector <- sydney_fit$coefficients
train_probability_vector <- numeric(1000)
for (i in 1:1000) {
       X.vector <- c(1, mean(sydney$Car_Time), mean(sydney$Train_Time),
           mean(sydney$Car_Cost), train_cost_vector[i])
       train_probability_vector[i] <- 
           exp(X.vector %*% beta.vector)/
               (1 + exp(X.vector %*% beta.vector))
       } 

# currently 150 out of 333 commuters (45 percent) use the train       
# determine price required for 55 percent of commuters to take the train 
# this is the desired quota set by public administrators
index <- 1  # beginning index for search
while (train_probability_vector[index] > 0.55) index <- index + 1
Solution_Price <- train_cost_vector[index]
cat("\nSolution Price: ", Solution_Price)
## 
## Solution Price:  33.84685
Current_Mean_Price <- mean(sydney$Train_Cost)

# how much do administrators need to lower prices?
# use greatest integer function to ensure quota is exceeded
Cents_Lower <- ceiling(Current_Mean_Price - Solution_Price)
cat("\nLower prices by ", Cents_Lower, "cents\n")
## 
## Lower prices by  5 cents
pdf(file = "fig_predicting_choice_ticket_price_solution.pdf", 
    width = 8.5, height = 8.5) 
plot(train_cost_vector, train_probability_vector,
     type="l",ylim=c(0,1.0), las = 1, 
     xlab="Cost of Taking the Train (in cents)",
     ylab="Estimated Probability of Taking the Train")

# plot current average train ticket price as vertical line     
abline(v = Current_Mean_Price, col = "red", lty = "solid", lwd = 2)    
abline(v = Solution_Price, col = "blue", lty = "dashed", lwd = 2)

legend("topright", legend = c("Current Mean Train Ticket Price", 
        paste("Solution Price (", Cents_Lower, " Cents Lower)", sep = "")), 
    col = c("red", "blue"), pch = c(NA, NA), lwd = c(2, 2),
    border = "black", lty = c("solid", "dashed"), cex = 1.25)
dev.off()       
## quartz_off_screen 
##                 2
# of a tax would public administrators have to impose in order to have
# a comparable effect to train ticket prices?
# Evaluate the logistic regression model in terms of its out-of-sample
Car_cost_vector <- seq(min(sydney$Car_Cost), max(sydney$Car_Cost), length=1000)
car_probability_vector <- numeric(1000)
for (i in 1:1000) {
  Y.vector <- c(1, mean(sydney$Car_Time), mean(sydney$Train_Time),
Car_cost_vector[i], mean(sydney$Train_Cost))
car_probability_vector[i] <-
  exp(Y.vector %*% beta.vector)/
  (1 + exp(Y.vector %*% beta.vector))
}
#car_probability_vector
index <- 1  # beginning index for search
while (car_probability_vector[index] < 0.5) index <- index + 1
Solution_Price <- Car_cost_vector[index]
cat("\nSolution Price: ", Solution_Price)
## 
## Solution Price:  77.11111
Current_Mean_Price <- mean(sydney$Car_Cost)
Current_Mean_Price
## [1] 57.75375
Cents_Lower <- ceiling(Solution_Price- Current_Mean_Price)
Cents_Lower
## [1] 20
cat("\nLower prices by ", Cents_Lower, "cents\n")
## 
## Lower prices by  20 cents