library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.3.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.2
library(AmesHousing)
## Warning: package 'AmesHousing' was built under R version 4.3.3
library(boot)
library(broom)
library(lindia)
## Warning: package 'lindia' was built under R version 4.3.3
library(dplyr)
project_data <- read.csv("online_shoppers_intention.csv")
First, I’ll apply all of the functions and changes I made to the dataset last week that got my data to it’s best form.
# Recode Month into two categories: "Pre-Christmas" and "Other"
project_data <- project_data |>
mutate(Pre_Christmas = if_else(Month %in% c("Oct", "Nov", "Dec"), 1, 0))
# Remove specified columns
project_data <- project_data |>
dplyr::select(-TrafficType, -OperatingSystems, -SpecialDay, -Region)
# Convert logical columns to numeric for correlation calculation
project_data$Revenue <- as.numeric(project_data$Revenue)
project_data$Weekend <- as.numeric(project_data$Weekend)
project_data$VisitorType <- as.numeric(factor(project_data$VisitorType, levels = unique(project_data$VisitorType)))
# Create total_duration and total_pages variables
project_data <- project_data |>
mutate(
total_duration = Administrative_Duration + Informational_Duration + ProductRelated_Duration,
total_pages = Administrative + Informational + ProductRelated
) |>
# Remove the original columns to reduce multicollinearity
dplyr::select(-Administrative_Duration, -Informational_Duration, -ProductRelated_Duration,
-Administrative, -Informational, -ProductRelated)
# Remove the browser dummy variables
project_data_cleaned <- project_data |>
dplyr::select(-starts_with("Browser_"), -Browser) # Adjust based on the dummy variable names
# Create a new dataset excluding BounceRates and Weekend
project_data_more_clean <- project_data_cleaned |>
dplyr::select(-BounceRates, -Weekend, -total_duration)
Now, I’ll provide the final weighted model we got to by the end of last week.
# Ensure that 'Revenue' is treated as a binary factor
project_data_more_clean$Revenue <- as.factor(project_data_more_clean$Revenue)
# First, Calculate Weights for Each Class
# Calculate the frequency of each class
class_counts <- table(project_data$Revenue)
total_count <- nrow(project_data)
# Calculate weights for each class
weight_0 <- total_count / (2 * class_counts[1]) # Weight for class 0 (non-revenue)
weight_1 <- total_count / (2 * class_counts[2]) # Weight for class 1 (revenue)
# Assign weights based on the target variable
project_data_more_clean$weights <- ifelse(project_data$Revenue == 0, weight_0, weight_1)
# Run the Weighted Logistic Regression Model
weighted_model <- glm(
formula = Revenue ~ ExitRates + PageValues + VisitorType + Pre_Christmas + total_pages,
family = binomial,
data = project_data_more_clean,
weights = weights
)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# View Model Summary
summary(weighted_model)
##
## Call:
## glm(formula = Revenue ~ ExitRates + PageValues + VisitorType +
## Pre_Christmas + total_pages, family = binomial, data = project_data_more_clean,
## weights = weights)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.493e+00 1.010e-01 -14.782 < 2e-16 ***
## ExitRates -1.407e+01 1.042e+00 -13.495 < 2e-16 ***
## PageValues 1.192e-01 3.033e-03 39.298 < 2e-16 ***
## VisitorType 2.321e-01 6.409e-02 3.622 0.000293 ***
## Pre_Christmas 7.909e-01 4.892e-02 16.168 < 2e-16 ***
## total_pages 6.958e-03 5.139e-04 13.539 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17093 on 12329 degrees of freedom
## Residual deviance: 10834 on 12324 degrees of freedom
## AIC: 14355
##
## Number of Fisher Scoring iterations: 8
# Evaluate Model
# Predict probabilities
predicted_prob <- predict(weighted_model, type = "response")
# Classify based on a 0.5 threshold
predicted_class <- ifelse(predicted_prob > 0.5, 1, 0)
# Generate Confusion Matrix and Evaluation Metrics
# Use caret for evaluation
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
# Generate confusion matrix
confusion_matrix <- confusionMatrix(
factor(predicted_class),
factor(project_data$Revenue),
positive = "1"
)
# Print confusion matrix and evaluation metrics
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9364 487
## 1 1058 1421
##
## Accuracy : 0.8747
## 95% CI : (0.8687, 0.8805)
## No Information Rate : 0.8453
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5732
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7448
## Specificity : 0.8985
## Pos Pred Value : 0.5732
## Neg Pred Value : 0.9506
## Prevalence : 0.1547
## Detection Rate : 0.1152
## Detection Prevalence : 0.2011
## Balanced Accuracy : 0.8216
##
## 'Positive' Class : 1
##
# Extract the components from the confusion matrix
confusion_data <- confusionMatrix(
factor(predicted_class),
factor(project_data$Revenue),
positive = "1"
)
# Extracting precision, recall, and F1 score
precision <- confusion_data$byClass["Pos Pred Value"] # Precision
recall <- confusion_data$byClass["Sensitivity"] # Recall
f1_score <- 2 * (precision * recall) / (precision + recall) # F1 Score
# Print out the results
cat("Precision: ", precision, "\n")
## Precision: 0.573215
cat("Recall: ", recall, "\n")
## Recall: 0.7447589
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.6478231
So, we’ll start by using a boxcox transformation to find the optimal lambda for each of the three
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Add a small constant to ExitRates to ensure all values are positive
project_data_more_clean$ExitRates_shifted <- project_data_more_clean$ExitRates + 0.001
project_data_more_clean$pagevalues_shifted <- project_data_more_clean$PageValues + 0.001
project_data_more_clean$total_pages_shifted <- project_data_more_clean$total_pages + 0.001
# Apply the Box-Cox transformation to determine the optimal lambda for the shifted variable
boxcox_result <- boxcox(lm(ExitRates_shifted ~ 1, data = project_data_more_clean),
lambda = seq(-2, 2, by = 0.1))
# Apply Box-Cox for PageValues
boxcox_pagevalues <- boxcox(lm(pagevalues_shifted ~ 1, data = project_data_more_clean),
lambda = seq(-2, 2, by = 0.1))
# Apply Box-Cox for total_pages
boxcox_total_pages <- boxcox(lm(total_pages_shifted ~ 1, data = project_data_more_clean),
lambda = seq(-2, 2, by = 0.1))
# Find the optimal lambda
optimal_lambda_exitrates <- boxcox_result$x[which.max(boxcox_result$y)]
optimal_lambda_exitrates
## [1] 0.02020202
# Identify optimal lambda for PageValues
optimal_lambda_pagevalues <- boxcox_pagevalues$x[which.max(boxcox_pagevalues$y)]
cat("Optimal lambda for PageValues:", optimal_lambda_pagevalues, "\n")
## Optimal lambda for PageValues: -0.4242424
# Identify optimal lambda for total_pages
optimal_lambda_total_pages <- boxcox_total_pages$x[which.max(boxcox_total_pages$y)]
cat("Optimal lambda for Total Pages:", optimal_lambda_total_pages, "\n")
## Optimal lambda for Total Pages: 0.1414141
Now I can apply the optimal lamda values to do the transformation.
# Apply Box-Cox transformation for ExitRates with its optimal lambda
project_data_more_clean$ExitRates_transformed <- (project_data_more_clean$ExitRates_shifted^optimal_lambda_exitrates - 1) / optimal_lambda_exitrates
# Apply Box-Cox transformation for PageValues with its optimal lambda
project_data_more_clean$PageValues_transformed <- (project_data_more_clean$pagevalues_shifted^optimal_lambda_pagevalues - 1) / optimal_lambda_pagevalues
# Apply Box-Cox transformation for total_pages with its optimal lambda
project_data_more_clean$total_pages_transformed <- (project_data_more_clean$total_pages_shifted^optimal_lambda_total_pages - 1) / optimal_lambda_total_pages
Lets look at how this changed our 3 variables.
library(gridExtra) # To arrange multiple plots together
## Warning: package 'gridExtra' was built under R version 4.3.3
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Original and transformed histograms
p1 <- ggplot(project_data_more_clean, aes(x = ExitRates)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
ggtitle("Original ExitRates")
p2 <- ggplot(project_data_more_clean, aes(x = ExitRates_transformed)) +
geom_histogram(bins = 30, fill = "lightgreen", color = "black") +
ggtitle("Transformed ExitRates")
p3 <- ggplot(project_data_more_clean, aes(x = PageValues)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
ggtitle("Original PageValues")
p4 <- ggplot(project_data_more_clean, aes(x = PageValues_transformed)) +
geom_histogram(bins = 30, fill = "lightgreen", color = "black") +
ggtitle("Transformed PageValues")
p5 <- ggplot(project_data_more_clean, aes(x = total_pages)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black") +
ggtitle("Original Total Pages")
p6 <- ggplot(project_data_more_clean, aes(x = total_pages_transformed)) +
geom_histogram(bins = 30, fill = "lightgreen", color = "black") +
ggtitle("Transformed Total Pages")
# Arrange plots in a grid
grid.arrange(p1, p2, p3, p4, p5, p6, ncol = 2)
This worked really well for total pages and exit rates. I’m not sure what happened with page values. I’ll check the correlation on that one to determine which is working best i guess (if this means there’s another route to best transform the variable please let me know TA).
project_data_more_clean$Revenue <- as.numeric(as.character(project_data_more_clean$Revenue))
# Calculate the correlations
cor_original <- cor(project_data_more_clean$PageValues, project_data_more_clean$Revenue)
cor_transformed <- cor(project_data_more_clean$PageValues_transformed, project_data_more_clean$Revenue)
# Print the results
cat("Correlation between Revenue and original PageValues:", cor_original, "\n")
## Correlation between Revenue and original PageValues: 0.4925693
cat("Correlation between Revenue and transformed PageValues:", cor_transformed, "\n")
## Correlation between Revenue and transformed PageValues: 0.6047496
It did improve the correlation so we’ll go with that despite the confusing distribution.
Now we’ll see how using these new transformed variables impacts our model.
# Make sure 'Revenue' is treated as a binary factor
project_data_more_clean$Revenue <- as.factor(project_data_more_clean$Revenue)
# Define weights for the imbalanced classes
class_counts <- table(project_data_more_clean$Revenue)
total_count <- nrow(project_data_more_clean)
weight_0 <- total_count / (2 * class_counts[1]) # Weight for class 0 (non-revenue)
weight_1 <- total_count / (2 * class_counts[2]) # Weight for class 1 (revenue)
project_data_more_clean$weights <- ifelse(project_data_more_clean$Revenue == 0, weight_0, weight_1)
# Fit the weighted logistic regression model using the transformed variables
weighted_model_transformed <- glm(
formula = Revenue ~ ExitRates_transformed + PageValues_transformed + total_pages_transformed,
family = binomial,
data = project_data_more_clean,
weights = weights
)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
# View the summary of the model
summary(weighted_model_transformed)
##
## Call:
## glm(formula = Revenue ~ ExitRates_transformed + PageValues_transformed +
## total_pages_transformed, family = binomial, data = project_data_more_clean,
## weights = weights)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.005907 0.133512 0.044 0.9647
## ExitRates_transformed -0.405195 0.034516 -11.739 <2e-16 ***
## PageValues_transformed 0.074443 0.001239 60.078 <2e-16 ***
## total_pages_transformed 0.048630 0.016268 2.989 0.0028 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17093 on 12329 degrees of freedom
## Residual deviance: 10291 on 12326 degrees of freedom
## AIC: 13386
##
## Number of Fisher Scoring iterations: 4
Next, we’ll provide extra summary details and threshold controls, like we did for the first model.
# Predict probabilities using the new weighted logistic regression model
predicted_prob <- predict(weighted_model_transformed, type = "response")
# Classify based on a 0.5 threshold
predicted_class <- ifelse(predicted_prob > 0.5, 1, 0)
# Generate Confusion Matrix and Evaluation Metrics
confusion_matrix <- confusionMatrix(
factor(predicted_class),
factor(project_data_more_clean$Revenue),
positive = "1"
)
# Print confusion matrix and main evaluation metrics
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9230 370
## 1 1192 1538
##
## Accuracy : 0.8733
## 95% CI : (0.8673, 0.8791)
## No Information Rate : 0.8453
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5882
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.8061
## Specificity : 0.8856
## Pos Pred Value : 0.5634
## Neg Pred Value : 0.9615
## Prevalence : 0.1547
## Detection Rate : 0.1247
## Detection Prevalence : 0.2214
## Balanced Accuracy : 0.8459
##
## 'Positive' Class : 1
##
# Extract components from confusion matrix for advanced metrics
precision <- confusion_matrix$byClass["Pos Pred Value"] # Precision
recall <- confusion_matrix$byClass["Sensitivity"] # Recall
f1_score <- 2 * (precision * recall) / (precision + recall) # F1 Score
specificity <- confusion_matrix$byClass["Specificity"] # Specificity
accuracy <- confusion_matrix$overall["Accuracy"] # Accuracy
balanced_accuracy <- confusion_matrix$byClass["Balanced Accuracy"] # Balanced Accuracy
# Print out all metrics
cat("Evaluation Metrics:\n")
## Evaluation Metrics:
cat("Precision: ", precision, "\n")
## Precision: 0.56337
cat("Recall: ", recall, "\n")
## Recall: 0.8060797
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.6632169
cat("Specificity: ", specificity, "\n")
## Specificity: 0.8856266
cat("Accuracy: ", accuracy, "\n")
## Accuracy: 0.8733171
cat("Balanced Accuracy: ", balanced_accuracy, "\n")
## Balanced Accuracy: 0.8458531
Interestingly the accuracy went down, but the model is more likely to predict positive cases in our new model which ultimately is more important (without adjusting the threshold). Since the goal is to determine which shoppers are more likely to make a purchase, this is good. Ultimately down the line I probably will tweak the threshold since I imagine my purpose for this model would be to push more advertisement to potential buyers. So, I think it wouldn’t be a problem if we wrongly send advertisements to non-purchasers. The problem would lie more in us not advertising to people who would make a purchase.
Although accuracy went down, its important to realize that accuracy in the first iteration really isn’t an indicator of a good model. As a result of the transformations we made, a number of enhancements to stability and interpretability are shown in our new variable. This makes the new version better in my mind. These are the key changes that signify this to me:
Improvement in AIC: The AIC value has dropped from 14355 in the original model to 13386 in the transformed model, indicating a better fit. Lower AIC values suggest the transformed model explains the variation in the data more effectively while using fewer or similar parameters.
Significant Coefficient Change: In the original model, each predictor had high z-values and very low p-values, indicating significance, but in the transformed model, all predictors remain significant with improved coefficient stability. This suggests that the transformations have reduced potential issues related to scaling or distribution, making the model more robust.
Interpretation of Coefficients: The coefficients for the transformed variables are generally smaller in magnitude compared to the original ones. This indicates that the model with transformed variables may have a more direct and less exaggerated relationship between the predictors and the response variable, which is often desired in predictive models to prevent overfitting.
Model Iterations and Convergence: The transformed model converged in only 4 Fisher Scoring iterations, while the original required 8 iterations. This is a positive indicator that the transformed model is more computationally stable, likely due to the smoother distributions of the transformed predictors.
Adjusted Predictive Power: The transformed model's residual deviance (10291) is lower than the original model’s residual deviance (10834), reinforcing that the transformed model has improved explanatory power with regard to the target variable.
Although this has made the model more stable there’s still a lot of issues I think.
overall recall and precision aren’t great. Since we want to accurately predict successful cases we’d really like these to be higher.
With an imbalanced dataset (where non-revenue cases significantly outnumber revenue cases), logistic regression—even with weighting—might not be the best choice. Alternative approaches like gradient-boosted trees, random forests, or specialized techniques for imbalanced data (such as SMOTE for synthetic sampling) could potentially yield better results.
The model improvements seen here might partly reflect overfitting to this specific dataset. Transformations can sometimes amplify the fit to existing data patterns, leading to a model that doesn't generalize well.