library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.3.3
library(ggrepel)
## Warning: package 'ggrepel' was built under R version 4.3.2
library(AmesHousing)
## Warning: package 'AmesHousing' was built under R version 4.3.3
library(boot)
library(broom)
library(lindia)
## Warning: package 'lindia' was built under R version 4.3.3
project_data <- read.csv("online_shoppers_intention.csv")
summary(project_data)
## Administrative Administrative_Duration Informational
## Min. : 0.000 Min. : 0.00 Min. : 0.0000
## 1st Qu.: 0.000 1st Qu.: 0.00 1st Qu.: 0.0000
## Median : 1.000 Median : 7.50 Median : 0.0000
## Mean : 2.315 Mean : 80.82 Mean : 0.5036
## 3rd Qu.: 4.000 3rd Qu.: 93.26 3rd Qu.: 0.0000
## Max. :27.000 Max. :3398.75 Max. :24.0000
## Informational_Duration ProductRelated ProductRelated_Duration
## Min. : 0.00 Min. : 0.00 Min. : 0.0
## 1st Qu.: 0.00 1st Qu.: 7.00 1st Qu.: 184.1
## Median : 0.00 Median : 18.00 Median : 598.9
## Mean : 34.47 Mean : 31.73 Mean : 1194.8
## 3rd Qu.: 0.00 3rd Qu.: 38.00 3rd Qu.: 1464.2
## Max. :2549.38 Max. :705.00 Max. :63973.5
## BounceRates ExitRates PageValues SpecialDay
## Min. :0.000000 Min. :0.00000 Min. : 0.000 Min. :0.00000
## 1st Qu.:0.000000 1st Qu.:0.01429 1st Qu.: 0.000 1st Qu.:0.00000
## Median :0.003112 Median :0.02516 Median : 0.000 Median :0.00000
## Mean :0.022191 Mean :0.04307 Mean : 5.889 Mean :0.06143
## 3rd Qu.:0.016813 3rd Qu.:0.05000 3rd Qu.: 0.000 3rd Qu.:0.00000
## Max. :0.200000 Max. :0.20000 Max. :361.764 Max. :1.00000
## Month OperatingSystems Browser Region
## Length:12330 Min. :1.000 Min. : 1.000 Min. :1.000
## Class :character 1st Qu.:2.000 1st Qu.: 2.000 1st Qu.:1.000
## Mode :character Median :2.000 Median : 2.000 Median :3.000
## Mean :2.124 Mean : 2.357 Mean :3.147
## 3rd Qu.:3.000 3rd Qu.: 2.000 3rd Qu.:4.000
## Max. :8.000 Max. :13.000 Max. :9.000
## TrafficType VisitorType Weekend Revenue
## Min. : 1.00 Length:12330 Mode :logical Mode :logical
## 1st Qu.: 2.00 Class :character FALSE:9462 FALSE:10422
## Median : 2.00 Mode :character TRUE :2868 TRUE :1908
## Mean : 4.07
## 3rd Qu.: 4.00
## Max. :20.00
first, I’ll make a correlation matrix of the variables, and also convert the variables to be numeric.
I went ahead and removed the variables we’ve previously determined we don’t understand well enough to utilize.
I also converted the month variable to be represent either pre-christmas or not based on finds of previous data dives relating to the month variable.
# Recode Month into two categories: "Pre-Christmas" and "Other"
project_data <- project_data |>
mutate(Pre_Christmas = if_else(Month %in% c("Oct", "Nov", "Dec"), 1, 0))
# Remove specified columns
project_data <- project_data |>
select(-TrafficType, -OperatingSystems, -SpecialDay, -Region)
# Convert logical columns to numeric for correlation calculation
project_data$Revenue <- as.numeric(project_data$Revenue)
project_data$Weekend <- as.numeric(project_data$Weekend)
project_data$VisitorType <- as.numeric(factor(project_data$VisitorType, levels = unique(project_data$VisitorType)))
# Generate the correlation matrix
cor_matrix <- project_data |>
select_if(is.numeric) |>
cor(use = "complete.obs")
# Print the correlation matrix
print(cor_matrix)
## Administrative Administrative_Duration Informational
## Administrative 1.00000000 0.60158334 0.37685043
## Administrative_Duration 0.60158334 1.00000000 0.30270971
## Informational 0.37685043 0.30270971 1.00000000
## Informational_Duration 0.25584814 0.23803079 0.61895486
## ProductRelated 0.43111934 0.28908662 0.37416429
## ProductRelated_Duration 0.37393901 0.35542195 0.38750531
## BounceRates -0.22356263 -0.14417041 -0.11611362
## ExitRates -0.31648300 -0.20579776 -0.16366606
## PageValues 0.09898959 0.06760848 0.04863169
## Browser -0.02503457 -0.01539153 -0.03823468
## VisitorType 0.01668015 0.01912018 -0.05821094
## Weekend 0.02641675 0.01499014 0.03578473
## Revenue 0.13891709 0.09358672 0.09520034
## Pre_Christmas 0.07263560 0.04759135 0.05616678
## Informational_Duration ProductRelated
## Administrative 0.25584814 0.43111934
## Administrative_Duration 0.23803079 0.28908662
## Informational 0.61895486 0.37416429
## Informational_Duration 1.00000000 0.28004627
## ProductRelated 0.28004627 1.00000000
## ProductRelated_Duration 0.34736358 0.86092684
## BounceRates -0.07406661 -0.20457763
## ExitRates -0.10527568 -0.29252628
## PageValues 0.03086087 0.05628179
## Browser -0.01928498 -0.01314572
## VisitorType -0.04537184 -0.12791579
## Weekend 0.02407849 0.01609196
## Revenue 0.07034450 0.15853798
## Pre_Christmas 0.04191138 0.13802800
## ProductRelated_Duration BounceRates ExitRates
## Administrative 0.373939013 -0.22356263 -0.316482998
## Administrative_Duration 0.355421954 -0.14417041 -0.205797757
## Informational 0.387505306 -0.11611362 -0.163666061
## Informational_Duration 0.347363577 -0.07406661 -0.105275683
## ProductRelated 0.860926836 -0.20457763 -0.292526283
## ProductRelated_Duration 1.000000000 -0.18454112 -0.251984097
## BounceRates -0.184541115 1.00000000 0.913004396
## ExitRates -0.251984097 0.91300440 1.000000000
## PageValues 0.052823063 -0.11938603 -0.174498310
## Browser -0.007380440 -0.01577221 -0.004442355
## VisitorType -0.118273394 -0.11491649 -0.152677804
## Weekend 0.007310614 -0.04651400 -0.062587048
## Revenue 0.152372611 -0.15067291 -0.207071082
## Pre_Christmas 0.128780968 -0.06084438 -0.085611165
## PageValues Browser VisitorType Weekend
## Administrative 0.09898959 -0.025034572 0.01668015 0.026416750
## Administrative_Duration 0.06760848 -0.015391527 0.01912018 0.014990142
## Informational 0.04863169 -0.038234678 -0.05821094 0.035784725
## Informational_Duration 0.03086087 -0.019284981 -0.04537184 0.024078486
## ProductRelated 0.05628179 -0.013145721 -0.12791579 0.016091964
## ProductRelated_Duration 0.05282306 -0.007380440 -0.11827339 0.007310614
## BounceRates -0.11938603 -0.015772209 -0.11491649 -0.046513997
## ExitRates -0.17449831 -0.004442355 -0.15267780 -0.062587048
## PageValues 1.00000000 0.045591919 0.12007658 0.012001639
## Browser 0.04559192 1.000000000 0.12445574 -0.040260864
## VisitorType 0.12007658 0.124455742 1.00000000 0.030261699
## Weekend 0.01200164 -0.040260864 0.03026170 1.000000000
## Revenue 0.49256930 0.023984289 0.09848486 0.029295368
## Pre_Christmas 0.06057641 0.005028240 0.10814622 0.028808922
## Revenue Pre_Christmas
## Administrative 0.13891709 0.07263560
## Administrative_Duration 0.09358672 0.04759135
## Informational 0.09520034 0.05616678
## Informational_Duration 0.07034450 0.04191138
## ProductRelated 0.15853798 0.13802800
## ProductRelated_Duration 0.15237261 0.12878097
## BounceRates -0.15067291 -0.06084438
## ExitRates -0.20707108 -0.08561117
## PageValues 0.49256930 0.06057641
## Browser 0.02398429 0.00502824
## VisitorType 0.09848486 0.10814622
## Weekend 0.02929537 0.02880892
## Revenue 1.00000000 0.12459138
## Pre_Christmas 0.12459138 1.00000000
Next, I began cutting down on variables by merging duration and page count variables (due to multicollinearity also) and removing the old ones, and reprinted the matrix
# Create total_duration and total_pages variables
project_data <- project_data |>
mutate(
total_duration = Administrative_Duration + Informational_Duration + ProductRelated_Duration,
total_pages = Administrative + Informational + ProductRelated
) |>
# Remove the original columns to reduce multicollinearity
select(-Administrative_Duration, -Informational_Duration, -ProductRelated_Duration,
-Administrative, -Informational, -ProductRelated)
# Convert logical columns to numeric for correlation calculation
project_data$Revenue <- as.numeric(project_data$Revenue)
project_data$Weekend <- as.numeric(project_data$Weekend)
project_data$VisitorType <- as.numeric(factor(project_data$VisitorType, levels = unique(project_data$VisitorType)))
# Generate the correlation matrix
cor_matrix <- project_data |>
select_if(is.numeric) |>
cor(use = "complete.obs")
# Print the correlation matrix
print(cor_matrix)
## BounceRates ExitRates PageValues Browser VisitorType
## BounceRates 1.00000000 0.913004396 -0.11938603 -0.015772209 -0.11491649
## ExitRates 0.91300440 1.000000000 -0.17449831 -0.004442355 -0.15267780
## PageValues -0.11938603 -0.174498310 1.00000000 0.045591919 0.12007658
## Browser -0.01577221 -0.004442355 0.04559192 1.000000000 0.12445574
## VisitorType -0.11491649 -0.152677804 0.12007658 0.124455742 1.00000000
## Weekend -0.04651400 -0.062587048 0.01200164 -0.040260864 0.03026170
## Revenue -0.15067291 -0.207071082 0.49256930 0.023984289 0.09848486
## Pre_Christmas -0.06084438 -0.085611165 0.06057641 0.005028240 0.10814622
## total_duration -0.19092231 -0.261758762 0.05760192 -0.009598073 -0.11254391
## total_pages -0.21474806 -0.306776609 0.06221245 -0.015401497 -0.12270805
## Weekend Revenue Pre_Christmas total_duration
## BounceRates -0.046513997 -0.15067291 -0.06084438 -0.190922308
## ExitRates -0.062587048 -0.20707108 -0.08561117 -0.261758762
## PageValues 0.012001639 0.49256930 0.06057641 0.057601918
## Browser -0.040260864 0.02398429 0.00502824 -0.009598073
## VisitorType 0.030261699 0.09848486 0.10814622 -0.112543908
## Weekend 1.000000000 0.02929537 0.02880892 0.009828763
## Revenue 0.029295368 1.00000000 0.12459138 0.156068130
## Pre_Christmas 0.028808922 0.12459138 1.00000000 0.127959623
## total_duration 0.009828763 0.15606813 0.12795962 1.000000000
## total_pages 0.018250423 0.16411016 0.13869970 0.857412966
## total_pages
## BounceRates -0.21474806
## ExitRates -0.30677661
## PageValues 0.06221245
## Browser -0.01540150
## VisitorType -0.12270805
## Weekend 0.01825042
## Revenue 0.16411016
## Pre_Christmas 0.13869970
## total_duration 0.85741297
## total_pages 1.00000000
Next I wanted to see if we’d have any correlation with our Browser classes by making dummy variables.
# Load necessary libraries
library(tidyverse)
# Recode the Browser variable based on the given mapping
project_data <- project_data |>
mutate(Browser_Group = case_when(
Browser == 2 ~ "Chrome",
Browser == 1 ~ "Safari",
Browser == 4 ~ "Firefox",
TRUE ~ "Other"
))
# Convert Browser_Group into dummy variables
project_data <- project_data |>
mutate_at(vars(Browser_Group), as.factor) |>
mutate(Browser_Chrome = if_else(Browser_Group == "Chrome", 1, 0),
Browser_Safari = if_else(Browser_Group == "Safari", 1, 0),
Browser_Firefox = if_else(Browser_Group == "Firefox", 1, 0),
Browser_Other = if_else(Browser_Group == "Other", 1, 0))
# Select the relevant columns for correlation, including the dummy variables
cor_data <- project_data |>
select(-Browser, -Browser_Group) |>
select_if(is.numeric)
# Calculate the correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")
# Print the correlation matrix
print(cor_matrix)
## BounceRates ExitRates PageValues VisitorType Weekend
## BounceRates 1.000000000 0.91300440 -0.11938603 -0.11491649 -0.046513997
## ExitRates 0.913004396 1.00000000 -0.17449831 -0.15267780 -0.062587048
## PageValues -0.119386026 -0.17449831 1.00000000 0.12007658 0.012001639
## VisitorType -0.114916489 -0.15267780 0.12007658 1.00000000 0.030261699
## Weekend -0.046513997 -0.06258705 0.01200164 0.03026170 1.000000000
## Revenue -0.150672912 -0.20707108 0.49256930 0.09848486 0.029295368
## Pre_Christmas -0.060844384 -0.08561117 0.06057641 0.10814622 0.028808922
## total_duration -0.190922308 -0.26175876 0.05760192 -0.11254391 0.009828763
## total_pages -0.214748058 -0.30677661 0.06221245 -0.12270805 0.018250423
## Browser_Chrome -0.023001124 -0.03181496 -0.01253733 -0.05555502 -0.026791416
## Browser_Safari 0.041017650 0.03779079 -0.01030120 0.01119674 0.063065051
## Browser_Firefox -0.026170769 -0.01727675 0.01132337 0.02064515 -0.043102884
## Browser_Other 0.002745455 0.01433783 0.02535019 0.05868681 -0.007450396
## Revenue Pre_Christmas total_duration total_pages
## BounceRates -0.150672912 -0.060844384 -0.190922308 -0.214748058
## ExitRates -0.207071082 -0.085611165 -0.261758762 -0.306776609
## PageValues 0.492569295 0.060576405 0.057601918 0.062212446
## VisitorType 0.098484857 0.108146216 -0.112543908 -0.122708046
## Weekend 0.029295368 0.028808922 0.009828763 0.018250423
## Revenue 1.000000000 0.124591381 0.156068130 0.164110159
## Pre_Christmas 0.124591381 1.000000000 0.127959623 0.138699696
## total_duration 0.156068130 0.127959623 1.000000000 0.857412966
## total_pages 0.164110159 0.138699696 0.857412966 1.000000000
## Browser_Chrome -0.004182589 0.006437624 0.081800037 0.093335459
## Browser_Safari -0.008964876 0.027848059 -0.073482149 -0.080261413
## Browser_Firefox 0.015247028 -0.048306372 -0.007017284 -0.009494028
## Browser_Other 0.006726694 -0.009438492 -0.027589316 -0.035163969
## Browser_Chrome Browser_Safari Browser_Firefox Browser_Other
## BounceRates -0.023001124 0.041017650 -0.026170769 0.002745455
## ExitRates -0.031814963 0.037790785 -0.017276749 0.014337830
## PageValues -0.012537332 -0.010301201 0.011323368 0.025350187
## VisitorType -0.055555023 0.011196742 0.020645147 0.058686814
## Weekend -0.026791416 0.063065051 -0.043102884 -0.007450396
## Revenue -0.004182589 -0.008964876 0.015247028 0.006726694
## Pre_Christmas 0.006437624 0.027848059 -0.048306372 -0.009438492
## total_duration 0.081800037 -0.073482149 -0.007017284 -0.027589316
## total_pages 0.093335459 -0.080261413 -0.009494028 -0.035163969
## Browser_Chrome 1.000000000 -0.674252000 -0.340106591 -0.437278977
## Browser_Safari -0.674252000 1.000000000 -0.125849563 -0.161806239
## Browser_Firefox -0.340106591 -0.125849563 1.000000000 -0.081618398
## Browser_Other -0.437278977 -0.161806239 -0.081618398 1.000000000
I found that these browser variables had very little correlative relations with the target so I dropped them, and cleaned the dataset
# Remove the browser dummy variables
project_data_cleaned <- project_data |>
select(-starts_with("Browser_"), -Browser) # Adjust based on the dummy variable names
# Select relevant columns for the correlation matrix
cor_data <- project_data_cleaned |>
select_if(is.numeric)
# Calculate the correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")
# Print the correlation matrix
print(cor_matrix)
## BounceRates ExitRates PageValues VisitorType Weekend
## BounceRates 1.00000000 0.91300440 -0.11938603 -0.11491649 -0.046513997
## ExitRates 0.91300440 1.00000000 -0.17449831 -0.15267780 -0.062587048
## PageValues -0.11938603 -0.17449831 1.00000000 0.12007658 0.012001639
## VisitorType -0.11491649 -0.15267780 0.12007658 1.00000000 0.030261699
## Weekend -0.04651400 -0.06258705 0.01200164 0.03026170 1.000000000
## Revenue -0.15067291 -0.20707108 0.49256930 0.09848486 0.029295368
## Pre_Christmas -0.06084438 -0.08561117 0.06057641 0.10814622 0.028808922
## total_duration -0.19092231 -0.26175876 0.05760192 -0.11254391 0.009828763
## total_pages -0.21474806 -0.30677661 0.06221245 -0.12270805 0.018250423
## Revenue Pre_Christmas total_duration total_pages
## BounceRates -0.15067291 -0.06084438 -0.190922308 -0.21474806
## ExitRates -0.20707108 -0.08561117 -0.261758762 -0.30677661
## PageValues 0.49256930 0.06057641 0.057601918 0.06221245
## VisitorType 0.09848486 0.10814622 -0.112543908 -0.12270805
## Weekend 0.02929537 0.02880892 0.009828763 0.01825042
## Revenue 1.00000000 0.12459138 0.156068130 0.16411016
## Pre_Christmas 0.12459138 1.00000000 0.127959623 0.13869970
## total_duration 0.15606813 0.12795962 1.000000000 0.85741297
## total_pages 0.16411016 0.13869970 0.857412966 1.00000000
finally i removed the variables of these that had high multicollinearity to get our final 5 variables. (i know 1-4 was the suggestion but I felt all 5 of these were useful for a model)
# Create a new dataset excluding BounceRates and Weekend
project_data_more_clean <- project_data_cleaned |>
select(-BounceRates, -Weekend, -total_duration)
# Compute and display the updated correlation matrix
cor_matrix_more_clean <- cor(project_data_more_clean |> select_if(is.numeric), use = "complete.obs")
print(cor_matrix_more_clean)
## ExitRates PageValues VisitorType Revenue Pre_Christmas
## ExitRates 1.00000000 -0.17449831 -0.15267780 -0.20707108 -0.08561117
## PageValues -0.17449831 1.00000000 0.12007658 0.49256930 0.06057641
## VisitorType -0.15267780 0.12007658 1.00000000 0.09848486 0.10814622
## Revenue -0.20707108 0.49256930 0.09848486 1.00000000 0.12459138
## Pre_Christmas -0.08561117 0.06057641 0.10814622 0.12459138 1.00000000
## total_pages -0.30677661 0.06221245 -0.12270805 0.16411016 0.13869970
## total_pages
## ExitRates -0.30677661
## PageValues 0.06221245
## VisitorType -0.12270805
## Revenue 0.16411016
## Pre_Christmas 0.13869970
## total_pages 1.00000000
First, i’ll just use glm(), and get the general summary numbers
# Ensure that 'Revenue' is treated as a binary factor
project_data_more_clean$Revenue <- as.factor(project_data_more_clean$Revenue)
# Fit the logistic regression model
logistic_model <- glm(Revenue ~ ExitRates + PageValues + VisitorType + Pre_Christmas + total_pages,
data = project_data_more_clean,
family = binomial)
# Print the summary of the model
summary(logistic_model)
##
## Call:
## glm(formula = Revenue ~ ExitRates + PageValues + VisitorType +
## Pre_Christmas + total_pages, family = binomial, data = project_data_more_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.652e+00 1.277e-01 -20.757 < 2e-16 ***
## ExitRates -1.750e+01 1.621e+00 -10.795 < 2e-16 ***
## PageValues 8.082e-02 2.351e-03 34.372 < 2e-16 ***
## VisitorType 2.451e-01 7.889e-02 3.107 0.00189 **
## Pre_Christmas 5.954e-01 6.238e-02 9.545 < 2e-16 ***
## total_pages 5.055e-03 5.263e-04 9.605 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 10624.8 on 12329 degrees of freedom
## Residual deviance: 7343.1 on 12324 degrees of freedom
## AIC: 7355.1
##
## Number of Fisher Scoring iterations: 6
Next, we’ll see how accurate our model is at predicting the target and get a confidence matrix, and the accuracy, precision, recall, and F1 scores for the model. We’ll also get the 95% confidence interval
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(lattice)
# Split the data into training and test sets
set.seed(12)
train_indices <- createDataPartition(project_data_more_clean$Revenue, p = 0.7, list = FALSE)
train_data <- project_data_more_clean[train_indices, ]
test_data <- project_data_more_clean[-train_indices, ]
# Fit the logistic regression model on the training data
logistic_model <- glm(Revenue ~ ExitRates + PageValues + VisitorType + Pre_Christmas + total_pages,
data = train_data,
family = binomial)
# Make predictions on the test data
predicted_probs <- predict(logistic_model, newdata = test_data, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, "1", "0") # Threshold of 0.5
# Convert predictions and true values to factors for comparison
predicted_classes <- factor(predicted_classes, levels = c("0", "1"))
true_classes <- factor(test_data$Revenue, levels = c("0", "1"))
# Confusion matrix
conf_matrix <- confusionMatrix(predicted_classes, true_classes, positive = "1")
print(conf_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3054 349
## 1 72 223
##
## Accuracy : 0.8862
## 95% CI : (0.8755, 0.8962)
## No Information Rate : 0.8453
## P-Value [Acc > NIR] : 5.469e-13
##
## Kappa : 0.4573
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.38986
## Specificity : 0.97697
## Pos Pred Value : 0.75593
## Neg Pred Value : 0.89744
## Prevalence : 0.15468
## Detection Rate : 0.06030
## Detection Prevalence : 0.07977
## Balanced Accuracy : 0.68341
##
## 'Positive' Class : 1
##
# Accuracy, Precision, Recall, and F1-score
accuracy <- conf_matrix$overall['Accuracy']
precision <- conf_matrix$byClass['Precision']
recall <- conf_matrix$byClass['Recall']
f1_score <- conf_matrix$byClass['F1']
# Print results
cat("Accuracy:", accuracy, "\n")
## Accuracy: 0.8861547
cat("Precision:", precision, "\n")
## Precision: 0.7559322
cat("Recall:", recall, "\n")
## Recall: 0.3898601
cat("F1 Score:", f1_score, "\n")
## F1 Score: 0.5144175
Here’s what I take away from these values:
Accuracy: The model has a high overall accuracy of 88.5%. However, accuracy alone can be misleading in cases of imbalanced data, which we have here with a higher number of non-revenue (0) instances compared to revenue (1) instances.
Sensitivity (Recall): Almost more importantly, the sensitivity (or recall) for predicting the positive class (Revenue = 1) is quite low at 36.7%. This means that the model is missing a significant portion of actual revenue-generating cases, which is problematic since correctly identifying these cases is my goal.
F1 Score: With an F1 score of 0.50 for the positive class, the model struggles to balance precision and recall. This low F1 score is driven by the model's difficulty in capturing true positives (revenue cases) effectively.
In order to optimize these issues we ran into I will implement weights into the formula, I probably should have done this to begin will based on the significant imbalance in the target.
The weights are calculated based on the inverse of each class’s frequency, balancing the data by assigning more weight to the minority class.
# First, Calculate Weights for Each Class
# Calculate the frequency of each class
class_counts <- table(project_data_more_clean$Revenue)
total_count <- nrow(project_data_more_clean)
# Calculate weights for each class
weight_0 <- total_count / (2 * class_counts[1]) # Weight for class 0 (non-revenue)
weight_1 <- total_count / (2 * class_counts[2]) # Weight for class 1 (revenue)
# Assign weights based on the target variable
project_data_more_clean$weights <- ifelse(project_data_more_clean$Revenue == 0, weight_0, weight_1)
# Run the Weighted Logistic Regression Model
weighted_model <- glm(
formula = Revenue ~ ExitRates + PageValues + VisitorType + Pre_Christmas + total_pages,
family = binomial,
data = project_data_more_clean,
weights = weights
)
## Warning in eval(family$initialize): non-integer #successes in a binomial glm!
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
# View Model Summary
summary(weighted_model)
##
## Call:
## glm(formula = Revenue ~ ExitRates + PageValues + VisitorType +
## Pre_Christmas + total_pages, family = binomial, data = project_data_more_clean,
## weights = weights)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.493e+00 1.010e-01 -14.782 < 2e-16 ***
## ExitRates -1.407e+01 1.042e+00 -13.495 < 2e-16 ***
## PageValues 1.192e-01 3.033e-03 39.298 < 2e-16 ***
## VisitorType 2.321e-01 6.409e-02 3.622 0.000293 ***
## Pre_Christmas 7.909e-01 4.892e-02 16.168 < 2e-16 ***
## total_pages 6.958e-03 5.139e-04 13.539 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 17093 on 12329 degrees of freedom
## Residual deviance: 10834 on 12324 degrees of freedom
## AIC: 14355
##
## Number of Fisher Scoring iterations: 8
# Evaluate Model
# Predict probabilities
predicted_prob <- predict(weighted_model, type = "response")
# Classify based on a 0.5 threshold
predicted_class <- ifelse(predicted_prob > 0.5, 1, 0)
# Generate Confusion Matrix and Evaluation Metrics
# Use caret for evaluation
library(caret)
# Generate confusion matrix
confusion_matrix <- confusionMatrix(
factor(predicted_class),
factor(project_data_more_clean$Revenue),
positive = "1"
)
# Print confusion matrix and evaluation metrics
print(confusion_matrix)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 9364 487
## 1 1058 1421
##
## Accuracy : 0.8747
## 95% CI : (0.8687, 0.8805)
## No Information Rate : 0.8453
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5732
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7448
## Specificity : 0.8985
## Pos Pred Value : 0.5732
## Neg Pred Value : 0.9506
## Prevalence : 0.1547
## Detection Rate : 0.1152
## Detection Prevalence : 0.2011
## Balanced Accuracy : 0.8216
##
## 'Positive' Class : 1
##
# Extract the components from the confusion matrix
confusion_data <- confusionMatrix(
factor(predicted_class),
factor(project_data_more_clean$Revenue),
positive = "1"
)
# Extracting precision, recall, and F1 score
precision <- confusion_data$byClass["Pos Pred Value"] # Precision
recall <- confusion_data$byClass["Sensitivity"] # Recall
f1_score <- 2 * (precision * recall) / (precision + recall) # F1 Score
# Print out the results
cat("Precision: ", precision, "\n")
## Precision: 0.573215
cat("Recall: ", recall, "\n")
## Recall: 0.7447589
cat("F1 Score: ", f1_score, "\n")
## F1 Score: 0.6478231
New insights:
The weighted model achieved a recall (sensitivity) of 74.5% compared to 36.7% in the unweighted model. This indicates that the weighted model is much better at correctly identifying positive (revenue-generating) instances, which is essential given the class imbalance.
The F1 score increased from 0.496 in the unweighted model to 0.648 in the weighted model. This improvement reflects a better balance between precision and recall, making the weighted model more effective in identifying relevant revenue-generating instances.
Specificity decreased slightly from 97.9% in the unweighted model to 89.9% in the weighted model. While this trade-off was necessary to improve recall, it means the weighted model is slightly more likely to produce false positives, classifying non-revenue instances as revenue.