library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.3.3

library(ggrepel)

## Warning: package 'ggrepel' was built under R version 4.3.2

library(AmesHousing)

## Warning: package 'AmesHousing' was built under R version 4.3.3

library(boot)
library(broom)
library(lindia)

## Warning: package 'lindia' was built under R version 4.3.3

project_data <- read.csv("online_shoppers_intention.csv")

Logistic Regression

summary(project_data)

##  Administrative   Administrative_Duration Informational    
##  Min.   : 0.000   Min.   :   0.00         Min.   : 0.0000  
##  1st Qu.: 0.000   1st Qu.:   0.00         1st Qu.: 0.0000  
##  Median : 1.000   Median :   7.50         Median : 0.0000  
##  Mean   : 2.315   Mean   :  80.82         Mean   : 0.5036  
##  3rd Qu.: 4.000   3rd Qu.:  93.26         3rd Qu.: 0.0000  
##  Max.   :27.000   Max.   :3398.75         Max.   :24.0000  
##  Informational_Duration ProductRelated   ProductRelated_Duration
##  Min.   :   0.00        Min.   :  0.00   Min.   :    0.0        
##  1st Qu.:   0.00        1st Qu.:  7.00   1st Qu.:  184.1        
##  Median :   0.00        Median : 18.00   Median :  598.9        
##  Mean   :  34.47        Mean   : 31.73   Mean   : 1194.8        
##  3rd Qu.:   0.00        3rd Qu.: 38.00   3rd Qu.: 1464.2        
##  Max.   :2549.38        Max.   :705.00   Max.   :63973.5        
##   BounceRates         ExitRates         PageValues        SpecialDay     
##  Min.   :0.000000   Min.   :0.00000   Min.   :  0.000   Min.   :0.00000  
##  1st Qu.:0.000000   1st Qu.:0.01429   1st Qu.:  0.000   1st Qu.:0.00000  
##  Median :0.003112   Median :0.02516   Median :  0.000   Median :0.00000  
##  Mean   :0.022191   Mean   :0.04307   Mean   :  5.889   Mean   :0.06143  
##  3rd Qu.:0.016813   3rd Qu.:0.05000   3rd Qu.:  0.000   3rd Qu.:0.00000  
##  Max.   :0.200000   Max.   :0.20000   Max.   :361.764   Max.   :1.00000  
##     Month           OperatingSystems    Browser           Region     
##  Length:12330       Min.   :1.000    Min.   : 1.000   Min.   :1.000  
##  Class :character   1st Qu.:2.000    1st Qu.: 2.000   1st Qu.:1.000  
##  Mode  :character   Median :2.000    Median : 2.000   Median :3.000  
##                     Mean   :2.124    Mean   : 2.357   Mean   :3.147  
##                     3rd Qu.:3.000    3rd Qu.: 2.000   3rd Qu.:4.000  
##                     Max.   :8.000    Max.   :13.000   Max.   :9.000  
##   TrafficType    VisitorType         Weekend         Revenue       
##  Min.   : 1.00   Length:12330       Mode :logical   Mode :logical  
##  1st Qu.: 2.00   Class :character   FALSE:9462      FALSE:10422    
##  Median : 2.00   Mode  :character   TRUE :2868      TRUE :1908     
##  Mean   : 4.07                                                     
##  3rd Qu.: 4.00                                                     
##  Max.   :20.00

VARIABLE SELECTION

first, I’ll make a correlation matrix of the variables, and also convert the variables to be numeric.

I went ahead and removed the variables we’ve previously determined we don’t understand well enough to utilize.

I also converted the month variable to be represent either pre-christmas or not based on finds of previous data dives relating to the month variable.

This was when we found the significant difference in purchasing patterns leading up to christmas

# Recode Month into two categories: "Pre-Christmas" and "Other"
project_data <- project_data |>
  mutate(Pre_Christmas = if_else(Month %in% c("Oct", "Nov", "Dec"), 1, 0))

# Remove specified columns
project_data <- project_data |>
  select(-TrafficType, -OperatingSystems, -SpecialDay, -Region)

# Convert logical columns to numeric for correlation calculation
project_data$Revenue <- as.numeric(project_data$Revenue)
project_data$Weekend <- as.numeric(project_data$Weekend)
project_data$VisitorType <- as.numeric(factor(project_data$VisitorType, levels = unique(project_data$VisitorType)))

# Generate the correlation matrix
cor_matrix <- project_data |>
  select_if(is.numeric) |>
  cor(use = "complete.obs")

# Print the correlation matrix
print(cor_matrix)

##                         Administrative Administrative_Duration Informational
## Administrative              1.00000000              0.60158334    0.37685043
## Administrative_Duration     0.60158334              1.00000000    0.30270971
## Informational               0.37685043              0.30270971    1.00000000
## Informational_Duration      0.25584814              0.23803079    0.61895486
## ProductRelated              0.43111934              0.28908662    0.37416429
## ProductRelated_Duration     0.37393901              0.35542195    0.38750531
## BounceRates                -0.22356263             -0.14417041   -0.11611362
## ExitRates                  -0.31648300             -0.20579776   -0.16366606
## PageValues                  0.09898959              0.06760848    0.04863169
## Browser                    -0.02503457             -0.01539153   -0.03823468
## VisitorType                 0.01668015              0.01912018   -0.05821094
## Weekend                     0.02641675              0.01499014    0.03578473
## Revenue                     0.13891709              0.09358672    0.09520034
## Pre_Christmas               0.07263560              0.04759135    0.05616678
##                         Informational_Duration ProductRelated
## Administrative                      0.25584814     0.43111934
## Administrative_Duration             0.23803079     0.28908662
## Informational                       0.61895486     0.37416429
## Informational_Duration              1.00000000     0.28004627
## ProductRelated                      0.28004627     1.00000000
## ProductRelated_Duration             0.34736358     0.86092684
## BounceRates                        -0.07406661    -0.20457763
## ExitRates                          -0.10527568    -0.29252628
## PageValues                          0.03086087     0.05628179
## Browser                            -0.01928498    -0.01314572
## VisitorType                        -0.04537184    -0.12791579
## Weekend                             0.02407849     0.01609196
## Revenue                             0.07034450     0.15853798
## Pre_Christmas                       0.04191138     0.13802800
##                         ProductRelated_Duration BounceRates    ExitRates
## Administrative                      0.373939013 -0.22356263 -0.316482998
## Administrative_Duration             0.355421954 -0.14417041 -0.205797757
## Informational                       0.387505306 -0.11611362 -0.163666061
## Informational_Duration              0.347363577 -0.07406661 -0.105275683
## ProductRelated                      0.860926836 -0.20457763 -0.292526283
## ProductRelated_Duration             1.000000000 -0.18454112 -0.251984097
## BounceRates                        -0.184541115  1.00000000  0.913004396
## ExitRates                          -0.251984097  0.91300440  1.000000000
## PageValues                          0.052823063 -0.11938603 -0.174498310
## Browser                            -0.007380440 -0.01577221 -0.004442355
## VisitorType                        -0.118273394 -0.11491649 -0.152677804
## Weekend                             0.007310614 -0.04651400 -0.062587048
## Revenue                             0.152372611 -0.15067291 -0.207071082
## Pre_Christmas                       0.128780968 -0.06084438 -0.085611165
##                          PageValues      Browser VisitorType      Weekend
## Administrative           0.09898959 -0.025034572  0.01668015  0.026416750
## Administrative_Duration  0.06760848 -0.015391527  0.01912018  0.014990142
## Informational            0.04863169 -0.038234678 -0.05821094  0.035784725
## Informational_Duration   0.03086087 -0.019284981 -0.04537184  0.024078486
## ProductRelated           0.05628179 -0.013145721 -0.12791579  0.016091964
## ProductRelated_Duration  0.05282306 -0.007380440 -0.11827339  0.007310614
## BounceRates             -0.11938603 -0.015772209 -0.11491649 -0.046513997
## ExitRates               -0.17449831 -0.004442355 -0.15267780 -0.062587048
## PageValues               1.00000000  0.045591919  0.12007658  0.012001639
## Browser                  0.04559192  1.000000000  0.12445574 -0.040260864
## VisitorType              0.12007658  0.124455742  1.00000000  0.030261699
## Weekend                  0.01200164 -0.040260864  0.03026170  1.000000000
## Revenue                  0.49256930  0.023984289  0.09848486  0.029295368
## Pre_Christmas            0.06057641  0.005028240  0.10814622  0.028808922
##                             Revenue Pre_Christmas
## Administrative           0.13891709    0.07263560
## Administrative_Duration  0.09358672    0.04759135
## Informational            0.09520034    0.05616678
## Informational_Duration   0.07034450    0.04191138
## ProductRelated           0.15853798    0.13802800
## ProductRelated_Duration  0.15237261    0.12878097
## BounceRates             -0.15067291   -0.06084438
## ExitRates               -0.20707108   -0.08561117
## PageValues               0.49256930    0.06057641
## Browser                  0.02398429    0.00502824
## VisitorType              0.09848486    0.10814622
## Weekend                  0.02929537    0.02880892
## Revenue                  1.00000000    0.12459138
## Pre_Christmas            0.12459138    1.00000000

Next, I began cutting down on variables by merging duration and page count variables (due to multicollinearity also) and removing the old ones, and reprinted the matrix

# Create total_duration and total_pages variables
project_data <- project_data |>
  mutate(
    total_duration = Administrative_Duration + Informational_Duration + ProductRelated_Duration,
    total_pages = Administrative + Informational + ProductRelated
  ) |>
  # Remove the original columns to reduce multicollinearity
  select(-Administrative_Duration, -Informational_Duration, -ProductRelated_Duration, 
         -Administrative, -Informational, -ProductRelated)

# Convert logical columns to numeric for correlation calculation
project_data$Revenue <- as.numeric(project_data$Revenue)
project_data$Weekend <- as.numeric(project_data$Weekend)
project_data$VisitorType <- as.numeric(factor(project_data$VisitorType, levels = unique(project_data$VisitorType)))

# Generate the correlation matrix
cor_matrix <- project_data |>
  select_if(is.numeric) |>
  cor(use = "complete.obs")

# Print the correlation matrix
print(cor_matrix)

##                BounceRates    ExitRates  PageValues      Browser VisitorType
## BounceRates     1.00000000  0.913004396 -0.11938603 -0.015772209 -0.11491649
## ExitRates       0.91300440  1.000000000 -0.17449831 -0.004442355 -0.15267780
## PageValues     -0.11938603 -0.174498310  1.00000000  0.045591919  0.12007658
## Browser        -0.01577221 -0.004442355  0.04559192  1.000000000  0.12445574
## VisitorType    -0.11491649 -0.152677804  0.12007658  0.124455742  1.00000000
## Weekend        -0.04651400 -0.062587048  0.01200164 -0.040260864  0.03026170
## Revenue        -0.15067291 -0.207071082  0.49256930  0.023984289  0.09848486
## Pre_Christmas  -0.06084438 -0.085611165  0.06057641  0.005028240  0.10814622
## total_duration -0.19092231 -0.261758762  0.05760192 -0.009598073 -0.11254391
## total_pages    -0.21474806 -0.306776609  0.06221245 -0.015401497 -0.12270805
##                     Weekend     Revenue Pre_Christmas total_duration
## BounceRates    -0.046513997 -0.15067291   -0.06084438   -0.190922308
## ExitRates      -0.062587048 -0.20707108   -0.08561117   -0.261758762
## PageValues      0.012001639  0.49256930    0.06057641    0.057601918
## Browser        -0.040260864  0.02398429    0.00502824   -0.009598073
## VisitorType     0.030261699  0.09848486    0.10814622   -0.112543908
## Weekend         1.000000000  0.02929537    0.02880892    0.009828763
## Revenue         0.029295368  1.00000000    0.12459138    0.156068130
## Pre_Christmas   0.028808922  0.12459138    1.00000000    0.127959623
## total_duration  0.009828763  0.15606813    0.12795962    1.000000000
## total_pages     0.018250423  0.16411016    0.13869970    0.857412966
##                total_pages
## BounceRates    -0.21474806
## ExitRates      -0.30677661
## PageValues      0.06221245
## Browser        -0.01540150
## VisitorType    -0.12270805
## Weekend         0.01825042
## Revenue         0.16411016
## Pre_Christmas   0.13869970
## total_duration  0.85741297
## total_pages     1.00000000

Next I wanted to see if we’d have any correlation with our Browser classes by making dummy variables.

# Load necessary libraries
library(tidyverse)

# Recode the Browser variable based on the given mapping
project_data <- project_data |>
  mutate(Browser_Group = case_when(
    Browser == 2 ~ "Chrome",
    Browser == 1 ~ "Safari",
    Browser == 4 ~ "Firefox",
    TRUE ~ "Other"
  ))

# Convert Browser_Group into dummy variables
project_data <- project_data |>
  mutate_at(vars(Browser_Group), as.factor) |>
  mutate(Browser_Chrome = if_else(Browser_Group == "Chrome", 1, 0),
         Browser_Safari = if_else(Browser_Group == "Safari", 1, 0),
         Browser_Firefox = if_else(Browser_Group == "Firefox", 1, 0),
         Browser_Other = if_else(Browser_Group == "Other", 1, 0))

# Select the relevant columns for correlation, including the dummy variables
cor_data <- project_data |>
  select(-Browser, -Browser_Group) |>
  select_if(is.numeric)

# Calculate the correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")

# Print the correlation matrix
print(cor_matrix)

##                  BounceRates   ExitRates  PageValues VisitorType      Weekend
## BounceRates      1.000000000  0.91300440 -0.11938603 -0.11491649 -0.046513997
## ExitRates        0.913004396  1.00000000 -0.17449831 -0.15267780 -0.062587048
## PageValues      -0.119386026 -0.17449831  1.00000000  0.12007658  0.012001639
## VisitorType     -0.114916489 -0.15267780  0.12007658  1.00000000  0.030261699
## Weekend         -0.046513997 -0.06258705  0.01200164  0.03026170  1.000000000
## Revenue         -0.150672912 -0.20707108  0.49256930  0.09848486  0.029295368
## Pre_Christmas   -0.060844384 -0.08561117  0.06057641  0.10814622  0.028808922
## total_duration  -0.190922308 -0.26175876  0.05760192 -0.11254391  0.009828763
## total_pages     -0.214748058 -0.30677661  0.06221245 -0.12270805  0.018250423
## Browser_Chrome  -0.023001124 -0.03181496 -0.01253733 -0.05555502 -0.026791416
## Browser_Safari   0.041017650  0.03779079 -0.01030120  0.01119674  0.063065051
## Browser_Firefox -0.026170769 -0.01727675  0.01132337  0.02064515 -0.043102884
## Browser_Other    0.002745455  0.01433783  0.02535019  0.05868681 -0.007450396
##                      Revenue Pre_Christmas total_duration  total_pages
## BounceRates     -0.150672912  -0.060844384   -0.190922308 -0.214748058
## ExitRates       -0.207071082  -0.085611165   -0.261758762 -0.306776609
## PageValues       0.492569295   0.060576405    0.057601918  0.062212446
## VisitorType      0.098484857   0.108146216   -0.112543908 -0.122708046
## Weekend          0.029295368   0.028808922    0.009828763  0.018250423
## Revenue          1.000000000   0.124591381    0.156068130  0.164110159
## Pre_Christmas    0.124591381   1.000000000    0.127959623  0.138699696
## total_duration   0.156068130   0.127959623    1.000000000  0.857412966
## total_pages      0.164110159   0.138699696    0.857412966  1.000000000
## Browser_Chrome  -0.004182589   0.006437624    0.081800037  0.093335459
## Browser_Safari  -0.008964876   0.027848059   -0.073482149 -0.080261413
## Browser_Firefox  0.015247028  -0.048306372   -0.007017284 -0.009494028
## Browser_Other    0.006726694  -0.009438492   -0.027589316 -0.035163969
##                 Browser_Chrome Browser_Safari Browser_Firefox Browser_Other
## BounceRates       -0.023001124    0.041017650    -0.026170769   0.002745455
## ExitRates         -0.031814963    0.037790785    -0.017276749   0.014337830
## PageValues        -0.012537332   -0.010301201     0.011323368   0.025350187
## VisitorType       -0.055555023    0.011196742     0.020645147   0.058686814
## Weekend           -0.026791416    0.063065051    -0.043102884  -0.007450396
## Revenue           -0.004182589   -0.008964876     0.015247028   0.006726694
## Pre_Christmas      0.006437624    0.027848059    -0.048306372  -0.009438492
## total_duration     0.081800037   -0.073482149    -0.007017284  -0.027589316
## total_pages        0.093335459   -0.080261413    -0.009494028  -0.035163969
## Browser_Chrome     1.000000000   -0.674252000    -0.340106591  -0.437278977
## Browser_Safari    -0.674252000    1.000000000    -0.125849563  -0.161806239
## Browser_Firefox   -0.340106591   -0.125849563     1.000000000  -0.081618398
## Browser_Other     -0.437278977   -0.161806239    -0.081618398   1.000000000

I found that these browser variables had very little correlative relations with the target so I dropped them, and cleaned the dataset

# Remove the browser dummy variables
project_data_cleaned <- project_data |>
  select(-starts_with("Browser_"), -Browser)  # Adjust based on the dummy variable names

# Select relevant columns for the correlation matrix
cor_data <- project_data_cleaned |>
  select_if(is.numeric)

# Calculate the correlation matrix
cor_matrix <- cor(cor_data, use = "complete.obs")

# Print the correlation matrix
print(cor_matrix)

##                BounceRates   ExitRates  PageValues VisitorType      Weekend
## BounceRates     1.00000000  0.91300440 -0.11938603 -0.11491649 -0.046513997
## ExitRates       0.91300440  1.00000000 -0.17449831 -0.15267780 -0.062587048
## PageValues     -0.11938603 -0.17449831  1.00000000  0.12007658  0.012001639
## VisitorType    -0.11491649 -0.15267780  0.12007658  1.00000000  0.030261699
## Weekend        -0.04651400 -0.06258705  0.01200164  0.03026170  1.000000000
## Revenue        -0.15067291 -0.20707108  0.49256930  0.09848486  0.029295368
## Pre_Christmas  -0.06084438 -0.08561117  0.06057641  0.10814622  0.028808922
## total_duration -0.19092231 -0.26175876  0.05760192 -0.11254391  0.009828763
## total_pages    -0.21474806 -0.30677661  0.06221245 -0.12270805  0.018250423
##                    Revenue Pre_Christmas total_duration total_pages
## BounceRates    -0.15067291   -0.06084438   -0.190922308 -0.21474806
## ExitRates      -0.20707108   -0.08561117   -0.261758762 -0.30677661
## PageValues      0.49256930    0.06057641    0.057601918  0.06221245
## VisitorType     0.09848486    0.10814622   -0.112543908 -0.12270805
## Weekend         0.02929537    0.02880892    0.009828763  0.01825042
## Revenue         1.00000000    0.12459138    0.156068130  0.16411016
## Pre_Christmas   0.12459138    1.00000000    0.127959623  0.13869970
## total_duration  0.15606813    0.12795962    1.000000000  0.85741297
## total_pages     0.16411016    0.13869970    0.857412966  1.00000000

finally i removed the variables of these that had high multicollinearity to get our final 5 variables. (i know 1-4 was the suggestion but I felt all 5 of these were useful for a model)

# Create a new dataset excluding BounceRates and Weekend
project_data_more_clean <- project_data_cleaned |>
  select(-BounceRates, -Weekend, -total_duration)

# Compute and display the updated correlation matrix
cor_matrix_more_clean <- cor(project_data_more_clean |> select_if(is.numeric), use = "complete.obs")
print(cor_matrix_more_clean)

##                 ExitRates  PageValues VisitorType     Revenue Pre_Christmas
## ExitRates      1.00000000 -0.17449831 -0.15267780 -0.20707108   -0.08561117
## PageValues    -0.17449831  1.00000000  0.12007658  0.49256930    0.06057641
## VisitorType   -0.15267780  0.12007658  1.00000000  0.09848486    0.10814622
## Revenue       -0.20707108  0.49256930  0.09848486  1.00000000    0.12459138
## Pre_Christmas -0.08561117  0.06057641  0.10814622  0.12459138    1.00000000
## total_pages   -0.30677661  0.06221245 -0.12270805  0.16411016    0.13869970
##               total_pages
## ExitRates     -0.30677661
## PageValues     0.06221245
## VisitorType   -0.12270805
## Revenue        0.16411016
## Pre_Christmas  0.13869970
## total_pages    1.00000000

MODEL BUILDING AND OPTIMIZATION

First, i’ll just use glm(), and get the general summary numbers

# Ensure that 'Revenue' is treated as a binary factor
project_data_more_clean$Revenue <- as.factor(project_data_more_clean$Revenue)

# Fit the logistic regression model
logistic_model <- glm(Revenue ~ ExitRates + PageValues + VisitorType + Pre_Christmas + total_pages, 
                      data = project_data_more_clean, 
                      family = binomial)

# Print the summary of the model
summary(logistic_model)

## 
## Call:
## glm(formula = Revenue ~ ExitRates + PageValues + VisitorType + 
##     Pre_Christmas + total_pages, family = binomial, data = project_data_more_clean)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -2.652e+00  1.277e-01 -20.757  < 2e-16 ***
## ExitRates     -1.750e+01  1.621e+00 -10.795  < 2e-16 ***
## PageValues     8.082e-02  2.351e-03  34.372  < 2e-16 ***
## VisitorType    2.451e-01  7.889e-02   3.107  0.00189 ** 
## Pre_Christmas  5.954e-01  6.238e-02   9.545  < 2e-16 ***
## total_pages    5.055e-03  5.263e-04   9.605  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 10624.8  on 12329  degrees of freedom
## Residual deviance:  7343.1  on 12324  degrees of freedom
## AIC: 7355.1
## 
## Number of Fisher Scoring iterations: 6

Based on the output of our logistic regression model summary, all variables in the model are statistically significant predictors (p < 0.05). The high coefficients for some variables, like ExitRates and total_pages, suggest they have a strong impact on predicting revenue. However, the negative coefficient for ExitRates implies a lower exit rate is associated with revenue generation, which is intuitive.

Next, we’ll see how accurate our model is at predicting the target and get a confidence matrix, and the accuracy, precision, recall, and F1 scores for the model. We’ll also get the 95% confidence interval

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'lattice'

## The following object is masked from 'package:boot':
## 
##     melanoma

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(lattice)

# Split the data into training and test sets
set.seed(12)
train_indices <- createDataPartition(project_data_more_clean$Revenue, p = 0.7, list = FALSE)
train_data <- project_data_more_clean[train_indices, ]
test_data <- project_data_more_clean[-train_indices, ]

# Fit the logistic regression model on the training data
logistic_model <- glm(Revenue ~ ExitRates + PageValues + VisitorType + Pre_Christmas + total_pages, 
                      data = train_data, 
                      family = binomial)

# Make predictions on the test data
predicted_probs <- predict(logistic_model, newdata = test_data, type = "response")
predicted_classes <- ifelse(predicted_probs > 0.5, "1", "0")  # Threshold of 0.5

# Convert predictions and true values to factors for comparison
predicted_classes <- factor(predicted_classes, levels = c("0", "1"))
true_classes <- factor(test_data$Revenue, levels = c("0", "1"))

# Confusion matrix
conf_matrix <- confusionMatrix(predicted_classes, true_classes, positive = "1")
print(conf_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3054  349
##          1   72  223
##                                           
##                Accuracy : 0.8862          
##                  95% CI : (0.8755, 0.8962)
##     No Information Rate : 0.8453          
##     P-Value [Acc > NIR] : 5.469e-13       
##                                           
##                   Kappa : 0.4573          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.38986         
##             Specificity : 0.97697         
##          Pos Pred Value : 0.75593         
##          Neg Pred Value : 0.89744         
##              Prevalence : 0.15468         
##          Detection Rate : 0.06030         
##    Detection Prevalence : 0.07977         
##       Balanced Accuracy : 0.68341         
##                                           
##        'Positive' Class : 1               
##

# Accuracy, Precision, Recall, and F1-score
accuracy <- conf_matrix$overall['Accuracy']
precision <- conf_matrix$byClass['Precision']
recall <- conf_matrix$byClass['Recall']
f1_score <- conf_matrix$byClass['F1']

# Print results
cat("Accuracy:", accuracy, "\n")

## Accuracy: 0.8861547

cat("Precision:", precision, "\n")

## Precision: 0.7559322

cat("Recall:", recall, "\n")

## Recall: 0.3898601

cat("F1 Score:", f1_score, "\n")

## F1 Score: 0.5144175

Here’s what I take away from these values:

Accuracy: The model has a high overall accuracy of 88.5%. However, accuracy alone can be misleading in cases of imbalanced data, which we have here with a higher number of non-revenue (0) instances compared to revenue (1) instances.

Sensitivity (Recall): Almost more importantly, the sensitivity (or recall) for predicting the positive class (Revenue = 1) is quite low at 36.7%. This means that the model is missing a significant portion of actual revenue-generating cases, which is problematic since correctly identifying these cases is my goal.

F1 Score: With an F1 score of 0.50 for the positive class, the model struggles to balance precision and recall. This low F1 score is driven by the model's difficulty in capturing true positives (revenue cases) effectively.

In order to optimize these issues we ran into I will implement weights into the formula, I probably should have done this to begin will based on the significant imbalance in the target.

The weights are calculated based on the inverse of each class’s frequency, balancing the data by assigning more weight to the minority class.

# First, Calculate Weights for Each Class
# Calculate the frequency of each class
class_counts <- table(project_data_more_clean$Revenue)
total_count <- nrow(project_data_more_clean)

# Calculate weights for each class
weight_0 <- total_count / (2 * class_counts[1])  # Weight for class 0 (non-revenue)
weight_1 <- total_count / (2 * class_counts[2])  # Weight for class 1 (revenue)

# Assign weights based on the target variable
project_data_more_clean$weights <- ifelse(project_data_more_clean$Revenue == 0, weight_0, weight_1)

# Run the Weighted Logistic Regression Model
weighted_model <- glm(
  formula = Revenue ~ ExitRates + PageValues + VisitorType + Pre_Christmas + total_pages,
  family = binomial,
  data = project_data_more_clean,
  weights = weights
)

## Warning in eval(family$initialize): non-integer #successes in a binomial glm!

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# View Model Summary
summary(weighted_model)

## 
## Call:
## glm(formula = Revenue ~ ExitRates + PageValues + VisitorType + 
##     Pre_Christmas + total_pages, family = binomial, data = project_data_more_clean, 
##     weights = weights)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -1.493e+00  1.010e-01 -14.782  < 2e-16 ***
## ExitRates     -1.407e+01  1.042e+00 -13.495  < 2e-16 ***
## PageValues     1.192e-01  3.033e-03  39.298  < 2e-16 ***
## VisitorType    2.321e-01  6.409e-02   3.622 0.000293 ***
## Pre_Christmas  7.909e-01  4.892e-02  16.168  < 2e-16 ***
## total_pages    6.958e-03  5.139e-04  13.539  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 17093  on 12329  degrees of freedom
## Residual deviance: 10834  on 12324  degrees of freedom
## AIC: 14355
## 
## Number of Fisher Scoring iterations: 8

# Evaluate Model
# Predict probabilities
predicted_prob <- predict(weighted_model, type = "response")

# Classify based on a 0.5 threshold
predicted_class <- ifelse(predicted_prob > 0.5, 1, 0)

# Generate Confusion Matrix and Evaluation Metrics
# Use caret for evaluation
library(caret)

# Generate confusion matrix
confusion_matrix <- confusionMatrix(
  factor(predicted_class),
  factor(project_data_more_clean$Revenue),
  positive = "1"
)

# Print confusion matrix and evaluation metrics
print(confusion_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 9364  487
##          1 1058 1421
##                                           
##                Accuracy : 0.8747          
##                  95% CI : (0.8687, 0.8805)
##     No Information Rate : 0.8453          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5732          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.7448          
##             Specificity : 0.8985          
##          Pos Pred Value : 0.5732          
##          Neg Pred Value : 0.9506          
##              Prevalence : 0.1547          
##          Detection Rate : 0.1152          
##    Detection Prevalence : 0.2011          
##       Balanced Accuracy : 0.8216          
##                                           
##        'Positive' Class : 1               
##

# Extract the components from the confusion matrix
confusion_data <- confusionMatrix(
  factor(predicted_class),
  factor(project_data_more_clean$Revenue),
  positive = "1"
)

# Extracting precision, recall, and F1 score
precision <- confusion_data$byClass["Pos Pred Value"]  # Precision
recall <- confusion_data$byClass["Sensitivity"]        # Recall
f1_score <- 2 * (precision * recall) / (precision + recall)  # F1 Score

# Print out the results
cat("Precision: ", precision, "\n")

## Precision:  0.573215

cat("Recall: ", recall, "\n")

## Recall:  0.7447589

cat("F1 Score: ", f1_score, "\n")

## F1 Score:  0.6478231

New insights:

The weighted model achieved a recall (sensitivity) of 74.5% compared to 36.7% in the unweighted model. This indicates that the weighted model is much better at correctly identifying positive (revenue-generating) instances, which is essential given the class imbalance.
The F1 score increased from 0.496 in the unweighted model to 0.648 in the weighted model. This improvement reflects a better balance between precision and recall, making the weighted model more effective in identifying relevant revenue-generating instances.
Specificity decreased slightly from 97.9% in the unweighted model to 89.9% in the weighted model. While this trade-off was necessary to improve recall, it means the weighted model is slightly more likely to produce false positives, classifying non-revenue instances as revenue.

DataDive10_mohler

Carson

2024-11-05

Logistic Regression

VARIABLE SELECTION

MODEL BUILDING AND OPTIMIZATION