library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
project_data <- read.csv("online_shoppers_intention.csv")

Documentation

Head:

head(project_data)
##   Administrative Administrative_Duration Informational Informational_Duration
## 1              0                       0             0                      0
## 2              0                       0             0                      0
## 3              0                       0             0                      0
## 4              0                       0             0                      0
## 5              0                       0             0                      0
## 6              0                       0             0                      0
##   ProductRelated ProductRelated_Duration BounceRates ExitRates PageValues
## 1              1                0.000000  0.20000000 0.2000000          0
## 2              2               64.000000  0.00000000 0.1000000          0
## 3              1                0.000000  0.20000000 0.2000000          0
## 4              2                2.666667  0.05000000 0.1400000          0
## 5             10              627.500000  0.02000000 0.0500000          0
## 6             19              154.216667  0.01578947 0.0245614          0
##   SpecialDay Month OperatingSystems Browser Region TrafficType
## 1          0   Feb                1       1      1           1
## 2          0   Feb                2       2      1           2
## 3          0   Feb                4       1      9           3
## 4          0   Feb                3       2      2           4
## 5          0   Feb                3       3      1           4
## 6          0   Feb                2       2      1           3
##         VisitorType Weekend Revenue
## 1 Returning_Visitor   FALSE   FALSE
## 2 Returning_Visitor   FALSE   FALSE
## 3 Returning_Visitor   FALSE   FALSE
## 4 Returning_Visitor   FALSE   FALSE
## 5 Returning_Visitor    TRUE   FALSE
## 6 Returning_Visitor   FALSE   FALSE

Variable List:

names(project_data)
##  [1] "Administrative"          "Administrative_Duration"
##  [3] "Informational"           "Informational_Duration" 
##  [5] "ProductRelated"          "ProductRelated_Duration"
##  [7] "BounceRates"             "ExitRates"              
##  [9] "PageValues"              "SpecialDay"             
## [11] "Month"                   "OperatingSystems"       
## [13] "Browser"                 "Region"                 
## [15] "TrafficType"             "VisitorType"            
## [17] "Weekend"                 "Revenue"

So I did some further investigation and found info for these variables that I had confusion on:

Visualization Analysis

I will use graphs and charts to try and determine more about these 4:

  • First I will make a pie chart of the distribution of the Browser variable and compare it to one with real world data and see if they correlate:
browser_counts <- table(project_data$Browser)
# Create labels for the pie chart, only showing percentages for 1, 2, 4, and 5
labels <- ifelse(names(browser_counts) %in% c("1", "2", "4", "5"), 
                 paste(round(prop.table(browser_counts) * 100, 1), "%"), "")

# Create a pie chart with the specified labels
pie(browser_counts, 
    labels = labels,  # Only shows % for 1, 2, 4, and 5
    main = "Distribution of Browser Variable",
    col = rainbow(length(browser_counts)))

  • Now compare to this one I found with real world data on the distribution of browsers. (stockton.edu)

  • I feel these are close enough for us to infer the identity of the 4 largest classes: Chrome (2), Safari (1), Firefox (4), and Edge (5)

Next, I’ll look at OperatingSystem:

table(project_data$OperatingSystem)
## 
##    1    2    3    4    5    6    7    8 
## 2585 6601 2555  478    6   19    7   79
os_counts <- table(project_data$OperatingSystem)

# Create labels for the pie chart, only showing percentages for 1, 2, 3, and 4
os_labels <- ifelse(names(os_counts) %in% c("1", "2", "3", "4"), 
                    paste(round(prop.table(os_counts) * 100, 1), "%"), "")

# Create a pie chart with the specified labels
pie(os_counts, 
    labels = os_labels,  # Only shows % for 1, 2, 3, and 4
    main = "Distribution of Operating System Variable",
    col = rainbow(length(os_counts)))

  • Compared to real data:

  • It seems like it’s not the same as any data i’m finding online so it will be difficult to assume anything other than that largest group (2) is windows.

Next we’ll try to determine some of the traffic types:

  • First I’ll look at the distribution
table(project_data$TrafficType)
## 
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2451 3913 2052 1069  260  444   40  343   42  450  247    1  738   13   38    3 
##   17   18   19   20 
##    1   10   17  198
  • If I combine all the page visit variables into 1 variable to see total page counts, and then organize them by traffic type and the average of all the page counts, maybe i’ll find a class that is always 1 (or very low) page, which could lead me to the conclusion that the user found our site through some kind of direct purchase link that brought them right to the product they wanted. Conversely, if some are significantly higher than the rest we could conclude that it was a more organic lead.
# Create a new variable 'TotalPagesVisited' by combining the three page count variables
project_data$TotalPagesVisited <- project_data$Administrative + 
                                  project_data$Informational + 
                                  project_data$ProductRelated

# Calculate the average total page count for each TrafficType
avg_pages_by_traffic <- aggregate(TotalPagesVisited ~ TrafficType, data = project_data, FUN = mean)

# View the list of averages for each traffic type
print(avg_pages_by_traffic)
##    TrafficType TotalPagesVisited
## 1            1          34.20441
## 2            2          41.75415
## 3            3          27.90789
## 4            4          31.37325
## 5            5          21.84231
## 6            6          32.09459
## 7            7          32.60000
## 8            8          29.49854
## 9            9          17.02381
## 10          10          35.75111
## 11          11          27.34413
## 12          12           3.00000
## 13          13          35.10298
## 14          14          85.07692
## 15          15          17.92105
## 16          16          18.33333
## 17          17           4.00000
## 18          18          16.30000
## 19          19          41.35294
## 20          20          22.47980
  • based on this, it will be tough to make any assumptions of traffic type. Unless there’s some strong correlations with our target variable, or another one im not considering, this likely makes the variable useless.

Now I’ll see if i can find anything out about the region variable

  • first, the distribution:
table(project_data$Region)
## 
##    1    2    3    4    5    6    7    8    9 
## 4780 1136 2403 1182  318  805  761  434  511
  • Next, lets see the frequency of making a purchase for each and graph the info with a stacked bar chart
# Create a contingency table of Region and Revenue
region_revenue_counts <- table(project_data$Revenue, project_data$Region)

# Calculate percentages for False (assuming False is the first row in the table)
false_percents <- prop.table(region_revenue_counts, 2)[1, ] * 100

# Create a stacked bar plot for Region by Revenue
bar_positions <- barplot(region_revenue_counts, 
                         main = "Distribution of Region by Revenue", 
                         xlab = "Region", 
                         ylab = "Count", 
                         col = c("lightcoral", "lightblue"),  # Two different colors for True/False
                         legend = rownames(region_revenue_counts),  # Add legend for True/False
                         beside = FALSE)  # Stacked bars

# Add percentage labels for the False part of each bar (first row in table)
text(bar_positions, region_revenue_counts[1, ] / 2, 
     labels = paste(round(false_percents, 1), "%"), 
     col = "black", cex = 0.8)

  • Our most frequent region has a slightly higher success rates among our countries, but I still don’t think this provides enough insight to be useful.

Conclusion