library(tidyverse)
library(ggthemes)
library(arules)
library(igraph)
library(arulesViz)
library(skimr)
library(gt)The glimpse function provides some insight into the dataset. The missing values are to be expecting due to the nature of receipt data.
## Rows: 9,834
## Columns: 32
## $ `citrus fruit` <chr> "tropical fruit", "whole milk", "pip fruit", ...
## $ `semi-finished bread` <chr> "yogurt", NA, "yogurt", "whole milk", "butter...
## $ margarine <chr> "coffee", NA, "cream cheese", "condensed milk...
## $ `ready soups` <chr> NA, NA, "meat spreads", "long life bakery pro...
## $ X5 <chr> NA, NA, NA, NA, "abrasive cleaner", NA, "liqu...
## $ X6 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "yogu...
## $ X7 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "flou...
## $ X8 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "bott...
## $ X9 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, "dish...
## $ X10 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X11 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X12 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X13 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X14 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X15 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X16 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X17 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X18 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X19 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X20 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X21 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X22 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X23 <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X24 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X25 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X26 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X27 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X28 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X29 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X30 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X31 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
## $ X32 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, N...
The top 10 items are set forth below. Milk is the most popular item, other vegtables and rolls/buns take the silver and broze metals.
transactions <- read.transactions("GroceryDataSet.csv", sep=",")
itemFrequencyPlot(transactions, topN=10, type="absolute", main="Top 10 Items",col="#35b0ab") The top 10 rules and there related metrics are set forth in the heat map table and graph below.
rules<- apriori(df, parameter=list(supp=0.001, conf=0.5) , control=list(verbose=FALSE))
rules %>%
DATAFRAME() %>%
arrange(desc(lift)) %>%
top_n(10) %>%
gt() %>%
data_color(
columns = vars(lift,coverage,confidence,support),
colors = scales::col_numeric(
# custom defined values - notice that order matters!
palette = c("#ffffff", "#f2fbd2", "#c9ecb4", "#93d3ab", "#35b0ab"),
domain = NULL
)
) %>%
fmt_number(
columns = vars(support, confidence, coverage, lift),
decimals = 4
) %>%
cols_align(
align = "left",
columns = vars(LHS, RHS)
) %>%
tab_style(
style = list(
cell_borders(
sides = "left",
color = "black",
weight = px(3)
)
),
locations = list(
cells_body(
columns = vars(support)
)
)
) %>%
tab_header(
title = md("**Top 10 Associative Rules**"),
subtitle = "By Lift, Support, Confidence and Coverage"
)| Top 10 Associative Rules | ||||||
|---|---|---|---|---|---|---|
| By Lift, Support, Confidence and Coverage | ||||||
| LHS | RHS | support | confidence | coverage | lift | count |
| {X6=whole milk} | {X5=other vegetables} | 0.0070 | 0.7931 | 0.0088 | 64.9948 | 69 |
| {X5=other vegetables} | {X6=whole milk} | 0.0070 | 0.5750 | 0.0122 | 64.9948 | 69 |
| {X5=whole milk} | {ready soups=other vegetables} | 0.0106 | 0.6980 | 0.0152 | 27.0236 | 104 |
| {semi-finished bread=root vegetables,ready soups=whole milk} | {margarine=other vegetables} | 0.0052 | 0.8361 | 0.0062 | 19.8117 | 51 |
| {semi-finished bread=sausage} | {citrus fruit=frankfurter} | 0.0101 | 1.0000 | 0.0101 | 16.9552 | 99 |
| {margarine=pip fruit} | {semi-finished bread=tropical fruit} | 0.0079 | 0.5306 | 0.0149 | 14.6987 | 78 |
| {ready soups=butter} | {margarine=whole milk} | 0.0061 | 0.6667 | 0.0092 | 12.9565 | 60 |
| {ready soups=whole milk} | {margarine=other vegetables} | 0.0173 | 0.5397 | 0.0320 | 12.7885 | 170 |
| {ready soups=curd} | {margarine=whole milk} | 0.0052 | 0.5862 | 0.0088 | 11.3928 | 51 |
| {margarine=curd} | {semi-finished bread=whole milk} | 0.0052 | 0.5100 | 0.0102 | 7.6687 | 51 |
The cluster analysis depicted by the Dendrogram for items below is consistent with the Top 10 Associative Rules above - Whole Milk and other vegtables form the major cluster.
df2 <- transactions <- read.transactions("GroceryDataSet.csv", sep=",")
df2 <- df2[ , itemFrequency(df2) > 0.04]
d_jaccard <- dissimilarity(df2, which = "items")
# plot dendrogram
plot(hclust(d_jaccard, method = "ward.D2"),
main = "Dendrogram for items", sub = "", xlab = "")