library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(arules)
## Warning: package 'arules' was built under R version 4.4.3
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Warning: package 'arulesViz' was built under R version 4.4.3
library(cluster)
library(factoextra)
## Warning: package 'factoextra' was built under R version 4.4.3
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.4.2
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(tidyverse)
## Warning: package 'readr' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ✔ purrr 1.0.2 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ Matrix::expand() masks tidyr::expand()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ Matrix::pack() masks tidyr::pack()
## ✖ arules::recode() masks dplyr::recode()
## ✖ Matrix::unpack() masks tidyr::unpack()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(tidytext)
df <- read.csv("https://raw.githubusercontent.com/Jennyjjxxzz/Data-624_HW10/refs/heads/main/GroceryDataSet.csv", stringsAsFactors = FALSE)
# Convert each row to a transaction (remove NAs and blanks)
transactions_list <- apply(df, 1, function(x) na.omit(x[x != ""]))
transactions <- as(transactions_list, "transactions")
summary(transactions)
## transactions as itemMatrix in sparse format with
## 9834 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.0260917
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34051
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1004 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 3.00 4.41 6.00 32.00
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
# Mine rules using the Apriori algorithm
rules <- apriori(transactions, parameter = list(supp = 0.001, conf = 0.5))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9834 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [5668 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
# Sort
top5_rules <- sort(rules, by = "lift", decreasing = TRUE)[1:5]
# Show top 5 rules
inspect(top5_rules)
## lhs rhs support
## [1] {Instant food products, soda} => {hamburger meat} 0.001220256
## [2] {popcorn, soda} => {salty snack} 0.001220256
## [3] {baking powder, flour} => {sugar} 0.001016880
## [4] {ham, processed cheese} => {white bread} 0.001932072
## [5] {Instant food products, whole milk} => {hamburger meat} 0.001525320
## confidence coverage lift count
## [1] 0.6315789 0.001932072 18.99372 12
## [2] 0.6315789 0.001932072 16.69610 12
## [3] 0.5555556 0.001830384 16.40641 10
## [4] 0.6333333 0.003050641 15.04396 19
## [5] 0.5000000 0.003050641 15.03670 15
itemFrequencyPlot(transactions, topN = 20, type = "absolute", horiz = TRUE)
plot(rules[1:20], method = "graph", engine = "htmlwidget")
# Convert transactions to binary item matrix
item_matrix <- as(transactions, "matrix")
# Scale the martix
scaled <- scale(item_matrix)
# Compute total within-cluster sum of squares for k = 1 to 10
wss <- map_dbl(1:10, ~kmeans(scaled, centers = ., nstart = 10)$tot.withinss)
# Plot Elbow Plot
tibble(k = 1:10, wss = wss) %>%
ggplot(aes(k, wss)) +
geom_line() +
geom_point() +
geom_vline(xintercept = 3, linetype = "dashed", color = "red") +
labs(title = "Elbow Plot for Optimal Clusters",
x = "Number of Clusters",
y = "Within-Cluster Sum of Squares") +
theme_minimal()
#### K-Means Clustering on Market Baskets
set.seed(123)
km_res <- kmeans(scaled, centers = 3, nstart = 25)
fviz_cluster(km_res, data = scaled, geom = "point", ellipse.type = "convex") +
labs(title = "K-Means Clustering on Market Baskets")
# Assign cluster labels
cluster_assignments <- km_res$cluster
# Convert transactions to a list and combine with cluster labels
transaction_list <- as(transactions, "list")
transaction_df <- tibble(
transaction_id = seq_along(transaction_list),
items = transaction_list,
cluster = cluster_assignments
)
# Unnest the items into long format
exploded_items <- transaction_df %>%
unnest(items)
# Count top 10 most frequent items in each cluster
top_items_by_cluster <- exploded_items %>%
group_by(cluster, items) %>%
summarise(count = n(), .groups = "drop") %>%
arrange(cluster, desc(count)) %>%
group_by(cluster) %>%
slice_head(n = 10)
# Reorder items
top_items_by_cluster <- top_items_by_cluster %>%
mutate(items = reorder_within(items, count, cluster))
# plot
ggplot(top_items_by_cluster, aes(x = items, y = count, fill = as.factor(cluster))) +
geom_col(show.legend = FALSE) +
scale_x_reordered() +
facet_wrap(~ cluster, scales = "free_y") +
coord_flip() +
labs(
title = "Top 10 Most Sold Products per Cluster",
x = "Product",
y = "Count"
) +
theme_minimal()