tr = read.transactions('C:/Users/SeungminSong/Downloads/624R/GroceryDataSet.csv', format ='basket', sep=',')
head(as(tr, 'data.frame'), 10)
## items
## 1 {citrus fruit,margarine,ready soups,semi-finished bread}
## 2 {coffee,tropical fruit,yogurt}
## 3 {whole milk}
## 4 {cream cheese,meat spreads,pip fruit,yogurt}
## 5 {condensed milk,long life bakery product,other vegetables,whole milk}
## 6 {abrasive cleaner,butter,rice,whole milk,yogurt}
## 7 {rolls/buns}
## 8 {bottled beer,liquor (appetizer),other vegetables,rolls/buns,UHT-milk}
## 9 {pot plants}
## 10 {cereals,whole milk}
Dataset: Contains 9835 transactions (item sets) and 169 items (columns), and the data density is approximately 0.026.
Most common items: The most frequently appearing items are ‘whole milk’, ‘other vegetables’, ‘rolls/buns’, and ‘soda’, appearing 2513, 1903, 1809, and 1715 times, respectively. ‘yogurt’ appeared 1372 times.
summary(tr)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
inspect(tr[1:5])
## items
## [1] {citrus fruit,
## margarine,
## ready soups,
## semi-finished bread}
## [2] {coffee,
## tropical fruit,
## yogurt}
## [3] {whole milk}
## [4] {cream cheese,
## meat spreads,
## pip fruit,
## yogurt}
## [5] {condensed milk,
## long life bakery product,
## other vegetables,
## whole milk}
itemFrequency(tr)
## abrasive cleaner artif. sweetener baby cosmetics
## 0.0035587189 0.0032536858 0.0006100661
## baby food bags baking powder
## 0.0001016777 0.0004067107 0.0176919166
## bathroom cleaner beef berries
## 0.0027452974 0.0524656838 0.0332486019
## beverages bottled beer bottled water
## 0.0260294865 0.0805287239 0.1105236401
## brandy brown bread butter
## 0.0041687850 0.0648703610 0.0554143366
## butter milk cake bar candles
## 0.0279613625 0.0132180986 0.0089476360
## candy canned beer canned fish
## 0.0298932384 0.0776817489 0.0150482969
## canned fruit canned vegetables cat food
## 0.0032536858 0.0107778343 0.0232841891
## cereals chewing gum chicken
## 0.0056939502 0.0210472801 0.0429079817
## chocolate chocolate marshmallow citrus fruit
## 0.0496187087 0.0090493137 0.0827656329
## cleaner cling film/bags cocoa drinks
## 0.0050838841 0.0113879004 0.0022369090
## coffee condensed milk cooking chocolate
## 0.0580579563 0.0102694459 0.0025419420
## cookware cream cream cheese
## 0.0027452974 0.0013218099 0.0396542959
## curd curd cheese decalcifier
## 0.0532791052 0.0050838841 0.0015251652
## dental care dessert detergent
## 0.0057956279 0.0371123538 0.0192170819
## dish cleaner dishes dog food
## 0.0104728012 0.0175902389 0.0085409253
## domestic eggs female sanitary products finished products
## 0.0634468734 0.0061006609 0.0065073716
## fish flour flower (seeds)
## 0.0029486528 0.0173868836 0.0103711235
## flower soil/fertilizer frankfurter frozen chicken
## 0.0019318760 0.0589730554 0.0006100661
## frozen dessert frozen fish frozen fruits
## 0.0107778343 0.0116929334 0.0012201322
## frozen meals frozen potato products frozen vegetables
## 0.0283680732 0.0084392476 0.0480935435
## fruit/vegetable juice grapes hair spray
## 0.0722928317 0.0223690900 0.0011184545
## ham hamburger meat hard cheese
## 0.0260294865 0.0332486019 0.0245043213
## herbs honey house keeping products
## 0.0162684291 0.0015251652 0.0083375699
## hygiene articles ice cream instant coffee
## 0.0329435689 0.0250127097 0.0074224708
## Instant food products jam ketchup
## 0.0080325369 0.0053889171 0.0042704626
## kitchen towels kitchen utensil light bulbs
## 0.0059989832 0.0004067107 0.0041687850
## liqueur liquor liquor (appetizer)
## 0.0009150991 0.0110828673 0.0079308592
## liver loaf long life bakery product make up remover
## 0.0050838841 0.0374173869 0.0008134215
## male cosmetics margarine mayonnaise
## 0.0045754957 0.0585663447 0.0091509914
## meat meat spreads misc. beverages
## 0.0258261312 0.0042704626 0.0283680732
## mustard napkins newspapers
## 0.0119979664 0.0523640061 0.0798169802
## nut snack nuts/prunes oil
## 0.0031520081 0.0033553635 0.0280630402
## onions organic products organic sausage
## 0.0310116929 0.0016268429 0.0022369090
## other vegetables packaged fruit/vegetables pasta
## 0.1934926284 0.0130147433 0.0150482969
## pastry pet care photo/film
## 0.0889679715 0.0094560244 0.0092526690
## pickled vegetables pip fruit popcorn
## 0.0178952720 0.0756481952 0.0072191154
## pork pot plants potato products
## 0.0576512456 0.0172852059 0.0028469751
## preservation products processed cheese prosecco
## 0.0002033554 0.0165734621 0.0020335536
## pudding powder ready soups red/blush wine
## 0.0023385867 0.0018301983 0.0192170819
## rice roll products rolls/buns
## 0.0076258261 0.0102694459 0.1839349263
## root vegetables rubbing alcohol rum
## 0.1089984748 0.0010167768 0.0044738180
## salad dressing salt salty snack
## 0.0008134215 0.0107778343 0.0378240976
## sauces sausage seasonal products
## 0.0054905948 0.0939501779 0.0142348754
## semi-finished bread shopping bags skin care
## 0.0176919166 0.0985256736 0.0035587189
## sliced cheese snack products soap
## 0.0245043213 0.0030503305 0.0026436197
## soda soft cheese softener
## 0.1743772242 0.0170818505 0.0054905948
## sound storage medium soups sparkling wine
## 0.0001016777 0.0068124047 0.0055922725
## specialty bar specialty cheese specialty chocolate
## 0.0273512964 0.0085409253 0.0304016268
## specialty fat specialty vegetables spices
## 0.0036603965 0.0017285206 0.0051855618
## spread cheese sugar sweet spreads
## 0.0111845450 0.0338586680 0.0090493137
## syrup tea tidbits
## 0.0032536858 0.0038637519 0.0023385867
## toilet cleaner tropical fruit turkey
## 0.0007117438 0.1049313676 0.0081342145
## UHT-milk vinegar waffles
## 0.0334519573 0.0065073716 0.0384341637
## whipped/sour cream whisky white bread
## 0.0716827656 0.0008134215 0.0420945602
## white wine whole milk yogurt
## 0.0190137265 0.2555160142 0.1395017794
## zwieback
## 0.0069140824
itemFrequencyPlot(tr, xlab="product", ylab="ratio", col=1:6, topN=10)
Since the number of related rules is large, let’s only look at the results where the number of products in the rule is 2 to 4.
rules=apriori(tr, parameter=list(support=0.02, confidence=0.4, minlen=2, maxlen=4))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.02 2
## maxlen target ext
## 4 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 196
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [59 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
There are a total of 15 association rules, and in each association rule, one can observe support, confidence, lift, and the number of transactions.
inspect(rules)
## lhs rhs support
## [1] {frozen vegetables} => {whole milk} 0.02043721
## [2] {beef} => {whole milk} 0.02125064
## [3] {curd} => {whole milk} 0.02613116
## [4] {margarine} => {whole milk} 0.02419929
## [5] {butter} => {whole milk} 0.02755465
## [6] {domestic eggs} => {whole milk} 0.02999492
## [7] {whipped/sour cream} => {other vegetables} 0.02887646
## [8] {whipped/sour cream} => {whole milk} 0.03223183
## [9] {tropical fruit} => {whole milk} 0.04229792
## [10] {root vegetables} => {other vegetables} 0.04738180
## [11] {root vegetables} => {whole milk} 0.04890696
## [12] {yogurt} => {whole milk} 0.05602440
## [13] {other vegetables, root vegetables} => {whole milk} 0.02318251
## [14] {root vegetables, whole milk} => {other vegetables} 0.02318251
## [15] {other vegetables, yogurt} => {whole milk} 0.02226741
## confidence coverage lift count
## [1] 0.4249471 0.04809354 1.663094 201
## [2] 0.4050388 0.05246568 1.585180 209
## [3] 0.4904580 0.05327911 1.919481 257
## [4] 0.4131944 0.05856634 1.617098 238
## [5] 0.4972477 0.05541434 1.946053 271
## [6] 0.4727564 0.06344687 1.850203 295
## [7] 0.4028369 0.07168277 2.081924 284
## [8] 0.4496454 0.07168277 1.759754 317
## [9] 0.4031008 0.10493137 1.577595 416
## [10] 0.4347015 0.10899847 2.246605 466
## [11] 0.4486940 0.10899847 1.756031 481
## [12] 0.4016035 0.13950178 1.571735 551
## [13] 0.4892704 0.04738180 1.914833 228
## [14] 0.4740125 0.04890696 2.449770 228
## [15] 0.5128806 0.04341637 2.007235 219
Lift is used to evaluate the interestingness of a rule, with a higher lift meaning the rule is likely to have more meaningful relevance. However, rules with low improvement can also be useful and important to your marketing strategy if support and confidence are still high enough.
In this graph, ‘whole milk’ is the central node, expressing its relationships with several other products. For example, the association rule between ‘whole milk’ and ‘root vegetables’ shows high improvement. This indicates that customers who purchase ‘root vegetables’ are more likely than usual to also purchase ‘whole milk’.
Other nodes also provide important information. For example, ‘yogurt’ and ‘other vegetables’ also have a high association rule with ‘whole milk’, indicating that these products are also likely to be purchased together.
rules_sorted_by_lift <- sort(rules, by="lift", decreasing=TRUE)
top_10_rules_by_lift <- rules_sorted_by_lift[1:10]
inspect(top_10_rules_by_lift)
## lhs rhs support
## [1] {root vegetables, whole milk} => {other vegetables} 0.02318251
## [2] {root vegetables} => {other vegetables} 0.04738180
## [3] {whipped/sour cream} => {other vegetables} 0.02887646
## [4] {other vegetables, yogurt} => {whole milk} 0.02226741
## [5] {butter} => {whole milk} 0.02755465
## [6] {curd} => {whole milk} 0.02613116
## [7] {other vegetables, root vegetables} => {whole milk} 0.02318251
## [8] {domestic eggs} => {whole milk} 0.02999492
## [9] {whipped/sour cream} => {whole milk} 0.03223183
## [10] {root vegetables} => {whole milk} 0.04890696
## confidence coverage lift count
## [1] 0.4740125 0.04890696 2.449770 228
## [2] 0.4347015 0.10899847 2.246605 466
## [3] 0.4028369 0.07168277 2.081924 284
## [4] 0.5128806 0.04341637 2.007235 219
## [5] 0.4972477 0.05541434 1.946053 271
## [6] 0.4904580 0.05327911 1.919481 257
## [7] 0.4892704 0.04738180 1.914833 228
## [8] 0.4727564 0.06344687 1.850203 295
## [9] 0.4496454 0.07168277 1.759754 317
## [10] 0.4486940 0.10899847 1.756031 481
What is important about this analysis is that the combination of ‘whole milk’ and ‘root vegetables’ has the highest support (approximately 4.89%), indicating that it is one of the most occurring rules in the overall transaction (rule [10]).
plot(top_10_rules_by_lift, method="graph", control=list(type="items"), main="Top 10 Association Rules by Lift")
## Available control parameters (with default values):
## layout = stress
## circular = FALSE
## ggraphdots = NULL
## edges = <environment>
## nodes = <environment>
## nodetext = <environment>
## colors = c("#EE0000FF", "#EEEEEEFF")
## engine = ggplot2
## max = 100
## verbose = FALSE
item_matrix <- as.matrix(itemFrequency(tr))
fviz_nbclust(item_matrix, kmeans, method = "wss")
fviz_nbclust(item_matrix, kmeans, method = "silhouette")
set.seed(123) #
kmeans_result <- kmeans(item_matrix, centers = 3)
head(kmeans_result)
## $cluster
## abrasive cleaner artif. sweetener baby cosmetics
## 3 3 3
## baby food bags baking powder
## 3 3 3
## bathroom cleaner beef berries
## 3 1 3
## beverages bottled beer bottled water
## 3 1 1
## brandy brown bread butter
## 3 1 1
## butter milk cake bar candles
## 3 3 3
## candy canned beer canned fish
## 3 1 3
## canned fruit canned vegetables cat food
## 3 3 3
## cereals chewing gum chicken
## 3 3 1
## chocolate chocolate marshmallow citrus fruit
## 1 3 1
## cleaner cling film/bags cocoa drinks
## 3 3 3
## coffee condensed milk cooking chocolate
## 1 3 3
## cookware cream cream cheese
## 3 3 3
## curd curd cheese decalcifier
## 1 3 3
## dental care dessert detergent
## 3 3 3
## dish cleaner dishes dog food
## 3 3 3
## domestic eggs female sanitary products finished products
## 1 3 3
## fish flour flower (seeds)
## 3 3 3
## flower soil/fertilizer frankfurter frozen chicken
## 3 1 3
## frozen dessert frozen fish frozen fruits
## 3 3 3
## frozen meals frozen potato products frozen vegetables
## 3 3 1
## fruit/vegetable juice grapes hair spray
## 1 3 3
## ham hamburger meat hard cheese
## 3 3 3
## herbs honey house keeping products
## 3 3 3
## hygiene articles ice cream instant coffee
## 3 3 3
## Instant food products jam ketchup
## 3 3 3
## kitchen towels kitchen utensil light bulbs
## 3 3 3
## liqueur liquor liquor (appetizer)
## 3 3 3
## liver loaf long life bakery product make up remover
## 3 3 3
## male cosmetics margarine mayonnaise
## 3 1 3
## meat meat spreads misc. beverages
## 3 3 3
## mustard napkins newspapers
## 3 1 1
## nut snack nuts/prunes oil
## 3 3 3
## onions organic products organic sausage
## 3 3 3
## other vegetables packaged fruit/vegetables pasta
## 2 3 3
## pastry pet care photo/film
## 1 3 3
## pickled vegetables pip fruit popcorn
## 3 1 3
## pork pot plants potato products
## 1 3 3
## preservation products processed cheese prosecco
## 3 3 3
## pudding powder ready soups red/blush wine
## 3 3 3
## rice roll products rolls/buns
## 3 3 2
## root vegetables rubbing alcohol rum
## 1 3 3
## salad dressing salt salty snack
## 3 3 3
## sauces sausage seasonal products
## 3 1 3
## semi-finished bread shopping bags skin care
## 3 1 3
## sliced cheese snack products soap
## 3 3 3
## soda soft cheese softener
## 2 3 3
## sound storage medium soups sparkling wine
## 3 3 3
## specialty bar specialty cheese specialty chocolate
## 3 3 3
## specialty fat specialty vegetables spices
## 3 3 3
## spread cheese sugar sweet spreads
## 3 3 3
## syrup tea tidbits
## 3 3 3
## toilet cleaner tropical fruit turkey
## 3 1 3
## UHT-milk vinegar waffles
## 3 3 3
## whipped/sour cream whisky white bread
## 1 3 1
## white wine whole milk yogurt
## 3 2 2
## zwieback
## 3
##
## $centers
## [,1]
## 1 0.07052289
## 2 0.18936451
## 3 0.01137603
##
## $totss
## [1] 0.2494903
##
## $withinss
## [1] 0.010727637 0.007133454 0.015370090
##
## $tot.withinss
## [1] 0.03323118
##
## $betweenss
## [1] 0.2162591
sparse_matrix <- as(tr, "ngCMatrix")
set.seed(123)
transaction_sums <- rowSums(sparse_matrix)
set.seed(123)
km_result <- kmeans(as.matrix(transaction_sums), centers = 3)
cluster_df <- data.frame(TransactionID = seq_along(transaction_sums), Cluster = km_result$cluster)
head(cluster_df)
## TransactionID Cluster
## abrasive cleaner 1 3
## artif. sweetener 2 3
## baby cosmetics 3 3
## baby food 4 3
## bags 5 3
## baking powder 6 3
The graph represents a two-dimensional visualization of k-means clustering on transaction data. Dimensionality reduction was performed using PCA, which represented each transaction as two principal components (PC1 and PC2) obtained through principal component analysis. As a result, the transaction data was projected into a new two-dimensional space.
svd_res <- irlba(sparse_matrix, nv = 2)
set.seed(123)
km_res <- kmeans(svd_res$v, centers = 3)
cluster_df <- data.frame(PC1 = svd_res$v[, 1], PC2 = svd_res$v[, 2], Cluster = km_res$cluster)
ggplot(cluster_df, aes(x = PC1, y = PC2, color = as.factor(Cluster))) +
geom_point(alpha = 0.5) +
scale_color_manual(values = c("red", "blue", "green")) +
theme_minimal() +
labs(color = "Cluster") +
ggtitle("2D Visualization of K-Means Clustering on Transaction Data")