Homework 10

tr = read.transactions('C:/Users/SeungminSong/Downloads/624R/GroceryDataSet.csv', format ='basket', sep=',')
head(as(tr, 'data.frame'), 10)
##                                                                     items
## 1                {citrus fruit,margarine,ready soups,semi-finished bread}
## 2                                          {coffee,tropical fruit,yogurt}
## 3                                                            {whole milk}
## 4                            {cream cheese,meat spreads,pip fruit,yogurt}
## 5   {condensed milk,long life bakery product,other vegetables,whole milk}
## 6                        {abrasive cleaner,butter,rice,whole milk,yogurt}
## 7                                                            {rolls/buns}
## 8  {bottled beer,liquor (appetizer),other vegetables,rolls/buns,UHT-milk}
## 9                                                            {pot plants}
## 10                                                   {cereals,whole milk}

Dataset: Contains 9835 transactions (item sets) and 169 items (columns), and the data density is approximately 0.026.

Most common items: The most frequently appearing items are ‘whole milk’, ‘other vegetables’, ‘rolls/buns’, and ‘soda’, appearing 2513, 1903, 1809, and 1715 times, respectively. ‘yogurt’ appeared 1372 times.

summary(tr)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics
inspect(tr[1:5])
##     items                      
## [1] {citrus fruit,             
##      margarine,                
##      ready soups,              
##      semi-finished bread}      
## [2] {coffee,                   
##      tropical fruit,           
##      yogurt}                   
## [3] {whole milk}               
## [4] {cream cheese,             
##      meat spreads,             
##      pip fruit,                
##      yogurt}                   
## [5] {condensed milk,           
##      long life bakery product, 
##      other vegetables,         
##      whole milk}
itemFrequency(tr)
##          abrasive cleaner          artif. sweetener            baby cosmetics 
##              0.0035587189              0.0032536858              0.0006100661 
##                 baby food                      bags             baking powder 
##              0.0001016777              0.0004067107              0.0176919166 
##          bathroom cleaner                      beef                   berries 
##              0.0027452974              0.0524656838              0.0332486019 
##                 beverages              bottled beer             bottled water 
##              0.0260294865              0.0805287239              0.1105236401 
##                    brandy               brown bread                    butter 
##              0.0041687850              0.0648703610              0.0554143366 
##               butter milk                  cake bar                   candles 
##              0.0279613625              0.0132180986              0.0089476360 
##                     candy               canned beer               canned fish 
##              0.0298932384              0.0776817489              0.0150482969 
##              canned fruit         canned vegetables                  cat food 
##              0.0032536858              0.0107778343              0.0232841891 
##                   cereals               chewing gum                   chicken 
##              0.0056939502              0.0210472801              0.0429079817 
##                 chocolate     chocolate marshmallow              citrus fruit 
##              0.0496187087              0.0090493137              0.0827656329 
##                   cleaner           cling film/bags              cocoa drinks 
##              0.0050838841              0.0113879004              0.0022369090 
##                    coffee            condensed milk         cooking chocolate 
##              0.0580579563              0.0102694459              0.0025419420 
##                  cookware                     cream              cream cheese 
##              0.0027452974              0.0013218099              0.0396542959 
##                      curd               curd cheese               decalcifier 
##              0.0532791052              0.0050838841              0.0015251652 
##               dental care                   dessert                 detergent 
##              0.0057956279              0.0371123538              0.0192170819 
##              dish cleaner                    dishes                  dog food 
##              0.0104728012              0.0175902389              0.0085409253 
##             domestic eggs  female sanitary products         finished products 
##              0.0634468734              0.0061006609              0.0065073716 
##                      fish                     flour            flower (seeds) 
##              0.0029486528              0.0173868836              0.0103711235 
##    flower soil/fertilizer               frankfurter            frozen chicken 
##              0.0019318760              0.0589730554              0.0006100661 
##            frozen dessert               frozen fish             frozen fruits 
##              0.0107778343              0.0116929334              0.0012201322 
##              frozen meals    frozen potato products         frozen vegetables 
##              0.0283680732              0.0084392476              0.0480935435 
##     fruit/vegetable juice                    grapes                hair spray 
##              0.0722928317              0.0223690900              0.0011184545 
##                       ham            hamburger meat               hard cheese 
##              0.0260294865              0.0332486019              0.0245043213 
##                     herbs                     honey    house keeping products 
##              0.0162684291              0.0015251652              0.0083375699 
##          hygiene articles                 ice cream            instant coffee 
##              0.0329435689              0.0250127097              0.0074224708 
##     Instant food products                       jam                   ketchup 
##              0.0080325369              0.0053889171              0.0042704626 
##            kitchen towels           kitchen utensil               light bulbs 
##              0.0059989832              0.0004067107              0.0041687850 
##                   liqueur                    liquor        liquor (appetizer) 
##              0.0009150991              0.0110828673              0.0079308592 
##                liver loaf  long life bakery product           make up remover 
##              0.0050838841              0.0374173869              0.0008134215 
##            male cosmetics                 margarine                mayonnaise 
##              0.0045754957              0.0585663447              0.0091509914 
##                      meat              meat spreads           misc. beverages 
##              0.0258261312              0.0042704626              0.0283680732 
##                   mustard                   napkins                newspapers 
##              0.0119979664              0.0523640061              0.0798169802 
##                 nut snack               nuts/prunes                       oil 
##              0.0031520081              0.0033553635              0.0280630402 
##                    onions          organic products           organic sausage 
##              0.0310116929              0.0016268429              0.0022369090 
##          other vegetables packaged fruit/vegetables                     pasta 
##              0.1934926284              0.0130147433              0.0150482969 
##                    pastry                  pet care                photo/film 
##              0.0889679715              0.0094560244              0.0092526690 
##        pickled vegetables                 pip fruit                   popcorn 
##              0.0178952720              0.0756481952              0.0072191154 
##                      pork                pot plants           potato products 
##              0.0576512456              0.0172852059              0.0028469751 
##     preservation products          processed cheese                  prosecco 
##              0.0002033554              0.0165734621              0.0020335536 
##            pudding powder               ready soups            red/blush wine 
##              0.0023385867              0.0018301983              0.0192170819 
##                      rice             roll products                rolls/buns 
##              0.0076258261              0.0102694459              0.1839349263 
##           root vegetables           rubbing alcohol                       rum 
##              0.1089984748              0.0010167768              0.0044738180 
##            salad dressing                      salt               salty snack 
##              0.0008134215              0.0107778343              0.0378240976 
##                    sauces                   sausage         seasonal products 
##              0.0054905948              0.0939501779              0.0142348754 
##       semi-finished bread             shopping bags                 skin care 
##              0.0176919166              0.0985256736              0.0035587189 
##             sliced cheese            snack products                      soap 
##              0.0245043213              0.0030503305              0.0026436197 
##                      soda               soft cheese                  softener 
##              0.1743772242              0.0170818505              0.0054905948 
##      sound storage medium                     soups            sparkling wine 
##              0.0001016777              0.0068124047              0.0055922725 
##             specialty bar          specialty cheese       specialty chocolate 
##              0.0273512964              0.0085409253              0.0304016268 
##             specialty fat      specialty vegetables                    spices 
##              0.0036603965              0.0017285206              0.0051855618 
##             spread cheese                     sugar             sweet spreads 
##              0.0111845450              0.0338586680              0.0090493137 
##                     syrup                       tea                   tidbits 
##              0.0032536858              0.0038637519              0.0023385867 
##            toilet cleaner            tropical fruit                    turkey 
##              0.0007117438              0.1049313676              0.0081342145 
##                  UHT-milk                   vinegar                   waffles 
##              0.0334519573              0.0065073716              0.0384341637 
##        whipped/sour cream                    whisky               white bread 
##              0.0716827656              0.0008134215              0.0420945602 
##                white wine                whole milk                    yogurt 
##              0.0190137265              0.2555160142              0.1395017794 
##                  zwieback 
##              0.0069140824
itemFrequencyPlot(tr, xlab="product", ylab="ratio", col=1:6, topN=10)

Since the number of related rules is large, let’s only look at the results where the number of products in the rule is 2 to 4.

rules=apriori(tr, parameter=list(support=0.02, confidence=0.4, minlen=2, maxlen=4))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.02      2
##  maxlen target  ext
##       4  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 196 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [59 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

There are a total of 15 association rules, and in each association rule, one can observe support, confidence, lift, and the number of transactions.

inspect(rules)
##      lhs                                    rhs                support   
## [1]  {frozen vegetables}                 => {whole milk}       0.02043721
## [2]  {beef}                              => {whole milk}       0.02125064
## [3]  {curd}                              => {whole milk}       0.02613116
## [4]  {margarine}                         => {whole milk}       0.02419929
## [5]  {butter}                            => {whole milk}       0.02755465
## [6]  {domestic eggs}                     => {whole milk}       0.02999492
## [7]  {whipped/sour cream}                => {other vegetables} 0.02887646
## [8]  {whipped/sour cream}                => {whole milk}       0.03223183
## [9]  {tropical fruit}                    => {whole milk}       0.04229792
## [10] {root vegetables}                   => {other vegetables} 0.04738180
## [11] {root vegetables}                   => {whole milk}       0.04890696
## [12] {yogurt}                            => {whole milk}       0.05602440
## [13] {other vegetables, root vegetables} => {whole milk}       0.02318251
## [14] {root vegetables, whole milk}       => {other vegetables} 0.02318251
## [15] {other vegetables, yogurt}          => {whole milk}       0.02226741
##      confidence coverage   lift     count
## [1]  0.4249471  0.04809354 1.663094 201  
## [2]  0.4050388  0.05246568 1.585180 209  
## [3]  0.4904580  0.05327911 1.919481 257  
## [4]  0.4131944  0.05856634 1.617098 238  
## [5]  0.4972477  0.05541434 1.946053 271  
## [6]  0.4727564  0.06344687 1.850203 295  
## [7]  0.4028369  0.07168277 2.081924 284  
## [8]  0.4496454  0.07168277 1.759754 317  
## [9]  0.4031008  0.10493137 1.577595 416  
## [10] 0.4347015  0.10899847 2.246605 466  
## [11] 0.4486940  0.10899847 1.756031 481  
## [12] 0.4016035  0.13950178 1.571735 551  
## [13] 0.4892704  0.04738180 1.914833 228  
## [14] 0.4740125  0.04890696 2.449770 228  
## [15] 0.5128806  0.04341637 2.007235 219

Lift is used to evaluate the interestingness of a rule, with a higher lift meaning the rule is likely to have more meaningful relevance. However, rules with low improvement can also be useful and important to your marketing strategy if support and confidence are still high enough.

In this graph, ‘whole milk’ is the central node, expressing its relationships with several other products. For example, the association rule between ‘whole milk’ and ‘root vegetables’ shows high improvement. This indicates that customers who purchase ‘root vegetables’ are more likely than usual to also purchase ‘whole milk’.

Other nodes also provide important information. For example, ‘yogurt’ and ‘other vegetables’ also have a high association rule with ‘whole milk’, indicating that these products are also likely to be purchased together.

rules_sorted_by_lift <- sort(rules, by="lift", decreasing=TRUE)

top_10_rules_by_lift <- rules_sorted_by_lift[1:10]

inspect(top_10_rules_by_lift)
##      lhs                                    rhs                support   
## [1]  {root vegetables, whole milk}       => {other vegetables} 0.02318251
## [2]  {root vegetables}                   => {other vegetables} 0.04738180
## [3]  {whipped/sour cream}                => {other vegetables} 0.02887646
## [4]  {other vegetables, yogurt}          => {whole milk}       0.02226741
## [5]  {butter}                            => {whole milk}       0.02755465
## [6]  {curd}                              => {whole milk}       0.02613116
## [7]  {other vegetables, root vegetables} => {whole milk}       0.02318251
## [8]  {domestic eggs}                     => {whole milk}       0.02999492
## [9]  {whipped/sour cream}                => {whole milk}       0.03223183
## [10] {root vegetables}                   => {whole milk}       0.04890696
##      confidence coverage   lift     count
## [1]  0.4740125  0.04890696 2.449770 228  
## [2]  0.4347015  0.10899847 2.246605 466  
## [3]  0.4028369  0.07168277 2.081924 284  
## [4]  0.5128806  0.04341637 2.007235 219  
## [5]  0.4972477  0.05541434 1.946053 271  
## [6]  0.4904580  0.05327911 1.919481 257  
## [7]  0.4892704  0.04738180 1.914833 228  
## [8]  0.4727564  0.06344687 1.850203 295  
## [9]  0.4496454  0.07168277 1.759754 317  
## [10] 0.4486940  0.10899847 1.756031 481

What is important about this analysis is that the combination of ‘whole milk’ and ‘root vegetables’ has the highest support (approximately 4.89%), indicating that it is one of the most occurring rules in the overall transaction (rule [10]).

plot(top_10_rules_by_lift, method="graph", control=list(type="items"), main="Top 10 Association Rules by Lift")
## Available control parameters (with default values):
## layout    =  stress
## circular  =  FALSE
## ggraphdots    =  NULL
## edges     =  <environment>
## nodes     =  <environment>
## nodetext  =  <environment>
## colors    =  c("#EE0000FF", "#EEEEEEFF")
## engine    =  ggplot2
## max   =  100
## verbose   =  FALSE

Extra Credit

K-means clustering

item_matrix <- as.matrix(itemFrequency(tr))

fviz_nbclust(item_matrix, kmeans, method = "wss")

fviz_nbclust(item_matrix, kmeans, method = "silhouette")

set.seed(123) #
kmeans_result <- kmeans(item_matrix, centers = 3)


head(kmeans_result)
## $cluster
##          abrasive cleaner          artif. sweetener            baby cosmetics 
##                         3                         3                         3 
##                 baby food                      bags             baking powder 
##                         3                         3                         3 
##          bathroom cleaner                      beef                   berries 
##                         3                         1                         3 
##                 beverages              bottled beer             bottled water 
##                         3                         1                         1 
##                    brandy               brown bread                    butter 
##                         3                         1                         1 
##               butter milk                  cake bar                   candles 
##                         3                         3                         3 
##                     candy               canned beer               canned fish 
##                         3                         1                         3 
##              canned fruit         canned vegetables                  cat food 
##                         3                         3                         3 
##                   cereals               chewing gum                   chicken 
##                         3                         3                         1 
##                 chocolate     chocolate marshmallow              citrus fruit 
##                         1                         3                         1 
##                   cleaner           cling film/bags              cocoa drinks 
##                         3                         3                         3 
##                    coffee            condensed milk         cooking chocolate 
##                         1                         3                         3 
##                  cookware                     cream              cream cheese 
##                         3                         3                         3 
##                      curd               curd cheese               decalcifier 
##                         1                         3                         3 
##               dental care                   dessert                 detergent 
##                         3                         3                         3 
##              dish cleaner                    dishes                  dog food 
##                         3                         3                         3 
##             domestic eggs  female sanitary products         finished products 
##                         1                         3                         3 
##                      fish                     flour            flower (seeds) 
##                         3                         3                         3 
##    flower soil/fertilizer               frankfurter            frozen chicken 
##                         3                         1                         3 
##            frozen dessert               frozen fish             frozen fruits 
##                         3                         3                         3 
##              frozen meals    frozen potato products         frozen vegetables 
##                         3                         3                         1 
##     fruit/vegetable juice                    grapes                hair spray 
##                         1                         3                         3 
##                       ham            hamburger meat               hard cheese 
##                         3                         3                         3 
##                     herbs                     honey    house keeping products 
##                         3                         3                         3 
##          hygiene articles                 ice cream            instant coffee 
##                         3                         3                         3 
##     Instant food products                       jam                   ketchup 
##                         3                         3                         3 
##            kitchen towels           kitchen utensil               light bulbs 
##                         3                         3                         3 
##                   liqueur                    liquor        liquor (appetizer) 
##                         3                         3                         3 
##                liver loaf  long life bakery product           make up remover 
##                         3                         3                         3 
##            male cosmetics                 margarine                mayonnaise 
##                         3                         1                         3 
##                      meat              meat spreads           misc. beverages 
##                         3                         3                         3 
##                   mustard                   napkins                newspapers 
##                         3                         1                         1 
##                 nut snack               nuts/prunes                       oil 
##                         3                         3                         3 
##                    onions          organic products           organic sausage 
##                         3                         3                         3 
##          other vegetables packaged fruit/vegetables                     pasta 
##                         2                         3                         3 
##                    pastry                  pet care                photo/film 
##                         1                         3                         3 
##        pickled vegetables                 pip fruit                   popcorn 
##                         3                         1                         3 
##                      pork                pot plants           potato products 
##                         1                         3                         3 
##     preservation products          processed cheese                  prosecco 
##                         3                         3                         3 
##            pudding powder               ready soups            red/blush wine 
##                         3                         3                         3 
##                      rice             roll products                rolls/buns 
##                         3                         3                         2 
##           root vegetables           rubbing alcohol                       rum 
##                         1                         3                         3 
##            salad dressing                      salt               salty snack 
##                         3                         3                         3 
##                    sauces                   sausage         seasonal products 
##                         3                         1                         3 
##       semi-finished bread             shopping bags                 skin care 
##                         3                         1                         3 
##             sliced cheese            snack products                      soap 
##                         3                         3                         3 
##                      soda               soft cheese                  softener 
##                         2                         3                         3 
##      sound storage medium                     soups            sparkling wine 
##                         3                         3                         3 
##             specialty bar          specialty cheese       specialty chocolate 
##                         3                         3                         3 
##             specialty fat      specialty vegetables                    spices 
##                         3                         3                         3 
##             spread cheese                     sugar             sweet spreads 
##                         3                         3                         3 
##                     syrup                       tea                   tidbits 
##                         3                         3                         3 
##            toilet cleaner            tropical fruit                    turkey 
##                         3                         1                         3 
##                  UHT-milk                   vinegar                   waffles 
##                         3                         3                         3 
##        whipped/sour cream                    whisky               white bread 
##                         1                         3                         1 
##                white wine                whole milk                    yogurt 
##                         3                         2                         2 
##                  zwieback 
##                         3 
## 
## $centers
##         [,1]
## 1 0.07052289
## 2 0.18936451
## 3 0.01137603
## 
## $totss
## [1] 0.2494903
## 
## $withinss
## [1] 0.010727637 0.007133454 0.015370090
## 
## $tot.withinss
## [1] 0.03323118
## 
## $betweenss
## [1] 0.2162591
sparse_matrix <- as(tr, "ngCMatrix")

set.seed(123)

transaction_sums <- rowSums(sparse_matrix)

set.seed(123)
km_result <- kmeans(as.matrix(transaction_sums), centers = 3)

cluster_df <- data.frame(TransactionID = seq_along(transaction_sums), Cluster = km_result$cluster)

head(cluster_df)
##                  TransactionID Cluster
## abrasive cleaner             1       3
## artif. sweetener             2       3
## baby cosmetics               3       3
## baby food                    4       3
## bags                         5       3
## baking powder                6       3

The graph represents a two-dimensional visualization of k-means clustering on transaction data. Dimensionality reduction was performed using PCA, which represented each transaction as two principal components (PC1 and PC2) obtained through principal component analysis. As a result, the transaction data was projected into a new two-dimensional space.

svd_res <- irlba(sparse_matrix, nv = 2)

set.seed(123)
km_res <- kmeans(svd_res$v, centers = 3)

cluster_df <- data.frame(PC1 = svd_res$v[, 1], PC2 = svd_res$v[, 2], Cluster = km_res$cluster)

ggplot(cluster_df, aes(x = PC1, y = PC2, color = as.factor(Cluster))) +
  geom_point(alpha = 0.5) +
  scale_color_manual(values = c("red", "blue", "green")) +
  theme_minimal() +
  labs(color = "Cluster") +
  ggtitle("2D Visualization of K-Means Clustering on Transaction Data")