#The goal of this analysis is to identify natural groupings of grocery items/transactions using clustering methods and interpret purchasing patterns in the dataset.

#Loading the data set and preparation

library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(cluster)

# Load transaction data
groceries <- read.transactions(
  "https://raw.githubusercontent.com/zahid607/Data624/refs/heads/main/GroceryDataSet.csv",
  format = "basket",
  sep = ","
)

summary(groceries)
## transactions as itemMatrix in sparse format with
##  9835 rows (elements/itemsets/transactions) and
##  169 columns (items) and a density of 0.02609146 
## 
## most frequent items:
##       whole milk other vegetables       rolls/buns             soda 
##             2513             1903             1809             1715 
##           yogurt          (Other) 
##             1372            34055 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15   16 
## 2159 1643 1299 1005  855  645  545  438  350  246  182  117   78   77   55   46 
##   17   18   19   20   21   22   23   24   26   27   28   29   32 
##   29   14   14    9   11    4    6    1    1    1    1    3    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   3.000   4.409   6.000  32.000 
## 
## includes extended item information - examples:
##             labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3   baby cosmetics

#Comment: The dataset contains 9,835 grocery transactions with 169 items. Each transaction represents a basket of purchased goods.

#Exploratory Analysis

#Top 25 items

groceries_top25 <- read.transactions(
  "https://raw.githubusercontent.com/zahid607/Data624/refs/heads/main/GroceryDataSet.csv",
  format = "basket",
  sep = ","
)

itemFrequencyPlot(
  groceries_top25,
  topN = 25,
  type = "absolute",
  main = "Top 25 Items"
)

#Convert data for clustering

groceries_mat <- as(groceries, "matrix")
groceries_df <- as.data.frame(groceries_mat)

# Remove columns with no variance (important fix)
groceries_df <- groceries_df[, colSums(groceries_df) > 0]

# Scale data
groceries_scaled <- scale(groceries_df)

#Choosing Optimal Number of Clusters (Elbow Method)

set.seed(1)

wss <- sapply(1:10, function(k){
  kmeans(groceries_scaled, centers = k, nstart = 20)$tot.withinss
})

plot(1:10, wss, type = "b",
     xlab = "Number of Clusters (k)",
     ylab = "Within-cluster Sum of Squares",
     main = "Elbow Method")

#Comment: The plot shows a noticeable bend around k = 4 to 5, so k = 5 was selected for final clustering.

#K-Means Clustering

set.seed(1)

k_cluster <- kmeans(groceries_scaled,
                    centers = 5,
                    nstart = 50,
                    iter.max = 100)

table(k_cluster$cluster)
## 
##    1    2    3    4    5 
##    9 2215   57   41 7513

#Cluster Visualization (PCA)

pca <- prcomp(groceries_scaled)

plot(pca$x[,1], pca$x[,2],
     col = k_cluster$cluster,
     pch = 19,
     xlab = "PC1",
     ylab = "PC2",
     main = "Cluster Visualization using PCA")

#Comment:Clusters show some overlap, which suggests that purchasing behavior is not strongly segmented into clearly separated groups.

#Cluster Quality (Silhouette Score)

sil <- silhouette(k_cluster$cluster, dist(groceries_scaled))
plot(sil, main = "Silhouette Plot for K-Means Clusters")

#Comment: Close to 1 = good clustering, Around 0 = overlapping clusters,Negative = poor clustering

#Cluster Interpretation

cluster_profile <- aggregate(groceries_df,
                             by = list(cluster = k_cluster$cluster),
                             mean)

cluster_profile
##   cluster abrasive cleaner artif. sweetener baby cosmetics    baby food
## 1       1      0.111111111      0.111111111      0.0000000 0.0000000000
## 2       2      0.010835214      0.009480813      0.0000000 0.0004514673
## 3       3      0.017543860      0.017543860      0.1052632 0.0000000000
## 4       4      0.000000000      0.000000000      0.0000000 0.0000000000
## 5       5      0.001197924      0.001197924      0.0000000 0.0000000000
##           bags baking powder bathroom cleaner       beef    berries  beverages
## 1 0.0000000000   0.000000000        0.0000000 0.00000000 0.00000000 0.00000000
## 2 0.0009029345   0.056433409        0.0000000 0.12776524 0.08352144 0.03340858
## 3 0.0000000000   0.052631579        0.4736842 0.08771930 0.14035088 0.00000000
## 4 0.0000000000   0.024390244        0.0000000 0.02439024 0.02439024 0.04878049
## 5 0.0002662052   0.005989618        0.0000000 0.03021430 0.01770265 0.02395847
##   bottled beer bottled water brandy brown bread     butter butter milk
## 1   0.11111111    0.11111111      0  0.00000000 0.22222222  0.00000000
## 2   0.09119639    0.17787810      0  0.13363431 0.15981941  0.06365688
## 3   0.07017544    0.17543860      0  0.05263158 0.10526316  0.08771930
## 4   0.21951220    0.12195122      1  0.07317073 0.02439024  0.00000000
## 5   0.07666711    0.09011048      0  0.04472248 0.02422468  0.01717024
##      cake bar     candles      candy canned beer canned fish canned fruit
## 1 0.000000000 0.000000000 0.33333333  0.11111111 0.000000000  0.000000000
## 2 0.027539503 0.017155756 0.05417607  0.03476298 0.035665914  0.009932280
## 3 0.017543860 0.000000000 0.03508772  0.10526316 0.035087719  0.000000000
## 4 0.000000000 0.000000000 0.02439024  0.14634146 0.024390244  0.000000000
## 5 0.009050978 0.006655131 0.02236124  0.08971117 0.008784773  0.001331026
##   canned vegetables   cat food     cereals chewing gum    chicken  chocolate
## 1       0.000000000 0.11111111 0.000000000  0.00000000 0.22222222 0.00000000
## 2       0.032957111 0.06139955 0.018510158  0.02076749 0.10564334 0.10022573
## 3       0.017543860 0.03508772 0.000000000  0.01754386 0.08771930 0.07017544
## 4       0.000000000 0.00000000 0.000000000  0.02439024 0.02439024 0.04878049
## 5       0.004259284 0.01197924 0.001996539  0.02116332 0.02395847 0.03460668
##   chocolate marshmallow citrus fruit     cleaner cling film/bags cocoa drinks
## 1           0.000000000   0.33333333 0.111111111     0.000000000 0.0000000000
## 2           0.017155756   0.20857788 0.011286682     0.028893905 0.0067720090
## 3           0.035087719   0.15789474 0.035087719     0.000000000 0.0000000000
## 4           0.000000000   0.04878049 0.000000000     0.000000000 0.0000000000
## 5           0.006522028   0.04498869 0.002928258     0.006388926 0.0009317184
##       coffee condensed milk cooking chocolate    cookware        cream
## 1 0.11111111    0.000000000       0.000000000 0.000000000 0.0000000000
## 2 0.09887133    0.013544018       0.005869074 0.004514673 0.0027088036
## 3 0.07017544    0.035087719       0.000000000 0.017543860 0.0175438596
## 4 0.09756098    0.073170732       0.024390244 0.024390244 0.0000000000
## 5 0.04565420    0.008784773       0.001464129 0.001996539 0.0007986157
##   cream cheese       curd curd cheese  decalcifier dental care    dessert
## 1   0.00000000 0.22222222 0.000000000 0.0000000000 0.000000000 0.00000000
## 2   0.10925508 0.14943567 0.013995485 0.0040632054 0.017607223 0.09435666
## 3   0.10526316 0.05263158 0.017543860 0.0175438596 0.017543860 0.08771930
## 4   0.02439024 0.04878049 0.000000000 0.0000000000 0.000000000 0.00000000
## 5   0.01876747 0.02475709 0.002395847 0.0006655131 0.002262745 0.02009850
##    detergent dish cleaner     dishes   dog food domestic eggs
## 1 0.00000000  0.000000000 0.00000000 0.00000000    0.11111111
## 2 0.04966140  0.026636569 0.03566591 0.01986456    0.18419865
## 3 0.00000000  0.052631579 0.01754386 0.00000000    0.03508772
## 4 0.04878049  0.048780488 0.02439024 0.04878049    0.02439024
## 5 0.01024890  0.005191002 0.01224544 0.00505790    0.02821776
##   female sanitary products finished products        fish       flour
## 1              0.000000000       0.000000000 0.000000000 0.222222222
## 2              0.013544018       0.010383747 0.002708804 0.057787810
## 3              0.000000000       0.000000000 0.017543860 0.017543860
## 4              0.024390244       0.000000000 0.000000000 0.024390244
## 5              0.003859976       0.005457208 0.002928258 0.005191002
##   flower (seeds) flower soil/fertilizer frankfurter frozen chicken
## 1    0.000000000            0.000000000  0.33333333      0.0000000
## 2    0.023927765            0.001354402  0.13092551      0.0000000
## 3    0.000000000            0.000000000  0.08771930      0.1052632
## 4    0.000000000            0.000000000  0.04878049      0.0000000
## 5    0.006522028            0.002129642  0.03726873      0.0000000
##   frozen dessert frozen fish frozen fruits frozen meals frozen potato products
## 1    0.111111111 0.111111111  0.0000000000   0.00000000            0.333333333
## 2    0.027088036 0.032957111  0.0036117381   0.05733634            0.021670429
## 3    0.017543860 0.017543860  0.0000000000   0.08771930            0.017543860
## 4    0.024390244 0.024390244  0.0000000000   0.00000000            0.000000000
## 5    0.005723413 0.005191002  0.0005324105   0.01956609            0.004126181
##   frozen vegetables fruit/vegetable juice     grapes   hair spray        ham
## 1        0.11111111            0.00000000 0.00000000 0.0000000000 0.11111111
## 2        0.13318284            0.17246050 0.05823928 0.0013544018 0.07539503
## 3        0.08771930            0.12280702 0.07017544 0.0175438596 0.03508772
## 4        0.00000000            0.04878049 0.04878049 0.0000000000 0.02439024
## 5        0.02289365            0.04259284 0.01131372 0.0009317184 0.01131372
##   hamburger meat hard cheese       herbs        honey house keeping products
## 1     0.11111111  0.11111111 0.000000000 0.0000000000            0.000000000
## 2     0.09390519  0.06952596 0.053724605 0.0040632054            0.023476298
## 3     0.08771930  0.01754386 0.000000000 0.0000000000            0.000000000
## 4     0.04878049  0.02439024 0.024390244 0.0000000000            0.000000000
## 5     0.01477439  0.01118062 0.005324105 0.0007986157            0.003993079
##   hygiene articles  ice cream instant coffee Instant food products         jam
## 1       0.00000000 0.00000000    0.000000000           0.000000000 0.000000000
## 2       0.08397291 0.03205418    0.016252822           0.023024831 0.018058691
## 3       0.05263158 0.01754386    0.017543860           0.000000000 0.017543860
## 4       0.04878049 0.02439024    0.000000000           0.000000000 0.000000000
## 5       0.01770265 0.02302675    0.004791694           0.003726873 0.001597231
##       ketchup kitchen towels kitchen utensil light bulbs      liqueur
## 1 0.000000000    0.000000000    0.0000000000 0.000000000 0.0000000000
## 2 0.012189616    0.018961625    0.0013544018 0.005869074 0.0004514673
## 3 0.035087719    0.052631579    0.0000000000 0.017543860 0.0000000000
## 4 0.000000000    0.000000000    0.0000000000 0.000000000 0.0000000000
## 5 0.001730334    0.001863437    0.0001331026 0.003593771 0.0010648210
##        liquor liquor (appetizer)  liver loaf long life bakery product
## 1 0.000000000        0.111111111 0.111111111               0.00000000
## 2 0.004514673        0.005417607 0.008577878               0.08893905
## 3 0.000000000        0.035087719 0.035087719               0.07017544
## 4 0.024390244        0.073170732 0.000000000               0.02439024
## 5 0.013044057        0.007986157 0.003726873               0.02209504
##   make up remover male cosmetics margarine  mayonnaise       meat meat spreads
## 1    0.0000000000    0.000000000 0.0000000 0.000000000 0.11111111  0.000000000
## 2    0.0013544018    0.003611738 0.1516930 0.030248307 0.06230248  0.011286682
## 3    0.0000000000    0.000000000 0.1754386 0.017543860 0.00000000  0.000000000
## 4    0.0000000000    0.024390244 0.0000000 0.000000000 0.02439024  0.000000000
## 5    0.0006655131    0.004791694 0.0306136 0.002928258 0.01517370  0.002262745
##   misc. beverages     mustard    napkins newspapers   nut snack nuts/prunes
## 1      0.00000000 0.000000000 0.00000000 0.22222222 0.000000000 0.000000000
## 2      0.04424379 0.033408578 0.13498871 0.13092551 0.005417607 0.004514673
## 3      0.03508772 0.070175439 0.08771930 0.19298246 0.000000000 0.000000000
## 4      0.07317073 0.000000000 0.07317073 0.00000000 0.000000000 0.048780488
## 5      0.02342606 0.005324105 0.02768535 0.06415546 0.002528950 0.002795155
##          oil     onions organic products organic sausage other vegetables
## 1 0.11111111 0.33333333     0.0000000000     0.000000000        0.5555556
## 2 0.07765237 0.08487585     0.0054176072     0.005417607        0.4654628
## 3 0.08771930 0.07017544     0.0000000000     0.017543860        0.2982456
## 4 0.07317073 0.04878049     0.0000000000     0.000000000        0.1463415
## 5 0.01264475 0.01437508     0.0005324105     0.001197924        0.1123386
##   packaged fruit/vegetables       pasta     pastry    pet care  photo/film
## 1               0.000000000 0.000000000 0.33333333 0.000000000 0.000000000
## 2               0.027088036 0.042437923 0.15981941 0.013092551 0.009029345
## 3               0.017543860 0.017543860 0.15789474 0.000000000 0.000000000
## 4               0.000000000 0.024390244 0.02439024 0.000000000 0.000000000
## 5               0.008917876 0.006921336 0.06761613 0.008518568 0.009450286
##   pickled vegetables  pip fruit     popcorn       pork pot plants
## 1        0.000000000 0.33333333 0.000000000 0.33333333 0.00000000
## 2        0.054176072 0.17878104 0.016704289 0.13544018 0.03386005
## 3        0.000000000 0.08771930 0.017543860 0.14035088 0.01754386
## 4        0.024390244 0.07317073 0.000000000 0.00000000 0.02439024
## 5        0.007320644 0.04485558 0.004392387 0.03407427 0.01237854
##   potato products preservation products processed cheese    prosecco
## 1     0.000000000          0.0000000000      0.111111111 0.000000000
## 2     0.009029345          0.0009029345      0.049209932 0.001354402
## 3     0.000000000          0.0000000000      0.017543860 0.000000000
## 4     0.000000000          0.0000000000      0.000000000 0.000000000
## 5     0.001064821          0.0000000000      0.006921336 0.002262745
##   pudding powder ready soups red/blush wine        rice roll products
## 1   0.0000000000   0.0000000     0.00000000 0.111111111   0.000000000
## 2   0.0094808126   0.0000000     0.02212190 0.029345372   0.031602709
## 3   0.0000000000   0.3157895     0.01754386 0.017543860   0.017543860
## 4   0.0000000000   0.0000000     0.02439024 0.000000000   0.000000000
## 5   0.0002662052   0.0000000     0.01836816 0.001064821   0.003993079
##   rolls/buns root vegetables rubbing alcohol         rum salad dressing
## 1  0.1111111      0.11111111    0.0000000000 0.000000000      0.8888889
## 2  0.2749436      0.30519187    0.0027088036 0.009029345      0.0000000
## 3  0.3157895      0.22807018    0.0000000000 0.000000000      0.0000000
## 4  0.2682927      0.17073171    0.0000000000 0.000000000      0.0000000
## 5  0.1557301      0.04991348    0.0005324105 0.003194463      0.0000000
##          salt salty snack      sauces    sausage seasonal products
## 1 0.111111111  0.11111111 0.111111111 0.11111111       0.000000000
## 2 0.026636569  0.07539503 0.014446953 0.18329571       0.030248307
## 3 0.000000000  0.05263158 0.000000000 0.14035088       0.052631579
## 4 0.000000000  0.07317073 0.000000000 0.12195122       0.000000000
## 5 0.006122721  0.02635432 0.002795155 0.06708372       0.009317184
##   semi-finished bread shopping bags   skin care sliced cheese snack products
## 1          0.11111111    0.11111111 0.000000000   0.000000000    0.000000000
## 2          0.04063205    0.15846501 0.008577878   0.073137698    0.003611738
## 3          0.07017544    0.14035088 0.017543860   0.087719298    0.000000000
## 4          0.02439024    0.31707317 0.048780488   0.024390244    0.000000000
## 5          0.01038200    0.07932916 0.001730334   0.009716491    0.002928258
##          soap      soda soft cheese    softener sound storage medium
## 1 0.000000000 0.3333333 0.111111111 0.000000000            0.1111111
## 2 0.003611738 0.2320542 0.048758465 0.013995485            0.0000000
## 3 0.017543860 0.3333333 0.017543860 0.000000000            0.0000000
## 4 0.000000000 0.1707317 0.000000000 0.024390244            0.0000000
## 5 0.002262745 0.1559963 0.007719952 0.002928258            0.0000000
##         soups sparkling wine specialty bar specialty cheese specialty chocolate
## 1 0.000000000    0.000000000    0.00000000      0.000000000          0.11111111
## 2 0.023476298    0.007223476    0.03882619      0.023927765          0.04650113
## 3 0.017543860    0.000000000    0.05263158      0.000000000          0.00000000
## 4 0.000000000    0.024390244    0.04878049      0.048780488          0.07317073
## 5 0.001863437    0.005057900    0.02369227      0.003859976          0.02555570
##   specialty fat specialty vegetables      spices spread cheese      sugar
## 1   0.000000000         0.0000000000 0.000000000    0.11111111 0.00000000
## 2   0.011286682         0.0067720090 0.012641084    0.02167043 0.09209932
## 3   0.000000000         0.0000000000 0.017543860    0.00000000 0.05263158
## 4   0.000000000         0.0000000000 0.000000000    0.00000000 0.00000000
## 5   0.001464129         0.0002662052 0.002928258    0.00811926 0.01677093
##   sweet spreads       syrup         tea     tidbits toilet cleaner
## 1   0.000000000 0.000000000 0.000000000 0.000000000    0.000000000
## 2   0.023927765 0.007674944 0.011738149 0.004966140    0.003160271
## 3   0.035087719 0.017543860 0.017543860 0.000000000    0.000000000
## 4   0.024390244 0.000000000 0.000000000 0.000000000    0.000000000
## 5   0.004392387 0.001863437 0.001464129 0.001597231    0.000000000
##   tropical fruit      turkey   UHT-milk     vinegar    waffles
## 1     0.00000000 0.111111111 0.11111111 0.000000000 0.00000000
## 2     0.26049661 0.027990971 0.06862302 0.017155756 0.08623025
## 3     0.19298246 0.035087719 0.03508772 0.000000000 0.08771930
## 4     0.02439024 0.000000000 0.09756098 0.000000000 0.02439024
## 5     0.05896446 0.001996539 0.02262745 0.003460668 0.02409157
##   whipped/sour cream       whisky white bread white wine whole milk     yogurt
## 1         0.22222222 0.0000000000  0.22222222 0.00000000  0.1111111 0.00000000
## 2         0.20451467 0.0013544018  0.10564334 0.01038375  0.5327314 0.33950339
## 3         0.10526316 0.0000000000  0.05263158 0.00000000  0.3333333 0.24561404
## 4         0.00000000 0.0487804878  0.07317073 0.04878049  0.1219512 0.21951220
## 5         0.03247704 0.0003993079  0.02289365 0.02156262  0.1740982 0.07946227
##      zwieback
## 1 0.000000000
## 2 0.012189616
## 3 0.000000000
## 4 0.024390244
## 5 0.005324105

Comment:

#Cluster 2 (largest group): high values for milk, yogurt, bread → “daily essentials / breakfast cluster” #Cluster 5: mixed low-frequency items → “general mixed grocery basket” #Cluster 3 & 4: smaller specialized groups (e.g., beverages, snacks, frozen goods)

#Conclusion: The elbow method suggested 5 clusters. Clusters show moderate structure with overlap.Some meaningful shopping patterns were identified. However, clustering quality is limited due to data sparsity and method assumptions