#The goal of this analysis is to identify natural groupings of grocery items/transactions using clustering methods and interpret purchasing patterns in the dataset.
#Loading the data set and preparation
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(cluster)
# Load transaction data
groceries <- read.transactions(
"https://raw.githubusercontent.com/zahid607/Data624/refs/heads/main/GroceryDataSet.csv",
format = "basket",
sep = ","
)
summary(groceries)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55 46
## 17 18 19 20 21 22 23 24 26 27 28 29 32
## 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
#Comment: The dataset contains 9,835 grocery transactions with 169 items. Each transaction represents a basket of purchased goods.
#Exploratory Analysis
#Top 25 items
groceries_top25 <- read.transactions(
"https://raw.githubusercontent.com/zahid607/Data624/refs/heads/main/GroceryDataSet.csv",
format = "basket",
sep = ","
)
itemFrequencyPlot(
groceries_top25,
topN = 25,
type = "absolute",
main = "Top 25 Items"
)
#Convert data for clustering
groceries_mat <- as(groceries, "matrix")
groceries_df <- as.data.frame(groceries_mat)
# Remove columns with no variance (important fix)
groceries_df <- groceries_df[, colSums(groceries_df) > 0]
# Scale data
groceries_scaled <- scale(groceries_df)
#Choosing Optimal Number of Clusters (Elbow Method)
set.seed(1)
wss <- sapply(1:10, function(k){
kmeans(groceries_scaled, centers = k, nstart = 20)$tot.withinss
})
plot(1:10, wss, type = "b",
xlab = "Number of Clusters (k)",
ylab = "Within-cluster Sum of Squares",
main = "Elbow Method")
#Comment: The plot shows a noticeable bend around k = 4 to 5, so k = 5 was selected for final clustering.
#K-Means Clustering
set.seed(1)
k_cluster <- kmeans(groceries_scaled,
centers = 5,
nstart = 50,
iter.max = 100)
table(k_cluster$cluster)
##
## 1 2 3 4 5
## 9 2215 57 41 7513
#Cluster Visualization (PCA)
pca <- prcomp(groceries_scaled)
plot(pca$x[,1], pca$x[,2],
col = k_cluster$cluster,
pch = 19,
xlab = "PC1",
ylab = "PC2",
main = "Cluster Visualization using PCA")
#Comment:Clusters show some overlap, which suggests that purchasing behavior is not strongly segmented into clearly separated groups.
#Cluster Quality (Silhouette Score)
sil <- silhouette(k_cluster$cluster, dist(groceries_scaled))
plot(sil, main = "Silhouette Plot for K-Means Clusters")
#Comment: Close to 1 = good clustering, Around 0 = overlapping clusters,Negative = poor clustering
#Cluster Interpretation
cluster_profile <- aggregate(groceries_df,
by = list(cluster = k_cluster$cluster),
mean)
cluster_profile
## cluster abrasive cleaner artif. sweetener baby cosmetics baby food
## 1 1 0.111111111 0.111111111 0.0000000 0.0000000000
## 2 2 0.010835214 0.009480813 0.0000000 0.0004514673
## 3 3 0.017543860 0.017543860 0.1052632 0.0000000000
## 4 4 0.000000000 0.000000000 0.0000000 0.0000000000
## 5 5 0.001197924 0.001197924 0.0000000 0.0000000000
## bags baking powder bathroom cleaner beef berries beverages
## 1 0.0000000000 0.000000000 0.0000000 0.00000000 0.00000000 0.00000000
## 2 0.0009029345 0.056433409 0.0000000 0.12776524 0.08352144 0.03340858
## 3 0.0000000000 0.052631579 0.4736842 0.08771930 0.14035088 0.00000000
## 4 0.0000000000 0.024390244 0.0000000 0.02439024 0.02439024 0.04878049
## 5 0.0002662052 0.005989618 0.0000000 0.03021430 0.01770265 0.02395847
## bottled beer bottled water brandy brown bread butter butter milk
## 1 0.11111111 0.11111111 0 0.00000000 0.22222222 0.00000000
## 2 0.09119639 0.17787810 0 0.13363431 0.15981941 0.06365688
## 3 0.07017544 0.17543860 0 0.05263158 0.10526316 0.08771930
## 4 0.21951220 0.12195122 1 0.07317073 0.02439024 0.00000000
## 5 0.07666711 0.09011048 0 0.04472248 0.02422468 0.01717024
## cake bar candles candy canned beer canned fish canned fruit
## 1 0.000000000 0.000000000 0.33333333 0.11111111 0.000000000 0.000000000
## 2 0.027539503 0.017155756 0.05417607 0.03476298 0.035665914 0.009932280
## 3 0.017543860 0.000000000 0.03508772 0.10526316 0.035087719 0.000000000
## 4 0.000000000 0.000000000 0.02439024 0.14634146 0.024390244 0.000000000
## 5 0.009050978 0.006655131 0.02236124 0.08971117 0.008784773 0.001331026
## canned vegetables cat food cereals chewing gum chicken chocolate
## 1 0.000000000 0.11111111 0.000000000 0.00000000 0.22222222 0.00000000
## 2 0.032957111 0.06139955 0.018510158 0.02076749 0.10564334 0.10022573
## 3 0.017543860 0.03508772 0.000000000 0.01754386 0.08771930 0.07017544
## 4 0.000000000 0.00000000 0.000000000 0.02439024 0.02439024 0.04878049
## 5 0.004259284 0.01197924 0.001996539 0.02116332 0.02395847 0.03460668
## chocolate marshmallow citrus fruit cleaner cling film/bags cocoa drinks
## 1 0.000000000 0.33333333 0.111111111 0.000000000 0.0000000000
## 2 0.017155756 0.20857788 0.011286682 0.028893905 0.0067720090
## 3 0.035087719 0.15789474 0.035087719 0.000000000 0.0000000000
## 4 0.000000000 0.04878049 0.000000000 0.000000000 0.0000000000
## 5 0.006522028 0.04498869 0.002928258 0.006388926 0.0009317184
## coffee condensed milk cooking chocolate cookware cream
## 1 0.11111111 0.000000000 0.000000000 0.000000000 0.0000000000
## 2 0.09887133 0.013544018 0.005869074 0.004514673 0.0027088036
## 3 0.07017544 0.035087719 0.000000000 0.017543860 0.0175438596
## 4 0.09756098 0.073170732 0.024390244 0.024390244 0.0000000000
## 5 0.04565420 0.008784773 0.001464129 0.001996539 0.0007986157
## cream cheese curd curd cheese decalcifier dental care dessert
## 1 0.00000000 0.22222222 0.000000000 0.0000000000 0.000000000 0.00000000
## 2 0.10925508 0.14943567 0.013995485 0.0040632054 0.017607223 0.09435666
## 3 0.10526316 0.05263158 0.017543860 0.0175438596 0.017543860 0.08771930
## 4 0.02439024 0.04878049 0.000000000 0.0000000000 0.000000000 0.00000000
## 5 0.01876747 0.02475709 0.002395847 0.0006655131 0.002262745 0.02009850
## detergent dish cleaner dishes dog food domestic eggs
## 1 0.00000000 0.000000000 0.00000000 0.00000000 0.11111111
## 2 0.04966140 0.026636569 0.03566591 0.01986456 0.18419865
## 3 0.00000000 0.052631579 0.01754386 0.00000000 0.03508772
## 4 0.04878049 0.048780488 0.02439024 0.04878049 0.02439024
## 5 0.01024890 0.005191002 0.01224544 0.00505790 0.02821776
## female sanitary products finished products fish flour
## 1 0.000000000 0.000000000 0.000000000 0.222222222
## 2 0.013544018 0.010383747 0.002708804 0.057787810
## 3 0.000000000 0.000000000 0.017543860 0.017543860
## 4 0.024390244 0.000000000 0.000000000 0.024390244
## 5 0.003859976 0.005457208 0.002928258 0.005191002
## flower (seeds) flower soil/fertilizer frankfurter frozen chicken
## 1 0.000000000 0.000000000 0.33333333 0.0000000
## 2 0.023927765 0.001354402 0.13092551 0.0000000
## 3 0.000000000 0.000000000 0.08771930 0.1052632
## 4 0.000000000 0.000000000 0.04878049 0.0000000
## 5 0.006522028 0.002129642 0.03726873 0.0000000
## frozen dessert frozen fish frozen fruits frozen meals frozen potato products
## 1 0.111111111 0.111111111 0.0000000000 0.00000000 0.333333333
## 2 0.027088036 0.032957111 0.0036117381 0.05733634 0.021670429
## 3 0.017543860 0.017543860 0.0000000000 0.08771930 0.017543860
## 4 0.024390244 0.024390244 0.0000000000 0.00000000 0.000000000
## 5 0.005723413 0.005191002 0.0005324105 0.01956609 0.004126181
## frozen vegetables fruit/vegetable juice grapes hair spray ham
## 1 0.11111111 0.00000000 0.00000000 0.0000000000 0.11111111
## 2 0.13318284 0.17246050 0.05823928 0.0013544018 0.07539503
## 3 0.08771930 0.12280702 0.07017544 0.0175438596 0.03508772
## 4 0.00000000 0.04878049 0.04878049 0.0000000000 0.02439024
## 5 0.02289365 0.04259284 0.01131372 0.0009317184 0.01131372
## hamburger meat hard cheese herbs honey house keeping products
## 1 0.11111111 0.11111111 0.000000000 0.0000000000 0.000000000
## 2 0.09390519 0.06952596 0.053724605 0.0040632054 0.023476298
## 3 0.08771930 0.01754386 0.000000000 0.0000000000 0.000000000
## 4 0.04878049 0.02439024 0.024390244 0.0000000000 0.000000000
## 5 0.01477439 0.01118062 0.005324105 0.0007986157 0.003993079
## hygiene articles ice cream instant coffee Instant food products jam
## 1 0.00000000 0.00000000 0.000000000 0.000000000 0.000000000
## 2 0.08397291 0.03205418 0.016252822 0.023024831 0.018058691
## 3 0.05263158 0.01754386 0.017543860 0.000000000 0.017543860
## 4 0.04878049 0.02439024 0.000000000 0.000000000 0.000000000
## 5 0.01770265 0.02302675 0.004791694 0.003726873 0.001597231
## ketchup kitchen towels kitchen utensil light bulbs liqueur
## 1 0.000000000 0.000000000 0.0000000000 0.000000000 0.0000000000
## 2 0.012189616 0.018961625 0.0013544018 0.005869074 0.0004514673
## 3 0.035087719 0.052631579 0.0000000000 0.017543860 0.0000000000
## 4 0.000000000 0.000000000 0.0000000000 0.000000000 0.0000000000
## 5 0.001730334 0.001863437 0.0001331026 0.003593771 0.0010648210
## liquor liquor (appetizer) liver loaf long life bakery product
## 1 0.000000000 0.111111111 0.111111111 0.00000000
## 2 0.004514673 0.005417607 0.008577878 0.08893905
## 3 0.000000000 0.035087719 0.035087719 0.07017544
## 4 0.024390244 0.073170732 0.000000000 0.02439024
## 5 0.013044057 0.007986157 0.003726873 0.02209504
## make up remover male cosmetics margarine mayonnaise meat meat spreads
## 1 0.0000000000 0.000000000 0.0000000 0.000000000 0.11111111 0.000000000
## 2 0.0013544018 0.003611738 0.1516930 0.030248307 0.06230248 0.011286682
## 3 0.0000000000 0.000000000 0.1754386 0.017543860 0.00000000 0.000000000
## 4 0.0000000000 0.024390244 0.0000000 0.000000000 0.02439024 0.000000000
## 5 0.0006655131 0.004791694 0.0306136 0.002928258 0.01517370 0.002262745
## misc. beverages mustard napkins newspapers nut snack nuts/prunes
## 1 0.00000000 0.000000000 0.00000000 0.22222222 0.000000000 0.000000000
## 2 0.04424379 0.033408578 0.13498871 0.13092551 0.005417607 0.004514673
## 3 0.03508772 0.070175439 0.08771930 0.19298246 0.000000000 0.000000000
## 4 0.07317073 0.000000000 0.07317073 0.00000000 0.000000000 0.048780488
## 5 0.02342606 0.005324105 0.02768535 0.06415546 0.002528950 0.002795155
## oil onions organic products organic sausage other vegetables
## 1 0.11111111 0.33333333 0.0000000000 0.000000000 0.5555556
## 2 0.07765237 0.08487585 0.0054176072 0.005417607 0.4654628
## 3 0.08771930 0.07017544 0.0000000000 0.017543860 0.2982456
## 4 0.07317073 0.04878049 0.0000000000 0.000000000 0.1463415
## 5 0.01264475 0.01437508 0.0005324105 0.001197924 0.1123386
## packaged fruit/vegetables pasta pastry pet care photo/film
## 1 0.000000000 0.000000000 0.33333333 0.000000000 0.000000000
## 2 0.027088036 0.042437923 0.15981941 0.013092551 0.009029345
## 3 0.017543860 0.017543860 0.15789474 0.000000000 0.000000000
## 4 0.000000000 0.024390244 0.02439024 0.000000000 0.000000000
## 5 0.008917876 0.006921336 0.06761613 0.008518568 0.009450286
## pickled vegetables pip fruit popcorn pork pot plants
## 1 0.000000000 0.33333333 0.000000000 0.33333333 0.00000000
## 2 0.054176072 0.17878104 0.016704289 0.13544018 0.03386005
## 3 0.000000000 0.08771930 0.017543860 0.14035088 0.01754386
## 4 0.024390244 0.07317073 0.000000000 0.00000000 0.02439024
## 5 0.007320644 0.04485558 0.004392387 0.03407427 0.01237854
## potato products preservation products processed cheese prosecco
## 1 0.000000000 0.0000000000 0.111111111 0.000000000
## 2 0.009029345 0.0009029345 0.049209932 0.001354402
## 3 0.000000000 0.0000000000 0.017543860 0.000000000
## 4 0.000000000 0.0000000000 0.000000000 0.000000000
## 5 0.001064821 0.0000000000 0.006921336 0.002262745
## pudding powder ready soups red/blush wine rice roll products
## 1 0.0000000000 0.0000000 0.00000000 0.111111111 0.000000000
## 2 0.0094808126 0.0000000 0.02212190 0.029345372 0.031602709
## 3 0.0000000000 0.3157895 0.01754386 0.017543860 0.017543860
## 4 0.0000000000 0.0000000 0.02439024 0.000000000 0.000000000
## 5 0.0002662052 0.0000000 0.01836816 0.001064821 0.003993079
## rolls/buns root vegetables rubbing alcohol rum salad dressing
## 1 0.1111111 0.11111111 0.0000000000 0.000000000 0.8888889
## 2 0.2749436 0.30519187 0.0027088036 0.009029345 0.0000000
## 3 0.3157895 0.22807018 0.0000000000 0.000000000 0.0000000
## 4 0.2682927 0.17073171 0.0000000000 0.000000000 0.0000000
## 5 0.1557301 0.04991348 0.0005324105 0.003194463 0.0000000
## salt salty snack sauces sausage seasonal products
## 1 0.111111111 0.11111111 0.111111111 0.11111111 0.000000000
## 2 0.026636569 0.07539503 0.014446953 0.18329571 0.030248307
## 3 0.000000000 0.05263158 0.000000000 0.14035088 0.052631579
## 4 0.000000000 0.07317073 0.000000000 0.12195122 0.000000000
## 5 0.006122721 0.02635432 0.002795155 0.06708372 0.009317184
## semi-finished bread shopping bags skin care sliced cheese snack products
## 1 0.11111111 0.11111111 0.000000000 0.000000000 0.000000000
## 2 0.04063205 0.15846501 0.008577878 0.073137698 0.003611738
## 3 0.07017544 0.14035088 0.017543860 0.087719298 0.000000000
## 4 0.02439024 0.31707317 0.048780488 0.024390244 0.000000000
## 5 0.01038200 0.07932916 0.001730334 0.009716491 0.002928258
## soap soda soft cheese softener sound storage medium
## 1 0.000000000 0.3333333 0.111111111 0.000000000 0.1111111
## 2 0.003611738 0.2320542 0.048758465 0.013995485 0.0000000
## 3 0.017543860 0.3333333 0.017543860 0.000000000 0.0000000
## 4 0.000000000 0.1707317 0.000000000 0.024390244 0.0000000
## 5 0.002262745 0.1559963 0.007719952 0.002928258 0.0000000
## soups sparkling wine specialty bar specialty cheese specialty chocolate
## 1 0.000000000 0.000000000 0.00000000 0.000000000 0.11111111
## 2 0.023476298 0.007223476 0.03882619 0.023927765 0.04650113
## 3 0.017543860 0.000000000 0.05263158 0.000000000 0.00000000
## 4 0.000000000 0.024390244 0.04878049 0.048780488 0.07317073
## 5 0.001863437 0.005057900 0.02369227 0.003859976 0.02555570
## specialty fat specialty vegetables spices spread cheese sugar
## 1 0.000000000 0.0000000000 0.000000000 0.11111111 0.00000000
## 2 0.011286682 0.0067720090 0.012641084 0.02167043 0.09209932
## 3 0.000000000 0.0000000000 0.017543860 0.00000000 0.05263158
## 4 0.000000000 0.0000000000 0.000000000 0.00000000 0.00000000
## 5 0.001464129 0.0002662052 0.002928258 0.00811926 0.01677093
## sweet spreads syrup tea tidbits toilet cleaner
## 1 0.000000000 0.000000000 0.000000000 0.000000000 0.000000000
## 2 0.023927765 0.007674944 0.011738149 0.004966140 0.003160271
## 3 0.035087719 0.017543860 0.017543860 0.000000000 0.000000000
## 4 0.024390244 0.000000000 0.000000000 0.000000000 0.000000000
## 5 0.004392387 0.001863437 0.001464129 0.001597231 0.000000000
## tropical fruit turkey UHT-milk vinegar waffles
## 1 0.00000000 0.111111111 0.11111111 0.000000000 0.00000000
## 2 0.26049661 0.027990971 0.06862302 0.017155756 0.08623025
## 3 0.19298246 0.035087719 0.03508772 0.000000000 0.08771930
## 4 0.02439024 0.000000000 0.09756098 0.000000000 0.02439024
## 5 0.05896446 0.001996539 0.02262745 0.003460668 0.02409157
## whipped/sour cream whisky white bread white wine whole milk yogurt
## 1 0.22222222 0.0000000000 0.22222222 0.00000000 0.1111111 0.00000000
## 2 0.20451467 0.0013544018 0.10564334 0.01038375 0.5327314 0.33950339
## 3 0.10526316 0.0000000000 0.05263158 0.00000000 0.3333333 0.24561404
## 4 0.00000000 0.0487804878 0.07317073 0.04878049 0.1219512 0.21951220
## 5 0.03247704 0.0003993079 0.02289365 0.02156262 0.1740982 0.07946227
## zwieback
## 1 0.000000000
## 2 0.012189616
## 3 0.000000000
## 4 0.024390244
## 5 0.005324105
#Cluster 2 (largest group): high values for milk, yogurt, bread → “daily essentials / breakfast cluster” #Cluster 5: mixed low-frequency items → “general mixed grocery basket” #Cluster 3 & 4: smaller specialized groups (e.g., beverages, snacks, frozen goods)
#Conclusion: The elbow method suggested 5 clusters. Clusters show moderate structure with overlap.Some meaningful shopping patterns were identified. However, clustering quality is limited due to data sparsity and method assumptions