That is exactly what the Groceries Data Set contains: a collection of receipts with each line representing 1 receipt and the items purchased. Each line is called a transaction and each column in a row represents an item. The data set is attached.
Your assignment is to use R to mine the data for association rules. You should report support, confidence and lift and your top 10 rules by lift.
Extra credit: do a simple cluster analysis on the data as well. Use whichever packages you like. Due May 5 before midnight.
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
## Loading required package: grid
library(knitr)
#library(assertr)
## upload data
Groceries <- read.transactions('GroceryDataSet.csv', rm.duplicates= TRUE, format = 'basket', sep=',')
Groceries
## transactions in sparse format with
## 9835 transactions (rows) and
## 169 items (columns)
summary(Groceries)
## transactions as itemMatrix in sparse format with
## 9835 rows (elements/itemsets/transactions) and
## 169 columns (items) and a density of 0.02609146
##
## most frequent items:
## whole milk other vegetables rolls/buns soda
## 2513 1903 1809 1715
## yogurt (Other)
## 1372 34055
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2159 1643 1299 1005 855 645 545 438 350 246 182 117 78 77 55
## 16 17 18 19 20 21 22 23 24 26 27 28 29 32
## 46 29 14 14 9 11 4 6 1 1 1 1 3 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 3.000 4.409 6.000 32.000
##
## includes extended item information - examples:
## labels
## 1 abrasive cleaner
## 2 artif. sweetener
## 3 baby cosmetics
itemFrequencyPlot(Groceries, topN = 15, type = 'absolute')
grocery_rules <- apriori(Groceries, parameter = list(support = 0.001, confidence = 0.8))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.8 0.1 1 none FALSE TRUE 5 0.001 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 9
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[169 item(s), 9835 transaction(s)] done [0.00s].
## sorting and recoding items ... [157 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 done [0.01s].
## writing ... [410 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(grocery_rules)
## set of 410 rules
##
## rule length distribution (lhs + rhs):sizes
## 3 4 5 6
## 29 229 140 12
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 3.000 4.000 4.000 4.329 5.000 6.000
##
## summary of quality measures:
## support confidence lift count
## Min. :0.001017 Min. :0.8000 Min. : 3.131 Min. :10.00
## 1st Qu.:0.001017 1st Qu.:0.8333 1st Qu.: 3.312 1st Qu.:10.00
## Median :0.001220 Median :0.8462 Median : 3.588 Median :12.00
## Mean :0.001247 Mean :0.8663 Mean : 3.951 Mean :12.27
## 3rd Qu.:0.001322 3rd Qu.:0.9091 3rd Qu.: 4.341 3rd Qu.:13.00
## Max. :0.003152 Max. :1.0000 Max. :11.235 Max. :31.00
##
## mining info:
## data ntransactions support confidence
## Groceries 9835 0.001 0.8
inspect(head(sort(grocery_rules, by = 'lift'), 10))
## lhs rhs support confidence lift count
## [1] {liquor,
## red/blush wine} => {bottled beer} 0.001931876 0.9047619 11.235269 19
## [2] {citrus fruit,
## fruit/vegetable juice,
## other vegetables,
## soda} => {root vegetables} 0.001016777 0.9090909 8.340400 10
## [3] {oil,
## other vegetables,
## tropical fruit,
## whole milk,
## yogurt} => {root vegetables} 0.001016777 0.9090909 8.340400 10
## [4] {citrus fruit,
## fruit/vegetable juice,
## grapes} => {tropical fruit} 0.001118454 0.8461538 8.063879 11
## [5] {other vegetables,
## rice,
## whole milk,
## yogurt} => {root vegetables} 0.001321810 0.8666667 7.951182 13
## [6] {oil,
## other vegetables,
## tropical fruit,
## whole milk} => {root vegetables} 0.001321810 0.8666667 7.951182 13
## [7] {ham,
## other vegetables,
## pip fruit,
## yogurt} => {tropical fruit} 0.001016777 0.8333333 7.941699 10
## [8] {beef,
## citrus fruit,
## other vegetables,
## tropical fruit} => {root vegetables} 0.001016777 0.8333333 7.645367 10
## [9] {butter,
## cream cheese,
## root vegetables} => {yogurt} 0.001016777 0.9090909 6.516698 10
## [10] {butter,
## sliced cheese,
## tropical fruit,
## whole milk} => {yogurt} 0.001016777 0.9090909 6.516698 10
subsets <- which(colSums(is.subset(grocery_rules, grocery_rules)) > 1)
grocery_rules <- grocery_rules[-subsets]
kable(inspect(head(sort(grocery_rules, by = "lift"), 10)))
## lhs rhs support confidence lift count
## [1] {liquor,
## red/blush wine} => {bottled beer} 0.001931876 0.9047619 11.235269 19
## [2] {citrus fruit,
## fruit/vegetable juice,
## other vegetables,
## soda} => {root vegetables} 0.001016777 0.9090909 8.340400 10
## [3] {citrus fruit,
## fruit/vegetable juice,
## grapes} => {tropical fruit} 0.001118454 0.8461538 8.063879 11
## [4] {butter,
## cream cheese,
## root vegetables} => {yogurt} 0.001016777 0.9090909 6.516698 10
## [5] {cream cheese,
## curd,
## other vegetables,
## whipped/sour cream} => {yogurt} 0.001016777 0.9090909 6.516698 10
## [6] {pip fruit,
## sausage,
## sliced cheese} => {yogurt} 0.001220132 0.8571429 6.144315 12
## [7] {butter,
## tropical fruit,
## white bread} => {yogurt} 0.001118454 0.8461538 6.065542 11
## [8] {butter,
## margarine,
## tropical fruit} => {yogurt} 0.001118454 0.8461538 6.065542 11
## [9] {cream cheese,
## margarine,
## whipped/sour cream} => {yogurt} 0.001016777 0.8333333 5.973639 10
## [10] {beef,
## butter,
## tropical fruit} => {yogurt} 0.001016777 0.8333333 5.973639 10
## Warning in kable_markdown(x, padding = padding, ...): The table should have
## a header (column names)
|| || || ||
plot(grocery_rules, method = "scatter", engine = "htmlwidget")
## To reduce overplotting, jitter is added! Use jitter = 0 to prevent jitter.
plot(head(sort(grocery_rules, by="lift"), 10), method = "graph")
plot(grocery_rules, method="grouped")
## convert transactions to transaction ID lists
ct <- crossTable(Groceries, sort=TRUE)
is(ct)
## [1] "matrix" "array" "mMatrix" "listI" "output" "input"
## [7] "mpinput" "structure" "vector"
df <- as.data.frame(ct)
withss <- sapply(1:10,
function(k) {
kmeans(df, k, nstart = 50, iter.max = 15)$tot.withinss})
plot( 1:10, withss, type = "b", pch = 19, frame = FALSE, xlab = "Number of clusters",
ylab = "Within Sum of squares" )
axis(1, at = 1:10, labels = seq(1, 10, 1))
clust_output <- kmeans(df, centers = 3)
clust_output$size
## [1] 27 137 5
library(FactoMineR)
library(factoextra)
## Loading required package: ggplot2
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
df.pca <- PCA(df, scale.unit = TRUE, graph = FALSE)
df.pca
## **Results for the Principal Component Analysis (PCA)**
## The analysis was performed on 169 individuals, described by 169 variables
## *The results are available in the following objects:
##
## name description
## 1 "$eig" "eigenvalues"
## 2 "$var" "results for the variables"
## 3 "$var$coord" "coord. for the variables"
## 4 "$var$cor" "correlations variables - dimensions"
## 5 "$var$cos2" "cos2 for the variables"
## 6 "$var$contrib" "contributions of the variables"
## 7 "$ind" "results for the individuals"
## 8 "$ind$coord" "coord. for the individuals"
## 9 "$ind$cos2" "cos2 for the individuals"
## 10 "$ind$contrib" "contributions of the individuals"
## 11 "$call" "summary statistics"
## 12 "$call$centre" "mean of the variables"
## 13 "$call$ecart.type" "standard error of the variables"
## 14 "$call$row.w" "weights for the individuals"
## 15 "$call$col.w" "weights for the variables"
fviz_eig(df.pca, addlabels = TRUE, ylim = c(0, 50))
var <- get_pca_var(df.pca)
var
## Principal Component Analysis Results for variables
## ===================================================
## Name Description
## 1 "$coord" "Coordinates for the variables"
## 2 "$cor" "Correlations between variables and dimensions"
## 3 "$cos2" "Cos2 for the variables"
## 4 "$contrib" "contributions of the variables"
head(var$coord, 3)
## Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
## whole milk 0.8525215 -0.1464986 -0.04181391 -0.12797198 0.1349026
## other vegetables 0.8393466 -0.1493608 0.08702113 0.05633156 -0.1613636
## rolls/buns 0.6968992 0.1973108 0.02947426 -0.10663548 0.1673875
fviz_pca_var(df.pca, col.var = "black")
fviz_pca_var(df.pca, col.var = "cos2",
gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
repel = TRUE # Avoid text overlapping
)
https://towardsdatascience.com/association-rule-mining-in-r-ddf2d044ae50
https://rpubs.com/shreyaghelani/267081
Michael Hahsler, Kurt Hornik, and Thomas Reutterer (2006) Implications of probabilistic data modeling for mining association rules. In M. Spiliopoulou, R. Kruse, C. Borgelt, A. Nuernberger, and W. Gaul, editors, From Data and Information Analysis to Knowledge Engineering, Studies in Classification, Data Analysis, and Knowledge Organization, pages 598–605. Springer-Verlag.
http://www.salemmarafi.com/code/market-basket-analysis-with-r/
https://stepupanalytics.com/steps-of-k-means-clustering-in-r/?utm_source=metype