library(mlbench)
library(readr)
library(data.table)
library(stats)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(arulesViz)
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:arules':
##
## size
library(Matrix)
data=paste("paper,pens,pencils", "paper,ruler","pencils,cards,ruler", "paper,cards", "pens,pencils,cards", "paper,ruler,calculator", "paper, pens,cards, folder", "pencils, ruler, folder", "paper, pencils,cards,ruler", "pens,cards", sep="\n")
#There are 10 items in the set
# Step 1: Set the support threshold. minsup(X) = 25%. Step 2: Generate all candidate 1-itemsets
# Step 2: Generate all candidates 1-items
#Below are all the 1 item sets and the Support, low frequency items, folder and calculator, are pruned as they do not reach the 25% minimum support threshold
# Item Support_Count Support
# Paper 5 .50
# Pens 5 .50
# Pencils 5 .50
#Cards 6 .60
#Ruler 5 .50
#Folder 2 .20
#Calculator 1 .10
#Step 3: Calculate Support and Pruning for all.
# The table in step 3 is everything from the table above except the folder and calculator
#Step 4 and Step 5: Generate all candidate 2 item sets and calculate the support count, the Support, and the 25% minimum threshold support pruning. This 2 item set is taken from original data.
#Items Support_Count Support
#Paper, Pens} 2 .2
#{Paper, Pencils} 2 .2
#{Paper, Cards} 3 .3
#{Paper, Ruler} 2 .2
#{Pens, Pencils} 2 .2
#{Pens, Cards} 3 .3
#{Pens, Ruler} 1 .1
#{Pencils, Cards} 3 .3
#{Pencils, Ruler} 3 .3
#{Cards, Ruler} 2 .2
# Out of the 2 item sets, {Paper,Cards}, {Pens, Cards},{Pencils, Cards}, and {Pencils, Ruler} are the 4, 2-item transaction sets that meet the threshold standard. Therefore, Paper, Cards, Pens, and Ruler remain after pruning.
# Step 6:Generate higher order Itemsets (3-itemsets, 4-itemsets, and so on, each time calculating support and pruning items that are below the threshold. The algorithm stops when no new frequent itemsets can be generated or when the item sets reach the size of the largest transaction.
#Items Support_Count Support
#{Paper,Cards,Pens} 1 .1
#{Paper, Cards, Pencils} 1 .1
#{Paper,Cards,Ruler} 1 .1
#{Paper, Pens, Pencils} 1 .1
#{Paper, Pens, Ruler} 1 .1
#{Cards, Pens, Pencils} 1 .1
#{Cards, Pencils, Ruler} 1 .1
#{Cards, Pencils, Ruler} 2 .2 (This is a subset from the 4 itme set)
#{Pens, Pencils, Ruler} 0 0
# None of the 3 item sets meet the the 25% support threshold
#Step 7: Create Association Rules. and then evaluate these rules for their usefulness based on metrics like confidence and lift. 30 percent of the paper cards is taken from the 2 item set for support.
#Rule Support Confidence Lift
#{Paper,Cards} .3 3/5=.6 0.6/0.6.=1 #Lift=Confidence/Support of the Consequent(Cards)
#{Cards,Paper} .3 3/6=.5 0.5/0.5= 1 #Confidence=Support(Card,Paper)/Support(Cards)
#{Pen,Cards} .3 3/5=.6 0.6/0.6= 1
#{Cards, Pen} .3 3/6=.5 0.5/0.5= 1
#{Pencil,Cards} .3 3/5= .6 0.6/0.6= 1
#{Cards, Pencil} .3 3/6= .5 0.5/0.5= 1
#{Pencils, Ruler} .3 3/5=.6 0.6/0.5= 1.2
#{Ruler, Pencils} .3 3/5= .6 0.6/0.5=1.2
#Based on the Confidence and Lift of the dataset, If a person purchases Pencils they have a greater chance(1.2 times as likely) to purchase a Ruler. All the other have a Lift of 1.0 which shows independence and the rules are not interesting.
cat(data)
## paper,pens,pencils
## paper,ruler
## pencils,cards,ruler
## paper,cards
## pens,pencils,cards
## paper,ruler,calculator
## paper, pens,cards, folder
## pencils, ruler, folder
## paper, pencils,cards,ruler
## pens,cards
write(data, file = "hw_basket.txt")
tr <- read.transactions("hw_basket.txt", format = "basket", sep = ",", skip = 1)
inspect(tr)
## items
## [1] {paper, ruler}
## [2] {cards, pencils, ruler}
## [3] {cards, paper}
## [4] {cards, pencils, pens}
## [5] {calculator, paper, ruler}
## [6] {cards, folder, paper, pens}
## [7] {folder, pencils, ruler}
## [8] {cards, paper, pencils, ruler}
## [9] {cards, pens}
itemLabels(tr)
## [1] "calculator" "cards" "folder" "paper" "pencils"
## [6] "pens" "ruler"
dd <- as(tr,"transactions")
class(dd)
## [1] "transactions"
## attr(,"package")
## [1] "arules"
str(dd)
## Formal class 'transactions' [package "arules"] with 3 slots
## ..@ data :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
## .. .. ..@ i : int [1:26] 3 6 1 4 6 1 3 1 4 5 ...
## .. .. ..@ p : int [1:10] 0 2 5 7 10 13 17 20 24 26
## .. .. ..@ Dim : int [1:2] 7 9
## .. .. ..@ Dimnames:List of 2
## .. .. .. ..$ : NULL
## .. .. .. ..$ : NULL
## .. .. ..@ factors : list()
## ..@ itemInfo :'data.frame': 7 obs. of 1 variable:
## .. ..$ labels: chr [1:7] "calculator" "cards" "folder" "paper" ...
## ..@ itemsetInfo:'data.frame': 0 obs. of 0 variables
summary(dd)
## transactions as itemMatrix in sparse format with
## 9 rows (elements/itemsets/transactions) and
## 7 columns (items) and a density of 0.4126984
##
## most frequent items:
## cards paper ruler pencils pens (Other)
## 6 5 5 4 3 3
##
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4
## 3 4 2
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 2.000 3.000 2.889 3.000 4.000
##
## includes extended item information - examples:
## labels
## 1 calculator
## 2 cards
## 3 folder
itemFrequencyPlot(dd, topN = 7, support=0.25,cex.names=0.8)

## Mining Associations with the "Apriori" Algorithm
ars <- apriori(dd, parameter=list(support=0.25, confidence=0.0075))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.0075 0.1 1 none FALSE TRUE 5 0.25 1
## maxlen target ext
## 10 rules TRUE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 2
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7 item(s), 9 transaction(s)] done [0.00s].
## sorting and recoding items ... [5 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
ars
## set of 15 rules
str(ars)
## Formal class 'rules' [package "arules"] with 4 slots
## ..@ lhs :Formal class 'itemMatrix' [package "arules"] with 3 slots
## .. .. ..@ data :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
## .. .. .. .. ..@ i : int [1:10] 5 1 4 6 4 1 6 3 3 1
## .. .. .. .. ..@ p : int [1:16] 0 0 0 0 0 0 1 2 3 4 ...
## .. .. .. .. ..@ Dim : int [1:2] 7 15
## .. .. .. .. ..@ Dimnames:List of 2
## .. .. .. .. .. ..$ : NULL
## .. .. .. .. .. ..$ : NULL
## .. .. .. .. ..@ factors : list()
## .. .. ..@ itemInfo :'data.frame': 7 obs. of 1 variable:
## .. .. .. ..$ labels: chr [1:7] "calculator" "cards" "folder" "paper" ...
## .. .. ..@ itemsetInfo:'data.frame': 0 obs. of 0 variables
## ..@ rhs :Formal class 'itemMatrix' [package "arules"] with 3 slots
## .. .. ..@ data :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
## .. .. .. .. ..@ i : int [1:15] 5 4 6 3 1 1 5 6 4 1 ...
## .. .. .. .. ..@ p : int [1:16] 0 1 2 3 4 5 6 7 8 9 ...
## .. .. .. .. ..@ Dim : int [1:2] 7 15
## .. .. .. .. ..@ Dimnames:List of 2
## .. .. .. .. .. ..$ : NULL
## .. .. .. .. .. ..$ : NULL
## .. .. .. .. ..@ factors : list()
## .. .. ..@ itemInfo :'data.frame': 7 obs. of 1 variable:
## .. .. .. ..$ labels: chr [1:7] "calculator" "cards" "folder" "paper" ...
## .. .. ..@ itemsetInfo:'data.frame': 0 obs. of 0 variables
## ..@ quality:'data.frame': 15 obs. of 5 variables:
## .. ..$ support : num [1:15] 0.333 0.444 0.556 0.556 0.667 ...
## .. ..$ confidence: num [1:15] 0.333 0.444 0.556 0.556 0.667 ...
## .. ..$ coverage : num [1:15] 1 1 1 1 1 ...
## .. ..$ lift : num [1:15] 1 1 1 1 1 ...
## .. ..$ count : int [1:15] 3 4 5 5 6 3 3 3 3 3 ...
## ..@ info :List of 5
## .. ..$ data : symbol dd
## .. ..$ ntransactions: int 9
## .. ..$ support : num 0.25
## .. ..$ confidence : num 0.0075
## .. ..$ call : chr "apriori(data = dd, parameter = list(support = 0.25, confidence = 0.0075))"
inspect( sort (ars, by = "lift"))
## lhs rhs support confidence coverage lift count
## [1] {pens} => {cards} 0.3333333 1.0000000 0.3333333 1.500 3
## [2] {cards} => {pens} 0.3333333 0.5000000 0.6666667 1.500 3
## [3] {pencils} => {ruler} 0.3333333 0.7500000 0.4444444 1.350 3
## [4] {ruler} => {pencils} 0.3333333 0.6000000 0.5555556 1.350 3
## [5] {pencils} => {cards} 0.3333333 0.7500000 0.4444444 1.125 3
## [6] {cards} => {pencils} 0.3333333 0.5000000 0.6666667 1.125 3
## [7] {ruler} => {paper} 0.3333333 0.6000000 0.5555556 1.080 3
## [8] {paper} => {ruler} 0.3333333 0.6000000 0.5555556 1.080 3
## [9] {} => {pens} 0.3333333 0.3333333 1.0000000 1.000 3
## [10] {} => {pencils} 0.4444444 0.4444444 1.0000000 1.000 4
## [11] {} => {ruler} 0.5555556 0.5555556 1.0000000 1.000 5
## [12] {} => {paper} 0.5555556 0.5555556 1.0000000 1.000 5
## [13] {} => {cards} 0.6666667 0.6666667 1.0000000 1.000 6
## [14] {cards} => {paper} 0.3333333 0.5000000 0.6666667 0.900 3
## [15] {paper} => {cards} 0.3333333 0.6000000 0.5555556 0.900 3
inspect( head( sort (ars, by = "lift")))
## lhs rhs support confidence coverage lift count
## [1] {pens} => {cards} 0.3333333 1.00 0.3333333 1.500 3
## [2] {cards} => {pens} 0.3333333 0.50 0.6666667 1.500 3
## [3] {pencils} => {ruler} 0.3333333 0.75 0.4444444 1.350 3
## [4] {ruler} => {pencils} 0.3333333 0.60 0.5555556 1.350 3
## [5] {pencils} => {cards} 0.3333333 0.75 0.4444444 1.125 3
## [6] {cards} => {pencils} 0.3333333 0.50 0.6666667 1.125 3
inspect( head( sort (ars, by = "confidence")))
## lhs rhs support confidence coverage lift count
## [1] {pens} => {cards} 0.3333333 1.0000000 0.3333333 1.500 3
## [2] {pencils} => {ruler} 0.3333333 0.7500000 0.4444444 1.350 3
## [3] {pencils} => {cards} 0.3333333 0.7500000 0.4444444 1.125 3
## [4] {} => {cards} 0.6666667 0.6666667 1.0000000 1.000 6
## [5] {ruler} => {pencils} 0.3333333 0.6000000 0.5555556 1.350 3
## [6] {ruler} => {paper} 0.3333333 0.6000000 0.5555556 1.080 3
# Rules involving high median price income
# X --> HousesWith_HighMedianValue
#inspect(head(subset(ars, subset=rhs %in% "experience_level=FT"),5,by="confidence"))
inspect(subset(ars, subset=rhs %in% "cards"))
## lhs rhs support confidence coverage lift count
## [1] {} => {cards} 0.6666667 0.6666667 1.0000000 1.000 6
## [2] {pens} => {cards} 0.3333333 1.0000000 0.3333333 1.500 3
## [3] {pencils} => {cards} 0.3333333 0.7500000 0.4444444 1.125 3
## [4] {paper} => {cards} 0.3333333 0.6000000 0.5555556 0.900 3
inspect(head( subset(ars, subset=rhs %in% "pens"), by = "confidence"))
## lhs rhs support confidence coverage lift count
## [1] {cards} => {pens} 0.3333333 0.5000000 0.6666667 1.5 3
## [2] {} => {pens} 0.3333333 0.3333333 1.0000000 1.0 3
inspect(subset(ars, subset=rhs %in% "pencils"))
## lhs rhs support confidence coverage lift count
## [1] {} => {pencils} 0.4444444 0.4444444 1.0000000 1.000 4
## [2] {ruler} => {pencils} 0.3333333 0.6000000 0.5555556 1.350 3
## [3] {cards} => {pencils} 0.3333333 0.5000000 0.6666667 1.125 3