library(mlbench)
library(readr)
library(data.table)
library(stats)
library(arules)
## Loading required package: Matrix
## 
## Attaching package: 'arules'
## The following objects are masked from 'package:base':
## 
##     abbreviate, write
library(arulesViz)
library(kernlab)
## 
## Attaching package: 'kernlab'
## The following object is masked from 'package:arules':
## 
##     size
library(Matrix)
data=paste("paper,pens,pencils", "paper,ruler","pencils,cards,ruler", "paper,cards", "pens,pencils,cards", "paper,ruler,calculator", "paper, pens,cards, folder", "pencils, ruler, folder", "paper, pencils,cards,ruler", "pens,cards", sep="\n")
#There are 10 items in the set
# Step 1: Set the support threshold. minsup(X) = 25%. Step 2: Generate all candidate 1-itemsets
# Step 2: Generate all candidates 1-items
#Below are all the 1 item sets and the Support, low frequency items, folder and calculator, are pruned as they do not reach the 25% minimum support threshold
# Item   Support_Count   Support
# Paper     5             .50
# Pens      5             .50
# Pencils   5             .50
#Cards      6             .60
#Ruler      5             .50
#Folder     2             .20
#Calculator 1             .10

#Step 3: Calculate Support and Pruning for all.
# The table in step 3 is everything from the table above except the folder and calculator

#Step 4 and Step 5: Generate all candidate 2 item sets and calculate the support count, the Support, and the 25% minimum threshold support pruning. This 2 item set is taken from original data.
#Items            Support_Count     Support
#Paper, Pens}         2               .2
#{Paper, Pencils}     2               .2
#{Paper, Cards}       3               .3
#{Paper, Ruler}       2               .2
#{Pens, Pencils}      2               .2
#{Pens, Cards}        3               .3
#{Pens, Ruler}        1               .1
#{Pencils, Cards}     3               .3
#{Pencils, Ruler}     3               .3
#{Cards, Ruler}       2               .2
# Out of the 2 item sets, {Paper,Cards}, {Pens, Cards},{Pencils, Cards}, and {Pencils, Ruler} are the 4, 2-item transaction sets that meet the threshold standard. Therefore, Paper, Cards, Pens, and Ruler remain after pruning.
# Step 6:Generate higher order Itemsets (3-itemsets, 4-itemsets, and so on, each time calculating support and pruning items that are below the threshold. The algorithm stops when no new frequent itemsets can be generated or when the item sets reach the size of the largest transaction.
#Items                  Support_Count         Support
#{Paper,Cards,Pens}         1                   .1
#{Paper, Cards, Pencils}    1                   .1
#{Paper,Cards,Ruler}        1                   .1
#{Paper, Pens, Pencils}     1                   .1
#{Paper, Pens, Ruler}       1                   .1
#{Cards, Pens, Pencils}     1                   .1
#{Cards, Pencils, Ruler}    1                   .1
#{Cards, Pencils, Ruler}    2                   .2    (This is a subset from the 4 itme set)
#{Pens, Pencils, Ruler}     0                    0
# None of the 3 item sets meet the the 25% support threshold
#Step 7: Create Association Rules. and then evaluate these rules for their usefulness based on metrics like confidence and lift. 30 percent of the paper cards is taken from the 2 item set for support.
#Rule           Support       Confidence      Lift
#{Paper,Cards}    .3            3/5=.6        0.6/0.6.=1  #Lift=Confidence/Support of the Consequent(Cards)
#{Cards,Paper}    .3            3/6=.5        0.5/0.5= 1  #Confidence=Support(Card,Paper)/Support(Cards)
#{Pen,Cards}      .3            3/5=.6        0.6/0.6= 1
#{Cards, Pen}     .3            3/6=.5        0.5/0.5= 1
#{Pencil,Cards}   .3            3/5= .6        0.6/0.6= 1
#{Cards, Pencil}  .3            3/6= .5       0.5/0.5= 1
#{Pencils, Ruler} .3            3/5=.6        0.6/0.5= 1.2
#{Ruler, Pencils} .3            3/5= .6       0.6/0.5=1.2
#Based on the Confidence and Lift of the dataset, If a person purchases Pencils they have a greater chance(1.2 times as likely) to purchase a Ruler. All the other have a Lift of 1.0 which shows independence and the rules are not interesting.
cat(data)
## paper,pens,pencils
## paper,ruler
## pencils,cards,ruler
## paper,cards
## pens,pencils,cards
## paper,ruler,calculator
## paper, pens,cards, folder
## pencils, ruler, folder
## paper, pencils,cards,ruler
## pens,cards
write(data, file = "hw_basket.txt")
tr <- read.transactions("hw_basket.txt", format = "basket", sep = ",", skip = 1)
inspect(tr)
##     items                         
## [1] {paper, ruler}                
## [2] {cards, pencils, ruler}       
## [3] {cards, paper}                
## [4] {cards, pencils, pens}        
## [5] {calculator, paper, ruler}    
## [6] {cards, folder, paper, pens}  
## [7] {folder, pencils, ruler}      
## [8] {cards, paper, pencils, ruler}
## [9] {cards, pens}
itemLabels(tr)
## [1] "calculator" "cards"      "folder"     "paper"      "pencils"   
## [6] "pens"       "ruler"
dd <- as(tr,"transactions")
class(dd)
## [1] "transactions"
## attr(,"package")
## [1] "arules"
str(dd)
## Formal class 'transactions' [package "arules"] with 3 slots
##   ..@ data       :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
##   .. .. ..@ i       : int [1:26] 3 6 1 4 6 1 3 1 4 5 ...
##   .. .. ..@ p       : int [1:10] 0 2 5 7 10 13 17 20 24 26
##   .. .. ..@ Dim     : int [1:2] 7 9
##   .. .. ..@ Dimnames:List of 2
##   .. .. .. ..$ : NULL
##   .. .. .. ..$ : NULL
##   .. .. ..@ factors : list()
##   ..@ itemInfo   :'data.frame':  7 obs. of  1 variable:
##   .. ..$ labels: chr [1:7] "calculator" "cards" "folder" "paper" ...
##   ..@ itemsetInfo:'data.frame':  0 obs. of  0 variables
summary(dd)
## transactions as itemMatrix in sparse format with
##  9 rows (elements/itemsets/transactions) and
##  7 columns (items) and a density of 0.4126984 
## 
## most frequent items:
##   cards   paper   ruler pencils    pens (Other) 
##       6       5       5       4       3       3 
## 
## element (itemset/transaction) length distribution:
## sizes
## 2 3 4 
## 3 4 2 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   3.000   2.889   3.000   4.000 
## 
## includes extended item information - examples:
##       labels
## 1 calculator
## 2      cards
## 3     folder
itemFrequencyPlot(dd, topN = 7, support=0.25,cex.names=0.8)

## Mining Associations with the "Apriori" Algorithm
ars <- apriori(dd, parameter=list(support=0.25, confidence=0.0075))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##      0.0075    0.1    1 none FALSE            TRUE       5    0.25      1
##  maxlen target  ext
##      10  rules TRUE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 2 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[7 item(s), 9 transaction(s)] done [0.00s].
## sorting and recoding items ... [5 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [15 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
ars
## set of 15 rules
str(ars)
## Formal class 'rules' [package "arules"] with 4 slots
##   ..@ lhs    :Formal class 'itemMatrix' [package "arules"] with 3 slots
##   .. .. ..@ data       :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
##   .. .. .. .. ..@ i       : int [1:10] 5 1 4 6 4 1 6 3 3 1
##   .. .. .. .. ..@ p       : int [1:16] 0 0 0 0 0 0 1 2 3 4 ...
##   .. .. .. .. ..@ Dim     : int [1:2] 7 15
##   .. .. .. .. ..@ Dimnames:List of 2
##   .. .. .. .. .. ..$ : NULL
##   .. .. .. .. .. ..$ : NULL
##   .. .. .. .. ..@ factors : list()
##   .. .. ..@ itemInfo   :'data.frame':    7 obs. of  1 variable:
##   .. .. .. ..$ labels: chr [1:7] "calculator" "cards" "folder" "paper" ...
##   .. .. ..@ itemsetInfo:'data.frame':    0 obs. of  0 variables
##   ..@ rhs    :Formal class 'itemMatrix' [package "arules"] with 3 slots
##   .. .. ..@ data       :Formal class 'ngCMatrix' [package "Matrix"] with 5 slots
##   .. .. .. .. ..@ i       : int [1:15] 5 4 6 3 1 1 5 6 4 1 ...
##   .. .. .. .. ..@ p       : int [1:16] 0 1 2 3 4 5 6 7 8 9 ...
##   .. .. .. .. ..@ Dim     : int [1:2] 7 15
##   .. .. .. .. ..@ Dimnames:List of 2
##   .. .. .. .. .. ..$ : NULL
##   .. .. .. .. .. ..$ : NULL
##   .. .. .. .. ..@ factors : list()
##   .. .. ..@ itemInfo   :'data.frame':    7 obs. of  1 variable:
##   .. .. .. ..$ labels: chr [1:7] "calculator" "cards" "folder" "paper" ...
##   .. .. ..@ itemsetInfo:'data.frame':    0 obs. of  0 variables
##   ..@ quality:'data.frame':  15 obs. of  5 variables:
##   .. ..$ support   : num [1:15] 0.333 0.444 0.556 0.556 0.667 ...
##   .. ..$ confidence: num [1:15] 0.333 0.444 0.556 0.556 0.667 ...
##   .. ..$ coverage  : num [1:15] 1 1 1 1 1 ...
##   .. ..$ lift      : num [1:15] 1 1 1 1 1 ...
##   .. ..$ count     : int [1:15] 3 4 5 5 6 3 3 3 3 3 ...
##   ..@ info   :List of 5
##   .. ..$ data         : symbol dd
##   .. ..$ ntransactions: int 9
##   .. ..$ support      : num 0.25
##   .. ..$ confidence   : num 0.0075
##   .. ..$ call         : chr "apriori(data = dd, parameter = list(support = 0.25, confidence = 0.0075))"
inspect(  sort (ars, by = "lift"))
##      lhs          rhs       support   confidence coverage  lift  count
## [1]  {pens}    => {cards}   0.3333333 1.0000000  0.3333333 1.500 3    
## [2]  {cards}   => {pens}    0.3333333 0.5000000  0.6666667 1.500 3    
## [3]  {pencils} => {ruler}   0.3333333 0.7500000  0.4444444 1.350 3    
## [4]  {ruler}   => {pencils} 0.3333333 0.6000000  0.5555556 1.350 3    
## [5]  {pencils} => {cards}   0.3333333 0.7500000  0.4444444 1.125 3    
## [6]  {cards}   => {pencils} 0.3333333 0.5000000  0.6666667 1.125 3    
## [7]  {ruler}   => {paper}   0.3333333 0.6000000  0.5555556 1.080 3    
## [8]  {paper}   => {ruler}   0.3333333 0.6000000  0.5555556 1.080 3    
## [9]  {}        => {pens}    0.3333333 0.3333333  1.0000000 1.000 3    
## [10] {}        => {pencils} 0.4444444 0.4444444  1.0000000 1.000 4    
## [11] {}        => {ruler}   0.5555556 0.5555556  1.0000000 1.000 5    
## [12] {}        => {paper}   0.5555556 0.5555556  1.0000000 1.000 5    
## [13] {}        => {cards}   0.6666667 0.6666667  1.0000000 1.000 6    
## [14] {cards}   => {paper}   0.3333333 0.5000000  0.6666667 0.900 3    
## [15] {paper}   => {cards}   0.3333333 0.6000000  0.5555556 0.900 3
inspect( head( sort (ars, by = "lift")))
##     lhs          rhs       support   confidence coverage  lift  count
## [1] {pens}    => {cards}   0.3333333 1.00       0.3333333 1.500 3    
## [2] {cards}   => {pens}    0.3333333 0.50       0.6666667 1.500 3    
## [3] {pencils} => {ruler}   0.3333333 0.75       0.4444444 1.350 3    
## [4] {ruler}   => {pencils} 0.3333333 0.60       0.5555556 1.350 3    
## [5] {pencils} => {cards}   0.3333333 0.75       0.4444444 1.125 3    
## [6] {cards}   => {pencils} 0.3333333 0.50       0.6666667 1.125 3
inspect( head( sort (ars, by = "confidence")))
##     lhs          rhs       support   confidence coverage  lift  count
## [1] {pens}    => {cards}   0.3333333 1.0000000  0.3333333 1.500 3    
## [2] {pencils} => {ruler}   0.3333333 0.7500000  0.4444444 1.350 3    
## [3] {pencils} => {cards}   0.3333333 0.7500000  0.4444444 1.125 3    
## [4] {}        => {cards}   0.6666667 0.6666667  1.0000000 1.000 6    
## [5] {ruler}   => {pencils} 0.3333333 0.6000000  0.5555556 1.350 3    
## [6] {ruler}   => {paper}   0.3333333 0.6000000  0.5555556 1.080 3
# Rules involving high median price income
# X --> HousesWith_HighMedianValue 
#inspect(head(subset(ars, subset=rhs %in% "experience_level=FT"),5,by="confidence"))
inspect(subset(ars, subset=rhs %in% "cards"))
##     lhs          rhs     support   confidence coverage  lift  count
## [1] {}        => {cards} 0.6666667 0.6666667  1.0000000 1.000 6    
## [2] {pens}    => {cards} 0.3333333 1.0000000  0.3333333 1.500 3    
## [3] {pencils} => {cards} 0.3333333 0.7500000  0.4444444 1.125 3    
## [4] {paper}   => {cards} 0.3333333 0.6000000  0.5555556 0.900 3
inspect(head(  subset(ars, subset=rhs %in% "pens"), by = "confidence"))
##     lhs        rhs    support   confidence coverage  lift count
## [1] {cards} => {pens} 0.3333333 0.5000000  0.6666667 1.5  3    
## [2] {}      => {pens} 0.3333333 0.3333333  1.0000000 1.0  3
inspect(subset(ars, subset=rhs %in% "pencils"))
##     lhs        rhs       support   confidence coverage  lift  count
## [1] {}      => {pencils} 0.4444444 0.4444444  1.0000000 1.000 4    
## [2] {ruler} => {pencils} 0.3333333 0.6000000  0.5555556 1.350 3    
## [3] {cards} => {pencils} 0.3333333 0.5000000  0.6666667 1.125 3