install.packages("arules")
Error in install.packages : Updating loaded packages
library(arules)
library(arulesViz)
tr1 <- read.transactions("/Users/jayavarshini/Desktop/ms/sem5/dm/assing2/tr-1k-canonical.csv", sep=",")
tr5 <- read.transactions("/Users/jayavarshini/Desktop/ms/sem5/dm/assing2/tr-5k-canonical.csv", sep=",")
tr20 <- read.transactions("/Users/jayavarshini/Desktop/ms/sem5/dm/assing2/tr-20k-canonical.csv", sep=",")
Restarting R session...
summary(itemFrequency(tr1))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.02600 0.05675 0.07550 0.07076 0.08400 0.10800
summary(itemFrequency(tr5))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.03820 0.06250 0.07530 0.07094 0.08325 0.11080
summary(itemFrequency(tr20))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.04055 0.06294 0.07365 0.07113 0.08420 0.10985
summary(itemFrequency(tr75))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.04193 0.06211 0.07511 0.07099 0.08344 0.10924
f_is <- apriori(tr1, parameter=list(support=0.045, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 45
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 1000 transaction(s)] done [0.00s].
sorting and recoding items ... [42 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
writing ... [47 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
inspect(sort(f_is, decreasing = T , by="count"))
rm(f_is)
itemFrequencyPlot(tr1)
f_is <- apriori(tr5, parameter=list(support=0.05, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 250
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 5000 transaction(s)] done [0.00s].
sorting and recoding items ... [38 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 done [0.00s].
writing ... [39 set(s)] done [0.00s].
creating S4 object ... done [0.00s].
inspect(sort(f_is, decreasing = T , by="count"))
rm(f_is)
itemFrequencyPlot(tr5)
f_is <- apriori(tr20, parameter=list(support=0.06, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 1200
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 20000 transaction(s)] done [0.01s].
sorting and recoding items ... [38 item(s)] done [0.00s].
creating transaction tree ... done [0.02s].
checking subsets of size 1 2 done [0.00s].
writing ... [38 set(s)] done [0.00s].
creating S4 object ... done [0.01s].
inspect(sort(f_is, decreasing = T , by="count"))
rm(f_is)
itemFrequencyPlot(tr20)
f_is <- apriori(tr75, parameter=list(support=0.045, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 3375
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 75000 transaction(s)] done [0.05s].
sorting and recoding items ... [38 item(s)] done [0.01s].
creating transaction tree ... done [0.08s].
checking subsets of size 1 2 done [0.00s].
writing ... [40 set(s)] done [0.00s].
creating S4 object ... done [0.03s].
inspect(sort(f_is, decreasing = T , by="count"))
rm(f_is)
itemFrequencyPlot(tr75)
Using a support value of 0.5 gives us 2 item sets and individual item sets with a good number of record anf the most frequent item set Gongolias Cookie occurs 108 time with a good support value
r <- apriori(tr1,parameter = list(support=0.009))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 9
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 1000 transaction(s)] done [0.00s].
sorting and recoding items ... [50 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 done [0.00s].
writing ... [84 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
summary(r)
set of 84 rules
rule length distribution (lhs + rhs):sizes
3 4 5
57 23 4
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 3.000 3.000 3.369 4.000 5.000
summary of quality measures:
support confidence lift count
Min. :0.01800 Min. :0.8000 Min. : 8.511 Min. :18.00
1st Qu.:0.01900 1st Qu.:0.9044 1st Qu.:11.218 1st Qu.:19.00
Median :0.02400 Median :0.9554 Median :12.545 Median :24.00
Mean :0.02568 Mean :0.9410 Mean :12.644 Mean :25.68
3rd Qu.:0.02950 3rd Qu.:1.0000 3rd Qu.:14.124 3rd Qu.:29.50
Max. :0.04000 Max. :1.0000 Max. :19.608 Max. :40.00
mining info:
inspect(head(r,by="confidence"))
r1 <- apriori(tr1,parameter = list(support=0.005))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 5
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 1000 transaction(s)] done [0.00s].
sorting and recoding items ... [50 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 done [0.00s].
writing ... [88 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
summary(r1)
set of 88 rules
rule length distribution (lhs + rhs):sizes
3 4 5
61 23 4
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 3.000 3.000 3.352 4.000 5.000
summary of quality measures:
support confidence lift count
Min. :0.00500 Min. :0.8000 Min. : 8.403 Min. : 5.00
1st Qu.:0.01900 1st Qu.:0.9032 1st Qu.:11.029 1st Qu.:19.00
Median :0.02400 Median :0.9500 Median :12.516 Median :24.00
Mean :0.02476 Mean :0.9367 Mean :12.490 Mean :24.76
3rd Qu.:0.02900 3rd Qu.:1.0000 3rd Qu.:13.946 3rd Qu.:29.00
Max. :0.04000 Max. :1.0000 Max. :19.608 Max. :40.00
mining info:
install.packages("arulesViz")
trying URL 'https://cran.rstudio.com/bin/macosx/el-capitan/contrib/3.5/arulesViz_1.3-2.tgz'
Content type 'application/x-gzip' length 762680 bytes (744 KB)
==================================================
downloaded 744 KB
The downloaded binary packages are in
/var/folders/6f/tkdd1c4j4pbbkbbzp07y2rgw0000gn/T//RtmpviW8Jj/downloaded_packages
library(arulesViz)
Loading required package: grid
inspect(head(r1,by="confidence"))
plot(r1,engine="htmlwidget",jitter=0)
r20 <- apriori(tr20,parameter = list(support=0.005))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 100
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 20000 transaction(s)] done [0.01s].
sorting and recoding items ... [50 item(s)] done [0.00s].
creating transaction tree ... done [0.01s].
checking subsets of size 1 2 3 4 5 done [0.01s].
writing ... [81 rule(s)] done [0.00s].
creating S4 object ... done [0.01s].
summary(r20)
set of 81 rules
rule length distribution (lhs + rhs):sizes
3 4 5
52 24 5
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.00 3.00 3.00 3.42 4.00 5.00
summary of quality measures:
support confidence lift count
Min. :0.02040 Min. :0.8016 Min. : 7.942 Min. :408.0
1st Qu.:0.02055 1st Qu.:0.9128 1st Qu.:12.533 1st Qu.:411.0
Median :0.02545 Median :0.9225 Median :13.407 Median :509.0
Mean :0.02471 Mean :0.9380 Mean :13.015 Mean :494.1
3rd Qu.:0.02810 3rd Qu.:0.9927 3rd Qu.:14.294 3rd Qu.:562.0
Max. :0.04100 Max. :1.0000 Max. :15.026 Max. :820.0
mining info:
inspect(head(r20,by="confidence"))
lhs rhs support confidence lift count
[1] {Green Tea,
Lemon Cookie,
Lemon Lemonade,
Raspberry Cookie} => {Raspberry Lemonade} 0.02040 1.0000000 14.60920 408
[2] {Green Tea,
Lemon Cookie,
Raspberry Cookie,
Raspberry Lemonade} => {Lemon Lemonade} 0.02040 1.0000000 15.02630 408
[3] {Almond Twist,
Coffee Eclair,
Hot Coffee} => {Apple Pie} 0.02810 0.9982238 13.46222 562
[4] {Green Tea,
Raspberry Cookie,
Raspberry Lemonade} => {Lemon Lemonade} 0.02045 0.9975610 14.98965 409
[5] {Green Tea,
Lemon Cookie,
Lemon Lemonade,
Raspberry Lemonade} => {Raspberry Cookie} 0.02040 0.9975550 14.36364 408
[6] {Green Tea,
Lemon Lemonade,
Raspberry Cookie,
Raspberry Lemonade} => {Lemon Cookie} 0.02040 0.9975550 14.61619 408
plot(r20,engine="htmlwidget",jitter=0)
r5<- apriori(tr5,parameter = list(support=0.005))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 25
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 5000 transaction(s)] done [0.00s].
sorting and recoding items ... [50 item(s)] done [0.00s].
creating transaction tree ... done [0.00s].
checking subsets of size 1 2 3 4 5 done [0.00s].
writing ... [85 rule(s)] done [0.00s].
creating S4 object ... done [0.00s].
summary(r5)
set of 85 rules
rule length distribution (lhs + rhs):sizes
3 4 5
53 27 5
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 3.000 3.000 3.435 4.000 5.000
summary of quality measures:
support confidence lift count
Min. :0.02120 Min. :0.8030 Min. : 7.859 Min. :106.0
1st Qu.:0.02120 1st Qu.:0.9120 1st Qu.:12.059 1st Qu.:106.0
Median :0.02620 Median :0.9296 Median :13.478 Median :131.0
Mean :0.02573 Mean :0.9341 Mean :13.149 Mean :128.7
3rd Qu.:0.02980 3rd Qu.:0.9913 3rd Qu.:14.621 3rd Qu.:149.0
Max. :0.04080 Max. :1.0000 Max. :15.625 Max. :204.0
mining info:
inspect(head(r5,by="confidence"))
plot(r5,engine="htmlwidget",jitter=0)
These rules have good confidence and high lift showing these are intresting findings.
r75 <- apriori(tr75, parameter = list(support=0.005))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 375
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 75000 transaction(s)] done [0.06s].
sorting and recoding items ... [50 item(s)] done [0.01s].
creating transaction tree ... done [0.07s].
checking subsets of size 1 2 3 4 5 done [0.01s].
writing ... [85 rule(s)] done [0.00s].
creating S4 object ... done [0.04s].
summary(r75)
set of 85 rules
rule length distribution (lhs + rhs):sizes
3 4 5
52 28 5
Min. 1st Qu. Median Mean 3rd Qu. Max.
3.000 3.000 3.000 3.447 4.000 5.000
summary of quality measures:
support confidence lift count
Min. :0.02059 Min. :0.8049 Min. : 7.924 Min. :1544
1st Qu.:0.02073 1st Qu.:0.9070 1st Qu.:12.873 1st Qu.:1555
Median :0.02093 Median :0.9229 Median :13.403 Median :1570
Mean :0.02454 Mean :0.9319 Mean :13.035 Mean :1840
3rd Qu.:0.02676 3rd Qu.:0.9922 3rd Qu.:14.514 3rd Qu.:2007
Max. :0.04111 Max. :1.0000 Max. :14.784 Max. :3083
mining info:
inspect(head(r75, by="confidence"))
lhs rhs support confidence lift count
[1] {Green Tea,
Lemon Cookie,
Raspberry Cookie,
Raspberry Lemonade} => {Lemon Lemonade} 0.02073333 1.0000000 14.65416 1555
[2] {Green Tea,
Lemon Cookie,
Lemon Lemonade,
Raspberry Cookie} => {Raspberry Lemonade} 0.02073333 1.0000000 14.76087 1555
[3] {Green Tea,
Lemon Cookie,
Lemon Lemonade,
Raspberry Lemonade} => {Raspberry Cookie} 0.02073333 1.0000000 14.78415 1555
[4] {Green Tea,
Lemon Lemonade,
Raspberry Cookie,
Raspberry Lemonade} => {Lemon Cookie} 0.02073333 0.9993573 14.69355 1555
[5] {Green Tea,
Lemon Cookie,
Raspberry Cookie} => {Raspberry Lemonade} 0.02073333 0.9980745 14.73245 1555
[6] {Green Tea,
Lemon Cookie,
Raspberry Cookie} => {Lemon Lemonade} 0.02073333 0.9980745 14.62594 1555
plot(r75,engine="htmlwidget",jitter=0)
The results of the rules seem to have pretty good lift and confidence. Increasing the number of the number of transaction seem to have provided difference results with much more clarity. For example, the Consequent with Rasberry Lemonade is present with less support value with antecedent in 1k dataset while in the 75k dataset it is the best rule that is determined. The accuracy of the rules are increased as the number of data in the transcation is increased.
f_is <- apriori(tr75, parameter=list(support=0.05, target="frequent itemsets"))
Apriori
Parameter specification:
Algorithmic control:
Absolute minimum support count: 3750
set item appearances ...[0 item(s)] done [0.00s].
set transactions ...[50 item(s), 75000 transaction(s)] done [0.06s].
sorting and recoding items ... [38 item(s)] done [0.01s].
creating transaction tree ... done [0.08s].
checking subsets of size 1 2 done [0.00s].
writing ... [40 set(s)] done [0.00s].
creating S4 object ... done [0.04s].
inspect(sort(f_is, decreasing = T , by="count"))
rm(f_is)
itemFrequencyPlot(tr75)