Association Rules and Frequent Pattern Mining

Data Mining II

Luigi Ruberto

Introduction

dtab <- read.csv("tabela4K40W40.csv")
summary(dtab[, 1:4])
##       P1R4            P2R4            P3R4            P4R4      
##  Min.   : 0.00   Min.   : 0.00   Min.   : 0.00   Min.   : 0.00  
##  1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00   1st Qu.: 0.00  
##  Median : 1.00   Median : 1.00   Median : 1.00   Median : 1.00  
##  Mean   : 2.05   Mean   : 1.69   Mean   : 1.69   Mean   : 1.65  
##  3rd Qu.: 3.00   3rd Qu.: 2.00   3rd Qu.: 3.00   3rd Qu.: 2.00  
##  Max.   :13.00   Max.   :12.00   Max.   :12.00   Max.   :10.00

Sample

Sample of motifs table

print(dtab[sample(1:nrow(dtab), 10), c(1:8, 41)])
##     P1R4 P2R4 P3R4 P4R4 P5R4 P6R4 P7R4 P8R4 Classe
## 77     1    0    0    0    2    1    1    0      N
## 256    1    0    2    3    0    1    1    1      M
## 271    5    6    4    6    3    2    0    1      E
## 50     0    0    1    0    1    0    1    0      N
## 199    3    2    0    1    1    2    4    1      N
## 71     2    2    2    2    0    0    0    0      N
## 114    9    6    9   10    6    3    8    9      N
## 286    0    6    3    5    4    1    1    0      E
## 237    3    1    2    4    4    0    2    0      M
## 97     1    0    2    1    1    1    1    1      N

Frequency of motifs

Barplot of the frenquencies of motifs for each sound, divided by classes.

plot of chunk unnamed-chunk-4

First attempt to find rules

Aplication of function caren with low values of min.sup and min.conf

min.sup = 0.4
min.conf = 0.7
rls <- caren(dtab, min.sup = 0.4, min.conf = 0.7)
nrow(rls)
## [1] 22176

Rules with highest level of support

ar.pp(rls[order(rls$Sup, decreasing = T), ][1:10, ])
## ( 0.8301 , 0.8301 ) P33R4=0 <- 
## ( 0.8237 , 0.8237 ) P40R4=0 <- 
## ( 0.8141 , 0.8141 ) P39R4=0 <- 
## ( 0.7853 , 0.7853 ) P36R4=0 <- 
## ( 0.7853 , 0.7853 ) P35R4=0 <- 
## ( 0.7724 , 0.7724 ) P38R4=0 <- 
## ( 0.7596 , 0.7596 ) P37R4=0 <- 
## ( 0.75 , 0.75 ) P34R4=0 <- 
## ( 0.7276 , 0.7276 ) P31R4=0 <- 
## ( 0.7244 , 0.7244 ) P29R4=0 <-

Rules with highest level of confidence

ar.pp(rls[order(rls$Conf, decreasing = T), ][1:10, ])
## ( 0.4199 , 1 ) P14R4=0 <- P10R4=0 & P24R4=0 & P15R4=0
## ( 0.4071 , 1 ) P14R4=0 <- P15R4=0 & P24R4=0 & P25R4=0
## ( 0.4006 , 1 ) P14R4=0 <- P15R4=0 & P24R4=0 & P21R4=0 & P27R4=0
## ( 0.4006 , 1 ) P10R4=0 <- P18R4=0 & P27R4=0 & P37R4=0 & P21R4=0 & P14R4=0 & P25R4=0
## ( 0.4327 , 0.9926 ) P14R4=0 <- P15R4=0 & P24R4=0 & P27R4=0
## ( 0.4263 , 0.9925 ) P14R4=0 <- P15R4=0 & P24R4=0 & P21R4=0
## ( 0.4199 , 0.9924 ) P14R4=0 <- P10R4=0 & P24R4=0 & P28R4=0
## ( 0.4167 , 0.9924 ) P14R4=0 <- P15R4=0 & P24R4=0 & P18R4=0
## ( 0.4135 , 0.9923 ) P10R4=0 <- P18R4=0 & P21R4=0 & P37R4=0 & P14R4=0 & P25R4=0
## ( 0.4103 , 0.9922 ) P14R4=0 <- P10R4=0 & P18R4=0 & P15R4=0 & P37R4=0

Variation of number of rules with min.sup

plot of chunk unnamed-chunk-11

Variation of number of rules with min.conf

plot of chunk unnamed-chunk-12

Improvement

rls_i <- caren(dtab, min.sup = 0.4, min.conf = 0.7, imp = 0.01)

plot of chunk unnamed-chunk-14

Chi-squared

Application of Chi-squared test to discard rules where the two parts are indipendent

rls_cs <- caren(dtab, min.sup = 0.4, min.conf = 0.7, imp = 0.01, chi = TRUE)

plot of chunk unnamed-chunk-16

Lift

Variation of numeber of rules with lift plot of chunk unnamed-chunk-17

Conviction

Variation of numeber of rules with conviction plot of chunk unnamed-chunk-18

Conclusion

rls_f <- caren(dtab, min.sup = 0.45, min.conf = 0.9, imp = 0.04, chi = T, conv = 5, 
    lift = 1.3)

Rules with highest level of support

ar.pp(rls_f[order(rls_f$Sup, decreasing = T), ][1:10, ])
## ( 0.5801 , 0.9679 ) P33R4=0 <- P13R4=0 & P40R4=0
## ( 0.5481 , 0.9716 ) P33R4=0 <- P13R4=0 & P16R4=0
## ( 0.5128 , 0.9816 ) P33R4=0 <- P13R4=0 & P16R4=0 & P40R4=0
## ( 0.5032 , 0.9573 ) P21R4=0 <- P10R4=0 & P27R4=0
## ( 0.5 , 0.9512 ) P25R4=0 <- P10R4=0 & P27R4=0
## ( 0.5 , 0.9512 ) P14R4=0 <- P15R4=0 & P27R4=0
## ( 0.4968 , 0.9748 ) P33R4=0 <- P13R4=0 & P38R4=0 & P40R4=0
## ( 0.4968 , 0.9451 ) P14R4=0 <- P10R4=0 & P27R4=0
## ( 0.4872 , 0.9744 ) P33R4=0 <- P13R4=0 & P16R4=0 & P35R4=0
## ( 0.484 , 0.9379 ) P21R4=0 <- P10R4=0 & P28R4=0

Rules with highest level of confidence

ar.pp(rls_f[order(rls_f$Conf, decreasing = T), ][1:10, ])
## ( 0.5128 , 0.9816 ) P33R4=0 <- P13R4=0 & P16R4=0 & P40R4=0
## ( 0.4968 , 0.9748 ) P33R4=0 <- P13R4=0 & P38R4=0 & P40R4=0
## ( 0.4872 , 0.9744 ) P33R4=0 <- P13R4=0 & P16R4=0 & P35R4=0
## ( 0.4647 , 0.9732 ) P33R4=0 <- P13R4=0 & P38R4=0 & P16R4=0
## ( 0.4647 , 0.9732 ) P33R4=0 <- P13R4=0 & P39R4=0 & P40R4=0
## ( 0.4615 , 0.973 ) P14R4=0 <- P10R4=0 & P24R4=0
## ( 0.4551 , 0.9726 ) P33R4=0 <- P13R4=0 & P36R4=0 & P40R4=0
## ( 0.5481 , 0.9716 ) P33R4=0 <- P13R4=0 & P16R4=0
## ( 0.5801 , 0.9679 ) P33R4=0 <- P13R4=0 & P40R4=0
## ( 0.4583 , 0.9662 ) P21R4=0 <- P10R4=0 & P27R4=0 & P14R4=0 & P25R4=0

Class as consequent

rls2 <- caren(dtab, min.sup = 0.2, min.conf = 0.7, H = "Classe")
nrow(rls2)
## [1] 52189

Variation of number of rules with min.sup

plot of chunk unnamed-chunk-24

Variation of number of rules with min.conf

plot of chunk unnamed-chunk-25

Improvement and first rules

rls2 <- caren(dtab, min.sup = 0.25, min.conf = 0.72, H = "Classe", imp = 0.01)

Rules with highest level of support

ar.pp(rls2[order(rls2$Sup, decreasing = T), ][1:10, ])
## ( 0.4006 , 0.731 ) Classe=N <- P18R4=0 & P25R4=0
## ( 0.4006 , 0.7225 ) Classe=N <- P18R4=0 & P14R4=0
## ( 0.3782 , 0.7239 ) Classe=N <- P23R4=0 & P25R4=0
## ( 0.3622 , 0.7338 ) Classe=N <- P18R4=0 & P14R4=0 & P37R4=0
## ( 0.3429 , 0.7279 ) Classe=N <- P14R4=0 & P23R4=0 & P27R4=0
## ( 0.3397 , 0.7211 ) Classe=N <- P22R4=0 & P14R4=0 & P27R4=0
## ( 0.3365 , 0.7292 ) Classe=N <- P14R4=0 & P23R4=0 & P31R4=0
## ( 0.3333 , 0.7429 ) Classe=N <- P23R4=0 & P25R4=0 & P31R4=0
## ( 0.3301 , 0.7203 ) Classe=N <- P22R4=0 & P27R4=0 & P23R4=0
## ( 0.3301 , 0.7203 ) Classe=N <- P10R4=0 & P31R4=0 & P25R4=0

Rules with highest level of confidence

ar.pp(rls2[order(rls2$Conf, decreasing = T), ][1:10, ])
## ( 0.2532 , 0.767 ) Classe=N <- P18R4=0 & P23R4=0 & P33R4=0 & P25R4=0
## ( 0.2628 , 0.7593 ) Classe=N <- P23R4=0 & P25R4=0 & P33R4=0 & P31R4=0
## ( 0.2724 , 0.7522 ) Classe=N <- P18R4=0 & P33R4=0 & P14R4=0 & P37R4=0
## ( 0.3077 , 0.75 ) Classe=N <- P18R4=0 & P33R4=0 & P25R4=0
## ( 0.2596 , 0.75 ) Classe=N <- P14R4=0 & P23R4=0 & P40R4=0 & P32R4=0 & P27R4=0
## ( 0.25 , 0.75 ) Classe=N <- P18R4=0 & P23R4=0 & P33R4=0 & P37R4=0
## ( 0.2724 , 0.7456 ) Classe=N <- P14R4=0 & P33R4=0 & P23R4=0 & P27R4=0
## ( 0.2532 , 0.7453 ) Classe=N <- P18R4=0 & P40R4=0 & P25R4=0 & P39R4=0 & P37R4=0
## ( 0.3333 , 0.7429 ) Classe=N <- P23R4=0 & P25R4=0 & P31R4=0
## ( 0.25 , 0.7429 ) Classe=N <- P18R4=0 & P40R4=0 & P25R4=0 & P31R4=0 & P37R4=0

Class as a cause

rls4 <- caren(dtab, min.sup = 0.3, min.conf = 0.6, A = "Classe", imp = 0.01)

plot of chunk unnamed-chunk-30

First rules and improvement

rls5 <- caren(dtab, min.sup = 0.35, min.conf = 0.9, A = "Classe", imp = 0.01)

Rules with highest level of support

ar.pp(rls5[order(rls5$Sup, decreasing = T), ][1:10, ])
## ( 0.4327 , 0.9 ) P39R4=0 <- Classe=N & P31R4=0
## ( 0.4135 , 0.9281 ) P33R4=0 <- Classe=N & P16R4=0
## ( 0.391 , 0.9173 ) P25R4=0 <- P10R4=0 & Classe=N
## ( 0.3846 , 0.9023 ) P14R4=0 <- P10R4=0 & Classe=N
## ( 0.3846 , 0.9302 ) P39R4=0 <- Classe=N & P31R4=0 & P36R4=0
## ( 0.3814 , 0.9015 ) P14R4=0 <- P15R4=0 & Classe=N
## ( 0.3814 , 0.9015 ) P39R4=0 <- Classe=N & P37R4=0 & P36R4=0
## ( 0.375 , 0.9286 ) P14R4=0 <- Classe=N & P27R4=0 & P39R4=0
## ( 0.375 , 0.9141 ) P39R4=0 <- Classe=N & P31R4=0 & P37R4=0
## ( 0.375 , 0.9 ) P27R4=0 <- Classe=N & P39R4=0 & P14R4=0

Rules with highest level of confidence

ar.pp(rls5[order(rls5$Conf, decreasing = T), ][1:10, ])
## ( 0.3526 , 0.9649 ) P14R4=0 <- P15R4=0 & Classe=N & P27R4=0
## ( 0.3526 , 0.9649 ) P25R4=0 <- P10R4=0 & Classe=N & P27R4=0
## ( 0.359 , 0.9492 ) P25R4=0 <- P10R4=0 & Classe=N & P21R4=0
## ( 0.3686 , 0.9426 ) P27R4=0 <- Classe=N & P21R4=0 & P14R4=0
## ( 0.3622 , 0.9417 ) P25R4=0 <- P10R4=0 & Classe=N & P14R4=0
## ( 0.359 , 0.9412 ) P14R4=0 <- P10R4=0 & Classe=N & P37R4=0
## ( 0.359 , 0.9412 ) P39R4=0 <- Classe=N & P32R4=0 & P31R4=0
## ( 0.359 , 0.9412 ) P33R4=0 <- Classe=N & P16R4=0 & P40R4=0
## ( 0.3686 , 0.935 ) P14R4=0 <- Classe=N & P21R4=0 & P27R4=0
## ( 0.3622 , 0.9339 ) P39R4=0 <- Classe=N & P14R4=0 & P31R4=0