数据预处理和探索

1:数据读取

# 读取数据
mydata <- read_csv("dataset_group.csv",col_names = FALSE)
# 大约有38种物品的大类,1139个购买记录
head(mydata)
## # A tibble: 6 x 3
##   X1            X2 X3           
##   <date>     <int> <chr>        
## 1 2000-01-01     1 yogurt       
## 2 2000-01-01     1 pork         
## 3 2000-01-01     1 sandwich bags
## 4 2000-01-01     1 lunch meat   
## 5 2000-01-01     1 all- purpose 
## 6 2000-01-01     1 flour
length(unique(mydata$X3))
## [1] 38
length(unique(mydata$X2))
## [1] 1139

2:数据可视化

2.1:可视化每个顾客购买了几件物品

## 查看客户购买了几样物品
Id_buy <- mydata %>%
  group_by(X2)%>%
  summarise(num = n())

ggplot(Id_buy,aes(x = X2,y = num)) +
  theme_bw(base_family = "STKaiti",base_size = 10) +
  geom_bar(stat = "identity",fill = "lightblue") +
  labs(x = "客户ID",y = "商品数量")

2.1:可视化每个商品出现的次数

## 查看客户购买了几样物品
items_unm <- mydata %>%
  group_by(X3)%>%
  summarise(num = n())

ggplot(items_unm,aes(x = reorder(X3,num),y = num)) +
  theme_bw(base_family = "STKaiti",base_size = 10) +
  geom_bar(stat = "identity",fill = "lightblue") +
  labs(x = "商品",y = "商品出现次数") + 
  coord_flip() +
  geom_text(aes(x = reorder(X3,num),y = num + 50,label = num),size = 3)

2 关联规则准备

2.1:数据准备

# 数据表转化为list
buy_data<- split(x=mydata$X3,f=mydata$X2)
# 查看一共有多少个实例
sum(sapply(buy_data,length))
## [1] 22343
# 22343
# 过滤掉相同的实例
buy_data <- lapply(buy_data,unique)
sum(sapply(buy_data,length))
## [1] 16753
# 16753
## 转换数据形势,用于关联规则分析
buy_data <- as(buy_data, "transactions")

2.2:可视化频繁项集

## 1:可视化频繁项集
## 出现的频率大于等于0.25的项目
par(cex = 0.7)
itemFrequencyPlot(buy_data,support = 0.25,col = "lightblue",
                  xlab = "Iterm Frequency",ylab = "Frequency",
                  main = "Frequency > 0.25")

2.3:客户和商品的稀疏矩阵图像

## 查看店铺和商品的稀疏矩阵图像
par(family = "STKaiti",cex = 0.75,mai = c(0,1,0,1)) 
arules::image(buy_data,xlab = "item number",ylab = "ID number",main = "ID--items")

3 关联规则分析

2.1:找到规则

## 找到规则
myrule <- apriori(data = buy_data,
                  parameter = list(support = 0.25,
                                   confidence = 0.4,
                                   minlen = 1))
## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.4    0.1    1 none FALSE            TRUE       5    0.25      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 284 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[38 item(s), 1139 transaction(s)] done [0.00s].
## sorting and recoding items ... [38 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [57 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].
## 找到了38个规则
summary(myrule)
## set of 57 rules
## 
## rule length distribution (lhs + rhs):sizes
##  1  2 
##  2 55 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   2.000   2.000   1.965   2.000   2.000 
## 
## summary of quality measures:
##     support         confidence          lift      
##  Min.   :0.2704   Min.   :0.4002   Min.   :1.000  
##  1st Qu.:0.2915   1st Qu.:0.4216   1st Qu.:1.053  
##  Median :0.3003   Median :0.7728   Median :1.065  
##  Mean   :0.3107   Mean   :0.6631   Mean   :1.066  
##  3rd Qu.:0.3108   3rd Qu.:0.7875   3rd Qu.:1.078  
##  Max.   :0.7392   Max.   :0.8378   Max.   :1.133  
## 
## mining info:
##      data ntransactions support confidence
##  buy_data          1139    0.25        0.4
## 将规则按照提升度排序
sortlift <- arules::sort(myrule,decreasing = TRUE,by = "lift")
inspect(sortlift[1:15])
##      lhs                    rhs                 support   confidence
## [1]  {eggs}              => {vegetables}        0.3266023 0.8378378 
## [2]  {vegetables}        => {eggs}              0.3266023 0.4418052 
## [3]  {vegetables}        => {yogurt}            0.3195786 0.4323040 
## [4]  {yogurt}            => {vegetables}        0.3195786 0.8310502 
## [5]  {sugar}             => {vegetables}        0.2976295 0.8248175 
## [6]  {vegetables}        => {sugar}             0.2976295 0.4026128 
## [7]  {laundry detergent} => {vegetables}        0.3090430 0.8167053 
## [8]  {vegetables}        => {laundry detergent} 0.3090430 0.4180523 
## [9]  {sandwich loaves}   => {vegetables}        0.2827041 0.8090452 
## [10] {aluminum foil}     => {vegetables}        0.3107989 0.8082192 
## [11] {vegetables}        => {aluminum foil}     0.3107989 0.4204276 
## [12] {waffles}           => {vegetables}        0.3151888 0.7995546 
## [13] {vegetables}        => {waffles}           0.3151888 0.4263658 
## [14] {paper towels}      => {vegetables}        0.2897278 0.7990315 
## [15] {sandwich bags}     => {vegetables}        0.2932397 0.7971360 
##      lift    
## [1]  1.133370
## [2]  1.133370
## [3]  1.124188
## [4]  1.124188
## [5]  1.115757
## [6]  1.115757
## [7]  1.104783
## [8]  1.104783
## [9]  1.094421
## [10] 1.093304
## [11] 1.093304
## [12] 1.081583
## [13] 1.081583
## [14] 1.080875
## [15] 1.078311
## 将结果保存为数据表的形势
ruledf <- as(myrule,"data.frame")

2.2:规则可视化

1:规则可视化为网络图

## 可视化提升度最高的前20个规则
plot(sortlift[1:20], method="graph")

## 可视化所有的规则
plot(sortlift, method="graph")

### 2:规则可视化为grouped matrix

## 可视化欺诈的产生过程
par(cex = 0.7)
plot(sortlift[1:20], method = "grouped matrix")  

3:规则可视化为可交互图

plotly_arules(sortlift, method="matrix")