# 读取数据
mydata <- read_csv("dataset_group.csv",col_names = FALSE)
# 大约有38种物品的大类,1139个购买记录
head(mydata)## # A tibble: 6 x 3
## X1 X2 X3
## <date> <int> <chr>
## 1 2000-01-01 1 yogurt
## 2 2000-01-01 1 pork
## 3 2000-01-01 1 sandwich bags
## 4 2000-01-01 1 lunch meat
## 5 2000-01-01 1 all- purpose
## 6 2000-01-01 1 flour
length(unique(mydata$X3))## [1] 38
length(unique(mydata$X2))## [1] 1139
## 查看客户购买了几样物品
Id_buy <- mydata %>%
group_by(X2)%>%
summarise(num = n())
ggplot(Id_buy,aes(x = X2,y = num)) +
theme_bw(base_family = "STKaiti",base_size = 10) +
geom_bar(stat = "identity",fill = "lightblue") +
labs(x = "客户ID",y = "商品数量")## 查看客户购买了几样物品
items_unm <- mydata %>%
group_by(X3)%>%
summarise(num = n())
ggplot(items_unm,aes(x = reorder(X3,num),y = num)) +
theme_bw(base_family = "STKaiti",base_size = 10) +
geom_bar(stat = "identity",fill = "lightblue") +
labs(x = "商品",y = "商品出现次数") +
coord_flip() +
geom_text(aes(x = reorder(X3,num),y = num + 50,label = num),size = 3)# 数据表转化为list
buy_data<- split(x=mydata$X3,f=mydata$X2)
# 查看一共有多少个实例
sum(sapply(buy_data,length))## [1] 22343
# 22343
# 过滤掉相同的实例
buy_data <- lapply(buy_data,unique)
sum(sapply(buy_data,length))## [1] 16753
# 16753
## 转换数据形势,用于关联规则分析
buy_data <- as(buy_data, "transactions")## 1:可视化频繁项集
## 出现的频率大于等于0.25的项目
par(cex = 0.7)
itemFrequencyPlot(buy_data,support = 0.25,col = "lightblue",
xlab = "Iterm Frequency",ylab = "Frequency",
main = "Frequency > 0.25")## 查看店铺和商品的稀疏矩阵图像
par(family = "STKaiti",cex = 0.75,mai = c(0,1,0,1))
arules::image(buy_data,xlab = "item number",ylab = "ID number",main = "ID--items")## 找到规则
myrule <- apriori(data = buy_data,
parameter = list(support = 0.25,
confidence = 0.4,
minlen = 1))## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.4 0.1 1 none FALSE TRUE 5 0.25 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 284
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[38 item(s), 1139 transaction(s)] done [0.00s].
## sorting and recoding items ... [38 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 done [0.00s].
## writing ... [57 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
## 找到了38个规则
summary(myrule)## set of 57 rules
##
## rule length distribution (lhs + rhs):sizes
## 1 2
## 2 55
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 2.000 2.000 1.965 2.000 2.000
##
## summary of quality measures:
## support confidence lift
## Min. :0.2704 Min. :0.4002 Min. :1.000
## 1st Qu.:0.2915 1st Qu.:0.4216 1st Qu.:1.053
## Median :0.3003 Median :0.7728 Median :1.065
## Mean :0.3107 Mean :0.6631 Mean :1.066
## 3rd Qu.:0.3108 3rd Qu.:0.7875 3rd Qu.:1.078
## Max. :0.7392 Max. :0.8378 Max. :1.133
##
## mining info:
## data ntransactions support confidence
## buy_data 1139 0.25 0.4
## 将规则按照提升度排序
sortlift <- arules::sort(myrule,decreasing = TRUE,by = "lift")
inspect(sortlift[1:15])## lhs rhs support confidence
## [1] {eggs} => {vegetables} 0.3266023 0.8378378
## [2] {vegetables} => {eggs} 0.3266023 0.4418052
## [3] {vegetables} => {yogurt} 0.3195786 0.4323040
## [4] {yogurt} => {vegetables} 0.3195786 0.8310502
## [5] {sugar} => {vegetables} 0.2976295 0.8248175
## [6] {vegetables} => {sugar} 0.2976295 0.4026128
## [7] {laundry detergent} => {vegetables} 0.3090430 0.8167053
## [8] {vegetables} => {laundry detergent} 0.3090430 0.4180523
## [9] {sandwich loaves} => {vegetables} 0.2827041 0.8090452
## [10] {aluminum foil} => {vegetables} 0.3107989 0.8082192
## [11] {vegetables} => {aluminum foil} 0.3107989 0.4204276
## [12] {waffles} => {vegetables} 0.3151888 0.7995546
## [13] {vegetables} => {waffles} 0.3151888 0.4263658
## [14] {paper towels} => {vegetables} 0.2897278 0.7990315
## [15] {sandwich bags} => {vegetables} 0.2932397 0.7971360
## lift
## [1] 1.133370
## [2] 1.133370
## [3] 1.124188
## [4] 1.124188
## [5] 1.115757
## [6] 1.115757
## [7] 1.104783
## [8] 1.104783
## [9] 1.094421
## [10] 1.093304
## [11] 1.093304
## [12] 1.081583
## [13] 1.081583
## [14] 1.080875
## [15] 1.078311
## 将结果保存为数据表的形势
ruledf <- as(myrule,"data.frame")## 可视化提升度最高的前20个规则
plot(sortlift[1:20], method="graph")## 可视化所有的规则
plot(sortlift, method="graph") ### 2:规则可视化为grouped matrix
## 可视化欺诈的产生过程
par(cex = 0.7)
plot(sortlift[1:20], method = "grouped matrix") plotly_arules(sortlift, method="matrix")