## 2016年厦门大数据大赛
## 第二题:基于大数据的商品销售预测及关联销售挖掘
## author:Daitu
## 2016-7-12
## 从数据集中针对店铺,进行关联销售挖掘
## 店铺的相似性
## 更改工作文件夹------------------------------------
setwd("/Users/daitu/数据分析/2016ABD")
getwd()
## [1] "/Users/daitu/数据分析/2016ABD"
rm(list = ls());gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 368128 19.7 592000 31.7 460000 24.6
## Vcells 568002 4.4 1023718 7.9 786371 6.0
## 加载所需要的包-----------------------------------
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following object is masked from 'package:dplyr':
##
## recode
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(stringr)
# sessionInfo()
## 第一步:读取数据####-------------------------------------------
# item_id 每一个商品链接的独有的ID ----字符串-------
# item_number 商品的款号 ----字符串-------
# shop_id 销售这件商品的店铺ID ----字符串-------
# shop_type 销售这件商品的店铺类型,分为TB_TMALL与TB_JISHI两种 ----字符串-------
# brand_name 商品的品牌名 ----字符串-------
# item_name 商品名(商品标题) ----字符串-------
# price 该商品的销售价格(此处对各sku取均值)
# tag_price 商品的标签价
# monthly_sales_num 商品月销量
# assessment_num 商品评价数
# monthly_sales 月销售额 -----价格乘以销量-----
## 商品销售
load("第二题数据/item_fact.RData")
head(item_fact)
## # A tibble: 6 × 11
## item_id item_number shop_id shop_type brand_name
## <chr> <chr> <chr> <chr> <chr>
## 1 10003087358 UB 64971284 TB_JISHI umbro
## 2 10005342950 2011030511534230 61051459 TB_JISHI 美津浓
## 3 10006292251 467046-401 64767888 TB_JISHI nike
## 4 10010133546 YKL40 65507365 TB_TMALL 骐煌
## 5 10013604183 AWDK288 62687027 TB_TMALL 李宁
## 6 10014801651 7X50 35714901 TB_JISHI yukon
## # ... with 6 more variables: item_name <chr>, price <dbl>,
## # tag_price <dbl>, monthly_sales_num <dbl>, assessment_num <dbl>,
## # monthly_sales <dbl>
summary(item_fact)
## item_id item_number shop_id
## Length:947276 Length:947276 Length:947276
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## shop_type brand_name item_name
## Length:947276 Length:947276 Length:947276
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## price tag_price monthly_sales_num
## Min. :0.00e+00 Min. :0.00e+00 Min. :0.00e+00
## 1st Qu.:1.31e+02 1st Qu.:1.68e+02 1st Qu.:0.00e+00
## Median :2.14e+02 Median :2.88e+02 Median :0.00e+00
## Mean :5.16e+02 Mean :5.83e+02 Mean :8.29e+00
## 3rd Qu.:4.15e+02 3rd Qu.:4.99e+02 3rd Qu.:1.00e+00
## Max. :1.00e+08 Max. :1.00e+08 Max. :2.48e+05
## assessment_num monthly_sales
## Min. : -1.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0
## Median : 0.0 Median : 0
## Mean : 34.7 Mean : 1652
## 3rd Qu.: 3.0 3rd Qu.: 225
## Max. :891523.0 Max. :4678400
## 查看多少商品有销量
sum(item_fact$monthly_sales_num > 0) #约有305218商品有销售出去的纪录
## [1] 305218
## 只对有销售纪录的商品进行关联分析
item_fact <- item_fact[which(item_fact$monthly_sales_num > 0),]
## 商品的数目
iten_number <- as.data.frame(table(item_fact$item_number))
dim(iten_number) ## 127137 种商品编号
## [1] 127137 2
# ## 只分析出现频次大于1的商品
# iten_number <- iten_number[iten_number$Freq>1,]
# dim(iten_number) ## 39932 种商品编号
bb <- item_fact[item_fact$item_number == "00001",]
## 商品编号无法唯一定义商品,所以要剔除商品编号异常的数据,它们的存在没有帮组
## 提出商品编号带有:“-”,“*”,“#”,“。”等,
## 即只保留开始和结尾都是数字或者为字母的商品
iten_number <- iten_number[grepl("^[[:alnum:]]+[A-Za-z0-9\\+-]+[[:alnum:]]$",iten_number$Var1),]
## 将商品编号只有0、1的数据删除,因为不能唯一定位
iten_number <- iten_number[!grepl("^[0-1]+[0]+[0-1]$",iten_number$Var1),]
item_fact <- item_fact[item_fact$item_number %in% iten_number$Var1,]
dim(item_fact) # 还有283597 条纪录
## [1] 283597 11
n_distinct(item_fact$shop_id) # 16858个店铺
## [1] 16858
n_distinct(item_fact$brand_name) # 3939 种品牌
## [1] 3939
## 查看每个商铺大约有多少件商品出售####-------------------
shop <- item_fact %>%
dplyr::group_by(shop_id) %>% ## 将数据按照店铺的ID进行分组
dplyr::summarise(n_item = n_distinct(item_number)) ## 该店铺共销售了多少种商品
summary(shop)
## shop_id n_item
## Length:16858 Min. : 1.00
## Class :character 1st Qu.: 2.00
## Mode :character Median : 5.00
## Mean : 16.54
## 3rd Qu.: 13.00
## Max. :5501.00
# shop <- as.data.frame(table(item_fact$shop_id))
# summary(shop$Freq)
## 销售商品数目的分布
p1 <- ggplot(shop,aes(n_item)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(binwidth = 20,color = "firebrick",fill = "red",alpha = 0.5)+
labs(x = "销售商品数目(件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")
p2 <- ggplot(shop[shop$n_item<=500,],aes(n_item)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(binwidth = 5,color = "firebrick",fill = "red",alpha = 0.5)+
labs(x = "销售商品数目(<=500件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")
gridExtra::grid.arrange(p1,p2,nrow = 2)

## 大多数上架的销售商品数目小于100件
## 将数据转化为适合关联规则的形式####----------------------------------
## 店铺-商品数据
arule_data <- item_fact[c("shop_id","item_number")]
str(arule_data)
## Classes 'tbl_df', 'tbl' and 'data.frame': 283597 obs. of 2 variables:
## $ shop_id : chr "65507365" "62204349" "62687027" "36179219" ...
## $ item_number: chr "YKL40" "DL00150" "AKLK631" "M21D013" ...
## 将数据保存为便于csv文件
write.csv(arule_data,file = "第二题数据/店铺_商品.txt",row.names = FALSE)
## 店铺-品牌数据
arule_brand <- item_fact[c("shop_id","brand_name")]
str(arule_brand)
## Classes 'tbl_df', 'tbl' and 'data.frame': 283597 obs. of 2 variables:
## $ shop_id : chr "65507365" "62204349" "62687027" "36179219" ...
## $ brand_name: chr "骐煌" "sherpa" "李宁" "匹克" ...
aa <- as.data.frame(table(arule_brand$brand_name))
bb <- item_fact[item_fact$brand_name == "0",]
## 虽然有些品牌的名称是不正确的,但是我们能最后的到在规则中不考虑这些错误的品牌
## 将数据保存为便于csv文件
write.csv(arule_brand,file = "第二题数据/店铺_品牌.txt",row.names = FALSE)
## 对店铺和销售的商品数据进行关联规则分析####-----------------------------
## 读取关联分析数据
arudata <- read.transactions(file = "第二题数据/店铺_商品.txt",format = "single",
sep = ",",cols = c(1,2),rm.duplicates = TRUE)
## distribution of transactions with duplicates:
## items
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 1088 339 151 72 44 26 28 15 10 9 6 7 9 5 8
## 16 17 18 19 20 22 23 24 25 27 28 31 33 36 37
## 3 3 7 1 1 2 1 3 2 2 2 1 1 1 1
## 41 45 46 47 51
## 1 1 1 1 1
## 查看数据
summary(arudata)
## transactions as itemMatrix in sparse format with
## 16859 rows (elements/itemsets/transactions) and
## 111656 columns (items) and a density of 0.0001481251
##
## most frequent items:
## M20324 130690-002 C77124 511882-094 574 (Other)
## 229 203 203 167 144 277886
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 3408 2093 1493 1178 908 775 614 547 500 391 371 283 246 230 221
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 223 176 153 143 150 149 133 124 105 84 83 100 88 92 98
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 85 69 69 57 64 60 48 38 51 37 60 49 36 40 38
## 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 38 36 32 23 21 17 22 21 19 20 16 18 26 18 15
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 8 11 14 16 12 8 6 10 5 6 2 5 3 3 8
## 76 77 78 79 80 81 82 85 86 87 88 89 90 91 92
## 4 1 1 4 3 6 3 2 2 3 2 2 1 3 2
## 93 94 95 96 97 98 99 101 102 104 105 106 107 108 109
## 2 4 4 2 2 4 3 2 2 3 2 7 1 2 2
## 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124
## 2 4 4 1 2 3 2 1 2 3 1 3 3 2 1
## 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
## 1 4 2 2 6 2 2 2 5 2 2 2 2 1 2
## 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154
## 2 2 1 4 2 1 2 1 2 3 2 4 3 2 1
## 156 157 158 159 160 161 162 164 166 167 169 170 171 172 173
## 1 1 4 2 6 2 7 3 4 1 4 1 1 1 2
## 174 175 176 177 179 181 182 187 189 191 192 193 195 196 197
## 2 2 1 1 1 2 1 1 1 1 2 1 2 2 1
## 198 201 203 204 205 206 207 208 210 211 212 213 216 217 218
## 1 2 2 2 1 1 1 1 2 2 1 2 2 1 3
## 219 220 221 224 225 226 227 229 230 232 233 234 235 237 238
## 1 1 2 1 1 1 1 2 1 1 1 2 2 3 1
## 240 242 243 245 246 247 248 249 251 253 254 255 258 259 260
## 1 1 1 2 3 1 1 1 4 1 1 2 2 1 1
## 263 264 265 266 267 268 274 278 280 282 284 286 287 288 291
## 1 1 1 3 1 1 1 1 1 2 1 1 1 1 2
## 294 295 298 301 304 310 311 313 317 320 321 324 326 327 328
## 1 2 1 1 3 1 2 1 2 1 3 1 2 1 1
## 331 333 337 338 341 343 345 350 352 353 361 364 367 371 376
## 3 1 2 1 1 1 1 1 1 2 1 1 1 1 2
## 377 380 400 407 408 411 416 419 421 422 442 443 446 454 478
## 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1
## 496 506 508 512 517 529 541 544 575 578 591 599 607 615 626
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 628 633 637 667 668 672 676 715 728 749 757 791 802 807 815
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 838 867 930 957 1028 1043 1064 1066 1132 1147 1149 1278 1323 1494 1585
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 2287 2302 2528 5501
## 1 1 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 2.00 5.00 16.54 13.00 5501.00
##
## includes extended item information - examples:
## labels
## 1 0-011
## 2 0-23
## 3 0-24
##
## includes extended transaction information - examples:
## transactionID
## 1 100002563
## 2 100005363
## 3 100005546
inspect(arudata[1:2])
## items transactionID
## [1] {10372601,
## 1272298-100,
## 1273365-001,
## 1278852-001,
## 705248-010,
## 714965-010,
## 714965-021,
## 729SPM0057082,
## 742504-010,
## 747445-100,
## 749324-100,
## 808924-672,
## 808951-451,
## 819217-307,
## 819712-307,
## 831975-870,
## 831977-870,
## 835611-010,
## AA0881,
## AF4083,
## AF4693,
## AQ5000,
## AQ5152,
## P1GA165844,
## S76650,
## S90230,
## S90391,
## SP0284-100,
## SX4854-701} 100002563
## [2] {51003,
## 639669,
## 639790,
## 725717,
## 776114,
## 799108,
## 819006,
## AE5427,
## TA6BB632} 100005363
## 可视化商品的频率图
## itemFrequency(item_arules)
par(family = "STKaiti",cex = 0.75)
itemFrequencyPlot(arudata,type = "absolute",topN = 50,main = "商品频率图top50",
xlab = "商品编号",ylab = "频数",col = "LightSkyBlue")

par(family = "STKaiti",cex = 0.75)
itemFrequencyPlot(arudata,type = "relative",topN = 50,main = "商品频率图top50",
xlab = "商品编号",ylab = "频率",col = "LightSkyBlue")

## 可以看出出现次数最多的商品大约出现了200多次,并且top50的商品出现次数均大于50次,
## 但是商品出现的频率均偏低。
## 查看店铺和商品的稀疏矩阵图像
par(family = "STKaiti",cex = 0.75,mai = c(0,1,0,1))
arules::image(arudata,xlab = "item number",ylab = "shop number",main = "shops--items")

# ## 计算各个项集的频率
# fsets <- eclat(item_arules,parameter = list(support = 0.005,maxlen = 10))
#
# inspect(sort(fsets,by = "support"))
## 训练模型
myrule <- apriori(data = arudata,
parameter = list(support = 0.001,
confidence = 0.9,
minlen = 2))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.9 0.1 1 none FALSE TRUE 5 0.001 2
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 16
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[111656 item(s), 16859 transaction(s)] done [0.08s].
## sorting and recoding items ... [1925 item(s)] done [0.01s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.04s].
## writing ... [2698 rule(s)] done [0.01s].
## creating S4 object ... done [0.02s].
summary(myrule)
## set of 2698 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4 5 6 7
## 12 633 1418 542 86 7
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 4.000 4.000 4.029 4.000 7.000
##
## summary of quality measures:
## support confidence lift
## Min. :0.001008 Min. :0.9000 Min. :126.4
## 1st Qu.:0.001008 1st Qu.:0.9091 1st Qu.:198.3
## Median :0.001068 Median :0.9444 Median :239.0
## Mean :0.001068 Mean :0.9462 Mean :260.8
## 3rd Qu.:0.001068 3rd Qu.:0.9524 3rd Qu.:306.2
## Max. :0.001661 Max. :1.0000 Max. :802.8
##
## mining info:
## data ntransactions support confidence
## arudata 16859 0.001 0.9
inspect(myrule[1:5])
## lhs rhs support confidence lift
## [1] {ahsk028} => {AKSK081} 0.001008363 0.9444444 215.1674
## [2] {11116220124} => {51116120028} 0.001126994 0.9500000 205.3340
## [3] {16628204} => {15625311} 0.001067679 1.0000000 301.0536
## [4] {96627781} => {96627143} 0.001008363 1.0000000 802.8095
## [5] {96627781} => {91625505} 0.001008363 1.0000000 295.7719
## 将规则按照支持度排序
sortsupport <- arules::sort(myrule,decreasing = TRUE,by = "support")
inspect(sortsupport[1:10])
## lhs rhs support confidence
## [1] {11621304,12627775} => {11625553} 0.001660834 0.9655172
## [2] {11621304,11627775} => {11625553} 0.001601518 0.9000000
## [3] {15627740,16627785} => {11621304} 0.001542203 0.9285714
## [4] {12627775,16627785} => {11625553} 0.001542203 0.9285714
## [5] {15627783,16627165} => {15627141} 0.001482887 0.9615385
## [6] {11621304,11625501,12627775} => {11625553} 0.001482887 0.9615385
## [7] {11625501,11625553,12627775} => {11621304} 0.001482887 0.9259259
## [8] {ABPJ039,ARBK019} => {ABPK021} 0.001423572 0.9600000
## [9] {15621166,15621787} => {15621158} 0.001423572 0.9230769
## [10] {15621787,15627141} => {15621158} 0.001423572 0.9230769
## lift
## [1] 191.5018
## [2] 178.5071
## [3] 182.0324
## [4] 184.1739
## [5] 219.0619
## [6] 190.7127
## [7] 181.5138
## [8] 134.8720
## [9] 232.2710
## [10] 232.2710
## 将规则按照置信度排序
sortconf <- arules::sort(myrule,decreasing = TRUE,by = "confidence")
inspect(sortconf[1:10])
## lhs rhs support confidence
## [1] {16628204} => {15625311} 0.001067679 1
## [2] {96627781} => {96627143} 0.001008363 1
## [3] {96627781} => {91625505} 0.001008363 1
## [4] {16627109} => {11625553} 0.001008363 1
## [5] {985218119762,985219119337} => {985118119979} 0.001067679 1
## [6] {984118119372,985419119880} => {985219119337} 0.001008363 1
## [7] {984119119578,987319119623} => {985219119337} 0.001008363 1
## [8] {12115101024,51116120028} => {11115101024} 0.001008363 1
## [9] {985219119775,987319119623} => {985118119979} 0.001008363 1
## [10] {985219119775,987319119623} => {985219119337} 0.001008363 1
## lift
## [1] 301.0536
## [2] 802.8095
## [3] 295.7719
## [4] 198.3412
## [5] 280.9833
## [6] 230.9452
## [7] 230.9452
## [8] 306.5273
## [9] 280.9833
## [10] 230.9452
## 将规则按照提升度排序
sortlift <- arules::sort(myrule,decreasing = TRUE,by = "lift")
inspect(sortlift[1:10])
## lhs rhs support
## [1] {96627781} => {96627143} 0.001008363
## [2] {91625505,96627781} => {96627143} 0.001008363
## [3] {11618022,15621166,91625505} => {19627351} 0.001067679
## [4] {11618022,15621166,16627785,91625505} => {19627351} 0.001008363
## [5] {11611102,15621166,91625505} => {12617775} 0.001067679
## [6] {16625145,91625505} => {11622204} 0.001008363
## [7] {644441,SH1007} => {SH1001} 0.001008363
## [8] {15627784,91625505} => {11631307} 0.001008363
## [9] {19627301,91625505} => {11631307} 0.001067679
## [10] {11625553,91545546} => {91528015} 0.001008363
## confidence lift
## [1] 1.0000000 802.8095
## [2] 1.0000000 802.8095
## [3] 0.9473684 591.5439
## [4] 0.9444444 589.7181
## [5] 0.9473684 550.7477
## [6] 0.9444444 549.0479
## [7] 0.9444444 549.0479
## [8] 1.0000000 543.8387
## [9] 0.9473684 515.2156
## [10] 1.0000000 510.8788
## 将关联规则数据转化为数据表格的形式
item_shop_ruledf <- as(myrule,"data.frame")
## 我门可以发现,我门发现的2000多条的规则中,置信度非常的接近1
## 并且提升度也很大,这时因为很多店铺销售的东西都很相似
utils::write.csv(item_shop_ruledf,file = "第二题数据/店铺_商品_关联结果.txt",
row.names = FALSE,quote = FALSE,fileEncoding = "UTF-8")
## 对店铺销售的品牌进行关联规则的挖掘####--------------------
## 读取关联分析数据
arudata <- read.transactions(file = "第二题数据/店铺_品牌.txt",format = "single",
sep = ",",cols = c(1,2),rm.duplicates = TRUE)
## distribution of transactions with duplicates:
## items
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 2160 1421 1027 809 675 566 446 437 346 263 280 225 217 219 155
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 158 131 150 99 124 107 121 84 88 83 84 87 87 65 67
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 66 67 52 52 51 44 59 38 54 43 32 34 37 33 35
## 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 30 19 23 24 24 19 22 14 16 17 18 16 1 14 15
## 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
## 10 17 19 11 5 5 8 7 4 3 5 2 2 5 4
## 77 78 79 80 81 83 84 85 86 87 88 89 90 91 92
## 4 3 2 4 2 1 1 2 2 2 3 2 3 3 1
## 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107
## 4 4 2 1 1 3 1 3 4 1 2 4 4 2 2
## 108 109 110 111 112 113 114 116 117 118 119 120 121 123 124
## 3 1 2 5 1 1 3 2 4 1 2 5 4 1 1
## 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139
## 4 4 2 5 1 2 5 3 3 1 2 3 1 2 3
## 140 141 142 143 144 145 146 147 148 149 151 152 155 156 157
## 1 1 1 1 3 4 1 2 3 5 4 2 4 1 3
## 158 159 160 161 162 163 164 165 168 169 172 173 174 175 176
## 2 6 3 2 2 2 2 3 3 2 1 2 4 3 1
## 177 180 185 188 189 191 192 193 194 195 197 200 201 203 204
## 1 1 1 1 1 3 1 2 2 2 1 1 1 2 1
## 205 206 207 208 209 211 212 214 215 217 220 221 223 226 227
## 1 1 2 1 2 1 2 1 3 3 2 1 2 1 4
## 228 229 231 232 233 234 236 238 241 242 243 244 245 247 249
## 1 1 1 1 2 1 2 2 1 3 1 2 1 1 1
## 250 251 252 253 254 257 263 264 265 266 267 274 275 277 278
## 1 1 2 3 2 3 1 3 1 1 1 1 1 1 3
## 281 285 287 288 290 291 292 294 297 302 303 304 307 308 309
## 1 1 1 1 1 1 1 1 1 1 2 1 1 1 1
## 312 313 314 316 320 321 322 326 327 330 331 332 334 336 337
## 1 1 1 2 3 1 1 1 2 1 1 1 1 1 2
## 340 349 351 352 360 363 364 365 375 379 381 398 406 408 409
## 2 1 1 2 1 1 1 1 2 1 1 1 1 2 1
## 410 416 418 420 421 441 442 450 475 492 494 505 506 518 527
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 540 541 572 575 581 586 587 614 625 634 656 667 668 677 709
## 1 1 1 1 1 1 1 1 2 2 1 1 1 1 1
## 729 745 763 788 797 800 835 861 866 929 945 1027 1039 1063 1067
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## 1142 1153 1156 1277 1322 1485 1596 2298 2311 2568 5500
## 1 1 1 1 1 1 1 1 1 1 1
## 查看数据
summary(arudata)
## transactions as itemMatrix in sparse format with
## 16859 rows (elements/itemsets/transactions) and
## 3940 columns (items) and a density of 0.0005758722
##
## most frequent items:
## adidas nike 李宁 other 特步 (Other)
## 4098 4057 1936 1511 1228 25422
##
## element (itemset/transaction) length distribution:
## sizes
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 8186 3756 2092 1119 657 390 242 131 66 56 48 35 15 18 11
## 16 17 18 19 20 21 22 24 26 30 39
## 10 11 3 2 1 1 2 1 4 1 1
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 2.269 3.000 39.000
##
## includes extended item information - examples:
## labels
## 1 -
## 2 .
## 3 ..
##
## includes extended transaction information - examples:
## transactionID
## 1 100002563
## 2 100005363
## 3 100005546
inspect(arudata[1:5])
## items transactionID
## [1] {美津浓,adidas,lacoste,nike,puma,under armour} 100002563
## [2] {adidas,footjoy,nike,titleist} 100005363
## [3] {美津浓,nike} 100005546
## [4] {李宁,adidas,air jordan,converse,nike} 100010926
## [5] {美津浓,butterfly} 100013851
## 可视化商品的频率图
## itemFrequency(item_arules)
par(family = "STKaiti",cex = 0.75)
itemFrequencyPlot(arudata,type = "absolute",topN = 30,main = "品牌频率图top30",
xlab = "品牌名",ylab = "频数",col = "LightSkyBlue")

par(family = "STKaiti",cex = 0.75)
itemFrequencyPlot(arudata,type = "relative",topN = 30,main = "品牌频率图top30",
xlab = "品牌名",ylab = "频率",col = "LightSkyBlue")

## 可以看出出现次数最多的商品大约出现了4000多次,说明有很多店铺,销售这个品牌的产品,
## 并且top7的品牌出现次数均大于1000次,
## 品牌出现的频率偏高。
## 查看店铺和商品的稀疏矩阵图像
par(family = "STKaiti",cex = 0.75)
arules::image(arudata,xlab = "brand number",ylab = "shop number",main = "shops--brands")

## 训练模型
myrule <- apriori(data = arudata,
parameter = list(support = 0.005,
confidence = 0.5,
minlen = 1))
## Apriori
##
## Parameter specification:
## confidence minval smax arem aval originalSupport maxtime support minlen
## 0.5 0.1 1 none FALSE TRUE 5 0.005 1
## maxlen target ext
## 10 rules FALSE
##
## Algorithmic control:
## filter tree heap memopt load sort verbose
## 0.1 TRUE TRUE FALSE TRUE 2 TRUE
##
## Absolute minimum support count: 84
##
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[3940 item(s), 16859 transaction(s)] done [0.00s].
## sorting and recoding items ... [33 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [75 rule(s)] done [0.00s].
## creating S4 object ... done [0.00s].
summary(myrule)
## set of 75 rules
##
## rule length distribution (lhs + rhs):sizes
## 2 3 4
## 12 52 11
##
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.000 3.000 3.000 2.987 3.000 4.000
##
## summary of quality measures:
## support confidence lift
## Min. :0.005042 Min. :0.5016 Min. : 2.084
## 1st Qu.:0.006228 1st Qu.:0.6100 1st Qu.: 3.034
## Median :0.007533 Median :0.7306 Median : 3.553
## Mean :0.015848 Mean :0.7299 Mean : 5.003
## 3rd Qu.:0.011626 3rd Qu.:0.8567 3rd Qu.: 7.441
## Max. :0.156415 Max. :0.9892 Max. :12.780
##
## mining info:
## data ntransactions support confidence
## arudata 16859 0.005 0.5
inspect(myrule[1:10])
## lhs rhs support confidence lift
## [1] {vans} => {nike} 0.008482116 0.5035211 2.092399
## [2] {vans} => {adidas} 0.009193902 0.5457746 2.245294
## [3] {reebok} => {adidas} 0.015777923 0.5683761 2.338275
## [4] {亚瑟士} => {adidas} 0.013642565 0.5066079 2.084164
## [5] {converse} => {adidas} 0.026929237 0.5176739 2.129689
## [6] {new balance} => {nike} 0.023548253 0.5109395 2.123226
## [7] {new balance} => {adidas} 0.025564980 0.5546976 2.282002
## [8] {puma} => {adidas} 0.029064595 0.5349345 2.200698
## [9] {air jordan} => {nike} 0.061866066 0.8549180 3.552641
## [10] {air jordan} => {adidas} 0.048816656 0.6745902 2.775236
## 将规则按照支持度排序
sortsupport <- arules::sort(myrule,decreasing = TRUE,by = "support")
inspect(sortsupport[1:15])
## lhs rhs support confidence lift
## [1] {nike} => {adidas} 0.15641497 0.6499877 2.674022
## [2] {adidas} => {nike} 0.15641497 0.6434846 2.674022
## [3] {air jordan} => {nike} 0.06186607 0.8549180 3.552641
## [4] {air jordan} => {adidas} 0.04881666 0.6745902 2.775236
## [5] {air jordan,nike} => {adidas} 0.04519841 0.7305849 3.005595
## [6] {adidas,air jordan} => {nike} 0.04519841 0.9258809 3.847529
## [7] {puma} => {adidas} 0.02906459 0.5349345 2.200698
## [8] {converse} => {adidas} 0.02692924 0.5176739 2.129689
## [9] {new balance} => {adidas} 0.02556498 0.5546976 2.282002
## [10] {new balance} => {nike} 0.02354825 0.5109395 2.123226
## [11] {nike,puma} => {adidas} 0.02253989 0.8351648 3.435833
## [12] {adidas,puma} => {nike} 0.02253989 0.7755102 3.222659
## [13] {converse,nike} => {adidas} 0.02230263 0.8623853 3.547817
## [14] {adidas,converse} => {nike} 0.02230263 0.8281938 3.441587
## [15] {new balance,nike} => {adidas} 0.01993001 0.8463476 3.481839
## 将规则按照置信度排序
sortconf <- arules::sort(myrule,decreasing = TRUE,by = "confidence")
inspect(sortconf[1:15])
## lhs rhs support confidence
## [1] {adidas,air jordan,reebok} => {nike} 0.005457026 0.9892473
## [2] {adidas,air jordan,under armour} => {nike} 0.006465389 0.9819820
## [3] {air jordan,reebok} => {nike} 0.006050181 0.9532710
## [4] {adidas,air jordan,new balance} => {nike} 0.005753603 0.9509804
## [5] {air jordan,under armour} => {nike} 0.008126223 0.9448276
## [6] {air jordan,new balance} => {nike} 0.006702651 0.9416667
## [7] {new balance,nike,puma} => {adidas} 0.005990865 0.9351852
## [8] {adidas,air jordan} => {nike} 0.045198410 0.9258809
## [9] {converse,nike,puma} => {adidas} 0.006465389 0.9083333
## [10] {converse,new balance} => {adidas} 0.005634972 0.9047619
## [11] {air jordan,nike,reebok} => {adidas} 0.005457026 0.9019608
## [12] {nike,vans} => {adidas} 0.007592384 0.8951049
## [13] {nike,reebok} => {adidas} 0.011625838 0.8789238
## [14] {adidas,new balance,puma} => {nike} 0.005990865 0.8706897
## [15] {air jordan,reebok} => {adidas} 0.005516341 0.8691589
## lift
## [1] 4.110850
## [2] 4.080659
## [3] 3.961350
## [4] 3.951831
## [5] 3.926263
## [6] 3.913128
## [7] 3.847313
## [8] 3.847529
## [9] 3.736845
## [10] 3.722153
## [11] 3.710629
## [12] 3.682424
## [13] 3.615855
## [14] 3.618180
## [15] 3.575683
## 将规则按照提升度排序
sortlift <- arules::sort(myrule,decreasing = TRUE,by = "lift")
inspect(sortlift[1:15])
## lhs rhs support confidence
## [1] {匹克,特步} => {乔丹} 0.006228127 0.6481481
## [2] {匹克,特步} => {鸿星尔克} 0.005338395 0.5555556
## [3] {鸿星尔克,匹克} => {乔丹} 0.005041817 0.6439394
## [4] {361,乔丹} => {鸿星尔克} 0.006109496 0.5024390
## [5] {乔丹,特步} => {鸿星尔克} 0.009371849 0.5015873
## [6] {鸿星尔克,特步} => {乔丹} 0.009371849 0.5374150
## [7] {李宁,特步} => {361} 0.006999229 0.5221239
## [8] {鸿星尔克,乔丹} => {特步} 0.009371849 0.7417840
## [9] {361,鸿星尔克} => {乔丹} 0.006109496 0.5150000
## [10] {361,特步} => {乔丹} 0.008363485 0.5127273
## [11] {361,乔丹} => {特步} 0.008363485 0.6878049
## [12] {鸿星尔克,匹克} => {特步} 0.005338395 0.6818182
## [13] {adidas,nike,under armour} => {air jordan} 0.006465389 0.6770186
## [14] {鸿星尔克,李宁} => {特步} 0.005575657 0.6438356
## [15] {361,鸿星尔克} => {特步} 0.007533068 0.6350000
## lift
## [1] 12.780269
## [2] 12.725694
## [3] 12.697280
## [4] 11.508994
## [5] 11.489484
## [6] 10.596817
## [7] 10.491641
## [8] 10.183825
## [9] 10.154836
## [10] 10.110022
## [11] 9.442754
## [12] 9.360564
## [13] 9.355621
## [14] 8.839108
## [15] 8.717805
## 将关联规则数据转化为数据表格的形式
brand_shop_ruledf <- as(myrule,"data.frame")
utils::write.csv(brand_shop_ruledf,file = "第二题数据/店铺_品牌_关联结果.txt",
row.names = FALSE,quote = FALSE,fileEncoding = "UTF-8")