guanlian_lin

## 2016年厦门大数据大赛
## 第二题：基于大数据的商品销售预测及关联销售挖掘
## author:Daitu
## 2016-7-12
## 从数据集中针对店铺，进行关联销售挖掘
## 店铺的相似性


## 更改工作文件夹------------------------------------
setwd("/Users/daitu/数据分析/2016ABD")
getwd()

## [1] "/Users/daitu/数据分析/2016ABD"

rm(list = ls());gc()

##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 368128 19.7     592000 31.7   460000 24.6
## Vcells 568002  4.4    1023718  7.9   786371  6.0

## 加载所需要的包-----------------------------------
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.3.2

library(arules)

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(stringr)


# sessionInfo()

## 第一步：读取数据####-------------------------------------------
# item_id 每一个商品链接的独有的ID   ----字符串-------
# item_number 商品的款号          ----字符串-------  
# shop_id 销售这件商品的店铺ID        ----字符串-------
# shop_type 销售这件商品的店铺类型，分为TB_TMALL与TB_JISHI两种   ----字符串-------
# brand_name 商品的品牌名                    ----字符串-------
# item_name 商品名(商品标题)               ----字符串-------
# price 该商品的销售价格(此处对各sku取均值)
# tag_price 商品的标签价
# monthly_sales_num 商品月销量
# assessment_num 商品评价数
# monthly_sales 月销售额  -----价格乘以销量-----

## 商品销售
load("第二题数据/item_fact.RData")
head(item_fact)

## # A tibble: 6 × 11
##       item_id      item_number  shop_id shop_type brand_name
##         <chr>            <chr>    <chr>     <chr>      <chr>
## 1 10003087358               UB 64971284  TB_JISHI      umbro
## 2 10005342950 2011030511534230 61051459  TB_JISHI     美津浓
## 3 10006292251       467046-401 64767888  TB_JISHI       nike
## 4 10010133546            YKL40 65507365  TB_TMALL       骐煌
## 5 10013604183          AWDK288 62687027  TB_TMALL       李宁
## 6 10014801651             7X50 35714901  TB_JISHI      yukon
## # ... with 6 more variables: item_name <chr>, price <dbl>,
## #   tag_price <dbl>, monthly_sales_num <dbl>, assessment_num <dbl>,
## #   monthly_sales <dbl>

summary(item_fact)

##    item_id          item_number          shop_id         
##  Length:947276      Length:947276      Length:947276     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   shop_type          brand_name         item_name        
##  Length:947276      Length:947276      Length:947276     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      price            tag_price        monthly_sales_num 
##  Min.   :0.00e+00   Min.   :0.00e+00   Min.   :0.00e+00  
##  1st Qu.:1.31e+02   1st Qu.:1.68e+02   1st Qu.:0.00e+00  
##  Median :2.14e+02   Median :2.88e+02   Median :0.00e+00  
##  Mean   :5.16e+02   Mean   :5.83e+02   Mean   :8.29e+00  
##  3rd Qu.:4.15e+02   3rd Qu.:4.99e+02   3rd Qu.:1.00e+00  
##  Max.   :1.00e+08   Max.   :1.00e+08   Max.   :2.48e+05  
##  assessment_num     monthly_sales    
##  Min.   :    -1.0   Min.   :      0  
##  1st Qu.:     0.0   1st Qu.:      0  
##  Median :     0.0   Median :      0  
##  Mean   :    34.7   Mean   :   1652  
##  3rd Qu.:     3.0   3rd Qu.:    225  
##  Max.   :891523.0   Max.   :4678400

## 查看多少商品有销量
sum(item_fact$monthly_sales_num > 0)  #约有305218商品有销售出去的纪录

## [1] 305218

## 只对有销售纪录的商品进行关联分析
item_fact <- item_fact[which(item_fact$monthly_sales_num > 0),]

## 商品的数目
iten_number <- as.data.frame(table(item_fact$item_number))  
dim(iten_number)  ## 127137 种商品编号

## [1] 127137      2

# ## 只分析出现频次大于1的商品
# iten_number <- iten_number[iten_number$Freq>1,]
# dim(iten_number)   ## 39932 种商品编号

bb <- item_fact[item_fact$item_number == "00001",]
## 商品编号无法唯一定义商品，所以要剔除商品编号异常的数据，它们的存在没有帮组
## 提出商品编号带有：“－”，“＊”，“＃”，“。”等，
## 即只保留开始和结尾都是数字或者为字母的商品
iten_number <- iten_number[grepl("^[[:alnum:]]+[A-Za-z0-9\\+-]+[[:alnum:]]$",iten_number$Var1),]
## 将商品编号只有0、1的数据删除，因为不能唯一定位
iten_number <- iten_number[!grepl("^[0-1]+[0]+[0-1]$",iten_number$Var1),]

item_fact <- item_fact[item_fact$item_number %in% iten_number$Var1,]
dim(item_fact)   # 还有283597 条纪录

## [1] 283597     11

n_distinct(item_fact$shop_id) #  16858个店铺

## [1] 16858

n_distinct(item_fact$brand_name) #  3939 种品牌

## [1] 3939

## 查看每个商铺大约有多少件商品出售####-------------------
shop <- item_fact %>%
  dplyr::group_by(shop_id) %>%   ## 将数据按照店铺的ID进行分组
  dplyr::summarise(n_item = n_distinct(item_number)) ## 该店铺共销售了多少种商品
summary(shop)

##    shop_id              n_item       
##  Length:16858       Min.   :   1.00  
##  Class :character   1st Qu.:   2.00  
##  Mode  :character   Median :   5.00  
##                     Mean   :  16.54  
##                     3rd Qu.:  13.00  
##                     Max.   :5501.00

# shop <- as.data.frame(table(item_fact$shop_id))
# summary(shop$Freq)
## 销售商品数目的分布
p1 <- ggplot(shop,aes(n_item)) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(binwidth = 20,color = "firebrick",fill = "red",alpha = 0.5)+
  labs(x = "销售商品数目(件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")


p2 <- ggplot(shop[shop$n_item<=500,],aes(n_item)) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(binwidth = 5,color = "firebrick",fill = "red",alpha = 0.5)+
  labs(x = "销售商品数目(<=500件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")

gridExtra::grid.arrange(p1,p2,nrow = 2)

## 大多数上架的销售商品数目小于100件


## 将数据转化为适合关联规则的形式####----------------------------------


## 店铺－商品数据
arule_data <- item_fact[c("shop_id","item_number")]
str(arule_data)

## Classes 'tbl_df', 'tbl' and 'data.frame':    283597 obs. of  2 variables:
##  $ shop_id    : chr  "65507365" "62204349" "62687027" "36179219" ...
##  $ item_number: chr  "YKL40" "DL00150" "AKLK631" "M21D013" ...

## 将数据保存为便于csv文件
write.csv(arule_data,file = "第二题数据/店铺_商品.txt",row.names = FALSE)

## 店铺－品牌数据
arule_brand <- item_fact[c("shop_id","brand_name")]
str(arule_brand)

## Classes 'tbl_df', 'tbl' and 'data.frame':    283597 obs. of  2 variables:
##  $ shop_id   : chr  "65507365" "62204349" "62687027" "36179219" ...
##  $ brand_name: chr  "骐煌" "sherpa" "李宁" "匹克" ...

aa <- as.data.frame(table(arule_brand$brand_name))
bb <- item_fact[item_fact$brand_name == "0",]
## 虽然有些品牌的名称是不正确的，但是我们能最后的到在规则中不考虑这些错误的品牌
## 将数据保存为便于csv文件
write.csv(arule_brand,file = "第二题数据/店铺_品牌.txt",row.names = FALSE)



## 对店铺和销售的商品数据进行关联规则分析####-----------------------------
## 读取关联分析数据
arudata <- read.transactions(file = "第二题数据/店铺_商品.txt",format = "single",
                             sep = ",",cols = c(1,2),rm.duplicates = TRUE)

## distribution of transactions with duplicates:
## items
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 1088  339  151   72   44   26   28   15   10    9    6    7    9    5    8 
##   16   17   18   19   20   22   23   24   25   27   28   31   33   36   37 
##    3    3    7    1    1    2    1    3    2    2    2    1    1    1    1 
##   41   45   46   47   51 
##    1    1    1    1    1

## 查看数据
summary(arudata)

## transactions as itemMatrix in sparse format with
##  16859 rows (elements/itemsets/transactions) and
##  111656 columns (items) and a density of 0.0001481251 
## 
## most frequent items:
##     M20324 130690-002     C77124 511882-094        574    (Other) 
##        229        203        203        167        144     277886 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 3408 2093 1493 1178  908  775  614  547  500  391  371  283  246  230  221 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
##  223  176  153  143  150  149  133  124  105   84   83  100   88   92   98 
##   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45 
##   85   69   69   57   64   60   48   38   51   37   60   49   36   40   38 
##   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60 
##   38   36   32   23   21   17   22   21   19   20   16   18   26   18   15 
##   61   62   63   64   65   66   67   68   69   70   71   72   73   74   75 
##    8   11   14   16   12    8    6   10    5    6    2    5    3    3    8 
##   76   77   78   79   80   81   82   85   86   87   88   89   90   91   92 
##    4    1    1    4    3    6    3    2    2    3    2    2    1    3    2 
##   93   94   95   96   97   98   99  101  102  104  105  106  107  108  109 
##    2    4    4    2    2    4    3    2    2    3    2    7    1    2    2 
##  110  111  112  113  114  115  116  117  118  119  120  121  122  123  124 
##    2    4    4    1    2    3    2    1    2    3    1    3    3    2    1 
##  125  126  127  128  129  130  131  132  133  134  135  136  137  138  139 
##    1    4    2    2    6    2    2    2    5    2    2    2    2    1    2 
##  140  141  142  143  144  145  146  147  148  149  150  151  152  153  154 
##    2    2    1    4    2    1    2    1    2    3    2    4    3    2    1 
##  156  157  158  159  160  161  162  164  166  167  169  170  171  172  173 
##    1    1    4    2    6    2    7    3    4    1    4    1    1    1    2 
##  174  175  176  177  179  181  182  187  189  191  192  193  195  196  197 
##    2    2    1    1    1    2    1    1    1    1    2    1    2    2    1 
##  198  201  203  204  205  206  207  208  210  211  212  213  216  217  218 
##    1    2    2    2    1    1    1    1    2    2    1    2    2    1    3 
##  219  220  221  224  225  226  227  229  230  232  233  234  235  237  238 
##    1    1    2    1    1    1    1    2    1    1    1    2    2    3    1 
##  240  242  243  245  246  247  248  249  251  253  254  255  258  259  260 
##    1    1    1    2    3    1    1    1    4    1    1    2    2    1    1 
##  263  264  265  266  267  268  274  278  280  282  284  286  287  288  291 
##    1    1    1    3    1    1    1    1    1    2    1    1    1    1    2 
##  294  295  298  301  304  310  311  313  317  320  321  324  326  327  328 
##    1    2    1    1    3    1    2    1    2    1    3    1    2    1    1 
##  331  333  337  338  341  343  345  350  352  353  361  364  367  371  376 
##    3    1    2    1    1    1    1    1    1    2    1    1    1    1    2 
##  377  380  400  407  408  411  416  419  421  422  442  443  446  454  478 
##    1    1    1    1    1    1    1    2    1    1    1    1    1    1    1 
##  496  506  508  512  517  529  541  544  575  578  591  599  607  615  626 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  628  633  637  667  668  672  676  715  728  749  757  791  802  807  815 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  838  867  930  957 1028 1043 1064 1066 1132 1147 1149 1278 1323 1494 1585 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 2287 2302 2528 5501 
##    1    1    1    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    2.00    5.00   16.54   13.00 5501.00 
## 
## includes extended item information - examples:
##   labels
## 1  0-011
## 2   0-23
## 3   0-24
## 
## includes extended transaction information - examples:
##   transactionID
## 1     100002563
## 2     100005363
## 3     100005546

inspect(arudata[1:2])

##     items           transactionID
## [1] {10372601,                   
##      1272298-100,                
##      1273365-001,                
##      1278852-001,                
##      705248-010,                 
##      714965-010,                 
##      714965-021,                 
##      729SPM0057082,              
##      742504-010,                 
##      747445-100,                 
##      749324-100,                 
##      808924-672,                 
##      808951-451,                 
##      819217-307,                 
##      819712-307,                 
##      831975-870,                 
##      831977-870,                 
##      835611-010,                 
##      AA0881,                     
##      AF4083,                     
##      AF4693,                     
##      AQ5000,                     
##      AQ5152,                     
##      P1GA165844,                 
##      S76650,                     
##      S90230,                     
##      S90391,                     
##      SP0284-100,                 
##      SX4854-701}        100002563
## [2] {51003,                      
##      639669,                     
##      639790,                     
##      725717,                     
##      776114,                     
##      799108,                     
##      819006,                     
##      AE5427,                     
##      TA6BB632}          100005363

## 可视化商品的频率图
## itemFrequency(item_arules)
par(family = "STKaiti",cex = 0.75)
itemFrequencyPlot(arudata,type = "absolute",topN = 50,main = "商品频率图top50",
                  xlab = "商品编号",ylab = "频数",col = "LightSkyBlue")

par(family = "STKaiti",cex = 0.75)
itemFrequencyPlot(arudata,type = "relative",topN = 50,main = "商品频率图top50",
                  xlab = "商品编号",ylab = "频率",col = "LightSkyBlue")

## 可以看出出现次数最多的商品大约出现了200多次，并且top50的商品出现次数均大于50次，
## 但是商品出现的频率均偏低。

## 查看店铺和商品的稀疏矩阵图像
par(family = "STKaiti",cex = 0.75,mai = c(0,1,0,1)) 
arules::image(arudata,xlab = "item number",ylab = "shop number",main = "shops--items")

# ## 计算各个项集的频率
# fsets <- eclat(item_arules,parameter = list(support = 0.005,maxlen = 10))
# 
# inspect(sort(fsets,by = "support"))

## 训练模型
myrule <- apriori(data = arudata,
                 parameter = list(support = 0.001,
                                  confidence = 0.9,
                                   minlen = 2))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.9    0.1    1 none FALSE            TRUE       5   0.001      2
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 16 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[111656 item(s), 16859 transaction(s)] done [0.08s].
## sorting and recoding items ... [1925 item(s)] done [0.01s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 5 6 7 done [0.04s].
## writing ... [2698 rule(s)] done [0.01s].
## creating S4 object  ... done [0.02s].

summary(myrule)

## set of 2698 rules
## 
## rule length distribution (lhs + rhs):sizes
##    2    3    4    5    6    7 
##   12  633 1418  542   86    7 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   4.000   4.000   4.029   4.000   7.000 
## 
## summary of quality measures:
##     support           confidence          lift      
##  Min.   :0.001008   Min.   :0.9000   Min.   :126.4  
##  1st Qu.:0.001008   1st Qu.:0.9091   1st Qu.:198.3  
##  Median :0.001068   Median :0.9444   Median :239.0  
##  Mean   :0.001068   Mean   :0.9462   Mean   :260.8  
##  3rd Qu.:0.001068   3rd Qu.:0.9524   3rd Qu.:306.2  
##  Max.   :0.001661   Max.   :1.0000   Max.   :802.8  
## 
## mining info:
##     data ntransactions support confidence
##  arudata         16859   0.001        0.9

inspect(myrule[1:5])

##     lhs              rhs           support     confidence lift    
## [1] {ahsk028}     => {AKSK081}     0.001008363 0.9444444  215.1674
## [2] {11116220124} => {51116120028} 0.001126994 0.9500000  205.3340
## [3] {16628204}    => {15625311}    0.001067679 1.0000000  301.0536
## [4] {96627781}    => {96627143}    0.001008363 1.0000000  802.8095
## [5] {96627781}    => {91625505}    0.001008363 1.0000000  295.7719

## 将规则按照支持度排序
sortsupport <- arules::sort(myrule,decreasing = TRUE,by = "support")
inspect(sortsupport[1:10])

##      lhs                             rhs        support     confidence
## [1]  {11621304,12627775}          => {11625553} 0.001660834 0.9655172 
## [2]  {11621304,11627775}          => {11625553} 0.001601518 0.9000000 
## [3]  {15627740,16627785}          => {11621304} 0.001542203 0.9285714 
## [4]  {12627775,16627785}          => {11625553} 0.001542203 0.9285714 
## [5]  {15627783,16627165}          => {15627141} 0.001482887 0.9615385 
## [6]  {11621304,11625501,12627775} => {11625553} 0.001482887 0.9615385 
## [7]  {11625501,11625553,12627775} => {11621304} 0.001482887 0.9259259 
## [8]  {ABPJ039,ARBK019}            => {ABPK021}  0.001423572 0.9600000 
## [9]  {15621166,15621787}          => {15621158} 0.001423572 0.9230769 
## [10] {15621787,15627141}          => {15621158} 0.001423572 0.9230769 
##      lift    
## [1]  191.5018
## [2]  178.5071
## [3]  182.0324
## [4]  184.1739
## [5]  219.0619
## [6]  190.7127
## [7]  181.5138
## [8]  134.8720
## [9]  232.2710
## [10] 232.2710

## 将规则按照置信度排序
sortconf <- arules::sort(myrule,decreasing = TRUE,by = "confidence")
inspect(sortconf[1:10])

##      lhs                            rhs            support     confidence
## [1]  {16628204}                  => {15625311}     0.001067679 1         
## [2]  {96627781}                  => {96627143}     0.001008363 1         
## [3]  {96627781}                  => {91625505}     0.001008363 1         
## [4]  {16627109}                  => {11625553}     0.001008363 1         
## [5]  {985218119762,985219119337} => {985118119979} 0.001067679 1         
## [6]  {984118119372,985419119880} => {985219119337} 0.001008363 1         
## [7]  {984119119578,987319119623} => {985219119337} 0.001008363 1         
## [8]  {12115101024,51116120028}   => {11115101024}  0.001008363 1         
## [9]  {985219119775,987319119623} => {985118119979} 0.001008363 1         
## [10] {985219119775,987319119623} => {985219119337} 0.001008363 1         
##      lift    
## [1]  301.0536
## [2]  802.8095
## [3]  295.7719
## [4]  198.3412
## [5]  280.9833
## [6]  230.9452
## [7]  230.9452
## [8]  306.5273
## [9]  280.9833
## [10] 230.9452

## 将规则按照提升度排序
sortlift <- arules::sort(myrule,decreasing = TRUE,by = "lift")
inspect(sortlift[1:10])

##      lhs                                      rhs        support    
## [1]  {96627781}                            => {96627143} 0.001008363
## [2]  {91625505,96627781}                   => {96627143} 0.001008363
## [3]  {11618022,15621166,91625505}          => {19627351} 0.001067679
## [4]  {11618022,15621166,16627785,91625505} => {19627351} 0.001008363
## [5]  {11611102,15621166,91625505}          => {12617775} 0.001067679
## [6]  {16625145,91625505}                   => {11622204} 0.001008363
## [7]  {644441,SH1007}                       => {SH1001}   0.001008363
## [8]  {15627784,91625505}                   => {11631307} 0.001008363
## [9]  {19627301,91625505}                   => {11631307} 0.001067679
## [10] {11625553,91545546}                   => {91528015} 0.001008363
##      confidence lift    
## [1]  1.0000000  802.8095
## [2]  1.0000000  802.8095
## [3]  0.9473684  591.5439
## [4]  0.9444444  589.7181
## [5]  0.9473684  550.7477
## [6]  0.9444444  549.0479
## [7]  0.9444444  549.0479
## [8]  1.0000000  543.8387
## [9]  0.9473684  515.2156
## [10] 1.0000000  510.8788

## 将关联规则数据转化为数据表格的形式
item_shop_ruledf <- as(myrule,"data.frame")

## 我门可以发现，我门发现的2000多条的规则中，置信度非常的接近1
## 并且提升度也很大，这时因为很多店铺销售的东西都很相似

utils::write.csv(item_shop_ruledf,file = "第二题数据/店铺_商品_关联结果.txt",
                 row.names = FALSE,quote = FALSE,fileEncoding = "UTF-8")



## 对店铺销售的品牌进行关联规则的挖掘####--------------------
## 读取关联分析数据
arudata <- read.transactions(file = "第二题数据/店铺_品牌.txt",format = "single",
                             sep = ",",cols = c(1,2),rm.duplicates = TRUE)

## distribution of transactions with duplicates:
## items
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 2160 1421 1027  809  675  566  446  437  346  263  280  225  217  219  155 
##   16   17   18   19   20   21   22   23   24   25   26   27   28   29   30 
##  158  131  150   99  124  107  121   84   88   83   84   87   87   65   67 
##   31   32   33   34   35   36   37   38   39   40   41   42   43   44   45 
##   66   67   52   52   51   44   59   38   54   43   32   34   37   33   35 
##   46   47   48   49   50   51   52   53   54   55   56   57   58   59   60 
##   30   19   23   24   24   19   22   14   16   17   18   16    1   14   15 
##   61   62   63   64   65   66   67   68   69   70   71   72   73   74   75 
##   10   17   19   11    5    5    8    7    4    3    5    2    2    5    4 
##   77   78   79   80   81   83   84   85   86   87   88   89   90   91   92 
##    4    3    2    4    2    1    1    2    2    2    3    2    3    3    1 
##   93   94   95   96   97   98   99  100  101  102  103  104  105  106  107 
##    4    4    2    1    1    3    1    3    4    1    2    4    4    2    2 
##  108  109  110  111  112  113  114  116  117  118  119  120  121  123  124 
##    3    1    2    5    1    1    3    2    4    1    2    5    4    1    1 
##  125  126  127  128  129  130  131  132  133  134  135  136  137  138  139 
##    4    4    2    5    1    2    5    3    3    1    2    3    1    2    3 
##  140  141  142  143  144  145  146  147  148  149  151  152  155  156  157 
##    1    1    1    1    3    4    1    2    3    5    4    2    4    1    3 
##  158  159  160  161  162  163  164  165  168  169  172  173  174  175  176 
##    2    6    3    2    2    2    2    3    3    2    1    2    4    3    1 
##  177  180  185  188  189  191  192  193  194  195  197  200  201  203  204 
##    1    1    1    1    1    3    1    2    2    2    1    1    1    2    1 
##  205  206  207  208  209  211  212  214  215  217  220  221  223  226  227 
##    1    1    2    1    2    1    2    1    3    3    2    1    2    1    4 
##  228  229  231  232  233  234  236  238  241  242  243  244  245  247  249 
##    1    1    1    1    2    1    2    2    1    3    1    2    1    1    1 
##  250  251  252  253  254  257  263  264  265  266  267  274  275  277  278 
##    1    1    2    3    2    3    1    3    1    1    1    1    1    1    3 
##  281  285  287  288  290  291  292  294  297  302  303  304  307  308  309 
##    1    1    1    1    1    1    1    1    1    1    2    1    1    1    1 
##  312  313  314  316  320  321  322  326  327  330  331  332  334  336  337 
##    1    1    1    2    3    1    1    1    2    1    1    1    1    1    2 
##  340  349  351  352  360  363  364  365  375  379  381  398  406  408  409 
##    2    1    1    2    1    1    1    1    2    1    1    1    1    2    1 
##  410  416  418  420  421  441  442  450  475  492  494  505  506  518  527 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
##  540  541  572  575  581  586  587  614  625  634  656  667  668  677  709 
##    1    1    1    1    1    1    1    1    2    2    1    1    1    1    1 
##  729  745  763  788  797  800  835  861  866  929  945 1027 1039 1063 1067 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1    1 
## 1142 1153 1156 1277 1322 1485 1596 2298 2311 2568 5500 
##    1    1    1    1    1    1    1    1    1    1    1

## 查看数据
summary(arudata)

## transactions as itemMatrix in sparse format with
##  16859 rows (elements/itemsets/transactions) and
##  3940 columns (items) and a density of 0.0005758722 
## 
## most frequent items:
##  adidas    nike    李宁   other    特步 (Other) 
##    4098    4057    1936    1511    1228   25422 
## 
## element (itemset/transaction) length distribution:
## sizes
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 8186 3756 2092 1119  657  390  242  131   66   56   48   35   15   18   11 
##   16   17   18   19   20   21   22   24   26   30   39 
##   10   11    3    2    1    1    2    1    4    1    1 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   2.269   3.000  39.000 
## 
## includes extended item information - examples:
##   labels
## 1      -
## 2      .
## 3     ..
## 
## includes extended transaction information - examples:
##   transactionID
## 1     100002563
## 2     100005363
## 3     100005546

inspect(arudata[1:5])

##     items                                          transactionID
## [1] {美津浓,adidas,lacoste,nike,puma,under armour} 100002563    
## [2] {adidas,footjoy,nike,titleist}                 100005363    
## [3] {美津浓,nike}                                  100005546    
## [4] {李宁,adidas,air jordan,converse,nike}         100010926    
## [5] {美津浓,butterfly}                             100013851

## 可视化商品的频率图
## itemFrequency(item_arules)
par(family = "STKaiti",cex = 0.75)
itemFrequencyPlot(arudata,type = "absolute",topN = 30,main = "品牌频率图top30",
                  xlab = "品牌名",ylab = "频数",col = "LightSkyBlue")

par(family = "STKaiti",cex = 0.75)
itemFrequencyPlot(arudata,type = "relative",topN = 30,main = "品牌频率图top30",
                  xlab = "品牌名",ylab = "频率",col = "LightSkyBlue")

## 可以看出出现次数最多的商品大约出现了4000多次，说明有很多店铺，销售这个品牌的产品，
## 并且top7的品牌出现次数均大于1000次，
## 品牌出现的频率偏高。

## 查看店铺和商品的稀疏矩阵图像
par(family = "STKaiti",cex = 0.75) 
arules::image(arudata,xlab = "brand number",ylab = "shop number",main = "shops--brands")

## 训练模型
myrule <- apriori(data = arudata,
                  parameter = list(support = 0.005,
                                   confidence = 0.5,
                                   minlen = 1))

## Apriori
## 
## Parameter specification:
##  confidence minval smax arem  aval originalSupport maxtime support minlen
##         0.5    0.1    1 none FALSE            TRUE       5   0.005      1
##  maxlen target   ext
##      10  rules FALSE
## 
## Algorithmic control:
##  filter tree heap memopt load sort verbose
##     0.1 TRUE TRUE  FALSE TRUE    2    TRUE
## 
## Absolute minimum support count: 84 
## 
## set item appearances ...[0 item(s)] done [0.00s].
## set transactions ...[3940 item(s), 16859 transaction(s)] done [0.00s].
## sorting and recoding items ... [33 item(s)] done [0.00s].
## creating transaction tree ... done [0.00s].
## checking subsets of size 1 2 3 4 done [0.00s].
## writing ... [75 rule(s)] done [0.00s].
## creating S4 object  ... done [0.00s].

summary(myrule)

## set of 75 rules
## 
## rule length distribution (lhs + rhs):sizes
##  2  3  4 
## 12 52 11 
## 
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   3.000   3.000   2.987   3.000   4.000 
## 
## summary of quality measures:
##     support           confidence          lift       
##  Min.   :0.005042   Min.   :0.5016   Min.   : 2.084  
##  1st Qu.:0.006228   1st Qu.:0.6100   1st Qu.: 3.034  
##  Median :0.007533   Median :0.7306   Median : 3.553  
##  Mean   :0.015848   Mean   :0.7299   Mean   : 5.003  
##  3rd Qu.:0.011626   3rd Qu.:0.8567   3rd Qu.: 7.441  
##  Max.   :0.156415   Max.   :0.9892   Max.   :12.780  
## 
## mining info:
##     data ntransactions support confidence
##  arudata         16859   0.005        0.5

inspect(myrule[1:10])

##      lhs              rhs      support     confidence lift    
## [1]  {vans}        => {nike}   0.008482116 0.5035211  2.092399
## [2]  {vans}        => {adidas} 0.009193902 0.5457746  2.245294
## [3]  {reebok}      => {adidas} 0.015777923 0.5683761  2.338275
## [4]  {亚瑟士}      => {adidas} 0.013642565 0.5066079  2.084164
## [5]  {converse}    => {adidas} 0.026929237 0.5176739  2.129689
## [6]  {new balance} => {nike}   0.023548253 0.5109395  2.123226
## [7]  {new balance} => {adidas} 0.025564980 0.5546976  2.282002
## [8]  {puma}        => {adidas} 0.029064595 0.5349345  2.200698
## [9]  {air jordan}  => {nike}   0.061866066 0.8549180  3.552641
## [10] {air jordan}  => {adidas} 0.048816656 0.6745902  2.775236

## 将规则按照支持度排序
sortsupport <- arules::sort(myrule,decreasing = TRUE,by = "support")
inspect(sortsupport[1:15])

##      lhs                    rhs      support    confidence lift    
## [1]  {nike}              => {adidas} 0.15641497 0.6499877  2.674022
## [2]  {adidas}            => {nike}   0.15641497 0.6434846  2.674022
## [3]  {air jordan}        => {nike}   0.06186607 0.8549180  3.552641
## [4]  {air jordan}        => {adidas} 0.04881666 0.6745902  2.775236
## [5]  {air jordan,nike}   => {adidas} 0.04519841 0.7305849  3.005595
## [6]  {adidas,air jordan} => {nike}   0.04519841 0.9258809  3.847529
## [7]  {puma}              => {adidas} 0.02906459 0.5349345  2.200698
## [8]  {converse}          => {adidas} 0.02692924 0.5176739  2.129689
## [9]  {new balance}       => {adidas} 0.02556498 0.5546976  2.282002
## [10] {new balance}       => {nike}   0.02354825 0.5109395  2.123226
## [11] {nike,puma}         => {adidas} 0.02253989 0.8351648  3.435833
## [12] {adidas,puma}       => {nike}   0.02253989 0.7755102  3.222659
## [13] {converse,nike}     => {adidas} 0.02230263 0.8623853  3.547817
## [14] {adidas,converse}   => {nike}   0.02230263 0.8281938  3.441587
## [15] {new balance,nike}  => {adidas} 0.01993001 0.8463476  3.481839

## 将规则按照置信度排序
sortconf <- arules::sort(myrule,decreasing = TRUE,by = "confidence")
inspect(sortconf[1:15])

##      lhs                                 rhs      support     confidence
## [1]  {adidas,air jordan,reebok}       => {nike}   0.005457026 0.9892473 
## [2]  {adidas,air jordan,under armour} => {nike}   0.006465389 0.9819820 
## [3]  {air jordan,reebok}              => {nike}   0.006050181 0.9532710 
## [4]  {adidas,air jordan,new balance}  => {nike}   0.005753603 0.9509804 
## [5]  {air jordan,under armour}        => {nike}   0.008126223 0.9448276 
## [6]  {air jordan,new balance}         => {nike}   0.006702651 0.9416667 
## [7]  {new balance,nike,puma}          => {adidas} 0.005990865 0.9351852 
## [8]  {adidas,air jordan}              => {nike}   0.045198410 0.9258809 
## [9]  {converse,nike,puma}             => {adidas} 0.006465389 0.9083333 
## [10] {converse,new balance}           => {adidas} 0.005634972 0.9047619 
## [11] {air jordan,nike,reebok}         => {adidas} 0.005457026 0.9019608 
## [12] {nike,vans}                      => {adidas} 0.007592384 0.8951049 
## [13] {nike,reebok}                    => {adidas} 0.011625838 0.8789238 
## [14] {adidas,new balance,puma}        => {nike}   0.005990865 0.8706897 
## [15] {air jordan,reebok}              => {adidas} 0.005516341 0.8691589 
##      lift    
## [1]  4.110850
## [2]  4.080659
## [3]  3.961350
## [4]  3.951831
## [5]  3.926263
## [6]  3.913128
## [7]  3.847313
## [8]  3.847529
## [9]  3.736845
## [10] 3.722153
## [11] 3.710629
## [12] 3.682424
## [13] 3.615855
## [14] 3.618180
## [15] 3.575683

## 将规则按照提升度排序
sortlift <- arules::sort(myrule,decreasing = TRUE,by = "lift")
inspect(sortlift[1:15])

##      lhs                           rhs          support     confidence
## [1]  {匹克,特步}                => {乔丹}       0.006228127 0.6481481 
## [2]  {匹克,特步}                => {鸿星尔克}   0.005338395 0.5555556 
## [3]  {鸿星尔克,匹克}            => {乔丹}       0.005041817 0.6439394 
## [4]  {361,乔丹}                 => {鸿星尔克}   0.006109496 0.5024390 
## [5]  {乔丹,特步}                => {鸿星尔克}   0.009371849 0.5015873 
## [6]  {鸿星尔克,特步}            => {乔丹}       0.009371849 0.5374150 
## [7]  {李宁,特步}                => {361}        0.006999229 0.5221239 
## [8]  {鸿星尔克,乔丹}            => {特步}       0.009371849 0.7417840 
## [9]  {361,鸿星尔克}             => {乔丹}       0.006109496 0.5150000 
## [10] {361,特步}                 => {乔丹}       0.008363485 0.5127273 
## [11] {361,乔丹}                 => {特步}       0.008363485 0.6878049 
## [12] {鸿星尔克,匹克}            => {特步}       0.005338395 0.6818182 
## [13] {adidas,nike,under armour} => {air jordan} 0.006465389 0.6770186 
## [14] {鸿星尔克,李宁}            => {特步}       0.005575657 0.6438356 
## [15] {361,鸿星尔克}             => {特步}       0.007533068 0.6350000 
##      lift     
## [1]  12.780269
## [2]  12.725694
## [3]  12.697280
## [4]  11.508994
## [5]  11.489484
## [6]  10.596817
## [7]  10.491641
## [8]  10.183825
## [9]  10.154836
## [10] 10.110022
## [11]  9.442754
## [12]  9.360564
## [13]  9.355621
## [14]  8.839108
## [15]  8.717805

## 将关联规则数据转化为数据表格的形式
brand_shop_ruledf <- as(myrule,"data.frame")

utils::write.csv(brand_shop_ruledf,file = "第二题数据/店铺_品牌_关联结果.txt",
          row.names = FALSE,quote = FALSE,fileEncoding = "UTF-8")

guanlian_lin_1.R

daitu

Sun Dec 4 23:32:36 2016