## 2016年厦门大数据大赛
## 第二题:基于大数据的商品销售预测及关联销售挖掘
## author:Daitu
## 2016-7-12
## 工作:读取预处理后的数据进行探索分析;数据可视化

## 更改工作文件夹------------------------------------
setwd("/Users/daitu/数据分析/2016ABD")
getwd()
## [1] "/Users/daitu/数据分析/2016ABD"
rm(list = ls());gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 367078 19.7     592000 31.7   460000 24.6
## Vcells 562803  4.3    1023718  7.9   786371  6.0
## 加载所需要的包-----------------------------------
library(stringr)
library(data.table)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
## 
##     between, last
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(GGally)
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
## 
##     nasa
library(psych)
## 
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(corrplot)


## 第一步:读取数据####-------------------------------------------
# item_id 每一个商品链接的独有的ID   ----字符串-------
# item_number 商品的款号          ----字符串-------  
# shop_id 销售这件商品的店铺ID        ----字符串-------
# shop_type 销售这件商品的店铺类型,分为TB_TMALL与TB_JISHI两种   ----字符串-------
# brand_name 商品的品牌名                    ----字符串-------
# item_name 商品名(商品标题)               ----字符串-------
# price 该商品的销售价格(此处对各sku取均值)
# tag_price 商品的标签价
# monthly_sales_num 商品月销量
# assessment_num 商品评价数
# monthly_sales 月销售额  -----价格乘以销量-----

## 商品销售
load("第二题数据/item_fact.RData")
head(item_fact)
## # A tibble: 6 × 11
##       item_id      item_number  shop_id shop_type brand_name
##         <chr>            <chr>    <chr>     <chr>      <chr>
## 1 10003087358               UB 64971284  TB_JISHI      umbro
## 2 10005342950 2011030511534230 61051459  TB_JISHI     美津浓
## 3 10006292251       467046-401 64767888  TB_JISHI       nike
## 4 10010133546            YKL40 65507365  TB_TMALL       骐煌
## 5 10013604183          AWDK288 62687027  TB_TMALL       李宁
## 6 10014801651             7X50 35714901  TB_JISHI      yukon
## # ... with 6 more variables: item_name <chr>, price <dbl>,
## #   tag_price <dbl>, monthly_sales_num <dbl>, assessment_num <dbl>,
## #   monthly_sales <dbl>
summary(item_fact)
##    item_id          item_number          shop_id         
##  Length:947276      Length:947276      Length:947276     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##   shop_type          brand_name         item_name        
##  Length:947276      Length:947276      Length:947276     
##  Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character  
##                                                          
##                                                          
##                                                          
##      price            tag_price        monthly_sales_num 
##  Min.   :0.00e+00   Min.   :0.00e+00   Min.   :0.00e+00  
##  1st Qu.:1.31e+02   1st Qu.:1.68e+02   1st Qu.:0.00e+00  
##  Median :2.14e+02   Median :2.88e+02   Median :0.00e+00  
##  Mean   :5.16e+02   Mean   :5.83e+02   Mean   :8.29e+00  
##  3rd Qu.:4.15e+02   3rd Qu.:4.99e+02   3rd Qu.:1.00e+00  
##  Max.   :1.00e+08   Max.   :1.00e+08   Max.   :2.48e+05  
##  assessment_num     monthly_sales    
##  Min.   :    -1.0   Min.   :      0  
##  1st Qu.:     0.0   1st Qu.:      0  
##  Median :     0.0   Median :      0  
##  Mean   :    34.7   Mean   :   1652  
##  3rd Qu.:     3.0   3rd Qu.:    225  
##  Max.   :891523.0   Max.   :4678400
# as.data.frame(item_fact[item_fact$price > 30000 ,])

# ## 销售前500数据
# load("第二题数据/top500df.RData")
# head(top500df)
## 读取店铺的信息数据####
filename4 <- "第二题数据/shop_dsr."
shop_dsr <- fread(filename4,header  = TRUE,sep = "\t",
                  colClasses  = rep(c("character","numeric"),times = c(2,3)))

head(shop_dsr)
##      shop_id            shop_name  mas  sas  cas
## 1: 126042966       李宁跑酷专卖店 4.81 4.76 4.73
## 2: 148002147 lining李宁新兴专卖店 4.88 4.83 4.82
## 3: 128369255   李宁羽毛球拍专业店 4.86 4.88 4.85
## 4: 145412890   李宁天泽华盛专卖店 4.90 4.86 4.86
## 5: 145270296 lining李宁图漫专卖店 4.89 4.87 4.82
## 6: 108228574   李宁力方力合专卖店 4.90 4.85 4.83
dim(shop_dsr)   #只有19190个店铺名
## [1] 19190     5
shop_dsr <- tbl_df(shop_dsr)
## 将数据按照店铺分组---------------------------------------
shop <- item_fact %>%
  dplyr::group_by(shop_id) %>%   ## 将数据按照店铺的ID进行分组
  dplyr::summarise(n_item = n_distinct(item_id),   ## 该店铺共销售了多少种商品
            brand_number = n_distinct(brand_name), ## 该店铺共销售了几种品牌的商品
            monthly_shop_sum = sum(monthly_sales_num), ## 该店铺的月销量
            monthly_sales_sum = sum(monthly_sales),   ## 该店铺的月销售额
            assessment_sum = sum(assessment_num),     ## 该店铺的评价总数
            tag_price_mean = mean(tag_price),   # 该店铺商品的平均标签价
            shop_type = unique(shop_type)) %>%  ## 店铺的销售方式
  mutate(price_mean = monthly_sales_sum / monthly_shop_sum)  ## 该店铺商品的平均售价)
dim(shop)    # 有 23531  家店铺
## [1] 23531     9
summary(shop)
##    shop_id              n_item         brand_number    monthly_shop_sum  
##  Length:23531       Min.   :   1.00   Min.   : 1.000   Min.   :     0.0  
##  Class :character   1st Qu.:   4.00   1st Qu.: 1.000   1st Qu.:     0.0  
##  Mode  :character   Median :  16.00   Median : 2.000   Median :    11.0  
##                     Mean   :  40.26   Mean   : 3.632   Mean   :   333.8  
##                     3rd Qu.:  51.00   3rd Qu.: 5.000   3rd Qu.:    68.0  
##                     Max.   :5672.00   Max.   :57.000   Max.   :402232.0  
##                                                                          
##  monthly_sales_sum   assessment_sum    tag_price_mean     
##  Min.   :        0   Min.   :      0   Min.   :      0.0  
##  1st Qu.:        0   1st Qu.:      1   1st Qu.:    168.0  
##  Median :     2034   Median :     20   Median :    285.0  
##  Mean   :    66519   Mean   :   1396   Mean   :    599.2  
##  3rd Qu.:    14398   3rd Qu.:    125   3rd Qu.:    532.5  
##  Max.   :111205902   Max.   :1384268   Max.   :2500303.0  
##                                                           
##   shop_type           price_mean     
##  Length:23531       Min.   :    0.0  
##  Class :character   1st Qu.:  105.2  
##  Mode  :character   Median :  184.7  
##                     Mean   :  325.2  
##                     3rd Qu.:  375.1  
##                     Max.   :14014.3  
##                     NA's   :5989
## 可以发现平均销售价格会出现 0/0 为缺失值的情况,针对这种情况,将平均销售价格定位0元。
shop$price_mean[is.na(shop$price_mean)] <- 0
summary(shop)
##    shop_id              n_item         brand_number    monthly_shop_sum  
##  Length:23531       Min.   :   1.00   Min.   : 1.000   Min.   :     0.0  
##  Class :character   1st Qu.:   4.00   1st Qu.: 1.000   1st Qu.:     0.0  
##  Mode  :character   Median :  16.00   Median : 2.000   Median :    11.0  
##                     Mean   :  40.26   Mean   : 3.632   Mean   :   333.8  
##                     3rd Qu.:  51.00   3rd Qu.: 5.000   3rd Qu.:    68.0  
##                     Max.   :5672.00   Max.   :57.000   Max.   :402232.0  
##  monthly_sales_sum   assessment_sum    tag_price_mean     
##  Min.   :        0   Min.   :      0   Min.   :      0.0  
##  1st Qu.:        0   1st Qu.:      1   1st Qu.:    168.0  
##  Median :     2034   Median :     20   Median :    285.0  
##  Mean   :    66519   Mean   :   1396   Mean   :    599.2  
##  3rd Qu.:    14398   3rd Qu.:    125   3rd Qu.:    532.5  
##  Max.   :111205902   Max.   :1384268   Max.   :2500303.0  
##   shop_type           price_mean     
##  Length:23531       Min.   :    0.0  
##  Class :character   1st Qu.:    0.0  
##  Mode  :character   Median :  128.2  
##                     Mean   :  242.5  
##                     3rd Qu.:  288.0  
##                     Max.   :14014.3
## 将 shop_dsr和shop数据连接
sum(shop$shop_id %in% shop_dsr$shop_id)
## [1] 14977
## 我们可以发现这两个数据集中,只有14977个店铺是相同的,其他则是不同的

# aa <- left_join(shop_dsr,shop,by = "shop_id")


## 对上面的商铺数据进行可视化探索分析
## 查看两种销售方式的对比 -------------------------
table(shop$shop_type)
## 
## TB_JISHI TB_TMALL 
##    22833      698
table(shop$shop_type) / dim(shop)[1]
## 
## TB_JISHI TB_TMALL 
## 0.970337 0.029663
## 可以发现销售方式为TB_JISHI占据百分比大于97%
## 销售方式为TB_TMALL 占据百分比不到3%


## 商铺销售商品数目的可视化  ---------------------------------
summary(shop$n_item)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    4.00   16.00   40.26   51.00 5672.00
length(which(shop$n_item > 1000)) 
## [1] 64
## 可以发现有一个店铺销售的商品数目为5671件,约有65个店铺销售的商品数目多于1000
## 查看这些数据
# data.frame(shop[which(shop$n_item > 1000),])

p1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(n_item),binwidth = 5,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  labs(x = "销售商品数目(件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")+
  theme(plot.title = element_text(hjust = 0.5))

p2 <- ggplot(shop[which(shop$n_item <= 250),]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(n_item),binwidth = 5,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  labs(x = "销售商品数目(<=250件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")+
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(p1,p2,nrow = 2)

## 可以看出在两种销售方式店铺中,主要销售方式为-TB_JISHI,店铺数目占据大部分,
## 但是这种销售规模较小,大多数销售商品数目小于250
## 而对于TB_TMALL,店铺数量不多,但是销售商品数目多于250的均为这种销售方式的店铺

## 商铺销售商品的品牌数目可视化------------------------------------
summary(shop$brand_number)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   3.632   5.000  57.000
length(which(shop$brand_number > 20)) 
## [1] 116
## 可以发现有店铺销售的商品品牌数目为57个品牌,约有116个店铺销售的商品品牌数目多于20

p1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(brand_number),binwidth = 1,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  labs(x = "销售商品品牌数(个)",y = "店铺的数目(家)",title = "店铺销售商品品牌数分布")+
  theme(plot.title = element_text(hjust = 0.5))

p2 <- ggplot(shop[which(shop$n_item <= 20),]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(brand_number),binwidth = 1,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  labs(x = "销售商品品牌数(<=20个)",y = "店铺的数目(家)",title = "店铺销售商品品牌数分布")+
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(p1,p2,nrow = 2)

## 在销售商品占据的品牌数目上,两种方式的店铺数量的分布大致是相同的

## 店铺的月销售额可视化分析---------------------------------
summary(shop$monthly_sales_sum)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0      2034     66520     14400 111200000
# boxplot(shop$monthly_sales_sum)

ms1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(monthly_sales_sum),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  scale_x_continuous(labels = function(x) paste(x/1e6,"m",sep = "")) +
  labs(x = "月销售额(元)",y = "店铺的数目(家)",title = "店铺月销售额分布")+
  theme(plot.title = element_text(hjust = 0.5))

ms2 <- ggplot(shop[shop$monthly_sales_sum <= 10000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(monthly_sales_sum),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  #scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
  labs(x = "月销售额(<=1e4元)",y = "店铺的数目(家)",title = "店铺月销售额分布")+
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(ms1,ms2,nrow = 2)

## 可以看出 销售方式为TB_TMALL的店铺的月销售额更大,月销售额有超过90,000,000元的
## 只有很少一部分店铺的月销售额少于10000元
##  销售方式为TB_JISHI 的店铺,月销售额大多数不超过1000元

## 店铺的商品月销量数据可视化---------------------------------
summary(shop$monthly_shop_sum)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##      0.0      0.0     11.0    333.8     68.0 402200.0
dim(shop[shop$monthly_shop_sum <= 1000,])[1]/dim(shop)[1]
## [1] 0.9674897
## 月96%的商铺,月销售商品数目小于1000件
summary(shop[shop$monthly_shop_sum <= 1000,]$monthly_shop_sum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   10.00   62.43   57.00 1000.00
mss1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(monthly_shop_sum),bins = 200,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  scale_x_continuous(labels = function(x) paste(x/1e4,"万",sep = "")) +
  labs(x = "商品月销量(个)",y = "店铺的数目(家)",title = "店铺商品月销量分布")+
  theme(plot.title = element_text(hjust = 0.5))
# mss1

mss2 <- ggplot(shop[shop$monthly_shop_sum <= 1000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(monthly_shop_sum),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  #scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
  labs(x = "商品月销量(<=1e3个)",y = "店铺的数目(家)",title = "店铺商品月销量分布")+
  theme(plot.title = element_text(hjust = 0.5))
# mss2
grid.arrange(mss1,mss2,nrow = 2)

## 可以发现整体的趋势是相同的

## 对店铺的评价总数数据可视化---------------------------------
summary(shop$assessment_sum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       1      20    1396     125 1384000
dim(shop[shop$assessment_sum <= 2000,])[1]/dim(shop)[1]
## [1] 0.9547406
## 月95.4%的商铺,评价总数目小于2000个
summary(shop[shop$assessment_sum <= 2000,]$assessment_sum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     1.0    16.0   123.8    97.0  2000.0
as1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(assessment_sum),bins = 200,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  scale_x_continuous(labels = function(x) paste(x/1e4,"万",sep = "")) +
  labs(x = "评价总数目(条)",y = "店铺的数目(家)",title = "店铺评价总数目分布")+
  theme(plot.title = element_text(hjust = 0.5))
# as1

as2 <- ggplot(shop[shop$assessment_sum <= 2000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(assessment_sum),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  #scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
  labs(x = "评价总数目(<=2e3条)",y = "店铺的数目(家)",title = "店铺评价总数目分布")+
  theme(plot.title = element_text(hjust = 0.5))
# as2
grid.arrange(as1,as2,nrow = 2)

## 可以发现整体的趋势是相同的


## 店铺商品的平均售价数据可视化---------------------------------
summary(shop$price_mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0   128.2   242.5   288.0 14010.0
length(which(shop$price_mean > 3000))
## [1] 64
## 平均售价大于3千的店铺有64个
mean_price_if <- left_join(shop[which(shop$price_mean > 3000),],shop_dsr,by = "shop_id")
as.data.frame(mean_price_if)
##      shop_id n_item brand_number monthly_shop_sum monthly_sales_sum
## 1  105150986     26           10                2          6874.000
## 2  106491637      4            2               57        285000.000
## 3  106709126      6            2                2          6998.000
## 4  106994378     30            5                1          5000.000
## 5  108378081     26            8               11         41400.000
## 6  108706114     56            8                2         14331.333
## 7  109124350     65            5               29         96333.158
## 8  110449499     48            1              883       3185100.000
## 9  110763136     37            3               38        316454.246
## 10 112247087     17           11               17         72933.857
## 11 112289638     12            3                5         28548.867
## 12 113345819      3            1               30        111770.000
## 13 113728768      2            2               15         58485.000
## 14 113735217     17            8               60        542080.000
## 15 115441901     64            8               35        114404.893
## 16 115638306     44            3              131        507480.158
## 17 117441510     29           13               15         54390.000
## 18 118604216     28            8                1          3200.000
## 19 118747986     27            3                4         13055.636
## 20 119122395      7            2               15         70176.263
## 21 120030201     11            6                1          4399.600
## 22 120864435     60            6                2         18213.286
## 23 121506705      1            1                3         13497.000
## 24 121701682     44            6                4         18228.042
## 25 122361611      1            1                1          4999.000
## 26 122996061     40            6                1          3599.000
## 27 125235924     53            5               11         48085.500
## 28 128499466     12            9               25         91301.308
## 29 128800353      8            6                8         53548.000
## 30 135688535     18           13               13         55136.000
## 31 135748733     65            3               44        144321.667
## 32 144458316      2            2                1          4263.000
## 33 145460561      1            1               13         55900.000
## 34 145938867      4            3                7         24857.000
## 35 148072553     54            9               55        509461.000
## 36 148679224      1            1               21         83979.000
## 37 149529472     63           13               55        172098.000
## 38 155007283      4            1               70        283930.000
## 39 156741993      9            5                2          6088.500
## 40 162646920     12            1              123        372114.000
## 41  33205199     49            4                1          3999.000
## 42  33724331      5            4               27        131136.000
## 43  35951771     16            4                1          5449.500
## 44  36325984      8            2              414       1318640.000
## 45  36405665      5            1               39        182938.000
## 46  58288287      3            1               34        102566.000
## 47  59240636     67            5                6         25146.933
## 48  59404220     26           10                1         13800.000
## 49  59643312      1            1                1          4300.000
## 50  60414631     59            7               15         88218.000
## 51  60654322      2            1                1          4955.000
## 52  61402211     11            2             1006       4174423.500
## 53  61470019     26            6                5         23418.333
## 54  62040425      7            3                6         27655.000
## 55  62292177     52           19                7         98100.000
## 56  63346840     22            6               63        293124.000
## 57  64604098      3            2                1          3613.846
## 58  65625754     46            4                5         49995.000
## 59  66195737     43            9                1          4230.000
## 60  69320330     66            6                3         19799.000
## 61  70339413     67            5                2          6380.000
## 62  71891736     19            5                5         17895.000
## 63  72633459      1            1                2         13800.000
## 64  72917227      4            3                3         16734.000
##    assessment_sum tag_price_mean shop_type price_mean
## 1               0       982.8077  TB_JISHI   3437.000
## 2             286      4418.9975  TB_JISHI   5000.000
## 3               6      2909.0000  TB_JISHI   3499.000
## 4               1      6849.8333  TB_JISHI   5000.000
## 5              61      3427.4423  TB_JISHI   3763.636
## 6              21      1667.0536  TB_JISHI   7165.667
## 7              88      3075.4231  TB_JISHI   3321.833
## 8            3285      4521.2500  TB_TMALL   3607.135
## 9             249      4107.9459  TB_JISHI   8327.743
## 10             33      3328.4412  TB_JISHI   4290.227
## 11              5      4641.8292  TB_JISHI   5709.773
## 12             80      5665.6667  TB_TMALL   3725.667
## 13              2      2014.0000  TB_JISHI   3899.000
## 14            234      5000.5882  TB_JISHI   9034.667
## 15             90      4740.1562  TB_JISHI   3268.711
## 16            596      3381.4432  TB_JISHI   3873.894
## 17             27      4994.9655  TB_JISHI   3626.000
## 18              1      1595.7857  TB_JISHI   3200.000
## 19             19      2576.8037  TB_JISHI   3263.909
## 20             72      3665.8571  TB_JISHI   4678.418
## 21              4      4102.3636  TB_JISHI   4399.600
## 22             11      1718.0667  TB_JISHI   9106.643
## 23              3      8999.0000  TB_JISHI   4499.000
## 24             31      2450.7045  TB_JISHI   4557.010
## 25              4      4999.0000  TB_JISHI   4999.000
## 26              5      1736.7625  TB_JISHI   3599.000
## 27              7      6052.4151  TB_JISHI   4371.409
## 28             33      1357.1667  TB_JISHI   3652.052
## 29             32      7555.1875  TB_JISHI   6693.500
## 30             42      1198.1667  TB_JISHI   4241.231
## 31            125      3110.9846  TB_JISHI   3280.038
## 32              2      2161.0000  TB_JISHI   4263.000
## 33              1      4300.0000  TB_JISHI   4300.000
## 34             44      1754.2500  TB_JISHI   3551.000
## 35            134      3807.7500  TB_JISHI   9262.927
## 36             15      3999.0000  TB_JISHI   3999.000
## 37             80      2005.0952  TB_JISHI   3129.055
## 38             69      4149.0000  TB_JISHI   4056.143
## 39              0       746.9222  TB_JISHI   3044.250
## 40             45      2769.2500  TB_JISHI   3025.317
## 41              3      3139.8571  TB_JISHI   3999.000
## 42            100      5021.1000  TB_JISHI   4856.889
## 43              5      1238.1875  TB_JISHI   5449.500
## 44            657      1868.2500  TB_JISHI   3185.121
## 45            422      4603.0000  TB_JISHI   4690.718
## 46            141      3532.3333  TB_JISHI   3016.647
## 47             11      3457.7612  TB_JISHI   4191.156
## 48              0      6242.6923  TB_JISHI  13800.000
## 49              2      4300.0000  TB_JISHI   4300.000
## 50             94      2934.1864  TB_JISHI   5881.200
## 51              6      2817.5000  TB_JISHI   4955.000
## 52           4328      2580.0000  TB_JISHI   4149.526
## 53             39      3603.6154  TB_JISHI   4683.667
## 54            364      3207.4286  TB_JISHI   4609.167
## 55              5      6027.3462  TB_JISHI  14014.286
## 56            298      3419.0909  TB_JISHI   4652.762
## 57              4      1733.8333  TB_JISHI   3613.846
## 58             11      8383.9783  TB_JISHI   9999.000
## 59              1      2144.7442  TB_JISHI   4230.000
## 60             19      2489.3030  TB_JISHI   6599.667
## 61              3      1722.0597  TB_JISHI   3190.000
## 62             27      2462.1579  TB_JISHI   3579.000
## 63              2      6900.0000  TB_JISHI   6900.000
## 64              9      2893.0000  TB_JISHI   5578.000
##                                          shop_name  mas  sas  cas
## 1                                             <NA>   NA   NA   NA
## 2                                     捷时达电商部 4.99 4.99 4.98
## 3                                             <NA>   NA   NA   NA
## 4                                     儒迪的托拉斯 4.80 4.80 4.80
## 5                                       天驰高尔夫 4.77 4.80 4.73
## 6                                   阿言的生活小馆 4.96 4.96 5.00
## 7                                             <NA>   NA   NA   NA
## 8                             suunto颂拓官方旗舰店 4.91 4.87 4.85
## 9                                     珂珂球鞋鞋柜 4.97 4.97 4.97
## 10                                            <NA>   NA   NA   NA
## 11                                            <NA>   NA   NA   NA
## 12                              青岛荣信电器专营店 4.94 4.90 4.86
## 13                                            <NA>   NA   NA   NA
## 14                                      风度高尔夫 4.99 4.99 4.99
## 15                                          鞋符号 4.93 4.93 4.95
## 16                                 一步天堂sneaker 4.98 4.97 4.97
## 17                                      信天翁商城 4.99 4.93 4.94
## 18                                            <NA>   NA   NA   NA
## 19                                     JYZN sports 5.00 5.00 4.97
## 20                                            <NA>   NA   NA   NA
## 21                                          艾佛象 5.00 5.00 5.00
## 22                                   LIP球鞋工作室 4.95 4.95 4.93
## 23                                            <NA>   NA   NA   NA
## 24                                        吉吉潮店 5.00 5.00 5.00
## 25                                  瑞哥南北杂货铺 5.00 5.00 5.00
## 26                                            <NA>   NA   NA   NA
## 27                                        暴暴体育 5.00 5.00 5.00
## 28                              晴天高尔夫用品商城 4.88 4.89 4.88
## 29                                            <NA>   NA   NA   NA
## 30                          十八洞高尔夫用品特卖店 4.89 4.89 4.88
## 31                                          彧鞋屋 4.96 4.98 4.95
## 32                                            <NA>   NA   NA   NA
## 33                                            <NA>   NA   NA   NA
## 34                                            <NA>   NA   NA   NA
## 35                                            <NA>   NA   NA   NA
## 36                                    海尔商城店铺 4.95 4.96 4.97
## 37                                正品高尔夫球用品 4.95 4.97 4.97
## 38                                            <NA>   NA   NA   NA
## 39                            美国耐克篮球正品专柜 5.00 5.00 4.88
## 40                                            <NA>   NA   NA   NA
## 41                                            <NA>   NA   NA   NA
## 42                                            <NA>   NA   NA   NA
## 43                                            <NA>   NA   NA   NA
## 44                                       TCL舰旗店 4.94 4.95 4.95
## 45                            海尔智能医疗器械商城 4.96 4.96 4.94
## 46                                海尔扫地机品牌店 4.93 4.97 4.96
## 47                                            <NA>   NA   NA   NA
## 48                              TP二手高尔夫球具店 4.82 4.82 4.85
## 49 原宿流---個性あふれる!化妆品百货及日本代购代拍 5.00 5.00 5.00
## 50                                        YUKON568 4.90 4.92 4.96
## 51                                辉煌高尔夫旗航店 4.86 4.86 4.80
## 52                                    跨时代全球购 4.91 4.91 4.85
## 53                                  诚昊运动白菜店 4.93 4.95 4.93
## 54                             Haier海尔生活电器城 4.90 4.94 4.94
## 55                       Atomicgolf 阿淘美客高尔夫 4.78 4.78 4.68
## 56                                            <NA>   NA   NA   NA
## 57                                            <NA>   NA   NA   NA
## 58                                      朧月夜sole 5.00 5.00 5.00
## 59                                万里百分百诚信店 4.93 4.93 4.93
## 60                                  nbshop武汉潮铺 5.00 5.00 5.00
## 61                                            <NA>   NA   NA   NA
## 62                                            <NA>   NA   NA   NA
## 63                                  嘉乐高尔夫批发 4.96 4.98 4.98
## 64                                            <NA>   NA   NA   NA
dim(shop[shop$price_mean <= 3000,])[1]/dim(shop)[1]
## [1] 0.9972802
## 月95.5%的商铺,店铺商品的平均销售价格小于3000
summary(shop[shop$price_mean <= 3000,]$price_mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0   128.0   229.6   285.5  2999.0
pm1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(price_mean),bins = 200,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free_y") +
  scale_x_continuous(labels = function(x) paste(x/1e3,"k",sep = "")) +
  labs(x = "商品平均售价(元)",y = "店铺的数目(家)",title = "店铺商品平均售价分布")+
  theme(plot.title = element_text(hjust = 0.5))
# pm1

pm2 <- ggplot(shop[shop$price_mean <= 3000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(price_mean),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free_y") +
  #scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
  labs(x = "商品平均售价(<=3e3元)",y = "店铺的数目(家)",title = "店铺商品平均售价分布")+
  theme(plot.title = element_text(hjust = 0.5))
# pm2
grid.arrange(pm1,pm2,nrow = 2)

## 在两种累想上铺的商品平均售价上,可以发现两类上铺的分布几乎是一样的,
## 并不能说明那种类型的店铺出售的商品更高档

## 对商店商品的平均标价 进行可视化分析---------------------------
summary(shop$tag_price_mean)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##       0.0     168.0     285.0     599.2     532.5 2500000.0
length(which(shop$tag_price_mean > 3000))
## [1] 236
length(which(shop$tag_price_mean > 100000))
## [1] 3
## 平均售价大于3千的店铺有236个 ,标价大于10万的有3个
mean_price_if <- left_join(shop[which(shop$tag_price_mean > 100000),],shop_dsr,by = "shop_id")
as.data.frame(mean_price_if)
##     shop_id n_item brand_number monthly_shop_sum monthly_sales_sum
## 1 118193439      2            2                0             0.000
## 2 151623813      5            4                0             0.000
## 3  35545828     40           10                6          1069.667
##   assessment_sum tag_price_mean shop_type price_mean            shop_name
## 1              0       100084.0  TB_JISHI     0.0000         渣霄虎子的店
## 2              0       191333.2  TB_JISHI     0.0000           独自的华丽
## 3             13      2500303.0  TB_JISHI   178.2778 北京斯托克司户外装备
##    mas  sas  cas
## 1 5.00 5.00 5.00
## 2 0.00 0.00 0.00
## 3 4.74 4.95 4.95
## 这些类型的店铺军事TB_JISHI,并且约销售额要么没有要么很低
## 说明这些商品的标价很高,但是并没有人去购买,说明这些商品是博人眼球的物品


dim(shop[shop$tag_price_mean <= 3000,])[1]/dim(shop)[1]
## [1] 0.9899707
## 约98.9%的商铺,商铺商品的平均标价小于3000
summary(shop[shop$tag_price_mean <= 3000,]$tag_price_mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.01  166.80  282.70  422.40  520.50 2999.00
pm1 <- ggplot(shop[shop$tag_price_mean > 3000 & shop$tag_price_mean < 100000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(tag_price_mean),bins = 200,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  scale_x_continuous(labels = function(x) paste(x/1e3,"k",sep = "")) +
  labs(x = "商品平均标价(>3e3&<1e5元)",y = "店铺的数目(家)",title = "店铺商品平均标价分布")
# pm1

pm2 <- ggplot(shop[shop$tag_price_mean <= 3000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(tag_price_mean),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free_y") +
  #scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
  labs(x = "商品平均标价(<=3e3元)",y = "店铺的数目(家)",title = "店铺商品平均标价分布")
# pm2
grid.arrange(pm1,pm2,nrow = 2)

## 可以看出两类商店的平均标价的分布存在很明显的差异
## 在高价区,BT_JISHI的商品平均标价较高
## 在低价区,BT_TMALL的商铺比较集中于商品平均标价高的区域

## 对商铺数据进行平行坐标图可视化####--------------------------------
lab_x <- c("销售商品数","销售品牌数","月销量","月销售额","评论数","平均标价","平均售价")
ggparcoord(shop,columns = c(2:7,9),groupColumn = 8,scale = "std") +
  theme_gray(base_family = "STKaiti") +
  theme(legend.position = "top") + 
  scale_x_discrete(labels = lab_x) +
  labs(x= "",y = "标准化后数值",title = "商铺平行坐标图")

ggparcoord(shop,columns = c(2:7,9),groupColumn = 8,scale = "robust") +
  theme_gray(base_family = "STKaiti") +
  theme(legend.position = "top") + 
  scale_x_discrete(labels = lab_x) +
  labs(x= "",y = "Robust后数值",title = "商铺平行坐标图")

## Robust :减去中位数,除以中位数的标准偏差


ggparcoord(shop,columns = c(2:7,9),groupColumn = 8 ,scale = "uniminmax") +
  theme_gray(base_family = "STKaiti") +
  theme(legend.position = "top") + 
  scale_x_discrete(labels = lab_x) +
  labs(x= "",y = "单位区间数值",title = "商铺平行坐标图")

## 从平行坐标图上我们可以看出两类变店铺的差异
## 1: TB_TMALL 的销售商品的数目更多,月销量更高,月销售额更高,评论数目更多
## 2:TB_JISHI  的销售品牌数更多,平均标价更高,平均售价更高


## 分析商铺数据中的数据的相关系数####---------------------------
## 查看散点图
ggscatmat(data = as.data.frame(shop),columns = c(2:7,9),
          color = "shop_type",corMethod = "pearson") +
  theme_bw(base_family = "STKaiti") +
  theme(legend.position = "top") + 
  ggtitle("散点图矩阵")

## 从散点图和相关系数矩阵中可以看出,不同的类别销售方式相关性有差异

## 将数据标准化,然后查看相关系数和散点矩阵图--------------------------------
shop_std <- tbl_df(as.data.frame(apply(shop[,c(2:7,9)], 2, scale)))
shop_std$shop_type <- shop$shop_type
shop_std$shop_id <- shop$shop_id   # 标准化后的商铺数据
head(shop_std)
## # A tibble: 6 × 9
##        n_item brand_number monthly_shop_sum monthly_sales_sum
##         <dbl>        <dbl>            <dbl>             <dbl>
## 1  0.00494926    0.9106128      -0.04400471       0.002296298
## 2  0.10480155    1.1809999      -0.05381395      -0.025089271
## 3 -0.21472577   -0.4413223      -0.05785305      -0.052513470
## 4  0.14474246    0.3698388      -0.03304144      -0.022616612
## 5 -0.23469622   -0.4413223      -0.05381395      -0.056765884
## 6 -0.19475531   -0.1709352      -0.04592809      -0.049164979
## # ... with 5 more variables: assessment_sum <dbl>, tag_price_mean <dbl>,
## #   price_mean <dbl>, shop_type <chr>, shop_id <chr>
summary(shop_std)
##      n_item          brand_number     monthly_shop_sum  
##  Min.   :-0.26132   Min.   :-0.7117   Min.   :-0.06420  
##  1st Qu.:-0.24135   1st Qu.:-0.7117   1st Qu.:-0.06420  
##  Median :-0.16147   Median :-0.4413   Median :-0.06208  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.07152   3rd Qu.: 0.3698   3rd Qu.:-0.05112  
##  Max.   :37.48950   Max.   :14.4300   Max.   :77.30033  
##  monthly_sales_sum  assessment_sum     tag_price_mean     
##  Min.   :-0.05827   Min.   :-0.06944   Min.   : -0.03657  
##  1st Qu.:-0.05827   1st Qu.:-0.06939   1st Qu.: -0.02632  
##  Median :-0.05649   Median :-0.06845   Median : -0.01918  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   :  0.00000  
##  3rd Qu.:-0.04566   3rd Qu.:-0.06322   3rd Qu.: -0.00407  
##  Max.   :97.36520   Max.   :68.78840   Max.   :152.59263  
##    price_mean       shop_type           shop_id         
##  Min.   :-0.5673   Length:23531       Length:23531      
##  1st Qu.:-0.5673   Class :character   Class :character  
##  Median :-0.2674   Mode  :character   Mode  :character  
##  Mean   : 0.0000                                        
##  3rd Qu.: 0.1065                                        
##  Max.   :32.2195
## 查看散点图
ggscatmat(data = as.data.frame(shop_std),columns = 1:7,
          color = "shop_type",corMethod = "pearson") +
  theme_bw(base_family = "STKaiti") +
  theme(legend.position = "top") + 
  ggtitle("散点图矩阵")

## 好像数据标准化,并不能让那个改变相关系数的大小


shop_cor <- cor(shop[,c(2:7,9)])
shop_cor
##                         n_item brand_number monthly_shop_sum
## n_item             1.000000000  0.099900400     0.3290160004
## brand_number       0.099900400  1.000000000    -0.0109006465
## monthly_shop_sum   0.329016000 -0.010900647     1.0000000000
## monthly_sales_sum  0.354521976 -0.008370258     0.8335239180
## assessment_sum     0.316417453 -0.016163316     0.7299167843
## tag_price_mean    -0.000263462  0.013772150    -0.0008675259
## price_mean         0.023641528  0.084639091    -0.0064856184
##                   monthly_sales_sum assessment_sum tag_price_mean
## n_item                 3.545220e-01    0.316417453  -2.634620e-04
## brand_number          -8.370258e-03   -0.016163316   1.377215e-02
## monthly_shop_sum       8.335239e-01    0.729916784  -8.675259e-04
## monthly_sales_sum      1.000000e+00    0.607148620  -7.270806e-05
## assessment_sum         6.071486e-01    1.000000000  -1.024222e-03
## tag_price_mean        -7.270806e-05   -0.001024222   1.000000e+00
## price_mean             2.267963e-02   -0.007825621   2.086091e-02
##                     price_mean
## n_item             0.023641528
## brand_number       0.084639091
## monthly_shop_sum  -0.006485618
## monthly_sales_sum  0.022679633
## assessment_sum    -0.007825621
## tag_price_mean     0.020860913
## price_mean         1.000000000
# corr.test(shop[,c(2:7,9)])
# corr.p(shop_cor,dim(shop)[1])

lab_x <- c("销售商品数","销售品牌数","月销量","月销售额","评论数","平均标价","平均售价")
colnames(shop_cor) <- lab_x
rownames(shop_cor) <- lab_x
shop_cor
##              销售商品数   销售品牌数        月销量      月销售额
## 销售商品数  1.000000000  0.099900400  0.3290160004  3.545220e-01
## 销售品牌数  0.099900400  1.000000000 -0.0109006465 -8.370258e-03
## 月销量      0.329016000 -0.010900647  1.0000000000  8.335239e-01
## 月销售额    0.354521976 -0.008370258  0.8335239180  1.000000e+00
## 评论数      0.316417453 -0.016163316  0.7299167843  6.071486e-01
## 平均标价   -0.000263462  0.013772150 -0.0008675259 -7.270806e-05
## 平均售价    0.023641528  0.084639091 -0.0064856184  2.267963e-02
##                  评论数      平均标价     平均售价
## 销售商品数  0.316417453 -2.634620e-04  0.023641528
## 销售品牌数 -0.016163316  1.377215e-02  0.084639091
## 月销量      0.729916784 -8.675259e-04 -0.006485618
## 月销售额    0.607148620 -7.270806e-05  0.022679633
## 评论数      1.000000000 -1.024222e-03 -0.007825621
## 平均标价   -0.001024222  1.000000e+00  0.020860913
## 平均售价   -0.007825621  2.086091e-02  1.000000000
par(family = "STKaiti",mfrow = c(1,1))
corrplot(shop_cor,method = "pie",type = "full",
         mar = c(0, 6, 6, 0),title = "相关系数图")

## 可以发现月销量、月销售额、评论数之间的相关系数较大
## 销售商品数与:月销量、月销售额、评论数相关夜超过了0.5



## 分析店铺的信息数据####--------------------------------------------
head(shop_dsr)
## # A tibble: 6 × 5
##     shop_id            shop_name   mas   sas   cas
##       <chr>                <chr> <dbl> <dbl> <dbl>
## 1 126042966       李宁跑酷专卖店  4.81  4.76  4.73
## 2 148002147 lining李宁新兴专卖店  4.88  4.83  4.82
## 3 128369255   李宁羽毛球拍专业店  4.86  4.88  4.85
## 4 145412890   李宁天泽华盛专卖店  4.90  4.86  4.86
## 5 145270296 lining李宁图漫专卖店  4.89  4.87  4.82
## 6 108228574   李宁力方力合专卖店  4.90  4.85  4.83
summary(shop_dsr)
##    shop_id           shop_name              mas             sas       
##  Length:19190       Length:19190       Min.   :0.000   Min.   :0.000  
##  Class :character   Class :character   1st Qu.:4.650   1st Qu.:4.690  
##  Mode  :character   Mode  :character   Median :4.860   Median :4.870  
##                                        Mean   :4.012   Mean   :4.027  
##                                        3rd Qu.:4.960   3rd Qu.:4.970  
##                                        Max.   :5.000   Max.   :5.000  
##       cas       
##  Min.   :0.000  
##  1st Qu.:4.660  
##  Median :4.850  
##  Mean   :4.013  
##  3rd Qu.:4.950  
##  Max.   :5.000
shop_dsrj <- left_join(shop_dsr,shop[,c(1,8)],by = "shop_id")
## 可以发现有些店铺是无法表明销售类型的,需要把这些数据删除
shop_dsrj <- dplyr::filter(shop_dsrj,!is.na(shop_type))

## 对数据进行可视化--------------------------------------

mas <- ggplot(shop_dsrj,aes(mas,fill = shop_type)) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(y = ..density..),binwidth = 0.1,alpha = 0.6,
                 position='fill',color="firebrick") +
  theme(legend.position = "right") 

# mas


sas <- ggplot(shop_dsrj,aes(sas,fill = shop_type)) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(y = ..density..),binwidth = 0.1,alpha = 0.6,
                 position='fill',color="firebrick") +
  theme(legend.position = "right") 

# sas

cas <- ggplot(shop_dsrj,aes(cas,fill = shop_type)) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(y = ..density..),binwidth = 0.1,alpha = 0.6,
                 position='fill',color="firebrick") +
  theme(legend.position = "right") 

# cas

grid.arrange(mas,sas,cas,nrow = 3,top = 0,bottom = 0)
## Warning: Removed 54 rows containing missing values (geom_bar).

## Warning: Removed 54 rows containing missing values (geom_bar).
## Warning: Removed 52 rows containing missing values (geom_bar).

## 可以发现,TB_TMALL 没有低分
## 将两个商店的数据结合,只分析14977多家店铺的数据####----------------
shop <- dplyr::left_join(shop,shop_dsr,by = "shop_id") %>%
  dplyr::filter(!is.na(shop_name)) %>%
  dplyr::arrange(desc(monthly_sales_sum))
dim(shop)
## [1] 14977    13
as.data.frame(head(shop))
##    shop_id n_item brand_number monthly_shop_sum monthly_sales_sum
## 1 71955116   1071            1           258393         111205902
## 2 62147762   5672            1           338607          87688349
## 3 57299736   1501            2           402232          60401379
## 4 60790435    903            1           263702          42626982
## 5 62847409    452            1            79457          34905426
## 6 60036677   1395            1           139804          19488651
##   assessment_sum tag_price_mean shop_type price_mean         shop_name
## 1         520424       592.1662  TB_TMALL   430.3751    NIKE官方旗舰店
## 2         405415       367.0465  TB_TMALL   258.9679  adidas官方旗舰店
## 3        1384268       171.3135  TB_TMALL   150.1655      李宁官方网店
## 4         680782       283.4186  TB_TMALL   161.6483      安踏官方网店
## 5         507890       748.9314  TB_TMALL   439.2996 New Balance旗舰店
## 6        1000357       269.2136  TB_TMALL   139.3998    特步官方旗舰店
##    mas  sas  cas
## 1 4.86 4.84 4.87
## 2 4.83 4.77 4.83
## 3 4.83 4.80 4.79
## 4 4.82 4.78 4.76
## 5 4.80 4.79 4.78
## 6 4.79 4.74 4.76
## 查看两种销售方式的对比 -------------------------
table(shop$shop_type)
## 
## TB_JISHI TB_TMALL 
##    14448      529
table(shop$shop_type) / dim(shop)[1]
## 
##   TB_JISHI   TB_TMALL 
## 0.96467917 0.03532083
## 可以发现销售方式为TB_JISHI占据百分比大于96.5%
## 销售方式为TB_TMALL 占据百分比不到3.4%


## 商铺销售商品数目的可视化  ---------------------------------
summary(shop$n_item)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    4.00   18.00   44.58   53.00 5672.00
length(which(shop$n_item > 1000)) 
## [1] 57
## 可以发现有一个店铺销售的商品数目为5671件,约有65个店铺销售的商品数目多于1000
## 查看这些数据
# data.frame(shop[which(shop$n_item > 1000),])

p1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(n_item),binwidth = 5,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  labs(x = "销售商品数目(件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")

p2 <- ggplot(shop[which(shop$n_item <= 250),]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(n_item),binwidth = 5,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  labs(x = "销售商品数目(<=250件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")

grid.arrange(p1,p2,nrow = 2)

## 可以看出在两种销售方式店铺中,主要销售方式为-TB_JISHI,店铺数目占据大部分,
## 但是这种销售规模较小,大多数销售商品数目小于250
## 而对于TB_TMALL,店铺数量不多,但是销售商品数目多于250的均为这种销售方式的店铺

## 商铺销售商品的品牌数目可视化------------------------------------
summary(shop$brand_number)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.000   1.000   2.000   3.752   5.000  57.000
length(which(shop$brand_number > 20)) 
## [1] 99
## 可以发现有店铺销售的商品品牌数目为85个品牌,约有337个店铺销售的商品品牌数目多于20

p1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(brand_number),binwidth = 1,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  labs(x = "销售商品品牌数(个)",y = "店铺的数目(家)",title = "店铺销售商品品牌数分布")

p2 <- ggplot(shop[which(shop$n_item <= 20),]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(brand_number),binwidth = 1,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  labs(x = "销售商品品牌数(<=20个)",y = "店铺的数目(家)",title = "店铺销售商品品牌数分布")

grid.arrange(p1,p2,nrow = 2)

## 在销售商品占据的品牌数目上,两种方式的店铺数量的分布大致是相同的

## 店铺的月销售额可视化分析---------------------------------
summary(shop$monthly_sales_sum)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##         0         0      1813     83660     14160 111200000
# boxplot(shop$monthly_sales_sum)

ms1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(monthly_sales_sum),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  scale_x_continuous(labels = function(x) paste(x/1e6,"m",sep = "")) +
  labs(x = "月销售额(元)",y = "店铺的数目(家)",title = "店铺月销售额分布")

ms2 <- ggplot(shop[shop$monthly_sales_sum <= 10000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(monthly_sales_sum),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  #scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
  labs(x = "月销售额(<=1e4元)",y = "店铺的数目(家)",title = "店铺月销售额分布")

grid.arrange(ms1,ms2,nrow = 2)

## 可以看出 销售方式为TB_TMALL的店铺的月销售额更大,月销售额有超过90,000,000元的
## 只有很少一部分店铺的月销售额少于10000元
##  销售方式为TB_JISHI 的店铺,月销售额大多数不超过1000元

## 店铺的商品月销量数据可视化---------------------------------
summary(shop$monthly_shop_sum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       0      11     423      75  402200
dim(shop[shop$monthly_shop_sum <= 1000,])[1]/dim(shop)[1]
## [1] 0.9624758
## 月96%的商铺,月销售商品数目小于1000件
summary(shop[shop$monthly_shop_sum <= 1000,]$monthly_shop_sum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00    9.00   65.24   60.00 1000.00
mss1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(monthly_shop_sum),bins = 200,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  scale_x_continuous(labels = function(x) paste(x/1e4,"万",sep = "")) +
  labs(x = "商品月销量(个)",y = "店铺的数目(家)",title = "店铺商品月销量分布")
# mss1

mss2 <- ggplot(shop[shop$monthly_shop_sum <= 1000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(monthly_shop_sum),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  #scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
  labs(x = "商品月销量(<=1e3个)",y = "店铺的数目(家)",title = "店铺商品月销量分布")
# mss2
grid.arrange(mss1,mss2,nrow = 2)

## 可以发现整体的趋势是相同的

## 对店铺的评价总数数据可视化---------------------------------
summary(shop$assessment_sum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0       1      18    1825     134 1384000
dim(shop[shop$assessment_sum <= 2000,])[1]/dim(shop)[1]
## [1] 0.9474528
## 月95.4%的商铺,评价总数目小于2000个
summary(shop[shop$assessment_sum <= 2000,]$assessment_sum)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     1.0    15.0   127.8    98.0  2000.0
as1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(assessment_sum),bins = 200,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  scale_x_continuous(labels = function(x) paste(x/1e4,"万",sep = "")) +
  labs(x = "评价总数目(条)",y = "店铺的数目(家)",title = "店铺评价总数目分布")
# as1

as2 <- ggplot(shop[shop$assessment_sum <= 2000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(assessment_sum),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  #scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
  labs(x = "评价总数目(<=2e3条)",y = "店铺的数目(家)",title = "店铺评价总数目分布")
# as2
grid.arrange(as1,as2,nrow = 2)

## 可以发现整体的趋势是相同的


## 店铺商品的平均售价数据可视化---------------------------------
summary(shop$price_mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0   116.4   219.9   258.9 14010.0
length(which(shop$price_mean > 3000))
## [1] 38
## 平均售价大于3千的店铺有60个
mean_price_if <- left_join(shop[which(shop$price_mean > 3000),],shop_dsr,by = "shop_id")
# as.data.frame(mean_price_if)

dim(shop[shop$price_mean <= 3000,])[1]/dim(shop)[1]
## [1] 0.9974628
## 月95.5%的商铺,店铺商品的平均销售价格小于3000
summary(shop[shop$price_mean <= 3000,]$price_mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0     0.0   115.9   206.9   258.0  2944.0
pm1 <- ggplot(shop) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(price_mean),bins = 200,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free_y") +
  scale_x_continuous(labels = function(x) paste(x/1e3,"k",sep = "")) +
  labs(x = "商品平均售价(元)",y = "店铺的数目(家)",title = "店铺商品平均售价分布")
# pm1

pm2 <- ggplot(shop[shop$price_mean <= 3000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(price_mean),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free_y") +
  #scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
  labs(x = "商品平均售价(<=3e3元)",y = "店铺的数目(家)",title = "店铺商品平均售价分布")
# pm2
grid.arrange(pm1,pm2,nrow = 2)

## 在两种累想上铺的商品平均售价上,可以发现两类上铺的分布几乎是一样的,
## 并不能说明那种类型的店铺出售的商品更高档

## 对商店商品的平均标价 进行可视化分析---------------------------
summary(shop$tag_price_mean)
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##       0.0     163.0     279.1     657.6     517.8 2500000.0
length(which(shop$tag_price_mean > 3000))
## [1] 148
length(which(shop$tag_price_mean > 100000))
## [1] 3
## 平均售价大于3千的店铺有260个 ,标价大于10万的有4个
mean_price_if <- left_join(shop[which(shop$tag_price_mean > 100000),],shop_dsr,by = "shop_id")
as.data.frame(mean_price_if)
##     shop_id n_item brand_number monthly_shop_sum monthly_sales_sum
## 1  35545828     40           10                6          1069.667
## 2 118193439      2            2                0             0.000
## 3 151623813      5            4                0             0.000
##   assessment_sum tag_price_mean shop_type price_mean          shop_name.x
## 1             13      2500303.0  TB_JISHI   178.2778 北京斯托克司户外装备
## 2              0       100084.0  TB_JISHI     0.0000         渣霄虎子的店
## 3              0       191333.2  TB_JISHI     0.0000           独自的华丽
##   mas.x sas.x cas.x          shop_name.y mas.y sas.y cas.y
## 1  4.74  4.95  4.95 北京斯托克司户外装备  4.74  4.95  4.95
## 2  5.00  5.00  5.00         渣霄虎子的店  5.00  5.00  5.00
## 3  0.00  0.00  0.00           独自的华丽  0.00  0.00  0.00
## 这些类型的店铺军事TB_JISHI,并且约销售额要么没有要么很低
## 说明这些商品的标价很高,但是并没有人去购买,说明这些商品是博人眼球的物品


dim(shop[shop$tag_price_mean <= 3000,])[1]/dim(shop)[1]
## [1] 0.9901182
## 约98.9%的商铺,商铺商品的平均标价小于3000
summary(shop[shop$tag_price_mean <= 3000,]$tag_price_mean)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.01  162.00  276.40  410.00  503.00 2999.00
pm1 <- ggplot(shop[shop$tag_price_mean > 3000 & shop$tag_price_mean < 100000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(tag_price_mean),bins = 200,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free") +
  scale_x_continuous(labels = function(x) paste(x/1e3,"k",sep = "")) +
  labs(x = "商品平均标价(>3e3&<1e5元)",y = "店铺的数目(家)",title = "店铺商品平均标价分布")
# pm1

pm2 <- ggplot(shop[shop$tag_price_mean <= 3000,]) +
  theme_bw(base_family = "STKaiti") +
  geom_histogram(aes(tag_price_mean),bins = 100,color="firebrick",fill = "red",
                 alpha = 0.6) + 
  facet_wrap(~shop_type,scales = "free_y") +
  #scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
  labs(x = "商品平均标价(<=3e3元)",y = "店铺的数目(家)",title = "店铺商品平均标价分布")
# pm2
grid.arrange(pm1,pm2,nrow = 2)

## 可以看出两类商店的平均标价的分布存在很明显的差异
## 在高价区,BT_JISHI的商品平均标价较高
## 在低价区,BT_TMALL的商铺比较集中于商品平均标价高的区域


## 对商铺数据进行平行坐标图可视化####--------------------------------
lab_x <- c("销售商品数","销售品牌数","月销量","月销售额","评论数","平均标价",
           "平均售价","mas","sas","cas")
ggparcoord(shop,columns = c(2:7,9,11:13),groupColumn = 8,scale = "std") +
  theme_gray(base_family = "STKaiti") +
  theme(legend.position = "top") + 
  scale_x_discrete(labels = lab_x) +
  labs(x= "",y = "标准化后数值",title = "商铺平行坐标图")

ggparcoord(shop,columns = c(2:7,9,11:13),groupColumn = 8,scale = "robust") +
  theme_gray(base_family = "STKaiti") +
  theme(legend.position = "top") + 
  scale_x_discrete(labels = lab_x) +
  labs(x= "",y = "Robust后数值",title = "商铺平行坐标图")

## Robust :减去中位数,除以中位数的标准偏差


ggparcoord(shop,columns = c(2:7,9,11:13),groupColumn = 8 ,scale = "uniminmax") +
  theme_gray(base_family = "STKaiti") +
  theme(legend.position = "top") + 
  scale_x_discrete(labels = lab_x) +
  labs(x= "",y = "单位区间数值",title = "商铺平行坐标图")

## 从平行坐标图上我们可以看出两类变店铺的差异
## 1: TB_TMALL 的销售商品的数目更多,月销量更高,月销售额更高,评论数目更多
## 2:TB_JISHI  的销售品牌数更多,平均标价更高,平均售价更高


## 分析商铺数据中的数据的相关系数####---------------------------
## 查看散点图
ggscatmat(data = as.data.frame(shop),columns = c(2:7,9,11:13),
          color = "shop_type",corMethod = "pearson") +
  theme_bw(base_family = "STKaiti") +
  theme(legend.position = "top") + 
  ggtitle("散点图矩阵")

## 从散点图和相关系数矩阵中可以看出,不同的类别销售方式相关性有差异

## 将数据标准化,然后查看相关系数和散点矩阵图--------------------------------
shop_std <- tbl_df(as.data.frame(apply(shop[,c(2:7,9,11:13)], 2, scale)))
shop_std$shop_type <- shop$shop_type
shop_std$shop_id <- shop$shop_id   # 标准化后的商铺数据
head(shop_std)
## # A tibble: 6 × 12
##      n_item brand_number monthly_shop_sum monthly_sales_sum assessment_sum
##       <dbl>        <dbl>            <dbl>             <dbl>          <dbl>
## 1  5.853689   -0.6916237         40.54370          78.28417       21.18327
## 2 32.093174   -0.6916237         53.15048          61.71636       16.48549
## 3  8.305978   -0.4402661         63.15006          42.49305       56.46880
## 4  4.895586   -0.6916237         41.37808          29.97122       27.73343
## 5  2.323535   -0.6916237         12.42134          24.53148       20.67130
## 6  7.701460   -0.6916237         21.90573          13.67056       40.78715
## # ... with 7 more variables: tag_price_mean <dbl>, price_mean <dbl>,
## #   mas <dbl>, sas <dbl>, cas <dbl>, shop_type <chr>, shop_id <chr>
summary(shop_std)
##      n_item          brand_number     monthly_shop_sum  
##  Min.   :-0.24852   Min.   :-0.6916   Min.   :-0.06647  
##  1st Qu.:-0.23141   1st Qu.:-0.6916   1st Qu.:-0.06647  
##  Median :-0.15157   Median :-0.4403   Median :-0.06474  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.00000  
##  3rd Qu.: 0.04804   3rd Qu.: 0.3138   3rd Qu.:-0.05469  
##  Max.   :32.09317   Max.   :13.3844   Max.   :63.15006  
##  monthly_sales_sum  assessment_sum     tag_price_mean     
##  Min.   :-0.05894   Min.   :-0.07454   Min.   : -0.03204  
##  1st Qu.:-0.05894   1st Qu.:-0.07450   1st Qu.: -0.02409  
##  Median :-0.05766   Median :-0.07381   Median : -0.01844  
##  Mean   : 0.00000   Mean   : 0.00000   Mean   :  0.00000  
##  3rd Qu.:-0.04896   3rd Qu.:-0.06907   3rd Qu.: -0.00681  
##  Max.   :78.28417   Max.   :56.46880   Max.   :121.78834  
##    price_mean            mas               sas               cas         
##  Min.   :-0.52265   Min.   :-2.6215   Min.   :-2.6234   Min.   :-2.6212  
##  1st Qu.:-0.52265   1st Qu.: 0.2954   1st Qu.: 0.3039   1st Qu.: 0.2955  
##  Median :-0.24602   Median : 0.3883   Median : 0.3904   Median : 0.3884  
##  Mean   : 0.00000   Mean   : 0.0000   Mean   : 0.0000   Mean   : 0.0000  
##  3rd Qu.: 0.09251   3rd Qu.: 0.4440   3rd Qu.: 0.4398   3rd Qu.: 0.4441  
##  Max.   :32.77893   Max.   : 0.4750   Max.   : 0.4645   Max.   : 0.4750  
##   shop_type           shop_id         
##  Length:14977       Length:14977      
##  Class :character   Class :character  
##  Mode  :character   Mode  :character  
##                                       
##                                       
## 
## 查看散点图
ggscatmat(data = as.data.frame(shop_std),columns = 1:7,
          color = "shop_type",corMethod = "pearson") +
  theme_bw(base_family = "STKaiti") +
  theme(legend.position = "top") + 
  ggtitle("散点图矩阵")

## 查看店铺的累积月销售分布####--------------------------------
month_all <- sum(shop$monthly_sales_sum)
print(month_all)  ## 约12.5亿元
## [1] 1253026233
month_pro <-cumsum(shop$monthly_sales_sum / month_all)
ggplot() +
  theme_grey(base_family = "STKaiti") +
  geom_line(aes(x = seq(1,length(month_pro)),y = month_pro)) + 
  geom_vline(xintercept = length(month_pro[month_pro != 1]),color = "red") +
  labs(x = "店铺数目",y = "累积百分比",title = "店铺月销售额") +
  geom_text(aes(length(month_pro[month_pro != 1])-1000,0.5),label = "月销售=0",
            family = "STKaiti") +
  geom_vline(xintercept = length(month_pro[month_pro <= 0.9]),color = "red") +
  geom_text(aes(length(month_pro[month_pro <= 0.9])-600,0.5),label = "月销售\n占所有\n90%",
            family = "STKaiti") +
  scale_x_continuous(breaks = seq(1,length(month_pro),by = 1000)) +
  geom_text(aes(length(month_pro[month_pro <= 0.9])+3000,0.5),label = "月销售占所有10%",
            family = "STKaiti") 

## 绘制直方图查看销量售额的分布
data1 <- data.frame(month_pro = month_pro,shop_type = shop$shop_type)
data1 <- data1[data1$month_pro<1,]  # 只查看销量部位0的数据
ggplot(data1) + theme_bw(base_family = "STKaiti") +
  geom_bar(aes(x = seq(1,length(month_pro)),y = month_pro,
               color = shop_type),stat = "identity",width = 1) +
  theme(legend.position = "top") +
  labs(x = "店铺数目",y = "累积百分比",title = "店铺月销售额") 

## 可以发现虽然店铺为TB_TMALL类型的店铺数量少,但是对总体月销售额的贡献却占据主题部分