## 2016年厦门大数据大赛
## 第二题:基于大数据的商品销售预测及关联销售挖掘
## author:Daitu
## 2016-7-12
## 工作:读取预处理后的数据进行探索分析;数据可视化
## 更改工作文件夹------------------------------------
setwd("/Users/daitu/数据分析/2016ABD")
getwd()
## [1] "/Users/daitu/数据分析/2016ABD"
rm(list = ls());gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 367078 19.7 592000 31.7 460000 24.6
## Vcells 562803 4.3 1023718 7.9 786371 6.0
## 加载所需要的包-----------------------------------
library(stringr)
library(data.table)
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(VIM)
## Loading required package: colorspace
## Loading required package: grid
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(GGally)
##
## Attaching package: 'GGally'
## The following object is masked from 'package:dplyr':
##
## nasa
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(corrplot)
## 第一步:读取数据####-------------------------------------------
# item_id 每一个商品链接的独有的ID ----字符串-------
# item_number 商品的款号 ----字符串-------
# shop_id 销售这件商品的店铺ID ----字符串-------
# shop_type 销售这件商品的店铺类型,分为TB_TMALL与TB_JISHI两种 ----字符串-------
# brand_name 商品的品牌名 ----字符串-------
# item_name 商品名(商品标题) ----字符串-------
# price 该商品的销售价格(此处对各sku取均值)
# tag_price 商品的标签价
# monthly_sales_num 商品月销量
# assessment_num 商品评价数
# monthly_sales 月销售额 -----价格乘以销量-----
## 商品销售
load("第二题数据/item_fact.RData")
head(item_fact)
## # A tibble: 6 × 11
## item_id item_number shop_id shop_type brand_name
## <chr> <chr> <chr> <chr> <chr>
## 1 10003087358 UB 64971284 TB_JISHI umbro
## 2 10005342950 2011030511534230 61051459 TB_JISHI 美津浓
## 3 10006292251 467046-401 64767888 TB_JISHI nike
## 4 10010133546 YKL40 65507365 TB_TMALL 骐煌
## 5 10013604183 AWDK288 62687027 TB_TMALL 李宁
## 6 10014801651 7X50 35714901 TB_JISHI yukon
## # ... with 6 more variables: item_name <chr>, price <dbl>,
## # tag_price <dbl>, monthly_sales_num <dbl>, assessment_num <dbl>,
## # monthly_sales <dbl>
summary(item_fact)
## item_id item_number shop_id
## Length:947276 Length:947276 Length:947276
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## shop_type brand_name item_name
## Length:947276 Length:947276 Length:947276
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## price tag_price monthly_sales_num
## Min. :0.00e+00 Min. :0.00e+00 Min. :0.00e+00
## 1st Qu.:1.31e+02 1st Qu.:1.68e+02 1st Qu.:0.00e+00
## Median :2.14e+02 Median :2.88e+02 Median :0.00e+00
## Mean :5.16e+02 Mean :5.83e+02 Mean :8.29e+00
## 3rd Qu.:4.15e+02 3rd Qu.:4.99e+02 3rd Qu.:1.00e+00
## Max. :1.00e+08 Max. :1.00e+08 Max. :2.48e+05
## assessment_num monthly_sales
## Min. : -1.0 Min. : 0
## 1st Qu.: 0.0 1st Qu.: 0
## Median : 0.0 Median : 0
## Mean : 34.7 Mean : 1652
## 3rd Qu.: 3.0 3rd Qu.: 225
## Max. :891523.0 Max. :4678400
# as.data.frame(item_fact[item_fact$price > 30000 ,])
# ## 销售前500数据
# load("第二题数据/top500df.RData")
# head(top500df)
## 读取店铺的信息数据####
filename4 <- "第二题数据/shop_dsr."
shop_dsr <- fread(filename4,header = TRUE,sep = "\t",
colClasses = rep(c("character","numeric"),times = c(2,3)))
head(shop_dsr)
## shop_id shop_name mas sas cas
## 1: 126042966 李宁跑酷专卖店 4.81 4.76 4.73
## 2: 148002147 lining李宁新兴专卖店 4.88 4.83 4.82
## 3: 128369255 李宁羽毛球拍专业店 4.86 4.88 4.85
## 4: 145412890 李宁天泽华盛专卖店 4.90 4.86 4.86
## 5: 145270296 lining李宁图漫专卖店 4.89 4.87 4.82
## 6: 108228574 李宁力方力合专卖店 4.90 4.85 4.83
dim(shop_dsr) #只有19190个店铺名
## [1] 19190 5
shop_dsr <- tbl_df(shop_dsr)
## 将数据按照店铺分组---------------------------------------
shop <- item_fact %>%
dplyr::group_by(shop_id) %>% ## 将数据按照店铺的ID进行分组
dplyr::summarise(n_item = n_distinct(item_id), ## 该店铺共销售了多少种商品
brand_number = n_distinct(brand_name), ## 该店铺共销售了几种品牌的商品
monthly_shop_sum = sum(monthly_sales_num), ## 该店铺的月销量
monthly_sales_sum = sum(monthly_sales), ## 该店铺的月销售额
assessment_sum = sum(assessment_num), ## 该店铺的评价总数
tag_price_mean = mean(tag_price), # 该店铺商品的平均标签价
shop_type = unique(shop_type)) %>% ## 店铺的销售方式
mutate(price_mean = monthly_sales_sum / monthly_shop_sum) ## 该店铺商品的平均售价)
dim(shop) # 有 23531 家店铺
## [1] 23531 9
summary(shop)
## shop_id n_item brand_number monthly_shop_sum
## Length:23531 Min. : 1.00 Min. : 1.000 Min. : 0.0
## Class :character 1st Qu.: 4.00 1st Qu.: 1.000 1st Qu.: 0.0
## Mode :character Median : 16.00 Median : 2.000 Median : 11.0
## Mean : 40.26 Mean : 3.632 Mean : 333.8
## 3rd Qu.: 51.00 3rd Qu.: 5.000 3rd Qu.: 68.0
## Max. :5672.00 Max. :57.000 Max. :402232.0
##
## monthly_sales_sum assessment_sum tag_price_mean
## Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 0 1st Qu.: 1 1st Qu.: 168.0
## Median : 2034 Median : 20 Median : 285.0
## Mean : 66519 Mean : 1396 Mean : 599.2
## 3rd Qu.: 14398 3rd Qu.: 125 3rd Qu.: 532.5
## Max. :111205902 Max. :1384268 Max. :2500303.0
##
## shop_type price_mean
## Length:23531 Min. : 0.0
## Class :character 1st Qu.: 105.2
## Mode :character Median : 184.7
## Mean : 325.2
## 3rd Qu.: 375.1
## Max. :14014.3
## NA's :5989
## 可以发现平均销售价格会出现 0/0 为缺失值的情况,针对这种情况,将平均销售价格定位0元。
shop$price_mean[is.na(shop$price_mean)] <- 0
summary(shop)
## shop_id n_item brand_number monthly_shop_sum
## Length:23531 Min. : 1.00 Min. : 1.000 Min. : 0.0
## Class :character 1st Qu.: 4.00 1st Qu.: 1.000 1st Qu.: 0.0
## Mode :character Median : 16.00 Median : 2.000 Median : 11.0
## Mean : 40.26 Mean : 3.632 Mean : 333.8
## 3rd Qu.: 51.00 3rd Qu.: 5.000 3rd Qu.: 68.0
## Max. :5672.00 Max. :57.000 Max. :402232.0
## monthly_sales_sum assessment_sum tag_price_mean
## Min. : 0 Min. : 0 Min. : 0.0
## 1st Qu.: 0 1st Qu.: 1 1st Qu.: 168.0
## Median : 2034 Median : 20 Median : 285.0
## Mean : 66519 Mean : 1396 Mean : 599.2
## 3rd Qu.: 14398 3rd Qu.: 125 3rd Qu.: 532.5
## Max. :111205902 Max. :1384268 Max. :2500303.0
## shop_type price_mean
## Length:23531 Min. : 0.0
## Class :character 1st Qu.: 0.0
## Mode :character Median : 128.2
## Mean : 242.5
## 3rd Qu.: 288.0
## Max. :14014.3
## 将 shop_dsr和shop数据连接
sum(shop$shop_id %in% shop_dsr$shop_id)
## [1] 14977
## 我们可以发现这两个数据集中,只有14977个店铺是相同的,其他则是不同的
# aa <- left_join(shop_dsr,shop,by = "shop_id")
## 对上面的商铺数据进行可视化探索分析
## 查看两种销售方式的对比 -------------------------
table(shop$shop_type)
##
## TB_JISHI TB_TMALL
## 22833 698
table(shop$shop_type) / dim(shop)[1]
##
## TB_JISHI TB_TMALL
## 0.970337 0.029663
## 可以发现销售方式为TB_JISHI占据百分比大于97%
## 销售方式为TB_TMALL 占据百分比不到3%
## 商铺销售商品数目的可视化 ---------------------------------
summary(shop$n_item)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 4.00 16.00 40.26 51.00 5672.00
length(which(shop$n_item > 1000))
## [1] 64
## 可以发现有一个店铺销售的商品数目为5671件,约有65个店铺销售的商品数目多于1000
## 查看这些数据
# data.frame(shop[which(shop$n_item > 1000),])
p1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(n_item),binwidth = 5,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
labs(x = "销售商品数目(件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")+
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot(shop[which(shop$n_item <= 250),]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(n_item),binwidth = 5,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
labs(x = "销售商品数目(<=250件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")+
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1,p2,nrow = 2)

## 可以看出在两种销售方式店铺中,主要销售方式为-TB_JISHI,店铺数目占据大部分,
## 但是这种销售规模较小,大多数销售商品数目小于250
## 而对于TB_TMALL,店铺数量不多,但是销售商品数目多于250的均为这种销售方式的店铺
## 商铺销售商品的品牌数目可视化------------------------------------
summary(shop$brand_number)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 3.632 5.000 57.000
length(which(shop$brand_number > 20))
## [1] 116
## 可以发现有店铺销售的商品品牌数目为57个品牌,约有116个店铺销售的商品品牌数目多于20
p1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(brand_number),binwidth = 1,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
labs(x = "销售商品品牌数(个)",y = "店铺的数目(家)",title = "店铺销售商品品牌数分布")+
theme(plot.title = element_text(hjust = 0.5))
p2 <- ggplot(shop[which(shop$n_item <= 20),]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(brand_number),binwidth = 1,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
labs(x = "销售商品品牌数(<=20个)",y = "店铺的数目(家)",title = "店铺销售商品品牌数分布")+
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(p1,p2,nrow = 2)

## 在销售商品占据的品牌数目上,两种方式的店铺数量的分布大致是相同的
## 店铺的月销售额可视化分析---------------------------------
summary(shop$monthly_sales_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 2034 66520 14400 111200000
# boxplot(shop$monthly_sales_sum)
ms1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(monthly_sales_sum),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
scale_x_continuous(labels = function(x) paste(x/1e6,"m",sep = "")) +
labs(x = "月销售额(元)",y = "店铺的数目(家)",title = "店铺月销售额分布")+
theme(plot.title = element_text(hjust = 0.5))
ms2 <- ggplot(shop[shop$monthly_sales_sum <= 10000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(monthly_sales_sum),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
#scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
labs(x = "月销售额(<=1e4元)",y = "店铺的数目(家)",title = "店铺月销售额分布")+
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(ms1,ms2,nrow = 2)

## 可以看出 销售方式为TB_TMALL的店铺的月销售额更大,月销售额有超过90,000,000元的
## 只有很少一部分店铺的月销售额少于10000元
## 销售方式为TB_JISHI 的店铺,月销售额大多数不超过1000元
## 店铺的商品月销量数据可视化---------------------------------
summary(shop$monthly_shop_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 11.0 333.8 68.0 402200.0
dim(shop[shop$monthly_shop_sum <= 1000,])[1]/dim(shop)[1]
## [1] 0.9674897
## 月96%的商铺,月销售商品数目小于1000件
summary(shop[shop$monthly_shop_sum <= 1000,]$monthly_shop_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 10.00 62.43 57.00 1000.00
mss1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(monthly_shop_sum),bins = 200,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
scale_x_continuous(labels = function(x) paste(x/1e4,"万",sep = "")) +
labs(x = "商品月销量(个)",y = "店铺的数目(家)",title = "店铺商品月销量分布")+
theme(plot.title = element_text(hjust = 0.5))
# mss1
mss2 <- ggplot(shop[shop$monthly_shop_sum <= 1000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(monthly_shop_sum),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
#scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
labs(x = "商品月销量(<=1e3个)",y = "店铺的数目(家)",title = "店铺商品月销量分布")+
theme(plot.title = element_text(hjust = 0.5))
# mss2
grid.arrange(mss1,mss2,nrow = 2)

## 可以发现整体的趋势是相同的
## 对店铺的评价总数数据可视化---------------------------------
summary(shop$assessment_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1 20 1396 125 1384000
dim(shop[shop$assessment_sum <= 2000,])[1]/dim(shop)[1]
## [1] 0.9547406
## 月95.4%的商铺,评价总数目小于2000个
summary(shop[shop$assessment_sum <= 2000,]$assessment_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 1.0 16.0 123.8 97.0 2000.0
as1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(assessment_sum),bins = 200,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
scale_x_continuous(labels = function(x) paste(x/1e4,"万",sep = "")) +
labs(x = "评价总数目(条)",y = "店铺的数目(家)",title = "店铺评价总数目分布")+
theme(plot.title = element_text(hjust = 0.5))
# as1
as2 <- ggplot(shop[shop$assessment_sum <= 2000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(assessment_sum),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
#scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
labs(x = "评价总数目(<=2e3条)",y = "店铺的数目(家)",title = "店铺评价总数目分布")+
theme(plot.title = element_text(hjust = 0.5))
# as2
grid.arrange(as1,as2,nrow = 2)

## 可以发现整体的趋势是相同的
## 店铺商品的平均售价数据可视化---------------------------------
summary(shop$price_mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 128.2 242.5 288.0 14010.0
length(which(shop$price_mean > 3000))
## [1] 64
## 平均售价大于3千的店铺有64个
mean_price_if <- left_join(shop[which(shop$price_mean > 3000),],shop_dsr,by = "shop_id")
as.data.frame(mean_price_if)
## shop_id n_item brand_number monthly_shop_sum monthly_sales_sum
## 1 105150986 26 10 2 6874.000
## 2 106491637 4 2 57 285000.000
## 3 106709126 6 2 2 6998.000
## 4 106994378 30 5 1 5000.000
## 5 108378081 26 8 11 41400.000
## 6 108706114 56 8 2 14331.333
## 7 109124350 65 5 29 96333.158
## 8 110449499 48 1 883 3185100.000
## 9 110763136 37 3 38 316454.246
## 10 112247087 17 11 17 72933.857
## 11 112289638 12 3 5 28548.867
## 12 113345819 3 1 30 111770.000
## 13 113728768 2 2 15 58485.000
## 14 113735217 17 8 60 542080.000
## 15 115441901 64 8 35 114404.893
## 16 115638306 44 3 131 507480.158
## 17 117441510 29 13 15 54390.000
## 18 118604216 28 8 1 3200.000
## 19 118747986 27 3 4 13055.636
## 20 119122395 7 2 15 70176.263
## 21 120030201 11 6 1 4399.600
## 22 120864435 60 6 2 18213.286
## 23 121506705 1 1 3 13497.000
## 24 121701682 44 6 4 18228.042
## 25 122361611 1 1 1 4999.000
## 26 122996061 40 6 1 3599.000
## 27 125235924 53 5 11 48085.500
## 28 128499466 12 9 25 91301.308
## 29 128800353 8 6 8 53548.000
## 30 135688535 18 13 13 55136.000
## 31 135748733 65 3 44 144321.667
## 32 144458316 2 2 1 4263.000
## 33 145460561 1 1 13 55900.000
## 34 145938867 4 3 7 24857.000
## 35 148072553 54 9 55 509461.000
## 36 148679224 1 1 21 83979.000
## 37 149529472 63 13 55 172098.000
## 38 155007283 4 1 70 283930.000
## 39 156741993 9 5 2 6088.500
## 40 162646920 12 1 123 372114.000
## 41 33205199 49 4 1 3999.000
## 42 33724331 5 4 27 131136.000
## 43 35951771 16 4 1 5449.500
## 44 36325984 8 2 414 1318640.000
## 45 36405665 5 1 39 182938.000
## 46 58288287 3 1 34 102566.000
## 47 59240636 67 5 6 25146.933
## 48 59404220 26 10 1 13800.000
## 49 59643312 1 1 1 4300.000
## 50 60414631 59 7 15 88218.000
## 51 60654322 2 1 1 4955.000
## 52 61402211 11 2 1006 4174423.500
## 53 61470019 26 6 5 23418.333
## 54 62040425 7 3 6 27655.000
## 55 62292177 52 19 7 98100.000
## 56 63346840 22 6 63 293124.000
## 57 64604098 3 2 1 3613.846
## 58 65625754 46 4 5 49995.000
## 59 66195737 43 9 1 4230.000
## 60 69320330 66 6 3 19799.000
## 61 70339413 67 5 2 6380.000
## 62 71891736 19 5 5 17895.000
## 63 72633459 1 1 2 13800.000
## 64 72917227 4 3 3 16734.000
## assessment_sum tag_price_mean shop_type price_mean
## 1 0 982.8077 TB_JISHI 3437.000
## 2 286 4418.9975 TB_JISHI 5000.000
## 3 6 2909.0000 TB_JISHI 3499.000
## 4 1 6849.8333 TB_JISHI 5000.000
## 5 61 3427.4423 TB_JISHI 3763.636
## 6 21 1667.0536 TB_JISHI 7165.667
## 7 88 3075.4231 TB_JISHI 3321.833
## 8 3285 4521.2500 TB_TMALL 3607.135
## 9 249 4107.9459 TB_JISHI 8327.743
## 10 33 3328.4412 TB_JISHI 4290.227
## 11 5 4641.8292 TB_JISHI 5709.773
## 12 80 5665.6667 TB_TMALL 3725.667
## 13 2 2014.0000 TB_JISHI 3899.000
## 14 234 5000.5882 TB_JISHI 9034.667
## 15 90 4740.1562 TB_JISHI 3268.711
## 16 596 3381.4432 TB_JISHI 3873.894
## 17 27 4994.9655 TB_JISHI 3626.000
## 18 1 1595.7857 TB_JISHI 3200.000
## 19 19 2576.8037 TB_JISHI 3263.909
## 20 72 3665.8571 TB_JISHI 4678.418
## 21 4 4102.3636 TB_JISHI 4399.600
## 22 11 1718.0667 TB_JISHI 9106.643
## 23 3 8999.0000 TB_JISHI 4499.000
## 24 31 2450.7045 TB_JISHI 4557.010
## 25 4 4999.0000 TB_JISHI 4999.000
## 26 5 1736.7625 TB_JISHI 3599.000
## 27 7 6052.4151 TB_JISHI 4371.409
## 28 33 1357.1667 TB_JISHI 3652.052
## 29 32 7555.1875 TB_JISHI 6693.500
## 30 42 1198.1667 TB_JISHI 4241.231
## 31 125 3110.9846 TB_JISHI 3280.038
## 32 2 2161.0000 TB_JISHI 4263.000
## 33 1 4300.0000 TB_JISHI 4300.000
## 34 44 1754.2500 TB_JISHI 3551.000
## 35 134 3807.7500 TB_JISHI 9262.927
## 36 15 3999.0000 TB_JISHI 3999.000
## 37 80 2005.0952 TB_JISHI 3129.055
## 38 69 4149.0000 TB_JISHI 4056.143
## 39 0 746.9222 TB_JISHI 3044.250
## 40 45 2769.2500 TB_JISHI 3025.317
## 41 3 3139.8571 TB_JISHI 3999.000
## 42 100 5021.1000 TB_JISHI 4856.889
## 43 5 1238.1875 TB_JISHI 5449.500
## 44 657 1868.2500 TB_JISHI 3185.121
## 45 422 4603.0000 TB_JISHI 4690.718
## 46 141 3532.3333 TB_JISHI 3016.647
## 47 11 3457.7612 TB_JISHI 4191.156
## 48 0 6242.6923 TB_JISHI 13800.000
## 49 2 4300.0000 TB_JISHI 4300.000
## 50 94 2934.1864 TB_JISHI 5881.200
## 51 6 2817.5000 TB_JISHI 4955.000
## 52 4328 2580.0000 TB_JISHI 4149.526
## 53 39 3603.6154 TB_JISHI 4683.667
## 54 364 3207.4286 TB_JISHI 4609.167
## 55 5 6027.3462 TB_JISHI 14014.286
## 56 298 3419.0909 TB_JISHI 4652.762
## 57 4 1733.8333 TB_JISHI 3613.846
## 58 11 8383.9783 TB_JISHI 9999.000
## 59 1 2144.7442 TB_JISHI 4230.000
## 60 19 2489.3030 TB_JISHI 6599.667
## 61 3 1722.0597 TB_JISHI 3190.000
## 62 27 2462.1579 TB_JISHI 3579.000
## 63 2 6900.0000 TB_JISHI 6900.000
## 64 9 2893.0000 TB_JISHI 5578.000
## shop_name mas sas cas
## 1 <NA> NA NA NA
## 2 捷时达电商部 4.99 4.99 4.98
## 3 <NA> NA NA NA
## 4 儒迪的托拉斯 4.80 4.80 4.80
## 5 天驰高尔夫 4.77 4.80 4.73
## 6 阿言的生活小馆 4.96 4.96 5.00
## 7 <NA> NA NA NA
## 8 suunto颂拓官方旗舰店 4.91 4.87 4.85
## 9 珂珂球鞋鞋柜 4.97 4.97 4.97
## 10 <NA> NA NA NA
## 11 <NA> NA NA NA
## 12 青岛荣信电器专营店 4.94 4.90 4.86
## 13 <NA> NA NA NA
## 14 风度高尔夫 4.99 4.99 4.99
## 15 鞋符号 4.93 4.93 4.95
## 16 一步天堂sneaker 4.98 4.97 4.97
## 17 信天翁商城 4.99 4.93 4.94
## 18 <NA> NA NA NA
## 19 JYZN sports 5.00 5.00 4.97
## 20 <NA> NA NA NA
## 21 艾佛象 5.00 5.00 5.00
## 22 LIP球鞋工作室 4.95 4.95 4.93
## 23 <NA> NA NA NA
## 24 吉吉潮店 5.00 5.00 5.00
## 25 瑞哥南北杂货铺 5.00 5.00 5.00
## 26 <NA> NA NA NA
## 27 暴暴体育 5.00 5.00 5.00
## 28 晴天高尔夫用品商城 4.88 4.89 4.88
## 29 <NA> NA NA NA
## 30 十八洞高尔夫用品特卖店 4.89 4.89 4.88
## 31 彧鞋屋 4.96 4.98 4.95
## 32 <NA> NA NA NA
## 33 <NA> NA NA NA
## 34 <NA> NA NA NA
## 35 <NA> NA NA NA
## 36 海尔商城店铺 4.95 4.96 4.97
## 37 正品高尔夫球用品 4.95 4.97 4.97
## 38 <NA> NA NA NA
## 39 美国耐克篮球正品专柜 5.00 5.00 4.88
## 40 <NA> NA NA NA
## 41 <NA> NA NA NA
## 42 <NA> NA NA NA
## 43 <NA> NA NA NA
## 44 TCL舰旗店 4.94 4.95 4.95
## 45 海尔智能医疗器械商城 4.96 4.96 4.94
## 46 海尔扫地机品牌店 4.93 4.97 4.96
## 47 <NA> NA NA NA
## 48 TP二手高尔夫球具店 4.82 4.82 4.85
## 49 原宿流---個性あふれる!化妆品百货及日本代购代拍 5.00 5.00 5.00
## 50 YUKON568 4.90 4.92 4.96
## 51 辉煌高尔夫旗航店 4.86 4.86 4.80
## 52 跨时代全球购 4.91 4.91 4.85
## 53 诚昊运动白菜店 4.93 4.95 4.93
## 54 Haier海尔生活电器城 4.90 4.94 4.94
## 55 Atomicgolf 阿淘美客高尔夫 4.78 4.78 4.68
## 56 <NA> NA NA NA
## 57 <NA> NA NA NA
## 58 朧月夜sole 5.00 5.00 5.00
## 59 万里百分百诚信店 4.93 4.93 4.93
## 60 nbshop武汉潮铺 5.00 5.00 5.00
## 61 <NA> NA NA NA
## 62 <NA> NA NA NA
## 63 嘉乐高尔夫批发 4.96 4.98 4.98
## 64 <NA> NA NA NA
dim(shop[shop$price_mean <= 3000,])[1]/dim(shop)[1]
## [1] 0.9972802
## 月95.5%的商铺,店铺商品的平均销售价格小于3000
summary(shop[shop$price_mean <= 3000,]$price_mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 128.0 229.6 285.5 2999.0
pm1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(price_mean),bins = 200,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free_y") +
scale_x_continuous(labels = function(x) paste(x/1e3,"k",sep = "")) +
labs(x = "商品平均售价(元)",y = "店铺的数目(家)",title = "店铺商品平均售价分布")+
theme(plot.title = element_text(hjust = 0.5))
# pm1
pm2 <- ggplot(shop[shop$price_mean <= 3000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(price_mean),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free_y") +
#scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
labs(x = "商品平均售价(<=3e3元)",y = "店铺的数目(家)",title = "店铺商品平均售价分布")+
theme(plot.title = element_text(hjust = 0.5))
# pm2
grid.arrange(pm1,pm2,nrow = 2)

## 在两种累想上铺的商品平均售价上,可以发现两类上铺的分布几乎是一样的,
## 并不能说明那种类型的店铺出售的商品更高档
## 对商店商品的平均标价 进行可视化分析---------------------------
summary(shop$tag_price_mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 168.0 285.0 599.2 532.5 2500000.0
length(which(shop$tag_price_mean > 3000))
## [1] 236
length(which(shop$tag_price_mean > 100000))
## [1] 3
## 平均售价大于3千的店铺有236个 ,标价大于10万的有3个
mean_price_if <- left_join(shop[which(shop$tag_price_mean > 100000),],shop_dsr,by = "shop_id")
as.data.frame(mean_price_if)
## shop_id n_item brand_number monthly_shop_sum monthly_sales_sum
## 1 118193439 2 2 0 0.000
## 2 151623813 5 4 0 0.000
## 3 35545828 40 10 6 1069.667
## assessment_sum tag_price_mean shop_type price_mean shop_name
## 1 0 100084.0 TB_JISHI 0.0000 渣霄虎子的店
## 2 0 191333.2 TB_JISHI 0.0000 独自的华丽
## 3 13 2500303.0 TB_JISHI 178.2778 北京斯托克司户外装备
## mas sas cas
## 1 5.00 5.00 5.00
## 2 0.00 0.00 0.00
## 3 4.74 4.95 4.95
## 这些类型的店铺军事TB_JISHI,并且约销售额要么没有要么很低
## 说明这些商品的标价很高,但是并没有人去购买,说明这些商品是博人眼球的物品
dim(shop[shop$tag_price_mean <= 3000,])[1]/dim(shop)[1]
## [1] 0.9899707
## 约98.9%的商铺,商铺商品的平均标价小于3000
summary(shop[shop$tag_price_mean <= 3000,]$tag_price_mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01 166.80 282.70 422.40 520.50 2999.00
pm1 <- ggplot(shop[shop$tag_price_mean > 3000 & shop$tag_price_mean < 100000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(tag_price_mean),bins = 200,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
scale_x_continuous(labels = function(x) paste(x/1e3,"k",sep = "")) +
labs(x = "商品平均标价(>3e3&<1e5元)",y = "店铺的数目(家)",title = "店铺商品平均标价分布")
# pm1
pm2 <- ggplot(shop[shop$tag_price_mean <= 3000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(tag_price_mean),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free_y") +
#scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
labs(x = "商品平均标价(<=3e3元)",y = "店铺的数目(家)",title = "店铺商品平均标价分布")
# pm2
grid.arrange(pm1,pm2,nrow = 2)

## 可以看出两类商店的平均标价的分布存在很明显的差异
## 在高价区,BT_JISHI的商品平均标价较高
## 在低价区,BT_TMALL的商铺比较集中于商品平均标价高的区域
## 对商铺数据进行平行坐标图可视化####--------------------------------
lab_x <- c("销售商品数","销售品牌数","月销量","月销售额","评论数","平均标价","平均售价")
ggparcoord(shop,columns = c(2:7,9),groupColumn = 8,scale = "std") +
theme_gray(base_family = "STKaiti") +
theme(legend.position = "top") +
scale_x_discrete(labels = lab_x) +
labs(x= "",y = "标准化后数值",title = "商铺平行坐标图")

ggparcoord(shop,columns = c(2:7,9),groupColumn = 8,scale = "robust") +
theme_gray(base_family = "STKaiti") +
theme(legend.position = "top") +
scale_x_discrete(labels = lab_x) +
labs(x= "",y = "Robust后数值",title = "商铺平行坐标图")

## Robust :减去中位数,除以中位数的标准偏差
ggparcoord(shop,columns = c(2:7,9),groupColumn = 8 ,scale = "uniminmax") +
theme_gray(base_family = "STKaiti") +
theme(legend.position = "top") +
scale_x_discrete(labels = lab_x) +
labs(x= "",y = "单位区间数值",title = "商铺平行坐标图")

## 从平行坐标图上我们可以看出两类变店铺的差异
## 1: TB_TMALL 的销售商品的数目更多,月销量更高,月销售额更高,评论数目更多
## 2:TB_JISHI 的销售品牌数更多,平均标价更高,平均售价更高
## 分析商铺数据中的数据的相关系数####---------------------------
## 查看散点图
ggscatmat(data = as.data.frame(shop),columns = c(2:7,9),
color = "shop_type",corMethod = "pearson") +
theme_bw(base_family = "STKaiti") +
theme(legend.position = "top") +
ggtitle("散点图矩阵")

## 从散点图和相关系数矩阵中可以看出,不同的类别销售方式相关性有差异
## 将数据标准化,然后查看相关系数和散点矩阵图--------------------------------
shop_std <- tbl_df(as.data.frame(apply(shop[,c(2:7,9)], 2, scale)))
shop_std$shop_type <- shop$shop_type
shop_std$shop_id <- shop$shop_id # 标准化后的商铺数据
head(shop_std)
## # A tibble: 6 × 9
## n_item brand_number monthly_shop_sum monthly_sales_sum
## <dbl> <dbl> <dbl> <dbl>
## 1 0.00494926 0.9106128 -0.04400471 0.002296298
## 2 0.10480155 1.1809999 -0.05381395 -0.025089271
## 3 -0.21472577 -0.4413223 -0.05785305 -0.052513470
## 4 0.14474246 0.3698388 -0.03304144 -0.022616612
## 5 -0.23469622 -0.4413223 -0.05381395 -0.056765884
## 6 -0.19475531 -0.1709352 -0.04592809 -0.049164979
## # ... with 5 more variables: assessment_sum <dbl>, tag_price_mean <dbl>,
## # price_mean <dbl>, shop_type <chr>, shop_id <chr>
summary(shop_std)
## n_item brand_number monthly_shop_sum
## Min. :-0.26132 Min. :-0.7117 Min. :-0.06420
## 1st Qu.:-0.24135 1st Qu.:-0.7117 1st Qu.:-0.06420
## Median :-0.16147 Median :-0.4413 Median :-0.06208
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.07152 3rd Qu.: 0.3698 3rd Qu.:-0.05112
## Max. :37.48950 Max. :14.4300 Max. :77.30033
## monthly_sales_sum assessment_sum tag_price_mean
## Min. :-0.05827 Min. :-0.06944 Min. : -0.03657
## 1st Qu.:-0.05827 1st Qu.:-0.06939 1st Qu.: -0.02632
## Median :-0.05649 Median :-0.06845 Median : -0.01918
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:-0.04566 3rd Qu.:-0.06322 3rd Qu.: -0.00407
## Max. :97.36520 Max. :68.78840 Max. :152.59263
## price_mean shop_type shop_id
## Min. :-0.5673 Length:23531 Length:23531
## 1st Qu.:-0.5673 Class :character Class :character
## Median :-0.2674 Mode :character Mode :character
## Mean : 0.0000
## 3rd Qu.: 0.1065
## Max. :32.2195
## 查看散点图
ggscatmat(data = as.data.frame(shop_std),columns = 1:7,
color = "shop_type",corMethod = "pearson") +
theme_bw(base_family = "STKaiti") +
theme(legend.position = "top") +
ggtitle("散点图矩阵")

## 好像数据标准化,并不能让那个改变相关系数的大小
shop_cor <- cor(shop[,c(2:7,9)])
shop_cor
## n_item brand_number monthly_shop_sum
## n_item 1.000000000 0.099900400 0.3290160004
## brand_number 0.099900400 1.000000000 -0.0109006465
## monthly_shop_sum 0.329016000 -0.010900647 1.0000000000
## monthly_sales_sum 0.354521976 -0.008370258 0.8335239180
## assessment_sum 0.316417453 -0.016163316 0.7299167843
## tag_price_mean -0.000263462 0.013772150 -0.0008675259
## price_mean 0.023641528 0.084639091 -0.0064856184
## monthly_sales_sum assessment_sum tag_price_mean
## n_item 3.545220e-01 0.316417453 -2.634620e-04
## brand_number -8.370258e-03 -0.016163316 1.377215e-02
## monthly_shop_sum 8.335239e-01 0.729916784 -8.675259e-04
## monthly_sales_sum 1.000000e+00 0.607148620 -7.270806e-05
## assessment_sum 6.071486e-01 1.000000000 -1.024222e-03
## tag_price_mean -7.270806e-05 -0.001024222 1.000000e+00
## price_mean 2.267963e-02 -0.007825621 2.086091e-02
## price_mean
## n_item 0.023641528
## brand_number 0.084639091
## monthly_shop_sum -0.006485618
## monthly_sales_sum 0.022679633
## assessment_sum -0.007825621
## tag_price_mean 0.020860913
## price_mean 1.000000000
# corr.test(shop[,c(2:7,9)])
# corr.p(shop_cor,dim(shop)[1])
lab_x <- c("销售商品数","销售品牌数","月销量","月销售额","评论数","平均标价","平均售价")
colnames(shop_cor) <- lab_x
rownames(shop_cor) <- lab_x
shop_cor
## 销售商品数 销售品牌数 月销量 月销售额
## 销售商品数 1.000000000 0.099900400 0.3290160004 3.545220e-01
## 销售品牌数 0.099900400 1.000000000 -0.0109006465 -8.370258e-03
## 月销量 0.329016000 -0.010900647 1.0000000000 8.335239e-01
## 月销售额 0.354521976 -0.008370258 0.8335239180 1.000000e+00
## 评论数 0.316417453 -0.016163316 0.7299167843 6.071486e-01
## 平均标价 -0.000263462 0.013772150 -0.0008675259 -7.270806e-05
## 平均售价 0.023641528 0.084639091 -0.0064856184 2.267963e-02
## 评论数 平均标价 平均售价
## 销售商品数 0.316417453 -2.634620e-04 0.023641528
## 销售品牌数 -0.016163316 1.377215e-02 0.084639091
## 月销量 0.729916784 -8.675259e-04 -0.006485618
## 月销售额 0.607148620 -7.270806e-05 0.022679633
## 评论数 1.000000000 -1.024222e-03 -0.007825621
## 平均标价 -0.001024222 1.000000e+00 0.020860913
## 平均售价 -0.007825621 2.086091e-02 1.000000000
par(family = "STKaiti",mfrow = c(1,1))
corrplot(shop_cor,method = "pie",type = "full",
mar = c(0, 6, 6, 0),title = "相关系数图")

## 可以发现月销量、月销售额、评论数之间的相关系数较大
## 销售商品数与:月销量、月销售额、评论数相关夜超过了0.5
## 分析店铺的信息数据####--------------------------------------------
head(shop_dsr)
## # A tibble: 6 × 5
## shop_id shop_name mas sas cas
## <chr> <chr> <dbl> <dbl> <dbl>
## 1 126042966 李宁跑酷专卖店 4.81 4.76 4.73
## 2 148002147 lining李宁新兴专卖店 4.88 4.83 4.82
## 3 128369255 李宁羽毛球拍专业店 4.86 4.88 4.85
## 4 145412890 李宁天泽华盛专卖店 4.90 4.86 4.86
## 5 145270296 lining李宁图漫专卖店 4.89 4.87 4.82
## 6 108228574 李宁力方力合专卖店 4.90 4.85 4.83
summary(shop_dsr)
## shop_id shop_name mas sas
## Length:19190 Length:19190 Min. :0.000 Min. :0.000
## Class :character Class :character 1st Qu.:4.650 1st Qu.:4.690
## Mode :character Mode :character Median :4.860 Median :4.870
## Mean :4.012 Mean :4.027
## 3rd Qu.:4.960 3rd Qu.:4.970
## Max. :5.000 Max. :5.000
## cas
## Min. :0.000
## 1st Qu.:4.660
## Median :4.850
## Mean :4.013
## 3rd Qu.:4.950
## Max. :5.000
shop_dsrj <- left_join(shop_dsr,shop[,c(1,8)],by = "shop_id")
## 可以发现有些店铺是无法表明销售类型的,需要把这些数据删除
shop_dsrj <- dplyr::filter(shop_dsrj,!is.na(shop_type))
## 对数据进行可视化--------------------------------------
mas <- ggplot(shop_dsrj,aes(mas,fill = shop_type)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(y = ..density..),binwidth = 0.1,alpha = 0.6,
position='fill',color="firebrick") +
theme(legend.position = "right")
# mas
sas <- ggplot(shop_dsrj,aes(sas,fill = shop_type)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(y = ..density..),binwidth = 0.1,alpha = 0.6,
position='fill',color="firebrick") +
theme(legend.position = "right")
# sas
cas <- ggplot(shop_dsrj,aes(cas,fill = shop_type)) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(y = ..density..),binwidth = 0.1,alpha = 0.6,
position='fill',color="firebrick") +
theme(legend.position = "right")
# cas
grid.arrange(mas,sas,cas,nrow = 3,top = 0,bottom = 0)
## Warning: Removed 54 rows containing missing values (geom_bar).
## Warning: Removed 54 rows containing missing values (geom_bar).
## Warning: Removed 52 rows containing missing values (geom_bar).

## 可以发现,TB_TMALL 没有低分
## 将两个商店的数据结合,只分析14977多家店铺的数据####----------------
shop <- dplyr::left_join(shop,shop_dsr,by = "shop_id") %>%
dplyr::filter(!is.na(shop_name)) %>%
dplyr::arrange(desc(monthly_sales_sum))
dim(shop)
## [1] 14977 13
as.data.frame(head(shop))
## shop_id n_item brand_number monthly_shop_sum monthly_sales_sum
## 1 71955116 1071 1 258393 111205902
## 2 62147762 5672 1 338607 87688349
## 3 57299736 1501 2 402232 60401379
## 4 60790435 903 1 263702 42626982
## 5 62847409 452 1 79457 34905426
## 6 60036677 1395 1 139804 19488651
## assessment_sum tag_price_mean shop_type price_mean shop_name
## 1 520424 592.1662 TB_TMALL 430.3751 NIKE官方旗舰店
## 2 405415 367.0465 TB_TMALL 258.9679 adidas官方旗舰店
## 3 1384268 171.3135 TB_TMALL 150.1655 李宁官方网店
## 4 680782 283.4186 TB_TMALL 161.6483 安踏官方网店
## 5 507890 748.9314 TB_TMALL 439.2996 New Balance旗舰店
## 6 1000357 269.2136 TB_TMALL 139.3998 特步官方旗舰店
## mas sas cas
## 1 4.86 4.84 4.87
## 2 4.83 4.77 4.83
## 3 4.83 4.80 4.79
## 4 4.82 4.78 4.76
## 5 4.80 4.79 4.78
## 6 4.79 4.74 4.76
## 查看两种销售方式的对比 -------------------------
table(shop$shop_type)
##
## TB_JISHI TB_TMALL
## 14448 529
table(shop$shop_type) / dim(shop)[1]
##
## TB_JISHI TB_TMALL
## 0.96467917 0.03532083
## 可以发现销售方式为TB_JISHI占据百分比大于96.5%
## 销售方式为TB_TMALL 占据百分比不到3.4%
## 商铺销售商品数目的可视化 ---------------------------------
summary(shop$n_item)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 4.00 18.00 44.58 53.00 5672.00
length(which(shop$n_item > 1000))
## [1] 57
## 可以发现有一个店铺销售的商品数目为5671件,约有65个店铺销售的商品数目多于1000
## 查看这些数据
# data.frame(shop[which(shop$n_item > 1000),])
p1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(n_item),binwidth = 5,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
labs(x = "销售商品数目(件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")
p2 <- ggplot(shop[which(shop$n_item <= 250),]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(n_item),binwidth = 5,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
labs(x = "销售商品数目(<=250件)",y = "店铺的数目(家)",title = "店铺销售商品数目分布")
grid.arrange(p1,p2,nrow = 2)

## 可以看出在两种销售方式店铺中,主要销售方式为-TB_JISHI,店铺数目占据大部分,
## 但是这种销售规模较小,大多数销售商品数目小于250
## 而对于TB_TMALL,店铺数量不多,但是销售商品数目多于250的均为这种销售方式的店铺
## 商铺销售商品的品牌数目可视化------------------------------------
summary(shop$brand_number)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000 1.000 2.000 3.752 5.000 57.000
length(which(shop$brand_number > 20))
## [1] 99
## 可以发现有店铺销售的商品品牌数目为85个品牌,约有337个店铺销售的商品品牌数目多于20
p1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(brand_number),binwidth = 1,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
labs(x = "销售商品品牌数(个)",y = "店铺的数目(家)",title = "店铺销售商品品牌数分布")
p2 <- ggplot(shop[which(shop$n_item <= 20),]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(brand_number),binwidth = 1,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
labs(x = "销售商品品牌数(<=20个)",y = "店铺的数目(家)",title = "店铺销售商品品牌数分布")
grid.arrange(p1,p2,nrow = 2)

## 在销售商品占据的品牌数目上,两种方式的店铺数量的分布大致是相同的
## 店铺的月销售额可视化分析---------------------------------
summary(shop$monthly_sales_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 1813 83660 14160 111200000
# boxplot(shop$monthly_sales_sum)
ms1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(monthly_sales_sum),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
scale_x_continuous(labels = function(x) paste(x/1e6,"m",sep = "")) +
labs(x = "月销售额(元)",y = "店铺的数目(家)",title = "店铺月销售额分布")
ms2 <- ggplot(shop[shop$monthly_sales_sum <= 10000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(monthly_sales_sum),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
#scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
labs(x = "月销售额(<=1e4元)",y = "店铺的数目(家)",title = "店铺月销售额分布")
grid.arrange(ms1,ms2,nrow = 2)

## 可以看出 销售方式为TB_TMALL的店铺的月销售额更大,月销售额有超过90,000,000元的
## 只有很少一部分店铺的月销售额少于10000元
## 销售方式为TB_JISHI 的店铺,月销售额大多数不超过1000元
## 店铺的商品月销量数据可视化---------------------------------
summary(shop$monthly_shop_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 0 11 423 75 402200
dim(shop[shop$monthly_shop_sum <= 1000,])[1]/dim(shop)[1]
## [1] 0.9624758
## 月96%的商铺,月销售商品数目小于1000件
summary(shop[shop$monthly_shop_sum <= 1000,]$monthly_shop_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 9.00 65.24 60.00 1000.00
mss1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(monthly_shop_sum),bins = 200,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
scale_x_continuous(labels = function(x) paste(x/1e4,"万",sep = "")) +
labs(x = "商品月销量(个)",y = "店铺的数目(家)",title = "店铺商品月销量分布")
# mss1
mss2 <- ggplot(shop[shop$monthly_shop_sum <= 1000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(monthly_shop_sum),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
#scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
labs(x = "商品月销量(<=1e3个)",y = "店铺的数目(家)",title = "店铺商品月销量分布")
# mss2
grid.arrange(mss1,mss2,nrow = 2)

## 可以发现整体的趋势是相同的
## 对店铺的评价总数数据可视化---------------------------------
summary(shop$assessment_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0 1 18 1825 134 1384000
dim(shop[shop$assessment_sum <= 2000,])[1]/dim(shop)[1]
## [1] 0.9474528
## 月95.4%的商铺,评价总数目小于2000个
summary(shop[shop$assessment_sum <= 2000,]$assessment_sum)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 1.0 15.0 127.8 98.0 2000.0
as1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(assessment_sum),bins = 200,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
scale_x_continuous(labels = function(x) paste(x/1e4,"万",sep = "")) +
labs(x = "评价总数目(条)",y = "店铺的数目(家)",title = "店铺评价总数目分布")
# as1
as2 <- ggplot(shop[shop$assessment_sum <= 2000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(assessment_sum),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
#scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
labs(x = "评价总数目(<=2e3条)",y = "店铺的数目(家)",title = "店铺评价总数目分布")
# as2
grid.arrange(as1,as2,nrow = 2)

## 可以发现整体的趋势是相同的
## 店铺商品的平均售价数据可视化---------------------------------
summary(shop$price_mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 116.4 219.9 258.9 14010.0
length(which(shop$price_mean > 3000))
## [1] 38
## 平均售价大于3千的店铺有60个
mean_price_if <- left_join(shop[which(shop$price_mean > 3000),],shop_dsr,by = "shop_id")
# as.data.frame(mean_price_if)
dim(shop[shop$price_mean <= 3000,])[1]/dim(shop)[1]
## [1] 0.9974628
## 月95.5%的商铺,店铺商品的平均销售价格小于3000
summary(shop[shop$price_mean <= 3000,]$price_mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 0.0 115.9 206.9 258.0 2944.0
pm1 <- ggplot(shop) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(price_mean),bins = 200,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free_y") +
scale_x_continuous(labels = function(x) paste(x/1e3,"k",sep = "")) +
labs(x = "商品平均售价(元)",y = "店铺的数目(家)",title = "店铺商品平均售价分布")
# pm1
pm2 <- ggplot(shop[shop$price_mean <= 3000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(price_mean),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free_y") +
#scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
labs(x = "商品平均售价(<=3e3元)",y = "店铺的数目(家)",title = "店铺商品平均售价分布")
# pm2
grid.arrange(pm1,pm2,nrow = 2)

## 在两种累想上铺的商品平均售价上,可以发现两类上铺的分布几乎是一样的,
## 并不能说明那种类型的店铺出售的商品更高档
## 对商店商品的平均标价 进行可视化分析---------------------------
summary(shop$tag_price_mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 163.0 279.1 657.6 517.8 2500000.0
length(which(shop$tag_price_mean > 3000))
## [1] 148
length(which(shop$tag_price_mean > 100000))
## [1] 3
## 平均售价大于3千的店铺有260个 ,标价大于10万的有4个
mean_price_if <- left_join(shop[which(shop$tag_price_mean > 100000),],shop_dsr,by = "shop_id")
as.data.frame(mean_price_if)
## shop_id n_item brand_number monthly_shop_sum monthly_sales_sum
## 1 35545828 40 10 6 1069.667
## 2 118193439 2 2 0 0.000
## 3 151623813 5 4 0 0.000
## assessment_sum tag_price_mean shop_type price_mean shop_name.x
## 1 13 2500303.0 TB_JISHI 178.2778 北京斯托克司户外装备
## 2 0 100084.0 TB_JISHI 0.0000 渣霄虎子的店
## 3 0 191333.2 TB_JISHI 0.0000 独自的华丽
## mas.x sas.x cas.x shop_name.y mas.y sas.y cas.y
## 1 4.74 4.95 4.95 北京斯托克司户外装备 4.74 4.95 4.95
## 2 5.00 5.00 5.00 渣霄虎子的店 5.00 5.00 5.00
## 3 0.00 0.00 0.00 独自的华丽 0.00 0.00 0.00
## 这些类型的店铺军事TB_JISHI,并且约销售额要么没有要么很低
## 说明这些商品的标价很高,但是并没有人去购买,说明这些商品是博人眼球的物品
dim(shop[shop$tag_price_mean <= 3000,])[1]/dim(shop)[1]
## [1] 0.9901182
## 约98.9%的商铺,商铺商品的平均标价小于3000
summary(shop[shop$tag_price_mean <= 3000,]$tag_price_mean)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.01 162.00 276.40 410.00 503.00 2999.00
pm1 <- ggplot(shop[shop$tag_price_mean > 3000 & shop$tag_price_mean < 100000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(tag_price_mean),bins = 200,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free") +
scale_x_continuous(labels = function(x) paste(x/1e3,"k",sep = "")) +
labs(x = "商品平均标价(>3e3&<1e5元)",y = "店铺的数目(家)",title = "店铺商品平均标价分布")
# pm1
pm2 <- ggplot(shop[shop$tag_price_mean <= 3000,]) +
theme_bw(base_family = "STKaiti") +
geom_histogram(aes(tag_price_mean),bins = 100,color="firebrick",fill = "red",
alpha = 0.6) +
facet_wrap(~shop_type,scales = "free_y") +
#scale_x_continuous(labels = function(x) paste(x/10e3,"k",sep = "")) +
labs(x = "商品平均标价(<=3e3元)",y = "店铺的数目(家)",title = "店铺商品平均标价分布")
# pm2
grid.arrange(pm1,pm2,nrow = 2)

## 可以看出两类商店的平均标价的分布存在很明显的差异
## 在高价区,BT_JISHI的商品平均标价较高
## 在低价区,BT_TMALL的商铺比较集中于商品平均标价高的区域
## 对商铺数据进行平行坐标图可视化####--------------------------------
lab_x <- c("销售商品数","销售品牌数","月销量","月销售额","评论数","平均标价",
"平均售价","mas","sas","cas")
ggparcoord(shop,columns = c(2:7,9,11:13),groupColumn = 8,scale = "std") +
theme_gray(base_family = "STKaiti") +
theme(legend.position = "top") +
scale_x_discrete(labels = lab_x) +
labs(x= "",y = "标准化后数值",title = "商铺平行坐标图")

ggparcoord(shop,columns = c(2:7,9,11:13),groupColumn = 8,scale = "robust") +
theme_gray(base_family = "STKaiti") +
theme(legend.position = "top") +
scale_x_discrete(labels = lab_x) +
labs(x= "",y = "Robust后数值",title = "商铺平行坐标图")

## Robust :减去中位数,除以中位数的标准偏差
ggparcoord(shop,columns = c(2:7,9,11:13),groupColumn = 8 ,scale = "uniminmax") +
theme_gray(base_family = "STKaiti") +
theme(legend.position = "top") +
scale_x_discrete(labels = lab_x) +
labs(x= "",y = "单位区间数值",title = "商铺平行坐标图")

## 从平行坐标图上我们可以看出两类变店铺的差异
## 1: TB_TMALL 的销售商品的数目更多,月销量更高,月销售额更高,评论数目更多
## 2:TB_JISHI 的销售品牌数更多,平均标价更高,平均售价更高
## 分析商铺数据中的数据的相关系数####---------------------------
## 查看散点图
ggscatmat(data = as.data.frame(shop),columns = c(2:7,9,11:13),
color = "shop_type",corMethod = "pearson") +
theme_bw(base_family = "STKaiti") +
theme(legend.position = "top") +
ggtitle("散点图矩阵")

## 从散点图和相关系数矩阵中可以看出,不同的类别销售方式相关性有差异
## 将数据标准化,然后查看相关系数和散点矩阵图--------------------------------
shop_std <- tbl_df(as.data.frame(apply(shop[,c(2:7,9,11:13)], 2, scale)))
shop_std$shop_type <- shop$shop_type
shop_std$shop_id <- shop$shop_id # 标准化后的商铺数据
head(shop_std)
## # A tibble: 6 × 12
## n_item brand_number monthly_shop_sum monthly_sales_sum assessment_sum
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 5.853689 -0.6916237 40.54370 78.28417 21.18327
## 2 32.093174 -0.6916237 53.15048 61.71636 16.48549
## 3 8.305978 -0.4402661 63.15006 42.49305 56.46880
## 4 4.895586 -0.6916237 41.37808 29.97122 27.73343
## 5 2.323535 -0.6916237 12.42134 24.53148 20.67130
## 6 7.701460 -0.6916237 21.90573 13.67056 40.78715
## # ... with 7 more variables: tag_price_mean <dbl>, price_mean <dbl>,
## # mas <dbl>, sas <dbl>, cas <dbl>, shop_type <chr>, shop_id <chr>
summary(shop_std)
## n_item brand_number monthly_shop_sum
## Min. :-0.24852 Min. :-0.6916 Min. :-0.06647
## 1st Qu.:-0.23141 1st Qu.:-0.6916 1st Qu.:-0.06647
## Median :-0.15157 Median :-0.4403 Median :-0.06474
## Mean : 0.00000 Mean : 0.0000 Mean : 0.00000
## 3rd Qu.: 0.04804 3rd Qu.: 0.3138 3rd Qu.:-0.05469
## Max. :32.09317 Max. :13.3844 Max. :63.15006
## monthly_sales_sum assessment_sum tag_price_mean
## Min. :-0.05894 Min. :-0.07454 Min. : -0.03204
## 1st Qu.:-0.05894 1st Qu.:-0.07450 1st Qu.: -0.02409
## Median :-0.05766 Median :-0.07381 Median : -0.01844
## Mean : 0.00000 Mean : 0.00000 Mean : 0.00000
## 3rd Qu.:-0.04896 3rd Qu.:-0.06907 3rd Qu.: -0.00681
## Max. :78.28417 Max. :56.46880 Max. :121.78834
## price_mean mas sas cas
## Min. :-0.52265 Min. :-2.6215 Min. :-2.6234 Min. :-2.6212
## 1st Qu.:-0.52265 1st Qu.: 0.2954 1st Qu.: 0.3039 1st Qu.: 0.2955
## Median :-0.24602 Median : 0.3883 Median : 0.3904 Median : 0.3884
## Mean : 0.00000 Mean : 0.0000 Mean : 0.0000 Mean : 0.0000
## 3rd Qu.: 0.09251 3rd Qu.: 0.4440 3rd Qu.: 0.4398 3rd Qu.: 0.4441
## Max. :32.77893 Max. : 0.4750 Max. : 0.4645 Max. : 0.4750
## shop_type shop_id
## Length:14977 Length:14977
## Class :character Class :character
## Mode :character Mode :character
##
##
##
## 查看散点图
ggscatmat(data = as.data.frame(shop_std),columns = 1:7,
color = "shop_type",corMethod = "pearson") +
theme_bw(base_family = "STKaiti") +
theme(legend.position = "top") +
ggtitle("散点图矩阵")

## 查看店铺的累积月销售分布####--------------------------------
month_all <- sum(shop$monthly_sales_sum)
print(month_all) ## 约12.5亿元
## [1] 1253026233
month_pro <-cumsum(shop$monthly_sales_sum / month_all)
ggplot() +
theme_grey(base_family = "STKaiti") +
geom_line(aes(x = seq(1,length(month_pro)),y = month_pro)) +
geom_vline(xintercept = length(month_pro[month_pro != 1]),color = "red") +
labs(x = "店铺数目",y = "累积百分比",title = "店铺月销售额") +
geom_text(aes(length(month_pro[month_pro != 1])-1000,0.5),label = "月销售=0",
family = "STKaiti") +
geom_vline(xintercept = length(month_pro[month_pro <= 0.9]),color = "red") +
geom_text(aes(length(month_pro[month_pro <= 0.9])-600,0.5),label = "月销售\n占所有\n90%",
family = "STKaiti") +
scale_x_continuous(breaks = seq(1,length(month_pro),by = 1000)) +
geom_text(aes(length(month_pro[month_pro <= 0.9])+3000,0.5),label = "月销售占所有10%",
family = "STKaiti")

## 绘制直方图查看销量售额的分布
data1 <- data.frame(month_pro = month_pro,shop_type = shop$shop_type)
data1 <- data1[data1$month_pro<1,] # 只查看销量部位0的数据
ggplot(data1) + theme_bw(base_family = "STKaiti") +
geom_bar(aes(x = seq(1,length(month_pro)),y = month_pro,
color = shop_type),stat = "identity",width = 1) +
theme(legend.position = "top") +
labs(x = "店铺数目",y = "累积百分比",title = "店铺月销售额")

## 可以发现虽然店铺为TB_TMALL类型的店铺数量少,但是对总体月销售额的贡献却占据主题部分