## 2016年暑期课程设计
## 问题:Grupo Bimbo Inventory Demand
## 宾堡集团的库存需求
## 最大限度地提高销售和最大限度地减少烘焙食品的退回
## Daitu
## start:2016.06.21
## 参考借鉴kaggle上的公开程序
setwd("/Users/Daitu/数据分析/kaggle/Grupo Bimbo")
getwd()
## [1] "/Users/daitu/数据分析/kaggle/Grupo Bimbo"
## 加载包
library(data.table)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.2
library(dplyr)
## -------------------------------------------------------------------------
## data.table + dplyr code now lives in dtplyr.
## Please library(dtplyr)!
## -------------------------------------------------------------------------
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:data.table':
##
## between, last
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(treemap)
## 读取数据####
## 1:读取训练集
system.time({
traindata <- fread("train.csv",sep=",",header = TRUE)
})
##
Read 0.0% of 74180464 rows
Read 3.9% of 74180464 rows
Read 7.7% of 74180464 rows
Read 11.6% of 74180464 rows
Read 15.3% of 74180464 rows
Read 19.0% of 74180464 rows
Read 22.8% of 74180464 rows
Read 26.7% of 74180464 rows
Read 30.5% of 74180464 rows
Read 34.4% of 74180464 rows
Read 38.2% of 74180464 rows
Read 42.1% of 74180464 rows
Read 45.9% of 74180464 rows
Read 49.6% of 74180464 rows
Read 53.3% of 74180464 rows
Read 57.0% of 74180464 rows
Read 60.7% of 74180464 rows
Read 64.4% of 74180464 rows
Read 68.2% of 74180464 rows
Read 71.9% of 74180464 rows
Read 75.8% of 74180464 rows
Read 79.7% of 74180464 rows
Read 83.6% of 74180464 rows
Read 87.5% of 74180464 rows
Read 91.4% of 74180464 rows
Read 95.3% of 74180464 rows
Read 99.1% of 74180464 rows
Read 74180464 rows and 11 (of 11) columns from 2.980 GB file in 00:00:33
## user system elapsed
## 29.848 2.609 35.237
head(traindata)
## Semana Agencia_ID Canal_ID Ruta_SAK Cliente_ID Producto_ID
## 1: 3 1110 7 3301 15766 1212
## 2: 3 1110 7 3301 15766 1216
## 3: 3 1110 7 3301 15766 1238
## 4: 3 1110 7 3301 15766 1240
## 5: 3 1110 7 3301 15766 1242
## 6: 3 1110 7 3301 15766 1250
## Venta_uni_hoy Venta_hoy Dev_uni_proxima Dev_proxima Demanda_uni_equil
## 1: 3 25.14 0 0 3
## 2: 4 33.52 0 0 4
## 3: 4 39.32 0 0 4
## 4: 4 33.52 0 0 4
## 5: 3 22.92 0 0 3
## 6: 5 38.20 0 0 5
## 2:读取客户名单数据
cliente_tabla <- fread("cliente_tabla.csv",sep=",",header = TRUE)
head(cliente_tabla)
## Cliente_ID NombreCliente
## 1: 0 SIN NOMBRE
## 2: 1 OXXO XINANTECATL
## 3: 2 SIN NOMBRE
## 4: 3 EL MORENO
## 5: 4 SDN SER DE ALIM CUERPO SA CIA DE INT
## 6: 4 SDN SER DE ALIM CUERPO SA CIA DE INT
## 3:读取产品名单数据
producto_tabla <- fread("producto_tabla.csv",sep=",",header = TRUE)
head(producto_tabla)
## Producto_ID NombreProducto
## 1: 0 NO IDENTIFICADO 0
## 2: 9 Capuccino Moka 750g NES 9
## 3: 41 Bimbollos Ext sAjonjoli 6p 480g BIM 41
## 4: 53 Burritos Sincro 170g CU LON 53
## 5: 72 Div Tira Mini Doradita 4p 45g TR 72
## 6: 73 Pan Multigrano Linaza 540g BIM 73
## 4:读取城镇和国家(州)数据
town_state <- fread("town_state.csv",sep=",",header = TRUE)
head(town_state)
## Agencia_ID Town State
## 1: 1110 2008 AG. LAGO FILT MÉXICO, D.F.
## 2: 1111 2002 AG. AZCAPOTZALCO MÉXICO, D.F.
## 3: 1112 2004 AG. CUAUTITLAN ESTADO DE MÉXICO
## 4: 1113 2008 AG. LAGO FILT MÉXICO, D.F.
## 5: 1114 2029 AG.IZTAPALAPA 2 MÉXICO, D.F.
## 6: 1116 2011 AG. SAN ANTONIO MÉXICO, D.F.
## 数据的描述统计####
## 1:分析数据的周数:Semana
Semana <- data.frame(table(traindata$Semana))
colnames(Semana) <- c("Semana","Freq")
# 3 4 5 6 7 8 9
# 11165207 11009593 10615397 10191837 10382849 10406868 10408713
## 条形图
ggplot(data = Semana,aes(Semana,Freq)) +
geom_bar(stat = "identity", width = 0.6,fill = "lightblue",colour = "black") +
theme_grey(base_family = "STKaiti") +
scale_y_continuous() +
labs(x="周数",y="数据条数",title = "每周数据的记录数")

Semana2 <- traindata %>%
group_by(Semana) %>%
summarise(Units = sum(Venta_uni_hoy),
Return_Units = sum(Dev_uni_proxima),
NetU = sum(Demanda_uni_equil)) %>%
mutate(Retern_Rate = Return_Units / (Return_Units + Units))
Semana2
## # A tibble: 7 × 5
## Semana Units Return_Units NetU Retern_Rate
## <int> <int> <int> <int> <dbl>
## 1 3 78519996 1286055 77664309 0.01611476
## 2 4 80509571 1322852 79618866 0.01616538
## 3 5 78450353 1220967 77610637 0.01532505
## 4 6 74753832 1321510 73851129 0.01737107
## 5 7 77548038 1417450 76597014 0.01795025
## 6 8 76497828 1452369 75525105 0.01863201
## 7 9 75991700 1641371 75054450 0.02114268
dim(Semana2) # 几个星期的数据
## [1] 7 5
## 可视化每周的情况
ggplot(data = Semana2,aes(Semana,NetU,fill = Retern_Rate)) +
geom_bar(stat = "identity",color = "black",width = 0.7) +
theme_bw(base_family = "STKaiti") +
scale_y_continuous(labels=function(x)paste(x/100000, "m"))+
scale_x_continuous(labels = function(x)paste("第",x,"星期")) +
scale_fill_gradient(name="退回率", low="blue", high="red")+
labs(x = "星期",y = "销量",title = "每星期销量")

## 销售站的数据分析#####
## 2 :销售站Agencia 和 州State
agencias <- traindata %>%
group_by(Agencia_ID) %>% # 数据按照销售战进行分组统计
summarise(Units = sum(Venta_uni_hoy), # 总结多个值为一个值,units:本销售站的销量和
Pesos = sum(Venta_hoy), # 本周的销售量(比索)之和
Return_Units = sum(Dev_uni_proxima), # 下星期的返回量之和
Return_Pesos = sum(Dev_proxima), # 下星期的返回量(比索)之和
Net = sum(Demanda_uni_equil)) %>% # 调整后的需求和
mutate(Net_Pesos = Pesos - Return_Pesos, # mutate:添加新的变量
Return_Rate = Return_Units / (Units+Return_Units)) %>% # 添加变量退货比率
arrange(desc(Units)) %>% # 将数据按照变量Units的降序排列
inner_join(town_state, by="Agencia_ID") # 按照变量Agencia_ID,连接两个表,return all rows from x
head(agencias)
## # A tibble: 6 × 10
## Agencia_ID Units Pesos Return_Units Return_Pesos Net Net_Pesos
## <int> <int> <dbl> <int> <dbl> <int> <dbl>
## 1 1911 5355617 40226634 87481 761811.1 5298372 39464823
## 2 1351 4103035 30911017 44234 360760.9 4075131 30550256
## 3 1912 4038799 29837553 60433 517715.0 3996598 29319838
## 4 1312 3901226 31410683 48758 451032.4 3867069 30959651
## 5 1347 3830254 26415182 66063 477153.5 3787763 25938028
## 6 1129 3748791 66163786 19644 354812.7 3731729 65808973
## # ... with 3 more variables: Return_Rate <dbl>, Town <chr>, State <chr>
dim(agencias) # 多少个销售站
## [1] 552 10
## 可视化x:每天销量,y:销售站的数量
ggplot(agencias, aes(x=Units/7))+
geom_histogram(fill="red", color="gray", binwidth=10000)+ #条形图的宽度为10000
theme_bw(base_family = "STKaiti") +
scale_x_continuous(labels=function(x)paste(x/1000, "k"))+
labs(x = "每天的平均销量",y = "销售站数量",title = "销售站的销量")

## 前100的销售站的销量树图
treemap(agencias[1:100, ],
index=c("Agencia_ID"), vSize="Units", vColor="Return_Rate",
type="value", title.legend="Units return rate", title="Top 100 agencias")

## 所有销售站的销量树图
treemap(agencias,
index=c("Agencia_ID"), vSize="Units", vColor="Return_Rate",
type="value", title.legend="Units return rate", title="All agencias")

## 销售站的历史数据分析
agencias_history <- traindata %>%
group_by(Agencia_ID, Semana) %>% # 数据按照销售站、星期数,进行分组统计
summarise(Units = sum(Venta_uni_hoy), # 总结多个值为一个值,units:销量和
Pesos = sum(Venta_hoy),
Return_Units = sum(Dev_uni_proxima),
Return_Pesos = sum(Dev_proxima),
Net = sum(Demanda_uni_equil)) %>%
mutate(Net_Pesos = Pesos - Return_Pesos,
Avg_Pesos = Pesos / Units,
Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(Agencia_ID, Semana) %>% # 将数据按照变量 销售站、星期数的降序排列
inner_join(town_state, by="Agencia_ID")
head(agencias_history)
## Source: local data frame [6 x 12]
## Groups: Agencia_ID [1]
##
## Agencia_ID Semana Units Pesos Return_Units Return_Pesos Net
## <int> <int> <int> <dbl> <int> <dbl> <int>
## 1 1110 3 120285 1296739 3577 29712.03 119951
## 2 1110 4 135788 1385112 5260 29932.82 135327
## 3 1110 5 127420 1345680 3342 26249.25 127077
## 4 1110 6 115255 1239051 5721 26595.87 114865
## 5 1110 7 122955 1297072 6950 34845.26 122513
## 6 1110 8 127277 1345696 8030 35157.98 126735
## # ... with 5 more variables: Net_Pesos <dbl>, Avg_Pesos <dbl>,
## # Return_Rate <dbl>, Town <chr>, State <chr>
dim(agencias_history)
## [1] 3863 12
## 取出销售量前30的销售站ID
top30agencias <- agencias$Agencia_ID[1:30]
## 销量前30的销售站每周的销量和退货率图
ggplot(agencias_history %>% filter(Agencia_ID %in% top30agencias))+
geom_bar(aes(x=Semana, y=Units, fill=Return_Rate), stat="identity", color="black")+
theme_bw(base_family = "STKaiti") +
facet_wrap(~Agencia_ID)+ # 按照销售站划分成子图
scale_y_continuous(labels=function(x)paste(x/1000, "k"))+
scale_fill_gradient(name="退回率", low="white", high="red")+
ggtitle("销量前30的销售站") + ylab("销量") +xlab("星期")

## 每个州的销售数据的分析
states <- agencias_history %>%
group_by(State, Semana) %>% #数据按照州和星期分组
summarise(Units = sum(Units),
Pesos = sum(Pesos),
Return_Units = sum(Return_Units),
Return_Pesos = sum(Return_Pesos),
Net = sum(Net)) %>%
mutate(Avg_Pesos = Pesos / Units,
Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(desc(Units)) # 数据按照销量排列
head(states)
## Source: local data frame [6 x 9]
## Groups: State [1]
##
## State Semana Units Pesos Return_Units Return_Pesos
## <chr> <int> <int> <dbl> <int> <dbl>
## 1 ESTADO DE MÉXICO 3 10653345 103437148 154026 1510612
## 2 ESTADO DE MÉXICO 4 10584696 100758414 148409 1484139
## 3 ESTADO DE MÉXICO 5 10523973 99693511 144172 1449122
## 4 ESTADO DE MÉXICO 8 10422406 98640885 168741 1600855
## 5 ESTADO DE MÉXICO 9 10252045 98114659 160559 1523816
## 6 ESTADO DE MÉXICO 7 10156108 96457302 175260 1698870
## # ... with 3 more variables: Net <int>, Avg_Pesos <dbl>, Return_Rate <dbl>
dim(states)[1] /7 # 多少个州
## [1] 33
## 地点--星期 --退回百分比 图像可视化
ggplot(states)+
geom_bar(aes(x=Semana, y=Units, fill=Return_Rate), stat="identity", color="black")+
theme_bw(base_family = "STKaiti") +
facet_wrap(~State)+
scale_y_continuous(labels=function(x)paste(x/1e6, "m"))+
scale_fill_gradient(name="退回率", low="white", high="red")+
ggtitle("州的销售量")+ ylab("销量") +xlab("星期")

## 州的销量树图
states2 <- states %>%
group_by(State) %>%
summarise(Units = sum(Units),
Pesos = sum(Pesos),
Return_Units = sum(Return_Units),
Return_Pesos = sum(Return_Pesos),
Net = sum(Net)) %>%
mutate(Avg_Pesos = Pesos / Units,
Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(desc(Units)) # 数据按照销量排列
head(states2)
## # A tibble: 6 × 8
## State Units Pesos Return_Units Return_Pesos Net
## <chr> <int> <dbl> <int> <dbl> <int>
## 1 ESTADO DE MÉXICO 72421491 692290237 1108790 10840260 71685681
## 2 MÉXICO, D.F. 63859463 641420811 1155641 9808764 63216897
## 3 JALISCO 49266415 429703973 837289 7840476 48701496
## 4 NUEVO LEÓN 30105293 286344224 466455 5252757 29756845
## 5 GUANAJUATO 28215533 241668605 481674 4447663 27884613
## 6 PUEBLA 26631917 226746531 502023 4189637 26293428
## # ... with 2 more variables: Avg_Pesos <dbl>, Return_Rate <dbl>
dim(states2) # 多少个州
## [1] 33 8
treemap(states2,index = c("State"),vSize = "Units",vColor = "Return_Rate",
type = "value",palette=c("#FFFFFF","#FFFFFF","#FF0000"),
title.legend="Units return rate", title="All State")

## 销售渠道的分析####
canals2 <- traindata %>%
group_by(Canal_ID) %>% #根据销售渠道和星期进行分组
summarise(Units = sum(Venta_uni_hoy),
Pesos = sum(Venta_hoy),
Return_Units = sum(Dev_uni_proxima),
Return_Pesos = sum(Dev_proxima),
Net = sum(Demanda_uni_equil)) %>%
mutate(Net_Pesos = Pesos - Return_Pesos,
Avg_Pesos = Pesos / Units,
Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(desc(Units))
canals2
## # A tibble: 9 × 9
## Canal_ID Units Pesos Return_Units Return_Pesos Net
## <int> <int> <dbl> <int> <dbl> <int>
## 1 1 390757290 3143719571 6177725 54588903.9 386646625
## 2 2 52402129 940084794 1226250 19762509.8 51384359
## 3 4 46663250 471003920 272376 2554061.4 46468113
## 4 5 20774423 172359549 1344109 9170591.8 19891458
## 5 11 14661539 165875147 252079 2379442.9 14589584
## 6 7 10282022 119925189 132817 1588764.4 10249097
## 7 6 3569312 28573289 54311 456051.2 3549277
## 8 8 3127856 41589248 202907 1724391.6 3109500
## 9 9 33497 1533784 0 0.0 33497
## # ... with 3 more variables: Net_Pesos <dbl>, Avg_Pesos <dbl>,
## # Return_Rate <dbl>
dim(canals2)
## [1] 9 9
# 销售渠道1占据主要的销量
treemap(canals2, index=c("Canal_ID"), vSize="Units", vColor = "Return_Rate", type="value",
#palette=c("#FFFFFF","#FFFFFF","#FF0000"),
title.legend="Units return rate",
title="Canals repartition")

##
canals <- traindata %>%
group_by(Canal_ID, Semana) %>% #根据销售渠道和星期进行分组
summarise(Units = sum(Venta_uni_hoy),
Pesos = sum(Venta_hoy),
Return_Units = sum(Dev_uni_proxima),
Return_Pesos = sum(Dev_proxima),
Net = sum(Demanda_uni_equil)) %>%
mutate(Net_Pesos = Pesos - Return_Pesos,
Avg_Pesos = Pesos / Units,
Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(desc(Units))
head(canals)
## Source: local data frame [6 x 10]
## Groups: Canal_ID [1]
##
## Canal_ID Semana Units Pesos Return_Units Return_Pesos Net
## <int> <int> <int> <dbl> <int> <dbl> <int>
## 1 1 4 57970962 469960421 839777 7536925 57415516
## 2 1 3 56697977 472277894 835051 7612406 56154862
## 3 1 5 56565253 450590664 798399 7198662 56026267
## 4 1 7 56188416 444780203 942701 8242810 55557236
## 5 1 8 55482753 441614833 957245 8294457 54845666
## 6 1 9 54142315 437679187 932279 7972306 53522411
## # ... with 3 more variables: Net_Pesos <dbl>, Avg_Pesos <dbl>,
## # Return_Rate <dbl>
## 销售渠道的销量和星期和退货率
ggplot(canals)+
geom_bar(aes(x=Semana, y=Units, fill=Return_Rate), stat="identity", color="black")+
theme_bw(base_family = "STKaiti") +
facet_wrap(~Canal_ID, scale="free")+
scale_y_continuous(labels=function(x)paste(x/1e6, "m"))+
scale_fill_gradient(name="退回率", low="white", high="red")+
ggtitle("销售渠道")+ ylab("销量") +xlab("星期")

## 销售渠道和销售站分析####
agencias_canals <- traindata %>%
group_by(Agencia_ID) %>%
summarise(n_canals = n_distinct(Canal_ID)) #添加该销售渠道有多少销售站
head(agencias)
## # A tibble: 6 × 10
## Agencia_ID Units Pesos Return_Units Return_Pesos Net Net_Pesos
## <int> <int> <dbl> <int> <dbl> <int> <dbl>
## 1 1911 5355617 40226634 87481 761811.1 5298372 39464823
## 2 1351 4103035 30911017 44234 360760.9 4075131 30550256
## 3 1912 4038799 29837553 60433 517715.0 3996598 29319838
## 4 1312 3901226 31410683 48758 451032.4 3867069 30959651
## 5 1347 3830254 26415182 66063 477153.5 3787763 25938028
## 6 1129 3748791 66163786 19644 354812.7 3731729 65808973
## # ... with 3 more variables: Return_Rate <dbl>, Town <chr>, State <chr>
## 销售渠道有多少销售站可视化
ggplot(agencias_canals)+
geom_histogram(aes(x=n_canals), fill="red", color="black", alpha="0.5", binwidth=0.5)+
theme_bw(base_family = "STKaiti") +
scale_x_continuous(breaks=1:5)+
scale_y_continuous()+
theme(axis.text.x=element_text(hjust=1)) +
labs(x = "销售渠道数量",y = "销售站数量",title = "销售站的销售渠道量")

# 大部分销售站有1条销售渠道,只有很少的销售站有超过三条的销售渠道
## 销售路线的分析####
routes <- traindata %>% group_by(Ruta_SAK) %>%
summarise(n_Agencias = n_distinct(Agencia_ID), #销售路线有多少销售站
n_Clients = n_distinct(Cliente_ID), #销售路线有多少客户
Units=sum(Venta_uni_hoy), #销售路线的销售量
Return_Units = sum(Dev_uni_proxima)) %>% #销售路线的销售量退货量
mutate(Return_Rate = Return_Units / (Units+Return_Units)) %>% # 添加退货率变量
arrange(desc(Units)) # 按照销量排序
head(routes)
## # A tibble: 6 × 6
## Ruta_SAK n_Agencias n_Clients Units Return_Units Return_Rate
## <int> <int> <int> <int> <int> <dbl>
## 1 1101 82 5404 2645921 30981 0.011573453
## 2 6601 80 1104 2577239 10601 0.004096467
## 3 1102 82 5670 2571009 36771 0.014100499
## 4 1103 79 5434 2568287 37902 0.014543074
## 5 3001 49 20 2481974 402419 0.139516009
## 6 3002 49 25 2405887 125932 0.049739733
ggplot(routes, aes(x=Units/7))+
geom_histogram(fill="red", color="gray", binwidth=5000)+
theme_bw(base_family = "STKaiti") +
scale_x_continuous(labels=function(x)paste(x/1000, "k"))+
scale_y_continuous()+
labs(x = "平均每天销量",y = "销售路线数量")

## 大部分的销售路线的销售量并不多,超过2/3的销售路线每天的销售量不超过10千
## 销售路线和销售站#####
routes_agencias <- traindata %>% group_by(Ruta_SAK, Agencia_ID) %>%
summarise(count=n(), #当前分组的观测数
n_Clients = n_distinct(Cliente_ID), # 客户数量
Units=sum(Venta_uni_hoy), #销售量求和
Return_Units = sum(Dev_uni_proxima)) %>%
mutate(Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(desc(Units))
head(routes_agencias)
## Source: local data frame [6 x 7]
## Groups: Ruta_SAK [4]
##
## Ruta_SAK Agencia_ID count n_Clients Units Return_Units Return_Rate
## <int> <int> <int> <int> <int> <int> <dbl>
## 1 900 22362 6953 9 201679 0 0.000000e+00
## 2 3 1142 773 1 163195 0 0.000000e+00
## 3 900 22560 2744 5 148973 0 0.000000e+00
## 4 1 1168 748 1 145380 8 5.502517e-05
## 5 8 1114 736 1 125192 174 1.387936e-03
## 6 900 22090 5043 6 111042 0 0.000000e+00
top100routes <- routes$Ruta_SAK[1:100] # 销量前100的路线
top100agencias <- agencias$Agencia_ID[1:100] # 销量前100的销售站
## 可视化
ggplot(routes_agencias %>%
filter(Ruta_SAK %in% top100routes, Agencia_ID %in% top100agencias))+
geom_point(aes(x=as.character(Ruta_SAK),
y=as.character(Agencia_ID),
size=Units, color=Return_Rate))+
theme_bw(base_family = "STKaiti") +
scale_color_gradient(name="退回率", low="blue", high="red")+
scale_size_continuous(name = "销量",range = c(1,4)) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank()) +
labs(x = "销售路线",y = "销售站",title = "销量前100的销售站&销售路线")

top50routes <- routes$Ruta_SAK[1:50] # 销量前50的路线
top50agencias <- agencias$Agencia_ID[1:50] # 销量前50的销售站
## 可视化
ggplot(routes_agencias %>%
filter(Ruta_SAK %in% top50routes, Agencia_ID %in% top50agencias))+
geom_point(aes(x=as.character(Ruta_SAK),
y=as.character(Agencia_ID),
size=Units, color=Return_Rate))+
theme_bw(base_family = "STKaiti") +
scale_color_gradient(name="退回率", low="blue", high="red")+
scale_size_continuous(name = "销量",range = c(1,6)) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank()) +
labs(x = "销售路线",y = "销售站",title = "销量前50的销售站&销售路线")

## 对客户数据进行分析#####
sales <- traindata %>% #客户数据
group_by(Cliente_ID) %>% # 按照客户id分组
summarise(Units = sum(Venta_uni_hoy),
Pesos = sum(Venta_hoy), # 本周销售金额
Return_Units = sum(Dev_uni_proxima),
Return_Pesos = sum(Dev_proxima), # 下星期的退回金额
Net = sum(Demanda_uni_equil)) %>%
mutate(Return_Rate = Return_Units / (Units+Return_Units),
Avg_Pesos = Pesos / Units) %>% # 单价
mutate(Net_Pesos = Pesos - Return_Pesos) %>% # 实际销售金额
inner_join(cliente_tabla, by="Cliente_ID") %>%
arrange(desc(Pesos)) # 本周销售金额排序
head(sales)
## # A tibble: 6 × 10
## Cliente_ID Units Pesos Return_Units Return_Pesos Net
## <int> <int> <dbl> <int> <dbl> <int>
## 1 653378 18650001 154662268 1131794 7367474.15 17866224
## 2 653039 909671 7697623 16642 143066.78 893756
## 3 827594 69264 4814696 0 0.00 69264
## 4 652850 490617 4018867 59664 495570.18 440039
## 5 1216931 232517 3325557 49 395.64 232472
## 6 5903732 162633 2931618 0 0.00 162633
## # ... with 4 more variables: Return_Rate <dbl>, Avg_Pesos <dbl>,
## # Net_Pesos <dbl>, NombreCliente <chr>
dim(sales)
## [1] 885416 10
# 花费量前100个客户的树形图
# 可见有一个大客户:Puebla Remision
treemap(sales[1:100, ],
index=c("NombreCliente"), vSize="Units", vColor="Return_Rate",
palette=c("#FFFFFF","#FFFFFF","#FF0000"),
type="value", title.legend="Units return rate", title="Top 100 clients")

## 客户的累积消耗量
sales$Cum_Units <- cumsum(sales$Units) / sum(sales$Units) # 累积百分比
s <- seq(1, 800000, 100) # 约有80万个客户
ggplot()+geom_line(aes(x=s, y=sales$Cum_Units[s]))+
theme_bw(base_family = "STKaiti") +
scale_x_continuous(labels=function(x) paste(x/1000, "k"))+
ggtitle("客户分配")+ xlab("客户数量")+ylab("累积比率")

## 前20万客户约贡献了75%的销售量
## 客户和销售站分析####
agencias_by_client <- traindata %>%
group_by(Cliente_ID) %>% #按照客户id分组
summarise(n_agencias = n_distinct(Agencia_ID)) %>% #多少个销售站
inner_join(cliente_tabla, by="Cliente_ID")
head(agencias_by_client)
## # A tibble: 6 × 3
## Cliente_ID n_agencias NombreCliente
## <int> <int> <chr>
## 1 26 2 BODEGA COMERCIAL MEXICANA TOLUCA
## 2 60 2 SAMS CLUB TOLUCA
## 3 65 2 WAL MART METEPEC
## 4 101 1 WAL MART TOLUCA
## 5 105 1 SUPER KOMPRAS SAN BUENAVENTURA
## 6 106 1 ISSSTE 21
dim(agencias_by_client)
## [1] 885416 3
# 单个客户使用销售站的数量
# 大部分的客户只从一个销售站购买,只有几个客户购买狗的销售站 >= 5
table(agencias_by_client$n_agencias)
##
## 1 2 3 4 5 9 62
## 844113 37510 3771 19 1 1 1
## 使用销售站多的客户信息
agencias_by_client %>% filter(n_agencias %in% c(5, 9, 62)) #返回符合条件的行
## # A tibble: 3 × 3
## Cliente_ID n_agencias NombreCliente
## <int> <int> <chr>
## 1 188391 9 DESAYUNOS ESCOLARES
## 2 653378 62 PUEBLA REMISION
## 3 1274327 5 COMERCIALIZADORA LA PUERTA DEL SOL
# Cliente_ID n_agencias NombreCliente
# (int) (int) (chr)
# 1 188391 9 DESAYUNOS ESCOLARES
# 2 653378 62 PUEBLA REMISION
# 3 1274327 5 COMERCIALIZADORA LA PUERTA DEL SOL
## 客户和购买渠道分析#####
clients_canals <- traindata %>%
group_by(Cliente_ID) %>%
summarise(n_canals = n_distinct(Canal_ID))
## 大多数客户只有一个购买渠道。不同的销售渠道可以为一个客户提供服务。
table(clients_canals$n_canals)
##
## 1 2 3 4
## 874022 6516 65 1
# 1 2 3 4
# 874022 6516 65 1
# 很少有销售站有同一个客户通过多个渠道。
clients_agencies_canals <- traindata %>%
group_by(Cliente_ID, Agencia_ID) %>%
summarise(n_canals = n_distinct(Canal_ID))
table(clients_agencies_canals$n_canals)
##
## 1 2 3
## 922108 3271 3
## 客户和路线分析#####
clients_routes <- traindata %>%
group_by(Cliente_ID) %>%
summarise(n_routes = n_distinct(Ruta_SAK))
head(clients_routes)
## # A tibble: 6 × 2
## Cliente_ID n_routes
## <int> <int>
## 1 26 6
## 2 60 21
## 3 65 3
## 4 101 1
## 5 105 1
## 6 106 1
dim(clients_routes)
## [1] 880604 2
## 大多数客户只有不到5个仓库的交货,但超过240个客户的工作与10个仓库或更多。
sum(clients_routes$n_routes >= 10)
## [1] 242
ggplot(clients_routes)+
geom_histogram(aes(x=n_routes), fill="red", color="black", alpha="0.5", binwidth=1)+
theme_bw(base_family = "STKaiti") +
scale_y_continuous(labels=function(x) paste(x/1000, "k"))+
ggtitle("客户和销售路线")+ xlab("路线数量")+ylab("客户数量")

## 对集团销售的产品进行分析#####
products <- traindata %>% group_by(Producto_ID) %>% #根据生产的产品进行分组
summarise(Units = sum(Venta_uni_hoy), # 销量
Pesos = sum(Venta_hoy), # 卖出的总钱数
Return_Units = sum(Dev_uni_proxima), # 被退回的总量
Return_Pesos = sum(Dev_proxima), #备退回的总钱数
Net = sum(Demanda_uni_equil)) %>% #调整后的我总需求
mutate(Avg_Pesos = Pesos / Units, # 每种产品的单价
Return_Rate = Return_Units / (Units+Return_Units)) %>% # 退货率
filter(!is.nan(Avg_Pesos)) %>% #剔除没有单价的商品
inner_join(producto_tabla, by="Producto_ID") %>%
arrange(desc(Units))
head(products)
## # A tibble: 6 × 9
## Producto_ID Units Pesos Return_Units Return_Pesos Net
## <int> <int> <dbl> <int> <dbl> <int>
## 1 2425 23860309 107365673 170005 765022.5 23728674
## 2 1278 19660667 88448180 178937 805123.9 19536596
## 3 1284 19625219 59297775 122273 387284.7 19539579
## 4 43285 15259454 80239869 158415 834036.1 15157951
## 5 36610 12836368 9884190 297745 229263.6 12680243
## 6 1240 12313013 104695281 195520 1800498.9 12167245
## # ... with 3 more variables: Avg_Pesos <dbl>, Return_Rate <dbl>,
## # NombreProducto <chr>
dim(products) # 产品数量
## [1] 1719 9
products$NombreProducto <- factor(as.character(products$NombreProducto), levels=products$NombreProducto)
# 销量前100的产品树图
treemap(products[1:100, ],
index=c("NombreProducto"), vSize="Units", vColor="Return_Rate",
palette=c("#FFFFFF","#FFFFFF","#FF0000"),
type="value", title.legend="Units return rate", title="Top 100 products")

## 产品的家的密度分布
ggplot(products, aes(x=Avg_Pesos))+
geom_histogram(aes(y=..density..), fill="gray", color="black", alpha="0.8")+
geom_density(fill="red", alpha="0.3")+
theme_bw(base_family = "STKaiti") +
scale_x_continuous(lim=c(0, 50))+
ggtitle("产品单价的分布")+ xlab("平均单价")+ylab("密度")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 74 rows containing non-finite values (stat_bin).
## Warning: Removed 74 rows containing non-finite values (stat_density).

## 产品和销售站
products_agencies <- traindata %>% group_by(Agencia_ID) %>%
summarise(n_products = n_distinct(Producto_ID))
head(products_agencies)
## # A tibble: 6 × 2
## Agencia_ID n_products
## <int> <int>
## 1 1110 214
## 2 1111 208
## 3 1112 199
## 4 1113 202
## 5 1114 222
## 6 1116 200
dim(products_agencies)
## [1] 552 2
## 大多数销售站会卖100~200种产品
ggplot(products_agencies)+
geom_histogram(aes(x = n_products), fill="red", color="black", alpha="0.5", binwidth=10)+
theme_bw(base_family = "STKaiti") +
ggtitle("销售站出售的产品数量")+ xlab("产品数量(种)")+ylab("销售站数量(个)")

## 产品和销售路线
routes_products <- traindata %>% group_by(Producto_ID) %>%
summarise(n_routes = n_distinct(Ruta_SAK))
table(routes_products$n_routes)
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 155 87 67 40 56 36 26 19 16 15 26 14 16 34 15
## 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30
## 18 21 46 14 23 13 13 12 14 11 6 12 5 13 8
## 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45
## 10 9 6 9 12 4 8 5 7 5 7 11 7 11 10
## 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
## 9 14 4 5 3 5 5 4 6 3 6 8 7 3 5
## 61 63 64 65 66 67 68 69 70 71 72 73 74 75 76
## 7 7 4 5 1 5 2 2 8 4 3 2 4 3 4
## 77 78 79 80 82 83 84 85 86 87 88 89 90 91 92
## 1 2 3 2 2 4 5 7 2 5 2 3 4 5 5
## 93 94 95 96 97 98 99 100 101 102 103 104 105 106 108
## 1 2 4 5 3 2 2 2 1 2 2 3 3 2 1
## 109 111 113 114 115 116 117 118 119 122 123 124 125 126 127
## 2 3 2 3 2 4 1 2 2 1 1 1 5 3 2
## 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142
## 2 1 4 3 3 5 1 3 1 2 1 2 1 1 4
## 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157
## 1 5 1 2 2 2 1 2 3 1 3 2 3 2 1
## 158 160 161 162 164 167 168 169 170 171 173 174 177 178 179
## 4 2 1 1 2 3 1 2 3 2 2 1 1 2 4
## 181 182 183 184 185 186 190 192 193 194 195 196 197 198 200
## 2 2 1 1 1 1 1 2 2 1 1 2 4 4 1
## 201 202 203 204 205 206 208 209 213 214 215 216 217 218 219
## 2 2 1 4 2 1 2 3 2 1 1 1 2 1 3
## 221 223 225 226 227 230 231 235 237 238 239 240 243 245 247
## 1 2 2 1 3 1 1 2 2 3 1 1 2 1 1
## 248 249 251 252 253 254 255 256 257 258 260 261 262 263 264
## 1 1 2 1 2 3 4 1 2 2 2 1 4 2 1
## 266 269 270 273 274 275 276 277 279 283 284 286 287 288 289
## 1 2 1 1 1 1 1 3 1 1 1 1 1 4 2
## 291 292 293 294 297 298 299 300 303 305 306 307 308 312 313
## 2 2 1 3 2 2 1 1 2 1 1 1 1 1 5
## 315 318 320 321 322 325 326 327 328 330 331 332 334 336 337
## 1 2 3 1 1 3 2 3 4 1 1 3 1 1 2
## 340 341 343 345 348 349 351 352 353 354 356 357 358 360 361
## 2 1 2 1 1 1 5 2 1 1 2 1 1 1 1
## 364 365 369 370 372 373 374 375 376 380 381 382 383 385 387
## 1 2 1 1 1 2 1 1 1 2 1 1 2 2 5
## 388 391 392 397 398 400 402 405 408 410 413 415 418 420 421
## 2 2 1 1 1 1 1 1 2 1 1 1 2 2 1
## 423 424 425 428 431 432 433 434 435 437 438 439 442 443 444
## 1 2 2 1 1 2 2 1 1 1 2 1 1 1 1
## 445 446 449 450 451 452 453 454 455 456 459 460 462 463 467
## 2 1 1 1 1 2 2 4 3 1 1 1 1 2 1
## 468 470 471 474 478 479 482 483 485 486 488 490 492 493 494
## 1 1 3 1 2 2 3 2 1 2 1 1 1 2 1
## 495 497 498 499 500 503 504 509 511 515 516 520 521 522 525
## 1 1 1 1 1 1 1 2 1 2 1 2 1 1 1
## 529 535 539 540 541 546 547 549 553 564 565 567 572 574 576
## 1 2 1 1 1 2 2 1 1 2 3 4 2 1 1
## 579 580 582 593 596 598 600 601 602 604 608 615 616 619 621
## 2 1 1 1 1 1 1 2 1 1 2 1 2 1 2
## 622 624 632 635 637 656 658 662 665 669 671 674 680 682 689
## 1 1 1 2 1 1 1 1 2 1 1 1 1 1 1
## 690 691 696 697 702 707 708 716 719 722 724 729 731 733 735
## 1 1 1 1 1 1 1 1 1 1 1 2 2 1 2
## 736 744 748 756 765 771 773 782 789 791 793 798 799 803 804
## 1 1 2 2 1 1 1 1 2 1 1 2 1 1 1
## 807 811 818 830 832 843 861 947 955 956 957 964 979 991 994
## 1 2 1 1 1 1 1 3 1 1 1 1 1 1 1
## 997 1001 1020 1029 1036 1144 1166 1196
## 1 1 1 1 1 1 1 1
## 大部分的产品只有几条销售路线,只有几种产品的销售路线很多
ggplot(routes_products)+
geom_histogram(aes(x=n_routes), fill="red", color="black", alpha="0.5", binwidth=10)+
theme_bw(base_family = "STKaiti") +
ggtitle("产品和销售线路")+ xlab("销售线路(条)")+ylab("产品数量(种)")

## 产品和销售路线 top100
routes.products <- traindata %>% group_by(Ruta_SAK, Producto_ID) %>%
summarise(count=n(),
n_Agencias = n_distinct(Agencia_ID),
n_Clients = n_distinct(Cliente_ID),
Units=sum(Venta_uni_hoy),
Return_Units = sum(Dev_uni_proxima)) %>%
mutate(Return_Rate = Return_Units / (Units+Return_Units)) %>%
arrange(desc(Units))
head(routes.products)
## Source: local data frame [6 x 8]
## Groups: Ruta_SAK [6]
##
## Ruta_SAK Producto_ID count n_Agencias n_Clients Units Return_Units
## <int> <int> <int> <int> <int> <int> <int>
## 1 1154 2425 12062 27 1967 388887 2604
## 2 1151 2425 12759 27 2070 377794 2172
## 3 1155 2425 10914 24 1792 372039 2737
## 4 1152 2425 12196 26 2012 370659 2282
## 5 1156 2425 10844 24 1780 361474 2787
## 6 1153 2425 11219 24 1830 352817 2316
## # ... with 1 more variables: Return_Rate <dbl>
dim(routes.products)
## [1] 250265 8
top100routes <- routes$Ruta_SAK[1:100]
top100products <- products$Producto_ID[1:100]
ggplot(routes.products %>%
filter(Ruta_SAK %in% top100routes, Producto_ID %in% top100products))+
geom_point(aes(x=as.character(Ruta_SAK),
y=as.character(Producto_ID),
size=Units, color=Return_Rate))+
theme_bw(base_family = "STKaiti")+
scale_color_gradient(name="退回率", low="blue", high="red")+
scale_size_continuous(name = "销量",range = c(1,6)) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank()) +
ggtitle("销量前100产品&线路")+ xlab("销售线路ID")+ylab("产品ID")

## 产品和销售路线 top50
top50routes <- routes$Ruta_SAK[1:50]
top50products <- products$Producto_ID[1:50]
ggplot(routes.products %>%
filter(Ruta_SAK %in% top50routes, Producto_ID %in% top50products))+
geom_point(aes(x=as.character(Ruta_SAK),
y=as.character(Producto_ID),
size=Units, color=Return_Rate))+
theme_bw(base_family = "STKaiti")+
scale_color_gradient(name="退回率", low="blue", high="red")+
scale_size_continuous(name = "销量",range = c(1,8)) +
theme(axis.line=element_blank(),
axis.text.x=element_blank(),
axis.text.y=element_blank()) +
ggtitle("销量前50产品&线路")+ xlab("销售线路ID")+ylab("产品ID")

## 产品和客户
products_by_client <- traindata %>%
group_by(Cliente_ID) %>%
summarise(n_products = n_distinct(Producto_ID)) %>%
inner_join(cliente_tabla, by="Cliente_ID")
head(products_by_client)
## # A tibble: 6 × 3
## Cliente_ID n_products NombreCliente
## <int> <int> <chr>
## 1 26 51 BODEGA COMERCIAL MEXICANA TOLUCA
## 2 60 34 SAMS CLUB TOLUCA
## 3 65 112 WAL MART METEPEC
## 4 101 4 WAL MART TOLUCA
## 5 105 72 SUPER KOMPRAS SAN BUENAVENTURA
## 6 106 21 ISSSTE 21
dim(products_by_client)
## [1] 885416 3
ggplot(products_by_client)+
geom_histogram(aes(x=n_products), fill="red", color="black", alpha="0.3", binwidth=2)+
theme_bw(base_family = "STKaiti")+
scale_y_continuous(labels=function(x)paste(x/1000, "k"))+
ggtitle("产品量所对应的客户量")+ xlab("产品数量(种)")+ylab("客户数量(位)")

## 需求量的数据分布####
Demanda_uni_equil <- traindata %>%
group_by(Producto_ID,Semana,Agencia_ID,Canal_ID,Ruta_SAK) %>%
summarise(Demanda = sum(Demanda_uni_equil),
logDemanda = log(sum(Demanda_uni_equil+1))) #销量取对数,均值
head(Demanda_uni_equil)
## Source: local data frame [6 x 7]
## Groups: Producto_ID, Semana, Agencia_ID, Canal_ID [3]
##
## Producto_ID Semana Agencia_ID Canal_ID Ruta_SAK Demanda logDemanda
## <int> <int> <int> <int> <int> <int> <dbl>
## 1 41 3 2281 7 3303 30 3.433987
## 2 41 3 2281 7 3306 3086 8.035603
## 3 41 3 2281 7 3309 330 5.805135
## 4 41 3 23879 7 3201 215 5.379897
## 5 41 4 2281 7 3303 200 5.308268
## 6 41 4 2281 7 3306 2170 7.683864
dim(Demanda_uni_equil)
## [1] 5363216 7
summary(Demanda_uni_equil$Demanda)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 14.00 40.00 99.93 104.00 15120.00
summary(Demanda_uni_equil$logDemanda)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.890 3.892 3.824 4.828 9.624
ggplot(Demanda_uni_equil,aes(x =Demanda)) +
geom_histogram(aes(y = ..density..),fill="gray", color="black", alpha="0.8",binwidth = 1) +
geom_density(fill="red", alpha="0.3")+
theme_bw(base_family = "STKaiti") +
#scale_x_continuous(lim=c(0, 100))+
ggtitle("产品需求量的分布")+ xlab("实际需求量")+ylab("密度")

ggplot(Demanda_uni_equil,aes(x =Demanda)) +
geom_histogram(aes(y = ..density..),fill="gray", color="black", alpha="0.8",binwidth = 1) +
geom_density(fill="red", alpha="0.3")+
theme_bw(base_family = "STKaiti") +
scale_x_continuous(lim=c(0, 1000))+
ggtitle("产品需求量的分布")+ xlab("实际需求量")+ylab("密度")
## Warning: Removed 47788 rows containing non-finite values (stat_bin).
## Warning: Removed 47788 rows containing non-finite values (stat_density).

ggplot(Demanda_uni_equil,aes(x =logDemanda)) +
geom_histogram(aes(y = ..density..),fill="gray", color="black", alpha="0.8",binwidth = 0.4) +
geom_density(fill="red", alpha="0.2")+
theme_bw(base_family = "STKaiti") +
#scale_x_continuous(lim=c(0, 50))+
ggtitle("产品需求量的分布")+ xlab("取对数后实际需求量")+ylab("密度")
