rm(list=ls(all=T))
load("data/olist.rdata")
load("data/Z.rdata")
pacman::p_load(dplyr, ggplot2, d3heatmap, plotly, FactoMineR, factoextra)
cols = colorRamp(c('seagreen','lightyellow','red'))
從下表可觀察到,2017-11-01之後銷售量明顯提高。
O$order_purchase_timestamp = as.POSIXct(O$order_purchase_timestamp)
Table <- table(cut(O$order_purchase_timestamp, breaks="months"))%>%barplot(las=2)
11月每日整體銷售量中,在11/24(黑色星期五)有銷售高峰
O$order_purchase_timestamp = as.Date(O$order_purchase_timestamp)
a = O[,c(1,2,3,4)]
b = a %>% filter(format(a$order_purchase_timestamp, "%m")=="11")
table(cut(b$order_purchase_timestamp , breaks="days"))%>%barplot(las=2)
load("data/olist.rdata")
將訂單時間以季分時間軸度
bx = paste0(rep(2016:2018, each=4),
rep(c('-02-01','-05-01','-08-01','-11-01'), 3))[3:12] %>% as.POSIXct()
O$season = cut(O$order_purchase_timestamp, bx) %>% as.Date
O %>% filter(season >= as.Date("2017-02-01"), season <= as.Date("2018-05-01")) %>%
{table(.$season)} %>% barplot()
將賣家數量以季分時間軸度
range(Q$won_date)
[1] "2017-12-05 02:00:00 UTC" "2018-11-14 18:04:19 UTC"
bx_S = paste0(rep(2017:2018, each=4),
rep(c('-02-01','-05-01','-08-01','-11-01'), 2))%>% as.POSIXct()
Q$season = cut(Q$won_date, bx_S) %>% as.Date
table(Q$season)%>% barplot()
銷售項目比率折線圖(以季變化)
#把season放到I
I = left_join(P[,1:2], TPC) %>%
select(product_id, category=product_category_name_english) %>%
right_join(I) %>%
left_join(O[,c(1,9)])
Joining, by = "product_category_name"
Joining, by = "product_id"
Joining, by = "order_id"
table(I$category) %>% sort %>% tail(10) %>% names -> cat10
I %>% filter(category %in% cat10) %>%
group_by(season, category) %>% summarise(
ItemSold = n(),
noOrder = n_distinct(order_id),
ProdRev = sum(price),
TotalRev = sum(price, freight_value),
rTotalRev = mean(price)
) %>%
mutate_at(vars(ItemSold:TotalRev), ~(./sum(.))) -> A
A %>% filter(season >= as.Date("2017-02-01"), season <= as.Date("2018-05-01")) %>% ggplot(aes(x=season, y=ItemSold, col=category)) +
geom_line() + geom_point() + ggtitle("%ItemSold per Season")-> p
ggplotly(p)
bed_bath_table(床_衛浴_桌子):具週期性,但購買週期長,所以相較於housewares的銷量不會有太多起伏
computers_accessories(電腦周邊):每隔三機會有一次的購買高峰
furniture_decor(居家裝飾):和居家用品的趨勢相反和園藝工具相同
garden_tools(園藝):和居家裝飾相同春夏為高峰
housewares(家庭用具):冬高夏低
sports_leisure(運動休閒):春秋高峰
health_beauty(保健美容):呈逐步上升趨勢(可能受情人節或父親節影響)
watches_(手錶禮物):呈逐步上升趨勢但比保健美容更具階段性(可能受情人節或父親節影響)
telephone(電話):需求逐步下降
下表資訊:
Y軸:星期一~星期日
X軸:0~24小時
load("data/olist.rdata")
從熱圖中發現星期一是下單高峰,週末力道趨緩
table(format(O$order_purchase_timestamp,"%u"), format(O$order_purchase_timestamp,"%H")) %>%
as.data.frame.matrix %>%
d3heatmap(.,F,F,col=colorRamp(c('seagreen','lightyellow','red')))
平日下單叫假日高出許多,而平日中又以週一、週二為大宗
時間已早上9-下午4點為消費者為主要下單時段
各種類與地區的總銷售量
I = left_join(I,P[,c(1,2)])
Joining, by = "product_id"
I = left_join(I,TPC[,c(1,2)])
Joining, by = "product_category_name"
a = I %>% group_by(product_category_name)%>% summarise(
nCat = n())
a = left_join(a,TPC[,c(1,2)])
Joining, by = "product_category_name"
top15 = a %>% top_n(15, nCat)
top15
# A tibble: 15 x 3
product_category_name nCat product_category_name_english
<chr> <int> <chr>
1 automotivo 4235 auto
2 bebes 3065 baby
3 beleza_saude 9670 health_beauty
4 brinquedos 4117 toys
5 cama_mesa_banho 11115 bed_bath_table
6 cool_stuff 3796 cool_stuff
7 eletronicos 2767 electronics
8 esporte_lazer 8641 sports_leisure
9 ferramentas_jardim 4347 garden_tools
10 informatica_acessorios 7827 computers_accessories
11 moveis_decoracao 8334 furniture_decor
12 perfumaria 3419 perfumery
13 relogios_presentes 5991 watches_gifts
14 telefonia 4545 telephony
15 utilidades_domesticas 6964 housewares
top15 = left_join(top15,TPC[,c(1,2)])
Joining, by = c("product_category_name", "product_category_name_english")
top16 = rbind(top15, top15[15,])
top16$product_category_name_english[16] = 'others'
top16$nCat[16] = sum(a$nCat[!a$product_category_name_english %in% top15$product_category_name_english[1:15]])
top16$product_category_name_english <- factor(top16$product_category_name_english, levels = top16$product_category_name_english[order(-top16$nCat)])
ggplot(top16, aes(x=product_category_name_english, y=nCat)) +
geom_histogram(stat="identity") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))+
xlab("Product Category") +
ylab("Total Item Sales ")
Warning: Ignoring unknown parameters: binwidth, bins, pad
####上表可以看到,銷售量前五名屬於生活用品類(購買頻率高)
每個地區的銷售量比較:長條圖
I = left_join(I,O[,c(1,2)])
Joining, by = "order_id"
I = left_join(I,C[,c(1,5)])
Joining, by = "customer_id"
a = I %>% group_by(customer_state)%>% summarise(
nItems = n()
)
top10 = a %>% top_n(10, nItems)
top10 = rbind(top10, top10[10,])
top10$customer_state[11] = 'others'
top10$nItems[11] = sum(a$nItems[! a$customer_state %in% top10$customer_state[1:10]])
#table(I$customer_state) %>% sort(decreasing = T) %>% prop.table() %>% cumsum()
#top10
top10$customer_state <- factor(top10$customer_state, levels = top10$customer_state[order(-top10$nItems)])
ggplot(top10, aes(x=customer_state, y=nItems)) +
geom_histogram(stat="identity")+
xlab("State") +
ylab("Total Item Sales ")
Warning: Ignoring unknown parameters: binwidth, bins, pad
#怎麼找出其他?(前十個銷售量以外的州,設為「other」)
上表可以看到,位於聖保羅的消費者佔了Olist大部分的銷售量
銷售量與各變量的相關性分析
load("data/olist.rdata")
P = left_join(P,count(I, product_id))
Joining, by = "product_id"
i = !is.na(P$product_photos_qty) #排除na
cor(P$product_photos_qty[i], P$n[i]) #求銷售量跟照片多寡的相關性
[1] 0.0038473
#整體銷售量跟照片多寡相關性低
colnames(TPC)[1] = "product_category_name"
P = left_join(P, TPC)
Joining, by = "product_category_name"
P2 = P[complete.cases(P),] #消除na
P2 %>% group_by(product_category_name_english) %>% summarise(
nProd = n(),
Corr = cor(n, product_photos_qty)
) %>% arrange(Corr)
Warning in cor(n, product_photos_qty): the standard deviation is zero
# A tibble: 71 x 3
product_category_name_english nProd Corr
<chr> <int> <dbl>
1 fashion_childrens_clothes 5 -0.867
2 tablets_printing_image 9 -0.460
3 furniture_mattress_and_upholstery 10 -0.356
4 fashion_underwear_beach 53 -0.355
5 home_comfort_2 5 -0.25
6 la_cuisine 10 -0.201
7 diapers_and_hygiene 12 -0.183
8 fashion_male_clothing 95 -0.170
9 flowers 14 -0.165
10 fashion_sport 19 -0.148
# … with 61 more rows
小結:有些商品類別的銷售量跟照片多寡具有相關性
例如:流行孩童服飾的銷售量與銷售量有高相關性(相關係數的絕對值0.8668451 > 0.7)
P = left_join(P, TPC)
Joining, by = c("product_category_name", "product_category_name_english")
P2 = P[complete.cases(P),] #消除na
P2 %>% group_by(product_category_name_english) %>% summarise(
nProd = n(),
Corr = cor(n, product_name_lenght)
) %>% arrange(Corr)
Warning in cor(n, product_name_lenght): the standard deviation is zero
# A tibble: 71 x 3
product_category_name_english nProd Corr
<chr> <int> <dbl>
1 fashion_childrens_clothes 5 -0.676
2 fashion_sport 19 -0.513
3 home_comfort_2 5 -0.432
4 fashion_underwear_beach 53 -0.243
5 flowers 14 -0.222
6 furniture_bedroom 45 -0.210
7 music 27 -0.200
8 agro_industry_and_commerce 74 -0.193
9 furniture_living_room 156 -0.175
10 industry_commerce_and_business 68 -0.169
# … with 61 more rows
小結:有些商品類別的銷售量跟產品名字長度具有相關性
例如:流行孩童服飾的銷售量與產品名字長度有中度相關性(相關係數的絕對值0.4 < 0.67586633 < 0.7)
P = left_join(P, TPC)
Joining, by = c("product_category_name", "product_category_name_english")
P2 = P[complete.cases(P),] #消除na
P2 %>% group_by(product_category_name_english) %>% summarise(
nProd = n(),
Corr = cor(n, product_description_lenght)
) %>% arrange(Corr)
Warning in cor(n, product_description_lenght): the standard deviation is zero
# A tibble: 71 x 3
product_category_name_english nProd Corr
<chr> <int> <dbl>
1 fashion_childrens_clothes 5 -0.776
2 fashion_underwear_beach 53 -0.273
3 party_supplies 26 -0.212
4 cine_photo 28 -0.201
5 home_comfort_2 5 -0.183
6 furniture_mattress_and_upholstery 10 -0.183
7 office_furniture 309 -0.178
8 tablets_printing_image 9 -0.172
9 signaling_and_security 93 -0.161
10 dvds_blu_ray 48 -0.0983
# … with 61 more rows
小結:有些商品類別的銷售量跟產品描述長度具有相關性
例如:流行孩童服飾的銷售量跟產品描述長度有高度相關性(相關係數的絕對值0.77566003 > 0.7)
P = left_join(P, TPC)
Joining, by = c("product_category_name", "product_category_name_english")
P2 = P[complete.cases(P),] #消除na
P2 %>% group_by(product_category_name_english) %>% summarise(
nProd = n(),
Corr = cor(n, product_height_cm)
) %>% arrange(Corr)
Warning in cor(n, product_height_cm): the standard deviation is zero
# A tibble: 71 x 3
product_category_name_english nProd Corr
<chr> <int> <dbl>
1 la_cuisine 10 -0.452
2 furniture_mattress_and_upholstery 10 -0.408
3 flowers 14 -0.314
4 fashion_underwear_beach 53 -0.281
5 diapers_and_hygiene 12 -0.216
6 kitchen_dining_laundry_garden_furniture 94 -0.197
7 audio 58 -0.195
8 small_appliances 231 -0.192
9 market_place 104 -0.174
10 agro_industry_and_commerce 74 -0.166
# … with 61 more rows
小結:有些商品類別的銷售量跟產品高度具有相關性
例如:廚房用品的銷售量跟產品描述高度有中度相關性(相關係數的絕對值0.4 < 0.4521562 > 0.7)
P = left_join(P, TPC)
Joining, by = c("product_category_name", "product_category_name_english")
P2 = P[complete.cases(P),] #消除na
P2 %>% group_by(product_category_name_english) %>% summarise(
nProd = n(),
Corr = cor(n, product_width_cm)
) %>% arrange(Corr)
Warning in cor(n, product_width_cm): the standard deviation is zero
# A tibble: 71 x 3
product_category_name_english nProd Corr
<chr> <int> <dbl>
1 fashion_childrens_clothes 5 -0.520
2 flowers 14 -0.259
3 agro_industry_and_commerce 74 -0.257
4 food_drink 104 -0.256
5 office_furniture 309 -0.219
6 audio 58 -0.190
7 kitchen_dining_laundry_garden_furniture 94 -0.174
8 party_supplies 26 -0.168
9 fashion_shoes 173 -0.168
10 small_appliances 231 -0.159
# … with 61 more rows
小結:有些商品類別的銷售量跟產品寬度具有相關性
例如:流行孩童服飾的銷售量跟產品寬度有中度相關性(相關係數的絕對值0.4 < 0.51951 < 0.7)
賣家來源與銷售量:賣家
load("data/olist.rdata")
P= left_join(P,TPC[,c(1,2)])
Joining, by = "product_category_name"
Q=left_join(Q,L[,c(1,4)])
Joining, by = "mql_id"
new <- (S=left_join(S,Q[,c(1,2,15)]))
Joining, by = "seller_id"
I=left_join(I,P[,c(1,10)])
Joining, by = "product_id"
new = left_join(new,I[,c(8,4)])
Joining, by = "seller_id"
new2=new %>% group_by(new$seller_id) %>% summarize(
nSellerS=n(),
type=origin[1],
category=product_category_name_english[1]
) %>% arrange(desc(nSellerS)) %>% ungroup()
CategoryVSOrigin = xtabs(nSellerS ~ category + type, new2)
(rowSums(CategoryVSOrigin)/sum(CategoryVSOrigin)) %>% cumsum %>% {which(. < .95)} %>% last
[1] 47
(colSums(CategoryVSOrigin)/sum(CategoryVSOrigin)) %>% cumsum %>% {which(. < .95)} %>% last
[1] 8
CategoryVSOrigin = as(CategoryVSOrigin[1:47, 1:8], "matrix") # keep 8 origins & 55 categories
colnames(CategoryVSOrigin) = substr(colnames(CategoryVSOrigin),1 ,47) # use shorter names
`
d3heatmap(CategoryVSOrigin,F,F,col=cols, xaxis_font_size=10, yaxis_font_size=6)
range(CategoryVSOrigin[CategoryVSOrigin > 0])
[1] 1 551
因為log(0) = -Inf,所以取對數之前我們常需要先加進去一個數值,通常我們會選用矩陣中最小的數字
par(mfrow=c(1,2), cex=0.8)
hist(CategoryVSOrigin)
hist(log(50+CategoryVSOrigin,10)) # 商務應用裡面一般我們會用10底的對數
d3heatmap(log(51+CategoryVSOrigin,10),F,F,col=cols, xaxis_font_size=10, yaxis_font_size=6)
d3heatmap(log(51+CategoryVSOrigin,10),col=cols, xaxis_font_size=10, yaxis_font_size=6)
舉例來說,
販售health_beauty的賣家中,
以paid_search註冊的賣家最多,如果要徵求賣美妝的賣家,就可以多多利用付費廣告來吸引新用戶。
賣家來源與銷售量:銷售量
if(!require(morpheus)) install.packages('morpheus.R')
pacman::p_load(dplyr, ggplot2, d3heatmap, plotly, googleVis)
cols = colorRamp(c('seagreen','lightyellow','red'))
load("data/olist.rdata")
篩選出被審核通過的廠商,申請管道為何
revenue = sum(price)
廠商個別收益 = seller_id的order_id之sum(price)
SComplete = inner_join(S[, c(1)], Q[,c(1, 2, 12)])
Joining, by = "seller_id"
SComplete = inner_join(SComplete, L[, c(1, 4)])
Joining, by = "mql_id"
SComplete = inner_join(SComplete, I[, c(1, 3, 4, 6)])
Joining, by = "seller_id"
O = left_join(O, I[,c(1,4)])
Joining, by = "order_id"
SO = O %>% group_by(seller_id) %>%
summarize(nSellerSalesO=n()) %>%
arrange(desc(nSellerSalesO))
SO
# A tibble: 3,096 x 2
seller_id nSellerSalesO
<chr> <int>
1 6560211a19b47992c3666cc44a7e94c0 2033
2 4a3ca9315b744ce9f8e9374361493884 1987
3 1f50f920176fa81dab994f9023523100 1931
4 cc419e0650a3c5ba77189a1882b7556a 1775
5 da8622b14eb17ae2831f4ac5b9dab84a 1551
6 955fee9216a65b617aa5c0531780ce60 1499
7 1025f0e2d44d7041d6cf58b6550e0bfa 1428
8 7c67e1448b00f6e969d365cea6b010ab 1364
9 ea8482cd71df3c1969d7b9473ff13abc 1203
10 7a67c85e85bb2ce8582c35f2203ad736 1171
# … with 3,086 more rows
SComplete = left_join(SComplete, SO)
Joining, by = "seller_id"
SC_ORS = SComplete %>% group_by(origin) %>%
summarise(nOriginSC=n(), OriginRevenue=sum(price), avgRevenueTotalSales = OriginRevenue/nOriginSC) %>% arrange(desc(avgRevenueTotalSales)) %>% ungroup()
SC_ORS
# A tibble: 10 x 4
origin nOriginSC OriginRevenue avgRevenueTotalSales
<chr> <int> <dbl> <dbl>
1 email 24 8485. 354.
2 referral 77 17887. 232.
3 unknown 1372 213743. 156.
4 organic_search 1346 207023. 154.
5 display 7 923 132.
6 <NA> 11 1243. 113.
7 paid_search 1448 155277. 107.
8 direct_traffic 218 21904. 100.
9 social 444 43478. 97.9
10 other 97 6889. 71.0
SC_ORS$avgRevenueTotalSales = SC_ORS$avgRevenueTotalSales %>% round(0)
SC_ORS$origin <- factor(SC_ORS$origin, levels=SC_ORS$origin[order(-SC_ORS$avgRevenueTotalSales)])
geom_col 可以有XY(非連續)
state_count 只能有X(非連續)
Origin vs AvgRevenueTotalSales
ggplot(SC_ORS, aes(x=reorder(origin, -avgRevenueTotalSales), y=avgRevenueTotalSales)) +
geom_bar(width=0.75, stat="identity") +
geom_text(aes(label=round(avgRevenueTotalSales)), size=3, vjust=-0.5, check_overlap=T)
Origin vs OriginRevenue
ggplot(SC_ORS, aes(x=reorder(origin, -OriginRevenue), y=OriginRevenue)) +
geom_bar(width=0.75, stat="identity") +
geom_text(aes(label=round(OriginRevenue)), size=3, vjust=-0.5, check_overlap=T)
SC_ORS$avgRevenueTotalSales = SC_ORS$avgRevenueTotalSales %>% round(0)
SO = left_join(SO, SComplete[, c(1, 4)])
Joining, by = "seller_id"
Origin vs OriginSales
ggplot(SO, aes(x=origin, y=nSellerSalesO)) +
geom_bar(width=0.75, stat="identity") +
geom_text(aes(label=round(nSellerSalesO)), size=3, check_overlap=T)
SComplete = left_join(SComplete, P[, c(1, 2)])
Joining, by = "product_id"
SComplete = left_join(SComplete, TPC)
Joining, by = "product_category_name"
SC_OCS = SComplete %>% group_by(product_category_name_english, origin) %>%
transmute(OriginTotalSales = sum(nSellerSalesO)) %>%
ungroup()
SC_OCS
# A tibble: 5,044 x 3
product_category_name_english origin OriginTotalSales
<chr> <chr> <int>
1 bed_bath_table paid_search 373
2 bed_bath_table paid_search 373
3 bed_bath_table paid_search 373
4 bed_bath_table paid_search 373
5 garden_tools unknown 593
6 garden_tools organic_search 544
7 sports_leisure paid_search 3221
8 baby paid_search 648
9 toys paid_search 1389
10 housewares paid_search 8197
# … with 5,034 more rows
OriginNumber VS Categories
par(cex=0.8)
table(SC_OCS$product_category_name_english) %>% sort(decreasing=T) %>% barplot(las=2)
CategoryVSOrigin = xtabs(OriginTotalSales ~ product_category_name_english + origin, SC_OCS)
(rowSums(CategoryVSOrigin)/sum(CategoryVSOrigin)) %>% cumsum %>% {which(. < .95)} %>% last
[1] 55
(colSums(CategoryVSOrigin)/sum(CategoryVSOrigin)) %>% cumsum %>% {which(. < .95)} %>% last
[1] 8
CategoryVSOrigin = as(CategoryVSOrigin[1:55, 1:8], "matrix") # keep 8 origins & 55 categories
colnames(CategoryVSOrigin) = substr(colnames(CategoryVSOrigin),1 ,55) # use shorter names
try <- CompleteSa2[,1:3] test2=c(“origin”,“OriginTotalSales”,“product_category_name”)
d3heatmap(CategoryVSOrigin,F,F,col=cols, xaxis_font_size=10, yaxis_font_size=6)
range(CategoryVSOrigin[CategoryVSOrigin > 0])
[1] 1 57844116
因為log(0) = -Inf,所以取對數之前我們常需要先加進去一個數值,通常我們會選用矩陣中最小的數字
par(mfrow=c(1,2), cex=0.8)
hist(CategoryVSOrigin)
hist(log(50+CategoryVSOrigin,10)) # 商務應用裡面一般我們會用10底的對數
d3heatmap(log(51+CategoryVSOrigin,10),F,F,col=cols, xaxis_font_size=10, yaxis_font_size=6)
d3heatmap(log(51+CategoryVSOrigin,10),col=cols, xaxis_font_size=10, yaxis_font_size=6)
可以建議在有紅色方格的分類項目上打廣告,吸引該廠商加入平台。
seller_id from origin
SC_OSid = SComplete %>% group_by(origin) %>%
count(origin) %>% arrange(desc(n)) %>% ungroup()
nOriginSales = SC_OSid$n
ggplot(SC_OSid, aes(x=reorder(origin, -nOriginSales), y=nOriginSales)) +
geom_bar(width=0.75, stat="identity") +
geom_text(aes(label=round(nOriginSales)), size=3, vjust=-0.5, check_overlap=T)
RS = inner_join(S[, c(1)], SComplete[, c(1, 5, 6, 8)])
Joining, by = "seller_id"
RS = inner_join(RS, R[, c(1, 2, 3)])
Joining, by = "order_id"
RS = RS %>% group_by(seller_id) %>%
summarise(
n=n(),AvgReviewScore=mean(review_score), SellerSales=sum(n))
RS$SellerSales
[1] 3 39 44 9 4 2 1 8 9 6 1 19 5 12 1 3 10 3
[19] 2 5 3 6 4 4 6 58 1 2 16 3 6 5 34 4 1 1
[37] 30 1 1 12 2 1 2 1 25 1 3 3 18 6 14 1 29 1
[55] 3 28 2 2 6 1 3 8 1 10 1 11 7 4 20 6 1 16
[73] 10 5 9 8 2 1 49 5 1 38 10 3 24 2 1 7 1 10
[91] 21 22 5 3 1 3 1 1 6 3 23 3 4 40 67 5 8 2
[109] 2 2 1 2 11 84 4 3 1 2 10 19 4 6 22 10 5 45
[127] 38 14 2 10 3 17 17 4 10 5 1 8 2 1 4 4 35 110
[145] 5 1 1 1 5 3 40 27 1 2 6 22 4 22 43 48 5 9
[163] 8 1 3 2 11 7 98 13 15 12 5 4 2 3 5 15 26 1
[181] 11 3 28 8 4 4 578 5 7 1 9 14 16 3 24 12 4 34
[199] 2 7 15 1 1 1 15 2 2 11 106 3 1 2 1 1 8 4
[217] 1 7 4 1 1 1 12 1 18 14 12 2 7 5 3 2 7 8
[235] 4 3 14 1 11 2 2 1 8 1 6 4 7 6 1 3 4 2
[253] 15 2 7 10 12 7 21 11 1 22 5 7 23 8 2 86 38 6
[271] 2 1 2 1 15 7 12 2 29 1 2 45 1 5 3 3 13 13
[289] 8 11 1 6 338 4 15 1 9 1 15 8 14 33 1 2 11 10
[307] 63 1 7 1 24 24 35 1 9 12 73 7 1 1 16 1 1 4
[325] 59 1 13 18 8 9 48 16 4 4 8 1 1 2 14 86 2 2
[343] 3 1 12 20 1 12 7 2 11 9 2 3 4 4 8 1 8 17
[361] 3 4 2 11 4 1 8 15 91 3 9 1 10 7 27 1 6 18
[379] 15 31
廠商銷售量和廠商評價不顯著
<br>
I = left_join(I, R[, c(2, 3)])
Joining, by = "order_id"
RS = I %>% group_by(seller_id) %>% summarize(
nSellerSalesI=n(),
avgReviewScore=mean(review_score)
) %>% arrange(desc(nSellerSalesI)) %>%
unique() %>% ungroup()
RS
# A tibble: 3,095 x 3
seller_id nSellerSalesI avgReviewScore
<chr> <int> <dbl>
1 6560211a19b47992c3666cc44a7e94c0 2039 3.89
2 4a3ca9315b744ce9f8e9374361493884 2009 3.78
3 1f50f920176fa81dab994f9023523100 1940 3.98
4 cc419e0650a3c5ba77189a1882b7556a 1819 4.06
5 da8622b14eb17ae2831f4ac5b9dab84a 1574 4.07
6 955fee9216a65b617aa5c0531780ce60 1501 4.04
7 1025f0e2d44d7041d6cf58b6550e0bfa 1443 3.83
8 7c67e1448b00f6e969d365cea6b010ab 1375 3.35
9 ea8482cd71df3c1969d7b9473ff13abc 1204 3.94
10 7a67c85e85bb2ce8582c35f2203ad736 1175 4.22
# … with 3,085 more rows
SOP = left_join(S[, c(1)], SComplete[, c(1, 5, 6)])
Joining, by = "seller_id"
P = P %>% group_by(product_id) %>%
mutate(product_size_cm3=product_length_cm*product_height_cm*product_width_cm) %>%
ungroup()
P
# A tibble: 32,951 x 10
product_id product_categor… product_name_le… product_descrip…
<chr> <chr> <int> <int>
1 1e9e8ef04… perfumaria 40 287
2 3aa071139… artes 44 276
3 96bd76ec8… esporte_lazer 46 250
4 cef67bcfe… bebes 27 261
5 9dc1a7de2… utilidades_dome… 37 402
6 41d3672d4… instrumentos_mu… 60 745
7 732bd381a… cool_stuff 56 1272
8 2548af3e6… moveis_decoracao 56 184
9 37cc742be… eletrodomesticos 57 163
10 8c9210988… brinquedos 36 1156
# … with 32,941 more rows, and 6 more variables: product_photos_qty <int>,
# product_weight_g <int>, product_length_cm <int>, product_height_cm <int>,
# product_width_cm <int>, product_size_cm3 <int>
PS = left_join(P[, c(1,10)], SComplete[, c(1,6,10)])
Joining, by = "product_id"
PS = left_join(PS, RS[, c(1,2)])
Joining, by = "seller_id"
PS = left_join(PS, SO[, c(1,2)])
Joining, by = "seller_id"
cor.test(PS$product_size_cm3, PS$nSellerSalesI)
Pearson's product-moment correlation
data: PS$product_size_cm3 and PS$nSellerSalesI
t = -285, df = 594172, p-value <0.0000000000000002
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.34939 -0.34492
sample estimates:
cor
-0.34716
舉例來說,
販售health_beauty的賣家中,
以paid_search和organic_search註冊的銷售量最多,
代表如果要篩選表現較佳的health_beauty賣家的話,
可以透過paid_search和organic_search的註冊來源來做區隔。
也就是說,可以在搜尋引擎上打health_beauty相關的廣告來吸引更多相關賣家。
集群分析-產品類別
load("data/olist.rdata")
category的分群
I = left_join(I,P[,c(1,2)])
Joining, by = "product_id"
I = left_join(I,TPC[,c(1,2)])
Joining, by = "product_category_name"
I = left_join(I,R[,c(2,3)])
Joining, by = "order_id"
A = I %>% group_by(product_category_name_english) %>% summarise(
rev = sum(price)) %>% ungroup()
A %>% mutate(
totalrev = sum(A[,2]),
rate_rev = rev/totalrev) %>% arrange(desc(rate_rev))
# A tibble: 72 x 4
product_category_name_english rev totalrev rate_rev
<chr> <dbl> <dbl> <dbl>
1 health_beauty 1263238. 13653185. 0.0925
2 watches_gifts 1206075. 13653185. 0.0883
3 bed_bath_table 1051086. 13653185. 0.0770
4 sports_leisure 993685. 13653185. 0.0728
5 computers_accessories 919721. 13653185. 0.0674
6 furniture_decor 736682. 13653185. 0.0540
7 cool_stuff 637259. 13653185. 0.0467
8 housewares 634543. 13653185. 0.0465
9 auto 594363. 13653185. 0.0435
10 garden_tools 486432. 13653185. 0.0356
# … with 62 more rows
A
# A tibble: 72 x 2
product_category_name_english rev
<chr> <dbl>
1 agro_industry_and_commerce 72530.
2 air_conditioning 55025.
3 art 24203.
4 arts_and_craftmanship 1814.
5 audio 50738.
6 auto 594363.
7 baby 412117.
8 bed_bath_table 1051086.
9 books_general_interest 46857.
10 books_imported 4640.
# … with 62 more rows
table(I$product_category_name_english) %>% sort(decreasing = T) %>% prop.table() %>% cumsum()
bed_bath_table health_beauty
0.10093 0.18803
sports_leisure furniture_decor
0.26593 0.34129
computers_accessories housewares
0.41198 0.47455
watches_gifts telephony
0.52828 0.56902
garden_tools auto
0.60807 0.64618
toys cool_stuff
0.68311 0.71719
perfumery baby
0.74795 0.77546
electronics stationery
0.80025 0.82286
fashion_bags_accessories pet_shop
0.84122 0.85867
office_furniture consoles_games
0.87390 0.88409
luggage_accessories construction_tools_construction
0.89386 0.90220
home_appliances small_appliances
0.90944 0.91556
musical_instruments home_construction
0.92166 0.92707
books_general_interest food
0.93202 0.93659
furniture_living_room home_confort
0.94113 0.94504
drinks audio
0.94847 0.95174
market_place construction_tools_lights
0.95452 0.95726
air_conditioning kitchen_dining_laundry_garden_furniture
0.95991 0.96244
food_drink books_technical
0.96495 0.96735
industry_commerce_and_business fashion_shoes
0.96976 0.97214
fixed_telephony home_appliances_2
0.97451 0.97667
costruction_tools_garden agro_industry_and_commerce
0.97882 0.98071
art computers
0.98259 0.98440
signaling_and_security construction_tools_safety
0.98618 0.98792
christmas_supplies fashion_male_clothing
0.98929 0.99047
fashion_underwear_beach furniture_bedroom
0.99165 0.99263
costruction_tools_tools tablets_printing_image
0.99355 0.99430
small_appliances_home_oven_and_coffee cine_photo
0.99498 0.99563
dvds_blu_ray books_imported
0.99620 0.99674
fashio_female_clothing party_supplies
0.99719 0.99757
diapers_and_hygiene furniture_mattress_and_upholstery
0.99792 0.99826
music flowers
0.99860 0.99890
fashion_sport home_comfort_2
0.99918 0.99944
arts_and_craftmanship cds_dvds_musicals
0.99966 0.99979
la_cuisine fashion_childrens_clothes
0.99991 0.99998
security_and_services
1.00000
#取前15個category(佔銷量80%)
Z = I %>% group_by(product_category_name_english) %>% summarise(
ItemsSold = n(),#賣多少產品
Rev = sum(price),#賺多少營收
noProd = n_distinct(product_id),#賣多少不一樣的產品
avgPrice = mean(price),# 平均產品售價
avgFreight = mean(freight_value),#平均運費
avgRevProd = Rev/noProd,#平均產品會產生多少營收
avgItemsProd = ItemsSold/noProd,#平均每個產品賣了多少個項目
avgscore = mean(review_score)
)
summary(Z)
product_category_name_english ItemsSold Rev
Length:72 Min. : 2 Min. : 283
Class :character 1st Qu.: 98 1st Qu.: 9356
Mode :character Median : 290 Median : 47151
Mean : 1574 Mean : 189628
3rd Qu.: 1763 3rd Qu.: 197326
Max. :11272 Max. :1263238
noProd avgPrice avgFreight avgRevProd
Min. : 1.0 Min. : 25.3 Min. :11.9 Min. : 79.3
1st Qu.: 37.0 1st Qu.: 80.7 1st Qu.:16.3 1st Qu.: 199.4
Median : 99.5 Median : 113.9 Median :19.2 Median : 345.1
Mean : 457.6 Mean : 144.2 Mean :21.7 Mean : 501.2
3rd Qu.: 543.5 3rd Qu.: 147.0 3rd Qu.:22.7 3rd Qu.: 514.7
Max. :3029.0 Max. :1098.3 Max. :48.5 Max. :7432.1
avgItemsProd avgscore
Min. : 1.00 Min. :2.50
1st Qu.: 2.32 1st Qu.:3.92
Median : 2.80 Median :4.04
Mean : 3.32 Mean :4.01
3rd Qu.: 3.92 3rd Qu.:4.15
Max. :14.00 Max. :4.64
zz = c("bed_bath_table","health_beauty","sports_leisure","furniture_decor",
"computers_accessories","housewares","watches_gifts",
"telephony","garden_tools","auto","toys","cool_stuff",
"perfumery","baby","electronics","stationery","fashion_bags_accessories",
"pet_shop","office_furniture","consoles_games")
Z = Z %>% filter(product_category_name_english %in% zz) %>% data.frame()
rownames(Z) = Z$product_category_name
pca = PCA(Z[,2:9])
pca
**Results for the Principal Component Analysis (PCA)**
The analysis was performed on 20 individuals, described by 8 variables
*The results are available in the following objects:
name description
1 "$eig" "eigenvalues"
2 "$var" "results for the variables"
3 "$var$coord" "coord. for the variables"
4 "$var$cor" "correlations variables - dimensions"
5 "$var$cos2" "cos2 for the variables"
6 "$var$contrib" "contributions of the variables"
7 "$ind" "results for the individuals"
8 "$ind$coord" "coord. for the individuals"
9 "$ind$cos2" "cos2 for the individuals"
10 "$ind$contrib" "contributions of the individuals"
11 "$call" "summary statistics"
12 "$call$centre" "mean of the variables"
13 "$call$ecart.type" "standard error of the variables"
14 "$call$row.w" "weights for the individuals"
15 "$call$col.w" "weights for the variables"
get_eigenvalue(pca)
eigenvalue variance.percent cumulative.variance.percent
Dim.1 3.1203503 39.004379 39.004
Dim.2 2.6749757 33.437196 72.442
Dim.3 1.1793839 14.742298 87.184
Dim.4 0.7904394 9.880492 97.064
Dim.5 0.1972855 2.466068 99.530
Dim.6 0.0190277 0.237847 99.768
Dim.7 0.0122745 0.153431 99.922
Dim.8 0.0062631 0.078289 100.000
pca\(var\)coord`: 各變數在各尺度的座標
pca$var$coord
Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
ItemsSold -0.44894 0.86579 -0.144706 0.149039 0.029565616
Rev -0.19068 0.95096 0.220862 0.026485 0.000016669
noProd -0.61772 0.73741 -0.209503 -0.138018 0.052329209
avgPrice 0.61992 0.39552 0.542480 -0.392485 -0.083897157
avgFreight 0.73761 0.14516 -0.460213 -0.368823 0.294231113
avgRevProd 0.84597 0.40236 0.332940 0.070773 -0.005395069
avgItemsProd 0.70034 0.21154 -0.036928 0.671724 0.096360373
avgscore -0.60528 -0.30521 0.668939 0.046675 0.301243908
pca\(var\)coord`: 各變數在各尺度呈現的資訊比率
pca$var$cos2
Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
ItemsSold 0.201543 0.749586 0.0209398 0.02221261 0.00087412562087
Rev 0.036358 0.904316 0.0487801 0.00070147 0.00000000027785
noProd 0.381581 0.543778 0.0438914 0.01904898 0.00273834609718
avgPrice 0.384295 0.156436 0.2942844 0.15404484 0.00703873296966
avgFreight 0.544063 0.021070 0.2117959 0.13603043 0.08657194768519
avgRevProd 0.715670 0.161891 0.1108489 0.00500884 0.00002910676947
avgItemsProd 0.490475 0.044749 0.0013637 0.45121365 0.00928532156984
avgscore 0.366365 0.093150 0.4474796 0.00217854 0.09074789199767
將變數投射到主成分空間
fviz_pca_var(pca)
【C】縮減空間中的個體 (Individuals)
pca\(ind\)coord`: 個體在各尺度的座標
pca$ind$coord
Dim.1 Dim.2 Dim.3 Dim.4 Dim.5
auto -0.746752 0.0090012 0.068352 -1.7669832 -0.077742
baby 0.637063 -0.7166511 0.108497 -0.7824179 -0.122098
bed_bath_table -1.705067 2.7134632 -1.321038 0.4040085 -0.377621
computers_accessories 0.287958 1.6308688 -0.140517 1.0258632 -0.287283
consoles_games 0.992674 -1.7848938 0.737757 -0.2747416 -0.832805
cool_stuff 1.978962 0.4033102 1.760035 0.1632432 0.564118
electronics 0.084395 -1.9841621 -0.755362 2.1222250 0.097351
fashion_bags_accessories -1.594076 -2.5061866 -0.110341 -0.2825151 -0.288563
furniture_decor -1.481470 1.2904760 -1.581025 -0.2653104 -0.206925
garden_tools 1.699746 -0.0947168 0.105496 1.4826295 0.724244
health_beauty -1.053489 2.6626509 0.763691 0.2706058 0.527869
housewares -1.557286 0.4977828 -0.863039 -0.4395855 0.368192
office_furniture 5.802838 0.4008194 -2.333619 -0.9895464 0.109087
perfumery -0.164657 -1.0356523 1.011108 0.5064239 -0.010141
pet_shop -0.519804 -2.0417267 0.429733 -0.8214849 0.376770
sports_leisure -1.851038 1.8666533 -0.067120 -0.5135895 0.403934
stationery -0.928725 -2.0023386 0.227009 -0.2449615 0.367443
telephony -0.717147 -1.0705847 -0.942613 1.0150711 -0.672900
toys -0.859137 -0.6668207 0.468714 -0.6109624 0.162937
watches_gifts 1.695010 2.4287077 2.434282 0.0020282 -0.825866
pca\(ind\)coord`: 個體在各尺度呈現的資訊比率
pca$ind$cos2
Dim.1 Dim.2 Dim.3 Dim.4
auto 0.14762530 0.000021449 0.00123682 0.82655790777
baby 0.25617180 0.324176982 0.00743019 0.38640628687
bed_bath_table 0.23539177 0.596150988 0.14129890 0.01321567910
computers_accessories 0.02102763 0.674482835 0.00500712 0.26687738722
consoles_games 0.17785966 0.575027398 0.09824061 0.01362425071
cool_stuff 0.51623600 0.021441334 0.40833457 0.00351272060
electronics 0.00078264 0.432596528 0.06269592 0.49489334232
fashion_bags_accessories 0.27953469 0.690946589 0.00133933 0.00878013273
furniture_decor 0.33788556 0.256379885 0.38482355 0.01083659189
garden_tools 0.51226305 0.001590667 0.00197331 0.38975376256
health_beauty 0.12050193 0.769773087 0.06332412 0.00795075482
housewares 0.64394427 0.065794769 0.19777551 0.05130957942
office_furniture 0.83596606 0.003988460 0.13519713 0.02430972272
perfumery 0.01128729 0.446538713 0.42562398 0.10677251550
pet_shop 0.04938752 0.761962848 0.03375479 0.12334961526
sports_leisure 0.46473180 0.472605722 0.00061106 0.03577700401
stationery 0.16773905 0.779712583 0.01002180 0.01166957766
telephony 0.12750448 0.284152384 0.22028029 0.25544778828
toys 0.40910696 0.246450864 0.12176644 0.20689081171
watches_gifts 0.18645109 0.382799174 0.38455825 0.00000026697
Dim.5
auto 0.001600008
baby 0.009409910
bed_bath_table 0.011545692
computers_accessories 0.020929219
consoles_games 0.125184514
cool_stuff 0.041948124
electronics 0.001041382
fashion_bags_accessories 0.009160061
furniture_decor 0.006591896
garden_tools 0.093002618
health_beauty 0.030254227
housewares 0.035996436
office_furniture 0.000295427
perfumery 0.000042814
pet_shop 0.025947223
sports_leisure 0.022130564
stationery 0.026256678
telephony 0.112256024
toys 0.014714689
watches_gifts 0.044262851
將變數投射到主成分空間
fviz_pca_ind(pca)
【E】將個體分群
Kmeans集群分析
set.seed(99)
kmg = kmeans(scale(Z[,2:8]), 4)$cluster %>% factor
table(kmg)
kmg
1 2 3 4
4 10 1 5
#
將個體和變數投射到主成分空間
fviz_pca_biplot(
pca, pointsize="cos2", repel=T, labelsize=3,
col.var="red", col.ind="#E7B800", alpha.ind=0.3)
主成分空間的選擇
fviz_pca_biplot(
pca, axes=c(1,3),
repel=T, col.var="black", labelsize=3,
col.ind=kmg, alpha.ind=0.6, pointshape=16,
addEllipses = T, ellipse.level = 0.65,
mean.point = FALSE) +
theme(legend.position = "none")
Too few points to calculate an ellipse
#category的群集分析
(紫色賣家群集):
巴西熱門生活用品類
前幾大銷量最高的品項都在裡面(美容保健、家居用品、Deco…)
佔總銷售額整體最高
(橘色定位):
偏高單價、購買頻率普通的商品
平均貢獻銷量比率中上
佔總銷售額比率高
集群分析-產業類別
rm(list=ls(all=T))
load("data/olist.rdata")
load("data/Z.rdata")
segment = group_by(Z1, business_segment) %>% summarise(
noSellers = n(),
avgScore = mean(avgScore),
totalRev = sum(Rev),
avgItemsSold = mean(ItemsSold),
avgPrice = totalRev/avgItemsSold,
avgRev = mean(Rev),
avgReview = mean(noReview)
) %>% arrange( desc(totalRev) )
segment
# A tibble: 29 x 8
business_segment noSellers avgScore totalRev avgItemsSold avgPrice avgRev
<chr> <int> <dbl> <dbl> <dbl> <dbl> <dbl>
1 watches 3 3.03 117279. 198 592. 39093.
2 health_beauty 45 4.51 90836. 18.2 4979. 2019.
3 household_utili… 44 4.47 51569. 12.8 4045. 1172.
4 audio_video_ele… 31 4.03 50265. 8.87 5666. 1621.
5 small_appliances 7 4.56 47675. 9.86 4837. 6811.
6 home_decor 44 4.16 44102. 10.3 4293. 1002.
7 pet 17 4.39 40499. 16.8 2407. 2382.
8 construction_to… 32 4.33 34539. 9.94 3476. 1079.
9 car_accessories 30 4.03 30175. 5.47 5520. 1006.
10 home_appliances 5 4.02 26241. 28.2 931. 5248.
# … with 19 more rows, and 1 more variable: avgReview <dbl>
Merge TPC$product_category_name_english into P as P$category
names(TPC)[1]="product_category_name"
P = left_join(P, TPC) %>% rename(category = product_category_name_english)
Joining, by = "product_category_name"
Merge P$category intoI`
I = left_join(I, P[,c(1,10)])
Joining, by = "product_id"
Summerise by category
category = filter(I, !is.na(category)) %>%
group_by(category) %>% summarise(
itemsSold = n(),
totalRev = sum(price),
avgPrice = mean(price),
noProduct = n_distinct(product_id),
noSeller = n_distinct(seller_id),
dummy = 2018
) %>% arrange(desc(totalRev))
Top-20 categories
top20 = category %>% top_n(20, totalRev)
top20
# A tibble: 20 x 7
category itemsSold totalRev avgPrice noProduct noSeller dummy
<chr> <int> <dbl> <dbl> <int> <int> <dbl>
1 health_beauty 9670 1258681. 130. 2444 492 2018
2 watches_gifts 5991 1205006. 201. 1329 101 2018
3 bed_bath_table 11115 1036989. 93.3 3029 196 2018
4 sports_leisure 8641 988049. 114. 2867 481 2018
5 computers_accessories 7827 911954. 117. 1639 287 2018
6 furniture_decor 8334 729762. 87.6 2657 370 2018
7 cool_stuff 3796 635291. 167. 789 267 2018
8 housewares 6964 632249. 90.8 2335 468 2018
9 auto 4235 592720. 140. 1900 383 2018
10 garden_tools 4347 485256. 112. 753 237 2018
11 toys 4117 483947. 118. 1411 252 2018
12 baby 3065 411765. 134. 919 244 2018
13 perfumery 3419 399125. 117. 868 175 2018
14 telephony 4545 323668. 71.2 1134 149 2018
15 office_furniture 1691 273961. 162. 309 34 2018
16 stationery 2517 230943. 91.8 849 173 2018
17 computers 203 222963. 1098. 30 9 2018
18 pet_shop 1947 214315. 110. 719 137 2018
19 musical_instruments 680 191499. 282. 289 70 2018
20 small_appliances 679 190649. 281. 231 105 2018
segment2=segment %>% filter(business_segment!="watches")%>% data.frame
rownames(segment2) = segment2$business_segment
pca = PCA(segment2[,2:8])
【D】同時投射個體和變數 (Bi-ploy)
將個體和變數投射到主成分空間
#把兩個徒合併
fviz_pca_biplot(
pca, pointsize="cos2", repel=T, labelsize=3,
col.var="red", col.ind="#E7B800", alpha.ind=0.3)
【E】將個體分群
Kmeans集群分析
#尺度分析和及群分析做搭配
#層級樹的及群分析(資料不能太多)
#kmg:標準化建立比較基準相同的權重
#cluster 分群向量
#分群玩 :美群多少人 共同特徵(summrise group mean)
set.seed(444)
kmg = kmeans(scale(segment2[,2:8]),4)$cluster %>% factor
table(kmg)
kmg
1 2 3 4
3 8 5 12
#請勿動此區參數!!!!!!
將個體和變數投射到主成分空間
fviz_pca_biplot(
pca, repel=T, col.var="black", labelsize=3,
col.ind=kmg, alpha.ind=0.6, pointshape=16,
addEllipses = T, ellipse.level = 0.65,
mean.point = FALSE) +
theme(legend.position = "none")
Too few points to calculate an ellipse
以賣家的集群分析來看 可以分為三大集群
第一個(紫)是高單價群集 平均產品收入高
運費高
總體分數低
單價偏高,銷售額佔總體也偏高–>顧客對賣家有黏著度/產品本身信賴感就強
採取策略:
不打價格促銷戰
找知名品牌合作–>增加對賣家的信賴感
第二個(綠)是消費周期較長的品類群集 品項內容主要也是日常用品
但平均的產品消費周期較長 因此在短時間內沒有特別的銷量表現
第三個(橘)是日常生活用品 (賣家很多)
銷售量、收入與評分偏高
且運費低廉
在價格不高的情況下創造出銷售額最高–>需求大、面向消費者喜好
整體表現最佳
採取策略:
擴大通路佔有率
物流狀況分析
rm(list=ls(all=T))
load("data/olist.rdata")
計算賣家販售的商品種類
P=P%>%left_join(TPC[,c(1:2)])
Joining, by = "product_category_name"
I=I%>%left_join(P[,c(1,10)])
Joining, by = "product_id"
I=I%>%left_join(R[,c(2,3)])
Joining, by = "order_id"
I=I%>%left_join(O[,c(1,3)])
Joining, by = "order_id"
new=I[,c(1,4,6,9,10)]
new=new %>% left_join(O[,c(1,2,4:8)])
Joining, by = "order_id"
new=new %>% left_join(S[,c(1,4)])
Joining, by = "seller_id"
table(new$order_status,new$review_score)
1 2 3 4 5
approved 2 0 0 1 0
canceled 416 36 36 14 44
delivered 13044 3802 9350 21246 63406
invoiced 268 33 16 20 27
processing 302 24 12 7 13
shipped 749 89 130 93 135
unavailable 6 0 0 0 1
new2=new %>% group_by(seller_id) %>% summarise(
n = n(),
avScore = mean(review_score, na.rm=T),
rtDelivered = mean(order_status=="delivered"),
delay=mean(
difftime(order_delivered_customer_date,order_estimated_delivery_date, units="days"),
na.rm=T),
rtDelay = mean(
order_delivered_customer_date>order_estimated_delivery_date, na.rm=T),
carry = mean(
difftime(order_delivered_customer_date, order_purchase_timestamp, units="days"),
na.rm=T),
seller_state = seller_state[1]
) %>% filter(n > 100) %>% arrange(rtDelivered)
#
ggplot(new2, aes(x=rtDelay, y=rtDelivered, col=avScore)) +
geom_point(aes(size=n), alpha=0.3)+
geom_text(aes(label=seller_state), size=3)->g
ggplotly(g)
各地區買家的綜合物流狀況
table(new2$seller_state)
BA DF ES MA MG PE PR RJ RS SC SP
1 2 1 1 24 1 17 9 5 12 165
new2 = new2 %>% mutate(
state = ifelse(seller_state %in% c("SP","SC","RS","RJ","PR","MG"),
seller_state, "Others"))
table(new2$state)
MG Others PR RJ RS SC SP
24 6 17 9 5 12 165
new2 %>% filter(state=="SP") %>%
ggplot(aes(x=rtDelay, y=rtDelivered, col=avScore)) +
geom_point(aes(size=n), alpha=0.6)+
ggtitle("Delivery Performance by SP Sellers") ->g
ggplotly(g)
new2 %>% filter(state!="SP") %>%
ggplot(aes(x=rtDelay, y=rtDelivered, col=avScore)) +
geom_point(aes(size=n), alpha=0.6)+
ggtitle("Deleivery Performane by State") +
facet_wrap(~state)
從剛剛的賣家產業類別
來看我們發現蓋改善貨運的狀況可以有效提高顧客的滿意度
因此我們往下分析每個地區的物流狀況
以銷售額最高的聖保羅來看大部分的賣家產品抵達率都在九成以上/但有部分賣家的延遲率高達15%
為了能給予顧客更好的消費體驗
我們建議在物流的表現上產品抵達率比須高於95%/延遲率則必須低於10%
因此可以針對這兩個部分的賣家加強物流的經營管理
物流狀況分析:Delay時間與銷售量
load("data/olist.rdata")
cols = colorRamp(c('seagreen','lightyellow','red'))
#運送時間差和地點的關係
O = O %>%
mutate(delay = (
as.Date(order_delivered_customer_date)-as.Date(order_estimated_delivery_date)
) %>% as.integer
)
I = left_join(I, O)
Joining, by = "order_id"
I = left_join(I, S)
Joining, by = "seller_id"
summary(I)
order_id order_item_id product_id seller_id
Length:112650 Min. : 1.0 Length:112650 Length:112650
Class :character 1st Qu.: 1.0 Class :character Class :character
Mode :character Median : 1.0 Mode :character Mode :character
Mean : 1.2
3rd Qu.: 1.0
Max. :21.0
shipping_limit_date price freight_value
Min. :2016-09-19 00:15:34 Min. : 0.8 Min. : 0.0
1st Qu.:2017-09-20 20:57:27 1st Qu.: 39.9 1st Qu.: 13.1
Median :2018-01-26 13:59:35 Median : 75.0 Median : 16.3
Mean :2018-01-07 15:36:52 Mean : 120.7 Mean : 20.0
3rd Qu.:2018-05-10 14:34:00 3rd Qu.: 134.9 3rd Qu.: 21.1
Max. :2020-04-09 22:35:08 Max. :6735.0 Max. :409.7
customer_id order_status order_purchase_timestamp
Length:112650 Length:112650 Min. :2016-09-04 21:15:19
Class :character Class :character 1st Qu.:2017-09-13 19:17:04
Mode :character Mode :character Median :2018-01-19 23:02:16
Mean :2018-01-01 00:09:48
3rd Qu.:2018-05-04 17:30:36
Max. :2018-09-03 09:06:57
order_approved_at order_delivered_carrier_date
Min. :2016-09-15 12:16:38 Min. :2016-10-08 10:34:01
1st Qu.:2017-09-14 03:06:05 1st Qu.:2017-09-18 20:41:02
Median :2018-01-20 13:42:05 Median :2018-01-24 18:44:52
Mean :2018-01-01 11:46:01 Mean :2018-01-05 02:22:56
3rd Qu.:2018-05-05 03:10:16 3rd Qu.:2018-05-08 13:20:15
Max. :2018-09-03 17:40:06 Max. :2018-09-11 19:48:28
NA's :15 NA's :1194
order_delivered_customer_date order_estimated_delivery_date delay
Min. :2016-10-11 13:46:32 Min. :2016-10-04 00:00:00 Min. :-147
1st Qu.:2017-09-26 20:09:44 1st Qu.:2017-10-05 00:00:00 1st Qu.: -17
Median :2018-02-02 20:57:23 Median :2018-02-16 00:00:00 Median : -13
Mean :2018-01-14 13:25:24 Mean :2018-01-24 20:12:15 Mean : -12
3rd Qu.:2018-05-15 20:09:21 3rd Qu.:2018-05-28 00:00:00 3rd Qu.: -7
Max. :2018-10-17 13:22:46 Max. :2018-10-25 00:00:00 Max. : 188
NA's :2454 NA's :2454
seller_zip_code_prefix seller_city seller_state
Length:112650 Length:112650 Length:112650
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
I %>% filter(!is.na(delay)) %>%
group_by(seller_state) %>% summarise(
nSeller = n_distinct(seller_id),
nOrder = n_distinct(order_id),
rDelay = mean(delay > 0),
avgDelay = mean(delay)) %>%
arrange(desc(rDelay))
# A tibble: 22 x 5
seller_state nSeller nOrder rDelay avgDelay
<chr> <int> <int> <dbl> <dbl>
1 AM 1 3 0.333 9
2 MA 1 389 0.194 -11.3
3 RN 5 51 0.0714 -13.5
4 SP 1769 68637 0.0711 -11.3
5 RJ 163 4230 0.0691 -12.5
6 CE 12 87 0.0667 -13.4
7 DF 30 808 0.0600 -13.2
8 MS 5 49 0.06 -17.4
9 ES 22 310 0.0577 -13.4
10 PR 335 7512 0.0530 -14.2
# … with 12 more rows
df = group_by(I, order_id, seller_id) %>% tally %>% left_join(S[,c(1,4)])
Joining, by = "seller_id"
O2 = left_join(O, df[,c(1,4)])
Joining, by = "order_id"
O2 = left_join(O2,C[,c(1,5)])
Joining, by = "customer_id"
summary(O2)
order_id customer_id order_status
Length:100785 Length:100785 Length:100785
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
order_purchase_timestamp order_approved_at
Min. :2016-09-04 21:15:19 Min. :2016-09-15 12:16:38
1st Qu.:2017-09-12 21:22:46 1st Qu.:2017-09-13 12:50:20
Median :2018-01-19 12:38:28 Median :2018-01-19 19:49:46
Mean :2017-12-31 19:45:51 Mean :2018-01-01 05:40:13
3rd Qu.:2018-05-04 22:23:55 3rd Qu.:2018-05-05 03:35:20
Max. :2018-10-17 17:30:18 Max. :2018-09-03 17:40:06
NA's :160
order_delivered_carrier_date order_delivered_customer_date
Min. :2016-10-08 10:34:01 Min. :2016-10-11 13:46:32
1st Qu.:2017-09-18 17:57:41 1st Qu.:2017-09-26 18:02:59
Median :2018-01-24 18:45:17 Median :2018-02-02 22:51:49
Mean :2018-01-05 08:27:43 Mean :2018-01-14 21:56:38
3rd Qu.:2018-05-08 14:25:00 3rd Qu.:2018-05-16 13:48:48
Max. :2018-09-11 19:48:28 Max. :2018-10-17 13:22:46
NA's :1784 NA's :2968
order_estimated_delivery_date delay seller_state
Min. :2016-09-30 00:00:00 Min. :-147.0 Length:100785
1st Qu.:2017-10-04 00:00:00 1st Qu.: -17.0 Class :character
Median :2018-02-15 00:00:00 Median : -13.0 Mode :character
Mean :2018-01-24 14:47:32 Mean : -11.9
3rd Qu.:2018-05-28 00:00:00 3rd Qu.: -7.0
Max. :2018-11-12 00:00:00 Max. : 188.0
NA's :2968
customer_state
Length:100785
Class :character
Mode :character
mx = tapply(O2$delay, list(O2$customer_state, O2$seller_state), mean)
mx[is.na(mx)] = 0
mx = mx[order(-rowSums(mx)),]
mx = mx[,order(-colSums(mx))]
N = 12
mx[1:N, 1:N] %>% d3heatmap(F,F,col=cols)
O = left_join(O, C[,c(1,5)])
Joining, by = "customer_id"
summary(O)
order_id customer_id order_status
Length:99441 Length:99441 Length:99441
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
order_purchase_timestamp order_approved_at
Min. :2016-09-04 21:15:19 Min. :2016-09-15 12:16:38
1st Qu.:2017-09-12 14:46:19 1st Qu.:2017-09-12 23:24:16
Median :2018-01-18 23:04:36 Median :2018-01-19 11:36:13
Mean :2017-12-31 08:43:12 Mean :2017-12-31 18:35:24
3rd Qu.:2018-05-04 15:42:16 3rd Qu.:2018-05-04 20:35:10
Max. :2018-10-17 17:30:18 Max. :2018-09-03 17:40:06
NA's :160
order_delivered_carrier_date order_delivered_customer_date
Min. :2016-10-08 10:34:01 Min. :2016-10-11 13:46:32
1st Qu.:2017-09-15 22:28:50 1st Qu.:2017-09-25 22:07:22
Median :2018-01-24 16:10:58 Median :2018-02-02 19:28:10
Mean :2018-01-04 21:49:48 Mean :2018-01-14 12:09:19
3rd Qu.:2018-05-08 13:37:45 3rd Qu.:2018-05-15 22:48:52
Max. :2018-09-11 19:48:28 Max. :2018-10-17 13:22:46
NA's :1783 NA's :2965
order_estimated_delivery_date delay customer_state
Min. :2016-09-30 00:00:00 Min. :-147.0 Length:99441
1st Qu.:2017-10-03 00:00:00 1st Qu.: -17.0 Class :character
Median :2018-02-15 00:00:00 Median : -12.0 Mode :character
Mean :2018-01-24 03:08:37 Mean : -11.9
3rd Qu.:2018-05-25 00:00:00 3rd Qu.: -7.0
Max. :2018-11-12 00:00:00 Max. : 188.0
NA's :2965
range(O$delay, na.rm=T)
[1] -147 188
table(O$order_status, O$delay > 0, useNA='ifany')
FALSE TRUE <NA>
approved 0 0 2
canceled 5 1 619
created 0 0 5
delivered 89936 6534 8
invoiced 0 0 314
processing 0 0 301
shipped 0 0 1107
unavailable 0 0 609
table(O$order_status)
approved canceled created delivered invoiced processing
2 625 5 96478 314 301
shipped unavailable
1107 609
O$anticipation_dev
Warning: Unknown or uninitialised column: 'anticipation_dev'.
NULL
str(R$anticipation_dev)
Warning: Unknown or uninitialised column: 'anticipation_dev'.
NULL
summary(O$anticipation_dev)
Warning: Unknown or uninitialised column: 'anticipation_dev'.
Length Class Mode
0 NULL NULL
I=left_join(I,O)
Joining, by = c("order_id", "customer_id", "order_status", "order_purchase_timestamp", "order_approved_at", "order_delivered_carrier_date", "order_delivered_customer_date", "order_estimated_delivery_date", "delay")
S=left_join(S,I)
Joining, by = c("seller_id", "seller_zip_code_prefix", "seller_city", "seller_state")
O2=O2 %>% group_by(seller_state,customer_state) %>% mutate(nProd=n())
mx = tapply(O2$delay, list(O2$customer_state, O2$seller_state), length)
mx[is.na(mx)] = 0
mx = mx[order(-rowSums(mx)),]
mx = mx[,order(-colSums(mx))]
N = 12
mx[1:N, 1:N] %>% d3heatmap(F,F,col=cols)
cust12 = rownames(mx)[1:12]; cust12
[1] "SP" "RJ" "MG" "RS" "PR" "SC" "BA" "DF" "ES" "GO" "PE" "CE"
sell12 = colnames(mx)[1:12]; sell12
[1] "SP" "MG" "PR" "RJ" "SC" "RS" "DF" "BA" "GO" "PE" "MA" "ES"
mx = O2 %>% filter(customer_state %in% cust12, seller_state %in% sell12) %>%
{tapply(.$delay, list(.$customer_state, .$seller_state), mean, na.rm=T)}
d3heatmap(mx, col=cols)
mx = O2 %>% filter(customer_state %in% cust12, seller_state %in% sell12) %>%
{tapply( .$delay > 0, list(.$customer_state, .$seller_state), mean, na.rm=T)}
mx %>% d3heatmap(F,F, col=cols)
#cust12 =c("SC","RS","RJ","BA","PE","SP","MG","DF","GO","PR","CE","ES"); cust12
#sell12 =c("MA","ES","DF","PE","GO","RS","PR","SP","RJ","BA","MG","SC"); sell12
mx = O2 %>% filter(customer_state %in% cust12, seller_state %in% sell12) %>%
{tapply(.$nProd, list(.$customer_state, .$seller_state), mean, na.rm=T)}
E=(rowSums(mx)%o%colSums(mx))/sum(mx)
r=(mx-E)/sqrt(E)
r[r>100]=100
r %>% d3heatmap(F,F, col=cols)
#DELAY的天數和銷售量呈顯著低正相關
#為何有相關
#為何是正相關
O2=O2 %>% mutate(nProd2=scale(nProd))
cor.test(O2$nProd,O2$delay)
Pearson's product-moment correlation
data: O2$nProd and O2$delay
t = 40, df = 97815, p-value <0.0000000000000002
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.12080 0.13314
sample estimates:
cor
0.12698
🧙 給Olist商場的建議:
A.提供後臺數據分析、優化平台
B.增加廠商數量,開拓藍海市場
C.制定該地區本土化行銷策略
D,舉辦實體活動,增加黏著度
E.針對高價商品引進大型品牌商
F.設置延遲率與抵達率的獎懲機制
🧙 給Olist賣家的建議:
A.提供多元的產品
B.改善產品的呈現方式