Group6_OLIST

rm(list=ls(all=T))
load("data/olist.rdata")
load("data/Z.rdata")

pacman::p_load(dplyr, ggplot2, d3heatmap, plotly, FactoMineR, factoextra)
cols = colorRamp(c('seagreen','lightyellow','red'))

從下表可觀察到，2017-11-01之後銷售量明顯提高。

O$order_purchase_timestamp = as.POSIXct(O$order_purchase_timestamp)
Table <- table(cut(O$order_purchase_timestamp, breaks="months"))%>%barplot(las=2)

11月每日整體銷售量中，在11/24（黑色星期五）有銷售高峰

O$order_purchase_timestamp = as.Date(O$order_purchase_timestamp)


a = O[,c(1,2,3,4)]

b = a %>% filter(format(a$order_purchase_timestamp, "%m")=="11")  


table(cut(b$order_purchase_timestamp , breaks="days"))%>%barplot(las=2)

load("data/olist.rdata")

將訂單時間以季分時間軸度

bx = paste0(rep(2016:2018, each=4),
       rep(c('-02-01','-05-01','-08-01','-11-01'), 3))[3:12] %>% as.POSIXct()
O$season = cut(O$order_purchase_timestamp, bx) %>% as.Date
O %>% filter(season >= as.Date("2017-02-01"), season <= as.Date("2018-05-01")) %>% 
  {table(.$season)} %>% barplot()

將賣家數量以季分時間軸度

range(Q$won_date)

[1] "2017-12-05 02:00:00 UTC" "2018-11-14 18:04:19 UTC"

bx_S = paste0(rep(2017:2018, each=4),
       rep(c('-02-01','-05-01','-08-01','-11-01'), 2))%>% as.POSIXct()
Q$season = cut(Q$won_date, bx_S) %>% as.Date
table(Q$season)%>% barplot()

銷售項目比率折線圖(以季變化)

#把season放到I
I = left_join(P[,1:2], TPC) %>% 
  select(product_id, category=product_category_name_english) %>% 
  right_join(I) %>% 
  left_join(O[,c(1,9)])

Joining, by = "product_category_name"

Joining, by = "product_id"

Joining, by = "order_id"

table(I$category) %>% sort %>% tail(10) %>% names -> cat10
I %>% filter(category %in% cat10) %>% 
  group_by(season, category) %>% summarise(
    ItemSold = n(),
    noOrder = n_distinct(order_id),
    ProdRev = sum(price),
    TotalRev = sum(price, freight_value),
    rTotalRev = mean(price)
    ) %>% 
  mutate_at(vars(ItemSold:TotalRev), ~(./sum(.))) -> A

A %>% filter(season >= as.Date("2017-02-01"), season <= as.Date("2018-05-01")) %>% ggplot(aes(x=season, y=ItemSold, col=category)) + 
  geom_line() + geom_point() + ggtitle("%ItemSold per Season")-> p
ggplotly(p)

bed_bath_table(床_衛浴_桌子):具週期性,但購買週期長,所以相較於housewares的銷量不會有太多起伏

computers_accessories(電腦周邊):每隔三機會有一次的購買高峰

furniture_decor(居家裝飾):和居家用品的趨勢相反和園藝工具相同

garden_tools(園藝):和居家裝飾相同春夏為高峰

housewares(家庭用具):冬高夏低

sports_leisure(運動休閒):春秋高峰

health_beauty(保健美容):呈逐步上升趨勢（可能受情人節或父親節影響）

watches_(手錶禮物):呈逐步上升趨勢但比保健美容更具階段性（可能受情人節或父親節影響）

telephone(電話):需求逐步下降

下表資訊：

Y軸：星期一～星期日

X軸：0~24小時

load("data/olist.rdata")

從熱圖中發現星期一是下單高峰，週末力道趨緩

table(format(O$order_purchase_timestamp,"%u"), format(O$order_purchase_timestamp,"%H")) %>% 
  as.data.frame.matrix %>% 
  d3heatmap(.,F,F,col=colorRamp(c('seagreen','lightyellow','red')))

平日下單叫假日高出許多，而平日中又以週一、週二為大宗

時間已早上9-下午4點為消費者為主要下單時段

各種類與地區的總銷售量

I = left_join(I,P[,c(1,2)])

Joining, by = "product_id"

I = left_join(I,TPC[,c(1,2)])

Joining, by = "product_category_name"

 a = I %>% group_by(product_category_name)%>% summarise(
  nCat = n())

a = left_join(a,TPC[,c(1,2)])

Joining, by = "product_category_name"

 top15 = a %>% top_n(15, nCat)
top15

# A tibble: 15 x 3
   product_category_name   nCat product_category_name_english
   <chr>                  <int> <chr>                        
 1 automotivo              4235 auto                         
 2 bebes                   3065 baby                         
 3 beleza_saude            9670 health_beauty                
 4 brinquedos              4117 toys                         
 5 cama_mesa_banho        11115 bed_bath_table               
 6 cool_stuff              3796 cool_stuff                   
 7 eletronicos             2767 electronics                  
 8 esporte_lazer           8641 sports_leisure               
 9 ferramentas_jardim      4347 garden_tools                 
10 informatica_acessorios  7827 computers_accessories        
11 moveis_decoracao        8334 furniture_decor              
12 perfumaria              3419 perfumery                    
13 relogios_presentes      5991 watches_gifts                
14 telefonia               4545 telephony                    
15 utilidades_domesticas   6964 housewares

top15 = left_join(top15,TPC[,c(1,2)])

Joining, by = c("product_category_name", "product_category_name_english")

top16 = rbind(top15, top15[15,])
top16$product_category_name_english[16] = 'others'
top16$nCat[16] = sum(a$nCat[!a$product_category_name_english %in% top15$product_category_name_english[1:15]])

top16$product_category_name_english <- factor(top16$product_category_name_english, levels = top16$product_category_name_english[order(-top16$nCat)])

ggplot(top16, aes(x=product_category_name_english, y=nCat)) +
  geom_histogram(stat="identity") + 
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
   xlab("Product Category") + 
    ylab("Total Item Sales ")

Warning: Ignoring unknown parameters: binwidth, bins, pad

####上表可以看到，銷售量前五名屬於生活用品類（購買頻率高）

每個地區的銷售量比較：長條圖

I = left_join(I,O[,c(1,2)])

Joining, by = "order_id"

I = left_join(I,C[,c(1,5)])

Joining, by = "customer_id"

a = I %>% group_by(customer_state)%>% summarise(
  nItems = n()
  )
top10 = a %>% top_n(10, nItems)

top10 = rbind(top10, top10[10,])
top10$customer_state[11] = 'others'
top10$nItems[11] = sum(a$nItems[! a$customer_state %in% top10$customer_state[1:10]])

#table(I$customer_state) %>% sort(decreasing = T) %>% prop.table() %>% cumsum()


#top10

top10$customer_state <- factor(top10$customer_state, levels = top10$customer_state[order(-top10$nItems)])

ggplot(top10, aes(x=customer_state, y=nItems)) +
  geom_histogram(stat="identity")+
   xlab("State") + 
    ylab("Total Item Sales ")

Warning: Ignoring unknown parameters: binwidth, bins, pad

#怎麼找出其他？（前十個銷售量以外的州，設為「other」）

上表可以看到，位於聖保羅的消費者佔了Olist大部分的銷售量

銷售量與各變量的相關性分析

load("data/olist.rdata")

P = left_join(P,count(I, product_id))

Joining, by = "product_id"

i = !is.na(P$product_photos_qty) #排除na
cor(P$product_photos_qty[i], P$n[i]) #求銷售量跟照片多寡的相關性

[1] 0.0038473

#整體銷售量跟照片多寡相關性低
colnames(TPC)[1] = "product_category_name"

P = left_join(P, TPC)

Joining, by = "product_category_name"

P2 = P[complete.cases(P),] #消除na

P2 %>% group_by(product_category_name_english) %>% summarise(
  nProd = n(),
  Corr = cor(n, product_photos_qty)
  ) %>% arrange(Corr)

Warning in cor(n, product_photos_qty): the standard deviation is zero

# A tibble: 71 x 3
   product_category_name_english     nProd   Corr
   <chr>                             <int>  <dbl>
 1 fashion_childrens_clothes             5 -0.867
 2 tablets_printing_image                9 -0.460
 3 furniture_mattress_and_upholstery    10 -0.356
 4 fashion_underwear_beach              53 -0.355
 5 home_comfort_2                        5 -0.25 
 6 la_cuisine                           10 -0.201
 7 diapers_and_hygiene                  12 -0.183
 8 fashion_male_clothing                95 -0.170
 9 flowers                              14 -0.165
10 fashion_sport                        19 -0.148
# … with 61 more rows

小結：有些商品類別的銷售量跟照片多寡具有相關性

例如：流行孩童服飾的銷售量與銷售量有高相關性（相關係數的絕對值0.8668451 > 0.7）

P = left_join(P, TPC)

Joining, by = c("product_category_name", "product_category_name_english")

P2 = P[complete.cases(P),] #消除na

P2 %>% group_by(product_category_name_english) %>% summarise(
  nProd = n(),
  Corr = cor(n, product_name_lenght)
  ) %>% arrange(Corr)

Warning in cor(n, product_name_lenght): the standard deviation is zero

# A tibble: 71 x 3
   product_category_name_english  nProd   Corr
   <chr>                          <int>  <dbl>
 1 fashion_childrens_clothes          5 -0.676
 2 fashion_sport                     19 -0.513
 3 home_comfort_2                     5 -0.432
 4 fashion_underwear_beach           53 -0.243
 5 flowers                           14 -0.222
 6 furniture_bedroom                 45 -0.210
 7 music                             27 -0.200
 8 agro_industry_and_commerce        74 -0.193
 9 furniture_living_room            156 -0.175
10 industry_commerce_and_business    68 -0.169
# … with 61 more rows

小結：有些商品類別的銷售量跟產品名字長度具有相關性

例如：流行孩童服飾的銷售量與產品名字長度有中度相關性（相關係數的絕對值0.4 < 0.67586633 < 0.7）

P = left_join(P, TPC)

Joining, by = c("product_category_name", "product_category_name_english")

P2 = P[complete.cases(P),] #消除na

P2 %>% group_by(product_category_name_english) %>% summarise(
  nProd = n(),
  Corr = cor(n, product_description_lenght)
  ) %>% arrange(Corr)

Warning in cor(n, product_description_lenght): the standard deviation is zero

# A tibble: 71 x 3
   product_category_name_english     nProd    Corr
   <chr>                             <int>   <dbl>
 1 fashion_childrens_clothes             5 -0.776 
 2 fashion_underwear_beach              53 -0.273 
 3 party_supplies                       26 -0.212 
 4 cine_photo                           28 -0.201 
 5 home_comfort_2                        5 -0.183 
 6 furniture_mattress_and_upholstery    10 -0.183 
 7 office_furniture                    309 -0.178 
 8 tablets_printing_image                9 -0.172 
 9 signaling_and_security               93 -0.161 
10 dvds_blu_ray                         48 -0.0983
# … with 61 more rows

小結：有些商品類別的銷售量跟產品描述長度具有相關性

例如：流行孩童服飾的銷售量跟產品描述長度有高度相關性（相關係數的絕對值0.77566003 > 0.7）

P = left_join(P, TPC)

Joining, by = c("product_category_name", "product_category_name_english")

P2 = P[complete.cases(P),] #消除na

P2 %>% group_by(product_category_name_english) %>% summarise(
  nProd = n(),
  Corr = cor(n, product_height_cm)
  ) %>% arrange(Corr)

Warning in cor(n, product_height_cm): the standard deviation is zero

# A tibble: 71 x 3
   product_category_name_english           nProd   Corr
   <chr>                                   <int>  <dbl>
 1 la_cuisine                                 10 -0.452
 2 furniture_mattress_and_upholstery          10 -0.408
 3 flowers                                    14 -0.314
 4 fashion_underwear_beach                    53 -0.281
 5 diapers_and_hygiene                        12 -0.216
 6 kitchen_dining_laundry_garden_furniture    94 -0.197
 7 audio                                      58 -0.195
 8 small_appliances                          231 -0.192
 9 market_place                              104 -0.174
10 agro_industry_and_commerce                 74 -0.166
# … with 61 more rows

小結：有些商品類別的銷售量跟產品高度具有相關性

例如：廚房用品的銷售量跟產品描述高度有中度相關性（相關係數的絕對值0.4 < 0.4521562 > 0.7）

P = left_join(P, TPC)

Joining, by = c("product_category_name", "product_category_name_english")

P2 = P[complete.cases(P),] #消除na

P2 %>% group_by(product_category_name_english) %>% summarise(
  nProd = n(),
  Corr = cor(n, product_width_cm)
  ) %>% arrange(Corr)

Warning in cor(n, product_width_cm): the standard deviation is zero

# A tibble: 71 x 3
   product_category_name_english           nProd   Corr
   <chr>                                   <int>  <dbl>
 1 fashion_childrens_clothes                   5 -0.520
 2 flowers                                    14 -0.259
 3 agro_industry_and_commerce                 74 -0.257
 4 food_drink                                104 -0.256
 5 office_furniture                          309 -0.219
 6 audio                                      58 -0.190
 7 kitchen_dining_laundry_garden_furniture    94 -0.174
 8 party_supplies                             26 -0.168
 9 fashion_shoes                             173 -0.168
10 small_appliances                          231 -0.159
# … with 61 more rows

小結：有些商品類別的銷售量跟產品寬度具有相關性

例如：流行孩童服飾的銷售量跟產品寬度有中度相關性（相關係數的絕對值0.4 < 0.51951 < 0.7）

賣家來源與銷售量：賣家

load("data/olist.rdata")

P= left_join(P,TPC[,c(1,2)])

Joining, by = "product_category_name"

Q=left_join(Q,L[,c(1,4)])

Joining, by = "mql_id"

new <- (S=left_join(S,Q[,c(1,2,15)]))

Joining, by = "seller_id"

I=left_join(I,P[,c(1,10)])

Joining, by = "product_id"

new = left_join(new,I[,c(8,4)])

Joining, by = "seller_id"

new2=new %>% group_by(new$seller_id) %>% summarize(
  nSellerS=n(),
  type=origin[1],
  category=product_category_name_english[1]
  ) %>% arrange(desc(nSellerS)) %>% ungroup()

CategoryVSOrigin = xtabs(nSellerS ~ category + type, new2)

(rowSums(CategoryVSOrigin)/sum(CategoryVSOrigin)) %>% cumsum %>% {which(. < .95)} %>% last

[1] 47

(colSums(CategoryVSOrigin)/sum(CategoryVSOrigin)) %>% cumsum %>% {which(. < .95)} %>% last

[1] 8

CategoryVSOrigin = as(CategoryVSOrigin[1:47, 1:8], "matrix")         # keep 8 origins & 55 categories
colnames(CategoryVSOrigin) = substr(colnames(CategoryVSOrigin),1 ,47) # use shorter names

d3heatmap(CategoryVSOrigin,F,F,col=cols, xaxis_font_size=10, yaxis_font_size=6)

range(CategoryVSOrigin[CategoryVSOrigin > 0])

[1]   1 551

因為log(0) = -Inf，所以取對數之前我們常需要先加進去一個數值，通常我們會選用矩陣中最小的數字

par(mfrow=c(1,2), cex=0.8)
hist(CategoryVSOrigin)
hist(log(50+CategoryVSOrigin,10)) # 商務應用裡面一般我們會用10底的對數

d3heatmap(log(51+CategoryVSOrigin,10),F,F,col=cols, xaxis_font_size=10, yaxis_font_size=6)

d3heatmap(log(51+CategoryVSOrigin,10),col=cols, xaxis_font_size=10, yaxis_font_size=6)

舉例來說，

販售health_beauty的賣家中，

以paid_search註冊的賣家最多，如果要徵求賣美妝的賣家，就可以多多利用付費廣告來吸引新用戶。

賣家來源與銷售量：銷售量

if(!require(morpheus)) install.packages('morpheus.R')
pacman::p_load(dplyr, ggplot2, d3heatmap, plotly, googleVis)
cols = colorRamp(c('seagreen','lightyellow','red'))

load("data/olist.rdata")

篩選出被審核通過的廠商，申請管道為何

revenue = sum(price)

廠商個別收益 = seller_id的order_id之sum(price)

SComplete = inner_join(S[, c(1)], Q[,c(1, 2, 12)])

Joining, by = "seller_id"

SComplete = inner_join(SComplete, L[, c(1, 4)])

Joining, by = "mql_id"

SComplete = inner_join(SComplete, I[, c(1, 3, 4, 6)])

Joining, by = "seller_id"

O = left_join(O, I[,c(1,4)])

Joining, by = "order_id"

SO = O %>% group_by(seller_id) %>% 
  summarize(nSellerSalesO=n()) %>% 
  arrange(desc(nSellerSalesO)) 
SO

# A tibble: 3,096 x 2
   seller_id                        nSellerSalesO
   <chr>                                    <int>
 1 6560211a19b47992c3666cc44a7e94c0          2033
 2 4a3ca9315b744ce9f8e9374361493884          1987
 3 1f50f920176fa81dab994f9023523100          1931
 4 cc419e0650a3c5ba77189a1882b7556a          1775
 5 da8622b14eb17ae2831f4ac5b9dab84a          1551
 6 955fee9216a65b617aa5c0531780ce60          1499
 7 1025f0e2d44d7041d6cf58b6550e0bfa          1428
 8 7c67e1448b00f6e969d365cea6b010ab          1364
 9 ea8482cd71df3c1969d7b9473ff13abc          1203
10 7a67c85e85bb2ce8582c35f2203ad736          1171
# … with 3,086 more rows

SComplete = left_join(SComplete, SO)

Joining, by = "seller_id"

SC_ORS = SComplete %>% group_by(origin) %>% 
  summarise(nOriginSC=n(), OriginRevenue=sum(price), avgRevenueTotalSales = OriginRevenue/nOriginSC) %>% arrange(desc(avgRevenueTotalSales)) %>% ungroup()
SC_ORS

# A tibble: 10 x 4
   origin         nOriginSC OriginRevenue avgRevenueTotalSales
   <chr>              <int>         <dbl>                <dbl>
 1 email                 24         8485.                354. 
 2 referral              77        17887.                232. 
 3 unknown             1372       213743.                156. 
 4 organic_search      1346       207023.                154. 
 5 display                7          923                 132. 
 6 <NA>                  11         1243.                113. 
 7 paid_search         1448       155277.                107. 
 8 direct_traffic       218        21904.                100. 
 9 social               444        43478.                 97.9
10 other                 97         6889.                 71.0

SC_ORS$avgRevenueTotalSales = SC_ORS$avgRevenueTotalSales %>% round(0)

SC_ORS$origin <- factor(SC_ORS$origin, levels=SC_ORS$origin[order(-SC_ORS$avgRevenueTotalSales)])

geom_col 可以有ＸＹ（非連續）

state_count 只能有Ｘ（非連續）

Origin vs AvgRevenueTotalSales

ggplot(SC_ORS, aes(x=reorder(origin, -avgRevenueTotalSales), y=avgRevenueTotalSales)) +
  geom_bar(width=0.75, stat="identity") +
  geom_text(aes(label=round(avgRevenueTotalSales)), size=3, vjust=-0.5, check_overlap=T)

Origin vs OriginRevenue

ggplot(SC_ORS, aes(x=reorder(origin, -OriginRevenue), y=OriginRevenue)) +
  geom_bar(width=0.75, stat="identity") +
  geom_text(aes(label=round(OriginRevenue)), size=3, vjust=-0.5, check_overlap=T)

SC_ORS$avgRevenueTotalSales = SC_ORS$avgRevenueTotalSales %>% round(0)
SO = left_join(SO, SComplete[, c(1, 4)])

Joining, by = "seller_id"

Origin vs OriginSales

ggplot(SO, aes(x=origin, y=nSellerSalesO)) +
  geom_bar(width=0.75, stat="identity") +
  geom_text(aes(label=round(nSellerSalesO)), size=3, check_overlap=T)

SComplete = left_join(SComplete, P[, c(1, 2)])

Joining, by = "product_id"

SComplete = left_join(SComplete, TPC)

Joining, by = "product_category_name"

SC_OCS = SComplete %>% group_by(product_category_name_english, origin) %>% 
  transmute(OriginTotalSales = sum(nSellerSalesO)) %>%
  ungroup()
SC_OCS

# A tibble: 5,044 x 3
   product_category_name_english origin         OriginTotalSales
   <chr>                         <chr>                     <int>
 1 bed_bath_table                paid_search                 373
 2 bed_bath_table                paid_search                 373
 3 bed_bath_table                paid_search                 373
 4 bed_bath_table                paid_search                 373
 5 garden_tools                  unknown                     593
 6 garden_tools                  organic_search              544
 7 sports_leisure                paid_search                3221
 8 baby                          paid_search                 648
 9 toys                          paid_search                1389
10 housewares                    paid_search                8197
# … with 5,034 more rows

OriginNumber VS Categories

par(cex=0.8)
table(SC_OCS$product_category_name_english) %>% sort(decreasing=T) %>% barplot(las=2)

CategoryVSOrigin = xtabs(OriginTotalSales ~ product_category_name_english + origin, SC_OCS)

(rowSums(CategoryVSOrigin)/sum(CategoryVSOrigin)) %>% cumsum %>% {which(. < .95)} %>% last

[1] 55

(colSums(CategoryVSOrigin)/sum(CategoryVSOrigin)) %>% cumsum %>% {which(. < .95)} %>% last

[1] 8

CategoryVSOrigin = as(CategoryVSOrigin[1:55, 1:8], "matrix")         # keep 8 origins & 55 categories
colnames(CategoryVSOrigin) = substr(colnames(CategoryVSOrigin),1 ,55) # use shorter names

try <- CompleteSa2[,1:3] test2=c(“origin”,“OriginTotalSales”,“product_category_name”)

d3heatmap(CategoryVSOrigin,F,F,col=cols, xaxis_font_size=10, yaxis_font_size=6)

range(CategoryVSOrigin[CategoryVSOrigin > 0])

[1]        1 57844116

因為log(0) = -Inf，所以取對數之前我們常需要先加進去一個數值，通常我們會選用矩陣中最小的數字

par(mfrow=c(1,2), cex=0.8)
hist(CategoryVSOrigin)
hist(log(50+CategoryVSOrigin,10)) # 商務應用裡面一般我們會用10底的對數

d3heatmap(log(51+CategoryVSOrigin,10),F,F,col=cols, xaxis_font_size=10, yaxis_font_size=6)

d3heatmap(log(51+CategoryVSOrigin,10),col=cols, xaxis_font_size=10, yaxis_font_size=6)

可以建議在有紅色方格的分類項目上打廣告，吸引該廠商加入平台。

seller_id from origin

SC_OSid = SComplete %>% group_by(origin) %>% 
  count(origin) %>% arrange(desc(n)) %>% ungroup()

nOriginSales = SC_OSid$n

ggplot(SC_OSid, aes(x=reorder(origin, -nOriginSales), y=nOriginSales)) +
  geom_bar(width=0.75, stat="identity") +
  geom_text(aes(label=round(nOriginSales)), size=3, vjust=-0.5, check_overlap=T)

RS = inner_join(S[, c(1)], SComplete[, c(1, 5, 6, 8)])

Joining, by = "seller_id"

RS = inner_join(RS, R[, c(1, 2, 3)])

Joining, by = "order_id"

RS = RS %>% group_by(seller_id) %>% 
  summarise(
    n=n(),AvgReviewScore=mean(review_score), SellerSales=sum(n))
RS$SellerSales

  [1]   3  39  44   9   4   2   1   8   9   6   1  19   5  12   1   3  10   3
 [19]   2   5   3   6   4   4   6  58   1   2  16   3   6   5  34   4   1   1
 [37]  30   1   1  12   2   1   2   1  25   1   3   3  18   6  14   1  29   1
 [55]   3  28   2   2   6   1   3   8   1  10   1  11   7   4  20   6   1  16
 [73]  10   5   9   8   2   1  49   5   1  38  10   3  24   2   1   7   1  10
 [91]  21  22   5   3   1   3   1   1   6   3  23   3   4  40  67   5   8   2
[109]   2   2   1   2  11  84   4   3   1   2  10  19   4   6  22  10   5  45
[127]  38  14   2  10   3  17  17   4  10   5   1   8   2   1   4   4  35 110
[145]   5   1   1   1   5   3  40  27   1   2   6  22   4  22  43  48   5   9
[163]   8   1   3   2  11   7  98  13  15  12   5   4   2   3   5  15  26   1
[181]  11   3  28   8   4   4 578   5   7   1   9  14  16   3  24  12   4  34
[199]   2   7  15   1   1   1  15   2   2  11 106   3   1   2   1   1   8   4
[217]   1   7   4   1   1   1  12   1  18  14  12   2   7   5   3   2   7   8
[235]   4   3  14   1  11   2   2   1   8   1   6   4   7   6   1   3   4   2
[253]  15   2   7  10  12   7  21  11   1  22   5   7  23   8   2  86  38   6
[271]   2   1   2   1  15   7  12   2  29   1   2  45   1   5   3   3  13  13
[289]   8  11   1   6 338   4  15   1   9   1  15   8  14  33   1   2  11  10
[307]  63   1   7   1  24  24  35   1   9  12  73   7   1   1  16   1   1   4
[325]  59   1  13  18   8   9  48  16   4   4   8   1   1   2  14  86   2   2
[343]   3   1  12  20   1  12   7   2  11   9   2   3   4   4   8   1   8  17
[361]   3   4   2  11   4   1   8  15  91   3   9   1  10   7  27   1   6  18
[379]  15  31

廠商銷售量和廠商評價不顯著

<br>

I = left_join(I, R[, c(2, 3)])

Joining, by = "order_id"

RS = I %>% group_by(seller_id) %>% summarize(
  nSellerSalesI=n(),
  avgReviewScore=mean(review_score)
) %>% arrange(desc(nSellerSalesI)) %>% 
  unique() %>% ungroup()
RS

# A tibble: 3,095 x 3
   seller_id                        nSellerSalesI avgReviewScore
   <chr>                                    <int>          <dbl>
 1 6560211a19b47992c3666cc44a7e94c0          2039           3.89
 2 4a3ca9315b744ce9f8e9374361493884          2009           3.78
 3 1f50f920176fa81dab994f9023523100          1940           3.98
 4 cc419e0650a3c5ba77189a1882b7556a          1819           4.06
 5 da8622b14eb17ae2831f4ac5b9dab84a          1574           4.07
 6 955fee9216a65b617aa5c0531780ce60          1501           4.04
 7 1025f0e2d44d7041d6cf58b6550e0bfa          1443           3.83
 8 7c67e1448b00f6e969d365cea6b010ab          1375           3.35
 9 ea8482cd71df3c1969d7b9473ff13abc          1204           3.94
10 7a67c85e85bb2ce8582c35f2203ad736          1175           4.22
# … with 3,085 more rows

SOP = left_join(S[, c(1)], SComplete[, c(1, 5, 6)])

Joining, by = "seller_id"

P = P %>% group_by(product_id) %>% 
  mutate(product_size_cm3=product_length_cm*product_height_cm*product_width_cm) %>% 
  ungroup()
P

# A tibble: 32,951 x 10
   product_id product_categor… product_name_le… product_descrip…
   <chr>      <chr>                       <int>            <int>
 1 1e9e8ef04… perfumaria                     40              287
 2 3aa071139… artes                          44              276
 3 96bd76ec8… esporte_lazer                  46              250
 4 cef67bcfe… bebes                          27              261
 5 9dc1a7de2… utilidades_dome…               37              402
 6 41d3672d4… instrumentos_mu…               60              745
 7 732bd381a… cool_stuff                     56             1272
 8 2548af3e6… moveis_decoracao               56              184
 9 37cc742be… eletrodomesticos               57              163
10 8c9210988… brinquedos                     36             1156
# … with 32,941 more rows, and 6 more variables: product_photos_qty <int>,
#   product_weight_g <int>, product_length_cm <int>, product_height_cm <int>,
#   product_width_cm <int>, product_size_cm3 <int>

PS = left_join(P[, c(1,10)], SComplete[, c(1,6,10)])

Joining, by = "product_id"

PS = left_join(PS, RS[, c(1,2)])

Joining, by = "seller_id"

PS = left_join(PS, SO[, c(1,2)])

Joining, by = "seller_id"

cor.test(PS$product_size_cm3, PS$nSellerSalesI)


    Pearson's product-moment correlation

data:  PS$product_size_cm3 and PS$nSellerSalesI
t = -285, df = 594172, p-value <0.0000000000000002
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 -0.34939 -0.34492
sample estimates:
     cor 
-0.34716

舉例來說，

販售health_beauty的賣家中，

以paid_search和organic_search註冊的銷售量最多，

代表如果要篩選表現較佳的health_beauty賣家的話，

可以透過paid_search和organic_search的註冊來源來做區隔。

也就是說，可以在搜尋引擎上打health_beauty相關的廣告來吸引更多相關賣家。

集群分析-產品類別

load("data/olist.rdata")

category的分群

I = left_join(I,P[,c(1,2)])

Joining, by = "product_id"

I = left_join(I,TPC[,c(1,2)])

Joining, by = "product_category_name"

I = left_join(I,R[,c(2,3)])

Joining, by = "order_id"

A = I %>% group_by(product_category_name_english) %>% summarise(
  rev =  sum(price)) %>% ungroup() 

A %>% mutate( 
      totalrev = sum(A[,2]),
      rate_rev = rev/totalrev) %>% arrange(desc(rate_rev))

# A tibble: 72 x 4
   product_category_name_english      rev  totalrev rate_rev
   <chr>                            <dbl>     <dbl>    <dbl>
 1 health_beauty                 1263238. 13653185.   0.0925
 2 watches_gifts                 1206075. 13653185.   0.0883
 3 bed_bath_table                1051086. 13653185.   0.0770
 4 sports_leisure                 993685. 13653185.   0.0728
 5 computers_accessories          919721. 13653185.   0.0674
 6 furniture_decor                736682. 13653185.   0.0540
 7 cool_stuff                     637259. 13653185.   0.0467
 8 housewares                     634543. 13653185.   0.0465
 9 auto                           594363. 13653185.   0.0435
10 garden_tools                   486432. 13653185.   0.0356
# … with 62 more rows

# A tibble: 72 x 2
   product_category_name_english      rev
   <chr>                            <dbl>
 1 agro_industry_and_commerce      72530.
 2 air_conditioning                55025.
 3 art                             24203.
 4 arts_and_craftmanship            1814.
 5 audio                           50738.
 6 auto                           594363.
 7 baby                           412117.
 8 bed_bath_table                1051086.
 9 books_general_interest          46857.
10 books_imported                   4640.
# … with 62 more rows

table(I$product_category_name_english) %>% sort(decreasing = T) %>% prop.table() %>% cumsum()

                         bed_bath_table                           health_beauty 
                                0.10093                                 0.18803 
                         sports_leisure                         furniture_decor 
                                0.26593                                 0.34129 
                  computers_accessories                              housewares 
                                0.41198                                 0.47455 
                          watches_gifts                               telephony 
                                0.52828                                 0.56902 
                           garden_tools                                    auto 
                                0.60807                                 0.64618 
                                   toys                              cool_stuff 
                                0.68311                                 0.71719 
                              perfumery                                    baby 
                                0.74795                                 0.77546 
                            electronics                              stationery 
                                0.80025                                 0.82286 
               fashion_bags_accessories                                pet_shop 
                                0.84122                                 0.85867 
                       office_furniture                          consoles_games 
                                0.87390                                 0.88409 
                    luggage_accessories         construction_tools_construction 
                                0.89386                                 0.90220 
                        home_appliances                        small_appliances 
                                0.90944                                 0.91556 
                    musical_instruments                       home_construction 
                                0.92166                                 0.92707 
                 books_general_interest                                    food 
                                0.93202                                 0.93659 
                  furniture_living_room                            home_confort 
                                0.94113                                 0.94504 
                                 drinks                                   audio 
                                0.94847                                 0.95174 
                           market_place               construction_tools_lights 
                                0.95452                                 0.95726 
                       air_conditioning kitchen_dining_laundry_garden_furniture 
                                0.95991                                 0.96244 
                             food_drink                         books_technical 
                                0.96495                                 0.96735 
         industry_commerce_and_business                           fashion_shoes 
                                0.96976                                 0.97214 
                        fixed_telephony                       home_appliances_2 
                                0.97451                                 0.97667 
               costruction_tools_garden              agro_industry_and_commerce 
                                0.97882                                 0.98071 
                                    art                               computers 
                                0.98259                                 0.98440 
                 signaling_and_security               construction_tools_safety 
                                0.98618                                 0.98792 
                     christmas_supplies                   fashion_male_clothing 
                                0.98929                                 0.99047 
                fashion_underwear_beach                       furniture_bedroom 
                                0.99165                                 0.99263 
                costruction_tools_tools                  tablets_printing_image 
                                0.99355                                 0.99430 
  small_appliances_home_oven_and_coffee                              cine_photo 
                                0.99498                                 0.99563 
                           dvds_blu_ray                          books_imported 
                                0.99620                                 0.99674 
                 fashio_female_clothing                          party_supplies 
                                0.99719                                 0.99757 
                    diapers_and_hygiene       furniture_mattress_and_upholstery 
                                0.99792                                 0.99826 
                                  music                                 flowers 
                                0.99860                                 0.99890 
                          fashion_sport                          home_comfort_2 
                                0.99918                                 0.99944 
                  arts_and_craftmanship                       cds_dvds_musicals 
                                0.99966                                 0.99979 
                             la_cuisine               fashion_childrens_clothes 
                                0.99991                                 0.99998 
                  security_and_services 
                                1.00000

#取前15個category（佔銷量80%）

Z = I %>% group_by(product_category_name_english) %>% summarise(
  ItemsSold = n(),#賣多少產品
  Rev = sum(price),#賺多少營收
  noProd = n_distinct(product_id),#賣多少不一樣的產品
  avgPrice = mean(price),#  平均產品售價
  avgFreight = mean(freight_value),#平均運費
  avgRevProd = Rev/noProd,#平均產品會產生多少營收
  avgItemsProd = ItemsSold/noProd,#平均每個產品賣了多少個項目
  avgscore = mean(review_score)
  )
summary(Z)

 product_category_name_english   ItemsSold          Rev         
 Length:72                     Min.   :    2   Min.   :    283  
 Class :character              1st Qu.:   98   1st Qu.:   9356  
 Mode  :character              Median :  290   Median :  47151  
                               Mean   : 1574   Mean   : 189628  
                               3rd Qu.: 1763   3rd Qu.: 197326  
                               Max.   :11272   Max.   :1263238  
     noProd          avgPrice        avgFreight     avgRevProd    
 Min.   :   1.0   Min.   :  25.3   Min.   :11.9   Min.   :  79.3  
 1st Qu.:  37.0   1st Qu.:  80.7   1st Qu.:16.3   1st Qu.: 199.4  
 Median :  99.5   Median : 113.9   Median :19.2   Median : 345.1  
 Mean   : 457.6   Mean   : 144.2   Mean   :21.7   Mean   : 501.2  
 3rd Qu.: 543.5   3rd Qu.: 147.0   3rd Qu.:22.7   3rd Qu.: 514.7  
 Max.   :3029.0   Max.   :1098.3   Max.   :48.5   Max.   :7432.1  
  avgItemsProd      avgscore   
 Min.   : 1.00   Min.   :2.50  
 1st Qu.: 2.32   1st Qu.:3.92  
 Median : 2.80   Median :4.04  
 Mean   : 3.32   Mean   :4.01  
 3rd Qu.: 3.92   3rd Qu.:4.15  
 Max.   :14.00   Max.   :4.64

zz = c("bed_bath_table","health_beauty","sports_leisure","furniture_decor",
       "computers_accessories","housewares","watches_gifts",
       "telephony","garden_tools","auto","toys","cool_stuff",
       "perfumery","baby","electronics","stationery","fashion_bags_accessories",
       "pet_shop","office_furniture","consoles_games")


Z = Z %>% filter(product_category_name_english %in% zz) %>% data.frame()
rownames(Z) = Z$product_category_name

pca = PCA(Z[,2:9])

pca

**Results for the Principal Component Analysis (PCA)**
The analysis was performed on 20 individuals, described by 8 variables
*The results are available in the following objects:

   name               description                          
1  "$eig"             "eigenvalues"                        
2  "$var"             "results for the variables"          
3  "$var$coord"       "coord. for the variables"           
4  "$var$cor"         "correlations variables - dimensions"
5  "$var$cos2"        "cos2 for the variables"             
6  "$var$contrib"     "contributions of the variables"     
7  "$ind"             "results for the individuals"        
8  "$ind$coord"       "coord. for the individuals"         
9  "$ind$cos2"        "cos2 for the individuals"           
10 "$ind$contrib"     "contributions of the individuals"   
11 "$call"            "summary statistics"                 
12 "$call$centre"     "mean of the variables"              
13 "$call$ecart.type" "standard error of the variables"    
14 "$call$row.w"      "weights for the individuals"        
15 "$call$col.w"      "weights for the variables"

get_eigenvalue(pca)

      eigenvalue variance.percent cumulative.variance.percent
Dim.1  3.1203503        39.004379                      39.004
Dim.2  2.6749757        33.437196                      72.442
Dim.3  1.1793839        14.742298                      87.184
Dim.4  0.7904394         9.880492                      97.064
Dim.5  0.1972855         2.466068                      99.530
Dim.6  0.0190277         0.237847                      99.768
Dim.7  0.0122745         0.153431                      99.922
Dim.8  0.0062631         0.078289                     100.000

pca$var$coord`: 各變數在各尺度的座標

pca$var$coord

                Dim.1    Dim.2     Dim.3     Dim.4        Dim.5
ItemsSold    -0.44894  0.86579 -0.144706  0.149039  0.029565616
Rev          -0.19068  0.95096  0.220862  0.026485  0.000016669
noProd       -0.61772  0.73741 -0.209503 -0.138018  0.052329209
avgPrice      0.61992  0.39552  0.542480 -0.392485 -0.083897157
avgFreight    0.73761  0.14516 -0.460213 -0.368823  0.294231113
avgRevProd    0.84597  0.40236  0.332940  0.070773 -0.005395069
avgItemsProd  0.70034  0.21154 -0.036928  0.671724  0.096360373
avgscore     -0.60528 -0.30521  0.668939  0.046675  0.301243908

pca$var$coord`: 各變數在各尺度呈現的資訊比率

pca$var$cos2

                Dim.1    Dim.2     Dim.3      Dim.4            Dim.5
ItemsSold    0.201543 0.749586 0.0209398 0.02221261 0.00087412562087
Rev          0.036358 0.904316 0.0487801 0.00070147 0.00000000027785
noProd       0.381581 0.543778 0.0438914 0.01904898 0.00273834609718
avgPrice     0.384295 0.156436 0.2942844 0.15404484 0.00703873296966
avgFreight   0.544063 0.021070 0.2117959 0.13603043 0.08657194768519
avgRevProd   0.715670 0.161891 0.1108489 0.00500884 0.00002910676947
avgItemsProd 0.490475 0.044749 0.0013637 0.45121365 0.00928532156984
avgscore     0.366365 0.093150 0.4474796 0.00217854 0.09074789199767

將變數投射到主成分空間

fviz_pca_var(pca)

【C】縮減空間中的個體 (Individuals)

pca$ind$coord`: 個體在各尺度的座標

pca$ind$coord

                             Dim.1      Dim.2     Dim.3      Dim.4     Dim.5
auto                     -0.746752  0.0090012  0.068352 -1.7669832 -0.077742
baby                      0.637063 -0.7166511  0.108497 -0.7824179 -0.122098
bed_bath_table           -1.705067  2.7134632 -1.321038  0.4040085 -0.377621
computers_accessories     0.287958  1.6308688 -0.140517  1.0258632 -0.287283
consoles_games            0.992674 -1.7848938  0.737757 -0.2747416 -0.832805
cool_stuff                1.978962  0.4033102  1.760035  0.1632432  0.564118
electronics               0.084395 -1.9841621 -0.755362  2.1222250  0.097351
fashion_bags_accessories -1.594076 -2.5061866 -0.110341 -0.2825151 -0.288563
furniture_decor          -1.481470  1.2904760 -1.581025 -0.2653104 -0.206925
garden_tools              1.699746 -0.0947168  0.105496  1.4826295  0.724244
health_beauty            -1.053489  2.6626509  0.763691  0.2706058  0.527869
housewares               -1.557286  0.4977828 -0.863039 -0.4395855  0.368192
office_furniture          5.802838  0.4008194 -2.333619 -0.9895464  0.109087
perfumery                -0.164657 -1.0356523  1.011108  0.5064239 -0.010141
pet_shop                 -0.519804 -2.0417267  0.429733 -0.8214849  0.376770
sports_leisure           -1.851038  1.8666533 -0.067120 -0.5135895  0.403934
stationery               -0.928725 -2.0023386  0.227009 -0.2449615  0.367443
telephony                -0.717147 -1.0705847 -0.942613  1.0150711 -0.672900
toys                     -0.859137 -0.6668207  0.468714 -0.6109624  0.162937
watches_gifts             1.695010  2.4287077  2.434282  0.0020282 -0.825866

pca$ind$coord`: 個體在各尺度呈現的資訊比率

pca$ind$cos2

                              Dim.1       Dim.2      Dim.3         Dim.4
auto                     0.14762530 0.000021449 0.00123682 0.82655790777
baby                     0.25617180 0.324176982 0.00743019 0.38640628687
bed_bath_table           0.23539177 0.596150988 0.14129890 0.01321567910
computers_accessories    0.02102763 0.674482835 0.00500712 0.26687738722
consoles_games           0.17785966 0.575027398 0.09824061 0.01362425071
cool_stuff               0.51623600 0.021441334 0.40833457 0.00351272060
electronics              0.00078264 0.432596528 0.06269592 0.49489334232
fashion_bags_accessories 0.27953469 0.690946589 0.00133933 0.00878013273
furniture_decor          0.33788556 0.256379885 0.38482355 0.01083659189
garden_tools             0.51226305 0.001590667 0.00197331 0.38975376256
health_beauty            0.12050193 0.769773087 0.06332412 0.00795075482
housewares               0.64394427 0.065794769 0.19777551 0.05130957942
office_furniture         0.83596606 0.003988460 0.13519713 0.02430972272
perfumery                0.01128729 0.446538713 0.42562398 0.10677251550
pet_shop                 0.04938752 0.761962848 0.03375479 0.12334961526
sports_leisure           0.46473180 0.472605722 0.00061106 0.03577700401
stationery               0.16773905 0.779712583 0.01002180 0.01166957766
telephony                0.12750448 0.284152384 0.22028029 0.25544778828
toys                     0.40910696 0.246450864 0.12176644 0.20689081171
watches_gifts            0.18645109 0.382799174 0.38455825 0.00000026697
                               Dim.5
auto                     0.001600008
baby                     0.009409910
bed_bath_table           0.011545692
computers_accessories    0.020929219
consoles_games           0.125184514
cool_stuff               0.041948124
electronics              0.001041382
fashion_bags_accessories 0.009160061
furniture_decor          0.006591896
garden_tools             0.093002618
health_beauty            0.030254227
housewares               0.035996436
office_furniture         0.000295427
perfumery                0.000042814
pet_shop                 0.025947223
sports_leisure           0.022130564
stationery               0.026256678
telephony                0.112256024
toys                     0.014714689
watches_gifts            0.044262851

將變數投射到主成分空間

fviz_pca_ind(pca)

【E】將個體分群

Kmeans集群分析

set.seed(99)
kmg = kmeans(scale(Z[,2:8]), 4)$cluster %>% factor
table(kmg)

kmg
 1  2  3  4 
 4 10  1  5

將個體和變數投射到主成分空間

fviz_pca_biplot(
  pca, pointsize="cos2", repel=T, labelsize=3,
  col.var="red", col.ind="#E7B800", alpha.ind=0.3)

主成分空間的選擇

fviz_pca_biplot(
  pca, axes=c(1,3),
  repel=T, col.var="black", labelsize=3,
  col.ind=kmg, alpha.ind=0.6, pointshape=16, 
  addEllipses = T, ellipse.level = 0.65, 
  mean.point = FALSE) + 
  theme(legend.position = "none")

Too few points to calculate an ellipse

#category的群集分析

(紫色賣家群集)：

巴西熱門生活用品類

前幾大銷量最高的品項都在裡面（美容保健、家居用品、Deco…）

佔總銷售額整體最高

(橘色定位)：

偏高單價、購買頻率普通的商品

平均貢獻銷量比率中上

佔總銷售額比率高

集群分析-產業類別

rm(list=ls(all=T))
load("data/olist.rdata")
load("data/Z.rdata")

segment = group_by(Z1, business_segment) %>% summarise(
  noSellers = n(),
  avgScore = mean(avgScore),
  totalRev = sum(Rev),
  avgItemsSold = mean(ItemsSold),
  avgPrice = totalRev/avgItemsSold,
  avgRev = mean(Rev),
  avgReview = mean(noReview)
  ) %>% arrange( desc(totalRev) )
segment

# A tibble: 29 x 8
   business_segment noSellers avgScore totalRev avgItemsSold avgPrice avgRev
   <chr>                <int>    <dbl>    <dbl>        <dbl>    <dbl>  <dbl>
 1 watches                  3     3.03  117279.       198        592. 39093.
 2 health_beauty           45     4.51   90836.        18.2     4979.  2019.
 3 household_utili…        44     4.47   51569.        12.8     4045.  1172.
 4 audio_video_ele…        31     4.03   50265.         8.87    5666.  1621.
 5 small_appliances         7     4.56   47675.         9.86    4837.  6811.
 6 home_decor              44     4.16   44102.        10.3     4293.  1002.
 7 pet                     17     4.39   40499.        16.8     2407.  2382.
 8 construction_to…        32     4.33   34539.         9.94    3476.  1079.
 9 car_accessories         30     4.03   30175.         5.47    5520.  1006.
10 home_appliances          5     4.02   26241.        28.2      931.  5248.
# … with 19 more rows, and 1 more variable: avgReview <dbl>

Merge TPC$product_category_name_english into P as P$category

names(TPC)[1]="product_category_name"
P = left_join(P, TPC) %>% rename(category = product_category_name_english)

Joining, by = "product_category_name"

Merge P$category intoI`

I = left_join(I, P[,c(1,10)])

Joining, by = "product_id"

Summerise by category

category = filter(I, !is.na(category)) %>% 
  group_by(category) %>% summarise(
    itemsSold = n(),
    totalRev = sum(price),
    avgPrice = mean(price),
    noProduct = n_distinct(product_id),
    noSeller = n_distinct(seller_id),
    dummy = 2018
  ) %>% arrange(desc(totalRev))

Top-20 categories

top20 = category %>% top_n(20, totalRev)
top20

# A tibble: 20 x 7
   category              itemsSold totalRev avgPrice noProduct noSeller dummy
   <chr>                     <int>    <dbl>    <dbl>     <int>    <int> <dbl>
 1 health_beauty              9670 1258681.    130.       2444      492  2018
 2 watches_gifts              5991 1205006.    201.       1329      101  2018
 3 bed_bath_table            11115 1036989.     93.3      3029      196  2018
 4 sports_leisure             8641  988049.    114.       2867      481  2018
 5 computers_accessories      7827  911954.    117.       1639      287  2018
 6 furniture_decor            8334  729762.     87.6      2657      370  2018
 7 cool_stuff                 3796  635291.    167.        789      267  2018
 8 housewares                 6964  632249.     90.8      2335      468  2018
 9 auto                       4235  592720.    140.       1900      383  2018
10 garden_tools               4347  485256.    112.        753      237  2018
11 toys                       4117  483947.    118.       1411      252  2018
12 baby                       3065  411765.    134.        919      244  2018
13 perfumery                  3419  399125.    117.        868      175  2018
14 telephony                  4545  323668.     71.2      1134      149  2018
15 office_furniture           1691  273961.    162.        309       34  2018
16 stationery                 2517  230943.     91.8       849      173  2018
17 computers                   203  222963.   1098.         30        9  2018
18 pet_shop                   1947  214315.    110.        719      137  2018
19 musical_instruments         680  191499.    282.        289       70  2018
20 small_appliances            679  190649.    281.        231      105  2018

segment2=segment %>% filter(business_segment!="watches")%>% data.frame
rownames(segment2) = segment2$business_segment

pca = PCA(segment2[,2:8])

【D】同時投射個體和變數 (Bi-ploy)

將個體和變數投射到主成分空間

#把兩個徒合併
fviz_pca_biplot(
  pca, pointsize="cos2", repel=T, labelsize=3,
  col.var="red", col.ind="#E7B800", alpha.ind=0.3)

【E】將個體分群

Kmeans集群分析

#尺度分析和及群分析做搭配
#層級樹的及群分析(資料不能太多)
#kmg:標準化建立比較基準相同的權重
#cluster 分群向量
#分群玩 :美群多少人 共同特徵(summrise group mean)

set.seed(444)
kmg = kmeans(scale(segment2[,2:8]),4)$cluster %>% factor
table(kmg)

kmg
 1  2  3  4 
 3  8  5 12

#請勿動此區參數！！！！！！

將個體和變數投射到主成分空間

fviz_pca_biplot(
  pca, repel=T, col.var="black", labelsize=3,
  col.ind=kmg, alpha.ind=0.6, pointshape=16, 
  addEllipses = T, ellipse.level = 0.65, 
  mean.point = FALSE) + 
  theme(legend.position = "none")

Too few points to calculate an ellipse

以賣家的集群分析來看可以分為三大集群

第一個(紫)是高單價群集平均產品收入高

運費高

總體分數低

單價偏高，銷售額佔總體也偏高–>顧客對賣家有黏著度/產品本身信賴感就強

採取策略：

不打價格促銷戰

找知名品牌合作–>增加對賣家的信賴感

第二個(綠)是消費周期較長的品類群集品項內容主要也是日常用品

但平均的產品消費周期較長因此在短時間內沒有特別的銷量表現

第三個(橘)是日常生活用品 (賣家很多)

銷售量、收入與評分偏高

且運費低廉

在價格不高的情況下創造出銷售額最高–>需求大、面向消費者喜好

整體表現最佳

採取策略：

擴大通路佔有率

物流狀況分析

rm(list=ls(all=T))
load("data/olist.rdata")

計算賣家販售的商品種類

P=P%>%left_join(TPC[,c(1:2)])

Joining, by = "product_category_name"

I=I%>%left_join(P[,c(1,10)])

Joining, by = "product_id"

I=I%>%left_join(R[,c(2,3)])

Joining, by = "order_id"

I=I%>%left_join(O[,c(1,3)])

Joining, by = "order_id"

new=I[,c(1,4,6,9,10)]

new=new %>% left_join(O[,c(1,2,4:8)])

Joining, by = "order_id"

new=new %>% left_join(S[,c(1,4)])

Joining, by = "seller_id"

table(new$order_status,new$review_score)

             
                  1     2     3     4     5
  approved        2     0     0     1     0
  canceled      416    36    36    14    44
  delivered   13044  3802  9350 21246 63406
  invoiced      268    33    16    20    27
  processing    302    24    12     7    13
  shipped       749    89   130    93   135
  unavailable     6     0     0     0     1

new2=new %>% group_by(seller_id) %>% summarise(
  n = n(),
  avScore = mean(review_score, na.rm=T),
  rtDelivered = mean(order_status=="delivered"),
  delay=mean(
    difftime(order_delivered_customer_date,order_estimated_delivery_date, units="days"),
    na.rm=T),
  rtDelay = mean(
    order_delivered_customer_date>order_estimated_delivery_date, na.rm=T),
  carry = mean(
    difftime(order_delivered_customer_date, order_purchase_timestamp, units="days"),
    na.rm=T),
  seller_state = seller_state[1] 

  ) %>% filter(n > 100) %>% arrange(rtDelivered)

#
ggplot(new2, aes(x=rtDelay, y=rtDelivered, col=avScore)) +
  geom_point(aes(size=n), alpha=0.3)+
  geom_text(aes(label=seller_state), size=3)->g

ggplotly(g)

各地區買家的綜合物流狀況

table(new2$seller_state)


 BA  DF  ES  MA  MG  PE  PR  RJ  RS  SC  SP 
  1   2   1   1  24   1  17   9   5  12 165

new2 = new2 %>% mutate(
  state = ifelse(seller_state %in% c("SP","SC","RS","RJ","PR","MG"),
                 seller_state, "Others"))
table(new2$state)


    MG Others     PR     RJ     RS     SC     SP 
    24      6     17      9      5     12    165

new2 %>% filter(state=="SP") %>% 
ggplot(aes(x=rtDelay, y=rtDelivered, col=avScore)) +
  geom_point(aes(size=n), alpha=0.6)+
  ggtitle("Delivery Performance by SP Sellers") ->g
ggplotly(g)

new2 %>% filter(state!="SP") %>% 
ggplot(aes(x=rtDelay, y=rtDelivered, col=avScore)) +
  geom_point(aes(size=n), alpha=0.6)+
  ggtitle("Deleivery Performane by State") +
  facet_wrap(~state)

從剛剛的賣家產業類別

來看我們發現蓋改善貨運的狀況可以有效提高顧客的滿意度

因此我們往下分析每個地區的物流狀況

以銷售額最高的聖保羅來看大部分的賣家產品抵達率都在九成以上/但有部分賣家的延遲率高達15%

為了能給予顧客更好的消費體驗

我們建議在物流的表現上產品抵達率比須高於95%/延遲率則必須低於10%

因此可以針對這兩個部分的賣家加強物流的經營管理

物流狀況分析：Delay時間與銷售量

load("data/olist.rdata")
cols = colorRamp(c('seagreen','lightyellow','red'))

#運送時間差和地點的關係
O = O %>% 
  mutate(delay = (
    as.Date(order_delivered_customer_date)-as.Date(order_estimated_delivery_date) 
      ) %>% as.integer
    )

I = left_join(I, O)

Joining, by = "order_id"

I = left_join(I, S)

Joining, by = "seller_id"

summary(I)

   order_id         order_item_id   product_id         seller_id        
 Length:112650      Min.   : 1.0   Length:112650      Length:112650     
 Class :character   1st Qu.: 1.0   Class :character   Class :character  
 Mode  :character   Median : 1.0   Mode  :character   Mode  :character  
                    Mean   : 1.2                                        
                    3rd Qu.: 1.0                                        
                    Max.   :21.0                                        
                                                                        
 shipping_limit_date               price        freight_value  
 Min.   :2016-09-19 00:15:34   Min.   :   0.8   Min.   :  0.0  
 1st Qu.:2017-09-20 20:57:27   1st Qu.:  39.9   1st Qu.: 13.1  
 Median :2018-01-26 13:59:35   Median :  75.0   Median : 16.3  
 Mean   :2018-01-07 15:36:52   Mean   : 120.7   Mean   : 20.0  
 3rd Qu.:2018-05-10 14:34:00   3rd Qu.: 134.9   3rd Qu.: 21.1  
 Max.   :2020-04-09 22:35:08   Max.   :6735.0   Max.   :409.7  
                                                               
 customer_id        order_status       order_purchase_timestamp     
 Length:112650      Length:112650      Min.   :2016-09-04 21:15:19  
 Class :character   Class :character   1st Qu.:2017-09-13 19:17:04  
 Mode  :character   Mode  :character   Median :2018-01-19 23:02:16  
                                       Mean   :2018-01-01 00:09:48  
                                       3rd Qu.:2018-05-04 17:30:36  
                                       Max.   :2018-09-03 09:06:57  
                                                                    
 order_approved_at             order_delivered_carrier_date 
 Min.   :2016-09-15 12:16:38   Min.   :2016-10-08 10:34:01  
 1st Qu.:2017-09-14 03:06:05   1st Qu.:2017-09-18 20:41:02  
 Median :2018-01-20 13:42:05   Median :2018-01-24 18:44:52  
 Mean   :2018-01-01 11:46:01   Mean   :2018-01-05 02:22:56  
 3rd Qu.:2018-05-05 03:10:16   3rd Qu.:2018-05-08 13:20:15  
 Max.   :2018-09-03 17:40:06   Max.   :2018-09-11 19:48:28  
 NA's   :15                    NA's   :1194                 
 order_delivered_customer_date order_estimated_delivery_date     delay     
 Min.   :2016-10-11 13:46:32   Min.   :2016-10-04 00:00:00   Min.   :-147  
 1st Qu.:2017-09-26 20:09:44   1st Qu.:2017-10-05 00:00:00   1st Qu.: -17  
 Median :2018-02-02 20:57:23   Median :2018-02-16 00:00:00   Median : -13  
 Mean   :2018-01-14 13:25:24   Mean   :2018-01-24 20:12:15   Mean   : -12  
 3rd Qu.:2018-05-15 20:09:21   3rd Qu.:2018-05-28 00:00:00   3rd Qu.:  -7  
 Max.   :2018-10-17 13:22:46   Max.   :2018-10-25 00:00:00   Max.   : 188  
 NA's   :2454                                                NA's   :2454  
 seller_zip_code_prefix seller_city        seller_state      
 Length:112650          Length:112650      Length:112650     
 Class :character       Class :character   Class :character  
 Mode  :character       Mode  :character   Mode  :character

I %>% filter(!is.na(delay)) %>% 
  group_by(seller_state) %>% summarise(
    nSeller = n_distinct(seller_id),
    nOrder = n_distinct(order_id),
    rDelay = mean(delay > 0),
    avgDelay = mean(delay)) %>% 
  arrange(desc(rDelay))

# A tibble: 22 x 5
   seller_state nSeller nOrder rDelay avgDelay
   <chr>          <int>  <int>  <dbl>    <dbl>
 1 AM                 1      3 0.333       9  
 2 MA                 1    389 0.194     -11.3
 3 RN                 5     51 0.0714    -13.5
 4 SP              1769  68637 0.0711    -11.3
 5 RJ               163   4230 0.0691    -12.5
 6 CE                12     87 0.0667    -13.4
 7 DF                30    808 0.0600    -13.2
 8 MS                 5     49 0.06      -17.4
 9 ES                22    310 0.0577    -13.4
10 PR               335   7512 0.0530    -14.2
# … with 12 more rows

df = group_by(I, order_id, seller_id) %>% tally %>% left_join(S[,c(1,4)])

Joining, by = "seller_id"

O2 = left_join(O, df[,c(1,4)])

Joining, by = "order_id"

O2 = left_join(O2,C[,c(1,5)])

Joining, by = "customer_id"

summary(O2)

   order_id         customer_id        order_status      
 Length:100785      Length:100785      Length:100785     
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
                                                         
 order_purchase_timestamp      order_approved_at            
 Min.   :2016-09-04 21:15:19   Min.   :2016-09-15 12:16:38  
 1st Qu.:2017-09-12 21:22:46   1st Qu.:2017-09-13 12:50:20  
 Median :2018-01-19 12:38:28   Median :2018-01-19 19:49:46  
 Mean   :2017-12-31 19:45:51   Mean   :2018-01-01 05:40:13  
 3rd Qu.:2018-05-04 22:23:55   3rd Qu.:2018-05-05 03:35:20  
 Max.   :2018-10-17 17:30:18   Max.   :2018-09-03 17:40:06  
                               NA's   :160                  
 order_delivered_carrier_date  order_delivered_customer_date
 Min.   :2016-10-08 10:34:01   Min.   :2016-10-11 13:46:32  
 1st Qu.:2017-09-18 17:57:41   1st Qu.:2017-09-26 18:02:59  
 Median :2018-01-24 18:45:17   Median :2018-02-02 22:51:49  
 Mean   :2018-01-05 08:27:43   Mean   :2018-01-14 21:56:38  
 3rd Qu.:2018-05-08 14:25:00   3rd Qu.:2018-05-16 13:48:48  
 Max.   :2018-09-11 19:48:28   Max.   :2018-10-17 13:22:46  
 NA's   :1784                  NA's   :2968                 
 order_estimated_delivery_date     delay        seller_state      
 Min.   :2016-09-30 00:00:00   Min.   :-147.0   Length:100785     
 1st Qu.:2017-10-04 00:00:00   1st Qu.: -17.0   Class :character  
 Median :2018-02-15 00:00:00   Median : -13.0   Mode  :character  
 Mean   :2018-01-24 14:47:32   Mean   : -11.9                     
 3rd Qu.:2018-05-28 00:00:00   3rd Qu.:  -7.0                     
 Max.   :2018-11-12 00:00:00   Max.   : 188.0                     
                               NA's   :2968                       
 customer_state    
 Length:100785     
 Class :character  
 Mode  :character

mx = tapply(O2$delay, list(O2$customer_state, O2$seller_state), mean)
mx[is.na(mx)] = 0
mx = mx[order(-rowSums(mx)),]
mx = mx[,order(-colSums(mx))]

N = 12
mx[1:N, 1:N] %>%  d3heatmap(F,F,col=cols)

O = left_join(O, C[,c(1,5)])

Joining, by = "customer_id"

summary(O)

   order_id         customer_id        order_status      
 Length:99441       Length:99441       Length:99441      
 Class :character   Class :character   Class :character  
 Mode  :character   Mode  :character   Mode  :character  
                                                         
                                                         
                                                         
                                                         
 order_purchase_timestamp      order_approved_at            
 Min.   :2016-09-04 21:15:19   Min.   :2016-09-15 12:16:38  
 1st Qu.:2017-09-12 14:46:19   1st Qu.:2017-09-12 23:24:16  
 Median :2018-01-18 23:04:36   Median :2018-01-19 11:36:13  
 Mean   :2017-12-31 08:43:12   Mean   :2017-12-31 18:35:24  
 3rd Qu.:2018-05-04 15:42:16   3rd Qu.:2018-05-04 20:35:10  
 Max.   :2018-10-17 17:30:18   Max.   :2018-09-03 17:40:06  
                               NA's   :160                  
 order_delivered_carrier_date  order_delivered_customer_date
 Min.   :2016-10-08 10:34:01   Min.   :2016-10-11 13:46:32  
 1st Qu.:2017-09-15 22:28:50   1st Qu.:2017-09-25 22:07:22  
 Median :2018-01-24 16:10:58   Median :2018-02-02 19:28:10  
 Mean   :2018-01-04 21:49:48   Mean   :2018-01-14 12:09:19  
 3rd Qu.:2018-05-08 13:37:45   3rd Qu.:2018-05-15 22:48:52  
 Max.   :2018-09-11 19:48:28   Max.   :2018-10-17 13:22:46  
 NA's   :1783                  NA's   :2965                 
 order_estimated_delivery_date     delay        customer_state    
 Min.   :2016-09-30 00:00:00   Min.   :-147.0   Length:99441      
 1st Qu.:2017-10-03 00:00:00   1st Qu.: -17.0   Class :character  
 Median :2018-02-15 00:00:00   Median : -12.0   Mode  :character  
 Mean   :2018-01-24 03:08:37   Mean   : -11.9                     
 3rd Qu.:2018-05-25 00:00:00   3rd Qu.:  -7.0                     
 Max.   :2018-11-12 00:00:00   Max.   : 188.0                     
                               NA's   :2965

range(O$delay, na.rm=T)

[1] -147  188

table(O$order_status, O$delay > 0, useNA='ifany')

             
              FALSE  TRUE  <NA>
  approved        0     0     2
  canceled        5     1   619
  created         0     0     5
  delivered   89936  6534     8
  invoiced        0     0   314
  processing      0     0   301
  shipped         0     0  1107
  unavailable     0     0   609

table(O$order_status)


   approved    canceled     created   delivered    invoiced  processing 
          2         625           5       96478         314         301 
    shipped unavailable 
       1107         609

O$anticipation_dev

Warning: Unknown or uninitialised column: 'anticipation_dev'.

NULL

str(R$anticipation_dev)

Warning: Unknown or uninitialised column: 'anticipation_dev'.

 NULL

summary(O$anticipation_dev)

Warning: Unknown or uninitialised column: 'anticipation_dev'.

Length  Class   Mode 
     0   NULL   NULL

I=left_join(I,O)

Joining, by = c("order_id", "customer_id", "order_status", "order_purchase_timestamp", "order_approved_at", "order_delivered_carrier_date", "order_delivered_customer_date", "order_estimated_delivery_date", "delay")

S=left_join(S,I)

Joining, by = c("seller_id", "seller_zip_code_prefix", "seller_city", "seller_state")

O2=O2 %>% group_by(seller_state,customer_state) %>% mutate(nProd=n())
mx = tapply(O2$delay, list(O2$customer_state, O2$seller_state), length)
mx[is.na(mx)] = 0
mx = mx[order(-rowSums(mx)),]
mx = mx[,order(-colSums(mx))]

N = 12
mx[1:N, 1:N] %>%  d3heatmap(F,F,col=cols)

cust12 = rownames(mx)[1:12]; cust12

 [1] "SP" "RJ" "MG" "RS" "PR" "SC" "BA" "DF" "ES" "GO" "PE" "CE"

sell12 = colnames(mx)[1:12]; sell12

 [1] "SP" "MG" "PR" "RJ" "SC" "RS" "DF" "BA" "GO" "PE" "MA" "ES"

mx = O2 %>% filter(customer_state %in% cust12, seller_state %in% sell12) %>% 
  {tapply(.$delay, list(.$customer_state, .$seller_state), mean, na.rm=T)}
d3heatmap(mx, col=cols)

mx = O2 %>% filter(customer_state %in% cust12, seller_state %in% sell12) %>% 
  {tapply( .$delay > 0, list(.$customer_state, .$seller_state), mean, na.rm=T)}
mx %>% d3heatmap(F,F, col=cols)

#cust12 =c("SC","RS","RJ","BA","PE","SP","MG","DF","GO","PR","CE","ES"); cust12
#sell12 =c("MA","ES","DF","PE","GO","RS","PR","SP","RJ","BA","MG","SC"); sell12
mx = O2 %>% filter(customer_state %in% cust12, seller_state %in% sell12) %>% 
  {tapply(.$nProd, list(.$customer_state, .$seller_state), mean, na.rm=T)}
E=(rowSums(mx)%o%colSums(mx))/sum(mx)
r=(mx-E)/sqrt(E)
r[r>100]=100
r %>% d3heatmap(F,F, col=cols)

#DELAY的天數和銷售量呈顯著低正相關
#為何有相關
#為何是正相關
O2=O2 %>% mutate(nProd2=scale(nProd))
cor.test(O2$nProd,O2$delay)


    Pearson's product-moment correlation

data:  O2$nProd and O2$delay
t = 40, df = 97815, p-value <0.0000000000000002
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
 0.12080 0.13314
sample estimates:
    cor 
0.12698

🧙 給Olist商場的建議：
A.提供後臺數據分析、優化平台

B.增加廠商數量，開拓藍海市場

C.制定該地區本土化行銷策略

D,舉辦實體活動，增加黏著度

E.針對高價商品引進大型品牌商

F.設置延遲率與抵達率的獎懲機制

🧙 給Olist賣家的建議：
A.提供多元的產品

B.改善產品的呈現方式

Group6_OLIST_EDA.

龔雪燕郭培羽吳紫瑜郭律磊沈佳蒨

2019/10/21

Group6_OLIST_EDA.

龔雪燕 郭培羽 吳紫瑜 郭律磊 沈佳蒨

2019/10/21

龔雪燕郭培羽吳紫瑜郭律磊沈佳蒨