[第四組] M074020010 陳韻卉 M074020036 張哲瑋 M074020043 巢正修 B046060005陳雅靜 B046060016 戴婉錡 B046060037 陳柏愷

library(knitr); opts_chunk$set(comment = NA)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(d3heatmap)
library(googleVis)

## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'

## 
## Welcome to googleVis version 0.6.3
## 
## Please read Google's Terms of Use
## before you start using the package:
## https://developers.google.com/terms/
## 
## Note, the plot method of googleVis will by default use
## the standard browser to display its output.
## 
## See the googleVis package vignettes for more details,
## or visit https://github.com/mages/googleVis.
## 
## To suppress this message use:
## suppressPackageStartupMessages(library(googleVis))

library(servr)
library(tidyr)
library(htmltools)

options(gvis.plot.tag='chart', digits=4, scipen=60)
#options(width=100, digits=4, scipen=60)
op <- options(gvis.plot.tag='chart')

load("origin.RData") #load 原始資料

資料合併、整理

Order = merge(Order,Pay) # 一筆是一個訂單
Order = merge(Order,Customer)  # 一筆是一個訂單

Order$order_purchase_timestamp = as.POSIXct(Order$order_purchase_timestamp, format = "%Y-%m-%d %H:%M:%S")

min(Order$order_purchase_timestamp)

[1] "2016-09-04 21:15:19 CST"

max(Order$order_purchase_timestamp)

[1] "2018-10-17 17:30:18 CST"

hist(Order$order_purchase_timestamp, "month", las = 2,freq = T,xlab = "")

table(format(Order$order_purchase_timestamp, "%u"), format(Order$order_purchase_timestamp, "%H")) %>%
  as.data.frame.matrix %>%
  d3heatmap(F,F,col=colorRamp(c('seagreen','lightyellow','red')))

names(CatName)[1] = "product_category_name"
Product = merge(Product,CatName)

ProductSell = merge(Order,Item) #一筆 是一個產品的銷售紀錄
ProductSell = merge(ProductSell,Product) #一筆 是一個產品的銷售紀錄
ProductSell = merge(Seller,ProductSell) #一筆是一個賣家的產品銷售紀錄

商品種類的實際需求數量

ProductSell %>% 
  group_by(product_category_name_english) %>% 
  summarise(
    n = n()
  ) %>% arrange(desc(n))

沒有銷售紀錄的商家

sum(is.na(ProductSell$product_id))

[1] 0

#有62
subset(ProductSell,is.na(product_id))

去除沒有銷售紀錄的商家

ProductSell = subset(ProductSell,!is.na(product_id))

length(unique(ProductSell$seller_id))

[1] 3033

#在銷售紀錄中 共有3095個不同的賣家
#但Deal資料集中 只有842位賣家的資料(看起來是透過網頁註冊的賣家才會放在這個資料集)

每個產品類別的特性比較

ggplot(ProductSell,aes(x=product_category_name_english,y=price))+
  geom_boxplot()+
  scale_y_log10()

因為Review是針對一筆訂單，而訂單可能包含多項商品，我們不知道評論究竟是針對訂單中的哪一個產品，因此篩選出只有買一個商品的訂單，可以確保評論的星等是針對此商品。

篩選出只有購買一個商品的訂單

One_product = Item %>%
        group_by(order_id) %>% 
        summarise(product_num = n_distinct(order_item_id)) %>% 
        filter(product_num == 1)

以產品類別做分組計算出平均單價、總收益、商品總賣出數、有多少賣家賣出此種產品類別

X = ProductSell %>%
    filter(order_id %in% One_product$order_id) %>%  #篩選出只有購買一個商品的訂單
    group_by(product_category_name_english) %>% 
    summarise(
      avg_price = mean(price), #平均單價
      total_rev = sum(price), #總收益
      total_sold_num = n(),　#商品總賣出數
      total_seller_num = n_distinct(seller_id) #有多少賣家賣出此種產品類別
    ) %>% mutate()

以產品類別做分組計算出總評論數量、平均評論分數

ProductReview = ProductSell %>%
                filter(order_id %in% One_product$order_id) %>% 
                left_join(Reviews) #保留那些沒有評論的商品

Joining, by = "order_id"

vis = ProductReview %>% 
  group_by(product_category_name_english) %>%
  summarise(
    total_review_num = n(), #評論數量
    avg_review_score = mean(review_score)) %>% 
  merge(X) %>% 
  mutate(dummy = 2018)

google motion chart

op <- options(gvis.plot.tag='chart')
options(op)
vistmp = gvisMotionChart(
  vis, "product_category_name_english", "dummy",
  options=list(width=800, height=600, title="Product category"))

print(vistmp, file="M.html")

includeHTML("./M.html")

MotionChartID50092d40799d

Data: vis • Chart ID: MotionChartID50092d40799d • googleVis-0.6.3
R version 3.5.3 (2019-03-11) • Google Terms of Use • Documentation and Data Policy

每個賣家所在州的特性比較

X = ProductSell %>%
    filter(order_id %in% One_product$order_id) %>%  #篩選出只有購買一個商品的訂單
    group_by(seller_state) %>% 
    summarise(
      avg_price = mean(price),
      total_rev = sum(price),
      total_sold_num = n(),
      total_seller_num = n_distinct(seller_id),
      total_cust_num = n_distinct(customer_unique_id)
    )

vis = ProductReview %>% 
  group_by(seller_state) %>%
  summarise(
    total_review_num = n(), #評論數量
    avg_review_score = mean(review_score)) %>% 
  merge(X) %>% 
  mutate(dummy = 2018)

op <- options(gvis.plot.tag='chart')
options(op)
vistmp = gvisMotionChart(
  vis, "seller_state", "dummy",
  options=list(width=800, height=600, title="Product category"))

print(vistmp, file="M.html")
includeHTML("./M.html")

MotionChartID500926126565

Data: vis • Chart ID: MotionChartID500926126565 • googleVis-0.6.3
R version 3.5.3 (2019-03-11) • Google Terms of Use • Documentation and Data Policy

地理繪圖

將州名縮寫對照到全名

abbrBZ = read.csv("state.csv")

vis = merge(vis,abbrBZ,by.x = "seller_state",by.y = "ABBR")
#vis$State_name = paste(vis$State_name,vis$seller_state,sep = "-")

GeoStates <- gvisGeoChart(vis, 
                          locationvar = "State_name",
                          
                          colorvar = "total_sold_num", 
                          sizevar = "total_cust_num",
                          
                          #hovervar = c("avg_review_score","total_seller_num"),
                          hovervar = "seller_state",
                          options=list(region="BR",
                                       displayMode="regions", 
                                       resolution="provinces",
                          width=800, height=600))

print(GeoStates, file="geo.html")

includeHTML("./geo.html")

GeoChartID50092bcce078

Data: vis • Chart ID: GeoChartID50092bcce078 • googleVis-0.6.3
R version 3.5.3 (2019-03-11) • Google Terms of Use • Documentation and Data Policy

分群：以賣家銷售資料對賣家做分群　推測其產業類別

在marketing funnel中的Deal資料集中有business_segment欄位說明賣家屬於哪個產業類別但只有提供377多個商家有business_segment

X = merge(Deal,ProductSell)
length(unique(X$seller_id))

[1] 377

原本的business_segment有幾種類別

levels(Deal$business_segment)

 [1] ""                                "air_conditioning"               
 [3] "audio_video_electronics"         "baby"                           
 [5] "bags_backpacks"                  "bed_bath_table"                 
 [7] "books"                           "car_accessories"                
 [9] "computers"                       "construction_tools_house_garden"
[11] "fashion_accessories"             "food_drink"                     
[13] "food_supplement"                 "games_consoles"                 
[15] "gifts"                           "handcrafted"                    
[17] "health_beauty"                   "home_appliances"                
[19] "home_decor"                      "home_office_furniture"          
[21] "household_utilities"             "jewerly"                        
[23] "music_instruments"               "other"                          
[25] "party"                           "perfume"                        
[27] "pet"                             "phone_mobile"                   
[29] "religious"                       "small_appliances"               
[31] "sports_leisure"                  "stationery"                     
[33] "toys"                            "watches"

所有我們嘗試對賣家做分群並查看每群特性推測其產業類別

分群的來源資料為 ProductSell:每一筆是一個賣家的商品銷售紀錄
整理出一個dataframe
每一列是一個賣家欄位是每種產品種類的sold number

Segment = table(ProductSell$seller_id,ProductSell$product_category_name_english) %>% as.data.frame 
Segment = Segment %>% spread(Var2,Freq) #從長表格整理成寬表格
names(Segment)[1] = "seller_id"

Segment

只選取有銷售紀錄的賣家

Segment = Segment[rowSums(Segment[,2:72]) != 0,]

分群找出產業類別

根據賣家銷售紀錄來對賣家做分群，我們對賣家銷售紀錄的處理方法有兩種

1. 0/1 沒賣/有賣商品

將產品種類賣出數轉成只要有賣就是true , 沒賣是false 的格式但我們認為這樣子會失去單一賣家的銷售特性

2.賣家銷售類別比例來計算距離作分群

以prop.table(margin=1) 計算每個賣家銷售產品的類別比例

mx = as.matrix(Segment[,2:72]) %>% prop.table(margin=1)

hc = dist(mx) %>% hclust(method="ward.D2")
# 用"binary"的方法來計算距離

plot(hc)

在反覆測試分群數目後，選擇20作為最終分群數

kg = cutree(hc, k=20)

cluster = as.data.frame(table(kg))


ggplot(cluster,aes(x=reorder(kg, -Freq), y = Freq)) + 
    geom_bar(stat="identity")+
    labs(x="第幾群",y="群內賣家數量")

mx = as.data.frame(mx)
mx$group = factor(cutree(hc, k=20))

D = mx %>% 
  group_by(group) %>% 
  summarise_each(funs(mean))

Warning: funs() is soft deprecated as of dplyr 0.8.0
please use list() instead

# Before:
funs(name = f(.)

# After: 
list(name = ~f(.))
This warning is displayed once per session.

以熱圖呈現每群特性：同一群賣家的銷售產品類別比例

D[,2:72] %>% 
  as.matrix %>%
d3heatmap(col=colorRampPalette(c('black','darkblue','blue','green','yellow','orange','red'))(999))

先以第二種方法來看的話

每個產業類別有領導賣家嗎？

得到group id後合併回去銷售資料

X = ProductSell %>%
  group_by(seller_id) %>%
  summarise(
    total_sold_num = n(),
    total_rev = sum(price),
    avg_freight = mean(freight_value),
    prod_num = n_distinct(product_id),
    avg_photo_qty = mean(product_photos_qty)
  )

Leader = cbind(X,mx[,"group"])
names(Leader)[7] = "group"

以x軸為賣家總賣出的商品數量 , y軸為總收益看出有哪幾群出現領導賣家

ggplot(Leader,aes(x=total_sold_num,y=total_rev))+
  geom_point()+
  facet_wrap(.~group)

我們觀察這20群中，簡單歸類出4種市場表現，並可以往回看熱圖觀察這群是什麼產業類別。

第7群：　雖然出現銷售數量比他人多的賣家，但總收益並沒有比有些賣得少的人還多，也就是他的平均收益並沒有領先他人太多
第5群：　出現賣的多、收益也多的領導賣家
第11群：　出現多個表現特殊的賣家，以收益為100000~1500000的兩個賣家來做比較的話，會發現銷售數不一定是影響收益多少的關鍵，賣得少也可以獲得可觀的收益(也就是說，這個賣家能將有限資源做最大的發揮，並反映在商品價值上。) 第11群是watches_gift產業類別，或許可以想成是手錶市場上有兩種賣家，一家具有指標性產品，會吸引特定買家購買（因此平均定價可拉高）。而另外一家則是主打親民，人人都買得起的商品。這種情況下，消費者會根據其偏好，購買能滿足他們需求的產品。
第18群：　是pet shops產業，他是一個完全競爭市場，大家的平均收益的差不多，並沒有一個特殊賣家可以獲得較大的市場比例，消費者並沒有對特定賣家忠誠，或是有特殊的喜好

第四組：Olist電商資料探索