[第四組] M074020010 陳韻卉 M074020036 張哲瑋 M074020043 巢正修 B046060005陳雅靜 B046060016 戴婉錡 B046060037 陳柏愷
library(knitr); opts_chunk$set(comment = NA)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(d3heatmap)
library(googleVis)
## Creating a generic function for 'toJSON' from package 'jsonlite' in package 'googleVis'
##
## Welcome to googleVis version 0.6.3
##
## Please read Google's Terms of Use
## before you start using the package:
## https://developers.google.com/terms/
##
## Note, the plot method of googleVis will by default use
## the standard browser to display its output.
##
## See the googleVis package vignettes for more details,
## or visit https://github.com/mages/googleVis.
##
## To suppress this message use:
## suppressPackageStartupMessages(library(googleVis))
library(servr)
library(tidyr)
library(htmltools)
options(gvis.plot.tag='chart', digits=4, scipen=60)
#options(width=100, digits=4, scipen=60)
op <- options(gvis.plot.tag='chart')
load("origin.RData") #load 原始資料
Order = merge(Order,Pay) # 一筆是一個訂單
Order = merge(Order,Customer) # 一筆是一個訂單
Order$order_purchase_timestamp = as.POSIXct(Order$order_purchase_timestamp, format = "%Y-%m-%d %H:%M:%S")
min(Order$order_purchase_timestamp)
[1] "2016-09-04 21:15:19 CST"
max(Order$order_purchase_timestamp)
[1] "2018-10-17 17:30:18 CST"
hist(Order$order_purchase_timestamp, "month", las = 2,freq = T,xlab = "")
table(format(Order$order_purchase_timestamp, "%u"), format(Order$order_purchase_timestamp, "%H")) %>%
as.data.frame.matrix %>%
d3heatmap(F,F,col=colorRamp(c('seagreen','lightyellow','red')))
names(CatName)[1] = "product_category_name"
Product = merge(Product,CatName)
ProductSell = merge(Order,Item) #一筆 是一個產品的銷售紀錄
ProductSell = merge(ProductSell,Product) #一筆 是一個產品的銷售紀錄
ProductSell = merge(Seller,ProductSell) #一筆是一個賣家的產品銷售紀錄
商品種類的實際需求數量
ProductSell %>%
group_by(product_category_name_english) %>%
summarise(
n = n()
) %>% arrange(desc(n))
沒有銷售紀錄的商家
sum(is.na(ProductSell$product_id))
[1] 0
#有62
subset(ProductSell,is.na(product_id))
去除沒有銷售紀錄的商家
ProductSell = subset(ProductSell,!is.na(product_id))
length(unique(ProductSell$seller_id))
[1] 3033
#在銷售紀錄中 共有3095個不同的賣家
#但Deal資料集中 只有842位賣家的資料(看起來是透過網頁註冊的賣家才會放在這個資料集)
ggplot(ProductSell,aes(x=product_category_name_english,y=price))+
geom_boxplot()+
scale_y_log10()
因為Review是針對一筆訂單,而訂單可能包含多項商品,我們不知道評論究竟是針對訂單中的哪一個產品,因此篩選出只有 買一個商品的訂單,可以確保評論的星等是針對此商品。
篩選出只有購買一個商品的訂單
One_product = Item %>%
group_by(order_id) %>%
summarise(product_num = n_distinct(order_item_id)) %>%
filter(product_num == 1)
以產品類別做分組 計算出平均單價、總收益、商品總賣出數、有多少賣家賣出此種產品類別
X = ProductSell %>%
filter(order_id %in% One_product$order_id) %>% #篩選出只有購買一個商品的訂單
group_by(product_category_name_english) %>%
summarise(
avg_price = mean(price), #平均單價
total_rev = sum(price), #總收益
total_sold_num = n(), #商品總賣出數
total_seller_num = n_distinct(seller_id) #有多少賣家賣出此種產品類別
) %>% mutate()
以產品類別做分組 計算出總評論數量、平均評論分數
ProductReview = ProductSell %>%
filter(order_id %in% One_product$order_id) %>%
left_join(Reviews) #保留那些沒有評論的商品
Joining, by = "order_id"
vis = ProductReview %>%
group_by(product_category_name_english) %>%
summarise(
total_review_num = n(), #評論數量
avg_review_score = mean(review_score)) %>%
merge(X) %>%
mutate(dummy = 2018)
google motion chart
op <- options(gvis.plot.tag='chart')
options(op)
vistmp = gvisMotionChart(
vis, "product_category_name_english", "dummy",
options=list(width=800, height=600, title="Product category"))
print(vistmp, file="M.html")
includeHTML("./M.html")
X = ProductSell %>%
filter(order_id %in% One_product$order_id) %>% #篩選出只有購買一個商品的訂單
group_by(seller_state) %>%
summarise(
avg_price = mean(price),
total_rev = sum(price),
total_sold_num = n(),
total_seller_num = n_distinct(seller_id),
total_cust_num = n_distinct(customer_unique_id)
)
vis = ProductReview %>%
group_by(seller_state) %>%
summarise(
total_review_num = n(), #評論數量
avg_review_score = mean(review_score)) %>%
merge(X) %>%
mutate(dummy = 2018)
op <- options(gvis.plot.tag='chart')
options(op)
vistmp = gvisMotionChart(
vis, "seller_state", "dummy",
options=list(width=800, height=600, title="Product category"))
print(vistmp, file="M.html")
includeHTML("./M.html")
將州名縮寫對照到全名
abbrBZ = read.csv("state.csv")
vis = merge(vis,abbrBZ,by.x = "seller_state",by.y = "ABBR")
#vis$State_name = paste(vis$State_name,vis$seller_state,sep = "-")
GeoStates <- gvisGeoChart(vis,
locationvar = "State_name",
colorvar = "total_sold_num",
sizevar = "total_cust_num",
#hovervar = c("avg_review_score","total_seller_num"),
hovervar = "seller_state",
options=list(region="BR",
displayMode="regions",
resolution="provinces",
width=800, height=600))
print(GeoStates, file="geo.html")
includeHTML("./geo.html")
在marketing funnel中的Deal資料集中 有business_segment欄位說明賣家屬於哪個產業類別 但只有提供377多個商家有business_segment
X = merge(Deal,ProductSell)
length(unique(X$seller_id))
[1] 377
原本的business_segment有幾種類別
levels(Deal$business_segment)
[1] "" "air_conditioning"
[3] "audio_video_electronics" "baby"
[5] "bags_backpacks" "bed_bath_table"
[7] "books" "car_accessories"
[9] "computers" "construction_tools_house_garden"
[11] "fashion_accessories" "food_drink"
[13] "food_supplement" "games_consoles"
[15] "gifts" "handcrafted"
[17] "health_beauty" "home_appliances"
[19] "home_decor" "home_office_furniture"
[21] "household_utilities" "jewerly"
[23] "music_instruments" "other"
[25] "party" "perfume"
[27] "pet" "phone_mobile"
[29] "religious" "small_appliances"
[31] "sports_leisure" "stationery"
[33] "toys" "watches"
所有我們嘗試對賣家做分群 並查看每群特性 推測其產業類別
分群的來源資料為 ProductSell:每一筆是一個賣家的商品銷售紀錄
整理出一個dataframe
每一列是一個賣家 欄位是每種產品種類的sold number
Segment = table(ProductSell$seller_id,ProductSell$product_category_name_english) %>% as.data.frame
Segment = Segment %>% spread(Var2,Freq) #從長表格整理成寬表格
names(Segment)[1] = "seller_id"
Segment
只選取有銷售紀錄的賣家
Segment = Segment[rowSums(Segment[,2:72]) != 0,]
根據賣家銷售紀錄來對賣家做分群,我們對賣家銷售紀錄的處理方法有兩種
將產品種類賣出數 轉成 只要有賣就是true , 沒賣是false 的格式 但我們認為這樣子會失去單一賣家的銷售特性
以prop.table(margin=1) 計算每個賣家銷售產品的類別比例
mx = as.matrix(Segment[,2:72]) %>% prop.table(margin=1)
hc = dist(mx) %>% hclust(method="ward.D2")
# 用"binary"的方法來計算距離
plot(hc)
在反覆測試分群數目後,選擇20作為最終分群數
kg = cutree(hc, k=20)
cluster = as.data.frame(table(kg))
ggplot(cluster,aes(x=reorder(kg, -Freq), y = Freq)) +
geom_bar(stat="identity")+
labs(x="第幾群",y="群內賣家數量")
mx = as.data.frame(mx)
mx$group = factor(cutree(hc, k=20))
D = mx %>%
group_by(group) %>%
summarise_each(funs(mean))
Warning: funs() is soft deprecated as of dplyr 0.8.0
please use list() instead
# Before:
funs(name = f(.)
# After:
list(name = ~f(.))
This warning is displayed once per session.
以熱圖呈現每群特性:同一群賣家的銷售產品類別比例
D[,2:72] %>%
as.matrix %>%
d3heatmap(col=colorRampPalette(c('black','darkblue','blue','green','yellow','orange','red'))(999))
先以第二種方法來看的話
得到group id後 合併回去銷售資料
X = ProductSell %>%
group_by(seller_id) %>%
summarise(
total_sold_num = n(),
total_rev = sum(price),
avg_freight = mean(freight_value),
prod_num = n_distinct(product_id),
avg_photo_qty = mean(product_photos_qty)
)
Leader = cbind(X,mx[,"group"])
names(Leader)[7] = "group"
以x軸為賣家總賣出的商品數量 , y軸為總收益 看出有哪幾群出現領導賣家
ggplot(Leader,aes(x=total_sold_num,y=total_rev))+
geom_point()+
facet_wrap(.~group)
我們觀察這20群中,簡單歸類出4種市場表現,並可以往回看熱圖觀察這群是什麼產業類別。
第7群: 雖然出現銷售數量比他人多的賣家,但總收益並沒有比有些賣得少的人還多,也就是他的平均收益並沒有領先他人太多
第5群: 出現賣的多、收益也多的領導賣家
第11群: 出現多個表現特殊的賣家,以收益為100000~1500000的兩個賣家來做比較的話,會發現銷售數 不一定是影響收益多少的關鍵,賣得少也可以獲得可觀的收益(也就是說,這個賣家能將有限資源做最大的發揮,並反映在商品價值上。) 第11群是watches_gift產業類別,或許可以想成是手錶市場上有兩種賣家,一家具有指標性產品,會吸引特定買家購買(因此平均定價可拉高)。而另外一家則是主打親民,人人都買得起的商品。這種情況下,消費者會根據其偏好,購買能滿足他們需求的產品。
第18群: 是pet shops產業,他是一個完全競爭市場,大家的平均收益的差不多,並沒有一個特殊賣家可以獲得較大的市場比例,消費者並沒有對特定賣家忠誠,或是有特殊的喜好