#主題:關於olist-2015年成立的交易平台

目錄

載入套件

讀入data

load("0410_wenyu.rda")
# order = fread("olist_orders_dataset.csv") # 訂單與運送狀態
# review = fread("olist_order_reviews_dataset11.csv", encoding = "UTF-8") # 商品滿意度調查
# payment = fread("olist_order_payments_dataset.csv") # 付款方式
# item = fread("olist_order_items_dataset.csv") # 訂單價值
# customer = fread("olist_customers_dataset.csv") # 客戶資料
# product = fread("olist_products_dataset.csv") # 產品資訊
# seller = fread("olist_sellers_dataset.csv") # 賣家資訊
# geolocation = fread("olist_geolocation_dataset.csv") # 巴西郵遞區號與地理位置
# name = fread("product_category_name_translation.csv") # 英文名
# 
# # 整理data
# # 依照orderID
# df =merge(order, review, by = "order_id", all = TRUE)
# df =merge(df,payment, by = "order_id", all = TRUE)
# df =merge(df,item, by = "order_id", all = TRUE)
# # 依照customerID
# df =merge(df,customer, by = "customer_id", all = TRUE)
# # 依照sellerID
# df =merge(df,seller, by = "seller_id", all = TRUE)
# # 依照productID
# df =merge(df,product, by = "product_id", all = TRUE)
# # 依照product_category_name
# df =merge(df,name, by = "product_category_name", x.all = TRUE)
# # 剩餘zipcode之後再做比較

# 篩掉同一個訂單
o=df[!duplicated(df$order_id), ] 

一、 電商平台的成長狀態:

總結:數量跟金額上整體有逐漸上升的狀態,於2017年大幅成長,2018平緩化。除了因剛成立,處成長期而有上升的狀態,另外此平台也會因為特別活動而有突出的訂單數(e.g. 2017/11/24(六) 數量飆高,因為11/23是黑色星期五、2018五月初數量上升可能是因母親節節日)

1. 訂單數量概覽:

# table(df$order_purchase_timestamp) %>% sort # 以月分劃分是最洽當的
ts=as.POSIXct(o$order_purchase_timestamp, format="%Y-%m-%d %H:%M:%S")
par(cex=0.8)
hist(ts,"month",las=2,freq=T,xlab="",main ="各月份訂單數",col ="#FF7744")

2. 細覽:

summary(o$order_id) # 總共97256筆
##    Length     Class      Mode 
##     97256 character character
# 細分
o$date= as.Date(o$order_purchase_timestamp,format="%Y-%m-%d") 
lim <- as.Date(c("2017-01-01","2018-09-01"))
qplot(data=o,
      x=date,
      geom = "bar",fill="")+
  # fill = customer_city) + # 依地區分 
  labs(x = "日期",y = "數量") + 
  scale_x_date(date_breaks="months",limits = lim) + 
  ggtitle("細分訂單數")+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')+
  theme(axis.text.x = element_text(angle = -60))
## Warning: Removed 311 rows containing non-finite values (stat_count).

3. 消費金額趨勢(不含運費) :

# 篩掉「訂單編號且訂單商品數一樣」的,避免同一商品因付款方式不同重複計算;最後再加總
cash = df %>% select(order_id,order_item_id,price,order_purchase_timestamp) %>% as.data.frame
cash1 =cash %>%
  group_by(order_id,order_item_id) %>%
  filter(row_number() == 1) %>%
  ungroup()
cash2 = aggregate(price ~ order_id, cash1, sum) 
cash2  =merge(cash2 ,cash1, by =c( "order_id"), all.x = TRUE)
cash2$date= as.Date(cash2$order_purchase_timestamp,format="%Y-%m-%d")
setDT(cash2)[, Month_Yr := format(as.Date(order_purchase_timestamp), "%Y-%m") ]
cash3 = aggregate(price.x ~ Month_Yr, cash2, sum)

# 視覺化
ggplot(cash3, aes(x = Month_Yr, y = price.x)) + 
  geom_point(color = '#FF7744') +
  scale_y_continuous(labels = comma)+ 
  #geom_text_repel(aes(label=price.x)) +
  labs(x = "年月份",y = "銷售金額")+
  ggtitle("總銷售金額趨勢")+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')+
  theme(axis.text.x = element_text(angle = -60))

二、 訂單主要項目與內容:

總結:每次購買商品數量只買一項,客單價低,但是有許多離群值,因此可以依每個客戶潛力來做行銷,另外從其熱門商品來看,其潛力產品為健康保養(持續上升),夕陽產品則是3c產品[這邊可能要參考冠綾做的寫]。

1. 每張訂單中,產品購買數量:

此資料中最高是一個訂單中購買了21個商品,但大多仍是購買少量商品為主(90%的訂單都只單純購買一樣商品)。

pnumber = df %>% select(order_id,order_item_id) %>% as.data.frame
pnumber1 =pnumber %>%
    group_by(order_id) %>%
    arrange(order_id,desc(order_item_id)) %>%
    filter(row_number() == 1) %>%
    ungroup()
pnumber1 %>%
  ggplot(aes(x=order_item_id,fill=""))+
  geom_bar(aes(y = ..prop.., stat="count"))+
  scale_y_continuous(breaks=seq(0,1,0.1),limits =c(0,1))+
  labs(x = "產品銷售數量",y = "訂單數量百分比")+
  ggtitle("單筆訂單中購買的產品數量")+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')
## Warning: Ignoring unknown aesthetics: stat

2. 熱門商品:在平台上偏好購買的商品

以售出數量2000以上計算(前18名)分别為寢具用品、健康保養、運動休閒用品、家具及裝飾品、電腦配件、家居用品、手錶禮物、電話、園藝工具、汽車、玩具、coolstuff、香水、嬰兒用品、電器用品、文具、時尚包包配件、寵物用品;其較熱門的商品,反倒不像其他電商平台(銷售服飾雜貨等)。

df$product_category_name_english %>% table %>%  sort %>% 
  as.data.frame %>% 
  filter(Freq>2000) %>%
  ggplot(aes(x=.,y=Freq,fill=""))+
  geom_bar(stat="identity")+
  labs(x = "產品名稱",y = "售出數量")+
  ggtitle("商品熱門排行")+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')+
  theme(axis.text.x = element_text(angle = -70))

3. 單筆消費商品金額(不含運費):

以每筆訂單去做的單次購買商品的金額:其客單價(平均數)為137.85元巴西幣(約1097.5台幣),超過50%的人單筆花費(不含運費)少於100巴西幣(約800台幣),其實不高;但因為是購物平台,有各種不同產品,所以在整體價格上也會有很大的離群值(中位數86.8元巴西幣,約700元台幣),因此若在將來要實施行銷方案,可能必須先以各客戶的消費能力來作不同的營銷。

# 篩掉「訂單編號且訂單商品數一樣」的,避免同一商品因付款方式不同重複計算;最後再加總
price = df %>% select(order_id,order_item_id,price) %>% as.data.frame
price1 =price %>%
  group_by(order_id,order_item_id) %>%
  filter(row_number() == 1) %>%
  ungroup()
price2 = aggregate(price ~ order_id, price1, sum) %>% as.data.frame
# 直方圖
ggplot(price2, aes(x=price,fill="")) +
  geom_histogram(aes(y = (..count..)/sum(..count..)),binwidth=50)+
  scale_x_continuous(breaks=seq(0,2000,100),limits =c(0,2000))+
  scale_y_continuous(labels = percent)+
  labs(x = "商品消費金額(巴西幣)",y = "數量/總數量")+
  ggtitle("單筆消費商品金額(不含運費)")+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')+
  theme(axis.text.x = element_text(angle = -70))
## Warning: Removed 142 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

# 盒狀圖
ggplot(price2, aes(x =1, y = price,fill="")) +
  geom_boxplot( aes( y = price),binwidth=50)+
  scale_y_continuous(breaks=seq(0,1000,100),limits =c(0,1000))+
  labs(x = "整個平台的訂單",y = "總消費金額(不含運費)")+
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')
## Warning: Ignoring unknown parameters: binwidth
## Warning: Removed 929 rows containing non-finite values (stat_boxplot).

mean(price2$price) # 平均數 137.85
## [1] 137.8485
median(price2$price) # 中位數 86.8
## [1] 86.8

4. 單筆消費總金額(含運費)

其含運費後平均提升至160.8元巴西幣(約1290台幣),中位數提升至105.28元(約850台幣)

# 會因為買的商品數量,重複計算到總金額;如果只篩訂單,又會把不同付款方式的各個金額篩掉
# 所以我們刪掉「訂單且付款方式一樣」的,並合併不同付款方式的各個金額變成各訂單總金額。
p =df %>%
  group_by(order_id, payment_sequential) %>%
  filter(row_number() == 1) %>%
  ungroup()
pricecount = p %>% select(order_id,payment_value) %>% as.data.frame
pricecount1 = aggregate(payment_value ~ order_id, pricecount, sum) %>% as.data.frame
# 直方圖
ggplot(pricecount1, aes(x=payment_value,fill="")) +
  geom_histogram(aes(y = (..count..)/sum(..count..)),binwidth=50)+
  scale_x_continuous(breaks=seq(0,2000,100),limits =c(0,2000))+
  scale_y_continuous(labels = percent)+
  labs(x = "訂單總金額(巴西幣)",y = "數量/總數量")+
  ggtitle("單筆消費總金額(含運費)")+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')+
  theme(axis.text.x = element_text(angle = -70))
## Warning: Removed 202 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).

# 盒狀圖
ggplot(pricecount1, aes(x =1, y = payment_value,fill="")) +
  geom_boxplot( aes( y = payment_value),binwidth=50, outlier.alpha = 0.01)+
  scale_y_continuous(breaks=seq(0,1000,100),limits =c(0,1000))+
  labs(x = "整個平台的訂單",y = "總消費金額(含運費)")+
  ggtitle("單筆消費總金額")+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')
## Warning: Ignoring unknown parameters: binwidth
## Warning: Removed 1130 rows containing non-finite values (stat_boxplot).

mean(pricecount1$payment_value) # 平均數 160.801
## [1] 160.801
median(pricecount1$payment_value) # 中位數 105.28
## [1] 105.28

三、 當前客群分析:

總結:此平台吸引的消費族群,多位於聖保羅州等巴西東南側地帶,偏好於平日白天活動時間(10~16點)購物,並習慣使用信用卡支付,是較為信任網路資安,且經濟能力還算穩定者;從顧客忠誠度來看,可以感受到這些消費者仍在嘗鮮、試水溫的階段。

1. 所在地區:依照訂單去看這個平台的消費者主要居住地

前6名為SP,RJ,MG,RS,PR,SC(聚集在巴西東南方);聖保羅州特別多

t= table(o$customer_state) %>% sort %>% as.data.frame

t %>%
  ggplot(aes(x=Var1,y=Freq,fill=""))+
  geom_bar(stat="identity")+
  labs(x = "地區",y = "消費者數量")+
  ggtitle("消費者所在地區")+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')

2. 偏好消費時間:

購物時間主要在周一到周五的10至16點,還有周一、二的20至22點,但主要為上班時間或上課時段;假日反而沒人網購(除了周日晚上)

table(format(ts,"%u"), format(ts,"%H")) %>% 
  as.data.frame.matrix %>% 
  d3heatmap(F,F,col=colorRamp(c('seagreen','lightyellow','red')))

3. 忠誠度:在平台上的消費次數

忠誠度低,多數顧客只買一次,估計有可能是此平台才成立不久,正在擴展客源的階段;並且可能是因商場優惠(黑色星期五)湧入的客源。

### 篩掉同一次的購買,以免重覆計算
as.data.frame(table(o$customer_id)) %>%
  ggplot(aes(x=Freq,fill=""))+
  geom_histogram(aes(y = (..count..)/sum(..count..)),binwidth=1)+
  scale_x_continuous(breaks=seq(0,10,1),limits =c(0,5))+
  scale_y_continuous(labels = percent)+
  labs(x = "購買次數",y = "客戶數百分比")+
  ggtitle("每位顧客消費次數")+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')
## Warning: Removed 2 rows containing missing values (geom_bar).

4. 主要支付方式

相較於流行於巴西本地的現金支付方法boleto,這些消費者以使用信用卡付款為大宗,佔了約3/4,可以知道他們對於資訊及個資上的安全是較為信任的,且有能力申辦信用卡,代表其經濟能力是穩定且不錯的。

as.data.frame(table(o$payment_type))%>%
  ggplot+
  geom_bar(aes(x=factor(1),
               y=Freq,
               fill=Var1),
           stat = "identity",
           position = 'fill') +
  coord_polar("y", start=0)+
  labs(x = "",y = "")+
  ggtitle("習慣支付方式")+
  scale_fill_discrete(name="支付方式",
                      breaks=c("boleto", "credit_card","debit_card","voucher"),
                      labels=c("boleto本地現金支付方式", "信用卡","金融卡","禮券,代金券"))+ 
  scale_y_continuous(labels = percent_format())

四、 評價分析:從最直接的客戶反應,了解該平台的優劣勢所在

總結:我們也發現除了產品是否符合預期,物流的速度也是此消費族群重視的地方。而物流影響評價,評價影響銷量;若能提升物流,以提高評價,並連動提高銷售量,將是此平台極大的機會。

1. 評價分數:

平均4.09335分,以五分為最多,然此數據非呈現遞減,一分數量卻高於兩分和三分。

### 篩掉同一個reviewid
r=df[!duplicated(df$review_id), ] 

ggplot(r,aes(x=review_score,fill=""))+
  geom_bar()+
  labs(x = "評分(/5分)",y = "人數")+
  ggtitle("評價分數")+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')

mean(r$review_score) 
## [1] 4.09335

2. 評價分數趨勢:

可以發現銷售量最高的日子:黑色星期五後給出的評價卻不到母親節檔期過後的高,另外我們也可以發現每一天五分的數量跟其他分數比起來都是最多的。

###各分數時間趨勢
r$d= as.Date(r$review_answer_timestamp,format="%Y/%m/%d") 
reviewperiod=as.data.frame(table(r$d,r$review_score)) %>% 
  as.data.frame
reviewperiod$tsd=as.Date(reviewperiod$Var1, format="%Y-%m-%d")
reviewperiod %>%
  ggplot(aes(x=tsd,y=Freq,color =Var2))+
  geom_line()+ 
  labs(x = "日期",y = "數量")+
  ggtitle("評價分數趨勢")+ 
  scale_x_date(date_breaks="month",limits = lim) + 
  scale_color_discrete(name="給分")+ 
  theme_economist() +
  theme(axis.text.x = element_text(angle = -60))
## Warning: Removed 565 rows containing missing values (geom_path).

3. 文字評論內容:

跟在期限內收到商品之間可能有關係
produto 產品/ de介係詞 /não不是/ prazo 交貨/ muito 非常/ entrega 交貨/ antes首先/chegou 到達/recebi 收到了/bom 良好 /

reviewword = as.data.frame(select(r,review_id,review_score,review_comment_title,review_comment_message)) 
colnames(reviewword) = c("id","score","title","message")
reviewword <-  reviewword %>%  # 刪除空行
  filter(message!=""|!is.na(message)) 
reviewword1 <- reviewword %>%  # 切割(尚未加入停用字)
  ungroup() %>%
  unnest_tokens(word, message) %>%
  filter(!word %in% stop_words$word)

### 文字雲:

count <- reviewword1 %>%
  anti_join(stop_words) %>%
  count(word) %>%
  arrange(desc(n))
## Joining, by = "word"
count %>% wordcloud2
count
## # A tibble: 17,013 x 2
##    word        n
##    <chr>   <int>
##  1 produto 18008
##  2 de      11267
##  3 nao     10559
##  4 prazo    8265
##  5 muito    7836
##  6 entrega  6441
##  7 antes    5533
##  8 chegou   5474
##  9 recebi   5168
## 10 bom      4520
## # ... with 17,003 more rows
### 計算詞/句頻
n5 =reviewword %>% 
  ungroup() %>%
  unnest_tokens(figram, message, token = "ngrams", n = 5) %>%
  separate(figram, c("word1", "word2", "word3", "word4", "word5" ), sep = " ",na.rm = T) %>%
  count(word1, word2, word3, word4,word5, sort = TRUE)
head(n5,20)
## # A tibble: 20 x 6
##    word1    word2    word3   word4   word5        n
##    <chr>    <chr>    <chr>   <chr>   <chr>    <int>
##  1 <NA>     <NA>     <NA>    <NA>    <NA>     66858
##  2 produto  chegou   antes   do      prazo      298
##  3 chegou   bem      antes   do      prazo      278
##  4 produto  entregue antes   do      prazo      203
##  5 ainda    nao      recebi  o       produto    183
##  6 o        produto  chegou  antes   do         167
##  7 foi      entregue antes   do      prazo      145
##  8 o        produto  nao     foi     entregue   122
##  9 chegou   antes    do      prazo   e          121
## 10 entregue bem      antes   do      prazo      101
## 11 e        chegou   antes   do      prazo       93
## 12 produto  chegou   bem     antes   do          93
## 13 o        produto  antes   do      prazo       87
## 14 ate      o        momento nao     recebi      84
## 15 nao      recebi   o       produto e           84
## 16 recebi   o        produto antes   do          84
## 17 produto  entregue dentro  do      prazo       83
## 18 chegou   antes    do      prazo   previsto    77
## 19 o        produto  chegou  no      prazo       77
## 20 produto  chegou   dentro  do      prazo       75
# 產品在截止日期前到達
# 在截止日期前到達
# 產品在截止日期前交付
# 我沒有收到產品
# 產品到貨之前到了
# 在截止日期前交付
# 產品未送達
# 在截止日期前抵達
# 在截止日期前交付

4. 計算物流跟評分的關係:

此平台有約90%的會準時到貨,平均物流抵達時間為12.5天內抵達居多,最長高達210天,最短一天不到,但光靠圖表,我們無法明顯看出兩者之間的關聯;我們使用相關性和回歸檢定,才發現呈現低度負相關,且回歸檢定上也是顯著的(指物流花越久,分數越低;物流越快,分數越高)。

# 物流準時時間 
tspre=as.POSIXct(o$order_estimated_delivery_date, format="%Y-%m-%d %H:%M:%S")
tsaft=as.POSIXct(o$order_delivered_customer_date, format="%Y-%m-%d %H:%M:%S")

as.data.frame(table(tsaft > tspre))%>%
  ggplot+
  geom_bar(aes(x=factor(1),
               y=Freq,
               fill=Var1),
           stat = "identity",
           position = 'fill') +
  labs(x = "",y = "")+
  ggtitle("物流準時程度")+
  coord_polar("y", start=0)+ 
  scale_fill_discrete(name="到貨期",
                      breaks=c("FALSE", "TRUE"),
                      labels=c("準時期限內", "遲交貨"))+ 
  theme_economist() + 
  scale_color_economist()  + 
  scale_y_continuous(labels = percent_format())

# 關係
tsr1=as.POSIXct(r$order_purchase_timestamp, format="%Y-%m-%d %H:%M:%S")
tsr2=as.POSIXct(r$order_delivered_customer_date, format="%Y-%m-%d %H:%M:%S")
r = mutate(r,spendtime=difftime(tsr2,tsr1, units="day") )  #天數
# 視覺化
r %>%
  ggplot(aes(review_score, spendtime)) +
  geom_point(mapping = NULL, data = NULL, stat = "identity", position = "identity",color="#FF7744") +
  labs(x = "評分(/5分)",y = "物流等待(天數)")+
  ggtitle("物流跟評分")+
  scale_y_continuous(labels = comma)+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')
## Warning: Removed 2122 rows containing missing values (geom_point).

summary(as.numeric(r$spendtime)) 
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
##   0.5334   6.7635  10.2109  12.5501  15.7053 209.6286     2122
# 相關
cor.test(r$review_score, as.numeric(r$spendtime))
## 
##  Pearson's product-moment correlation
## 
## data:  r$review_score and as.numeric(r$spendtime)
## t = -109.76, df = 94930, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.3412025 -0.3299126
## sample estimates:
##        cor 
## -0.3355696
# 回歸
model <- lm(formula= review_score ~ spendtime,
            data=r)
summary(model)
## 
## Call:
## lm(formula = review_score ~ spendtime, data = r)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.6771 -0.4126  0.5124  0.7619  8.8013 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  4.7163697  0.0065356   721.6   <2e-16 ***
## spendtime   -0.0454997  0.0004146  -109.8   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.219 on 94930 degrees of freedom
##   (2122 observations deleted due to missingness)
## Multiple R-squared:  0.1126, Adjusted R-squared:  0.1126 
## F-statistic: 1.205e+04 on 1 and 94930 DF,  p-value: < 2.2e-16

5. 銷售量與評分的關係

評分五分的數量與銷售量之間呈現中度正相關,且其線性回歸顯著,代表當五分的評分數上升時,銷售量也會跟著提升。

r$o= as.Date(r$order_purchase_timestamp,format="%Y-%m-%d") 
ro =as.data.frame(table(r$o))
reviewperiod1 = reviewperiod %>% filter(Var2 ==5) # 篩出五分 
ro =merge(ro,reviewperiod1, by = "Var1", x.all = TRUE)
colnames(ro)=c("時間","sales","分數","評5分數","時間1")

ro %>%
  ggplot(aes(sales,評5分數)) +
  geom_point(mapping = NULL, data = NULL, stat = "identity", position = "identity",color="#FF7744",size=1) +
  labs(x = "5分的評分數",y = "銷售量")+
  ggtitle("產品銷售量與評分")+
  scale_y_continuous(labels = comma)+ 
  theme_economist() + 
  scale_color_economist()+
  theme(legend.position='none')

cor.test(ro$sales,  ro$評5分數)
## 
##  Pearson's product-moment correlation
## 
## data:  ro$sales and ro$評5分數
## t = 15.724, df = 595, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.4825511 0.5961422
## sample estimates:
##      cor 
## 0.541816

四、 段落總結:

附錄

1. 計算TF-IDF

因為透過單看各分數的文字雲無法看出實際問題,所以使用tf-idf
評分為1:insatisfeito不滿/ tento試圖 /parcela一部分 /decepção騙局 /enganosa 誤導/entrar 聯繫,進入/descaso 忽視
評分為2:tento試圖/entrar 聯繫,進入/flex靈活性/moist 潮濕/Pneu Maggion aro 某產品名稱 /esquerdo左
評分為3:identidade身分/taxas利率/pentes梳子/ tirando服用/apto適合/automotivo汽車/presilhas 緊/
評分為4:ótimo 美好/rapida很快/direitinho恰到好處/benefícios好處/baixar降低/Tirando 除此 評分為5:ótimo 美好/rapida很快/direitinho恰到好處/adorou喜歡/excelentes優點/superou超出預期/parabens恭喜
小結:估計低分是因為覺得被騙被誤導,但是聯絡賣家又被忽視;有一些詞是直接跟產品有關,像是2分出現的某輪胎牌子

### 每區間總共字數
reviewword2 =reviewword1  %>%
  count(score, word, sort = TRUE)
total_words <- reviewword2 %>% 
  group_by(score) %>% 
  summarize(total = sum(n))

t_words <- left_join(reviewword2, total_words)
## Joining, by = "score"
t_words
## # A tibble: 31,725 x 4
##    score word          n  total
##    <int> <chr>     <int>  <int>
##  1     5 produto    7963 142573
##  2     1 nao        6008 125654
##  3     5 prazo      5569 142573
##  4     5 muito      5296 142573
##  5     1 produto    4861 125654
##  6     5 antes      4452 142573
##  7     1 de         3974 125654
##  8     5 de         3729 142573
##  9     5 entrega    3628 142573
## 10     5 recomendo  3227 142573
## # ... with 31,715 more rows
###找出重要的單詞(專有詞)
t_words <-t_words  %>%
  bind_tf_idf(word,score, n)
t_words
## # A tibble: 31,725 x 7
##    score word          n  total     tf   idf tf_idf
##    <int> <chr>     <int>  <int>  <dbl> <dbl>  <dbl>
##  1     5 produto    7963 142573 0.0559     0      0
##  2     1 nao        6008 125654 0.0478     0      0
##  3     5 prazo      5569 142573 0.0391     0      0
##  4     5 muito      5296 142573 0.0371     0      0
##  5     1 produto    4861 125654 0.0387     0      0
##  6     5 antes      4452 142573 0.0312     0      0
##  7     1 de         3974 125654 0.0316     0      0
##  8     5 de         3729 142573 0.0262     0      0
##  9     5 entrega    3628 142573 0.0254     0      0
## 10     5 recomendo  3227 142573 0.0226     0      0
## # ... with 31,715 more rows
### 計算出tf-idf
t_words %>%
  select(-total) %>%
  arrange(desc(tf_idf))
## # A tibble: 31,725 x 6
##    score word           n       tf   idf   tf_idf
##    <int> <chr>      <int>    <dbl> <dbl>    <dbl>
##  1     5 otima        201 0.00141  0.223 0.000315
##  2     5 rapida       195 0.00137  0.223 0.000305
##  3     5 direitinho    73 0.000512 0.511 0.000262
##  4     5 adorou        70 0.000491 0.511 0.000251
##  5     5 excelentes    38 0.000267 0.916 0.000244
##  6     5 superou      143 0.00100  0.223 0.000224
##  7     4 direitinho    22 0.000423 0.511 0.000216
##  8     2 aussie         4 0.000131 1.61  0.000211
##  9     5 parabens     127 0.000891 0.223 0.000199
## 10     5 agradecer     29 0.000203 0.916 0.000186
## # ... with 31,715 more rows
### 結果視覺化
t_words %>%
  arrange(desc(tf_idf)) %>%
  mutate(word = factor(word, levels = rev(unique(word)))) %>% 
  group_by(score) %>% 
  top_n(7) %>% 
  ungroup() %>%
  ggplot(aes(word, tf_idf, fill =score)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~score, ncol = 2, scales = "free") +
  coord_flip()
## Selecting by tf_idf

2. 產品簡介與評分的關係

小結:極低度相關

### 產品照片數與評分的關係
cor(r$review_score,  r$product_photos_qty)
## [1] 0.01435297
cor.test(r$review_score,  r$product_photos_qty)
## 
##  Pearson's product-moment correlation
## 
## data:  r$review_score and r$product_photos_qty
## t = 4.4719, df = 97052, p-value = 7.763e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.008062374 0.020642439
## sample estimates:
##        cor 
## 0.01435297
r %>%
  ggplot(aes(review_score, product_photos_qty,group=review_score)) +
  geom_boxplot( width = .5, outlier.alpha = 0.01)+
  geom_point(mapping = NULL, data = NULL, stat = "identity", position = "identity") +
  labs(x = "評分(/5分)",y = "產品照片數量")+
  ggtitle("產品照片數與評分")+
  scale_y_continuous(labels = comma)

### 產品簡介字數與評分的關係
cor(r$review_score,  r$product_description_lenght)
## [1] 0.01006433
cor.test(r$review_score,  r$product_description_lenght)
## 
##  Pearson's product-moment correlation
## 
## data:  r$review_score and r$product_description_lenght
## t = 3.1355, df = 97052, p-value = 0.001716
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.003773238 0.016354620
## sample estimates:
##        cor 
## 0.01006433
r %>%
  ggplot(aes(review_score, product_description_lenght,group=review_score)) +
  geom_boxplot( width = .5, outlier.alpha = 0.01)+
  geom_point(mapping = NULL, data = NULL, stat = "identity", position = "identity") +
  labs(x = "評分(/5分)",y = "產品簡介字數")+
  ggtitle("產品簡介字數與評分")+
  scale_y_continuous(labels = comma)

3. 運費與評分的關係

小結:基本上關聯不大

cor(r$review_score,  r$freight_value)
## [1] -0.03853995
cor.test(r$review_score,  r$freight_value)
## 
##  Pearson's product-moment correlation
## 
## data:  r$review_score and r$freight_value
## t = -12.015, df = 97052, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.04482041 -0.03225644
## sample estimates:
##         cor 
## -0.03853995
r %>%
  ggplot(aes(review_score,freight_value)) +
  geom_point(mapping = NULL, data = NULL, stat = "identity", position = "identity") +
  labs(x = "評分(/5分)",y = "運費")+
  ggtitle("運費與評分")+
  scale_y_continuous(labels = comma)