研究議題:評分與顧客留言之商業分析

載入套件與資料

pacman::p_load(plotly,ggplot2,stringr,tidyr,dplyr,googleVis)
load("data/olist.rdata")

匯入顧客與廠商的位置

A = left_join(P[,1:2],TPC) %>% 
  select(product_id, category=product_category_name_english) %>% 
  right_join(I) %>% 
  left_join(S[,c(1,4)]) %>% 
  left_join(O) %>% 
  left_join(C[,c(1,5)]) %>% 
  rename(from=seller_state, to=customer_state)

A = group_by(R, order_id) %>% summarise(score = review_score[1]) %>% 
  right_join(A)


  • 資料處理
A = A %>% mutate(
  order_confirm = difftime(order_approved_at, order_purchase_timestamp, units="days"),
  s_to_t = difftime(order_delivered_carrier_date, order_approved_at, units="days"),
  t_to_c = difftime(order_delivered_customer_date, order_delivered_carrier_date, units="days"),
  t.total = order_confirm+s_to_t+t_to_c,
  t.estimate = difftime(order_estimated_delivery_date, order_purchase_timestamp, units="days"),
  t.delay = t.total - t.estimate, 
  t.limit = difftime(shipping_limit_date, order_purchase_timestamp, units="days")
  ) %>% 
  mutate_at(vars(order_confirm:t.limit), as.numeric)
table(A$category) %>% sort() %>% tail(20) %>% names -> top20
A %>% filter(category %in% top20) %>% 
  group_by(category) %>% 
  summarise_at(vars(order_confirm:t_to_c), mean, na.rm=T) %>% 
  gather("stage","days",order_confirm:t_to_c) %>%
  mutate(stage = factor(stage, c('t_to_c','s_to_t','order_confirm'))) %>% 
  ggplot(aes(x=category, y=days, fill=stage )) +
    geom_bar(stat="Identity") +
    coord_flip()

par(cex=0.7, mar=c(5,12,4,2))
boxplot(t.total~category, subset(A, category %in% top20), horizontal=T, las=2,
        main="Delivery Timespan (days) per Category", xlab="days")

a = A %>% filter(category %in% top20, category!="office_furniture") %>% 
  mutate(ym = as.Date(format(order_delivered_customer_date, "%Y-%m-15")) ) %>% 
  filter(ym>=as.Date("2017-03-15"), ym<=as.Date("2018-08-15"))
a %>% group_by(ym) %>% summarise(
  percetage_of_delay = 100*mean(t.delay > 0, na.rm=T),
  std_score = mean(score, na.rm=T)
  ) %>% 
  mutate_at(vars(std_score), ~ 2*(.-mean(.))/sd(.)) %>% 
  gather("metrics","value",-ym) %>% 
  ggplot(aes(x=ym, y=value, col=metrics)) +
  geom_point() + geom_line() +
  ggtitle("評分與物流延遲之關係圖") + xlab("Date") +
  scale_x_date(date_breaks="2 months", date_labels="%Y-%m")-> p
ggplotly(p)

做評分跟留言間的議題研究

#載入套件
library(dplyr)
library(stringr)
library(ggplot2)
#載入data
load("data/R.rdata")
#挑出所需欄位進行進一步的處理
R2 <- data.frame(date = R$review_creation_date, score = R$review_score, message = R$message)

#新增一個欄位來看有沒有在評價裡留言
R2$has_revs <- ifelse(is.na(R2$message),"no reviews","with reviews")

#畫圖來看個分數中有留言的情況
qplot(score, data = R2, fill = has_revs, binwidth = 0.5)

+ 由圖可知,多數評分為5分的消費者並沒有留言

#進一步查看
score5 <- R2 %>% select(has_revs) %>% filter(R2$score == 5) 
aa <- table(score5) %>% data.frame()
aa$Freq <- as.numeric(aa$Freq)
aa$avg <- aa$Freq / nrow(score5)

barplot(aa$avg,names.arg=aa$score5,main="評分為5中,留言的比率", col=c("purple","orange"), boder=TRUE,xlab="score", ylab="frequency", ylim=c(0, 1))

R22<-na.omit(R2)
#table(R22$score)%>%barplot()
table(is.na(R22$has_revs))#篩出有留言的評價

#apple <- R22 %>% group_by(date)  %>% mutate(avg_score = #mean(score)) %>%ungroup()
#apple$date <- format(apple$date,format="%Y-%m-%d")  

#tomato = apple %>% select(date, avg_score)
#tomato=unique(tomato) #清楚的篩出每天的平均分數

利用文字雲了解大家不滿意的原因

library(tidytext)
filter1<- R22 %>% 
filter(score == 1) %>%select(message)

#做斷詞
tidy_filter1 <- filter1 %>%
  unnest_tokens(word, message)

# 過濾特殊字元
well_1 = tidy_filter1 %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9']",word)) %>% # 去數字
  filter(nchar(.$word)>1)

#加入停用字
data(stop_words)
well_1 <- well_1 %>%
anti_join(stop_words)
畫出1分留言的常用字
well_1 %>%
  count(word, sort = TRUE) %>%
  filter(n > 500) %>%
  filter(!(word %in% c("product", "store", "receive", "paid", "bought","recommend")))%>%#不要的字
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(x = "出現次數超過500次", y = NULL)

##### 繪製成文字雲

library(wordcloud2)
well_1 %>%
  count(word, sort = TRUE) %>%
  filter(!(word %in% c("product", "store", "receive", "paid", "bought","recommend", "purchased", "forward")))%>%#不要的字
  filter(n > 50) %>% wordcloud2(size = 1,  color='random-light', backgroundColor="black", shape = "dimond")