pacman::p_load(plotly,ggplot2,stringr,tidyr,dplyr,googleVis)
load("data/olist.rdata")A = left_join(P[,1:2],TPC) %>%
select(product_id, category=product_category_name_english) %>%
right_join(I) %>%
left_join(S[,c(1,4)]) %>%
left_join(O) %>%
left_join(C[,c(1,5)]) %>%
rename(from=seller_state, to=customer_state)
A = group_by(R, order_id) %>% summarise(score = review_score[1]) %>%
right_join(A)A = A %>% mutate(
order_confirm = difftime(order_approved_at, order_purchase_timestamp, units="days"),
s_to_t = difftime(order_delivered_carrier_date, order_approved_at, units="days"),
t_to_c = difftime(order_delivered_customer_date, order_delivered_carrier_date, units="days"),
t.total = order_confirm+s_to_t+t_to_c,
t.estimate = difftime(order_estimated_delivery_date, order_purchase_timestamp, units="days"),
t.delay = t.total - t.estimate,
t.limit = difftime(shipping_limit_date, order_purchase_timestamp, units="days")
) %>%
mutate_at(vars(order_confirm:t.limit), as.numeric)table(A$category) %>% sort() %>% tail(20) %>% names -> top20
A %>% filter(category %in% top20) %>%
group_by(category) %>%
summarise_at(vars(order_confirm:t_to_c), mean, na.rm=T) %>%
gather("stage","days",order_confirm:t_to_c) %>%
mutate(stage = factor(stage, c('t_to_c','s_to_t','order_confirm'))) %>%
ggplot(aes(x=category, y=days, fill=stage )) +
geom_bar(stat="Identity") +
coord_flip()par(cex=0.7, mar=c(5,12,4,2))
boxplot(t.total~category, subset(A, category %in% top20), horizontal=T, las=2,
main="Delivery Timespan (days) per Category", xlab="days")a = A %>% filter(category %in% top20, category!="office_furniture") %>%
mutate(ym = as.Date(format(order_delivered_customer_date, "%Y-%m-15")) ) %>%
filter(ym>=as.Date("2017-03-15"), ym<=as.Date("2018-08-15"))a %>% group_by(ym) %>% summarise(
percetage_of_delay = 100*mean(t.delay > 0, na.rm=T),
std_score = mean(score, na.rm=T)
) %>%
mutate_at(vars(std_score), ~ 2*(.-mean(.))/sd(.)) %>%
gather("metrics","value",-ym) %>%
ggplot(aes(x=ym, y=value, col=metrics)) +
geom_point() + geom_line() +
ggtitle("評分與物流延遲之關係圖") + xlab("Date") +
scale_x_date(date_breaks="2 months", date_labels="%Y-%m")-> p
ggplotly(p)#載入套件
library(dplyr)
library(stringr)
library(ggplot2)
#載入data
load("data/R.rdata")#挑出所需欄位進行進一步的處理
R2 <- data.frame(date = R$review_creation_date, score = R$review_score, message = R$message)
#新增一個欄位來看有沒有在評價裡留言
R2$has_revs <- ifelse(is.na(R2$message),"no reviews","with reviews")
#畫圖來看個分數中有留言的情況
qplot(score, data = R2, fill = has_revs, binwidth = 0.5) + 由圖可知,多數評分為5分的消費者並沒有留言
#進一步查看
score5 <- R2 %>% select(has_revs) %>% filter(R2$score == 5)
aa <- table(score5) %>% data.frame()
aa$Freq <- as.numeric(aa$Freq)
aa$avg <- aa$Freq / nrow(score5)
barplot(aa$avg,names.arg=aa$score5,main="評分為5中,留言的比率", col=c("purple","orange"), boder=TRUE,xlab="score", ylab="frequency", ylim=c(0, 1))R22<-na.omit(R2)
#table(R22$score)%>%barplot()
table(is.na(R22$has_revs))#篩出有留言的評價
#apple <- R22 %>% group_by(date) %>% mutate(avg_score = #mean(score)) %>%ungroup()
#apple$date <- format(apple$date,format="%Y-%m-%d")
#tomato = apple %>% select(date, avg_score)
#tomato=unique(tomato) #清楚的篩出每天的平均分數library(tidytext)
filter1<- R22 %>%
filter(score == 1) %>%select(message)
#做斷詞
tidy_filter1 <- filter1 %>%
unnest_tokens(word, message)
# 過濾特殊字元
well_1 = tidy_filter1 %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9']",word)) %>% # 去數字
filter(nchar(.$word)>1)
#加入停用字
data(stop_words)
well_1 <- well_1 %>%
anti_join(stop_words)well_1 %>%
count(word, sort = TRUE) %>%
filter(n > 500) %>%
filter(!(word %in% c("product", "store", "receive", "paid", "bought","recommend")))%>%#不要的字
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word)) +
geom_col() +
labs(x = "出現次數超過500次", y = NULL) ##### 繪製成文字雲
library(wordcloud2)
well_1 %>%
count(word, sort = TRUE) %>%
filter(!(word %in% c("product", "store", "receive", "paid", "bought","recommend", "purchased", "forward")))%>%#不要的字
filter(n > 50) %>% wordcloud2(size = 1, color='random-light', backgroundColor="black", shape = "dimond")