#主題:關於olist-2015年成立的交易平台
load("0410_wenyu.rda")
# order = fread("olist_orders_dataset.csv") # 訂單與運送狀態
# review = fread("olist_order_reviews_dataset11.csv", encoding = "UTF-8") # 商品滿意度調查
# payment = fread("olist_order_payments_dataset.csv") # 付款方式
# item = fread("olist_order_items_dataset.csv") # 訂單價值
# customer = fread("olist_customers_dataset.csv") # 客戶資料
# product = fread("olist_products_dataset.csv") # 產品資訊
# seller = fread("olist_sellers_dataset.csv") # 賣家資訊
# geolocation = fread("olist_geolocation_dataset.csv") # 巴西郵遞區號與地理位置
# name = fread("product_category_name_translation.csv") # 英文名
#
# # 整理data
# # 依照orderID
# df =merge(order, review, by = "order_id", all = TRUE)
# df =merge(df,payment, by = "order_id", all = TRUE)
# df =merge(df,item, by = "order_id", all = TRUE)
# # 依照customerID
# df =merge(df,customer, by = "customer_id", all = TRUE)
# # 依照sellerID
# df =merge(df,seller, by = "seller_id", all = TRUE)
# # 依照productID
# df =merge(df,product, by = "product_id", all = TRUE)
# # 依照product_category_name
# df =merge(df,name, by = "product_category_name", x.all = TRUE)
# # 剩餘zipcode之後再做比較
# 篩掉同一個訂單
o=df[!duplicated(df$order_id), ]
總結:數量跟金額上整體有逐漸上升的狀態,於2017年大幅成長,2018平緩化。除了因剛成立,處成長期而有上升的狀態,另外此平台也會因為特別活動而有突出的訂單數(e.g. 2017/11/24(六) 數量飆高,因為11/23是黑色星期五、2018五月初數量上升可能是因母親節節日)
# table(df$order_purchase_timestamp) %>% sort # 以月分劃分是最洽當的
ts=as.POSIXct(o$order_purchase_timestamp, format="%Y-%m-%d %H:%M:%S")
par(cex=0.8)
hist(ts,"month",las=2,freq=T,xlab="",main ="各月份訂單數",col ="#FF7744")
summary(o$order_id) # 總共97256筆
## Length Class Mode
## 97256 character character
# 細分
o$date= as.Date(o$order_purchase_timestamp,format="%Y-%m-%d")
lim <- as.Date(c("2017-01-01","2018-09-01"))
qplot(data=o,
x=date,
geom = "bar",fill="")+
# fill = customer_city) + # 依地區分
labs(x = "日期",y = "數量") +
scale_x_date(date_breaks="months",limits = lim) +
ggtitle("細分訂單數")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')+
theme(axis.text.x = element_text(angle = -60))
## Warning: Removed 311 rows containing non-finite values (stat_count).
# 篩掉「訂單編號且訂單商品數一樣」的,避免同一商品因付款方式不同重複計算;最後再加總
cash = df %>% select(order_id,order_item_id,price,order_purchase_timestamp) %>% as.data.frame
cash1 =cash %>%
group_by(order_id,order_item_id) %>%
filter(row_number() == 1) %>%
ungroup()
cash2 = aggregate(price ~ order_id, cash1, sum)
cash2 =merge(cash2 ,cash1, by =c( "order_id"), all.x = TRUE)
cash2$date= as.Date(cash2$order_purchase_timestamp,format="%Y-%m-%d")
setDT(cash2)[, Month_Yr := format(as.Date(order_purchase_timestamp), "%Y-%m") ]
cash3 = aggregate(price.x ~ Month_Yr, cash2, sum)
# 視覺化
ggplot(cash3, aes(x = Month_Yr, y = price.x)) +
geom_point(color = '#FF7744') +
scale_y_continuous(labels = comma)+
#geom_text_repel(aes(label=price.x)) +
labs(x = "年月份",y = "銷售金額")+
ggtitle("總銷售金額趨勢")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')+
theme(axis.text.x = element_text(angle = -60))
總結:每次購買商品數量只買一項,客單價低,但是有許多離群值,因此可以依每個客戶潛力來做行銷,另外從其熱門商品來看,其潛力產品為健康保養(持續上升),夕陽產品則是3c產品[這邊可能要參考冠綾做的寫]。
此資料中最高是一個訂單中購買了21個商品,但大多仍是購買少量商品為主(90%的訂單都只單純購買一樣商品)。
pnumber = df %>% select(order_id,order_item_id) %>% as.data.frame
pnumber1 =pnumber %>%
group_by(order_id) %>%
arrange(order_id,desc(order_item_id)) %>%
filter(row_number() == 1) %>%
ungroup()
pnumber1 %>%
ggplot(aes(x=order_item_id,fill=""))+
geom_bar(aes(y = ..prop.., stat="count"))+
scale_y_continuous(breaks=seq(0,1,0.1),limits =c(0,1))+
labs(x = "產品銷售數量",y = "訂單數量百分比")+
ggtitle("單筆訂單中購買的產品數量")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')
## Warning: Ignoring unknown aesthetics: stat
以售出數量2000以上計算(前18名)分别為寢具用品、健康保養、運動休閒用品、家具及裝飾品、電腦配件、家居用品、手錶禮物、電話、園藝工具、汽車、玩具、coolstuff、香水、嬰兒用品、電器用品、文具、時尚包包配件、寵物用品;其較熱門的商品,反倒不像其他電商平台(銷售服飾雜貨等)。
df$product_category_name_english %>% table %>% sort %>%
as.data.frame %>%
filter(Freq>2000) %>%
ggplot(aes(x=.,y=Freq,fill=""))+
geom_bar(stat="identity")+
labs(x = "產品名稱",y = "售出數量")+
ggtitle("商品熱門排行")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')+
theme(axis.text.x = element_text(angle = -70))
以每筆訂單去做的單次購買商品的金額:其客單價(平均數)為137.85元巴西幣(約1097.5台幣),超過50%的人單筆花費(不含運費)少於100巴西幣(約800台幣),其實不高;但因為是購物平台,有各種不同產品,所以在整體價格上也會有很大的離群值(中位數86.8元巴西幣,約700元台幣),因此若在將來要實施行銷方案,可能必須先以各客戶的消費能力來作不同的營銷。
# 篩掉「訂單編號且訂單商品數一樣」的,避免同一商品因付款方式不同重複計算;最後再加總
price = df %>% select(order_id,order_item_id,price) %>% as.data.frame
price1 =price %>%
group_by(order_id,order_item_id) %>%
filter(row_number() == 1) %>%
ungroup()
price2 = aggregate(price ~ order_id, price1, sum) %>% as.data.frame
# 直方圖
ggplot(price2, aes(x=price,fill="")) +
geom_histogram(aes(y = (..count..)/sum(..count..)),binwidth=50)+
scale_x_continuous(breaks=seq(0,2000,100),limits =c(0,2000))+
scale_y_continuous(labels = percent)+
labs(x = "商品消費金額(巴西幣)",y = "數量/總數量")+
ggtitle("單筆消費商品金額(不含運費)")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')+
theme(axis.text.x = element_text(angle = -70))
## Warning: Removed 142 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
# 盒狀圖
ggplot(price2, aes(x =1, y = price,fill="")) +
geom_boxplot( aes( y = price),binwidth=50)+
scale_y_continuous(breaks=seq(0,1000,100),limits =c(0,1000))+
labs(x = "整個平台的訂單",y = "總消費金額(不含運費)")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')
## Warning: Ignoring unknown parameters: binwidth
## Warning: Removed 929 rows containing non-finite values (stat_boxplot).
mean(price2$price) # 平均數 137.85
## [1] 137.8485
median(price2$price) # 中位數 86.8
## [1] 86.8
其含運費後平均提升至160.8元巴西幣(約1290台幣),中位數提升至105.28元(約850台幣)
# 會因為買的商品數量,重複計算到總金額;如果只篩訂單,又會把不同付款方式的各個金額篩掉
# 所以我們刪掉「訂單且付款方式一樣」的,並合併不同付款方式的各個金額變成各訂單總金額。
p =df %>%
group_by(order_id, payment_sequential) %>%
filter(row_number() == 1) %>%
ungroup()
pricecount = p %>% select(order_id,payment_value) %>% as.data.frame
pricecount1 = aggregate(payment_value ~ order_id, pricecount, sum) %>% as.data.frame
# 直方圖
ggplot(pricecount1, aes(x=payment_value,fill="")) +
geom_histogram(aes(y = (..count..)/sum(..count..)),binwidth=50)+
scale_x_continuous(breaks=seq(0,2000,100),limits =c(0,2000))+
scale_y_continuous(labels = percent)+
labs(x = "訂單總金額(巴西幣)",y = "數量/總數量")+
ggtitle("單筆消費總金額(含運費)")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')+
theme(axis.text.x = element_text(angle = -70))
## Warning: Removed 202 rows containing non-finite values (stat_bin).
## Warning: Removed 2 rows containing missing values (geom_bar).
# 盒狀圖
ggplot(pricecount1, aes(x =1, y = payment_value,fill="")) +
geom_boxplot( aes( y = payment_value),binwidth=50, outlier.alpha = 0.01)+
scale_y_continuous(breaks=seq(0,1000,100),limits =c(0,1000))+
labs(x = "整個平台的訂單",y = "總消費金額(含運費)")+
ggtitle("單筆消費總金額")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')
## Warning: Ignoring unknown parameters: binwidth
## Warning: Removed 1130 rows containing non-finite values (stat_boxplot).
mean(pricecount1$payment_value) # 平均數 160.801
## [1] 160.801
median(pricecount1$payment_value) # 中位數 105.28
## [1] 105.28
總結:此平台吸引的消費族群,多位於聖保羅州等巴西東南側地帶,偏好於平日白天活動時間(10~16點)購物,並習慣使用信用卡支付,是較為信任網路資安,且經濟能力還算穩定者;從顧客忠誠度來看,可以感受到這些消費者仍在嘗鮮、試水溫的階段。
前6名為SP,RJ,MG,RS,PR,SC(聚集在巴西東南方);聖保羅州特別多
t= table(o$customer_state) %>% sort %>% as.data.frame
t %>%
ggplot(aes(x=Var1,y=Freq,fill=""))+
geom_bar(stat="identity")+
labs(x = "地區",y = "消費者數量")+
ggtitle("消費者所在地區")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')
購物時間主要在周一到周五的10至16點,還有周一、二的20至22點,但主要為上班時間或上課時段;假日反而沒人網購(除了周日晚上)
table(format(ts,"%u"), format(ts,"%H")) %>%
as.data.frame.matrix %>%
d3heatmap(F,F,col=colorRamp(c('seagreen','lightyellow','red')))
忠誠度低,多數顧客只買一次,估計有可能是此平台才成立不久,正在擴展客源的階段;並且可能是因商場優惠(黑色星期五)湧入的客源。
### 篩掉同一次的購買,以免重覆計算
as.data.frame(table(o$customer_id)) %>%
ggplot(aes(x=Freq,fill=""))+
geom_histogram(aes(y = (..count..)/sum(..count..)),binwidth=1)+
scale_x_continuous(breaks=seq(0,10,1),limits =c(0,5))+
scale_y_continuous(labels = percent)+
labs(x = "購買次數",y = "客戶數百分比")+
ggtitle("每位顧客消費次數")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')
## Warning: Removed 2 rows containing missing values (geom_bar).
相較於流行於巴西本地的現金支付方法boleto,這些消費者以使用信用卡付款為大宗,佔了約3/4,可以知道他們對於資訊及個資上的安全是較為信任的,且有能力申辦信用卡,代表其經濟能力是穩定且不錯的。
as.data.frame(table(o$payment_type))%>%
ggplot+
geom_bar(aes(x=factor(1),
y=Freq,
fill=Var1),
stat = "identity",
position = 'fill') +
coord_polar("y", start=0)+
labs(x = "",y = "")+
ggtitle("習慣支付方式")+
scale_fill_discrete(name="支付方式",
breaks=c("boleto", "credit_card","debit_card","voucher"),
labels=c("boleto本地現金支付方式", "信用卡","金融卡","禮券,代金券"))+
scale_y_continuous(labels = percent_format())
總結:我們也發現除了產品是否符合預期,物流的速度也是此消費族群重視的地方。而物流影響評價,評價影響銷量;若能提升物流,以提高評價,並連動提高銷售量,將是此平台極大的機會。
平均4.09335分,以五分為最多,然此數據非呈現遞減,一分數量卻高於兩分和三分。
### 篩掉同一個reviewid
r=df[!duplicated(df$review_id), ]
ggplot(r,aes(x=review_score,fill=""))+
geom_bar()+
labs(x = "評分(/5分)",y = "人數")+
ggtitle("評價分數")+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')
mean(r$review_score)
## [1] 4.09335
可以發現銷售量最高的日子:黑色星期五後給出的評價卻不到母親節檔期過後的高,另外我們也可以發現每一天五分的數量跟其他分數比起來都是最多的。
###各分數時間趨勢
r$d= as.Date(r$review_answer_timestamp,format="%Y/%m/%d")
reviewperiod=as.data.frame(table(r$d,r$review_score)) %>%
as.data.frame
reviewperiod$tsd=as.Date(reviewperiod$Var1, format="%Y-%m-%d")
reviewperiod %>%
ggplot(aes(x=tsd,y=Freq,color =Var2))+
geom_line()+
labs(x = "日期",y = "數量")+
ggtitle("評價分數趨勢")+
scale_x_date(date_breaks="month",limits = lim) +
scale_color_discrete(name="給分")+
theme_economist() +
theme(axis.text.x = element_text(angle = -60))
## Warning: Removed 565 rows containing missing values (geom_path).
跟在期限內收到商品之間可能有關係
produto 產品/ de介係詞 /não不是/ prazo 交貨/ muito 非常/ entrega 交貨/ antes首先/chegou 到達/recebi 收到了/bom 良好 /
reviewword = as.data.frame(select(r,review_id,review_score,review_comment_title,review_comment_message))
colnames(reviewword) = c("id","score","title","message")
reviewword <- reviewword %>% # 刪除空行
filter(message!=""|!is.na(message))
reviewword1 <- reviewword %>% # 切割(尚未加入停用字)
ungroup() %>%
unnest_tokens(word, message) %>%
filter(!word %in% stop_words$word)
### 文字雲:
count <- reviewword1 %>%
anti_join(stop_words) %>%
count(word) %>%
arrange(desc(n))
## Joining, by = "word"
count %>% wordcloud2
count
## # A tibble: 17,013 x 2
## word n
## <chr> <int>
## 1 produto 18008
## 2 de 11267
## 3 nao 10559
## 4 prazo 8265
## 5 muito 7836
## 6 entrega 6441
## 7 antes 5533
## 8 chegou 5474
## 9 recebi 5168
## 10 bom 4520
## # ... with 17,003 more rows
### 計算詞/句頻
n5 =reviewword %>%
ungroup() %>%
unnest_tokens(figram, message, token = "ngrams", n = 5) %>%
separate(figram, c("word1", "word2", "word3", "word4", "word5" ), sep = " ",na.rm = T) %>%
count(word1, word2, word3, word4,word5, sort = TRUE)
head(n5,20)
## # A tibble: 20 x 6
## word1 word2 word3 word4 word5 n
## <chr> <chr> <chr> <chr> <chr> <int>
## 1 <NA> <NA> <NA> <NA> <NA> 66858
## 2 produto chegou antes do prazo 298
## 3 chegou bem antes do prazo 278
## 4 produto entregue antes do prazo 203
## 5 ainda nao recebi o produto 183
## 6 o produto chegou antes do 167
## 7 foi entregue antes do prazo 145
## 8 o produto nao foi entregue 122
## 9 chegou antes do prazo e 121
## 10 entregue bem antes do prazo 101
## 11 e chegou antes do prazo 93
## 12 produto chegou bem antes do 93
## 13 o produto antes do prazo 87
## 14 ate o momento nao recebi 84
## 15 nao recebi o produto e 84
## 16 recebi o produto antes do 84
## 17 produto entregue dentro do prazo 83
## 18 chegou antes do prazo previsto 77
## 19 o produto chegou no prazo 77
## 20 produto chegou dentro do prazo 75
# 產品在截止日期前到達
# 在截止日期前到達
# 產品在截止日期前交付
# 我沒有收到產品
# 產品到貨之前到了
# 在截止日期前交付
# 產品未送達
# 在截止日期前抵達
# 在截止日期前交付
此平台有約90%的會準時到貨,平均物流抵達時間為12.5天內抵達居多,最長高達210天,最短一天不到,但光靠圖表,我們無法明顯看出兩者之間的關聯;我們使用相關性和回歸檢定,才發現呈現低度負相關,且回歸檢定上也是顯著的(指物流花越久,分數越低;物流越快,分數越高)。
# 物流準時時間
tspre=as.POSIXct(o$order_estimated_delivery_date, format="%Y-%m-%d %H:%M:%S")
tsaft=as.POSIXct(o$order_delivered_customer_date, format="%Y-%m-%d %H:%M:%S")
as.data.frame(table(tsaft > tspre))%>%
ggplot+
geom_bar(aes(x=factor(1),
y=Freq,
fill=Var1),
stat = "identity",
position = 'fill') +
labs(x = "",y = "")+
ggtitle("物流準時程度")+
coord_polar("y", start=0)+
scale_fill_discrete(name="到貨期",
breaks=c("FALSE", "TRUE"),
labels=c("準時期限內", "遲交貨"))+
theme_economist() +
scale_color_economist() +
scale_y_continuous(labels = percent_format())
# 關係
tsr1=as.POSIXct(r$order_purchase_timestamp, format="%Y-%m-%d %H:%M:%S")
tsr2=as.POSIXct(r$order_delivered_customer_date, format="%Y-%m-%d %H:%M:%S")
r = mutate(r,spendtime=difftime(tsr2,tsr1, units="day") ) #天數
# 視覺化
r %>%
ggplot(aes(review_score, spendtime)) +
geom_point(mapping = NULL, data = NULL, stat = "identity", position = "identity",color="#FF7744") +
labs(x = "評分(/5分)",y = "物流等待(天數)")+
ggtitle("物流跟評分")+
scale_y_continuous(labels = comma)+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')
## Warning: Removed 2122 rows containing missing values (geom_point).
summary(as.numeric(r$spendtime))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.5334 6.7635 10.2109 12.5501 15.7053 209.6286 2122
# 相關
cor.test(r$review_score, as.numeric(r$spendtime))
##
## Pearson's product-moment correlation
##
## data: r$review_score and as.numeric(r$spendtime)
## t = -109.76, df = 94930, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.3412025 -0.3299126
## sample estimates:
## cor
## -0.3355696
# 回歸
model <- lm(formula= review_score ~ spendtime,
data=r)
summary(model)
##
## Call:
## lm(formula = review_score ~ spendtime, data = r)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.6771 -0.4126 0.5124 0.7619 8.8013
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.7163697 0.0065356 721.6 <2e-16 ***
## spendtime -0.0454997 0.0004146 -109.8 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.219 on 94930 degrees of freedom
## (2122 observations deleted due to missingness)
## Multiple R-squared: 0.1126, Adjusted R-squared: 0.1126
## F-statistic: 1.205e+04 on 1 and 94930 DF, p-value: < 2.2e-16
評分五分的數量與銷售量之間呈現中度正相關,且其線性回歸顯著,代表當五分的評分數上升時,銷售量也會跟著提升。
r$o= as.Date(r$order_purchase_timestamp,format="%Y-%m-%d")
ro =as.data.frame(table(r$o))
reviewperiod1 = reviewperiod %>% filter(Var2 ==5) # 篩出五分
ro =merge(ro,reviewperiod1, by = "Var1", x.all = TRUE)
colnames(ro)=c("時間","sales","分數","評5分數","時間1")
ro %>%
ggplot(aes(sales,評5分數)) +
geom_point(mapping = NULL, data = NULL, stat = "identity", position = "identity",color="#FF7744",size=1) +
labs(x = "5分的評分數",y = "銷售量")+
ggtitle("產品銷售量與評分")+
scale_y_continuous(labels = comma)+
theme_economist() +
scale_color_economist()+
theme(legend.position='none')
cor.test(ro$sales, ro$評5分數)
##
## Pearson's product-moment correlation
##
## data: ro$sales and ro$評5分數
## t = 15.724, df = 595, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.4825511 0.5961422
## sample estimates:
## cor
## 0.541816
電商平台因剛成立,處成長期而有上升的狀態,也會因為特別活動而有突出的訂單數,因此若能在特別節日中,舉辦相關活動,並且抓住成長期的大量顧客,則可以脫隱而出。
訂單中購買商品數量只買一項,且客單價低,但是有許多離群值,因此須依每個客戶潛力(如:將之分為高、低消費族群),做適合的行銷;另外也可依商品在不同地區、時間的熱門程度來做營銷。
此平台當前吸引的消費族群,多位於聖保羅州等巴西東南側地帶,偏好於平日白天活動時間(10~16點)購物,並習慣使用信用卡支付,是較為信任網路資安,且經濟能力還算穩定者;從顧客忠誠度來看,可以感受到這些消費者仍在嘗鮮、試水溫的階段。
從評論文字探索,除了產品是否符合預期,物流的速度也是此消費族群重視的地方。而物流影響評價,評價影響銷量;若能提升物流,以提高評價,並連動提高銷售量,將是此平台極大的機會。
因為透過單看各分數的文字雲無法看出實際問題,所以使用tf-idf
評分為1:insatisfeito不滿/ tento試圖 /parcela一部分 /decepção騙局 /enganosa 誤導/entrar 聯繫,進入/descaso 忽視
評分為2:tento試圖/entrar 聯繫,進入/flex靈活性/moist 潮濕/Pneu Maggion aro 某產品名稱 /esquerdo左
評分為3:identidade身分/taxas利率/pentes梳子/ tirando服用/apto適合/automotivo汽車/presilhas 緊/
評分為4:ótimo 美好/rapida很快/direitinho恰到好處/benefícios好處/baixar降低/Tirando 除此 評分為5:ótimo 美好/rapida很快/direitinho恰到好處/adorou喜歡/excelentes優點/superou超出預期/parabens恭喜
小結:估計低分是因為覺得被騙被誤導,但是聯絡賣家又被忽視;有一些詞是直接跟產品有關,像是2分出現的某輪胎牌子
### 每區間總共字數
reviewword2 =reviewword1 %>%
count(score, word, sort = TRUE)
total_words <- reviewword2 %>%
group_by(score) %>%
summarize(total = sum(n))
t_words <- left_join(reviewword2, total_words)
## Joining, by = "score"
t_words
## # A tibble: 31,725 x 4
## score word n total
## <int> <chr> <int> <int>
## 1 5 produto 7963 142573
## 2 1 nao 6008 125654
## 3 5 prazo 5569 142573
## 4 5 muito 5296 142573
## 5 1 produto 4861 125654
## 6 5 antes 4452 142573
## 7 1 de 3974 125654
## 8 5 de 3729 142573
## 9 5 entrega 3628 142573
## 10 5 recomendo 3227 142573
## # ... with 31,715 more rows
###找出重要的單詞(專有詞)
t_words <-t_words %>%
bind_tf_idf(word,score, n)
t_words
## # A tibble: 31,725 x 7
## score word n total tf idf tf_idf
## <int> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 5 produto 7963 142573 0.0559 0 0
## 2 1 nao 6008 125654 0.0478 0 0
## 3 5 prazo 5569 142573 0.0391 0 0
## 4 5 muito 5296 142573 0.0371 0 0
## 5 1 produto 4861 125654 0.0387 0 0
## 6 5 antes 4452 142573 0.0312 0 0
## 7 1 de 3974 125654 0.0316 0 0
## 8 5 de 3729 142573 0.0262 0 0
## 9 5 entrega 3628 142573 0.0254 0 0
## 10 5 recomendo 3227 142573 0.0226 0 0
## # ... with 31,715 more rows
### 計算出tf-idf
t_words %>%
select(-total) %>%
arrange(desc(tf_idf))
## # A tibble: 31,725 x 6
## score word n tf idf tf_idf
## <int> <chr> <int> <dbl> <dbl> <dbl>
## 1 5 otima 201 0.00141 0.223 0.000315
## 2 5 rapida 195 0.00137 0.223 0.000305
## 3 5 direitinho 73 0.000512 0.511 0.000262
## 4 5 adorou 70 0.000491 0.511 0.000251
## 5 5 excelentes 38 0.000267 0.916 0.000244
## 6 5 superou 143 0.00100 0.223 0.000224
## 7 4 direitinho 22 0.000423 0.511 0.000216
## 8 2 aussie 4 0.000131 1.61 0.000211
## 9 5 parabens 127 0.000891 0.223 0.000199
## 10 5 agradecer 29 0.000203 0.916 0.000186
## # ... with 31,715 more rows
### 結果視覺化
t_words %>%
arrange(desc(tf_idf)) %>%
mutate(word = factor(word, levels = rev(unique(word)))) %>%
group_by(score) %>%
top_n(7) %>%
ungroup() %>%
ggplot(aes(word, tf_idf, fill =score)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = "tf-idf") +
facet_wrap(~score, ncol = 2, scales = "free") +
coord_flip()
## Selecting by tf_idf
小結:極低度相關
### 產品照片數與評分的關係
cor(r$review_score, r$product_photos_qty)
## [1] 0.01435297
cor.test(r$review_score, r$product_photos_qty)
##
## Pearson's product-moment correlation
##
## data: r$review_score and r$product_photos_qty
## t = 4.4719, df = 97052, p-value = 7.763e-06
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.008062374 0.020642439
## sample estimates:
## cor
## 0.01435297
r %>%
ggplot(aes(review_score, product_photos_qty,group=review_score)) +
geom_boxplot( width = .5, outlier.alpha = 0.01)+
geom_point(mapping = NULL, data = NULL, stat = "identity", position = "identity") +
labs(x = "評分(/5分)",y = "產品照片數量")+
ggtitle("產品照片數與評分")+
scale_y_continuous(labels = comma)
### 產品簡介字數與評分的關係
cor(r$review_score, r$product_description_lenght)
## [1] 0.01006433
cor.test(r$review_score, r$product_description_lenght)
##
## Pearson's product-moment correlation
##
## data: r$review_score and r$product_description_lenght
## t = 3.1355, df = 97052, p-value = 0.001716
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.003773238 0.016354620
## sample estimates:
## cor
## 0.01006433
r %>%
ggplot(aes(review_score, product_description_lenght,group=review_score)) +
geom_boxplot( width = .5, outlier.alpha = 0.01)+
geom_point(mapping = NULL, data = NULL, stat = "identity", position = "identity") +
labs(x = "評分(/5分)",y = "產品簡介字數")+
ggtitle("產品簡介字數與評分")+
scale_y_continuous(labels = comma)
小結:基本上關聯不大
cor(r$review_score, r$freight_value)
## [1] -0.03853995
cor.test(r$review_score, r$freight_value)
##
## Pearson's product-moment correlation
##
## data: r$review_score and r$freight_value
## t = -12.015, df = 97052, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.04482041 -0.03225644
## sample estimates:
## cor
## -0.03853995
r %>%
ggplot(aes(review_score,freight_value)) +
geom_point(mapping = NULL, data = NULL, stat = "identity", position = "identity") +
labs(x = "評分(/5分)",y = "運費")+
ggtitle("運費與評分")+
scale_y_continuous(labels = comma)