當我們欲發展自己的彩妝品牌,則了解市場利基以及特性就成為相當重要的課題,因此我們欲以口紅的評論作為資料探勘的標的來協助了解市場發展以及品牌的重要性,所以挑選分析Amazon.com上留言的評論者對於唇膏的整體看法,並以不同的角度剖析評論,像是了解評論者的情緒起伏,或是評論當中所涵蓋的主題內容,評論字詞使用的相似程度,甚至是評論者與品牌、星等、價格之間的關聯性,最後透過模型預測什麼樣的評論有可能會進入銷售前十名。
packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest","tidytext","wordcloud","topicmodels","doParallel","koRpus","quanteda")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
Sys.setlocale("LC_ALL","C")
## [1] "LC_CTYPE=C;LC_NUMERIC=C;LC_TIME=C;LC_COLLATE=C;LC_MONETARY=C;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
#options(digits=5, scipen=10)
library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
library(wordcloud)
library(stringr)
library(tidytext)
library(ggplot2)
library(tidyr)
library(data.table)
library(readr)
library(topicmodels)
library(lexicon)
library(koRpus)
library(quanteda)
library(syuzhet)
library(corrplot)
library(slam)
library(doParallel)
library(Rtsne)
library(igraph)
library(ggraph)
library(ggrepel)
library(wordcloud)
library(RColorBrewer)
lipstick_detail=fread('./data/yuchia.lipstick_info_original.csv',na.strings=c("","NA"),stringsAsFactors = F)
lip_sub=fread('./data/yuchia.lipstick_info_suburl.csv',na.strings=c("","NA"),stringsAsFactors = F)
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "LC_CTYPE=zh_TW.UTF-8;LC_NUMERIC=C;LC_TIME=zh_TW.UTF-8;LC_COLLATE=zh_TW.UTF-8;LC_MONETARY=zh_TW.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
lipstick=fread('./data/yuchia.amazon_lipstick_review.csv',header = T,stringsAsFactors = F)
##因為評論有些欄位有中文資料,需事先做前處理
##刪除空白$換行
lipstick$availability<- gsub("\\n", "", lipstick$availability)%>%
gsub("^\\s+|\\s+$", "", .)%>%
gsub("[.]", "",.)
#找出沒有中文的欄位
lipstick_nochiness<-lipstick %>% filter(grepl('[a-zA-Z]',lipstick$availability))
lipstick_nochiness$star_sub<-substr(lipstick_nochiness$star,1,3)
##處理字串
lipstick_nochiness$availability<- gsub("^Only.*|^In.*|.*ship.*","In Stock",lipstick_nochiness$availability)%>%
gsub("^Available.*","Unavailable",.)
##找出有中文的欄位
lipstick_chiness<-lipstick %>% filter(!grepl('[a-zA-Z]',lipstick$availability))
##處理星等
lipstick_chiness$star_sub<-substr(lipstick_chiness$star,7,10)
##處理字串
lipstick_chiness$availability<- gsub("庫存僅剩\\s[0-9].*", "In Stock", lipstick_chiness$availability) %>%
gsub(".*[有]?現貨。", "In Stock",.)%>%
gsub("來自\\s這些暢銷品\\s有貨。", "Unavailable",.) %>%
gsub("暫時缺貨\\s,", "Unavailable",.)
##合併dataframe
lipstick_review<-rbind(lipstick_nochiness,lipstick_chiness)
colnames(lipstick_review)[which(names(lipstick_review) == "a")] <- "ASIN"
##Date:
Sys.setlocale("LC_TIME", "C")
## [1] "C"
lipstick_review$art_Date1=as.Date(lipstick_review$art_Date,format = "%B %d,%Y")
lipstick_review$Year=year(lipstick_review$art_Date1)
lipstick_review$Month=format(lipstick_review$art_Date1, "%m")
##Brand:
lipstick_review$brand=lipstick_review$brand %>% as.factor
brand=lipstick_review$brand %>% unique
##Star:
lipstick_review$author_star_sub=lipstick_review$author_star %>% substr(1,3)
lipstick_review$star_sub =lipstick_review$star_sub %>% as.numeric()
lipstick_review$author_star_sub =lipstick_review$author_star_sub %>% as.numeric()
#刪除重複紀錄
lipstick_review <-distinct(lipstick_review,art_Content, .keep_all= TRUE)
#去除多餘欄位
lipstick_review <- lipstick_review[,-c(6,10,11)]
## 刪除多餘的欄位
lipstick_detail <- lipstick_detail[,-c(22:45)]
## 與suburl合併
lipstick_detail<-rbind(lipstick_detail,lip_sub,fill=T)
## 刪除重複
lipstick_detail<-distinct(lipstick_detail,URL, .keep_all= TRUE)
## 整理產品星等
lipstick_detail$product_star_sub <- substr(lipstick_detail$Product_Star, 1, 3)
lipstick_detail$product_star_sub <- lipstick_detail$product_star_sub %>% as.numeric()
#唇膏整合排名by ASIN
lipstick_review$ASIN<-lapply(lipstick_review$art_Url,function(x){strsplit(x, "/")[[1]][6]})%>%as.character()
lip_combine<-lipstick_detail %>%
filter(Category1 %in% c('Beauty & Personal Care','Beauty & Peronal Care'))%>%
select(Rank1,Rank2,Category1,ASIN,Customer_Reviews)
lipstick_all<-lipstick_review%>%left_join(lip_combine,by=c("ASIN"="ASIN"))
lipstick_all$Rank1<-lipstick_all$Rank1%>% str_replace_all(",","") %>% as.numeric
lipstick_all$Rank2<-lipstick_all$Rank2%>% str_replace_all(",","") %>% as.numeric
##篩選有Rank1的資料
lipstick_all<-lipstick_all %>%
filter(is.na(Rank2)==FALSE)
##整合其他商品資料欄位
another_info <- select(lipstick_detail, product_star_sub, color, Price, Title, ASIN)
colnames(another_info)[4] <- "product_title"
lipstick_all <- left_join(lipstick_all, another_info, by = "ASIN")
#刪除評論內容有NA值的紀錄
lipstick_all <-na.omit(lipstick_all, cols="artContent")
head(lipstick_all, 1)
## _id
## 206 5c89267c923ce022a2d0a193
## art_Content
## 206 Look up a swatch of the shade you're looking at online before you buy it, the pictures here are not even close to the shade you're getting. I knew about this and ordered the shade I wanted and got that exact shade, I just wanted to make sure anyone reading reviews knows that the pictures on here are literally many, MANY shades away from what you're purchasing. Whipped Caviar is a gorgeous shade, definitely one of my favorite from NYX, I'm just saying the image shown for it is not accurate whatsoever. It is, however, a gorgeous color. Goes on a smooth matte and stays in place. I love it!
## art_Date
## 206 July 23, 2015
## art_Url
## 206 https://www.amazon.com/NYX-Matte-Lipstick-Perfect-Red/product-reviews/B005FYJD7M/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## author_name availability brand
## 206 Octane In Stock NYX PROFESSIONAL MAKEUP
## comment_title title
## 206 LOOK UP SWATCHES BEFORE YOU BUY\n NYX Matte Lipstick, Perfect Red
## star_sub art_Date1 Year Month author_star_sub ASIN Rank1 Rank2
## 206 3.7 2015-07-23 2015 07 5 B005FYJD7M 38102 300
## Category1 Customer_Reviews product_star_sub color
## 206 Beauty & Personal Care 2378 3.7 Perfect red
## Price product_title
## 206 5.85 NYX Matte Lipstick, Perfect Red
summary(lipstick_all$Rank2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4 139 333 1651 1910 56404
# 處理評論文字
lipstick_all$art_Content <- gsub("'", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[[:punct:]]", " ", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[[:cntrl:]]", " ",lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("^[[:space:]]+", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[[:space:]]+$", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[0-9]+", " ", lipstick_all$art_Content)
lipstick_all$art_Content<-iconv(lipstick_all$art_Content,"WINDOWS-1252","UTF-8")
lipstick_all$art_Content <- tolower(lipstick_all$art_Content)
# 處理商品標題
lipstick_all$product_title <- gsub("'", "", lipstick_all$product_title)
lipstick_all$product_title <- gsub("[[:punct:]]", " ", lipstick_all$product_title)
lipstick_all$product_title <- gsub("[[:cntrl:]]", " ",lipstick_all$product_title)
lipstick_all$product_title <- gsub("^[[:space:]]+", "", lipstick_all$product_title)
lipstick_all$product_title <- gsub("[[:space:]]+$", "", lipstick_all$product_title)
lipstick_all$product_title <- gsub("[0-9]+", " ", lipstick_all$product_title)
lipstick_all$product_title<-iconv(lipstick_all$product_title,"WINDOWS-1252","UTF-8")
lipstick_all$product_title <- tolower(lipstick_all$product_title)
#重新編主鍵
lipstick_all<-lipstick_all %>%
mutate(ID =rownames(lipstick_all))
#刪除舊有id
lipstick_all <- lipstick_all[,-1]
#對評論內容做斷詞
conbine_tidy<-lipstick_all%>%
mutate(linenumber =row_number())%>%
unnest_tokens(sentence,art_Content)
#對商品標題做斷詞
title_tidy<-lipstick_all%>%
mutate(linenumber =row_number())%>%
unnest_tokens(sentence,product_title)
NRC_VAD<-read.table("./data/NRC_VAD/NRC_VAD_Lexicon.txt",header = T,sep="\t",stringsAsFactors = F)
NRC<-get_sentiments("nrc")%>%as.data.frame()
NRC_all<-NRC_VAD%>%left_join(NRC,by=c("Word"="word"))
join2<- conbine_tidy %>%
inner_join(NRC_VAD,by=(c("sentence"="Word")))%>%
group_by(linenumber,ID)%>%
summarise(n=n(),
valence_avg=sum(Valence)/n,
arousal_avg=sum(Arousal)/n,
dominance_avg=sum(Dominance)/n)
#合併join2
lipstick_all<-lipstick_all%>%
left_join(join2,by="ID")
# 移除author_name,brand,star_sub,author_star_sub,Rank2,情緒欄位為NA的資料
lipstick_all_no_NA = na.omit(lipstick_all, cols = c("author_name","brand","star_sub","author_star_sub","Rank2", "valence_avg", "arousal_avg", "dominance_avg"))
sentiment_valence = rbind(
# valence_avg 最高的十個品牌
lipstick_all_no_NA %>%
#filter(brand %in% discuss_plot_df$brand) %>%
group_by(brand) %>%
summarise(valence_all_avg = mean(valence_avg)) %>%
arrange(desc(valence_all_avg)) %>%
head(10) %>%
ungroup() %>%
mutate(score = "high"),
# valence_avg 最低的十個品牌
lipstick_all_no_NA %>%
#filter(brand %in% discuss_plot_df$brand) %>%
group_by(brand) %>%
summarise(valence_all_avg = mean(valence_avg)) %>%
arrange(valence_all_avg) %>%
head(10) %>%
ungroup() %>%
mutate(score = "low")
)
#畫圖
sentiment_valence %>%
group_by(score) %>%
arrange(desc(valence_all_avg)) %>%
ungroup() %>%
ggplot(aes(reorder(brand, valence_all_avg), valence_all_avg, fill = score)) +
geom_col() +
xlab("Brand") +
ylab("Valence AVG") +
coord_flip()
sentiment_arousal = rbind(
# arousal_avg 最高的十個品牌
lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(arousal_all_avg = mean(arousal_avg)) %>%
arrange(desc(arousal_all_avg)) %>%
head(10) %>%
ungroup() %>%
mutate(score = "high"),
# arousal_avg 最低的十個品牌
lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(arousal_all_avg = mean(arousal_avg)) %>%
arrange(arousal_all_avg) %>%
head(10) %>%
ungroup() %>%
mutate(score = "low")
)
# 畫圖
sentiment_arousal %>%
group_by(score) %>%
arrange(desc(arousal_all_avg)) %>%
ungroup() %>%
ggplot(aes(reorder(brand, arousal_all_avg), arousal_all_avg, fill = score)) +
geom_col() +
xlab("Brand") +
ylab("Arousal AVG") +
coord_flip()
sentiment_dominance = rbind(
# dominance_avg 最高的十個品牌
lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(dominance_all_avg = mean(dominance_avg)) %>%
arrange(desc(dominance_all_avg)) %>%
head(10) %>%
ungroup() %>%
mutate(score = "high"),
# dominance_avg 最低的十個品牌
lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(dominance_all_avg = mean(dominance_avg)) %>%
arrange(dominance_all_avg) %>%
head(10) %>%
ungroup() %>%
mutate(score = "low")
)
#畫圖
sentiment_dominance %>%
group_by(score) %>%
arrange(desc(dominance_all_avg)) %>%
ungroup() %>%
ggplot(aes(reorder(brand, dominance_all_avg), dominance_all_avg, fill = score)) +
geom_col() +
xlab("Brand") +
ylab("Dominance AVG") +
coord_flip()
# 把valence與arousal維度最高10個與最低10個都有出現的品牌join一起
sentiment_join = sentiment_valence %>%
inner_join(sentiment_arousal, by = c(brand = "brand"))
sentiment_join$score.x = as.factor(sentiment_join$score.x)
sentiment_join$score.y = as.factor(sentiment_join$score.y)
sentiment_join %>%
ggplot(aes(score.x, score.y, label = brand)) +
geom_jitter(position = position_jitter(seed = 87)) +
xlab("Valence") +
ylab("Arousal") +
geom_vline(xintercept = 1.5) + # 在x軸中間加入線條以利區別點點位置
geom_hline(yintercept = 1.5) + # 在y軸中間加入線條以利區別點點位置
scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) + # 將x軸的刻度對調
scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) + # 將y軸的刻度對調
geom_label_repel(position = position_jitter(seed = 87), aes(label = brand), box.padding = 0.7)
# 把每個點點標上brand名稱
這張圖可以看出: 越靠右邊的品牌越帶給消費者正面情緒,反之,越左邊越負面;越靠上方的品牌代表消費者越外放的情緒,反之,越下方則越內縮。
1. 以Oriflame、ONE1X、K牌脣膏為例,消費者感受到正面的情緒,並且在評論中的敘述是比較想把好心得分享給大家
2. NOTE Cosmetics、Nykaa、A牌脣膏則是讓消費者產生負面情緒,而且消費者的評論也較積極表現出不滿的心情
3. Kryolan雖然讓消費者有負面情緒,但消費者的評論卻是用比較婉轉的方式說產品的缺點
lipstick_all_no_NA[lipstick_all_no_NA$brand == "Oriflame" | lipstick_all_no_NA$brand == "ONE1X", 1]
## [1] "its superb my new lip love"
Oriflame、ONE1X的評論表現出很喜歡該品牌,有愛不釋手的感覺
lipstick_all_no_NA[lipstick_all_no_NA$brand == "NOTE Cosmetics", 1]
## [1] "i would never ever buy again so disappointed very bad experience"
## [2] "didn t keep it because it was totally broken off when i opened package wouldn t buy agsin"
NOTE Cosmetics的評論很明顯唾棄這種品牌的脣膏,而且還非常不推薦給其他人
lipstick_all_no_NA[lipstick_all_no_NA$brand == "Kryolan", 1]
## [1] "it has quite a lot of wax smell in it"
Kryolan的評論沒有直接表現出對品牌的喜好,但委婉地說出產品的缺點
star_amount = lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(star = mean(star_sub)) %>%
ungroup()
sentiment_join_star = sentiment_join %>%
inner_join(star_amount, by = c(brand = "brand"))
sentiment_join_star %>%
ggplot(aes(score.x, score.y, label = brand)) +
geom_jitter(position = position_jitter(seed = 87), aes(size = star)) + # 星星數越高,點點越大
xlab("Valence") +
ylab("Arousal") +
geom_vline(xintercept = 1.5) + # 加入線條以利區別點點位置
geom_hline(yintercept = 1.5) + # 加入線條以利區別點點位置
scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) + # 將x軸排序對調
scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) + # 將y軸排序對調
geom_label_repel(position = position_jitter(seed = 87), size = 3, box.padding = 0.7) # 把每個點點標上brand名稱
# 將同品牌lipstick的rank平均
rank_avg = lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(rank = mean(Rank2)) %>%
ungroup()
sentiment_join_rank = sentiment_join %>%
inner_join(rank_avg, by = c(brand = "brand"))
sentiment_join_rank %>%
ggplot(aes(score.x, score.y, label = brand)) +
geom_jitter(position = position_jitter(seed = 87), aes(size = -rank)) + # rank越前面,點點越大
xlab("Valence") +
ylab("Arousal") +
geom_vline(xintercept = 1.5) + # 加入線條以利區別點點位置
geom_hline(yintercept = 1.5) + # 加入線條以利區別點點位置
scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) + # 將x軸排序對調
scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) + # 將y軸排序對調
geom_label_repel(position = position_jitter(seed = 87), size = 3, box.padding = 0.7) # 把每個點點標上brand名稱
# 將正負面分數最高與最低各10個牌子,連結該牌的平均銷售排行與平均獲得星星數
valence_rank_star = sentiment_valence %>%
inner_join(rank_avg, by = c(brand = "brand")) %>%
inner_join(star_amount, by = c(brand = "brand"))
valence_rank_star
## # A tibble: 20 x 5
## brand valence_all_avg score rank star
## <fct> <dbl> <chr> <dbl> <dbl>
## 1 Oriflame 0.860 high 44499 5
## 2 LCTCKP 0.817 high 44649 5
## 3 BORNTREE 0.796 high 28540 4
## 4 Stephanie Imports 0.783 high 6951 5
## 5 Nasty Woman Cosmetics 0.776 high 5208 5
## 6 Make Up For Ever 0.763 high 50366 4.8
## 7 IOPE 0.760 high 10295 4.7
## 8 Clarins 0.754 high 24797. 3.67
## 9 Splashes & Spills 0.753 high 2066 3.4
## 10 Femme Couture 0.753 high 10604 4.6
## 11 Beauty Glazed 0.329 low 9065 1
## 12 A'some 0.438 low 27233 2
## 13 Huda Beauty 0.484 low 4937 2.9
## 14 Kryolan 0.492 low 54303 1
## 15 Smoke & Mirrors 0.506 low 46511 1
## 16 MILEMEI 0.515 low 25234 1.33
## 17 NOTE Cosmetics 0.522 low 39886 1
## 18 Measurable Difference 0.529 low 28815 2
## 19 VDIVOV 0.532 low 15117 2.6
## 20 True Color 0.567 low 10458 2.9
valence_rank_star %>%
ggplot(aes(star, valence_all_avg, label = brand)) +
geom_point(aes(size = -rank, color = -rank)) + # 銷售排行越好,點點越大,顏色越深
#geom_text_repel(box.padding = 0.7, aes(size = 2)) +
geom_label_repel( box.padding = 0.7, size = 3) +
ylab("Sentiment Valence") +
xlab("Star Amount")
#LDA
conbine_document<-conbine_tidy %>%
anti_join(stop_words,by=c("sentence"="word")) %>%
count(linenumber, sentence, sort = TRUE) %>%
ungroup()
#轉成dtm
top10_dtm <- conbine_document %>%
cast_dtm(linenumber, sentence, n)
top10_dtm
## <<DocumentTermMatrix (documents: 59740, terms: 18587)>>
## Non-/sparse entries: 676002/1109711378
## Sparsity : 100%
## Maximal term length: NA
## Weighting : term frequency (tf)
lda_top10 <- LDA(top10_dtm, k = 2, control = list(seed = 1010))
lda_top10
## A LDA_VEM topic model with 2 topics.
#主題跟字的關係
document_topics <- tidy(lda_top10, matrix = "beta")
document_topics
## # A tibble: 37,174 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 red 0.0103
## 2 2 red 0.00564
## 3 1 organic 0.000536
## 4 2 organic 0.000190
## 5 1 lipstick 0.000670
## 6 2 lipstick 0.0620
## 7 1 color 0.00674
## 8 2 color 0.111
## 9 1 product 0.0224
## 10 2 product 0.00831
## # … with 37,164 more rows
top_terms <- document_topics %>%
filter(!term %in% c("lipstick","color","love","lips","product","doe snt","lip","lipsticks"))%>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
## # A tibble: 40 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 colors 0.0120
## 2 1 day 0.0107
## 3 1 shade 0.0107
## 4 1 red 0.0103
## 5 1 time 0.0102
## 6 1 pretty 0.00951
## 7 1 im 0.00946
## 8 1 doesnt 0.00938
## 9 1 buy 0.00920
## 10 1 beautiful 0.00865
## # … with 30 more rows
#畫圖呈現主題跟字的關係
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
beta_spread <- document_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
## # A tibble: 219 x 4
## term topic1 topic2 log_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 absolutely 0.00294 0.000617 -2.25
## 2 add 0.00152 0.000174 -3.12
## 3 amazing 0.00344 0.00202 -0.769
## 4 amazon 0.00324 0.00106 -1.61
## 5 application 0.00180 0.00253 0.496
## 6 applicator 0.00112 0.000365 -1.61
## 7 applied 0.000808 0.00349 2.11
## 8 apply 0.00598 0.00601 0.00729
## 9 applying 0.00305 0.000647 -2.24
## 10 arrived 0.00136 0.00103 -0.396
## # … with 209 more rows
beta_spread%>%
arrange(desc(abs(log_ratio))) %>%
head(20)%>%
mutate(term = reorder(term, log_ratio)) %>%
ggplot(aes(term, log_ratio, fill = log_ratio>0)) +
geom_col(show.legend = FALSE) +
xlab("term") +
ylab("log(topic2 / topic1)") +
coord_flip()
#以品牌被討論的次數來找出前十大品牌
brand_discuss_df=lipstick_review %>%
group_by(.,brand,Year) %>%
summarise(
discuss_n=n()
)%>% filter(Year>2009)
brand_discuss_df %>%
group_by(.,brand) %>%
summarise(
discuss_sum=sum(discuss_n)
) ->top10_df
#top10品牌討論度by年分
brand_discuss_df$brand %in%
top10_df [order(top10_df$discuss_sum,decreasing = T),]$brand[1:10] %>%
subset(brand_discuss_df,.)->discuss_plot_df
discuss_plot_df<- subset(discuss_plot_df,Year!="NA")
ggplot(discuss_plot_df, aes(x = Year, y =discuss_n,color=brand)) +
geom_line() + geom_point( fill = "white")+scale_x_continuous(breaks = c(2009:2019))+scale_y_continuous(breaks=c(0,1000,2000,3000,4000,5000,6000,7000,8000,9000))
brand_product_discuss_df=lipstick_review %>%
group_by(.,brand, ASIN,Year) %>%
summarise(
discuss_n=n()
)%>% filter(Year>2009)
brand_product_discuss_df %>%
na.omit()%>%
filter(Year==2018, brand%in% top10_df$brand)%>%
arrange(desc(discuss_n))
## # A tibble: 1,340 x 4
## # Groups: brand, ASIN [1,340]
## brand ASIN Year discuss_n
## <fct> <chr> <int> <int>
## 1 Maybelline New York B06XF16MWM 2018 1226
## 2 Lime Crime B00JH2B182 2018 487
## 3 L'Oreal Paris B004BJ6VNG 2018 427
## 4 Maybelline New York B00467B0KC 2018 424
## 5 Burt's Bees B014VZUH1G 2018 337
## 6 COVERGIRL B00J2AP9OG 2018 335
## 7 LipSense B00HD5NRSC 2018 313
## 8 Revlon B001P2JZBC 2018 290
## 9 Revlon B000V5OQNG 2018 265
## 10 Maybelline New York B07BM1XTKH 2018 253
## # … with 1,330 more rows
#列出所有品牌在2018所出的產品
lipstick_all %>%
filter(Year == 2018) %>%
select(ASIN, Rank2, Price, color) %>%
unique() %>%
arrange(-desc(Rank2)) %>%
head(20)
## ASIN Rank2 Price color
## 1 B001P2JZBC 4 6.99 Bare Maximum
## 2 B00XQ9IRP2 6 19.93 Frog Prince
## 3 B00J2AP9OG 12 3.99 Clear
## 4 B00J2AP9OG 13 3.99 Clear
## 5 B014VZUH1G 14 6.03 Blush Basin
## 6 B00LNQB8AQ 16 9.99 Jennifer's Nude
## 7 B0032RMX3U 23 21.9 Black Honey 06
## 8 B00IOWYHJ6 40 6.85 1
## 9 B00G3DWC54 55 6.4 MATTE
## 10 B00VAY4AJM 57 7.52 Redwood Forest
## 11 B078VNH5MR 60 14.98 A
## 12 B071VCF42M 65 4.59 Rebel Rose
## 13 B018WCT05I 68 3.84 Violet Red
## 14 B01DPA81MK 75 4.98 Clear Vinyl
## 15 B00BG14WOM 78 4.97 Pink Frot
## 16 B07JFNVP61 83 12.99 A
## 17 B000V5OQNG 88 6.93 NUDE
## 18 B019YUECNC 92 3.89 Budapest
## 19 B01GH0421Y 110 3.59 1 Count
## 20 B00BR0DAG8 139 18 Single
#以bi-gram做斷詞
lipstick_bigrams <- lipstick_all%>%
mutate(linenumber =row_number())%>%
unnest_tokens(bigram,art_Content, token = "ngrams", n = 2)
lipstick_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 266,278 x 2
## bigram nn
## <chr> <int>
## 1 the color 14986
## 2 it is 7990
## 3 i love 7897
## 4 my lips 7722
## 5 this is 6569
## 6 i have 6534
## 7 is a 6215
## 8 color is 6045
## 9 this lipstick 5674
## 10 love this 5388
## # … with 266,268 more rows
bigrams_separated <- lipstick_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
## # A tibble: 71,279 x 3
## word1 word2 nn
## <chr> <chr> <int>
## 1 lip color 2494
## 2 skin tone 1253
## 3 lip gloss 1157
## 4 lip balm 1088
## 5 beautiful color 1026
## 6 love love 913
## 7 staying power 903
## 8 nice color 893
## 9 lime crime 798
## 10 matte lipstick 767
## # … with 71,269 more rows
lipstick_all %>%
unnest_tokens(trigram, art_Content, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
## # A tibble: 44,860 x 4
## word1 word2 word3 nn
## <chr> <chr> <chr> <int>
## 1 <NA> <NA> <NA> 1922
## 2 love love love 414
## 3 natural lip color 271
## 4 matte lip cream 127
## 5 matte lip color 102
## 6 lasting lip color 92
## 7 soft matte lip 88
## 8 lips feel dry 83
## 9 tinted lip balm 83
## 10 love lime crime 74
## # … with 44,850 more rows
#將trigram的結果拆成三個字並另存成一dataframe
lipstick_trigrams <- lipstick_all%>%
mutate(linenumber =row_number())%>%
unnest_tokens(trigram,art_Content, token = "ngrams", n = 3)
trigrams_separated <- lipstick_trigrams %>%
separate(trigram, c("word1", "word2","word3"), sep = " ")
trigrams_filtered <- trigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word)
#找出評論中提到的商品顏色
bigram_color <- filter(bigrams_filtered,word2%in% lipstick_detail$color)
trigram_color <- filter(trigrams_filtered,word3%in% lipstick_detail$color)
bigrams_united <- bigram_color %>%
unite(bigram, word1, word2, sep = "_") %>%
filter(!bigram %in% c("NA_NA"))
set.seed(1234)
wordcloud(words = bigrams_united$bigram,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
trigrams_united <- trigram_color %>%
unite(trigram, word1, word2, word3, sep = "_")%>%
filter(!trigram %in% c("NA_NA_NA"))
set.seed(1234)
wordcloud(words = trigrams_united$trigram,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
#先去掉bobbi brown(其為品牌名稱而不是一種顏色)
trigram_color <- trigram_color %>%
filter(!(word2 =="bobbi" & word3 =="brown"))
pure_color <- trigram_color %>%
filter(word2 !="skin") %>%
filter(word2 !="lip")
new_united <- pure_color %>%
unite(trigram, word1, word2, word3, sep = "_")
set.seed(1234)
wordcloud(words = new_united$trigram,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# 將上述結果以長條圖表示
trigram<-new_united %>%
count(trigram, sort = TRUE)
color_discuss <- trigram %>%
filter(nn>10)%>%
ggplot(.,aes(x=reorder(trigram,nn),y=nn))+
geom_bar(stat="identity")+
coord_flip()+ labs(y='count',x='trigram of color')
color_discuss
color_mentioned <- new_united %>%
filter(brand %in% discuss_plot_df$brand) %>%
count(trigram, sort = TRUE)
color_mentioned
## # A tibble: 1,658 x 2
## trigram nn
## <chr> <int>
## 1 blue_based_red 19
## 2 fire_engine_red 18
## 3 bubble_gum_pink 16
## 4 blue_toned_red 15
## 5 deep_dark_red 11
## 6 perfect_matte_red 11
## 7 nice_deep_red 10
## 8 nice_bright_red 7
## 9 true_blue_red 7
## 10 barbie_doll_pink 6
## # … with 1,648 more rows
color_plot <- color_mentioned %>%
head(10)%>%
ggplot(.,aes(x=reorder(trigram,nn),y=nn))+
geom_bar(stat="identity")+
coord_flip()+ labs(y='count',x='trigram of color')
color_plot
#找出10大品牌討論中所提到的顏色
#只顯示10種
top10_color <- color_mentioned %>%
head(10)
new_united %>%
filter(brand %in% discuss_plot_df$brand) %>%
filter(trigram %in% color_mentioned$trigram) %>%
filter(trigram %in% top10_color$trigram) %>%
select(brand, trigram) %>%
ggplot(aes(trigram, fill=brand)) + geom_bar(position="stack")
new_united %>%
filter(Year == 2018) %>%
filter(brand %in% discuss_plot_df$brand) %>%
filter(trigram %in% color_mentioned$trigram) %>%
filter(trigram %in% top10_color$trigram) %>%
select(brand, trigram) %>%
ggplot(aes(trigram, fill=brand)) + geom_bar(position="stack")
#建立要被篩選的字
mat_word <- c("matt","matte","matted")
gloss_word <- c("gloss","glossy")
#以trigram找出有被提到是霧面還亮面的評論
matte <- filter(trigrams_filtered, word1 %in% mat_word | word2 %in% mat_word | word3 %in% mat_word)
gloss <- filter(trigrams_filtered, word1 %in% gloss_word | word2 %in% gloss_word | word3 %in% gloss_word)
color <- conbine_tidy %>% filter(sentence %in% mat_word |sentence %in% gloss_word | sentence %in% c("dark","red","tone","brown","pink","orange","black","purple","fire","cupcake","nap","cabernet","lay","temptation"))
## 唇膏質地趨勢圖(霧面 vs 亮面)
color %>%
select(art_Date1,sentence)%>%
count(art_Date1,sentence) %>%
arrange(desc(n))%>%
filter(n>10)%>%
filter(sentence %in% mat_word | sentence %in% gloss_word)%>%
ggplot(.,aes(art_Date1,n, fill = sentence,label=sentence)) +
geom_line(aes(color = sentence), position="identity")
## 顏色趨勢圖
color %>%
select(art_Date1,sentence)%>%
count(art_Date1,sentence) %>%
arrange(desc(n))%>%
filter(n>10)%>%
filter(!sentence %in% mat_word & !sentence %in% gloss_word)%>%
ggplot(.,aes(art_Date1,n, fill = sentence,label=sentence)) +
geom_line(aes(color = sentence), position="identity")
# 挑出提到"skin tone"的詞彙
## 其他含skin的trigram可能也是在說評論者的膚色,但無法直接判斷是原膚色還是擦了口紅後使皮膚顏色暗沉或變得明亮,因此只留下含有"skin tone"的trigram
skin_color <- trigram_color %>%
filter(word2 =="skin") %>%
filter(word3 == "tone")
skin_color %>%
group_by(word1) %>%
count() %>%
filter(nn>2) %>%
ggplot(.,aes(x=reorder(word1,nn),y=nn))+
geom_bar(stat="identity")+
coord_flip()+ labs(y='count',x='skin color')
# 計算個膚色人數
color_count <- trigram_color %>%
count(word1, word2, word3, sort = TRUE)
color_count
## # A tibble: 2,475 x 4
## word1 word2 word3 nn
## <chr> <chr> <chr> <int>
## 1 olive skin tone 52
## 2 medium skin tone 46
## 3 fair skin tone 31
## 4 blue based red 28
## 5 bubble gum pink 26
## 6 fire engine red 23
## 7 darker skin tone 22
## 8 light skin tone 21
## 9 dark skin tone 20
## 10 brown skin tone 18
## # … with 2,465 more rows
#找出有提到自己膚色的評論
temp <- trigram_color %>%
unite(trigram, word1, word2, word3, sep = "_") %>%
group_by(author_name, ASIN) %>%
count(trigram)
# 找出同時提到兩種顏色的評論(膚色+口紅顏色)
names <- temp %>%
group_by(author_name, ASIN) %>%
count(author_name)
names <- names %>%
filter(n==2)
pair <- temp %>%
filter(author_name %in% names$author_name)
#correating pairs
library(widyr)
word_cors <- pair %>%
pairwise_cor(trigram, author_name, sort = TRUE)
word_cors
## # A tibble: 629,642 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 opaque_dark_red gorgeous_opaque_dark 1
## 2 gorgeous_opaque_dark opaque_dark_red 1
## 3 stay_bright_pink sheer_berry_pink 1
## 4 sheer_berry_pink stay_bright_pink 1
## 5 havent_worn_black beautiful_luscious_black 1
## 6 beautiful_luscious_black havent_worn_black 1
## 7 mahogany_purply_red burgandy_brown_red 1
## 8 burgandy_brown_red mahogany_purply_red 1
## 9 nude_pink_tone flashy_bright_pink 1
## 10 flashy_bright_pink nude_pink_tone 1
## # … with 629,632 more rows
skin_united <- skin_color %>%
unite(trigram, word1, word2, word3, sep = "_")
# 篩選描述膚色的詞彙(item1)搭配口紅顏色(item2)
word_cors <- word_cors %>%
filter(item1 %in% skin_united$trigram)
word_cors
## # A tibble: 26,962 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 reddish_skin_tone beige_light_purple 1
## 2 girbilive_skin_tone hot_bubblegum_pink 1
## 3 compliments_skin_tone hot_bubblegum_pink 1
## 4 girbilive_skin_tone natural_pink_brown 1
## 5 compliments_skin_tone natural_pink_brown 1
## 6 girbilive_skin_tone solid_matte_red 1
## 7 compliments_skin_tone solid_matte_red 1
## 8 girbilive_skin_tone bit_darker_pink 1
## 9 compliments_skin_tone bit_darker_pink 1
## 10 girbilive_skin_tone differs_personally_pink 1
## # … with 26,952 more rows
# 找出人數多寡前三名的膚色
skin_rank <- skin_united %>%
group_by(trigram) %>%
count(sort = TRUE) %>%
filter(nn>15)
skin_rank
## # A tibble: 7 x 2
## # Groups: trigram [7]
## trigram nn
## <chr> <int>
## 1 olive_skin_tone 52
## 2 medium_skin_tone 46
## 3 fair_skin_tone 31
## 4 darker_skin_tone 22
## 5 light_skin_tone 21
## 6 dark_skin_tone 20
## 7 brown_skin_tone 18
most <- skin_rank %>%
head(3)
set.seed(1000)
word_cors %>%
filter(item1 %in% skin_rank$trigram) %>%
filter(correlation > .5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
# 選出最多人擁有的三種膚色調
# 挑選出correlation > 0.37 的口紅顏色
most_cor <- word_cors %>%
filter(item1 %in% most$trigram) %>%
filter(correlation > .37)
## 區分色系
red <- pure_color %>%
filter(word1 =="red" | word2 == "red" | word3 == "red") %>%
unite(trigram, word1, word2, word3, sep = "_") %>%
select(trigram)
pink <- pure_color %>%
filter(word1 =="pink" | word2 == "pink" | word3 == "pink") %>%
unite(trigram, word1, word2, word3, sep = "_") %>%
select(trigram)
orange <- pure_color %>%
filter(word1 =="orange" | word2 == "orange" | word3 == "orange") %>%
unite(trigram, word1, word2, word3, sep = "_") %>%
select(trigram)
brown <- pure_color %>%
filter(word1 =="brown" | word2 == "brown" | word3 == "brown") %>%
unite(trigram, word1, word2, word3, sep = "_") %>%
select(trigram)
black <- pure_color %>%
filter(word1 =="black" | word2 == "black" | word3 == "black") %>%
unite(trigram, word1, word2, word3, sep = "_") %>%
select(trigram)
# 區分correlation的色系
R <- most_cor %>%
filter(item2 %in% red$trigram) %>%
mutate(color="red_tone")
P <- most_cor %>%
filter(item2 %in% pink$trigram) %>%
mutate(color="pink_tone")
O <- most_cor %>%
filter(item2 %in% orange$trigram) %>%
mutate(color="orange_tone")
Br <- most_cor %>%
filter(item2 %in% brown$trigram) %>%
mutate(color="brown_tone")
Bl <- most_cor %>%
filter(item2 %in% black$trigram) %>%
mutate(color="black_tone")
most_cor <- rbind(R, P, O, Br, Bl)
most_cor
## # A tibble: 303 x 4
## item1 item2 correlation color
## <chr> <chr> <dbl> <chr>
## 1 fair_skin_tone solid_matte_red 0.405 red_tone
## 2 fair_skin_tone horrible_color_red 0.405 red_tone
## 3 fair_skin_tone rich_girl_red 0.405 red_tone
## 4 fair_skin_tone darker_deeper_red 0.405 red_tone
## 5 fair_skin_tone mouse_bright_red 0.405 red_tone
## 6 fair_skin_tone ugly_red_brown 0.405 red_tone
## 7 fair_skin_tone pink_blue_red 0.405 red_tone
## 8 fair_skin_tone orange_tone_red 0.405 red_tone
## 9 fair_skin_tone snow_white_red 0.405 red_tone
## 10 fair_skin_tone deep_wine_red 0.405 red_tone
## # … with 293 more rows
tone_cor <- most_cor
tone_cor$item2 <- most_cor$color
tone_cor <- tone_cor[,-4]
tone_cor %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "pink", size = 5) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
tmp <- lipstick_all %>%
select(author_name, brand) %>%
#filter(author_name != 'Kindle Customer' & author_name != 'Amazon Customer') %>%
group_by(brand) %>%
mutate(brand_sum = n()) %>%
ungroup() %>%
arrange(desc(brand_sum))
# 只取前10大品牌
top_10 <- unique(tmp$brand)[1:10]
# 因為評論者太多 只取留言數夠多的評論者做呈現
link <- tmp %>%
filter(brand %in% top_10) %>%
group_by(author_name) %>%
mutate(author_sum = n()) %>%
ungroup() %>%
distinct() %>%
filter(author_sum > 30)
author <- select(link, author_name, author_sum)
brand <- select(link, brand, brand_sum)
names(author) <- NULL
names(brand) <- NULL
nodes <- rbind(data.frame(author),data.frame(brand)) %>% distinct()
reviewNetwork <- graph_from_data_frame(d=link, v=nodes,directed = F)
reviewNetwork
## IGRAPH 03e4d19 UN-- 28 165 --
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 03e4d19 (vertex names):
## [1] Amazon Customer--COVERGIRL Lisa --COVERGIRL
## [3] Laura --COVERGIRL Mary --COVERGIRL
## [5] Kindle Customer--COVERGIRL Ashley --COVERGIRL
## [7] Jessica --COVERGIRL Linda --COVERGIRL
## [9] Michelle --COVERGIRL Stephanie --COVERGIRL
## [11] Sarah --COVERGIRL Amy --COVERGIRL
## [13] Melissa --COVERGIRL Anna --COVERGIRL
## [15] Nicole --COVERGIRL Amanda --COVERGIRL
## + ... omitted several edges
V(reviewNetwork)$size <- log(V(reviewNetwork)$X2)*3
# 黃色是品牌的node 藍色是author
plot(reviewNetwork,
vertex.color=c("skyblue", "yellow")[1+(names(V(reviewNetwork)) %in% top_10)])
linkM <- link %>%
filter(brand == "Maybelline New York")
authorM <- select(linkM, author_name, author_sum)
brandM <- select(linkM, brand, brand_sum)
names(authorM) <- NULL
names(brandM) <- NULL
nodesM <- rbind(data.frame(authorM),data.frame(brandM)) %>% distinct()
NetworkM <- graph_from_data_frame(d=linkM, v=nodesM,directed = F)
NetworkM
## IGRAPH 78a4532 UN-- 19 18 --
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 78a4532 (vertex names):
## [1] Amazon Customer--Maybelline New York
## [2] Ashley --Maybelline New York
## [3] Kindle Customer--Maybelline New York
## [4] Amanda --Maybelline New York
## [5] Amy --Maybelline New York
## [6] Sarah --Maybelline New York
## [7] Laura --Maybelline New York
## [8] Linda --Maybelline New York
## + ... omitted several edges
V(NetworkM)$size <- log(V(NetworkM)$X2)*3
plot(NetworkM,
vertex.color=c("red","pink")[1+(names(V(NetworkM)) %in% top_10)])
linkL <- link %>%
filter(brand == "L'Oreal Paris")
authorL <- select(linkL, author_name, author_sum)
brandL <- select(linkL, brand, brand_sum)
names(authorL) <- NULL
names(brandL) <- NULL
nodesL <- rbind(data.frame(authorL),data.frame(brandL)) %>% distinct()
NetworkL <- graph_from_data_frame(d=linkL, v=nodesL,directed = F)
NetworkL
## IGRAPH 35fcddf UN-- 18 17 --
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 35fcddf (vertex names):
## [1] Amazon Customer--L'Oreal Paris Kindle Customer--L'Oreal Paris
## [3] Nicole --L'Oreal Paris Melissa --L'Oreal Paris
## [5] Mary --L'Oreal Paris Jessica --L'Oreal Paris
## [7] Anna --L'Oreal Paris Lauren --L'Oreal Paris
## [9] Michelle --L'Oreal Paris Sarah --L'Oreal Paris
## [11] Amy --L'Oreal Paris Lisa --L'Oreal Paris
## [13] Linda --L'Oreal Paris Amanda --L'Oreal Paris
## [15] Stephanie --L'Oreal Paris Jennifer --L'Oreal Paris
## + ... omitted several edges
V(NetworkL)$size <- log(V(NetworkL)$X2)*3
plot(NetworkL,
vertex.color=c("blue","gray")[1+(names(V(NetworkL)) %in% top_10)])
linkR <- link %>%
filter(brand == "Revlon")
authorR <- select(linkR, author_name, author_sum)
brandR <- select(linkR, brand, brand_sum)
names(authorR) <- NULL
names(brandR) <- NULL
nodesR <- rbind(data.frame(authorR),data.frame(brandR)) %>% distinct()
NetworkR <- graph_from_data_frame(d=linkR, v=nodesR,directed = F)
NetworkR
## IGRAPH 1c3e4c5 UN-- 18 17 --
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 1c3e4c5 (vertex names):
## [1] Amazon Customer--Revlon Kindle Customer--Revlon
## [3] Jessica --Revlon Amy --Revlon
## [5] Ashley --Revlon Lisa --Revlon
## [7] Amanda --Revlon Linda --Revlon
## [9] Melissa --Revlon Mary --Revlon
## [11] Nicole --Revlon Michelle --Revlon
## [13] Sarah --Revlon Laura --Revlon
## [15] Stephanie --Revlon Lauren --Revlon
## + ... omitted several edges
V(NetworkR)$size <- log(V(NetworkR)$X2)*4
plot(NetworkR,
vertex.color=c("orange","yellow")[1+(names(V(NetworkR)) %in% top_10)])
length(unique(lipstick$author_star))
## [1] 5
star_link <- lipstick_all %>%
select(author_name, author_star_sub) %>%
group_by(author_name) %>%
mutate(author_sum = n()) %>%
distinct() %>%
filter(author_sum > 40)
starNetwork <- graph_from_data_frame(d=star_link,directed = F)
starNetwork
## IGRAPH b01d720 UN-- 29 120 --
## + attr: name (v/c), author_sum (e/n)
## + edges from b01d720 (vertex names):
## [1] Amazon Customer--5 Nicole --5 Amazon Customer--4
## [4] Kindle Customer--5 Amanda --5 Jessica --4
## [7] Katie --5 Amazon Customer--1 Amazon Customer--2
## [10] Amazon Customer--3 Kindle Customer--1 Brittany --5
## [13] Nicole --2 Mary --5 Sarah --1
## [16] Lisa --5 Jessica --1 Michelle --2
## [19] Jennifer --1 Lauren --5 Amanda --2
## [22] Brittany --4 Stephanie --5 Amy --4
## + ... omitted several edges
plot(starNetwork)
price_tmp <- lipstick_all %>%
select(author_name, Price)
price_tmp$Price <- as.numeric(price_tmp$Price)
## Warning: NAs introduced by coercion
price_tmp <- na.omit(price_tmp)
summary(price_tmp$Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.92 5.85 7.95 11.98 19.66 79.95
plot(density(price_tmp$Price))
#處理links
price_link <- price_tmp %>%
group_by(author_name) %>%
mutate(author_sum = n()) %>%
ungroup() %>%
mutate(price_index = ifelse(Price>20,'high','low')) %>%
filter(author_sum > 50) %>%
select(author_name, price_index, author_sum)
priceNetwork <- graph_from_data_frame(d=price_link)
priceNetwork
## IGRAPH 930eddd DN-- 15 6122 --
## + attr: name (v/c), author_sum (e/n)
## + edges from 930eddd (vertex names):
## [1] Amazon Customer->low Amazon Customer->low Amazon Customer->low
## [4] Kindle Customer->low Amazon Customer->low Amanda ->low
## [7] Amanda ->low Amazon Customer->low Jessica ->low
## [10] Amazon Customer->low Amazon Customer->low Jessica ->low
## [13] Amazon Customer->low Amazon Customer->low Kindle Customer->low
## [16] Amazon Customer->low Amazon Customer->low Amazon Customer->low
## [19] Amazon Customer->low Amazon Customer->low Amazon Customer->low
## [22] Mary ->low Sarah ->low Lisa ->low
## + ... omitted several edges
plot(priceNetwork, vertex.color=c("skyblue"))
#類別轉factor
new_united$brand <- as.factor(new_united$brand)
new_united$trigram <- as.factor(new_united$trigram)
#新增一欄位:是否會進銷售排行前十名
predict_data <- new_united %>%
mutate(inRanking = ifelse(Rank2 >= 10, "no", "yes"))
predict_data$inRanking <- as.factor(predict_data$inRanking)
#切分訓練資料
set.seed(1)
indx <- sample(1:nrow(predict_data), nrow(predict_data)*0.7)
test <- predict_data[-indx,]
train <- predict_data[indx,]
## 用品牌及 trigram 的結果預測是否會進入銷售排行前十名
library(e1071)
svmfit <- svm(inRanking ~ brand + trigram, data=train , kernel ="linear", cost=10,scale=FALSE)
ypred <- predict (svmfit ,test)
x=table(predict = ypred , truth = test$inRanking )
sum(diag(x))/sum(x)
## [1] 0.9740634
1.唇膏的評論難以使用主題模型分類。
2.從評論當中可以發現消費者對於唇膏的顏色,以及是否搭配自己的膚色較為在意。
3.唇膏的目標客群通常為女性,因此從網路圖也可以發現評論者通常以女性居多
4.一開始使用單字斷詞進行分析時,所呈現出的效果不盡理想,最後嘗試透過bigram及trigram進行切割之後,所得出的字詞比較具有意義及可解釋性,最後我們使用trigram的結果來預測銷售績效也達到了不錯的效果。
5.品牌對於唇膏的銷售是有影響力的。
6.跟隨流行的顏色推出產品很重要。
7.即便品牌沒有以流行色為產品主軸,但仍然可以依其品牌特質提高討論度。