資料來源

Amazon.com

動機與目的

欲了解在Amazon.com上留言的評論者對於唇膏的整體看法,並以不同的角度剖析評論,像是了解評論者的情緒起伏,或是評論當中所涵蓋的主題內容,評論字詞使用的相似程度,甚至是評論者與品牌、星等、價格之間的關聯性。最後透過模型預測什麼樣的評論有可能會進入銷售前十名

基本設定+載入package

setwd("~/Lab")
getwd()
## [1] "/home/m064020020/Lab"
packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest","tidytext","wordcloud","topicmodels","doParallel")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

Sys.setlocale("LC_ALL","C")
## [1] "LC_CTYPE=C;LC_NUMERIC=C;LC_TIME=C;LC_COLLATE=C;LC_MONETARY=C;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
 #options(digits=5, scipen=10)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(SnowballC)
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(wordcloud)
## Loading required package: RColorBrewer
## 
## Attaching package: 'wordcloud'
## The following object is masked from 'package:gplots':
## 
##     textplot
library(stringr)
library(tidytext)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## The following object is masked from 'package:NLP':
## 
##     annotate
library(tidyr)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(readr)
library(topicmodels)
library(lexicon)
library(koRpus)
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()
## 
## Attaching package: 'koRpus'
## The following object is masked from 'package:readr':
## 
##     tokenize
library(quanteda)
## Package version: 1.4.3
## Parallel computing: 2 of 24 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following objects are masked from 'package:koRpus':
## 
##     tokens, types
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
## 
##     View
library(syuzhet)
library(corrplot)
## corrplot 0.84 loaded
library(slam)
## 
## Attaching package: 'slam'
## The following object is masked from 'package:data.table':
## 
##     rollup
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(Rtsne)
library(lm.beta)
library(igraph)
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:quanteda':
## 
##     as.igraph
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
library(ggrepel)
library("wordcloud")
library("RColorBrewer")

讀資料

lipstick_detail=fread('./data/yuchia.lipstick_info_original.csv',na.strings=c("","NA"),stringsAsFactors = F)
lip_sub=fread('./data/yuchia.lipstick_info_suburl.csv',na.strings=c("","NA"),stringsAsFactors = F)


Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "LC_CTYPE=zh_TW.UTF-8;LC_NUMERIC=C;LC_TIME=zh_TW.UTF-8;LC_COLLATE=zh_TW.UTF-8;LC_MONETARY=zh_TW.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
lipstick=fread('./data/yuchia.amazon_lipstick_review.csv',header = T,stringsAsFactors = F)

review資料前處理

##因為評論有些欄位有中文資料,需事先做前處理
##刪除空白$換行
lipstick$availability<- gsub("\\n", "", lipstick$availability)%>%
  gsub("^\\s+|\\s+$", "", .)%>%
  gsub("[.]", "",.)

#unique(lipstick$availability)
#unique(lipstick$star)


#找出沒有中文的欄位
lipstick_nochiness<-lipstick %>% filter(grepl('[a-zA-Z]',lipstick$availability))
lipstick_nochiness$star_sub<-substr(lipstick_nochiness$star,1,3)

##處理字串
lipstick_nochiness$availability<- gsub("^Only.*|^In.*|.*ship.*","In Stock",lipstick_nochiness$availability)%>%
  gsub("^Available.*","Unavailable",.)

#unique(lipstick_nochiness$availability)

##找出有中文的欄位
lipstick_chiness<-lipstick %>% filter(!grepl('[a-zA-Z]',lipstick$availability))
##處理星等
lipstick_chiness$star_sub<-substr(lipstick_chiness$star,7,10)
##處理字串
lipstick_chiness$availability<- gsub("庫存僅剩\\s[0-9].*", "In Stock", lipstick_chiness$availability) %>%
  gsub(".*[有]?現貨。", "In Stock",.)%>%
  gsub("來自\\s這些暢銷品\\s有貨。", "Unavailable",.) %>%
  gsub("暫時缺貨\\s,", "Unavailable",.)

##合併兩張dataframe
lipstick_review<-rbind(lipstick_nochiness,lipstick_chiness)


##Date:
Sys.setlocale("LC_TIME", "C")
## [1] "C"
lipstick_review$art_Date1=as.Date(lipstick_review$art_Date,format = "%B %d,%Y")
lipstick_review$Year=year(lipstick_review$art_Date1)
lipstick_review$Month=format(lipstick_review$art_Date1, "%m")

##Brand:
lipstick_review$brand=lipstick_review$brand %>% as.factor
brand=lipstick_review$brand %>% unique

##Star:
#lipstick_review$star_sub=lipstick_review$star1 %>% substr(1,3)
lipstick_review$author_star_sub=lipstick_review$author_star %>% substr(1,3)
lipstick_review$star_sub =lipstick_review$star_sub %>% as.numeric()
lipstick_review$author_star_sub =lipstick_review$author_star_sub %>% as.numeric()

lipstick_review<-distinct(lipstick_review,art_Content, .keep_all= TRUE)#刪除重複


## 唇膏商品資料前處理

##刪除多餘的欄位
lipstick_detail <- lipstick_detail[,-c(22:45)]
## 與suburl合併
lipstick_detail<-rbind(lipstick_detail,lip_sub,fill=T)
## 刪除重複
lipstick_detail<-distinct(lipstick_detail,URL, .keep_all= TRUE)#刪除重複

review,information資料合併

#唇膏整合排名by ASIN
lipstick_review$a<-lapply(lipstick_review$art_Url,function(x){strsplit(x, "/")[[1]][6]})%>%as.character()
lip_combine<-lipstick_detail %>%
  filter(Category1 %in% c('Beauty & Personal Care','Beauty & Peronal Care'))%>%
  select(Rank1,Rank2,Category1,ASIN,Customer_Reviews)
lipstick_all<-lipstick_review%>%left_join(lip_combine,by=c("a"="ASIN"))


lipstick_all$Rank1<-lipstick_all$Rank1%>% str_replace_all(",","") %>% as.numeric
lipstick_all$Rank2<-lipstick_all$Rank2%>% str_replace_all(",","") %>% as.numeric

##只篩選有存貨&有Rank1的資料
lipstick_all<-lipstick_all %>%
  filter(availability=="In Stock")%>%
  filter(is.na(Rank2)==FALSE)


head(lipstick_all)
##                        _id
## 1 5c892605923ce022a2d09ff5
## 2 5c892605923ce022a2d09ff7
## 3 5c892605923ce022a2d09ff9
## 4 5c892605923ce022a2d09ffb
## 5 5c892605923ce022a2d09ffd
## 6 5c892605923ce022a2d09fff
##                                                                                                                                                                                                                                                                                                                                                                                                                             art_Content
## 1                                                                                                                                                                                                                                                                    very sticky. The colors were a little off. Caution to others: it drys very fast and very hard to take off. Id recommend to use coconut oil to remove the lipstick.
## 2         I really love the dark red lip color in this set. Yes some of the lighter colors in the set were ""sticky"" and settled in the lines of my lips however I solved that quickly by mosturizing my lips before applying the lip color and applying a setting powder.  I would reccommend based on the colors as it is difficult to find a good set where all the colors compliment your skin tone like this one does to my skin.
## 3                                                                                                                                                                                                                             Product definitely dries quick, but the color pallet is NOT a truly representaed by the photos. The color pallet I received is very brown/nude earth tones-not pink as it displays. Definitely returning.
## 4 The Color is beautiful but it's very strange texture! it makes your lips stick together! i put it on and was like ahh, this is great, and about 60 seconds later i'm saying........ what the heck is going on... I am going to try putting a coat of clear chapstick over it I guess? Too inexpensive to return, so i'm going to try and work with it, but very strange texture once it dries. you can't even rub your lips together.
## 5                                                                                                                                                                                                                                                                                                Really pretty colors for the price! Dried quickly from the swatches and smell like chocolate \U0001f60a I cant wait to try on my lips.
## 6                                                                                                                                                                                                                                                                                                                                          Super Sticky! Starts off ok, nice colors but like the other probably better with clear gloss
##            art_Date
## 1 February 18, 2018
## 2  November 4, 2018
## 3 February 14, 2018
## 4  January 11, 2018
## 5 September 9, 2018
## 6  January 18, 2018
##                                                                                                                                                      art_Url
## 1 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 2 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 3 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 4 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 5 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 6 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
##       author_name        author_star availability  brand
## 1           Joana 1.0 out of 5 stars     In Stock SHERUI
## 2          Ashley 5.0 out of 5 stars     In Stock SHERUI
## 3 frances padilla 2.0 out of 5 stars     In Stock SHERUI
## 4       Katharine 1.0 out of 5 stars     In Stock SHERUI
## 5      Evelina L. 5.0 out of 5 stars     In Stock SHERUI
## 6      Lola Bunny 1.0 out of 5 stars     In Stock SHERUI
##                                                     comment_title error
## 1        Id recommend to use coconut oil to remove the lipstick\n     0
## 2                               Pocket Sized, Variety of colors\n     0
## 3                          Not pink color palett!!! VERY BROWN!\n     0
## 4          The Color is beautiful but it's very strange texture\n     0
## 5                                  Cute mini matte lipstick set\n     0
## 6 STICKY!!! (Almost glue your mouth shut type sticky)\U0001f628\n     0
##                 star
## 1 3.2 out of 5 stars
## 2 3.2 out of 5 stars
## 3 3.2 out of 5 stars
## 4 3.2 out of 5 stars
## 5 3.2 out of 5 stars
## 6 3.2 out of 5 stars
##                                                                           title
## 1 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 2 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 3 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 4 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 5 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 6 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
##   star_sub  art_Date1 Year Month author_star_sub          a Rank1 Rank2
## 1      3.2 2018-02-18 2018    02               1 B0762M2ZWJ  1189     1
## 2      3.2 2018-11-04 2018    11               5 B0762M2ZWJ  1189     1
## 3      3.2 2018-02-14 2018    02               2 B0762M2ZWJ  1189     1
## 4      3.2 2018-01-11 2018    01               1 B0762M2ZWJ  1189     1
## 5      3.2 2018-09-09 2018    09               5 B0762M2ZWJ  1189     1
## 6      3.2 2018-01-18 2018    01               1 B0762M2ZWJ  1189     1
##                Category1 Customer_Reviews
## 1 Beauty & Personal Care              264
## 2 Beauty & Personal Care              264
## 3 Beauty & Personal Care              264
## 4 Beauty & Personal Care              264
## 5 Beauty & Personal Care              264
## 6 Beauty & Personal Care              264

唇膏排名統計資料

summary(lipstick_all$Rank2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1     157     457    1979    2328   56629

品牌討論度

#by年分
#lipstick_review$brand %>% unique


brand_discuss_df=lipstick_review %>% 
  group_by(.,brand,Year) %>%
  summarise(
    discuss_n=n()
  )%>% filter(Year>2010)


brand_discuss_df %>% 
  group_by(.,brand) %>%
  summarise(
    discuss_sum=sum(discuss_n) 
  ) ->top10_df


#top10品牌討論度by年分
brand_discuss_df$brand %in% 
  top10_df [order(top10_df$discuss_sum,decreasing = T),]$brand[1:10] %>%
  
  subset(brand_discuss_df,.)->discuss_plot_df

discuss_plot_df<- subset(discuss_plot_df,Year!="NA")

品牌被討論數隨時間分布圖by年分

#by年分
ggplot(discuss_plot_df, aes(x = Year, y =discuss_n,color=brand)) + 
  geom_line() + geom_point(  fill = "white")+scale_x_continuous(breaks = c(2011:2019))+scale_y_continuous(breaks=c(0,1000,2000,3000,4000,5000,6000,7000,8000,9000)) 

斷字

############文字清理方法二###############
lipstick_all$art_Content <- gsub("'", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[[:punct:]]", " ", lipstick_all$art_Content) 
lipstick_all$art_Content <- gsub("[[:cntrl:]]", " ",lipstick_all$art_Content) 
lipstick_all$art_Content <- gsub("^[[:space:]]+", "", lipstick_all$art_Content) 
lipstick_all$art_Content <- gsub("[[:space:]]+$", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[0-9]+", " ", lipstick_all$art_Content)

lipstick_all$art_Content<-iconv(lipstick_all$art_Content,"WINDOWS-1252","UTF-8")
lipstick_all$art_Content <- tolower(lipstick_all$art_Content)


#重新編主鍵
lipstick_all<-lipstick_all %>%
  mutate(id2 =rownames(lipstick_all))

conbine_tidy<-lipstick_all%>%
  mutate(linenumber =row_number())%>%
  unnest_tokens(sentence,art_Content)

情緒分析(使用NRC字典)

NRC_VAD<-read.table("./data/NRC_VAD/NRC_VAD_Lexicon.txt",header = T,sep="\t",stringsAsFactors = F)
NRC<-get_sentiments("nrc")%>%as.data.frame()
NRC_all<-NRC_VAD%>%left_join(NRC,by=c("Word"="word"))

join2<- conbine_tidy %>%
  inner_join(NRC_VAD,by=(c("sentence"="Word")))%>%
  group_by(linenumber,id2)%>%
  summarise(n=n(),
            valence_avg=sum(Valence)/n,
            arousal_avg=sum(Arousal)/n,
            dominance_avg=sum(Dominance)/n)

#合併join2
lipstick_all<-lipstick_all%>%
  left_join(join2,by=c("id2"))

把情緒值為NA的移除

# 移除author_name,brand,star_sub,author_star_sub,Rank2,情緒欄位為NA的資料
lipstick_all_no_NA = lipstick_all[complete.cases(lipstick_all[ , c("author_name","brand","star_sub","author_star_sub","Rank2")]), ]

正負面情緒,最高與最低各10個brand

sentiment_valence = rbind(
# valence_avg 最高的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(valence_all_avg = mean(valence_avg)) %>%
    arrange(desc(valence_all_avg)) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "high"),
# valence_avg 最低的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(valence_all_avg = mean(valence_avg)) %>%
    arrange(valence_all_avg) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "low")
  )

#畫圖
sentiment_valence %>% 
  group_by(score) %>%
  arrange(desc(valence_all_avg)) %>%
  ungroup() %>%
  ggplot(aes(reorder(brand, valence_all_avg), valence_all_avg, fill = score)) +
  geom_col() +
  xlab("Brand") +
  ylab("Valence AVG") +
  coord_flip() 

內縮/外放情緒,最高與最低各10個brand

sentiment_arousal = rbind(
# arousal_avg 最高的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(arousal_all_avg = mean(arousal_avg)) %>%
    arrange(desc(arousal_all_avg)) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "high"),
# arousal_avg 最低的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(arousal_all_avg = mean(arousal_avg)) %>%
    arrange(arousal_all_avg) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "low")
  )

# 畫圖
sentiment_arousal %>% 
  group_by(score) %>%
  arrange(desc(arousal_all_avg)) %>%
  ungroup() %>%
  ggplot(aes(reorder(brand, arousal_all_avg), arousal_all_avg, fill = score)) +
  geom_col() +
  xlab("Brand") +
  ylab("Arousal AVG") +
  coord_flip()

支配程度情緒,最高與最低各10個brand

sentiment_dominance = rbind(
# dominance_avg 最高的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(dominance_all_avg = mean(dominance_avg)) %>%
    arrange(desc(dominance_all_avg)) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "high"),
# dominance_avg 最低的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(dominance_all_avg = mean(dominance_avg)) %>%
    arrange(dominance_all_avg) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "low")
  )

#畫圖
sentiment_dominance %>% 
  group_by(score) %>%
  arrange(desc(dominance_all_avg)) %>%
  ungroup() %>%
  ggplot(aes(reorder(brand, dominance_all_avg), dominance_all_avg, fill = score)) +
  geom_col() +
  xlab("Brand") +
  ylab("Dominance AVG") +
  coord_flip()

JOIN “Valence, Arousal維度” 的情緒

# 把valence與arousal維度最高10個與最低10個都有出現的品牌join一起
sentiment_join = sentiment_valence %>%
  inner_join(sentiment_arousal, by = c(brand = "brand"))

sentiment_join$score.x = as.factor(sentiment_join$score.x)
sentiment_join$score.y = as.factor(sentiment_join$score.y)

sentiment_join %>%
  ggplot(aes(score.x, score.y, label = brand)) +
  geom_jitter(position = position_jitter(seed = 87)) +
  xlab("Valence") +
  ylab("Arousal") +
  geom_vline(xintercept = 1.5) +    # 在x軸中間加入線條以利區別點點位置
  geom_hline(yintercept = 1.5) +    # 在y軸中間加入線條以利區別點點位置
  scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) +    # 將x軸的刻度對調
  scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) +    # 將y軸的刻度對調
  geom_label_repel(position = position_jitter(seed = 87), aes(label = brand), box.padding = 0.7)  

# 把每個點點標上brand名稱

這張圖可以看出, 越靠右邊的品牌越帶給消費者正面情緒,反之,越左邊越負面; 越靠上方的品牌代表消費者越外放的情緒,反之,越下方則越內縮。
1. 以OriflameONE1XK牌脣膏為例,消費者感受到正面的情緒,並且在評論中的敘述是比較想把好心得分享給大家
2. NOTE CosmeticsNykaaA牌脣膏則是讓消費者產生負面情緒,而且消費者的評論也較積極表現出不滿的心情
3. Kryolan雖然讓消費者有負面情緒,但消費者的評論卻是用比較婉轉的方式說產品的缺點

查看一下上述分析中的評論內容

lipstick_all_no_NA[lipstick_all_no_NA$brand == "Oriflame" | lipstick_all_no_NA$brand == "ONE1X", 2]
## [1] "so cute got them for a bachelorette gift bags the ladies all loved them actually fought over the colors lol"
## [2] "its superb  my new lip love"

OriflameONE1X的評論表現出很喜歡該品牌,有愛不釋手的感覺

lipstick_all_no_NA[lipstick_all_no_NA$brand == "NOTE Cosmetics", 2]
## [1] "i would never ever buy again    so disappointed   very bad experience"                      
## [2] "didn t keep it because it was totally broken off when i opened package   wouldn t buy agsin"

NOTE Cosmetics的評論很明顯唾棄這種品牌的脣膏,而且還非常不推薦給其他人

lipstick_all_no_NA[lipstick_all_no_NA$brand == "Kryolan", 2]
## [1] "it has quite a lot of wax smell in it"

Kryolan的評論沒有直接表現出對品牌的喜好,但委婉地說出產品的缺點

加入星星數量

star_amount = lipstick_all_no_NA %>%
  group_by(brand) %>%
  summarise(star = mean(star_sub)) %>%
  ungroup()

sentiment_join_star = sentiment_join %>%
  inner_join(star_amount, by = c(brand = "brand"))

sentiment_join_star %>%
  ggplot(aes(score.x, score.y, label = brand)) +
  geom_jitter(position = position_jitter(seed = 87), aes(size = star)) +   # 星星數越高,點點越大
  xlab("Valence") +
  ylab("Arousal") +
  geom_vline(xintercept = 1.5) +    # 加入線條以利區別點點位置
  geom_hline(yintercept = 1.5) +    # 加入線條以利區別點點位置
  scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) +    # 將x軸排序對調
  scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) +    # 將y軸排序對調
  geom_label_repel(position = position_jitter(seed = 87), size = 3, box.padding = 0.7)  # 把每個點點標上brand名稱

加入lipstick商品排名(Rank2)

# 將同品牌lipstick的rank平均
rank_avg = lipstick_all_no_NA %>%
  group_by(brand,Rank2) %>%
  summarise(rank = mean(Rank2)) %>%
  group_by(brand) %>%
  summarise(rank = mean(rank)) %>%
  ungroup()

sentiment_join_rank = sentiment_join %>%
  inner_join(rank_avg, by = c(brand = "brand"))

sentiment_join_rank %>%
  ggplot(aes(score.x, score.y, label = brand)) +
  geom_jitter(position = position_jitter(seed = 87), aes(size = -rank)) +   # rank越前面,點點越大
  xlab("Valence") +
  ylab("Arousal") +
  geom_vline(xintercept = 1.5) +    # 加入線條以利區別點點位置
  geom_hline(yintercept = 1.5) +    # 加入線條以利區別點點位置
  scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) +    # 將x軸排序對調
  scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) +    # 將y軸排序對調
  geom_label_repel(position = position_jitter(seed = 87), size = 3, box.padding = 0.7)  # 把每個點點標上brand名稱

討論正負情緒、star與rank的關係

# 將正負面分數最高與最低各10個牌子,連結該牌的平均銷售排行與平均獲得星星數
valence_rank_star = sentiment_valence %>%
  inner_join(rank_avg, by = c(brand = "brand")) %>%
  inner_join(star_amount, by = c(brand = "brand"))

valence_rank_star
## # A tibble: 20 x 5
##    brand                                valence_all_avg score   rank  star
##    <fct>                                          <dbl> <chr>  <dbl> <dbl>
##  1 BeautiControl                                  0.918 high  24605   5   
##  2 MILK MAKEUP                                    0.872 high  32875   5   
##  3 ONE1X                                          0.867 high   2126   5   
##  4 Oriflame                                       0.860 high  44499   5   
##  5 LCTCKP                                         0.817 high  44649   5   
##  6 KAT VON D Everlasting Glimmer Veil …           0.806 high  15197   4   
##  7 BORNTREE                                       0.796 high  28540   4   
##  8 FLOWER                                         0.793 high  21762   3.3 
##  9 Tayongpo                                       0.790 high  10539   4.5 
## 10 Stephanie Imports                              0.783 high   6951   5   
## 11 VOGUE COLOMBIA                                 0.25  low   56629   2   
## 12 Nykaa                                          0.291 low   38799   1   
## 13 A'some                                         0.438 low   27233   2   
## 14 YSL                                            0.440 low    4965   1   
## 15 Kryolan                                        0.492 low   54303   1   
## 16 Smoke & Mirrors                                0.506 low   46511   1   
## 17 SUNSENT                                        0.506 low    7506   2.7 
## 18 Addictive Cosmetics                            0.514 low   16925   2.4 
## 19 MILEMEI                                        0.515 low   23968.  1.33
## 20 NOTE Cosmetics                                 0.522 low   39886   1

畫出正負情緒、star與rank的關係圖

valence_rank_star %>%
  ggplot(aes(star, valence_all_avg, label = brand)) +
  geom_point(aes(size = -rank, color = -rank)) +       # 銷售排行越好,點點越大,顏色越深
  #geom_text_repel(box.padding = 0.7, aes(size = 2)) +
  geom_label_repel( box.padding = 0.7, size = 3) +
  ylab("Sentiment Valence") +
  xlab("Star Amount")

主題模型

conbine_document<-conbine_tidy %>%
  anti_join(stop_words,by=c("sentence"="word")) %>%
  count(linenumber, sentence, sort = TRUE) %>%
  ungroup()

#轉成dtm
top10_dtm <- conbine_document %>%
  cast_dtm(linenumber, sentence, n)

top10_dtm
## <<DocumentTermMatrix (documents: 63345, terms: 19961)>>
## Non-/sparse entries: 686353/1263743192
## Sparsity           : 100%
## Maximal term length: NA
## Weighting          : term frequency (tf)
lda_top10 <- LDA(top10_dtm, k = 2, control = list(seed = 1234))
lda_top10
## A LDA_VEM topic model with 2 topics.
#主題跟字的關係
document_topics <- tidy(lda_top10, matrix = "beta")
document_topics
## # A tibble: 39,922 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 red      0.00740 
##  2     2 red      0.00759 
##  3     1 organic  0.000110
##  4     2 organic  0.000390
##  5     1 bees     0.000820
##  6     2 bees     0.000329
##  7     1 burts    0.000431
##  8     2 burts    0.000652
##  9     1 lipstick 0.0271  
## 10     2 lipstick 0.0354  
## # ... with 39,912 more rows
top_terms <- document_topics %>%
  filter(!term  %in% c("lipstick","color","love","lips","product","doe snt","lip","lipsticks"))%>%
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms
## # A tibble: 40 x 3
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 doesnt  0.0124 
##  2     1 perfect 0.0108 
##  3     1 wear    0.0108 
##  4     1 pink    0.0106 
##  5     1 apply   0.0101 
##  6     1 matte   0.0100 
##  7     1 day     0.00910
##  8     1 price   0.00872
##  9     1 red     0.00740
## 10     1 time    0.00698
## # ... with 30 more rows
#畫圖呈現主題跟字的關係

 top_terms %>%
   mutate(term = reorder(term, beta)) %>%
   ggplot(aes(term, beta, fill = factor(topic))) +
   geom_col(show.legend = FALSE) +
   facet_wrap(~ topic, scales = "free") +
   coord_flip()

算出beta的差異值

beta_spread <- document_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_spread
## # A tibble: 216 x 4
##    term          topic1   topic2 log_ratio
##    <chr>          <dbl>    <dbl>     <dbl>
##  1 absolutely  0.00247  0.00142    -0.802 
##  2 add         0.00121  0.000465   -1.38  
##  3 amazing     0.00103  0.00464     2.17  
##  4 amazon      0.00265  0.00166    -0.672 
##  5 application 0.00322  0.000759   -2.09  
##  6 applicator  0.000225 0.00106     2.24  
##  7 applied     0.00232  0.00204    -0.186 
##  8 apply       0.0101   0.00145    -2.80  
##  9 applying    0.00168  0.00170     0.0209
## 10 arrived     0.000699 0.00205     1.55  
## # ... with 206 more rows
beta_spread%>%
  arrange(desc(abs(log_ratio)))  %>%
  head(20)%>%
  mutate(term = reorder(term, log_ratio)) %>%
  ggplot(aes(term, log_ratio, fill = log_ratio>0)) +
  geom_col(show.legend = FALSE) +
  xlab("term") +
  ylab("log(topic2 / topic1)") +
  coord_flip()

使用tSNE做尺度縮減看字跟字的相似性

library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
## [1] 24
## 字頻表
dtm = lipstick_all$art_Content %>% 
  iconv(to = "utf-8", sub="") %>% 
  str_trim(.) %>%
  toupper() %>%
  VectorSource %>% Corpus %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(stemDocument) %>% 
  DocumentTermMatrix %>% 
  removeSparseTerms(0.995)
dtm  # (documents: 14156, terms: 1030)
## <<DocumentTermMatrix (documents: 63980, terms: 560)>>
## Non-/sparse entries: 1124526/34704274
## Sparsity           : 97%
## Maximal term length: 10
## Weighting          : term frequency (tf)
dtm_tmp=dtm

過濾tfidf太小的字

library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
  log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0758  0.2012  0.2444  0.2652  0.3012  1.0575
dtm=dtm_tmp
dtm = dtm[, tfidf > 0.2012 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)
## [1] 63980   420

尺度縮減

#install.packages('Rtsne')
library(Rtsne)

# n = 300
# tsne = dtm[, 1:n] %>% as.data.frame.matrix %>% 
#   sapply(.,function(v)ifelse(v>1,1,v))%>%
#   scale %>% t %>% 
#   Rtsne(check_dup=F, theta=0.0, max_iter=3200)

階層式分群

# Y = tsne$Y              # tSNE coordinates
# d = dist(Y)             # distance matrix
# hc = hclust(d)          # hi-clustering
# K = 30                # number of clusters 
# g = cutree(hc,K)        # cut into K clusters
# table(g) %>% as.vector %>% sort         # sizes of clusters

畫出文字雲

# # install.packages('randomcoloR')
# library(randomcoloR)
# library(wordcloud)
# 
# wc = col_sums(dtm[,1:n])
# colors = distinctColorPalette(K)
# 
# 
# #png("./Amazon_lipstick.png", width=3200, height=1800)
# textplot(
#   Y[,1], Y[,2], colnames(dtm)[1:n], show=F, 
#   col=colors[g],
#   cex= 0.3 + 1.25 * sqrt(wc/mean(wc)),
#   font=2)
# dev.off()
文字雲

文字雲

畫出評論者對品牌的網路圖 看看會不會區分成不同的集群

nrow(lipstick) # 92524 num of data
## [1] 92524
length(unique(lipstick$author_name)) # 57672 comment author
## [1] 57672
length(unique(lipstick$brand)) #437 different  brand
## [1] 437
## 看各品牌的討論數
top10_df %>% arrange(desc(discuss_sum))
## # A tibble: 433 x 2
##    brand                   discuss_sum
##    <fct>                         <int>
##  1 Maybelline New York           10658
##  2 NYX PROFESSIONAL MAKEUP        6513
##  3 Revlon                         6239
##  4 Lime Crime                     6036
##  5 L'Oreal Paris                  5988
##  6 COVERGIRL                      4347
##  7 LipSense                       3059
##  8 Rimmel                         2050
##  9 Burt's Bees                    1964
## 10 Nabi                           1716
## # ... with 423 more rows
tmp <- lipstick %>% 
  select(author_name, brand) %>% 
  filter(author_name != 'Kindle Customer' & author_name != 'Amazon Customer') %>% 
  group_by(brand) %>% 
  mutate(brand_sum = n()) %>% 
  ungroup() %>% 
  arrange(desc(brand_sum))

# 只取前10大品牌
top_10 <- unique(tmp$brand)[1:10]

# 因為評論者太多 只取留言數夠多的評論者做呈現
link <- tmp %>% 
  filter(brand %in% top_10) %>% 
  group_by(author_name) %>% 
  mutate(author_sum = n()) %>% 
  ungroup() %>% 
  distinct() %>% 
  filter(author_sum > 30)


author <- select(link, author_name, author_sum)
brand <- select(link, brand, brand_sum)
names(author) <- NULL
names(brand) <- NULL

nodes <- rbind(data.frame(author),data.frame(brand)) %>% distinct()


reviewNetwork <- graph_from_data_frame(d=link, v=nodes,directed = F)
reviewNetwork
## IGRAPH 5ffab62 UN-- 35 228 -- 
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 5ffab62 (vertex names):
##  [1] Michelle --Maybelline New York Heather  --Maybelline New York
##  [3] Melissa  --Maybelline New York Mary     --Maybelline New York
##  [5] Emily    --Maybelline New York Laura    --Maybelline New York
##  [7] Jennifer --Maybelline New York Amy      --Maybelline New York
##  [9] Karen    --Maybelline New York Sarah    --Maybelline New York
## [11] Anna     --Maybelline New York Katie    --Maybelline New York
## [13] Elizabeth--Maybelline New York Nicole   --Maybelline New York
## [15] Amanda   --Maybelline New York Samantha --Maybelline New York
## + ... omitted several edges
V(reviewNetwork)$size <- log(V(reviewNetwork)$X2)*3

# 紅色是品牌的node 藍色是author
plot(reviewNetwork,
     vertex.color=c("skyblue", "red")[1+(names(V(reviewNetwork)) %in% top_10)])

看看各個評論者對評分的網路圖 會不會有評論者只評高分或只評低分

length(unique(lipstick$author_star))
## [1] 5
star_link <- lipstick %>% 
  select(author_name, author_star) %>% 
  group_by(author_name) %>% 
  mutate(author_sum = n()) %>% 
  distinct() %>% 
  filter(author_sum > 40)


starNetwork <- graph_from_data_frame(d=star_link,directed = F)
starNetwork
## IGRAPH 0ad1d82 UN-- 56 251 -- 
## + attr: name (v/c), author_sum (e/n)
## + edges from 0ad1d82 (vertex names):
##  [1] Ashley         --5.0 out of 5 stars
##  [2] Amazon Customer--4.0 out of 5 stars
##  [3] Angela         --5.0 out of 5 stars
##  [4] Amazon Customer--1.0 out of 5 stars
##  [5] Sara           --1.0 out of 5 stars
##  [6] Amazon Customer--3.0 out of 5 stars
##  [7] Amazon Customer--5.0 out of 5 stars
##  [8] Sarah          --3.0 out of 5 stars
## + ... omitted several edges
plot(starNetwork)

  • 從圖可以得知評論者都很公平,高分低分都有給,不會刻意只給高分或低分

買家對價格的網路圖 看看有沒有一群人都買高價或都買低價

colnames(lipstick_all)[which(names(lipstick_all) == "a")] <- "ASIN"


price_tmp <- lipstick_all %>% 
  left_join(lipstick_detail, by = 'ASIN') %>% 
  select(author_name, Price)

price_tmp$Price <- as.numeric(price_tmp$Price)
## Warning: NAs introduced by coercion
price_tmp <- na.omit(price_tmp)

summary(price_tmp$Price)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.75    5.85    8.83   12.99   19.99  118.99
plot(density(price_tmp$Price))

price_link <- price_tmp %>% 
  filter(author_name != 'Kindle Customer' & author_name != 'Amazon Customer') %>% 
  group_by(author_name) %>% 
  mutate(author_sum = n()) %>% 
  ungroup() %>% 
  mutate(price_index = ifelse(Price>20,'high','low')) %>% 
  filter(author_sum > 50) %>% 
  select(author_name, price_index, author_sum) 
  


priceNetwork <- graph_from_data_frame(d=price_link,directed = F)
priceNetwork
## IGRAPH c5141b7 UN-- 17 970 -- 
## + attr: name (v/c), author_sum (e/n)
## + edges from c5141b7 (vertex names):
##  [1] Ashley   --low Sarah    --low Nicole   --low Nicole   --low
##  [5] Amanda   --low Amanda   --low Jessica  --low Katie    --low
##  [9] Jessica  --low Katie    --low Nicole   --low Mary     --low
## [13] Sarah    --low Lisa     --low Jessica  --low Michelle --low
## [17] Jennifer --low Lauren   --low Amanda   --low Stephanie--low
## [21] Stephanie--low Michelle --low Elizabeth--low Amanda   --low
## [25] Jennifer --low Amanda   --low Sarah    --low Stephanie--low
## [29] Mary     --low Katie    --low Michelle --low Laura    --low
## + ... omitted several edges
plot(priceNetwork)

使用bigram斷詞

#以bi-gram做斷詞
lipstick_bigrams <- lipstick_all%>%
  mutate(linenumber =row_number())%>%  
  unnest_tokens(bigram,art_Content, token = "ngrams", n = 2)

看各bigram的次數

lipstick_bigrams %>%
  count(bigram, sort = TRUE)
## # A tibble: 286,727 x 2
##    bigram           nn
##    <chr>         <int>
##  1 the color     14629
##  2 i love         7932
##  3 it is          7805
##  4 my lips        7557
##  5 i have         6282
##  6 this is        6262
##  7 color is       6039
##  8 is a           5963
##  9 this lipstick  5711
## 10 love this      5217
## # ... with 286,717 more rows

將bigram拆成word1, word2

bigrams_separated <- lipstick_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts
## # A tibble: 77,589 x 3
##    word1     word2       nn
##    <chr>     <chr>    <int>
##  1 lip       color     2089
##  2 skin      tone      1297
##  3 beautiful color     1122
##  4 lip       gloss     1085
##  5 love      love      1039
##  6 lip       balm      1005
##  7 nice      color      924
##  8 staying   power      828
##  9 matte     lipstick   818
## 10 <NA>      <NA>       812
## # ... with 77,579 more rows

trigram

lipstick_all %>%
  unnest_tokens(trigram, art_Content, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)
## # A tibble: 49,049 x 4
##    word1   word2 word3         nn
##    <chr>   <chr> <chr>      <int>
##  1 <NA>    <NA>  <NA>        2220
##  2 love    love  love         475
##  3 natural lip   color        237
##  4 matte   lip   cream        121
##  5 matte   lip   color         87
##  6 soft    matte lip           84
##  7 lips    feel  dry           80
##  8 love    lime  crime         76
##  9 ðÿ      ðÿ    ðÿ            72
## 10 lime    crime velvetines    72
## # ... with 49,039 more rows
#將trigram的結果拆成三個字並另存成一dataframe
lipstick_trigrams <- lipstick_all%>%
  mutate(linenumber =row_number())%>%  
  unnest_tokens(trigram,art_Content, token = "ngrams", n = 3)


trigrams_separated <- lipstick_trigrams %>%
  separate(trigram, c("word1", "word2","word3"), sep = " ")

trigrams_filtered <- trigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  filter(!word3 %in% stop_words$word)

找出評論中提到自己所使用的口紅顏色

#找出評論中提到的商品顏色
bigram_color <- filter(bigrams_filtered,word2%in% lipstick_detail$color)
trigram_color <- filter(trigrams_filtered,word3%in% lipstick_detail$color)

將bigarm的兩個字連接後的wordcloud

bigrams_united <- bigram_color %>%
  unite(bigram, word1, word2, sep = "_")


set.seed(1234)
wordcloud(words = bigrams_united$bigram,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

trigram三個字連接後的wordcloud

trigrams_united <- trigram_color %>%
  unite(trigram, word1, word2, word3, sep = "_")%>%
  filter(!trigram %in% c("NA_NA_NA"))

# set.seed(1234)
# wordcloud(words = trigrams_united$trigram,
#           max.words=200, random.order=FALSE, rot.per=0.35, 
#           colors=brewer.pal(8, "Dark2"))
trigram

trigram

去掉所有形容膚色的詞彙留下可能為討論口紅的顏色

without_skin_color <- filter(trigram_color, word2 !="skin")

合併詞彙再做一次文字雲

new_united <- without_skin_color %>%
  unite(trigram, word1, word2, word3, sep = "_")


# set.seed(1234)
# wordcloud(words = new_united$trigram,
#           max.words=200, random.order=FALSE, rot.per=0.35, 
#           colors=brewer.pal(8, "Dark2"))
trigram color

trigram color

畫成長條圖 看看什麼顏色最常被提到

trigram<-new_united %>%
  count(trigram, sort = TRUE)

trigram %>%
  filter(nn>5)%>%
  ggplot(.,aes(x=reorder(trigram,nn),y=nn))+
  geom_bar(stat="identity")+
  coord_flip()+ labs(y='count',x='trigram of color')

抓出討論亮面與霧面的紀錄

#建立要被篩選的字
mat_word <- c("matt","matte","matted")
gloss_word <- c("gloss","glossy")

#以trigram找出有被提到是霧面還亮面的評論
matte <- filter(trigrams_filtered, word1 %in% mat_word | word2 %in% mat_word | word3 %in% mat_word)
gloss <- filter(trigrams_filtered, word1 %in% gloss_word | word2 %in% gloss_word | word3 %in% gloss_word)
color <- conbine_tidy %>% filter(sentence %in% mat_word |sentence %in% gloss_word | sentence %in% c("dark","red","tone","brown","pink","orange","black","purple","fire","cupcake","nap","cabernet","lay","temptation"))

## 霧面 vs 亮面唇膏趨勢圖
color %>%
  select(art_Date1,sentence)%>%
  count(art_Date1,sentence) %>%
  arrange(desc(n))%>%
  filter(n>10)%>%
  filter(sentence %in% mat_word | sentence %in% gloss_word)%>%
  ggplot(.,aes(art_Date1,n, fill = sentence,label=sentence)) +
 geom_line(aes(color = sentence), position="identity")

## 其他顏色趨勢圖
color %>%
  select(art_Date1,sentence)%>%
  count(art_Date1,sentence) %>%
  arrange(desc(n))%>%
  filter(n>10)%>%
  filter(!sentence %in% mat_word & !sentence %in% gloss_word)%>%
  ggplot(.,aes(art_Date1,n, fill = sentence,label=sentence)) +
 geom_line(aes(color = sentence), position="identity")

準備資料

#類別轉factor
new_united$brand <- as.factor(new_united$brand)
new_united$trigram <- as.factor(new_united$trigram)


#新增一欄位:是否會進銷售排行前十名
predict_data <- new_united %>% 
  mutate(inRanking = ifelse(Rank2 >= 10, "no", "yes"))
predict_data$inRanking <- as.factor(predict_data$inRanking)

#切分訓練資料
set.seed(1)
indx <- sample(1:nrow(predict_data), nrow(predict_data)*0.7)
test  <- predict_data[-indx,]
train <- predict_data[indx,]

support vector classifier

## 用品牌及trigram的結果預測是否會進入銷售排行前十名

library(e1071)
svmfit6 <- svm(inRanking ~ brand + trigram, data=train , kernel ="linear", cost=10,scale=FALSE)

ypred <- predict (svmfit6 ,test)
x=table(predict = ypred , truth = test$inRanking )
sum(diag(x))/sum(x) 
## [1] 0.9696376

結論

1.唇膏的評論難以使用主題模型分類,但透過尺度縮減及階層式分群使相似的字歸類在一起,能夠使我們推斷出評論潛在的主題
2.從評論當中可以發現消費者對於唇膏的顏色,以及是否搭配自己的膚色較為在意。除此之外,唇膏的目標客群通常為女性,因此從網路圖也可以發現評論者通常以女性居多
3.一開始使用單字斷詞進行分析時,所呈現出的效果不盡理想,最後嘗試透過bigram及trigram進行切割之後,所得出的字詞比較具有意義及可解釋性,最後我們使用trigram的結果來預測銷售績效也達到了不錯的效果