Final Project-Lipstick Review on Amazon

動機與目的

當我們欲發展自己的彩妝品牌，則了解市場利基以及特性就成為相當重要的課題，因此我們欲以口紅的評論作為資料探勘的標的來協助了解市場發展以及品牌的重要性，所以挑選分析Amazon.com上留言的評論者對於唇膏的整體看法，並以不同的角度剖析評論，像是了解評論者的情緒起伏，或是評論當中所涵蓋的主題內容，評論字詞使用的相似程度，甚至是評論者與品牌、星等、價格之間的關聯性，最後透過模型預測什麼樣的評論有可能會進入銷售前十名。

基本設定+載入package

packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest","tidytext","wordcloud","topicmodels","doParallel","koRpus","quanteda")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

Sys.setlocale("LC_ALL","C")

## [1] "LC_CTYPE=C;LC_NUMERIC=C;LC_TIME=C;LC_COLLATE=C;LC_MONETARY=C;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"

 #options(digits=5, scipen=10)

library(dplyr)
library(tm)
library(SnowballC)
library(ROCR)
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
library(wordcloud)
library(stringr)
library(tidytext)
library(ggplot2)
library(tidyr)
library(data.table)
library(readr)
library(topicmodels)
library(lexicon)
library(koRpus)
library(quanteda)
library(syuzhet)
library(corrplot)
library(slam)
library(doParallel)
library(Rtsne)
library(igraph)
library(ggraph)
library(ggrepel)
library(wordcloud)
library(RColorBrewer)

讀資料

lipstick_detail=fread('./data/yuchia.lipstick_info_original.csv',na.strings=c("","NA"),stringsAsFactors = F)
lip_sub=fread('./data/yuchia.lipstick_info_suburl.csv',na.strings=c("","NA"),stringsAsFactors = F)


Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## [1] "LC_CTYPE=zh_TW.UTF-8;LC_NUMERIC=C;LC_TIME=zh_TW.UTF-8;LC_COLLATE=zh_TW.UTF-8;LC_MONETARY=zh_TW.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"

lipstick=fread('./data/yuchia.amazon_lipstick_review.csv',header = T,stringsAsFactors = F)

評論資料前處理

##因為評論有些欄位有中文資料，需事先做前處理
##刪除空白$換行
lipstick$availability<- gsub("\\n", "", lipstick$availability)%>%
  gsub("^\\s+|\\s+$", "", .)%>%
  gsub("[.]", "",.)


#找出沒有中文的欄位
lipstick_nochiness<-lipstick %>% filter(grepl('[a-zA-Z]',lipstick$availability))
lipstick_nochiness$star_sub<-substr(lipstick_nochiness$star,1,3)

##處理字串
lipstick_nochiness$availability<- gsub("^Only.*|^In.*|.*ship.*","In Stock",lipstick_nochiness$availability)%>%
  gsub("^Available.*","Unavailable",.)


##找出有中文的欄位
lipstick_chiness<-lipstick %>% filter(!grepl('[a-zA-Z]',lipstick$availability))
##處理星等
lipstick_chiness$star_sub<-substr(lipstick_chiness$star,7,10)
##處理字串
lipstick_chiness$availability<- gsub("庫存僅剩\\s[0-9].*", "In Stock", lipstick_chiness$availability) %>%
  gsub(".*[有]?現貨。", "In Stock",.)%>%
  gsub("來自\\s這些暢銷品\\s有貨。", "Unavailable",.) %>%
  gsub("暫時缺貨\\s,", "Unavailable",.)

##合併dataframe
lipstick_review<-rbind(lipstick_nochiness,lipstick_chiness)
colnames(lipstick_review)[which(names(lipstick_review) == "a")] <- "ASIN"

##Date:
Sys.setlocale("LC_TIME", "C")

## [1] "C"

lipstick_review$art_Date1=as.Date(lipstick_review$art_Date,format = "%B %d,%Y")
lipstick_review$Year=year(lipstick_review$art_Date1)
lipstick_review$Month=format(lipstick_review$art_Date1, "%m")

##Brand:
lipstick_review$brand=lipstick_review$brand %>% as.factor
brand=lipstick_review$brand %>% unique

##Star:
lipstick_review$author_star_sub=lipstick_review$author_star %>% substr(1,3)
lipstick_review$star_sub =lipstick_review$star_sub %>% as.numeric()
lipstick_review$author_star_sub =lipstick_review$author_star_sub %>% as.numeric()

#刪除重複紀錄
lipstick_review <-distinct(lipstick_review,art_Content, .keep_all= TRUE)

#去除多餘欄位
lipstick_review <- lipstick_review[,-c(6,10,11)]

唇膏商品資料前處理

## 刪除多餘的欄位
lipstick_detail <- lipstick_detail[,-c(22:45)]
## 與suburl合併
lipstick_detail<-rbind(lipstick_detail,lip_sub,fill=T)
## 刪除重複
lipstick_detail<-distinct(lipstick_detail,URL, .keep_all= TRUE)
## 整理產品星等
lipstick_detail$product_star_sub <- substr(lipstick_detail$Product_Star, 1, 3)
lipstick_detail$product_star_sub <- lipstick_detail$product_star_sub %>% as.numeric()

合併評論與商品資料

#唇膏整合排名by ASIN
lipstick_review$ASIN<-lapply(lipstick_review$art_Url,function(x){strsplit(x, "/")[[1]][6]})%>%as.character()
lip_combine<-lipstick_detail %>%
  filter(Category1 %in% c('Beauty & Personal Care','Beauty & Peronal Care'))%>%
  select(Rank1,Rank2,Category1,ASIN,Customer_Reviews)
lipstick_all<-lipstick_review%>%left_join(lip_combine,by=c("ASIN"="ASIN"))


lipstick_all$Rank1<-lipstick_all$Rank1%>% str_replace_all(",","") %>% as.numeric
lipstick_all$Rank2<-lipstick_all$Rank2%>% str_replace_all(",","") %>% as.numeric

##篩選有Rank1的資料
lipstick_all<-lipstick_all %>%
  filter(is.na(Rank2)==FALSE)


##整合其他商品資料欄位
another_info <- select(lipstick_detail, product_star_sub, color, Price, Title, ASIN)
colnames(another_info)[4] <- "product_title"
lipstick_all <- left_join(lipstick_all, another_info, by = "ASIN")

#刪除評論內容有NA值的紀錄
lipstick_all <-na.omit(lipstick_all, cols="artContent")

head(lipstick_all, 1)

##                          _id
## 206 5c89267c923ce022a2d0a193
##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                           art_Content
## 206 Look up a swatch of the shade you're looking at online before you buy it, the pictures here are not even close to the shade you're getting. I knew about this and ordered the shade I wanted and got that exact shade, I just wanted to make sure anyone reading reviews knows that the pictures on here are literally many, MANY shades away from what you're purchasing. Whipped Caviar is a gorgeous shade, definitely one of my favorite from NYX, I'm just saying the image shown for it is not accurate whatsoever. It is, however, a gorgeous color. Goes on a smooth matte and stays in place. I love it!
##          art_Date
## 206 July 23, 2015
##                                                                                                                                           art_Url
## 206 https://www.amazon.com/NYX-Matte-Lipstick-Perfect-Red/product-reviews/B005FYJD7M/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
##     author_name availability                   brand
## 206      Octane     In Stock NYX PROFESSIONAL MAKEUP
##                         comment_title                           title
## 206 LOOK UP SWATCHES BEFORE YOU BUY\n NYX Matte Lipstick, Perfect Red
##     star_sub  art_Date1 Year Month author_star_sub       ASIN Rank1 Rank2
## 206      3.7 2015-07-23 2015    07               5 B005FYJD7M 38102   300
##                  Category1 Customer_Reviews product_star_sub       color
## 206 Beauty & Personal Care             2378              3.7 Perfect red
##     Price                   product_title
## 206  5.85 NYX Matte Lipstick, Perfect Red

資料前處理將唇膏商品資訊的銷售績效欄位併在評論資料表
最後統整出的資料集描述：評論者名稱、評論內容、評論日期、url、評論者給產品的星等(1-5星)、品牌、存貨狀態、評論標題、產品標題、產品星等(1-5星)、銷售排名、留言數

唇膏排名統計資料

summary(lipstick_all$Rank2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       4     139     333    1651    1910   56404

文字內容前處理

# 處理評論文字
lipstick_all$art_Content <- gsub("'", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[[:punct:]]", " ", lipstick_all$art_Content) 
lipstick_all$art_Content <- gsub("[[:cntrl:]]", " ",lipstick_all$art_Content) 
lipstick_all$art_Content <- gsub("^[[:space:]]+", "", lipstick_all$art_Content) 
lipstick_all$art_Content <- gsub("[[:space:]]+$", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[0-9]+", " ", lipstick_all$art_Content)

lipstick_all$art_Content<-iconv(lipstick_all$art_Content,"WINDOWS-1252","UTF-8")
lipstick_all$art_Content <- tolower(lipstick_all$art_Content)


# 處理商品標題
lipstick_all$product_title <- gsub("'", "", lipstick_all$product_title)
lipstick_all$product_title <- gsub("[[:punct:]]", " ", lipstick_all$product_title) 
lipstick_all$product_title <- gsub("[[:cntrl:]]", " ",lipstick_all$product_title) 
lipstick_all$product_title <- gsub("^[[:space:]]+", "", lipstick_all$product_title) 
lipstick_all$product_title <- gsub("[[:space:]]+$", "", lipstick_all$product_title)
lipstick_all$product_title <- gsub("[0-9]+", " ", lipstick_all$product_title)

lipstick_all$product_title<-iconv(lipstick_all$product_title,"WINDOWS-1252","UTF-8")
lipstick_all$product_title <- tolower(lipstick_all$product_title)

#重新編主鍵
lipstick_all<-lipstick_all %>%
  mutate(ID =rownames(lipstick_all))

#刪除舊有id
lipstick_all <- lipstick_all[,-1]

#對評論內容做斷詞
conbine_tidy<-lipstick_all%>%
  mutate(linenumber =row_number())%>%
  unnest_tokens(sentence,art_Content)

#對商品標題做斷詞
title_tidy<-lipstick_all%>%
  mutate(linenumber =row_number())%>%
  unnest_tokens(sentence,product_title)

評論分析

情緒分析(使用NRC字典)

NRC_VAD<-read.table("./data/NRC_VAD/NRC_VAD_Lexicon.txt",header = T,sep="\t",stringsAsFactors = F)
NRC<-get_sentiments("nrc")%>%as.data.frame()
NRC_all<-NRC_VAD%>%left_join(NRC,by=c("Word"="word"))

join2<- conbine_tidy %>%
  inner_join(NRC_VAD,by=(c("sentence"="Word")))%>%
  group_by(linenumber,ID)%>%
  summarise(n=n(),
            valence_avg=sum(Valence)/n,
            arousal_avg=sum(Arousal)/n,
            dominance_avg=sum(Dominance)/n)

#合併join2
lipstick_all<-lipstick_all%>%
  left_join(join2,by="ID")

字典使用 [NRC VAD Lexicon] (http://saifmohammad.com/WebPages/nrc-vad.html)
NRC VAD Lexicon包含了20,007個字詞，並計算其效價 (Valence) 、激發 (Arousal) 和支配 (Dominance) 三種維度之分數。Valence 代表「正面」—「負面」或是「愉悅」—「不滿」的維度；Arousal代表「興奮」—「平靜」或「主動」—「被動」的維度；Dominance是「強者」—「弱者」或「完全掌控」—「無法控制」的維度。分數範圍從0至1

把情緒值為NA的移除

# 移除author_name,brand,star_sub,author_star_sub,Rank2,情緒欄位為NA的資料
lipstick_all_no_NA = na.omit(lipstick_all, cols = c("author_name","brand","star_sub","author_star_sub","Rank2", "valence_avg", "arousal_avg", "dominance_avg"))

正負面情緒，最高與最低各10個brand

sentiment_valence = rbind(
# valence_avg 最高的十個品牌
  lipstick_all_no_NA %>%
    #filter(brand %in% discuss_plot_df$brand) %>%
    group_by(brand) %>%
    summarise(valence_all_avg = mean(valence_avg)) %>%
    arrange(desc(valence_all_avg)) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "high"),
# valence_avg 最低的十個品牌
  lipstick_all_no_NA %>%
    #filter(brand %in% discuss_plot_df$brand) %>%
    group_by(brand) %>%
    summarise(valence_all_avg = mean(valence_avg)) %>%
    arrange(valence_all_avg) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "low")
  )

#畫圖
sentiment_valence %>% 
  group_by(score) %>%
  arrange(desc(valence_all_avg)) %>%
  ungroup() %>%
  ggplot(aes(reorder(brand, valence_all_avg), valence_all_avg, fill = score)) +
  geom_col() +
  xlab("Brand") +
  ylab("Valence AVG") +
  coord_flip()

內縮/外放情緒，最高與最低各10個brand

sentiment_arousal = rbind(
# arousal_avg 最高的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(arousal_all_avg = mean(arousal_avg)) %>%
    arrange(desc(arousal_all_avg)) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "high"),
# arousal_avg 最低的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(arousal_all_avg = mean(arousal_avg)) %>%
    arrange(arousal_all_avg) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "low")
  )

# 畫圖
sentiment_arousal %>% 
  group_by(score) %>%
  arrange(desc(arousal_all_avg)) %>%
  ungroup() %>%
  ggplot(aes(reorder(brand, arousal_all_avg), arousal_all_avg, fill = score)) +
  geom_col() +
  xlab("Brand") +
  ylab("Arousal AVG") +
  coord_flip()

支配程度情緒，最高與最低各10個brand

sentiment_dominance = rbind(
# dominance_avg 最高的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(dominance_all_avg = mean(dominance_avg)) %>%
    arrange(desc(dominance_all_avg)) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "high"),
# dominance_avg 最低的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(dominance_all_avg = mean(dominance_avg)) %>%
    arrange(dominance_all_avg) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "low")
  )

#畫圖
sentiment_dominance %>% 
  group_by(score) %>%
  arrange(desc(dominance_all_avg)) %>%
  ungroup() %>%
  ggplot(aes(reorder(brand, dominance_all_avg), dominance_all_avg, fill = score)) +
  geom_col() +
  xlab("Brand") +
  ylab("Dominance AVG") +
  coord_flip()

JOIN “Valence, Arousal維度” 的情緒

# 把valence與arousal維度最高10個與最低10個都有出現的品牌join一起
sentiment_join = sentiment_valence %>%
  inner_join(sentiment_arousal, by = c(brand = "brand"))

sentiment_join$score.x = as.factor(sentiment_join$score.x)
sentiment_join$score.y = as.factor(sentiment_join$score.y)

sentiment_join %>%
  ggplot(aes(score.x, score.y, label = brand)) +
  geom_jitter(position = position_jitter(seed = 87)) +
  xlab("Valence") +
  ylab("Arousal") +
  geom_vline(xintercept = 1.5) +    # 在x軸中間加入線條以利區別點點位置
  geom_hline(yintercept = 1.5) +    # 在y軸中間加入線條以利區別點點位置
  scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) +    # 將x軸的刻度對調
  scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) +    # 將y軸的刻度對調
  geom_label_repel(position = position_jitter(seed = 87), aes(label = brand), box.padding = 0.7)

# 把每個點點標上brand名稱

這張圖可以看出: 越靠右邊的品牌越帶給消費者正面情緒，反之，越左邊越負面；越靠上方的品牌代表消費者越外放的情緒，反之，越下方則越內縮。
1. 以Oriflame、ONE1X、K牌脣膏為例，消費者感受到正面的情緒，並且在評論中的敘述是比較想把好心得分享給大家
2. NOTE Cosmetics、Nykaa、A牌脣膏則是讓消費者產生負面情緒，而且消費者的評論也較積極表現出不滿的心情
3. Kryolan雖然讓消費者有負面情緒，但消費者的評論卻是用比較婉轉的方式說產品的缺點

查看一下上述分析中的評論內容

lipstick_all_no_NA[lipstick_all_no_NA$brand == "Oriflame" | lipstick_all_no_NA$brand == "ONE1X", 1]

## [1] "its superb  my new lip love"

Oriflame、ONE1X的評論表現出很喜歡該品牌，有愛不釋手的感覺

lipstick_all_no_NA[lipstick_all_no_NA$brand == "NOTE Cosmetics", 1]

## [1] "i would never ever buy again    so disappointed   very bad experience"                      
## [2] "didn t keep it because it was totally broken off when i opened package   wouldn t buy agsin"

NOTE Cosmetics的評論很明顯唾棄這種品牌的脣膏，而且還非常不推薦給其他人

lipstick_all_no_NA[lipstick_all_no_NA$brand == "Kryolan", 1]

## [1] "it has quite a lot of wax smell in it"

Kryolan的評論沒有直接表現出對品牌的喜好，但委婉地說出產品的缺點

加入星星數量

star_amount = lipstick_all_no_NA %>%
  group_by(brand) %>%
  summarise(star = mean(star_sub)) %>%
  ungroup()

sentiment_join_star = sentiment_join %>%
  inner_join(star_amount, by = c(brand = "brand"))

sentiment_join_star %>%
  ggplot(aes(score.x, score.y, label = brand)) +
  geom_jitter(position = position_jitter(seed = 87), aes(size = star)) +   # 星星數越高，點點越大
  xlab("Valence") +
  ylab("Arousal") +
  geom_vline(xintercept = 1.5) +    # 加入線條以利區別點點位置
  geom_hline(yintercept = 1.5) +    # 加入線條以利區別點點位置
  scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) +    # 將x軸排序對調
  scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) +    # 將y軸排序對調
  geom_label_repel(position = position_jitter(seed = 87), size = 3, box.padding = 0.7)  # 把每個點點標上brand名稱

該品牌平均所得的星星越多，點點的size會越大
這幾個品牌的脣膏所拿到的星星數(size大小)，與其帶給消費者的情緒的關係符合常理，消費者感受越好，給的星星自然會比較多(size較大)。

加入lipstick商品排名(Rank2)

# 將同品牌lipstick的rank平均
rank_avg = lipstick_all_no_NA %>%
  group_by(brand) %>%
  summarise(rank = mean(Rank2)) %>%
  ungroup()

sentiment_join_rank = sentiment_join %>%
  inner_join(rank_avg, by = c(brand = "brand"))

sentiment_join_rank %>%
  ggplot(aes(score.x, score.y, label = brand)) +
  geom_jitter(position = position_jitter(seed = 87), aes(size = -rank)) +   # rank越前面，點點越大
  xlab("Valence") +
  ylab("Arousal") +
  geom_vline(xintercept = 1.5) +    # 加入線條以利區別點點位置
  geom_hline(yintercept = 1.5) +    # 加入線條以利區別點點位置
  scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) +    # 將x軸排序對調
  scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) +    # 將y軸排序對調
  geom_label_repel(position = position_jitter(seed = 87), size = 3, box.padding = 0.7)  # 把每個點點標上brand名稱

該品牌脣膏的銷售排名越好，點點的size會越大
- Oriflame牌的銷售排名並不好，可能是價錢較貴導致比較少人購買
- Addictive Cosmetics雖讓消費者印象比較不好，但是可能因為價格便宜，消費者比較買得下手
脣膏的銷售量排行與消費者的正負面情緒沒有太直接關係(把篩選的個數增加也一樣)

討論正負情緒、star與rank的關係

# 將正負面分數最高與最低各10個牌子，連結該牌的平均銷售排行與平均獲得星星數
valence_rank_star = sentiment_valence %>%
  inner_join(rank_avg, by = c(brand = "brand")) %>%
  inner_join(star_amount, by = c(brand = "brand"))

valence_rank_star

## # A tibble: 20 x 5
##    brand                 valence_all_avg score   rank  star
##    <fct>                           <dbl> <chr>  <dbl> <dbl>
##  1 Oriflame                        0.860 high  44499   5   
##  2 LCTCKP                          0.817 high  44649   5   
##  3 BORNTREE                        0.796 high  28540   4   
##  4 Stephanie Imports               0.783 high   6951   5   
##  5 Nasty Woman Cosmetics           0.776 high   5208   5   
##  6 Make Up For Ever                0.763 high  50366   4.8 
##  7 IOPE                            0.760 high  10295   4.7 
##  8 Clarins                         0.754 high  24797.  3.67
##  9 Splashes & Spills               0.753 high   2066   3.4 
## 10 Femme Couture                   0.753 high  10604   4.6 
## 11 Beauty Glazed                   0.329 low    9065   1   
## 12 A'some                          0.438 low   27233   2   
## 13 Huda Beauty                     0.484 low    4937   2.9 
## 14 Kryolan                         0.492 low   54303   1   
## 15 Smoke & Mirrors                 0.506 low   46511   1   
## 16 MILEMEI                         0.515 low   25234   1.33
## 17 NOTE Cosmetics                  0.522 low   39886   1   
## 18 Measurable Difference           0.529 low   28815   2   
## 19 VDIVOV                          0.532 low   15117   2.6 
## 20 True Color                      0.567 low   10458   2.9

正面情緒星星數高負面情緒星星數低

畫出正負情緒、star與rank的關係圖

valence_rank_star %>%
  ggplot(aes(star, valence_all_avg, label = brand)) +
  geom_point(aes(size = -rank, color = -rank)) +       # 銷售排行越好，點點越大，顏色越深
  #geom_text_repel(box.padding = 0.7, aes(size = 2)) +
  geom_label_repel( box.padding = 0.7, size = 3) +
  ylab("Sentiment Valence") +
  xlab("Star Amount")

X軸:獲得星星數(越往右星星越多)，Y軸:消費者的情緒分數(越往上消費者情緒越正向)
此圖可看出的關係與前面所做的蠻符合，脣膏的銷售量與消費者的情緒沒有太直接的關係，所以各處的點點有大有小。
但是該品牌所獲得的星星數量，可以明確表示消費者用過的體驗，故圖中的點點比較集中於右上方(體驗好，星星多)和左下方(體驗差，星星少)

主題分析

#LDA
conbine_document<-conbine_tidy %>%
  anti_join(stop_words,by=c("sentence"="word")) %>%
  count(linenumber, sentence, sort = TRUE) %>%
  ungroup()

#轉成dtm
top10_dtm <- conbine_document %>%
  cast_dtm(linenumber, sentence, n)

top10_dtm

## <<DocumentTermMatrix (documents: 59740, terms: 18587)>>
## Non-/sparse entries: 676002/1109711378
## Sparsity           : 100%
## Maximal term length: NA
## Weighting          : term frequency (tf)

lda_top10 <- LDA(top10_dtm, k = 2, control = list(seed = 1010))
lda_top10

## A LDA_VEM topic model with 2 topics.

#主題跟字的關係
document_topics <- tidy(lda_top10, matrix = "beta")
document_topics

## # A tibble: 37,174 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 red      0.0103  
##  2     2 red      0.00564 
##  3     1 organic  0.000536
##  4     2 organic  0.000190
##  5     1 lipstick 0.000670
##  6     2 lipstick 0.0620  
##  7     1 color    0.00674 
##  8     2 color    0.111   
##  9     1 product  0.0224  
## 10     2 product  0.00831 
## # … with 37,164 more rows

top_terms <- document_topics %>%
  filter(!term  %in% c("lipstick","color","love","lips","product","doe snt","lip","lipsticks"))%>%
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms

## # A tibble: 40 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 colors    0.0120 
##  2     1 day       0.0107 
##  3     1 shade     0.0107 
##  4     1 red       0.0103 
##  5     1 time      0.0102 
##  6     1 pretty    0.00951
##  7     1 im        0.00946
##  8     1 doesnt    0.00938
##  9     1 buy       0.00920
## 10     1 beautiful 0.00865
## # … with 30 more rows

#畫圖呈現主題跟字的關係

 top_terms %>%
   mutate(term = reorder(term, beta)) %>%
   ggplot(aes(term, beta, fill = factor(topic))) +
   geom_col(show.legend = FALSE) +
   facet_wrap(~ topic, scales = "free") +
   coord_flip()

使用主題模型分出來的結果不慎理想。
可能是由於一則評論不會只憑價口紅好不好用(質地、成分)也同時會評論其顏色和對自己的適合程度，因此無法區分出特定主題。

算出beta的差異值

beta_spread <- document_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_spread

## # A tibble: 219 x 4
##    term          topic1   topic2 log_ratio
##    <chr>          <dbl>    <dbl>     <dbl>
##  1 absolutely  0.00294  0.000617  -2.25   
##  2 add         0.00152  0.000174  -3.12   
##  3 amazing     0.00344  0.00202   -0.769  
##  4 amazon      0.00324  0.00106   -1.61   
##  5 application 0.00180  0.00253    0.496  
##  6 applicator  0.00112  0.000365  -1.61   
##  7 applied     0.000808 0.00349    2.11   
##  8 apply       0.00598  0.00601    0.00729
##  9 applying    0.00305  0.000647  -2.24   
## 10 arrived     0.00136  0.00103   -0.396  
## # … with 209 more rows

beta_spread%>%
  arrange(desc(abs(log_ratio)))  %>%
  head(20)%>%
  mutate(term = reorder(term, log_ratio)) %>%
  ggplot(aes(term, log_ratio, fill = log_ratio>0)) +
  geom_col(show.legend = FALSE) +
  xlab("term") +
  ylab("log(topic2 / topic1)") +
  coord_flip()

找出beta的最大差異值取log之後可以很明顯看出topic1偏向顏色；topic2偏向成份(texture質地,shimmer珠光,moisture水分)

以品牌的角度分析商品評論

近十年品牌討論度

#以品牌被討論的次數來找出前十大品牌
brand_discuss_df=lipstick_review %>% 
  group_by(.,brand,Year) %>%
  summarise(
    discuss_n=n()
  )%>% filter(Year>2009)

brand_discuss_df %>% 
  group_by(.,brand) %>%
  summarise(
    discuss_sum=sum(discuss_n) 
  ) ->top10_df


#top10品牌討論度by年分
brand_discuss_df$brand %in% 
  top10_df [order(top10_df$discuss_sum,decreasing = T),]$brand[1:10] %>%
  
  subset(brand_discuss_df,.)->discuss_plot_df

discuss_plot_df<- subset(discuss_plot_df,Year!="NA")

品牌討論數之時間分布圖(按年分)

ggplot(discuss_plot_df, aes(x = Year, y =discuss_n,color=brand)) + 
  geom_line() + geom_point(  fill = "white")+scale_x_continuous(breaks = c(2009:2019))+scale_y_continuous(breaks=c(0,1000,2000,3000,4000,5000,6000,7000,8000,9000))

Maybeline的討論度在2018年異軍突起。

brand_product_discuss_df=lipstick_review %>% 
  group_by(.,brand, ASIN,Year) %>%
  summarise(
    discuss_n=n()
  )%>% filter(Year>2009)

brand_product_discuss_df %>% 
  na.omit()%>%
  filter(Year==2018, brand%in% top10_df$brand)%>%
  arrange(desc(discuss_n))

## # A tibble: 1,340 x 4
## # Groups:   brand, ASIN [1,340]
##    brand               ASIN        Year discuss_n
##    <fct>               <chr>      <int>     <int>
##  1 Maybelline New York B06XF16MWM  2018      1226
##  2 Lime Crime          B00JH2B182  2018       487
##  3 L'Oreal Paris       B004BJ6VNG  2018       427
##  4 Maybelline New York B00467B0KC  2018       424
##  5 Burt's Bees         B014VZUH1G  2018       337
##  6 COVERGIRL           B00J2AP9OG  2018       335
##  7 LipSense            B00HD5NRSC  2018       313
##  8 Revlon              B001P2JZBC  2018       290
##  9 Revlon              B000V5OQNG  2018       265
## 10 Maybelline New York B07BM1XTKH  2018       253
## # … with 1,330 more rows

評論數量在近幾年逐漸有上升的趨勢，尤其以Maybelline在2018年評論數量暴增，其討論數為第二高之品牌的近乎三倍

#列出所有品牌在2018所出的產品
lipstick_all %>% 
  filter(Year == 2018) %>% 
  select(ASIN, Rank2, Price, color) %>% 
  unique() %>% 
  arrange(-desc(Rank2)) %>% 
  head(20)

##          ASIN Rank2 Price           color
## 1  B001P2JZBC     4  6.99    Bare Maximum
## 2  B00XQ9IRP2     6 19.93     Frog Prince
## 3  B00J2AP9OG    12  3.99           Clear
## 4  B00J2AP9OG    13  3.99           Clear
## 5  B014VZUH1G    14  6.03     Blush Basin
## 6  B00LNQB8AQ    16  9.99 Jennifer's Nude
## 7  B0032RMX3U    23  21.9  Black Honey 06
## 8  B00IOWYHJ6    40  6.85               1
## 9  B00G3DWC54    55   6.4           MATTE
## 10 B00VAY4AJM    57  7.52  Redwood Forest
## 11 B078VNH5MR    60 14.98               A
## 12 B071VCF42M    65  4.59      Rebel Rose
## 13 B018WCT05I    68  3.84      Violet Red
## 14 B01DPA81MK    75  4.98     Clear Vinyl
## 15 B00BG14WOM    78  4.97       Pink Frot
## 16 B07JFNVP61    83 12.99               A
## 17 B000V5OQNG    88  6.93            NUDE
## 18 B019YUECNC    92  3.89        Budapest
## 19 B01GH0421Y   110  3.59         1 Count
## 20 B00BR0DAG8   139    18          Single

Bigram & Trigram

使用bigram斷詞

#以bi-gram做斷詞
lipstick_bigrams <- lipstick_all%>%
  mutate(linenumber =row_number())%>%  
  unnest_tokens(bigram,art_Content, token = "ngrams", n = 2)

看各bigram的次數

lipstick_bigrams %>%
  count(bigram, sort = TRUE)

## # A tibble: 266,278 x 2
##    bigram           nn
##    <chr>         <int>
##  1 the color     14986
##  2 it is          7990
##  3 i love         7897
##  4 my lips        7722
##  5 this is        6569
##  6 i have         6534
##  7 is a           6215
##  8 color is       6045
##  9 this lipstick  5674
## 10 love this      5388
## # … with 266,268 more rows

將bigram拆成word1, word2

bigrams_separated <- lipstick_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts

## # A tibble: 71,279 x 3
##    word1     word2       nn
##    <chr>     <chr>    <int>
##  1 lip       color     2494
##  2 skin      tone      1253
##  3 lip       gloss     1157
##  4 lip       balm      1088
##  5 beautiful color     1026
##  6 love      love       913
##  7 staying   power      903
##  8 nice      color      893
##  9 lime      crime      798
## 10 matte     lipstick   767
## # … with 71,269 more rows

使用trigram做斷詞

lipstick_all %>%
  unnest_tokens(trigram, art_Content, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

## # A tibble: 44,860 x 4
##    word1   word2 word3    nn
##    <chr>   <chr> <chr> <int>
##  1 <NA>    <NA>  <NA>   1922
##  2 love    love  love    414
##  3 natural lip   color   271
##  4 matte   lip   cream   127
##  5 matte   lip   color   102
##  6 lasting lip   color    92
##  7 soft    matte lip      88
##  8 lips    feel  dry      83
##  9 tinted  lip   balm     83
## 10 love    lime  crime    74
## # … with 44,850 more rows

#將trigram的結果拆成三個字並另存成一dataframe
lipstick_trigrams <- lipstick_all%>%
  mutate(linenumber =row_number())%>%  
  unnest_tokens(trigram,art_Content, token = "ngrams", n = 3)


trigrams_separated <- lipstick_trigrams %>%
  separate(trigram, c("word1", "word2","word3"), sep = " ")

trigrams_filtered <- trigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  filter(!word3 %in% stop_words$word)

找出評論中提到自己所使用的口紅顏色

#找出評論中提到的商品顏色
bigram_color <- filter(bigrams_filtered,word2%in% lipstick_detail$color)
trigram_color <- filter(trigrams_filtered,word3%in% lipstick_detail$color)

將bigarm的兩個字連接後的wordcloud

bigrams_united <- bigram_color %>%
  unite(bigram, word1, word2, sep = "_") %>% 
  filter(!bigram %in% c("NA_NA"))


set.seed(1234)
wordcloud(words = bigrams_united$bigram,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

最常被提到的是膚色

trigram三個字連接後的wordcloud

trigrams_united <- trigram_color %>%
  unite(trigram, word1, word2, word3, sep = "_")%>%
  filter(!trigram %in% c("NA_NA_NA"))

set.seed(1234)
wordcloud(words = trigrams_united$trigram,
         max.words=200, random.order=FALSE, rot.per=0.35, 
         colors=brewer.pal(8, "Dark2"))

olive膚色調出現最多，其次為白皙的皮膚色調。

去掉所有形容膚色與嘴唇的詞彙留下可能為討論口紅顏色的顏色

#先去掉bobbi brown(其為品牌名稱而不是一種顏色)
trigram_color <- trigram_color %>% 
  filter(!(word2 =="bobbi" & word3 =="brown"))

pure_color <- trigram_color %>% 
  filter(word2 !="skin") %>% 
  filter(word2 !="lip")

合併詞彙再做一次文字雲

new_united <- pure_color %>%
  unite(trigram, word1, word2, word3, sep = "_")


set.seed(1234)
wordcloud(words = new_united$trigram,
          max.words=200, random.order=FALSE, rot.per=0.35, 
         colors=brewer.pal(8, "Dark2"))

帶藍調(偏冷色系)的紅色最常被提到

# 將上述結果以長條圖表示
trigram<-new_united %>%
  count(trigram, sort = TRUE)

color_discuss <- trigram %>%
  filter(nn>10)%>%
  ggplot(.,aes(x=reorder(trigram,nn),y=nn))+
  geom_bar(stat="identity")+
  coord_flip()+ labs(y='count',x='trigram of color')
  
color_discuss

第二常出現的顏色是泡泡糖粉，第三則為消防車紅

十大品牌中被討論的顏色

color_mentioned <- new_united %>% 
  filter(brand %in% discuss_plot_df$brand) %>% 
  count(trigram, sort = TRUE) 

color_mentioned

## # A tibble: 1,658 x 2
##    trigram              nn
##    <chr>             <int>
##  1 blue_based_red       19
##  2 fire_engine_red      18
##  3 bubble_gum_pink      16
##  4 blue_toned_red       15
##  5 deep_dark_red        11
##  6 perfect_matte_red    11
##  7 nice_deep_red        10
##  8 nice_bright_red       7
##  9 true_blue_red         7
## 10 barbie_doll_pink      6
## # … with 1,648 more rows

color_plot <- color_mentioned %>%
  head(10)%>%
  ggplot(.,aes(x=reorder(trigram,nn),y=nn))+
  geom_bar(stat="identity")+
  coord_flip()+ labs(y='count',x='trigram of color')

color_plot

前十大品牌和所有品牌的結果相似。
表示十大品牌有跟隨顏色的趨勢來推出產品。
或可以說市面上產品的確跟隨著流行色系。

#找出10大品牌討論中所提到的顏色
#只顯示10種
top10_color <- color_mentioned %>% 
  head(10) 
  
new_united %>% 
  filter(brand %in% discuss_plot_df$brand) %>% 
  filter(trigram %in% color_mentioned$trigram) %>% 
  filter(trigram %in% top10_color$trigram) %>% 
  select(brand, trigram) %>% 
  ggplot(aes(trigram, fill=brand)) + geom_bar(position="stack")

由圖可得知，Maybeline近十年每年的討論度都排在前十名的前段，是因為其產品顏色多樣，且其出了許多討論度高的顏色。
Loreal同系列的口紅會出相當多種顏色，但其顏色卻不是熱門的討論色系，可見Maybeline與Loreal此兩大以唇膏著名的品牌有著不同的產品策略。

針對2018年看10大品牌討論的顏色

new_united %>%
  filter(Year == 2018) %>% 
  filter(brand %in% discuss_plot_df$brand) %>% 
  filter(trigram %in% color_mentioned$trigram) %>% 
  filter(trigram %in% top10_color$trigram) %>% 
  select(brand, trigram) %>% 
  ggplot(aes(trigram, fill=brand)) + geom_bar(position="stack")

2018年十大品牌中所討論的顏色只有8種，但同樣可見，Maybeline的口紅顏色是常被討論的就涵蓋了五種，因此同樣可知Maybeline最大的優勢是顏色。
但值得注意的是，當年度討論度最高的bubble gum pink卻沒有出現在Maybeline的產品中。
而這一年，雖然Loreal和Revlon的產品評論皆沒提到顏色，但其討論度相較前一年也是稍有增長，推測此兩品牌的產品可能有其他特點受消費者喜愛

口紅質地討論(亮面/霧面)

#建立要被篩選的字
mat_word <- c("matt","matte","matted")
gloss_word <- c("gloss","glossy")

#以trigram找出有被提到是霧面還亮面的評論
matte <- filter(trigrams_filtered, word1 %in% mat_word | word2 %in% mat_word | word3 %in% mat_word)
gloss <- filter(trigrams_filtered, word1 %in% gloss_word | word2 %in% gloss_word | word3 %in% gloss_word)

color <- conbine_tidy %>% filter(sentence %in% mat_word |sentence %in% gloss_word | sentence %in% c("dark","red","tone","brown","pink","orange","black","purple","fire","cupcake","nap","cabernet","lay","temptation"))

## 唇膏質地趨勢圖(霧面 vs 亮面)
color %>%
  select(art_Date1,sentence)%>%
  count(art_Date1,sentence) %>%
  arrange(desc(n))%>%
  filter(n>10)%>%
  filter(sentence %in% mat_word | sentence %in% gloss_word)%>%
  ggplot(.,aes(art_Date1,n, fill = sentence,label=sentence)) +
 geom_line(aes(color = sentence), position="identity")

## 顏色趨勢圖
color %>%
  select(art_Date1,sentence)%>%
  count(art_Date1,sentence) %>%
  arrange(desc(n))%>%
  filter(n>10)%>%
  filter(!sentence %in% mat_word & !sentence %in% gloss_word)%>%
  ggplot(.,aes(art_Date1,n, fill = sentence,label=sentence)) +
 geom_line(aes(color = sentence), position="identity")

分析各膚色人數多寡

# 挑出提到"skin tone"的詞彙
## 其他含skin的trigram可能也是在說評論者的膚色，但無法直接判斷是原膚色還是擦了口紅後使皮膚顏色暗沉或變得明亮，因此只留下含有"skin tone"的trigram
skin_color <- trigram_color %>% 
  filter(word2 =="skin") %>% 
  filter(word3 == "tone")


skin_color %>% 
  group_by(word1) %>%
  count() %>%
  filter(nn>2) %>% 
  ggplot(.,aes(x=reorder(word1,nn),y=nn))+
  geom_bar(stat="identity")+
  coord_flip()+ labs(y='count',x='skin color')

Olive 最多，其次是中間膚色，再來是白皙的膚色，而後面次數很少的詞其實大多跟以上三種差不多，只是形容詞不同的差別。
未來可以看看心理學的研究，了解膚色是否與性格有關，寫出的評論是否會有不同(Ex.健康膚色的人較活潑喜歡分享，因此較常提到自己的膚色)。

# 計算個膚色人數
color_count <- trigram_color %>% 
  count(word1, word2, word3, sort = TRUE)

color_count

## # A tibble: 2,475 x 4
##    word1  word2  word3    nn
##    <chr>  <chr>  <chr> <int>
##  1 olive  skin   tone     52
##  2 medium skin   tone     46
##  3 fair   skin   tone     31
##  4 blue   based  red      28
##  5 bubble gum    pink     26
##  6 fire   engine red      23
##  7 darker skin   tone     22
##  8 light  skin   tone     21
##  9 dark   skin   tone     20
## 10 brown  skin   tone     18
## # … with 2,465 more rows

#找出有提到自己膚色的評論
temp <- trigram_color %>% 
  unite(trigram, word1, word2, word3, sep = "_") %>% 
  group_by(author_name, ASIN) %>% 
  count(trigram)

# 找出同時提到兩種顏色的評論(膚色+口紅顏色)
names <- temp %>%
  group_by(author_name, ASIN) %>% 
  count(author_name)

names <- names %>% 
  filter(n==2)

pair <- temp %>% 
  filter(author_name %in% names$author_name)

計算詞彙共同出現的關聯性

#correating pairs
library(widyr)

word_cors <- pair %>%
  pairwise_cor(trigram, author_name, sort = TRUE)

word_cors

## # A tibble: 629,642 x 3
##    item1                    item2                    correlation
##    <chr>                    <chr>                          <dbl>
##  1 opaque_dark_red          gorgeous_opaque_dark               1
##  2 gorgeous_opaque_dark     opaque_dark_red                    1
##  3 stay_bright_pink         sheer_berry_pink                   1
##  4 sheer_berry_pink         stay_bright_pink                   1
##  5 havent_worn_black        beautiful_luscious_black           1
##  6 beautiful_luscious_black havent_worn_black                  1
##  7 mahogany_purply_red      burgandy_brown_red                 1
##  8 burgandy_brown_red       mahogany_purply_red                1
##  9 nude_pink_tone           flashy_bright_pink                 1
## 10 flashy_bright_pink       nude_pink_tone                     1
## # … with 629,632 more rows

skin_united <- skin_color %>% 
  unite(trigram, word1, word2, word3, sep = "_") 

# 篩選描述膚色的詞彙(item1)搭配口紅顏色(item2)
word_cors <- word_cors %>% 
  filter(item1 %in% skin_united$trigram)

word_cors

## # A tibble: 26,962 x 3
##    item1                 item2                   correlation
##    <chr>                 <chr>                         <dbl>
##  1 reddish_skin_tone     beige_light_purple                1
##  2 girbilive_skin_tone   hot_bubblegum_pink                1
##  3 compliments_skin_tone hot_bubblegum_pink                1
##  4 girbilive_skin_tone   natural_pink_brown                1
##  5 compliments_skin_tone natural_pink_brown                1
##  6 girbilive_skin_tone   solid_matte_red                   1
##  7 compliments_skin_tone solid_matte_red                   1
##  8 girbilive_skin_tone   bit_darker_pink                   1
##  9 compliments_skin_tone bit_darker_pink                   1
## 10 girbilive_skin_tone   differs_personally_pink           1
## # … with 26,952 more rows

# 找出人數多寡前三名的膚色
skin_rank <- skin_united %>% 
  group_by(trigram) %>% 
  count(sort = TRUE) %>% 
  filter(nn>15)

skin_rank

## # A tibble: 7 x 2
## # Groups:   trigram [7]
##   trigram             nn
##   <chr>            <int>
## 1 olive_skin_tone     52
## 2 medium_skin_tone    46
## 3 fair_skin_tone      31
## 4 darker_skin_tone    22
## 5 light_skin_tone     21
## 6 dark_skin_tone      20
## 7 brown_skin_tone     18

most <- skin_rank %>% 
  head(3)

對膚色調與口紅顏色繪製網路圖

set.seed(1000)

word_cors %>%
  filter(item1 %in% skin_rank$trigram) %>%
  filter(correlation > .5) %>% 
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

和明亮膚色一起出現的口紅顏色為俄羅斯紅、活力紅、亮橘紅等。
和暗沉膚色一起出現的口紅顏色為自然棕與淡粉色兩種。

# 選出最多人擁有的三種膚色調
# 挑選出correlation > 0.37 的口紅顏色
most_cor <- word_cors %>%
  filter(item1 %in% most$trigram) %>%
  filter(correlation > .37) 


## 區分色系
red <- pure_color %>% 
  filter(word1 =="red" | word2 == "red" | word3 == "red") %>% 
  unite(trigram, word1, word2, word3, sep = "_") %>% 
  select(trigram)

pink <- pure_color %>% 
  filter(word1 =="pink" | word2 == "pink" | word3 == "pink") %>% 
  unite(trigram, word1, word2, word3, sep = "_") %>% 
  select(trigram)

orange <- pure_color %>% 
  filter(word1 =="orange" | word2 == "orange" | word3 == "orange") %>% 
  unite(trigram, word1, word2, word3, sep = "_") %>% 
  select(trigram)

brown <- pure_color %>% 
  filter(word1 =="brown" | word2 == "brown" | word3 == "brown") %>% 
  unite(trigram, word1, word2, word3, sep = "_") %>% 
  select(trigram)

black <- pure_color %>% 
  filter(word1 =="black" | word2 == "black" | word3 == "black") %>% 
  unite(trigram, word1, word2, word3, sep = "_") %>% 
  select(trigram)

# 區分correlation的色系
R <- most_cor %>% 
  filter(item2 %in% red$trigram) %>% 
  mutate(color="red_tone")

P <- most_cor %>% 
  filter(item2 %in% pink$trigram) %>% 
  mutate(color="pink_tone")

O <- most_cor %>% 
  filter(item2 %in% orange$trigram) %>% 
  mutate(color="orange_tone")

Br <- most_cor %>% 
  filter(item2 %in% brown$trigram) %>% 
  mutate(color="brown_tone")

Bl <- most_cor %>% 
  filter(item2 %in% black$trigram) %>% 
  mutate(color="black_tone")

most_cor <- rbind(R, P, O, Br, Bl)

most_cor

## # A tibble: 303 x 4
##    item1          item2              correlation color   
##    <chr>          <chr>                    <dbl> <chr>   
##  1 fair_skin_tone solid_matte_red          0.405 red_tone
##  2 fair_skin_tone horrible_color_red       0.405 red_tone
##  3 fair_skin_tone rich_girl_red            0.405 red_tone
##  4 fair_skin_tone darker_deeper_red        0.405 red_tone
##  5 fair_skin_tone mouse_bright_red         0.405 red_tone
##  6 fair_skin_tone ugly_red_brown           0.405 red_tone
##  7 fair_skin_tone pink_blue_red            0.405 red_tone
##  8 fair_skin_tone orange_tone_red          0.405 red_tone
##  9 fair_skin_tone snow_white_red           0.405 red_tone
## 10 fair_skin_tone deep_wine_red            0.405 red_tone
## # … with 293 more rows

繪製前三大膚色與口紅顏色的關聯網路圖

tone_cor <- most_cor
tone_cor$item2 <- most_cor$color
tone_cor <- tone_cor[,-4]

tone_cor %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "pink", size = 5) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

中間色調的膚色和其他膚色及口紅顏色相距最遠，correlation最小
由圖可見白皙的膚調(fair skin tone)果然最適合各種顏色的口紅，因此在最中間的位子，其印證了網路上許多教學口紅挑選秘訣(依膚色挑選口紅色號)是正確的。
而最常與數人膚色(olive skin tone)一起出現的口紅色調為粉色調、紅色調以及橘色調。
可以推測擁有olive skin tone的消費者可能較不喜歡黑色或棕色等暗色系的口紅，可能會顯得皮膚髒或是氣色差。

網絡分析

畫出評論者對品牌的網路圖，觀察會不會區分成不同的集群

tmp <- lipstick_all %>% 
  select(author_name, brand) %>% 
  #filter(author_name != 'Kindle Customer' & author_name != 'Amazon Customer') %>% 
  group_by(brand) %>% 
  mutate(brand_sum = n()) %>% 
  ungroup() %>% 
  arrange(desc(brand_sum))

# 只取前10大品牌
top_10 <- unique(tmp$brand)[1:10]

# 因為評論者太多 只取留言數夠多的評論者做呈現
link <- tmp %>% 
  filter(brand %in% top_10) %>% 
  group_by(author_name) %>% 
  mutate(author_sum = n()) %>% 
  ungroup() %>% 
  distinct() %>% 
  filter(author_sum > 30)


author <- select(link, author_name, author_sum)
brand <- select(link, brand, brand_sum)
names(author) <- NULL
names(brand) <- NULL

nodes <- rbind(data.frame(author),data.frame(brand)) %>% distinct()


reviewNetwork <- graph_from_data_frame(d=link, v=nodes,directed = F)
reviewNetwork

## IGRAPH 03e4d19 UN-- 28 165 -- 
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 03e4d19 (vertex names):
##  [1] Amazon Customer--COVERGIRL Lisa           --COVERGIRL
##  [3] Laura          --COVERGIRL Mary           --COVERGIRL
##  [5] Kindle Customer--COVERGIRL Ashley         --COVERGIRL
##  [7] Jessica        --COVERGIRL Linda          --COVERGIRL
##  [9] Michelle       --COVERGIRL Stephanie      --COVERGIRL
## [11] Sarah          --COVERGIRL Amy            --COVERGIRL
## [13] Melissa        --COVERGIRL Anna           --COVERGIRL
## [15] Nicole         --COVERGIRL Amanda         --COVERGIRL
## + ... omitted several edges

V(reviewNetwork)$size <- log(V(reviewNetwork)$X2)*3

# 黃色是品牌的node 藍色是author
plot(reviewNetwork,
     vertex.color=c("skyblue", "yellow")[1+(names(V(reviewNetwork)) %in% top_10)])

看圖可以看到前10大品牌都是很受大家歡迎的，並沒有分出明顯的群集

以近十年平均品牌聲量最高的三個品牌網路圖

1: Maybeline New York

linkM <- link %>% 
  filter(brand == "Maybelline New York")

authorM <- select(linkM, author_name, author_sum)
brandM <- select(linkM, brand, brand_sum)
names(authorM) <- NULL
names(brandM) <- NULL

nodesM <- rbind(data.frame(authorM),data.frame(brandM)) %>% distinct()


NetworkM <- graph_from_data_frame(d=linkM, v=nodesM,directed = F)
NetworkM

## IGRAPH 78a4532 UN-- 19 18 -- 
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 78a4532 (vertex names):
##  [1] Amazon Customer--Maybelline New York
##  [2] Ashley         --Maybelline New York
##  [3] Kindle Customer--Maybelline New York
##  [4] Amanda         --Maybelline New York
##  [5] Amy            --Maybelline New York
##  [6] Sarah          --Maybelline New York
##  [7] Laura          --Maybelline New York
##  [8] Linda          --Maybelline New York
## + ... omitted several edges

V(NetworkM)$size <- log(V(NetworkM)$X2)*3

plot(NetworkM,
     vertex.color=c("red","pink")[1+(names(V(NetworkM)) %in% top_10)])

2: L’Oreal Paris

linkL <- link %>% 
  filter(brand == "L'Oreal Paris")

authorL <- select(linkL, author_name, author_sum)
brandL <- select(linkL, brand, brand_sum)
names(authorL) <- NULL
names(brandL) <- NULL

nodesL <- rbind(data.frame(authorL),data.frame(brandL)) %>% distinct()


NetworkL <- graph_from_data_frame(d=linkL, v=nodesL,directed = F)
NetworkL

## IGRAPH 35fcddf UN-- 18 17 -- 
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 35fcddf (vertex names):
##  [1] Amazon Customer--L'Oreal Paris Kindle Customer--L'Oreal Paris
##  [3] Nicole         --L'Oreal Paris Melissa        --L'Oreal Paris
##  [5] Mary           --L'Oreal Paris Jessica        --L'Oreal Paris
##  [7] Anna           --L'Oreal Paris Lauren         --L'Oreal Paris
##  [9] Michelle       --L'Oreal Paris Sarah          --L'Oreal Paris
## [11] Amy            --L'Oreal Paris Lisa           --L'Oreal Paris
## [13] Linda          --L'Oreal Paris Amanda         --L'Oreal Paris
## [15] Stephanie      --L'Oreal Paris Jennifer       --L'Oreal Paris
## + ... omitted several edges

V(NetworkL)$size <- log(V(NetworkL)$X2)*3

plot(NetworkL,
     vertex.color=c("blue","gray")[1+(names(V(NetworkL)) %in% top_10)])

3: Revlon

linkR <- link %>% 
  filter(brand == "Revlon")

authorR <- select(linkR, author_name, author_sum)
brandR <- select(linkR, brand, brand_sum)
names(authorR) <- NULL
names(brandR) <- NULL

nodesR <- rbind(data.frame(authorR),data.frame(brandR)) %>% distinct()


NetworkR <- graph_from_data_frame(d=linkR, v=nodesR,directed = F)
NetworkR

## IGRAPH 1c3e4c5 UN-- 18 17 -- 
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 1c3e4c5 (vertex names):
##  [1] Amazon Customer--Revlon Kindle Customer--Revlon
##  [3] Jessica        --Revlon Amy            --Revlon
##  [5] Ashley         --Revlon Lisa           --Revlon
##  [7] Amanda         --Revlon Linda          --Revlon
##  [9] Melissa        --Revlon Mary           --Revlon
## [11] Nicole         --Revlon Michelle       --Revlon
## [13] Sarah          --Revlon Laura          --Revlon
## [15] Stephanie      --Revlon Lauren         --Revlon
## + ... omitted several edges

V(NetworkR)$size <- log(V(NetworkR)$X2)*4

plot(NetworkR,
     vertex.color=c("orange","yellow")[1+(names(V(NetworkR)) %in% top_10)])

三張圖可以看到匿名的評論(author_name = Amazon Customer)在品牌的網絡中佔了較大的影響。

了解各個評論者對評分的網路圖會不會有評論者只評高分或只評低分

length(unique(lipstick$author_star))

## [1] 5

star_link <- lipstick_all %>% 
  select(author_name, author_star_sub) %>% 
  group_by(author_name) %>% 
  mutate(author_sum = n()) %>% 
  distinct() %>% 
  filter(author_sum > 40)


starNetwork <- graph_from_data_frame(d=star_link,directed = F)
starNetwork

## IGRAPH b01d720 UN-- 29 120 -- 
## + attr: name (v/c), author_sum (e/n)
## + edges from b01d720 (vertex names):
##  [1] Amazon Customer--5 Nicole         --5 Amazon Customer--4
##  [4] Kindle Customer--5 Amanda         --5 Jessica        --4
##  [7] Katie          --5 Amazon Customer--1 Amazon Customer--2
## [10] Amazon Customer--3 Kindle Customer--1 Brittany       --5
## [13] Nicole         --2 Mary           --5 Sarah          --1
## [16] Lisa           --5 Jessica        --1 Michelle       --2
## [19] Jennifer       --1 Lauren         --5 Amanda         --2
## [22] Brittany       --4 Stephanie      --5 Amy            --4
## + ... omitted several edges

plot(starNetwork)

評分很平均，不會特別偏低或偏評高分。

買家對價格的網路圖看看有沒有一群人都買高價或都買低價

price_tmp <- lipstick_all %>% 
  select(author_name, Price)

price_tmp$Price <- as.numeric(price_tmp$Price)

## Warning: NAs introduced by coercion

price_tmp <- na.omit(price_tmp)

summary(price_tmp$Price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.92    5.85    7.95   11.98   19.66   79.95

plot(density(price_tmp$Price))

#處理links
price_link <- price_tmp %>% 
  group_by(author_name) %>% 
  mutate(author_sum = n()) %>% 
  ungroup() %>% 
  mutate(price_index = ifelse(Price>20,'high','low')) %>% 
  filter(author_sum > 50) %>% 
  select(author_name, price_index, author_sum) 

 

priceNetwork <- graph_from_data_frame(d=price_link)
priceNetwork

## IGRAPH 930eddd DN-- 15 6122 -- 
## + attr: name (v/c), author_sum (e/n)
## + edges from 930eddd (vertex names):
##  [1] Amazon Customer->low Amazon Customer->low Amazon Customer->low
##  [4] Kindle Customer->low Amazon Customer->low Amanda         ->low
##  [7] Amanda         ->low Amazon Customer->low Jessica        ->low
## [10] Amazon Customer->low Amazon Customer->low Jessica        ->low
## [13] Amazon Customer->low Amazon Customer->low Kindle Customer->low
## [16] Amazon Customer->low Amazon Customer->low Amazon Customer->low
## [19] Amazon Customer->low Amazon Customer->low Amazon Customer->low
## [22] Mary           ->low Sarah          ->low Lisa           ->low
## + ... omitted several edges

plot(priceNetwork, vertex.color=c("skyblue"))

低價的商品購買數量比較多，但也看不出明顯分群，高低價錢大家都買單

使用 SVM 做分類

準備資料

#類別轉factor
new_united$brand <- as.factor(new_united$brand)
new_united$trigram <- as.factor(new_united$trigram)


#新增一欄位：是否會進銷售排行前十名
predict_data <- new_united %>% 
  mutate(inRanking = ifelse(Rank2 >= 10, "no", "yes"))
predict_data$inRanking <- as.factor(predict_data$inRanking)

#切分訓練資料
set.seed(1)
indx <- sample(1:nrow(predict_data), nrow(predict_data)*0.7)
test  <- predict_data[-indx,]
train <- predict_data[indx,]

分類預測

## 用品牌及 trigram 的結果預測是否會進入銷售排行前十名
library(e1071)
svmfit <- svm(inRanking ~ brand + trigram, data=train , kernel ="linear", cost=10,scale=FALSE)

ypred <- predict (svmfit ,test)
x=table(predict = ypred , truth = test$inRanking )
sum(diag(x))/sum(x)

## [1] 0.9740634

以品牌及被討論的顏色來做分類準確率可達97%
在評論當中提到商品的顏色對評論的有用性來說是重要的參考

結論

1.唇膏的評論難以使用主題模型分類。
2.從評論當中可以發現消費者對於唇膏的顏色，以及是否搭配自己的膚色較為在意。
3.唇膏的目標客群通常為女性，因此從網路圖也可以發現評論者通常以女性居多
4.一開始使用單字斷詞進行分析時，所呈現出的效果不盡理想，最後嘗試透過bigram及trigram進行切割之後，所得出的字詞比較具有意義及可解釋性，最後我們使用trigram的結果來預測銷售績效也達到了不錯的效果。
5.品牌對於唇膏的銷售是有影響力的。
6.跟隨流行的顏色推出產品很重要。
7.即便品牌沒有以流行色為產品主軸，但仍然可以依其品牌特質提高討論度。