Final Project-Lipstick Review on Amazon

資料來源

動機與目的

欲了解在Amazon.com上留言的評論者對於唇膏的整體看法，並以不同的角度剖析評論，像是了解評論者的情緒起伏，或是評論當中所涵蓋的主題內容，評論字詞使用的相似程度，甚至是評論者與品牌、星等、價格之間的關聯性。最後透過模型預測什麼樣的評論有可能會進入銷售前十名

基本設定+載入package

setwd("~/Lab")
getwd()

## [1] "/home/m064020020/Lab"

packages = c(
  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest","tidytext","wordcloud","topicmodels","doParallel")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

Sys.setlocale("LC_ALL","C")

## [1] "LC_CTYPE=C;LC_NUMERIC=C;LC_TIME=C;LC_COLLATE=C;LC_MONETARY=C;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"

 #options(digits=5, scipen=10)

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tm)

## Loading required package: NLP

library(SnowballC)
library(ROCR)

## Loading required package: gplots

## 
## Attaching package: 'gplots'

## The following object is masked from 'package:stats':
## 
##     lowess

library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(wordcloud)

## Loading required package: RColorBrewer

## 
## Attaching package: 'wordcloud'

## The following object is masked from 'package:gplots':
## 
##     textplot

library(stringr)
library(tidytext)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:randomForest':
## 
##     margin

## The following object is masked from 'package:NLP':
## 
##     annotate

library(tidyr)
library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

library(readr)
library(topicmodels)
library(lexicon)
library(koRpus)

## Loading required package: sylly

## For information on available language packages for 'koRpus', run
## 
##   available.koRpus.lang()
## 
## and see ?install.koRpus.lang()

## 
## Attaching package: 'koRpus'

## The following object is masked from 'package:readr':
## 
##     tokenize

library(quanteda)

## Package version: 1.4.3

## Parallel computing: 2 of 24 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:koRpus':
## 
##     tokens, types

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

library(syuzhet)
library(corrplot)

## corrplot 0.84 loaded

library(slam)

## 
## Attaching package: 'slam'

## The following object is masked from 'package:data.table':
## 
##     rollup

library(doParallel)

## Loading required package: foreach

## Loading required package: iterators

## Loading required package: parallel

library(Rtsne)
library(lm.beta)
library(igraph)

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:quanteda':
## 
##     as.igraph

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

library(ggrepel)
library("wordcloud")
library("RColorBrewer")

讀資料

lipstick_detail=fread('./data/yuchia.lipstick_info_original.csv',na.strings=c("","NA"),stringsAsFactors = F)
lip_sub=fread('./data/yuchia.lipstick_info_suburl.csv',na.strings=c("","NA"),stringsAsFactors = F)


Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼

## [1] "LC_CTYPE=zh_TW.UTF-8;LC_NUMERIC=C;LC_TIME=zh_TW.UTF-8;LC_COLLATE=zh_TW.UTF-8;LC_MONETARY=zh_TW.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"

lipstick=fread('./data/yuchia.amazon_lipstick_review.csv',header = T,stringsAsFactors = F)

review資料前處理

##因為評論有些欄位有中文資料，需事先做前處理
##刪除空白$換行
lipstick$availability<- gsub("\\n", "", lipstick$availability)%>%
  gsub("^\\s+|\\s+$", "", .)%>%
  gsub("[.]", "",.)

#unique(lipstick$availability)
#unique(lipstick$star)


#找出沒有中文的欄位
lipstick_nochiness<-lipstick %>% filter(grepl('[a-zA-Z]',lipstick$availability))
lipstick_nochiness$star_sub<-substr(lipstick_nochiness$star,1,3)

##處理字串
lipstick_nochiness$availability<- gsub("^Only.*|^In.*|.*ship.*","In Stock",lipstick_nochiness$availability)%>%
  gsub("^Available.*","Unavailable",.)

#unique(lipstick_nochiness$availability)

##找出有中文的欄位
lipstick_chiness<-lipstick %>% filter(!grepl('[a-zA-Z]',lipstick$availability))
##處理星等
lipstick_chiness$star_sub<-substr(lipstick_chiness$star,7,10)
##處理字串
lipstick_chiness$availability<- gsub("庫存僅剩\\s[0-9].*", "In Stock", lipstick_chiness$availability) %>%
  gsub(".*[有]?現貨。", "In Stock",.)%>%
  gsub("來自\\s這些暢銷品\\s有貨。", "Unavailable",.) %>%
  gsub("暫時缺貨\\s,", "Unavailable",.)

##合併兩張dataframe
lipstick_review<-rbind(lipstick_nochiness,lipstick_chiness)


##Date:
Sys.setlocale("LC_TIME", "C")

## [1] "C"

lipstick_review$art_Date1=as.Date(lipstick_review$art_Date,format = "%B %d,%Y")
lipstick_review$Year=year(lipstick_review$art_Date1)
lipstick_review$Month=format(lipstick_review$art_Date1, "%m")

##Brand:
lipstick_review$brand=lipstick_review$brand %>% as.factor
brand=lipstick_review$brand %>% unique

##Star:
#lipstick_review$star_sub=lipstick_review$star1 %>% substr(1,3)
lipstick_review$author_star_sub=lipstick_review$author_star %>% substr(1,3)
lipstick_review$star_sub =lipstick_review$star_sub %>% as.numeric()
lipstick_review$author_star_sub =lipstick_review$author_star_sub %>% as.numeric()

lipstick_review<-distinct(lipstick_review,art_Content, .keep_all= TRUE)#刪除重複


## 唇膏商品資料前處理

##刪除多餘的欄位
lipstick_detail <- lipstick_detail[,-c(22:45)]
## 與suburl合併
lipstick_detail<-rbind(lipstick_detail,lip_sub,fill=T)
## 刪除重複
lipstick_detail<-distinct(lipstick_detail,URL, .keep_all= TRUE)#刪除重複

review,information資料合併

#唇膏整合排名by ASIN
lipstick_review$a<-lapply(lipstick_review$art_Url,function(x){strsplit(x, "/")[[1]][6]})%>%as.character()
lip_combine<-lipstick_detail %>%
  filter(Category1 %in% c('Beauty & Personal Care','Beauty & Peronal Care'))%>%
  select(Rank1,Rank2,Category1,ASIN,Customer_Reviews)
lipstick_all<-lipstick_review%>%left_join(lip_combine,by=c("a"="ASIN"))


lipstick_all$Rank1<-lipstick_all$Rank1%>% str_replace_all(",","") %>% as.numeric
lipstick_all$Rank2<-lipstick_all$Rank2%>% str_replace_all(",","") %>% as.numeric

##只篩選有存貨&有Rank1的資料
lipstick_all<-lipstick_all %>%
  filter(availability=="In Stock")%>%
  filter(is.na(Rank2)==FALSE)


head(lipstick_all)

##                        _id
## 1 5c892605923ce022a2d09ff5
## 2 5c892605923ce022a2d09ff7
## 3 5c892605923ce022a2d09ff9
## 4 5c892605923ce022a2d09ffb
## 5 5c892605923ce022a2d09ffd
## 6 5c892605923ce022a2d09fff
##                                                                                                                                                                                                                                                                                                                                                                                                                             art_Content
## 1                                                                                                                                                                                                                                                                    very sticky. The colors were a little off. Caution to others: it drys very fast and very hard to take off. Id recommend to use coconut oil to remove the lipstick.
## 2         I really love the dark red lip color in this set. Yes some of the lighter colors in the set were ""sticky"" and settled in the lines of my lips however I solved that quickly by mosturizing my lips before applying the lip color and applying a setting powder.  I would reccommend based on the colors as it is difficult to find a good set where all the colors compliment your skin tone like this one does to my skin.
## 3                                                                                                                                                                                                                             Product definitely dries quick, but the color pallet is NOT a truly representaed by the photos. The color pallet I received is very brown/nude earth tones-not pink as it displays. Definitely returning.
## 4 The Color is beautiful but it's very strange texture! it makes your lips stick together! i put it on and was like ahh, this is great, and about 60 seconds later i'm saying........ what the heck is going on... I am going to try putting a coat of clear chapstick over it I guess? Too inexpensive to return, so i'm going to try and work with it, but very strange texture once it dries. you can't even rub your lips together.
## 5                                                                                                                                                                                                                                                                                                Really pretty colors for the price! Dried quickly from the swatches and smell like chocolate \U0001f60a I cant wait to try on my lips.
## 6                                                                                                                                                                                                                                                                                                                                          Super Sticky! Starts off ok, nice colors but like the other probably better with clear gloss
##            art_Date
## 1 February 18, 2018
## 2  November 4, 2018
## 3 February 14, 2018
## 4  January 11, 2018
## 5 September 9, 2018
## 6  January 18, 2018
##                                                                                                                                                      art_Url
## 1 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 2 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 3 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 4 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 5 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 6 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
##       author_name        author_star availability  brand
## 1           Joana 1.0 out of 5 stars     In Stock SHERUI
## 2          Ashley 5.0 out of 5 stars     In Stock SHERUI
## 3 frances padilla 2.0 out of 5 stars     In Stock SHERUI
## 4       Katharine 1.0 out of 5 stars     In Stock SHERUI
## 5      Evelina L. 5.0 out of 5 stars     In Stock SHERUI
## 6      Lola Bunny 1.0 out of 5 stars     In Stock SHERUI
##                                                     comment_title error
## 1        Id recommend to use coconut oil to remove the lipstick\n     0
## 2                               Pocket Sized, Variety of colors\n     0
## 3                          Not pink color palett!!! VERY BROWN!\n     0
## 4          The Color is beautiful but it's very strange texture\n     0
## 5                                  Cute mini matte lipstick set\n     0
## 6 STICKY!!! (Almost glue your mouth shut type sticky)\U0001f628\n     0
##                 star
## 1 3.2 out of 5 stars
## 2 3.2 out of 5 stars
## 3 3.2 out of 5 stars
## 4 3.2 out of 5 stars
## 5 3.2 out of 5 stars
## 6 3.2 out of 5 stars
##                                                                           title
## 1 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 2 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 3 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 4 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 5 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 6 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
##   star_sub  art_Date1 Year Month author_star_sub          a Rank1 Rank2
## 1      3.2 2018-02-18 2018    02               1 B0762M2ZWJ  1189     1
## 2      3.2 2018-11-04 2018    11               5 B0762M2ZWJ  1189     1
## 3      3.2 2018-02-14 2018    02               2 B0762M2ZWJ  1189     1
## 4      3.2 2018-01-11 2018    01               1 B0762M2ZWJ  1189     1
## 5      3.2 2018-09-09 2018    09               5 B0762M2ZWJ  1189     1
## 6      3.2 2018-01-18 2018    01               1 B0762M2ZWJ  1189     1
##                Category1 Customer_Reviews
## 1 Beauty & Personal Care              264
## 2 Beauty & Personal Care              264
## 3 Beauty & Personal Care              264
## 4 Beauty & Personal Care              264
## 5 Beauty & Personal Care              264
## 6 Beauty & Personal Care              264

資料前處理將唇膏商品資訊的銷售績效欄位併在評論資料表
最後統整出的資料集描述：評論者名稱、評論內容、評論日期、url、評論者給產品的星等(1-5星)、品牌、存貨狀態、評論標題、產品標題、產品星等(1-5星)、銷售排名、留言數

唇膏排名統計資料

summary(lipstick_all$Rank2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       1     157     457    1979    2328   56629

品牌討論度

#by年分
#lipstick_review$brand %>% unique


brand_discuss_df=lipstick_review %>% 
  group_by(.,brand,Year) %>%
  summarise(
    discuss_n=n()
  )%>% filter(Year>2010)


brand_discuss_df %>% 
  group_by(.,brand) %>%
  summarise(
    discuss_sum=sum(discuss_n) 
  ) ->top10_df


#top10品牌討論度by年分
brand_discuss_df$brand %in% 
  top10_df [order(top10_df$discuss_sum,decreasing = T),]$brand[1:10] %>%
  
  subset(brand_discuss_df,.)->discuss_plot_df

discuss_plot_df<- subset(discuss_plot_df,Year!="NA")

品牌被討論數隨時間分布圖by年分

#by年分
ggplot(discuss_plot_df, aes(x = Year, y =discuss_n,color=brand)) + 
  geom_line() + geom_point(  fill = "white")+scale_x_continuous(breaks = c(2011:2019))+scale_y_continuous(breaks=c(0,1000,2000,3000,4000,5000,6000,7000,8000,9000))

評論數量在近幾年逐漸有上升的趨勢，尤其以Maybelline在2018年評論數量暴增

斷字

############文字清理方法二###############
lipstick_all$art_Content <- gsub("'", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[[:punct:]]", " ", lipstick_all$art_Content) 
lipstick_all$art_Content <- gsub("[[:cntrl:]]", " ",lipstick_all$art_Content) 
lipstick_all$art_Content <- gsub("^[[:space:]]+", "", lipstick_all$art_Content) 
lipstick_all$art_Content <- gsub("[[:space:]]+$", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[0-9]+", " ", lipstick_all$art_Content)

lipstick_all$art_Content<-iconv(lipstick_all$art_Content,"WINDOWS-1252","UTF-8")
lipstick_all$art_Content <- tolower(lipstick_all$art_Content)


#重新編主鍵
lipstick_all<-lipstick_all %>%
  mutate(id2 =rownames(lipstick_all))

conbine_tidy<-lipstick_all%>%
  mutate(linenumber =row_number())%>%
  unnest_tokens(sentence,art_Content)

情緒分析(使用NRC字典)

NRC_VAD<-read.table("./data/NRC_VAD/NRC_VAD_Lexicon.txt",header = T,sep="\t",stringsAsFactors = F)
NRC<-get_sentiments("nrc")%>%as.data.frame()
NRC_all<-NRC_VAD%>%left_join(NRC,by=c("Word"="word"))

join2<- conbine_tidy %>%
  inner_join(NRC_VAD,by=(c("sentence"="Word")))%>%
  group_by(linenumber,id2)%>%
  summarise(n=n(),
            valence_avg=sum(Valence)/n,
            arousal_avg=sum(Arousal)/n,
            dominance_avg=sum(Dominance)/n)

#合併join2
lipstick_all<-lipstick_all%>%
  left_join(join2,by=c("id2"))

字典使用 [NRC VAD Lexicon] (http://saifmohammad.com/WebPages/nrc-vad.html)
NRC VAD Lexicon包含了20,007個字詞，並計算其效價 (Valence) 、激發 (Arousal) 和支配 (Dominance) 三種維度之分數。Valence 代表「正面」—「負面」或是「愉悅」—「不滿」的維度；Arousal代表「興奮」—「平靜」或「主動」—「被動」的維度；Dominance是「強者」—「弱者」或「完全掌控」—「無法控制」的維度。分數範圍從0至1

把情緒值為NA的移除

# 移除author_name,brand,star_sub,author_star_sub,Rank2,情緒欄位為NA的資料
lipstick_all_no_NA = lipstick_all[complete.cases(lipstick_all[ , c("author_name","brand","star_sub","author_star_sub","Rank2")]), ]

正負面情緒，最高與最低各10個brand

sentiment_valence = rbind(
# valence_avg 最高的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(valence_all_avg = mean(valence_avg)) %>%
    arrange(desc(valence_all_avg)) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "high"),
# valence_avg 最低的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(valence_all_avg = mean(valence_avg)) %>%
    arrange(valence_all_avg) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "low")
  )

#畫圖
sentiment_valence %>% 
  group_by(score) %>%
  arrange(desc(valence_all_avg)) %>%
  ungroup() %>%
  ggplot(aes(reorder(brand, valence_all_avg), valence_all_avg, fill = score)) +
  geom_col() +
  xlab("Brand") +
  ylab("Valence AVG") +
  coord_flip()

內縮/外放情緒，最高與最低各10個brand

sentiment_arousal = rbind(
# arousal_avg 最高的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(arousal_all_avg = mean(arousal_avg)) %>%
    arrange(desc(arousal_all_avg)) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "high"),
# arousal_avg 最低的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(arousal_all_avg = mean(arousal_avg)) %>%
    arrange(arousal_all_avg) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "low")
  )

# 畫圖
sentiment_arousal %>% 
  group_by(score) %>%
  arrange(desc(arousal_all_avg)) %>%
  ungroup() %>%
  ggplot(aes(reorder(brand, arousal_all_avg), arousal_all_avg, fill = score)) +
  geom_col() +
  xlab("Brand") +
  ylab("Arousal AVG") +
  coord_flip()

支配程度情緒，最高與最低各10個brand

sentiment_dominance = rbind(
# dominance_avg 最高的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(dominance_all_avg = mean(dominance_avg)) %>%
    arrange(desc(dominance_all_avg)) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "high"),
# dominance_avg 最低的十個品牌
  lipstick_all_no_NA %>%
    group_by(brand) %>%
    summarise(dominance_all_avg = mean(dominance_avg)) %>%
    arrange(dominance_all_avg) %>%
    head(10) %>%
    ungroup() %>%
    mutate(score = "low")
  )

#畫圖
sentiment_dominance %>% 
  group_by(score) %>%
  arrange(desc(dominance_all_avg)) %>%
  ungroup() %>%
  ggplot(aes(reorder(brand, dominance_all_avg), dominance_all_avg, fill = score)) +
  geom_col() +
  xlab("Brand") +
  ylab("Dominance AVG") +
  coord_flip()

JOIN “Valence, Arousal維度” 的情緒

# 把valence與arousal維度最高10個與最低10個都有出現的品牌join一起
sentiment_join = sentiment_valence %>%
  inner_join(sentiment_arousal, by = c(brand = "brand"))

sentiment_join$score.x = as.factor(sentiment_join$score.x)
sentiment_join$score.y = as.factor(sentiment_join$score.y)

sentiment_join %>%
  ggplot(aes(score.x, score.y, label = brand)) +
  geom_jitter(position = position_jitter(seed = 87)) +
  xlab("Valence") +
  ylab("Arousal") +
  geom_vline(xintercept = 1.5) +    # 在x軸中間加入線條以利區別點點位置
  geom_hline(yintercept = 1.5) +    # 在y軸中間加入線條以利區別點點位置
  scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) +    # 將x軸的刻度對調
  scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) +    # 將y軸的刻度對調
  geom_label_repel(position = position_jitter(seed = 87), aes(label = brand), box.padding = 0.7)

# 把每個點點標上brand名稱

這張圖可以看出，越靠右邊的品牌越帶給消費者正面情緒，反之，越左邊越負面；越靠上方的品牌代表消費者越外放的情緒，反之，越下方則越內縮。
1. 以Oriflame、ONE1X、K牌脣膏為例，消費者感受到正面的情緒，並且在評論中的敘述是比較想把好心得分享給大家
2. NOTE Cosmetics、Nykaa、A牌脣膏則是讓消費者產生負面情緒，而且消費者的評論也較積極表現出不滿的心情
3. Kryolan雖然讓消費者有負面情緒，但消費者的評論卻是用比較婉轉的方式說產品的缺點

查看一下上述分析中的評論內容

lipstick_all_no_NA[lipstick_all_no_NA$brand == "Oriflame" | lipstick_all_no_NA$brand == "ONE1X", 2]

## [1] "so cute got them for a bachelorette gift bags the ladies all loved them actually fought over the colors lol"
## [2] "its superb  my new lip love"

Oriflame、ONE1X的評論表現出很喜歡該品牌，有愛不釋手的感覺

lipstick_all_no_NA[lipstick_all_no_NA$brand == "NOTE Cosmetics", 2]

## [1] "i would never ever buy again    so disappointed   very bad experience"                      
## [2] "didn t keep it because it was totally broken off when i opened package   wouldn t buy agsin"

NOTE Cosmetics的評論很明顯唾棄這種品牌的脣膏，而且還非常不推薦給其他人

lipstick_all_no_NA[lipstick_all_no_NA$brand == "Kryolan", 2]

## [1] "it has quite a lot of wax smell in it"

Kryolan的評論沒有直接表現出對品牌的喜好，但委婉地說出產品的缺點

加入星星數量

star_amount = lipstick_all_no_NA %>%
  group_by(brand) %>%
  summarise(star = mean(star_sub)) %>%
  ungroup()

sentiment_join_star = sentiment_join %>%
  inner_join(star_amount, by = c(brand = "brand"))

sentiment_join_star %>%
  ggplot(aes(score.x, score.y, label = brand)) +
  geom_jitter(position = position_jitter(seed = 87), aes(size = star)) +   # 星星數越高，點點越大
  xlab("Valence") +
  ylab("Arousal") +
  geom_vline(xintercept = 1.5) +    # 加入線條以利區別點點位置
  geom_hline(yintercept = 1.5) +    # 加入線條以利區別點點位置
  scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) +    # 將x軸排序對調
  scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) +    # 將y軸排序對調
  geom_label_repel(position = position_jitter(seed = 87), size = 3, box.padding = 0.7)  # 把每個點點標上brand名稱

該品牌平均所得的星星越多，點點的size會越大
這幾個品牌的脣膏所拿到的星星數(size大小)，與其帶給消費者的情緒的關係符合常理，消費者感受越好，給的星星自然會比較多(size較大)。

加入lipstick商品排名(Rank2)

# 將同品牌lipstick的rank平均
rank_avg = lipstick_all_no_NA %>%
  group_by(brand,Rank2) %>%
  summarise(rank = mean(Rank2)) %>%
  group_by(brand) %>%
  summarise(rank = mean(rank)) %>%
  ungroup()

sentiment_join_rank = sentiment_join %>%
  inner_join(rank_avg, by = c(brand = "brand"))

sentiment_join_rank %>%
  ggplot(aes(score.x, score.y, label = brand)) +
  geom_jitter(position = position_jitter(seed = 87), aes(size = -rank)) +   # rank越前面，點點越大
  xlab("Valence") +
  ylab("Arousal") +
  geom_vline(xintercept = 1.5) +    # 加入線條以利區別點點位置
  geom_hline(yintercept = 1.5) +    # 加入線條以利區別點點位置
  scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) +    # 將x軸排序對調
  scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) +    # 將y軸排序對調
  geom_label_repel(position = position_jitter(seed = 87), size = 3, box.padding = 0.7)  # 把每個點點標上brand名稱

該品牌脣膏的銷售排名越好，點點的size會越大
Oriflame牌的銷售排名並不好，可能是價錢較貴導致比較少人購買
Addictive Cosmetics雖讓消費者印象比較不好，但是可能因為價格便宜，消費者比較買得下手
脣膏的銷售量排行與消費者的正負面情緒沒有太直接關係(把篩選的個數增加也一樣>_<)

討論正負情緒、star與rank的關係

# 將正負面分數最高與最低各10個牌子，連結該牌的平均銷售排行與平均獲得星星數
valence_rank_star = sentiment_valence %>%
  inner_join(rank_avg, by = c(brand = "brand")) %>%
  inner_join(star_amount, by = c(brand = "brand"))

valence_rank_star

## # A tibble: 20 x 5
##    brand                                valence_all_avg score   rank  star
##    <fct>                                          <dbl> <chr>  <dbl> <dbl>
##  1 BeautiControl                                  0.918 high  24605   5   
##  2 MILK MAKEUP                                    0.872 high  32875   5   
##  3 ONE1X                                          0.867 high   2126   5   
##  4 Oriflame                                       0.860 high  44499   5   
##  5 LCTCKP                                         0.817 high  44649   5   
##  6 KAT VON D Everlasting Glimmer Veil …           0.806 high  15197   4   
##  7 BORNTREE                                       0.796 high  28540   4   
##  8 FLOWER                                         0.793 high  21762   3.3 
##  9 Tayongpo                                       0.790 high  10539   4.5 
## 10 Stephanie Imports                              0.783 high   6951   5   
## 11 VOGUE COLOMBIA                                 0.25  low   56629   2   
## 12 Nykaa                                          0.291 low   38799   1   
## 13 A'some                                         0.438 low   27233   2   
## 14 YSL                                            0.440 low    4965   1   
## 15 Kryolan                                        0.492 low   54303   1   
## 16 Smoke & Mirrors                                0.506 low   46511   1   
## 17 SUNSENT                                        0.506 low    7506   2.7 
## 18 Addictive Cosmetics                            0.514 low   16925   2.4 
## 19 MILEMEI                                        0.515 low   23968.  1.33
## 20 NOTE Cosmetics                                 0.522 low   39886   1

畫出正負情緒、star與rank的關係圖

valence_rank_star %>%
  ggplot(aes(star, valence_all_avg, label = brand)) +
  geom_point(aes(size = -rank, color = -rank)) +       # 銷售排行越好，點點越大，顏色越深
  #geom_text_repel(box.padding = 0.7, aes(size = 2)) +
  geom_label_repel( box.padding = 0.7, size = 3) +
  ylab("Sentiment Valence") +
  xlab("Star Amount")

X軸:獲得星星數(越往右星星越多)，Y軸:消費者的情緒分數(越往上消費者情緒越正向)
此圖可看出的關係與前面所做的蠻符合，脣膏的銷售量與消費者的情緒沒有太直接的關係，所以各處的點點有大有小。
但是該品牌所獲得的星星數量，可以明確表示消費者用過的體驗，故圖中的點點比較集中於右上方(體驗好，星星多)和左下方(體驗差，星星少)

主題模型

conbine_document<-conbine_tidy %>%
  anti_join(stop_words,by=c("sentence"="word")) %>%
  count(linenumber, sentence, sort = TRUE) %>%
  ungroup()

#轉成dtm
top10_dtm <- conbine_document %>%
  cast_dtm(linenumber, sentence, n)

top10_dtm

## <<DocumentTermMatrix (documents: 63345, terms: 19961)>>
## Non-/sparse entries: 686353/1263743192
## Sparsity           : 100%
## Maximal term length: NA
## Weighting          : term frequency (tf)

lda_top10 <- LDA(top10_dtm, k = 2, control = list(seed = 1234))
lda_top10

## A LDA_VEM topic model with 2 topics.

#主題跟字的關係
document_topics <- tidy(lda_top10, matrix = "beta")
document_topics

## # A tibble: 39,922 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 red      0.00740 
##  2     2 red      0.00759 
##  3     1 organic  0.000110
##  4     2 organic  0.000390
##  5     1 bees     0.000820
##  6     2 bees     0.000329
##  7     1 burts    0.000431
##  8     2 burts    0.000652
##  9     1 lipstick 0.0271  
## 10     2 lipstick 0.0354  
## # ... with 39,912 more rows

top_terms <- document_topics %>%
  filter(!term  %in% c("lipstick","color","love","lips","product","doe snt","lip","lipsticks"))%>%
  group_by(topic) %>%
  top_n(20, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms

## # A tibble: 40 x 3
##    topic term       beta
##    <int> <chr>     <dbl>
##  1     1 doesnt  0.0124 
##  2     1 perfect 0.0108 
##  3     1 wear    0.0108 
##  4     1 pink    0.0106 
##  5     1 apply   0.0101 
##  6     1 matte   0.0100 
##  7     1 day     0.00910
##  8     1 price   0.00872
##  9     1 red     0.00740
## 10     1 time    0.00698
## # ... with 30 more rows

#畫圖呈現主題跟字的關係

 top_terms %>%
   mutate(term = reorder(term, beta)) %>%
   ggplot(aes(term, beta, fill = factor(topic))) +
   geom_col(show.legend = FALSE) +
   facet_wrap(~ topic, scales = "free") +
   coord_flip()

使用主題模型分出來的結果不慎理想

算出beta的差異值

beta_spread <- document_topics %>%
  mutate(topic = paste0("topic", topic)) %>%
  spread(topic, beta) %>%
  filter(topic1 > .001 | topic2 > .001) %>%
  mutate(log_ratio = log2(topic2 / topic1))

beta_spread

## # A tibble: 216 x 4
##    term          topic1   topic2 log_ratio
##    <chr>          <dbl>    <dbl>     <dbl>
##  1 absolutely  0.00247  0.00142    -0.802 
##  2 add         0.00121  0.000465   -1.38  
##  3 amazing     0.00103  0.00464     2.17  
##  4 amazon      0.00265  0.00166    -0.672 
##  5 application 0.00322  0.000759   -2.09  
##  6 applicator  0.000225 0.00106     2.24  
##  7 applied     0.00232  0.00204    -0.186 
##  8 apply       0.0101   0.00145    -2.80  
##  9 applying    0.00168  0.00170     0.0209
## 10 arrived     0.000699 0.00205     1.55  
## # ... with 206 more rows

beta_spread%>%
  arrange(desc(abs(log_ratio)))  %>%
  head(20)%>%
  mutate(term = reorder(term, log_ratio)) %>%
  ggplot(aes(term, log_ratio, fill = log_ratio>0)) +
  geom_col(show.legend = FALSE) +
  xlab("term") +
  ylab("log(topic2 / topic1)") +
  coord_flip()

找出beta的最大差異值取log之後可以很明顯看出topic 1偏向顏色；topic2偏向成份(texture質地,shimmer珠光,moisture水分)

使用tSNE做尺度縮減看字跟字的相似性

library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()

## [1] 24

## 字頻表
dtm = lipstick_all$art_Content %>% 
  iconv(to = "utf-8", sub="") %>% 
  str_trim(.) %>%
  toupper() %>%
  VectorSource %>% Corpus %>% 
  tm_map(content_transformer(tolower)) %>% 
  tm_map(removePunctuation) %>% 
  tm_map(stemDocument) %>% 
  DocumentTermMatrix %>% 
  removeSparseTerms(0.995)
dtm  # (documents: 14156, terms: 1030)

## <<DocumentTermMatrix (documents: 63980, terms: 560)>>
## Non-/sparse entries: 1124526/34704274
## Sparsity           : 97%
## Maximal term length: 10
## Weighting          : term frequency (tf)

dtm_tmp=dtm

過濾tfidf太小的字

library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
  log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0758  0.2012  0.2444  0.2652  0.3012  1.0575

dtm=dtm_tmp
dtm = dtm[, tfidf > 0.2012 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)

## [1] 63980   420

尺度縮減

#install.packages('Rtsne')
library(Rtsne)

# n = 300
# tsne = dtm[, 1:n] %>% as.data.frame.matrix %>% 
#   sapply(.,function(v)ifelse(v>1,1,v))%>%
#   scale %>% t %>% 
#   Rtsne(check_dup=F, theta=0.0, max_iter=3200)

階層式分群

# Y = tsne$Y              # tSNE coordinates
# d = dist(Y)             # distance matrix
# hc = hclust(d)          # hi-clustering
# K = 30                # number of clusters 
# g = cutree(hc,K)        # cut into K clusters
# table(g) %>% as.vector %>% sort         # sizes of clusters

畫出文字雲

# # install.packages('randomcoloR')
# library(randomcoloR)
# library(wordcloud)
# 
# wc = col_sums(dtm[,1:n])
# colors = distinctColorPalette(K)
# 
# 
# #png("./Amazon_lipstick.png", width=3200, height=1800)
# textplot(
#   Y[,1], Y[,2], colnames(dtm)[1:n], show=F, 
#   col=colors[g],
#   cex= 0.3 + 1.25 * sqrt(wc/mean(wc)),
#   font=2)
# dev.off()

文字雲

分群的結果可以發現一些相似的字詞被區分在一群，例如:顏色,成份,服務

畫出評論者對品牌的網路圖看看會不會區分成不同的集群

nrow(lipstick) # 92524 num of data

## [1] 92524

length(unique(lipstick$author_name)) # 57672 comment author

## [1] 57672

length(unique(lipstick$brand)) #437 different  brand

## [1] 437

## 看各品牌的討論數
top10_df %>% arrange(desc(discuss_sum))

## # A tibble: 433 x 2
##    brand                   discuss_sum
##    <fct>                         <int>
##  1 Maybelline New York           10658
##  2 NYX PROFESSIONAL MAKEUP        6513
##  3 Revlon                         6239
##  4 Lime Crime                     6036
##  5 L'Oreal Paris                  5988
##  6 COVERGIRL                      4347
##  7 LipSense                       3059
##  8 Rimmel                         2050
##  9 Burt's Bees                    1964
## 10 Nabi                           1716
## # ... with 423 more rows

tmp <- lipstick %>% 
  select(author_name, brand) %>% 
  filter(author_name != 'Kindle Customer' & author_name != 'Amazon Customer') %>% 
  group_by(brand) %>% 
  mutate(brand_sum = n()) %>% 
  ungroup() %>% 
  arrange(desc(brand_sum))

# 只取前10大品牌
top_10 <- unique(tmp$brand)[1:10]

# 因為評論者太多 只取留言數夠多的評論者做呈現
link <- tmp %>% 
  filter(brand %in% top_10) %>% 
  group_by(author_name) %>% 
  mutate(author_sum = n()) %>% 
  ungroup() %>% 
  distinct() %>% 
  filter(author_sum > 30)


author <- select(link, author_name, author_sum)
brand <- select(link, brand, brand_sum)
names(author) <- NULL
names(brand) <- NULL

nodes <- rbind(data.frame(author),data.frame(brand)) %>% distinct()


reviewNetwork <- graph_from_data_frame(d=link, v=nodes,directed = F)
reviewNetwork

## IGRAPH 5ffab62 UN-- 35 228 -- 
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 5ffab62 (vertex names):
##  [1] Michelle --Maybelline New York Heather  --Maybelline New York
##  [3] Melissa  --Maybelline New York Mary     --Maybelline New York
##  [5] Emily    --Maybelline New York Laura    --Maybelline New York
##  [7] Jennifer --Maybelline New York Amy      --Maybelline New York
##  [9] Karen    --Maybelline New York Sarah    --Maybelline New York
## [11] Anna     --Maybelline New York Katie    --Maybelline New York
## [13] Elizabeth--Maybelline New York Nicole   --Maybelline New York
## [15] Amanda   --Maybelline New York Samantha --Maybelline New York
## + ... omitted several edges

V(reviewNetwork)$size <- log(V(reviewNetwork)$X2)*3

# 紅色是品牌的node 藍色是author
plot(reviewNetwork,
     vertex.color=c("skyblue", "red")[1+(names(V(reviewNetwork)) %in% top_10)])

看圖可以看到前10大品牌都是很受大家歡迎的，並沒有分出明顯的群集
從留言者姓名也可以發現評論者幾乎都是以女性居多

看看各個評論者對評分的網路圖會不會有評論者只評高分或只評低分

length(unique(lipstick$author_star))

## [1] 5

star_link <- lipstick %>% 
  select(author_name, author_star) %>% 
  group_by(author_name) %>% 
  mutate(author_sum = n()) %>% 
  distinct() %>% 
  filter(author_sum > 40)


starNetwork <- graph_from_data_frame(d=star_link,directed = F)
starNetwork

## IGRAPH 0ad1d82 UN-- 56 251 -- 
## + attr: name (v/c), author_sum (e/n)
## + edges from 0ad1d82 (vertex names):
##  [1] Ashley         --5.0 out of 5 stars
##  [2] Amazon Customer--4.0 out of 5 stars
##  [3] Angela         --5.0 out of 5 stars
##  [4] Amazon Customer--1.0 out of 5 stars
##  [5] Sara           --1.0 out of 5 stars
##  [6] Amazon Customer--3.0 out of 5 stars
##  [7] Amazon Customer--5.0 out of 5 stars
##  [8] Sarah          --3.0 out of 5 stars
## + ... omitted several edges

plot(starNetwork)

從圖可以得知評論者都很公平，高分低分都有給，不會刻意只給高分或低分

買家對價格的網路圖看看有沒有一群人都買高價或都買低價

colnames(lipstick_all)[which(names(lipstick_all) == "a")] <- "ASIN"


price_tmp <- lipstick_all %>% 
  left_join(lipstick_detail, by = 'ASIN') %>% 
  select(author_name, Price)

price_tmp$Price <- as.numeric(price_tmp$Price)

## Warning: NAs introduced by coercion

price_tmp <- na.omit(price_tmp)

summary(price_tmp$Price)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.75    5.85    8.83   12.99   19.99  118.99

plot(density(price_tmp$Price))

price_link <- price_tmp %>% 
  filter(author_name != 'Kindle Customer' & author_name != 'Amazon Customer') %>% 
  group_by(author_name) %>% 
  mutate(author_sum = n()) %>% 
  ungroup() %>% 
  mutate(price_index = ifelse(Price>20,'high','low')) %>% 
  filter(author_sum > 50) %>% 
  select(author_name, price_index, author_sum) 
  


priceNetwork <- graph_from_data_frame(d=price_link,directed = F)
priceNetwork

## IGRAPH c5141b7 UN-- 17 970 -- 
## + attr: name (v/c), author_sum (e/n)
## + edges from c5141b7 (vertex names):
##  [1] Ashley   --low Sarah    --low Nicole   --low Nicole   --low
##  [5] Amanda   --low Amanda   --low Jessica  --low Katie    --low
##  [9] Jessica  --low Katie    --low Nicole   --low Mary     --low
## [13] Sarah    --low Lisa     --low Jessica  --low Michelle --low
## [17] Jennifer --low Lauren   --low Amanda   --low Stephanie--low
## [21] Stephanie--low Michelle --low Elizabeth--low Amanda   --low
## [25] Jennifer --low Amanda   --low Sarah    --low Stephanie--low
## [29] Mary     --low Katie    --low Michelle --low Laura    --low
## + ... omitted several edges

plot(priceNetwork)

低價的商品購買數量比較多，但也看不出明顯分群，高低價錢大家都買單

使用bigram斷詞

#以bi-gram做斷詞
lipstick_bigrams <- lipstick_all%>%
  mutate(linenumber =row_number())%>%  
  unnest_tokens(bigram,art_Content, token = "ngrams", n = 2)

看各bigram的次數

lipstick_bigrams %>%
  count(bigram, sort = TRUE)

## # A tibble: 286,727 x 2
##    bigram           nn
##    <chr>         <int>
##  1 the color     14629
##  2 i love         7932
##  3 it is          7805
##  4 my lips        7557
##  5 i have         6282
##  6 this is        6262
##  7 color is       6039
##  8 is a           5963
##  9 this lipstick  5711
## 10 love this      5217
## # ... with 286,717 more rows

將bigram拆成word1, word2

bigrams_separated <- lipstick_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts

## # A tibble: 77,589 x 3
##    word1     word2       nn
##    <chr>     <chr>    <int>
##  1 lip       color     2089
##  2 skin      tone      1297
##  3 beautiful color     1122
##  4 lip       gloss     1085
##  5 love      love      1039
##  6 lip       balm      1005
##  7 nice      color      924
##  8 staying   power      828
##  9 matte     lipstick   818
## 10 <NA>      <NA>       812
## # ... with 77,579 more rows

trigram

lipstick_all %>%
  unnest_tokens(trigram, art_Content, token = "ngrams", n = 3) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
  filter(!word1 %in% stop_words$word,
         !word2 %in% stop_words$word,
         !word3 %in% stop_words$word) %>%
  count(word1, word2, word3, sort = TRUE)

## # A tibble: 49,049 x 4
##    word1   word2 word3         nn
##    <chr>   <chr> <chr>      <int>
##  1 <NA>    <NA>  <NA>        2220
##  2 love    love  love         475
##  3 natural lip   color        237
##  4 matte   lip   cream        121
##  5 matte   lip   color         87
##  6 soft    matte lip           84
##  7 lips    feel  dry           80
##  8 love    lime  crime         76
##  9 ðÿ      ðÿ    ðÿ            72
## 10 lime    crime velvetines    72
## # ... with 49,039 more rows

#將trigram的結果拆成三個字並另存成一dataframe
lipstick_trigrams <- lipstick_all%>%
  mutate(linenumber =row_number())%>%  
  unnest_tokens(trigram,art_Content, token = "ngrams", n = 3)


trigrams_separated <- lipstick_trigrams %>%
  separate(trigram, c("word1", "word2","word3"), sep = " ")

trigrams_filtered <- trigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word) %>% 
  filter(!word3 %in% stop_words$word)

找出評論中提到自己所使用的口紅顏色

#找出評論中提到的商品顏色
bigram_color <- filter(bigrams_filtered,word2%in% lipstick_detail$color)
trigram_color <- filter(trigrams_filtered,word3%in% lipstick_detail$color)

將bigarm的兩個字連接後的wordcloud

bigrams_united <- bigram_color %>%
  unite(bigram, word1, word2, sep = "_")


set.seed(1234)
wordcloud(words = bigrams_united$bigram,
          max.words=200, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(8, "Dark2"))

最常被提到的是膚色

trigram三個字連接後的wordcloud

trigrams_united <- trigram_color %>%
  unite(trigram, word1, word2, word3, sep = "_")%>%
  filter(!trigram %in% c("NA_NA_NA"))

# set.seed(1234)
# wordcloud(words = trigrams_united$trigram,
#           max.words=200, random.order=FALSE, rot.per=0.35, 
#           colors=brewer.pal(8, "Dark2"))

trigram

最常出現的膚色是olive skin tone這種膚色調而口紅顏色當中冷色調（blue-based）的紅色最常被討論

去掉所有形容膚色的詞彙留下可能為討論口紅的顏色

without_skin_color <- filter(trigram_color, word2 !="skin")

合併詞彙再做一次文字雲

new_united <- without_skin_color %>%
  unite(trigram, word1, word2, word3, sep = "_")


# set.seed(1234)
# wordcloud(words = new_united$trigram,
#           max.words=200, random.order=FALSE, rot.per=0.35, 
#           colors=brewer.pal(8, "Dark2"))

trigram color

可以看到近年流行深紅，而深紅色被提到的次數為第二
由perfect mattered可以知道指的是完美的霧面，霧面口紅也的確是近年的流行

畫成長條圖看看什麼顏色最常被提到

trigram<-new_united %>%
  count(trigram, sort = TRUE)

trigram %>%
  filter(nn>5)%>%
  ggplot(.,aes(x=reorder(trigram,nn),y=nn))+
  geom_bar(stat="identity")+
  coord_flip()+ labs(y='count',x='trigram of color')

抓出討論亮面與霧面的紀錄

#建立要被篩選的字
mat_word <- c("matt","matte","matted")
gloss_word <- c("gloss","glossy")

#以trigram找出有被提到是霧面還亮面的評論
matte <- filter(trigrams_filtered, word1 %in% mat_word | word2 %in% mat_word | word3 %in% mat_word)
gloss <- filter(trigrams_filtered, word1 %in% gloss_word | word2 %in% gloss_word | word3 %in% gloss_word)

color <- conbine_tidy %>% filter(sentence %in% mat_word |sentence %in% gloss_word | sentence %in% c("dark","red","tone","brown","pink","orange","black","purple","fire","cupcake","nap","cabernet","lay","temptation"))

## 霧面 vs 亮面唇膏趨勢圖
color %>%
  select(art_Date1,sentence)%>%
  count(art_Date1,sentence) %>%
  arrange(desc(n))%>%
  filter(n>10)%>%
  filter(sentence %in% mat_word | sentence %in% gloss_word)%>%
  ggplot(.,aes(art_Date1,n, fill = sentence,label=sentence)) +
 geom_line(aes(color = sentence), position="identity")

## 其他顏色趨勢圖
color %>%
  select(art_Date1,sentence)%>%
  count(art_Date1,sentence) %>%
  arrange(desc(n))%>%
  filter(n>10)%>%
  filter(!sentence %in% mat_word & !sentence %in% gloss_word)%>%
  ggplot(.,aes(art_Date1,n, fill = sentence,label=sentence)) +
 geom_line(aes(color = sentence), position="identity")

準備資料

#類別轉factor
new_united$brand <- as.factor(new_united$brand)
new_united$trigram <- as.factor(new_united$trigram)


#新增一欄位：是否會進銷售排行前十名
predict_data <- new_united %>% 
  mutate(inRanking = ifelse(Rank2 >= 10, "no", "yes"))
predict_data$inRanking <- as.factor(predict_data$inRanking)

#切分訓練資料
set.seed(1)
indx <- sample(1:nrow(predict_data), nrow(predict_data)*0.7)
test  <- predict_data[-indx,]
train <- predict_data[indx,]

support vector classifier

## 用品牌及trigram的結果預測是否會進入銷售排行前十名

library(e1071)
svmfit6 <- svm(inRanking ~ brand + trigram, data=train , kernel ="linear", cost=10,scale=FALSE)

ypred <- predict (svmfit6 ,test)
x=table(predict = ypred , truth = test$inRanking )
sum(diag(x))/sum(x)

## [1] 0.9696376

結論

1.唇膏的評論難以使用主題模型分類，但透過尺度縮減及階層式分群使相似的字歸類在一起，能夠使我們推斷出評論潛在的主題
2.從評論當中可以發現消費者對於唇膏的顏色，以及是否搭配自己的膚色較為在意。除此之外，唇膏的目標客群通常為女性，因此從網路圖也可以發現評論者通常以女性居多
3.一開始使用單字斷詞進行分析時，所呈現出的效果不盡理想，最後嘗試透過bigram及trigram進行切割之後，所得出的字詞比較具有意義及可解釋性，最後我們使用trigram的結果來預測銷售績效也達到了不錯的效果

Final Project-Lipstick Review on Amazon

資料來源

動機與目的

基本設定+載入package

讀資料

review資料前處理

review,information資料合併

唇膏排名統計資料

品牌討論度

品牌被討論數隨時間分布圖by年分

斷字

情緒分析(使用NRC字典)

把情緒值為NA的移除

正負面情緒，最高與最低各10個brand

內縮/外放情緒，最高與最低各10個brand

支配程度情緒，最高與最低各10個brand

JOIN “Valence, Arousal維度” 的情緒

查看一下上述分析中的評論內容

加入星星數量

加入lipstick商品排名(Rank2)

討論正負情緒、star與rank的關係

畫出正負情緒、star與rank的關係圖

主題模型

算出beta的差異值

使用tSNE做尺度縮減看字跟字的相似性

過濾tfidf太小的字

尺度縮減

階層式分群

畫出文字雲

畫出評論者對品牌的網路圖 看看會不會區分成不同的集群

看看各個評論者對評分的網路圖 會不會有評論者只評高分或只評低分

買家對價格的網路圖 看看有沒有一群人都買高價或都買低價

使用bigram斷詞

看各bigram的次數

將bigram拆成word1, word2

trigram

找出評論中提到自己所使用的口紅顏色

將bigarm的兩個字連接後的wordcloud

trigram三個字連接後的wordcloud

去掉所有形容膚色的詞彙留下可能為討論口紅的顏色

合併詞彙再做一次文字雲

畫成長條圖 看看什麼顏色最常被提到

抓出討論亮面與霧面的紀錄

準備資料

support vector classifier

結論

畫出評論者對品牌的網路圖看看會不會區分成不同的集群

看看各個評論者對評分的網路圖會不會有評論者只評高分或只評低分

買家對價格的網路圖看看有沒有一群人都買高價或都買低價

畫成長條圖看看什麼顏色最常被提到