欲了解在Amazon.com上留言的評論者對於唇膏的整體看法,並以不同的角度剖析評論,像是了解評論者的情緒起伏,或是評論當中所涵蓋的主題內容,評論字詞使用的相似程度,甚至是評論者與品牌、星等、價格之間的關聯性。最後透過模型預測什麼樣的評論有可能會進入銷售前十名
packages = c(
"dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest","tidytext","wordcloud","topicmodels","doParallel")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
Sys.setlocale("LC_ALL","C")
## [1] "LC_CTYPE=C;LC_NUMERIC=C;LC_TIME=C;LC_COLLATE=C;LC_MONETARY=C;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
#options(digits=5, scipen=10)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(SnowballC)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(wordcloud)
## Loading required package: RColorBrewer
##
## Attaching package: 'wordcloud'
## The following object is masked from 'package:gplots':
##
## textplot
library(stringr)
library(tidytext)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## The following object is masked from 'package:NLP':
##
## annotate
library(tidyr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(readr)
library(topicmodels)
library(lexicon)
library(koRpus)
## Loading required package: sylly
## For information on available language packages for 'koRpus', run
##
## available.koRpus.lang()
##
## and see ?install.koRpus.lang()
##
## Attaching package: 'koRpus'
## The following object is masked from 'package:readr':
##
## tokenize
library(quanteda)
## Package version: 1.4.3
## Parallel computing: 2 of 24 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:koRpus':
##
## tokens, types
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
library(syuzhet)
library(corrplot)
## corrplot 0.84 loaded
library(slam)
##
## Attaching package: 'slam'
## The following object is masked from 'package:data.table':
##
## rollup
library(doParallel)
## Loading required package: foreach
## Loading required package: iterators
## Loading required package: parallel
library(Rtsne)
library(igraph)
##
## Attaching package: 'igraph'
## The following object is masked from 'package:quanteda':
##
## as.igraph
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
library(ggrepel)
library("wordcloud")
library("RColorBrewer")
lipstick_detail=fread('./data/yuchia.lipstick_info_original.csv',na.strings=c("","NA"),stringsAsFactors = F)
lip_sub=fread('./data/yuchia.lipstick_info_suburl.csv',na.strings=c("","NA"),stringsAsFactors = F)
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
## [1] "LC_CTYPE=zh_TW.UTF-8;LC_NUMERIC=C;LC_TIME=zh_TW.UTF-8;LC_COLLATE=zh_TW.UTF-8;LC_MONETARY=zh_TW.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
lipstick=fread('./data/yuchia.amazon_lipstick_review.csv',header = T,stringsAsFactors = F)
##因為評論有些欄位有中文資料,需事先做前處理
##刪除空白$換行
lipstick$availability<- gsub("\\n", "", lipstick$availability)%>%
gsub("^\\s+|\\s+$", "", .)%>%
gsub("[.]", "",.)
#unique(lipstick$availability)
#unique(lipstick$star)
#找出沒有中文的欄位
lipstick_nochiness<-lipstick %>% filter(grepl('[a-zA-Z]',lipstick$availability))
lipstick_nochiness$star_sub<-substr(lipstick_nochiness$star,1,3)
##處理字串
lipstick_nochiness$availability<- gsub("^Only.*|^In.*|.*ship.*","In Stock",lipstick_nochiness$availability)%>%
gsub("^Available.*","Unavailable",.)
#unique(lipstick_nochiness$availability)
##找出有中文的欄位
lipstick_chiness<-lipstick %>% filter(!grepl('[a-zA-Z]',lipstick$availability))
##處理星等
lipstick_chiness$star_sub<-substr(lipstick_chiness$star,7,10)
##處理字串
lipstick_chiness$availability<- gsub("庫存僅剩\\s[0-9].*", "In Stock", lipstick_chiness$availability) %>%
gsub(".*[有]?現貨。", "In Stock",.)%>%
gsub("來自\\s這些暢銷品\\s有貨。", "Unavailable",.) %>%
gsub("暫時缺貨\\s,", "Unavailable",.)
##合併兩張dataframe
lipstick_review<-rbind(lipstick_nochiness,lipstick_chiness)
colnames(lipstick_review)[which(names(lipstick_review) == "a")] <- "ASIN"
##Date:
Sys.setlocale("LC_TIME", "C")
## [1] "C"
lipstick_review$art_Date1=as.Date(lipstick_review$art_Date,format = "%B %d,%Y")
lipstick_review$Year=year(lipstick_review$art_Date1)
lipstick_review$Month=format(lipstick_review$art_Date1, "%m")
##Brand:
lipstick_review$brand=lipstick_review$brand %>% as.factor
brand=lipstick_review$brand %>% unique
##Star:
#lipstick_review$star_sub=lipstick_review$star1 %>% substr(1,3)
lipstick_review$author_star_sub=lipstick_review$author_star %>% substr(1,3)
lipstick_review$star_sub =lipstick_review$star_sub %>% as.numeric()
lipstick_review$author_star_sub =lipstick_review$author_star_sub %>% as.numeric()
lipstick_review<-distinct(lipstick_review,art_Content, .keep_all= TRUE)#刪除重複
## 唇膏商品資料前處理
##刪除多餘的欄位
lipstick_detail <- lipstick_detail[,-c(22:45)]
## 與suburl合併
lipstick_detail<-rbind(lipstick_detail,lip_sub,fill=T)
## 刪除重複
lipstick_detail<-distinct(lipstick_detail,URL, .keep_all= TRUE)#刪除重複
#唇膏整合排名by ASIN
lipstick_review$ASIN<-lapply(lipstick_review$art_Url,function(x){strsplit(x, "/")[[1]][6]})%>%as.character()
lip_combine<-lipstick_detail %>%
filter(Category1 %in% c('Beauty & Personal Care','Beauty & Peronal Care'))%>%
select(Rank1,Rank2,Category1,ASIN,Customer_Reviews)
lipstick_all<-lipstick_review%>%left_join(lip_combine,by=c("ASIN"="ASIN"))
lipstick_all$Rank1<-lipstick_all$Rank1%>% str_replace_all(",","") %>% as.numeric
lipstick_all$Rank2<-lipstick_all$Rank2%>% str_replace_all(",","") %>% as.numeric
##只篩選有存貨&有Rank1的資料
lipstick_all<-lipstick_all %>%
filter(availability=="In Stock")%>%
filter(is.na(Rank2)==FALSE)
head(lipstick_all)
## _id
## 1 5c892605923ce022a2d09ff5
## 2 5c892605923ce022a2d09ff7
## 3 5c892605923ce022a2d09ff9
## 4 5c892605923ce022a2d09ffb
## 5 5c892605923ce022a2d09ffd
## 6 5c892605923ce022a2d09fff
## art_Content
## 1 very sticky. The colors were a little off. Caution to others: it drys very fast and very hard to take off. Id recommend to use coconut oil to remove the lipstick.
## 2 I really love the dark red lip color in this set. Yes some of the lighter colors in the set were ""sticky"" and settled in the lines of my lips however I solved that quickly by mosturizing my lips before applying the lip color and applying a setting powder. I would reccommend based on the colors as it is difficult to find a good set where all the colors compliment your skin tone like this one does to my skin.
## 3 Product definitely dries quick, but the color pallet is NOT a truly representaed by the photos. The color pallet I received is very brown/nude earth tones-not pink as it displays. Definitely returning.
## 4 The Color is beautiful but it's very strange texture! it makes your lips stick together! i put it on and was like ahh, this is great, and about 60 seconds later i'm saying........ what the heck is going on... I am going to try putting a coat of clear chapstick over it I guess? Too inexpensive to return, so i'm going to try and work with it, but very strange texture once it dries. you can't even rub your lips together.
## 5 Really pretty colors for the price! Dried quickly from the swatches and smell like chocolate \U0001f60a I cant wait to try on my lips.
## 6 Super Sticky! Starts off ok, nice colors but like the other probably better with clear gloss
## art_Date
## 1 February 18, 2018
## 2 November 4, 2018
## 3 February 14, 2018
## 4 January 11, 2018
## 5 September 9, 2018
## 6 January 18, 2018
## art_Url
## 1 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 2 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 3 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 4 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 5 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 6 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## author_name author_star availability brand
## 1 Joana 1.0 out of 5 stars In Stock SHERUI
## 2 Ashley 5.0 out of 5 stars In Stock SHERUI
## 3 frances padilla 2.0 out of 5 stars In Stock SHERUI
## 4 Katharine 1.0 out of 5 stars In Stock SHERUI
## 5 Evelina L. 5.0 out of 5 stars In Stock SHERUI
## 6 Lola Bunny 1.0 out of 5 stars In Stock SHERUI
## comment_title error
## 1 Id recommend to use coconut oil to remove the lipstick\n 0
## 2 Pocket Sized, Variety of colors\n 0
## 3 Not pink color palett!!! VERY BROWN!\n 0
## 4 The Color is beautiful but it's very strange texture\n 0
## 5 Cute mini matte lipstick set\n 0
## 6 STICKY!!! (Almost glue your mouth shut type sticky)\U0001f628\n 0
## star
## 1 3.2 out of 5 stars
## 2 3.2 out of 5 stars
## 3 3.2 out of 5 stars
## 4 3.2 out of 5 stars
## 5 3.2 out of 5 stars
## 6 3.2 out of 5 stars
## title
## 1 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 2 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 3 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 4 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 5 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 6 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## star_sub art_Date1 Year Month author_star_sub ASIN Rank1 Rank2
## 1 3.2 2018-02-18 2018 02 1 B0762M2ZWJ 1189 1
## 2 3.2 2018-11-04 2018 11 5 B0762M2ZWJ 1189 1
## 3 3.2 2018-02-14 2018 02 2 B0762M2ZWJ 1189 1
## 4 3.2 2018-01-11 2018 01 1 B0762M2ZWJ 1189 1
## 5 3.2 2018-09-09 2018 09 5 B0762M2ZWJ 1189 1
## 6 3.2 2018-01-18 2018 01 1 B0762M2ZWJ 1189 1
## Category1 Customer_Reviews
## 1 Beauty & Personal Care 264
## 2 Beauty & Personal Care 264
## 3 Beauty & Personal Care 264
## 4 Beauty & Personal Care 264
## 5 Beauty & Personal Care 264
## 6 Beauty & Personal Care 264
summary(lipstick_all$Rank2)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1 157 457 1979 2328 56629
#by年分
#lipstick_review$brand %>% unique
brand_discuss_df=lipstick_review %>%
group_by(.,brand,Year) %>%
summarise(
discuss_n=n()
)%>% filter(Year>2010)
brand_discuss_df %>%
group_by(.,brand) %>%
summarise(
discuss_sum=sum(discuss_n)
) ->top10_df
#top10品牌討論度by年分
brand_discuss_df$brand %in%
top10_df [order(top10_df$discuss_sum,decreasing = T),]$brand[1:10] %>%
subset(brand_discuss_df,.)->discuss_plot_df
discuss_plot_df<- subset(discuss_plot_df,Year!="NA")
#by年分
ggplot(discuss_plot_df, aes(x = Year, y =discuss_n,color=brand)) +
geom_line() + geom_point( fill = "white")+scale_x_continuous(breaks = c(2011:2019))+scale_y_continuous(breaks=c(0,1000,2000,3000,4000,5000,6000,7000,8000,9000))
brand_product_discuss_df=lipstick_review %>%
group_by(.,brand, ASIN,Year) %>%
summarise(
discuss_n=n()
)%>% filter(Year>2010)
top10_df
## # A tibble: 433 x 2
## brand discuss_sum
## <fct> <int>
## 1 100% PURE 316
## 2 16BRAND 1
## 3 3 Concept Eyes 4
## 4 3ce 17
## 5 3CE STUDIO 14
## 6 A:CONCEPT 1
## 7 A'some 1
## 8 Acrylichomedesign 13
## 9 Addictive Cosmetics 2
## 10 AL'IVER 43
## # ... with 423 more rows
brand_product_discuss_df %>%
na.omit()%>%
filter(Year==2018, brand%in% top10_df$brand)%>%
arrange(desc(discuss_n))
## # A tibble: 1,340 x 4
## # Groups: brand, ASIN [1,340]
## brand ASIN Year discuss_n
## <fct> <chr> <int> <int>
## 1 Maybelline New York B06XF16MWM 2018 1226
## 2 Lime Crime B00JH2B182 2018 487
## 3 L'Oreal Paris B004BJ6VNG 2018 427
## 4 Maybelline New York B00467B0KC 2018 424
## 5 Burt's Bees B014VZUH1G 2018 337
## 6 COVERGIRL B00J2AP9OG 2018 335
## 7 LipSense B00HD5NRSC 2018 313
## 8 Revlon B001P2JZBC 2018 290
## 9 Revlon B000V5OQNG 2018 265
## 10 Maybelline New York B07BM1XTKH 2018 253
## # ... with 1,330 more rows
#所有品牌在2018所出的產品
lipstick_all %>%
filter(Year == 2018) %>%
left_join(lipstick_detail, by = 'ASIN') %>%
select(ASIN, Rank2.x, Price) %>%
unique() %>%
arrange(-desc(Rank2.x))%>%
head(10)
## ASIN Rank2.x Price
## 1 B0762M2ZWJ 1 10.88
## 2 B001P2JZBC 4 6.99
## 3 B00XQ9IRP2 6 19.93
## 4 B00YFQ0M3U 11 1.09
## 5 B07FD5KXT1 12 8.88
## 6 B00J2AP9OG 12 3.99
## 7 B00J2AP9OG 13 3.99
## 8 B014VZUH1G 14 6.03
## 9 B07JVP1MVR 32 15.89
## 10 B01M1OY39E 36 20.99
############文字清理方法二###############
lipstick_all$art_Content <- gsub("'", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[[:punct:]]", " ", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[[:cntrl:]]", " ",lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("^[[:space:]]+", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[[:space:]]+$", "", lipstick_all$art_Content)
lipstick_all$art_Content <- gsub("[0-9]+", " ", lipstick_all$art_Content)
lipstick_all$art_Content<-iconv(lipstick_all$art_Content,"WINDOWS-1252","UTF-8")
lipstick_all$art_Content <- tolower(lipstick_all$art_Content)
#重新編主鍵
lipstick_all<-lipstick_all %>%
mutate(id2 =rownames(lipstick_all))
conbine_tidy<-lipstick_all%>%
mutate(linenumber =row_number())%>%
unnest_tokens(sentence,art_Content)
NRC_VAD<-read.table("./data/NRC_VAD/NRC_VAD_Lexicon.txt",header = T,sep="\t",stringsAsFactors = F)
NRC<-get_sentiments("nrc")%>%as.data.frame()
NRC_all<-NRC_VAD%>%left_join(NRC,by=c("Word"="word"))
join2<- conbine_tidy %>%
inner_join(NRC_VAD,by=(c("sentence"="Word")))%>%
group_by(linenumber,id2)%>%
summarise(n=n(),
valence_avg=sum(Valence)/n,
arousal_avg=sum(Arousal)/n,
dominance_avg=sum(Dominance)/n)
#合併join2
lipstick_all<-lipstick_all%>%
left_join(join2,by=c("id2"))
lipstick_all %>% head(10)
## _id
## 1 5c892605923ce022a2d09ff5
## 2 5c892605923ce022a2d09ff7
## 3 5c892605923ce022a2d09ff9
## 4 5c892605923ce022a2d09ffb
## 5 5c892605923ce022a2d09ffd
## 6 5c892605923ce022a2d09fff
## 7 5c892605923ce022a2d0a001
## 8 5c892605923ce022a2d0a003
## 9 5c892605923ce022a2d0a005
## 10 5c892605923ce022a2d0a007
## art_Content
## 1 very sticky the colors were a little off caution to others it drys very fast and very hard to take off id recommend to use coconut oil to remove the lipstick
## 2 i really love the dark red lip color in this set yes some of the lighter colors in the set were sticky and settled in the lines of my lips however i solved that quickly by mosturizing my lips before applying the lip color and applying a setting powder i would reccommend based on the colors as it is difficult to find a good set where all the colors compliment your skin tone like this one does to my skin
## 3 product definitely dries quick but the color pallet is not a truly representaed by the photos the color pallet i received is very brown nude earth tones not pink as it displays definitely returning
## 4 the color is beautiful but its very strange texture it makes your lips stick together i put it on and was like ahh this is great and about seconds later im saying what the heck is going on i am going to try putting a coat of clear chapstick over it i guess too inexpensive to return so im going to try and work with it but very strange texture once it dries you cant even rub your lips together
## 5 really pretty colors for the price dried quickly from the swatches and smell like chocolate ðÿ˜š i cant wait to try on my lips
## 6 super sticky starts off ok nice colors but like the other probably better with clear gloss
## 7 does not feel good when applied the color seems to run in the lines and dries your lips immediately not a fan
## 8 i really love these colors the color tends to crumble off my lips around lunch time but reapplying is not a big deal to me it lasts a few hours which is fine the colors dry slightly darker but are great shades the texture is a bit sticky for a while so i pat a translucent finishing powder on top of the color
## 9 smells great wears good with lip toppers or is too sticky on its own and needs setting powder
## 10 i bought this item and loved all the colors i just purchased a mac matte lipstick but as i was applying it the color got everywhere all over my teeth too it was very wet when i found these i was hesitant but then felt for the price what have i got to loose i was not disappointed the color stays on forever and i will defiantly be purchasing this product again
## art_Date
## 1 February 18, 2018
## 2 November 4, 2018
## 3 February 14, 2018
## 4 January 11, 2018
## 5 September 9, 2018
## 6 January 18, 2018
## 7 May 17, 2018
## 8 March 8, 2018
## 9 May 7, 2018
## 10 January 8, 2019
## art_Url
## 1 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 2 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 3 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 4 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 5 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 6 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 7 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 8 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 9 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## 10 https://www.amazon.com/Velvety-Liquid-Lipstick-Lipgloss-Waterproof/product-reviews/B0762M2ZWJ/ref=cm_cr_dp_d_show_all_btm?ie=UTF8&reviewerType=all_reviews
## author_name author_star availability brand
## 1 Joana 1.0 out of 5 stars In Stock SHERUI
## 2 Ashley 5.0 out of 5 stars In Stock SHERUI
## 3 frances padilla 2.0 out of 5 stars In Stock SHERUI
## 4 Katharine 1.0 out of 5 stars In Stock SHERUI
## 5 Evelina L. 5.0 out of 5 stars In Stock SHERUI
## 6 Lola Bunny 1.0 out of 5 stars In Stock SHERUI
## 7 BLONDEE70 2.0 out of 5 stars In Stock SHERUI
## 8 Amazon Customer 4.0 out of 5 stars In Stock SHERUI
## 9 Kryska 4.0 out of 5 stars In Stock SHERUI
## 10 Amazon Customer 4.0 out of 5 stars In Stock SHERUI
## comment_title
## 1 Id recommend to use coconut oil to remove the lipstick\n
## 2 Pocket Sized, Variety of colors\n
## 3 Not pink color palett!!! VERY BROWN!\n
## 4 The Color is beautiful but it's very strange texture\n
## 5 Cute mini matte lipstick set\n
## 6 STICKY!!! (Almost glue your mouth shut type sticky)\U0001f628\n
## 7 Does not feel good when applied the color seems to run in the ...\n
## 8 Very nice\n
## 9 Love\n
## 10 Compared to MAC\n
## error star
## 1 0 3.2 out of 5 stars
## 2 0 3.2 out of 5 stars
## 3 0 3.2 out of 5 stars
## 4 0 3.2 out of 5 stars
## 5 0 3.2 out of 5 stars
## 6 0 3.2 out of 5 stars
## 7 0 3.2 out of 5 stars
## 8 0 3.2 out of 5 stars
## 9 0 3.2 out of 5 stars
## 10 0 3.2 out of 5 stars
## title
## 1 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 2 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 3 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 4 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 5 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 6 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 7 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 8 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 9 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## 10 6pcs Matte Velvety Liquid Lipstick Matte Liquid Lipgloss Waterproof Lip Gloss
## star_sub art_Date1 Year Month author_star_sub ASIN Rank1 Rank2
## 1 3.2 2018-02-18 2018 02 1 B0762M2ZWJ 1189 1
## 2 3.2 2018-11-04 2018 11 5 B0762M2ZWJ 1189 1
## 3 3.2 2018-02-14 2018 02 2 B0762M2ZWJ 1189 1
## 4 3.2 2018-01-11 2018 01 1 B0762M2ZWJ 1189 1
## 5 3.2 2018-09-09 2018 09 5 B0762M2ZWJ 1189 1
## 6 3.2 2018-01-18 2018 01 1 B0762M2ZWJ 1189 1
## 7 3.2 2018-05-17 2018 05 2 B0762M2ZWJ 1189 1
## 8 3.2 2018-03-08 2018 03 4 B0762M2ZWJ 1189 1
## 9 3.2 2018-05-07 2018 05 4 B0762M2ZWJ 1189 1
## 10 3.2 2019-01-08 2019 01 4 B0762M2ZWJ 1189 1
## Category1 Customer_Reviews id2 linenumber n valence_avg
## 1 Beauty & Personal Care 264 1 1 13 0.5472308
## 2 Beauty & Personal Care 264 2 2 28 0.6225000
## 3 Beauty & Personal Care 264 3 3 11 0.6561818
## 4 Beauty & Personal Care 264 4 4 22 0.6371818
## 5 Beauty & Personal Care 264 5 5 10 0.6382000
## 6 Beauty & Personal Care 264 6 6 8 0.7298750
## 7 Beauty & Personal Care 264 7 7 7 0.7061429
## 8 Beauty & Personal Care 264 8 8 22 0.6586364
## 9 Beauty & Personal Care 264 9 9 5 0.6686000
## 10 Beauty & Personal Care 264 10 10 20 0.6264500
## arousal_avg dominance_avg
## 1 0.4374615 0.3953077
## 2 0.4204286 0.4542143
## 3 0.3873636 0.4580909
## 4 0.4580455 0.5069091
## 5 0.4521000 0.4471000
## 6 0.4248750 0.4956250
## 7 0.5090000 0.4955714
## 8 0.3823182 0.4497727
## 9 0.4400000 0.4546000
## 10 0.3999500 0.4691000
# 移除author_name,brand,star_sub,author_star_sub,Rank2,情緒欄位為NA的資料
lipstick_all_no_NA = lipstick_all[complete.cases(lipstick_all[ , c("author_name","brand","star_sub","author_star_sub","Rank2", "valence_avg", "arousal_avg", "dominance_avg")]), ]
sentiment_valence = rbind(
# valence_avg 最高的十個品牌
lipstick_all_no_NA %>%
#filter(brand %in% discuss_plot_df$brand) %>%
group_by(brand) %>%
summarise(valence_all_avg = mean(valence_avg)) %>%
arrange(desc(valence_all_avg)) %>%
head(10) %>%
ungroup() %>%
mutate(score = "high"),
# valence_avg 最低的十個品牌
lipstick_all_no_NA %>%
#filter(brand %in% discuss_plot_df$brand) %>%
group_by(brand) %>%
summarise(valence_all_avg = mean(valence_avg)) %>%
arrange(valence_all_avg) %>%
head(10) %>%
ungroup() %>%
mutate(score = "low")
)
#畫圖
sentiment_valence %>%
group_by(score) %>%
arrange(desc(valence_all_avg)) %>%
ungroup() %>%
ggplot(aes(reorder(brand, valence_all_avg), valence_all_avg, fill = score)) +
geom_col() +
xlab("Brand") +
ylab("Valence AVG") +
coord_flip()
sentiment_arousal = rbind(
# arousal_avg 最高的十個品牌
lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(arousal_all_avg = mean(arousal_avg)) %>%
arrange(desc(arousal_all_avg)) %>%
head(10) %>%
ungroup() %>%
mutate(score = "high"),
# arousal_avg 最低的十個品牌
lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(arousal_all_avg = mean(arousal_avg)) %>%
arrange(arousal_all_avg) %>%
head(10) %>%
ungroup() %>%
mutate(score = "low")
)
# 畫圖
sentiment_arousal %>%
group_by(score) %>%
arrange(desc(arousal_all_avg)) %>%
ungroup() %>%
ggplot(aes(reorder(brand, arousal_all_avg), arousal_all_avg, fill = score)) +
geom_col() +
xlab("Brand") +
ylab("Arousal AVG") +
coord_flip()
sentiment_dominance = rbind(
# dominance_avg 最高的十個品牌
lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(dominance_all_avg = mean(dominance_avg)) %>%
arrange(desc(dominance_all_avg)) %>%
head(10) %>%
ungroup() %>%
mutate(score = "high"),
# dominance_avg 最低的十個品牌
lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(dominance_all_avg = mean(dominance_avg)) %>%
arrange(dominance_all_avg) %>%
head(10) %>%
ungroup() %>%
mutate(score = "low")
)
#畫圖
sentiment_dominance %>%
group_by(score) %>%
arrange(desc(dominance_all_avg)) %>%
ungroup() %>%
ggplot(aes(reorder(brand, dominance_all_avg), dominance_all_avg, fill = score)) +
geom_col() +
xlab("Brand") +
ylab("Dominance AVG") +
coord_flip()
# 把valence與arousal維度最高10個與最低10個都有出現的品牌join一起
sentiment_join = sentiment_valence %>%
inner_join(sentiment_arousal, by = c(brand = "brand"))
sentiment_join$score.x = as.factor(sentiment_join$score.x)
sentiment_join$score.y = as.factor(sentiment_join$score.y)
sentiment_join %>%
ggplot(aes(score.x, score.y, label = brand)) +
geom_jitter(position = position_jitter(seed = 87)) +
xlab("Valence") +
ylab("Arousal") +
geom_vline(xintercept = 1.5) + # 在x軸中間加入線條以利區別點點位置
geom_hline(yintercept = 1.5) + # 在y軸中間加入線條以利區別點點位置
scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) + # 將x軸的刻度對調
scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) + # 將y軸的刻度對調
geom_label_repel(position = position_jitter(seed = 87), aes(label = brand), box.padding = 0.7)
# 把每個點點標上brand名稱
這張圖可以看出, 越靠右邊的品牌越帶給消費者正面情緒,反之,越左邊越負面; 越靠上方的品牌代表消費者越外放的情緒,反之,越下方則越內縮。
1. 以Oriflame、ONE1X、K牌脣膏為例,消費者感受到正面的情緒,並且在評論中的敘述是比較想把好心得分享給大家
2. NOTE Cosmetics、Nykaa、A牌脣膏則是讓消費者產生負面情緒,而且消費者的評論也較積極表現出不滿的心情
3. Kryolan雖然讓消費者有負面情緒,但消費者的評論卻是用比較婉轉的方式說產品的缺點
lipstick_all_no_NA[lipstick_all_no_NA$brand == "Oriflame" | lipstick_all_no_NA$brand == "ONE1X", 2]
## [1] "so cute got them for a bachelorette gift bags the ladies all loved them actually fought over the colors lol"
## [2] "its superb my new lip love"
Oriflame、ONE1X的評論表現出很喜歡該品牌,有愛不釋手的感覺
lipstick_all_no_NA[lipstick_all_no_NA$brand == "NOTE Cosmetics", 2]
## [1] "i would never ever buy again so disappointed very bad experience"
## [2] "didn t keep it because it was totally broken off when i opened package wouldn t buy agsin"
NOTE Cosmetics的評論很明顯唾棄這種品牌的脣膏,而且還非常不推薦給其他人
lipstick_all_no_NA[lipstick_all_no_NA$brand == "Kryolan", 2]
## [1] "it has quite a lot of wax smell in it"
Kryolan的評論沒有直接表現出對品牌的喜好,但委婉地說出產品的缺點
star_amount = lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(star = mean(star_sub)) %>%
ungroup()
sentiment_join_star = sentiment_join %>%
inner_join(star_amount, by = c(brand = "brand"))
sentiment_join_star %>%
ggplot(aes(score.x, score.y, label = brand)) +
geom_jitter(position = position_jitter(seed = 87), aes(size = star)) + # 星星數越高,點點越大
xlab("Valence") +
ylab("Arousal") +
geom_vline(xintercept = 1.5) + # 加入線條以利區別點點位置
geom_hline(yintercept = 1.5) + # 加入線條以利區別點點位置
scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) + # 將x軸排序對調
scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) + # 將y軸排序對調
geom_label_repel(position = position_jitter(seed = 87), size = 3, box.padding = 0.7) # 把每個點點標上brand名稱
# 將同品牌lipstick的rank平均
rank_avg = lipstick_all_no_NA %>%
group_by(brand) %>%
summarise(rank = mean(Rank2)) %>%
ungroup()
sentiment_join_rank = sentiment_join %>%
inner_join(rank_avg, by = c(brand = "brand"))
sentiment_join_rank %>%
ggplot(aes(score.x, score.y, label = brand)) +
geom_jitter(position = position_jitter(seed = 87), aes(size = -rank)) + # rank越前面,點點越大
xlab("Valence") +
ylab("Arousal") +
geom_vline(xintercept = 1.5) + # 加入線條以利區別點點位置
geom_hline(yintercept = 1.5) + # 加入線條以利區別點點位置
scale_x_discrete(limits = rev(levels(sentiment_join$score.x))) + # 將x軸排序對調
scale_y_discrete(limits = rev(levels(sentiment_join$score.y))) + # 將y軸排序對調
geom_label_repel(position = position_jitter(seed = 87), size = 3, box.padding = 0.7) # 把每個點點標上brand名稱
# 將正負面分數最高與最低各10個牌子,連結該牌的平均銷售排行與平均獲得星星數
valence_rank_star = sentiment_valence %>%
inner_join(rank_avg, by = c(brand = "brand")) %>%
inner_join(star_amount, by = c(brand = "brand"))
valence_rank_star
## # A tibble: 20 x 5
## brand valence_all_avg score rank star
## <fct> <dbl> <chr> <dbl> <dbl>
## 1 BeautiControl 0.918 high 24605 5
## 2 MILK MAKEUP 0.872 high 32875 5
## 3 ONE1X 0.867 high 2126 5
## 4 Oriflame 0.860 high 44499 5
## 5 LCTCKP 0.817 high 44649 5
## 6 KAT VON D Everlasting Glimmer Veil L… 0.806 high 15197 4
## 7 BORNTREE 0.796 high 28540 4
## 8 FLOWER 0.793 high 21762 3.3
## 9 Tayongpo 0.790 high 10539 4.5
## 10 Stephanie Imports 0.783 high 6951 5
## 11 VOGUE COLOMBIA 0.25 low 56629 2
## 12 Nykaa 0.291 low 38799 1
## 13 A'some 0.438 low 27233 2
## 14 YSL 0.440 low 4965 1
## 15 Kryolan 0.492 low 54303 1
## 16 Smoke & Mirrors 0.506 low 46511 1
## 17 SUNSENT 0.506 low 7506 2.7
## 18 Addictive Cosmetics 0.514 low 16925 2.4
## 19 MILEMEI 0.515 low 25234 1.33
## 20 NOTE Cosmetics 0.522 low 39886 1
valence_rank_star %>%
ggplot(aes(star, valence_all_avg, label = brand)) +
geom_point(aes(size = -rank, color = -rank)) + # 銷售排行越好,點點越大,顏色越深
#geom_text_repel(box.padding = 0.7, aes(size = 2)) +
geom_label_repel( box.padding = 0.7, size = 3) +
ylab("Sentiment Valence") +
xlab("Star Amount")
conbine_document<-conbine_tidy %>%
anti_join(stop_words,by=c("sentence"="word")) %>%
count(linenumber, sentence, sort = TRUE) %>%
ungroup()
#轉成dtm
top10_dtm <- conbine_document %>%
cast_dtm(linenumber, sentence, n)
top10_dtm
## <<DocumentTermMatrix (documents: 63345, terms: 19961)>>
## Non-/sparse entries: 686353/1263743192
## Sparsity : 100%
## Maximal term length: NA
## Weighting : term frequency (tf)
lda_top10 <- LDA(top10_dtm, k = 2, control = list(seed = 1234))
lda_top10
## A LDA_VEM topic model with 2 topics.
#主題跟字的關係
document_topics <- tidy(lda_top10, matrix = "beta")
document_topics
## # A tibble: 39,922 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 red 0.00740
## 2 2 red 0.00759
## 3 1 organic 0.000110
## 4 2 organic 0.000390
## 5 1 bees 0.000820
## 6 2 bees 0.000329
## 7 1 burts 0.000431
## 8 2 burts 0.000652
## 9 1 lipstick 0.0271
## 10 2 lipstick 0.0354
## # ... with 39,912 more rows
top_terms <- document_topics %>%
filter(!term %in% c("lipstick","color","love","lips","product","doe snt","lip","lipsticks"))%>%
group_by(topic) %>%
top_n(20, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms
## # A tibble: 40 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 doesnt 0.0124
## 2 1 perfect 0.0108
## 3 1 wear 0.0108
## 4 1 pink 0.0106
## 5 1 apply 0.0101
## 6 1 matte 0.0100
## 7 1 day 0.00910
## 8 1 price 0.00872
## 9 1 red 0.00740
## 10 1 time 0.00698
## # ... with 30 more rows
#畫圖呈現主題跟字的關係
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()
beta_spread <- document_topics %>%
mutate(topic = paste0("topic", topic)) %>%
spread(topic, beta) %>%
filter(topic1 > .001 | topic2 > .001) %>%
mutate(log_ratio = log2(topic2 / topic1))
beta_spread
## # A tibble: 216 x 4
## term topic1 topic2 log_ratio
## <chr> <dbl> <dbl> <dbl>
## 1 absolutely 0.00247 0.00142 -0.802
## 2 add 0.00121 0.000465 -1.38
## 3 amazing 0.00103 0.00464 2.17
## 4 amazon 0.00265 0.00166 -0.672
## 5 application 0.00322 0.000759 -2.09
## 6 applicator 0.000225 0.00106 2.24
## 7 applied 0.00232 0.00204 -0.186
## 8 apply 0.0101 0.00145 -2.80
## 9 applying 0.00168 0.00170 0.0209
## 10 arrived 0.000699 0.00205 1.55
## # ... with 206 more rows
beta_spread%>%
arrange(desc(abs(log_ratio))) %>%
head(20)%>%
mutate(term = reorder(term, log_ratio)) %>%
ggplot(aes(term, log_ratio, fill = log_ratio>0)) +
geom_col(show.legend = FALSE) +
xlab("term") +
ylab("log(topic2 / topic1)") +
coord_flip()
library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
## [1] 24
## 字頻表
dtm = lipstick_all$art_Content %>%
iconv(to = "utf-8", sub="") %>%
str_trim(.) %>%
toupper() %>%
VectorSource %>% Corpus %>%
tm_map(content_transformer(tolower)) %>%
tm_map(removePunctuation) %>%
tm_map(stemDocument) %>%
DocumentTermMatrix %>%
removeSparseTerms(0.995)
dtm # (documents: 14156, terms: 1030)
## <<DocumentTermMatrix (documents: 63980, terms: 560)>>
## Non-/sparse entries: 1124526/34704274
## Sparsity : 97%
## Maximal term length: 10
## Weighting : term frequency (tf)
dtm_tmp=dtm
library(slam)
tfidf = tapply(dtm$v/row_sums(dtm)[dtm$i], dtm$j, mean) *
log2(nrow(dtm)/col_sums(dtm > 0))
summary(tfidf)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0758 0.2012 0.2444 0.2652 0.3012 1.0575
dtm=dtm_tmp
dtm = dtm[, tfidf > 0.2012 ]
dtm = dtm[,order(-col_sums(dtm))]
dim(dtm)
## [1] 63980 420
#install.packages('Rtsne')
library(Rtsne)
# n = 300
# tsne = dtm[, 1:n] %>% as.data.frame.matrix %>%
# sapply(.,function(v)ifelse(v>1,1,v))%>%
# scale %>% t %>%
# Rtsne(check_dup=F, theta=0.0, max_iter=3200)
# Y = tsne$Y # tSNE coordinates
# d = dist(Y) # distance matrix
# hc = hclust(d) # hi-clustering
# K = 30 # number of clusters
# g = cutree(hc,K) # cut into K clusters
# table(g) %>% as.vector %>% sort # sizes of clusters
# # install.packages('randomcoloR')
# library(randomcoloR)
# library(wordcloud)
#
# wc = col_sums(dtm[,1:n])
# colors = distinctColorPalette(K)
#
#
# #png("./Amazon_lipstick.png", width=3200, height=1800)
# textplot(
# Y[,1], Y[,2], colnames(dtm)[1:n], show=F,
# col=colors[g],
# cex= 0.3 + 1.25 * sqrt(wc/mean(wc)),
# font=2)
# dev.off()
文字雲
nrow(lipstick) # 92524 num of data
## [1] 92524
length(unique(lipstick$author_name)) # 57672 comment author
## [1] 57672
length(unique(lipstick$brand)) #437 different brand
## [1] 437
tmp <- lipstick %>%
select(author_name, brand) %>%
filter(author_name != 'Kindle Customer' & author_name != 'Amazon Customer') %>%
group_by(brand) %>%
mutate(brand_sum = n()) %>%
ungroup() %>%
arrange(desc(brand_sum))
# 只取前10大品牌
top_10 <- unique(tmp$brand)[1:10]
# 因為評論者太多 只取留言數夠多的評論者做呈現
link <- tmp %>%
filter(brand %in% top_10) %>%
group_by(author_name) %>%
mutate(author_sum = n()) %>%
ungroup() %>%
distinct() %>%
filter(author_sum > 30)
author <- select(link, author_name, author_sum)
brand <- select(link, brand, brand_sum)
names(author) <- NULL
names(brand) <- NULL
nodes <- rbind(data.frame(author),data.frame(brand)) %>% distinct()
reviewNetwork <- graph_from_data_frame(d=link, v=nodes,directed = F)
reviewNetwork
## IGRAPH 7bd120e UN-- 35 228 --
## + attr: name (v/c), X2 (v/n), brand_sum (e/n), author_sum (e/n)
## + edges from 7bd120e (vertex names):
## [1] Michelle --Maybelline New York Heather --Maybelline New York
## [3] Melissa --Maybelline New York Mary --Maybelline New York
## [5] Emily --Maybelline New York Laura --Maybelline New York
## [7] Jennifer --Maybelline New York Amy --Maybelline New York
## [9] Karen --Maybelline New York Sarah --Maybelline New York
## [11] Anna --Maybelline New York Katie --Maybelline New York
## [13] Elizabeth--Maybelline New York Nicole --Maybelline New York
## [15] Amanda --Maybelline New York Samantha --Maybelline New York
## + ... omitted several edges
V(reviewNetwork)$size <- log(V(reviewNetwork)$X2)*3
# 紅色是品牌的node 藍色是author
plot(reviewNetwork,
vertex.color=c("skyblue", "red")[1+(names(V(reviewNetwork)) %in% top_10)])
length(unique(lipstick$author_star))
## [1] 5
star_link <- lipstick %>%
select(author_name, author_star) %>%
group_by(author_name) %>%
mutate(author_sum = n()) %>%
distinct() %>%
filter(author_sum > 40)
starNetwork <- graph_from_data_frame(d=star_link,directed = F)
starNetwork
## IGRAPH fe944c1 UN-- 56 251 --
## + attr: name (v/c), author_sum (e/n)
## + edges from fe944c1 (vertex names):
## [1] Ashley --5.0 out of 5 stars
## [2] Amazon Customer--4.0 out of 5 stars
## [3] Angela --5.0 out of 5 stars
## [4] Amazon Customer--1.0 out of 5 stars
## [5] Sara --1.0 out of 5 stars
## [6] Amazon Customer--3.0 out of 5 stars
## [7] Amazon Customer--5.0 out of 5 stars
## [8] Sarah --3.0 out of 5 stars
## + ... omitted several edges
plot(starNetwork)
colnames(lipstick_all)[which(names(lipstick_all) == "a")] <- "ASIN"
price_tmp <- lipstick_all %>%
left_join(lipstick_detail, by = 'ASIN') %>%
select(author_name, Price)
price_tmp$Price <- as.numeric(price_tmp$Price)
## Warning: NAs introduced by coercion
price_tmp <- na.omit(price_tmp)
summary(price_tmp$Price)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.75 5.85 8.83 12.99 19.99 118.99
plot(density(price_tmp$Price))
price_link <- price_tmp %>%
filter(author_name != 'Kindle Customer' & author_name != 'Amazon Customer') %>%
group_by(author_name) %>%
mutate(author_sum = n()) %>%
ungroup() %>%
mutate(price_index = ifelse(Price>20,'high','low')) %>%
filter(author_sum > 50) %>%
select(author_name, price_index, author_sum)
priceNetwork <- graph_from_data_frame(d=price_link,directed = F)
priceNetwork
## IGRAPH 83f0ea6 UN-- 17 970 --
## + attr: name (v/c), author_sum (e/n)
## + edges from 83f0ea6 (vertex names):
## [1] Ashley --low Sarah --low Nicole --low Nicole --low
## [5] Amanda --low Amanda --low Jessica --low Katie --low
## [9] Jessica --low Katie --low Nicole --low Mary --low
## [13] Sarah --low Lisa --low Jessica --low Michelle --low
## [17] Jennifer --low Lauren --low Amanda --low Stephanie--low
## [21] Stephanie--low Michelle --low Elizabeth--low Amanda --low
## [25] Jennifer --low Amanda --low Sarah --low Stephanie--low
## [29] Mary --low Katie --low Michelle --low Laura --low
## + ... omitted several edges
plot(priceNetwork)
#以bi-gram做斷詞
lipstick_bigrams <- lipstick_all%>%
mutate(linenumber =row_number())%>%
unnest_tokens(bigram,art_Content, token = "ngrams", n = 2)
lipstick_bigrams %>%
count(bigram, sort = TRUE)
## # A tibble: 286,727 x 2
## bigram nn
## <chr> <int>
## 1 the color 14629
## 2 i love 7932
## 3 it is 7805
## 4 my lips 7557
## 5 i have 6282
## 6 this is 6262
## 7 color is 6039
## 8 is a 5963
## 9 this lipstick 5711
## 10 love this 5217
## # ... with 286,717 more rows
bigrams_separated <- lipstick_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
## # A tibble: 77,589 x 3
## word1 word2 nn
## <chr> <chr> <int>
## 1 lip color 2089
## 2 skin tone 1297
## 3 beautiful color 1122
## 4 lip gloss 1085
## 5 love love 1039
## 6 lip balm 1005
## 7 nice color 924
## 8 staying power 828
## 9 matte lipstick 818
## 10 <NA> <NA> 812
## # ... with 77,579 more rows
lipstick_all %>%
unnest_tokens(trigram, art_Content, token = "ngrams", n = 3) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!word1 %in% stop_words$word,
!word2 %in% stop_words$word,
!word3 %in% stop_words$word) %>%
count(word1, word2, word3, sort = TRUE)
## # A tibble: 49,049 x 4
## word1 word2 word3 nn
## <chr> <chr> <chr> <int>
## 1 <NA> <NA> <NA> 2220
## 2 love love love 475
## 3 natural lip color 237
## 4 matte lip cream 121
## 5 matte lip color 87
## 6 soft matte lip 84
## 7 lips feel dry 80
## 8 love lime crime 76
## 9 ðÿ ðÿ ðÿ 72
## 10 lime crime velvetines 72
## # ... with 49,039 more rows
#將trigram的結果拆成三個字並另存成一dataframe
lipstick_trigrams <- lipstick_all%>%
mutate(linenumber =row_number())%>%
unnest_tokens(trigram,art_Content, token = "ngrams", n = 3)
trigrams_separated <- lipstick_trigrams %>%
separate(trigram, c("word1", "word2","word3"), sep = " ")
trigrams_filtered <- trigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word) %>%
filter(!word3 %in% stop_words$word)
#找出評論中提到的商品顏色
bigram_color <- filter(bigrams_filtered,word2%in% lipstick_detail$color)
trigram_color <- filter(trigrams_filtered,word3%in% lipstick_detail$color)
bigrams_united <- bigram_color %>%
unite(bigram, word1, word2, sep = "_")
set.seed(1234)
wordcloud(words = bigrams_united$bigram,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
trigrams_united <- trigram_color %>%
unite(trigram, word1, word2, word3, sep = "_")%>%
filter(!trigram %in% c("NA_NA_NA"))
# set.seed(1234)
# wordcloud(words = trigrams_united$trigram,
# max.words=200, random.order=FALSE, rot.per=0.35,
# colors=brewer.pal(8, "Dark2"))
trigram
without_skin_color <- filter(trigram_color, word2 !="skin")
new_united <- without_skin_color %>%
unite(trigram, word1, word2, word3, sep = "_")
# set.seed(1234)
# wordcloud(words = new_united$trigram,
# max.words=200, random.order=FALSE, rot.per=0.35,
# colors=brewer.pal(8, "Dark2"))
trigram color
trigram<-new_united %>%
count(trigram, sort = TRUE)
color_discuss <- trigram %>%
filter(nn>5)%>%
ggplot(.,aes(x=reorder(trigram,nn),y=nn))+
geom_bar(stat="identity")+
coord_flip()+ labs(y='count',x='trigram of color')
color_discuss
color_mentioned <- new_united %>%
filter(brand %in% discuss_plot_df$brand) %>%
count(trigram, sort = TRUE)
color_mentioned
## # A tibble: 1,575 x 2
## trigram nn
## <chr> <int>
## 1 blue_based_red 16
## 2 blue_toned_red 14
## 3 fire_engine_red 13
## 4 bubble_gum_pink 12
## 5 deep_dark_red 11
## 6 perfect_matte_red 11
## 7 bright_hot_pink 7
## 8 nice_bright_red 7
## 9 beautiful_true_red 6
## 10 nice_soft_pink 6
## # ... with 1,565 more rows
color_plot <- color_mentioned %>%
head(10)%>%
ggplot(.,aes(x=reorder(trigram,nn),y=nn))+
geom_bar(stat="identity")+
coord_flip()+ labs(y='count',x='trigram of color')
color_plot
top10_color <- color_mentioned %>%
head(10)
temp <- new_united %>%
filter(brand %in% discuss_plot_df$brand) %>%
filter(trigram %in% color_mentioned$trigram) %>%
filter(trigram %in% top10_color$trigram) %>%
select(brand, trigram)
temp %>%
ggplot(aes(trigram, fill=brand)) + geom_bar(position="stack")
#類別轉factor
new_united$brand <- as.factor(new_united$brand)
new_united$trigram <- as.factor(new_united$trigram)
#新增一欄位:是否會進銷售排行前十名
predict_data <- new_united %>%
mutate(inRanking = ifelse(Rank2 >= 10, "no", "yes"))
predict_data$inRanking <- as.factor(predict_data$inRanking)
#切分訓練資料
set.seed(1)
indx <- sample(1:nrow(predict_data), nrow(predict_data)*0.7)
test <- predict_data[-indx,]
train <- predict_data[indx,]
## 用品牌及 trigram 的結果預測是否會進入銷售排行前十名
library(e1071)
svmfit <- svm(inRanking ~ brand + trigram, data=train , kernel ="linear", cost=10,scale=FALSE)
ypred <- predict (svmfit ,test)
x=table(predict = ypred , truth = test$inRanking )
sum(diag(x))/sum(x)
## [1] 0.9696376
#建立要被篩選的字
mat_word <- c("matt","matte","matted")
gloss_word <- c("gloss","glossy")
#以trigram找出有被提到是霧面還亮面的評論
matte <- filter(trigrams_filtered, word1 %in% mat_word | word2 %in% mat_word | word3 %in% mat_word)
gloss <- filter(trigrams_filtered, word1 %in% gloss_word | word2 %in% gloss_word | word3 %in% gloss_word)
color <- conbine_tidy %>% filter(sentence %in% mat_word |sentence %in% gloss_word | sentence %in% c("dark","red","tone","brown","pink","orange","black","purple","fire","cupcake","nap","cabernet","lay","temptation"))
## 霧面 vs 亮面唇膏趨勢圖
color %>%
select(art_Date1,sentence)%>%
count(art_Date1,sentence) %>%
arrange(desc(n))%>%
filter(n>10)%>%
filter(sentence %in% mat_word | sentence %in% gloss_word)%>%
ggplot(.,aes(art_Date1,n, fill = sentence,label=sentence)) +
geom_line(aes(color = sentence), position="identity")
## 其他顏色趨勢圖
color %>%
select(art_Date1,sentence)%>%
count(art_Date1,sentence) %>%
arrange(desc(n))%>%
filter(n>10)%>%
filter(!sentence %in% mat_word & !sentence %in% gloss_word)%>%
ggplot(.,aes(art_Date1,n, fill = sentence,label=sentence)) +
geom_line(aes(color = sentence), position="identity")
new_united %>%
distinct(brand) %>%
filter(brand %in% discuss_plot_df)
## [1] brand
## <0 rows> (or 0-length row.names)
1.唇膏的評論難以使用主題模型分類,但透過尺度縮減及階層式分群使相似的字歸類在一起,能夠使我們推斷出評論潛在的主題
2.從評論當中可以發現消費者對於唇膏的顏色,以及是否搭配自己的膚色較為在意。除此之外,唇膏的目標客群通常為女性,因此從網路圖也可以發現評論者通常以女性居多
3.一開始使用單字斷詞進行分析時,所呈現出的效果不盡理想,最後嘗試透過bigram及trigram進行切割之後,所得出的字詞比較具有意義及可解釋性,最後我們使用trigram的結果來預測銷售績效也達到了不錯的效果