1.基本設定+載入package
setwd('D:\\Lab\\essay1')
getwd()
## [1] "D:/Lab/essay1"
#packages = c(
# "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest")
#existing = as.character(installed.packages()[,1])
#for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# Sys.setlocale("LC_ALL","C")
#options(digits=5, scipen=10)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(SnowballC)
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(wordcloud)
## Loading required package: RColorBrewer
##
## Attaching package: 'wordcloud'
## The following object is masked from 'package:gplots':
##
## textplot
library(stringr)
library(tidytext)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
##
## margin
## The following object is masked from 'package:NLP':
##
## annotate
library(tidyr)
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
library(readr)
讀資料
laptop_review=fread('./data/yuchia.amazon_laptop_review.csv',header = T)
#lipstick_review=fread('./data/yuchia.amazon_lipstick_review.csv',header = T)
資料清理
##刪除多餘的欄位
laptop_review <- laptop_review[,-c(12,13,14,15,16,17,18,19,20,21,22,23)]
laptop_review<- subset(laptop_review,error ==0) %>% as.data.frame()
##Date:
Sys.setlocale("LC_TIME", "C")
## [1] "C"
laptop_review$art_Date1=as.Date(laptop_review$art_Date,format = "%d-%b-%y")
laptop_review$Year=year(laptop_review$art_Date1)
laptop_review$Month=format(laptop_review$art_Date1, "%m")
##Brand:
laptop_review$brand=laptop_review$brand %>% as.factor
brand=laptop_review$brand %>% unique
##Star:
laptop_review$star_sub=laptop_review$star %>% substr(1,3)
laptop_review$author_star_sub=laptop_review$author_star %>% substr(1,3)
laptop_review$star_sub =laptop_review$star_sub %>% as.numeric()
laptop_review$author_star_sub =laptop_review$author_star_sub %>% as.numeric()
laptop_review<-distinct(laptop_review,art_Content, .keep_all= TRUE)#刪除重複
2.品牌討論度
#by年分
laptop_review$brand %>% unique
## [1] Acer Asus Lenovo Huawei
## [5] Samsung HP Microsoft MSI
## [9] Dell Apple Razer Google
## [13] LG Gigabyte XIDU Alienware
## [17] Wacom Fusion5 Oemgenuine RCA
## [21] EVOO ALLDOCUBE iRULU Pocket C.H.I.P
## [25] Hewlett Packard Toshiba Prostar Sager
## [29] Jumper IVIEW Yuntab Azpen
## [33] CHUWI LHMZNIY WinBook PROSCAN
## [37] CTL Fenniu NUVISION Nextbook Flexx
## [41] HYUNDAI AWOW Inspiron CyberpowerPC
## [45] Packard Bell Intel BIT Bit
## [49] Aorus VIZIO Ematic
## 51 Levels: Acer Alienware ALLDOCUBE Aorus Apple Asus AWOW Azpen ... Yuntab
brand_discuss_df=laptop_review %>%
group_by(.,brand,Year) %>%
summarise(
discuss_n=n()
)%>% filter(Year>2010)
brand_discuss_df %>%
group_by(.,brand) %>%
summarise(
discuss_sum=sum(discuss_n)
) ->top10_df
#top10品牌討論度by年分
brand_discuss_df$brand %in%
top10_df [order(top10_df$discuss_sum,decreasing = T),]$brand[1:10] %>%
subset(brand_discuss_df,.)->discuss_plot_df
discuss_plot_df<- subset(discuss_plot_df,Year!="NA")
#by 月份
brand_discuss_df_month=laptop_review %>%
group_by(.,brand,Year,Month) %>%
summarise(
discuss_n=n()
)
#top10品牌討論度by月分
brand_discuss_df_month$brand %in%
top10_df [order(top10_df$discuss_sum,decreasing = T),]$brand[1:10] %>%
subset(brand_discuss_df_month,.)->discuss_plot_df_month
品牌被討論數隨時間分布圖by年分
#by年分
ggplot(discuss_plot_df, aes(x = Year, y =discuss_n,color=brand)) +
geom_line() + geom_point( fill = "white")+ scale_x_continuous(breaks = c(2011:2019))+scale_y_continuous(breaks=c(0,1000,2000,3000,4000,5000,6000,7000,8000,9000))

品牌被討論數隨時間分布圖by月分
subset(discuss_plot_df_month,Year>=2015 & Year<2019) %>% group_by(brand,Year) %>% arrange(
desc(Month)) %>%
ggplot(data=.,aes(x=Month, y=discuss_n, color=brand,group=brand) ) + geom_line() +geom_point() ->p
p <- p + facet_grid(facets = Year ~ ., margins = FALSE) + theme_bw()
p + scale_y_continuous() + scale_x_discrete(labels=labels) +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 8))

商品討論次數
laptop_review %>%
group_by(.,title) %>%
summarise(
discuss_n=n(),
url=first(art_Url),
brand=first(brand)
) ->top10_tool_df
#top10_tool_df[order(top10_tool_df$discuss_n,decreasing = T),] %>% View
3.評論字頻表(dtm)
##建立文集
laptop_review$art_Content<-toupper(str_trim(laptop_review$art_Content)) #把評論都先變大寫(不然轉小寫都會出問題)
############文字清理方法二###############
# laptop_review$art_Content <- gsub("'", "", laptop_review$art_Content)
# laptop_review$art_Content <- gsub("[[:punct:]]", " ", laptop_review$art_Content)
# laptop_review$art_Content <- gsub("[[:cntrl:]]", " ", laptop_review$art_Content)
# laptop_review$art_Content <- gsub("^[[:space:]]+", "", laptop_review$art_Content)
# laptop_review$art_Content <- gsub("[[:space:]]+$", "", laptop_review$art_Content)
# laptop_review$art_Content <- gsub("[[0-9]]", " ", laptop_review$art_Content)
corpus = Corpus(VectorSource(laptop_review$art_Content))
#corpus[[1]]$content[1]
##轉為小寫
corpus = tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
#corpus[[2]]$content
##移除標點
corpus = tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
#corpus[[1]]$content
##去除贅字
corpus = tm_map(corpus, removeWords, c("laptop", stopwords("english")))#因為每一篇都有apple,所以apple本身就沒有意義,因此去除
## Warning in tm_map.SimpleCorpus(corpus, removeWords, c("laptop",
## stopwords("english"))): transformation drops documents
#corpus[[1]]$content
##字根還原
corpus = tm_map(corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
#corpus[[1]]$content
##處理原形單字
### 文件字詞矩陣 (字頻表,DTM)
##建立文件字詞矩陣 (Document Term Matrix)
frequencies = DocumentTermMatrix(corpus)
#frequencies$i
# Look at matrix
#findFreqTerms(frequencies, lowfreq=10) #出現次數超過20以上的字顯現出來
##### 移除頻率太低的字詞
sparse = removeSparseTerms(frequencies, 0.98) #此字在一千篇裡出現少過(0.995)5次就會被移除(0.98是20次)
sparse
## <<DocumentTermMatrix (documents: 128180, terms: 374)>>
## Non-/sparse entries: 2414644/45524676
## Sparsity : 95%
## Maximal term length: 11
## Weighting : term frequency (tf)
#document:文件數量;term:裡面總共有用到11625個字;sparsity:稀疏性
##### 轉成資料框
# Convert to a data frame把矩陣轉成資料框
tweetsSparse = as.data.frame(as.matrix(sparse))
#寫進csv
#write.csv(tweetsSparse, file = "dtm.csv")
# Make all variable names R-friendly 當矩陣轉成資料框後,把矩陣裡的欄位名稱用合法的方式取出來,所以用make.name的方式
colnames(tweetsSparse) = make.names(colnames(tweetsSparse))
文字雲(有問題)
# text=tweetsSparse %>% colSums %>% sort %>% as.data.frame()
# text$word=row.names(text)
# #text=text %>% filter(.>100)
#
# set.seed(1233)
# wordcloud(words = text$word, freq = text$., min.freq = 200,
# max.words=5000, random.order=FALSE, rot.per=0.35,
# colors=brewer.pal(8, "Dark2"))
建立字典
#####匯入字典
brand_txt<-readLines('./laptop_dictionary/brand.txt')
appearance_txt<-readLines('./laptop_dictionary/appearance.txt')
## Warning in readLines("./laptop_dictionary/appearance.txt"): 於 './
## laptop_dictionary/appearance.txt' 找到不完整的最後一列
product_txt<-readLines('./laptop_dictionary/product.txt')
## Warning in readLines("./laptop_dictionary/product.txt"): 於 './
## laptop_dictionary/product.txt' 找到不完整的最後一列
service_txt<-readLines('./laptop_dictionary/service.txt')
## Warning in readLines("./laptop_dictionary/service.txt"): 於 './
## laptop_dictionary/service.txt' 找到不完整的最後一列
specification_txt<-readLines('./laptop_dictionary/specification.txt')
## Warning in readLines("./laptop_dictionary/specification.txt"): 於 './
## laptop_dictionary/specification.txt' 找到不完整的最後一列
spend_txt<-readLines('./laptop_dictionary/spend.txt')
use_txt<-readLines('./laptop_dictionary/use.txt')
## Warning in readLines("./laptop_dictionary/use.txt"): 於 './
## laptop_dictionary/use.txt' 找到不完整的最後一列
use2_txt<-readLines('./laptop_dictionary/use2.txt')
## Warning in readLines("./laptop_dictionary/use2.txt"): 於 './
## laptop_dictionary/use2.txt' 找到不完整的最後一列
#####比對corpus中是否有出現字典的字
brand<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = brand_txt))) %>% as.data.frame()
appearance<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = appearance_txt))) %>% as.data.frame()
product<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = product_txt))) %>% as.data.frame()
service<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = service_txt))) %>% as.data.frame()
specification<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = specification_txt))) %>% as.data.frame()
spend<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = spend_txt))) %>% as.data.frame()
use<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = use_txt))) %>% as.data.frame()
use2<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = use2_txt))) %>% as.data.frame()
計算含重複
x<-list(brand,appearance,product,service,specification,spend,use,use2)
lapply(x, function(a){
a%>% colSums() %>% sort(decreasing = TRUE)
})
## [[1]]
## dell asus googl acer microsoft lenovo samsung
## 12401 11009 10585 7268 6508 5316 4396
## appl brand
## 3813 3712
##
## [[2]]
## look light pretti small display size
## 20105 14869 9123 8085 7951 7738
## model design weight color bright lightweight
## 6903 5939 4618 4610 4356 4259
## heavi portabl black beauti thin
## 4086 4036 3192 3169 2783
##
## [[3]]
## comput chromebook product set devic tablet
## 52239 23146 18576 11882 11237 10844
## laptop pad macbook desktop mac phone
## 8121 6482 5942 5122 3803 3557
##
## [[4]]
## return recommend fix warranti servic deal custom
## 11458 11153 7079 5470 5323 5054 5030
## store repair ship chang complaint arriv sent
## 4509 4407 4103 3913 3369 3307 3021
##
## [[5]]
## screen window keyboard batteri drive machin
## 40240 27692 25199 23743 16673 15053
## hard instal ssd system ram internet
## 11728 10439 8702 8167 7992 7956
## program card port usb softwar mous
## 7496 7120 7063 6999 6967 6752
## wifi processor speaker graphic memori storag
## 6293 5746 5734 5561 5541 5305
## driver plug touchpad touchscreen version trackpad
## 5189 4712 4562 4545 4432 3904
## hardwar process resolut
## 3863 3845 3554
##
## [[6]]
## price buy purchas charg money order worth cheap pay
## 21873 20288 17915 8693 7670 5317 4616 4561 4150
## cost free valu spend
## 3747 3364 3067 2961
##
## [[7]]
## use run fast power life hour touch slow
## 73512 19555 17387 13805 13094 12279 11406 11178
## qualiti perform sound quick function speed featur watch
## 10909 10425 6766 6732 5914 5788 5607 5564
## download load read brows experi click respons access
## 5480 5469 5402 4926 4808 4537 4218 3457
## edit surf faster carri crash smooth annoy
## 3406 3277 3165 3136 3117 3114 3061
##
## [[8]]
## work game play school offic home travel colleg
## 52544 20581 10653 6940 5850 4501 3995 3489
## note student movi research task job
## 3473 3352 3028 2916 2903 2687
計算不含重複
y<-list(brand,appearance,product,service,specification,spend,use,use2)
z<-lapply(y,function(b){
b %>% sapply(.,function(v)ifelse(v>1,1,v)) %>% as.data.frame() %>% colSums() %>% sort(decreasing = TRUE)
})
改變格式畫出字典統計圖
#把list改為dataframe
for (i in 1:length(z)) {
assign(paste0("z", i), as.data.frame(z[[i]]))
}
#改名稱
z1$word=row.names(z1)
colnames(z1)=c("times","word")
z2$word=row.names(z2)
colnames(z2)=c("times","word")
z3$word=row.names(z3)
colnames(z3)=c("times","word")
z4$word=row.names(z4)
colnames(z4)=c("times","word")
z5$word=row.names(z5)
colnames(z5)=c("times","word")
z6$word=row.names(z6)
colnames(z6)=c("times","word")
z7$word=row.names(z7)
colnames(z7)=c("times","word")
z8$word=row.names(z8)
colnames(z8)=c("times","word")
字典統計圖
#brand
z1=z1[order(z1$times),] #照順序排
z1$word=factor(z1$word,levels=z1$word)
p1 <- ggplot(z1,aes(x=factor(word),y=times )) +
geom_bar(stat="identity")+
xlab("Brand")+ylab("times")+
labs(title="Brand - Times of Appearence")+
geom_text(aes(label = times),
position = position_dodge(0.9),
hjust = 0)
p1 + coord_flip()

#appearance
z2=z2[order(z2$times),] #照順序排
z2$word=factor(z2$word,levels=z2$word)
p2 <- ggplot(z2,aes(x=factor(word),y=times )) +
geom_bar(stat="identity")+
xlab("appearance")+ylab("times")+
labs(title="appearance - Times of Appearence")+
geom_text(aes(label = times))
p2 + coord_flip()

#所有類別比較
all= c(brand=z1$times %>%sum,
appearance=z2$times %>% sum,
product=z3$times %>%sum,
service=z4$times %>%sum,
specification=z5$times %>%sum,
spend=z6$times %>%sum,
use=z7$times %>% sum,
use2=z8$times %>% sum )%>% sort %>% as.data.frame()
all$word=row.names(all)
colnames(all)=c("times","word")
all=all[order(all$times),]
all$word=factor(all$word,levels=all$word)
ggplot(all,aes(x=factor(word),y=times))+geom_bar(stat = "identity")+xlab("word")+ylab("times")+labs(title="筆電的顧客購買因素")+geom_text(aes(label=times), vjust=0)

google Vis(不會用到)
# library(googleVis)
#
# Bar <- gvisBarChart(z1,"word","times",options=list(titleTextStyle="{color:'red',fontName:'Courier',fontSize:10}",height=1300))
# plot(Bar)
4.情緒分析
####自定義nrc辭典
nrc_custom <- get_sentiments("nrc")
nrc_custom <-subset(nrc_custom,!word %in% c("battery","ram","money","weight")) #將一些字從情緒字典刪除
delete <-subset(nrc_custom,word %in% c("quiet") & sentiment %in% c("sadness"))
nrc_custom<-anti_join(nrc_custom,delete)
## Joining, by = c("word", "sentiment")
top10 sentiment analysis
library(syuzhet)
#load("laptop_10brand_nrc.RData")
top10_brand<-c("Acer","Apple","Asus", "Dell","HP","Lenovo","Microsoft","MSI","Samsung","Toshiba")
top10_review<-laptop_review %>% filter(brand==top10_brand) %>% as.data.frame()
top10_review_sentiment<-get_nrc_sentiment((top10_review$art_Content)) #要跑一下子
#save(top10_review,top10_review_sentiment, file = "laptop_10brand_nrc.RData")
Sentimentscores_amazon<-data.frame(colSums(top10_review_sentiment[,]))
names(Sentimentscores_amazon)<-"Score"
Sentimentscores_amazon<-cbind("sentiment"=rownames(Sentimentscores_amazon),Sentimentscores_amazon)
rownames(Sentimentscores_amazon)<-NULL
ggplot(data=Sentimentscores_amazon,aes(x=sentiment,y=Score))+geom_bar(aes(fill=sentiment),stat = "identity")+
theme(legend.position="none")+
xlab("Sentiments")+ylab("scores")+ggtitle("Sentiments of Top10 brand reviews")

使用nrc來看sentiment
brand_sentiment<-top10_review[,c("art_Content","art_Date1","brand")]
stop_words<-read.table("./data/stop_words.txt",header = T)
stop_words_custom <-subset(stop_words,!word %in% c("small","problem","believe","changes","use")) #將一些字從情緒字典刪除
tidy_brand<- brand_sentiment %>% unnest_tokens(word,art_Content) %>% anti_join(stop_words_custom)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
去除stop word後統計字數
tidy_brand %>%count(word,sort=TRUE)
## # A tibble: 25,499 x 2
## word n
## <chr> <int>
## 1 laptop 8452
## 2 computer 4537
## 3 use 3898
## 4 screen 3887
## 5 it's 3591
## 6 windows 2716
## 7 keyboard 2526
## 8 battery 2229
## 9 time 2134
## 10 price 2048
## # ... with 25,489 more rows
各品牌字數
tidy_brand %>% count(brand) %>%
rename(brand_total=n)
## # A tibble: 10 x 2
## brand brand_total
## <fct> <int>
## 1 Acer 68335
## 2 Apple 9994
## 3 Asus 104183
## 4 Dell 64870
## 5 HP 53909
## 6 Lenovo 38246
## 7 Microsoft 13919
## 8 MSI 6794
## 9 Samsung 40172
## 10 Toshiba 8453
算出nrc score (從這邊開始用客製化字典)
total_sentimnent <- tidy_brand %>%
group_by(brand) %>%
mutate(brand_total=n())%>%
ungroup()%>%
inner_join(nrc_custom)
## Joining, by = "word"
#unique(get_sentiments("nrc")$sentiment) #nrc裡面共有十個情緒
哪個品牌使用最多的負面字
total_sentimnent %>%
count(brand, sentiment, brand_total) %>%
# Define a new column percent
mutate(percent = n/brand_total) %>%
# Filter only for negative words
filter(sentiment == "negative") %>%
# Arrange by percent
arrange(desc(percent))
## # A tibble: 10 x 5
## brand sentiment brand_total n percent
## <fct> <chr> <int> <int> <dbl>
## 1 MSI negative 6794 329 0.0484
## 2 HP negative 53909 2580 0.0479
## 3 Lenovo negative 38246 1826 0.0477
## 4 Dell negative 64870 3069 0.0473
## 5 Toshiba negative 8453 391 0.0463
## 6 Asus negative 104183 4802 0.0461
## 7 Apple negative 9994 453 0.0453
## 8 Microsoft negative 13919 623 0.0448
## 9 Acer negative 68335 2830 0.0414
## 10 Samsung negative 40172 1658 0.0413
哪個品牌使用最多的正面字
total_sentimnent %>%
count(brand, sentiment, brand_total) %>%
# Define a new column percent
mutate(percent = n/brand_total) %>%
# Filter only for negative words
filter(sentiment == "positive") %>%
# Arrange by percent
arrange(desc(percent))
## # A tibble: 10 x 5
## brand sentiment brand_total n percent
## <fct> <chr> <int> <int> <dbl>
## 1 Apple positive 9994 956 0.0957
## 2 Samsung positive 40172 3478 0.0866
## 3 HP positive 53909 4580 0.0850
## 4 MSI positive 6794 561 0.0826
## 5 Acer positive 68335 5602 0.0820
## 6 Dell positive 64870 5248 0.0809
## 7 Lenovo positive 38246 3075 0.0804
## 8 Asus positive 104183 8341 0.0801
## 9 Microsoft positive 13919 1099 0.0790
## 10 Toshiba positive 8453 659 0.0780
哪些字在nrc的十大情緒擁有最高的情緒分數
total_sentimnent %>%
# Count by word and sentiment
count(word, sentiment) %>%
# Group by sentiment
group_by(sentiment) %>%
# Take the top 10 words for each sentiment
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
# Set up the plot with aes()
ggplot(aes(word,n, fill=sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ sentiment, scales = "free") +
coord_flip()
## Selecting by n

十大品牌前十名討論最多“否定”的字
total_sentimnent %>%
# Filter for only negative words
filter(sentiment == "negative") %>%
# Count by word and station
count(word, brand) %>%
# Group by station
group_by(brand) %>%
# Take the top 10 words for each station
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(paste(word, brand, sep = "__"), n)) %>%
# Set up the plot with aes()
ggplot(aes(x=word, y=n, fill=brand)) +
geom_col(show.legend = FALSE) +
scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
facet_wrap(~ brand, nrow = 2, scales = "free") +
coord_flip()
## Selecting by n

十大品牌前十名討論最多“正面”的字
total_sentimnent %>%
# Filter for only negative words
filter(sentiment == "positive") %>%
# Count by word and station
count(word, brand) %>%
# Group by station
group_by(brand) %>%
# Take the top 10 words for each station
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(paste(word, brand, sep = "__"), n)) %>%
# Set up the plot with aes()
ggplot(aes(x=word, y=n, fill=brand)) +
geom_col(show.legend = FALSE) +
scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
facet_wrap(~ brand, nrow = 2, scales = "free") +
coord_flip()
## Selecting by n

觀察正面及負面字的時間變化,中間虛線代表平均值(用lm)
# Load the lubridate package
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
##
## hour, isoweek, mday, minute, month, quarter, second, wday,
## week, yday, year
## The following object is masked from 'package:base':
##
## date
sentiment_by_time <- tidy_brand %>%
# Define a new column using floor_date()
mutate(date = floor_date(art_Date1, unit = "6 months")) %>%
# Group by date
group_by(date) %>%
mutate(total_words = n()) %>%
ungroup() %>%
# Implement sentiment analysis using the NRC lexicon
inner_join(nrc_custom)
## Joining, by = "word"
sentiment_by_time %>%
# Filter for positive and negative words
filter(sentiment %in% c("positive", "negative")) %>%
# Count by date, sentiment, and total_words
count(date, sentiment, total_words) %>%
ungroup() %>%
mutate(percent = n / total_words) %>%
# Set up the plot with aes()
ggplot(aes(date, percent, col=sentiment)) +
geom_line(size = 1.5) +
geom_smooth(method = "lm", se = FALSE, lty = 2) +
expand_limits(y = 0)
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_path).

用各字典裡出現最多的字來看時間變化(待修正)
tidy_brand %>%
# Define a new column that rounds each date to the nearest 1 month
mutate(date = floor_date(art_Date1, unit = "1 months")) %>%
filter(word %in% c("look", "dell", "product",
"computer", "return", "screen","price","work","use")) %>%
# Count by date and word
count(date, word) %>%
ungroup() %>%
# Set up your plot with aes()
ggplot(aes(date, n, col=word)) +
# Make facets by word
facet_wrap(~word,ncol=4) +
geom_line(size = 1.5, show.legend = FALSE) +
expand_limits(y = 0)
## Warning: Removed 3 rows containing missing values (geom_path).

5.計算文本情緒分數
####按照日期,品牌依序排列文本,再來按照以sentence切割
tidy_sentence<-brand_sentiment %>%
filter(!is.na(art_Date1))%>%
arrange(art_Date1)%>%
group_by(brand)%>%
mutate(linenumber = row_number())%>%
unnest_tokens(sentence,art_Content)
colnames(tidy_sentence)=c("art_Date1","brand","linenumber","word")
按照品牌計算所有文本的情緒分數
####使用bing字典
all_tidy_sentence_sentiment <- tidy_sentence %>%
inner_join(get_sentiments("bing")) %>%
count(brand,linenumber, sentiment) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative)
## Joining, by = "word"
###畫圖 :X軸有照文本日期時間排序
ggplot(all_tidy_sentence_sentiment, aes(linenumber, sentiment, fill = brand)) +
geom_col(show.legend = FALSE) +
facet_wrap(~brand, ncol = 2, scales = "free_x")+
xlab("index") +
ylab("sentiment score")

###看文本趨勢,因為全部文本都看的話數量太大,所以先取每80句為一個單位取商數,算出情緒字的個數
tidy_sentence_sentiment <- all_tidy_sentence_sentiment %>%
mutate(index=linenumber %/% 30) %>%
group_by(index) %>%
mutate(score=sum(sentiment))
6.用bind tf-idf試試分類字典
load("lipstick_tidy.RData")
# tidy_tfidf<-laptop_review[,c("art_Content","art_Date1","brand")] %>%
# unnest_tokens(word, art_Content) %>%
# count(brand, word, sort = TRUE)
#
# total_words <- tidy_tfidf%>%
# group_by(brand) %>%
# summarize(total = sum(n))
#
# tidy_tfidf <- left_join(tidy_tfidf, total_words)
#
# tidy_tfidf <- tidy_tfidf %>%
# bind_tf_idf(word, brand, n)
#save(tidy_tfidf, file = "lipstick_tidy.RData")