1.基本設定+載入package

setwd('D:\\Lab\\essay1')
getwd()
## [1] "D:/Lab/essay1"
#packages = c(
#  "dplyr","ggplot2","caTools","tm","SnowballC","ROCR","rpart","rpart.plot","randomForest")
#existing = as.character(installed.packages()[,1])
#for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

# Sys.setlocale("LC_ALL","C")
#options(digits=5, scipen=10)

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(SnowballC)
library(ROCR)
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
library(caTools)
library(rpart)
library(rpart.plot)
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(wordcloud)
## Loading required package: RColorBrewer
## 
## Attaching package: 'wordcloud'
## The following object is masked from 'package:gplots':
## 
##     textplot
library(stringr)
library(tidytext)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:randomForest':
## 
##     margin
## The following object is masked from 'package:NLP':
## 
##     annotate
library(tidyr)
library(data.table)
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
library(readr)

讀資料

laptop_review=fread('./data/yuchia.amazon_laptop_review.csv',header = T)
#lipstick_review=fread('./data/yuchia.amazon_lipstick_review.csv',header = T)

資料清理

##刪除多餘的欄位
laptop_review <- laptop_review[,-c(12,13,14,15,16,17,18,19,20,21,22,23)]
laptop_review<- subset(laptop_review,error ==0) %>% as.data.frame()

##Date:
Sys.setlocale("LC_TIME", "C")
## [1] "C"
laptop_review$art_Date1=as.Date(laptop_review$art_Date,format = "%d-%b-%y")
laptop_review$Year=year(laptop_review$art_Date1)
laptop_review$Month=format(laptop_review$art_Date1, "%m")

##Brand:
laptop_review$brand=laptop_review$brand %>% as.factor
brand=laptop_review$brand %>% unique

##Star:
laptop_review$star_sub=laptop_review$star %>% substr(1,3)
laptop_review$author_star_sub=laptop_review$author_star %>% substr(1,3)
laptop_review$star_sub =laptop_review$star_sub %>% as.numeric()
laptop_review$author_star_sub =laptop_review$author_star_sub %>% as.numeric()

laptop_review<-distinct(laptop_review,art_Content, .keep_all= TRUE)#刪除重複

2.品牌討論度

#by年分
laptop_review$brand %>% unique
##  [1] Acer            Asus            Lenovo          Huawei         
##  [5] Samsung         HP              Microsoft       MSI            
##  [9] Dell            Apple           Razer           Google         
## [13] LG              Gigabyte        XIDU            Alienware      
## [17] Wacom           Fusion5         Oemgenuine      RCA            
## [21] EVOO            ALLDOCUBE       iRULU           Pocket C.H.I.P 
## [25] Hewlett Packard Toshiba         Prostar         Sager          
## [29] Jumper          IVIEW           Yuntab          Azpen          
## [33] CHUWI           LHMZNIY         WinBook         PROSCAN        
## [37] CTL             Fenniu          NUVISION        Nextbook Flexx 
## [41] HYUNDAI         AWOW            Inspiron        CyberpowerPC   
## [45] Packard Bell    Intel           BIT             Bit            
## [49] Aorus           VIZIO           Ematic         
## 51 Levels: Acer Alienware ALLDOCUBE Aorus Apple Asus AWOW Azpen ... Yuntab
brand_discuss_df=laptop_review %>% 
  group_by(.,brand,Year) %>%
  summarise(
    discuss_n=n()
  )%>% filter(Year>2010)


brand_discuss_df %>% 
  group_by(.,brand) %>%
  summarise(
    discuss_sum=sum(discuss_n) 
  ) ->top10_df


#top10品牌討論度by年分
brand_discuss_df$brand %in% 
  top10_df [order(top10_df$discuss_sum,decreasing = T),]$brand[1:10] %>%
  
  subset(brand_discuss_df,.)->discuss_plot_df

discuss_plot_df<- subset(discuss_plot_df,Year!="NA")

#by 月份
brand_discuss_df_month=laptop_review %>% 
  group_by(.,brand,Year,Month) %>%
  summarise(
    discuss_n=n()
  )

#top10品牌討論度by月分
brand_discuss_df_month$brand %in% 
  top10_df [order(top10_df$discuss_sum,decreasing = T),]$brand[1:10] %>%
  subset(brand_discuss_df_month,.)->discuss_plot_df_month

品牌被討論數隨時間分布圖by年分

#by年分
ggplot(discuss_plot_df, aes(x = Year, y =discuss_n,color=brand)) + 
  geom_line() + geom_point(  fill = "white")+ scale_x_continuous(breaks = c(2011:2019))+scale_y_continuous(breaks=c(0,1000,2000,3000,4000,5000,6000,7000,8000,9000)) 

品牌被討論數隨時間分布圖by月分

subset(discuss_plot_df_month,Year>=2015 & Year<2019) %>% group_by(brand,Year) %>% arrange(
  desc(Month)) %>%
  ggplot(data=.,aes(x=Month, y=discuss_n, color=brand,group=brand) ) + geom_line() +geom_point() ->p
p <- p + facet_grid(facets = Year ~ ., margins = FALSE) + theme_bw()
p + scale_y_continuous() + scale_x_discrete(labels=labels) + 
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, size = 8))

商品討論次數

laptop_review %>% 
  group_by(.,title) %>%
  summarise(
    discuss_n=n(),
    url=first(art_Url),
    brand=first(brand)
  )  ->top10_tool_df

#top10_tool_df[order(top10_tool_df$discuss_n,decreasing = T),] %>% View

3.評論字頻表(dtm)

##建立文集
laptop_review$art_Content<-toupper(str_trim(laptop_review$art_Content)) #把評論都先變大寫(不然轉小寫都會出問題)
############文字清理方法二###############
# laptop_review$art_Content <- gsub("'", "", laptop_review$art_Content)
# laptop_review$art_Content <- gsub("[[:punct:]]", " ", laptop_review$art_Content) 
# laptop_review$art_Content <- gsub("[[:cntrl:]]", " ", laptop_review$art_Content) 
# laptop_review$art_Content <- gsub("^[[:space:]]+", "", laptop_review$art_Content) 
# laptop_review$art_Content <- gsub("[[:space:]]+$", "", laptop_review$art_Content)
# laptop_review$art_Content <- gsub("[[0-9]]", " ", laptop_review$art_Content)


corpus = Corpus(VectorSource(laptop_review$art_Content)) 
#corpus[[1]]$content[1]

##轉為小寫
corpus = tm_map(corpus, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(corpus, content_transformer(tolower)):
## transformation drops documents
#corpus[[2]]$content

##移除標點
corpus = tm_map(corpus, removePunctuation)
## Warning in tm_map.SimpleCorpus(corpus, removePunctuation): transformation
## drops documents
#corpus[[1]]$content

##去除贅字
corpus = tm_map(corpus, removeWords, c("laptop", stopwords("english")))#因為每一篇都有apple,所以apple本身就沒有意義,因此去除
## Warning in tm_map.SimpleCorpus(corpus, removeWords, c("laptop",
## stopwords("english"))): transformation drops documents
#corpus[[1]]$content

##字根還原
corpus = tm_map(corpus, stemDocument)
## Warning in tm_map.SimpleCorpus(corpus, stemDocument): transformation drops
## documents
#corpus[[1]]$content

##處理原形單字


### 文件字詞矩陣 (字頻表,DTM)
##建立文件字詞矩陣 (Document Term Matrix)
frequencies = DocumentTermMatrix(corpus)
#frequencies$i

# Look at matrix 
#findFreqTerms(frequencies, lowfreq=10) #出現次數超過20以上的字顯現出來

##### 移除頻率太低的字詞
sparse = removeSparseTerms(frequencies, 0.98) #此字在一千篇裡出現少過(0.995)5次就會被移除(0.98是20次)
sparse
## <<DocumentTermMatrix (documents: 128180, terms: 374)>>
## Non-/sparse entries: 2414644/45524676
## Sparsity           : 95%
## Maximal term length: 11
## Weighting          : term frequency (tf)
#document:文件數量;term:裡面總共有用到11625個字;sparsity:稀疏性

##### 轉成資料框
# Convert to a data frame把矩陣轉成資料框
tweetsSparse = as.data.frame(as.matrix(sparse))
#寫進csv
#write.csv(tweetsSparse, file = "dtm.csv")
# Make all variable names R-friendly 當矩陣轉成資料框後,把矩陣裡的欄位名稱用合法的方式取出來,所以用make.name的方式
colnames(tweetsSparse) = make.names(colnames(tweetsSparse))

文字雲(有問題)

# text=tweetsSparse %>% colSums %>% sort %>% as.data.frame()
# text$word=row.names(text)
# #text=text %>% filter(.>100)  
# 
# set.seed(1233)
# wordcloud(words = text$word, freq = text$., min.freq = 200,
#           max.words=5000, random.order=FALSE, rot.per=0.35, 
#           colors=brewer.pal(8, "Dark2"))

建立字典

#####匯入字典
brand_txt<-readLines('./laptop_dictionary/brand.txt')
appearance_txt<-readLines('./laptop_dictionary/appearance.txt')
## Warning in readLines("./laptop_dictionary/appearance.txt"): 於 './
## laptop_dictionary/appearance.txt' 找到不完整的最後一列
product_txt<-readLines('./laptop_dictionary/product.txt')
## Warning in readLines("./laptop_dictionary/product.txt"): 於 './
## laptop_dictionary/product.txt' 找到不完整的最後一列
service_txt<-readLines('./laptop_dictionary/service.txt')
## Warning in readLines("./laptop_dictionary/service.txt"): 於 './
## laptop_dictionary/service.txt' 找到不完整的最後一列
specification_txt<-readLines('./laptop_dictionary/specification.txt')
## Warning in readLines("./laptop_dictionary/specification.txt"): 於 './
## laptop_dictionary/specification.txt' 找到不完整的最後一列
spend_txt<-readLines('./laptop_dictionary/spend.txt')
use_txt<-readLines('./laptop_dictionary/use.txt')
## Warning in readLines("./laptop_dictionary/use.txt"): 於 './
## laptop_dictionary/use.txt' 找到不完整的最後一列
use2_txt<-readLines('./laptop_dictionary/use2.txt')
## Warning in readLines("./laptop_dictionary/use2.txt"): 於 './
## laptop_dictionary/use2.txt' 找到不完整的最後一列
#####比對corpus中是否有出現字典的字
brand<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = brand_txt)))  %>% as.data.frame() 
appearance<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = appearance_txt)))  %>% as.data.frame() 
product<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = product_txt)))  %>% as.data.frame() 
service<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = service_txt)))  %>% as.data.frame() 
specification<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = specification_txt)))  %>% as.data.frame() 
spend<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = spend_txt)))  %>% as.data.frame() 
use<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = use_txt)))  %>% as.data.frame() 
use2<-as.matrix(DocumentTermMatrix(corpus,list(dictionary = use2_txt)))  %>% as.data.frame() 

計算含重複

x<-list(brand,appearance,product,service,specification,spend,use,use2)
lapply(x, function(a){
  a%>% colSums() %>% sort(decreasing = TRUE)
}) 
## [[1]]
##      dell      asus     googl      acer microsoft    lenovo   samsung 
##     12401     11009     10585      7268      6508      5316      4396 
##      appl     brand 
##      3813      3712 
## 
## [[2]]
##        look       light      pretti       small     display        size 
##       20105       14869        9123        8085        7951        7738 
##       model      design      weight       color      bright lightweight 
##        6903        5939        4618        4610        4356        4259 
##       heavi     portabl       black      beauti        thin 
##        4086        4036        3192        3169        2783 
## 
## [[3]]
##     comput chromebook    product        set      devic     tablet 
##      52239      23146      18576      11882      11237      10844 
##     laptop        pad    macbook    desktop        mac      phone 
##       8121       6482       5942       5122       3803       3557 
## 
## [[4]]
##    return recommend       fix  warranti    servic      deal    custom 
##     11458     11153      7079      5470      5323      5054      5030 
##     store    repair      ship     chang complaint     arriv      sent 
##      4509      4407      4103      3913      3369      3307      3021 
## 
## [[5]]
##      screen      window    keyboard     batteri       drive      machin 
##       40240       27692       25199       23743       16673       15053 
##        hard      instal         ssd      system         ram    internet 
##       11728       10439        8702        8167        7992        7956 
##     program        card        port         usb     softwar        mous 
##        7496        7120        7063        6999        6967        6752 
##        wifi   processor     speaker     graphic      memori      storag 
##        6293        5746        5734        5561        5541        5305 
##      driver        plug    touchpad touchscreen     version    trackpad 
##        5189        4712        4562        4545        4432        3904 
##     hardwar     process     resolut 
##        3863        3845        3554 
## 
## [[6]]
##   price     buy purchas   charg   money   order   worth   cheap     pay 
##   21873   20288   17915    8693    7670    5317    4616    4561    4150 
##    cost    free    valu   spend 
##    3747    3364    3067    2961 
## 
## [[7]]
##      use      run     fast    power     life     hour    touch     slow 
##    73512    19555    17387    13805    13094    12279    11406    11178 
##  qualiti  perform    sound    quick function    speed   featur    watch 
##    10909    10425     6766     6732     5914     5788     5607     5564 
## download     load     read    brows   experi    click  respons   access 
##     5480     5469     5402     4926     4808     4537     4218     3457 
##     edit     surf   faster    carri    crash   smooth    annoy 
##     3406     3277     3165     3136     3117     3114     3061 
## 
## [[8]]
##     work     game     play   school    offic     home   travel   colleg 
##    52544    20581    10653     6940     5850     4501     3995     3489 
##     note  student     movi research     task      job 
##     3473     3352     3028     2916     2903     2687

計算不含重複

y<-list(brand,appearance,product,service,specification,spend,use,use2)
z<-lapply(y,function(b){
  b %>% sapply(.,function(v)ifelse(v>1,1,v)) %>% as.data.frame() %>% colSums() %>% sort(decreasing = TRUE)
})

改變格式畫出字典統計圖

#把list改為dataframe
for (i in 1:length(z)) {
  assign(paste0("z", i), as.data.frame(z[[i]])) 
} 

#改名稱
z1$word=row.names(z1)
colnames(z1)=c("times","word")
z2$word=row.names(z2)
colnames(z2)=c("times","word")
z3$word=row.names(z3)
colnames(z3)=c("times","word")
z4$word=row.names(z4)
colnames(z4)=c("times","word")
z5$word=row.names(z5)
colnames(z5)=c("times","word")
z6$word=row.names(z6)
colnames(z6)=c("times","word")
z7$word=row.names(z7)
colnames(z7)=c("times","word")
z8$word=row.names(z8)
colnames(z8)=c("times","word")

字典統計圖

#brand
z1=z1[order(z1$times),] #照順序排
z1$word=factor(z1$word,levels=z1$word)
p1 <- ggplot(z1,aes(x=factor(word),y=times )) + 
  geom_bar(stat="identity")+
  xlab("Brand")+ylab("times")+
  labs(title="Brand - Times of Appearence")+
  geom_text(aes(label = times),
            position = position_dodge(0.9),
            hjust = 0)
p1 + coord_flip()    

#appearance
z2=z2[order(z2$times),] #照順序排
z2$word=factor(z2$word,levels=z2$word)
p2 <- ggplot(z2,aes(x=factor(word),y=times )) + 
  geom_bar(stat="identity")+
  xlab("appearance")+ylab("times")+
  labs(title="appearance - Times of Appearence")+
  geom_text(aes(label = times))
p2 + coord_flip() 

#所有類別比較
all= c(brand=z1$times %>%sum,
       appearance=z2$times %>% sum,
       product=z3$times %>%sum,
       service=z4$times %>%sum,
       specification=z5$times %>%sum,
       spend=z6$times %>%sum,
       use=z7$times %>% sum,
       use2=z8$times %>% sum )%>% sort %>% as.data.frame()

all$word=row.names(all)
colnames(all)=c("times","word")
all=all[order(all$times),]
all$word=factor(all$word,levels=all$word)

ggplot(all,aes(x=factor(word),y=times))+geom_bar(stat = "identity")+xlab("word")+ylab("times")+labs(title="筆電的顧客購買因素")+geom_text(aes(label=times), vjust=0)

google Vis(不會用到)

# library(googleVis)
# 
# Bar <- gvisBarChart(z1,"word","times",options=list(titleTextStyle="{color:'red',fontName:'Courier',fontSize:10}",height=1300))
# plot(Bar)

4.情緒分析

####自定義nrc辭典
nrc_custom <- get_sentiments("nrc")
nrc_custom <-subset(nrc_custom,!word %in% c("battery","ram","money","weight")) #將一些字從情緒字典刪除
delete <-subset(nrc_custom,word %in% c("quiet") & sentiment %in% c("sadness"))
nrc_custom<-anti_join(nrc_custom,delete)
## Joining, by = c("word", "sentiment")

top10 sentiment analysis

library(syuzhet)
#load("laptop_10brand_nrc.RData")
top10_brand<-c("Acer","Apple","Asus", "Dell","HP","Lenovo","Microsoft","MSI","Samsung","Toshiba")
top10_review<-laptop_review %>% filter(brand==top10_brand) %>%  as.data.frame()

top10_review_sentiment<-get_nrc_sentiment((top10_review$art_Content)) #要跑一下子
#save(top10_review,top10_review_sentiment, file = "laptop_10brand_nrc.RData")

Sentimentscores_amazon<-data.frame(colSums(top10_review_sentiment[,]))

names(Sentimentscores_amazon)<-"Score"
Sentimentscores_amazon<-cbind("sentiment"=rownames(Sentimentscores_amazon),Sentimentscores_amazon)
rownames(Sentimentscores_amazon)<-NULL

ggplot(data=Sentimentscores_amazon,aes(x=sentiment,y=Score))+geom_bar(aes(fill=sentiment),stat = "identity")+
  theme(legend.position="none")+
  xlab("Sentiments")+ylab("scores")+ggtitle("Sentiments of Top10 brand reviews")

使用nrc來看sentiment

brand_sentiment<-top10_review[,c("art_Content","art_Date1","brand")]
stop_words<-read.table("./data/stop_words.txt",header = T)
stop_words_custom <-subset(stop_words,!word %in% c("small","problem","believe","changes","use")) #將一些字從情緒字典刪除

tidy_brand<- brand_sentiment %>% unnest_tokens(word,art_Content) %>% anti_join(stop_words_custom)
## Joining, by = "word"
## Warning: Column `word` joining character vector and factor, coercing into
## character vector
去除stop word後統計字數
  tidy_brand %>%count(word,sort=TRUE)
## # A tibble: 25,499 x 2
##    word         n
##    <chr>    <int>
##  1 laptop    8452
##  2 computer  4537
##  3 use       3898
##  4 screen    3887
##  5 it's      3591
##  6 windows   2716
##  7 keyboard  2526
##  8 battery   2229
##  9 time      2134
## 10 price     2048
## # ... with 25,489 more rows
各品牌字數
tidy_brand %>% count(brand) %>%
  rename(brand_total=n)
## # A tibble: 10 x 2
##    brand     brand_total
##    <fct>           <int>
##  1 Acer            68335
##  2 Apple            9994
##  3 Asus           104183
##  4 Dell            64870
##  5 HP              53909
##  6 Lenovo          38246
##  7 Microsoft       13919
##  8 MSI              6794
##  9 Samsung         40172
## 10 Toshiba          8453
算出nrc score (從這邊開始用客製化字典)
total_sentimnent <- tidy_brand %>%
  group_by(brand) %>%
  mutate(brand_total=n())%>%
  ungroup()%>%
  inner_join(nrc_custom)
## Joining, by = "word"
#unique(get_sentiments("nrc")$sentiment) #nrc裡面共有十個情緒
哪個品牌使用最多的負面字
total_sentimnent %>% 
  count(brand, sentiment, brand_total) %>%
  # Define a new column percent
  mutate(percent = n/brand_total) %>%
  # Filter only for negative words
  filter(sentiment == "negative") %>%
  # Arrange by percent
  arrange(desc(percent))
## # A tibble: 10 x 5
##    brand     sentiment brand_total     n percent
##    <fct>     <chr>           <int> <int>   <dbl>
##  1 MSI       negative         6794   329  0.0484
##  2 HP        negative        53909  2580  0.0479
##  3 Lenovo    negative        38246  1826  0.0477
##  4 Dell      negative        64870  3069  0.0473
##  5 Toshiba   negative         8453   391  0.0463
##  6 Asus      negative       104183  4802  0.0461
##  7 Apple     negative         9994   453  0.0453
##  8 Microsoft negative        13919   623  0.0448
##  9 Acer      negative        68335  2830  0.0414
## 10 Samsung   negative        40172  1658  0.0413
哪個品牌使用最多的正面字
total_sentimnent %>% 
  count(brand, sentiment, brand_total) %>%
  # Define a new column percent
  mutate(percent = n/brand_total) %>%
  # Filter only for negative words
  filter(sentiment == "positive") %>%
  # Arrange by percent
  arrange(desc(percent))
## # A tibble: 10 x 5
##    brand     sentiment brand_total     n percent
##    <fct>     <chr>           <int> <int>   <dbl>
##  1 Apple     positive         9994   956  0.0957
##  2 Samsung   positive        40172  3478  0.0866
##  3 HP        positive        53909  4580  0.0850
##  4 MSI       positive         6794   561  0.0826
##  5 Acer      positive        68335  5602  0.0820
##  6 Dell      positive        64870  5248  0.0809
##  7 Lenovo    positive        38246  3075  0.0804
##  8 Asus      positive       104183  8341  0.0801
##  9 Microsoft positive        13919  1099  0.0790
## 10 Toshiba   positive         8453   659  0.0780
哪些字在nrc的十大情緒擁有最高的情緒分數
total_sentimnent %>%
  # Count by word and sentiment
  count(word, sentiment) %>%
  # Group by sentiment
  group_by(sentiment) %>%
  # Take the top 10 words for each sentiment
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  # Set up the plot with aes()
  ggplot(aes(word,n, fill=sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ sentiment, scales = "free") +
  coord_flip()
## Selecting by n

十大品牌前十名討論最多“否定”的字
total_sentimnent %>%
  # Filter for only negative words
  filter(sentiment == "negative") %>%
  # Count by word and station
  count(word, brand) %>%
  # Group by station
  group_by(brand) %>%
  # Take the top 10 words for each station
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(paste(word, brand, sep = "__"), n)) %>%
  # Set up the plot with aes()
  ggplot(aes(x=word, y=n, fill=brand)) +
  geom_col(show.legend = FALSE) +
  scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
  facet_wrap(~ brand, nrow = 2, scales = "free") +
  coord_flip()
## Selecting by n

十大品牌前十名討論最多“正面”的字
total_sentimnent %>%
  # Filter for only negative words
  filter(sentiment == "positive") %>%
  # Count by word and station
  count(word, brand) %>%
  # Group by station
  group_by(brand) %>%
  # Take the top 10 words for each station
  top_n(10) %>%
  ungroup() %>%
  mutate(word = reorder(paste(word, brand, sep = "__"), n)) %>%
  # Set up the plot with aes()
  ggplot(aes(x=word, y=n, fill=brand)) +
  geom_col(show.legend = FALSE) +
  scale_x_discrete(labels = function(x) gsub("__.+$", "", x)) +
  facet_wrap(~ brand, nrow = 2, scales = "free") +
  coord_flip()
## Selecting by n

觀察正面及負面字的時間變化,中間虛線代表平均值(用lm)
# Load the lubridate package
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:data.table':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday,
##     week, yday, year
## The following object is masked from 'package:base':
## 
##     date
sentiment_by_time <- tidy_brand %>%
  # Define a new column using floor_date()
  mutate(date = floor_date(art_Date1, unit = "6 months")) %>%
  # Group by date
  group_by(date) %>%
  mutate(total_words = n()) %>%
  ungroup() %>%
  # Implement sentiment analysis using the NRC lexicon
  inner_join(nrc_custom)
## Joining, by = "word"
sentiment_by_time %>%
  # Filter for positive and negative words
  filter(sentiment %in% c("positive", "negative")) %>%
  # Count by date, sentiment, and total_words
  count(date, sentiment, total_words) %>%
  ungroup() %>%
  mutate(percent = n / total_words) %>%
  # Set up the plot with aes()
  ggplot(aes(date, percent, col=sentiment)) +
  geom_line(size = 1.5) +
  geom_smooth(method = "lm", se = FALSE, lty = 2) +
  expand_limits(y = 0)
## Warning: Removed 2 rows containing non-finite values (stat_smooth).
## Warning: Removed 2 rows containing missing values (geom_path).

用各字典裡出現最多的字來看時間變化(待修正)
tidy_brand %>%
  # Define a new column that rounds each date to the nearest 1 month
  mutate(date = floor_date(art_Date1, unit = "1 months")) %>%
  filter(word %in% c("look", "dell", "product",
                     "computer", "return", "screen","price","work","use")) %>%
  # Count by date and word
  count(date, word) %>%
  ungroup() %>%
  # Set up your plot with aes()
  ggplot(aes(date, n, col=word)) +
  # Make facets by word
  facet_wrap(~word,ncol=4) +
  geom_line(size = 1.5, show.legend = FALSE) +
  expand_limits(y = 0)
## Warning: Removed 3 rows containing missing values (geom_path).

5.計算文本情緒分數

####按照日期,品牌依序排列文本,再來按照以sentence切割
tidy_sentence<-brand_sentiment %>%
  filter(!is.na(art_Date1))%>%
  arrange(art_Date1)%>%
  group_by(brand)%>%
  mutate(linenumber = row_number())%>%
  unnest_tokens(sentence,art_Content)

colnames(tidy_sentence)=c("art_Date1","brand","linenumber","word")
按照品牌計算所有文本的情緒分數
####使用bing字典
all_tidy_sentence_sentiment <- tidy_sentence %>%
  inner_join(get_sentiments("bing")) %>%
  count(brand,linenumber, sentiment) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative)
## Joining, by = "word"
###畫圖 :X軸有照文本日期時間排序
ggplot(all_tidy_sentence_sentiment, aes(linenumber, sentiment, fill = brand)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~brand, ncol = 2, scales = "free_x")+
  xlab("index") +
  ylab("sentiment score")

###看文本趨勢,因為全部文本都看的話數量太大,所以先取每80句為一個單位取商數,算出情緒字的個數
tidy_sentence_sentiment <- all_tidy_sentence_sentiment %>%
  mutate(index=linenumber %/% 30) %>%
  group_by(index) %>%
  mutate(score=sum(sentiment))

6.用bind tf-idf試試分類字典

load("lipstick_tidy.RData")
# tidy_tfidf<-laptop_review[,c("art_Content","art_Date1","brand")]  %>%
#   unnest_tokens(word, art_Content) %>%
#   count(brand, word, sort = TRUE)
# 
# total_words <- tidy_tfidf%>% 
#   group_by(brand) %>% 
#   summarize(total = sum(n))
# 
# tidy_tfidf <- left_join(tidy_tfidf, total_words)
# 
# tidy_tfidf <- tidy_tfidf %>%
#   bind_tf_idf(word, brand, n)

#save(tidy_tfidf, file = "lipstick_tidy.RData")