修改助教檔案
情緒字典:
安裝需要的packages
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales',"data.table")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(dplyr)
library(stringr)
require(tidytext)
library(wordcloud2)
require(data.table)
require(ggplot2)
require(reshape2)
require(wordcloud)
require(tidyr)
require(readr)
require(scales)
require(data.table)
資料來源:文字平台收集ptt八卦版,關鍵字搜尋:美心、還願
準備LIWC字典
#將匯入的字整理成辭典
p <- read_file("../dict/liwc/positive.txt")
n <- read_file("../dict/liwc/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
#以逗號做分割
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
LIWC_ch
字典1:LIWC_ch
以LIWC字典判斷文集中的word屬於正面字還是負面字
首先計算所有字在文集中出現的總數
word_count <- data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>3)
word_count
與LIWC情緒字典join
文集中的字出現在LIWC字典中是屬於positive還是negative
word_count %>% inner_join(LIWC_ch)
# word_count屬於LIWC_ch辭典的字留下
data %>%
select(word) %>%
inner_join(LIWC_ch)
以LIWC情緒字典分析
統計每天的文章正面字的次數與負面字的次數
# 找出所有在時間區段中的日期
all_dates <-expand.grid(seq(as.Date(min(data$artDate)), as.Date(max(data$artDate)), by="day"), c("positive", "negative"))
str(all_dates)
names(all_dates) <- c("artDate", "sentiment")
#expand.grid用法
try <- expand.grid(h=c(60,80), w=c(100, 300), sex=c("Male", "Female"))
names(try) <- c("heigh","wide","sex")
try
#filter(word!="遊戲")刪除遊戲
plot_table<-data %>%
select(artDate,word,count) %>%
inner_join(LIWC_ch) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
head(plot_table)
#沒有資料的日期將count設為0
plot_table <- all_dates %>%
merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
all.x=T,all.y=T) %>%
mutate(count = replace_na(count, 0))
head(plot_table)
plot_table %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
#replace_na用法
x <- c(1,2,7,8,NA,10,22,NA,15)
replace_na(x,0)
以LIWC字典統計的情緒字數關察兩條線的差異,結果不明顯
同樣的方法以自訂情緒字典來分析並觀察有無差異
準備字典:遊戲商字典
以遊戲商的角度判斷情緒字屬於positive還是negative
將LIWC的情緒字以遊戲商角度重新label
Ex:
驚悚:positive 不恐怖:negative
#寫出csv並觀察
#word_count %>% inner_join(LIWC_ch) %>% write_csv("tokens_sentiment.csv" )
p <- read_file("../dict/game_company/positive.txt")
n <- read_file("../dict/game_company/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
game_company_ch <- rbind(positive, negative)
game_company_ch %>% head()
plot_table<-data %>%
select(artDate,word,count) %>%
inner_join(game_company_ch) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
#沒有資料的日期將count設為0
plot_table <- all_dates %>%
merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
all.x=T,all.y=T) %>%
mutate(count = replace_na(count, 0))
plot_table %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
接下來準備自訂字典xi_ch
以習近平的觀點來分析情緒字
準備字典:xi_ch
p <- read_file("../dict/xi/positive.txt")
n <- read_file("../dict/xi/negative.txt")
positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
xi_ch <- rbind(positive, negative)
xi_ch %>% head()
plot_table<-data %>%
select(artDate,word,count) %>%
inner_join(xi_ch) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
#沒有資料的日期將count設為0
plot_table <- all_dates %>%
merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
all.x=T,all.y=T) %>%
mutate(count = replace_na(count, 0))
plot_table %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"))
在02/15後負面情緒非常高
由於符咒風波引發中國玩家強烈的抵制
上面幾張圖發現02/15後有一段高峰
接下來都以02/15後的資料進行分析
比較兩個字典分析差異
senti_by_date_game_company<-data %>%
inner_join(game_company_ch) %>%
group_by(artDate, sentiment) %>%
summarise(n=sum(count)) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(method='game_company')
senti_by_date_game_company %>% head()
LIWC字典
計算每天的文章正面及負面字數
sentiment為正面字數減去負面字數
senti_by_date_LIWC<-data %>%
inner_join(LIWC_ch) %>%
group_by(artDate, sentiment) %>%
summarise(n=sum(count)) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(method='LIWC')
senti_by_date_LIWC %>% head()
比較兩個字典分析差異
bind_rows(senti_by_date_game_company,
senti_by_date_LIWC) %>%
filter(artDate>='2019-02-15') %>%
ggplot(aes(x= artDate,y=sentiment,fill=method)) +
geom_col(show.legend = FALSE) +
scale_x_date(labels = date_format("%m/%d")) +
facet_wrap(~method, ncol = 1, scales = "fixed")+
geom_text(aes(label=sentiment))
XI字典
計算每天的文章正面及負面字數
sentiment為正面字數減去負面字數
senti_by_date_XI<-data %>%
inner_join(xi_ch) %>%
group_by(artDate, sentiment) %>%
summarise(n=sum(count)) %>%
spread(sentiment, n, fill = 0) %>%
mutate(sentiment = positive - negative) %>%
mutate(method='XI')
比較遊戲廠商與習近平的分析角度差異
bind_rows(senti_by_date_game_company,
senti_by_date_XI) %>%
filter(artDate>='2019-02-15') %>%
ggplot(aes(x= artDate,y=sentiment,fill=method)) +
geom_col(show.legend = FALSE) +
scale_x_date(labels = date_format("%m/%d")) +
facet_wrap(~method, ncol = 1, scales = "fixed")+
geom_text(aes(label=sentiment))
從上面幾張圖看出在02/15後情緒起伏較明顯
以不同字典分析最常見的情緒字
LIWC 字典分析最常見正面及負面情緒字
word_count %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
清除’遊戲’辭彙
word_count<-word_count %>% filter(word!='遊戲')
word_count %>%
inner_join(LIWC_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
接下來以遊戲商及習近平的角度分析情緒
以自訂字典:遊戲廠商 分析最常見正面及負面情緒字
word_count %>%
inner_join(game_company_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
以自訂字典:習近平 分析最常見正面及負面情緒字
word_count %>%
inner_join(xi_ch) %>%
group_by(sentiment) %>%
top_n(10,wt = count) %>%
ungroup() %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(word, count, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
02/15後情緒數量大增
用文字雲分析02/15後討論的情緒字
LIWC字典
data %>%
filter(artDate>='2019-02-15') %>%
inner_join(LIWC_ch) %>%
group_by(word,sentiment) %>%
summarise(count=sum(count)) %>%
acast(word ~ sentiment, value.var = "count", fill = 0) %>%
comparison.cloud(colors = c("gray80", "gray20"),
max.words = 100)
自訂字典:遊戲廠商
data %>%
filter(artDate>='2019-02-15') %>%
inner_join(game_company_ch) %>%
group_by(word,sentiment) %>%
summarise(count=sum(count)) %>%
acast(word ~ sentiment, value.var = "count", fill = 0) %>%
comparison.cloud(colors = c("gray80", "gray20"),
max.words = 100)
自訂字典:習近平
data %>%
filter(artDate>='2019-02-15') %>%
inner_join(xi_ch) %>%
group_by(word,sentiment) %>%
summarise(count=sum(count)) %>%
acast(word ~ sentiment, value.var = "count", fill = 0) %>%
comparison.cloud(colors = c("gray80", "gray20"),
max.words = 100)
課堂練習
文字雲分析杜豐于及慈姑觀音兩腳色的評價
搜尋word中有杜豐于或慈姑觀音的文章
---
title: "中文資料使用不同情緒字典的情緒分析"
output: html_notebook
---
###修改助教檔案

#情緒字典:

+ LIWC
+ 使用者自訂字典
  + 遊戲商
  + 習近平

## 安裝需要的packages
```{r}
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales',"data.table")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
```

```{r}
library(dplyr)
library(stringr)
require(tidytext)
library(wordcloud2)
require(data.table)
require(ggplot2)
require(reshape2)
require(wordcloud)
require(tidyr)
require(readr)
require(scales)
require(data.table)
```




#資料來源:文字平台收集ptt八卦版,關鍵字搜尋:美心、還願
#資料集:social_media_0305_artWordFreq.csv

```{r}
data=fread('../data/social_media_0305_artWordFreq.csv',encoding = 'UTF-8')
```

```{r}
data %>% 
  filter(grepl('_',word)) 
```

```{r}
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
#轉換日期格式

head(data)

data<-data %>% 
  filter(!grepl('_',word)) 
#grepl:word出現正規表示式'_'
  
data
```







#準備LIWC字典



```{r}
#將匯入的字整理成辭典
p <- read_file("../dict/liwc/positive.txt")
n <- read_file("../dict/liwc/negative.txt")

positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
#以逗號做分割

positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
LIWC_ch <- rbind(positive, negative)
LIWC_ch
```

####字典1:LIWC_ch



#以LIWC字典判斷文集中的word屬於正面字還是負面字

####EX:
+ 讚美:positive
+ 卑劣:negative

####首先計算所有字在文集中出現的總數


```{r}
word_count <- data %>%
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>3)
 
word_count

```

####與LIWC情緒字典join
####文集中的字出現在LIWC字典中是屬於positive還是negative


```{r}
word_count %>% inner_join(LIWC_ch)
# word_count屬於LIWC_ch辭典的字留下

data %>% 
  select(word) %>%
  inner_join(LIWC_ch)

```





#以LIWC情緒字典分析
####統計每天的文章正面字的次數與負面字的次數


```{r}
# 找出所有在時間區段中的日期
all_dates <-expand.grid(seq(as.Date(min(data$artDate)), as.Date(max(data$artDate)), by="day"), c("positive", "negative"))
str(all_dates)

names(all_dates) <- c("artDate", "sentiment")
```
```{r}
#expand.grid用法
try <- expand.grid(h=c(60,80), w=c(100, 300), sex=c("Male", "Female"))
names(try) <- c("heigh","wide","sex")
try
```

```{r}
#filter(word!="遊戲")刪除遊戲

plot_table<-data %>%
  select(artDate,word,count) %>%
  inner_join(LIWC_ch) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))  
head(plot_table)


#沒有資料的日期將count設為0
plot_table <- all_dates %>% 
  merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))
head(plot_table)


plot_table %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d")) 

```

```{r}
#replace_na用法
x <- c(1,2,7,8,NA,10,22,NA,15)
replace_na(x,0)
```

####以LIWC字典統計的情緒字數關察兩條線的差異，結果不明顯
####同樣的方法以自訂情緒字典來分析並觀察有無差異


#準備字典:遊戲商字典
####以遊戲商的角度判斷情緒字屬於positive還是negative
####將LIWC的情緒字以遊戲商角度重新label
Ex:

驚悚:positive
不恐怖:negative


```{r}
#寫出csv並觀察
#word_count %>%  inner_join(LIWC_ch) %>% write_csv("tokens_sentiment.csv" )
```





```{r}
p <- read_file("../dict/game_company/positive.txt")
n <- read_file("../dict/game_company/negative.txt")

positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
game_company_ch <- rbind(positive, negative)
game_company_ch %>% head()
```


```{r}
plot_table<-data %>%
  select(artDate,word,count) %>%
  inner_join(game_company_ch) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))  

#沒有資料的日期將count設為0
plot_table <- all_dates %>% 
  merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))

plot_table %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d")) 
```




####接下來準備自訂字典xi_ch
####以習近平的觀點來分析情緒字

####準備字典:xi_ch

```{r}
p <- read_file("../dict/xi/positive.txt")
n <- read_file("../dict/xi/negative.txt")

positive <- strsplit(p, "[,]")[[1]]
negative <- strsplit(n, "[,]")[[1]]
positive <- data.frame(word = positive, sentiments = "positive")
negative <- data.frame(word = negative, sentiemtns = "negative")
colnames(negative) = c("word","sentiment")
colnames(positive) = c("word","sentiment")
xi_ch <- rbind(positive, negative)
xi_ch %>% head()
```


```{r}
plot_table<-data %>%
  select(artDate,word,count) %>%
  inner_join(xi_ch) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))  

#沒有資料的日期將count設為0
plot_table <- all_dates %>% 
  merge(plot_table,by.x=c('artDate', "sentiment"),by.y=c('artDate', "sentiment"),
        all.x=T,all.y=T) %>% 
  mutate(count = replace_na(count, 0))

plot_table %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d")) 
```

####在02/15後負面情緒非常高
####由於符咒風波引發中國玩家強烈的抵制




####上面幾張圖發現02/15後有一段高峰
####接下來都以02/15後的資料進行分析





#比較兩個字典分析差異

```{r}
senti_by_date_game_company<-data %>% 
  inner_join(game_company_ch) %>%
  
  group_by(artDate, sentiment) %>%
  summarise(n=sum(count)) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  mutate(method='game_company') 


senti_by_date_game_company %>% head()
```



####LIWC字典
####計算每天的文章正面及負面字數
####sentiment為正面字數減去負面字數

```{r}
senti_by_date_LIWC<-data %>% 
  inner_join(LIWC_ch) %>%
  group_by(artDate, sentiment) %>%
  summarise(n=sum(count)) %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  mutate(method='LIWC') 

senti_by_date_LIWC %>% head()

```



#比較兩個字典分析差異
```{r}
bind_rows(senti_by_date_game_company,
          senti_by_date_LIWC) %>%
  
  filter(artDate>='2019-02-15') %>%
  ggplot(aes(x= artDate,y=sentiment,fill=method)) +
  
  geom_col(show.legend = FALSE) +
  scale_x_date(labels = date_format("%m/%d")) +
  facet_wrap(~method, ncol = 1, scales = "fixed")+
  geom_text(aes(label=sentiment))
  
```


####XI字典
####計算每天的文章正面及負面字數
####sentiment為正面字數減去負面字數


```{r}
senti_by_date_XI<-data %>% 
  inner_join(xi_ch) %>%
  group_by(artDate, sentiment) %>%
  summarise(n=sum(count))  %>%
  spread(sentiment, n, fill = 0) %>%
  mutate(sentiment = positive - negative) %>%
  mutate(method='XI') 

```



#比較遊戲廠商與習近平的分析角度差異

```{r}
bind_rows(senti_by_date_game_company,
          senti_by_date_XI) %>%
  filter(artDate>='2019-02-15') %>%
  ggplot(aes(x= artDate,y=sentiment,fill=method)) +
  geom_col(show.legend = FALSE) +
  scale_x_date(labels = date_format("%m/%d")) +
  facet_wrap(~method, ncol = 1, scales = "fixed")+
  geom_text(aes(label=sentiment))
```


從上面幾張圖看出在02/15後情緒起伏較明顯



#以不同字典分析最常見的情緒字


# LIWC 字典分析最常見正面及負面情緒字

```{r}
word_count %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
```

####清除'遊戲'辭彙

```{r}
word_count<-word_count %>% filter(word!='遊戲')
```



```{r}
word_count %>% 
  inner_join(LIWC_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
```




接下來以遊戲商及習近平的角度分析情緒


#### 以自訂字典:遊戲廠商 分析最常見正面及負面情緒字 

```{r}
word_count %>%
  inner_join(game_company_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = count) %>% 
  ungroup() %>%
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
```

#### 以自訂字典:習近平 分析最常見正面及負面情緒字

```{r}
word_count %>%
  inner_join(xi_ch) %>%
  group_by(sentiment) %>%
  top_n(10,wt = count) %>% 
  ungroup() %>%
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
```








#02/15後情緒數量大增

#用文字雲分析02/15後討論的情緒字



####LIWC字典

```{r}
data %>% 
  filter(artDate>='2019-02-15') %>%
  inner_join(LIWC_ch) %>%
  group_by(word,sentiment) %>%
  summarise(count=sum(count)) %>%

  acast(word ~ sentiment, value.var = "count", fill = 0) %>%
  comparison.cloud(colors = c("gray80", "gray20"),
                   max.words = 100)
  
```


####自訂字典:遊戲廠商

```{r}
data %>% 
  filter(artDate>='2019-02-15') %>%
  inner_join(game_company_ch) %>%
  group_by(word,sentiment) %>%
  summarise(count=sum(count)) %>%

  acast(word ~ sentiment, value.var = "count", fill = 0) %>%
  comparison.cloud(colors = c("gray80", "gray20"),
                   max.words = 100)
  
```


####自訂字典:習近平

```{r}
data %>% 
  filter(artDate>='2019-02-15') %>%
  inner_join(xi_ch) %>%
  group_by(word,sentiment) %>%
  summarise(count=sum(count)) %>%
  acast(word ~ sentiment, value.var = "count", fill = 0) %>%
  comparison.cloud(colors = c("gray80", "gray20"),
                   max.words = 100)
```





####課堂練習


####文字雲分析杜豐于及慈姑觀音兩腳色的評價


####搜尋word中有杜豐于或慈姑觀音的文章




資料集:social_media_0305_artWordFreq.csv