Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼(Windows系統可將這行註解)
## [1] ""
packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr","data.table","wordcloud2","wordcloud","DiagrammeR","magrittr","rtweet",
"xml2","httr","jsonlite","data.tree")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(readr)
require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(NLP)
require(tidyr)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
require(data.table)
require(wordcloud2)
require(wordcloud)
library(DiagrammeR)
library(magrittr)
library(rtweet)
library(xml2)
library(httr)
library(jsonlite)
library(data.tree)
# 把文章和留言讀進來
MetaData = fread('test_mid_articleMetaData.csv',encoding = 'UTF-8')
Reviews = fread('test_mid_articleReviews.csv',encoding = 'UTF-8')
# 挑選文章對應的留言
Total_Data = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
#View(Total_Data)
data_day <- Total_Data %>%
dplyr::select(artDate, artUrl) %>%
distinct() %>%
group_by(artDate) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>% top_n(10)
data_day
## # A tibble: 11 x 2
## artDate count
## <chr> <int>
## 1 2021/02/01 30
## 2 2021/03/29 23
## 3 2021/03/01 22
## 4 2021/03/05 21
## 5 2021/03/04 16
## 6 2021/02/02 13
## 7 2021/03/28 12
## 8 2021/03/15 11
## 9 2021/02/06 9
## 10 2021/03/08 8
## 11 2021/03/27 8
可以看出事件討論度最高是2月1日,這天是緬甸軍政府採取發動政變的日期。
data_day$artDate= data_day$artDate %>% as.Date("%Y/%m/%d")
data_day %>%
ggplot()+
geom_line(aes(x=artDate,y=count))+
xlab("日期") +
ylab("數量") +
geom_vline(xintercept = as.numeric(as.Date("2021-02-01")), col='red', size = 1) +
scale_x_date(labels = date_format("%m/%d"))
緬甸軍方2月1日疑發動政變,包括總統溫敏、實質領導人翁山蘇姬在內多名執政黨高層已遭到軍方逮捕。軍方電視台宣布國家進入緊急狀態。 3月27號緬甸軍人節這一天,軍方舉行閱兵儀式同時,在緬甸各地仍有民眾上街示威,軍方持續以武力鎮壓,據統計當天至少有114位平民被軍方射殺,其中不乏青少年跟兒童,堪稱2月1號政變以來,緬甸最血腥的一天。
# 加入自定義的字典
jieba_tokenizer <- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
# 把文章和留言的斷詞結果併在一起
MetaData = fread('test_mid_articleMetaData.csv',encoding = 'UTF-8')
Reviews = fread('test_mid_articleReviews.csv',encoding = 'UTF-8')
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")])
#View(data)
# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
data_select = data %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# word_count:artDate,word,count
word_count <- data_select %>%
select(artDate,word) %>%
group_by(artDate,word) %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
word_count
## # A tibble: 3,049 x 3
## # Groups: artDate [61]
## artDate word count
## <date> <chr> <int>
## 1 2021-03-29 緬甸 321
## 2 2021-03-01 緬甸 271
## 3 2021-02-01 緬甸 268
## 4 2021-03-04 緬甸 238
## 5 2021-03-15 緬甸 165
## 6 2021-03-03 緬甸 146
## 7 2021-03-07 緬甸 140
## 8 2021-03-05 緬甸 133
## 9 2021-03-27 緬甸 131
## 10 2021-03-28 美國 109
## # ... with 3,039 more rows
可以看到緬甸、中國、台灣等字的出現次數較高。
word_count_s=data_select %>%
group_by(word) %>%
summarise(count=n()) %>%
arrange(desc(count)) %>%
head(10)
word_count_s %>% ggplot(aes(x=word, y=(count),fill=word))+
geom_col(show.legend = FALSE) +
labs(x = "詞", y = "詞頻") +
ggtitle("文章詞頻前10名")
tokens_count <- word_count %>%
group_by(word) %>%
summarise(sum = n()) %>%
arrange(desc(sum))
tokens_count %>% wordcloud2()
緬甸、中國、台灣確實是較明顯的。
jieba_tokenizer = worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
data_select02 = Total_Data %>%
filter(!grepl('[[:punct:]]',sentence)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",sentence)) # 去英文、數字
coup_bigram <- data_select02 %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
coup_bigram
## artTitle artDate artTime
## 1: [問卦]寫緬甸新聞的記者一直用""最血腥一天"" 2021/03/27 22:35:04
## 2: [問卦]寫緬甸新聞的記者一直用""最血腥一天"" 2021/03/27 22:35:04
## 3: [問卦]寫緬甸新聞的記者一直用""最血腥一天"" 2021/03/27 22:35:04
## 4: [問卦]寫緬甸新聞的記者一直用""最血腥一天"" 2021/03/27 22:35:04
## 5: [問卦]寫緬甸新聞的記者一直用""最血腥一天"" 2021/03/27 22:35:04
## ---
## 4430: Re:[問卦]緬甸之亂覺青們在那裡? 2021/02/23 11:29:39
## 4431: Re:[問卦]緬甸之亂覺青們在那裡? 2021/02/23 11:29:39
## 4432: Re:[問卦]緬甸之亂覺青們在那裡? 2021/02/23 11:29:39
## 4433: Re:[問卦]緬甸之亂覺青們在那裡? 2021/02/23 11:29:39
## 4434: Re:[問卦]緬甸之亂覺青們在那裡? 2021/02/23 11:29:39
## artUrl artPoster
## 1: https://www.ptt.cc/bbs/Gossiping/M.1616884506.A.770.html yoyoflag
## 2: https://www.ptt.cc/bbs/Gossiping/M.1616884506.A.770.html yoyoflag
## 3: https://www.ptt.cc/bbs/Gossiping/M.1616884506.A.770.html yoyoflag
## 4: https://www.ptt.cc/bbs/Gossiping/M.1616884506.A.770.html yoyoflag
## 5: https://www.ptt.cc/bbs/Gossiping/M.1616884506.A.770.html yoyoflag
## ---
## 4430: https://www.ptt.cc/bbs/Gossiping/M.1614079781.A.DB4.html joshua0606
## 4431: https://www.ptt.cc/bbs/Gossiping/M.1614079781.A.DB4.html joshua0606
## 4432: https://www.ptt.cc/bbs/Gossiping/M.1614079781.A.DB4.html joshua0606
## 4433: https://www.ptt.cc/bbs/Gossiping/M.1614079781.A.DB4.html joshua0606
## 4434: https://www.ptt.cc/bbs/Gossiping/M.1614079781.A.DB4.html joshua0606
## artCat commentNum push boo cmtContent
## 1: Gossiping 30 7 2 :建議用還蠻血腥的一天
## 2: Gossiping 30 7 2 :建議用還蠻血腥的一天
## 3: Gossiping 30 7 2 :建議用還蠻血腥的一天
## 4: Gossiping 30 7 2 :建議用還蠻血腥的一天
## 5: Gossiping 30 7 2 :建議用還蠻血腥的一天
## ---
## 4430: Gossiping 3 1 0 :緬甸就有一塊全由中共統治的地方,一國兩制。
## 4431: Gossiping 3 1 0 :緬甸就有一塊全由中共統治的地方,一國兩制。
## 4432: Gossiping 3 1 0 :緬甸就有一塊全由中共統治的地方,一國兩制。
## 4433: Gossiping 3 1 0 :緬甸就有一塊全由中共統治的地方,一國兩制。
## 4434: Gossiping 3 1 0 :緬甸就有一塊全由中共統治的地方,一國兩制。
## bigram
## 1: 抗爭 緬甸
## 2: 緬甸 軍方
## 3: 軍方 一次
## 4: 一次 鎮壓
## 5: 鎮壓 記者
## ---
## 4430: 可憐 有點
## 4431: 有點 緬甸
## 4432: 緬甸 人
## 4433: 人 排擠
## 4434: 排擠 樣子
血腥一天、 一次鎮壓、抗爭緬甸。
coup_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
count(bigram, sort = TRUE)
## bigram n
## 1: 血腥 一天 60
## 2: 緬甸 軍方 57
## 3: 一天 紀錄 30
## 4: 一天 軍方 30
## 5: 一次 鎮壓 30
## ---
## 621: 總覺 標題 1
## 622: 賺 錢 1
## 623: 鎮壓 抗議 1
## 624: 護照 想回 1
## 625: 護照 賺 1
P <- read_file("liwc/positive.txt") # 正向字典txt檔
N <- read_file("liwc/negative.txt") # 負向字典txt檔
#字典txt檔讀進來是一整個字串
typeof(P)
## [1] "character"
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot()+
geom_line(aes(x=artDate,y=count))+
scale_x_date(labels = date_format("%m/%d"))
02/01 是發文數最多
sentiment_count = data_select %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) %>%
arrange(desc(count))
sentiment_count
## # A tibble: 124 x 3
## # Groups: artDate [63]
## artDate sentiment count
## <date> <chr> <int>
## 1 2021-03-29 negative 341
## 2 2021-03-04 negative 306
## 3 2021-03-01 negative 280
## 4 2021-03-07 negative 267
## 5 2021-02-01 negative 245
## 6 2021-03-04 positive 236
## 7 2021-03-29 positive 232
## 8 2021-02-01 positive 213
## 9 2021-03-07 positive 208
## 10 2021-03-01 positive 168
## # ... with 114 more rows
# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")])
#View(data)
可以看到整體而言是負面的情緒較多
# 檢視資料的日期區間
range(sentiment_count$artDate) #"2021-01-26" "2021-04-26"
## [1] "2021-01-26" "2021-04-26"
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-01-26','2021-04-26'))
)+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-29'))
[1]])),colour = "red") +
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-01'))
[1]])),colour = "red")
sentiment_count %>%
# 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-01','2021-04-26'))
)
負面看法較多
sentiment_sum <-
word_count %>%
inner_join(LIWC, by = "word") %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame()
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
在負面的詞中,有抗議;譴責、鎮壓等相關字;在正面的詞中,有關心、支持、和平等。
sentiment_sum %>%
acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"), # positive negative
max.words = 50)
# 依據情緒值的正負比例歸類文章
article_type =
data_select %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=n()) %>%
spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
mutate(type = case_when(positive > negative ~ "positive",
TRUE ~ "negative")) %>%
data.frame()
article_type_date = left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")
# 看一下正負比例的文章各有幾篇
article_type %>%
group_by(type) %>%
summarise(count = n())
## # A tibble: 2 x 2
## type count
## <chr> <int>
## 1 negative 207
## 2 positive 89
article_type_date %>%
group_by(artDate,type) %>%
summarise(count = n()) %>%
ggplot(aes(x = artDate, y = count, fill = type)) +
geom_bar(stat = "identity", position = "dodge")+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-01','2021-04-26'))
)
可以看到事件的情緒是偏負面的
# negative_article:artUrl,word
negative_article <-
article_type %>%
filter(type=="negative")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
# positive_article:artUrl,word
positive_article <-
article_type %>%
filter(type=="positive")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
# 負面情緒關鍵字貢獻圖
negative_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to negative sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
可以看到負面的相關字有抗議、譴責、反抗等。
# 正面情緒關鍵字貢獻圖
positive_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to positive sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
可以看到關心、和平、幫忙等。
#####################################################中文怪怪 ### 1-2串接CoreNLP API
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9001",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)
# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="zh",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}
#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}
# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}
#gc() #釋放不使用的記憶體
#t0 = Sys.time()
#MetaData = fread('test_mid_articleMetaData.csv',encoding = 'UTF-8')
#MetaData %>%
# filter(!grepl("['^0-9a-z']",sentence))
#f =c("我来到北京清华大学","乒乓球拍卖完了","中国科学技术大学")
#f=c("可以看到文章中有中國、美國、台灣、仰光等詞")
#df = data.frame(text=f)
#clean = function(txt) {
#txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
# txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
## txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
# txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
# txt = gsub("\\n"," ",txt) #去除換行
# txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
# txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
# txt = gsub("&.*;","",txt) #去除html特殊字元編碼
# txt }
#MetaData$sentence = clean(MetaData$sentence)
#MetaData$text=MetaData$sentence
#df = data.frame(MetaData, header = F, encoding = "UTF-8")
#df$text = clean(df$text)
#obj = df %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
#Sys.time() - t0 #執行時間
#save.image("coreNLP_all.RData")
#tokens = coreNLP_tokens_parser(obj)
#head(tokens)
#unique(tokens$ner)
#tokens %>%
# filter(ner == "COUNTRY")
#head(tokens)
#tokens %>%
# filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
# group_by(lower_word) %>% #根據word分組
# summarize(count = n()) %>% #計算每組
# top_n(n = 13, count) %>%
## ungroup() %>%
# mutate(word = reorder(lower_word, count)) %>%
# ggplot(aes(word, count)) +
# geom_col()+
# ggtitle("Word Frequency (NER is COUNTRY)") +
# theme(text=element_text(size=14))+
# coord_flip()
############################################################從此開始 ### sentimentr 適用英文
library(sentimentr)
## Warning: package 'sentimentr' was built under R version 4.0.5
set.seed(10)
MetaData = fread('test_mid_articleMetaData.csv',encoding = 'UTF-8')
mytext <- get_sentences(MetaData$sentence) #將text轉成list of characters型態
x <- sample(MetaData$sentence, 1000, replace = TRUE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字
## words polarity n
## 1: care 1.00 12
## 2: please 1.00 8
## 3: pro 1.00 6
## 4: nobel 1.00 5
## 5: truth 1.00 4
## 6: understand 1.00 3
## 7: cares 1.00 2
## 8: assistance 0.80 19
## 9: helps 0.80 4
## 10: effective 0.80 4
## 11: civil 0.80 3
## 12: safe 0.75 4
## 13: hero 0.75 4
## 14: good 0.75 4
## 15: protect 0.75 3
## 16: humbly 0.60 6
## 17: fans 0.60 4
## 18: assets 0.50 64
## 19: win 0.50 29
## 20: support 0.50 12
## 21: save 0.50 12
## 22: liberation 0.50 9
## 23: like 0.50 8
## 24: free 0.50 7
## 25: safely 0.50 4
## 26: share 0.50 4
## 27: supports 0.50 4
## 28: appreciate 0.50 4
## 29: solidarity 0.50 4
## 30: democracy 0.40 8
## 31: young 0.40 6
## 32: organization 0.40 3
## 33: global 0.40 2
## 34: food 0.40 2
## 35: leader 0.25 16
## 36: league 0.25 8
## 37: center 0.25 5
## 38: soundly 0.25 4
## 39: feeling 0.25 4
## 40: guts 0.25 2
## 41: pray 0.10 62
## words polarity n
# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
#Reviews
Total_Data <- MetaData %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|Gossiping|TVBS新聞網|娛樂中心|看板|問卦", "", sentence))
# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞
jieba_tokenizer = worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
chi_tokenizer01 <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數
coup_words <- Total_Data %>%
unnest_tokens(word, sentence, token=chi_tokenizer01) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
coup_words
## artUrl word n
## 1: https://www.ptt.cc/bbs/Gossiping/M.1613244038.A.301.html 緬甸 43
## 2: https://www.ptt.cc/bbs/Gossiping/M.1613193695.A.798.html 緬甸 42
## 3: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 軍方 35
## 4: https://www.ptt.cc/bbs/Gossiping/M.1615564184.A.B56.html 緬甸 35
## 5: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 緬甸 33
## ---
## 21837: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 繁體中文 1
## 21838: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 嚴重 1
## 21839: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 嚴重者 1
## 21840: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 嚴格 1
## 21841: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 聽到 1
total_words <- coup_words %>%
group_by(artUrl) %>%
summarize(total = sum(n))
total_words
## # A tibble: 305 x 2
## artUrl total
## <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1611626398.A.E8E.html 149
## 2 https://www.ptt.cc/bbs/Gossiping/M.1611628921.A.CD2.html 120
## 3 https://www.ptt.cc/bbs/Gossiping/M.1611897015.A.61A.html 219
## 4 https://www.ptt.cc/bbs/Gossiping/M.1612136552.A.31F.html 124
## 5 https://www.ptt.cc/bbs/Gossiping/M.1612147806.A.5E8.html 116
## 6 https://www.ptt.cc/bbs/Gossiping/M.1612148853.A.BA3.html 182
## 7 https://www.ptt.cc/bbs/Gossiping/M.1612149990.A.A39.html 272
## 8 https://www.ptt.cc/bbs/Gossiping/M.1612150864.A.8D4.html 42
## 9 https://www.ptt.cc/bbs/Gossiping/M.1612151826.A.0F1.html 85
## 10 https://www.ptt.cc/bbs/Gossiping/M.1612152167.A.B41.html 51
## # ... with 295 more rows
coup_words <- left_join(coup_words, total_words,by = "artUrl")
coup_words_tf_idf <- coup_words %>%
bind_tf_idf(word, artUrl, n)
coup_words_tf_idf
## artUrl word n
## 1: https://www.ptt.cc/bbs/Gossiping/M.1613244038.A.301.html 緬甸 43
## 2: https://www.ptt.cc/bbs/Gossiping/M.1613193695.A.798.html 緬甸 42
## 3: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 軍方 35
## 4: https://www.ptt.cc/bbs/Gossiping/M.1615564184.A.B56.html 緬甸 35
## 5: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 緬甸 33
## ---
## 21837: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 繁體中文 1
## 21838: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 嚴重 1
## 21839: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 嚴重者 1
## 21840: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 嚴格 1
## 21841: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 聽到 1
## total tf idf tf_idf
## 1: 1480 0.02905405 0.02321829 0.0006745855
## 2: 753 0.05577689 0.02321829 0.0012950441
## 3: 582 0.06013746 1.33828514 0.0804810652
## 4: 359 0.09749304 0.02321829 0.0022636216
## 5: 582 0.05670103 0.02321829 0.0013165010
## ---
## 21837: 22 0.04545455 4.11087386 0.1868579029
## 21838: 22 0.04545455 2.77587280 0.1261760362
## 21839: 22 0.04545455 4.33401742 0.1970007916
## 21840: 22 0.04545455 4.33401742 0.1970007916
## 21841: 22 0.04545455 3.92855231 0.1785705594
可以看到有緬甸、中國等。
coup_words_tf_idf %>%
group_by(artUrl) %>%
slice_max(tf_idf, n=5) %>%
filter(n > 15) %>%
arrange(desc(artUrl))
## # A tibble: 19 x 7
## # Groups: artUrl [15]
## artUrl word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國 27 301 0.0897 1.21 0.108
## 2 https://www.ptt.cc/bbs/Gossiping/M.16~ 人類 31 2140 0.0145 4.11 0.0596
## 3 https://www.ptt.cc/bbs/Gossiping/M.16~ 不可 19 2140 0.00888 4.62 0.0410
## 4 https://www.ptt.cc/bbs/Gossiping/M.16~ 聖經 16 2140 0.00748 5.03 0.0376
## 5 https://www.ptt.cc/bbs/Gossiping/M.16~ 告訴 21 2140 0.00981 3.42 0.0335
## 6 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍政~ 17 243 0.0700 0.868 0.0607
## 7 https://www.ptt.cc/bbs/Gossiping/M.16~ 美國 25 292 0.0856 1.68 0.144
## 8 https://www.ptt.cc/bbs/Gossiping/M.16~ 台灣 16 292 0.0548 1.04 0.0569
## 9 https://www.ptt.cc/bbs/Gossiping/M.16~ 美國 25 225 0.111 1.68 0.186
## 10 https://www.ptt.cc/bbs/Gossiping/M.16~ 存在 21 651 0.0323 3.24 0.104
## 11 https://www.ptt.cc/bbs/Gossiping/M.16~ 仰光 17 314 0.0541 1.94 0.105
## 12 https://www.ptt.cc/bbs/Gossiping/M.16~ 中共 20 359 0.0557 2.78 0.155
## 13 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國 23 268 0.0858 1.21 0.104
## 14 https://www.ptt.cc/bbs/Gossiping/M.16~ 群眾 16 649 0.0247 3.08 0.0760
## 15 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍方 35 582 0.0601 1.34 0.0805
## 16 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國 29 753 0.0385 1.21 0.0466
## 17 https://www.ptt.cc/bbs/Gossiping/M.16~ 台灣 17 139 0.122 1.04 0.127
## 18 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國 24 263 0.0913 1.21 0.110
## 19 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍方 19 281 0.0676 1.34 0.0905
可以看到文章中有中國、美國、台灣、仰光等詞
coup_words_tf_idf %>%
group_by(artUrl) %>%
slice_max(tf_idf, n=10) %>%
ungroup() %>%
count(word, sort=TRUE)
## # A tibble: 3,261 x 2
## word n
## <chr> <int>
## 1 軍方 17
## 2 中國 16
## 3 親中 10
## 4 美國 9
## 5 軍政府 9
## 6 香港 9
## 7 民主 8
## 8 工廠 7
## 9 台灣 7
## 10 民族 7
## # ... with 3,251 more rows
軍方、軍政府、中國等是大家討論的重點。
#jieba_tokenizer = worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
#chi_tokenizer <- function(t) {
# lapply(t, function(x) {
# if(nchar(x)>1){
# tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
# tokens <- tokens[nchar(tokens)>1]
# return(tokens)
# }
# })
#}
# 過濾掉三個關鍵字"緬甸", "軍政"
word_pairs <- coup_words %>%
pairwise_count(word, artUrl, sort = TRUE) %>%
filter(!item1 %in% c("緬甸", "軍政") & !item2 %in% c("緬甸", "軍政"))
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
word_pairs
## # A tibble: 4,354,160 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 政變 軍政府 60
## 2 軍政府 政變 60
## 3 政府 軍政府 58
## 4 軍政府 政府 58
## 5 軍政府 中國 50
## 6 中國 軍政府 50
## 7 政變 軍方 49
## 8 軍方 政變 49
## 9 軍政府 軍方 47
## 10 軍方 軍政府 47
## # ... with 4,354,150 more rows
可以看到(政變、軍政府)、(軍政府、中國)、(抗議、軍整府)等詞常一起出現
word_cors <- coup_words %>%
group_by(word) %>%
filter(n() >= 10) %>%
pairwise_cor(word, artUrl, sort = TRUE)
word_cors
## # A tibble: 117,992 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 綜合 外電報導 0.694
## 2 外電報導 綜合 0.694
## 3 舉行 指出 0.621
## 4 指出 舉行 0.621
## 5 綜合 報導 0.599
## 6 報導 綜合 0.599
## 7 民主聯盟 全國 0.569
## 8 全國 民主聯盟 0.569
## 9 已有 記者 0.569
## 10 記者 已有 0.569
## # ... with 117,982 more rows
##??
word_cors %>%
filter(item1 %in% c("翁山蘇姬", "仰光")) %>%
group_by(item1) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
仰光、曼德勒是緬甸政變發生衝突的城市[大城] 國務資政、領導人都是對翁山蘇姬的稱呼
set.seed(2020)
word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
# 設定幾個詞做爲seed words
seed_words <- c("有沒有", "超過", "指出", "已有")
# 設定threshold爲0.5
threshold <- 0.5
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
.$item1 %>%
unique()
remove_words
## [1] "舉行" "指出" "已有" "記者" "報導" "超過" "援助" "活動"
# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
filter(!(item1 %in% remove_words|item2 %in% remove_words))
word_cors_new %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
總司令敏昂萊(Min Aung Hlaing)在首都奈比多主持紀念軍人節的閱兵式