Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼(Windows系統可將這行註解)
## [1] ""
packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr","data.table","wordcloud2","wordcloud","DiagrammeR","magrittr","rtweet",
             "xml2","httr","jsonlite","data.tree")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(readr)
require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(NLP)
require(tidyr)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
require(data.table)
require(wordcloud2)
require(wordcloud)

library(DiagrammeR)
library(magrittr)

library(rtweet)
library(xml2)
library(httr)
library(jsonlite)
library(data.tree)

1. 資料基本介紹

# 把文章和留言讀進來
MetaData = fread('test_mid_articleMetaData.csv',encoding = 'UTF-8')
Reviews  = fread('test_mid_articleReviews.csv',encoding = 'UTF-8')

# 挑選文章對應的留言
Total_Data = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
#View(Total_Data)

計算每日發文次數

data_day <- Total_Data %>% 
  dplyr::select(artDate, artUrl) %>% 
  distinct() %>%   
  group_by(artDate) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count)) %>% top_n(10)

data_day 
## # A tibble: 11 x 2
##    artDate    count
##    <chr>      <int>
##  1 2021/02/01    30
##  2 2021/03/29    23
##  3 2021/03/01    22
##  4 2021/03/05    21
##  5 2021/03/04    16
##  6 2021/02/02    13
##  7 2021/03/28    12
##  8 2021/03/15    11
##  9 2021/02/06     9
## 10 2021/03/08     8
## 11 2021/03/27     8

可以看出事件討論度最高是2月1日,這天是緬甸軍政府採取發動政變的日期。

折線圖呈現

data_day$artDate= data_day$artDate %>% as.Date("%Y/%m/%d")
data_day %>%
  ggplot()+
    geom_line(aes(x=artDate,y=count))+
    xlab("日期") + 
    ylab("數量") + 
    geom_vline(xintercept = as.numeric(as.Date("2021-02-01")), col='red', size = 1) + 
    scale_x_date(labels = date_format("%m/%d"))

緬甸軍方2月1日疑發動政變,包括總統溫敏、實質領導人翁山蘇姬在內多名執政黨高層已遭到軍方逮捕。軍方電視台宣布國家進入緊急狀態。 3月27號緬甸軍人節這一天,軍方舉行閱兵儀式同時,在緬甸各地仍有民眾上街示威,軍方持續以武力鎮壓,據統計當天至少有114位平民被軍方射殺,其中不乏青少年跟兒童,堪稱2月1號政變以來,緬甸最血腥的一天。

2.資料前處理與分析

(1). 文章斷詞

設定斷詞引擎

# 加入自定義的字典
jieba_tokenizer <- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")

# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

資料合併

# 把文章和留言的斷詞結果併在一起

MetaData = fread('test_mid_articleMetaData.csv',encoding = 'UTF-8')
Reviews  = fread('test_mid_articleReviews.csv',encoding = 'UTF-8')

MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)

# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 
#View(data)

(2). 資料基本清理

日期格式化

去除特殊字元、詞頻太低的字

算每天不同字的詞頻

# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")

# 過濾特殊字元
data_select = data %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
  filter(nchar(.$word)>1) 
  

# word_count:artDate,word,count
word_count <- data_select %>%
  select(artDate,word) %>%
  group_by(artDate,word) %>%
  summarise(count=n()) %>%  # 算字詞單篇總數用summarise
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
word_count
## # A tibble: 3,049 x 3
## # Groups:   artDate [61]
##    artDate    word  count
##    <date>     <chr> <int>
##  1 2021-03-29 緬甸    321
##  2 2021-03-01 緬甸    271
##  3 2021-02-01 緬甸    268
##  4 2021-03-04 緬甸    238
##  5 2021-03-15 緬甸    165
##  6 2021-03-03 緬甸    146
##  7 2021-03-07 緬甸    140
##  8 2021-03-05 緬甸    133
##  9 2021-03-27 緬甸    131
## 10 2021-03-28 美國    109
## # ... with 3,039 more rows

可以看到緬甸、中國、台灣等字的出現次數較高。

word_count_s=data_select %>% 
  group_by(word) %>% 
  summarise(count=n()) %>% 
  arrange(desc(count)) %>% 
  head(10)

word_count_s %>% ggplot(aes(x=word, y=(count),fill=word))+
  geom_col(show.legend = FALSE) +
  labs(x = "詞", y = "詞頻") +
  ggtitle("文章詞頻前10名")

文字雲

tokens_count <- word_count %>% 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  arrange(desc(sum))
tokens_count %>% wordcloud2()

緬甸、中國、台灣確實是較明顯的。

Bigram

jieba_tokenizer = worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")

jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}
data_select02 = Total_Data %>% 
  filter(!grepl('[[:punct:]]',sentence)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",sentence))  # 去英文、數字

coup_bigram <- data_select02 %>%
  unnest_tokens(bigram, sentence, token = jieba_bigram)

coup_bigram
##                                         artTitle    artDate  artTime
##    1: [問卦]寫緬甸新聞的記者一直用""最血腥一天"" 2021/03/27 22:35:04
##    2: [問卦]寫緬甸新聞的記者一直用""最血腥一天"" 2021/03/27 22:35:04
##    3: [問卦]寫緬甸新聞的記者一直用""最血腥一天"" 2021/03/27 22:35:04
##    4: [問卦]寫緬甸新聞的記者一直用""最血腥一天"" 2021/03/27 22:35:04
##    5: [問卦]寫緬甸新聞的記者一直用""最血腥一天"" 2021/03/27 22:35:04
##   ---                                                               
## 4430:             Re:[問卦]緬甸之亂覺青們在那裡? 2021/02/23 11:29:39
## 4431:             Re:[問卦]緬甸之亂覺青們在那裡? 2021/02/23 11:29:39
## 4432:             Re:[問卦]緬甸之亂覺青們在那裡? 2021/02/23 11:29:39
## 4433:             Re:[問卦]緬甸之亂覺青們在那裡? 2021/02/23 11:29:39
## 4434:             Re:[問卦]緬甸之亂覺青們在那裡? 2021/02/23 11:29:39
##                                                         artUrl  artPoster
##    1: https://www.ptt.cc/bbs/Gossiping/M.1616884506.A.770.html   yoyoflag
##    2: https://www.ptt.cc/bbs/Gossiping/M.1616884506.A.770.html   yoyoflag
##    3: https://www.ptt.cc/bbs/Gossiping/M.1616884506.A.770.html   yoyoflag
##    4: https://www.ptt.cc/bbs/Gossiping/M.1616884506.A.770.html   yoyoflag
##    5: https://www.ptt.cc/bbs/Gossiping/M.1616884506.A.770.html   yoyoflag
##   ---                                                                    
## 4430: https://www.ptt.cc/bbs/Gossiping/M.1614079781.A.DB4.html joshua0606
## 4431: https://www.ptt.cc/bbs/Gossiping/M.1614079781.A.DB4.html joshua0606
## 4432: https://www.ptt.cc/bbs/Gossiping/M.1614079781.A.DB4.html joshua0606
## 4433: https://www.ptt.cc/bbs/Gossiping/M.1614079781.A.DB4.html joshua0606
## 4434: https://www.ptt.cc/bbs/Gossiping/M.1614079781.A.DB4.html joshua0606
##          artCat commentNum push boo                                  cmtContent
##    1: Gossiping         30    7   2                       :建議用還蠻血腥的一天
##    2: Gossiping         30    7   2                       :建議用還蠻血腥的一天
##    3: Gossiping         30    7   2                       :建議用還蠻血腥的一天
##    4: Gossiping         30    7   2                       :建議用還蠻血腥的一天
##    5: Gossiping         30    7   2                       :建議用還蠻血腥的一天
##   ---                                                                          
## 4430: Gossiping          3    1   0 :緬甸就有一塊全由中共統治的地方,一國兩制。
## 4431: Gossiping          3    1   0 :緬甸就有一塊全由中共統治的地方,一國兩制。
## 4432: Gossiping          3    1   0 :緬甸就有一塊全由中共統治的地方,一國兩制。
## 4433: Gossiping          3    1   0 :緬甸就有一塊全由中共統治的地方,一國兩制。
## 4434: Gossiping          3    1   0 :緬甸就有一塊全由中共統治的地方,一國兩制。
##          bigram
##    1: 抗爭 緬甸
##    2: 緬甸 軍方
##    3: 軍方 一次
##    4: 一次 鎮壓
##    5: 鎮壓 記者
##   ---          
## 4430: 可憐 有點
## 4431: 有點 緬甸
## 4432:   緬甸 人
## 4433:   人 排擠
## 4434: 排擠 樣子

血腥一天、 一次鎮壓、抗爭緬甸。

取前10名相關性大的詞

coup_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  count(bigram, sort = TRUE)
##         bigram  n
##   1: 血腥 一天 60
##   2: 緬甸 軍方 57
##   3: 一天 紀錄 30
##   4: 一天 軍方 30
##   5: 一次 鎮壓 30
##  ---             
## 621: 總覺 標題  1
## 622:     賺 錢  1
## 623: 鎮壓 抗議  1
## 624: 護照 想回  1
## 625:   護照 賺  1

3. 情緒分析

載入情緒字典

P <- read_file("liwc/positive.txt") # 正向字典txt檔
N <- read_file("liwc/negative.txt") # 負向字典txt檔

#字典txt檔讀進來是一整個字串
typeof(P)
## [1] "character"

分割字詞,並將兩個情緒字典併在一起

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)

# 檢視字典
head(LIWC)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

正負情緒發文折線圖

MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot()+
    geom_line(aes(x=artDate,y=count))+
    scale_x_date(labels = date_format("%m/%d"))

02/01 是發文數最多

每天情緒總和(sentiment_count)

sentiment_count = data_select %>%
  select(artDate,word) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=n())  %>% 
  arrange(desc(count))
sentiment_count
## # A tibble: 124 x 3
## # Groups:   artDate [63]
##    artDate    sentiment count
##    <date>     <chr>     <int>
##  1 2021-03-29 negative    341
##  2 2021-03-04 negative    306
##  3 2021-03-01 negative    280
##  4 2021-03-07 negative    267
##  5 2021-02-01 negative    245
##  6 2021-03-04 positive    236
##  7 2021-03-29 positive    232
##  8 2021-02-01 positive    213
##  9 2021-03-07 positive    208
## 10 2021-03-01 positive    168
## # ... with 114 more rows
# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 
#View(data)

可以看到整體而言是負面的情緒較多

# 檢視資料的日期區間
range(sentiment_count$artDate) #"2021-01-26" "2021-04-26"
## [1] "2021-01-26" "2021-04-26"

畫折線圖

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-01-26','2021-04-26'))
               )+
  # 加上標示日期的線
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-29'))
[1]])),colour = "red") +
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-01'))
[1]])),colour = "red") 

正負情緒比例折線圖

sentiment_count %>% 
  # 標準化的部分
  group_by(artDate) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-02-01','2021-04-26'))
               )

負面看法較多

正負情緒代表字

sentiment_sum <- 
  word_count %>%
    inner_join(LIWC, by = "word") %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = sum(count)
  ) %>% 
  arrange(desc(sum)) %>%
  data.frame() 
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

計算情緒字並畫圖

sentiment_sum %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

在負面的詞中,有抗議;譴責、鎮壓等相關字;在正面的詞中,有關心、支持、和平等。

sentiment_sum %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"), # positive negative
                   max.words = 50)

正負情緒文章數量

# 依據情緒值的正負比例歸類文章
article_type = 
  data_select %>%
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=n()) %>%
  spread(sentiment,count,fill = 0) %>% #把正負面情緒展開,缺值補0
  mutate(type = case_when(positive > negative ~ "positive", 
                             TRUE ~ "negative")) %>%
  data.frame() 
article_type_date = left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")

# 看一下正負比例的文章各有幾篇
article_type %>%
  group_by(type) %>%
  summarise(count = n())
## # A tibble: 2 x 2
##   type     count
##   <chr>    <int>
## 1 negative   207
## 2 positive    89

每日正負文章數量長條圖

article_type_date %>%
  group_by(artDate,type) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = artDate, y = count, fill = type)) + 
  geom_bar(stat = "identity", position = "dodge")+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-02-01','2021-04-26'))
               )

可以看到事件的情緒是偏負面的

把正面和負面的文章挑出來,並和斷詞結果合併。

# negative_article:artUrl,word
negative_article <-
article_type %>%
  filter(type=="negative")%>%
  select(artUrl) %>%
  left_join(data_select[,c("artUrl", "word")], by = "artUrl")

# positive_article:artUrl,word
positive_article <-
article_type %>%
  filter(type=="positive")%>%
  select(artUrl) %>%
  left_join(data_select[,c("artUrl", "word")], by = "artUrl")

情緒關鍵字:負面情緒文章

# 負面情緒關鍵字貢獻圖
negative_article %>%
inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = n()
    )%>% 
  arrange(desc(sum)) %>%
  data.frame() %>%
  top_n(30,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to negative sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

可以看到負面的相關字有抗議、譴責、反抗等。

情緒關鍵字:正面情緒文章

# 正面情緒關鍵字貢獻圖
positive_article %>%
inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = n()
    )%>% 
  arrange(desc(sum)) %>%
  data.frame() %>%
  top_n(30,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to positive sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

可以看到關心、和平、幫忙等。

#####################################################中文怪怪 ### 1-2串接CoreNLP API

# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9001",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)
# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="zh",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}
#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}
coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}
coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}
coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}
# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}
#gc() #釋放不使用的記憶體

#t0 = Sys.time()

#MetaData = fread('test_mid_articleMetaData.csv',encoding = 'UTF-8')
#MetaData %>% 
#  filter(!grepl("['^0-9a-z']",sentence))

#f =c("我来到北京清华大学","乒乓球拍卖完了","中国科学技术大学")
#f=c("可以看到文章中有中國、美國、台灣、仰光等詞")
#df = data.frame(text=f)

#clean = function(txt) {
  #txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
#  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
##  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
#  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
#  txt = gsub("\\n"," ",txt) #去除換行
#  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
#  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
#  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
#  txt }

#MetaData$sentence  = clean(MetaData$sentence)
#MetaData$text=MetaData$sentence
#df = data.frame(MetaData, header = F, encoding = "UTF-8")
#df$text  = clean(df$text)

#obj =  df  %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

#Sys.time() - t0 #執行時間
#save.image("coreNLP_all.RData")
#tokens =  coreNLP_tokens_parser(obj)
#head(tokens)
#unique(tokens$ner)
#tokens %>%
 # filter(ner == "COUNTRY")
#head(tokens)
#tokens %>%
#  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
#  group_by(lower_word) %>% #根據word分組
#  summarize(count = n()) %>% #計算每組
#  top_n(n = 13, count) %>%
##  ungroup() %>% 
#  mutate(word = reorder(lower_word, count)) %>%
#  ggplot(aes(word, count)) + 
#  geom_col()+
#  ggtitle("Word Frequency (NER is COUNTRY)") +
 # theme(text=element_text(size=14))+
 # coord_flip()

############################################################從此開始 ### sentimentr 適用英文

library(sentimentr)
## Warning: package 'sentimentr' was built under R version 4.0.5
set.seed(10)
MetaData = fread('test_mid_articleMetaData.csv',encoding = 'UTF-8')
mytext <- get_sentences(MetaData$sentence) #將text轉成list of characters型態
x <- sample(MetaData$sentence, 1000, replace = TRUE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字
##            words polarity  n
##  1:         care     1.00 12
##  2:       please     1.00  8
##  3:          pro     1.00  6
##  4:        nobel     1.00  5
##  5:        truth     1.00  4
##  6:   understand     1.00  3
##  7:        cares     1.00  2
##  8:   assistance     0.80 19
##  9:        helps     0.80  4
## 10:    effective     0.80  4
## 11:        civil     0.80  3
## 12:         safe     0.75  4
## 13:         hero     0.75  4
## 14:         good     0.75  4
## 15:      protect     0.75  3
## 16:       humbly     0.60  6
## 17:         fans     0.60  4
## 18:       assets     0.50 64
## 19:          win     0.50 29
## 20:      support     0.50 12
## 21:         save     0.50 12
## 22:   liberation     0.50  9
## 23:         like     0.50  8
## 24:         free     0.50  7
## 25:       safely     0.50  4
## 26:        share     0.50  4
## 27:     supports     0.50  4
## 28:   appreciate     0.50  4
## 29:   solidarity     0.50  4
## 30:    democracy     0.40  8
## 31:        young     0.40  6
## 32: organization     0.40  3
## 33:       global     0.40  2
## 34:         food     0.40  2
## 35:       leader     0.25 16
## 36:       league     0.25  8
## 37:       center     0.25  5
## 38:      soundly     0.25  4
## 39:      feeling     0.25  4
## 40:         guts     0.25  2
## 41:         pray     0.10 62
##            words polarity  n

4. 字詞相關性

將不要的重複字去除

# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
#Reviews
Total_Data <- MetaData %>% 
  mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|Gossiping|TVBS新聞網|娛樂中心|看板|問卦", "", sentence))
# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞

jieba_tokenizer = worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
chi_tokenizer01 <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}
# 進行斷詞,並計算各詞彙在各文章中出現的次數

coup_words <- Total_Data %>%
  unnest_tokens(word, sentence, token=chi_tokenizer01) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
coup_words
##                                                          artUrl     word  n
##     1: https://www.ptt.cc/bbs/Gossiping/M.1613244038.A.301.html     緬甸 43
##     2: https://www.ptt.cc/bbs/Gossiping/M.1613193695.A.798.html     緬甸 42
##     3: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html     軍方 35
##     4: https://www.ptt.cc/bbs/Gossiping/M.1615564184.A.B56.html     緬甸 35
##     5: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html     緬甸 33
##    ---                                                                     
## 21837: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 繁體中文  1
## 21838: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html     嚴重  1
## 21839: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html   嚴重者  1
## 21840: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html     嚴格  1
## 21841: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html     聽到  1

計算每篇文章的詞數

total_words <- coup_words %>% 
  group_by(artUrl) %>% 
  summarize(total = sum(n))
total_words
## # A tibble: 305 x 2
##    artUrl                                                   total
##    <chr>                                                    <int>
##  1 https://www.ptt.cc/bbs/Gossiping/M.1611626398.A.E8E.html   149
##  2 https://www.ptt.cc/bbs/Gossiping/M.1611628921.A.CD2.html   120
##  3 https://www.ptt.cc/bbs/Gossiping/M.1611897015.A.61A.html   219
##  4 https://www.ptt.cc/bbs/Gossiping/M.1612136552.A.31F.html   124
##  5 https://www.ptt.cc/bbs/Gossiping/M.1612147806.A.5E8.html   116
##  6 https://www.ptt.cc/bbs/Gossiping/M.1612148853.A.BA3.html   182
##  7 https://www.ptt.cc/bbs/Gossiping/M.1612149990.A.A39.html   272
##  8 https://www.ptt.cc/bbs/Gossiping/M.1612150864.A.8D4.html    42
##  9 https://www.ptt.cc/bbs/Gossiping/M.1612151826.A.0F1.html    85
## 10 https://www.ptt.cc/bbs/Gossiping/M.1612152167.A.B41.html    51
## # ... with 295 more rows

合併資料欄位

coup_words <- left_join(coup_words, total_words,by = "artUrl")

計算 tf-idf 值

coup_words_tf_idf <- coup_words %>%
  bind_tf_idf(word, artUrl, n)
coup_words_tf_idf
##                                                          artUrl     word  n
##     1: https://www.ptt.cc/bbs/Gossiping/M.1613244038.A.301.html     緬甸 43
##     2: https://www.ptt.cc/bbs/Gossiping/M.1613193695.A.798.html     緬甸 42
##     3: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html     軍方 35
##     4: https://www.ptt.cc/bbs/Gossiping/M.1615564184.A.B56.html     緬甸 35
##     5: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html     緬甸 33
##    ---                                                                     
## 21837: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html 繁體中文  1
## 21838: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html     嚴重  1
## 21839: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html   嚴重者  1
## 21840: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html     嚴格  1
## 21841: https://www.ptt.cc/bbs/Gossiping/M.1619407530.A.9B1.html     聽到  1
##        total         tf        idf       tf_idf
##     1:  1480 0.02905405 0.02321829 0.0006745855
##     2:   753 0.05577689 0.02321829 0.0012950441
##     3:   582 0.06013746 1.33828514 0.0804810652
##     4:   359 0.09749304 0.02321829 0.0022636216
##     5:   582 0.05670103 0.02321829 0.0013165010
##    ---                                         
## 21837:    22 0.04545455 4.11087386 0.1868579029
## 21838:    22 0.04545455 2.77587280 0.1261760362
## 21839:    22 0.04545455 4.33401742 0.1970007916
## 21840:    22 0.04545455 4.33401742 0.1970007916
## 21841:    22 0.04545455 3.92855231 0.1785705594

可以看到有緬甸、中國等。

選出每篇文章,tf-idf值最大的五個詞,且出現次數超過15次

coup_words_tf_idf %>% 
  group_by(artUrl) %>%
  slice_max(tf_idf, n=5) %>%
  filter(n > 15) %>% 
  arrange(desc(artUrl))
## # A tibble: 19 x 7
## # Groups:   artUrl [15]
##    artUrl                                 word      n total      tf   idf tf_idf
##    <chr>                                  <chr> <int> <int>   <dbl> <dbl>  <dbl>
##  1 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國     27   301 0.0897  1.21  0.108 
##  2 https://www.ptt.cc/bbs/Gossiping/M.16~ 人類     31  2140 0.0145  4.11  0.0596
##  3 https://www.ptt.cc/bbs/Gossiping/M.16~ 不可     19  2140 0.00888 4.62  0.0410
##  4 https://www.ptt.cc/bbs/Gossiping/M.16~ 聖經     16  2140 0.00748 5.03  0.0376
##  5 https://www.ptt.cc/bbs/Gossiping/M.16~ 告訴     21  2140 0.00981 3.42  0.0335
##  6 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍政~    17   243 0.0700  0.868 0.0607
##  7 https://www.ptt.cc/bbs/Gossiping/M.16~ 美國     25   292 0.0856  1.68  0.144 
##  8 https://www.ptt.cc/bbs/Gossiping/M.16~ 台灣     16   292 0.0548  1.04  0.0569
##  9 https://www.ptt.cc/bbs/Gossiping/M.16~ 美國     25   225 0.111   1.68  0.186 
## 10 https://www.ptt.cc/bbs/Gossiping/M.16~ 存在     21   651 0.0323  3.24  0.104 
## 11 https://www.ptt.cc/bbs/Gossiping/M.16~ 仰光     17   314 0.0541  1.94  0.105 
## 12 https://www.ptt.cc/bbs/Gossiping/M.16~ 中共     20   359 0.0557  2.78  0.155 
## 13 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國     23   268 0.0858  1.21  0.104 
## 14 https://www.ptt.cc/bbs/Gossiping/M.16~ 群眾     16   649 0.0247  3.08  0.0760
## 15 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍方     35   582 0.0601  1.34  0.0805
## 16 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國     29   753 0.0385  1.21  0.0466
## 17 https://www.ptt.cc/bbs/Gossiping/M.16~ 台灣     17   139 0.122   1.04  0.127 
## 18 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國     24   263 0.0913  1.21  0.110 
## 19 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍方     19   281 0.0676  1.34  0.0905

可以看到文章中有中國、美國、台灣、仰光等詞

從每篇文章挑選出tf-idf最大的十個詞,並計算每個詞被選中的次數

coup_words_tf_idf %>% 
  group_by(artUrl) %>%
  slice_max(tf_idf, n=10) %>%
  ungroup() %>%
  count(word, sort=TRUE)
## # A tibble: 3,261 x 2
##    word       n
##    <chr>  <int>
##  1 軍方      17
##  2 中國      16
##  3 親中      10
##  4 美國       9
##  5 軍政府     9
##  6 香港       9
##  7 民主       8
##  8 工廠       7
##  9 台灣       7
## 10 民族       7
## # ... with 3,251 more rows

軍方、軍政府、中國等是大家討論的重點。

jiebar and ngrams

#jieba_tokenizer = worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")

#chi_tokenizer <- function(t) {
 # lapply(t, function(x) {
  #  if(nchar(x)>1){
   #   tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
  #    tokens <- tokens[nchar(tokens)>1]
   #   return(tokens)
  #  }
#  })
#}

計算兩個詞彙同時出現的總次數

# 過濾掉三個關鍵字"緬甸", "軍政"
word_pairs <- coup_words %>%
  pairwise_count(word, artUrl, sort = TRUE) %>% 
  filter(!item1 %in% c("緬甸", "軍政") & !item2 %in% c("緬甸", "軍政"))
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
word_pairs
## # A tibble: 4,354,160 x 3
##    item1  item2      n
##    <chr>  <chr>  <dbl>
##  1 政變   軍政府    60
##  2 軍政府 政變      60
##  3 政府   軍政府    58
##  4 軍政府 政府      58
##  5 軍政府 中國      50
##  6 中國   軍政府    50
##  7 政變   軍方      49
##  8 軍方   政變      49
##  9 軍政府 軍方      47
## 10 軍方   軍政府    47
## # ... with 4,354,150 more rows

可以看到(政變、軍政府)、(軍政府、中國)、(抗議、軍整府)等詞常一起出現

計算兩個詞彙間的相關性

word_cors <- coup_words %>%
  group_by(word) %>%
  filter(n() >= 10) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

word_cors
## # A tibble: 117,992 x 3
##    item1    item2    correlation
##    <chr>    <chr>          <dbl>
##  1 綜合     外電報導       0.694
##  2 外電報導 綜合           0.694
##  3 舉行     指出           0.621
##  4 指出     舉行           0.621
##  5 綜合     報導           0.599
##  6 報導     綜合           0.599
##  7 民主聯盟 全國           0.569
##  8 全國     民主聯盟       0.569
##  9 已有     記者           0.569
## 10 記者     已有           0.569
## # ... with 117,982 more rows

##??

找出與 “翁山蘇姬”、 “仰光” 這兩個緬甸政變相關性最高的 15 個詞彙

word_cors %>%
  filter(item1 %in% c("翁山蘇姬", "仰光")) %>%
  group_by(item1) %>%
  top_n(15) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。

仰光、曼德勒是緬甸政變發生衝突的城市[大城] 國務資政、領導人都是對翁山蘇姬的稱呼

使用詞彙關係圖畫出相關性大於0.5的組合

set.seed(2020)

word_cors %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
  theme_void()

去除不相關的字(有沒有、一名、至少)

# 設定幾個詞做爲seed words
seed_words <- c("有沒有", "超過", "指出", "已有")
# 設定threshold爲0.5
threshold <- 0.5
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
                filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
                .$item1 %>%
                unique()
remove_words
## [1] "舉行" "指出" "已有" "記者" "報導" "超過" "援助" "活動"

使用詞彙關係圖畫出相關性大於0.5的組合

# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
                filter(!(item1 %in% remove_words|item2 %in% remove_words))

word_cors_new %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
  theme_void()

總司令敏昂萊(Min Aung Hlaing)在首都奈比多主持紀念軍人節的閱兵式