避免亂碼

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼(Windows系統可將這行註解)

## [1] ""

下載需要的套件

packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr","data.table","wordcloud2","wordcloud","DiagrammeR","magrittr","rtweet",
             "xml2","httr","jsonlite","data.tree","sentimentr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

require(readr)
require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(NLP)
require(tidyr)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
require(data.table)
require(wordcloud2)
require(wordcloud)
library(DiagrammeR)
library(magrittr)
library(rtweet)
library(xml2)
library(httr)
library(jsonlite)
library(data.tree)
library(sentimentr)

動機與目的

香港反送中、泰國抗爭造成了國際眾大關注，也間接影響到各國對於民主與專制的討論。這次的緬甸抗爭引起了國際的注意，為了解國際情勢，了解東西方國家對於這次的關心程度，為此我們決定研究這次緬甸抗爭在Twitter和PPT上的討論程度。

一、 Twitter

資料收集：tweets

資料來源:Twitter，4/25~5/02，5000筆，English

(1)Twitter API設定

透過rtweet抓取tweets

app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權

load("coreNLP.RData")

(2)設定關鍵字抓tweets

關鍵字:“Myanmar”、“coup”

# 查詢關鍵字
key = c("#Myanmar")
context = "coup"
q = paste(c(key,context),collapse=" AND ")   
# 查詢字詞 "#Myanmar AND coup"
# 為了避免只下#Myanmar 會找到非在coup中的tweets，加入Suez要同時出現的條件

#抓5000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)

(3)tweets內容清理

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元，*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }


tweets$text = clean(tweets$text)  #text套用資料清理

df = data.frame()
  
df = rbind(df,tweets)  # transfer to data frame

df = df[!duplicated(df[,"status_id"]),]  #去除重複的tweets

df共有90個欄位，但我們在這裡僅會使用幾個欄位:

user_id: 用戶id
status_id : 推文id
created_at : 發文時間
text : 推文內容
source : 發文來源

(4)了解資料的資料筆數以及時間分布

created_at已經是一個date類型的欄位，因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料

nrow(df)

## [1] 3675

min(df$created_at)

## [1] "2021-04-25"

max(df$created_at)

## [1] "2021-05-02"

日期折線圖

目的是計算出每一天文章的發表數量，可以看出特定主題討論的熱度。

df$created_at <- df$created_at %>% as.Date("%Y/%m/%d")
data <- df %>% 
  dplyr::select(created_at, urls_url)
  # distinct()

select(): 我們只需要文章以及日期兩個欄位即可，其他欄位不需要。
distinct(): 一篇文章有很多個詞彙，所以會有很多列，但我們只需要一篇文章保留一個列即可。

article_count_by_date <- data %>% 
  group_by(created_at) %>% 
  summarise(count = n())
article_count_by_date

## # A tibble: 8 x 2
##   created_at count
##   <date>     <int>
## 1 2021-04-25   229
## 2 2021-04-26   194
## 3 2021-04-27   351
## 4 2021-04-28  2209
## 5 2021-04-29   155
## 6 2021-04-30   181
## 7 2021-05-01   157
## 8 2021-05-02   199

plot_date <- 
  # data
  article_count_by_date %>% 
  # aesthetics
  ggplot(aes(x = created_at, y = count)) +
  # geometrics
  geom_line(color = "#00AFBB", size = 1) + 
  # coordinates
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("Twitter 討論文章數") + 
  xlab("日期") + 
  ylab("數量") + 
  # theme
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定，避免中文字顯示錯誤。

plot_date

由此可看出，4/28討論度最高。根據新聞可知4/27軍方與武裝少數民族 - 克倫民族佔領軍方基地，並表示是2/1以來最嚴重的一次衝突。並且4/28當天緬甸反政變人士所組成的親民族團結政府表示在可以舉行任何有建設性的對話之前軍方需先無條件釋放政治犯，包括民族團結政府總統溫敏及國務資政翁山蘇姬。此外聯合國指出因為COVID-19及政變可能會呈嚴重飢荒以及大批難民。

(5)計算詞頻

計算每篇關鍵字的出現次數

df_word <- df %>%
  unnest_tokens(word, text) %>% 
  anti_join(stop_words) %>% 
  # filter(!(word %in% c("dont","myanmar","coup"))) %>% 
  count(user_id, created_at, word, sort = TRUE)

## Joining, by = "word"

df_word

## # A tibble: 40,831 x 4
##    user_id             created_at word              n
##    <chr>               <date>     <chr>         <int>
##  1 1360227293617037313 2021-05-02 coupeliminate    42
##  2 1360227293617037313 2021-05-02 protest          42
##  3 1360227293617037313 2021-05-02 youth            41
##  4 178312978           2021-04-26 care             21
##  5 178312978           2021-04-26 china            21
##  6 178312978           2021-04-26 chinarussia      21
##  7 178312978           2021-04-26 coup             21
##  8 178312978           2021-04-26 dominate         21
##  9 178312978           2021-04-26 dont             21
## 10 178312978           2021-04-26 mastermind       21
## # ... with 40,821 more rows

計算詞彙的出現次數，如果詞彙只有一個字則不列入計算

words_count <- df_word %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>10) %>%
  arrange(desc(sum))
words_count

## # A tibble: 276 x 2
##    word        sum
##    <chr>     <int>
##  1 coup       2536
##  2 military   1587
##  3 internet   1247
##  4 strike     1233
##  5 situation  1211
##  6 anti       1202
##  7 current    1200
##  8 inagainst  1199
##  9 reopen     1198
## 10 schools    1198
## # ... with 266 more rows

將文章詞頻前10名畫成長條圖

words_count %>% 
  mutate(word = reorder(word, sum)) %>%
  head(10) %>% 
  # 在 ggplot 定義 x, y 軸，分別是 word 與 n
  ggplot(aes(sum, word,fill=word)) +
  geom_col(show.legend = FALSE) +
  labs(x = "詞頻", y = "詞") +
  ggtitle("文章詞頻前10名")

將關鍵詞透過文字雲呈現

words_count %>% wordcloud2()

(6)情緒分析

library(tidyr)

sentiment <- df_word %>%
  inner_join(get_sentiments("bing")) %>%
  count(user_id, sentiment,created_at) %>%
  # spread(sentiment, n, fill = 0) %>% 與下一列等同 
  # 在不同的列中得到正面和負面的情感
  pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
  # 計算情感差距 (正面-負面)
  mutate(sentiment = positive - negative)

透過「bing」情緒字典計算正面與負面詞數

get_sentiments("bing") %>% 
  count(sentiment)

## # A tibble: 2 x 2
##   sentiment     n
##   <chr>     <int>
## 1 negative   4781
## 2 positive   2005

計算各正負面詞出現次數

bing_word_counts <- df_word %>%
  inner_join(get_sentiments("bing")) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

bing_word_counts

## # A tibble: 471 x 3
##    word     sentiment     n
##    <chr>    <chr>     <int>
##  1 strike   negative   1233
##  2 lost     negative    928
##  3 win      positive    211
##  4 protest  negative     76
##  5 protests negative     65
##  6 killed   negative     55
##  7 brutal   negative     45
##  8 crisis   negative     45
##  9 limited  negative     37
## 10 poverty  negative     37
## # ... with 461 more rows

將正負面詞繪製成長條圖

bing_word_counts %>%
  group_by(sentiment) %>%
  slice_max(n, n = 10) %>% 
  ungroup() %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(x = "Contribution to sentiment",
       y = NULL)

圖中顯示罷工為負面字詞最多的原因是：緬甸人民打算罷工至緬甸所有民主得以伸張後才會停止。

(7)串接CoreNLP API

API呼叫的設定

server端 :

需先在terminal開啟corenlp server
在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# 產生coreNLP的api url，將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)

# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文（使用9000 port）、另一個則負責中文（使用9001 port）
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}

#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理，每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

資料整理function

從回傳的object中整理斷詞出結果，輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

從回傳的core-nlp object中整理出詞彙依存關係，輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

從回傳的core-nlp object中整理出語句情緒，輸出為 tidydata 格式

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

圖形化 Dependency tree

程式參考來源：https://stackoverflow.com/questions/35496560/how-to-convert-corenlp-generated-parse-tree-into-data-tree-r-package

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

將句子丟入服務

取得coreNLP回傳的物件
先不要跑這段，會花大概半小時（如果你記憶體只有4G可能會當掉）

gc() #釋放不使用的記憶體

t0 = Sys.time()
obj = df[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

Sys.time() - t0 #執行時間
#Time difference of 28 mins

save.image("coreNLP.RData")

#先將會用到的東西存下來，要用可直接載RData
tokens =  coreNLP_tokens_parser(obj)
dependencies = coreNLP_dependency_parser(obj)
sentiment = coreNLP_sentiment_parser(obj)
save.image("coreNLP.RData")

提取結果

斷詞、詞彙還原、詞性標註、NER

tokens =  coreNLP_tokens_parser(obj)

head(tokens,20)

##              status_id         word        lemma pos    ner
## 1  1388922737561722881          The          the  DT      O
## 2  1388922737561722881      turmoil      turmoil  NN      O
## 3  1388922737561722881    following       follow VBG      O
## 4  1388922737561722881          the          the  DT      O
## 5  1388922737561722881     military     military  JJ      O
## 6  1388922737561722881         coup         coup  NN      O
## 7  1388922737561722881    incoupled     incouple VBN      O
## 8  1388922737561722881         with         with  IN      O
## 9  1388922737561722881          the          the  DT      O
## 10 1388922737561722881       impact       impact  NN      O
## 11 1388922737561722881      ofcould      ofcould  NN      O
## 12 1388922737561722881       result       result  NN      O
## 13 1388922737561722881           in           in  IN      O
## 14 1388922737561722881           up           up  RB      O
## 15 1388922737561722881           to           to  IN      O
## 16 1388922737561722881           25           25  CD NUMBER
## 17 1388922737561722881      million      million  CD NUMBER
## 18 1388922737561722881 peoplenearly peoplenearly  JJ      O
## 19 1388922737561722881         half         half  NN      O
## 20 1388922737561722881           of           of  IN      O

coreNLP_tokens_parser欄位:

status_id : 對應原本df裡的status_id，為一則tweets的唯一id
word: 原始斷詞
lemma : 對斷詞做詞形還原
pos : part-of-speech,詞性
ner: 命名實體
命名實體標註(NER) 從NER查看特定類型的實體，辨識出哪幾種類型

unique(tokens$ner)

##  [1] "O"                 "NUMBER"            "DATE"             
##  [4] "ORGANIZATION"      "PERSON"            "TITLE"            
##  [7] "CAUSE_OF_DEATH"    "DURATION"          "NATIONALITY"      
## [10] "MISC"              "CITY"              "CRIMINAL_CHARGE"  
## [13] "ORDINAL"           "COUNTRY"           "LOCATION"         
## [16] "TIME"              "IDEOLOGY"          "MONEY"            
## [19] "STATE_OR_PROVINCE" "SET"               "PERCENT"

#除去entity為Other，有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"]))

## [1] 745

轉小寫：
因為大小寫也會影響corenlp對NER的判斷，因此我們一開始給的推文內容是沒有處理大小寫的，但在跑完anotator後，為了正確計算詞頻，創建新欄位lower_word與lower_lemma，存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞（如Evergiven與evergiven）都換成小寫，再來計算詞頻

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

(8)探索分析 - NER

涉及到的國家(COUNTRY)

我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變，所涉及到的國家(COUNTRY)，以初步了解這個議題的主要國家。

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 13, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

## `summarise()` ungrouping output (override with `.groups` argument)

緬甸所涉及的國家第一名是中國原因：中國和俄羅斯都跟緬甸武裝部隊關係密切，他們分別是緬甸軍火的第一大和第二大供應者。

涉及到的組織(ORGANIZATION)

我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變，所涉及到的組織(ORGANIZATION)，以初步了解這個議題的主要公司/單位。

tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

## `summarise()` ungrouping output (override with `.groups` argument)

圖中所提及的組織原因：

kayah state：克耶邦，為緬甸東南部的一個邦。
karenni：客倫尼族，為緬甸的少數民族。
demoso- dimawhoso：為緬甸東部的一個城市。

涉及到的人物(PERSON)

我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變，所涉及到的人物(PERSON)，以初步了解這個議題的主要人物。

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  filter(!lower_word %in% c("taung" ,"mingala"))%>%
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

## `summarise()` ungrouping output (override with `.groups` argument)

圖中所提及之人物原因：

nyuntthese：緬甸執政黨全國民主聯盟發言人苗紐(Myo Nyunt)。
aung：民族團結政府國務資政翁山蘇姬(Aung San Suu Kyi)。
karen：武裝對抗著緬甸中央政府的政治組織，緬甸克倫族（Karen）。
tun：緬甸軍政府發言人紹敏通(Zaw Min Tun)。

涉及到的地點(LOCATION)

我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變，所涉及到的地點(LOCATION)，以初步了解這個議題的主要人物。

tokens %>%
  filter(ner == "LOCATION") %>%  #篩選NER為Location
  filter(!lower_word %in% c("yangon."))%>%
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is LOCATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

## `summarise()` ungrouping output (override with `.groups` argument)

圖中所提及之地點原因：

yangon：緬甸最大城市仰光省首府，為本次衝突地點。
salween：薩爾溫江的英語名稱，為本次抗爭避難地點。

(9)探索分析 - Dependency

語句依存關係結果

dependencies = coreNLP_dependency_parser(obj)

head(dependencies,20)

##              status_id      dep governor governorGloss dependent dependentGloss
## 1  1388922737561722881     ROOT        0          ROOT         2        turmoil
## 2  1388922737561722881      det        2       turmoil         1            The
## 3  1388922737561722881     case        6          coup         3      following
## 4  1388922737561722881      det        6          coup         4            the
## 5  1388922737561722881     amod        6          coup         5       military
## 6  1388922737561722881     nmod        2       turmoil         6           coup
## 7  1388922737561722881      acl        6          coup         7      incoupled
## 8  1388922737561722881     case       12        result         8           with
## 9  1388922737561722881      det       12        result         9            the
## 10 1388922737561722881 compound       12        result        10         impact
## 11 1388922737561722881 compound       12        result        11        ofcould
## 12 1388922737561722881      obl        7     incoupled        12         result
## 13 1388922737561722881     mark       24        living        13             in
## 14 1388922737561722881   advmod       24        living        14             up
## 15 1388922737561722881     case       19          half        15             to
## 16 1388922737561722881 compound       17       million        16             25
## 17 1388922737561722881   nummod       19          half        17        million
## 18 1388922737561722881     amod       19          half        18   peoplenearly
## 19 1388922737561722881      obl       14            up        19           half
## 20 1388922737561722881     case       23    population        20             of

視覺化的Dependency tree

parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)

(10)探索分析 - Sentiment

語句情緒值

情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive

sentiment = coreNLP_sentiment_parser(obj)

head(sentiment,20)

## # A tibble: 20 x 5
##    user_id             created_at negative positive sentiment
##    <chr>               <date>        <int>    <int>     <int>
##  1 1000091387088404480 2021-04-28        2        0        -2
##  2 1000283029363769345 2021-04-28        1        0        -1
##  3 1000657974870790145 2021-04-28        1        0        -1
##  4 1002627878909952005 2021-04-28        1        0        -1
##  5 1006051354488365057 2021-04-26        1        1         0
##  6 1006051354488365057 2021-04-29        1        1         0
##  7 1006051354488365057 2021-04-28        0        1         1
##  8 1006083289545785344 2021-04-28        1        0        -1
##  9 1008979981          2021-05-02        0        2         2
## 10 1009546724          2021-04-26        1        1         0
## 11 1009989511747473408 2021-05-02        1        0        -1
## 12 1010955190885732353 2021-04-28        2        0        -2
## 13 1014542681547026433 2021-04-28        1        0        -1
## 14 1014813793619083264 2021-04-28        1        0        -1
## 15 1014837749755494401 2021-04-29        1        0        -1
## 16 1014838262161014784 2021-04-27        2        1        -1
## 17 1014877087130423297 2021-04-26        0        1         1
## 18 1014877087130423297 2021-04-27        0        1         1
## 19 1014882776057696256 2021-04-28        1        0        -1
## 20 1014891217916604416 2021-04-28        2        0        -2

資料集中的情緒種類

#unique(sentiment$sentiment)
sentiment$sentimentValue = sentiment$sentimentValue  %>% as.numeric
View(sentiment)

#了解情緒文章的分佈
sentiment$sentiment %>% table()

## .
##  -16   -8   -6   -5   -4   -3   -2   -1    0    1    2    3    4 
##    1    1    4   13   27   79  590 1556   99  284   29   12    1

平均情緒分數時間趨勢

df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) + 
  geom_line()+
    scale_x_date(labels = date_format("%m/%d"))

圖中顯示：

4/25情緒高昂原因為4/24有招開東協會議，主要稱他們在會中達成「緬甸將立即停止暴力活動」的共識，使討論之情緒較為高昂
4/27情緒低迷原因與上述所說緬甸少數的克倫族攻擊鄰近泰國邊界的1處緬甸軍事哨站相關。

了解情緒分佈，以及在正面情緒及負面情緒下，所使用的文章詞彙

#了解正面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the',"!")) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lower_lemma) %>% #根據lemma分組
  summarize(count = n()) %>% 
  filter(count >5 & count<400)%>%
  wordcloud2()

#了解負面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
   filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
   group_by(lower_lemma) %>% 
  summarize(count = n()) %>% 
  filter(count >10 &count<400)%>%
  wordcloud2()

(11)Sentimentr 英文情緒分析

計算tweet中屬於正面的字

set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆，取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字

##             words polarity n
##   1:         care      1.0 9
##   2:     triumphs      1.0 8
##   3:      justice      1.0 8
##   4:       please      1.0 3
##   5: humanitarian      1.0 3
##  ---                        
## 210:         pray      0.1 2
## 211:  cooperation      0.1 2
## 212:      camping      0.1 1
## 213:      praying      0.1 1
## 214:     momentum      0.1 1

計算tweet中屬於負面的字

sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字

## Selecting by n

##          words polarity   n
##  1:     strike    -0.75 341
##  2:       anti    -1.00 336
##  3:       lost    -0.75 262
##  4: threatened    -0.50 254
##  5: kidnapping    -1.00 254
##  6:   abducted    -1.00  44
##  7:    protest    -0.50  34
##  8:  criminals    -1.00  31
##  9:      junta    -0.25  29
## 10: protesters    -0.60  27

用日期來了解情緒波動

code 參考 https://github.com/trinker/sentimentr

tweets$date = format(tweets$created_at,'%Y%m%d')

(out  = tweets  %>%  with(
    sentiment_by( #document level
        get_sentences(text), 
        list( date)
    )
))
plot(out)

圖中呈現，可發現整體事件4/25因為東協會議表示即將終止，因此呈現正面；但因為後面依舊有發生許多抗爭跡象，因此後續情緒普遍為負面呈現。

(12)tf-idf

total_words <- df_word %>% 
  group_by(user_id) %>% 
  summarize(total = sum(n))

## `summarise()` ungrouping output (override with `.groups` argument)

total_words

## # A tibble: 2,574 x 2
##    user_id             total
##    <chr>               <int>
##  1 1000091387088404480    22
##  2 1000283029363769345    11
##  3 1000657974870790145    11
##  4 1002627878909952005    11
##  5 1006051354488365057    86
##  6 1006083289545785344    11
##  7 1008979981             14
##  8 1009546724             19
##  9 100986964               6
## 10 1009989511747473408    20
## # ... with 2,564 more rows

Myanmar_words <- left_join(df_word, total_words)

## Joining, by = "user_id"

Myanmar_words

## # A tibble: 40,831 x 5
##    user_id             created_at word              n total
##    <chr>               <date>     <chr>         <int> <int>
##  1 1360227293617037313 2021-05-02 coupeliminate    42   144
##  2 1360227293617037313 2021-05-02 protest          42   144
##  3 1360227293617037313 2021-05-02 youth            41   144
##  4 178312978           2021-04-26 care             21   579
##  5 178312978           2021-04-26 china            21   579
##  6 178312978           2021-04-26 chinarussia      21   579
##  7 178312978           2021-04-26 coup             21   579
##  8 178312978           2021-04-26 dominate         21   579
##  9 178312978           2021-04-26 dont             21   579
## 10 178312978           2021-04-26 mastermind       21   579
## # ... with 40,821 more rows

# 以每篇文章爲單位，計算每個詞彙的 tf-idf 值
Myanmar_words_tf_idf <- Myanmar_words %>%
  bind_tf_idf(word,user_id, n)
Myanmar_words_tf_idf

## # A tibble: 40,831 x 8
##    user_id            created_at word              n total     tf    idf  tf_idf
##    <chr>              <date>     <chr>         <int> <int>  <dbl>  <dbl>   <dbl>
##  1 13602272936170373~ 2021-05-02 coupeliminate    42   144 0.292  7.85   2.29e+0
##  2 13602272936170373~ 2021-05-02 protest          42   144 0.292  3.52   1.03e+0
##  3 13602272936170373~ 2021-05-02 youth            41   144 0.285  6.47   1.84e+0
##  4 178312978          2021-04-26 care             21   579 0.0363 5.46   1.98e-1
##  5 178312978          2021-04-26 china            21   579 0.0363 5.08   1.84e-1
##  6 178312978          2021-04-26 chinarussia      21   579 0.0363 5.91   2.14e-1
##  7 178312978          2021-04-26 coup             21   579 0.0363 0.0149 5.39e-4
##  8 178312978          2021-04-26 dominate         21   579 0.0363 5.91   2.14e-1
##  9 178312978          2021-04-26 dont             21   579 0.0363 5.21   1.89e-1
## 10 178312978          2021-04-26 mastermind       21   579 0.0363 5.91   2.14e-1
## # ... with 40,821 more rows

# 選出每篇文章，tf-idf值最大的五個詞
Myanmar_words_tf_idf %>% 
  group_by(user_id) %>%
  slice_max(tf_idf, n=5) %>%
  arrange(desc(user_id))

## # A tibble: 13,874 x 8
## # Groups:   user_id [2,574]
##    user_id            created_at word            n total     tf   idf tf_idf
##    <chr>              <date>     <chr>       <int> <int>  <dbl> <dbl>  <dbl>
##  1 999907196          2021-04-26 howcripples     1     3 0.333   7.85 2.62  
##  2 999907196          2021-04-26 itssystem       1     3 0.333   7.85 2.62  
##  3 999907196          2021-04-26 return          1     3 0.333   4.91 1.64  
##  4 999323392158265344 2021-04-26 chasing         1    15 0.0667  7.85 0.524 
##  5 999323392158265344 2021-04-26 clouds          1    15 0.0667  7.85 0.524 
##  6 999323392158265344 2021-04-26 enjoy           1    15 0.0667  7.85 0.524 
##  7 999323392158265344 2021-04-26 enjoyed         1    15 0.0667  7.85 0.524 
##  8 999323392158265344 2021-04-26 uncertain       1    15 0.0667  7.85 0.524 
##  9 997324537581584384 2021-04-28 threatened      1    11 0.0909  1.03 0.0936
## 10 997324537581584384 2021-04-28 civillians      1    11 0.0909  1.03 0.0934
## # ... with 13,864 more rows

計算整個文集中較常 tf-idf 值高的字

# 從每篇文章挑選出tf-idf最大的十個詞，
# 並計算每個詞被選中的次數
Myanmar_words_tf_idf %>% 
  group_by(user_id) %>%
  slice_max(tf_idf, n=10) %>%
  ungroup() %>%
  count(word, sort=TRUE)

## # A tibble: 3,571 x 2
##    word                  n
##    <chr>             <int>
##  1 kidnapping          903
##  2 lives               903
##  3 lost                903
##  4 safety              903
##  5 terroristsecurity   903
##  6 threatened          903
##  7 civillians          902
##  8 nights              902
##  9 days                901
## 10 forces              893
## # ... with 3,561 more rows

(13)N-gram、共線相關圖

jieba_tokenizer = worker()

# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

water_bigram <- df %>%
  unnest_tokens(bigram, text, token = jieba_bigram)
water_bigram

## # A tibble: 75,984 x 91
##    user_id status_id created_at screen_name source display_text_wi~
##    <chr>   <chr>     <date>     <chr>       <chr>             <dbl>
##  1 164263~ 13889227~ 2021-05-02 UNGeneva    Tweet~              261
##  2 164263~ 13889227~ 2021-05-02 UNGeneva    Tweet~              261
##  3 164263~ 13889227~ 2021-05-02 UNGeneva    Tweet~              261
##  4 164263~ 13889227~ 2021-05-02 UNGeneva    Tweet~              261
##  5 164263~ 13889227~ 2021-05-02 UNGeneva    Tweet~              261
##  6 164263~ 13889227~ 2021-05-02 UNGeneva    Tweet~              261
##  7 164263~ 13889227~ 2021-05-02 UNGeneva    Tweet~              261
##  8 164263~ 13889227~ 2021-05-02 UNGeneva    Tweet~              261
##  9 164263~ 13889227~ 2021-05-02 UNGeneva    Tweet~              261
## 10 164263~ 13889227~ 2021-05-02 UNGeneva    Tweet~              261
## # ... with 75,974 more rows, and 85 more variables: reply_to_status_id <chr>,
## #   reply_to_user_id <chr>, reply_to_screen_name <chr>, is_quote <lgl>,
## #   is_retweet <lgl>, favorite_count <int>, retweet_count <int>,
## #   quote_count <int>, reply_count <int>, hashtags <list>, symbols <list>,
## #   urls_url <list>, urls_t.co <list>, urls_expanded_url <list>,
## #   media_url <list>, media_t.co <list>, media_expanded_url <list>,
## #   media_type <list>, ext_media_url <list>, ext_media_t.co <list>,
## #   ext_media_expanded_url <list>, ext_media_type <chr>,
## #   mentions_user_id <list>, mentions_screen_name <list>, lang <chr>,
## #   quoted_status_id <chr>, quoted_text <chr>, quoted_created_at <dttm>,
## #   quoted_source <chr>, quoted_favorite_count <int>,
## #   quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## #   quoted_name <chr>, quoted_followers_count <int>,
## #   quoted_friends_count <int>, quoted_statuses_count <int>,
## #   quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## #   retweet_source <chr>, retweet_favorite_count <int>,
## #   retweet_retweet_count <int>, retweet_user_id <chr>,
## #   retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## #   country_code <chr>, geo_coords <list>, coords_coords <list>,
## #   bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## #   description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>, date <date>, bigram <chr>

統計最常出現的bigram組合

# 計算每個組合出現的次數
water_bigram %>%
  count(bigram, sort = TRUE)

## # A tibble: 13,246 x 2
##    bigram                 n
##    <chr>              <int>
##  1 trying to           1210
##  2 anti coup           1206
##  3 back internet       1205
##  4 coup strike         1204
##  5 the current         1204
##  6 inagainst military  1203
##  7 reopen schools      1203
##  8 to reopen           1203
##  9 current situation   1201
## 10 is trying           1201
## # ... with 13,236 more rows

移除bigram中的停用字

water_bigram %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  count(word1, word2, sort = TRUE) %>%
  unite_("bigram", c("word1","word2"), sep=" ")

## # A tibble: 13,246 x 2
##    bigram                 n
##    <chr>              <int>
##  1 trying to           1210
##  2 anti coup           1206
##  3 back internet       1205
##  4 coup strike         1204
##  5 the current         1204
##  6 inagainst military  1203
##  7 reopen schools      1203
##  8 to reopen           1203
##  9 current situation   1201
## 10 is trying           1201
## # ... with 13,236 more rows

word_pairs <- df_word %>%
  pairwise_count(word,user_id, sort = TRUE) %>% 
  filter(!item1 %in% c("myanmar", "coup","4","apr","data","ramadan") & !item2 %in% c("myanmar", "coup","4","apr","data","ramadan"))

word_pairs

## # A tibble: 342,018 x 3
##    item1     item2         n
##    <chr>     <chr>     <dbl>
##  1 internet  military   1218
##  2 military  internet   1218
##  3 strike    military   1214
##  4 military  strike     1214
##  5 situation military   1204
##  6 military  situation  1204
##  7 anti      military   1198
##  8 inagainst military   1198
##  9 reopen    military   1198
## 10 schools   military   1198
## # ... with 342,008 more rows

word_cors <- df_word %>%
  group_by(word) %>%
  filter(n() >= 10) %>%
  pairwise_cor(word, user_id, sort = TRUE)

word_cors

## # A tibble: 97,656 x 3
##    item1           item2           correlation
##    <chr>           <chr>                 <dbl>
##  1 guys            detat                    1.
##  2 revolutionaries detat                    1.
##  3 detat           guys                     1.
##  4 revolutionaries guys                     1.
##  5 detat           revolutionaries          1.
##  6 guys            revolutionaries          1.
##  7 suppression     challenging              1.
##  8 thethree        challenging              1.
##  9 challenging     suppression              1.
## 10 thethree        suppression              1.
## # ... with 97,646 more rows

繪製“coup”及“myanmar”相關詞

word_cors %>%
  filter(item1 %in% c("myanmar", "coup")) %>%
  group_by(item1) %>%
  top_n(15) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定，避免中文字顯示錯誤。

使用詞彙關係圖畫出相關性大於0.95的組合

set.seed(2020)

#word_cors %>%
#  filter(correlation > 0.95) %>%
#  graph_from_data_frame() %>%
#  ggraph(layout = "fr") +
#  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
#  geom_node_point(color = "lightblue", size = 3) +
#  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
#  theme_void()

軍方全面關閉寬頻網路、Wi-Fi、漫遊，禁止大數民眾對外聯繫。
三指禮，象徵反抗極權統治的精神，聲援被捕的翁山蘇姬。
少數民族卡倫尼族(Karenni)民眾於12日在克耶州首府示威遭鎮壓。

二、PTT

資料基本介紹

資料來源: 文字平台收集PTT Gossip版2021-01-01 ~ 2021-04-28 所有文章
資料集： project_articleMetaData.csv、project_articleReviews.csv
關鍵字：緬甸、軍政
文章數量:321筆，留言數量:11528筆

# 把文章和留言讀進來
MetaData = fread('project_articleMetaData.csv',encoding = 'UTF-8')
Reviews  = fread('project_articleReviews.csv',encoding = 'UTF-8')
# 挑選文章對應的留言
Total_Data = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")

(1)計算每日發文次數

data_day <- Total_Data %>% 
  dplyr::select(artDate, artUrl) %>% 
  distinct() %>%   
  group_by(artDate) %>% 
  summarise(count = n()) %>% 
  arrange(desc(count))
data_day

## # A tibble: 67 x 2
##    artDate    count
##    <chr>      <int>
##  1 2021/02/01    32
##  2 2021/03/01    22
##  3 2021/03/29    22
##  4 2021/03/05    20
##  5 2021/03/04    15
##  6 2021/02/02    14
##  7 2021/03/28    12
##  8 2021/03/15    11
##  9 2021/02/06    10
## 10 2021/03/08     8
## # ... with 57 more rows

可以看出事件討論度最高是2月1日，這天是緬甸軍政府採取發動政變的日期。

折線圖呈現

data_day$artDate= data_day$artDate %>% as.Date("%Y/%m/%d")
data_day %>%
  ggplot()+
    geom_line(aes(x=artDate,y=count))+
    xlab("日期") + 
    ylab("數量") + 
    geom_vline(xintercept = as.numeric(as.Date("2021-02-01")), col='red', size = 0.5) + 
    geom_vline(xintercept = as.numeric(as.Date("2021-03-01")), col='red', size = 0.5)+ 
    geom_vline(xintercept = as.numeric(as.Date("2021-03-29")), col='red', size = 0.5)+
    scale_x_date(labels = date_format("%m/%d"))

圖中以2/1、3/1、3/29三日為主：

緬甸軍方2/1疑發動政變，包括總統溫敏、實質領導人翁山蘇姬在內多名執政黨高層已遭到軍方逮捕。軍方電視台宣布國家進入緊急狀態。
2/28緬甸軍方在仰光土瓦開槍鎮壓示威者，造成至少18人死亡。
3/29清晨在克欽邦與克耶邦交界的帕敢鎮（Hpakant），同步襲擊了4座警察局基地，除了殺死20名警察，克欽軍更劫取大批制式軍火，重創緬甸軍政府在北方的控制威信。

(2)資料前處理與分析

文章斷詞

設定斷詞引擎

# 加入自定義的字典
jieba_tokenizer <- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")

# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

資料基本清理

日期格式化
去除特殊字元、詞頻太低的字
算每天不同字的詞頻

MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)

data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 
# 格式化日期欄位
data$artDate = data$artDate %>% as.Date("%Y/%m/%d")

# 過濾特殊字元
data_select = data %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
  filter(nchar(.$word)>1)

# word_count:artDate,word,count
word_count <- data_select %>%
  group_by(word) %>%
  filter(word !="緬甸") %>% 
  summarise(count=n()) %>%  # 算字詞單篇總數用summarise
  filter(count>3) %>%  # 過濾出現太少次的字
  arrange(desc(count))

## `summarise()` ungrouping output (override with `.groups` argument)

word_count

## # A tibble: 3,294 x 2
##    word   count
##    <chr>  <int>
##  1 中國    1019
##  2 台灣     906
##  3 軍政府   771
##  4 軍方     562
##  5 美國     545
##  6 國家     489
##  7 政變     410
##  8 香港     404
##  9 政府     385
## 10 關心     345
## # ... with 3,284 more rows

將詞頻前10名繪製成圖

word_count %>% 
  head(10) %>% 
  mutate(word = reorder(word, count)) %>% 
  ggplot(aes(x=count, y=word,fill=word))+
  geom_col(show.legend = FALSE) +
  labs(x = "詞頻", y = "詞") +
  ggtitle("文章詞頻前10名")

圖中顯示，中國為第一名、台灣為第二名的原因：

中國為提供緬甸政府軍火的國家。
台灣立法院通過決議，呼籲緬甸軍方勿以武力攻擊和平示威民眾，應以和平、理性對話方式化解對立情勢。以及盡早恢復緬甸的民主政治，並要求政府支援在緬甸的台籍僑胞。

文字雲

word_count %>% wordcloud2()

(3)情緒分析

載入情緒字典

P <- read_file("liwc/positive.txt") # 正向字典txt檔
N <- read_file("liwc/negative.txt") # 負向字典txt檔

#字典txt檔讀進來是一整個字串
typeof(P)

## [1] "character"

分割字詞，並將兩個情緒字典併在一起

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments，word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)

# 檢視字典
head(LIWC)

##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     不錯  positive
## 6     主動  positive

每天正負面情緒總和(sentiment_count)

MetaData = fread('project_articleMetaData.csv',encoding = 'UTF-8')
Reviews  = fread('project_articleReviews.csv',encoding = 'UTF-8')
# 挑選文章對應的留言
Total_Data = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")

# 把文章和留言的斷詞結果併在一起
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Total_Data %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)

# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) 

# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")

# 過濾特殊字元
data_select = data %>% 
  filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
  filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
  filter(nchar(.$word)>1) 
  


sentiment_count = data_select %>%
  select(artDate,word) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=n())  %>% 
  arrange(desc(count))
sentiment_count

## # A tibble: 129 x 3
## # Groups:   artDate [66]
##    artDate    sentiment count
##    <date>     <chr>     <int>
##  1 2021-03-29 negative    339
##  2 2021-03-04 negative    303
##  3 2021-03-01 negative    280
##  4 2021-03-07 negative    267
##  5 2021-02-01 negative    248
##  6 2021-03-04 positive    235
##  7 2021-03-29 positive    230
##  8 2021-02-01 positive    219
##  9 2021-03-07 positive    208
## 10 2021-03-01 positive    168
## # ... with 119 more rows

可以看到整體而言是負面的情緒較多

# 檢視資料的日期區間
range(sentiment_count$artDate) #"2021-01-07" "2021-05-01"

## [1] "2021-01-07" "2021-05-01"

繪製折線圖

sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-01-07','2021-05-01'))
               )+
  # 加上標示日期的線
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-29'))
[1]])),colour = "red") +
  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-01'))
[1]])),colour = "red")

正負情緒比例折線圖

sentiment_count %>% 
  # 標準化的部分
  group_by(artDate) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-01-07','2021-05-01'))
               )

正負情緒代表字

sentiment_sum <- 
  word_count %>%
    inner_join(LIWC, by = "word") %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = sum(count)
  ) %>% 
  arrange(desc(sum)) %>%
  data.frame()

## `summarise()` regrouping output by 'word' (override with `.groups` argument)

#sentiment_sum

計算情緒字並畫圖

sentiment_sum %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

(4)字詞相關性

將不要的重複字去除

# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
#Reviews
Total_Data <- MetaData %>% 
  mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|Gossiping|TVBS新聞網|娛樂中心|看板|問卦", "", sentence))

# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞

jieba_tokenizer = worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
chi_tokenizer01 <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

# 進行斷詞，並計算各詞彙在各文章中出現的次數

coup_words <- Total_Data %>%
  unnest_tokens(word, sentence, token=chi_tokenizer01) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
coup_words

##                                                          artUrl word  n
##     1: https://www.ptt.cc/bbs/Gossiping/M.1613244038.A.301.html 緬甸 43
##     2: https://www.ptt.cc/bbs/Gossiping/M.1613193695.A.798.html 緬甸 42
##     3: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 軍方 35
##     4: https://www.ptt.cc/bbs/Gossiping/M.1615564184.A.B56.html 緬甸 35
##     5: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 緬甸 33
##    ---                                                                 
## 22467: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 確實  1
## 22468: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 隨便  1
## 22469: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 還不  1
## 22470: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 願意  1
## 22471: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 聽從  1

計算每篇文章的詞數

total_words <- coup_words %>% 
  group_by(artUrl) %>% 
  summarize(total = sum(n))

## `summarise()` ungrouping output (override with `.groups` argument)

total_words

## # A tibble: 321 x 2
##    artUrl                                                   total
##    <chr>                                                    <int>
##  1 https://www.ptt.cc/bbs/Gossiping/M.1609990690.A.13C.html    15
##  2 https://www.ptt.cc/bbs/Gossiping/M.1610008084.A.4C9.html    35
##  3 https://www.ptt.cc/bbs/Gossiping/M.1610024281.A.452.html    11
##  4 https://www.ptt.cc/bbs/Gossiping/M.1611897015.A.61A.html   219
##  5 https://www.ptt.cc/bbs/Gossiping/M.1612068324.A.114.html   231
##  6 https://www.ptt.cc/bbs/Gossiping/M.1612136552.A.31F.html   124
##  7 https://www.ptt.cc/bbs/Gossiping/M.1612147806.A.5E8.html   116
##  8 https://www.ptt.cc/bbs/Gossiping/M.1612148853.A.BA3.html   182
##  9 https://www.ptt.cc/bbs/Gossiping/M.1612149990.A.A39.html   272
## 10 https://www.ptt.cc/bbs/Gossiping/M.1612150864.A.8D4.html    42
## # ... with 311 more rows

合併資料欄位

coup_words <- left_join(coup_words, total_words,by = "artUrl")

計算 tf-idf 值

coup_words_tf_idf <- coup_words %>%
  bind_tf_idf(word, artUrl, n)
coup_words_tf_idf

##                                                          artUrl word  n total
##     1: https://www.ptt.cc/bbs/Gossiping/M.1613244038.A.301.html 緬甸 43  1480
##     2: https://www.ptt.cc/bbs/Gossiping/M.1613193695.A.798.html 緬甸 42   753
##     3: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 軍方 35   582
##     4: https://www.ptt.cc/bbs/Gossiping/M.1615564184.A.B56.html 緬甸 35   359
##     5: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 緬甸 33   582
##    ---                                                                       
## 22467: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 確實  1    52
## 22468: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 隨便  1    52
## 22469: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 還不  1    52
## 22470: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 願意  1    52
## 22471: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 聽從  1    52
##                tf        idf      tf_idf
##     1: 0.02905405 0.04785602 0.001390411
##     2: 0.05577689 0.04785602 0.002669260
##     3: 0.06013746 1.35260052 0.081341955
##     4: 0.09749304 0.04785602 0.004665629
##     5: 0.05670103 0.04785602 0.002713486
##    ---                                  
## 22467: 0.01923077 4.38514676 0.084329745
## 22468: 0.01923077 3.97968165 0.076532339
## 22469: 0.01923077 4.16200321 0.080038523
## 22470: 0.01923077 2.93822778 0.056504380
## 22471: 0.01923077 5.07829394 0.097659499

可以看到有緬甸、中國等。

選出每篇文章，tf-idf值最大的五個詞，且出現次數超過15次*

coup_words_tf_idf %>% 
  group_by(artUrl) %>%
  slice_max(tf_idf, n=5) %>%
  filter(n > 15) %>% 
  arrange(desc(artUrl))

## # A tibble: 19 x 7
## # Groups:   artUrl [15]
##    artUrl                                 word      n total      tf   idf tf_idf
##    <chr>                                  <chr> <int> <int>   <dbl> <dbl>  <dbl>
##  1 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國     27   301 0.0897  1.24  0.111 
##  2 https://www.ptt.cc/bbs/Gossiping/M.16~ 人類     31  2140 0.0145  4.16  0.0603
##  3 https://www.ptt.cc/bbs/Gossiping/M.16~ 不可     19  2140 0.00888 4.39  0.0389
##  4 https://www.ptt.cc/bbs/Gossiping/M.16~ 聖經     16  2140 0.00748 5.08  0.0380
##  5 https://www.ptt.cc/bbs/Gossiping/M.16~ 告訴     21  2140 0.00981 3.47  0.0340
##  6 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍政府~    17   243 0.0700  0.951 0.0665
##  7 https://www.ptt.cc/bbs/Gossiping/M.16~ 美國     25   292 0.0856  1.69  0.145 
##  8 https://www.ptt.cc/bbs/Gossiping/M.16~ 台灣     16   292 0.0548  1.06  0.0582
##  9 https://www.ptt.cc/bbs/Gossiping/M.16~ 美國     25   225 0.111   1.69  0.188 
## 10 https://www.ptt.cc/bbs/Gossiping/M.16~ 存在     21   651 0.0323  3.21  0.103 
## 11 https://www.ptt.cc/bbs/Gossiping/M.16~ 仰光     17   314 0.0541  1.99  0.108 
## 12 https://www.ptt.cc/bbs/Gossiping/M.16~ 中共     20   359 0.0557  2.78  0.155 
## 13 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國     23   268 0.0858  1.24  0.106 
## 14 https://www.ptt.cc/bbs/Gossiping/M.16~ 群眾     16   649 0.0247  3.13  0.0772
## 15 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍方     35   582 0.0601  1.35  0.0813
## 16 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國     29   753 0.0385  1.24  0.0477
## 17 https://www.ptt.cc/bbs/Gossiping/M.16~ 台灣     17   139 0.122   1.06  0.130 
## 18 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國     24   263 0.0913  1.24  0.113 
## 19 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍方     19   281 0.0676  1.35  0.0915

可以看到文章中有中國、美國、台灣、仰光等字詞。

從每篇文章挑選出tf-idf最大的十個詞，並計算每個詞被選中的次數

coup_words_tf_idf %>% 
  group_by(artUrl) %>%
  slice_max(tf_idf, n=10) %>%
  ungroup() %>%
  count(word, sort=TRUE)

## # A tibble: 3,333 x 2
##    word       n
##    <chr>  <int>
##  1 軍方      19
##  2 中國      16
##  3 香港      10
##  4 親中      10
##  5 美國       9
##  6 軍人       9
##  7 軍政府     9
##  8 緬甸       8
##  9 工廠       7
## 10 台商       7
## # ... with 3,323 more rows

軍方、軍政府、中國等是大家討論的重點。

(5)jiebar and ngrams

計算兩個詞彙同時出現的總次數

# 過濾掉三個關鍵字"緬甸", "軍政"
word_pairs <- coup_words %>%
  pairwise_count(word, artUrl, sort = TRUE) %>% 
  filter(!item1 %in% c("緬甸", "軍政") & !item2 %in% c("緬甸", "軍政"))

word_pairs

## # A tibble: 4,388,182 x 3
##    item1  item2      n
##    <chr>  <chr>  <dbl>
##  1 政變   軍政府    59
##  2 軍政府 政變      59
##  3 政府   軍政府    58
##  4 軍政府 政府      58
##  5 政變   軍方      51
##  6 軍政府 中國      51
##  7 中國   軍政府    51
##  8 軍方   政變      51
##  9 抗議   軍政府    47
## 10 政變   政府      47
## # ... with 4,388,172 more rows

可以看到(政變、軍政府)、(軍政府、中國)、(抗議、軍整府)等詞常一起出現。

計算兩個詞彙間的相關性

word_cors <- coup_words %>%
  group_by(word) %>%
  filter(n() >= 10) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

word_cors

## # A tibble: 131,406 x 3
##    item1    item2    correlation
##    <chr>    <chr>          <dbl>
##  1 綜合     外電報導       0.694
##  2 外電報導 綜合           0.694
##  3 舞弊     大選           0.665
##  4 大選     舞弊           0.665
##  5 執政黨   舞弊           0.656
##  6 舞弊     執政黨         0.656
##  7 執政黨   發言人         0.640
##  8 發言人   執政黨         0.640
##  9 綜合     報導           0.601
## 10 報導     綜合           0.601
## # ... with 131,396 more rows

找出與 “緬甸”、 “政變” 這兩個緬甸政變相關性最高的15個詞彙

word_cors %>%
  filter(item1 %in% c("緬甸", "政變")) %>%
  group_by(item1) %>%
  top_n(15) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定，避免中文字顯示錯誤。

去除不相關的字(有沒有、超過、指出、已有)

# 設定幾個詞做爲seed words
seed_words <- c("有沒有", "超過", "指出", "已有")
# 設定threshold爲0.5
threshold <- 0.5
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
                filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
                .$item1 %>%
                unique()
remove_words

## [1] "舉行" "指出" "已有" "記者" "援助" "報導" "超過" "活動"

使用詞彙關係圖畫出相關性大於0.5的組合

# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
                filter(!(item1 %in% remove_words|item2 %in% remove_words))

word_cors_new %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
  theme_void()