社群媒體分析-CoreNLP與sentimentr

基本介紹

目的:使用coreNLP與sentimentr分析twitter上關於緬甸政變文字資料
概述:2月1日凌晨，正當緬甸人民沉浸在夢鄉之際，緬甸國防軍（Tatmadaw）迅速發動政變，逮捕了國務資政、緬甸實質領導人翁山蘇姬（Aung San Suu Kyi）、總統溫敏（WinMyint)等多名領袖。希望利用推特的文章分析民眾對“緬甸政變”的情緒。
資料來源:Twitter，4/4~4/12，4403筆

1. coreNLP

安裝package

packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)

#load("coreNLP_all.RData")

1.1 資料收集：tweets

(1). Twitter API設定透過rtweet抓取tweets

app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權

(2). 設定關鍵字抓tweets

關鍵字為Myanmar和coup，總筆數共3543筆

# 查詢關鍵字
key = c("#Myanmar")
context = "coup"
q = paste(c(key,context),collapse=" AND ")     
#抓5000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)
View(tweets)

(3). tweets內容清理

## 用於資料清理

clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元，*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }

tweets$text = clean(tweets$text)  #text套用資料清理
df = data.frame()
df = rbind(df,tweets)  # transfer to data frame
df = df[!duplicated(df[,"status_id"]),]  #去除重複的tweets

1-2串接CoreNLP API

(1). API呼叫的設定

server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# 產生coreNLP的api url，將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)

# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文（使用9000 port）、另一個則負責中文（使用9001 port）
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}

#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理，每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

(2). 資料整理function

從回傳的object中整理斷詞出結果，輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

從回傳的core-nlp object中整理出詞彙依存關係，輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

從回傳的core-nlp object中整理出語句情緒，輸出為 tidydata 格式

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

圖形化 Dependency tree

程式參考來源：https://stackoverflow.com/questions/35496560/how-to-convert-corenlp-generated-parse-tree-into-data-tree-r-package

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

將句子丟入服務

取得coreNLP回傳的物件

gc() #釋放不使用的記憶體

##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 2028121 108.4    4067827 217.3  4067827 217.3
## Vcells 5337964  40.8   10146329  77.5 10146306  77.5

t0 = Sys.time()
#str(df$text)
obj = df[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

Sys.time() - t0 #執行時間

## Time difference of 17.44677 mins

#Time difference of 28 mins

save.image("~/M094020053/text_analysis/hw06/train03/coreNLP_all.RData")

1-3 提取結果

(1). 斷詞、詞彙還原、詞性標註、NER

tokens =  coreNLP_tokens_parser(obj)
head(tokens,20)

##              status_id                   word                  lemma pos
## 1  1381991774281867267                     At                     at  IN
## 2  1381991774281867267                  least                  least JJS
## 3  1381991774281867267                     18                     18  CD
## 4  1381991774281867267                 killed                   kill VBD
## 5  1381991774281867267                   inon                   inon  NN
## 6  1381991774281867267              bloodiest              bloodiest JJS
## 7  1381991774281867267                    day                    day  NN
## 8  1381991774281867267                     of                     of  IN
## 9  1381991774281867267               protests                protest NNS
## 10 1381991774281867267                against                against  IN
## 11 1381991774281867267                   coup                   coup  NN
## 12 1381991774281867267                    ...                    ...   :
## 13 1381991774281867267                 Follow                 follow  VB
## 14 1381988204530794499                   coup                   coup  NN
## 15 1381988204530794499            latestquiet            latestquiet  NN
## 16 1381988204530794499                     as                     as  IN
## 17 1381988204530794499                 people                 people NNS
## 18 1381988204530794499                boycott                boycott VBP
## 19 1381988204530794499                  water                  water  NN
## 20 1381988204530794499 festivalAdministrative festivaladministrative  JJ
##         ner
## 1         O
## 2         O
## 3    NUMBER
## 4         O
## 5         O
## 6         O
## 7  DURATION
## 8         O
## 9         O
## 10        O
## 11        O
## 12        O
## 13        O
## 14        O
## 15        O
## 16        O
## 17        O
## 18        O
## 19        O
## 20        O

(2). 命名實體標註(NER)

從NER查看特定類型的實體，辨識出哪幾種類型

#知道有幾種類別
unique(tokens$ner)

##  [1] "O"                 "NUMBER"            "DURATION"         
##  [4] "ORGANIZATION"      "LOCATION"          "COUNTRY"          
##  [7] "DATE"              "CAUSE_OF_DEATH"    "TITLE"            
## [10] "SET"               "PERSON"            "ORDINAL"          
## [13] "CITY"              "TIME"              "NATIONALITY"      
## [16] "IDEOLOGY"          "MISC"              "CRIMINAL_CHARGE"  
## [19] "STATE_OR_PROVINCE" "RELIGION"          "MONEY"            
## [22] "URL"

#除去entity為Other，有多少種word有被標註entity?  
length(unique(tokens$word[tokens$ner != "O"]))

## [1] 1205

(3). 轉小寫

因為大小寫也會影響corenlp對NER的判斷，因此我們一開始給的推文內容是沒有處理大小寫的，但在跑完anotator後，為了正確計算詞頻，創建新欄位lower_word與lower_lemma，存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞（如Evergiven與evergiven）都換成小寫，再來計算詞頻。

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

1.4 探索分析 - NER

涉及到的國家(COUNTRY)

我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變，所涉及到的國家(COUNTRY)，以初步了解這個議題的主要國家。

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 13, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

關於緬甸所涉及的國家前三名分別是中國、英國、俄羅斯。
中國和俄羅斯都跟緬甸武裝部隊關係密切，他們分別是緬甸軍火的第一大和第二大供應者。
英國(uk)與歐盟向聯合國人權委員會提出草案，要求譴責緬甸軍事政變。
緬甸(burma) vs 緬甸(Myanmar):Burma具有種族主義色彩，只代表緬甸境內佔多數的緬族(Burman)，故改成Myanmar就不會有疑慮。

涉及到的組織(ORGANIZATION)

我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變，所涉及到的組織(ORGANIZATION)，以初步了解這個議題的主要公司/單位。

tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

- 關於緬甸所涉及的組織可以看到有 un、military。
- 國際間對於緬甸安全部隊殺害手無寸鐵民主示威者的憤怒日益增長。聯合國人權理事會通過決議案，對緬甸政變發出警訊，並推動在緬甸設立聯合國的人權辦公室，但仍無法制止流血鎮壓。
- Tatmadaw:緬甸國防軍。
- ASEAN:東南亞國家協會。
- 美國有線電視新聞網CNN，是緬甸政變後，第一家進入仰光採訪的外媒。
- Reuters:路透社，是位列世界前三的多媒體新聞通訊社。

涉及到的人物(PERSON)

我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變，所涉及到的人物(PERSON)，以初步了解這個議題的主要人物。

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

- Aung San Suu Kyi:翁山蘇姬。
- Paing Takhon:模特兒白德宏[最帥的和尚]。
- Min Aung Hlaing:敏昂萊[部隊總司令]。
- Kyaw Zwar Minn:緬甸駐英國大使。
- sagaing : 實皆[緬甸的都市]。
- 4月8日，軍方發出逮捕令，涉及100多位反政變且具有影響力的人士。

涉及到的人物(LOCATION)

我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變，所涉及到的地點(LOCATION)，以初步了解這個議題的主要人物。

tokens %>%
  filter(ner == "LOCATION") %>%  #篩選NER為LOCATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 5, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is LOCATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

- yangon :仰光 [發生衝突的地點]。
- Salween River:薩爾溫江[越過河流邊境逃到泰國尋求庇護]。

1.5 探索分析 - Dependency

語句依存關係結果

dependencies = coreNLP_dependency_parser(obj)
head(dependencies,20)

##              status_id       dep governor governorGloss dependent
## 1  1381991774281867267      ROOT        0          ROOT         4
## 2  1381991774281867267    advmod        4        killed         1
## 3  1381991774281867267     fixed        1            At         2
## 4  1381991774281867267     nsubj        4        killed         3
## 5  1381991774281867267      iobj        4        killed         5
## 6  1381991774281867267      amod        7           day         6
## 7  1381991774281867267       dep        5          inon         7
## 8  1381991774281867267      case        9      protests         8
## 9  1381991774281867267      nmod        7           day         9
## 10 1381991774281867267      case       11          coup        10
## 11 1381991774281867267       obl        4        killed        11
## 12 1381991774281867267     punct        4        killed        12
## 13 1381991774281867267 parataxis        4        killed        13
## 14 1381988204530794499      ROOT        0          ROOT         5
## 15 1381988204530794499  compound        2   latestquiet         1
## 16 1381988204530794499     nsubj        5       boycott         2
## 17 1381988204530794499      case        4        people         3
## 18 1381988204530794499      nmod        2   latestquiet         4
## 19 1381988204530794499  compound        8       offices         6
## 20 1381988204530794499      amod        8       offices         7
##            dependentGloss
## 1                  killed
## 2                      At
## 3                   least
## 4                      18
## 5                    inon
## 6               bloodiest
## 7                     day
## 8                      of
## 9                protests
## 10                against
## 11                   coup
## 12                    ...
## 13                 Follow
## 14                boycott
## 15                   coup
## 16            latestquiet
## 17                     as
## 18                 people
## 19                  water
## 20 festivalAdministrative

視覺化 Dependency tree

parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)

詞性相依樹

1.6 探索分析 - Sentiment

語句情緒值

情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive

sentiment = coreNLP_sentiment_parser(obj)
head(sentiment,20)

##              status_id
## 1  1381991774281867267
## 2  1381988204530794499
## 3  1381987697913339904
## 4  1381986777821368323
## 5  1380451307794800645
## 6  1381155905064996866
## 7  1380815492558938113
## 8  1380755492205514754
## 9  1381986003066441731
## 10 1381985938482417674
## 11 1381985624459202560
## 12 1381985045276094475
## 13 1381984431410311169
## 14 1381590953509613571
## 15 1381983864562720773
## 16 1381983636958760963
## 17 1381983614015922176
## 18 1381983295387262986
## 19 1381982799092011008
## 20 1381982107279458311
##                                                                                                                                                                                                                                 text
## 1                                                                                                                                                          At least 18 killed inon bloodiest day of protests against coup ... Follow
## 2                                                                                                         coup latestquiet as people boycott water festivalAdministrative offices set ablazeenvoy 'remains ready' to meet junta. via
## 3                                                                                                         coup latestquiet as people boycott water festivalAdministrative offices set ablazeenvoy 'remains ready' to meet junta. via
## 4                                                                                                                       Filmed this clip inon the first day oflast yearit was COVID19Ramadan we called but this yearCoupRamadan in .
## 5  Fifteen ambassadors to  including Australia Canada European Union  members of EU New Zealand Korea U.S. and UK released a statement today Apr 9 calling the coup regime to end violence release political prisoners and restore .
## 6                                                                                                                  General Strike inin defiance of the military coup. Strikes continued every day acrossundeterred by dailymassacre.
## 7                                                                   As a symbol ofanticoup protest movement a Karen boy waving The Karen Flag over the Salween River with threefingers salute sign. Powerful image by . ELECTED CRPH
## 8                    Veteran actor and director Aung Lwin85 one of founders of thein 1988 was detained by the military on Friday his family member posted on social media as junta's target on anticoup celebrities is underway in .
## 9                                                                                                         coup latestquiet as people boycott water festivalAdministrative offices set ablazeenvoy 'remains ready' to meet junta. via
## 10                                                                                                                 Our people need to get information real information because the military spread out fake news on their own media.
## 11                                                                                                        coup latestquiet as people boycott water festivalAdministrative offices set ablazeenvoy 'remains ready' to meet junta. via
## 12                        Silent protest against the military coup acrosson first day of Thingyan festival. At least 710 including 50 children were killed during security forces' violence on protests since Feb rights groups say.
## 13                                                                                                                                  This parent leave their 3 children by force. Beause they were shot dead by coup in this morning.
## 14                                                                        Death troll instarted from Feb 9 and continued until today. Now the coup said 'strengthen multiparty democracy'. The world can see how shameless the coup.
## 15                                                                                                        coup latestquiet as people boycott water festivalAdministrative offices set ablazeenvoy 'remains ready' to meet junta. via
## 16                                                                                                        coup latestquiet as people boycott water festivalAdministrative offices set ablazeenvoy 'remains ready' to meet junta. via
## 17                                                                                                        coup latestquiet as people boycott water festivalAdministrative offices set ablazeenvoy 'remains ready' to meet junta. via
## 18                                                                                                        coup latestquiet as people boycott water festivalAdministrative offices set ablazeenvoy 'remains ready' to meet junta. via
## 19                                                                                                        coup latestquiet as people boycott water festivalAdministrative offices set ablazeenvoy 'remains ready' to meet junta. via
## 20                 Did the country director of an international charity really call the newspontaneoussystem a 'menagerie'? So what youve got is just this kind of menagerie of private clinics or newly croppedup health facilities
##    sentiment sentimentValue
## 1    Neutral              2
## 2    Neutral              2
## 3    Neutral              2
## 4    Neutral              2
## 5   Positive              3
## 6    Neutral              2
## 7    Neutral              2
## 8   Positive              3
## 9    Neutral              2
## 10  Negative              1
## 11   Neutral              2
## 12  Negative              1
## 13  Negative              1
## 14  Negative              1
## 15   Neutral              2
## 16   Neutral              2
## 17   Neutral              2
## 18   Neutral              2
## 19   Neutral              2
## 20   Neutral              2

資料集中的情緒種類

unique(sentiment$sentiment)

## [1] "Neutral"      "Positive"     "Negative"     "Verynegative"

sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric
#了解情緒文章的分佈
sentiment$sentiment %>% table()

## .
##     Negative      Neutral     Positive Verynegative 
##         1307         2913          420            1

我們可以看到中立與負面的情緒是最多的。

#平均情緒分數時間趨勢

df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) +
  geom_hline(yintercept = as.numeric(2.0), col='red', size = 1)+
  scale_x_date(labels = date_format("%m/%d"))+
  geom_line()

- 從圖中可以發現，情緒分數介於在0-1表示負面情緒多。
- 4/10 印緬邊境的實皆省塔木鎮在10日就有至少18名士兵被殺。

在正面情緒下，所使用的文章詞彙為何?

sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lower_lemma) %>% #根據lemma分組
  summarize(count = n()) %>% 
  filter(count >5 & count<400)%>%
  wordcloud2()

1988:1988年緬甸一場爭取民主的大規模民眾運動。
500:緬甸軍政府血腥鎮壓下已有逾500平民遇害。
actor、celebrity: 推測與緬甸軍政府通緝會「煽動民眾示威」的名人有關。

在負面情緒下，所使用的文章詞彙為何?

#負面文章的詞彙使用 
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  group_by(lower_lemma) %>% 
  summarize(count = n()) %>% 
  filter(count >10 &count<400)%>%
  wordcloud2()

- crackdown:鎮壓、blood:血、coup:軍政、fight:戰鬥
- egg:復活節彩蛋成為抗暴象徵。
- 396:去年11月大選是軍方將統治轉移至民選政府後的第2次大選，翁山蘇姬的全國民主聯盟贏得壓倒性勝利，在476席的國會中拿下396席。
- internet: 緬甸軍方不僅封鎖臉書、推特、Instagram，甚至關閉網際網路服務。

2. Sentimentr 英文情緒分析

2.1 使用twitter資料實踐在sentimentr

計算tweet中屬於正面的字

library(sentimentr)

set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = TRUE) #隨機取1000筆，取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字

##            words polarity  n
##   1:      please      1.0 23
##   2:       truth      1.0 19
##   3:     justice      1.0 12
##   4:       honor      1.0  9
##   5:      accept      1.0  4
##  ---                        
## 240:  committing      0.1  1
## 241:     markets      0.1  1
## 242:    building      0.1  1
## 243: fundamental      0.1  1
## 244:     reading      0.1  1

justice、truth 等正面的詞

計算tweet中屬於負面的字

sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字

## Selecting by n

##          words polarity   n
##  1:     strike    -0.75 369
##  2:   defiance    -0.50 348
##  3:   massacre    -0.50 345
##  4:      junta    -0.25 148
##  5:      death    -0.75 117
##  6: protesters    -0.60 111
##  7:     killed    -0.50  55
##  8:    boycott    -0.75  53
##  9:   arrested    -0.50  43
## 10:    protest    -0.50  43

strike、defiance(蔑視)、massacre(屠殺) 等負面的詞

highlight每個句子，判斷屬於正/負面

set.seed(12)
df%>%
    filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
    mutate(review = get_sentences(text)) %$% 
    sentiment_by(review, status_id) %>%
    highlight()

## Saved in C:\Users\User\AppData\Local\Temp\Rtmpc5S8u3/polarity.html

## Opening C:\Users\User\AppData\Local\Temp\Rtmpc5S8u3/polarity.html ...

2.2 依照日期來了解情緒波動的變化

code 參考 https://github.com/trinker/sentimentr

library(lubridate)
str(tweets$created_at)

##  POSIXct[1:4649], format: "2021-04-13 15:24:49" "2021-04-13 15:10:38" "2021-04-13 15:08:37" ...

tweets$date = format(tweets$created_at,'%Y-%m-%d')

(out  = tweets  %>%  with(
    sentiment_by( #document level
        get_sentences(text), 
        list(date)
    )
) )

##          date word_count        sd ave_sentiment
## 1: 2021-04-06       9869 0.2497680  -0.096801182
## 2: 2021-04-07       6529 0.2937578  -0.135508088
## 3: 2021-04-08      15349 0.2797126  -0.004509342
## 4: 2021-04-09       9653 0.3994905  -0.057547799
## 5: 2021-04-10      12844 0.3398882  -0.201701810
## 6: 2021-04-11      34413 0.1332748  -0.196153431
## 7: 2021-04-12       9021 0.3330784  -0.205323133
## 8: 2021-04-13       8788 0.2236949  -0.136493965

out$date=as.Date(out$date)

plot(out)

總結

緬甸軍政在2月1日夜裡發動政變，將緬甸的重要人物(翁山蘇姬等)進行軟禁，引起大家的恐慌，緬甸民眾大舉走上街頭示威抗議，軍方以鐵腕血腥鎮壓回應。根據緬甸「政治犯援助協會」統計，2月1日迄今，死於軍方血腥鎮壓的民眾已超過600人，各城鎮亦已實施戒嚴。儘管國際社會譴責聲浪不斷，但緬甸軍方有中國與俄羅斯全力支持，東南亞國協（ASEAN）也不願採取實際行動，緬甸民眾只能繼續以血肉之軀拚搏。從上圖可以看到在4/8可以看到情緒是相較偏負面推測可能原因是仰光一間中資服裝工廠在4月7日被示威者放火，同時也有遊行人士舉起杯葛中國品牌「小米」、「華為」的標語。軍方同日亦攻擊示威者佔據的「塔漢示威營」，造成 11 人死亡，故負面的分數較高。

整體分析的結果可以發現在twitter上負面的情緒比較多，也可以在文字雲看到負面的詞(反軍政、鎮壓、血等)，可以推測大家討論的觀點都是偏負面的。