新疆棉在推特上的情緒分析

從推特上面抓有近期有#XinjiangCotton （新疆棉事件）的推特和推文下來進行分析

基礎環境安裝

packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(webshot)
library(htmlwidgets)

#load("coreNLP_all.RData")

導入twitterAPI

app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權

抓關鍵字：

# 查詢關鍵字
key = c("#XinjiangCotton")
#context = ""
#q = paste(c(key,context),collapse=" AND ")   
#抓5000筆 不抓轉推
tweets = search_tweets(key,lang="en",n=5000,include_rts = FALSE,token = twitter_token)

清理資料：

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元，*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }


tweets$text = clean(tweets$text)  #text套用資料清理

df = data.frame()
  
df = rbind(df,tweets)  # transfer to data frame

df = df[!duplicated(df[,"status_id"]),]  #去除重複的tweets

nrow(df)

## [1] 313

串coreNLP的API

# 產生coreNLP的api url，將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)

# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文（使用9000 port）、另一個則負責中文（使用9001 port）
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}

#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理，每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

整理資料

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

圖形化相依關係樹的公式

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

執行

gc() #釋放不使用的記憶體

##           used  (Mb) gc trigger  (Mb) max used  (Mb)
## Ncells 1939463 103.6    3788595 202.4  2660078 142.1
## Vcells 3407514  26.0    8388608  64.0  5403089  41.3

t0 = Sys.time()
obj = df[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

Sys.time() - t0 #執行時間

## Time difference of 1.031911 mins

#Time difference of 28 mins

save.image("coreNLP_all.RData")

結果

呈現結果

tokens =  coreNLP_tokens_parser(obj)
head(tokens,20)

##              status_id       word      lemma pos               ner
## 1  1381607228751425543 Especially especially  RB                 O
## 2  1381607228751425543      those      those  DT                 O
## 3  1381607228751425543         in         in  IN                 O
## 4  1381526745447243782   Xinjiang   Xinjiang NNP STATE_OR_PROVINCE
## 5  1381526745447243782        has       have VBZ                 O
## 6  1381526745447243782       been         be VBN                 O
## 7  1381526745447243782         in         in  IN                 O
## 8  1381526745447243782        the        the  DT                 O
## 9  1381526745447243782       news       news  NN                 O
## 10 1381526745447243782          a          a  DT                 O
## 11 1381526745447243782        lot        lot  NN                 O
## 12 1381526745447243782    lately.    lately.  NN                 O
## 13 1381526745447243782         Is         be VBZ                 O
## 14 1381526745447243782         it         it PRP                 O
## 15 1381526745447243782       safe       safe  JJ                 O
## 16 1381526745447243782         to         to  TO                 O
## 17 1381526745447243782     travel     travel  VB                 O
## 18 1381526745447243782         in         in  IN                 O
## 19 1381526745447243782       this       this  DT                 O
## 20 1381526745447243782   troubled   troubled  JJ                 O

結果資料處理：去掉entity為O的（other）

unique(tokens$ner)

##  [1] "O"                 "STATE_OR_PROVINCE" "CITY"             
##  [4] "ORGANIZATION"      "NATIONALITY"       "COUNTRY"          
##  [7] "RELIGION"          "CRIMINAL_CHARGE"   "PERSON"           
## [10] "DATE"              "LOCATION"          "MISC"             
## [13] "NUMBER"            "TITLE"             "IDEOLOGY"         
## [16] "CAUSE_OF_DEATH"    "ORDINAL"           "SET"              
## [19] "TIME"              "DURATION"

length(unique(tokens$word[tokens$ner != "O"]))

## [1] 244

結果資料處理：轉成小寫方便計算

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

圖形化

列出此議題相關的國家

可以發現最多的前三分別是中國、美國和日本。和上次抓PTT的資料比起來有點不一樣。爆發衝突的另外一方『歐洲各國』完全沒有在前列

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 13, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

## 涉及此議題的組織：就中國共產黨和相關的企業、新聞媒體等

tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

## 涉及此議題的人基本上還是品牌

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

## 相依性分析：處理相依性分析結果

dependencies = coreNLP_dependency_parser(obj)
head(dependencies,20)

##              status_id      dep governor governorGloss dependent dependentGloss
## 1  1381607228751425543     ROOT        0          ROOT         1     Especially
## 2  1381607228751425543      obl        1    Especially         2          those
## 3  1381607228751425543     case        2         those         3             in
## 4  1381526745447243782     ROOT        0          ROOT        12           safe
## 5  1381526745447243782    nsubj        6          news         1       Xinjiang
## 6  1381526745447243782      aux        6          news         2            has
## 7  1381526745447243782      cop        6          news         3           been
## 8  1381526745447243782     case        6          news         4             in
## 9  1381526745447243782      det        6          news         5            the
## 10 1381526745447243782      dep       12          safe         6           news
## 11 1381526745447243782      det        9       lately.         7              a
## 12 1381526745447243782 compound        9       lately.         8            lot
## 13 1381526745447243782      dep        6          news         9        lately.
## 14 1381526745447243782      cop       12          safe        10             Is
## 15 1381526745447243782    nsubj       12          safe        11             it
## 16 1381526745447243782     mark       14        travel        13             to
## 17 1381526745447243782      dep       12          safe        14         travel
## 18 1381526745447243782     case       18       region?        15             in
## 19 1381526745447243782      det       18       region?        16           this
## 20 1381526745447243782     amod       18       region?        17       troubled

length(obj)

## [1] 308

視覺化

parse_tree <- obj[[39]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:httr':
## 
##     content

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## Loading required package: igraph

## Warning: package 'igraph' was built under R version 4.0.5

## 
## Attaching package: 'igraph'

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)

情緒分析

計算情緒分數

sentiment = coreNLP_sentiment_parser(obj)
head(sentiment,20)

##              status_id
## 1  1381607228751425543
## 2  1381526745447243782
## 3  1381521708163665921
## 4  1379743605087207432
## 5  1379744323051339776
## 6  1380766530976182273
## 7  1380039296166465540
## 8  1380439643548573698
## 9  1381499610552696835
## 10 1381478518798774279
## 11 1381476084420538369
## 12 1379717018321051654
## 13 1381466974878892036
## 14 1380082901883228160
## 15 1379481263053111302
## 16 1381459984324567041
## 17 1381437882854871042
## 18 1379774376304775169
## 19 1381433179689545731
## 20 1381402886018580484
##                                                                                                                                                                                                                                                     text
## 1                                                                                                                                                                                                                                    Especially those in
## 2                                                                                                                                               Xinjiang has been in the news a lot lately. Is it safe to travel in this troubled region? Find out here!
## 3                                                                                                                                                                                              Hong Kong is a very good example of OneChina consensus.  
## 4                                                                           The CCP is trying transfer its internal contradictions to Taiwan. The CCP is trying to seize Chinese people's properties now. It's like the same as culture revolution era. 
## 5                                                                           The CCP is trying transfer its internal contradictions to Taiwan. The CCP is trying to seize Chinese people's properties now. It's like the same as culture revolution era. 
## 6                                                                                                                                                                                      Instead of rule by law China is rule by a bunch of mafia thugs.  
## 7                                                                                                                                                                                                  We have to fight together against . Let's fight for .
## 8                                                                                                                                                                         If you will trust  it means that you will be finished soon.is not trustworthy.
## 9                                                                                                                                                                                                                                        is destroying .
## 10                                                                                                                                                                                                                                    The truth aboutand
## 11                  Don't insult our intelligence . BCI's complete collapse came solely from the lies they fabricated againstwhich they have yet to produce a shred of evidence. What makes u think they need to please anybody but their funders USAID?
## 12                                                                                                                                                                                                Petty  genocidal. Theis just the worst on every level.
## 13                                                                                                                                                                                                                    The dead eyed gelding should hush.
## 14                                                                                                                                                    Here it is A protester is a rioter. A Muslim is a terrorist. Truth is lies. Genocide is education.
## 15                                                                                                                                                                                 If thearrests doctors for telling the truth what chance do theb have?
## 16                                                                                                                          How disinformation of educational camps got started? How BBC commissioned Adrian Zenz to spin it? Check the research work by
## 17                                                     How can you engage in constructive dialogue with the government in a place where there is no civil society no freedom of association and no trade unions? An exILO official on Xinjiang conundrum
## 18                                                                                                                                                                                                                        Spring in Kashgar episode one.
## 19 excerpt from Kate Kui on Xinjiangmigrant workers in Xinjiang picking cotton in harvest season is like Amazon hiring temporary workers for peak Christmas orders. This is based on demand  supply  machine is used more as a much economic substitute.
## 20                                                                                                                                                                                      Support Global to execute the sanctions and Military action of .
##    sentiment sentimentValue
## 1    Neutral              2
## 2   Positive              3
## 3   Positive              3
## 4    Neutral              2
## 5    Neutral              2
## 6    Neutral              2
## 7   Negative              1
## 8    Neutral              2
## 9    Neutral              2
## 10   Neutral              2
## 11  Negative              1
## 12   Neutral              2
## 13  Negative              1
## 14  Positive              3
## 15   Neutral              2
## 16  Positive              3
## 17  Negative              1
## 18   Neutral              2
## 19   Neutral              2
## 20   Neutral              2

unique(sentiment$sentiment)

## [1] "Neutral"      "Positive"     "Negative"     "Verypositive"

sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric

sentiment$sentiment %>% table()

## .
##     Negative      Neutral     Positive Verypositive 
##           77          190           39            2

日期分析

其實近期已經過了議題的巔峰，可以發現情緒起起服服。

df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) + 
  geom_line()

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% 
  group_by(date,source) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment,color=source)) + 
  geom_line()

## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

#了解正面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  #anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lower_lemma) %>% #根據lemma分組
  summarize(count = n()) %>% 
  filter(count >5 & count<400)%>%
  wordcloud2()

#了解負面文章的詞彙使用
a <- sentiment %>% 
  merge(tokens) %>% 
  #anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  group_by(lower_lemma) %>% 
  summarize(count = n()) %>% 
  filter(count >10 &count<400)%>%
  wordcloud2()

my_graph <- a
saveWidget(my_graph, "temp.html", selfcontained = F)
webshot("temp.html", "wc1.png", delay = 5, vwidth = 500, vheight = 500)