社群媒體分析-CoreNLP中文試做

packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "data.tree", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(request)
library(httr)
library(network)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
library(data.tree)
require(jiebaR)

load("coreNLP.RData")

1.1 載入中文資料

來源:大數據平台，PTT八卦版，w8教材中有附
關鍵字:水庫

#中文資料前處理，過濾特殊字元
water <- read_csv("./ptt_gos_water_articleMetaData.csv") %>% 
  mutate(sentence=gsub("[\n]{2,}", "。", sentence)) %>% 
  mutate(sentence=gsub("\n", "", sentence)) %>% 
  mutate(sentence=gsub("[ \t]{2,}", "", sentence)) %>% 
  mutate(sentence=gsub("^\\s+|\\s+$","", sentence)) %>% 
  mutate(sentence=gsub("http(s)?[-:\\/A-Za-z0-9\\.]+", " ", sentence)) %>% 
  mutate(sentence=gsub("[0-9|~|%|=|\\|A-Za-z|／|\"]", "", sentence)) %>% 
  mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除", "", sentence))

## Parsed with column specification:
## cols(
##   artTitle = col_character(),
##   artDate = col_date(format = ""),
##   artTime = col_time(format = ""),
##   artUrl = col_character(),
##   artPoster = col_character(),
##   artCat = col_character(),
##   commentNum = col_double(),
##   push = col_double(),
##   boo = col_double(),
##   sentence = col_character()
## )

head(water,20)

## # A tibble: 20 x 10
##    artTitle artDate    artTime  artUrl artPoster artCat commentNum  push   boo
##    <chr>    <date>     <time>   <chr>  <chr>     <chr>       <dbl> <dbl> <dbl>
##  1 [問卦]台灣水… 2021-03-01 13:45:31 https… wan5389   Gossi…         27    15     2
##  2 [新聞]水情告… 2021-03-02 03:12:58 https… A80211ab  Gossi…         72    33     2
##  3 [新聞]水情再… 2021-03-04 02:53:32 https… shanggua… Gossi…        102    56     6
##  4 Re:[新聞]… 2021-03-04 03:08:26 https… sunyeah   Gossi…          3     1     0
##  5 Re:[新聞]… 2021-03-04 03:53:27 https… ROCisChi… Gossi…         10     4     1
##  6 [問卦]水庫見… 2021-03-04 09:23:18 https… XXXXXXXX… Gossi…         41    14     2
##  7 [問卦]台灣根… 2021-03-04 09:32:40 https… KennethC  Gossi…         26    17     3
##  8 Re:[新聞]… 2021-03-04 09:34:00 https… ray19877… Gossi…         14     6     0
##  9 Re:[問卦]… 2021-03-04 09:49:07 https… alwang    Gossi…          4     2     0
## 10 [問卦]寶山水… 2021-03-04 11:03:49 https… terumika… Gossi…         15     6     1
## 11 Re:[問卦]… 2021-03-04 15:26:29 https… leo0873   Gossi…          2     0     1
## 12 [新聞]南化水… 2021-03-04 15:26:47 https… jwph      Gossi…         27    11    11
## 13 [問卦]台積電… 2021-03-05 04:48:10 https… ich2      Gossi…         25     8     4
## 14 [新聞]水情吃… 2021-03-05 06:21:50 https… Emerson1… Gossi…       1160   635   211
## 15 Re:[新聞]… 2021-03-05 06:42:13 https… alwang    Gossi…          9     4     1
## 16 [問卦]為什麼… 2021-03-05 07:07:40 https… KennethC  Gossi…         31     8     5
## 17 Re:[新聞]… 2021-03-05 07:32:57 https… hugh509   Gossi…        456   282    39
## 18 [問卦]台灣水… 2021-03-05 08:19:46 https… cwh0105   Gossi…         18     7     1
## 19 Re:[新聞]… 2021-03-05 09:55:32 https… kid725    Gossi…         26    11     2
## 20 Re:[新聞]… 2021-03-05 10:06:24 https… tml7415   Gossi…         13     8     1
## # … with 1 more variable: sentence <chr>

data = water %>% #取需要的資料欄位
  select(sentence,artUrl)
data = data[1:500,] # 選取較少的資料筆數，減少運算負荷
colnames(data)[1] = "text"
data = data.frame(data)
head(data)

##                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                          text
## 1                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                    魯蛇家鄉最近停水好幾天引起民怨！就查了最近的水情資料！ 發現最近新竹以南水情不是很樂觀！卻沒有中央官員出來呼籲要怎麼解決缺水問題！反觀鳳梨被中國禁止進口， 中央從總統到行政院長各級長官，都站出來呼籲解決鳳梨問題！。所以現在臺灣鳳梨比臺灣缺水誰重要？。沒有喝水就喝鳳梨汁是不錯的解決方案吧？是否有專板本板並非萬能問板兩則本看板嚴格禁止政治問卦未滿繁體中文字水桶個月，嚴重者以鬧板論
## 2 新頭殼。顏得智。水情告急！新竹以南水庫蓄水率陷保衛戰 春季雨量展望暫不樂觀。水情告急！嘉義、台南地區於月日，水情燈號調整為減量供水的橙燈，截止至月日早上時，新竹以南已有座主要水庫，蓄水率逼近，新竹科學園區多間科技大廠也啟用水車載水確保產能，然而，根據中央氣象局上周公布的春季氣候展望，降雨的部分照目前預測來看為偏少到正常，水情仍不樂觀。。截止至月日早上時，新竹寶二水庫蓄水量.、苗栗永和山水庫.、苗栗明德水庫.、鯉魚潭水庫.、台中德基水庫.。新竹、苗栗、台中五座提供民生用水、農溉用水、工業用水的水庫，蓄水率皆逐漸逼近，此外，霧社水庫蓄水率、曾文水庫也僅有.。。除蓄水率陷入保衛戰的座水庫外，包含日月潭水庫、湖山水庫、仁義潭水庫、南化水庫、烏山頭水庫等雲、嘉、南、彰、投地區主要水庫，蓄水率大多也來到甚至以下。。按照中央氣象局月日所發布的春季氣候展望，預估未來一季的氣溫接近正常，針對雨量部分，春雨預估為偏少到正常，缺水狀況短期難以立即改善，民眾仍須節約用水。 中南部肥宅動起來節約用水阿!!還看路邊一堆人在洗車。#首先重複用不要水洗
## 3                                                                                        。.:聯合新聞網.:記者王昭月高雄即時報導.:水情再趨嚴峻 高雄今天凌晨時起暫停支援台南用水.:南部地區水情再趨嚴峻，高雄繼月日執行第一階段夜間減壓供水後，今天凌晨零時起，再停止送水台南，台灣自來水公司第七區管理處表示，目前供應高雄民生用水仍無問題，不過已無多餘的能力可支援台南用水，要視高屏溪川流量有無回升，再做浮動調整。。據南部水資源局高屏溪攔河堰水情監測，今早點川流量為.( 每秒立方公尺)，相較於上周點多，又再下探。。台水公司第七區管理處今早證實，由於高屏溪川流量漸少，在取得經濟部水利署同意下，今天凌晨起暫停支援台南用水。。大高雄地區進入乾旱期，高屏溪攔河堰日來平均流量約，創營運年來最低量，先前高屏溪原水充足，高雄每天可北送支援台南用水，但今年水情嚴峻，支援水遞減到每日萬噸，今天凌晨起則暫停北送。。七區處副處長徐志宏表示，高雄暫時還未討論到二階限水，但水情的確不佳，須未雨綢繆，只能期待早日降雨，挹注高屏溪川流量，再做水情調控。. (): :
## 4                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                        幹拎老師台北宜蘭一直下雨結果出了北區結界到桃園直接停水每到周末就一直下雨連預報不會下雨也會下雨。科學園區買的水車應該都是來翡翠水庫載水的吧濕透了這裡
## 5                                                                                                                                                                                                                                                                                                                                                                                                                                                                   這時候就要嘴一些北漂仔一兩個月前台北連日下雨時就該該叫不然就一入冬就嘴台北濕冷啦 發霉啦不習慣台北天氣想回南部啦 麻煩看一下。全台能打的水庫沒幾座翡翠水已經送去桃園 桃園水又拿去支援竹科了目前估計北台灣是供給到五月份都沒問題 剛好銜接梅雨季。不下雨北漂仔是要喝洨度日?台灣就三種雨 梅雨 颱風 東北季風梅雨和颱風集中春夏 但近年颱風銳減 去年幾乎沒放颱風假所以梅雨後 意思你就要一路撐到年尾等冬雨沒下冬雨的話 幾乎要等到隔年梅雨北漂仔是想靠梅雨稱一整年是吧?。謝謝你喜歡南部沒下雨的冬天
## 6                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                       水庫見底了 一堆人不去檢討為什麼用水。用的這麼厲害 反而一堆鄉民。一副很會 跳出來說 要清淤泥。你以為淤泥你說清就清喔 。
##                                                     artUrl
## 1 https://www.ptt.cc/bbs/Gossiping/M.1614606334.A.2DD.html
## 2 https://www.ptt.cc/bbs/Gossiping/M.1614654780.A.7C6.html
## 3 https://www.ptt.cc/bbs/Gossiping/M.1614826416.A.4D7.html
## 4 https://www.ptt.cc/bbs/Gossiping/M.1614827308.A.391.html
## 5 https://www.ptt.cc/bbs/Gossiping/M.1614830010.A.444.html
## 6 https://www.ptt.cc/bbs/Gossiping/M.1614849800.A.415.html

1.2 CoreNLP服務設定

(1). API呼叫的設定

server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -Xmx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -serverProperties StanfordCoreNLP-chinese.properties -port 9001 -timeout 15000

# 產生coreNLP的api url，將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
  
    url <- sprintf('http://%s:%s/?properties={"pipelineLanguage":"zh", "annotators":"tokenize, ssplit, pos,ner,parse"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)

# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="zh",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","ner","parse")){
  # 假設有兩個core-nlp server、一個負責英文（使用9000 port）、另一個則負責中文（使用9001 port）
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  result <-POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}

#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理，每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

(2). 資料整理function

從回傳的object中整理斷詞出結果，輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

從回傳的core-nlp object中整理出詞彙依存關係，輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

圖形化 Dependency tree

以樹狀圖方式呈現，較能看出關係程式參考來源：https://stackoverflow.com/questions/35496560/how-to-convert-corenlp-generated-parse-tree-into-data-tree-r-package

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

將句子丟入服務

取得coreNLP回傳的物件

gc() #釋放不使用的記憶體

t0 = Sys.time()
obj = data[1]  %>%  coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

Sys.time() - t0 #執行時間

save.image("coreNLP.RData")

1.3 提取結果

(1). 斷詞、詞彙還原、詞性標註、NER

tokens = coreNLP_tokens_parser(obj)

head(tokens,40)

##        word pos  ner
## 1  魯蛇家鄉  VV    O
## 2      最近  NT DATE
## 3      停水  NN    O
## 4        好  AD    O
## 5      幾天  AD    O
## 6      引起  VV    O
## 7      民怨  NN    O
## 8        ！  PU    O
## 9        新  JJ    O
## 10     頭殼  NN    O
## 11       。  PU    O
## 12       。  PU    O
## 13     幹拎  NN    O
## 14     老師  NN    O
## 15     台北  NR  GPE
## 16     宜蘭  NR    O
## 17     一直  AD    O
## 18     下雨  VV    O
## 19     結果  VV    O
## 20       出  VV    O
## 21       了  AS    O
## 22     北區  NR    O
## 23     結界  NN    O
## 24       到   P    O
## 25     桃園  NR    O
## 26     直接  AD    O
## 27     停水  VV    O
## 28       每  DT    O
## 29       到   P    O
## 30     周末  NT DATE
## 31       就  AD    O
## 32     一直  AD    O
## 33     下雨  VV    O
## 34   連預報  NN    O
## 35       不  AD    O
## 36       會  VV    O
## 37     下雨  VV    O
## 38       也  AD    O
## 39       會  VV    O
## 40     下雨  VV    O

coreNLP_tokens_parser欄位:
- word: 原始斷詞
- pos : part-of-speech,詞性
- ner: 命名實體

(2). 命名實體標註(NER)

從NER查看特定類型的實體，辨識出哪幾種類型

unique(tokens$ner)

##  [1] O                 DATE              GPE               NUMBER           
##  [5] MISC              PERSON            ORGANIZATION      LOCATION         
##  [9] CITY              DEMONYM           FACILITY          STATE_OR_PROVINCE
## [13] TIME              ORDINAL           TITLE             CAUSE_OF_DEATH   
## [17] IDEOLOGY         
## 17 Levels: O DATE GPE NUMBER MISC PERSON ORGANIZATION LOCATION CITY ... IDEOLOGY

#除去entity為Other，有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"]))

## [1] 273

1.4 探索分析 - NER

涉及到的地理位置(GPE)

tokens %>%
  filter(ner == "GPE") %>%  #篩選NER為GPE
  group_by(word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

涉及到的組織(ORGANIZATION)

我們可以透過coreNLP中的NER解析出在台灣缺水議題中，所涉及到的組織(ORGANIZATION)，以初步了解這個議題的主要公司/單位。

tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 5, count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

涉及到的人物(PERSON)

我們可以透過coreNLP中的NER解析出台灣缺水議題中，所涉及到的人物(PERSON)，以初步了解這個議題的主要人物。

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  group_by(word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 5, count) %>%
  ungroup() %>% 
  mutate(word = reorder(word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

1.5 探索分析 - Dependency

語句依存關係結果

dependencies = coreNLP_dependency_parser(obj)

head(dependencies,40)

##             dep governor governorGloss dependent dependentGloss
## 1          ROOT        0          ROOT         1       魯蛇家鄉
## 2     nmod:tmod        6          引起         2           最近
## 3         nsubj        6          引起         3           停水
## 4        advmod        6          引起         4             好
## 5        advmod        6          引起         5           幾天
## 6         ccomp        1      魯蛇家鄉         6           引起
## 7          dobj        6          引起         7           民怨
## 8         punct        1      魯蛇家鄉         8             ！
## 9          ROOT        0          ROOT         2           頭殼
## 10         amod        2          頭殼         1             新
## 11        punct        2          頭殼         3             。
## 12         ROOT        0          ROOT         1             。
## 13         ROOT        0          ROOT         6           下雨
## 14  compound:nn        2          老師         1           幹拎
## 15          dep        4          宜蘭         2           老師
## 16         name        4          宜蘭         3           台北
## 17        nsubj        6          下雨         4           宜蘭
## 18       advmod        6          下雨         5           一直
## 19          dep       15          停水         7           結果
## 20 advmod:rcomp        7          結果         8             出
## 21      aux:asp       15          停水         9             了
## 22  nmod:assmod       11          結界        10           北區
## 23        nsubj       15          停水        11           結界
## 24         case       13          桃園        12             到
## 25    nmod:prep       15          停水        13           桃園
## 26       advmod       15          停水        14           直接
## 27        ccomp        6          下雨        15           停水
## 28          dep       21          下雨        16             每
## 29         case       18          周末        17             到
## 30    nmod:prep       21          下雨        18           周末
## 31       advmod       21          下雨        19             就
## 32       advmod       21          下雨        20           一直
## 33        ccomp       15          停水        21           下雨
## 34         dobj       21          下雨        22         連預報
## 35          neg       24            會        23             不
## 36         conj       21          下雨        24             會
## 37  compound:vc       24            會        25           下雨
## 38       advmod       28          下雨        26             也
## 39  compound:vc       28          下雨        27             會
## 40         conj       21          下雨        28           下雨

視覺化 Dependency tree

parse_tree <- obj[[123]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:httr':
## 
##     content

## The following object is masked from 'package:ggplot2':
## 
##     annotate

## Loading required package: igraph

## 
## Attaching package: 'igraph'

## The following object is masked from 'package:tidyr':
## 
##     crossing

## The following objects are masked from 'package:network':
## 
##     %c%, %s%, add.edges, add.vertices, delete.edges, delete.vertices,
##     get.edge.attribute, get.edges, get.vertex.attribute, is.bipartite,
##     is.directed, list.edge.attributes, list.vertex.attributes,
##     set.edge.attribute, set.vertex.attribute

## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union

## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum

## The following object is masked from 'package:base':
## 
##     union

SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)