Abstract
使用coreNLP與sentimentr分析twitter上關於中美貿易戰的文字資料packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "gutenbergr", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
載入已經跑完的資料
load("coreNLP2.RData")
app = 'social_media_31lab'
consumer_key = 'GODaKYgABKACWnCC8oaQyrDz8'
consumer_secret = 'dIlhRbYkgjiCJzVrBqD4lKyQuekc6caKJmFKZXC3de2LaqQWMZ'
access_token = '1108983131132956672-DvMTaH6VsiQgOCN4BMRp7UJOe4mZsQ'
access_secret = 'gEICepruCbQHrMk8OWBAYRRHxEepIH1JsFkrzdrQVRPSH'
create_token(app,consumer_key, consumer_secret,
access_token, access_secret)
<Token>
<oauth_endpoint>
request: https://api.twitter.com/oauth/request_token
authorize: https://api.twitter.com/oauth/authenticate
access: https://api.twitter.com/oauth/access_token
<oauth_app> social_media_31lab
key: GODaKYgABKACWnCC8oaQyrDz8
secret: <hidden>
<credentials> oauth_token, oauth_token_secret
---
# 關鍵字
Keys = c("#tradewars","trade war")
# 除關鍵字外還需在tweets裡出現context才會抓取
context = "China"
df = data.frame()
clean = function(txt,key,context) {
txt = iconv(txt, "latin1", "ASCII", sub="") #轉換編碼
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址
txt = gsub("[\t]{2,}", "", txt) #去除兩個以上的tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("xi jinping","Xi",txt,ignore.case = T) #先將全部的xi jinping(不論大小寫)換成Xi)
txt = gsub(" Xi "," Xi Jinping ",txt,ignore.case = T)
#再將全部的Xi 換成Xi Jinping(正確大小寫 比較能被正確辨識出NER為PERSON)
#要記得是" Xi "前後有空格,否則沒有空格的話 遇到如Brexit這個字 會變成BreXi Jinping
#因為有些原本只有寫Xi 所以做兩次的轉換
#最後再整理空格
txt = gsub("\\s+"," ",txt) #去除一個以上的空格
txt = gsub("^\\s+|\\s+$","",txt) #去除前後一個以上的空格
#只留下我們想看的字元
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字 ?!.' ,空白的都去掉
txt }
for(key in Keys) {
q = paste(c(key,context),collapse=" AND ")
# 查詢字詞 "#tradewars AND China","trade war AND China""
# 為了避免只下#tradewars 會找到非中美貿易戰的tweets,加入China要同時出現的條件
tweets = search_tweets(q,lang="en",n=3000,include_rts = FALSE,retryonratelimit = T)
#抓3000筆 不抓轉推
tweets$text = clean(tweets$text,key,context)
df = rbind(df,tweets) # transfer to data frame
}
Searching for tweets...
This may take a few seconds...
Finished collecting tweets!
Searching for tweets...
This may take a few seconds...
Finished collecting tweets!
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweets
因為tweets是自由發揮,沒有固定格式的文章,因此在資料前處理會較繁雜,且要多次嘗試
查看df內容與欄位,了解rtweet抓回了什麼資料
head(df)
df共有88個欄位,但我們在這裡僅會使用幾個欄位:
rtweet最多只能抓到距今10天的資料,因此即使我們設定一個query要抓3000筆,照理說我們下了2個query,應該要抓到6000,但因為rtweet最遠只能抓到10天前的資料,在這10天內也無法抓到那麼多
nrow(df)
[1] 3737
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
min(df$created_at)
[1] "2019-03-15 11:28:54 UTC"
max(df$created_at)
[1] "2019-03-25 05:02:41 UTC"
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 生產core-nlp的api url,可以設定斷詞依據、以及要標註的任務
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}',
host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
generate_API_url("127.0.0.1")
指定服務的位置
host = "127.0.0.1"
# 呼叫core-nlp api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}
從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}
從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}
# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}
取得coreNLP回傳的物件
先不要跑這段,會花大概18分鐘(如果你記憶體只有4G可能會當掉…)
# gc() #釋放不使用的記憶體
#
# t0 = Sys.time()
# obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host)
# #先過濾掉沒有內容的的tweet
# #丟入coreNLP的物件 必須符合: 是一個data.frame 且有一個text欄位
#
# Sys.time() - t0 #執行時間
# #Time difference of 17.89611 mins
#
# save.image("coreNLP_0325.RData")
tokens = coreNLP_tokens_parser(obj)
tokens
coreNLP_tokens_parser欄位:
辨識出哪幾種類型的實體
levels(tokens$ner)
[1] "O" "COUNTRY" "NATIONALITY" "DATE"
[5] "MISC" "NUMBER" "PERSON" "TITLE"
[9] "ORDINAL" "CITY" "CAUSE_OF_DEATH" "LOCATION"
[13] "ORGANIZATION" "RELIGION" "PERCENT" "DURATION"
[17] "TIME" "IDEOLOGY" "MONEY" "SET"
[21] "STATE_OR_PROVINCE" "CRIMINAL_CHARGE" "URL"
length(unique(tokens$word[tokens$ner != "O"])) #除去entity為Other,有多少種word有被標註entity
[1] 1752
因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma
轉成小寫的目的是要將不同大小寫的同一字詞(如Trump與trump)都換成小寫,再來計算詞頻。
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)
對於初步不了解中美貿易戰的人來說,可能不知道相關人物是誰,無法設立完整的人物字典。透過coreNLP解析出NER,篩選出PERSON並計算實體的詞頻,能讓我們知道相關人物的在話題中的重要性。
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 15, count) %>%
ungroup() %>%
mutate(lower_word = reorder(lower_word, count)) %>%
ggplot(aes(lower_word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(lower_word = reorder(lower_word, count)) %>%
ggplot(aes(lower_word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()
tokens %>%
filter(ner == "IDEOLOGY") %>% #篩選NER為IDEOLOGY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(lower_word = reorder(lower_word, count)) %>%
ggplot(aes(lower_word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is IDEOLOGY)") +
theme(text=element_text(size=14))+
coord_flip()
參考解答
data("stop_words") #載入存在tidytext套件中的stop_words資料
names(stop_words)[1] = "lower_word" #將stop_words的第一個欄位名稱改做lower_word
tokens %>%
anti_join(stop_words) %>% #tokens和stop_words anti_join by 相同名稱的欄位lower_word
filter(!( lower_word %in% c("trade","war","china","us","u.s.","united","states","america"))) %>%
group_by(lower_word) %>%
summarise(count = n()) %>%
arrange(desc(count))
Joining, by = "lower_word"
tokens %>%
anti_join(stop_words) %>%
filter(!( lower_word %in% c("trade","war","china","us","u.s.","united","states","america"))) %>%
group_by(lower_lemma) %>%
summarise(count = n()) %>%
arrange(desc(count))
Joining, by = "lower_word"
table(df$source) %>% sort(decreasing = T) %>% head
Twitter Web Client Twitter for iPhone Twitter for Android IFTTT
976 581 450 246
dlvr.it Twitter for iPad
149 127
tokens %>%
anti_join(stop_words) %>%
merge(df[,c(2,6)]) %>%
filter(!(lower_word %in% c("trade","war","china","u.s.","united","states","america")),
source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%
group_by(source,lower_lemma) %>%
summarize(count = n()) %>% #計算每組
top_n(15,count) %>%
ungroup() %>%
mutate(lower_lemma = reorder(lower_lemma, count)) %>%
#arrange(desc(count),.by_group = TRUE) %>%
ggplot(aes(lower_lemma, count)) +
geom_col(show.legend = FALSE) +
facet_wrap(~source, scales = "free_y") +
labs(y = "tweets from difference source",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
Joining, by = "lower_word"
dependencies = coreNLP_dependency_parser(obj)
dependencies
查看原句
df$text[df$status_id == "1109868276069343233"]
[1] "Paper Dragon. Laws of economics can only be manipulated for so long. Trade Wars are easy. China goes bust Chinese companies had record amount of corporate bond defaults in 2018"
parse_tree <- obj[[1]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
Loading required package: NLP
package 㤼㸱NLP㤼㸲 was built under R version 3.5.2
Attaching package: 㤼㸱NLP㤼㸲
The following object is masked from 㤼㸱package:httr㤼㸲:
content
The following object is masked from 㤼㸱package:ggplot2㤼㸲:
annotate
Loading required package: igraph
package 㤼㸱igraph㤼㸲 was built under R version 3.5.3
Attaching package: 㤼㸱igraph㤼㸲
The following objects are masked from 㤼㸱package:dplyr㤼㸲:
as_data_frame, groups, union
The following objects are masked from 㤼㸱package:stats㤼㸲:
decompose, spectrum
The following object is masked from 㤼㸱package:base㤼㸲:
union
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)
sentiment = coreNLP_sentiment_parser(obj)
sentiment
在這個資料集中,情緒label有幾種
levels(sentiment$sentiment)
[1] "Negative" "Neutral" "Positive" "Verynegative"
**注意**
先讓我們查看sentimentValue的類型,是factor
class(sentiment$sentimentValue)
[1] "factor"
但是我們必須轉成numeric才能對其做數值運算,例如計算平均情緒
要特別注意,若直接將factor轉成numeric,結果的數值不會是原本factor的數值,而會是factor level的順序,範例:
A = c(1,4,3,0,4,3) %>% as.factor
levels(A)
[1] "0" "1" "3" "4"
level順序是0 > 1 > 3 > 4
若將A直接轉成numeric,可以發現結果並不會是原本的(1,4,3,0,4,3),而是轉換成元素原本的level順序
(1,4,3,0,4,3) => (level順序: 0是1,1是2,3是3,4是4 ) => (2,4,3,1,4,3)
A %>% as.numeric
[1] 2 4 3 1 4 3
因此,我們不能直接將sentimentValue直接由factor類型轉換成numeric
而是先將factor轉character,再轉numeric
sentiment$sentimentValue = sentiment$sentimentValue %>% as.character %>% as.numeric
用table看情緒label對應的sentimentValue
table(sentiment$sentiment,sentiment$sentimentValue)
0 1 2 3
Negative 0 2680 0 0
Neutral 0 0 813 0
Positive 0 0 0 181
Verynegative 26 0 0 0
sentimentValue就會是原本的:
0,1 : Verynegative,negative
2 : neutral
3,4 : positive,Verypositive(在本次資料集沒有出現)
平均情緒分數時間趨勢
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()
不同用戶端情緒時間趨勢
sentiment$sentimentValue = as.numeric(sentiment$sentimentValue)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%
group_by(date,source) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=source)) +
geom_line()
Verynegative和negative的tweets常出現的字
X = sentiment %>%
merge(tokens[,c("status_id","lower_word","lower_lemma","ner")]) %>%
anti_join(stop_words)
Joining, by = "lower_word"
X %>% filter(
!(lower_word %in% c("trade","war","china","china.","u.s.","united","states","trump","america")),
sentiment %in% c("Verynegative","negative") ,
ner != "NUMBER") %>%
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>%
wordcloud2()
X %>% filter(
!(lower_word %in% c("trade","war","china","china.","u.s.","united","states","trump","america")),
sentiment %in% c("Verynegative","negative") ,
ner != "NUMBER") %>%
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>%
arrange(desc(count))
positive的tweets常出現的字
sentiment %>%
merge(tokens[,c("status_id","lower_word","lower_lemma","ner")]) %>%
anti_join(stop_words) %>%
filter(!(lower_word %in% c("trade","war","china","china.","u.s.","united","states","trump","america")),
sentiment == "positive",
ner != "NUMBER") %>%
group_by(lower_lemma) %>% #根據word分組
summarize(count = n()) %>%
top_n(40,count) %>%
wordcloud2()
Joining, by = "lower_word"
no non-missing arguments to max; returning -Inf
library(sentimentr)
mytext <- c(
'do you like it? But I hate really bad dogs',
'I am the best friend.',
'Do you really like it? I\'m not a fan'
)
mytext <- get_sentences(mytext) #將character向量轉成list,list裡放著character向量(斷句)
mytext[[1]][1] #取出mytext(list)裡的第一個向量的第一個元素
[1] "do you like it?"
sentiment_by(mytext) #sentiment_by() 給定文本的平均情感分數
sentiment(mytext) # sentiment() 在sentence的級別評分
轉換Emoji代碼為語意文字
replace_emoji("\U0001f4aa")
packages = c("cleanNLP","dplyr", "magrittr","twitterR","stringi", "udpipe", "reticulate", "rJava",
"RCurl", "knitr", "rmarkdown", "testthat", "covr",
"roxygen2")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
# library(rJava)
# library(cleanNLP)
# library(udpipe)
# library(reticulate)
# gc()
#
# use_python("C:\\Users\\konir\\Anaconda3\\python") #指定自己的python路徑(版本3.6+)
# cnlp_init_udpipe()
# cnlp_init_corenlp("en",anno_level=2,lib_location = "C:\\Users\\konir\\Documents\\R\\win-library\\3.5\\cleanNLP\\extdata") #lib_locatio改成corenlp模組ㄜ
#
#
# #如果出現java.lang.OutOfMemoryError: GC overhead limit exceeded代表QQ你的記憶體就算gc()過了還是不夠,請重開R看看
# t0 = Sys.time()
# obj <- cnlp_annotate(df$text, as_strings = TRUE,backend = "coreNLP")
# Sys.time() - t0
# cnlp_get_document(obj)
# cnlp_get_dependency(obj)
# cnlp_get_token(obj)
# cnlp_get_entity(obj)
# cnlp_get_sentence(obj) #a score from 0 (most negative) to 4 (most positive)