從推特上面抓有近期有#XinjiangCotton (新疆棉事件)的推特和推文下來進行分析
packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(webshot)
library(htmlwidgets)
#load("coreNLP_all.RData")
app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權
# 查詢關鍵字
key = c("#XinjiangCotton")
#context = ""
#q = paste(c(key,context),collapse=" AND ")
#抓5000筆 不抓轉推
tweets = search_tweets(key,lang="en",n=5000,include_rts = FALSE,token = twitter_token)
## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweets
nrow(df)
## [1] 313
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)
# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}
#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}
# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}
gc() #釋放不使用的記憶體
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1939463 103.6 3788595 202.4 2660078 142.1
## Vcells 3407514 26.0 8388608 64.0 5403089 41.3
t0 = Sys.time()
obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
Sys.time() - t0 #執行時間
## Time difference of 1.031911 mins
#Time difference of 28 mins
save.image("coreNLP_all.RData")
tokens = coreNLP_tokens_parser(obj)
head(tokens,20)
## status_id word lemma pos ner
## 1 1381607228751425543 Especially especially RB O
## 2 1381607228751425543 those those DT O
## 3 1381607228751425543 in in IN O
## 4 1381526745447243782 Xinjiang Xinjiang NNP STATE_OR_PROVINCE
## 5 1381526745447243782 has have VBZ O
## 6 1381526745447243782 been be VBN O
## 7 1381526745447243782 in in IN O
## 8 1381526745447243782 the the DT O
## 9 1381526745447243782 news news NN O
## 10 1381526745447243782 a a DT O
## 11 1381526745447243782 lot lot NN O
## 12 1381526745447243782 lately. lately. NN O
## 13 1381526745447243782 Is be VBZ O
## 14 1381526745447243782 it it PRP O
## 15 1381526745447243782 safe safe JJ O
## 16 1381526745447243782 to to TO O
## 17 1381526745447243782 travel travel VB O
## 18 1381526745447243782 in in IN O
## 19 1381526745447243782 this this DT O
## 20 1381526745447243782 troubled troubled JJ O
unique(tokens$ner)
## [1] "O" "STATE_OR_PROVINCE" "CITY"
## [4] "ORGANIZATION" "NATIONALITY" "COUNTRY"
## [7] "RELIGION" "CRIMINAL_CHARGE" "PERSON"
## [10] "DATE" "LOCATION" "MISC"
## [13] "NUMBER" "TITLE" "IDEOLOGY"
## [16] "CAUSE_OF_DEATH" "ORDINAL" "SET"
## [19] "TIME" "DURATION"
length(unique(tokens$word[tokens$ner != "O"]))
## [1] 244
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)
可以發現最多的前三分別是中國、美國和日本。和上次抓PTT的資料比起來有點不一樣。爆發衝突的另外一方『歐洲各國』完全沒有在前列
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()
## 涉及此議題的組織: 就中國共產黨和相關的企業、新聞媒體等
tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()
## 涉及此議題的人 基本上還是品牌
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()
## 相依性分析: 處理相依性分析結果
dependencies = coreNLP_dependency_parser(obj)
head(dependencies,20)
## status_id dep governor governorGloss dependent dependentGloss
## 1 1381607228751425543 ROOT 0 ROOT 1 Especially
## 2 1381607228751425543 obl 1 Especially 2 those
## 3 1381607228751425543 case 2 those 3 in
## 4 1381526745447243782 ROOT 0 ROOT 12 safe
## 5 1381526745447243782 nsubj 6 news 1 Xinjiang
## 6 1381526745447243782 aux 6 news 2 has
## 7 1381526745447243782 cop 6 news 3 been
## 8 1381526745447243782 case 6 news 4 in
## 9 1381526745447243782 det 6 news 5 the
## 10 1381526745447243782 dep 12 safe 6 news
## 11 1381526745447243782 det 9 lately. 7 a
## 12 1381526745447243782 compound 9 lately. 8 lot
## 13 1381526745447243782 dep 6 news 9 lately.
## 14 1381526745447243782 cop 12 safe 10 Is
## 15 1381526745447243782 nsubj 12 safe 11 it
## 16 1381526745447243782 mark 14 travel 13 to
## 17 1381526745447243782 dep 12 safe 14 travel
## 18 1381526745447243782 case 18 region? 15 in
## 19 1381526745447243782 det 18 region? 16 this
## 20 1381526745447243782 amod 18 region? 17 troubled
length(obj)
## [1] 308
視覺化
parse_tree <- obj[[39]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
##
## content
## The following object is masked from 'package:ggplot2':
##
## annotate
## Loading required package: igraph
## Warning: package 'igraph' was built under R version 4.0.5
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)
計算情緒分數
sentiment = coreNLP_sentiment_parser(obj)
head(sentiment,20)
## status_id
## 1 1381607228751425543
## 2 1381526745447243782
## 3 1381521708163665921
## 4 1379743605087207432
## 5 1379744323051339776
## 6 1380766530976182273
## 7 1380039296166465540
## 8 1380439643548573698
## 9 1381499610552696835
## 10 1381478518798774279
## 11 1381476084420538369
## 12 1379717018321051654
## 13 1381466974878892036
## 14 1380082901883228160
## 15 1379481263053111302
## 16 1381459984324567041
## 17 1381437882854871042
## 18 1379774376304775169
## 19 1381433179689545731
## 20 1381402886018580484
## text
## 1 Especially those in
## 2 Xinjiang has been in the news a lot lately. Is it safe to travel in this troubled region? Find out here!
## 3 Hong Kong is a very good example of OneChina consensus.
## 4 The CCP is trying transfer its internal contradictions to Taiwan. The CCP is trying to seize Chinese people's properties now. It's like the same as culture revolution era.
## 5 The CCP is trying transfer its internal contradictions to Taiwan. The CCP is trying to seize Chinese people's properties now. It's like the same as culture revolution era.
## 6 Instead of rule by law China is rule by a bunch of mafia thugs.
## 7 We have to fight together against . Let's fight for .
## 8 If you will trust it means that you will be finished soon.is not trustworthy.
## 9 is destroying .
## 10 The truth aboutand
## 11 Don't insult our intelligence . BCI's complete collapse came solely from the lies they fabricated againstwhich they have yet to produce a shred of evidence. What makes u think they need to please anybody but their funders USAID?
## 12 Petty genocidal. Theis just the worst on every level.
## 13 The dead eyed gelding should hush.
## 14 Here it is A protester is a rioter. A Muslim is a terrorist. Truth is lies. Genocide is education.
## 15 If thearrests doctors for telling the truth what chance do theb have?
## 16 How disinformation of educational camps got started? How BBC commissioned Adrian Zenz to spin it? Check the research work by
## 17 How can you engage in constructive dialogue with the government in a place where there is no civil society no freedom of association and no trade unions? An exILO official on Xinjiang conundrum
## 18 Spring in Kashgar episode one.
## 19 excerpt from Kate Kui on Xinjiangmigrant workers in Xinjiang picking cotton in harvest season is like Amazon hiring temporary workers for peak Christmas orders. This is based on demand supply machine is used more as a much economic substitute.
## 20 Support Global to execute the sanctions and Military action of .
## sentiment sentimentValue
## 1 Neutral 2
## 2 Positive 3
## 3 Positive 3
## 4 Neutral 2
## 5 Neutral 2
## 6 Neutral 2
## 7 Negative 1
## 8 Neutral 2
## 9 Neutral 2
## 10 Neutral 2
## 11 Negative 1
## 12 Neutral 2
## 13 Negative 1
## 14 Positive 3
## 15 Neutral 2
## 16 Positive 3
## 17 Negative 1
## 18 Neutral 2
## 19 Neutral 2
## 20 Neutral 2
unique(sentiment$sentiment)
## [1] "Neutral" "Positive" "Negative" "Verypositive"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric
sentiment$sentiment %>% table()
## .
## Negative Neutral Positive Verypositive
## 77 190 39 2
其實近期已經過了議題的巔峰,可以發現情緒起起服服。
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%
group_by(date,source) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=source)) +
geom_line()
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
#anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >5 & count<400)%>%
wordcloud2()
#了解負面文章的詞彙使用
a <- sentiment %>%
merge(tokens) %>%
#anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >10 &count<400)%>%
wordcloud2()
my_graph <- a
saveWidget(my_graph, "temp.html", selfcontained = F)
webshot("temp.html", "wc1.png", delay = 5, vwidth = 500, vheight = 500)