避免亂碼
## [1] ""
下載需要的套件
packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr","data.table","wordcloud2","wordcloud","DiagrammeR","magrittr","rtweet",
"xml2","httr","jsonlite","data.tree","sentimentr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)require(readr)
require(dplyr)
require(stringr)
require(jiebaR)
require(tidytext)
require(NLP)
require(tidyr)
require(ggplot2)
require(ggraph)
require(igraph)
require(scales)
require(reshape2)
require(widyr)
require(data.table)
require(wordcloud2)
require(wordcloud)
library(DiagrammeR)
library(magrittr)
library(rtweet)
library(xml2)
library(httr)
library(jsonlite)
library(data.tree)
library(sentimentr)香港反送中、泰國抗爭造成了國際眾大關注,也間接影響到各國對於民主與專制的討論。這次的緬甸抗爭引起了國際的注意,為了解國際情勢,了解東西方國家對於這次的關心程度,為此我們決定研究這次緬甸抗爭在Twitter和PPT上的討論程度。
透過rtweet抓取tweets
app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權關鍵字:“Myanmar”、“coup”
## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweetsdf共有90個欄位,但我們在這裡僅會使用幾個欄位:
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料
## [1] 3675
## [1] "2021-04-25"
## [1] "2021-05-02"
目的是計算出每一天文章的發表數量,可以看出特定主題討論的熱度。
df$created_at <- df$created_at %>% as.Date("%Y/%m/%d")
data <- df %>%
dplyr::select(created_at, urls_url)
# distinct()article_count_by_date <- data %>%
group_by(created_at) %>%
summarise(count = n())
article_count_by_date## # A tibble: 8 x 2
## created_at count
## <date> <int>
## 1 2021-04-25 229
## 2 2021-04-26 194
## 3 2021-04-27 351
## 4 2021-04-28 2209
## 5 2021-04-29 155
## 6 2021-04-30 181
## 7 2021-05-01 157
## 8 2021-05-02 199
plot_date <-
# data
article_count_by_date %>%
# aesthetics
ggplot(aes(x = created_at, y = count)) +
# geometrics
geom_line(color = "#00AFBB", size = 1) +
# coordinates
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("Twitter 討論文章數") +
xlab("日期") +
ylab("數量") +
# theme
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
plot_date由此可看出,4/28討論度最高。 根據新聞可知4/27軍方與武裝少數民族 - 克倫民族佔領軍方基地,並表示是2/1以來最嚴重的一次衝突。並且4/28當天緬甸反政變人士所組成的親民族團結政府表示在可以舉行任何有建設性的對話之前軍方需先無條件釋放政治犯,包括民族團結政府總統溫敏及國務資政翁山蘇姬。 此外聯合國指出因為COVID-19及政變可能會呈嚴重飢荒以及大批難民。
df_word <- df %>%
unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
# filter(!(word %in% c("dont","myanmar","coup"))) %>%
count(user_id, created_at, word, sort = TRUE)## Joining, by = "word"
## # A tibble: 40,831 x 4
## user_id created_at word n
## <chr> <date> <chr> <int>
## 1 1360227293617037313 2021-05-02 coupeliminate 42
## 2 1360227293617037313 2021-05-02 protest 42
## 3 1360227293617037313 2021-05-02 youth 41
## 4 178312978 2021-04-26 care 21
## 5 178312978 2021-04-26 china 21
## 6 178312978 2021-04-26 chinarussia 21
## 7 178312978 2021-04-26 coup 21
## 8 178312978 2021-04-26 dominate 21
## 9 178312978 2021-04-26 dont 21
## 10 178312978 2021-04-26 mastermind 21
## # ... with 40,821 more rows
words_count <- df_word %>%
filter(nchar(.$word)>1) %>%
group_by(word) %>%
summarise(sum = n()) %>%
filter(sum>10) %>%
arrange(desc(sum))
words_count## # A tibble: 276 x 2
## word sum
## <chr> <int>
## 1 coup 2536
## 2 military 1587
## 3 internet 1247
## 4 strike 1233
## 5 situation 1211
## 6 anti 1202
## 7 current 1200
## 8 inagainst 1199
## 9 reopen 1198
## 10 schools 1198
## # ... with 266 more rows
library(tidyr)
sentiment <- df_word %>%
inner_join(get_sentiments("bing")) %>%
count(user_id, sentiment,created_at) %>%
# spread(sentiment, n, fill = 0) %>% 與下一列等同
# 在不同的列中得到正面和負面的情感
pivot_wider(names_from = sentiment, values_from = n, values_fill = 0) %>%
# 計算情感差距 (正面-負面)
mutate(sentiment = positive - negative)## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 4781
## 2 positive 2005
bing_word_counts <- df_word %>%
inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()## Joining, by = "word"
## # A tibble: 471 x 3
## word sentiment n
## <chr> <chr> <int>
## 1 strike negative 1233
## 2 lost negative 928
## 3 win positive 211
## 4 protest negative 76
## 5 protests negative 65
## 6 killed negative 55
## 7 brutal negative 45
## 8 crisis negative 45
## 9 limited negative 37
## 10 poverty negative 37
## # ... with 461 more rows
bing_word_counts %>%
group_by(sentiment) %>%
slice_max(n, n = 10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(n, word, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(x = "Contribution to sentiment",
y = NULL)圖中顯示罷工為負面字詞最多的原因是:緬甸人民打算罷工至緬甸所有民主得以伸張後才會停止。
server端 :
java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}取得coreNLP回傳的物件
先不要跑這段,會花大概半小時(如果你記憶體只有4G可能會當掉)
## status_id word lemma pos ner
## 1 1388922737561722881 The the DT O
## 2 1388922737561722881 turmoil turmoil NN O
## 3 1388922737561722881 following follow VBG O
## 4 1388922737561722881 the the DT O
## 5 1388922737561722881 military military JJ O
## 6 1388922737561722881 coup coup NN O
## 7 1388922737561722881 incoupled incouple VBN O
## 8 1388922737561722881 with with IN O
## 9 1388922737561722881 the the DT O
## 10 1388922737561722881 impact impact NN O
## 11 1388922737561722881 ofcould ofcould NN O
## 12 1388922737561722881 result result NN O
## 13 1388922737561722881 in in IN O
## 14 1388922737561722881 up up RB O
## 15 1388922737561722881 to to IN O
## 16 1388922737561722881 25 25 CD NUMBER
## 17 1388922737561722881 million million CD NUMBER
## 18 1388922737561722881 peoplenearly peoplenearly JJ O
## 19 1388922737561722881 half half NN O
## 20 1388922737561722881 of of IN O
coreNLP_tokens_parser欄位:
status_id : 對應原本df裡的status_id,為一則tweets的唯一id
word: 原始斷詞
lemma : 對斷詞做詞形還原
pos : part-of-speech,詞性
ner: 命名實體
命名實體標註(NER) 從NER查看特定類型的實體,辨識出哪幾種類型
## [1] "O" "NUMBER" "DATE"
## [4] "ORGANIZATION" "PERSON" "TITLE"
## [7] "CAUSE_OF_DEATH" "DURATION" "NATIONALITY"
## [10] "MISC" "CITY" "CRIMINAL_CHARGE"
## [13] "ORDINAL" "COUNTRY" "LOCATION"
## [16] "TIME" "IDEOLOGY" "MONEY"
## [19] "STATE_OR_PROVINCE" "SET" "PERCENT"
## [1] 745
我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變,所涉及到的國家(COUNTRY),以初步了解這個議題的主要國家。
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()## `summarise()` ungrouping output (override with `.groups` argument)
緬甸所涉及的國家第一名是中國原因:中國和俄羅斯都跟緬甸武裝部隊關係密切,他們分別是緬甸軍火的第一大和第二大供應者。
我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變,所涉及到的組織(ORGANIZATION),以初步了解這個議題的主要公司/單位。
tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()## `summarise()` ungrouping output (override with `.groups` argument)
圖中所提及的組織原因:
我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變,所涉及到的人物(PERSON),以初步了解這個議題的主要人物。
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
filter(!lower_word %in% c("taung" ,"mingala"))%>%
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()## `summarise()` ungrouping output (override with `.groups` argument)
圖中所提及之人物原因:
我們可以透過coreNLP中的NER解析出在Twitter上面談論緬甸政變,所涉及到的地點(LOCATION),以初步了解這個議題的主要人物。
tokens %>%
filter(ner == "LOCATION") %>% #篩選NER為Location
filter(!lower_word %in% c("yangon."))%>%
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is LOCATION)") +
theme(text=element_text(size=14))+
coord_flip()## `summarise()` ungrouping output (override with `.groups` argument)
圖中所提及之地點原因:
## status_id dep governor governorGloss dependent dependentGloss
## 1 1388922737561722881 ROOT 0 ROOT 2 turmoil
## 2 1388922737561722881 det 2 turmoil 1 The
## 3 1388922737561722881 case 6 coup 3 following
## 4 1388922737561722881 det 6 coup 4 the
## 5 1388922737561722881 amod 6 coup 5 military
## 6 1388922737561722881 nmod 2 turmoil 6 coup
## 7 1388922737561722881 acl 6 coup 7 incoupled
## 8 1388922737561722881 case 12 result 8 with
## 9 1388922737561722881 det 12 result 9 the
## 10 1388922737561722881 compound 12 result 10 impact
## 11 1388922737561722881 compound 12 result 11 ofcould
## 12 1388922737561722881 obl 7 incoupled 12 result
## 13 1388922737561722881 mark 24 living 13 in
## 14 1388922737561722881 advmod 24 living 14 up
## 15 1388922737561722881 case 19 half 15 to
## 16 1388922737561722881 compound 17 million 16 25
## 17 1388922737561722881 nummod 19 half 17 million
## 18 1388922737561722881 amod 19 half 18 peoplenearly
## 19 1388922737561722881 obl 14 up 19 half
## 20 1388922737561722881 case 23 population 20 of
情緒分數從最低分0~最高分4 + 0,1 : very negative,negative + 2 : neutral + 3,4 : very positive,postive
## # A tibble: 20 x 5
## user_id created_at negative positive sentiment
## <chr> <date> <int> <int> <int>
## 1 1000091387088404480 2021-04-28 2 0 -2
## 2 1000283029363769345 2021-04-28 1 0 -1
## 3 1000657974870790145 2021-04-28 1 0 -1
## 4 1002627878909952005 2021-04-28 1 0 -1
## 5 1006051354488365057 2021-04-26 1 1 0
## 6 1006051354488365057 2021-04-29 1 1 0
## 7 1006051354488365057 2021-04-28 0 1 1
## 8 1006083289545785344 2021-04-28 1 0 -1
## 9 1008979981 2021-05-02 0 2 2
## 10 1009546724 2021-04-26 1 1 0
## 11 1009989511747473408 2021-05-02 1 0 -1
## 12 1010955190885732353 2021-04-28 2 0 -2
## 13 1014542681547026433 2021-04-28 1 0 -1
## 14 1014813793619083264 2021-04-28 1 0 -1
## 15 1014837749755494401 2021-04-29 1 0 -1
## 16 1014838262161014784 2021-04-27 2 1 -1
## 17 1014877087130423297 2021-04-26 0 1 1
## 18 1014877087130423297 2021-04-27 0 1 1
## 19 1014882776057696256 2021-04-28 1 0 -1
## 20 1014891217916604416 2021-04-28 2 0 -2
#unique(sentiment$sentiment)
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric
View(sentiment)## .
## -16 -8 -6 -5 -4 -3 -2 -1 0 1 2 3 4
## 1 1 4 13 27 79 590 1556 99 284 29 12 1
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()+
scale_x_date(labels = date_format("%m/%d"))圖中顯示:
#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the',"!")) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >5 & count<400)%>%
wordcloud2()#了解負面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >10 &count<400)%>%
wordcloud2()set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字## words polarity n
## 1: care 1.0 9
## 2: triumphs 1.0 8
## 3: justice 1.0 8
## 4: please 1.0 3
## 5: humanitarian 1.0 3
## ---
## 210: pray 0.1 2
## 211: cooperation 0.1 2
## 212: camping 0.1 1
## 213: praying 0.1 1
## 214: momentum 0.1 1
## Selecting by n
## words polarity n
## 1: strike -0.75 341
## 2: anti -1.00 336
## 3: lost -0.75 262
## 4: threatened -0.50 254
## 5: kidnapping -1.00 254
## 6: abducted -1.00 44
## 7: protest -0.50 34
## 8: criminals -1.00 31
## 9: junta -0.25 29
## 10: protesters -0.60 27
code 參考 https://github.com/trinker/sentimentr
tweets$date = format(tweets$created_at,'%Y%m%d')
(out = tweets %>% with(
sentiment_by( #document level
get_sentences(text),
list( date)
)
))
plot(out) 圖中呈現,可發現整體事件4/25因為東協會議表示即將終止,因此呈現正面;但因為後面依舊有發生許多抗爭跡象,因此後續情緒普遍為負面呈現。
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2,574 x 2
## user_id total
## <chr> <int>
## 1 1000091387088404480 22
## 2 1000283029363769345 11
## 3 1000657974870790145 11
## 4 1002627878909952005 11
## 5 1006051354488365057 86
## 6 1006083289545785344 11
## 7 1008979981 14
## 8 1009546724 19
## 9 100986964 6
## 10 1009989511747473408 20
## # ... with 2,564 more rows
## Joining, by = "user_id"
## # A tibble: 40,831 x 5
## user_id created_at word n total
## <chr> <date> <chr> <int> <int>
## 1 1360227293617037313 2021-05-02 coupeliminate 42 144
## 2 1360227293617037313 2021-05-02 protest 42 144
## 3 1360227293617037313 2021-05-02 youth 41 144
## 4 178312978 2021-04-26 care 21 579
## 5 178312978 2021-04-26 china 21 579
## 6 178312978 2021-04-26 chinarussia 21 579
## 7 178312978 2021-04-26 coup 21 579
## 8 178312978 2021-04-26 dominate 21 579
## 9 178312978 2021-04-26 dont 21 579
## 10 178312978 2021-04-26 mastermind 21 579
## # ... with 40,821 more rows
# 以每篇文章爲單位,計算每個詞彙的 tf-idf 值
Myanmar_words_tf_idf <- Myanmar_words %>%
bind_tf_idf(word,user_id, n)
Myanmar_words_tf_idf## # A tibble: 40,831 x 8
## user_id created_at word n total tf idf tf_idf
## <chr> <date> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 13602272936170373~ 2021-05-02 coupeliminate 42 144 0.292 7.85 2.29e+0
## 2 13602272936170373~ 2021-05-02 protest 42 144 0.292 3.52 1.03e+0
## 3 13602272936170373~ 2021-05-02 youth 41 144 0.285 6.47 1.84e+0
## 4 178312978 2021-04-26 care 21 579 0.0363 5.46 1.98e-1
## 5 178312978 2021-04-26 china 21 579 0.0363 5.08 1.84e-1
## 6 178312978 2021-04-26 chinarussia 21 579 0.0363 5.91 2.14e-1
## 7 178312978 2021-04-26 coup 21 579 0.0363 0.0149 5.39e-4
## 8 178312978 2021-04-26 dominate 21 579 0.0363 5.91 2.14e-1
## 9 178312978 2021-04-26 dont 21 579 0.0363 5.21 1.89e-1
## 10 178312978 2021-04-26 mastermind 21 579 0.0363 5.91 2.14e-1
## # ... with 40,821 more rows
# 選出每篇文章,tf-idf值最大的五個詞
Myanmar_words_tf_idf %>%
group_by(user_id) %>%
slice_max(tf_idf, n=5) %>%
arrange(desc(user_id))## # A tibble: 13,874 x 8
## # Groups: user_id [2,574]
## user_id created_at word n total tf idf tf_idf
## <chr> <date> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 999907196 2021-04-26 howcripples 1 3 0.333 7.85 2.62
## 2 999907196 2021-04-26 itssystem 1 3 0.333 7.85 2.62
## 3 999907196 2021-04-26 return 1 3 0.333 4.91 1.64
## 4 999323392158265344 2021-04-26 chasing 1 15 0.0667 7.85 0.524
## 5 999323392158265344 2021-04-26 clouds 1 15 0.0667 7.85 0.524
## 6 999323392158265344 2021-04-26 enjoy 1 15 0.0667 7.85 0.524
## 7 999323392158265344 2021-04-26 enjoyed 1 15 0.0667 7.85 0.524
## 8 999323392158265344 2021-04-26 uncertain 1 15 0.0667 7.85 0.524
## 9 997324537581584384 2021-04-28 threatened 1 11 0.0909 1.03 0.0936
## 10 997324537581584384 2021-04-28 civillians 1 11 0.0909 1.03 0.0934
## # ... with 13,864 more rows
# 從每篇文章挑選出tf-idf最大的十個詞,
# 並計算每個詞被選中的次數
Myanmar_words_tf_idf %>%
group_by(user_id) %>%
slice_max(tf_idf, n=10) %>%
ungroup() %>%
count(word, sort=TRUE)## # A tibble: 3,571 x 2
## word n
## <chr> <int>
## 1 kidnapping 903
## 2 lives 903
## 3 lost 903
## 4 safety 903
## 5 terroristsecurity 903
## 6 threatened 903
## 7 civillians 902
## 8 nights 902
## 9 days 901
## 10 forces 893
## # ... with 3,561 more rows
jieba_tokenizer = worker()
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}## # A tibble: 75,984 x 91
## user_id status_id created_at screen_name source display_text_wi~
## <chr> <chr> <date> <chr> <chr> <dbl>
## 1 164263~ 13889227~ 2021-05-02 UNGeneva Tweet~ 261
## 2 164263~ 13889227~ 2021-05-02 UNGeneva Tweet~ 261
## 3 164263~ 13889227~ 2021-05-02 UNGeneva Tweet~ 261
## 4 164263~ 13889227~ 2021-05-02 UNGeneva Tweet~ 261
## 5 164263~ 13889227~ 2021-05-02 UNGeneva Tweet~ 261
## 6 164263~ 13889227~ 2021-05-02 UNGeneva Tweet~ 261
## 7 164263~ 13889227~ 2021-05-02 UNGeneva Tweet~ 261
## 8 164263~ 13889227~ 2021-05-02 UNGeneva Tweet~ 261
## 9 164263~ 13889227~ 2021-05-02 UNGeneva Tweet~ 261
## 10 164263~ 13889227~ 2021-05-02 UNGeneva Tweet~ 261
## # ... with 75,974 more rows, and 85 more variables: reply_to_status_id <chr>,
## # reply_to_user_id <chr>, reply_to_screen_name <chr>, is_quote <lgl>,
## # is_retweet <lgl>, favorite_count <int>, retweet_count <int>,
## # quote_count <int>, reply_count <int>, hashtags <list>, symbols <list>,
## # urls_url <list>, urls_t.co <list>, urls_expanded_url <list>,
## # media_url <list>, media_t.co <list>, media_expanded_url <list>,
## # media_type <list>, ext_media_url <list>, ext_media_t.co <list>,
## # ext_media_expanded_url <list>, ext_media_type <chr>,
## # mentions_user_id <list>, mentions_screen_name <list>, lang <chr>,
## # quoted_status_id <chr>, quoted_text <chr>, quoted_created_at <dttm>,
## # quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>, date <date>, bigram <chr>
## # A tibble: 13,246 x 2
## bigram n
## <chr> <int>
## 1 trying to 1210
## 2 anti coup 1206
## 3 back internet 1205
## 4 coup strike 1204
## 5 the current 1204
## 6 inagainst military 1203
## 7 reopen schools 1203
## 8 to reopen 1203
## 9 current situation 1201
## 10 is trying 1201
## # ... with 13,236 more rows
water_bigram %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
count(word1, word2, sort = TRUE) %>%
unite_("bigram", c("word1","word2"), sep=" ")## # A tibble: 13,246 x 2
## bigram n
## <chr> <int>
## 1 trying to 1210
## 2 anti coup 1206
## 3 back internet 1205
## 4 coup strike 1204
## 5 the current 1204
## 6 inagainst military 1203
## 7 reopen schools 1203
## 8 to reopen 1203
## 9 current situation 1201
## 10 is trying 1201
## # ... with 13,236 more rows
word_pairs <- df_word %>%
pairwise_count(word,user_id, sort = TRUE) %>%
filter(!item1 %in% c("myanmar", "coup","4","apr","data","ramadan") & !item2 %in% c("myanmar", "coup","4","apr","data","ramadan"))
word_pairs## # A tibble: 342,018 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 internet military 1218
## 2 military internet 1218
## 3 strike military 1214
## 4 military strike 1214
## 5 situation military 1204
## 6 military situation 1204
## 7 anti military 1198
## 8 inagainst military 1198
## 9 reopen military 1198
## 10 schools military 1198
## # ... with 342,008 more rows
word_cors <- df_word %>%
group_by(word) %>%
filter(n() >= 10) %>%
pairwise_cor(word, user_id, sort = TRUE)
word_cors## # A tibble: 97,656 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 guys detat 1.
## 2 revolutionaries detat 1.
## 3 detat guys 1.
## 4 revolutionaries guys 1.
## 5 detat revolutionaries 1.
## 6 guys revolutionaries 1.
## 7 suppression challenging 1.
## 8 thethree challenging 1.
## 9 challenging suppression 1.
## 10 thethree suppression 1.
## # ... with 97,646 more rows
word_cors %>%
filter(item1 %in% c("myanmar", "coup")) %>%
group_by(item1) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。set.seed(2020)
#word_cors %>%
# filter(correlation > 0.95) %>%
# graph_from_data_frame() %>%
# ggraph(layout = "fr") +
# geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
# geom_node_point(color = "lightblue", size = 3) +
# geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
# theme_void()data_day <- Total_Data %>%
dplyr::select(artDate, artUrl) %>%
distinct() %>%
group_by(artDate) %>%
summarise(count = n()) %>%
arrange(desc(count))
data_day ## # A tibble: 67 x 2
## artDate count
## <chr> <int>
## 1 2021/02/01 32
## 2 2021/03/01 22
## 3 2021/03/29 22
## 4 2021/03/05 20
## 5 2021/03/04 15
## 6 2021/02/02 14
## 7 2021/03/28 12
## 8 2021/03/15 11
## 9 2021/02/06 10
## 10 2021/03/08 8
## # ... with 57 more rows
可以看出事件討論度最高是2月1日,這天是緬甸軍政府採取發動政變的日期。
data_day$artDate= data_day$artDate %>% as.Date("%Y/%m/%d")
data_day %>%
ggplot()+
geom_line(aes(x=artDate,y=count))+
xlab("日期") +
ylab("數量") +
geom_vline(xintercept = as.numeric(as.Date("2021-02-01")), col='red', size = 0.5) +
geom_vline(xintercept = as.numeric(as.Date("2021-03-01")), col='red', size = 0.5)+
geom_vline(xintercept = as.numeric(as.Date("2021-03-29")), col='red', size = 0.5)+
scale_x_date(labels = date_format("%m/%d"))圖中以2/1、3/1、3/29三日為主:
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")])
# 格式化日期欄位
data$artDate = data$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
data_select = data %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
# word_count:artDate,word,count
word_count <- data_select %>%
group_by(word) %>%
filter(word !="緬甸") %>%
summarise(count=n()) %>% # 算字詞單篇總數用summarise
filter(count>3) %>% # 過濾出現太少次的字
arrange(desc(count))## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 3,294 x 2
## word count
## <chr> <int>
## 1 中國 1019
## 2 台灣 906
## 3 軍政府 771
## 4 軍方 562
## 5 美國 545
## 6 國家 489
## 7 政變 410
## 8 香港 404
## 9 政府 385
## 10 關心 345
## # ... with 3,284 more rows
word_count %>%
head(10) %>%
mutate(word = reorder(word, count)) %>%
ggplot(aes(x=count, y=word,fill=word))+
geom_col(show.legend = FALSE) +
labs(x = "詞頻", y = "詞") +
ggtitle("文章詞頻前10名")圖中顯示,中國為第一名、台灣為第二名的原因:
P <- read_file("liwc/positive.txt") # 正向字典txt檔
N <- read_file("liwc/negative.txt") # 負向字典txt檔
#字典txt檔讀進來是一整個字串
typeof(P)## [1] "character"
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
# 檢視字典
head(LIWC)## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 不錯 positive
## 6 主動 positive
MetaData = fread('project_articleMetaData.csv',encoding = 'UTF-8')
Reviews = fread('project_articleReviews.csv',encoding = 'UTF-8')
# 挑選文章對應的留言
Total_Data = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
# 把文章和留言的斷詞結果併在一起
MToken <- MetaData %>% unnest_tokens(word, sentence, token=customized_tokenizer)
RToken <- Total_Data %>% unnest_tokens(word, cmtContent, token=customized_tokenizer)
# 把資料併在一起
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")])
# 格式化日期欄位
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
# 過濾特殊字元
data_select = data %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
sentiment_count = data_select %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n()) %>%
arrange(desc(count))
sentiment_count## # A tibble: 129 x 3
## # Groups: artDate [66]
## artDate sentiment count
## <date> <chr> <int>
## 1 2021-03-29 negative 339
## 2 2021-03-04 negative 303
## 3 2021-03-01 negative 280
## 4 2021-03-07 negative 267
## 5 2021-02-01 negative 248
## 6 2021-03-04 positive 235
## 7 2021-03-29 positive 230
## 8 2021-02-01 positive 219
## 9 2021-03-07 positive 208
## 10 2021-03-01 positive 168
## # ... with 119 more rows
可以看到整體而言是負面的情緒較多
## [1] "2021-01-07" "2021-05-01"
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-01-07','2021-05-01'))
)+
# 加上標示日期的線
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-29'))
[1]])),colour = "red") +
geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-01'))
[1]])),colour = "red") sentiment_count %>%
# 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-01-07','2021-05-01'))
)sentiment_sum <-
word_count %>%
inner_join(LIWC, by = "word") %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame() ## `summarise()` regrouping output by 'word' (override with `.groups` argument)
sentiment_sum %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()# 挑選文章對應的留言
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
#Reviews
Total_Data <- MetaData %>%
mutate(sentence=gsub("媒體來源|記者署名|完整新聞標題|完整新聞內文|完整新聞連結|(或短網址)|備註|備註請放最後面|違者新聞文章刪除|Gossiping|TVBS新聞網|娛樂中心|看板|問卦", "", sentence))# 使用默認參數初始化一個斷詞引擎
# 先不使用任何的字典和停用詞
jieba_tokenizer = worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
chi_tokenizer01 <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}# 進行斷詞,並計算各詞彙在各文章中出現的次數
coup_words <- Total_Data %>%
unnest_tokens(word, sentence, token=chi_tokenizer01) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
coup_words## artUrl word n
## 1: https://www.ptt.cc/bbs/Gossiping/M.1613244038.A.301.html 緬甸 43
## 2: https://www.ptt.cc/bbs/Gossiping/M.1613193695.A.798.html 緬甸 42
## 3: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 軍方 35
## 4: https://www.ptt.cc/bbs/Gossiping/M.1615564184.A.B56.html 緬甸 35
## 5: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 緬甸 33
## ---
## 22467: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 確實 1
## 22468: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 隨便 1
## 22469: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 還不 1
## 22470: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 願意 1
## 22471: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 聽從 1
## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 321 x 2
## artUrl total
## <chr> <int>
## 1 https://www.ptt.cc/bbs/Gossiping/M.1609990690.A.13C.html 15
## 2 https://www.ptt.cc/bbs/Gossiping/M.1610008084.A.4C9.html 35
## 3 https://www.ptt.cc/bbs/Gossiping/M.1610024281.A.452.html 11
## 4 https://www.ptt.cc/bbs/Gossiping/M.1611897015.A.61A.html 219
## 5 https://www.ptt.cc/bbs/Gossiping/M.1612068324.A.114.html 231
## 6 https://www.ptt.cc/bbs/Gossiping/M.1612136552.A.31F.html 124
## 7 https://www.ptt.cc/bbs/Gossiping/M.1612147806.A.5E8.html 116
## 8 https://www.ptt.cc/bbs/Gossiping/M.1612148853.A.BA3.html 182
## 9 https://www.ptt.cc/bbs/Gossiping/M.1612149990.A.A39.html 272
## 10 https://www.ptt.cc/bbs/Gossiping/M.1612150864.A.8D4.html 42
## # ... with 311 more rows
## artUrl word n total
## 1: https://www.ptt.cc/bbs/Gossiping/M.1613244038.A.301.html 緬甸 43 1480
## 2: https://www.ptt.cc/bbs/Gossiping/M.1613193695.A.798.html 緬甸 42 753
## 3: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 軍方 35 582
## 4: https://www.ptt.cc/bbs/Gossiping/M.1615564184.A.B56.html 緬甸 35 359
## 5: https://www.ptt.cc/bbs/Gossiping/M.1613385729.A.973.html 緬甸 33 582
## ---
## 22467: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 確實 1 52
## 22468: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 隨便 1 52
## 22469: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 還不 1 52
## 22470: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 願意 1 52
## 22471: https://www.ptt.cc/bbs/Gossiping/M.1619882577.A.A9C.html 聽從 1 52
## tf idf tf_idf
## 1: 0.02905405 0.04785602 0.001390411
## 2: 0.05577689 0.04785602 0.002669260
## 3: 0.06013746 1.35260052 0.081341955
## 4: 0.09749304 0.04785602 0.004665629
## 5: 0.05670103 0.04785602 0.002713486
## ---
## 22467: 0.01923077 4.38514676 0.084329745
## 22468: 0.01923077 3.97968165 0.076532339
## 22469: 0.01923077 4.16200321 0.080038523
## 22470: 0.01923077 2.93822778 0.056504380
## 22471: 0.01923077 5.07829394 0.097659499
可以看到有緬甸、中國等。
coup_words_tf_idf %>%
group_by(artUrl) %>%
slice_max(tf_idf, n=5) %>%
filter(n > 15) %>%
arrange(desc(artUrl))## # A tibble: 19 x 7
## # Groups: artUrl [15]
## artUrl word n total tf idf tf_idf
## <chr> <chr> <int> <int> <dbl> <dbl> <dbl>
## 1 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國 27 301 0.0897 1.24 0.111
## 2 https://www.ptt.cc/bbs/Gossiping/M.16~ 人類 31 2140 0.0145 4.16 0.0603
## 3 https://www.ptt.cc/bbs/Gossiping/M.16~ 不可 19 2140 0.00888 4.39 0.0389
## 4 https://www.ptt.cc/bbs/Gossiping/M.16~ 聖經 16 2140 0.00748 5.08 0.0380
## 5 https://www.ptt.cc/bbs/Gossiping/M.16~ 告訴 21 2140 0.00981 3.47 0.0340
## 6 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍政府~ 17 243 0.0700 0.951 0.0665
## 7 https://www.ptt.cc/bbs/Gossiping/M.16~ 美國 25 292 0.0856 1.69 0.145
## 8 https://www.ptt.cc/bbs/Gossiping/M.16~ 台灣 16 292 0.0548 1.06 0.0582
## 9 https://www.ptt.cc/bbs/Gossiping/M.16~ 美國 25 225 0.111 1.69 0.188
## 10 https://www.ptt.cc/bbs/Gossiping/M.16~ 存在 21 651 0.0323 3.21 0.103
## 11 https://www.ptt.cc/bbs/Gossiping/M.16~ 仰光 17 314 0.0541 1.99 0.108
## 12 https://www.ptt.cc/bbs/Gossiping/M.16~ 中共 20 359 0.0557 2.78 0.155
## 13 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國 23 268 0.0858 1.24 0.106
## 14 https://www.ptt.cc/bbs/Gossiping/M.16~ 群眾 16 649 0.0247 3.13 0.0772
## 15 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍方 35 582 0.0601 1.35 0.0813
## 16 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國 29 753 0.0385 1.24 0.0477
## 17 https://www.ptt.cc/bbs/Gossiping/M.16~ 台灣 17 139 0.122 1.06 0.130
## 18 https://www.ptt.cc/bbs/Gossiping/M.16~ 中國 24 263 0.0913 1.24 0.113
## 19 https://www.ptt.cc/bbs/Gossiping/M.16~ 軍方 19 281 0.0676 1.35 0.0915
可以看到文章中有中國、美國、台灣、仰光等字詞。
coup_words_tf_idf %>%
group_by(artUrl) %>%
slice_max(tf_idf, n=10) %>%
ungroup() %>%
count(word, sort=TRUE)## # A tibble: 3,333 x 2
## word n
## <chr> <int>
## 1 軍方 19
## 2 中國 16
## 3 香港 10
## 4 親中 10
## 5 美國 9
## 6 軍人 9
## 7 軍政府 9
## 8 緬甸 8
## 9 工廠 7
## 10 台商 7
## # ... with 3,323 more rows
軍方、軍政府、中國等是大家討論的重點。
# 過濾掉三個關鍵字"緬甸", "軍政"
word_pairs <- coup_words %>%
pairwise_count(word, artUrl, sort = TRUE) %>%
filter(!item1 %in% c("緬甸", "軍政") & !item2 %in% c("緬甸", "軍政"))
word_pairs## # A tibble: 4,388,182 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 政變 軍政府 59
## 2 軍政府 政變 59
## 3 政府 軍政府 58
## 4 軍政府 政府 58
## 5 政變 軍方 51
## 6 軍政府 中國 51
## 7 中國 軍政府 51
## 8 軍方 政變 51
## 9 抗議 軍政府 47
## 10 政變 政府 47
## # ... with 4,388,172 more rows
可以看到(政變、軍政府)、(軍政府、中國)、(抗議、軍整府)等詞常一起出現。
word_cors <- coup_words %>%
group_by(word) %>%
filter(n() >= 10) %>%
pairwise_cor(word, artUrl, sort = TRUE)
word_cors## # A tibble: 131,406 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 綜合 外電報導 0.694
## 2 外電報導 綜合 0.694
## 3 舞弊 大選 0.665
## 4 大選 舞弊 0.665
## 5 執政黨 舞弊 0.656
## 6 舞弊 執政黨 0.656
## 7 執政黨 發言人 0.640
## 8 發言人 執政黨 0.640
## 9 綜合 報導 0.601
## 10 報導 綜合 0.601
## # ... with 131,396 more rows
word_cors %>%
filter(item1 %in% c("緬甸", "政變")) %>%
group_by(item1) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。# 設定幾個詞做爲seed words
seed_words <- c("有沒有", "超過", "指出", "已有")
# 設定threshold爲0.5
threshold <- 0.5
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
.$item1 %>%
unique()
remove_words## [1] "舉行" "指出" "已有" "記者" "援助" "報導" "超過" "活動"
# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
filter(!(item1 %in% remove_words|item2 %in% remove_words))
word_cors_new %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()圖中顯示:
留言部分
貼文部分
留言部分
貼文部分
tweets %>%
group_by(location) %>%
filter(location!="") %>%
count(location, sort = TRUE) %>%
mutate(location = reorder(location,n)) %>%
head(10) ## # A tibble: 10 x 2
## # Groups: location [10]
## location n
## <fct> <int>
## 1 United States 477
## 2 Myanmar 416
## 3 Singapore 145
## 4 United Kingdom 127
## 5 Yangon 30
## 6 Japan 28
## 7 Washington, DC 26
## 8 London, UK 19
## 9 New York, USA 18
## 10 Tokyo Japan Los Angeles United States 18
trend_article = fread('trend_articleMetaData.csv',encoding = 'UTF-8')
jieba_tokenizer <- worker(user="dict/user_dict.txt", stop_word = "dict/stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
trend_article <- trend_article %>% unnest_tokens(word, sentence, token=customized_tokenizer)
# 過濾特殊字元
data_select = trend_article %>%
filter(!grepl('[[:punct:]]',word)) %>% # 去標點符號
filter(!grepl("['^0-9a-z']",word)) %>% # 去英文、數字
filter(nchar(.$word)>1)
v = c("緬甸","疫情")
# word_count:artDate,word,count
word_count <- data_select %>%
group_by(word) %>%
filter(word %in% v)%>%
summarise(count=n()) # 算字詞單篇總數用summarise## `summarise()` ungrouping output (override with `.groups` argument)
## # A tibble: 2 x 2
## word count
## <chr> <int>
## 1 疫情 211
## 2 緬甸 17