安裝package
packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)load("coreNLP_all.RData")(1). Twitter API設定 透過rtweet抓取tweets
app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)(2). 設定關鍵字抓tweets
# 為了避免抓到非指奶茶聯盟而是真的奶茶的資料,關鍵字查詢加上democracy以鎖定目標
key = c("#milkteaalliance")
context = "democracy"
q = paste(c(key,context),collapse=" AND ")
# 抓10000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=10000,include_rts = FALSE,token = twitter_token)(3). tweets內容清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweetsdf共有90個欄位,其中包括:
head(df)## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 13562227~ 1381307252~ 2021-04-11 18:04:47 HtunPaPaKya~ at Taungoo. My~ Twitte~
## 2 13562227~ 1381305899~ 2021-04-11 17:59:24 HtunPaPaKya~ Monywa youths ~ Twitte~
## 3 13562227~ 1380561517~ 2021-04-09 16:41:29 HtunPaPaKya~ Antidictatorsh~ Twitte~
## 4 13562227~ 1380843797~ 2021-04-10 11:23:10 HtunPaPaKya~ Youths in Mawl~ Twitte~
## 5 13758551~ 1381195082~ 2021-04-11 10:39:03 LuChaw1619 Thank you Aust~ Twitte~
## 6 13758551~ 1381307249~ 2021-04-11 18:04:46 LuChaw1619 Loudest voices~ Twitte~
## # ... with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>,
## # ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## # lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## # quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
使用min,max來看資料最久、最近的日期
nrow(df)## [1] 8498
min(df$created_at)## [1] "2021-04-05 08:01:55 UTC"
max(df$created_at)## [1] "2021-04-11 18:04:47 UTC"
查看每日討論情形
tweets$created_at= tweets$created_at %>% as.Date("%Y/%m/%d")
tweets %>%
group_by(created_at) %>%
summarise(count = n()) %>%
ggplot()+
geom_line(aes(x=created_at,y=count))+
scale_x_date(labels = date_format("%m/%d"))從每日發文數量我們觀察到4/7、4/8左右開始關於MilkTeaAlliance的討論度上升。
進一步探討之後發現,Twitter公司於2021/4/8使用官方帳戶Twitter Public Policy宣布為奶茶聯盟設計了專用emoji,當推文中包含#MilkTeaAlliance、#nnevvy等奶茶聯盟相關標籤時將會被自動添加該emoji,以紀念奶茶聯盟成立一週年。
因此,推測可能因推出新emoji而使討論度提升。
(1). API呼叫的設定
server端 :
+ 需先在terminal開啟corenlp server
+ 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}(2). 資料整理function
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}取得coreNLP回傳的物件
(前面已經載入過coreNLP.RData,可先不跑這段)
gc() #釋放不使用的記憶體
t0 = Sys.time()
obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件必須符合: 是一個data.frame 有一個text欄位
Sys.time() - t0 #執行時間#先將會用到的東西存下來,要用可直接載RData
tokens = coreNLP_tokens_parser(obj)
dependencies = coreNLP_dependency_parser(obj)
sentiment = coreNLP_sentiment_parser(obj)
save.image("coreNLP_all.RData")(1). 斷詞、詞彙還原、詞性標註、NER
tokens = coreNLP_tokens_parser(obj)head(tokens,20)## status_id word lemma pos ner
## 1 1381307252754182146 at at IN O
## 2 1381307252754182146 Taungoo. Taungoo. NNP O
## 3 1381307252754182146 Myanmar Myanmar NNP NATIONALITY
## 4 1381307252754182146 will will MD O
## 5 1381307252754182146 also also RB O
## 6 1381307252754182146 fully fully RB O
## 7 1381307252754182146 revive revive VB O
## 8 1381307252754182146 to to IN O
## 9 1381307252754182146 true true JJ O
## 10 1381307252754182146 Federal federal JJ IDEOLOGY
## 11 1381307252754182146 Democracy democracy NN IDEOLOGY
## 12 1381307252754182146 as as IN O
## 13 1381307252754182146 old old JJ O
## 14 1381307252754182146 leaves leaf NNS O
## 15 1381307252754182146 gone go VBN O
## 16 1381307252754182146 and and CC O
## 17 1381307252754182146 new new JJ O
## 18 1381307252754182146 leaves leaf NNS O
## 19 1381307252754182146 born. born. NN O
## 20 1381305899575222273 Monywa Monywa NNP CITY
(2). 命名實體標註(NER)
unique(tokens$ner)## [1] "O" "NATIONALITY" "IDEOLOGY"
## [4] "CITY" "LOCATION" "ORGANIZATION"
## [7] "NUMBER" "DATE" "TITLE"
## [10] "DURATION" "MISC" "ORDINAL"
## [13] "TIME" "STATE_OR_PROVINCE" "PERSON"
## [16] "CAUSE_OF_DEATH" "RELIGION" "COUNTRY"
## [19] "CRIMINAL_CHARGE" "SET"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) ## [1] 712
(3). 轉小寫
一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換成小寫的word與lemma,目的是要將大小寫的同一字詞統一換成小寫,再來計算詞頻
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)我們可以透過coreNLP中的NER解析出在Twitter上面談論MilkTeaAlliance,所涉及到的國家(COUNTRY),以初步了解這個議題的主要國家。
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()
從圖表中可以發現
+ 中國被討論的最多。
+ 奶茶聯盟最初是由台灣、泰國、香港的網友組成,因此台灣、泰國、中國出現次數較多。
+ 2021年2月,緬甸爆發反對軍事政變的抗議活動。泰國的社運分子聲援緬甸的示威者,皇家緬甸茶包的圖片在網上被分享數千次。泰國藝術家新浪·維塔亞維羅伊畫了一幅插畫,圖中貼有泰國、台灣、香港、印度和緬甸旗幟的奶茶杯出現在「奶茶聯盟」的大字下,該圖在網上瘋傳。因此,緬甸被提及次數也較多。
+ 另外,奶茶聯盟也擴大觸及到澳洲、印度等亞太地區國家。
tokens %>%
filter(ner == "NATIONALITY") %>% #篩選NER為NATIONALITY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is NATIONALITY)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在Twitter上面談論MilkTeaAlliance,所涉及到的人物(PERSON),以初步了解這個議題的主要人物。
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()dependencies = coreNLP_dependency_parser(obj)head(dependencies,20)## status_id dep governor governorGloss dependent dependentGloss
## 1 1381307252754182146 ROOT 0 ROOT 7 revive
## 2 1381307252754182146 advmod 7 revive 1 at
## 3 1381307252754182146 compound 3 Myanmar 2 Taungoo.
## 4 1381307252754182146 nsubj 7 revive 3 Myanmar
## 5 1381307252754182146 aux 7 revive 4 will
## 6 1381307252754182146 advmod 7 revive 5 also
## 7 1381307252754182146 advmod 7 revive 6 fully
## 8 1381307252754182146 case 11 Democracy 8 to
## 9 1381307252754182146 amod 11 Democracy 9 true
## 10 1381307252754182146 amod 11 Democracy 10 Federal
## 11 1381307252754182146 obl 7 revive 11 Democracy
## 12 1381307252754182146 case 14 leaves 12 as
## 13 1381307252754182146 amod 14 leaves 13 old
## 14 1381307252754182146 nmod 11 Democracy 14 leaves
## 15 1381307252754182146 acl 14 leaves 15 gone
## 16 1381307252754182146 cc 18 leaves 16 and
## 17 1381307252754182146 amod 18 leaves 17 new
## 18 1381307252754182146 conj 14 leaves 18 leaves
## 19 1381307252754182146 dep 18 leaves 19 born.
## 20 1381305899575222273 ROOT 0 ROOT 3 staged
parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
sentiment = coreNLP_sentiment_parser(obj)head(sentiment,20)## status_id
## 1 1381307252754182146
## 2 1381305899575222273
## 3 1380561517062582275
## 4 1380843797022642178
## 5 1381195082410336257
## 6 1381307249100947459
## 7 1381193923142414336
## 8 1379294596014907394
## 9 1380931768522579981
## 10 1379294442692042756
## 11 1379118581686923265
## 12 1381143722025349122
## 13 1381096781954473991
## 14 1380475714944655363
## 15 1381307237944107011
## 16 1381307226934046724
## 17 1381230687248908288
## 18 1381228523633926145
## 19 1381226679461371905
## 20 1381307220571348994
## text
## 1 at Taungoo. Myanmar will also fully revive to true Federal Democracy as old leaves gone and new leaves born.
## 2 Monywa youths staged AntiCoup Sand Sculpture Strike on the sandbank of the Chindwin River. Their wishes such as We Want Democracy and We Need R2P in Myanmar were demostrated.
## 3 Antidictatorship strike at Demoso city Kayah State. The stronger we are the closer we get to our federal democracy road.
## 4 Youths in Mawlamyine did a prodemocracy movement by writing We Want Democracy and Free Our Leaders on the palms as a protest against military coup. ELECTED CRPH
## 5 Thank you Australiafor your shouts of encouragement towards Bago people to stay strong to prevail democracy to stop genocidal crimes against humanity. According to due to Juntas there have been 82deaths in Bago
## 6 Loudest voices for Democracy within SanChaung Yangon.
## 7 Monywa youths staged AntiCoup Sand Sculpture Strike on the sandbank of the Chindwin River. Their wishes such as We Want Democracy and We Need R2P in Myanmar were demostrated.
## 8 Dawn strike at DaikU township Bago Region. Citizens havent lost the will to fight for democracy.
## 9 Youths in Mawlamyine did a prodemocracy movement by writing We Want Democracy and Free Our Leaders on the palms as a protest against military coup. ELECTED CRPH
## 10 Rain falling doesn't stop protesters of Namti township Kachin state going strike against terrorist SAC. Their courage for Democracy is enomous.
## 11 Singu Protest against military regime. They had to drive 40 miles to upload these photos taken on April 4 on internet due toin those areas. Their courage and craving for Democracy is enormous.
## 12 Monywa youths staged AntiCoup Sand Sculpture Strike on the sandbank of the Chindwin River. Their wishes such as We Want Democracy and We Need R2P in Myanmar were demostrated.
## 13 campaign also represented for our refugees who has to shelter in forests. Myanmar will also fully revive to true Federal Democracy as old leaves gone and new leaves born. photo credit
## 14 Antidictatorship strike at Demoso city Kayah State. The stronger we are the closer we get to our federal democracy road.
## 15 Loudest voices for Democracy within SanChaung Yangon.
## 16 Loudest voices for Democracy within SanChaung Yangon.
## 17 Monywa youths staged AntiCoup Sand Sculpture Strike on the sandbank of the Chindwin River. Their wishes such as We Want Democracy and We Need R2P in Myanmar were demostrated.
## 18 at Taungoo. Myanmar will also fully revive to true Federal Democracy as old leaves gone and new leaves born.
## 19 campaign also represented for our refugees who has to shelter in forests. Myanmar will also fully revive to true Federal Democracy as old leaves gone and new leaves born. photo credit
## 20 Loudest voices for Democracy within SanChaung Yangon.
## sentiment sentimentValue
## 1 Neutral 2
## 2 Neutral 2
## 3 Neutral 2
## 4 Neutral 2
## 5 Negative 1
## 6 Neutral 2
## 7 Neutral 2
## 8 Negative 1
## 9 Neutral 2
## 10 Negative 1
## 11 Positive 3
## 12 Neutral 2
## 13 Neutral 2
## 14 Neutral 2
## 15 Neutral 2
## 16 Neutral 2
## 17 Neutral 2
## 18 Neutral 2
## 19 Neutral 2
## 20 Neutral 2
unique(sentiment$sentiment)## [1] "Neutral" "Negative" "Positive"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric#了解情緒文章的分佈
sentiment$sentiment %>% table()## .
## Negative Neutral Positive
## 2997 4795 706
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()
從圖中可以發現4/7情緒分數顯著地上升,又twitter於4/8宣布推出新emoji,雖然整體平均情緒分數並沒有由負面轉正面(大多小於2),但是可以發現的是在其之後幾天的分數相較於4/7以前的分數還是有著上升趨勢。
#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >10 & count<400)%>%
wordcloud2()## Joining, by = "word"
可以觀察到正面文章中包括courage、freedom、democracy等詞彙
#了解負面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >20 &count<400)%>%
wordcloud2() 而負面文章中則較常談論到death、severe、victim等詞彙,另外,雖然正負面文章都有dictatorship(獨裁政權),但是在負面文章中出現次數較正面文章多,推測其中之緣由為milkteaalliance理念是反極權統治,因此在負面情緒文章中才會較多地提及獨裁政權等詞彙。
library(sentimentr)## Warning: package 'sentimentr' was built under R version 4.0.5
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字## words polarity n
## 1: justice 1.00 22
## 2: please 1.00 17
## 3: bravely 1.00 3
## 4: top 1.00 3
## 5: honor 1.00 2
## ---
## 140: big 0.25 1
## 141: wireless 0.25 1
## 142: praying 0.10 2
## 143: collectively 0.10 1
## 144: moral 0.10 1
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字## Selecting by n
## words polarity n
## 1: protest -0.50 241
## 2: strike -0.75 215
## 3: penal -0.60 118
## 4: stop -0.40 113
## 5: treason -1.00 113
## 6: dictatorship -1.00 99
## 7: fight -0.50 94
## 8: protesters -0.60 63
## 9: lost -0.75 47
## 10: terrorist -1.00 45
set.seed(12)
df%>%
filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
mutate(review = get_sentences(text)) %$%
sentiment_by(review, status_id) %>%
highlight()## Saved in C:\Users\user\AppData\Local\Temp\Rtmp67MODA/polarity.html
## Opening C:\Users\user\AppData\Local\Temp\Rtmp67MODA/polarity.html ...
tweets$date = format(tweets$created_at,'%Y%m%d')
(out = tweets %>% with(
sentiment_by( #document level
get_sentences(text),
list( date)
)
))
plot(out) 從圖表中可以看到4/8twitter宣布新emoji之後,對於此議題的討論度大大提升,且情緒分數也趨向正向。
coreNLP
sentimentr