安裝package
= c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
packages = as.character(installed.packages()[,1])
existing for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
load("coreNLP_all.RData")
(1). Twitter API設定 透過rtweet抓取tweets
= '2021_sma'
app = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_key = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
consumer_secret = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_token = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
access_secret <- create_token(app,consumer_key, consumer_secret,
twitter_token set_renv = FALSE)
access_token, access_secret,#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權
(2). 設定關鍵字抓tweets
# 查詢關鍵字
= c("#Covid")
key = "vaccine"
context = paste(c(key,context),collapse=" AND ")
q # 查詢字詞 "#Covid AND vaccine"
# 為了避免只下#Covid 會找到非在討論疫苗相關的tweets,加入vacccine要同時出現的條件
#抓5000筆 不抓轉推
= search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token) tweets
(3). tweets內容清理
## 用於資料清理
= function(txt) {
clean = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt
txt }
$text = clean(tweets$text) #text套用資料清理
tweets
= data.frame()
df
= rbind(df,tweets) # transfer to data frame
df
= df[!duplicated(df[,"status_id"]),] #去除重複的tweets df
head(df)
## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 1111747~ 1381181784~ 2021-04-11 09:46:13 clusterInd~ Our Modi Goverm~ Twitter~
## 2 1111747~ 1380716910~ 2021-04-10 02:58:58 clusterInd~ Our Modi Goverm~ Twitter~
## 3 1111747~ 1380548319~ 2021-04-09 15:49:03 clusterInd~ Our Modi Goverm~ Twitter~
## 4 1111747~ 1380542072~ 2021-04-09 15:24:14 clusterInd~ Our Modi Goverm~ Twitter~
## 5 1111747~ 1381061171~ 2021-04-11 01:46:56 clusterInd~ Our Modi Goverm~ Twitter~
## 6 1111747~ 1380541631~ 2021-04-09 15:22:28 clusterInd~ Our Modi Goverm~ Twitter~
## # ... with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>,
## # ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## # lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## # quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
df共有90個欄位,但我們在這裡僅會使用幾個欄位:
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料
nrow(df)
## [1] 4776
min(df$created_at)
## [1] "2021-04-06 12:39:29 UTC"
max(df$created_at)
## [1] "2021-04-11 09:46:13 UTC"
(1). API呼叫的設定
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
<- function(host, port="9000",
generate_API_url tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
<- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
url
}#指定服務的位置
= "127.0.0.1"
host
generate_API_url(host)
# 呼叫coreNLP api
<- function(server_host, text, host="localhost", language="eng",
call_coreNLP tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
<- ifelse(language=="eng", 9000, 9001);
port # 產生api網址
<- generate_API_url(server_host, port=port,
url tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
<- POST(url, body = text, encode = "json")
result <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
doc return (doc)
}
#文件使用coreNLP服務
<- function(data,host){
coreNLP # 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
<- apply(data, 1 , function(x){
result <- call_coreNLP(host, x['text'])
object list(doc=object, data=x)
})
return(result)
}
(2). 資料整理function
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
<- function(coreNLP_objects){
coreNLP_tokens_parser
<- do.call(rbind, lapply(coreNLP_objects, function(obj){
result <- obj$data
original_data <- obj$doc
doc # for a sentences
<- doc$sentences
sentences
<- sentences[[1]]
sen
<- do.call(rbind, lapply(sen$tokens, function(x){
tokens <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
result
}))
<- original_data %>%
tokens t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))return(result)
}
從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
<- function(coreNLP_objects){
coreNLP_dependency_parser <- do.call(rbind, lapply(coreNLP_objects, function(obj){
result <- obj$data
original_data <- obj$doc
doc # for a sentences
<- doc$sentences
sentences <- sentences[[1]]
sen <- do.call(rbind, lapply(sen$basicDependencies, function(x){
dependencies <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
result
}))
<- original_data %>%
dependencies t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))return(result)
}
從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
<- function(coreNLP_objects){
coreNLP_sentiment_parser <- do.call(rbind, lapply(coreNLP_objects, function(obj){
result <- obj$data
original_data <- obj$doc
doc # for a sentences
<- doc$sentences
sentences <- sentences[[1]]
sen
<- original_data %>%
sentiment t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))return(result)
}
# 圖形化顯示dependency結果
<- function(ptext) {
parse2tree stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
<- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
ptext
## Replace words with unique versions
<- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
ms <- regmatches(ptext, ms)[[1]] # just words
words regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
<- matrix('', nrow=length(words)-2, ncol=2)
edgelist
## Function to fill in edgelist in place
<- (function() {
edgemaker <- 0 # row counter
i <- function(node) { # the recursive function
g if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
<- if(inherits(child, "Tree")) child$value else child
childval <<- i+1
i 1:2] <<- c(val, childval)
edgelist[i,
}
}invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
<- FromDataFrameNetwork(as.data.frame(edgelist))
tree return (tree)
}
取得coreNLP回傳的物件
先不要跑這段,會花大概半小時(如果你記憶體只有4G可能會當掉)
#gc() #釋放不使用的記憶體
#t0 = Sys.time()
#obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
#Sys.time() - t0 #執行時間
#Time difference of 28 mins
#save.image("coreNLP.RData")
#先將會用到的東西存下來,要用可直接載RData
= coreNLP_tokens_parser(obj)
tokens = coreNLP_dependency_parser(obj)
dependencies = coreNLP_sentiment_parser(obj)
sentiment #save.image("coreNLP_all.RData")
(1). 斷詞、詞彙還原、詞性標註、NER
= coreNLP_tokens_parser(obj) tokens
head(tokens,20)
## status_id word lemma pos ner
## 1 1381181784663060481 Our we PRP$ O
## 2 1381181784663060481 Modi Modi NNP O
## 3 1381181784663060481 Goverment Goverment NNP O
## 4 1381181784663060481 is be VBZ O
## 5 1381181784663060481 responsible responsible JJ O
## 6 1381181784663060481 forspread forspread NN O
## 7 1381181784663060481 in in IN O
## 8 1381181784663060481 India India NNP COUNTRY
## 9 1381181784663060481 they they PRP O
## 10 1381181784663060481 are be VBP O
## 11 1381181784663060481 not not RB O
## 12 1381181784663060481 allowing allow VBG O
## 13 1381181784663060481 Vaccination vaccination NN O
## 14 1381181784663060481 to to IN O
## 15 1381181784663060481 the the DT O
## 16 1381181784663060481 most most RBS O
## 17 1381181784663060481 exposed exposed JJ O
## 18 1381181784663060481 citizen citizen NN O
## 19 1381181784663060481 like like IN O
## 20 1381181784663060481 me. me. NN O
(2). 命名實體標註(NER)
unique(tokens$ner)
## [1] "O" "COUNTRY" "ORDINAL"
## [4] "TIME" "MISC" "NUMBER"
## [7] "CAUSE_OF_DEATH" "ORGANIZATION" "DATE"
## [10] "PERSON" "LOCATION" "CITY"
## [13] "STATE_OR_PROVINCE" "DURATION" "NATIONALITY"
## [16] "SET" "TITLE" "PERCENT"
## [19] "IDEOLOGY" "CRIMINAL_CHARGE" "MONEY"
## [22] "URL" "RELIGION"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"]))
## [1] 2893
(3). 轉小寫
因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻
$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma) tokens
我們可以透過coreNLP中的NER解析出在Twitter上面談論covid-19疫苗,所涉及到的國家(COUNTRY),以初步了解這個議題的主要國家。
%>%
tokens filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()
我們可以透過coreNLP中的NER解析出在Twitter上面談論covid-19疫苗,所涉及到的組織(ORGANIZATION),以初步了解這個議題的主要公司/單位。
%>%
tokens filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()
我們可以透過coreNLP中的NER解析出在Twitter上面談論covid-19疫苗,所涉及到的人物(PERSON),以初步了解這個議題的主要人物。
%>%
tokens filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()
= coreNLP_dependency_parser(obj) dependencies
head(dependencies,20)
## status_id dep governor governorGloss dependent
## 1 1381181784663060481 ROOT 0 ROOT 24
## 2 1381181784663060481 nmod:poss 3 Goverment 1
## 3 1381181784663060481 compound 3 Goverment 2
## 4 1381181784663060481 nsubj 6 forspread 3
## 5 1381181784663060481 cop 6 forspread 4
## 6 1381181784663060481 amod 6 forspread 5
## 7 1381181784663060481 parataxis 24 country 6
## 8 1381181784663060481 case 8 India 7
## 9 1381181784663060481 nmod 6 forspread 8
## 10 1381181784663060481 nsubj 12 allowing 9
## 11 1381181784663060481 aux 12 allowing 10
## 12 1381181784663060481 advmod 12 allowing 11
## 13 1381181784663060481 parataxis 24 country 12
## 14 1381181784663060481 obj 12 allowing 13
## 15 1381181784663060481 case 18 citizen 14
## 16 1381181784663060481 det 18 citizen 15
## 17 1381181784663060481 advmod 17 exposed 16
## 18 1381181784663060481 amod 18 citizen 17
## 19 1381181784663060481 obl 12 allowing 18
## 20 1381181784663060481 case 20 me. 19
## dependentGloss
## 1 country
## 2 Our
## 3 Modi
## 4 Goverment
## 5 is
## 6 responsible
## 7 forspread
## 8 in
## 9 India
## 10 they
## 11 are
## 12 not
## 13 allowing
## 14 Vaccination
## 15 to
## 16 the
## 17 most
## 18 exposed
## 19 citizen
## 20 like
<- obj[[113]]$doc[[1]][[1]]$parse
parse_tree <- parse2tree(parse_tree)
tree SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)
情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
= coreNLP_sentiment_parser(obj) sentiment
head(sentiment,20)
## status_id
## 1 1381181784663060481
## 2 1380716910761807879
## 3 1380548319177371648
## 4 1380542072856342529
## 5 1381061171734749186
## 6 1380541631506472965
## 7 1380852244422135812
## 8 1381177037801414656
## 9 1380541042672365569
## 10 1380852787714527237
## 11 1380560168363040769
## 12 1380542621169246210
## 13 1381181245850218499
## 14 1380183407188131846
## 15 1381181030791536640
## 16 1381180063631040514
## 17 1380221838501019658
## 18 1381179597539147776
## 19 1380032556981121026
## 20 1381177704335212546
## text
## 1 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 2 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 3 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 4 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 5 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 6 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 7 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 8 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 9 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 10 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 11 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 12 Our Modi Goverment is responsible forspread in India they are not allowing Vaccination to the most exposed citizen like me. India is the country of Youngster with big populationbut no Vaccine for us.
## 13 Got my 2nd vaccine yesterday. 2 AM and I'm feeling all the symptoms
## 14 is an experimental medical procedure derived from a . No one has the right to tell others to accept it to have access to society. If the vaccine worksunvaccinated people won't impact those vaccinated. This is a personal choice. Sue .
## 15 The people who decided to bar students fromstate school based on a required experimentalbioweapon should not only be suedbut arrested and checked forties.
## 16 This country is dying.. No vaccine available No medicine available Queue for remisdisevir
## 17 Govt ofwcould learn from that. Travel pass that includes negativetest or certificate ofor immunity! BORDERS MUST OPEN FOR ALGERIANS their families ITS BEEN 13m!
## 18 Algerians living or stranded abroad AREto return to . Even with negativetest vaccine or immunity cert gov't bans them from their own country separating families partners for over 1y now.
## 19 People may experience side effects all over the body like fever headache muscleache tiredness after the vaccine. These effects are usually mild than developing coronavirus.
## 20 When you get your 19 vaccine yourprocesses information on the spike protein generates an immune response to it. Hence these vaccines require two doses because the second dose helps to better reinforce this immune response.
## sentiment sentimentValue
## 1 Negative 1
## 2 Negative 1
## 3 Negative 1
## 4 Negative 1
## 5 Negative 1
## 6 Negative 1
## 7 Negative 1
## 8 Negative 1
## 9 Negative 1
## 10 Negative 1
## 11 Negative 1
## 12 Negative 1
## 13 Neutral 2
## 14 Neutral 2
## 15 Negative 1
## 16 Negative 1
## 17 Neutral 2
## 18 Negative 1
## 19 Negative 1
## 20 Positive 3
unique(sentiment$sentiment)
## [1] "Negative" "Neutral" "Positive" "Verypositive"
$sentimentValue = sentiment$sentimentValue %>% as.numeric sentiment
#了解情緒文章的分佈
$sentiment %>% table() sentiment
## .
## Negative Neutral Positive Verypositive
## 1419 2503 830 2
$date = as.Date(df$created_at)
df
%>%
sentiment merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()
+ 圖中可看出整個討論的情緒偏向負面情緒
#了解正面文章的詞彙使用
%>%
sentiment merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >5 & count<400)%>%
wordcloud2()
## Joining, by = "word"
#了解負面文章的詞彙使用
%>%
sentiment merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >10 &count<400)%>%
wordcloud2()
“wordcloud”
library(sentimentr)
## Warning: package 'sentimentr' was built under R version 4.0.5
<- c(
mytext 'do you like it? But I hate really bad dogs',
'I am the best friend.',
'Do you really like it? I\'m not a fan'
)
<- get_sentences(mytext) #物件,將character向量轉成list,list裡放著character向量(已斷句) mytext
情緒分數為-1~1之間,<0屬於負面,>0屬於正面,0屬於中性
sentiment_by(mytext) #document level
## element_id word_count sd ave_sentiment
## 1: 1 10 1.497465 -0.8088680
## 2: 2 5 NA 0.5813777
## 3: 3 9 0.284605 0.2196345
sentiment(mytext) #sentence level
## element_id sentence_id word_count sentiment
## 1: 1 1 4 0.2500000
## 2: 1 2 6 -1.8677359
## 3: 2 1 5 0.5813777
## 4: 3 1 5 0.4024922
## 5: 3 2 4 0.0000000
set.seed(10)
<- get_sentences(tweets$text) #將text轉成list of characters型態
mytext <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
x <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_words <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts > 0,] #正面的字 sentiment_counts[polarity
## words polarity n
## 1: please 1.0 24
## 2: efficacy 1.0 9
## 3: benefits 1.0 7
## 4: understand 1.0 7
## 5: care 1.0 6
## ---
## 437: cooperation 0.1 1
## 438: authenticity 0.1 1
## 439: overcome 0.1 1
## 440: coalition 0.1 1
## 441: technology 0.1 1
< 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字 sentiment_counts[polarity
## Selecting by n
## words polarity n
## 1: risk -0.75 44
## 2: shot -0.40 38
## 3: virus -0.50 19
## 4: jab -0.60 19
## 5: death -0.75 19
## 6: shortage -0.75 19
## 7: pandemic -1.00 19
## 8: die -0.75 16
## 9: stop -0.40 12
## 10: government -0.50 11
set.seed(12)
%>%
dffilter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
mutate(review = get_sentences(text)) %$%
sentiment_by(review, status_id) %>%
highlight()
## Saved in C:\Users\sammy\AppData\Local\Temp\RtmpWmsWM2/polarity.html
## Opening C:\Users\sammy\AppData\Local\Temp\RtmpWmsWM2/polarity.html ...
$date = format(tweets$created_at,'%Y%m%d')
tweets
out = tweets %>% with(
(sentiment_by( #document level
get_sentences(text),
list( date)
)
))plot(out)