安裝package
packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)load("coreNLP_all_1.RData")(1). Twitter API設定 透過rtweet抓取tweets
app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權(2). 設定關鍵字抓tweets
# 查詢關鍵字
key = c("#covid-19")
context = "vaccine"
q = paste(c(key,context),collapse=" AND ")
# 查詢字詞 "#covid-19 AND vaccine"
# 為了避免只下#covid-19 會找到非在vaccine中的tweets,加入vaccine要同時出現的條件
#抓5000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)## Warning: Rate limit exceeded - 88
## Warning: Rate limit exceeded
(3). tweets內容清理
## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweetshead(df)## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 15571186… 1381534678… 2021-04-12 09:08:29 RuddFuneral… Why could Morr… Twitte…
## 2 12808449… 1381534671… 2021-04-12 09:08:27 Bob36783646 93 Israeli doc… Twitte…
## 3 82274462… 1381534651… 2021-04-12 09:08:23 Nilesh_TNIE Vaccine shorta… Twitte…
## 4 316371232 1381534606… 2021-04-12 09:08:12 FahimQasim5… Over 175.1 mil… Twitte…
## 5 466701933 1381534603… 2021-04-12 09:08:11 c21st_org With the vacci… Twitte…
## 6 31073711… 1381534561… 2021-04-12 09:08:01 DiaquipLtd Covid19 G7 nat… Buffer
## # … with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>,
## # ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## # lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## # quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料
nrow(df)## [1] 3194
min(df$created_at)## [1] "2021-04-12 01:58:12 UTC"
max(df$created_at)## [1] "2021-04-12 09:08:29 UTC"
(1). API呼叫的設定
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}(2). 資料整理function
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}(1). 斷詞、詞彙還原、詞性標註、NER
tokens = coreNLP_tokens_parser(obj)head(tokens,20)## status_id word lemma pos ner
## 1 1381162673338003457 John John NNP PERSON
## 2 1381162673338003457 Tory Tory NNP PERSON
## 3 1381162673338003457 Got get VBD O
## 4 1381162673338003457 A a DT O
## 5 1381162673338003457 COVID19 covid19 NN O
## 6 1381162673338003457 Vaccine vaccine NN O
## 7 1381162673338003457 Dose dose NN O
## 8 1381162673338003457 He he PRP O
## 9 1381162673338003457 'Literally 'literally RB O
## 10 1381162673338003457 Did do VBD O
## 11 1381162673338003457 Not not RB O
## 12 1381162673338003457 Even even RB O
## 13 1381162673338003457 Feel feel VB O
## 14 1381162673338003457 The the DT O
## 15 1381162673338003457 Needle' needle' NN O
## 16 1381133716878196736 Doug Doug NNP PERSON
## 17 1381133716878196736 Ford Ford NNP PERSON
## 18 1381133716878196736 gets get VBZ O
## 19 1381133716878196736 first first JJ ORDINAL
## 20 1381133716878196736 dose dose NN O
(2). 命名實體標註(NER)
unique(tokens$ner)## [1] "PERSON" "O" "ORDINAL"
## [4] "ORGANIZATION" "CITY" "MISC"
## [7] "COUNTRY" "NATIONALITY" "TIME"
## [10] "IDEOLOGY" "CAUSE_OF_DEATH" "TITLE"
## [13] "DATE" "NUMBER" "DURATION"
## [16] "STATE_OR_PROVINCE" "LOCATION" "PERCENT"
## [19] "RELIGION" "SET" "CRIMINAL_CHARGE"
## [22] "MONEY" "URL"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) ## [1] 3176
(3). 轉小寫
因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)我們可以透過coreNLP中的NER解析出在Twitter上面談論covid-19 ,所涉及到的國家(COUNTRY),以初步了解這個議題所討論到的主要國家。
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip() + 在「印度」連續兩日出現破14萬起新增確診趨勢,讓該國政府緊急拉起警報,而且疫苗庫存僅剩3天施打量 + 「美國」太快宣布戰勝病毒,導致第4波疫情即將到來,目前已達成超過1.5億劑新冠疫苗的接種數量,完成疫苗接種者約占5分之1總人口
tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip() + 嬌生疫苗 + 穆迪分析
dependencies = coreNLP_dependency_parser(obj)head(dependencies,20)## status_id dep governor governorGloss dependent
## 1 1381162673338003457 ROOT 0 ROOT 13
## 2 1381162673338003457 compound 2 Tory 1
## 3 1381162673338003457 nsubj 3 Got 2
## 4 1381162673338003457 parataxis 13 Feel 3
## 5 1381162673338003457 det 7 Dose 4
## 6 1381162673338003457 compound 7 Dose 5
## 7 1381162673338003457 compound 7 Dose 6
## 8 1381162673338003457 obj 3 Got 7
## 9 1381162673338003457 nsubj 13 Feel 8
## 10 1381162673338003457 advmod 13 Feel 9
## 11 1381162673338003457 aux 13 Feel 10
## 12 1381162673338003457 advmod 13 Feel 11
## 13 1381162673338003457 advmod 13 Feel 12
## 14 1381162673338003457 det 15 Needle' 14
## 15 1381162673338003457 obj 13 Feel 15
## 16 1381133716878196736 ROOT 0 ROOT 3
## 17 1381133716878196736 compound 2 Ford 1
## 18 1381133716878196736 nsubj 3 gets 2
## 19 1381133716878196736 amod 5 dose 4
## 20 1381133716878196736 obj 3 gets 5
## dependentGloss
## 1 Feel
## 2 John
## 3 Tory
## 4 Got
## 5 A
## 6 COVID19
## 7 Vaccine
## 8 Dose
## 9 He
## 10 'Literally
## 11 Did
## 12 Not
## 13 Even
## 14 The
## 15 Needle'
## 16 gets
## 17 Doug
## 18 Ford
## 19 first
## 20 dose
parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
##
## content
## The following object is masked from 'package:ggplot2':
##
## annotate
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
sentiment = coreNLP_sentiment_parser(obj)head(sentiment,20)## status_id
## 1 1381162673338003457
## 2 1381133716878196736
## 3 1381123156300140544
## 4 1381131207602274304
## 5 1381147814055583746
## 6 1381014676356673537
## 7 1381004122283991052
## 8 1381142027165970432
## 9 1381141019392049153
## 10 1381144054189162498
## 11 1381143280822460416
## 12 1381129434229579778
## 13 1381162654702694401
## 14 1381140108028612610
## 15 1381162652047708164
## 16 1381162648973246466
## 17 1381162643998789632
## 18 1381162616052191235
## 19 1381072014216208389
## 20 1381132409194885121
## text
## 1 John Tory Got A COVID19 Vaccine Dose He 'Literally Did Not Even Feel The Needle'
## 2 Doug Ford gets first dose of AstraZeneca COVID19 vaccine
## 3 Toronto looks to create standby lists at mass COVID19 vaccination clinics
## 4 Here's why Canadians have reason to be more optimistic about the COVID19 vaccine rollout
## 5 Pfizer BioNTech seek U.S. emergency nod for COVID19 vaccine in adolescents
## 6 COVID19 vaccine shortages to hit worlds poorest countries as COVAX halts deliveries
## 7 Pfizers COVID19 shot less effective against South African variant study
## 8 Second doses erroneously administered at Toronto COVID19 vaccination clinics
## 9 Employees at Toronto operated vaccine clinics test positive for COVID19
## 10 JJ COVID19 vaccine under EU review over blood clots
## 11 Expired COVID19 vaccine given out at Metro Vancouver pharmacy
## 12 Ford to receive first dose of COVID19 vaccine this morning
## 13 How Trump's Initiative and Free Market Innovation Created the Coronavirus Vaccinevia
## 14 The answer to your COVID19 vaccine question. To learn more about the COVID19 vaccines visit
## 15 The COVID19 vaccine is safe effective. The British Islamic Medical Association has confirmed you can have the vaccine during Ramadan. If you prefer make an appointment before or after Ramadan. To book your vaccine visit
## 16 While global equity concerns are acknowledged can it trump a countrys own domestic vaccine equity concerns? The latter is very difficult to sustain politically and socially too.
## 17 The COVID19 vaccine was only made available to the public after meeting strict safety and effectiveness criteria. Vaccines work by teaching your immune system how to defend itself against attack from the virus. Read more
## 18 Indians with comorbidities face a high risk in terms of severeand mortality. It's imperative to vaccinate vulnerable groups asap and then expand to other ageson India's public health challenge from the
## 19 As severalvaccine centres report low supplies amid a surge of cases we askand health sector analystabout Indias vaccine manufacturing capacity and the implications of vaccine shortage for public health
## 20 Dr Reddys Laboratories could get emergencyuse authorisation to make thevaccine. India needs many more doses to expandbeyond the elderly those with comorbidities health sector analyst
## sentiment sentimentValue
## 1 Neutral 2
## 2 Neutral 2
## 3 Neutral 2
## 4 Neutral 2
## 5 Neutral 2
## 6 Neutral 2
## 7 Neutral 2
## 8 Neutral 2
## 9 Positive 3
## 10 Neutral 2
## 11 Neutral 2
## 12 Neutral 2
## 13 Neutral 2
## 14 Neutral 2
## 15 Positive 3
## 16 Negative 1
## 17 Negative 1
## 18 Negative 1
## 19 Negative 1
## 20 Negative 1
unique(sentiment$sentiment)## [1] "Neutral" "Positive" "Negative" "Verynegative" "Verypositive"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric#了解情緒文章的分佈
sentiment$sentiment %>% table()## .
## Negative Neutral Positive Verynegative Verypositive
## 1152 3330 449 4 4
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%
group_by(date,source) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=source)) +
geom_line()## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
##### 了解情緒分佈,以及在正面情緒及負面情緒下,所使用的文章詞彙為何?
#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the','covid19','vaccine',19,'covid19.','covid')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >5 & count<400)%>%
wordcloud2()## Joining, by = "word"
#了解負面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >10 &count<400)%>%
wordcloud2()## Joining, by = "word"
“wordcloud”
library(sentimentr)
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字## words polarity n
## 1: efficacy 1.0 48
## 2: top 1.0 35
## 3: approved 1.0 8
## 4: approval 1.0 5
## 5: fast 1.0 5
## ---
## 360: masters 0.1 1
## 361: pray 0.1 1
## 362: shares 0.1 1
## 363: depends 0.1 1
## 364: church 0.1 1
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字## Selecting by n
## words polarity n
## 1: virus -0.50 31
## 2: government -0.50 27
## 3: disease -1.00 20
## 4: risk -0.75 19
## 5: prevent -0.25 18
## 6: fall -0.25 12
## 7: strain -0.60 12
## 8: drug -0.10 11
## 9: shot -0.40 11
## 10: cancer -0.75 11
set.seed(12)
df%>%
filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
mutate(review = get_sentences(text)) %$%
sentiment_by(review, status_id) %>%
highlight()## Saved in /var/folders/p2/cqvmy7c17px138qxl3wdzfzm0000gn/T//RtmpdM7882/polarity.html
## Opening /var/folders/p2/cqvmy7c17px138qxl3wdzfzm0000gn/T//RtmpdM7882/polarity.html ...
tweets$date = format(tweets$created_at,'%Y%m%d')
(out = tweets %>% with(
sentiment_by( #document level
get_sentences(text),
list( date)
)
))
plot(out)(out = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% with(
sentiment_by(
get_sentences(text),
list(source, date)
)
))
plot(out)