安裝package
packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)load("coreNLP.RData")(1). Twitter API設定 透過rtweet抓取tweets
app = 'r lan'
consumer_key = 'dJefV7i1VsTTJ9rg4NGKsaVk8'
consumer_secret = 'S2m6I1bHt6sBVd4y0vv2v2Mmq4XYb5qRhNCO1KbOmDzp9nWiXh'
access_token = '3610603634-lAhazEWR7VpLgyMt5Ink3jojkg8iSBWIqbqqjaF'
access_secret = 'pvKFRmnJmhft9SOp1aMcwBBNEX9hqsc5jtHpMONUgpWkm'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)(2). 設定關鍵字抓tweets
# 查詢關鍵字
key = c("#COVID19")
context = "#vaccine"
q = paste(c(key,context),collapse=" AND ")
# 查詢字詞 "#COVID19 AND #vaccine"
# 為了避免只下#EverGiven 會找到非在suez中的tweets,加入Suez要同時出現的條件
#抓5000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)(3). tweets內容清理
## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweetshead(df)## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 135187648 1380922284… 2021-04-10 16:35:03 WorldBankAf… NEWreport find… Hootsu…
## 2 471615885 1380922251… 2021-04-10 16:34:55 MeadowGood What about any… Twitte…
## 3 41626609 1380922195… 2021-04-10 16:34:42 greg_folkers . Quotation of… Twitte…
## 4 10430292… 1380921774… 2021-04-10 16:33:01 tweet_aneri inTelangana le… Twitte…
## 5 11349623… 1380921111… 2021-04-10 16:30:23 AccessCNY Every program … Hootsu…
## 6 11349623… 1379803826… 2021-04-07 14:30:42 AccessCNY Thank you to F… Hootsu…
## # … with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>,
## # ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## # lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## # quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
df共有90個欄位,但我們在這裡僅會使用幾個欄位:
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料
nrow(df)## [1] 4948
min(df$created_at)## [1] "2021-04-06 22:05:04 UTC"
max(df$created_at)## [1] "2021-04-10 16:35:03 UTC"
df%>%
filter(quoted_name != "") %>% #篩選quoted_name不為空白
group_by(quoted_name) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 15, count) %>%
ungroup() %>%
mutate(user = reorder(quoted_name, count)) %>%
ggplot(aes(user, count)) +
geom_col()+
ggtitle("Word Frequency (quoted_name)") +
theme(text=element_text(size=14))+
coord_flip()可以發現,會被其他使用者分享文章的帳號多為組織、報社、雜誌,像是: Reuters(路透社)、Financial Times(金融時報)、EU Medicines Agency(歐洲藥品管理局)、 The Wall Street Journal(華爾街日報)、The New York Times(紐約時報)等, 這些帳號所發出來的文章為假消息的機率較低, 因而會有使用者將訊息轉貼出去。
df %>%
filter(quoted_location != "") %>% #篩選quoted_location不為空白
group_by(quoted_location) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 15, count) %>%
ungroup() %>%
mutate(location = reorder(quoted_location, count)) %>%
ggplot(aes(location, count)) +
geom_col()+
ggtitle("Word Frequency (quoted_location)") +
theme(text=element_text(size=14))+
coord_flip()df %>%
filter(quoted_followers_count != "") %>% #篩選quoted_followers_count
group_by(quoted_followers_count) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 15, count) %>%
ungroup() %>%
mutate(number = reorder(quoted_followers_count, count)) %>%
ggplot(aes(number, count)) +
geom_col()+
ggtitle("Word Frequency (quoted_followers_count)") +
theme(text=element_text(size=14))+
coord_flip()會被引用的文章帳號,其追蹤人數相對較多
df %>%
filter(quoted_name != "") %>%
filter(quoted_location != "") %>%
filter(quoted_followers_count != "") %>%
select(quoted_name,quoted_location,quoted_followers_count) ## # A tibble: 376 x 3
## quoted_name quoted_location quoted_followers_cou…
## <chr> <chr> <int>
## 1 Gearoid Reidy Tokyo, Japan 9155
## 2 Julia Mio Inuma (井沼ジュリア) Tokyo, Japan 691
## 3 Rochelle Kopp Tokyo and Silicon Valley 10278
## 4 The Times Of India New Delhi 13471119
## 5 MOHS Sierra Leone 1478
## 6 Amanda Jetté Knox Ottawa, Canada 65185
## 7 MDVaxAlerts College Park, MD 18481
## 8 MDVaxAlerts College Park, MD 18481
## 9 Meg Tirrell New York 88455
## 10 Governor Phil Murphy New Jersey, USA 384616
## # … with 366 more rows
(1). API呼叫的設定
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}(2). 資料整理function
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}取得coreNLP回傳的物件
# gc() #釋放不使用的記憶體
#
# t0 = Sys.time()
# obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
# #丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
#
# Sys.time() - t0 #執行時間
# #Time difference of 28 mins
#
# save.image("coreNLP.RData")#先將會用到的東西存下來,要用可直接載RData
# tokens = coreNLP_tokens_parser(obj)
# dependencies = coreNLP_dependency_parser(obj)
# sentiment = coreNLP_sentiment_parser(obj)
# save.image("coreNLP.RData")(1). 斷詞、詞彙還原、詞性標註、NER
#tokens = coreNLP_tokens_parser(obj)head(tokens,20)## status_id word lemma pos ner
## 1 1380922284517511175 NEWreport newreport NN O
## 2 1380922284517511175 finds find VBZ O
## 3 1380922284517511175 that that IN O
## 4 1380922284517511175 given give VBN O
## 5 1380922284517511175 the the DT O
## 6 1380922284517511175 presenthesitancy presenthesitancy NN O
## 7 1380922284517511175 levels level NNS O
## 8 1380922284517511175 strategies strategy NNS O
## 9 1380922284517511175 are be VBP O
## 10 1380922284517511175 needed need VBN O
## 11 1380922284517511175 to to TO O
## 12 1380922284517511175 generate generate VB O
## 13 1380922284517511175 Confidence confidence NN O
## 14 1380922284517511175 Acceptance Acceptance NNP O
## 15 1380922284517511175 Demand demand NN O
## 16 1380922284517511175 for for IN O
## 17 1380922284517511175 thevaccine thevaccine NN O
## 18 1380922284517511175 Learn learn VB O
## 19 1380922284517511175 more more JJR O
## 20 1380922251428659204 What what WP O
(2). 命名實體標註(NER)
unique(tokens$ner)## [1] "O" "LOCATION" "NUMBER"
## [4] "PERSON" "DATE" "DURATION"
## [7] "MISC" "ORGANIZATION" "COUNTRY"
## [10] "CITY" "NATIONALITY" "ORDINAL"
## [13] "RELIGION" "TITLE" "STATE_OR_PROVINCE"
## [16] "CAUSE_OF_DEATH" "CRIMINAL_CHARGE" "IDEOLOGY"
## [19] "SET" "TIME" "URL"
## [22] "PERCENT" "MONEY"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) ## [1] 2982
(3). 轉小寫
因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()#dependencies = coreNLP_dependency_parser(obj)head(dependencies,20)## status_id dep governor governorGloss dependent
## 1 1380922284517511175 ROOT 0 ROOT 2
## 2 1380922284517511175 nsubj 2 finds 1
## 3 1380922284517511175 mark 10 needed 3
## 4 1380922284517511175 case 7 levels 4
## 5 1380922284517511175 det 7 levels 5
## 6 1380922284517511175 compound 7 levels 6
## 7 1380922284517511175 obl 10 needed 7
## 8 1380922284517511175 nsubj:pass 10 needed 8
## 9 1380922284517511175 aux:pass 10 needed 9
## 10 1380922284517511175 ccomp 2 finds 10
## 11 1380922284517511175 mark 12 generate 11
## 12 1380922284517511175 xcomp 10 needed 12
## 13 1380922284517511175 compound 15 Demand 13
## 14 1380922284517511175 compound 15 Demand 14
## 15 1380922284517511175 nsubj 18 Learn 15
## 16 1380922284517511175 case 17 thevaccine 16
## 17 1380922284517511175 nmod 15 Demand 17
## 18 1380922284517511175 ccomp 12 generate 18
## 19 1380922284517511175 obj 18 Learn 19
## 20 1380922251428659204 ROOT 0 ROOT 1
## dependentGloss
## 1 finds
## 2 NEWreport
## 3 that
## 4 given
## 5 the
## 6 presenthesitancy
## 7 levels
## 8 strategies
## 9 are
## 10 needed
## 11 to
## 12 generate
## 13 Confidence
## 14 Acceptance
## 15 Demand
## 16 for
## 17 thevaccine
## 18 Learn
## 19 more
## 20 What
parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
#sentiment = coreNLP_sentiment_parser(obj)head(sentiment,20)## status_id
## 1 1380922284517511175
## 2 1380922251428659204
## 3 1380922195552178176
## 4 1380921774162944003
## 5 1380921111311958019
## 6 1379803826132283394
## 7 1380921088062976007
## 8 1380220172091453440
## 9 1380920174581141504
## 10 1379691746163036163
## 11 1380919458470969350
## 12 1380570929818898432
## 13 1379683088045760520
## 14 1380919205499871233
## 15 1380434176168501254
## 16 1380569194597523456
## 17 1379681865129357312
## 18 1380919092677287940
## 19 1380565549848043523
## 20 1380918924934406151
## text
## 1 NEWreport finds that given the presenthesitancy levels strategies are needed to generate Confidence Acceptance Demand for thevaccine Learn more
## 2 What about anyone who got the vaccine wear a pin to show you are vaccinated? This will encourage others we will know who is in the club so we know who we can let our masks down around?
## 3 . Quotation of the Day Relying on one dose of Moderna or Pfizer to stop variants like B.1.351 is like using a BB gun to stop a charging rhino.JOHN P. MOORE a virologist at
## 4 in Telangana left with only 5.6 lakh doses ofwhich would last for 3 days. Govt of Telangana writes to Centre to provide at least 30 lakh doses of Vaccine to Telangana for next 15 days.
## 5 Every program at AccessCNY serves highrisk individuals. And every person who gets vaccinated helps to protect our participants and the community at large!
## 6 Thank you to Faye and every other AccessCNY employee who has spent the last year caring for and advocating for the people we serve. The impact of your dedicated actions is immeasurable.
## 7 ages 50 or older it is time for you to take your shot! Contact your countydepartment or
## 8 ages 50 or older it is time for you to take your shot! Contact your countydepartment or
## 9 Its vaccine day for our lil family. Will I post a picture or share an update or two later? Im not sure.Here is to two more humans beginning thejourney
## 10 AZN Booted from Baltimore Fair Play by the US? Coincidence politics or a little of both AstraZeneca's road to FDA approval has been a rocky one even if the US has purchased 500M doses of their EBS NVAX ALT ALNY DVAX
## 11 All Eyes Are on Upcoming Novavax's Phase 3 Study Readout .'sis arriving later to theimmunization frenzy thanwanted but still capable of being a major player in the global supply. NVAX AZN CVAC ALT
## 12 JNJVaccine Under EU Review with Same Safety Issue as AstraZeneca Vaccine Europes drug regulator said it is reviewing reports of blood clots in 4 people who received J has expanded itsprobe. NVAX OCGN MRNA
## 13 Thevaccine is still for no good reason at all not approved in . This is moving from indifference bureaucratic rigidity and perfect as enemy of the good to a morally bankrupt and abject failure of leadership.
## 14 had 59 new cases per 100K in the last 7 days and climbing. Los Angeles County had 35. Osakas positive test rate 8 LA 1.4. Osaka has vaccinated LA 30 with 1 dose.Osaka is heading into crisis by any standardwith little response.
## 15 Anticipatedsupply for 65 inseems to increase rapidly starting in May but Ive seen no discussion of plannedanticipatedratescapacity. Completing 2 doses to 30 of 36 mill 65 between May 1 and say July 15 requires 400450K vaxday.
## 16 Would that be the same Ministry of Health that appears to be moving the approval of thevaccine at a glacial pace?
## 17 Vox clamantis in deserto. A voice crying in the Japanese wilderness. Sadly once bureaucratic inertia required but slow consensus and inflexibility set a reactive and ineffective course adjustment is unlikely.
## 18 45 ...about getting theafter being sick withbecause they are under the impression that they will be much sicker after the . While this shows they can get more of a systemic reaction it is safe and they won't be sick enough to be hospitalized.
## 19 While we are making progress we still more work to do. This is our shot! Get your COVID19 vaccine when you are able help end this pandemic...
## 20 It Begins Only COVID Vaccinated People Can Be Evacuated from Volcano Stricken Caribbean Island of St. Vincent
## sentiment sentimentValue
## 1 Neutral 2
## 2 Positive 3
## 3 Neutral 2
## 4 Negative 1
## 5 Neutral 2
## 6 Neutral 2
## 7 Negative 1
## 8 Negative 1
## 9 Negative 1
## 10 Negative 1
## 11 Neutral 2
## 12 Negative 1
## 13 Negative 1
## 14 Positive 3
## 15 Negative 1
## 16 Neutral 2
## 17 Negative 1
## 18 Negative 1
## 19 Positive 3
## 20 Neutral 2
unique(sentiment$sentiment)## [1] "Neutral" "Positive" "Negative" "Verynegative" "Verypositive"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric#了解情緒文章的分佈
sentiment$sentiment %>% table()## .
## Negative Neutral Positive Verynegative Verypositive
## 1288 2858 773 2 3
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >5 & count<400)%>%
wordcloud2()## Joining, by = "word"
#了解負面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >10 &count<400)%>%
wordcloud2()library(sentimentr)
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字## words polarity n
## 1: please 1.0 99
## 2: care 1.0 12
## 3: approved 1.0 10
## 4: efficacy 1.0 8
## 5: approval 1.0 7
## ---
## 508: coalition 0.1 1
## 509: relating 0.1 1
## 510: momentum 0.1 1
## 511: prefer 0.1 1
## 512: build 0.1 1
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字## Selecting by n
## words polarity n
## 1: pharmaceutical -0.40 80
## 2: pandemic -1.00 69
## 3: shot -0.40 41
## 4: jab -0.60 31
## 5: virus -0.50 30
## 6: government -0.50 22
## 7: risk -0.75 22
## 8: fight -0.50 13
## 9: shortage -0.75 13
## 10: flu -0.75 13
# set.seed(12)
# df%>%
# filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
# mutate(review = get_sentences(text)) %$%
# sentiment_by(review, status_id) %>%
# highlight()tweets$date = format(tweets$created_at,'%Y%m%d')
(out = tweets %>% with(
sentiment_by( #document level
get_sentences(text),
list( date)
)
))
plot(out)(out = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% with(
sentiment_by(
get_sentences(text),
list(source, date)
)
))
plot(out)