安裝package
packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)load("coreNLP_all.RData")(1). Twitter API設定 透過rtweet抓取tweets
app = 'Emotions COVID-19 Vaccine'
consumer_key = 'sldS3M1c37owWAxx88lRg8anU'
consumer_secret = 'lCsUtxqA6DWC9nW7xH2a5KAITLXEX8oj10tcWE7zRVTxgHARfC'
access_token = '1283052584312410112-LocNkHahyAJ50KR0sADTmHryO0k3Kq'
access_secret = 'gLssR17xxOZUDLeiF6sB5LiwSYAAVBE0mLjXQolINF4k3'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權(2). 設定關鍵字抓tweets
# 查詢關鍵字
key = c("#COVID-19")
context = "Vaccine"
q = paste(c(key,context),collapse=" AND ")
# 查詢字詞 "#COVID-19 AND Vaccine"
# 為了避免只下#COVID-19 會找到非在Vaccine中的tweets,加入Vaccine要同時出現的條件
#抓10000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=8000,include_rts = FALSE,token = twitter_token)(3). tweets內容清理
## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweetshead(df)## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 49196776~ 1381131832~ 2021-04-11 06:27:43 Illuminatek~ Can you still ~ Twitte~
## 2 19050000 1381131829~ 2021-04-11 06:27:43 FinancialRe~ Australians co~ Echobox
## 3 12241562~ 1381131683~ 2021-04-11 06:27:08 KratikaTriv~ What is the po~ Twitte~
## 4 27987687 1381131650~ 2021-04-11 06:27:00 PinkNews New HIV vaccin~ TweetD~
## 5 27987687 1380908932~ 2021-04-10 15:42:00 PinkNews New HIV vaccin~ TweetD~
## 6 34713362 1381131563~ 2021-04-11 06:26:39 business South Korea wi~ Social~
## # ... with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>,
## # ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## # lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## # quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
df共有90個欄位,但我們在這裡僅會使用幾個欄位:
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料
nrow(df)## [1] 7940
min(df$created_at)## [1] "2021-04-10 15:32:41 UTC"
max(df$created_at)## [1] "2021-04-11 06:27:43 UTC"
(1). API呼叫的設定
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}(2). 資料整理function
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}取得coreNLP回傳的物件
先不要跑這段,會花大概半小時(如果你記憶體只有4G可能會當掉)
#gc() #釋放不使用的記憶體
#t0 = Sys.time()
#obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host)
#丟入本地執行 丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
#Sys.time() - t0 #執行時間
#Time difference of 30 mins#save.image("coreNLP_covid19.RData")#先將會用到的東西存下來,要用可直接載RData
#tokens = coreNLP_tokens_parser(obj)
#dependencies = coreNLP_dependency_parser(obj)
#sentiment = coreNLP_sentiment_parser(obj)
#save.image("coreNLP_all.RData")(1). 斷詞、詞彙還原、詞性標註、NER
tokens = coreNLP_tokens_parser(obj)head(tokens,20)## status_id word lemma pos ner
## 1 1381131832515817472 Can can MD O
## 2 1381131832515817472 you you PRP O
## 3 1381131832515817472 still still RB O
## 4 1381131832515817472 catch catch VB O
## 5 1381131832515817472 Covid19 covid19 NN O
## 6 1381131832515817472 after after IN O
## 7 1381131832515817472 getting get VBG O
## 8 1381131832515817472 the the DT O
## 9 1381131832515817472 vaccine? vaccine? NN O
## 10 1381131832515817472 YES! yes! NN O
## 11 1381131832515817472 It it PRP O
## 12 1381131832515817472 is be VBZ O
## 13 1381131832515817472 possible possible JJ O
## 14 1381131832515817472 to to TO O
## 15 1381131832515817472 still still RB O
## 16 1381131832515817472 catch catch VB O
## 17 1381131832515817472 Covid Covid NNP O
## 18 1381131832515817472 after after IN O
## 19 1381131832515817472 vaccination. vaccination. NNP O
## 20 1381131832515817472 Supposed suppose VBN O
(2). 命名實體標註(NER)
unique(tokens$ner)## [1] "O" "MISC" "ORDINAL"
## [4] "DATE" "CAUSE_OF_DEATH" "COUNTRY"
## [7] "NUMBER" "DURATION" "ORGANIZATION"
## [10] "TITLE" "CITY" "NATIONALITY"
## [13] "STATE_OR_PROVINCE" "PERSON" "TIME"
## [16] "LOCATION" "SET" "PERCENT"
## [19] "RELIGION" "IDEOLOGY" "CRIMINAL_CHARGE"
## [22] "MONEY" "URL"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) ## [1] 4440
(3). 轉小寫
因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)我們可以透過coreNLP中的NER解析出在Twitter上面談論covid19疫苗的事情,所涉及到的國家(COUNTRY),以初步了解這個議題的主要國家。
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在Twitter上面談論covid19疫苗的事情,所涉及到的組織(ORGANIZATION),以初步了解這個議題的主要公司/單位。
tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在Twitter上面談論covid19疫苗的事情,所涉及到的人物(PERSON),以初步了解這個議題的主要人物。
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()dependencies = coreNLP_dependency_parser(obj)head(dependencies,20)## status_id dep governor governorGloss dependent
## 1 1381131832515817472 ROOT 0 ROOT 4
## 2 1381131832515817472 aux 4 catch 1
## 3 1381131832515817472 nsubj 4 catch 2
## 4 1381131832515817472 advmod 4 catch 3
## 5 1381131832515817472 obj 4 catch 5
## 6 1381131832515817472 mark 7 getting 6
## 7 1381131832515817472 advcl 4 catch 7
## 8 1381131832515817472 det 10 YES! 8
## 9 1381131832515817472 compound 10 YES! 9
## 10 1381131832515817472 obj 7 getting 10
## 11 1381131832515817472 nsubj 13 possible 11
## 12 1381131832515817472 cop 13 possible 12
## 13 1381131832515817472 acl:relcl 10 YES! 13
## 14 1381131832515817472 mark 16 catch 14
## 15 1381131832515817472 advmod 16 catch 15
## 16 1381131832515817472 xcomp 13 possible 16
## 17 1381131832515817472 obj 16 catch 17
## 18 1381131832515817472 case 19 vaccination. 18
## 19 1381131832515817472 obl 16 catch 19
## 20 1381131832515817472 acl 19 vaccination. 20
## dependentGloss
## 1 catch
## 2 Can
## 3 you
## 4 still
## 5 Covid19
## 6 after
## 7 getting
## 8 the
## 9 vaccine?
## 10 YES!
## 11 It
## 12 is
## 13 possible
## 14 to
## 15 still
## 16 catch
## 17 Covid
## 18 after
## 19 vaccination.
## 20 Supposed
parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
sentiment = coreNLP_sentiment_parser(obj)head(sentiment,20)## status_id
## 1 1381131832515817472
## 2 1381131829978271749
## 3 1381131683269775360
## 4 1381131650017411074
## 5 1380908932525330433
## 6 1381131563396653059
## 7 1381131492282224643
## 8 1381131377819713537
## 9 1381131361226997760
## 10 1381130703497093122
## 11 1380941752756887555
## 12 1381130898272231424
## 13 1381131348954415104
## 14 1381130611386015744
## 15 1380964456415555594
## 16 1381131038743801856
## 17 1381096556607180804
## 18 1381131288116072454
## 19 1380922421084045313
## 20 1380921654734434306
## text
## 1 Can you still catch Covid19 after getting the vaccine? YES! It is possible to still catch Covid after vaccination. Supposed to be unlikely but I know someone who has. So as the county reopens maintain distancing stay safe. If we are carefulno more lockdowns.
## 2 Australians could receive at least their first COVID19 vaccine injection by Christmas with the federal government working towards the new goal under the reworked national rollout.
## 3 What is the point in getting vaccinated when they are completely ineffective against this new wave of corona virus?
## 4 New HIV vaccine 'based on Moderna's COVID jab' shows huge promise after first human trials
## 5 New HIV vaccine 'based on Moderna's COVID jab' shows huge promise after first human trials
## 6 South Korea will resume AstraZeneca's Covid19 vaccine inoculations for those between the ages of 30 and 60
## 7 10042021 Covid19 vaccine jabbed
## 8 Data through April 1 not an April Fool's Day Joke! They've already reported 2342 deaths in the United States from the COVID19 vaccine experimental drug plus 941 patients with permanent disability from the shot and 1484 life threatening injuries.
## 9 Vaccination appointments are available at CVS.DRACUT
## 10 Mu mus can anyone tell why should one take a vaccine after having a covid19 infection recovered? Vaccine is a simulation why don't you open a driving school for those who have permanent license.
## 11 Congratulations your vaccine immunity lasts lower or equal to Covid19 infection.
## 12 Mu mus can anyone tell why should one take a vaccine after having a covid19 infection recovered? Vaccine is a simulation why don't you open a driving school for those who have permanent license.
## 13 Mu mus can anyone tell why should one take a vaccine after having a covid19 infection recovered? Vaccine is a simulation why don't you open a driving school for those who have permanent license.
## 14 Mu mus can anyone tell why should one take a vaccine after having a covid19 infection recovered? Vaccine is a simulation why don't you open a driving school for those who have permanent license.
## 15 Vaccine appointments available at Walgreens Saint Louis from Apr 14 to Apr 15. Sign up here zip code 63113
## 16 Vaccine appointments available at Walgreens Jennings from Apr 12 to Apr 15. Sign up here zip code 63136
## 17 Vaccine appointments available at CVS Saint Louis. Sign up here
## 18 Vaccine appointments available at Walgreens Saint Louis from Apr 13 to Apr 15. Sign up here zip code 63121
## 19 Vaccine appointments available at Walgreens Saint Louis from Apr 11 to Apr 15. Sign up here zip code 63136
## 20 Vaccine appointments available at Walgreens Saint Louis on Apr 15. Sign up here zip code 63108
## sentiment sentimentValue
## 1 Neutral 2
## 2 Neutral 2
## 3 Neutral 2
## 4 Positive 3
## 5 Positive 3
## 6 Neutral 2
## 7 Neutral 2
## 8 Positive 3
## 9 Neutral 2
## 10 Neutral 2
## 11 Neutral 2
## 12 Neutral 2
## 13 Neutral 2
## 14 Neutral 2
## 15 Neutral 2
## 16 Neutral 2
## 17 Neutral 2
## 18 Neutral 2
## 19 Neutral 2
## 20 Neutral 2
unique(sentiment$sentiment)## [1] "Neutral" "Positive" "Negative" "Verynegative" "Verypositive"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric#了解情緒文章的分佈
sentiment$sentiment %>% table()## .
## Negative Neutral Positive Verynegative Verypositive
## 2014 4887 958 5 6
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%
group_by(date,source) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=source)) +
geom_line()## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the','covid19')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >5 & count<400)%>%
wordcloud2()## Joining, by = "word"
#了解負面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >10 &count<400)%>%
wordcloud2()“wordcloud”
library(sentimentr)## Warning: package 'sentimentr' was built under R version 4.0.5
mytext <- c(
"I heard you on the phone, I had you in mind",
"I've been on the run since you walked through the hallway",
"Heard it all before, I don't even mind",
"I'll do anything, pull me in and I'm far away"
)
mytext <- get_sentences(mytext) #物件,將character向量轉成list,list裡放著character向量(已斷句)情緒分數為-1~1之間,<0屬於負面,>0屬於正面,0屬於中性
sentiment_by(mytext) #document level## element_id word_count sd ave_sentiment
## 1: 1 11 NA 0
## 2: 2 11 NA 0
## 3: 3 8 NA 0
## 4: 4 10 NA 0
sentiment(mytext) #sentence level## element_id sentence_id word_count sentiment
## 1: 1 1 11 0
## 2: 2 1 11 0
## 3: 3 1 8 0
## 4: 4 1 10 0
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字## words polarity n
## 1: please 1.0 21
## 2: care 1.0 14
## 3: approved 1.0 12
## 4: efficacy 1.0 6
## 5: understand 1.0 5
## ---
## 351: pray 0.1 1
## 352: veteran 0.1 1
## 353: building 0.1 1
## 354: develop 0.1 1
## 355: build 0.1 1
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字## Selecting by n
## words polarity n
## 1: shot -0.40 32
## 2: declined -0.60 22
## 3: fake -0.75 22
## 4: crack -0.50 15
## 5: virus -0.50 15
## 6: risk -0.75 15
## 7: shortage -0.75 13
## 8: emergency -0.75 13
## 9: warning -0.50 12
## 10: stop -0.40 10
## 11: rocky -1.00 10
set.seed(12)
df%>%
filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
mutate(review = get_sentences(text)) %$%
sentiment_by(review, status_id) %>%
highlight()## Saved in C:\Users\ASUS-NB\AppData\Local\Temp\RtmpOWVqEp/polarity.html
## Opening C:\Users\ASUS-NB\AppData\Local\Temp\RtmpOWVqEp/polarity.html ...
tweets$date = format(tweets$created_at,'%Y%m%d')
(out = tweets %>% with(
sentiment_by( #document level
get_sentences(text),
list( date)
)
))
plot(out)(out = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% with(
sentiment_by(
get_sentences(text),
list(source, date)
)
))
plot(out)