packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(sentimentr)load("coreNLP_all.RData")(1). Twitter API設定 透過rtweet抓取tweets
app = 'Emotions COVID-19 Vaccine'
consumer_key = 'sldS3M1c37owWAxx88lRg8anU'
consumer_secret = 'lCsUtxqA6DWC9nW7xH2a5KAITLXEX8oj10tcWE7zRVTxgHARfC'
access_token = '1283052584312410112-LocNkHahyAJ50KR0sADTmHryO0k3Kq'
access_secret = 'gLssR17xxOZUDLeiF6sB5LiwSYAAVBE0mLjXQolINF4k3'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權(2). 設定關鍵字抓tweets
# 查詢關鍵字
key = c("#EarthDay")
context = ""
q = paste(c(key,context),collapse=" AND ")
# 查詢字詞 "#COVID-19 AND Vaccine"
# 為了避免只下#COVID-19 會找到非在Vaccine中的tweets,加入Vaccine要同時出現的條件
#抓8000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=8000,include_rts = FALSE,token = twitter_token)(3). tweets內容清理
## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweetshead(df)## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 312028258 1387335026~ 2021-04-28 09:17:00 EUROCITIES Live now How c~ Twitte~
## 2 11663231~ 1387334596~ 2021-04-28 09:15:17 DaceHermione At last week's~ Twitte~
## 3 11094207~ 1387334483~ 2021-04-28 09:14:50 GreenhamBun~ We had a busy ~ Twitte~
## 4 466339000 1387333279~ 2021-04-28 09:10:03 Alleyns_Sch~ Lower School p~ Hootsu~
## 5 54006056 1387333119~ 2021-04-28 09:09:25 panintellig~ Last week duri~ Twitte~
## 6 8352642 1387332486~ 2021-04-28 09:06:54 pacobriseno . after ~ Twitte~
## # ... with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>,
## # ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## # lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## # quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
df共有90個欄位,但我們在這裡僅會使用幾個欄位:
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料
nrow(df)## [1] 7741
min(df$created_at)## [1] "2021-04-23 19:16:00 UTC"
max(df$created_at)## [1] "2021-04-28 09:17:00 UTC"
(1). API呼叫的設定
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}(2). 資料整理function
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}取得coreNLP回傳的物件
先不要跑這段,會花大概半小時(如果你記憶體只有4G可能會當掉)
#() #釋放不使用的記憶體
#t0 = Sys.time()
#obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host)
#丟入本地執行 丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
#Sys.time() - t0 #執行時間
#Time difference of 30 mins#save.image("coreNLP.RData")#先將會用到的東西存下來,要用可直接載RData
#tokens = coreNLP_tokens_parser(obj)
#dependencies = coreNLP_dependency_parser(obj)
#sentiment = coreNLP_sentiment_parser(obj)
#save.image("coreNLP_all.RData")(1). 斷詞、詞彙還原、詞性標註、NER
tokens = coreNLP_tokens_parser(obj)head(tokens,20)## status_id word lemma pos ner
## 1 1387335026942091265 Live live RB O
## 2 1387335026942091265 now now RB DATE
## 3 1387335026942091265 How how WRB O
## 4 1387335026942091265 can can MD O
## 5 1387335026942091265 we we PRP O
## 6 1387335026942091265 ensure ensure VB O
## 7 1387335026942091265 recovery recovery NN O
## 8 1387335026942091265 strategies strategy NNS O
## 9 1387335026942091265 drive drive VBP O
## 10 1387335026942091265 the the DT O
## 11 1387335026942091265 green green JJ O
## 12 1387335026942091265 transition transition NN O
## 13 1387335026942091265 at at IN O
## 14 1387335026942091265 a a DT O
## 15 1387335026942091265 local local JJ O
## 16 1387335026942091265 level? level? NN O
## 17 1387335026942091265 Join join VB O
## 18 1387335026942091265 our we PRP$ O
## 19 1387335026942091265 panel panel NN O
## 20 1387335026942091265 on on IN O
(2). 命名實體標註(NER)
unique(tokens$ner)## [1] "O" "DATE" "NUMBER"
## [4] "ORGANIZATION" "DURATION" "TIME"
## [7] "MISC" "LOCATION" "COUNTRY"
## [10] "TITLE" "PERSON" "SET"
## [13] "CITY" "NATIONALITY" "MONEY"
## [16] "ORDINAL" "STATE_OR_PROVINCE" "CAUSE_OF_DEATH"
## [19] "IDEOLOGY" "URL" "CRIMINAL_CHARGE"
## [22] "RELIGION" "PERCENT"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) ## [1] 4373
(3). 轉小寫
因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)我們可以透過coreNLP中的NER解析出在Twitter上面談論世界地球日的事情,所涉及到的國家(COUNTRY),以初步了解這個議題的主要國家。
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在Twitter上面談論世界地球日的事情,所涉及到的組織(ORGANIZATION),以初步了解這個議題的主要公司/單位。
tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在Twitter上面世界地球日的事情,所涉及到的人物(PERSON),以初步了解這個議題的主要人物。
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()dependencies = coreNLP_dependency_parser(obj)head(dependencies,20)## status_id dep governor governorGloss dependent
## 1 1387335026942091265 ROOT 0 ROOT 6
## 2 1387335026942091265 advmod 6 ensure 1
## 3 1387335026942091265 advmod 6 ensure 2
## 4 1387335026942091265 advmod 6 ensure 3
## 5 1387335026942091265 aux 6 ensure 4
## 6 1387335026942091265 nsubj 6 ensure 5
## 7 1387335026942091265 compound 8 strategies 7
## 8 1387335026942091265 nsubj 9 drive 8
## 9 1387335026942091265 ccomp 6 ensure 9
## 10 1387335026942091265 det 12 transition 10
## 11 1387335026942091265 amod 12 transition 11
## 12 1387335026942091265 nsubj 17 Join 12
## 13 1387335026942091265 case 16 level? 13
## 14 1387335026942091265 det 16 level? 14
## 15 1387335026942091265 amod 16 level? 15
## 16 1387335026942091265 nmod 12 transition 16
## 17 1387335026942091265 ccomp 9 drive 17
## 18 1387335026942091265 nmod:poss 19 panel 18
## 19 1387335026942091265 obj 17 Join 19
## 20 1387335026942091265 obl 17 Join 20
## dependentGloss
## 1 ensure
## 2 Live
## 3 now
## 4 How
## 5 can
## 6 we
## 7 recovery
## 8 strategies
## 9 drive
## 10 the
## 11 green
## 12 transition
## 13 at
## 14 a
## 15 local
## 16 level?
## 17 Join
## 18 our
## 19 panel
## 20 on
parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
sentiment = coreNLP_sentiment_parser(obj)head(sentiment,20)## status_id
## 1 1387335026942091265
## 2 1387334596195463169
## 3 1387334483532206086
## 4 1387333279238459393
## 5 1387333119477420032
## 6 1387332486611492864
## 7 1387331504792035331
## 8 1386970979381960706
## 9 1386608239983403009
## 10 1387331006219309056
## 11 1387330901869219841
## 12 1386847604084969473
## 13 1387330769517957120
## 14 1387330749158752260
## 15 1387330114870988806
## 16 1387329499629559809
## 17 1387329211203866626
## 18 1385771674084134913
## 19 1385772954470936577
## 20 1387328132936577025
## text
## 1 Live now How can we ensure recovery strategies drive the green transition at a local level? Join our panel on
## 2 At last week'ssummit world leaders announced welcome and ambitious climate targets. The question is can we deliver the speed and scale of innovation needed to meet them?explores this in his new paper
## 3 We had a busy Earth Day last week with our 'Acts of Green' competition and received a total of 74 Acts of Green submissions across our Service Centres! Congratulations to all 3 winning teams and well done all for your hard efforts!
## 4 Lower School pupils have been thinking aboutin their actions this week. This morning they enjoyed registration and games outside in their year group bubbles. Getting out and enjoying nature does us all good. Remember to keep looking up!
## 5 Last week duringwe highlighted thatis on the rise but did you know that 6.7M tonnes of food is wasted a year? By usingyour business can make a change! Book a demo to find out how
## 6 . after will you adopt ambitious climate commitments and get on track to meettargets before ?
## 7 Demonstrate your brands sustainability claims such as ethical sourcing or environmental impact with the new Everledger Platform. Request a demo here
## 8 Plastic packaging has become an economic environmental and social burden. Only 14 of plastic packaging is collected for recycling globally. Watch now how our platform can play a key role in the circular packaging economy
## 9 We're working with our partners to meet our own carbon neutrality goals.is on track to have 100 of their cloud running on renewable energy by 2025. Read on
## 10 Post! Learn how financing and investing activities are facing big impacts as climate initiatives continue to move forward
## 11 My friend is officially an egg
## 12 Happening tomorrow Building SustainableSolutions on Wednesday April 28 at 6.15 p.m. SGT 2.15 p.m. GST How can environmental commitment towards a green supply chain create value? Find out in our webinar
## 13 Join us live today! Do not miss Building Sustainable Supply Chain Solutions at 6.157 p.m. SGT 2.153 p.m. GSTRegister here to access the webinar
## 14 Last week our little environmentalists at the ELV celebrated Earth Day with lots of fun activities. Students got the chance to water and plant new seeds and learnt about the Earth and climate changes.
## 15 We are delighted to announce will be hosting our Young SVP connects meeting again next Tues 4thMay at 7pm. We look forward to welcoming you along to share your thoughts views discuss what is going on for you! If you took part in the Aprilchallenge we would love
## 16 Canada is a world leader in cleantech but women are still significantly underrepresented. This MaRS andare launching the RBC Women in Cleantech Accelerator a 12month program that will connect and support women entrepreneurs. Learn more and apply today.
## 17 UV Sun Protection Cooling Arm Sleeves Original Price 11.99 Price after discount 8.99Free shipping Discount codeMKTC2METP342
## 18 2021 ins style Straw beach bag price9.9Free shipping
## 19 2021 ins style Straw beach bag Environmentally friendly products price9.9Free shipping
## 20 The Earth is our common wealth. Let's preserve it. Environmental protection guides our choices. Every day by your side at your service!
## sentiment sentimentValue
## 1 Neutral 2
## 2 Positive 3
## 3 Neutral 2
## 4 Positive 3
## 5 Negative 1
## 6 Neutral 2
## 7 Neutral 2
## 8 Neutral 2
## 9 Neutral 2
## 10 Neutral 2
## 11 Neutral 2
## 12 Neutral 2
## 13 Negative 1
## 14 Positive 3
## 15 Positive 3
## 16 Positive 3
## 17 Neutral 2
## 18 Neutral 2
## 19 Neutral 2
## 20 Neutral 2
unique(sentiment$sentiment)## [1] "Neutral" "Positive" "Negative" "Verypositive" "Verynegative"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric#了解情緒文章的分佈
sentiment$sentiment %>% table()## .
## Negative Neutral Positive Verynegative Verypositive
## 1318 3965 2375 3 55
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%
group_by(date,source) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=source)) +
geom_line()## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('the')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >20 & count<400)%>%
wordcloud2()## Joining, by = "word"
#了解負面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >20 &count<400)%>%
wordcloud2()“wordcloud”
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字## words polarity n
## 1: sustainable 1.0 34
## 2: honor 1.0 21
## 3: care 1.0 19
## 4: sustainability 1.0 19
## 5: please 1.0 13
## ---
## 611: gospel 0.1 1
## 612: prime 0.1 1
## 613: relating 0.1 1
## 614: mural 0.1 1
## 615: lion 0.1 1
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字## Selecting by n
## words polarity n
## 1: trash -0.50 20
## 2: late -0.25 16
## 3: waste -0.75 13
## 4: pandemic -1.00 9
## 5: miss -1.00 8
## 6: homeless -1.00 8
## 7: challenge -0.25 6
## 8: leave -0.25 6
## 9: wait -0.25 5
## 10: targets -0.40 5
## 11: litter -0.60 5
## 12: poverty -0.75 5
## 13: crisis -0.75 5
set.seed(12)
df%>%
filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
mutate(review = get_sentences(text)) %$%
sentiment_by(review, status_id) %>%
highlight()## Saved in C:\Users\ASUS-NB\AppData\Local\Temp\RtmpwjxYgZ/polarity.html
## Opening C:\Users\ASUS-NB\AppData\Local\Temp\RtmpwjxYgZ/polarity.html ...
tweets$date = format(tweets$created_at,'%Y%m%d')
(out = tweets %>% with(
sentiment_by( #document level
get_sentences(text),
list( date)
)
))
plot(out)(out = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% with(
sentiment_by(
get_sentences(text),
list(source, date)
)
))
plot(out)