安裝package
packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(sentimentr)load("coreNLP_all.RData")(1). Twitter API設定 透過rtweet抓取tweets
app = 'Emotions COVID-19 Vaccine'
consumer_key = 'sldS3M1c37owWAxx88lRg8anU'
consumer_secret = 'lCsUtxqA6DWC9nW7xH2a5KAITLXEX8oj10tcWE7zRVTxgHARfC'
access_token = '1283052584312410112-LocNkHahyAJ50KR0sADTmHryO0k3Kq'
access_secret = 'gLssR17xxOZUDLeiF6sB5LiwSYAAVBE0mLjXQolINF4k3'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權(2). 設定關鍵字抓tweets
# 查詢關鍵字
key = c("#Prayfornanggala402")
context = " "
q = paste(c(key,context),collapse=" AND ")
# 查詢字詞 "#COVID-19 AND Vaccine"
# 為了避免只下#COVID-19 會找到非在Vaccine中的tweets,加入Vaccine要同時出現的條件
#抓10000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=8000,include_rts = FALSE,token = twitter_token)(3). tweets內容清理
## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweetshead(df)## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 13108524~ 1387299755~ 2021-04-28 06:56:51 GreyGeordie out of curious~ Twitte~
## 2 13108524~ 1387077236~ 2021-04-27 16:12:38 GreyGeordie idk the meanin~ Twitte~
## 3 70889196~ 1387284663~ 2021-04-28 05:56:52 AmitTiw8519~ To my sisters ~ Twitte~
## 4 12373699~ 1387264694~ 2021-04-28 04:37:31 Jih_Yooo ON ETERNAL PAT~ Twitte~
## 5 392759577 1387260528~ 2021-04-28 04:20:58 yenjan TKS Tae tae Twitte~
## 6 12988991~ 1387255561~ 2021-04-28 04:01:14 BTS_twt_KTHV Heres a song f~ Twitte~
## # ... with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>,
## # ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## # lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## # quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
df共有90個欄位,但我們在這裡僅會使用幾個欄位:
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料
nrow(df)## [1] 7992
min(df$created_at)## [1] "2021-04-23 23:00:47 UTC"
max(df$created_at)## [1] "2021-04-28 06:56:51 UTC"
(1). API呼叫的設定
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}(2). 資料整理function
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}取得coreNLP回傳的物件
先不要跑這段,會花大概半小時(如果你記憶體只有4G可能會當掉)
#gc() #釋放不使用的記憶體
#t0 = Sys.time()
#obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host)
#丟入本地執行 丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
#Sys.time() - t0 #執行時間
#Time difference of 30 mins#save.image("coreNLP.RData")#先將會用到的東西存下來,要用可直接載RData
#tokens = coreNLP_tokens_parser(obj)
#dependencies = coreNLP_dependency_parser(obj)
#sentiment = coreNLP_sentiment_parser(obj)
#save.image("coreNLP_all.RData")(1). 斷詞、詞彙還原、詞性標註、NER
tokens = coreNLP_tokens_parser(obj)head(tokens,20)## status_id word lemma pos ner
## 1 1387299755584233472 out out IN O
## 2 1387299755584233472 of of IN O
## 3 1387299755584233472 curiousity curiousity NN O
## 4 1387299755584233472 here's here' NNS O
## 5 1387299755584233472 my my PRP$ O
## 6 1387299755584233472 take take NN O
## 7 1387299755584233472 on on IN O
## 8 1387299755584233472 the the DT O
## 9 1387299755584233472 poignant poignant JJ O
## 10 1387299755584233472 farewell farewell NN O
## 11 1387299755584233472 sung sing VBN O
## 12 1387299755584233472 by by IN O
## 13 1387299755584233472 the the DT O
## 14 1387299755584233472 Nanggala402 nanggala402 NN O
## 15 1387299755584233472 sailors sailor NNS O
## 16 1387299755584233472 before before IN O
## 17 1387299755584233472 its its PRP$ O
## 18 1387299755584233472 eternal eternal JJ O
## 19 1387299755584233472 departure. departure. NN O
## 20 1387299755584233472 RIP. rip. NN O
(2). 命名實體標註(NER)
unique(tokens$ner)## [1] "O" "LOCATION" "DATE"
## [4] "CAUSE_OF_DEATH" "PERSON" "ORGANIZATION"
## [7] "NUMBER" "SET" "TIME"
## [10] "COUNTRY" "DURATION" "MISC"
## [13] "NATIONALITY" "TITLE" "CITY"
## [16] "STATE_OR_PROVINCE" "RELIGION"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) ## [1] 436
(3). 轉小寫
因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)我們可以透過coreNLP中的NER解析出在Twitter上面談論Prayfornanggala402的事情,所涉及到的國家(COUNTRY),以初步了解這個議題的主要國家。
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在Twitter上面談論Prayfornanggala402事情,所涉及到的組織(ORGANIZATION),以初步了解這個議題的主要公司/單位。
tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在Twitter上面談論Prayfornanggala402事情,所涉及到的人物(PERSON),以初步了解這個議題的主要人物。
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()dependencies = coreNLP_dependency_parser(obj)head(dependencies,20)## status_id dep governor governorGloss dependent
## 1 1387299755584233472 ROOT 0 ROOT 10
## 2 1387299755584233472 dep 6 take 1
## 3 1387299755584233472 case 4 here's 2
## 4 1387299755584233472 compound 4 here's 3
## 5 1387299755584233472 obl 1 out 4
## 6 1387299755584233472 nmod:poss 6 take 5
## 7 1387299755584233472 dep 10 farewell 6
## 8 1387299755584233472 case 10 farewell 7
## 9 1387299755584233472 det 10 farewell 8
## 10 1387299755584233472 amod 10 farewell 9
## 11 1387299755584233472 acl 10 farewell 11
## 12 1387299755584233472 case 15 sailors 12
## 13 1387299755584233472 det 15 sailors 13
## 14 1387299755584233472 compound 15 sailors 14
## 15 1387299755584233472 obl 11 sung 15
## 16 1387299755584233472 case 20 RIP. 16
## 17 1387299755584233472 nmod:poss 20 RIP. 17
## 18 1387299755584233472 amod 20 RIP. 18
## 19 1387299755584233472 compound 20 RIP. 19
## 20 1387299755584233472 obl 11 sung 20
## dependentGloss
## 1 farewell
## 2 out
## 3 of
## 4 curiousity
## 5 here's
## 6 my
## 7 take
## 8 on
## 9 the
## 10 poignant
## 11 sung
## 12 by
## 13 the
## 14 Nanggala402
## 15 sailors
## 16 before
## 17 its
## 18 eternal
## 19 departure.
## 20 RIP.
parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
sentiment = coreNLP_sentiment_parser(obj)head(sentiment,20)## status_id
## 1 1387299755584233472
## 2 1387077236252024832
## 3 1387284663580823557
## 4 1387264694205902849
## 5 1387260528775491586
## 6 1387255561956126725
## 7 1387153770258128897
## 8 1387076208836308992
## 9 1387068237234376705
## 10 1386980509079654405
## 11 1386958160322121730
## 12 1386946588551966720
## 13 1385856804110835715
## 14 1386931017341186050
## 15 1386890600947982339
## 16 1386856290983415808
## 17 1386838780422750213
## 18 1386823676473925637
## 19 1386766961514881025
## 20 1386712656287916034
## text
## 1 out of curiousity here's my take on the poignant farewell sung by the Nanggala402 sailors before its eternal departure. RIP.
## 2 idk the meaning of the lyrics. but to learn that this vid made before they departed... is just heartbreaking... RIP.
## 3 To my sisters and brothers in Assam who are now dealing with the double blow of an earthquake and the rampaging second wave of COVID I send you my love and prayers.
## 4 ON ETERNAL PATROL !!! ASCENDING TO THE HIGHER PLACE THAN THE SURFACE SAILORS HEAVEN IS THE LAST DOCKING...
## 5 TKS Tae tae
## 6 Heres a song for you Oceans Where Feet May Fail by Hillsong UNITED
## 7 Thks For your Simpathy..
## 8 Becuase its a best love time so i wanna stop time.
## 9 Breaking News
## 10 So... sad
## 11 sending my prayers to all the crews of KRI Nanggala 402 and their families. hopefully all the crews can be found and return home safely
## 12 Align Right Hope you all liked this Leave your thoughts in the comment section
## 13 Make it simple but significant Hope you all liked this Leave your thoughts in the comment section
## 14 Every Morning You Have Two Choices Continue To Sleep With Your Dreams Or Wake Up And Chase Them. BEAUTIFUL MORNING TO ALL MY SUBSCRIBERS
## 15 Poignant video shows crew of sunken Indonesia submarine singing farewell song
## 16 'protect our family and motherland from harm under tides and ocean. rest in peace sailor'
## 17 Even though I'm not ready to be missing you. I'm not ready to live without you. I wish all the best for you.
## 18 Farewell soldiers and keep your duty JALESVEVA JAYAMAHE
## 19 On Eternal Patrol Berpatroli untuk Selamanya. Sending love and prayers for all passengers crews and families. Via sobatNET
## 20 Our deepest condolence to the crews of KRI Nanggala 402. May them Rest in Peace and God give the strength to their families. You'll never be forgotten.
## sentiment sentimentValue
## 1 Neutral 2
## 2 Neutral 2
## 3 Negative 1
## 4 Neutral 2
## 5 Neutral 2
## 6 Neutral 2
## 7 Neutral 2
## 8 Positive 3
## 9 Neutral 2
## 10 Negative 1
## 11 Positive 3
## 12 Negative 1
## 13 Neutral 2
## 14 Negative 1
## 15 Negative 1
## 16 Positive 3
## 17 Negative 1
## 18 Neutral 2
## 19 Neutral 2
## 20 Negative 1
unique(sentiment$sentiment)## [1] "Neutral" "Negative" "Positive" "Verypositive"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric#了解情緒文章的分佈
sentiment$sentiment %>% table()## .
## Negative Neutral Positive Verypositive
## 605 2462 4913 10
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%
group_by(date,source) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=source)) +
geom_line()## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('the')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >5 & count<400)%>%
wordcloud2()## Joining, by = "word"
#了解負面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >10 &count<400)%>%
wordcloud2()“wordcloud”
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字## words polarity n
## 1: please 1.00 27
## 2: care 1.00 3
## 3: almighty 1.00 2
## 4: bless 1.00 2
## 5: quickly 1.00 2
## 6: sincerity 1.00 1
## 7: blessed 1.00 1
## 8: buddy 1.00 1
## 9: safety 0.80 14
## 10: heroes 0.80 8
## 11: miracles 0.80 7
## 12: available 0.80 5
## 13: heaven 0.80 3
## 14: wish 0.80 2
## 15: wishes 0.80 2
## 16: well 0.80 2
## 17: wishing 0.80 1
## 18: protecting 0.80 1
## 19: provide 0.80 1
## 20: saved 0.80 1
## 21: birthday 0.80 1
## 22: peace 0.75 69
## 23: miracle 0.75 32
## 24: fair 0.75 27
## 25: good 0.75 21
## 26: safe 0.75 10
## 27: beautiful 0.75 6
## 28: protect 0.75 5
## 29: sincere 0.75 4
## 30: love 0.75 3
## 31: healthy 0.75 2
## 32: hero 0.75 2
## 33: heavenly 0.75 1
## 34: guarantee 0.75 1
## 35: enjoy 0.75 1
## 36: courage 0.75 1
## 37: hopeful 0.75 1
## 38: ease 0.75 1
## 39: hug 0.75 1
## 40: happy 0.75 1
## 41: found 0.60 669
## 42: hopefully 0.60 662
## 43: hoping 0.60 7
## 44: guide 0.60 3
## 45: safely 0.50 642
## 46: hope 0.50 28
## 47: greatest 0.50 16
## 48: best 0.50 13
## 49: dear 0.50 7
## 50: thank 0.50 7
## 51: save 0.50 6
## 52: loved 0.50 5
## 53: strong 0.50 4
## 54: brave 0.50 4
## 55: strength 0.50 3
## 56: beloved 0.50 3
## 57: great 0.50 2
## 58: nice 0.50 1
## 59: productive 0.50 1
## 60: happily 0.50 1
## 61: alive 0.50 1
## 62: shine 0.50 1
## 63: wealth 0.50 1
## 64: granted 0.50 1
## 65: dawn 0.50 1
## 66: patience 0.50 1
## 67: abundant 0.50 1
## 68: fortitude 0.50 1
## 69: kind 0.50 1
## 70: healing 0.50 1
## 71: grant 0.50 1
## 72: patient 0.50 1
## 73: contact 0.40 4
## 74: fellow 0.40 1
## 75: management 0.40 1
## 76: utmost 0.40 1
## 77: brother 0.40 1
## 78: magic 0.25 3
## 79: holy 0.25 2
## 80: salute 0.25 2
## 81: work 0.25 1
## 82: rescue 0.25 1
## 83: quiet 0.25 1
## 84: invite 0.25 1
## 85: big 0.25 1
## 86: pray 0.10 18
## 87: praying 0.10 2
## 88: prays 0.10 1
## words polarity n
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字## Selecting by n
## words polarity n
## 1: missing -0.50 11
## 2: departed -0.25 6
## 3: condolence -0.40 4
## 4: impossible -0.50 4
## 5: lost -0.75 4
## 6: sunk -0.50 3
## 7: cold -0.50 3
## 8: hard -0.25 2
## 9: guard -0.25 2
## 10: sad -0.50 2
## 11: heartbreaking -0.75 2
## 12: loss -0.75 2
## 13: hurt -0.75 2
## 14: hurts -0.75 2
## 15: shallow -1.00 2
set.seed(12)
df%>%
filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
mutate(review = get_sentences(text)) %$%
sentiment_by(review, status_id) %>%
highlight()## Saved in C:\Users\ASUS-NB\AppData\Local\Temp\RtmpYFxPQO/polarity.html
## Opening C:\Users\ASUS-NB\AppData\Local\Temp\RtmpYFxPQO/polarity.html ...
tweets$date = format(tweets$created_at,'%Y%m%d')
(out = tweets %>% with(
sentiment_by( #document level
get_sentences(text),
list( date)
)
))
plot(out)(out = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% with(
sentiment_by(
get_sentences(text),
list(source, date)
)
))
plot(out)