安裝package
packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)load("coreNLP_all.RData")(1). Twitter API設定 透過rtweet抓取tweets
app = '2021_sma_team3'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權(2). 設定關鍵字抓tweets
# 查詢關鍵字
key = c("#COVID19")
context = "#vaccine"
q = paste(c(key,context),collapse=" AND ")
# 查詢字詞 "#EverGiven AND Suez"
# 為了避免只下#EverGiven 會找到非在suez中的tweets,加入Suez要同時出現的條件
#抓5000筆 不抓轉推
since <- "2021-03-24"
until <- "2021-04-04"
#tweets = search_tweets(q,lang="en",n=5000,since = since, until = until,include_rts = FALSE,token = twitter_token)
tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)(3). tweets內容清理
## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),] #去除重複的tweetshead(df)## # A tibble: 6 x 90
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 82884108~ 1380803685~ 2021-04-10 08:43:47 latestly Punjab CM Amarin~ Tweet~
## 2 82884108~ 1378410446~ 2021-04-03 18:13:54 latestly COVID19 Vaccinat~ Tweet~
## 3 82884108~ 1379759032~ 2021-04-07 11:32:42 latestly Karnataka Admini~ Tweet~
## 4 82884108~ 1379792594~ 2021-04-07 13:46:04 latestly Maharashtra Faci~ Tweet~
## 5 82884108~ 1378310279~ 2021-04-03 11:35:53 latestly Uttar Pradesh Wo~ Tweet~
## 6 82884108~ 1379089604~ 2021-04-05 15:12:38 latestly Delhi GovtRun Ho~ Tweet~
## # ... with 84 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>,
## # ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## # lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## # quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>
df共有90個欄位,但我們在這裡僅會使用幾個欄位:
created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料
nrow(df)## [1] 4446
min(df$created_at)## [1] "2021-04-02 06:12:50 UTC"
max(df$created_at)## [1] "2021-04-10 08:43:47 UTC"
df %>%
filter(location != "") %>%
group_by(location) %>%
summarise(n = n(), .groups = "drop_last") %>%
top_n(n = 15, n) %>%
ggplot(aes(x = reorder(location, n), y = n, fill=n)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_label(aes(label=n), size=4, fill="white") +
labs(title = "Countries Location for Tweets") +
theme(axis.text.x = element_blank(),
axis.title = element_blank()) df %>%
filter(country_code != "") %>%
group_by(country_code) %>%
summarise(n = n(), .groups = "drop_last") %>%
top_n(n = 5, n) %>%
ggplot(aes(x = reorder(country_code, n), y = n, fill=n)) +
geom_bar(stat = "identity") +
coord_flip() +
geom_label(aes(label=n), size=4, fill="white") +
labs(title = "Countries for Tweets") +
theme(axis.text.x = element_blank(),
axis.title = element_blank())(1). API呼叫的設定
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}(2). 資料整理function
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}取得coreNLP回傳的物件
先不要跑這段,會花大概半小時(如果你記憶體只有4G可能會當掉)
gc() #釋放不使用的記憶體
t0 = Sys.time()
obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
Sys.time() - t0 #執行時間
#Time difference of 28 mins
save.image("coreNLP.RData")#先將會用到的東西存下來,要用可直接載RData
# tokens = coreNLP_tokens_parser(obj)
# dependencies = coreNLP_dependency_parser(obj)
# sentiment = coreNLP_sentiment_parser(obj)
# save.image("coreNLP_all.RData")(1). 斷詞、詞彙還原、詞性標註、NER
tokens = coreNLP_tokens_parser(obj)head(tokens,20)## status_id word lemma pos ner
## 1 1380803685941346304 Punjab punjab NN LOCATION
## 2 1380803685941346304 CM cm NN O
## 3 1380803685941346304 Amarinder Amarinder NNP O
## 4 1380803685941346304 Singh Singh NNP O
## 5 1380803685941346304 Says say VBZ O
## 6 1380803685941346304 the the DT O
## 7 1380803685941346304 State state NN O
## 8 1380803685941346304 Is be VBZ O
## 9 1380803685941346304 Left leave VBN O
## 10 1380803685941346304 With with IN O
## 11 1380803685941346304 Only only JJ O
## 12 1380803685941346304 Five five CD DURATION
## 13 1380803685941346304 Days day NNS DURATION
## 14 1380803685941346304 of of IN O
## 15 1380803685941346304 COVID19 covid19 NN O
## 16 1380803685941346304 Vaccine vaccine NN O
## 17 1380803685941346304 Urges urge VBZ O
## 18 1380803685941346304 Centre Centre NNP O
## 19 1380803685941346304 To to TO O
## 20 1380803685941346304 Share share VB O
(2). 命名實體標註(NER)
unique(tokens$ner)## [1] "LOCATION" "O" "DURATION"
## [4] "COUNTRY" "ORGANIZATION" "NUMBER"
## [7] "SET" "MISC" "DATE"
## [10] "NATIONALITY" "PERSON" "ORDINAL"
## [13] "TIME" "CRIMINAL_CHARGE" "TITLE"
## [16] "CITY" "MONEY" "CAUSE_OF_DEATH"
## [19] "STATE_OR_PROVINCE" "PERCENT" "RELIGION"
## [22] "URL" "IDEOLOGY"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) ## [1] 2850
(3). 轉小寫
因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)我們可以透過coreNLP中的NER解析出在Twitter上面談論長賜號擱淺蘇伊士運河,所涉及到的國家(COUNTRY),以初步了解這個議題的主要國家。
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 20, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()涉及到的組織。
tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 30, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在Twitter上面談論新冠疫苗,所涉及到的人物(PERSON),以初步了解這個議題的主要人物。
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()我們可以透過coreNLP中的NER解析出在新冠疫苗,所涉及到的思想意識(IDEOLOGY),以初步了解這個議題的意識,由於COUNT的數量少未明顯傳達什麼概念。
tokens %>%
filter(ner == "IDEOLOGY") %>% #篩選NER為IDEOLOGY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is IDEOLOGY)") +
theme(text=element_text(size=14))+
coord_flip()dependencies = coreNLP_dependency_parser(obj)head(dependencies,20)## status_id dep governor governorGloss dependent
## 1 1380803685941346304 ROOT 0 ROOT 17
## 2 1380803685941346304 compound 2 CM 1
## 3 1380803685941346304 obl:tmod 17 Urges 2
## 4 1380803685941346304 compound 4 Singh 3
## 5 1380803685941346304 nsubj 5 Says 4
## 6 1380803685941346304 acl:relcl 2 CM 5
## 7 1380803685941346304 det 7 State 6
## 8 1380803685941346304 nsubj:pass 9 Left 7
## 9 1380803685941346304 aux:pass 9 Left 8
## 10 1380803685941346304 ccomp 5 Says 9
## 11 1380803685941346304 case 12 Five 10
## 12 1380803685941346304 amod 12 Five 11
## 13 1380803685941346304 obl 9 Left 12
## 14 1380803685941346304 nsubj 17 Urges 13
## 15 1380803685941346304 case 16 Vaccine 14
## 16 1380803685941346304 compound 16 Vaccine 15
## 17 1380803685941346304 nmod 13 Days 16
## 18 1380803685941346304 obj 17 Urges 18
## 19 1380803685941346304 mark 20 Share 19
## 20 1380803685941346304 xcomp 17 Urges 20
## dependentGloss
## 1 Urges
## 2 Punjab
## 3 CM
## 4 Amarinder
## 5 Singh
## 6 Says
## 7 the
## 8 State
## 9 Is
## 10 Left
## 11 With
## 12 Only
## 13 Five
## 14 Days
## 15 of
## 16 COVID19
## 17 Vaccine
## 18 Centre
## 19 To
## 20 Share
# parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
parse_tree <- obj[[100]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
sentiment = coreNLP_sentiment_parser(obj)head(sentiment,20)## status_id
## 1 1380803685941346304
## 2 1378410446164688897
## 3 1379759032144044034
## 4 1379792594750689288
## 5 1378310279470051335
## 6 1379089604574281729
## 7 1379760842695188480
## 8 1380046808978628608
## 9 1377937358453018624
## 10 1379795359149350916
## 11 1379756439179198466
## 12 1379113199174213635
## 13 1380802797285871617
## 14 1380802023377960965
## 15 1380069718120349698
## 16 1379448739383902211
## 17 1380800216270565377
## 18 1379802064335818757
## 19 1379455392837730304
## 20 1380799689784713222
## text
## 1 Punjab CM Amarinder Singh Says the State Is Left With Only Five Days of COVID19 Vaccine Urges Centre To Share Schedule for Supply
## 2 COVID19 Vaccination in India Centre Asks States UTs Not To Allow Fresh Registration of Healthcare Workers for Inoculation
## 3 Karnataka Administered Over 50 Lakh People With COVID19 Vaccine Says Health Department
## 4 Maharashtra Facing COVID19 Vaccine Shortage State Govt Demands 40 Lakh Doses Every Week From Centre
## 5 Uttar Pradesh Woman Given Two Doses of COVID19 Vaccine DM Orders Inquiry into Negligence
## 6 Delhi GovtRun Hospitals COVID19 Vaccination Sites To Open for 24 Hours From April 6
## 7 Nepal Starts COVID19 Vaccination Drive With Chinese Vaccine Vero Cell Despite Safety Concerns
## 8 Uddhav Thackeray Takes His Second Dose of COVID19 Vaccine
## 9 Maharashtra Plea Filed in Bombay High Court for DoortoDoor COVID19 Vaccination for People Above 75 Years
## 10 No COVID19 Vaccine Shortage in Maharashtra Central Govt Replenishes More Vaccine Than What Is Needed Says Prakash Javadekar
## 11 Mexico Grants Authorisation for Emergency Use of Indias COVID19 Vaccine Covaxin
## 12 Sri Lanka Begins COVID19 Vaccination of Chinese Expats With Sinopharm Vaccine
## 13 The Imaan Pharmacycentre is available for more bookings this 11th 13th and 14th April. Book via
## 14 all these books saying i survived tis i survived that...screw that ima write one ab how i survived covid 19
## 15 Are you aandorin ? Have you taken the 19 ? Well NOW is your chance! CAHN is hosting aPopUpon Saturday 10th April 9AM5PM West Indian Centre M14 4SW.
## 16 will be hosting a 19on Saturday 10th April 9AM5PM West Indian Centre . We would like to encourage people within theandto visit our to take the Book
## 17 Have you booked to take your 19yet? Well there is still a chance to Book NOW here
## 18 Are you aandorin ? Have you taken the 19 ? Well NOW is your chance! CAHN is hosting aPopUpon Saturday 10th April 9AM5PM West Indian Centre M14 4SW.
## 19 has written an opento encourageto take the signed by some of the most highprofile names in the Click here to watch full video
## 20 I have gotCovid 19 vaccine the process went easy and didnt took time thanks to the MOH for the facilities and I urge people to do so in order to have a community free of COVID and to save more lives.
## sentiment sentimentValue
## 1 Negative 1
## 2 Neutral 2
## 3 Neutral 2
## 4 Neutral 2
## 5 Neutral 2
## 6 Neutral 2
## 7 Neutral 2
## 8 Neutral 2
## 9 Neutral 2
## 10 Negative 1
## 11 Neutral 2
## 12 Neutral 2
## 13 Positive 3
## 14 Neutral 2
## 15 Neutral 2
## 16 Neutral 2
## 17 Neutral 2
## 18 Neutral 2
## 19 Neutral 2
## 20 Neutral 2
unique(sentiment$sentiment)## [1] "Negative" "Neutral" "Positive" "Verypositive" "Verynegative"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric#了解情緒文章的分佈
sentiment$sentiment %>% table()## .
## Negative Neutral Positive Verynegative Verypositive
## 926 2786 709 1 2
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%
group_by(date,source) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=source)) +
geom_line()## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >15 & count<400)%>%
wordcloud2()## Joining, by = "word"
“wordcloud”
#了解負面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
group_by(lower_lemma) %>%
summarize(count = n()) %>%
filter(count >10 &count<400)%>%
wordcloud2()“wordcloud”
library(sentimentr)## Warning: package 'sentimentr' was built under R version 4.0.5
mytext <- c(
'do you like it? But I hate really bad dogs',
'I am the best friend.',
'Do you really like it? I\'m not a fan'
)
mytext <- get_sentences(mytext) #物件,將character向量轉成list,list裡放著character向量(已斷句)情緒分數為-1~1之間,<0屬於負面,>0屬於正面,0屬於中性
sentiment_by(mytext) #document level## element_id word_count sd ave_sentiment
## 1: 1 10 1.497465 -0.8088680
## 2: 2 5 NA 0.5813777
## 3: 3 9 0.284605 0.2196345
sentiment(mytext) #sentence level## element_id sentence_id word_count sentiment
## 1: 1 1 4 0.2500000
## 2: 1 2 6 -1.8677359
## 3: 2 1 5 0.5813777
## 4: 3 1 5 0.4024922
## 5: 3 2 4 0.0000000
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字## words polarity n
## 1: please 1.0 10
## 2: top 1.0 7
## 3: care 1.0 5
## 4: understand 1.0 4
## 5: truth 1.0 4
## ---
## 399: overcome 0.1 1
## 400: compliance 0.1 1
## 401: pray 0.1 1
## 402: moral 0.1 1
## 403: church 0.1 1
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字## Selecting by n
## words polarity n
## 1: shot -0.40 26
## 2: government -0.50 20
## 3: pandemic -1.00 20
## 4: virus -0.50 17
## 5: disease -1.00 17
## 6: limit -0.25 14
## 7: risk -0.75 13
## 8: stop -0.40 12
## 9: issues -1.00 10
## 10: drug -0.10 9
## 11: jab -0.60 9
set.seed(12)
df%>%
filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
mutate(review = get_sentences(text)) %$%
sentiment_by(review, status_id) %>%
highlight()## Saved in C:\Users\user\AppData\Local\Temp\RtmpOm6qnY/polarity.html
## Opening C:\Users\user\AppData\Local\Temp\RtmpOm6qnY/polarity.html ...
tweets$date = format(tweets$created_at,'%Y%m%d')
(out = tweets %>% with(
sentiment_by( #document level
get_sentences(text),
list( date)
)
))
plot(out)#
(out = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% with(
sentiment_by(
get_sentences(text),
list(source, date)
)
))
plot(out)replace_emoji("\U0001f4aa")## [1] " flexed biceps "