library(wordcloud2)
library(tidyr)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(sentimentr)
library(forcats)
library(janeaustenr)
library(ggraph)
library(igraph)
library(widyr)
library(syuzhet)
load("new.RData")
#透過rtweet抓取tweets
= '2021_sma'
app = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_key = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
consumer_secret = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_token = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
access_secret <- create_token(app,consumer_key, consumer_secret,
twitter_token set_renv = FALSE) access_token, access_secret,
# 查詢關鍵字
# key = c("#Dogecoin")
# context = "coin"
# q = paste(c(key,context),collapse=" AND ")
# tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)
## 用於資料清理
= function(txt) {
clean = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt
txt }
$text = clean(tweets$text) #text套用資料清理
tweets
= data.frame()
df
= rbind(df,tweets) # transfer to data frame
df
= df[!duplicated(df[,"status_id"]),]
df = df[!duplicated(df[,"text"]),] #去除重複的tweets df
nrow(df)
## [1] 2565
min(df$created_at)
## [1] "2021-04-23 20:32:12 UTC"
max(df$created_at)
## [1] "2021-05-01 16:17:04 UTC"
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
<- function(host, port="9000",
generate_API_url tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
<- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
url
}#指定服務的位置
= "127.0.0.1"
host
generate_API_url(host)
# 呼叫coreNLP api
<- function(server_host, text, host="localhost", language="eng",
call_coreNLP tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
<- ifelse(language=="eng", 9000, 9001);
port # 產生api網址
<- generate_API_url(server_host, port=port,
url tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
<- POST(url, body = text, encode = "json")
result <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
doc return (doc)
}
#文件使用coreNLP服務
<- function(data,host){
coreNLP # 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
<- apply(data, 1 , function(x){
result <- call_coreNLP(host, x['text'])
object list(doc=object, data=x)
})
return(result)
}
+從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
<- function(coreNLP_objects){
coreNLP_tokens_parser
<- do.call(rbind, lapply(coreNLP_objects, function(obj){
result <- obj$data
original_data <- obj$doc
doc # for a sentences
<- doc$sentences
sentences
<- sentences[[1]]
sen
<- do.call(rbind, lapply(sen$tokens, function(x){
tokens <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
result
}))
<- original_data %>%
tokens t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))return(result)
}
+從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
<- function(coreNLP_objects){
coreNLP_dependency_parser <- do.call(rbind, lapply(coreNLP_objects, function(obj){
result <- obj$data
original_data <- obj$doc
doc # for a sentences
<- doc$sentences
sentences <- sentences[[1]]
sen <- do.call(rbind, lapply(sen$basicDependencies, function(x){
dependencies <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
result
}))
<- original_data %>%
dependencies t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))return(result)
}
+從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
<- function(coreNLP_objects){
coreNLP_sentiment_parser <- do.call(rbind, lapply(coreNLP_objects, function(obj){
result <- obj$data
original_data <- obj$doc
doc # for a sentences
<- doc$sentences
sentences <- sentences[[1]]
sen
<- original_data %>%
sentiment t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))return(result)
}
# gc() #釋放不使用的記憶體
#
# t0 = Sys.time()
# obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
# #丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
#
# Sys.time() - t0 #執行時間
#Time difference of 28 mins
#save.image("coreNLP.RData")
#先將會用到的東西存下來,要用可直接載RData
#tokens = coreNLP_tokens_parser(obj)
#dependencies = coreNLP_dependency_parser(obj)
#sentiment = coreNLP_sentiment_parser(obj)
#save.image("coreNLP_all.RData")
+斷詞、詞彙還原、詞性標註、NER
= coreNLP_tokens_parser(obj) tokens
+轉小寫
$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma) tokens
%>%
tokens filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
filter(!lower_word == 'us') %>%
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 15, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()
討論到的國家數偏少
討論提到japan是因為狗狗幣的那隻迷因狗狗(柴犬)來自日本,因此在討論中有被提及
涉及到的人物(PERSON)
%>%
tokens filter(ner == "PERSON") %>% #篩選NER為PERSON
filter(!lower_word %in% c('doge','bitcoin','dogecoin','defi','coin','tesla','matic','cardano')) %>%
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()
<- tokens %>% right_join(df, by = 'status_id')
tokens_df
%>%
tokens_df group_by(date) %>%
summarise(count = n()) %>%
ggplot(aes(date, count)) +
geom_line(color = "#00AFBB", size = 1) +
geom_vline(xintercept = as.numeric(as.Date("2021-04-28")), col='red', size = 1) +
labs(x = "date", y = "count")
<- tokens %>% right_join(df, by = 'status_id')
tokens_df
<- tokens_df %>%
plot_afinn_df inner_join(get_sentiments("afinn")) %>%
group_by(date) %>%
mutate(count = n()) %>%
mutate(sum = sum(value)) %>%
mutate(ratio = sum/count) %>%
ggplot()+
geom_line(aes(x=date,y=ratio))
## Joining, by = "word"
plot_afinn_df
= coreNLP_sentiment_parser(obj) sentiment
#了解情緒文章的分佈
$sentiment %>% table() sentiment
## .
## Negative Neutral Positive Verynegative Verypositive
## 521 1532 504 2 5
$date = as.Date(df$created_at)
df
%>%
sentiment merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()
<- tokens_df %>%
dogefather #filter(reply_to_status_id == "1387290679794089986") %>%
filter(reply_to_screen_name == "elonmusk")
<- dogefather %>%
plot_dogefather_afinn inner_join(get_sentiments("afinn")) %>%
group_by(date) %>%
summarise(
sum = sum(value)
%>%
) ggplot(aes(date, sum)) +
geom_col(show.legend = FALSE)+
labs(x = "date", y = "score")
## Joining, by = "word"
plot_dogefather_afinn
+推測為 4/28 Elon Musk 發完 dogefather 貼文,隔日狗狗幣幣值暴漲,同時民眾對於 Elon Musk 的正面情緒高漲
<- get_timeline("@elonmusk", n= 1000) elon
## 用於資料清理
= function(txt) {
clean = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt
txt }
$text = clean(elon$text) #text套用資料清理
elon
= data.frame()
ef
= rbind(ef,elon) # transfer to data frame
ef
= ef[!duplicated(ef[,"status_id"]),]
ef = ef[!duplicated(ef[,"text"]),] #去除重複的tweets
ef
### 過濾有關doge coin
= c('2021-04-23','2021-04-24','2021-04-25','2021-04-26','2021-04-27','2021-04-28','2021-04-29','2021-04-30','2021-05-01')
keywords = paste(keywords,collapse="|")
toMatch = with(ef, ef[grepl(toMatch,created_at),]) ef
+資料筆數以及時間分布
nrow(ef)
## [1] 857
min(ef$created_at)
## [1] "2020-12-28 19:46:18 UTC"
max(ef$created_at)
## [1] "2021-05-03 17:27:11 UTC"
gc() #釋放不使用的記憶體
= Sys.time()
t0 = ef[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
obj_elon #丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
Sys.time() - t0 #執行時間
= coreNLP_tokens_parser(obj_elon) tokens_elon
$lower_word = tolower(tokens_elon$word)
tokens_elon$lower_lemma = tolower(tokens_elon$lemma) tokens_elon
<- tokens_elon %>% right_join(ef, by = 'status_id')
tokens_ef $created_at <- tokens_ef$created_at %>% as.Date("%Y/%m/%d") tokens_ef
## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%Y/%m/%d'
<- tokens_ef %>%
plot_afinn_ef inner_join(get_sentiments("afinn")) %>%
group_by(created_at) %>%
mutate(count = n()) %>%
mutate(sum = sum(value)) %>%
mutate(ratio = sum/count) %>%
ggplot()+
geom_line(aes(x=created_at,y=ratio))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-04-23','2021-05-01'))
)
## Joining, by = "word"
plot_afinn_ef
## Warning: Removed 475 row(s) containing missing values (geom_path).
$word = tolower(tokens$word) tokens
data(stop_words)
<- tokens %>%
tokens anti_join(stop_words)
## Joining, by = "word"
<- df %>%
doge_bigrams unnest_tokens(bigram, text, token = "ngrams", n = 2)
%>%
doge_bigrams count(bigram, sort = TRUE)
## # A tibble: 26,578 x 2
## bigram n
## <chr> <int>
## 1 doge coin 451
## 2 the coin 175
## 3 to the 168
## 4 coin is 160
## 5 is the 144
## 6 coin to 140
## 7 this coin 130
## 8 will be 127
## 9 in the 114
## 10 love coin 106
## # ... with 26,568 more rows
<- doge_bigrams %>%
bigrams_separated separate(bigram, c("word1", "word2"), sep = " ")
#過濾stopwords
<- bigrams_separated %>%
bigrams_filtered filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
<- bigrams_filtered %>%
bigram_counts count(word1, word2, sort = TRUE)
<- bigrams_filtered %>%
bigrams_united unite(bigram, word1, word2, sep = " ")
<- bigrams_united %>%
bigram_tf_idf count(status_id, bigram) %>%
bind_tf_idf(bigram, status_id, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf
## # A tibble: 9,869 x 6
## status_id bigram n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 1385693008134381580 anotherthis time 1 1 7.76 7.76
## 2 1385696043753299971 listed withand 1 1 7.76 7.76
## 3 1385702649501294603 super cheap 1 1 7.76 7.76
## 4 1385707172059811840 thatwill allowto 1 1 7.76 7.76
## 5 1385714634485682177 positive doge 1 1 7.76 7.76
## 6 1385721219660632067 spacexdoge coin 1 1 7.76 7.76
## 7 1385748927530610692 join ator 1 1 7.76 7.76
## 8 1385764995837964290 average cost 1 1 7.76 7.76
## 9 1385796167859011589 footballmessi coin 1 1 7.76 7.76
## 10 1385796607166189568 yeah baby 1 1 7.76 7.76
## # ... with 9,859 more rows
<- bigram_counts %>%
bigram_graph filter(n > 20) %>%
graph_from_data_frame()
## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
set.seed(2020)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1)
+ 以coin為中心的圖都與 dogecoin 相關: + people’s coin: Elon Musk 2/4 在其推特寫下 “Dogecoin is the people’s crypto”,此後部分的人討論都用此字 + meme coin, shit coin, joke coin: 皆為眾人嘲笑 dogecoin + love coin, discord: 為特定社團邀請用詞
<- df %>%
doge_words unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word)
<- doge_words %>%
word_pairs pairwise_count(word, status_id, sort = TRUE)
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
<- doge_words %>%
word_cors group_by(word) %>%
filter(n() >= 5) %>%
pairwise_cor(word, status_id, sort = TRUE)
word_cors
## # A tibble: 575,322 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 snap nkla 1.
## 2 nkla snap 1.
## 3 series book 1.
## 4 book series 1.
## 5 mint royal 1.
## 6 royal mint 1.
## 7 theta fil 1.
## 8 xlr fil 1.
## 9 mkr fil 1.
## 10 cmp fil 1.
## # ... with 575,312 more rows
%>%
word_cors filter(item1 %in% c("musk", "cuban")) %>%
group_by(item1) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()
## Selecting by correlation
dojacoin: 一個與蜜桃貓朵佳(Doja Cat)有關的 meme
snl 8th: elon musk 5/8 會上 saturday night live(snl)
使用詞彙關係圖畫出相關性大於 0.8 的組合
set.seed(2020)
%>%
word_cors filter(correlation > 0.8) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 2) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
### 去除廣告商
### 正常用戶不會重複留言
<- df %>%
df_ads group_by(user_id) %>%
summarize(count = n()) %>%
filter(count > 7)
<- df %>%
df_nonead filter(!user_id %in% c('1250317271869734912','1260750691083190274','1266632672685821953','1267197215635726336','1356471565110497283','1383414533754998784','1385386163389833219','1386394136094380033','4914843313','1386833874177646594','950847085288419328','1386805408916918272','1316705427493728257','54829446','1386185160920608777','104074409','1328466225492582401','1361907938537971712','1231658238933757952','1231656759233667072','265619996','930808810586832897','120810588','1372656595323207686','1384118260384235525','1384301836094480394','51110502','1388372828135755778','1216632885291057153','1183446394285150209','1279484272663879685','1292020460591357952','277451523'))
<- df_nonead %>%
nonead_doge_words unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word)
<- nonead_doge_words %>%
nonead_word_cors group_by(word) %>%
filter(n() >= 5) %>%
pairwise_cor(word, status_id, sort = TRUE)
set.seed(2020)
# 設定幾個詞做seed words
<- c("signals","kindmore","1800","developed","announcement","btc")
seed_words # 設定threshold 0.5
<- 0.5
threshold # 跟seed words相關性高於threshold的詞彙會被加入移除列表中
<- nonead_word_cors %>%
nonead_remove_words filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
$item1 %>%
.unique()
nonead_remove_words
## [1] "signals" "checkout" "bio" "kindmore" "purpose"
## [6] "created" "1800" "trades" "power" "xlm"
## [11] "guys" "link" "ada" "xrp" "professional"
## [16] "developed" "profit" "trade" "recommend" "love"
## [21] "mid" "announcement" "3000" "update" "btc"
## [26] "crypto"
%>%
nonead_word_cors filter(correlation > 0.5) %>%
filter(!(item1 %in% nonead_remove_words|item2 %in% nonead_remove_words)) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 2) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
<- nonead_doge_words %>%
plot_nonead_afinn filter(!word == c(nonead_remove_words)) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(date) %>%
mutate(count = n()) %>%
mutate(sum = sum(value)) %>%
mutate(ratio = sum/count) %>%
ggplot()+
geom_line(aes(x=date,y=ratio))
## Warning in word == c(nonead_remove_words): 較長的物件長度並非較短物件長度的倍數
## Joining, by = "word"
plot_nonead_afinn