library(wordcloud2)
library(tidyr)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(sentimentr)
library(forcats)
library(janeaustenr)
library(ggraph)
library(igraph)
library(widyr)
library(syuzhet)load("new.RData")#透過rtweet抓取tweets
app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
access_token, access_secret,set_renv = FALSE)# 查詢關鍵字
# key = c("#Dogecoin")
# context = "coin"
# q = paste(c(key,context),collapse=" AND ")
# tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
tweets$text = clean(tweets$text) #text套用資料清理
df = data.frame()
df = rbind(df,tweets) # transfer to data frame
df = df[!duplicated(df[,"status_id"]),]
df = df[!duplicated(df[,"text"]),] #去除重複的tweetsnrow(df)## [1] 2565
min(df$created_at)## [1] "2021-04-23 20:32:12 UTC"
max(df$created_at)## [1] "2021-05-01 16:17:04 UTC"
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}+從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}+從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}+從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}# gc() #釋放不使用的記憶體
#
# t0 = Sys.time()
# obj = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
# #丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
#
# Sys.time() - t0 #執行時間
#Time difference of 28 mins
#save.image("coreNLP.RData")#先將會用到的東西存下來,要用可直接載RData
#tokens = coreNLP_tokens_parser(obj)
#dependencies = coreNLP_dependency_parser(obj)
#sentiment = coreNLP_sentiment_parser(obj)
#save.image("coreNLP_all.RData")+斷詞、詞彙還原、詞性標註、NER
tokens = coreNLP_tokens_parser(obj)+轉小寫
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
filter(!lower_word == 'us') %>%
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 15, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()討論到的國家數偏少
討論提到japan是因為狗狗幣的那隻迷因狗狗(柴犬)來自日本,因此在討論中有被提及
涉及到的人物(PERSON)
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
filter(!lower_word %in% c('doge','bitcoin','dogecoin','defi','coin','tesla','matic','cardano')) %>%
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()tokens_df <- tokens %>% right_join(df, by = 'status_id')
tokens_df %>%
group_by(date) %>%
summarise(count = n()) %>%
ggplot(aes(date, count)) +
geom_line(color = "#00AFBB", size = 1) +
geom_vline(xintercept = as.numeric(as.Date("2021-04-28")), col='red', size = 1) +
labs(x = "date", y = "count")tokens_df <- tokens %>% right_join(df, by = 'status_id')
plot_afinn_df <- tokens_df %>%
inner_join(get_sentiments("afinn")) %>%
group_by(date) %>%
mutate(count = n()) %>%
mutate(sum = sum(value)) %>%
mutate(ratio = sum/count) %>%
ggplot()+
geom_line(aes(x=date,y=ratio))## Joining, by = "word"
plot_afinn_dfsentiment = coreNLP_sentiment_parser(obj)#了解情緒文章的分佈
sentiment$sentiment %>% table()## .
## Negative Neutral Positive Verynegative Verypositive
## 521 1532 504 2 5
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()dogefather <- tokens_df %>%
#filter(reply_to_status_id == "1387290679794089986") %>%
filter(reply_to_screen_name == "elonmusk")plot_dogefather_afinn <- dogefather %>%
inner_join(get_sentiments("afinn")) %>%
group_by(date) %>%
summarise(
sum = sum(value)
) %>%
ggplot(aes(date, sum)) +
geom_col(show.legend = FALSE)+
labs(x = "date", y = "score")## Joining, by = "word"
plot_dogefather_afinn+推測為 4/28 Elon Musk 發完 dogefather 貼文,隔日狗狗幣幣值暴漲,同時民眾對於 Elon Musk 的正面情緒高漲
elon <- get_timeline("@elonmusk", n= 1000)## 用於資料清理
clean = function(txt) {
txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt }
elon$text = clean(elon$text) #text套用資料清理
ef = data.frame()
ef = rbind(ef,elon) # transfer to data frame
ef = ef[!duplicated(ef[,"status_id"]),]
ef = ef[!duplicated(ef[,"text"]),] #去除重複的tweets
### 過濾有關doge coin
keywords = c('2021-04-23','2021-04-24','2021-04-25','2021-04-26','2021-04-27','2021-04-28','2021-04-29','2021-04-30','2021-05-01')
toMatch = paste(keywords,collapse="|")
ef = with(ef, ef[grepl(toMatch,created_at),])+資料筆數以及時間分布
nrow(ef)## [1] 857
min(ef$created_at)## [1] "2020-12-28 19:46:18 UTC"
max(ef$created_at)## [1] "2021-05-03 17:27:11 UTC"
gc() #釋放不使用的記憶體
t0 = Sys.time()
obj_elon = ef[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
Sys.time() - t0 #執行時間tokens_elon = coreNLP_tokens_parser(obj_elon)tokens_elon$lower_word = tolower(tokens_elon$word)
tokens_elon$lower_lemma = tolower(tokens_elon$lemma)tokens_ef <- tokens_elon %>% right_join(ef, by = 'status_id')
tokens_ef$created_at <- tokens_ef$created_at %>% as.Date("%Y/%m/%d")## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%Y/%m/%d'
plot_afinn_ef <- tokens_ef %>%
inner_join(get_sentiments("afinn")) %>%
group_by(created_at) %>%
mutate(count = n()) %>%
mutate(sum = sum(value)) %>%
mutate(ratio = sum/count) %>%
ggplot()+
geom_line(aes(x=created_at,y=ratio))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-04-23','2021-05-01'))
)## Joining, by = "word"
plot_afinn_ef## Warning: Removed 475 row(s) containing missing values (geom_path).
tokens$word = tolower(tokens$word)data(stop_words)
tokens <- tokens %>%
anti_join(stop_words)## Joining, by = "word"
doge_bigrams <- df %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)doge_bigrams %>%
count(bigram, sort = TRUE)## # A tibble: 26,578 x 2
## bigram n
## <chr> <int>
## 1 doge coin 451
## 2 the coin 175
## 3 to the 168
## 4 coin is 160
## 5 is the 144
## 6 coin to 140
## 7 this coin 130
## 8 will be 127
## 9 in the 114
## 10 love coin 106
## # ... with 26,568 more rows
bigrams_separated <- doge_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
#過濾stopwords
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words$word) %>%
filter(!word2 %in% stop_words$word)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigrams_united <- bigrams_filtered %>%
unite(bigram, word1, word2, sep = " ")bigram_tf_idf <- bigrams_united %>%
count(status_id, bigram) %>%
bind_tf_idf(bigram, status_id, n) %>%
arrange(desc(tf_idf))
bigram_tf_idf## # A tibble: 9,869 x 6
## status_id bigram n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 1385693008134381580 anotherthis time 1 1 7.76 7.76
## 2 1385696043753299971 listed withand 1 1 7.76 7.76
## 3 1385702649501294603 super cheap 1 1 7.76 7.76
## 4 1385707172059811840 thatwill allowto 1 1 7.76 7.76
## 5 1385714634485682177 positive doge 1 1 7.76 7.76
## 6 1385721219660632067 spacexdoge coin 1 1 7.76 7.76
## 7 1385748927530610692 join ator 1 1 7.76 7.76
## 8 1385764995837964290 average cost 1 1 7.76 7.76
## 9 1385796167859011589 footballmessi coin 1 1 7.76 7.76
## 10 1385796607166189568 yeah baby 1 1 7.76 7.76
## # ... with 9,859 more rows
bigram_graph <- bigram_counts %>%
filter(n > 20) %>%
graph_from_data_frame()## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"
set.seed(2020)
ggraph(bigram_graph, layout = "fr") +
geom_edge_link() +
geom_node_point() +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) + 以coin為中心的圖都與 dogecoin 相關: + people’s coin: Elon Musk 2/4 在其推特寫下 “Dogecoin is the people’s crypto”,此後部分的人討論都用此字 + meme coin, shit coin, joke coin: 皆為眾人嘲笑 dogecoin + love coin, discord: 為特定社團邀請用詞
doge_words <- df %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word)
word_pairs <- doge_words %>%
pairwise_count(word, status_id, sort = TRUE)## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
word_cors <- doge_words %>%
group_by(word) %>%
filter(n() >= 5) %>%
pairwise_cor(word, status_id, sort = TRUE)
word_cors## # A tibble: 575,322 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 snap nkla 1.
## 2 nkla snap 1.
## 3 series book 1.
## 4 book series 1.
## 5 mint royal 1.
## 6 royal mint 1.
## 7 theta fil 1.
## 8 xlr fil 1.
## 9 mkr fil 1.
## 10 cmp fil 1.
## # ... with 575,312 more rows
word_cors %>%
filter(item1 %in% c("musk", "cuban")) %>%
group_by(item1) %>%
top_n(15) %>%
ungroup() %>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip()## Selecting by correlation
dojacoin: 一個與蜜桃貓朵佳(Doja Cat)有關的 meme
snl 8th: elon musk 5/8 會上 saturday night live(snl)
使用詞彙關係圖畫出相關性大於 0.8 的組合
set.seed(2020)
word_cors %>%
filter(correlation > 0.8) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 2) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
### 去除廣告商
### 正常用戶不會重複留言
df_ads <- df %>%
group_by(user_id) %>%
summarize(count = n()) %>%
filter(count > 7)
df_nonead <- df %>%
filter(!user_id %in% c('1250317271869734912','1260750691083190274','1266632672685821953','1267197215635726336','1356471565110497283','1383414533754998784','1385386163389833219','1386394136094380033','4914843313','1386833874177646594','950847085288419328','1386805408916918272','1316705427493728257','54829446','1386185160920608777','104074409','1328466225492582401','1361907938537971712','1231658238933757952','1231656759233667072','265619996','930808810586832897','120810588','1372656595323207686','1384118260384235525','1384301836094480394','51110502','1388372828135755778','1216632885291057153','1183446394285150209','1279484272663879685','1292020460591357952','277451523'))
nonead_doge_words <- df_nonead %>%
unnest_tokens(word, text) %>%
filter(!word %in% stop_words$word)
nonead_word_cors <- nonead_doge_words %>%
group_by(word) %>%
filter(n() >= 5) %>%
pairwise_cor(word, status_id, sort = TRUE)set.seed(2020)
# 設定幾個詞做seed words
seed_words <- c("signals","kindmore","1800","developed","announcement","btc")
# 設定threshold 0.5
threshold <- 0.5
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
nonead_remove_words <- nonead_word_cors %>%
filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
.$item1 %>%
unique()
nonead_remove_words## [1] "signals" "checkout" "bio" "kindmore" "purpose"
## [6] "created" "1800" "trades" "power" "xlm"
## [11] "guys" "link" "ada" "xrp" "professional"
## [16] "developed" "profit" "trade" "recommend" "love"
## [21] "mid" "announcement" "3000" "update" "btc"
## [26] "crypto"
nonead_word_cors %>%
filter(correlation > 0.5) %>%
filter(!(item1 %in% nonead_remove_words|item2 %in% nonead_remove_words)) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 2) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()plot_nonead_afinn <- nonead_doge_words %>%
filter(!word == c(nonead_remove_words)) %>%
inner_join(get_sentiments("afinn")) %>%
group_by(date) %>%
mutate(count = n()) %>%
mutate(sum = sum(value)) %>%
mutate(ratio = sum/count) %>%
ggplot()+
geom_line(aes(x=date,y=ratio))## Warning in word == c(nonead_remove_words): 較長的物件長度並非較短物件長度的倍數
## Joining, by = "word"
plot_nonead_afinn