一、主題介紹

主題: 狗狗幣(DogeCoin)推特討論
時間：2021-04-23 至 2021-05-01
動機、目的: 狗狗幣誕生於2013年，由IBM工程師 Billy Markus 推出，幣面符號是一隻日本柴犬。隨著近期 Elon Musk 在推特自稱「狗狗幣教父」(Dogefather) 後，狗狗幣周三 (28 日) 一度狂飆逾 20%，一度造成幣圈的熱門話題，因此我們想透過分析近期與狗狗幣相關的 tweets 來觀察民眾對於狗狗幣的看法。

library(wordcloud2)
library(tidyr)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(sentimentr)
library(forcats)
library(janeaustenr)
library(ggraph)
library(igraph)
library(widyr)
library(syuzhet)

load("new.RData")

資料收集

Twitter API設定

#透過rtweet抓取tweets
app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)

設定關鍵字抓tweets

# 查詢關鍵字
# key = c("#Dogecoin")
# context = "coin"
# q = paste(c(key,context),collapse=" AND ")  
# tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)

tweets內容清理

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元，*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }

tweets$text = clean(tweets$text)  #text套用資料清理

df = data.frame()
  
df = rbind(df,tweets)  # transfer to data frame

df = df[!duplicated(df[,"status_id"]),]
df = df[!duplicated(df[,"text"]),]  #去除重複的tweets

了解資料的資料筆數以及時間分布

nrow(df)

## [1] 2565

min(df$created_at)

## [1] "2021-04-23 20:32:12 UTC"

max(df$created_at)

## [1] "2021-05-01 16:17:04 UTC"

二、資料前處理

CoreNLP

串接CoreNLP API

# 產生coreNLP的api url，將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)

# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文（使用9000 port）、另一個則負責中文（使用9001 port）
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}

#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理，每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

+從回傳的object中整理斷詞出結果，輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

+從回傳的core-nlp object中整理出詞彙依存關係，輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

+從回傳的core-nlp object中整理出語句情緒，輸出為 tidydata 格式

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

將句子丟入服務

# gc() #釋放不使用的記憶體
# 
# t0 = Sys.time()
# obj = df[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
# #丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
# 
# Sys.time() - t0 #執行時間
#Time difference of 28 mins

#save.image("coreNLP.RData")

#先將會用到的東西存下來，要用可直接載RData
#tokens =  coreNLP_tokens_parser(obj)
#dependencies = coreNLP_dependency_parser(obj)
#sentiment = coreNLP_sentiment_parser(obj)
#save.image("coreNLP_all.RData")

NER

+斷詞、詞彙還原、詞性標註、NER

tokens =  coreNLP_tokens_parser(obj)

+轉小寫

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

涉及到的國家(COUNTRY)

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  filter(!lower_word == 'us') %>% 
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 15, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

討論到的國家數偏少
討論提到japan是因為狗狗幣的那隻迷因狗狗(柴犬)來自日本，因此在討論中有被提及
涉及到的人物(PERSON)

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  filter(!lower_word %in% c('doge','bitcoin','dogecoin','defi','coin','tesla','matic','cardano')) %>%
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 13, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

elon musk：Tesla 執行長
mark cuban：美國職籃（NBA）達拉斯獨行俠隊老闆，同時也是 dogecoin 狗狗幣擁護者
ellen degeneres：《ellen show》的主持人
Billy Markus 跟 Jackson Palmer: dogecoin 創辦人
Snoop Dogg: 獲葛萊美獎提名的美國饒舌歌手、唱片製作人、演員
james bond: 一個與狗狗幣相關的迷因

三、情緒分析

計算每日發文數量

tokens_df <- tokens %>% right_join(df, by = 'status_id')

tokens_df %>% 
  group_by(date) %>%
  summarise(count = n()) %>%
  ggplot(aes(date, count)) +
  geom_line(color = "#00AFBB", size = 1) + 
  geom_vline(xintercept = as.numeric(as.Date("2021-04-28")), col='red', size = 1) + 
  labs(x = "date", y = "count")

4/28 Mark Cuban 參加《艾倫秀》，和主持人艾倫（Ellen DeGeneres）暢談 NFT 及狗狗幣
4/28 Elon Musk 發推特表示自己是 Dogefather 且會上 snl

afinn 情緒字典的分析結果

tokens_df <- tokens %>% right_join(df, by = 'status_id')

plot_afinn_df <- tokens_df %>% 
  inner_join(get_sentiments("afinn")) %>%
  group_by(date) %>%
  mutate(count = n()) %>%
  mutate(sum = sum(value)) %>%
  mutate(ratio = sum/count) %>%
  ggplot()+
  geom_line(aes(x=date,y=ratio))

## Joining, by = "word"

plot_afinn_df

4/28 Mark Cuban 參加《艾倫秀》，和主持人艾倫（Ellen DeGeneres）暢談 NFT 及狗狗幣 -> 4/27 時造成討論，大家對於狗狗幣的看法偏向正面情緒，4/28當日大眾情緒接近中性，4/29 又上漲。

探索分析 - Sentiment

語句情緒值情緒分數從最低分0~最高分4
- 0,1 : very negative,negative
- 2 : neutral
- 3,4 : very positive,postive

sentiment = coreNLP_sentiment_parser(obj)

#了解情緒文章的分佈
sentiment$sentiment %>% table()

## .
##     Negative      Neutral     Positive Verynegative Verypositive 
##          521         1532          504            2            5

平均情緒分數時間趨勢

df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) + 
  geom_line()

25 ~ 28 為偏負面情緒，與 afinn 差異甚大

Elon Musk 相關分析

91 位民眾回覆 Elon Musk dogefather 貼文的情緒

dogefather <- tokens_df %>%
  #filter(reply_to_status_id == "1387290679794089986") %>%
  filter(reply_to_screen_name == "elonmusk")

以date為單位畫出民眾回覆 musk 的情緒變化

plot_dogefather_afinn <- dogefather %>% 
  inner_join(get_sentiments("afinn")) %>%
  group_by(date) %>%
  summarise(
    sum = sum(value)
  ) %>%
  ggplot(aes(date, sum)) +
  geom_col(show.legend = FALSE)+
  labs(x = "date", y = "score")

## Joining, by = "word"

plot_dogefather_afinn

+推測為 4/28 Elon Musk 發完 dogefather 貼文，隔日狗狗幣幣值暴漲，同時民眾對於 Elon Musk 的正面情緒高漲

elon musk 本人回覆的情緒分析

elon <- get_timeline("@elonmusk", n= 1000)

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元，*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }


elon$text = clean(elon$text)  #text套用資料清理

ef = data.frame()
  
ef = rbind(ef,elon)  # transfer to data frame

ef = ef[!duplicated(ef[,"status_id"]),]
ef = ef[!duplicated(ef[,"text"]),]  #去除重複的tweets

### 過濾有關doge coin

keywords = c('2021-04-23','2021-04-24','2021-04-25','2021-04-26','2021-04-27','2021-04-28','2021-04-29','2021-04-30','2021-05-01')
toMatch = paste(keywords,collapse="|")
ef = with(ef, ef[grepl(toMatch,created_at),])

+資料筆數以及時間分布

nrow(ef)

## [1] 857

min(ef$created_at)

## [1] "2020-12-28 19:46:18 UTC"

max(ef$created_at)

## [1] "2021-05-03 17:27:11 UTC"

將句子丟入服務

gc() #釋放不使用的記憶體
  
t0 = Sys.time()
obj_elon = ef[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
 
Sys.time() - t0 #執行時間

tokens_elon =  coreNLP_tokens_parser(obj_elon)

tokens_elon$lower_word = tolower(tokens_elon$word)
tokens_elon$lower_lemma = tolower(tokens_elon$lemma)

以date為單位畫出各章的情緒變化

tokens_ef <- tokens_elon %>% right_join(ef, by = 'status_id')
tokens_ef$created_at <- tokens_ef$created_at %>% as.Date("%Y/%m/%d")

## Warning in as.POSIXlt.POSIXct(x, tz = tz): unknown timezone '%Y/%m/%d'

plot_afinn_ef <- tokens_ef %>% 
  inner_join(get_sentiments("afinn")) %>%
  group_by(created_at) %>%
  mutate(count = n()) %>%
  mutate(sum = sum(value)) %>%
  mutate(ratio = sum/count) %>%
  ggplot()+
  geom_line(aes(x=created_at,y=ratio))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-04-23','2021-05-01'))
               )

## Joining, by = "word"

plot_afinn_ef

## Warning: Removed 475 row(s) containing missing values (geom_path).

elon musk 於 4/27 回復推文情緒特別高昂，推測因為這個原因所以在 4/28 發文稱自己為 dogefather

四、共線圖

tf-idf

轉小寫

tokens$word = tolower(tokens$word)

排除停用字

data(stop_words)

tokens <- tokens %>%
  anti_join(stop_words)

## Joining, by = "word"

create bigram

doge_bigrams <- df %>%
  unnest_tokens(bigram, text, token = "ngrams", n = 2)

count bigram

doge_bigrams %>%
  count(bigram, sort = TRUE)

## # A tibble: 26,578 x 2
##    bigram        n
##    <chr>     <int>
##  1 doge coin   451
##  2 the coin    175
##  3 to the      168
##  4 coin is     160
##  5 is the      144
##  6 coin to     140
##  7 this coin   130
##  8 will be     127
##  9 in the      114
## 10 love coin   106
## # ... with 26,568 more rows

bigram

bigrams_separated <- doge_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

#過濾stopwords
bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words$word) %>%
  filter(!word2 %in% stop_words$word)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigrams_united <- bigrams_filtered %>%
  unite(bigram, word1, word2, sep = " ")

bigram_tf_idf

bigram_tf_idf <- bigrams_united %>%
  count(status_id, bigram) %>%
  bind_tf_idf(bigram, status_id, n) %>%
  arrange(desc(tf_idf))

bigram_tf_idf

## # A tibble: 9,869 x 6
##    status_id           bigram                 n    tf   idf tf_idf
##    <chr>               <chr>              <int> <dbl> <dbl>  <dbl>
##  1 1385693008134381580 anotherthis time       1     1  7.76   7.76
##  2 1385696043753299971 listed withand         1     1  7.76   7.76
##  3 1385702649501294603 super cheap            1     1  7.76   7.76
##  4 1385707172059811840 thatwill allowto       1     1  7.76   7.76
##  5 1385714634485682177 positive doge          1     1  7.76   7.76
##  6 1385721219660632067 spacexdoge coin        1     1  7.76   7.76
##  7 1385748927530610692 join ator              1     1  7.76   7.76
##  8 1385764995837964290 average cost           1     1  7.76   7.76
##  9 1385796167859011589 footballmessi coin     1     1  7.76   7.76
## 10 1385796607166189568 yeah baby              1     1  7.76   7.76
## # ... with 9,859 more rows

共線圖

計算兩個詞彙間的相關性

bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()

## Warning in graph_from_data_frame(.): In `d' `NA' elements were replaced with
## string "NA"

set.seed(2020)

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link() +
  geom_node_point() +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1)

+ 以coin為中心的圖都與 dogecoin 相關: + people’s coin: Elon Musk 2/4 在其推特寫下 “Dogecoin is the people’s crypto”，此後部分的人討論都用此字 + meme coin, shit coin, joke coin: 皆為眾人嘲笑 dogecoin + love coin, discord: 為特定社團邀請用詞

去除停用字

doge_words <- df %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)

word_pairs <- doge_words %>%
  pairwise_count(word, status_id, sort = TRUE)

## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help

## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.

計算兩個詞彙間的相關性

word_cors <- doge_words %>%
  group_by(word) %>%
  filter(n() >= 5) %>%
  pairwise_cor(word, status_id, sort = TRUE)

word_cors

## # A tibble: 575,322 x 3
##    item1  item2  correlation
##    <chr>  <chr>        <dbl>
##  1 snap   nkla            1.
##  2 nkla   snap            1.
##  3 series book            1.
##  4 book   series          1.
##  5 mint   royal           1.
##  6 royal  mint            1.
##  7 theta  fil             1.
##  8 xlr    fil             1.
##  9 mkr    fil             1.
## 10 cmp    fil             1.
## # ... with 575,312 more rows

找出與 “Cuban” “Musk” 相關性最高的 15 個詞彙

word_cors %>%
  filter(item1 %in% c("musk", "cuban")) %>%
  group_by(item1) %>%
  top_n(15) %>%
  ungroup() %>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip()

## Selecting by correlation

dojacoin: 一個與蜜桃貓朵佳（Doja Cat）有關的 meme
snl 8th: elon musk 5/8 會上 saturday night live(snl)
使用詞彙關係圖畫出相關性大於 0.8 的組合

set.seed(2020)

word_cors %>%
  filter(correlation > 0.8) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 2) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

## Warning: ggrepel: 4 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

此處大部分為廣告詞
可看出幾種廣告商正主推的虛擬貨幣及股價

五、問題

一大部分的 tweets 是廣告
整體推特情緒分析對於狗狗幣的情緒為正面情緒，然而我們推測部分的情緒有被廣告影響，因為廣告詞用的情緒都很正面

六、解決方法 - 篩掉廣告

### 去除廣告商
### 正常用戶不會重複留言
df_ads <- df %>% 
  group_by(user_id) %>%
  summarize(count = n()) %>%
  filter(count > 7)

df_nonead <- df %>%
  filter(!user_id %in% c('1250317271869734912','1260750691083190274','1266632672685821953','1267197215635726336','1356471565110497283','1383414533754998784','1385386163389833219','1386394136094380033','4914843313','1386833874177646594','950847085288419328','1386805408916918272','1316705427493728257','54829446','1386185160920608777','104074409','1328466225492582401','1361907938537971712','1231658238933757952','1231656759233667072','265619996','930808810586832897','120810588','1372656595323207686','1384118260384235525','1384301836094480394','51110502','1388372828135755778','1216632885291057153','1183446394285150209','1279484272663879685','1292020460591357952','277451523'))

nonead_doge_words <- df_nonead %>%
  unnest_tokens(word, text) %>%
  filter(!word %in% stop_words$word)

nonead_word_cors <- nonead_doge_words %>%
  group_by(word) %>%
  filter(n() >= 5) %>%
  pairwise_cor(word, status_id, sort = TRUE)

使用詞彙關係圖畫出相關性大於0.5的組合

set.seed(2020)

# 設定幾個詞做seed words
seed_words <- c("signals","kindmore","1800","developed","announcement","btc")
# 設定threshold 0.5
threshold <- 0.5
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
nonead_remove_words <- nonead_word_cors %>%
                filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
                .$item1 %>%
                unique()
nonead_remove_words

##  [1] "signals"      "checkout"     "bio"          "kindmore"     "purpose"     
##  [6] "created"      "1800"         "trades"       "power"        "xlm"         
## [11] "guys"         "link"         "ada"          "xrp"          "professional"
## [16] "developed"    "profit"       "trade"        "recommend"    "love"        
## [21] "mid"          "announcement" "3000"         "update"       "btc"         
## [26] "crypto"

nonead_word_cors %>%
  filter(correlation > 0.5) %>%
  filter(!(item1 %in% nonead_remove_words|item2 %in% nonead_remove_words)) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 2) +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()

出現 Elon Musk, Mark cuban 等重要人物
jackson palmer 和 billy markus 為 dogecoin 的創辦人
saturday night 為 Elon Musk 上 Saturday Night Live
cardano 和 chainlink 這兩種幣會在未來上漲並帶來收益
shiba inu 為柴犬，同時也是 dogecoin 的標誌狗
reliable, climbing為民眾覺得狗狗幣的幣值會穩定地持續上升

plot_nonead_afinn <- nonead_doge_words %>%
  filter(!word == c(nonead_remove_words)) %>%
  inner_join(get_sentiments("afinn")) %>%
  group_by(date) %>%
  mutate(count = n()) %>%
  mutate(sum = sum(value)) %>%
  mutate(ratio = sum/count) %>%
  ggplot()+
  geom_line(aes(x=date,y=ratio))

## Warning in word == c(nonead_remove_words): 較長的物件長度並非較短物件長度的倍數

## Joining, by = "word"

plot_nonead_afinn

七、結論

虛擬貨幣這種正在火熱的新興商品會讓 twitter 的留言裡充斥著廣告，進而影響到整體分析的狀況
較有廣告商時情緒起伏較小，推測為廣告商使用字眼情緒較為強烈
由各種情緒圖可得知，民眾會因為有重大的事件發生而在討論時有情緒上的波動

dogecoin tweet analytics

李胤豪糠維軒

2021/4/5

一、主題介紹

資料收集

二、資料前處理

CoreNLP

NER

三、情緒分析

計算每日發文數量

afinn 情緒字典的分析結果

探索分析 - Sentiment

Elon Musk 相關分析

91 位民眾回覆 Elon Musk dogefather 貼文的情緒

elon musk 本人回覆的情緒分析

四、共線圖

tf-idf

共線圖

五、問題

六、解決方法 - 篩掉廣告

七、結論

dogecoin tweet analytics

李胤豪 糠維軒

2021/4/5

一、主題介紹

資料收集

二、資料前處理

CoreNLP

NER

三、情緒分析

計算每日發文數量

afinn 情緒字典的分析結果

探索分析 - Sentiment

Elon Musk 相關分析

91 位民眾回覆 Elon Musk dogefather 貼文的情緒

elon musk 本人回覆的情緒分析

四、共線圖

tf-idf

共線圖

五、問題

六、解決方法 - 篩掉廣告

七、結論

李胤豪糠維軒