基本介紹

  • 目的:使用coreNLP與sentimentr分析twitter上 covid-19 和疫苗的文字資料
  • 概述:
  • 資料來源:Twitter,04/06 ~ 04/10,5000筆,English

1. coreNLP

安裝package

packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
load("coreNLP.RData")

1.1 資料收集:tweets

(1). Twitter API設定 透過rtweet抓取tweets

app = 'r lan'
consumer_key = 'dJefV7i1VsTTJ9rg4NGKsaVk8'
consumer_secret = 'S2m6I1bHt6sBVd4y0vv2v2Mmq4XYb5qRhNCO1KbOmDzp9nWiXh'
access_token = '3610603634-lAhazEWR7VpLgyMt5Ink3jojkg8iSBWIqbqqjaF'
access_secret = 'pvKFRmnJmhft9SOp1aMcwBBNEX9hqsc5jtHpMONUgpWkm'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)

(2). 設定關鍵字抓tweets

# 查詢關鍵字
key = c("#COVID19")
context = "#vaccine"
q = paste(c(key,context),collapse=" AND ")   
# 查詢字詞 "#COVID19 AND #vaccine"
# 為了避免只下#EverGiven 會找到非在suez中的tweets,加入Suez要同時出現的條件

#抓5000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)

(3). tweets內容清理

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }


tweets$text = clean(tweets$text)  #text套用資料清理

df = data.frame()
  
df = rbind(df,tweets)  # transfer to data frame

df = df[!duplicated(df[,"status_id"]),]  #去除重複的tweets
head(df)
## # A tibble: 6 x 90
##   user_id   status_id   created_at          screen_name  text            source 
##   <chr>     <chr>       <dttm>              <chr>        <chr>           <chr>  
## 1 135187648 1380922284… 2021-04-10 16:35:03 WorldBankAf… NEWreport find… Hootsu…
## 2 471615885 1380922251… 2021-04-10 16:34:55 MeadowGood   What about any… Twitte…
## 3 41626609  1380922195… 2021-04-10 16:34:42 greg_folkers . Quotation of… Twitte…
## 4 10430292… 1380921774… 2021-04-10 16:33:01 tweet_aneri  inTelangana le… Twitte…
## 5 11349623… 1380921111… 2021-04-10 16:30:23 AccessCNY    Every program … Hootsu…
## 6 11349623… 1379803826… 2021-04-07 14:30:42 AccessCNY    Thank you to F… Hootsu…
## # … with 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>,
## #   ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## #   lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## #   quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## #   quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## #   quoted_name <chr>, quoted_followers_count <int>,
## #   quoted_friends_count <int>, quoted_statuses_count <int>,
## #   quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## #   retweet_source <chr>, retweet_favorite_count <int>,
## #   retweet_retweet_count <int>, retweet_user_id <chr>,
## #   retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## #   country_code <chr>, geo_coords <list>, coords_coords <list>,
## #   bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## #   description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

df共有90個欄位,但我們在這裡僅會使用幾個欄位:

  • user_id: 用戶id
  • status_id : 推文id
  • created_at : 發文時間
  • text : 推文內容
  • source : 發文來源

了解資料的資料筆數以及時間分布

created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料

nrow(df)
## [1] 4948
min(df$created_at)
## [1] "2021-04-06 22:05:04 UTC"
max(df$created_at)
## [1] "2021-04-10 16:35:03 UTC"

哪些帳號的文會被引用

df%>%
  filter(quoted_name != "") %>%  #篩選quoted_name不為空白
  group_by(quoted_name) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 15, count) %>%
  ungroup() %>% 
  mutate(user = reorder(quoted_name, count)) %>%
  ggplot(aes(user, count)) + 
  geom_col()+
  ggtitle("Word Frequency (quoted_name)") +
  theme(text=element_text(size=14))+
  coord_flip()

可以發現,會被其他使用者分享文章的帳號多為組織、報社、雜誌,像是:
Reuters(路透社)、Financial Times(金融時報)、EU Medicines Agency(歐洲藥品管理局)、 The Wall Street Journal(華爾街日報)、The New York Times(紐約時報)等,
這些帳號所發出來的文章為假消息的機率較低, 因而會有使用者將訊息轉貼出去。

被引用的文章多分布於哪裡

df %>%
  filter(quoted_location != "") %>%  #篩選quoted_location不為空白
  group_by(quoted_location) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 15, count) %>%
  ungroup() %>% 
  mutate(location = reorder(quoted_location, count)) %>%
  ggplot(aes(location, count)) + 
  geom_col()+
  ggtitle("Word Frequency (quoted_location)") +
  theme(text=element_text(size=14))+
  coord_flip()

被引用的文章帳號的followers數量

df %>%
  filter(quoted_followers_count != "") %>%  #篩選quoted_followers_count
  group_by(quoted_followers_count) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 15, count) %>%
  ungroup() %>% 
  mutate(number = reorder(quoted_followers_count, count)) %>%
  ggplot(aes(number, count)) + 
  geom_col()+
  ggtitle("Word Frequency (quoted_followers_count)") +
  theme(text=element_text(size=14))+
  coord_flip()

會被引用的文章帳號,其追蹤人數相對較多

df %>%
  filter(quoted_name != "") %>%
  filter(quoted_location != "") %>%
  filter(quoted_followers_count != "") %>%
  select(quoted_name,quoted_location,quoted_followers_count) 
## # A tibble: 376 x 3
##    quoted_name                    quoted_location          quoted_followers_cou…
##    <chr>                          <chr>                                    <int>
##  1 Gearoid Reidy                  Tokyo, Japan                              9155
##  2 Julia Mio Inuma (井沼ジュリア) Tokyo, Japan                               691
##  3 Rochelle Kopp                  Tokyo and Silicon Valley                 10278
##  4 The Times Of India             New Delhi                             13471119
##  5 MOHS                           Sierra Leone                              1478
##  6 Amanda Jetté Knox              Ottawa, Canada                           65185
##  7 MDVaxAlerts                    College Park, MD                         18481
##  8 MDVaxAlerts                    College Park, MD                         18481
##  9 Meg Tirrell                    New York                                 88455
## 10 Governor Phil Murphy           New Jersey, USA                         384616
## # … with 366 more rows

1-2串接CoreNLP API

(1). API呼叫的設定

server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)
# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}
#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

(2). 資料整理function

從回傳的object中整理斷詞出結果,輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

圖形化 Dependency tree

程式參考來源:https://stackoverflow.com/questions/35496560/how-to-convert-corenlp-generated-parse-tree-into-data-tree-r-package

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

將句子丟入服務

取得coreNLP回傳的物件

# gc() #釋放不使用的記憶體
# 
# t0 = Sys.time()
# obj = df[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
# #丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
# 
# Sys.time() - t0 #執行時間
# #Time difference of 28 mins
# 
# save.image("coreNLP.RData")
#先將會用到的東西存下來,要用可直接載RData
# tokens =  coreNLP_tokens_parser(obj)
# dependencies = coreNLP_dependency_parser(obj)
# sentiment = coreNLP_sentiment_parser(obj)
# save.image("coreNLP.RData")

1-3 提取結果

(1). 斷詞、詞彙還原、詞性標註、NER

#tokens =  coreNLP_tokens_parser(obj)
head(tokens,20)
##              status_id             word            lemma pos ner
## 1  1380922284517511175        NEWreport        newreport  NN   O
## 2  1380922284517511175            finds             find VBZ   O
## 3  1380922284517511175             that             that  IN   O
## 4  1380922284517511175            given             give VBN   O
## 5  1380922284517511175              the              the  DT   O
## 6  1380922284517511175 presenthesitancy presenthesitancy  NN   O
## 7  1380922284517511175           levels            level NNS   O
## 8  1380922284517511175       strategies         strategy NNS   O
## 9  1380922284517511175              are               be VBP   O
## 10 1380922284517511175           needed             need VBN   O
## 11 1380922284517511175               to               to  TO   O
## 12 1380922284517511175         generate         generate  VB   O
## 13 1380922284517511175       Confidence       confidence  NN   O
## 14 1380922284517511175       Acceptance       Acceptance NNP   O
## 15 1380922284517511175           Demand           demand  NN   O
## 16 1380922284517511175              for              for  IN   O
## 17 1380922284517511175       thevaccine       thevaccine  NN   O
## 18 1380922284517511175            Learn            learn  VB   O
## 19 1380922284517511175             more             more JJR   O
## 20 1380922251428659204             What             what  WP   O
  • coreNLP_tokens_parser欄位:
    • status_id : 對應原本df裡的status_id,為一則tweets的唯一id
    • word: 原始斷詞
    • lemma : 對斷詞做詞形還原
    • pos : part-of-speech,詞性
    • ner: 命名實體

(2). 命名實體標註(NER)

  • 從NER查看特定類型的實體,辨識出哪幾種類型
unique(tokens$ner)
##  [1] "O"                 "LOCATION"          "NUMBER"           
##  [4] "PERSON"            "DATE"              "DURATION"         
##  [7] "MISC"              "ORGANIZATION"      "COUNTRY"          
## [10] "CITY"              "NATIONALITY"       "ORDINAL"          
## [13] "RELIGION"          "TITLE"             "STATE_OR_PROVINCE"
## [16] "CAUSE_OF_DEATH"    "CRIMINAL_CHARGE"   "IDEOLOGY"         
## [19] "SET"               "TIME"              "URL"              
## [22] "PERCENT"           "MONEY"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) 
## [1] 2982

(3). 轉小寫

因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

1.4 探索分析 - NER

涉及到的國家(COUNTRY)
tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 13, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

  • 被提到最多的為英國,近期英國與疫苗較為相關的新聞為
    • 境內超過半數成人已施打至少一劑疫苗,成為達成此接種水準的首個全球主要經濟體。
    • 阿斯利康(astrazeneca)暫停英國兒童臨床試驗
    • AZ疫苗被認為是有極低機率導致接種者出現血栓症狀,英國讓30歲以下民眾改打別款疫苗
  • 其次,印度為第二多被提及的國家,近期印度疫苗相關新聞為
    • 印度血清研究所(Serum Institute)為全球最大疫苗生產商
    • 4月9日、4月10日,印度確診人數暴增,印度政府宣佈疫苗暫停出口
涉及到的組織(ORGANIZATION)
tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

  • publix 為美國一商店/藥局名稱,其開放民眾於店內施打疫苗
  • st. 應為縮寫
  • astrazeneca 為一藥廠,生產covid-19疫苗
  • martin 為 martin’s supermarket,其開放民眾於店內施打疫苗
  • lucie 為St. Lucie County是美國佛羅里達州的聖露西亞縣
  • volusia、pinellas、osceola皆位於美國佛羅里達州
涉及到的人物(PERSON)
tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

  • johnson 為英國首相 Boris Johnson
  • walton、hernando、duval、lee、miamidade、broward為佛羅里達的一個縣名
  • biden 為美國總統拜登
  • modi 為印度總理

1.5 探索分析 - Dependency

語句依存關係結果
#dependencies = coreNLP_dependency_parser(obj)
head(dependencies,20)
##              status_id        dep governor governorGloss dependent
## 1  1380922284517511175       ROOT        0          ROOT         2
## 2  1380922284517511175      nsubj        2         finds         1
## 3  1380922284517511175       mark       10        needed         3
## 4  1380922284517511175       case        7        levels         4
## 5  1380922284517511175        det        7        levels         5
## 6  1380922284517511175   compound        7        levels         6
## 7  1380922284517511175        obl       10        needed         7
## 8  1380922284517511175 nsubj:pass       10        needed         8
## 9  1380922284517511175   aux:pass       10        needed         9
## 10 1380922284517511175      ccomp        2         finds        10
## 11 1380922284517511175       mark       12      generate        11
## 12 1380922284517511175      xcomp       10        needed        12
## 13 1380922284517511175   compound       15        Demand        13
## 14 1380922284517511175   compound       15        Demand        14
## 15 1380922284517511175      nsubj       18         Learn        15
## 16 1380922284517511175       case       17    thevaccine        16
## 17 1380922284517511175       nmod       15        Demand        17
## 18 1380922284517511175      ccomp       12      generate        18
## 19 1380922284517511175        obj       18         Learn        19
## 20 1380922251428659204       ROOT        0          ROOT         1
##      dependentGloss
## 1             finds
## 2         NEWreport
## 3              that
## 4             given
## 5               the
## 6  presenthesitancy
## 7            levels
## 8        strategies
## 9               are
## 10           needed
## 11               to
## 12         generate
## 13       Confidence
## 14       Acceptance
## 15           Demand
## 16              for
## 17       thevaccine
## 18            Learn
## 19             more
## 20             What
視覺化 Dependency tree
parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)

1.6 探索分析 - Sentiment

語句情緒值

情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive

#sentiment = coreNLP_sentiment_parser(obj)
head(sentiment,20)
##              status_id
## 1  1380922284517511175
## 2  1380922251428659204
## 3  1380922195552178176
## 4  1380921774162944003
## 5  1380921111311958019
## 6  1379803826132283394
## 7  1380921088062976007
## 8  1380220172091453440
## 9  1380920174581141504
## 10 1379691746163036163
## 11 1380919458470969350
## 12 1380570929818898432
## 13 1379683088045760520
## 14 1380919205499871233
## 15 1380434176168501254
## 16 1380569194597523456
## 17 1379681865129357312
## 18 1380919092677287940
## 19 1380565549848043523
## 20 1380918924934406151
##                                                                                                                                                                                                                                                   text
## 1                                                                                                      NEWreport finds that given the presenthesitancy levels strategies are needed to generate Confidence Acceptance Demand for thevaccine Learn more
## 2                                                            What about anyone who got the vaccine wear a pin to show you are vaccinated? This will encourage others  we will know who is in the club so we know who we can let our masks down around?
## 3                                                                          . Quotation of the Day Relying on one dose of Moderna or Pfizer to stop variants like B.1.351 is like using a BB gun to stop a charging rhino.JOHN P. MOORE a virologist at
## 4                                                            in  Telangana left with only 5.6 lakh doses ofwhich would last for 3 days. Govt of Telangana writes to Centre to provide at least 30 lakh doses of Vaccine to Telangana for next 15 days.
## 5                                                                                           Every program at AccessCNY serves highrisk individuals. And every person who gets vaccinated helps to protect our participants and the community at large!
## 6                                                            Thank you to Faye and every other AccessCNY employee who has spent the last year caring for and advocating for the people we serve. The impact of your dedicated actions is immeasurable.
## 7                                                                                                                                                             ages 50 or older  it is time for you to take your shot! Contact your countydepartment or
## 8                                                                                                                                                             ages 50 or older  it is time for you to take your shot! Contact your countydepartment or
## 9                                                                                               Its vaccine day for our lil family. Will I post a picture or share an update or two later? Im not sure.Here is to two more humans beginning thejourney
## 10                            AZN Booted from Baltimore  Fair Play by the US? Coincidence politics or a little of both AstraZeneca's road to FDA approval has been a rocky one even if the US has purchased 500M doses of their EBS NVAX ALT ALNY DVAX
## 11                                                All Eyes Are on Upcoming Novavax's Phase 3 Study Readout .'sis arriving later to theimmunization frenzy thanwanted but still capable of being a major player in the global supply. NVAX AZN CVAC ALT
## 12                                        JNJVaccine Under EU Review with Same Safety Issue as AstraZeneca Vaccine Europes drug regulator said it is reviewing reports of blood clots in 4 people who received J has expanded itsprobe. NVAX OCGN MRNA
## 13                                   Thevaccine is still  for no good reason at all  not approved in . This is moving from indifference bureaucratic rigidity and perfect as enemy of the good to a morally bankrupt and abject failure of leadership.
## 14                had 59 new cases per 100K in the last 7 days  and climbing. Los Angeles County had 35. Osakas positive test rate 8 LA 1.4. Osaka has vaccinated  LA 30 with 1 dose.Osaka is heading into crisis by any standardwith little response.
## 15                   Anticipatedsupply for 65 inseems to increase rapidly starting in May but Ive seen no discussion of plannedanticipatedratescapacity. Completing 2 doses to 30 of 36 mill 65 between May 1 and say July 15 requires 400450K vaxday.
## 16                                                                                                                                   Would that be the same Ministry of Health that appears to be moving the approval of thevaccine at a glacial pace?
## 17                                   Vox clamantis in deserto. A voice crying in the Japanese  wilderness. Sadly once bureaucratic inertia required but slow consensus and inflexibility set a reactive and ineffective course adjustment is unlikely.
## 18 45  ...about getting theafter being sick withbecause they are under the impression that they will be much sicker after the . While this shows they can get more of a systemic reaction it is safe and they won't be sick enough to be hospitalized.
## 19                                                                                                      While we are making progress we still more work to do. This is our shot! Get your COVID19 vaccine when you are able  help end this pandemic...
## 20                                                                                                                                       It Begins Only COVID Vaccinated People Can Be Evacuated from Volcano Stricken Caribbean Island of St. Vincent
##    sentiment sentimentValue
## 1    Neutral              2
## 2   Positive              3
## 3    Neutral              2
## 4   Negative              1
## 5    Neutral              2
## 6    Neutral              2
## 7   Negative              1
## 8   Negative              1
## 9   Negative              1
## 10  Negative              1
## 11   Neutral              2
## 12  Negative              1
## 13  Negative              1
## 14  Positive              3
## 15  Negative              1
## 16   Neutral              2
## 17  Negative              1
## 18  Negative              1
## 19  Positive              3
## 20   Neutral              2
資料集中的情緒種類
unique(sentiment$sentiment)
## [1] "Neutral"      "Positive"     "Negative"     "Verynegative" "Verypositive"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric
#了解情緒文章的分佈
sentiment$sentiment %>% table()
## .
##     Negative      Neutral     Positive Verynegative Verypositive 
##         1288         2858          773            2            3
平均情緒分數時間趨勢
df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) + 
  geom_line()

了解情緒分佈,以及在正面情緒及負面情緒下,所使用的文章詞彙為何?
#了解正面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lower_lemma) %>% #根據lemma分組
  summarize(count = n()) %>% 
  filter(count >5 & count<400)%>%
  wordcloud2()
## Joining, by = "word"
#了解負面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  group_by(lower_lemma) %>% 
  summarize(count = n()) %>% 
  filter(count >10 &count<400)%>%
  wordcloud2()

2. Sentimentr 英文情緒分析

2.2 使用twitter資料實踐在sentimentr

計算tweet中屬於正面的字
library(sentimentr)

set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字
##          words polarity  n
##   1:    please      1.0 99
##   2:      care      1.0 12
##   3:  approved      1.0 10
##   4:  efficacy      1.0  8
##   5:  approval      1.0  7
##  ---                      
## 508: coalition      0.1  1
## 509:  relating      0.1  1
## 510:  momentum      0.1  1
## 511:    prefer      0.1  1
## 512:     build      0.1  1
計算tweet中屬於負面的字
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字
## Selecting by n
##              words polarity  n
##  1: pharmaceutical    -0.40 80
##  2:       pandemic    -1.00 69
##  3:           shot    -0.40 41
##  4:            jab    -0.60 31
##  5:          virus    -0.50 30
##  6:     government    -0.50 22
##  7:           risk    -0.75 22
##  8:          fight    -0.50 13
##  9:       shortage    -0.75 13
## 10:            flu    -0.75 13
highlight每個句子,判斷屬於正/負面
# set.seed(12)
# df%>%
#     filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
#     mutate(review = get_sentences(text)) %$% 
#     sentiment_by(review, status_id) %>%
#     highlight()

2.3 用日期來了解情緒波動

code 參考 https://github.com/trinker/sentimentr

tweets$date = format(tweets$created_at,'%Y%m%d')

(out  = tweets  %>%  with(
    sentiment_by( #document level
        get_sentences(text), 
        list( date)
    )
))
plot(out)

2.4 用日期來了解不同用戶端情的緒波動

(out  = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%  with(
    sentiment_by(
        get_sentences(text), 
        list(source, date)
    )
))
plot(out)