基本介紹

  • 目的:使用coreNLP與sentimentr分析twitter上對新冠肺炎與疫苗的文字資料
  • 概述:利用推特的文章分析民眾對“新冠肺炎與疫苗”的情緒。
  • 資料來源:Twitter,4/10~4/11,5000筆,English

1. coreNLP

安裝package

packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
load("coreNLP_all_1.RData")

1.1 資料收集:tweets

(1). Twitter API設定 透過rtweet抓取tweets

app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權

(2). 設定關鍵字抓tweets

# 查詢關鍵字
key = c("#covid-19")
context = "vaccine"
q = paste(c(key,context),collapse=" AND ")   
# 查詢字詞 "#covid-19 AND vaccine"
# 為了避免只下#covid-19 會找到非在vaccine中的tweets,加入vaccine要同時出現的條件

#抓5000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)
## Warning: Rate limit exceeded - 88
## Warning: Rate limit exceeded

(3). tweets內容清理

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }


tweets$text = clean(tweets$text)  #text套用資料清理

df = data.frame()
  
df = rbind(df,tweets)  # transfer to data frame

df = df[!duplicated(df[,"status_id"]),]  #去除重複的tweets
head(df)
## # A tibble: 6 x 90
##   user_id   status_id   created_at          screen_name  text            source 
##   <chr>     <chr>       <dttm>              <chr>        <chr>           <chr>  
## 1 15571186… 1381534678… 2021-04-12 09:08:29 RuddFuneral… Why could Morr… Twitte…
## 2 12808449… 1381534671… 2021-04-12 09:08:27 Bob36783646  93 Israeli doc… Twitte…
## 3 82274462… 1381534651… 2021-04-12 09:08:23 Nilesh_TNIE  Vaccine shorta… Twitte…
## 4 316371232 1381534606… 2021-04-12 09:08:12 FahimQasim5… Over 175.1 mil… Twitte…
## 5 466701933 1381534603… 2021-04-12 09:08:11 c21st_org    With the vacci… Twitte…
## 6 31073711… 1381534561… 2021-04-12 09:08:01 DiaquipLtd   Covid19 G7 nat… Buffer 
## # … with 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>,
## #   ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## #   lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## #   quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## #   quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## #   quoted_name <chr>, quoted_followers_count <int>,
## #   quoted_friends_count <int>, quoted_statuses_count <int>,
## #   quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## #   retweet_source <chr>, retweet_favorite_count <int>,
## #   retweet_retweet_count <int>, retweet_user_id <chr>,
## #   retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## #   country_code <chr>, geo_coords <list>, coords_coords <list>,
## #   bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## #   description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

了解資料的資料筆數以及時間分布

created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料

nrow(df)
## [1] 3194
min(df$created_at)
## [1] "2021-04-12 01:58:12 UTC"
max(df$created_at)
## [1] "2021-04-12 09:08:29 UTC"

1-2串接CoreNLP API

(1). API呼叫的設定

server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)
# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}
#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}
coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

(2). 資料整理function

從回傳的object中整理斷詞出結果,輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

圖形化 Dependency tree

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

1-3 提取結果

(1). 斷詞、詞彙還原、詞性標註、NER

tokens =  coreNLP_tokens_parser(obj)
head(tokens,20)
##              status_id       word      lemma pos     ner
## 1  1381162673338003457       John       John NNP  PERSON
## 2  1381162673338003457       Tory       Tory NNP  PERSON
## 3  1381162673338003457        Got        get VBD       O
## 4  1381162673338003457          A          a  DT       O
## 5  1381162673338003457    COVID19    covid19  NN       O
## 6  1381162673338003457    Vaccine    vaccine  NN       O
## 7  1381162673338003457       Dose       dose  NN       O
## 8  1381162673338003457         He         he PRP       O
## 9  1381162673338003457 'Literally 'literally  RB       O
## 10 1381162673338003457        Did         do VBD       O
## 11 1381162673338003457        Not        not  RB       O
## 12 1381162673338003457       Even       even  RB       O
## 13 1381162673338003457       Feel       feel  VB       O
## 14 1381162673338003457        The        the  DT       O
## 15 1381162673338003457    Needle'    needle'  NN       O
## 16 1381133716878196736       Doug       Doug NNP  PERSON
## 17 1381133716878196736       Ford       Ford NNP  PERSON
## 18 1381133716878196736       gets        get VBZ       O
## 19 1381133716878196736      first      first  JJ ORDINAL
## 20 1381133716878196736       dose       dose  NN       O
  • coreNLP_tokens_parser欄位:
    • status_id : 對應原本df裡的status_id,為一則tweets的唯一id
    • word: 原始斷詞
    • lemma : 對斷詞做詞形還原
    • pos : part-of-speech,詞性
    • ner: 命名實體

(2). 命名實體標註(NER)

  • 從NER查看特定類型的實體,辨識出哪幾種類型
unique(tokens$ner)
##  [1] "PERSON"            "O"                 "ORDINAL"          
##  [4] "ORGANIZATION"      "CITY"              "MISC"             
##  [7] "COUNTRY"           "NATIONALITY"       "TIME"             
## [10] "IDEOLOGY"          "CAUSE_OF_DEATH"    "TITLE"            
## [13] "DATE"              "NUMBER"            "DURATION"         
## [16] "STATE_OR_PROVINCE" "LOCATION"          "PERCENT"          
## [19] "RELIGION"          "SET"               "CRIMINAL_CHARGE"  
## [22] "MONEY"             "URL"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) 
## [1] 3176

(3). 轉小寫

因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

1.4 探索分析 - NER

涉及到的國家(COUNTRY)

我們可以透過coreNLP中的NER解析出在Twitter上面談論covid-19 ,所涉及到的國家(COUNTRY),以初步了解這個議題所討論到的主要國家。

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 13, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

+ 在「印度」連續兩日出現破14萬起新增確診趨勢,讓該國政府緊急拉起警報,而且疫苗庫存僅剩3天施打量 + 「美國」太快宣布戰勝病毒,導致第4波疫情即將到來,目前已達成超過1.5億劑新冠疫苗的接種數量,完成疫苗接種者約占5分之1總人口

tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

+ 嬌生疫苗 + 穆迪分析

dependencies = coreNLP_dependency_parser(obj)
head(dependencies,20)
##              status_id       dep governor governorGloss dependent
## 1  1381162673338003457      ROOT        0          ROOT        13
## 2  1381162673338003457  compound        2          Tory         1
## 3  1381162673338003457     nsubj        3           Got         2
## 4  1381162673338003457 parataxis       13          Feel         3
## 5  1381162673338003457       det        7          Dose         4
## 6  1381162673338003457  compound        7          Dose         5
## 7  1381162673338003457  compound        7          Dose         6
## 8  1381162673338003457       obj        3           Got         7
## 9  1381162673338003457     nsubj       13          Feel         8
## 10 1381162673338003457    advmod       13          Feel         9
## 11 1381162673338003457       aux       13          Feel        10
## 12 1381162673338003457    advmod       13          Feel        11
## 13 1381162673338003457    advmod       13          Feel        12
## 14 1381162673338003457       det       15       Needle'        14
## 15 1381162673338003457       obj       13          Feel        15
## 16 1381133716878196736      ROOT        0          ROOT         3
## 17 1381133716878196736  compound        2          Ford         1
## 18 1381133716878196736     nsubj        3          gets         2
## 19 1381133716878196736      amod        5          dose         4
## 20 1381133716878196736       obj        3          gets         5
##    dependentGloss
## 1            Feel
## 2            John
## 3            Tory
## 4             Got
## 5               A
## 6         COVID19
## 7         Vaccine
## 8            Dose
## 9              He
## 10     'Literally
## 11            Did
## 12            Not
## 13           Even
## 14            The
## 15        Needle'
## 16           gets
## 17           Doug
## 18           Ford
## 19          first
## 20           dose

1.5 探索分析 - Dependency

語句依存關係結果
parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:httr':
## 
##     content
## The following object is masked from 'package:ggplot2':
## 
##     annotate
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)

1.6 探索分析 - Sentiment

語句情緒值

情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive

sentiment = coreNLP_sentiment_parser(obj)
head(sentiment,20)
##              status_id
## 1  1381162673338003457
## 2  1381133716878196736
## 3  1381123156300140544
## 4  1381131207602274304
## 5  1381147814055583746
## 6  1381014676356673537
## 7  1381004122283991052
## 8  1381142027165970432
## 9  1381141019392049153
## 10 1381144054189162498
## 11 1381143280822460416
## 12 1381129434229579778
## 13 1381162654702694401
## 14 1381140108028612610
## 15 1381162652047708164
## 16 1381162648973246466
## 17 1381162643998789632
## 18 1381162616052191235
## 19 1381072014216208389
## 20 1381132409194885121
##                                                                                                                                                                                                                             text
## 1                                                                                                                                              John Tory Got A COVID19 Vaccine Dose  He 'Literally Did Not Even Feel The Needle'
## 2                                                                                                                                                                       Doug Ford gets first dose of AstraZeneca COVID19 vaccine
## 3                                                                                                                                                      Toronto looks to create standby lists at mass COVID19 vaccination clinics
## 4                                                                                                                                       Here's why Canadians have reason to be more optimistic about the COVID19 vaccine rollout
## 5                                                                                                                                                     Pfizer BioNTech seek U.S. emergency nod for COVID19 vaccine in adolescents
## 6                                                                                                                                            COVID19 vaccine shortages to hit worlds poorest countries as COVAX halts deliveries
## 7                                                                                                                                                        Pfizers COVID19 shot less effective against South African variant study
## 8                                                                                                                                                   Second doses erroneously administered at Toronto COVID19 vaccination clinics
## 9                                                                                                                                                        Employees at Toronto operated vaccine clinics test positive for COVID19
## 10                                                                                                                                                                           JJ COVID19 vaccine under EU review over blood clots
## 11                                                                                                                                                                 Expired COVID19 vaccine given out at Metro Vancouver pharmacy
## 12                                                                                                                                                                    Ford to receive first dose of COVID19 vaccine this morning
## 13                                                                                                                                          How Trump's Initiative and Free Market Innovation Created the Coronavirus Vaccinevia
## 14                                                                                                                                   The answer to your COVID19 vaccine question. To learn more about the COVID19 vaccines visit
## 15 The COVID19 vaccine is safe  effective. The British Islamic Medical Association  has confirmed you can have the vaccine during Ramadan. If you prefer make an appointment before or after Ramadan. To book your vaccine visit
## 16                                             While global equity concerns are acknowledged can it trump a countrys own domestic vaccine equity concerns? The latter is very difficult to sustain politically and socially too.
## 17  The COVID19 vaccine was only made available to the public after meeting strict safety and effectiveness criteria. Vaccines work by teaching your immune system how to defend itself against attack from the virus. Read more
## 18                     Indians with comorbidities face a high risk in terms of severeand mortality. It's imperative to vaccinate vulnerable groups asap and then expand to other ageson India's public health challenge from the
## 19                     As severalvaccine centres report low supplies amid a surge of cases we askand health sector analystabout Indias vaccine manufacturing capacity and the implications of vaccine shortage for public health
## 20                                       Dr Reddys Laboratories could get emergencyuse authorisation to make thevaccine. India needs many more doses to expandbeyond the elderly  those with comorbidities health sector analyst
##    sentiment sentimentValue
## 1    Neutral              2
## 2    Neutral              2
## 3    Neutral              2
## 4    Neutral              2
## 5    Neutral              2
## 6    Neutral              2
## 7    Neutral              2
## 8    Neutral              2
## 9   Positive              3
## 10   Neutral              2
## 11   Neutral              2
## 12   Neutral              2
## 13   Neutral              2
## 14   Neutral              2
## 15  Positive              3
## 16  Negative              1
## 17  Negative              1
## 18  Negative              1
## 19  Negative              1
## 20  Negative              1
資料集中的情緒種類
unique(sentiment$sentiment)
## [1] "Neutral"      "Positive"     "Negative"     "Verynegative" "Verypositive"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric
#了解情緒文章的分佈
sentiment$sentiment %>% table()
## .
##     Negative      Neutral     Positive Verynegative Verypositive 
##         1152         3330          449            4            4
平均情緒分數時間趨勢
df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) + 
  geom_line()

不同用戶端情緒時間趨勢
sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% 
  group_by(date,source) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment,color=source)) + 
  geom_line()
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

##### 了解情緒分佈,以及在正面情緒及負面情緒下,所使用的文章詞彙為何?

#了解正面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the','covid19','vaccine',19,'covid19.','covid')) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lower_lemma) %>% #根據lemma分組
  summarize(count = n()) %>% 
  filter(count >5 & count<400)%>%
  wordcloud2()
## Joining, by = "word"
  • dose,
#了解負面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  group_by(lower_lemma) %>% 
  summarize(count = n()) %>% 
  filter(count >10 &count<400)%>%
  wordcloud2()
## Joining, by = "word"

“wordcloud”

2.2 使用twitter資料實踐在sentimentr

計算tweet中屬於正面的字
library(sentimentr)
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字
##         words polarity  n
##   1: efficacy      1.0 48
##   2:      top      1.0 35
##   3: approved      1.0  8
##   4: approval      1.0  5
##   5:     fast      1.0  5
##  ---                     
## 360:  masters      0.1  1
## 361:     pray      0.1  1
## 362:   shares      0.1  1
## 363:  depends      0.1  1
## 364:   church      0.1  1
計算tweet中屬於負面的字
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字
## Selecting by n
##          words polarity  n
##  1:      virus    -0.50 31
##  2: government    -0.50 27
##  3:    disease    -1.00 20
##  4:       risk    -0.75 19
##  5:    prevent    -0.25 18
##  6:       fall    -0.25 12
##  7:     strain    -0.60 12
##  8:       drug    -0.10 11
##  9:       shot    -0.40 11
## 10:     cancer    -0.75 11
highlight每個句子,判斷屬於正/負面
set.seed(12)
df%>%
    filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
    mutate(review = get_sentences(text)) %$% 
    sentiment_by(review, status_id) %>%
    highlight()
## Saved in /var/folders/p2/cqvmy7c17px138qxl3wdzfzm0000gn/T//RtmpdM7882/polarity.html
## Opening /var/folders/p2/cqvmy7c17px138qxl3wdzfzm0000gn/T//RtmpdM7882/polarity.html ...

2.3 用日期來了解情緒波動

code 參考 https://github.com/trinker/sentimentr

tweets$date = format(tweets$created_at,'%Y%m%d')

(out  = tweets  %>%  with(
    sentiment_by( #document level
        get_sentences(text), 
        list( date)
    )
))
plot(out)

2.4 用日期來了解不同用戶端情的緒波動

(out  = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%  with(
    sentiment_by(
        get_sentences(text), 
        list(source, date)
    )
))
plot(out)