Group2 homework-社群媒體分析CoreNLP與sentimentr

基本介紹

目的:使用coreNLP與sentimentr分析twitter上關於covid-19疫苗的文字資料
資料來源:Twitter，4/4~4/12，821筆，English

1. coreNLP

安裝JAVA JRE 1.8+ https://www.java.com/zh_TW/
下載Stanford coreNLP full模組

安裝package

packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(data.table)

load("coreNLP_all.RData")

1.1 資料收集：tweets

(1). Twitter API設定透過rtweet抓取tweets

app = '2021_sma'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權

(2). 設定關鍵字抓tweets

# 查詢關鍵字
key = c("vaccine","az vaccine","covid-19")
q = paste(key,collapse=" AND ")   
# 查詢字詞 "vaccine AND az vaccine AND covid"

#抓1000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=1000,include_rts = FALSE,token = twitter_token)

(3). tweets內容清理

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元，*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }


tweets$text = clean(tweets$text)  #text套用資料清理

df = data.frame() #建立空白DF
  
df = rbind(df,tweets)  # transfer to data frame

df = df[!duplicated(df[,"status_id"]),]  #去除重複的tweets

head(df)

## # A tibble: 6 x 91
##   user_id status_id created_at          screen_name text  source
##   <chr>   <chr>     <dttm>              <chr>       <chr> <chr> 
## 1 490309~ 13814306~ 2021-04-12 02:14:57 coalspeaker Up t~ Tumblr
## 2 907742~ 13814303~ 2021-04-12 02:14:03 HauschelMa~ This~ Twitt~
## 3 162253~ 13814202~ 2021-04-12 01:33:42 SeanRapley  So t~ Twitt~
## 4 215748~ 13813881~ 2021-04-11 23:26:10 NancyLover~ Can ~ Twitt~
## 5 394087~ 13813876~ 2021-04-11 23:24:23 angie_rasm~ As f~ Twitt~
## 6 121756~ 13813872~ 2021-04-11 23:22:51 Mark_G_Dav~ Bloo~ Twitt~
## # ... with 85 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>,
## #   ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## #   lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## #   quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## #   quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## #   quoted_name <chr>, quoted_followers_count <int>,
## #   quoted_friends_count <int>, quoted_statuses_count <int>,
## #   quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## #   retweet_source <chr>, retweet_favorite_count <int>,
## #   retweet_retweet_count <int>, retweet_user_id <chr>,
## #   retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## #   country_code <chr>, geo_coords <list>, coords_coords <list>,
## #   bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## #   description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>, date <date>

df共有90個欄位，但我們在這裡僅會使用幾個欄位:

user_id: 用戶id
status_id : 推文id
created_at : 發文時間
text : 推文內容
source : 發文來源

了解資料的資料筆數以及時間分布

created_at已經是一個date類型的欄位，因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料

nrow(df)

## [1] 822

min(df$created_at)

## [1] "2021-04-04 07:02:28 UTC"

max(df$created_at)

## [1] "2021-04-12 02:14:57 UTC"

1-2串接CoreNLP API

(1). API呼叫的設定

server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# 產生coreNLP的api url，將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)

# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文（使用9000 port）、另一個則負責中文（使用9001 port）
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}

#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理，每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

(2). 資料整理function

從回傳的object中整理斷詞出結果，輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

從回傳的core-nlp object中整理出詞彙依存關係，輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

從回傳的core-nlp object中整理出語句情緒，輸出為 tidydata 格式

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

圖形化 Dependency tree

程式參考來源：https://stackoverflow.com/questions/35496560/how-to-convert-corenlp-generated-parse-tree-into-data-tree-r-package

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

將句子丟入服務

取得coreNLP回傳的物件
先不要跑這段，會花大概半小時（如果你記憶體只有4G可能會當掉）

gc() #釋放不使用的記憶體

t0 = Sys.time()
obj = df[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

Sys.time() - t0 #執行時間

save.image("coreNLP.RData")

#先將會用到的東西存下來，要用可直接載RData
 # tokens =  coreNLP_tokens_parser(obj)
 # dependencies = coreNLP_dependency_parser(obj)
 # sentiment = coreNLP_sentiment_parser(obj)
 #save.image("coreNLP_all.RData")

1-3 提取結果

(1). 斷詞、詞彙還原、詞性標註、NER

tokens =  coreNLP_tokens_parser(obj)

head(tokens,20)

##              status_id      word     lemma pos               ner lower_word
## 1  1381430608052813830        Up        up  IN                 O         up
## 2  1381430608052813830        to        to  IN                 O         to
## 3  1381430608052813830        80        80  CD           PERCENT         80
## 4  1381430608052813830   percent   percent  NN           PERCENT    percent
## 5  1381430608052813830        in        in  IN                 O         in
## 6  1381430608052813830    Sicily    Sicily NNP          LOCATION     sicily
## 7  1381430608052813830    refuse    refuse VBP                 O     refuse
## 8  1381430608052813830        AZ        AZ NNP STATE_OR_PROVINCE         az
## 9  1381430608052813830   vaccine   vaccine  NN                 O    vaccine
## 10 1381430608052813830 president president  NN             TITLE  president
## 11 1381430608052813830    France    France NNP           COUNTRY     france
## 12 1381430608052813830        24        24  CD            NUMBER         24
## 13 1381430608052813830        Up        up  IN                 O         up
## 14 1381430608052813830        to        to  IN                 O         to
## 15 1381430608052813830        80        80  CD           PERCENT         80
## 16 1381430608052813830   percent   percent  NN           PERCENT    percent
## 17 1381430608052813830        of        of  IN                 O         of
## 18 1381430608052813830    people    people NNS                 O     people
## 19 1381430608052813830   offered     offer VBD                 O    offered
## 20 1381430608052813830       the       the  DT                 O        the
##    lower_lemma
## 1           up
## 2           to
## 3           80
## 4      percent
## 5           in
## 6       sicily
## 7       refuse
## 8           az
## 9      vaccine
## 10   president
## 11      france
## 12          24
## 13          up
## 14          to
## 15          80
## 16     percent
## 17          of
## 18      people
## 19       offer
## 20         the

coreNLP_tokens_parser欄位:
- status_id : 對應原本df裡的status_id，為一則tweets的唯一id
- word: 原始斷詞
- lemma : 對斷詞做詞形還原
- pos : part-of-speech,詞性
- ner: 命名實體

(2). 命名實體標註(NER)

從NER查看特定類型的實體，辨識出哪幾種類型

unique(tokens$ner)

##  [1] O                 PERCENT           LOCATION          STATE_OR_PROVINCE
##  [5] TITLE             COUNTRY           NUMBER            NATIONALITY      
##  [9] PERSON            ORGANIZATION      DATE              CAUSE_OF_DEATH   
## [13] ORDINAL           DURATION          CITY              MISC             
## [17] SET               TIME              MONEY             IDEOLOGY         
## 20 Levels: O PERCENT LOCATION STATE_OR_PROVINCE TITLE COUNTRY ... IDEOLOGY

#除去entity為Other，有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"]))

## [1] 817

(3). 轉小寫

因為大小寫也會影響corenlp對NER的判斷，因此我們一開始給的推文內容是沒有處理大小寫的，但在跑完anotator後，為了正確計算詞頻，創建新欄位lower_word與lower_lemma，存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞（如covid-19與COVID-19）都換成小寫，再來計算詞頻

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

1.4 探索分析 - NER

涉及到的國家(COUNTRY)

我們可以透過coreNLP中的NER解析出在Twitter上談論covid-19疫苗，所涉及到的國家(COUNTRY)，以初步了解這個議題的主要國家。

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 13, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

## `summarise()` ungrouping output (override with `.groups` argument)

最多提及到的是UK英國、因為他是第一個批准大規模使用疫苗的國家
根據新聞：澳洲政府在4/8召開記者，AZ疫苗將不再提供50歲以下人口施打，建議50歲以下民眾改為施打美國藥廠輝瑞（Pfizer）的COVID-19疫苗

涉及到的組織(ORGANIZATION)

我們可以透過coreNLP中的NER解析出在Twitter上面關於covid-19疫苗，所涉及到的組織(ORGANIZATION)，以初步了解這個議題的主要公司/單位。

tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

## `summarise()` ungrouping output (override with `.groups` argument)

輝瑞藥廠pfizer被提及最多次,因為他是英國第一個大規模使用的疫苗
AZ疫苗也是目前在全國進行施打的疫苗

涉及到的人物(PERSON)

我們可以透過coreNLP中的NER解析出在Twitter上面談論關於covid-19疫苗，所涉及到的人物(PERSON)，以初步了解這個議題的主要人物。

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

## `summarise()` ungrouping output (override with `.groups` argument)

疫苗公司：嬌生johnson&janssen、莫德納moderna、輝瑞pfz
美國總統：biden；澳洲總理：Scott Morrison
fda(美國食品藥物管理局) 局長：rolando

1.5 探索分析 - Dependency

語句依存關係結果

dependencies = coreNLP_dependency_parser(obj)

head(dependencies,20)

##              status_id      dep governor governorGloss dependent dependentGloss
## 1  1381430608052813830     ROOT        0          ROOT        40   president...
## 2  1381430608052813830      dep        3            80         1             Up
## 3  1381430608052813830    fixed        1            Up         2             to
## 4  1381430608052813830   nummod        4       percent         3             80
## 5  1381430608052813830    nsubj        7        refuse         4        percent
## 6  1381430608052813830     case        6        Sicily         5             in
## 7  1381430608052813830     nmod        4       percent         6         Sicily
## 8  1381430608052813830      dep       40  president...         7         refuse
## 9  1381430608052813830 compound       11        France         8             AZ
## 10 1381430608052813830 compound       10     president         9        vaccine
## 11 1381430608052813830 compound       11        France        10      president
## 12 1381430608052813830    nsubj       19       offered        11         France
## 13 1381430608052813830 compound       15            80        12             24
## 14 1381430608052813830      dep       15            80        13             Up
## 15 1381430608052813830    fixed       13            Up        14             to
## 16 1381430608052813830   nummod       16       percent        15             80
## 17 1381430608052813830     nmod       11        France        16        percent
## 18 1381430608052813830     case       16       percent        17             of
## 19 1381430608052813830      dep       16       percent        18         people
## 20 1381430608052813830    ccomp        7        refuse        19        offered

視覺化 Dependency tree

parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)

1.6 探索分析 - Sentiment

語句情緒值

情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive

sentiment = coreNLP_sentiment_parser(obj)

head(sentiment,20)

##              status_id
## 1  1381430608052813830
## 2  1381430380323078144
## 3  1381420228790214656
## 4  1381388132856201218
## 5  1381387682723495937
## 6  1381387298969804802
## 7  1381381856071929865
## 8  1381376584796409861
## 9  1381375007746617353
## 10 1381373581276315649
## 11 1381370225623408640
## 12 1381368584690012165
## 13 1381365440576897032
## 14 1381359468655517701
## 15 1381348348406415361
## 16 1379030194565054465
## 17 1381348328500162565
## 18 1381347913981431811
## 19 1381346218404675586
## 20 1381342911938265088
##                                                                                                                                                                                                                                                                                 text
## 1                                       Up to 80 percent in Sicily refuse AZ vaccine president  France 24 Up to 80 percent of people offered the AstraZeneca Covid19 vaccine In Sicily refuse it out of fears over its safety according to the southern Italian regions president...
## 2                                                                                                                        This is a very good summary re blood clots associated with the AZ COVID19 vaccine and recent study findings. Thread byon Thread Reader AppThread Reader App
## 3                                                                                                                                                                                                  So the Morrison Govt secured 50 million doses of the AZ vaccine on August 19 2020
## 4                                                                       Can 55 choose the PfizerModerna option at the Whistler Conf. Ctr. or are they limited to AZ at the pharmacies? All adult residents living and working in Whistler now eligible for COVID19 vaccine  CBC News
## 5  As far as I am aware the blood clot risk is only associated with AstraZeneca the only thing I've heard about JJ is they are investigating if there's a link. With AZ the risk is still VERY low. Your risk of a blood clot from COVID19 is much much higher than for any vaccine.
## 6                                   Blood Clotting Venous Thromboembolism with the AstraZeneca COVID Vaccine in the UK  79 clots 51 Female 28 Male  19 died 13 Female 6 Male In Australia the youngest Female to die of COVID was in her 50's  hence the under 50 restrictions on AZ
## 7                                                                                                                                                                                        TWiV 741In COVID19 clinical update  Daniel Griffin covers children  the effect of vaccines.
## 8                                                                                                                                                                Not sure if either of you are taking meds but heres some info about a recent study although looked at Pfizer not AZ
## 9                                                                                                                   Will the the AZ Vaccine be effective enough to enable protection from COVID19 variants such as the South African 501Y.V2 variant if you are travelling overseas?
## 10                                                                                                                          The risks associated with catching COVID19 are of much greater concern to me than the risks associated with the AZ vaccine. I'm getting jabbed tomorrow!
## 11                                   I didnt say the virus was created before the vaccine. You obviously dont know much about mRNA vaccines. Theyve been tried on other viruses prior to COVID19. Thats one of the reasons they were largely ready to go in terms of the RD segment.
## 12                                                             Great graphic showing relative risk of blood clots with AZ vaccine  lower than OCP smoking and much lower than COVID19 infection. Clot type morbiditymortality differs so individual risk important but very low risk
## 13                                                                                                                                                  Through KJZZ's QAZ reporting project several listeners have asked how homebound seniors can get the coronavirus vaccine.reports.
## 14                                                                                                                              AstraZeneca COVID19 vaccine FAQ Why do the age recommendations keep changing? Does it cause VIPIT blood clots? Is it effective against variants? via
## 15                                                                                                     Nope. Still waiting for my second dose of the AZ vaccine. Might be a little safer in pub gardens when the weathers better with more UV rays around to kill COVID19 particles.
## 16 The bigger picture is the AZ vaccine is that its cheap and easily stored making it vital for countries with poor healthcare infrastructures. The clotting issue is a concern but COVID19 remains the bigger risk  new variants emerge In unvaccinated areas that escape vaccines.
## 17                                                                                                                                                                                                             AZ is still on hold for anyone under 55. No current update from NACI.
## 18                                                                                                                                                    Do not blow the trumpet louder! Covid19 may hear this and try to form a new Varient  while Vaccine Agitators vs AZ battle out.
## 19    In Norway The ChAdOx1 nCoV19 vaccine AstraZeneca has been administered to health care professionals younger than 65 years of age who do not have close contact with patients with Covid19. Pfizer prioritised for healthcare staff in close contact to Covid patients. AZ like
## 20                                                                                                                                                           Also can thebe slowed down with the rise of concerns of blood clotting for some people due to covishield AZ vaccine? 22
##    sentiment sentimentValue
## 1   Negative              1
## 2   Positive              2
## 3    Neutral              3
## 4   Negative              1
## 5   Negative              1
## 6   Negative              1
## 7    Neutral              3
## 8    Neutral              3
## 9    Neutral              3
## 10   Neutral              3
## 11   Neutral              3
## 12  Positive              2
## 13   Neutral              3
## 14  Negative              1
## 15  Negative              1
## 16  Negative              1
## 17  Negative              1
## 18  Negative              1
## 19  Negative              1
## 20   Neutral              3

資料集中的情緒種類

unique(sentiment$sentiment)

## [1] Negative     Positive     Neutral      Verynegative
## Levels: Negative Positive Neutral Verynegative

sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric

#了解情緒文章的分佈
sentiment$sentiment %>% table()

## .
##     Negative     Positive      Neutral Verynegative 
##          372          178          261            2

平均情緒分數時間趨勢

df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) + 
  geom_line()

## `summarise()` ungrouping output (override with `.groups` argument)

+ 這幾天的情緒分數都是偏負面的介於1~2之間

不同用戶端情緒時間趨勢

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% 
  group_by(date,source) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment,color=source)) + 
  geom_line()

## `summarise()` regrouping output by 'date' (override with `.groups` argument)

但若是按用戶端看，安卓用戶的起伏較大，4/7前iphone用戶比較正面，但4/7後是安卓用戶比較正面

了解情緒分佈，以及在正面情緒及負面情緒下，所使用的文章詞彙為何?

#了解正面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lower_lemma) %>% #根據lemma分組
  summarize(count = n()) %>% 
  filter(count >5 & count<400)%>%
  filter(!lower_lemma %in% c("az","vaccine","vaccine.","blood","covid"))  %>% #去掉主要議題的字眼
 filter(!lower_lemma %in% lower_lemma[grep("[0-9]",lower_lemma)]) %>%#去掉數字
  wordcloud2()

## Joining, by = "word"

## `summarise()` ungrouping output (override with `.groups` argument)

#了解負面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  group_by(lower_lemma) %>% 
  summarize(count = n()) %>% 
  filter(count >10 &count<400)%>%
  arrange(desc(count)) %>% 
  filter(!lower_lemma %in% c("az","vaccine","covid19","covid")) %>% filter(!lower_lemma %in% lower_lemma[grep("[0-9]",lower_lemma)]) %>%#去掉數字
  wordcloud2()

“wordcloud”

2. Sentimentr 英文情緒分析

2.1 使用twitter資料實踐在sentimentr

計算tweet中屬於正面的字

library(sentimentr)
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, nrow(tweets), replace = FALSE) #隨機取1000筆，取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字

##             words polarity  n
##   1:     benefits      1.0 37
##   2:     efficacy      1.0 17
##   3:     low risk      1.0 11
##   4:     approval      1.0 10
##   5:       please      1.0 10
##  ---                         
## 349:       shares      0.1  1
## 350: collectively      0.1  1
## 351:        peaks      0.1  1
## 352:      prepare      0.1  1
## 353: considerable      0.1  1

計算tweet中屬於負面的字

sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字

## Selecting by n

##        words polarity   n
##  1:     risk    -0.75 180
##  2: clotting    -0.40  91
##  3:     clot    -0.40  62
##  4:    risks    -0.25  58
##  5:      jab    -0.60  46
##  6:     died    -0.50  31
##  7:    dying    -0.50  24
##  8:  adverse    -0.50  24
##  9:    death    -0.75  24
## 10:      die    -0.75  19

highlight每個句子，判斷屬於正/負面

set.seed(12)
df%>%
    filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
    mutate(review = get_sentences(text)) %$% 
    sentiment_by(review, status_id) %>%
    highlight()

## Saved in C:\Users\emma\AppData\Local\Temp\RtmpQDi7nI/polarity.html

## Opening C:\Users\emma\AppData\Local\Temp\RtmpQDi7nI/polarity.html ...

2.3 用日期來了解情緒波動

code 參考 https://github.com/trinker/sentimentr

tweets$date = format(tweets$created_at,'%Y%m%d')

(out  = tweets  %>%  with(
    sentiment_by( #document level
        get_sentences(text), 
        list( date) #==group_by(date)
    )
))
plot(out)

2.4 用日期來了解不同用戶端情的緒波動

(out  = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%  with(
    sentiment_by(
        get_sentences(text), 
        list(source, date) #==group_by(source, date)
    )
))
plot(out)

用sentimentr套件按用戶端來看，4/12號的iphone用戶正面情緒最高

轉換Emoji代碼為語意文字

replace_emoji("\U0001f4aa")

## [1] " flexed biceps "

總結

coreNLP

找出議題核心人物，組織，國家
用句法學的分析找出句子相依關係
分別找出正、負面文章的常用字

sentimentr

找到tweets中正負面的詞，並且計算每個文本中屬於正負面的句子有哪些
根據日期知道情緒的波動、不同用戶端的波動