基本介紹

  • 目的:使用coreNLP與sentimentr分析twitter上新冠疫苗的文字資料
  • 概述:。
  • 資料來源:Twitter,4/2-4/10,5000筆,English

1. coreNLP

安裝package

packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
load("coreNLP_all.RData")

1.1 資料收集:tweets

(1). Twitter API設定 透過rtweet抓取tweets

app = '2021_sma_team3'
consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權

(2). 設定關鍵字抓tweets

# 查詢關鍵字
key = c("#COVID19")
context = "#vaccine"
q = paste(c(key,context),collapse=" AND ")   
# 查詢字詞 "#EverGiven AND Suez"
# 為了避免只下#EverGiven 會找到非在suez中的tweets,加入Suez要同時出現的條件

#抓5000筆 不抓轉推
since <- "2021-03-24"
until <- "2021-04-04"

#tweets = search_tweets(q,lang="en",n=5000,since = since, until = until,include_rts = FALSE,token = twitter_token)
tweets = search_tweets(q,lang="en",n=5000,include_rts = FALSE,token = twitter_token)

(3). tweets內容清理

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }


tweets$text = clean(tweets$text)  #text套用資料清理

df = data.frame()
  
df = rbind(df,tweets)  # transfer to data frame

df = df[!duplicated(df[,"status_id"]),]  #去除重複的tweets
head(df)
## # A tibble: 6 x 90
##   user_id   status_id   created_at          screen_name text              source
##   <chr>     <chr>       <dttm>              <chr>       <chr>             <chr> 
## 1 82884108~ 1380803685~ 2021-04-10 08:43:47 latestly    Punjab CM Amarin~ Tweet~
## 2 82884108~ 1378410446~ 2021-04-03 18:13:54 latestly    COVID19 Vaccinat~ Tweet~
## 3 82884108~ 1379759032~ 2021-04-07 11:32:42 latestly    Karnataka Admini~ Tweet~
## 4 82884108~ 1379792594~ 2021-04-07 13:46:04 latestly    Maharashtra Faci~ Tweet~
## 5 82884108~ 1378310279~ 2021-04-03 11:35:53 latestly    Uttar Pradesh Wo~ Tweet~
## 6 82884108~ 1379089604~ 2021-04-05 15:12:38 latestly    Delhi GovtRun Ho~ Tweet~
## # ... with 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>,
## #   ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## #   lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## #   quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## #   quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## #   quoted_name <chr>, quoted_followers_count <int>,
## #   quoted_friends_count <int>, quoted_statuses_count <int>,
## #   quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## #   retweet_source <chr>, retweet_favorite_count <int>,
## #   retweet_retweet_count <int>, retweet_user_id <chr>,
## #   retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## #   country_code <chr>, geo_coords <list>, coords_coords <list>,
## #   bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## #   description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

df共有90個欄位,但我們在這裡僅會使用幾個欄位:

  • user_id: 用戶id
  • status_id : 推文id
  • created_at : 發文時間
  • text : 推文內容
  • source : 發文來源
  • location : 發文地點

了解資料的資料筆數以及時間分布

created_at已經是一個date類型的欄位,因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料

nrow(df)
## [1] 4446
min(df$created_at)
## [1] "2021-04-02 06:12:50 UTC"
max(df$created_at)
## [1] "2021-04-10 08:43:47 UTC"

了解發文資料的地點分布

  • location 欄位的資料有的包含城市名稱, 有的含有國名
df %>% 
  filter(location != "") %>% 
    group_by(location) %>%
    summarise(n = n(), .groups = "drop_last") %>% 
    top_n(n = 15, n) %>% 
    ggplot(aes(x = reorder(location, n), y = n, fill=n)) +
    geom_bar(stat = "identity") +
    coord_flip() + 
    geom_label(aes(label=n), size=4, fill="white") +
    labs(title = "Countries Location for Tweets") +
    theme(axis.text.x = element_blank(),
                     axis.title = element_blank())

  • country_code 國家代碼
  • 依序: US美國,GB英國,IN印度,CA加拿大,JM 牙買加
  df %>% 
  filter(country_code != "") %>% 
    group_by(country_code) %>%
    summarise(n = n(), .groups = "drop_last") %>% 
    top_n(n = 5, n) %>% 
    ggplot(aes(x = reorder(country_code, n), y = n, fill=n)) +
    geom_bar(stat = "identity") +
    coord_flip() + 
    geom_label(aes(label=n), size=4, fill="white") +
    labs(title = "Countries for Tweets") +
    theme(axis.text.x = element_blank(),
                     axis.title = element_blank())

1-2串接CoreNLP API

(1). API呼叫的設定

server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)
# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}
#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

(2). 資料整理function

從回傳的object中整理斷詞出結果,輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

圖形化 Dependency tree

程式參考來源:https://stackoverflow.com/questions/35496560/how-to-convert-corenlp-generated-parse-tree-into-data-tree-r-package

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

將句子丟入服務

取得coreNLP回傳的物件
先不要跑這段,會花大概半小時(如果你記憶體只有4G可能會當掉)

gc() #釋放不使用的記憶體

t0 = Sys.time()
obj = df[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

Sys.time() - t0 #執行時間
#Time difference of 28 mins

save.image("coreNLP.RData")
#先將會用到的東西存下來,要用可直接載RData
# tokens =  coreNLP_tokens_parser(obj)
# dependencies = coreNLP_dependency_parser(obj)
# sentiment = coreNLP_sentiment_parser(obj)
# save.image("coreNLP_all.RData")

1-3 提取結果

(1). 斷詞、詞彙還原、詞性標註、NER

tokens =  coreNLP_tokens_parser(obj)
head(tokens,20)
##              status_id      word     lemma pos      ner
## 1  1380803685941346304    Punjab    punjab  NN LOCATION
## 2  1380803685941346304        CM        cm  NN        O
## 3  1380803685941346304 Amarinder Amarinder NNP        O
## 4  1380803685941346304     Singh     Singh NNP        O
## 5  1380803685941346304      Says       say VBZ        O
## 6  1380803685941346304       the       the  DT        O
## 7  1380803685941346304     State     state  NN        O
## 8  1380803685941346304        Is        be VBZ        O
## 9  1380803685941346304      Left     leave VBN        O
## 10 1380803685941346304      With      with  IN        O
## 11 1380803685941346304      Only      only  JJ        O
## 12 1380803685941346304      Five      five  CD DURATION
## 13 1380803685941346304      Days       day NNS DURATION
## 14 1380803685941346304        of        of  IN        O
## 15 1380803685941346304   COVID19   covid19  NN        O
## 16 1380803685941346304   Vaccine   vaccine  NN        O
## 17 1380803685941346304     Urges      urge VBZ        O
## 18 1380803685941346304    Centre    Centre NNP        O
## 19 1380803685941346304        To        to  TO        O
## 20 1380803685941346304     Share     share  VB        O
  • coreNLP_tokens_parser欄位:
    • status_id : 對應原本df裡的status_id,為一則tweets的唯一id
    • word: 原始斷詞
    • lemma : 對斷詞做詞形還原
    • pos : part-of-speech,詞性
    • ner: 命名實體

(2). 命名實體標註(NER)

  • 從NER查看特定類型的實體,辨識出哪幾種類型
unique(tokens$ner)
##  [1] "LOCATION"          "O"                 "DURATION"         
##  [4] "COUNTRY"           "ORGANIZATION"      "NUMBER"           
##  [7] "SET"               "MISC"              "DATE"             
## [10] "NATIONALITY"       "PERSON"            "ORDINAL"          
## [13] "TIME"              "CRIMINAL_CHARGE"   "TITLE"            
## [16] "CITY"              "MONEY"             "CAUSE_OF_DEATH"   
## [19] "STATE_OR_PROVINCE" "PERCENT"           "RELIGION"         
## [22] "URL"               "IDEOLOGY"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"])) 
## [1] 2850

(3). 轉小寫

因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

1.4 探索分析 - NER

涉及到的國家(COUNTRY)

我們可以透過coreNLP中的NER解析出在Twitter上面談論長賜號擱淺蘇伊士運河,所涉及到的國家(COUNTRY),以初步了解這個議題的主要國家。

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 20, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

涉及到的組織(ORGANIZATION)

涉及到的組織。

tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 30, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

涉及到的人物(PERSON)

我們可以透過coreNLP中的NER解析出在Twitter上面談論新冠疫苗,所涉及到的人物(PERSON),以初步了解這個議題的主要人物。

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

涉及到的思想意識(IDEOLOGY)

我們可以透過coreNLP中的NER解析出在新冠疫苗,所涉及到的思想意識(IDEOLOGY),以初步了解這個議題的意識,由於COUNT的數量少未明顯傳達什麼概念。

tokens %>%
  filter(ner == "IDEOLOGY") %>%  #篩選NER為IDEOLOGY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is IDEOLOGY)") +
  theme(text=element_text(size=14))+
  coord_flip()

1.5 探索分析 - Dependency

語句依存關係結果
dependencies = coreNLP_dependency_parser(obj)
head(dependencies,20)
##              status_id        dep governor governorGloss dependent
## 1  1380803685941346304       ROOT        0          ROOT        17
## 2  1380803685941346304   compound        2            CM         1
## 3  1380803685941346304   obl:tmod       17         Urges         2
## 4  1380803685941346304   compound        4         Singh         3
## 5  1380803685941346304      nsubj        5          Says         4
## 6  1380803685941346304  acl:relcl        2            CM         5
## 7  1380803685941346304        det        7         State         6
## 8  1380803685941346304 nsubj:pass        9          Left         7
## 9  1380803685941346304   aux:pass        9          Left         8
## 10 1380803685941346304      ccomp        5          Says         9
## 11 1380803685941346304       case       12          Five        10
## 12 1380803685941346304       amod       12          Five        11
## 13 1380803685941346304        obl        9          Left        12
## 14 1380803685941346304      nsubj       17         Urges        13
## 15 1380803685941346304       case       16       Vaccine        14
## 16 1380803685941346304   compound       16       Vaccine        15
## 17 1380803685941346304       nmod       13          Days        16
## 18 1380803685941346304        obj       17         Urges        18
## 19 1380803685941346304       mark       20         Share        19
## 20 1380803685941346304      xcomp       17         Urges        20
##    dependentGloss
## 1           Urges
## 2          Punjab
## 3              CM
## 4       Amarinder
## 5           Singh
## 6            Says
## 7             the
## 8           State
## 9              Is
## 10           Left
## 11           With
## 12           Only
## 13           Five
## 14           Days
## 15             of
## 16        COVID19
## 17        Vaccine
## 18         Centre
## 19             To
## 20          Share
視覺化 Dependency tree
# parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
parse_tree <- obj[[100]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)

1.6 探索分析 - Sentiment

語句情緒值

情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive

sentiment = coreNLP_sentiment_parser(obj)
head(sentiment,20)
##              status_id
## 1  1380803685941346304
## 2  1378410446164688897
## 3  1379759032144044034
## 4  1379792594750689288
## 5  1378310279470051335
## 6  1379089604574281729
## 7  1379760842695188480
## 8  1380046808978628608
## 9  1377937358453018624
## 10 1379795359149350916
## 11 1379756439179198466
## 12 1379113199174213635
## 13 1380802797285871617
## 14 1380802023377960965
## 15 1380069718120349698
## 16 1379448739383902211
## 17 1380800216270565377
## 18 1379802064335818757
## 19 1379455392837730304
## 20 1380799689784713222
##                                                                                                                                                                                                       text
## 1                                                                        Punjab CM Amarinder Singh Says the State Is Left With Only Five Days of COVID19 Vaccine Urges Centre To Share Schedule for Supply
## 2                                                                                COVID19 Vaccination in India Centre Asks States UTs Not To Allow Fresh Registration of Healthcare Workers for Inoculation
## 3                                                                                                                   Karnataka Administered Over 50 Lakh People With COVID19 Vaccine Says Health Department
## 4                                                                                                      Maharashtra Facing COVID19 Vaccine Shortage State Govt Demands 40 Lakh Doses Every Week From Centre
## 5                                                                                                                 Uttar Pradesh Woman Given Two Doses of COVID19 Vaccine DM Orders Inquiry into Negligence
## 6                                                                                                                      Delhi GovtRun Hospitals COVID19 Vaccination Sites To Open for 24 Hours From April 6
## 7                                                                                                            Nepal Starts COVID19 Vaccination Drive With Chinese Vaccine Vero Cell Despite Safety Concerns
## 8                                                                                                                                                Uddhav Thackeray Takes His Second Dose of COVID19 Vaccine
## 9                                                                                                 Maharashtra Plea Filed in Bombay High Court for DoortoDoor COVID19 Vaccination for People Above 75 Years
## 10                                                                             No COVID19 Vaccine Shortage in Maharashtra Central Govt Replenishes More Vaccine Than What Is Needed Says Prakash Javadekar
## 11                                                                                                                         Mexico Grants Authorisation for Emergency Use of Indias COVID19 Vaccine Covaxin
## 12                                                                                                                           Sri Lanka Begins COVID19 Vaccination of Chinese Expats With Sinopharm Vaccine
## 13                                                                                                         The Imaan Pharmacycentre is available for more bookings this 11th 13th and 14th April. Book via
## 14                                                                                             all these books saying i survived tis i survived that...screw that ima write one ab how i survived covid 19
## 15                                                   Are you aandorin ? Have you taken the 19 ? Well NOW is your chance! CAHN is hosting aPopUpon Saturday 10th April  9AM5PM  West Indian Centre M14 4SW.
## 16                                                  will be hosting a 19on Saturday 10th April  9AM5PM  West Indian Centre . We would like to encourage people within theandto visit our to take the Book 
## 17                                                                                                                     Have you booked to take your 19yet? Well there is still a chance to  Book NOW here 
## 18                                                   Are you aandorin ? Have you taken the 19 ? Well NOW is your chance! CAHN is hosting aPopUpon Saturday 10th April  9AM5PM  West Indian Centre M14 4SW.
## 19                                                                           has written an opento encourageto take the signed by some of the most highprofile names in the Click here to watch full video
## 20 I have gotCovid 19 vaccine the process went easy and didnt took time thanks to the MOH for the facilities and I urge people to do so in order to have a community free of COVID and to save more lives.
##    sentiment sentimentValue
## 1   Negative              1
## 2    Neutral              2
## 3    Neutral              2
## 4    Neutral              2
## 5    Neutral              2
## 6    Neutral              2
## 7    Neutral              2
## 8    Neutral              2
## 9    Neutral              2
## 10  Negative              1
## 11   Neutral              2
## 12   Neutral              2
## 13  Positive              3
## 14   Neutral              2
## 15   Neutral              2
## 16   Neutral              2
## 17   Neutral              2
## 18   Neutral              2
## 19   Neutral              2
## 20   Neutral              2
資料集中的情緒種類
unique(sentiment$sentiment)
## [1] "Negative"     "Neutral"      "Positive"     "Verypositive" "Verynegative"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric
#了解情緒文章的分佈
sentiment$sentiment %>% table()
## .
##     Negative      Neutral     Positive Verynegative Verypositive 
##          926         2786          709            1            2
平均情緒分數時間趨勢
df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) + 
  geom_line()

不同用戶端情緒時間趨勢
sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% 
  group_by(date,source) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment,color=source)) + 
  geom_line()
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

了解情緒分佈,以及在正面情緒及負面情緒下,所使用的文章詞彙為何?
#了解正面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lower_lemma) %>% #根據lemma分組
  summarize(count = n()) %>% 
  filter(count >15 & count<400)%>%
  wordcloud2()
## Joining, by = "word"

“wordcloud”

#了解負面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  group_by(lower_lemma) %>% 
  summarize(count = n()) %>% 
  filter(count >10 &count<400)%>%
  wordcloud2()

“wordcloud”

2. Sentimentr 英文情緒分析

2.1 簡介sentimentr

library(sentimentr)
## Warning: package 'sentimentr' was built under R version 4.0.5
mytext <- c(
    'do you like it?  But I hate really bad dogs',
    'I am the best friend.',
    'Do you really like it?  I\'m not a fan'
)

mytext <- get_sentences(mytext) #物件,將character向量轉成list,list裡放著character向量(已斷句)
每個文本的情緒分數

情緒分數為-1~1之間,<0屬於負面,>0屬於正面,0屬於中性

sentiment_by(mytext) #document level
##    element_id word_count       sd ave_sentiment
## 1:          1         10 1.497465    -0.8088680
## 2:          2          5       NA     0.5813777
## 3:          3          9 0.284605     0.2196345
每個句子的情緒分數
sentiment(mytext) #sentence level
##    element_id sentence_id word_count  sentiment
## 1:          1           1          4  0.2500000
## 2:          1           2          6 -1.8677359
## 3:          2           1          5  0.5813777
## 4:          3           1          5  0.4024922
## 5:          3           2          4  0.0000000
  • 回傳4個欄位的dataframe:
    • element_id – 第幾個文本
    • sentence_id – 該文本中的第幾個句子
    • word_count – 句子字數
    • sentiment – 句子的情緒分數

2.2 使用twitter資料實踐在sentimentr

計算tweet中屬於正面的字
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字
##           words polarity  n
##   1:     please      1.0 10
##   2:        top      1.0  7
##   3:       care      1.0  5
##   4: understand      1.0  4
##   5:      truth      1.0  4
##  ---                       
## 399:   overcome      0.1  1
## 400: compliance      0.1  1
## 401:       pray      0.1  1
## 402:      moral      0.1  1
## 403:     church      0.1  1
計算tweet中屬於負面的字
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字
## Selecting by n
##          words polarity  n
##  1:       shot    -0.40 26
##  2: government    -0.50 20
##  3:   pandemic    -1.00 20
##  4:      virus    -0.50 17
##  5:    disease    -1.00 17
##  6:      limit    -0.25 14
##  7:       risk    -0.75 13
##  8:       stop    -0.40 12
##  9:     issues    -1.00 10
## 10:       drug    -0.10  9
## 11:        jab    -0.60  9
highlight每個句子,判斷屬於正/負面
set.seed(12)
df%>%
    filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
    mutate(review = get_sentences(text)) %$% 
    sentiment_by(review, status_id) %>%
    highlight()
## Saved in C:\Users\user\AppData\Local\Temp\RtmpOm6qnY/polarity.html
## Opening C:\Users\user\AppData\Local\Temp\RtmpOm6qnY/polarity.html ...

2.3 用日期來了解情緒波動

code 參考 https://github.com/trinker/sentimentr

tweets$date = format(tweets$created_at,'%Y%m%d')

(out  = tweets  %>%  with(
    sentiment_by( #document level
        get_sentences(text), 
        list( date)
    )
))
plot(out)

#

2.4 用日期來了解不同用戶端情的緒波動

(out  = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%  with(
    sentiment_by(
        get_sentences(text), 
        list(source, date)
    )
))
plot(out)

轉換Emoji代碼為語意文字
replace_emoji("\U0001f4aa")
## [1] " flexed biceps "

總結