社群媒體分析-第三組期中報告 Prayfornanggala402

A.動機和分析目的

B.資料集的描述

B-1 安裝package

安裝package

packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(sentimentr)

load("coreNLP_all.RData")

B-2 資料收集：tweets

(1). Twitter API設定透過rtweet抓取tweets

app = 'Emotions COVID-19 Vaccine'
consumer_key = 'sldS3M1c37owWAxx88lRg8anU'
consumer_secret = 'lCsUtxqA6DWC9nW7xH2a5KAITLXEX8oj10tcWE7zRVTxgHARfC'
access_token = '1283052584312410112-LocNkHahyAJ50KR0sADTmHryO0k3Kq'
access_secret = 'gLssR17xxOZUDLeiF6sB5LiwSYAAVBE0mLjXQolINF4k3'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權

(2). 設定關鍵字抓tweets

# 查詢關鍵字
key = c("#Prayfornanggala402")
context = " "
q = paste(c(key,context),collapse=" AND ")   
# 查詢字詞 "#COVID-19 AND Vaccine"
# 為了避免只下#COVID-19 會找到非在Vaccine中的tweets，加入Vaccine要同時出現的條件

#抓10000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=8000,include_rts = FALSE,token = twitter_token)

(3). tweets內容清理

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元，*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }


tweets$text = clean(tweets$text)  #text套用資料清理

df = data.frame()
  
df = rbind(df,tweets)  # transfer to data frame

df = df[!duplicated(df[,"status_id"]),]  #去除重複的tweets

head(df)

## # A tibble: 6 x 90
##   user_id   status_id   created_at          screen_name  text            source 
##   <chr>     <chr>       <dttm>              <chr>        <chr>           <chr>  
## 1 13108524~ 1387299755~ 2021-04-28 06:56:51 GreyGeordie  out of curious~ Twitte~
## 2 13108524~ 1387077236~ 2021-04-27 16:12:38 GreyGeordie  idk the meanin~ Twitte~
## 3 70889196~ 1387284663~ 2021-04-28 05:56:52 AmitTiw8519~ To my sisters ~ Twitte~
## 4 12373699~ 1387264694~ 2021-04-28 04:37:31 Jih_Yooo     ON ETERNAL PAT~ Twitte~
## 5 392759577 1387260528~ 2021-04-28 04:20:58 yenjan       TKS Tae tae     Twitte~
## 6 12988991~ 1387255561~ 2021-04-28 04:01:14 BTS_twt_KTHV Heres a song f~ Twitte~
## # ... with 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>,
## #   ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## #   lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## #   quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## #   quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## #   quoted_name <chr>, quoted_followers_count <int>,
## #   quoted_friends_count <int>, quoted_statuses_count <int>,
## #   quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## #   retweet_source <chr>, retweet_favorite_count <int>,
## #   retweet_retweet_count <int>, retweet_user_id <chr>,
## #   retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## #   country_code <chr>, geo_coords <list>, coords_coords <list>,
## #   bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## #   description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

df共有90個欄位，但我們在這裡僅會使用幾個欄位:

user_id: 用戶id
status_id : 推文id
created_at : 發文時間
text : 推文內容
source : 發文來源

了解資料的資料筆數以及時間分布

created_at已經是一個date類型的欄位，因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料

nrow(df)

## [1] 7992

min(df$created_at)

## [1] "2021-04-23 23:00:47 UTC"

max(df$created_at)

## [1] "2021-04-28 06:56:51 UTC"

B-3 資料集的描述

C 資料的分析過程

C-1 coreNLP

(1). API呼叫的設定

server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# 產生coreNLP的api url，將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)

# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文（使用9000 port）、另一個則負責中文（使用9001 port）
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}

#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理，每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

(2). 資料整理function

從回傳的object中整理斷詞出結果，輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

從回傳的core-nlp object中整理出詞彙依存關係，輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

從回傳的core-nlp object中整理出語句情緒，輸出為 tidydata 格式

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

圖形化 Dependency tree

程式參考來源：https://stackoverflow.com/questions/35496560/how-to-convert-corenlp-generated-parse-tree-into-data-tree-r-package

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

將句子丟入服務

取得coreNLP回傳的物件
先不要跑這段，會花大概半小時（如果你記憶體只有4G可能會當掉）

#gc() #釋放不使用的記憶體

#t0 = Sys.time()
#obj = df[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) 

#丟入本地執行 丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

#Sys.time() - t0 #執行時間


#Time difference of 30 mins

#save.image("coreNLP.RData")

#先將會用到的東西存下來，要用可直接載RData

#tokens =  coreNLP_tokens_parser(obj)
#dependencies = coreNLP_dependency_parser(obj)
#sentiment = coreNLP_sentiment_parser(obj)
#save.image("coreNLP_all.RData")

D 視覺化的分析結果與解釋

(1). 斷詞、詞彙還原、詞性標註、NER

tokens =  coreNLP_tokens_parser(obj)

head(tokens,20)

##              status_id        word       lemma  pos ner
## 1  1387299755584233472         out         out   IN   O
## 2  1387299755584233472          of          of   IN   O
## 3  1387299755584233472  curiousity  curiousity   NN   O
## 4  1387299755584233472      here's       here'  NNS   O
## 5  1387299755584233472          my          my PRP$   O
## 6  1387299755584233472        take        take   NN   O
## 7  1387299755584233472          on          on   IN   O
## 8  1387299755584233472         the         the   DT   O
## 9  1387299755584233472    poignant    poignant   JJ   O
## 10 1387299755584233472    farewell    farewell   NN   O
## 11 1387299755584233472        sung        sing  VBN   O
## 12 1387299755584233472          by          by   IN   O
## 13 1387299755584233472         the         the   DT   O
## 14 1387299755584233472 Nanggala402 nanggala402   NN   O
## 15 1387299755584233472     sailors      sailor  NNS   O
## 16 1387299755584233472      before      before   IN   O
## 17 1387299755584233472         its         its PRP$   O
## 18 1387299755584233472     eternal     eternal   JJ   O
## 19 1387299755584233472  departure.  departure.   NN   O
## 20 1387299755584233472        RIP.        rip.   NN   O

coreNLP_tokens_parser欄位:
- status_id : 對應原本df裡的status_id，為一則tweets的唯一id
- word: 原始斷詞
- lemma : 對斷詞做詞形還原
- pos : part-of-speech,詞性
- ner: 命名實體

(2). 命名實體標註(NER)

從NER查看特定類型的實體，辨識出哪幾種類型

unique(tokens$ner)

##  [1] "O"                 "LOCATION"          "DATE"             
##  [4] "CAUSE_OF_DEATH"    "PERSON"            "ORGANIZATION"     
##  [7] "NUMBER"            "SET"               "TIME"             
## [10] "COUNTRY"           "DURATION"          "MISC"             
## [13] "NATIONALITY"       "TITLE"             "CITY"             
## [16] "STATE_OR_PROVINCE" "RELIGION"

#除去entity為Other，有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"]))

## [1] 436

(3). 轉小寫

因為大小寫也會影響corenlp對NER的判斷，因此我們一開始給的推文內容是沒有處理大小寫的，但在跑完anotator後，為了正確計算詞頻，創建新欄位lower_word與lower_lemma，存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞（如Evergiven與evergiven）都換成小寫，再來計算詞頻

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

D-1 探索分析 - NER

涉及到的國家(COUNTRY)

我們可以透過coreNLP中的NER解析出在Twitter上面談論Prayfornanggala402的事情，所涉及到的國家(COUNTRY)，以初步了解這個議題的主要國家。

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 13, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

討論最多的國家是

涉及到的組織(ORGANIZATION)

我們可以透過coreNLP中的NER解析出在Twitter上面談論Prayfornanggala402事情，所涉及到的組織(ORGANIZATION)，以初步了解這個議題的主要公司/單位。

tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

涉及到的人物(PERSON)

我們可以透過coreNLP中的NER解析出在Twitter上面談論Prayfornanggala402事情，所涉及到的人物(PERSON)，以初步了解這個議題的主要人物。

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

D-2 探索分析 - Dependency

語句依存關係結果

dependencies = coreNLP_dependency_parser(obj)

head(dependencies,20)

##              status_id       dep governor governorGloss dependent
## 1  1387299755584233472      ROOT        0          ROOT        10
## 2  1387299755584233472       dep        6          take         1
## 3  1387299755584233472      case        4        here's         2
## 4  1387299755584233472  compound        4        here's         3
## 5  1387299755584233472       obl        1           out         4
## 6  1387299755584233472 nmod:poss        6          take         5
## 7  1387299755584233472       dep       10      farewell         6
## 8  1387299755584233472      case       10      farewell         7
## 9  1387299755584233472       det       10      farewell         8
## 10 1387299755584233472      amod       10      farewell         9
## 11 1387299755584233472       acl       10      farewell        11
## 12 1387299755584233472      case       15       sailors        12
## 13 1387299755584233472       det       15       sailors        13
## 14 1387299755584233472  compound       15       sailors        14
## 15 1387299755584233472       obl       11          sung        15
## 16 1387299755584233472      case       20          RIP.        16
## 17 1387299755584233472 nmod:poss       20          RIP.        17
## 18 1387299755584233472      amod       20          RIP.        18
## 19 1387299755584233472  compound       20          RIP.        19
## 20 1387299755584233472       obl       11          sung        20
##    dependentGloss
## 1        farewell
## 2             out
## 3              of
## 4      curiousity
## 5          here's
## 6              my
## 7            take
## 8              on
## 9             the
## 10       poignant
## 11           sung
## 12             by
## 13            the
## 14    Nanggala402
## 15        sailors
## 16         before
## 17            its
## 18        eternal
## 19     departure.
## 20           RIP.

視覺化 Dependency tree

parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)

D-3 情緒分析 - Sentiment

語句情緒值

情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive

sentiment = coreNLP_sentiment_parser(obj)

head(sentiment,20)

##              status_id
## 1  1387299755584233472
## 2  1387077236252024832
## 3  1387284663580823557
## 4  1387264694205902849
## 5  1387260528775491586
## 6  1387255561956126725
## 7  1387153770258128897
## 8  1387076208836308992
## 9  1387068237234376705
## 10 1386980509079654405
## 11 1386958160322121730
## 12 1386946588551966720
## 13 1385856804110835715
## 14 1386931017341186050
## 15 1386890600947982339
## 16 1386856290983415808
## 17 1386838780422750213
## 18 1386823676473925637
## 19 1386766961514881025
## 20 1386712656287916034
##                                                                                                                                                                    text
## 1                                          out of curiousity here's my take on the poignant farewell sung by the Nanggala402 sailors before its eternal departure. RIP.
## 2                                                  idk the meaning of the lyrics. but to learn that this vid made before they departed... is just heartbreaking... RIP.
## 3  To my sisters and brothers in Assam who are now dealing with the double blow of an earthquake and the rampaging second wave of COVID I send you my love and prayers.
## 4                                                            ON ETERNAL PATROL !!! ASCENDING TO THE HIGHER PLACE THAN THE SURFACE SAILORS HEAVEN IS THE LAST DOCKING...
## 5                                                                                                                                                           TKS Tae tae
## 6                                                                                                    Heres a song for you Oceans Where Feet May Fail by Hillsong UNITED
## 7                                                                                                                                              Thks For your Simpathy..
## 8                                                                                                                   Becuase its a best love time so i wanna stop time. 
## 9                                                                                                                                                         Breaking News
## 10                                                                                                                                                            So... sad
## 11                              sending my prayers to all the crews of KRI Nanggala 402 and their families. hopefully all the crews can be found and return home safely
## 12                                                                                       Align Right Hope you all liked this Leave your thoughts in the comment section
## 13                                                                    Make it simple but significant Hope you all liked this Leave your thoughts in the comment section
## 14                             Every Morning You Have Two Choices Continue To Sleep With Your Dreams Or Wake Up And Chase Them. BEAUTIFUL MORNING TO ALL MY SUBSCRIBERS
## 15                                                                                        Poignant video shows crew of sunken Indonesia submarine singing farewell song
## 16                                                                            'protect our family and motherland from harm under tides and ocean. rest in peace sailor'
## 17                                                         Even though I'm not ready to be missing you. I'm not ready to live without you. I wish all the best for you.
## 18                                                                                                              Farewell soldiers and keep your duty JALESVEVA JAYAMAHE
## 19                                          On Eternal Patrol  Berpatroli untuk Selamanya. Sending love and prayers for all passengers crews and families. Via sobatNET
## 20              Our deepest condolence to the crews of KRI Nanggala 402. May them Rest in Peace and God give the strength to their families. You'll never be forgotten.
##    sentiment sentimentValue
## 1    Neutral              2
## 2    Neutral              2
## 3   Negative              1
## 4    Neutral              2
## 5    Neutral              2
## 6    Neutral              2
## 7    Neutral              2
## 8   Positive              3
## 9    Neutral              2
## 10  Negative              1
## 11  Positive              3
## 12  Negative              1
## 13   Neutral              2
## 14  Negative              1
## 15  Negative              1
## 16  Positive              3
## 17  Negative              1
## 18   Neutral              2
## 19   Neutral              2
## 20  Negative              1

資料集中的情緒種類

unique(sentiment$sentiment)

## [1] "Neutral"      "Negative"     "Positive"     "Verypositive"

sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric

#了解情緒文章的分佈
sentiment$sentiment %>% table()

## .
##     Negative      Neutral     Positive Verypositive 
##          605         2462         4913           10

平均情緒分數時間趨勢

df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) + 
  geom_line()

不同用戶端情緒時間趨勢

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% 
  group_by(date,source) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment,color=source)) + 
  geom_line()

## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

了解情緒分佈，以及在正面情緒及負面情緒下，所使用的文章詞彙為何?

#了解正面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('the')) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lower_lemma) %>% #根據lemma分組
  summarize(count = n()) %>% 
  filter(count >5 & count<400)%>%
  wordcloud2()

## Joining, by = "word"

#了解負面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  group_by(lower_lemma) %>% 
  summarize(count = n()) %>% 
  filter(count >10 &count<400)%>%
  wordcloud2()

“wordcloud”

D-4 情緒分析 - 使用twitter資料在sentimentr

計算tweet中屬於正面的字

set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆，取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字

##          words polarity   n
##  1:     please     1.00  27
##  2:       care     1.00   3
##  3:   almighty     1.00   2
##  4:      bless     1.00   2
##  5:    quickly     1.00   2
##  6:  sincerity     1.00   1
##  7:    blessed     1.00   1
##  8:      buddy     1.00   1
##  9:     safety     0.80  14
## 10:     heroes     0.80   8
## 11:   miracles     0.80   7
## 12:  available     0.80   5
## 13:     heaven     0.80   3
## 14:       wish     0.80   2
## 15:     wishes     0.80   2
## 16:       well     0.80   2
## 17:    wishing     0.80   1
## 18: protecting     0.80   1
## 19:    provide     0.80   1
## 20:      saved     0.80   1
## 21:   birthday     0.80   1
## 22:      peace     0.75  69
## 23:    miracle     0.75  32
## 24:       fair     0.75  27
## 25:       good     0.75  21
## 26:       safe     0.75  10
## 27:  beautiful     0.75   6
## 28:    protect     0.75   5
## 29:    sincere     0.75   4
## 30:       love     0.75   3
## 31:    healthy     0.75   2
## 32:       hero     0.75   2
## 33:   heavenly     0.75   1
## 34:  guarantee     0.75   1
## 35:      enjoy     0.75   1
## 36:    courage     0.75   1
## 37:    hopeful     0.75   1
## 38:       ease     0.75   1
## 39:        hug     0.75   1
## 40:      happy     0.75   1
## 41:      found     0.60 669
## 42:  hopefully     0.60 662
## 43:     hoping     0.60   7
## 44:      guide     0.60   3
## 45:     safely     0.50 642
## 46:       hope     0.50  28
## 47:   greatest     0.50  16
## 48:       best     0.50  13
## 49:       dear     0.50   7
## 50:      thank     0.50   7
## 51:       save     0.50   6
## 52:      loved     0.50   5
## 53:     strong     0.50   4
## 54:      brave     0.50   4
## 55:   strength     0.50   3
## 56:    beloved     0.50   3
## 57:      great     0.50   2
## 58:       nice     0.50   1
## 59: productive     0.50   1
## 60:    happily     0.50   1
## 61:      alive     0.50   1
## 62:      shine     0.50   1
## 63:     wealth     0.50   1
## 64:    granted     0.50   1
## 65:       dawn     0.50   1
## 66:   patience     0.50   1
## 67:   abundant     0.50   1
## 68:  fortitude     0.50   1
## 69:       kind     0.50   1
## 70:    healing     0.50   1
## 71:      grant     0.50   1
## 72:    patient     0.50   1
## 73:    contact     0.40   4
## 74:     fellow     0.40   1
## 75: management     0.40   1
## 76:     utmost     0.40   1
## 77:    brother     0.40   1
## 78:      magic     0.25   3
## 79:       holy     0.25   2
## 80:     salute     0.25   2
## 81:       work     0.25   1
## 82:     rescue     0.25   1
## 83:      quiet     0.25   1
## 84:     invite     0.25   1
## 85:        big     0.25   1
## 86:       pray     0.10  18
## 87:    praying     0.10   2
## 88:      prays     0.10   1
##          words polarity   n

計算tweet中屬於負面的字

sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字

## Selecting by n

##             words polarity  n
##  1:       missing    -0.50 11
##  2:      departed    -0.25  6
##  3:    condolence    -0.40  4
##  4:    impossible    -0.50  4
##  5:          lost    -0.75  4
##  6:          sunk    -0.50  3
##  7:          cold    -0.50  3
##  8:          hard    -0.25  2
##  9:         guard    -0.25  2
## 10:           sad    -0.50  2
## 11: heartbreaking    -0.75  2
## 12:          loss    -0.75  2
## 13:          hurt    -0.75  2
## 14:         hurts    -0.75  2
## 15:       shallow    -1.00  2

highlight每個句子，判斷屬於正/負面

set.seed(12)
df%>%
    filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
    mutate(review = get_sentences(text)) %$% 
    sentiment_by(review, status_id) %>%
    highlight()

## Saved in C:\Users\ASUS-NB\AppData\Local\Temp\RtmpYFxPQO/polarity.html

## Opening C:\Users\ASUS-NB\AppData\Local\Temp\RtmpYFxPQO/polarity.html ...

用日期來了解情緒波動

tweets$date = format(tweets$created_at,'%Y%m%d')

(out  = tweets  %>%  with(
    sentiment_by( #document level
        get_sentences(text), 
        list( date)
    )
))
plot(out)

用日期來了解不同用戶端情的緒波動

(out  = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%  with(
    sentiment_by(
        get_sentences(text), 
        list(source, date)
    )
))
plot(out)