社群媒體分析-第三組期中報告 Earth-day

A.動機和分析目的

B.資料集的描述

B-1 安裝package

packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(sentimentr)

load("coreNLP_all.RData")

B-2 資料收集：tweets

(1). Twitter API設定透過rtweet抓取tweets

app = 'Emotions COVID-19 Vaccine'
consumer_key = 'sldS3M1c37owWAxx88lRg8anU'
consumer_secret = 'lCsUtxqA6DWC9nW7xH2a5KAITLXEX8oj10tcWE7zRVTxgHARfC'
access_token = '1283052584312410112-LocNkHahyAJ50KR0sADTmHryO0k3Kq'
access_secret = 'gLssR17xxOZUDLeiF6sB5LiwSYAAVBE0mLjXQolINF4k3'
twitter_token <- create_token(app,consumer_key, consumer_secret,
                    access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權

(2). 設定關鍵字抓tweets

# 查詢關鍵字
key = c("#EarthDay")
context = ""
q = paste(c(key,context),collapse=" AND ")   
# 查詢字詞 "#COVID-19 AND Vaccine"
# 為了避免只下#COVID-19 會找到非在Vaccine中的tweets，加入Vaccine要同時出現的條件

#抓8000筆 不抓轉推
tweets = search_tweets(q,lang="en",n=8000,include_rts = FALSE,token = twitter_token)

(3). tweets內容清理

## 用於資料清理
clean = function(txt) {
  txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
  txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
  txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元，*:0次以上)
  txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
  txt = gsub("\\n"," ",txt) #去除換行
  txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
  txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
  txt = gsub("&.*;","",txt) #去除html特殊字元編碼
  txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
  txt }


tweets$text = clean(tweets$text)  #text套用資料清理

df = data.frame()
  
df = rbind(df,tweets)  # transfer to data frame

df = df[!duplicated(df[,"status_id"]),]  #去除重複的tweets

head(df)

## # A tibble: 6 x 90
##   user_id   status_id   created_at          screen_name  text            source 
##   <chr>     <chr>       <dttm>              <chr>        <chr>           <chr>  
## 1 312028258 1387335026~ 2021-04-28 09:17:00 EUROCITIES   Live now How c~ Twitte~
## 2 11663231~ 1387334596~ 2021-04-28 09:15:17 DaceHermione At last week's~ Twitte~
## 3 11094207~ 1387334483~ 2021-04-28 09:14:50 GreenhamBun~ We had a busy ~ Twitte~
## 4 466339000 1387333279~ 2021-04-28 09:10:03 Alleyns_Sch~ Lower School p~ Hootsu~
## 5 54006056  1387333119~ 2021-04-28 09:09:25 panintellig~ Last week duri~ Twitte~
## 6 8352642   1387332486~ 2021-04-28 09:06:54 pacobriseno  .       after ~ Twitte~
## # ... with 84 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>,
## #   ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## #   lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## #   quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## #   quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## #   quoted_name <chr>, quoted_followers_count <int>,
## #   quoted_friends_count <int>, quoted_statuses_count <int>,
## #   quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## #   retweet_source <chr>, retweet_favorite_count <int>,
## #   retweet_retweet_count <int>, retweet_user_id <chr>,
## #   retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## #   country_code <chr>, geo_coords <list>, coords_coords <list>,
## #   bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## #   description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>

df共有90個欄位，但我們在這裡僅會使用幾個欄位:

user_id: 用戶id
status_id : 推文id
created_at : 發文時間
text : 推文內容
source : 發文來源

了解資料的資料筆數以及時間分布

created_at已經是一個date類型的欄位，因此可以直接用min,max來看最遠或最近的日期
註:rtweet最多只能抓到距今10天的資料

nrow(df)

## [1] 7741

min(df$created_at)

## [1] "2021-04-23 19:16:00 UTC"

max(df$created_at)

## [1] "2021-04-28 09:17:00 UTC"

B-3 資料集的描述

C 資料的分析過程

C-1 coreNLP

(1). API呼叫的設定

server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# 產生coreNLP的api url，將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)

# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文（使用9000 port）、另一個則負責中文（使用9001 port）
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}

#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理，每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

(2). 資料整理function

從回傳的object中整理斷詞出結果，輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

從回傳的core-nlp object中整理出詞彙依存關係，輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

從回傳的core-nlp object中整理出語句情緒，輸出為 tidydata 格式

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

圖形化 Dependency tree

程式參考來源：https://stackoverflow.com/questions/35496560/how-to-convert-corenlp-generated-parse-tree-into-data-tree-r-package

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

將句子丟入服務

取得coreNLP回傳的物件
先不要跑這段，會花大概半小時（如果你記憶體只有4G可能會當掉）

#() #釋放不使用的記憶體

#t0 = Sys.time()
#obj = df[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) 

#丟入本地執行 丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

#Sys.time() - t0 #執行時間

#Time difference of 30 mins

#save.image("coreNLP.RData")

#先將會用到的東西存下來，要用可直接載RData


#tokens =  coreNLP_tokens_parser(obj)
#dependencies = coreNLP_dependency_parser(obj)
#sentiment = coreNLP_sentiment_parser(obj)
#save.image("coreNLP_all.RData")

D 視覺化的分析結果與解釋

(1). 斷詞、詞彙還原、詞性標註、NER

tokens =  coreNLP_tokens_parser(obj)

head(tokens,20)

##              status_id       word      lemma  pos  ner
## 1  1387335026942091265       Live       live   RB    O
## 2  1387335026942091265        now        now   RB DATE
## 3  1387335026942091265        How        how  WRB    O
## 4  1387335026942091265        can        can   MD    O
## 5  1387335026942091265         we         we  PRP    O
## 6  1387335026942091265     ensure     ensure   VB    O
## 7  1387335026942091265   recovery   recovery   NN    O
## 8  1387335026942091265 strategies   strategy  NNS    O
## 9  1387335026942091265      drive      drive  VBP    O
## 10 1387335026942091265        the        the   DT    O
## 11 1387335026942091265      green      green   JJ    O
## 12 1387335026942091265 transition transition   NN    O
## 13 1387335026942091265         at         at   IN    O
## 14 1387335026942091265          a          a   DT    O
## 15 1387335026942091265      local      local   JJ    O
## 16 1387335026942091265     level?     level?   NN    O
## 17 1387335026942091265       Join       join   VB    O
## 18 1387335026942091265        our         we PRP$    O
## 19 1387335026942091265      panel      panel   NN    O
## 20 1387335026942091265         on         on   IN    O

coreNLP_tokens_parser欄位:
- status_id : 對應原本df裡的status_id，為一則tweets的唯一id
- word: 原始斷詞
- lemma : 對斷詞做詞形還原
- pos : part-of-speech,詞性
- ner: 命名實體

(2). 命名實體標註(NER)

從NER查看特定類型的實體，辨識出哪幾種類型

unique(tokens$ner)

##  [1] "O"                 "DATE"              "NUMBER"           
##  [4] "ORGANIZATION"      "DURATION"          "TIME"             
##  [7] "MISC"              "LOCATION"          "COUNTRY"          
## [10] "TITLE"             "PERSON"            "SET"              
## [13] "CITY"              "NATIONALITY"       "MONEY"            
## [16] "ORDINAL"           "STATE_OR_PROVINCE" "CAUSE_OF_DEATH"   
## [19] "IDEOLOGY"          "URL"               "CRIMINAL_CHARGE"  
## [22] "RELIGION"          "PERCENT"

#除去entity為Other，有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"]))

## [1] 4373

(3). 轉小寫

因為大小寫也會影響corenlp對NER的判斷，因此我們一開始給的推文內容是沒有處理大小寫的，但在跑完anotator後，為了正確計算詞頻，創建新欄位lower_word與lower_lemma，存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞（如Evergiven與evergiven）都換成小寫，再來計算詞頻

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

D-1 探索分析 - NER

涉及到的國家(COUNTRY)

我們可以透過coreNLP中的NER解析出在Twitter上面談論世界地球日的事情，所涉及到的國家(COUNTRY)，以初步了解這個議題的主要國家。

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 13, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

討論最多的國家

涉及到的組織(ORGANIZATION)

我們可以透過coreNLP中的NER解析出在Twitter上面談論世界地球日的事情，所涉及到的組織(ORGANIZATION)，以初步了解這個議題的主要公司/單位。

tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

涉及到的人物(PERSON)

我們可以透過coreNLP中的NER解析出在Twitter上面世界地球日的事情，所涉及到的人物(PERSON)，以初步了解這個議題的主要人物。

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

D-2 探索分析 - Dependency

語句依存關係結果

dependencies = coreNLP_dependency_parser(obj)

head(dependencies,20)

##              status_id       dep governor governorGloss dependent
## 1  1387335026942091265      ROOT        0          ROOT         6
## 2  1387335026942091265    advmod        6        ensure         1
## 3  1387335026942091265    advmod        6        ensure         2
## 4  1387335026942091265    advmod        6        ensure         3
## 5  1387335026942091265       aux        6        ensure         4
## 6  1387335026942091265     nsubj        6        ensure         5
## 7  1387335026942091265  compound        8    strategies         7
## 8  1387335026942091265     nsubj        9         drive         8
## 9  1387335026942091265     ccomp        6        ensure         9
## 10 1387335026942091265       det       12    transition        10
## 11 1387335026942091265      amod       12    transition        11
## 12 1387335026942091265     nsubj       17          Join        12
## 13 1387335026942091265      case       16        level?        13
## 14 1387335026942091265       det       16        level?        14
## 15 1387335026942091265      amod       16        level?        15
## 16 1387335026942091265      nmod       12    transition        16
## 17 1387335026942091265     ccomp        9         drive        17
## 18 1387335026942091265 nmod:poss       19         panel        18
## 19 1387335026942091265       obj       17          Join        19
## 20 1387335026942091265       obl       17          Join        20
##    dependentGloss
## 1          ensure
## 2            Live
## 3             now
## 4             How
## 5             can
## 6              we
## 7        recovery
## 8      strategies
## 9           drive
## 10            the
## 11          green
## 12     transition
## 13             at
## 14              a
## 15          local
## 16         level?
## 17           Join
## 18            our
## 19          panel
## 20             on

視覺化 Dependency tree

parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)

D-3 情緒分析 - Sentiment

語句情緒值

情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive

sentiment = coreNLP_sentiment_parser(obj)

head(sentiment,20)

##              status_id
## 1  1387335026942091265
## 2  1387334596195463169
## 3  1387334483532206086
## 4  1387333279238459393
## 5  1387333119477420032
## 6  1387332486611492864
## 7  1387331504792035331
## 8  1386970979381960706
## 9  1386608239983403009
## 10 1387331006219309056
## 11 1387330901869219841
## 12 1386847604084969473
## 13 1387330769517957120
## 14 1387330749158752260
## 15 1387330114870988806
## 16 1387329499629559809
## 17 1387329211203866626
## 18 1385771674084134913
## 19 1385772954470936577
## 20 1387328132936577025
##                                                                                                                                                                                                                                                                      text
## 1                                                                                                                                                           Live now How can we ensure recovery strategies drive the green transition at a local level? Join our panel on
## 2                                                                 At last week'ssummit world leaders announced welcome and ambitious climate targets. The question is can we deliver the speed and scale of innovation needed to meet them?explores this in his new paper
## 3                                 We had a busy Earth Day last week with our 'Acts of Green' competition and received a total of 74 Acts of Green submissions across our Service Centres! Congratulations to all 3 winning teams and well done all for your hard efforts!
## 4                            Lower School pupils have been thinking aboutin their actions this week. This morning they enjoyed registration and games outside in their year group bubbles. Getting out and enjoying nature does us all good. Remember to keep looking up!
## 5                                                                                      Last week duringwe highlighted thatis on the rise but did you know that 6.7M tonnes of food is wasted a year? By usingyour business can make a change! Book a demo to find out how
## 6                                                                                                                                                                    .       after  will you adopt ambitious climate commitments and get on track to meettargets before ?
## 7                                                                                                                    Demonstrate your brands sustainability claims such as ethical sourcing or environmental impact with the new Everledger Platform. Request a demo here
## 8                                            Plastic packaging has become an economic environmental and social burden. Only 14 of plastic packaging is collected for recycling globally. Watch now how our platform can play a key role in the circular packaging economy
## 9                                                                                                             We're working with our partners to meet our own carbon neutrality goals.is on track to have 100 of their cloud running on renewable energy by 2025. Read on
## 10                                                                                                                                             Post! Learn how financing and investing activities are facing big impacts as climate initiatives continue to move forward 
## 11                                                                                                                                                                                                                                         My friend is officially an egg
## 12                                                             Happening tomorrow Building SustainableSolutions on Wednesday April 28 at 6.15 p.m. SGT  2.15 p.m. GST How can environmental commitment towards a green supply chain create value? Find out in our webinar
## 13                                                                                                                      Join us live today! Do not miss  Building Sustainable Supply Chain Solutions at 6.157 p.m. SGT  2.153 p.m. GSTRegister here to access the webinar
## 14                                                               Last week our little environmentalists at the ELV celebrated Earth Day with lots of fun activities. Students got the chance to water and plant new seeds and learnt about the Earth and climate changes.
## 15 We are delighted to announce will be hosting our Young SVP connects meeting again next Tues 4thMay at 7pm. We look forward to welcoming you along to share your thoughts views  discuss what is going on for you! If you took part in the Aprilchallenge we would love
## 16          Canada is a world leader in cleantech but women are still significantly underrepresented. This  MaRS andare launching the RBC Women in Cleantech Accelerator a 12month program that will connect and support women entrepreneurs. Learn more and apply today.
## 17                                                                                                                                            UV Sun Protection Cooling Arm Sleeves Original Price 11.99 Price after discount 8.99Free shipping Discount codeMKTC2METP342
## 18                                                                                                                                                                                                                   2021 ins style Straw beach bag price9.9Free shipping
## 19                                                                                                                                                                                 2021 ins style Straw beach bag Environmentally friendly products price9.9Free shipping
## 20                                                                                                                                The Earth is our common wealth. Let's preserve it. Environmental protection guides our choices. Every day by your side at your service!
##    sentiment sentimentValue
## 1    Neutral              2
## 2   Positive              3
## 3    Neutral              2
## 4   Positive              3
## 5   Negative              1
## 6    Neutral              2
## 7    Neutral              2
## 8    Neutral              2
## 9    Neutral              2
## 10   Neutral              2
## 11   Neutral              2
## 12   Neutral              2
## 13  Negative              1
## 14  Positive              3
## 15  Positive              3
## 16  Positive              3
## 17   Neutral              2
## 18   Neutral              2
## 19   Neutral              2
## 20   Neutral              2

資料集中的情緒種類

unique(sentiment$sentiment)

## [1] "Neutral"      "Positive"     "Negative"     "Verypositive" "Verynegative"

sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric

#了解情緒文章的分佈
sentiment$sentiment %>% table()

## .
##     Negative      Neutral     Positive Verynegative Verypositive 
##         1318         3965         2375            3           55

平均情緒分數時間趨勢

df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) + 
  geom_line()

不同用戶端情緒時間趨勢

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% 
  group_by(date,source) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment,color=source)) + 
  geom_line()

## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

了解情緒分佈，以及在正面情緒及負面情緒下，所使用的文章詞彙為何?

#了解正面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('the')) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lower_lemma) %>% #根據lemma分組
  summarize(count = n()) %>% 
  filter(count >20 & count<400)%>%
  wordcloud2()

## Joining, by = "word"

#了解負面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  group_by(lower_lemma) %>% 
  summarize(count = n()) %>% 
  filter(count >20 &count<400)%>%
  wordcloud2()

“wordcloud”

D-4 情緒分析 - 使用twitter資料在sentimentr

計算tweet中屬於正面的字

set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆，取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字

##               words polarity  n
##   1:    sustainable      1.0 34
##   2:          honor      1.0 21
##   3:           care      1.0 19
##   4: sustainability      1.0 19
##   5:         please      1.0 13
##  ---                           
## 611:         gospel      0.1  1
## 612:          prime      0.1  1
## 613:       relating      0.1  1
## 614:          mural      0.1  1
## 615:           lion      0.1  1

計算tweet中屬於負面的字

sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字

## Selecting by n

##         words polarity  n
##  1:     trash    -0.50 20
##  2:      late    -0.25 16
##  3:     waste    -0.75 13
##  4:  pandemic    -1.00  9
##  5:      miss    -1.00  8
##  6:  homeless    -1.00  8
##  7: challenge    -0.25  6
##  8:     leave    -0.25  6
##  9:      wait    -0.25  5
## 10:   targets    -0.40  5
## 11:    litter    -0.60  5
## 12:   poverty    -0.75  5
## 13:    crisis    -0.75  5

highlight每個句子，判斷屬於正/負面

set.seed(12)
df%>%
    filter(status_id %in% sample(unique(status_id), 30)) %>% #隨機30筆貼文
    mutate(review = get_sentences(text)) %$% 
    sentiment_by(review, status_id) %>%
    highlight()

## Saved in C:\Users\ASUS-NB\AppData\Local\Temp\RtmpwjxYgZ/polarity.html

## Opening C:\Users\ASUS-NB\AppData\Local\Temp\RtmpwjxYgZ/polarity.html ...

用日期來了解情緒波動

tweets$date = format(tweets$created_at,'%Y%m%d')

(out  = tweets  %>%  with(
    sentiment_by( #document level
        get_sentences(text), 
        list( date)
    )
))
plot(out)

用日期來了解不同用戶端情的緒波動

(out  = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%  with(
    sentiment_by(
        get_sentences(text), 
        list(source, date)
    )
))
plot(out)