prj_1

安裝package

packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)

load("coreNLP_zhao.RData")

head(df)

## # A tibble: 6 x 92
##   user_id   status_id   created_at          screen_name  text            source 
##   <chr>     <chr>       <dttm>              <chr>        <chr>           <chr>  
## 1 12685834… 1387355969… 2021-04-28 10:40:13 ademorganle… " Ridiculous. … Twitte…
## 2 12685834… 1386650105… 2021-04-26 11:55:22 ademorganle… " I so wanted … Twitte…
## 3 12685834… 1387353730… 2021-04-28 10:31:19 ademorganle… "This is ridic… Twitte…
## 4 69575601  1387355831… 2021-04-28 10:39:40 veen_th      "A less than g… Twitte…
## 5 12783468… 1387355721… 2021-04-28 10:39:14 SAIJAYA2544… "Great respect… Twitte…
## 6 69296462  1387355088… 2021-04-28 10:36:43 Karmathebea… "Thank you. Fo… Twitte…
## # … with 86 more variables: display_text_width <dbl>,
## #   reply_to_status_id <chr>, reply_to_user_id <chr>,
## #   reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## #   favorite_count <int>, retweet_count <int>, quote_count <int>,
## #   reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## #   urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## #   media_t.co <list>, media_expanded_url <list>, media_type <list>,
## #   ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## #   ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## #   lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## #   quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## #   quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## #   quoted_name <chr>, quoted_followers_count <int>,
## #   quoted_friends_count <int>, quoted_statuses_count <int>,
## #   quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## #   retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## #   retweet_source <chr>, retweet_favorite_count <int>,
## #   retweet_retweet_count <int>, retweet_user_id <chr>,
## #   retweet_screen_name <chr>, retweet_name <chr>,
## #   retweet_followers_count <int>, retweet_friends_count <int>,
## #   retweet_statuses_count <int>, retweet_location <chr>,
## #   retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## #   place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## #   country_code <chr>, geo_coords <list>, coords_coords <list>,
## #   bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## #   description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## #   friends_count <int>, listed_count <int>, statuses_count <int>,
## #   favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## #   profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## #   profile_banner_url <chr>, profile_background_url <chr>,
## #   profile_image_url <chr>, time <chr>, date <date>

nrow(df)

## [1] 5000

min(df$created_at)

## [1] "2021-04-26 11:11:11 UTC"

max(df$created_at)

## [1] "2021-04-28 10:40:13 UTC"

抓趙婷相關資料

zhao <- df[grep("Zhao",df$text),]

1-2串接CoreNLP API

(1). API呼叫的設定

server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000

# 產生coreNLP的api url，將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
                    tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
    url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
    url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"

generate_API_url(host)

# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
                    tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
  # 假設有兩個core-nlp server、一個負責英文（使用9000 port）、另一個則負責中文（使用9001 port）
  port <- ifelse(language=="eng", 9000, 9001);
  # 產生api網址
  url <- generate_API_url(server_host, port=port,
                    tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
  
  result <- POST(url, body = text, encode = "json")
  doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
  return (doc)
}

#文件使用coreNLP服務
coreNLP <- function(data,host){
  # 依序將每個文件丟進core-nlp進行處理，每份文件的回傳結果為json格式
  # 在R中使用objects來儲存處理結果
  result <- apply(data, 1 , function(x){
    object <- call_coreNLP(host, x['text'])
    list(doc=object, data=x)
  })
  
  return(result)
}

(2). 資料整理function

從回傳的object中整理斷詞出結果，輸出為 tidydata 格式

coreNLP_tokens_parser <- function(coreNLP_objects){
  
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
   
    sen <- sentences[[1]]
    
    tokens <- do.call(rbind, lapply(sen$tokens, function(x){
      result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
      result
    }))
    
    tokens <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(tokens))) %>% 
      bind_cols(tokens)
    
    tokens
  }))
  return(result)
}

從回傳的core-nlp object中整理出詞彙依存關係，輸出為 tidydata 格式

coreNLP_dependency_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
      result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
      result
    }))
  
    dependencies <- original_data %>%
      t() %>% 
      data.frame() %>% 
      select(-text) %>% 
      slice(rep(1:n(), each = nrow(dependencies))) %>% 
      bind_cols(dependencies)
    dependencies
  }))
  return(result)
}

從回傳的core-nlp object中整理出語句情緒，輸出為 tidydata 格式

coreNLP_sentiment_parser <- function(coreNLP_objects){
  result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
    original_data <- obj$data
    doc <- obj$doc
    # for a sentences
    sentences <- doc$sentences
    sen <- sentences[[1]]
    
    sentiment <- original_data %>%
      t() %>% 
      data.frame() %>% 
      bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
  
    sentiment
  }))
  return(result)
}

圖形化 Dependency tree

程式參考來源：https://stackoverflow.com/questions/35496560/how-to-convert-corenlp-generated-parse-tree-into-data-tree-r-package

# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
  stopifnot(require(NLP) && require(igraph))
  
  # this step modifies coreNLP parse tree to mimic openNLP parse tree
  ptext <- gsub("[\r\n]", "", ptext)
  ptext <- gsub("ROOT", "TOP", ptext)


  ## Replace words with unique versions
  ms <- gregexpr("[^() ]+", ptext)                                      # just ignoring spaces and brackets?
  words <- regmatches(ptext, ms)[[1]]                                   # just words
  regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words))))  # add id to words
  
  ## Going to construct an edgelist and pass that to igraph
  ## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
  edgelist <- matrix('', nrow=length(words)-2, ncol=2)
  
  ## Function to fill in edgelist in place
  edgemaker <- (function() {
    i <- 0                                       # row counter
    g <- function(node) {                        # the recursive function
      if (inherits(node, "Tree")) {            # only recurse subtrees
        if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
          for (child in node$children) {
            childval <- if(inherits(child, "Tree")) child$value else child
            i <<- i+1
            edgelist[i,1:2] <<- c(val, childval)
          }
        }
        invisible(lapply(node$children, g))
      }
    }
  })()
  
  ## Create the edgelist from the parse tree
  edgemaker(Tree_parse(ptext))
  tree <- FromDataFrameNetwork(as.data.frame(edgelist))
  return (tree)
}

將句子丟入服務

取得coreNLP回傳的物件
先不要跑這段，會花大概半小時（如果你記憶體只有4G可能會當掉）

gc() #釋放不使用的記憶體

t0 = Sys.time()
obj = zhao[,c(2,5)]  %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位

Sys.time() - t0 #執行時間
#Time difference of 28 mins

save.image("coreNLP_zhao.RData")

#先將會用到的東西存下來，要用可直接載RData
#tokens =  coreNLP_tokens_parser(obj)
#dependencies = coreNLP_dependency_parser(obj)
#sentiment = coreNLP_sentiment_parser(obj)
#save.image("coreNLP_all.RData")

1-3 提取結果

(1). 斷詞、詞彙還原、詞性標註、NER

tokens =  coreNLP_tokens_parser(obj)

head(tokens,20)

##              status_id        word       lemma  pos    ner  lower_word
## 1  1387355969882050560 Ridiculous. Ridiculous.  NNP      O ridiculous.
## 2  1387355969882050560      Gavean      Gavean  NNP PERSON      gavean
## 3  1387355969882050560       Oscar       Oscar  NNP PERSON       oscar
## 4  1387355969882050560         but         but   CC      O         but
## 5  1387355969882050560      didn't      didn't   VB      O      didn't
## 6  1387355969882050560        care        care   NN      O        care
## 7  1387355969882050560        what        what   WP      O        what
## 8  1387355969882050560         his          he PRP$      O         his
## 9  1387355969882050560      speech      speech   NN      O      speech
## 10 1387355969882050560        was?        was?   IN      O        was?
## 11 1387355969882050560         Why         why  WRB      O         why
## 12 1387355969882050560       award       award   NN      O       award
## 13 1387355969882050560         him          he  PRP      O         him
## 14 1387355969882050560       then?       then?  VBP      O       then?
## 15 1387355969882050560        Give        give   VB      O        give
## 16 1387355969882050560          it          it  PRP      O          it
## 17 1387355969882050560          to          to   IN      O          to
## 18 1387355969882050560           .           .    .      O           .
## 19 1386650105382645761           I           I  PRP      O           i
## 20 1386650105382645761          so          so   RB      O          so
##    lower_lemma
## 1  ridiculous.
## 2       gavean
## 3        oscar
## 4          but
## 5       didn't
## 6         care
## 7         what
## 8           he
## 9       speech
## 10        was?
## 11         why
## 12       award
## 13          he
## 14       then?
## 15        give
## 16          it
## 17          to
## 18           .
## 19           i
## 20          so

coreNLP_tokens_parser欄位:
- status_id : 對應原本df裡的status_id，為一則tweets的唯一id
- word: 原始斷詞
- lemma : 對斷詞做詞形還原
- pos : part-of-speech,詞性
- ner: 命名實體

(2). 命名實體標註(NER)

從NER查看特定類型的實體，辨識出哪幾種類型

unique(tokens$ner)

##  [1] "O"                 "PERSON"            "DATE"             
##  [4] "MISC"              "DURATION"          "NUMBER"           
##  [7] "ORDINAL"           "CITY"              "NATIONALITY"      
## [10] "TITLE"             "COUNTRY"           "ORGANIZATION"     
## [13] "TIME"              "PERCENT"           "LOCATION"         
## [16] "SET"               "CAUSE_OF_DEATH"    "STATE_OR_PROVINCE"
## [19] "IDEOLOGY"          "RELIGION"          "CRIMINAL_CHARGE"  
## [22] "MONEY"             "URL"

#除去entity為Other，有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"]))

## [1] 2461

(3). 轉小寫

因為大小寫也會影響corenlp對NER的判斷，因此我們一開始給的推文內容是沒有處理大小寫的，但在跑完anotator後，為了正確計算詞頻，創建新欄位lower_word與lower_lemma，存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞（如Evergiven與evergiven）都換成小寫，再來計算詞頻

tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)

1.4 探索分析 - NER

涉及到的國家(COUNTRY)

tokens %>%
  filter(ner == "COUNTRY") %>%  #篩選NER為COUNTRY
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 13, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is COUNTRY)") +
  theme(text=element_text(size=14))+
  coord_flip()

涉及到的組織(ORGANIZATION)

tokens %>%
  filter(ner == "ORGANIZATION") %>%  #篩選NER為ORGANIZATION
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is ORGANIZATION)") +
  theme(text=element_text(size=14))+
  coord_flip()

涉及到的人物(PERSON)

tokens %>%
  filter(ner == "PERSON") %>%  #篩選NER為PERSON
  group_by(lower_word) %>% #根據word分組
  summarize(count = n()) %>% #計算每組
  top_n(n = 10, count) %>%
  ungroup() %>% 
  mutate(word = reorder(lower_word, count)) %>%
  ggplot(aes(word, count)) + 
  geom_col()+
  ggtitle("Word Frequency (NER is PERSON)") +
  theme(text=element_text(size=14))+
  coord_flip()

1.5 探索分析 - Dependency

語句依存關係結果

dependencies = coreNLP_dependency_parser(obj)

head(dependencies,20)

##              status_id       dep governor governorGloss dependent
## 1  1387355969882050560      ROOT        0          ROOT         5
## 2  1387355969882050560  compound        3         Oscar         1
## 3  1387355969882050560  compound        3         Oscar         2
## 4  1387355969882050560     nsubj        5        didn't         3
## 5  1387355969882050560    advmod        5        didn't         4
## 6  1387355969882050560       obj        5        didn't         6
## 7  1387355969882050560       obl       15          Give         7
## 8  1387355969882050560 nmod:poss        9        speech         8
## 9  1387355969882050560     nsubj       15          Give         9
## 10 1387355969882050560      mark       14         then?        10
## 11 1387355969882050560    advmod       14         then?        11
## 12 1387355969882050560       dep       11           Why        12
## 13 1387355969882050560     nsubj       14         then?        13
## 14 1387355969882050560       acl        9        speech        14
## 15 1387355969882050560       dep        5        didn't        15
## 16 1387355969882050560       obj       15          Give        16
## 17 1387355969882050560      case        7          what        17
## 18 1387355969882050560     punct        5        didn't        18
## 19 1386650105382645761      ROOT        0          ROOT         3
## 20 1386650105382645761     nsubj        3        wanted         1
##    dependentGloss
## 1          didn't
## 2     Ridiculous.
## 3          Gavean
## 4           Oscar
## 5             but
## 6            care
## 7            what
## 8             his
## 9          speech
## 10           was?
## 11            Why
## 12          award
## 13            him
## 14          then?
## 15           Give
## 16             it
## 17             to
## 18              .
## 19         wanted
## 20              I

視覺化 Dependency tree

parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)

1.6 探索分析 - Sentiment

語句情緒值

情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive

sentiment = coreNLP_sentiment_parser(obj)

head(sentiment,20)

##              status_id
## 1  1387355969882050560
## 2  1386650105382645761
## 3  1387353730463436805
## 4  1387355831918764038
## 5  1387355721210089473
## 6  1387355088709111811
## 7  1387354965065093122
## 8  1387353297644933123
## 9  1387351117919514631
## 10 1387348931324035073
## 11 1387348180908580865
## 12 1387347688090324993
## 13 1387345850209165313
## 14 1387345035927171074
## 15 1387344621823463427
## 16 1387344410891927556
## 17 1386984504368193540
## 18 1386981889471352836
## 19 1387343656852627458
## 20 1386998097629171713
##                                                                                                                                                                                                                                                            text
## 1                                                          Ridiculous. Gavean Oscar but didn't care what his speech was? Why award him then? Give it to . She's certainly overdue. Oscars are worse every year. Someone givea hairbrush. 4 no hairbrush.robbed.
## 2                                                                       I so wanted to wake up this morning to see a pic of you holding thatfor . At least your film was recognized for Best Costume  Makeup.I'm so sorry. I know you would have been gracious.
## 3                          This is ridiculous. They gave Hopkins an Oscar but didn't care what his speech would be? Why award him then? Give it to . She's certainly overdue. The Oscars are getting worse every year. Someone givea hairbrush. 4 no hairbrush.
## 4                                                                                                                                                                                                                        A less than glittering Oscars ceremony
## 5                                                                                                                                                                    Great respect to the legend aka CHADWICK BOSEMAN by not giving academy awardWow!!!applause
## 6                                                                                                                                                               Thank you. For your work for being responsible in a pandemic and for honoring Chadwich Boseman.
## 7                                                                                                                                                                                         Wishing you a very happy birthdayJi. fromfor yourperformance from the
## 8                                                                                                                                                                                                            Congratulations to Nicholas Becker for hiswin for!
## 9  It's been two days since theand wouldn't it be a waste if those best dressed lists stay on that red carpet? We show you and these Alisters where else you can wear these one of a kindpieces because in our book the awards go to the best outfit repeaters!
## 10                                                                                                                                                                                            Pandemicconstrained Oscars feature more oratory less music comedy
## 11                                                                                                                                                                                                                                            And the winner is
## 12                                                                                                                                                                                                                                                         Nice
## 13                                                                                                                                                                                                      WATCH SA documentary My Octopus Teacher' bags an Oscar!
## 14                                                                                                                        would have been the first one to congratulate . I am pretty sure that Chadwick Boseman is ashamed because of his dumb fans right now.
## 15                                                                                                                                                                                                                                       like Hollywood matters
## 16                                                                                                                                                                                                                                       Howreally choose their
## 17                                                                                                                                                                                                                                   You Need to See to Believe
## 18                                                                                                                                                                                                                        weigh in on bestlooks of all time via
## 19                                                                             Chineseborn American filmmakermakes history as the firstwoman  the second woman in Oscars 93 years to winAward for Best Director for . She also took the award for Best Picture.
## 20                                                                                                                 The thirdOscar winner from 's oeuvre  . The execution is flawless which makes this award rest with their team at ! Here's sendingto the team
##    sentiment sentimentValue
## 1   Positive              3
## 2    Neutral              2
## 3    Neutral              2
## 4    Neutral              2
## 5   Positive              3
## 6    Neutral              2
## 7   Positive              3
## 8    Neutral              2
## 9   Negative              1
## 10  Negative              1
## 11  Positive              3
## 12   Neutral              2
## 13   Neutral              2
## 14   Neutral              2
## 15   Neutral              2
## 16   Neutral              2
## 17   Neutral              2
## 18   Neutral              2
## 19   Neutral              2
## 20  Positive              3

資料集中的情緒種類

unique(sentiment$sentiment)

## [1] "Positive"     "Neutral"      "Negative"     "Verypositive" "Verynegative"

sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric

#了解情緒文章的分佈
sentiment$sentiment %>% table()

## .
##     Negative      Neutral     Positive Verynegative Verypositive 
##          947         2280         1650            9           85

平均情緒分數時間趨勢

df$date = as.Date(df$created_at)

sentiment %>% 
  merge(df[,c("status_id","source","date")]) %>%
  group_by(date) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment)) + 
  geom_line()

不同用戶端情緒時間趨勢

sentiment %>% 
  merge(zhao[,c("status_id","quoted_location","date")]) %>%
  group_by(date,quoted_location) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment,color=quoted_location)) + 
  geom_line()

## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

了解情緒分佈，以及在正面情緒及負面情緒下，所使用的文章詞彙為何?

#了解正面文章的詞彙使用
sentiment %>% 
  merge(tokens) %>% 
  anti_join(stop_words) %>% 
  filter(!lower_word %in% c('i','the')) %>% 
  filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
  group_by(lower_lemma) %>% #根據lemma分組
  summarize(count = n()) %>% 
  filter(count >5 & count<400)%>%
  wordcloud2()

## Joining, by = "word"

#了解負面文章的詞彙使用
#sentiment %>% 
  #merge(tokens) %>% 
  #anti_join(stop_words) %>% 
  #filter(!lower_word %in% c('i','the')) %>% 
  #filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
  #group_by(lower_lemma) %>% 
  #summarize(count = n()) %>% 
  #filter(count >10 &count<400)%>%
  #wordcloud2()

2. Sentimentr 英文情緒分析

2.1 簡介sentimentr

library(sentimentr)

每個文本的情緒分數

情緒分數為-1~1之間，<0屬於負面，>0屬於正面，0屬於中性

2.2 使用twitter資料實踐在sentimentr

計算tweet中屬於正面的字

set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆，取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字

##             words polarity  n
##   1:     congrats      1.0 29
##   2:       please      1.0 14
##   3:      messiah      1.0  8
##   4:    brilliant      1.0  8
##   5:         care      1.0  6
##  ---                         
## 473:   technology      0.1  1
## 474:       church      0.1  1
## 475:       sermon      0.1  1
## 476: collectively      0.1  1
## 477:          sex      0.1  1

計算tweet中屬於負面的字

sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字

## Selecting by n

##           words polarity  n
##  1:       black    -0.25 44
##  2:        stop    -0.40 11
##  3:      boring    -1.00 10
##  4:        dead    -1.00 10
##  5:        hate    -0.75  9
##  6:        late    -0.25  8
##  7:       worst    -0.50  8
##  8:        wait    -0.25  7
##  9:      enough    -0.25  6
## 10:      missed    -0.50  6
## 11:        lost    -0.75  6
## 12:         bad    -0.75  6
## 13:      racism    -1.00  6
## 14: should have    -1.05  6

2.3 用日期來了解情緒波動

code 參考 https://github.com/trinker/sentimentr

tweets$date = format(tweets$created_at,'%Y%m%d')

(out  = tweets  %>%  with(
    sentiment_by( #document level
        get_sentences(text), 
        list( date)
    )
))
plot(out)

2.4 用日期來了解不同用戶端情的緒波動

(out  = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>%  with(
    sentiment_by(
        get_sentences(text), 
        list(source, date)
    )
))
plot(out)

趙婷

zhao <- df[grep("Zhao",df$text),]

分類location

zhao$location[zhao$location %in% c("Hong Kong SAR, China","གངས་རིན་པོ་ཆེ")] <- "China"
zhao$location[zhao$location == "Rice University, Houston, TX"] <- "USA"
zhao$location[zhao$location == "Los Angeles, New York"] <- "USA"
zhao$location[zhao$location == "Virginia, USA"] <- "USA"
zhao$location[zhao$location == "Birmingham, England/ New York"] <- "USA"
zhao$location[zhao$location == "Seattle, WA"] <- "USA"
zhao$location[zhao$location == "Detroit, MI"] <- "USA"
zhao$location[zhao$location == "New York, NY"] <- "USA"
zhao$location[zhao$location == "Boston, MA"] <- "USA"
zhao$location[zhao$location == "Washington, DC"] <- "USA"
zhao$location[zhao$location == "Minnesota, USA"] <- "USA"
zhao$location[zhao$location == "Sacramento, CA"] <- "USA"
zhao$location[zhao$location == "Ann Arbor, MI"] <- "USA"
zhao$location[zhao$location == "New York, USA"] <- "USA"
zhao$location[zhao$location == "Columbus, OH" ] <- "USA"
zhao$location[zhao$location == "Ann Arbor, MI"] <- "USA"
zhao$location[zhao$location == "Mountain House CA / Michigan"] <- "USA"
zhao$location[zhao$location == "Galveston, TX"] <- "USA"
zhao$location[zhao$location == "Atlanta, GA"] <- "USA"
zhao$location[zhao$location == "Pittsburgh, PA, USA"] <- "USA"
zhao$location[zhao$location == "Massachusetts, USA"] <- "USA"
zhao$location[zhao$location == "Los Angeles, CA"] <- "USA"
zhao$location[zhao$location == "New York, NY"] <- "USA"

zhao$location[zhao$location == "New York & Miami "] <- "USA"
zhao$location[zhao$location %in% c("United States","Across the U.S.","Fort Lauderdale, FL", "San Francisco Bay Area","St. Louis" ,"North Carolina","minnesota","New York & Miami","HQ New York City - Global","Boston","Sioux Falls SD")] <- "USA"
unique(zhao$location)

##  [1] "india"                                               
##  [2] ""                                                    
##  [3] "China"                                               
##  [4] "USA"                                                 
##  [5] "England, United Kingdom"                             
##  [6] "India"                                               
##  [7] "Jaipur, Rajasthan"                                   
##  [8] "Republic of the Philippines"                         
##  [9] "Australia"                                           
## [10] "Worldwide "                                          
## [11] "Berne, Switzerland"                                  
## [12] "Mumbai"                                              
## [13] "France"                                              
## [14] "England "                                            
## [15] "Mumbai, India"                                       
## [16] "PA/NJ by way of The DMV"                             
## [17] "Chennai, India"                                      
## [18] "New Delhi, India"                                    
## [19] "Elgiva Theatre, Chesham"                             
## [20] "Wherever you get your Podcasts"                      
## [21] "3rd Rock from the Sun"                               
## [22] "Singur, West Bengal"                                 
## [23] "Global"                                              
## [24] "United Arab Emirates"                                
## [25] "Ahmedabad Gujarat"                                   
## [26] "Everywhere we learn & thrive!"                       
## [27] "Adelaide, Australia"                                 
## [28] "Aylesford, England"                                  
## [29] "Delhi"                                               
## [30] "Liverpool, UK"                                       
## [31] "San Jose, CA "                                       
## [32] "San Antonio/Houston, TX"                             
## [33] "Egypt"                                               
## [34] "London"                                              
## [35] "Vancouver 溫哥華, Canada 加拿大 \U0001f1e8\U0001f1e6"
## [36] "Singapore"                                           
## [37] "Chester, England"                                    
## [38] "Saudi Arabia"                                        
## [39] "London, England"                                     
## [40] "Toronto, Canada"                                     
## [41] "Rawalpindi/Islamabad"                                
## [42] "Canada"                                              
## [43] "Mumbai India"                                        
## [44] "Leeds, England"                                      
## [45] "Pune"                                                
## [46] "Kerala"                                              
## [47] "#RemoteWork #Boston "                                
## [48] "Ujjain"                                              
## [49] "London UK"                                           
## [50] "Dhaka, Bangladesh"

zhao$location[zhao$location == "New Delhi, India"] <- "India"
zhao$location[zhao$location == "Mumbai, India"] <- "India"
zhao$location[zhao$location == "San Jose, CA "] <- "USA"
zhao$location[zhao$location == "San Antonio/Houston, TX"] <- "USA"
zhao$location[zhao$location %in% c("india","Mumbai","Chennai, India" ,"Singur, West Bengal","Ahmedabad Gujarat" ,"Delhi" ,"Mumbai India","Pune" ,"Kerala","Ujjain","Dhaka, Bangladesh")] <- "India"
zhao$location[zhao$location %in% c("England, United Kingdom","Elgiva Theatre, Chesham","England ","Aylesford, England","Liverpool, UK" ,"London","Chester, England","London, England","Leeds, England","#RemoteWork #Boston ","London UK")] <- "UK"
zhao$location[zhao$location %in% c("Jaipur, Rajasthan")] <- "Bengal"
zhao$location[zhao$location %in% c("Republic of the Philippines")] <- "Philippines"
zhao$location[zhao$location %in% c("Vancouver 溫哥華, Canada 加拿大 \U0001f1e8\U0001f1e6" ,"Toronto, Canada")] <- "Canada"
zhao$location[zhao$location %in% c("Adelaide, Australia")] <- "Australia"
zhao$location[zhao$location %in% c("Berne, Switzerland" )] <- "Switzerland"
zhao$location[zhao$location %in% c("" ,"Worldwide " ,"PA/NJ by way of The DMV" , "Wherever you get your Podcasts","3rd Rock from the Sun","Global" ,"Everywhere we learn & thrive!")] <- "others"

zhao$date = as.Date(zhao$created_at)

sentiment %>% 
  merge(zhao[,c("status_id","location","date")]) %>%
  group_by(date,location) %>% 
  summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>% 
  ggplot(aes(x=date,y=avg_sentiment,color=location)) + 
  geom_line()

## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.

zhao_merge <- tokens%>%
  merge(zhao[,c("status_id","location","created_at")])

改日期

zhao_merge$num <- as.numeric(zhao_merge$created_at)
zhao_merge$day <- substring(zhao_merge$created_at, 9, 10)
zhao_merge$hour <- substring(zhao_merge$created_at, 12, 13)

AFINN情緒字典

sent_zhao <- zhao_merge %>%
  inner_join(get_sentiments("afinn"))%>%
  group_by(location)%>%
  summarise(score = sum(value))%>%
  mutate(location = reorder(location, score))%>%
  ggplot(aes(location, score)) +
  geom_col(show.legend = FALSE)+
  coord_flip()

## Joining, by = "word"

sent_zhao

用bing找出正負向字，並畫圖

bing_zhao <- 
  zhao_merge %>%
    inner_join(get_sentiments("bing")) %>%
    group_by(word,sentiment) %>%
  count(word, sort = TRUE)%>% 
  summarise(sum = sum(n)) %>% 
  arrange(desc(sum)) %>%
  data.frame()

## Joining, by = "word"

## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

bing_zhao

##               word sentiment sum
## 1              win  positive  44
## 2             best  positive  34
## 3            award  positive  14
## 4              won  positive  12
## 5          winners  positive   9
## 6  congratulations  positive   7
## 7          winning  positive   7
## 8             wins  positive   7
## 9           winner  positive   6
## 10        goodness  positive   4
## 11      incredible  positive   4
## 12         victory  positive   4
## 13            work  positive   4
## 14           prize  positive   3
## 15         amazing  positive   2
## 16          awards  positive   2
## 17      celebrated  positive   2
## 18         courage  positive   2
## 19           faith  positive   2
## 20        glorious  positive   2
## 21           great  positive   2
## 22            lies  negative   2
## 23          missed  negative   2
## 24         success  positive   2
## 25            well  positive   2
## 26       accolades  positive   1
## 27     achievement  positive   1
## 28        approval  positive   1
## 29         awarded  positive   1
## 30       beautiful  positive   1
## 31        breaking  negative   1
## 32   breakthroughs  positive   1
## 33          bright  positive   1
## 34       brilliant  positive   1
## 35     complaining  negative   1
## 36        creative  positive   1
## 37       difficult  negative   1
## 38   disappointing  negative   1
## 39         dynamic  positive   1
## 40           evade  negative   1
## 41         excited  positive   1
## 42         fraught  negative   1
## 43         fucking  negative   1
## 44            glad  positive   1
## 45            good  positive   1
## 46           happy  positive   1
## 47        immature  negative   1
## 48       inspiring  positive   1
## 49       insulting  negative   1
## 50         overdue  negative   1
## 51           proud  positive   1
## 52           right  positive   1
## 53            shit  negative   1
## 54          silent  positive   1
## 55         snagged  negative   1
## 56         stellar  positive   1
## 57          strict  negative   1
## 58          struck  negative   1
## 59          talent  positive   1
## 60           thank  positive   1
## 61             top  positive   1
## 62           toxic  negative   1
## 63          trophy  positive   1
## 64       visionary  positive   1
## 65           wrong  negative   1

bing_zhao %>%
  top_n(20,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

library(sentimentr)
set.seed(10)
mytext <- get_sentences(zhao$text) #將text轉成list of characters型態
sentiment_words <- extract_sentiment_terms(mytext) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,]   #正面的字

sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字

## Selecting by n

##           words polarity  n
##  1:       black    -0.25 44
##  2:        stop    -0.40 11
##  3:      boring    -1.00 10
##  4:        dead    -1.00 10
##  5:        hate    -0.75  9
##  6:        late    -0.25  8
##  7:       worst    -0.50  8
##  8:        wait    -0.25  7
##  9:      enough    -0.25  6
## 10:      missed    -0.50  6
## 11:        lost    -0.75  6
## 12:         bad    -0.75  6
## 13:      racism    -1.00  6
## 14: should have    -1.05  6