安裝package
packages = c("dplyr","ggplot2","rtweet" ,"xml2", "httr", "jsonlite", "data.tree", "NLP", "igraph","sentimentr","tidytext","wordcloud2","DiagrammeR","dplyr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
load("coreNLP_zhao.RData")
head(df)
## # A tibble: 6 x 92
## user_id status_id created_at screen_name text source
## <chr> <chr> <dttm> <chr> <chr> <chr>
## 1 12685834… 1387355969… 2021-04-28 10:40:13 ademorganle… " Ridiculous. … Twitte…
## 2 12685834… 1386650105… 2021-04-26 11:55:22 ademorganle… " I so wanted … Twitte…
## 3 12685834… 1387353730… 2021-04-28 10:31:19 ademorganle… "This is ridic… Twitte…
## 4 69575601 1387355831… 2021-04-28 10:39:40 veen_th "A less than g… Twitte…
## 5 12783468… 1387355721… 2021-04-28 10:39:14 SAIJAYA2544… "Great respect… Twitte…
## 6 69296462 1387355088… 2021-04-28 10:36:43 Karmathebea… "Thank you. Fo… Twitte…
## # … with 86 more variables: display_text_width <dbl>,
## # reply_to_status_id <chr>, reply_to_user_id <chr>,
## # reply_to_screen_name <chr>, is_quote <lgl>, is_retweet <lgl>,
## # favorite_count <int>, retweet_count <int>, quote_count <int>,
## # reply_count <int>, hashtags <list>, symbols <list>, urls_url <list>,
## # urls_t.co <list>, urls_expanded_url <list>, media_url <list>,
## # media_t.co <list>, media_expanded_url <list>, media_type <list>,
## # ext_media_url <list>, ext_media_t.co <list>, ext_media_expanded_url <list>,
## # ext_media_type <chr>, mentions_user_id <list>, mentions_screen_name <list>,
## # lang <chr>, quoted_status_id <chr>, quoted_text <chr>,
## # quoted_created_at <dttm>, quoted_source <chr>, quoted_favorite_count <int>,
## # quoted_retweet_count <int>, quoted_user_id <chr>, quoted_screen_name <chr>,
## # quoted_name <chr>, quoted_followers_count <int>,
## # quoted_friends_count <int>, quoted_statuses_count <int>,
## # quoted_location <chr>, quoted_description <chr>, quoted_verified <lgl>,
## # retweet_status_id <chr>, retweet_text <chr>, retweet_created_at <dttm>,
## # retweet_source <chr>, retweet_favorite_count <int>,
## # retweet_retweet_count <int>, retweet_user_id <chr>,
## # retweet_screen_name <chr>, retweet_name <chr>,
## # retweet_followers_count <int>, retweet_friends_count <int>,
## # retweet_statuses_count <int>, retweet_location <chr>,
## # retweet_description <chr>, retweet_verified <lgl>, place_url <chr>,
## # place_name <chr>, place_full_name <chr>, place_type <chr>, country <chr>,
## # country_code <chr>, geo_coords <list>, coords_coords <list>,
## # bbox_coords <list>, status_url <chr>, name <chr>, location <chr>,
## # description <chr>, url <chr>, protected <lgl>, followers_count <int>,
## # friends_count <int>, listed_count <int>, statuses_count <int>,
## # favourites_count <int>, account_created_at <dttm>, verified <lgl>,
## # profile_url <chr>, profile_expanded_url <chr>, account_lang <lgl>,
## # profile_banner_url <chr>, profile_background_url <chr>,
## # profile_image_url <chr>, time <chr>, date <date>
nrow(df)
## [1] 5000
min(df$created_at)
## [1] "2021-04-26 11:11:11 UTC"
max(df$created_at)
## [1] "2021-04-28 10:40:13 UTC"
抓趙婷相關資料
zhao <- df[grep("Zhao",df$text),]
(1). API呼叫的設定
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
generate_API_url <- function(host, port="9000",
tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
url <- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
}
#指定服務的位置
host = "127.0.0.1"
generate_API_url(host)
# 呼叫coreNLP api
call_coreNLP <- function(server_host, text, host="localhost", language="eng",
tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
port <- ifelse(language=="eng", 9000, 9001);
# 產生api網址
url <- generate_API_url(server_host, port=port,
tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
result <- POST(url, body = text, encode = "json")
doc <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
return (doc)
}
#文件使用coreNLP服務
coreNLP <- function(data,host){
# 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
result <- apply(data, 1 , function(x){
object <- call_coreNLP(host, x['text'])
list(doc=object, data=x)
})
return(result)
}
(2). 資料整理function
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
coreNLP_tokens_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
tokens <- do.call(rbind, lapply(sen$tokens, function(x){
result <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
}))
tokens <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))
return(result)
}
從回傳的core-nlp object中整理出詞彙依存關係,輸出為 tidydata 格式
coreNLP_dependency_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
dependencies <- do.call(rbind, lapply(sen$basicDependencies, function(x){
result <- data.frame(dep=x$dep, governor=x$governor, governorGloss=x$governorGloss, dependent=x$dependent, dependentGloss=x$dependentGloss)
result
}))
dependencies <- original_data %>%
t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(dependencies))) %>%
bind_cols(dependencies)
dependencies
}))
return(result)
}
從回傳的core-nlp object中整理出語句情緒,輸出為 tidydata 格式
coreNLP_sentiment_parser <- function(coreNLP_objects){
result <- do.call(rbind, lapply(coreNLP_objects, function(obj){
original_data <- obj$data
doc <- obj$doc
# for a sentences
sentences <- doc$sentences
sen <- sentences[[1]]
sentiment <- original_data %>%
t() %>%
data.frame() %>%
bind_cols(data.frame(sentiment=sen$sentiment, sentimentValue=sen$sentimentValue))
sentiment
}))
return(result)
}
# 圖形化顯示dependency結果
parse2tree <- function(ptext) {
stopifnot(require(NLP) && require(igraph))
# this step modifies coreNLP parse tree to mimic openNLP parse tree
ptext <- gsub("[\r\n]", "", ptext)
ptext <- gsub("ROOT", "TOP", ptext)
## Replace words with unique versions
ms <- gregexpr("[^() ]+", ptext) # just ignoring spaces and brackets?
words <- regmatches(ptext, ms)[[1]] # just words
regmatches(ptext, ms) <- list(paste0(words, seq.int(length(words)))) # add id to words
## Going to construct an edgelist and pass that to igraph
## allocate here since we know the size (number of nodes - 1) and -1 more to exclude 'TOP'
edgelist <- matrix('', nrow=length(words)-2, ncol=2)
## Function to fill in edgelist in place
edgemaker <- (function() {
i <- 0 # row counter
g <- function(node) { # the recursive function
if (inherits(node, "Tree")) { # only recurse subtrees
if ((val <- node$value) != 'TOP1') { # skip 'TOP' node (added '1' above)
for (child in node$children) {
childval <- if(inherits(child, "Tree")) child$value else child
i <<- i+1
edgelist[i,1:2] <<- c(val, childval)
}
}
invisible(lapply(node$children, g))
}
}
})()
## Create the edgelist from the parse tree
edgemaker(Tree_parse(ptext))
tree <- FromDataFrameNetwork(as.data.frame(edgelist))
return (tree)
}
取得coreNLP回傳的物件
先不要跑這段,會花大概半小時(如果你記憶體只有4G可能會當掉)
gc() #釋放不使用的記憶體
t0 = Sys.time()
obj = zhao[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
#丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
Sys.time() - t0 #執行時間
#Time difference of 28 mins
save.image("coreNLP_zhao.RData")
#先將會用到的東西存下來,要用可直接載RData
#tokens = coreNLP_tokens_parser(obj)
#dependencies = coreNLP_dependency_parser(obj)
#sentiment = coreNLP_sentiment_parser(obj)
#save.image("coreNLP_all.RData")
(1). 斷詞、詞彙還原、詞性標註、NER
tokens = coreNLP_tokens_parser(obj)
head(tokens,20)
## status_id word lemma pos ner lower_word
## 1 1387355969882050560 Ridiculous. Ridiculous. NNP O ridiculous.
## 2 1387355969882050560 Gavean Gavean NNP PERSON gavean
## 3 1387355969882050560 Oscar Oscar NNP PERSON oscar
## 4 1387355969882050560 but but CC O but
## 5 1387355969882050560 didn't didn't VB O didn't
## 6 1387355969882050560 care care NN O care
## 7 1387355969882050560 what what WP O what
## 8 1387355969882050560 his he PRP$ O his
## 9 1387355969882050560 speech speech NN O speech
## 10 1387355969882050560 was? was? IN O was?
## 11 1387355969882050560 Why why WRB O why
## 12 1387355969882050560 award award NN O award
## 13 1387355969882050560 him he PRP O him
## 14 1387355969882050560 then? then? VBP O then?
## 15 1387355969882050560 Give give VB O give
## 16 1387355969882050560 it it PRP O it
## 17 1387355969882050560 to to IN O to
## 18 1387355969882050560 . . . O .
## 19 1386650105382645761 I I PRP O i
## 20 1386650105382645761 so so RB O so
## lower_lemma
## 1 ridiculous.
## 2 gavean
## 3 oscar
## 4 but
## 5 didn't
## 6 care
## 7 what
## 8 he
## 9 speech
## 10 was?
## 11 why
## 12 award
## 13 he
## 14 then?
## 15 give
## 16 it
## 17 to
## 18 .
## 19 i
## 20 so
(2). 命名實體標註(NER)
unique(tokens$ner)
## [1] "O" "PERSON" "DATE"
## [4] "MISC" "DURATION" "NUMBER"
## [7] "ORDINAL" "CITY" "NATIONALITY"
## [10] "TITLE" "COUNTRY" "ORGANIZATION"
## [13] "TIME" "PERCENT" "LOCATION"
## [16] "SET" "CAUSE_OF_DEATH" "STATE_OR_PROVINCE"
## [19] "IDEOLOGY" "RELIGION" "CRIMINAL_CHARGE"
## [22] "MONEY" "URL"
#除去entity為Other,有多少種word有被標註entity
length(unique(tokens$word[tokens$ner != "O"]))
## [1] 2461
(3). 轉小寫
因為大小寫也會影響corenlp對NER的判斷,因此我們一開始給的推文內容是沒有處理大小寫的,但在跑完anotator後,為了正確計算詞頻,創建新欄位lower_word與lower_lemma,存放轉換小寫的word與lemma。轉成小寫的目的是要將不同大小寫的同一字詞(如Evergiven與evergiven)都換成小寫,再來計算詞頻
tokens$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma)
tokens %>%
filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
theme(text=element_text(size=14))+
coord_flip()
tokens %>%
filter(ner == "ORGANIZATION") %>% #篩選NER為ORGANIZATION
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is ORGANIZATION)") +
theme(text=element_text(size=14))+
coord_flip()
tokens %>%
filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()
dependencies = coreNLP_dependency_parser(obj)
head(dependencies,20)
## status_id dep governor governorGloss dependent
## 1 1387355969882050560 ROOT 0 ROOT 5
## 2 1387355969882050560 compound 3 Oscar 1
## 3 1387355969882050560 compound 3 Oscar 2
## 4 1387355969882050560 nsubj 5 didn't 3
## 5 1387355969882050560 advmod 5 didn't 4
## 6 1387355969882050560 obj 5 didn't 6
## 7 1387355969882050560 obl 15 Give 7
## 8 1387355969882050560 nmod:poss 9 speech 8
## 9 1387355969882050560 nsubj 15 Give 9
## 10 1387355969882050560 mark 14 then? 10
## 11 1387355969882050560 advmod 14 then? 11
## 12 1387355969882050560 dep 11 Why 12
## 13 1387355969882050560 nsubj 14 then? 13
## 14 1387355969882050560 acl 9 speech 14
## 15 1387355969882050560 dep 5 didn't 15
## 16 1387355969882050560 obj 15 Give 16
## 17 1387355969882050560 case 7 what 17
## 18 1387355969882050560 punct 5 didn't 18
## 19 1386650105382645761 ROOT 0 ROOT 3
## 20 1386650105382645761 nsubj 3 wanted 1
## dependentGloss
## 1 didn't
## 2 Ridiculous.
## 3 Gavean
## 4 Oscar
## 5 but
## 6 care
## 7 what
## 8 his
## 9 speech
## 10 was?
## 11 Why
## 12 award
## 13 him
## 14 then?
## 15 Give
## 16 it
## 17 to
## 18 .
## 19 wanted
## 20 I
parse_tree <- obj[[113]]$doc[[1]][[1]]$parse
tree <- parse2tree(parse_tree)
SetNodeStyle(tree, style = "filled,rounded", shape = "box")
plot(tree)
情緒分數從最低分0~最高分4
+ 0,1 : very negative,negative
+ 2 : neutral
+ 3,4 : very positive,postive
sentiment = coreNLP_sentiment_parser(obj)
head(sentiment,20)
## status_id
## 1 1387355969882050560
## 2 1386650105382645761
## 3 1387353730463436805
## 4 1387355831918764038
## 5 1387355721210089473
## 6 1387355088709111811
## 7 1387354965065093122
## 8 1387353297644933123
## 9 1387351117919514631
## 10 1387348931324035073
## 11 1387348180908580865
## 12 1387347688090324993
## 13 1387345850209165313
## 14 1387345035927171074
## 15 1387344621823463427
## 16 1387344410891927556
## 17 1386984504368193540
## 18 1386981889471352836
## 19 1387343656852627458
## 20 1386998097629171713
## text
## 1 Ridiculous. Gavean Oscar but didn't care what his speech was? Why award him then? Give it to . She's certainly overdue. Oscars are worse every year. Someone givea hairbrush. 4 no hairbrush.robbed.
## 2 I so wanted to wake up this morning to see a pic of you holding thatfor . At least your film was recognized for Best Costume Makeup.I'm so sorry. I know you would have been gracious.
## 3 This is ridiculous. They gave Hopkins an Oscar but didn't care what his speech would be? Why award him then? Give it to . She's certainly overdue. The Oscars are getting worse every year. Someone givea hairbrush. 4 no hairbrush.
## 4 A less than glittering Oscars ceremony
## 5 Great respect to the legend aka CHADWICK BOSEMAN by not giving academy awardWow!!!applause
## 6 Thank you. For your work for being responsible in a pandemic and for honoring Chadwich Boseman.
## 7 Wishing you a very happy birthdayJi. fromfor yourperformance from the
## 8 Congratulations to Nicholas Becker for hiswin for!
## 9 It's been two days since theand wouldn't it be a waste if those best dressed lists stay on that red carpet? We show you and these Alisters where else you can wear these one of a kindpieces because in our book the awards go to the best outfit repeaters!
## 10 Pandemicconstrained Oscars feature more oratory less music comedy
## 11 And the winner is
## 12 Nice
## 13 WATCH SA documentary My Octopus Teacher' bags an Oscar!
## 14 would have been the first one to congratulate . I am pretty sure that Chadwick Boseman is ashamed because of his dumb fans right now.
## 15 like Hollywood matters
## 16 Howreally choose their
## 17 You Need to See to Believe
## 18 weigh in on bestlooks of all time via
## 19 Chineseborn American filmmakermakes history as the firstwoman the second woman in Oscars 93 years to winAward for Best Director for . She also took the award for Best Picture.
## 20 The thirdOscar winner from 's oeuvre . The execution is flawless which makes this award rest with their team at ! Here's sendingto the team
## sentiment sentimentValue
## 1 Positive 3
## 2 Neutral 2
## 3 Neutral 2
## 4 Neutral 2
## 5 Positive 3
## 6 Neutral 2
## 7 Positive 3
## 8 Neutral 2
## 9 Negative 1
## 10 Negative 1
## 11 Positive 3
## 12 Neutral 2
## 13 Neutral 2
## 14 Neutral 2
## 15 Neutral 2
## 16 Neutral 2
## 17 Neutral 2
## 18 Neutral 2
## 19 Neutral 2
## 20 Positive 3
unique(sentiment$sentiment)
## [1] "Positive" "Neutral" "Negative" "Verypositive" "Verynegative"
sentiment$sentimentValue = sentiment$sentimentValue %>% as.numeric
#了解情緒文章的分佈
sentiment$sentiment %>% table()
## .
## Negative Neutral Positive Verynegative Verypositive
## 947 2280 1650 9 85
df$date = as.Date(df$created_at)
sentiment %>%
merge(df[,c("status_id","source","date")]) %>%
group_by(date) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment)) +
geom_line()
sentiment %>%
merge(zhao[,c("status_id","quoted_location","date")]) %>%
group_by(date,quoted_location) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=quoted_location)) +
geom_line()
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
#了解正面文章的詞彙使用
sentiment %>%
merge(tokens) %>%
anti_join(stop_words) %>%
filter(!lower_word %in% c('i','the')) %>%
filter(sentiment == "Verypositive" | sentiment =='Positive') %>%
group_by(lower_lemma) %>% #根據lemma分組
summarize(count = n()) %>%
filter(count >5 & count<400)%>%
wordcloud2()
## Joining, by = "word"
#了解負面文章的詞彙使用
#sentiment %>%
#merge(tokens) %>%
#anti_join(stop_words) %>%
#filter(!lower_word %in% c('i','the')) %>%
#filter(sentiment == "Verynegative" | sentiment =='Negative') %>%
#group_by(lower_lemma) %>%
#summarize(count = n()) %>%
#filter(count >10 &count<400)%>%
#wordcloud2()
library(sentimentr)
情緒分數為-1~1之間,<0屬於負面,>0屬於正面,0屬於中性
set.seed(10)
mytext <- get_sentences(tweets$text) #將text轉成list of characters型態
x <- sample(tweets$text, 1000, replace = FALSE) #隨機取1000筆,取後不放回
sentiment_words <- extract_sentiment_terms(x) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字
## words polarity n
## 1: congrats 1.0 29
## 2: please 1.0 14
## 3: messiah 1.0 8
## 4: brilliant 1.0 8
## 5: care 1.0 6
## ---
## 473: technology 0.1 1
## 474: church 0.1 1
## 475: sermon 0.1 1
## 476: collectively 0.1 1
## 477: sex 0.1 1
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字
## Selecting by n
## words polarity n
## 1: black -0.25 44
## 2: stop -0.40 11
## 3: boring -1.00 10
## 4: dead -1.00 10
## 5: hate -0.75 9
## 6: late -0.25 8
## 7: worst -0.50 8
## 8: wait -0.25 7
## 9: enough -0.25 6
## 10: missed -0.50 6
## 11: lost -0.75 6
## 12: bad -0.75 6
## 13: racism -1.00 6
## 14: should have -1.05 6
tweets$date = format(tweets$created_at,'%Y%m%d')
(out = tweets %>% with(
sentiment_by( #document level
get_sentences(text),
list( date)
)
))
plot(out)
(out = tweets %>% filter(source %in% c("Twitter Web Client","Twitter for iPhone","Twitter for Android")) %>% with(
sentiment_by(
get_sentences(text),
list(source, date)
)
))
plot(out)
趙婷
zhao <- df[grep("Zhao",df$text),]
分類location
zhao$location[zhao$location %in% c("Hong Kong SAR, China","གངས་རིན་པོ་ཆེ")] <- "China"
zhao$location[zhao$location == "Rice University, Houston, TX"] <- "USA"
zhao$location[zhao$location == "Los Angeles, New York"] <- "USA"
zhao$location[zhao$location == "Virginia, USA"] <- "USA"
zhao$location[zhao$location == "Birmingham, England/ New York"] <- "USA"
zhao$location[zhao$location == "Seattle, WA"] <- "USA"
zhao$location[zhao$location == "Detroit, MI"] <- "USA"
zhao$location[zhao$location == "New York, NY"] <- "USA"
zhao$location[zhao$location == "Boston, MA"] <- "USA"
zhao$location[zhao$location == "Washington, DC"] <- "USA"
zhao$location[zhao$location == "Minnesota, USA"] <- "USA"
zhao$location[zhao$location == "Sacramento, CA"] <- "USA"
zhao$location[zhao$location == "Ann Arbor, MI"] <- "USA"
zhao$location[zhao$location == "New York, USA"] <- "USA"
zhao$location[zhao$location == "Columbus, OH" ] <- "USA"
zhao$location[zhao$location == "Ann Arbor, MI"] <- "USA"
zhao$location[zhao$location == "Mountain House CA / Michigan"] <- "USA"
zhao$location[zhao$location == "Galveston, TX"] <- "USA"
zhao$location[zhao$location == "Atlanta, GA"] <- "USA"
zhao$location[zhao$location == "Pittsburgh, PA, USA"] <- "USA"
zhao$location[zhao$location == "Massachusetts, USA"] <- "USA"
zhao$location[zhao$location == "Los Angeles, CA"] <- "USA"
zhao$location[zhao$location == "New York, NY"] <- "USA"
zhao$location[zhao$location == "New York & Miami "] <- "USA"
zhao$location[zhao$location %in% c("United States","Across the U.S.","Fort Lauderdale, FL", "San Francisco Bay Area","St. Louis" ,"North Carolina","minnesota","New York & Miami","HQ New York City - Global","Boston","Sioux Falls SD")] <- "USA"
unique(zhao$location)
## [1] "india"
## [2] ""
## [3] "China"
## [4] "USA"
## [5] "England, United Kingdom"
## [6] "India"
## [7] "Jaipur, Rajasthan"
## [8] "Republic of the Philippines"
## [9] "Australia"
## [10] "Worldwide "
## [11] "Berne, Switzerland"
## [12] "Mumbai"
## [13] "France"
## [14] "England "
## [15] "Mumbai, India"
## [16] "PA/NJ by way of The DMV"
## [17] "Chennai, India"
## [18] "New Delhi, India"
## [19] "Elgiva Theatre, Chesham"
## [20] "Wherever you get your Podcasts"
## [21] "3rd Rock from the Sun"
## [22] "Singur, West Bengal"
## [23] "Global"
## [24] "United Arab Emirates"
## [25] "Ahmedabad Gujarat"
## [26] "Everywhere we learn & thrive!"
## [27] "Adelaide, Australia"
## [28] "Aylesford, England"
## [29] "Delhi"
## [30] "Liverpool, UK"
## [31] "San Jose, CA "
## [32] "San Antonio/Houston, TX"
## [33] "Egypt"
## [34] "London"
## [35] "Vancouver 溫哥華, Canada 加拿大 \U0001f1e8\U0001f1e6"
## [36] "Singapore"
## [37] "Chester, England"
## [38] "Saudi Arabia"
## [39] "London, England"
## [40] "Toronto, Canada"
## [41] "Rawalpindi/Islamabad"
## [42] "Canada"
## [43] "Mumbai India"
## [44] "Leeds, England"
## [45] "Pune"
## [46] "Kerala"
## [47] "#RemoteWork #Boston "
## [48] "Ujjain"
## [49] "London UK"
## [50] "Dhaka, Bangladesh"
zhao$location[zhao$location == "New Delhi, India"] <- "India"
zhao$location[zhao$location == "Mumbai, India"] <- "India"
zhao$location[zhao$location == "San Jose, CA "] <- "USA"
zhao$location[zhao$location == "San Antonio/Houston, TX"] <- "USA"
zhao$location[zhao$location %in% c("india","Mumbai","Chennai, India" ,"Singur, West Bengal","Ahmedabad Gujarat" ,"Delhi" ,"Mumbai India","Pune" ,"Kerala","Ujjain","Dhaka, Bangladesh")] <- "India"
zhao$location[zhao$location %in% c("England, United Kingdom","Elgiva Theatre, Chesham","England ","Aylesford, England","Liverpool, UK" ,"London","Chester, England","London, England","Leeds, England","#RemoteWork #Boston ","London UK")] <- "UK"
zhao$location[zhao$location %in% c("Jaipur, Rajasthan")] <- "Bengal"
zhao$location[zhao$location %in% c("Republic of the Philippines")] <- "Philippines"
zhao$location[zhao$location %in% c("Vancouver 溫哥華, Canada 加拿大 \U0001f1e8\U0001f1e6" ,"Toronto, Canada")] <- "Canada"
zhao$location[zhao$location %in% c("Adelaide, Australia")] <- "Australia"
zhao$location[zhao$location %in% c("Berne, Switzerland" )] <- "Switzerland"
zhao$location[zhao$location %in% c("" ,"Worldwide " ,"PA/NJ by way of The DMV" , "Wherever you get your Podcasts","3rd Rock from the Sun","Global" ,"Everywhere we learn & thrive!")] <- "others"
zhao$date = as.Date(zhao$created_at)
sentiment %>%
merge(zhao[,c("status_id","location","date")]) %>%
group_by(date,location) %>%
summarise(avg_sentiment = mean(sentimentValue,na.rm=T)) %>%
ggplot(aes(x=date,y=avg_sentiment,color=location)) +
geom_line()
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
zhao_merge <- tokens%>%
merge(zhao[,c("status_id","location","created_at")])
改日期
zhao_merge$num <- as.numeric(zhao_merge$created_at)
zhao_merge$day <- substring(zhao_merge$created_at, 9, 10)
zhao_merge$hour <- substring(zhao_merge$created_at, 12, 13)
AFINN情緒字典
sent_zhao <- zhao_merge %>%
inner_join(get_sentiments("afinn"))%>%
group_by(location)%>%
summarise(score = sum(value))%>%
mutate(location = reorder(location, score))%>%
ggplot(aes(location, score)) +
geom_col(show.legend = FALSE)+
coord_flip()
## Joining, by = "word"
sent_zhao
用bing找出正負向字,並畫圖
bing_zhao <-
zhao_merge %>%
inner_join(get_sentiments("bing")) %>%
group_by(word,sentiment) %>%
count(word, sort = TRUE)%>%
summarise(sum = sum(n)) %>%
arrange(desc(sum)) %>%
data.frame()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
bing_zhao
## word sentiment sum
## 1 win positive 44
## 2 best positive 34
## 3 award positive 14
## 4 won positive 12
## 5 winners positive 9
## 6 congratulations positive 7
## 7 winning positive 7
## 8 wins positive 7
## 9 winner positive 6
## 10 goodness positive 4
## 11 incredible positive 4
## 12 victory positive 4
## 13 work positive 4
## 14 prize positive 3
## 15 amazing positive 2
## 16 awards positive 2
## 17 celebrated positive 2
## 18 courage positive 2
## 19 faith positive 2
## 20 glorious positive 2
## 21 great positive 2
## 22 lies negative 2
## 23 missed negative 2
## 24 success positive 2
## 25 well positive 2
## 26 accolades positive 1
## 27 achievement positive 1
## 28 approval positive 1
## 29 awarded positive 1
## 30 beautiful positive 1
## 31 breaking negative 1
## 32 breakthroughs positive 1
## 33 bright positive 1
## 34 brilliant positive 1
## 35 complaining negative 1
## 36 creative positive 1
## 37 difficult negative 1
## 38 disappointing negative 1
## 39 dynamic positive 1
## 40 evade negative 1
## 41 excited positive 1
## 42 fraught negative 1
## 43 fucking negative 1
## 44 glad positive 1
## 45 good positive 1
## 46 happy positive 1
## 47 immature negative 1
## 48 inspiring positive 1
## 49 insulting negative 1
## 50 overdue negative 1
## 51 proud positive 1
## 52 right positive 1
## 53 shit negative 1
## 54 silent positive 1
## 55 snagged negative 1
## 56 stellar positive 1
## 57 strict negative 1
## 58 struck negative 1
## 59 talent positive 1
## 60 thank positive 1
## 61 top positive 1
## 62 toxic negative 1
## 63 trophy positive 1
## 64 visionary positive 1
## 65 wrong negative 1
bing_zhao %>%
top_n(20,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
library(sentimentr)
set.seed(10)
mytext <- get_sentences(zhao$text) #將text轉成list of characters型態
sentiment_words <- extract_sentiment_terms(mytext) #抓取其中帶有情緒的字
sentiment_counts <- attributes(sentiment_words)$counts #計算出現次數
sentiment_counts[polarity > 0,] #正面的字
sentiment_counts[polarity < 0,] %>% arrange(desc(n)) %>% top_n(10) #出現次數最多的負面字
## Selecting by n
## words polarity n
## 1: black -0.25 44
## 2: stop -0.40 11
## 3: boring -1.00 10
## 4: dead -1.00 10
## 5: hate -0.75 9
## 6: late -0.25 8
## 7: worst -0.50 8
## 8: wait -0.25 7
## 9: enough -0.25 6
## 10: missed -0.50 6
## 11: lost -0.75 6
## 12: bad -0.75 6
## 13: racism -1.00 6
## 14: should have -1.05 6