4/25在美國紐約所舉辦的奧斯卡典禮剛落幕,且像這類的頒獎典禮往往會有兩種聲音,一種會是獲獎人是眾望所歸的,而另一種為眾矢之的。因此本組想了解民眾對於得獎人的正負情緒並深入研究民眾對於得獎人的看法。
因此我們針對內容做以下分析(1)NER(Country,person)(2)針對所提及的人物做情緒分析,文字雲,正負面詞的詞頻..等
library(wordcloud2)
library(ggplot2)
library(scales)
library(rtweet)
library(dplyr)
library(xml2)
library(httr)
library(jsonlite)
library(magrittr)
library(data.tree)
library(tidytext)
library(stringr)
library(DiagrammeR)
library(magrittr)
library(data.table)
load("C:/Users/emma/Desktop/course/109_2/social_media/midterm/oscars2011_clean.RData")
(1). Twitter API設定 透過rtweet抓取tweets
#app = '2021_sma'
#consumer_key = '71QW6sEHM2cRfYQVXPueSnXt7'
#consumer_secret = 'XLCbvKGF9WbDWAfcIAshql9LBwlyRaG6ZNx2zh8TaFzNaBqNob'
#access_token = '1363396212112547841-VA58XSsunKG0DLnE4qVbw2ncwGDmTW'
#access_secret = 'X4EhjmzZ24IvpU56ZfyzHFwLpLeUQ8ZShbR6OwTjHfHFU'
#twitter_token <- create_token(app,consumer_key, consumer_secret,
#access_token, access_secret,set_renv = FALSE)
#Consumer Keys:知道你的身分
#Authentication Tokens:認證給你的授權
(2). 設定關鍵字抓tweets + hashtag:#Oscars2021
# 查詢關鍵字 #Oscar2021
#key = c("#Oscars2021")
#抓5000筆 不抓轉推
#tweets = search_tweets(key,lang="en",n=5000,include_rts = FALSE,token = twitter_token)
(3). tweets內容清理
## 用於資料清理
#clean = function(txt) {
#txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
#txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
#txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
#txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
#txt = gsub("\\n"," ",txt) #去除換行
#txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
#txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
#txt = gsub("&.*;","",txt) #去除html特殊字元編碼
#txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
#txt }
#tweets$text = clean(tweets$text) #text套用資料清理
#df = data.frame()
#df = rbind(df,tweets) # transfer to data frame
#df = df[!duplicated(df[,"status_id"]),]
#去除重複的tweets
(4). 每日發文的數量
as.Date(df$created_at)%>% table() %>% as.data.frame()%>% ggplot(aes(x=.,y=Freq,fill=.))+
geom_histogram(stat = "identity")+labs(x="日期",y="數量",title="每日發文數")
## Warning: Ignoring unknown parameters: binwidth, bins, pad
資料區間:2020/04/26 ~ 2021/04/28 共4781筆
4/26的篇數最多
load("C:/Users/emma/Desktop/course/109_2/social_media/midterm/oscars2011_cleanNER.RData")
server端 : + 需先在terminal開啟corenlp server + 在corenlp的路徑下開啟terminal輸入 java -mx4g -cp "*" edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 15000
# 產生coreNLP的api url,將本地端的網址轉成符合coreNLP服務的url
<- function(host, port="9000",
generate_API_url tokenize.whitespace="false", annotators=""){ #斷詞依據不是空格
<- sprintf('http://%s:%s/?properties={"tokenize.whitespace":"%s","annotators":"%s"}', host, port, tokenize.whitespace, annotators)
url <- URLencode(url)
url
}#指定服務的位置
= "127.0.0.1"
host
generate_API_url(host)
# 呼叫coreNLP api
<- function(server_host, text, host="localhost", language="eng",
call_coreNLP tokenize.whitespace="true", ssplit.eolonly="true", annotators=c("tokenize","ssplit","pos","lemma","ner","parse","sentiment")){
# 假設有兩個core-nlp server、一個負責英文(使用9000 port)、另一個則負責中文(使用9001 port)
<- ifelse(language=="eng", 9000, 9001);
port # 產生api網址
<- generate_API_url(server_host, port=port,
url tokenize.whitespace=tokenize.whitespace, annotators=paste0(annotators, collapse = ','))
<- POST(url, body = text, encode = "json")
result <- httr::content(result, "parsed","application/json",encoding = "UTF-8")
doc return (doc)
}
#文件使用coreNLP服務
<- function(data,host){
coreNLP # 依序將每個文件丟進core-nlp進行處理,每份文件的回傳結果為json格式
# 在R中使用objects來儲存處理結果
<- apply(data, 1 , function(x){
result <- call_coreNLP(host, x['text'])
object list(doc=object, data=x)
})
return(result)
}
從回傳的object中整理斷詞出結果,輸出為 tidydata 格式
<- function(coreNLP_objects){
coreNLP_tokens_parser
<- do.call(rbind, lapply(coreNLP_objects, function(obj){
result <- obj$data
original_data <- obj$doc
doc # for a sentences
<- doc$sentences
sentences
<- sentences[[1]]
sen
<- do.call(rbind, lapply(sen$tokens, function(x){
tokens <- data.frame(word=x$word, lemma=x$lemma, pos=x$pos, ner=x$ner)
result
result
}))
<- original_data %>%
tokens t() %>%
data.frame() %>%
select(-text) %>%
slice(rep(1:n(), each = nrow(tokens))) %>%
bind_cols(tokens)
tokens
}))return(result)
}
gc() #釋放不使用的記憶體
= Sys.time()
t0 = df[,c(2,5)] %>% filter(text != "") %>% coreNLP(host) #丟入本地執行
obj #丟入coreNLP的物件 必須符合: 是一個data.frame 有一個text欄位
Sys.time() - t0 #執行時間
#Time difference of 28 mins
#先將會用到的東西存下來,要用可直接載RData
#tokens = coreNLP_tokens_parser(obj)
#save.image("coreNLP_all.RData")
unique(tokens$ner)
## [1] O NUMBER ORGANIZATION ORDINAL
## [5] MISC TITLE PERSON NATIONALITY
## [9] LOCATION SET TIME DATE
## [13] DURATION PERCENT CITY IDEOLOGY
## [17] COUNTRY CAUSE_OF_DEATH STATE_OR_PROVINCE MONEY
## [21] URL RELIGION CRIMINAL_CHARGE
## 23 Levels: O NUMBER ORGANIZATION ORDINAL MISC TITLE PERSON ... CRIMINAL_CHARGE
length(unique(tokens$word[tokens$ner != "O"]))
## [1] 2399
%>%
tokens filter(ner == "COUNTRY") %>% #篩選NER為COUNTRY
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 13, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is COUNTRY)") +
geom_text(aes(label = count,colour="red"))+
theme(text=element_text(size=14))+
coord_flip()
america被提及次數最多,為奧斯卡主辦方;china被提及的次數為第二名,因為中國籍趙婷為第二個獲得最佳導演獎的華人,為第一個獲得最佳導演獎的亞裔女性。
india:本組推測印度被提及多次的原因為:本次奧斯卡共有5位印度籍明星獲獎,並且其中一位獲獎的人為印度第一位獲獎奧斯卡的人。但她於去年逝世,故大家對於她獲獎但卻無法親自領獎於twiiter上表示了扼腕。同時在播放回顧影片時,也有兩位印度籍過世明星被放入紀念影片當中,所以大家都在twiiter表達了緬懷之情。
%>%
tokens filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
geom_text(aes(label = count,colour="red"))+
theme(text=element_text(size=14))+
coord_flip()
anthony,hopkins 被提及的次數最多,經過調查圍本屆奧斯卡影帝。
chadwick,boseman 被提及的次數第二名多,為影帝的遺憾之珠,因黑豹打開知名度,但於去年英年早逝。
zhao,為第三多,為前面提及的中國女導演
youn 尹汝貞,尹汝貞以美國出品、韓裔團隊為主幹的「夢想之地」成為繼「櫻花戀」梅木三吉後60多年來第一位亞裔最佳女配角,為韓國史上首位奧斯卡演員獎項。 >接下來,會從NER提及到的人物做詳細的探索(anthony hopkins、chadwick boseman、尹汝貞、趙婷)
首先,我們先利用工作流程平台的共現相關圖去看anthony hopkins主要的討論議題為何?
「anthony」「hopkins」因為在電影「father」中,「amazing」「performance」所以「win」best 「actor」;而大家認為他「deserved」這個獎項。
另外,由於「chadwick」「boseman」英年早逝,有部分觀影者希望將這個獎項頒給表現出色的他。最後分享一個有趣的小插曲,由於奧斯卡製作人的失誤,所以影帝本人沒有辦法準時發表獲獎感言「late」,出乎意料的是影帝本人在獲獎感言當中,pay 「tribute」to chadwick boseman。
+5
~ -5
分# 影帝安東尼
=df[str_detect(df$text,regex("(.Anthony|^Anthony|.Hopkins|Hopkins$|.father)")),]
Anthony_df= subset(Anthony_df,as.Date(created_at)=="2021-04-27")
Anthony_df0427
= Anthony_df0427 %>%unnest_tokens(word, text) %>% anti_join(stop_words) Anthony_unnest
## Joining, by = "word"
= Anthony_unnest %>%
A inner_join(get_sentiments("afinn")) %>%
group_by(time) %>% summarise(sentiment = sum(value)) %>% mutate(actor="Anthony")
## Joining, by = "word"
=df[str_detect(df$text,regex("(.Chadwick|^Chadwick|.Boseman|Boseman$|.Ma Rainey's Black Bottom|^Ma Rainey's Black Bottom)")),]
Chadwick_df= subset(Chadwick_df,as.Date(created_at)=="2021-04-27")
Chadwick_df0427= Chadwick_df0427 %>%unnest_tokens(word, text) %>% anti_join(stop_words) Chadwick_unnest
## Joining, by = "word"
= Chadwick_unnest %>%
C inner_join(get_sentiments("afinn")) %>%
group_by(time) %>% summarise(sentiment = sum(value)) %>% mutate(actor="Chadwick")
## Joining, by = "word"
bind_rows(A,C)%>% ggplot(aes(x=time,y=sentiment,group=actor,col=actor))+geom_line()
於4/27的04點提及anthony的文章正向情緒非常高,但在5點的時候有提及chadwick的文章的情緒驟降,甚至情緒分數往負向走;本組推測會有這樣矛盾的情緒原因為大家對於本次的影帝人選感到非常的難以抉擇,無論是高齡的anthony抑或是因病過世的chadwick,各有各自的支持者,所以我們覺得無論今天是誰得獎,這張情緒圖就是會如此地矛盾。
= Anthony_df %>%unnest_tokens(word, text) %>% anti_join(stop_words) Anthony_unnest
## Joining, by = "word"
= Chadwick_df %>%unnest_tokens(word, text) %>% anti_join(stop_words) Chadwick_unnest
## Joining, by = "word"
= Anthony_unnest %>%
A_binginner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE)%>% mutate(actor="Anthony")
## Joining, by = "word"
= Chadwick_unnest %>%
C_bing inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE)%>% mutate(actor="Chadwick")
## Joining, by = "word"
rbind(A_bing,C_bing) %>%
group_by(sentiment,actor) %>%
top_n(5,wt=n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_grid(sentiment~actor,scales = "free_y",space = "free") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
在負面詞當中:anthony多了unfortunate
、sad
、blame
、akward
黑豹多了:disrespect
upset
snub
在正面詞當中: anthony多了congradulation
黑豹則是多了incredible
趙婷
首位亞洲女性導演獲得奧斯卡最佳導演=df[str_detect(df$text,regex("(.Chloé|^Chloé|.Chloe|^Chloe|.Zhao|Zhao$|.Nomadland|^Nomadland|.director|^director)")),]
zhao_df#進行段詞後濾掉停用字
= zhao_df %>%unnest_tokens(word, text) %>% anti_join(stop_words) df5
## Joining, by = "word"
$date = df5$created_at %>% as.Date() #新增一欄date
df5<- df5 %>%
word_count select(date, word) %>%
group_by(date,word) %>%
summarise(count=n()) %>%
filter(!word=="chloé"&!word=="chloe"&!word=="zhao"&!word=="oscar"&!word=="oscars") %>%
# 算字詞單篇總數用summarise
filter(count>10) %>% # 過濾出現太少次的字
arrange(desc(count))
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
-1] %>% wordcloud2() word_count[,
「director」「woman」「history」「frances,mcdormand」「chinese」「yuhjung」 以上為文章內含有趙婷的較高詞頻字,由於文字雲只能看到詞頻不能了解詞與詞之間的相關性,因此我們將透過共現相關圖來說明。
“趙婷的共現相關圖”
我們可以發現與「zhao」「chole」相關性最高的為「director」「second」「women」「nomadland」「history」,組合成一句話就是趙婷為第二個獲得最佳導演獎的華人。
「history」又與「youn」「yuhjung」有相關,因為尹汝貞跟趙婷皆是創下歷史新紀錄的人。最後「nomadland」與「frances」「mcdormand」有關,因為frances mcdormand為本片女主角並且榮獲本屆影后。
<- df[grep("Zhao",df$text),] zhao
$location[zhao$location %in% c("Hong Kong SAR, China","གངས་རིན་པོ་ཆེ")] <- "China"
zhao$location[zhao$location == "Rice University, Houston, TX"] <- "USA"
zhao$location[zhao$location == "Los Angeles, New York"] <- "USA"
zhao$location[zhao$location == "Virginia, USA"] <- "USA"
zhao$location[zhao$location == "Birmingham, England/ New York"] <- "USA"
zhao$location[zhao$location == "Seattle, WA"] <- "USA"
zhao$location[zhao$location == "Detroit, MI"] <- "USA"
zhao$location[zhao$location == "New York, NY"] <- "USA"
zhao$location[zhao$location == "Boston, MA"] <- "USA"
zhao$location[zhao$location == "Washington, DC"] <- "USA"
zhao$location[zhao$location == "Minnesota, USA"] <- "USA"
zhao$location[zhao$location == "Sacramento, CA"] <- "USA"
zhao$location[zhao$location == "Ann Arbor, MI"] <- "USA"
zhao$location[zhao$location == "New York, USA"] <- "USA"
zhao$location[zhao$location == "Columbus, OH" ] <- "USA"
zhao$location[zhao$location == "Ann Arbor, MI"] <- "USA"
zhao$location[zhao$location == "Mountain House CA / Michigan"] <- "USA"
zhao$location[zhao$location == "Galveston, TX"] <- "USA"
zhao$location[zhao$location == "Atlanta, GA"] <- "USA"
zhao$location[zhao$location == "Pittsburgh, PA, USA"] <- "USA"
zhao$location[zhao$location == "Massachusetts, USA"] <- "USA"
zhao$location[zhao$location == "Los Angeles, CA"] <- "USA"
zhao$location[zhao$location == "New York, NY"] <- "USA"
zhao
$location[zhao$location == "New York & Miami "] <- "USA"
zhao$location[zhao$location %in% c("United States","Across the U.S.","Fort Lauderdale, FL", "San Francisco Bay Area","St. Louis" ,"North Carolina","minnesota","New York & Miami","HQ New York City - Global","Boston","Sioux Falls SD")] <- "USA"
zhaounique(zhao$location)
## [1] "London, England" ""
## [3] "Chester, England" "USA"
## [5] "Adelaide, Australia" "Global"
## [7] "Ahmedabad Gujarat" "England "
## [9] "Vancouver 溫哥華, Canada 加拿大 <U+0001F1E8><U+0001F1E6>" "China"
## [11] "Mumbai, India" "San Antonio/Houston, TX"
## [13] "Pune" "Australia"
## [15] "Saudi Arabia" "india"
## [17] "India" "3rd Rock from the Sun"
## [19] "Kerala" "Berne, Switzerland"
## [21] "Aylesford, England" "PA/NJ by way of The DMV"
## [23] "Worldwide " "Singur, West Bengal"
## [25] "Chennai, India" "Toronto, Canada"
## [27] "New Delhi, India" "Singapore"
## [29] "Canada" "France"
## [31] "Elgiva Theatre, Chesham" "Mumbai India"
## [33] "Liverpool, UK" "Delhi"
## [35] "Wherever you get your Podcasts" "England, United Kingdom"
## [37] "Mumbai" "Jaipur, Rajasthan"
## [39] "Egypt" "United Arab Emirates"
## [41] "London" "Leeds, England"
## [43] "Everywhere we learn & thrive!" "<U+0F42><U+0F44><U+0F66><U+0F0B><U+0F62><U+0F72><U+0F53><U+0F0B><U+0F54><U+0F7C><U+0F0B><U+0F46><U+0F7A>"
## [45] "Rawalpindi/Islamabad" "Republic of the Philippines"
## [47] "San Jose, CA "
$location[zhao$location == "New Delhi, India"] <- "India"
zhao$location[zhao$location == "Mumbai, India"] <- "India"
zhao$location[zhao$location == "San Jose, CA "] <- "USA"
zhao$location[zhao$location == "San Antonio/Houston, TX"] <- "USA"
zhao$location[zhao$location %in% c("india","Mumbai","Chennai, India" ,"Singur, West Bengal","Ahmedabad Gujarat" ,"Delhi" ,"Mumbai India","Pune" ,"Kerala","Ujjain","Dhaka, Bangladesh")] <- "India"
zhao$location[zhao$location %in% c("England, United Kingdom","Elgiva Theatre, Chesham","England ","Aylesford, England","Liverpool, UK" ,"London","Chester, England","London, England","Leeds, England","#RemoteWork #Boston ","London UK")] <- "UK"
zhao$location[zhao$location %in% c("Jaipur, Rajasthan")] <- "Bengal"
zhao$location[zhao$location %in% c("Republic of the Philippines")] <- "Philippines"
zhao$location[zhao$location %in% c("Vancouver 溫哥華, Canada 加拿大 \U0001f1e8\U0001f1e6" ,"Toronto, Canada")] <- "Canada"
zhao$location[zhao$location %in% c("Adelaide, Australia")] <- "Australia"
zhao$location[zhao$location %in% c("Berne, Switzerland" )] <- "Switzerland"
zhao$location[zhao$location %in% c("" ,"Worldwide " ,"PA/NJ by way of The DMV" , "Wherever you get your Podcasts","3rd Rock from the Sun","Global" ,"Everywhere we learn & thrive!")] <- "others" zhao
<- zhao %>%
sent_zhao unnest_tokens(word, text) %>%
anti_join(stop_words) %>%
inner_join(get_sentiments("afinn"))%>%
group_by(location)%>%
summarise(score = round(mean(value)),2)%>%
mutate(location = reorder(location, score))%>%
ggplot(aes(location, score, fill = location)) +
geom_col(show.legend = FALSE)+
coord_flip()+
geom_label(aes(label=score), size=4, fill="white")
## Joining, by = "word"
## Joining, by = "word"
sent_zhao
本圖為在提及趙婷的文章中各國的情緒分數分佈圖,在上圖我們可以看到,多數國家在談及趙婷時,都有非常高的正向情緒,但是中國對於自家人獲獎居然一改常態並沒有大肆誇耀,反而累加起來的情緒分數偏低。所以,接著我們搜尋台灣的新聞發現趙婷成為中國人封殺的對象,因此我們從文字分析平台的抓取關於趙婷的新聞。
因此我們從文字分析平台抓的台灣資料(ptt和dcard的電影版、各個新聞的兩岸版),但僅抓到27筆
# 把文章和留言讀進來
= fread('../midterm/0501.csv',encoding = 'UTF-8')
MetaData = fread('../midterm/0501_articleReviews.csv',encoding = 'UTF-8')
Reviews
= function(txt) {
clean_modi = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
txt = gsub("\\n"," ",txt) #去除換行
txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
txt = gsub("&.*;","",txt) #去除html特殊字元編碼
txt = gsub("[a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
txt
txt }### 初始化斷詞引擎
library(jiebaR)
## Loading required package: jiebaRD
<- worker(user="user_dict.txt")
jieba_tokenizer ### 自定義斷詞函式
<- function(text) {
chinese_tokenizer lapply(text, function(x) {
<- segment(x, jieba_tokenizer) #先做段詞
tokens return(tokens)
}) }
$artContent=clean_modi(MetaData$artContent)
MetaData$cmtContent=clean_modi(Reviews$cmtContent)
Reviews# 把文章和留言的斷詞結果併在一起
=MetaData %>% unnest_tokens(word, artContent,chinese_tokenizer)
MToken<- Reviews %>% unnest_tokens(word, cmtContent,chinese_tokenizer)
RToken
# 把資料併在一起
<- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")]) data
# 格式化日期欄位
$artDate= data$artDate %>% as.Date("%Y/%m/%d")
data
# 過濾掉字詞==1
= data %>%
data_select filter(nchar(.$word)>1)
=c("中國","趙婷","奧斯卡","中國人","美國")
nomeaing=fread('../midterm/stop_words.txt',encoding = 'UTF-8') %>%rename(.,"word"="__")
stop_words
%>% count(word,sort=T) %>% anti_join(stop_words) %>%
data_select filter(!(word%in%nomeaing),n>=10) %>%
wordcloud2()
## Joining, by = "word"
“趙婷的文字雲”
發現「辱華」「封殺」「政治」「批評」等負面字出現在文字雲當中,但其實美國、英國、新加坡、韓國等多國主流媒體都為趙婷送上祝賀並稱讚她「成為荷里活歷史締造者」、「改寫了美國電影傳統」。然而,同一天的內地互聯網中,「趙婷」這個名字卻如同水蒸氣一般,稀薄到幾乎看不到。趙婷之所以被認為「敏感」,顯然與此前的「辱華風波」有關。她在2013年接受美國電影雜誌《電影人》(Filmmaker)訪問時「中國到處都是謊言」的話語也被網友翻出,並戴上「辱華」的標籤。
但如果稍有好奇心翻出《Filmmaker》的報道原文,從具體的上下文語境中可以發現,趙婷只是在對記者追述,自己為何選擇到西方學習政治學的心路歷程,所謂批評中國就只是個引子:「這與我十幾歲時(teenage)在中國的成長經歷有關,那裏到處是謊言,感覺就好像自己永遠無法從中走出去。我能得到的很多資訊都是不正確的,這讓我對自己的家庭和自己的背景充滿了叛逆情緒。後來我去了英國,又開始重新學習自己國家的歷史。學習政治學是我搞清楚什麼是真實的一種方式。用資訊武裝自己,然後再挑戰它(資訊本身)。」
= df5 %>%
zhao_counts inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
%>%
zhao_counts group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
我們透過正負面情緒字可以再次印證,在twitter上面提及趙婷的文章多數是由正面字較多,負面字幾乎不會出現。
尹汝貞
史上第一位獲得奧斯卡最佳女配角的南韓女演員=df[str_detect(df$text,regex("(.Youn|^Youn|.Yuh-jung|^Yuh-jung|.yuhjung|yuhjung$|.Minari|^Minari|.minari|^minari)")),]
Youn_df
#進行段詞後濾掉停用字
= Youn_df %>%unnest_tokens(word, text) %>% anti_join(stop_words) df6
## Joining, by = "word"
$date = df6$created_at %>% as.Date() #新增一欄date
df6<- df6 %>%
word_count select(date, word) %>%
group_by(date,word) %>%
summarise(count=n()) %>%
filter(!word=="youn"&!word=="yuhjung"&!word=="oscar"&!word=="oscars") %>%
# 算字詞單篇總數用summarise
filter(count>5) %>% # 過濾出現太少次的字
arrange(desc(count))
## `summarise()` has grouped output by 'date'. You can override using the `.groups` argument.
-1] %>% wordcloud2() word_count[,
“尹汝貞的文字雲”
主演電影游牧人生(nomadland), 導演為趙婷(zhao chloe)從布萊德彼特(Brad Pitt)手上接獎
尹汝貞的文字雲可以看到「minari」(他演的電影名字),「netflix」(與韓國共同出品),「brad」「pitt」「smell」此為頒獎典禮所發生之小插曲,布萊德彼特作為最佳女配角獎的頒獎人於會後與獲獎人尹汝貞共同合影,不斷被外媒詢問,布萊德彼特好聞嗎?(莫名其妙欸外媒==)
= df6 %>%
bing_Youn inner_join(get_sentiments("bing")) %>%
count(word, sentiment, sort = TRUE)
## Joining, by = "word"
%>%
bing_Youn group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
coord_flip()
## Selecting by n
load("C:/Users/emma/Desktop/course/109_2/social_media/midterm/coreNLPRedcarpet.RData")
# key = c("#Oscars")
# context = "red carpet"
# q = paste(c(key,context),collapse=" AND ")
# tweets = search_tweets(q,lang="en",n=4000,include_rts = FALSE,token = twitter_token)
# ## 用於資料清理
# clean = function(txt) {
# txt = iconv(txt, "latin1", "ASCII", sub="") #改變字的encoding
# txt = gsub("(@|#)\\w+", "", txt) #去除@或#後有數字,字母,底線 (標記人名或hashtag)
# txt = gsub("(http|https)://.*", "", txt) #去除網址(.:任意字元,*:0次以上)
# txt = gsub("[ \t]{2,}", "", txt) #去除兩個以上空格或tab
# txt = gsub("\\n"," ",txt) #去除換行
# txt = gsub("\\s+"," ",txt) #去除一個或多個空格(+:一次以上)
# txt = gsub("^\\s+|\\s+$","",txt) #去除開頭/結尾有一個或多個空格
# txt = gsub("&.*;","",txt) #去除html特殊字元編碼
# txt = gsub("[^a-zA-Z0-9?!. ']","",txt) #除了字母,數字空白?!.的都去掉(表情符號去掉)
# txt }
#
# tweets$text = clean(tweets$text) #text套用資料清理
#
# df = data.frame()
#
# df = rbind(df,tweets) # transfer to data frame
#
# df = df[!duplicated(df[,"status_id"]),] #去除重複的tweets
#tokens = coreNLP_tokens_parser(obj)
#save.image("redcarpet_tokens.RData")
$lower_word = tolower(tokens$word)
tokens$lower_lemma = tolower(tokens$lemma) tokens
提及到的人物有 + Carey Mulligan + Riz Ahmed + Alan S. Kim + Viola Davis
%>%
tokens filter(ner == "PERSON") %>% #篩選NER為PERSON
group_by(lower_word) %>% #根據word分組
summarize(count = n()) %>% #計算每組
top_n(n = 10, count) %>%
ungroup() %>%
mutate(word = reorder(lower_word, count)) %>%
ggplot(aes(word, count)) +
geom_col()+
ggtitle("Word Frequency (NER is PERSON)") +
theme(text=element_text(size=14))+
coord_flip()
“Carey Mulligan”
“Riz Ahmed(左)”
“Alan S. Kim(左)”
“Viola Davis”
整體而言,頒獎典禮大多為正向情緒字,但若從負面情緒字來看,可以了解到其實有一部份人的對於奧斯卡並不是那麼滿意,因此透過文字分析能夠透過正負比對以及資料搜尋,更能夠掌握事情的全貌,減少錯誤判斷的發生。