社群媒體期中報告 - PTT八卦版:「愛莉莎莎」於PTT聲量討論分析 第23組 組員:鍾兆宇、郭耿耀、陶冠霖、林品仲 2021/04/29
在這網紅文化盛行的時代,無論是小朋友、大人都爭相做直播主、YouTubers,事實上近年已有不少年輕YouTuber竄起,例如蔡阿嘎、阿滴、這群人等等,而上面這幾位都是屬於網路聲量大,並且也沒甚麼負面的聲量,而我們小組選擇了近年聲量大、負評也非常可怕的YouTubers「愛莉莎莎」。
知名YouTuber 「愛莉莎莎」從2017年開始經過多次的爭議事件,似乎不影響其人氣累積與聲量的討論,小組選定兩件爭議事件,『歧視原住民』、『肝膽排石』,爰此,本小組透過文字探勘技術進行分析,了解網友在爭議事件前後對「愛莉莎莎」的討論狀況,並對分析結果進行解釋。
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "widyr", "readr", "reshape2", "NLP", "ggraph", "igraph", "tm", "data.table", "quanteda", "Matrix", "slam", "wordcloud", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
require(widyr)
require(readr)
require(reshape2)
require(NLP)
require(ggraph)
require(igraph)
require(tm)
require(data.table)
require(quanteda)
require(Matrix)
require(slam)
require(wordcloud)
require(topicmodels)
require(LDAvis)
require(webshot)
require(htmlwidgets)
require(servr)
資料來源 : PTT Gossiping 版 取得管道 : 中山大學文字分析平台 - 搜尋「愛莉莎莎」關鍵字 資料期間 : 2019/08/01 ~ 2021/04/01
csv <- fread("./data/Alisas_articleReviews.CSV", encoding = "UTF-8")%>%
select(artTitle,artDate,cmtContent,artUrl)
csv$artDate= csv$artDate %>% as.Date("%Y/%m/%d")
mask_sentences <- strsplit(csv$cmtContent,"[。!;?!?;,:]")
mask_sentences <- data.frame(
artTitle = rep(csv$artTitle, sapply(mask_sentences, length)),
artDate = rep(csv$artDate, sapply(mask_sentences, length)),
artUrl = rep(csv$artUrl, sapply(mask_sentences, length)),
cmtContent = unlist(mask_sentences)
) %>%
filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))
mask_lexicon <- scan(file = "./dict/mask_lexicon.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
#stop_words
jieba_tokenizer = worker(write = "NOFILE") #worker()
new_user_word(jieba_tokenizer, c(mask_lexicon))
## [1] TRUE
chi_tokenizer <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
}
})
}
mask_sentences$cmtContent <- as.character (mask_sentences$cmtContent)
mask_words <- mask_sentences %>%
unnest_tokens(word, cmtContent, token=chi_tokenizer)
mask_words_Alisasa <- mask_words%>%
select(artDate,word)
article_count_by_date <- mask_sentences %>%
group_by(artDate) %>%
summarise(count = n()) %>%
filter(count>2) %>% # 過濾出現太少次的字
arrange(desc(count))
head(article_count_by_date, 20)
## # A tibble: 20 x 2
## artDate count
## <date> <int>
## 1 2021-02-13 6232
## 2 2021-02-14 4227
## 3 2021-02-15 1649
## 4 2021-03-01 1517
## 5 2020-05-25 1282
## 6 2020-01-10 1055
## 7 2021-02-17 1031
## 8 2020-07-02 986
## 9 2021-01-13 852
## 10 2021-02-16 850
## 11 2020-06-28 768
## 12 2020-07-08 716
## 13 2020-07-19 634
## 14 2020-05-28 632
## 15 2020-07-13 615
## 16 2020-06-26 611
## 17 2020-10-18 594
## 18 2021-02-12 587
## 19 2020-04-03 578
## 20 2020-07-03 546
plot_date <-
article_count_by_date %>%
ggplot(aes(x = artDate, y = count)) +
geom_line(color = "#00AFBB", size = 2) +
geom_vline(xintercept = as.numeric(as.Date("2020-04-01")), col='red') +
geom_vline(xintercept = as.numeric(as.Date("2020-11-27")), col='red') +
geom_vline(xintercept = as.numeric(as.Date("2021-02-12")), col='red') +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("愛莉莎莎 討論推文") +
xlab("日期") +
ylab("數量") #+
plot_date
t3<-mask_words %>%
filter(word=='原住民' | word=='歧視' | word=='漢化') %>%
count(artDate, sort = TRUE)
head(t3)
## artDate n
## 1 2020-04-03 184
## 2 2020-04-02 52
## 3 2020-04-04 17
## 4 2020-04-10 15
## 5 2020-04-21 7
## 6 2020-04-13 6
t3 %>%
ggplot()+
geom_line(aes(x=artDate,y=n))+
scale_x_date(breaks=date_breaks("30 days"),labels = date_format("%m/%d"))+
labs(x = '日期', y = '出現次數')
t4<-mask_words %>%
filter( word=='肝膽' | word=='膽結石' | word=='排石' | word=='醫生' | word=='醫師') %>%
count(artDate, sort = TRUE)
head(t4)
## artDate n
## 1 2021-02-13 373
## 2 2021-03-01 120
## 3 2021-02-14 118
## 4 2021-02-15 31
## 5 2021-02-12 30
## 6 2021-02-03 19
t4 %>%
ggplot()+
geom_line(aes(x=artDate,y=n))+
scale_x_date(breaks=date_breaks("30 days"),labels = date_format("%m/%d"))+
labs(x = '日期', y = '出現次數')
data = mask_words %>% group_by(word,artDate) %>% summarise( count = n() )
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
word_count <- data %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
filter(count>300) %>% # 過濾出現太少次的字
arrange(desc(count))
word_count
## # A tibble: 40 x 2
## word count
## <chr> <int>
## 1 愛莉莎莎 1227
## 2 台灣 996
## 3 莎莎 959
## 4 iu 898
## 5 屁事 880
## 6 可憐 830
## 7 知道 754
## 8 公關 742
## 9 女神 697
## 10 嘔嘔 677
## # ... with 30 more rows
P <- read_file("./dict/liwc/positive.txt")
N <- read_file("./dict/liwc/negative.txt")
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
word_count %>% inner_join(LIWC)
## Joining, by = "word"
## # A tibble: 7 x 3
## word count sentiment
## <chr> <int> <chr>
## 1 可憐 830 negative
## 2 八卦 648 negative
## 3 噁心 553 negative
## 4 可愛 547 positive
## 5 垃圾 408 negative
## 6 喜歡 407 positive
## 7 寶貝 344 positive
data %>%
select(word) %>%
inner_join(LIWC)
## Joining, by = "word"
## # A tibble: 8,435 x 2
## # Groups: word [792]
## word sentiment
## <chr> <chr>
## 1 一流 positive
## 2 一流 positive
## 3 一流 positive
## 4 了不起 positive
## 5 了不起 positive
## 6 了不起 positive
## 7 了不起 positive
## 8 了不起 positive
## 9 了不起 positive
## 10 了不起 positive
## # ... with 8,425 more rows
sentiment_count = data %>%
select(artDate,word,count) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=sum(count))
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))
sentiment_count %>%
filter(artDate<'2021/02/10') %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))
sentiment_count %>%
filter(artDate>='2021/02/10') %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))
發現網友對於愛莉莎莎,負面情緒字詞都是差不多的,「可憐」、「噁心」、「八卦」等等…
LIWC_word_counts <- data %>%
filter(artDate < '2021/02/10') %>%
inner_join(LIWC) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
LIWC_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Selecting by n
LIWC_word_counts <- data %>%
filter(artDate >= '2021/02/10') %>%
inner_join(LIWC) %>%
count(word, sentiment, sort = TRUE) %>%
ungroup()
## Joining, by = "word"
LIWC_word_counts %>%
group_by(sentiment) %>%
top_n(10) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Selecting by n
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
Alisasa_bigrams <- csv %>%
unnest_tokens(bigram, cmtContent, token = jieba_bigram)
Alisasa_bigrams <- csv %>%
unnest_tokens(bigram, cmtContent, token = jieba_bigram) %>%
filter(artDate <'2021/02/10')
bigrams_separated <- Alisasa_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words) %>%
filter(!word2 %in% stop_words)
# new bigram counts:
bigram_counts <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts
## word1 word2 n
## 1: 公關 公司 309
## 2: 笑 死 253
## 3: 台灣 iu 249
## 4: 噁 噁 243
## 5: 神奇 寶貝 199
## ---
## 49479: 讚 囉 1
## 49480: 讚 歡迎 1
## 49481: 讚 蘿 1
## 49482: 讚美 垃圾 1
## 49483: 讚美 阿北文 1
bigram_graph <- bigram_counts %>%
filter(n > 20) %>%
graph_from_data_frame()
bigram_graph
## IGRAPH 533fdc7 DN-- 157 118 --
## + attr: name (v/c), n (e/n)
## + edges from 533fdc7 (vertex names):
## [1] 公關 ->公司 笑 ->死 台灣 ->iu
## [4] 噁 ->噁 神奇 ->寶貝 莎 ->莎
## [7] 煩 ->不煩 g ->軟 10 ->坪
## [10] 殺 ->殺 八卦 ->版 八卦 ->女神
## [13] 奪魂 ->鋸 鄉民 ->女神 好 ->正
## [16] 愛 ->z 鯊 ->鯊 關我 ->屁事
## [19] 日本 ->人 洗 ->幾篇 想 ->舔
## [22] 觀光 ->廣告 好 ->想 cp ->值
## + ... omitted several edges
set.seed(2021)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
Alisasa_bigrams_2 <- csv %>%
unnest_tokens(bigram, cmtContent, token = jieba_bigram) %>%
filter(artDate >= '2021/02/10')
bigrams_separated <- Alisasa_bigrams_2 %>%
separate(bigram, c("word1", "word2"), sep = " ")
bigrams_filtered <- bigrams_separated %>%
filter(!word1 %in% stop_words) %>%
filter(!word2 %in% stop_words)
# new bigram counts:
bigram_counts_2 <- bigrams_filtered %>%
count(word1, word2, sort = TRUE)
bigram_counts_2
## word1 word2 n
## 1: 笑 死 327
## 2: 台灣 iu 103
## 3: 台灣 人 92
## 4: 大腦 排石法 85
## 5: 猀 砂 83
## ---
## 25890: 讚 遲來 1
## 25891: 讚 錯 1
## 25892: 讚 應該 1
## 25893: 讚 鵝 1
## 25894: 讚美 哪是 1
bigram_graph <- bigram_counts_2 %>%
filter(n > 30) %>%
graph_from_data_frame()
set.seed(2020)
a <- grid::arrow(type = "closed", length = unit(.15, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = a, end_cap = circle(.07, 'inches')) +
geom_node_point(color = "lightblue", size = 5) +
geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
theme_void()
mask_sentences_artUrl = csv %>%
select(artTitle,artDate,cmtContent,artUrl)
mask_words_artUrl <- mask_words %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
total_words <- mask_words_artUrl %>%
group_by(artUrl) %>%
summarize(total = sum(n))
mask_words_artUrl %>%
left_join(total_words)%>%
top_n(10) %>%
arrange(desc(n))%>%
head(10)
## artUrl word n total
## 1 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 醫生 138 4466
## 2 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 態度 93 4466
## 3 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 千千 58 4466
## 4 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 醫師 58 4466
## 5 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 理科 50 4466
## 6 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 影片 48 4466
## 7 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 主播 33 4466
## 8 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 一堆 32 4466
## 9 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 太太 31 4466
## 10 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 愛美 31 4466
mask_words_tf_idf <- mask_words_artUrl %>%
bind_tf_idf(word, artUrl, n)
mask_words_tf_idf %>%
group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(n)) %>%
ungroup()%>%
head(10)
## Selecting by tf_idf
## # A tibble: 10 x 6
## artUrl word n tf idf tf_idf
## <chr> <chr> <int> <dbl> <dbl> <dbl>
## 1 https://www.ptt.cc/bbs/Gossiping/M.161329~ 記者 239 0.119 3.68 0.440
## 2 https://www.ptt.cc/bbs/Gossiping/M.161323~ 醫生 138 0.0309 3.15 0.0974
## 3 https://www.ptt.cc/bbs/Gossiping/M.161329~ 反串 125 0.0624 2.55 0.159
## 4 https://www.ptt.cc/bbs/Gossiping/M.161456~ 公務員 104 0.0364 7.34 0.267
## 5 https://www.ptt.cc/bbs/Gossiping/M.161323~ 態度 93 0.0208 4.81 0.100
## 6 https://www.ptt.cc/bbs/Gossiping/M.161456~ 莎粉 72 0.0252 2.54 0.0639
## 7 https://www.ptt.cc/bbs/Gossiping/M.159107~ 愛愛莉莎莎~ 71 0.366 5.20 1.90
## 8 https://www.ptt.cc/bbs/Gossiping/M.157871~ 投票 67 0.0398 5.03 0.200
## 9 https://www.ptt.cc/bbs/Gossiping/M.159040~ 生日快樂 66 0.382 6.64 2.53
## 10 https://www.ptt.cc/bbs/Gossiping/M.161321~ 性騷擾 66 0.0542 4.63 0.251
mask_words_tf_idf %>%
group_by(artUrl) %>%
top_n(10) %>%
arrange(desc(n)) %>%
ungroup() %>%
count(word, sort=TRUE)
## Selecting by tf_idf
## # A tibble: 20,691 x 2
## word n
## <chr> <int>
## 1 屁事 64
## 2 可憐 56
## 3 可愛 46
## 4 台灣 42
## 5 愛莉莎莎 40
## 6 嘔嘔 40
## 7 一次 39
## 8 莎莎 36
## 9 公關 33
## 10 女神 32
## # ... with 20,681 more rows
jieba_tokenizer = worker()
new_user_word(jieba_tokenizer, c(mask_lexicon))
## [1] TRUE
jieba_trigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
ngram<- ngrams(unlist(tokens), 3)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
}
})
}
Alisasa_trigram <- csv %>%
unnest_tokens(bigram, cmtContent, token = jieba_trigram)
Alisasa_trigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
count(bigram, sort = TRUE)
## bigram n
## 1: 嘔 嘔 嘔 3164
## 2: 干 我 屁事 651
## 3: 嘔嘔 嘔 嘔 533
## 4: 噁 噁 噁 202
## 5: 的 大腦 排石法 116
## ---
## 182431: 讚美 哪是 台詞 1
## 182432: 讚揚 也 是 1
## 182433: 讚揚 才 對 1
## 182434: 鑽石 什麼 噁 1
## 182435: 鑽石 就是 珍珠 1
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8')
word_cors <- Alisasa_trigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("item1", "item2", "item3"), sep = " ") %>%
filter(!(item1 %in% stop_words), !(item2 %in% stop_words)) %>%
count(item1, item2,item3, sort = TRUE) #%>%
#unite_("bigram", c("item1","item2"), sep=" ")
word_cors
## item1 item2 item3 n
## 1: 噁 噁 噁 202
## 2: 大腦 排石法 真的 97
## 3: 猀 砂 砂 73
## 4: 愛 麗莎 莎 56
## 5: 大蒜 鼻痧 猀 50
## ---
## 61310: 讚 讚 柯 1
## 61311: 讚 讚 幫 1
## 61312: 讚美 垃圾 老頭 1
## 61313: 讚美 阿北文 跟 1
## 61314: 讚美 哪是 台詞 1
word_cors <- mask_words_Alisasa %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artDate, sort = TRUE)
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
word_cors %>%
filter(item1 %in% c("愛莉莎莎", "肝膽","膽結石","原住民","歧視")) %>%
group_by(item1) %>%
top_n(10) %>%
ungroup() %>%
arrange(desc(correlation))%>%
mutate(item2 = reorder(item2, correlation)) %>%
ggplot(aes(item2, correlation)) +
geom_bar(stat = "identity") +
facet_wrap(~ item1, scales = "free") +
coord_flip() #+
## Selecting by correlation
set.seed(2021)
word_cors %>%
filter(item1 %in% c("愛莉莎莎", "肝膽","膽結石","原住民","歧視")) %>%
group_by(item1) %>%
top_n(10) %>%
ungroup() %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
## Selecting by correlation
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
tf_idf_c <- mask_words_Alisasa %>%
count(artDate,word,sort = TRUE)
mask_words_tf_idf_2 <- tf_idf_c %>%
bind_tf_idf(word, artDate, n)
mask_words_tf_idf_2 %>%
group_by(artDate) %>%
top_n(20) %>%
arrange(desc(artDate)) %>%
ungroup() %>%
count(word, sort=TRUE)
## Selecting by tf_idf
## # A tibble: 12,123 x 2
## word n
## <chr> <int>
## 1 任務 17
## 2 幫噓 17
## 3 泱泱 16
## 4 日本 15
## 5 泉水 15
## 6 小馮 14
## 7 iu 12
## 8 新聞 12
## 9 不煩 11
## 10 道歉 11
## # ... with 12,113 more rows
我們依分析結果,看出在事件發生時,討論熱度以及聲量都會比較多,台灣人對於網紅的錯誤言詞,當下會有比較多的負面想法。 但似乎時間過後,好好道歉,風波一過,其實民眾包容度極高,只要影片好看,依舊會有一定的人氣、支持度。