社群媒體期中報告 - PTT八卦版：「愛莉莎莎」於PTT聲量討論分析第23組組員：鍾兆宇、郭耿耀、陶冠霖、林品仲 2021/04/29

前言

研究動機

在這網紅文化盛行的時代，無論是小朋友、大人都爭相做直播主、YouTubers，事實上近年已有不少年輕YouTuber竄起，例如蔡阿嘎、阿滴、這群人等等，而上面這幾位都是屬於網路聲量大，並且也沒甚麼負面的聲量，而我們小組選擇了近年聲量大、負評也非常可怕的YouTubers「愛莉莎莎」。

研究目的

知名YouTuber 「愛莉莎莎」從2017年開始經過多次的爭議事件，似乎不影響其人氣累積與聲量的討論，小組選定兩件爭議事件，『歧視原住民』、『肝膽排石』，爰此，本小組透過文字探勘技術進行分析，了解網友在爭議事件前後對「愛莉莎莎」的討論狀況，並對分析結果進行解釋。

前置作業

避免中文亂碼

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")

## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求

## [1] ""

安裝需要的套件

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "widyr", "readr", "reshape2", "NLP", "ggraph", "igraph", "tm", "data.table", "quanteda", "Matrix", "slam", "wordcloud", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
require(widyr)
require(readr)
require(reshape2)
require(NLP)
require(ggraph)
require(igraph)
require(tm)
require(data.table)
require(quanteda)
require(Matrix)
require(slam)
require(wordcloud)
require(topicmodels)
require(LDAvis)
require(webshot)
require(htmlwidgets)
require(servr)

資料集的描述

資料來源 : PTT Gossiping 版取得管道 : 中山大學文字分析平台 - 搜尋「愛莉莎莎」關鍵字資料期間 : 2019/08/01 ~ 2021/04/01

csv <- fread("./data/Alisas_articleReviews.CSV", encoding = "UTF-8")%>%
  select(artTitle,artDate,cmtContent,artUrl)

csv$artDate= csv$artDate %>% as.Date("%Y/%m/%d")
mask_sentences <- strsplit(csv$cmtContent,"[。！；？!?;，：]")

mask_sentences <- data.frame(
                        artTitle = rep(csv$artTitle, sapply(mask_sentences, length)),
                        artDate = rep(csv$artDate, sapply(mask_sentences, length)), 
                        artUrl = rep(csv$artUrl, sapply(mask_sentences, length)),
                        cmtContent = unlist(mask_sentences)
                      ) %>%
                      filter(!str_detect(cmtContent, regex("^(\t|\n| )*$")))

初始化斷詞引擎

mask_lexicon <- scan(file = "./dict/mask_lexicon.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')

stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')

#stop_words

jieba_tokenizer = worker(write = "NOFILE")  #worker()
new_user_word(jieba_tokenizer, c(mask_lexicon))

## [1] TRUE

資料前處理:自定義斷詞函式

chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

mask_sentences$cmtContent <- as.character (mask_sentences$cmtContent)

mask_words <- mask_sentences %>%
  unnest_tokens(word, cmtContent, token=chi_tokenizer)

mask_words_Alisasa <- mask_words%>%
  select(artDate,word)

資料集的分析過程

針對’愛莉莎莎’ 視覺化的結果與解釋

article_count_by_date <- mask_sentences %>% 
  group_by(artDate) %>% 
  summarise(count = n()) %>%
  filter(count>2) %>%  # 過濾出現太少次的字
  arrange(desc(count))
head(article_count_by_date, 20)

## # A tibble: 20 x 2
##    artDate    count
##    <date>     <int>
##  1 2021-02-13  6232
##  2 2021-02-14  4227
##  3 2021-02-15  1649
##  4 2021-03-01  1517
##  5 2020-05-25  1282
##  6 2020-01-10  1055
##  7 2021-02-17  1031
##  8 2020-07-02   986
##  9 2021-01-13   852
## 10 2021-02-16   850
## 11 2020-06-28   768
## 12 2020-07-08   716
## 13 2020-07-19   634
## 14 2020-05-28   632
## 15 2020-07-13   615
## 16 2020-06-26   611
## 17 2020-10-18   594
## 18 2021-02-12   587
## 19 2020-04-03   578
## 20 2020-07-03   546

查看相關文章推文數

plot_date <- 
  article_count_by_date %>% 
  ggplot(aes(x = artDate, y = count)) +
  geom_line(color = "#00AFBB", size = 2) + 
  geom_vline(xintercept = as.numeric(as.Date("2020-04-01")), col='red') +
  geom_vline(xintercept = as.numeric(as.Date("2020-11-27")), col='red') + 
  geom_vline(xintercept = as.numeric(as.Date("2021-02-12")), col='red') + 
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("愛莉莎莎 討論推文") + 
  xlab("日期") + 
  ylab("數量") #+ 

plot_date

以維基百科觀察愛莉莎莎的爭議言論

案例A 『歧視原住民』: 2020年4月1日發佈影片被認為有歧視原住民的字眼。
案例B 『肝膽排石』: 2020年11月27日發佈肝膽排石法的影片，先遭台大小兒科醫師(蒼藍鴿; 本名:吳其穎) 指稱誤導觀眾， 2021年2月12日突表示要提告蒼藍鴿醫師，經多位醫界人士表達影片錯誤的資訊， 2021年2月15日事件發酵引起輿論反彈後，愛莉莎莎自行下架影片並道歉。

統計爭議事件一「原住民」、「歧視」、「漢化」等爭議言論出現的總次數，以一天為單位。

t3<-mask_words %>%
  filter(word=='原住民' | word=='歧視' | word=='漢化') %>%
  count(artDate, sort = TRUE)
  
head(t3)

##      artDate   n
## 1 2020-04-03 184
## 2 2020-04-02  52
## 3 2020-04-04  17
## 4 2020-04-10  15
## 5 2020-04-21   7
## 6 2020-04-13   6

觀察「原住民」、「歧視」、「漢化」字詞出現的總次數，以三十天為一個單位。

t3 %>%
  ggplot()+
  geom_line(aes(x=artDate,y=n))+
  scale_x_date(breaks=date_breaks("30 days"),labels = date_format("%m/%d"))+
  labs(x = '日期', y = '出現次數')

統計爭議事件二「肝膽」、「膽結石」、「排石」、「醫生」、「醫師」等爭議言論出現的總次數，以一天為單位。

t4<-mask_words %>%
  filter( word=='肝膽' | word=='膽結石' | word=='排石' | word=='醫生' | word=='醫師') %>%
  count(artDate, sort = TRUE)
  
head(t4)

##      artDate   n
## 1 2021-02-13 373
## 2 2021-03-01 120
## 3 2021-02-14 118
## 4 2021-02-15  31
## 5 2021-02-12  30
## 6 2021-02-03  19

觀察「肝膽」、「膽結石」、「排石」、「醫生」、「醫師」字詞出現的總次數，以三十天為一個單位。

t4 %>%
  ggplot()+
  geom_line(aes(x=artDate,y=n))+
  scale_x_date(breaks=date_breaks("30 days"),labels = date_format("%m/%d"))+
  labs(x = '日期', y = '出現次數')

計算斷詞後，所有字在文集中的總詞頻

data = mask_words %>% group_by(word,artDate) %>% summarise( count = n() )

## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

word_count <- data %>%
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count))  %>%
  filter(count>300) %>%  # 過濾出現太少次的字
  arrange(desc(count))

word_count

## # A tibble: 40 x 2
##    word     count
##    <chr>    <int>
##  1 愛莉莎莎  1227
##  2 台灣       996
##  3 莎莎       959
##  4 iu         898
##  5 屁事       880
##  6 可憐       830
##  7 知道       754
##  8 公關       742
##  9 女神       697
## 10 嘔嘔       677
## # ... with 30 more rows

情緒分析

讀取LIWC情緒字典

P <- read_file("./dict/liwc/positive.txt")
N <- read_file("./dict/liwc/negative.txt")
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)

word_count %>% inner_join(LIWC)

## Joining, by = "word"

## # A tibble: 7 x 3
##   word  count sentiment
##   <chr> <int> <chr>    
## 1 可憐    830 negative 
## 2 八卦    648 negative 
## 3 噁心    553 negative 
## 4 可愛    547 positive 
## 5 垃圾    408 negative 
## 6 喜歡    407 positive 
## 7 寶貝    344 positive

合併資料源與LIWC情緒字典

data %>% 
  select(word) %>%
  inner_join(LIWC)

## Joining, by = "word"

## # A tibble: 8,435 x 2
## # Groups:   word [792]
##    word   sentiment
##    <chr>  <chr>    
##  1 一流   positive 
##  2 一流   positive 
##  3 一流   positive 
##  4 了不起 positive 
##  5 了不起 positive 
##  6 了不起 positive 
##  7 了不起 positive 
##  8 了不起 positive 
##  9 了不起 positive 
## 10 了不起 positive 
## # ... with 8,425 more rows

以LIWC情緒字典，計算出每日正負情緒之總數

sentiment_count = data %>%
  select(artDate,word,count) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=sum(count))

將計算出正負情緒總數以折線圖繪出

觀察發現只在2021年02月發生膽結石爭議，反駁醫師的專業，造成的負面影響較大

  sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))

2020年11月，愛莉莎莎分享的影片提到自己「喝橄欖油加葡萄柚汁」可排出「膽結石」，並紀錄其七天的療程。2021年1月台灣大學附設醫院小兒科醫師蒼藍鴿表示該影片會誤導觀眾。

sentiment_count %>%
  filter(artDate<'2021/02/10') %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))

2021年02月12日:愛莉莎莎再製作影片回應蒼藍鴿的各種指責，強調自己做足功課且不排提告，隨即引發網友論戰，嗣後陸續有醫界人士指出肝膽排石法的誤區。

2021年02月15日:自行下架影片並道歉。

sentiment_count %>%
  filter(artDate>='2021/02/10') %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))

比較2021年02月10日前後字詞差異

發現網友對於愛莉莎莎，負面情緒字詞都是差不多的，「可憐」、「噁心」、「八卦」等等…

LIWC_word_counts <- data %>%
  filter(artDate < '2021/02/10') %>%
  inner_join(LIWC) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

LIWC_word_counts %>% 
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Selecting by n

LIWC_word_counts <- data %>%
  filter(artDate >= '2021/02/10') %>%
  inner_join(LIWC) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()

## Joining, by = "word"

LIWC_word_counts %>% 
  group_by(sentiment) %>%
  top_n(10) %>%
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

## Selecting by n

檢查和過濾兩個連續的單詞

jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}

Alisasa_bigrams <- csv %>%
  unnest_tokens(bigram, cmtContent, token = jieba_bigram)

2021/02/10前

兩個連續的單詞做拆解以及過濾停用字，最後計算總數

Alisasa_bigrams <- csv %>%
  unnest_tokens(bigram, cmtContent, token = jieba_bigram) %>%
  filter(artDate <'2021/02/10')

bigrams_separated <- Alisasa_bigrams %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words) %>%
  filter(!word2 %in% stop_words)

# new bigram counts:
bigram_counts <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts

##        word1  word2   n
##     1:  公關   公司 309
##     2:    笑     死 253
##     3:  台灣     iu 249
##     4:    噁     噁 243
##     5:  神奇   寶貝 199
##    ---                 
## 49479:    讚     囉   1
## 49480:    讚   歡迎   1
## 49481:    讚     蘿   1
## 49482:  讚美   垃圾   1
## 49483:  讚美 阿北文   1

過濾次數大於20的，以及呈現兩個連續的單詞關係

bigram_graph <- bigram_counts %>%
  filter(n > 20) %>%
  graph_from_data_frame()

bigram_graph

## IGRAPH 533fdc7 DN-- 157 118 -- 
## + attr: name (v/c), n (e/n)
## + edges from 533fdc7 (vertex names):
##  [1] 公關      ->公司     笑        ->死       台灣      ->iu      
##  [4] 噁        ->噁       神奇      ->寶貝     莎        ->莎      
##  [7] 煩        ->不煩     g         ->軟       10        ->坪      
## [10] 殺        ->殺       八卦      ->版       八卦      ->女神    
## [13] 奪魂      ->鋸       鄉民      ->女神     好        ->正      
## [16] 愛        ->z        鯊        ->鯊       關我      ->屁事    
## [19] 日本      ->人       洗        ->幾篇     想        ->舔      
## [22] 觀光      ->廣告     好        ->想       cp        ->值      
## + ... omitted several edges

劃出共相關圖查看是否能找到更多關係

set.seed(2021)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

2021/02/10後

兩個連續的單詞做拆解以及過濾停用字，最後計算總數

Alisasa_bigrams_2 <- csv %>%
  unnest_tokens(bigram, cmtContent, token = jieba_bigram) %>%
  filter(artDate >= '2021/02/10')

bigrams_separated <- Alisasa_bigrams_2 %>%
  separate(bigram, c("word1", "word2"), sep = " ")

bigrams_filtered <- bigrams_separated %>%
  filter(!word1 %in% stop_words) %>%
  filter(!word2 %in% stop_words)

# new bigram counts:
bigram_counts_2 <- bigrams_filtered %>% 
  count(word1, word2, sort = TRUE)

bigram_counts_2

##        word1  word2   n
##     1:    笑     死 327
##     2:  台灣     iu 103
##     3:  台灣     人  92
##     4:  大腦 排石法  85
##     5:    猀     砂  83
##    ---                 
## 25890:    讚   遲來   1
## 25891:    讚     錯   1
## 25892:    讚   應該   1
## 25893:    讚     鵝   1
## 25894:  讚美   哪是   1

過濾次數大於30的，並劃出共相關圖查看是否能找到更多關係

bigram_graph <- bigram_counts_2 %>%
  filter(n > 30) %>%
  graph_from_data_frame()

set.seed(2020)

a <- grid::arrow(type = "closed", length = unit(.15, "inches"))

ggraph(bigram_graph, layout = "fr") +
  geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
                 arrow = a, end_cap = circle(.07, 'inches')) +
  geom_node_point(color = "lightblue", size = 5) +
  geom_node_text(aes(label = name), vjust = 1, hjust = 1) +
  theme_void()

計算每篇文章中的字詞出現的次數，以及每篇文章的詞數

mask_sentences_artUrl = csv %>%
  select(artTitle,artDate,cmtContent,artUrl)

mask_words_artUrl <- mask_words %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)

total_words <- mask_words_artUrl %>% 
  group_by(artUrl) %>% 
  summarize(total = sum(n))

合併 mask_words 與 total_words

新增各個詞彙在所有詞彙中的總數欄位

mask_words_artUrl %>%
  left_join(total_words)%>%
  top_n(10) %>%
  arrange(desc(n))%>%
  head(10)

##                                                      artUrl word   n total
## 1  https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 醫生 138  4466
## 2  https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 態度  93  4466
## 3  https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 千千  58  4466
## 4  https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 醫師  58  4466
## 5  https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 理科  50  4466
## 6  https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 影片  48  4466
## 7  https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 主播  33  4466
## 8  https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 一堆  32  4466
## 9  https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 太太  31  4466
## 10 https://www.ptt.cc/bbs/Gossiping/M.1613238576.A.D59.html 愛美  31  4466

TF-IDF 與詞相關性分析

以每篇文章爲單位，計算每個詞彙在的tf-idf值

選出每篇文章，tf-idf最大的十個詞

mask_words_tf_idf <- mask_words_artUrl %>%
  bind_tf_idf(word, artUrl, n)

mask_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(n)) %>%
  ungroup()%>%
  head(10)

## Selecting by tf_idf

## # A tibble: 10 x 6
##    artUrl                                     word         n     tf   idf tf_idf
##    <chr>                                      <chr>    <int>  <dbl> <dbl>  <dbl>
##  1 https://www.ptt.cc/bbs/Gossiping/M.161329~ 記者       239 0.119   3.68 0.440 
##  2 https://www.ptt.cc/bbs/Gossiping/M.161323~ 醫生       138 0.0309  3.15 0.0974
##  3 https://www.ptt.cc/bbs/Gossiping/M.161329~ 反串       125 0.0624  2.55 0.159 
##  4 https://www.ptt.cc/bbs/Gossiping/M.161456~ 公務員     104 0.0364  7.34 0.267 
##  5 https://www.ptt.cc/bbs/Gossiping/M.161323~ 態度        93 0.0208  4.81 0.100 
##  6 https://www.ptt.cc/bbs/Gossiping/M.161456~ 莎粉        72 0.0252  2.54 0.0639
##  7 https://www.ptt.cc/bbs/Gossiping/M.159107~ 愛愛莉莎莎~    71 0.366   5.20 1.90  
##  8 https://www.ptt.cc/bbs/Gossiping/M.157871~ 投票        67 0.0398  5.03 0.200 
##  9 https://www.ptt.cc/bbs/Gossiping/M.159040~ 生日快樂    66 0.382   6.64 2.53  
## 10 https://www.ptt.cc/bbs/Gossiping/M.161321~ 性騷擾      66 0.0542  4.63 0.251

將所有文章加總，tf-idf最大的十個詞，查看每個詞被選中的次數

mask_words_tf_idf %>% 
  group_by(artUrl) %>%
  top_n(10) %>%
  arrange(desc(n)) %>%
  ungroup() %>%
  count(word, sort=TRUE)

## Selecting by tf_idf

## # A tibble: 20,691 x 2
##    word         n
##    <chr>    <int>
##  1 屁事        64
##  2 可憐        56
##  3 可愛        46
##  4 台灣        42
##  5 愛莉莎莎    40
##  6 嘔嘔        40
##  7 一次        39
##  8 莎莎        36
##  9 公關        33
## 10 女神        32
## # ... with 20,681 more rows

jieba_tokenizer = worker()
new_user_word(jieba_tokenizer, c(mask_lexicon))

## [1] TRUE

jieba_trigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      ngram<- ngrams(unlist(tokens), 3)
      ngram <- lapply(ngram, paste, collapse = " ")
      unlist(ngram)
    }
  })
}

執行trigram分詞

Alisasa_trigram <- csv %>%
  unnest_tokens(bigram, cmtContent, token = jieba_trigram)

Alisasa_trigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  count(bigram, sort = TRUE)

##                 bigram    n
##      1:       嘔 嘔 嘔 3164
##      2:     干 我 屁事  651
##      3:     嘔嘔 嘔 嘔  533
##      4:       噁 噁 噁  202
##      5: 的 大腦 排石法  116
##     ---                    
## 182431: 讚美 哪是 台詞    1
## 182432:     讚揚 也 是    1
## 182433:     讚揚 才 對    1
## 182434:   鑽石 什麼 噁    1
## 182435: 鑽石 就是 珍珠    1

載入停用字

stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')

移除bigram中的停用字

word_cors <- Alisasa_trigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("item1", "item2", "item3"), sep = " ") %>% 
  filter(!(item1 %in% stop_words), !(item2 %in% stop_words)) %>%
  count(item1, item2,item3, sort = TRUE) #%>%
  #unite_("bigram", c("item1","item2"), sep=" ")

word_cors

##        item1  item2 item3   n
##     1:    噁     噁    噁 202
##     2:  大腦 排石法  真的  97
##     3:    猀     砂    砂  73
##     4:    愛   麗莎    莎  56
##     5:  大蒜   鼻痧    猀  50
##    ---                       
## 61310:    讚     讚    柯   1
## 61311:    讚     讚    幫   1
## 61312:  讚美   垃圾  老頭   1
## 61313:  讚美 阿北文    跟   1
## 61314:  讚美   哪是  台詞   1

「愛莉莎莎」、「肝膽」、「膽結石」、「原住民」、「歧視」，相關的字詞比率

word_cors <- mask_words_Alisasa %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, artDate, sort = TRUE)

## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.

word_cors %>%
  filter(item1 %in% c("愛莉莎莎", "肝膽","膽結石","原住民","歧視")) %>%
  group_by(item1) %>%
  top_n(10) %>%
  ungroup() %>%
  arrange(desc(correlation))%>%
  mutate(item2 = reorder(item2, correlation)) %>%
  ggplot(aes(item2, correlation)) +
  geom_bar(stat = "identity") +
  facet_wrap(~ item1, scales = "free") +
  coord_flip() #+

## Selecting by correlation

劃出共相關圖查看是否能找到更多關係

set.seed(2021)

word_cors %>%
  filter(item1 %in% c("愛莉莎莎", "肝膽","膽結石","原住民","歧視")) %>%
  group_by(item1) %>%
  top_n(10) %>%
  ungroup() %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定，避免中文字顯示錯誤。
  theme_void()

## Selecting by correlation

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

tf_idf_c <- mask_words_Alisasa %>%
  count(artDate,word,sort = TRUE)

mask_words_tf_idf_2 <- tf_idf_c %>%
  bind_tf_idf(word, artDate, n)

mask_words_tf_idf_2 %>% 
  group_by(artDate) %>%
  top_n(20) %>%
  arrange(desc(artDate)) %>%
  ungroup() %>%
  count(word, sort=TRUE)

## Selecting by tf_idf

## # A tibble: 12,123 x 2
##    word      n
##    <chr> <int>
##  1 任務     17
##  2 幫噓     17
##  3 泱泱     16
##  4 日本     15
##  5 泉水     15
##  6 小馮     14
##  7 iu       12
##  8 新聞     12
##  9 不煩     11
## 10 道歉     11
## # ... with 12,113 more rows

結論

我們依分析結果，看出在事件發生時，討論熱度以及聲量都會比較多，台灣人對於網紅的錯誤言詞，當下會有比較多的負面想法。但似乎時間過後，好好道歉，風波一過，其實民眾包容度極高，只要影片好看，依舊會有一定的人氣、支持度。