社群媒體期中報告 - PTT八卦版:雞排妹遭性騷擾的討論分析 第__組 組員:鍾兆宇、郭耿耀、陶冠霖、林品仲 2021/03/30 # 動機與分析目的 大家一直傳聞演藝圈都有所謂的「演藝圈潛規則」,想以雞排妹遭翁立友性騷擾,雙方各執一詞,以當時發生時間點去探討鄉民認為性騷擾的可能性如何

避免中文亂碼

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") 
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""

安裝需要的套件

packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "widyr", "readr", "reshape2", "NLP", "ggraph", "igraph", "tm", "data.table", "quanteda", "Matrix", "slam", "wordcloud", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr","tidytext","readr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)
## Loading required package: stringr
## Warning: package 'stringr' was built under R version 4.0.4
require(wordcloud2)
## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 4.0.4
require(scales)
## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4
require(widyr)
## Loading required package: widyr
## Warning: package 'widyr' was built under R version 4.0.5
require(readr)
## Loading required package: readr
## Warning: package 'readr' was built under R version 4.0.4
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
require(reshape2)
## Loading required package: reshape2
## Warning: package 'reshape2' was built under R version 4.0.4
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
require(NLP)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
require(ggraph)
## Loading required package: ggraph
## Warning: package 'ggraph' was built under R version 4.0.5
require(igraph)
## Loading required package: igraph
## Warning: package 'igraph' was built under R version 4.0.4
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
## 
##     crossing
## The following objects are masked from 'package:dplyr':
## 
##     as_data_frame, groups, union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
require(tm)
## Loading required package: tm
## Warning: package 'tm' was built under R version 4.0.5
require(data.table)
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 4.0.4
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
require(quanteda)
## Loading required package: quanteda
## Warning: package 'quanteda' was built under R version 4.0.5
## Warning in stringi::stri_info(): Your native charset does not map to Unicode
## well. This may cause serious problems. Consider switching to UTF-8.
## Warning in stringi::stri_info(): Your current locale is not in the list
## of available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.
## Warning in stringi::stri_info(): Your native charset does not map to Unicode
## well. This may cause serious problems. Consider switching to UTF-8.
## Warning in stringi::stri_info(): Your current locale is not in the list
## of available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.
## Package version: 3.0.0
## Unicode version: 10.0
## ICU version: 61.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
## 
##     stopwords
## The following objects are masked from 'package:NLP':
## 
##     meta, meta<-
require(Matrix)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
require(slam)
## Loading required package: slam
## 
## Attaching package: 'slam'
## The following object is masked from 'package:data.table':
## 
##     rollup
require(wordcloud)
## Loading required package: wordcloud
## Warning: package 'wordcloud' was built under R version 4.0.4
## Loading required package: RColorBrewer
require(topicmodels)
## Loading required package: topicmodels
## Warning: package 'topicmodels' was built under R version 4.0.5
require(LDAvis)
## Loading required package: LDAvis
## Warning: package 'LDAvis' was built under R version 4.0.5
require(webshot)
## Loading required package: webshot
## Warning: package 'webshot' was built under R version 4.0.5
require(htmlwidgets)
## Loading required package: htmlwidgets
## Warning: package 'htmlwidgets' was built under R version 4.0.4
require(servr)
## Loading required package: servr
## Warning: package 'servr' was built under R version 4.0.5
require(tidytext)
library(readr)

資料載入

資料集的描述:載入的資料是由中山大學管理學院文字分析平台取得,在平台資料選擇下載原始資料所取得之csv檔案。抓取範圍為 2021/02/01 ~ 2021/03/31透過文字分析平台搜尋「雞排妹」關鍵字,共搜尋到1110篇文章。

g_csv <- fread("./data/gp_articleMetaData2.CSV", encoding = "UTF-8")
g_csv$artDate <- g_csv$artDate %>% as.Date("%Y/%m/%d")
str(g_csv)
## Classes 'data.table' and 'data.frame':   1089 obs. of  11 variables:
##  $ artTitle  : chr  "[問卦]雞排妹接班人網友:愛莉莎莎" "[問卦]酸民們看到雞排妹一樣把持不住吧?" "[問卦]要選笨笨的泱泱還是聰明的雞排妹??" "[問卦]雞排妹會被業主封殺?" ...
##  $ artDate   : Date, format: "2021-02-01" "2021-02-01" ...
##  $ artTime   : chr  "04:00:11" "04:16:41" "13:21:19" "13:28:42" ...
##  $ artUrl    : chr  "https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html" "https://www.ptt.cc/bbs/Gossiping/M.1612153004.A.446.html" "https://www.ptt.cc/bbs/Gossiping/M.1612185682.A.4A9.html" "https://www.ptt.cc/bbs/Gossiping/M.1612186124.A.F14.html" ...
##  $ artPoster : chr  "akumakei" "kingtama" "Metallicat" "hiball" ...
##  $ artCat    : chr  "Gossiping" "Gossiping" "Gossiping" "Gossiping" ...
##  $ commentNum: int  28 54 33 47 36 78 137 21 910 13 ...
##  $ push      : int  4 28 11 22 19 39 17 7 454 1 ...
##  $ boo       : int  19 13 9 6 1 12 87 7 193 1 ...
##  $ sentence  : chr  "https://i.imgur.com/2cf7NuN.jpg\n雞排妹曾經是鄉民女神\n活耀於2014~2019年\nhttps://i.imgur.com/6ZjuCjC.jpg\n但愛"| __truncated__ "https://i.imgur.com/K4Ip0No.jpg\nhttps://i.imgur.com/9PK1gw3.jpg\nhttps://i.imgur.com/lLttlGw.jpg\n就是你\n\n嘴"| __truncated__ "這兩個其實都還不錯 真要選的話要選哪個\n\n泱泱呆呆的 雖然很可愛 但是傻傻的 娶回家可能很辛苦\n\n雞排妹就是很有腦"| __truncated__ "尾牙騷擾事件:\n雞排妹是受害者\n\n整形代言事件:\n把業主指名道姓講出來,\n提供消費者真實建議。\n\n雞排妹是很老"| __truncated__ ...
##  $ V11       : logi  NA NA NA NA NA NA ...
##  - attr(*, ".internal.selfref")=<externalptr>

#資料處理_保留文章以及日期欄位及去重

data <- g_csv %>% 
  dplyr::select(artDate, sentence) %>% 
  distinct()

#資料處理_日期分群,計算每天共有幾篇討論文章

article_count_by_date <- data %>% 
  group_by(artDate) %>% 
  summarise(count = n())
head(article_count_by_date, 20)
## # A tibble: 19 x 2
##    artDate    count
##    <date>     <int>
##  1 2021-02-01     7
##  2 2021-02-02    82
##  3 2021-02-03   191
##  4 2021-02-04   183
##  5 2021-02-05   286
##  6 2021-02-06   158
##  7 2021-02-07    70
##  8 2021-02-08    20
##  9 2021-02-13     5
## 10 2021-02-14     9
## 11 2021-02-15    15
## 12 2021-02-16    15
## 13 2021-02-17    14
## 14 2021-02-18     2
## 15 2021-02-19    10
## 16 2021-02-20     5
## 17 2021-02-22     2
## 18 2021-02-23     4
## 19 2021-02-24     1

#日期折線圖

plot_date <- 
  article_count_by_date %>% 
  ggplot(aes(x = artDate, y = count)) +
  geom_line(color = "#00AFBB", size = 1) +
  geom_vline(xintercept = as.numeric(as.Date("2021-02-03")), col='red', size = 1) +
  geom_vline(xintercept = as.numeric(as.Date("2021-02-05")), col='red', size = 1) +
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("ptt八卦版 雞排妹討論文章數") + 
  xlab("日期") + 
  ylab("數量") + 
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。

plot_date
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database

## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

#資料的分析:關於雞排妹的話題討論度從2021/02/03開始提高,2021/02/05討論度為最高點,當天為翁立友開記者會日期

#斷詞、停用字

jieba_tokenizer <- worker(user="g_dict.txt", stop_word ="stop_words.txt")
g_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

#詞頻

g_tokens <- g_csv %>% 
  unnest_tokens(word, sentence, token=g_tokenizer) %>% 
  select(-artTime, -artUrl,-artPoster, - artCat, - commentNum,-push,-boo)
g_tokens_count <- g_tokens %>% 
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  arrange(desc(sum))
head(g_tokens_count)
## # A tibble: 6 x 2
##   word     sum
##   <chr>  <int>
## 1 雞排妹  2920
## 2 性騷擾  1012
## 3 翁立友   656
## 4 https    531
## 5 炒新聞   482
## 6 新聞     395

#文字雲觀察_從詞頻將不相關的詞語 雞排妹、、翁立友、https、com 排除

wordc_plot <- g_tokens_count %>% 
  filter(word != "雞排妹" & word != "翁立友" & word != "https" & word != "com") %>% 
  filter(sum > 100) %>% 
  wordcloud2()
wordc_plot
t1<-g_tokens%>%
  filter(word=='性騷擾')%>%
  count(artDate)%>%
  mutate(s='性騷擾')

t2<-g_tokens%>%
  filter(word=='證據')%>%
  count(artDate)%>%
  mutate(s='證據')

t3<-g_tokens%>%
  filter(word=='炒新聞')%>%
  count(artDate)%>%
  mutate(s='炒新聞')

t4<-g_tokens%>%
  filter(word=='雞排妹')%>%
  count(artDate)%>%
  mutate(s='雞排妹')
rbind(t1,t2,t3,t4)%>%
  ggplot()+
  geom_line(aes(x=artDate,y=n,colour=s))+
  scale_x_date(breaks=date_breaks("7 days"),labels = date_format("%m/%d"))+
  xlab("日期") + 
  ylab("出現次數")

  theme(text = element_text(family = "Heiti TC Light"))  # 指定繁體中文黑體
## List of 1
##  $ text:List of 11
##   ..$ family       : chr "Heiti TC Light"
##   ..$ face         : NULL
##   ..$ colour       : NULL
##   ..$ size         : NULL
##   ..$ hjust        : NULL
##   ..$ vjust        : NULL
##   ..$ angle        : NULL
##   ..$ lineheight   : NULL
##   ..$ margin       : NULL
##   ..$ debug        : NULL
##   ..$ inherit.blank: logi FALSE
##   ..- attr(*, "class")= chr [1:2] "element_text" "element"
##  - attr(*, "class")= chr [1:2] "theme" "gg"
##  - attr(*, "complete")= logi FALSE
##  - attr(*, "validate")= logi TRUE

#關於雞排妹的話題,以長條圖分析大家在討論什麼

g_tokens_by_date <- g_tokens %>% 
  count(artDate, word, sort = TRUE)
  
plot_merge <- g_tokens_by_date %>%
  filter(word != "雞排妹" & word != "翁立友" & word != "https" & word != "com") %>%
  filter(artDate == as.Date("2021-02-01") | 
         artDate == as.Date("2021-02-03") |
         artDate == as.Date("2021-02-05") | 
         artDate == as.Date("2021-02-06") |
         artDate == as.Date("2021-02-07") )  %>% 
  group_by(artDate) %>% 
  top_n(5, n) %>% 
  ungroup() %>% 
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(x=word, y=n, fill = artDate)) +
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = NULL) +
  facet_wrap(~artDate, scales="free", ncol = 2) + 
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light"))
plot_merge
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

2021年1月30日雞排妹在個人臉書發文痛訴主持的尾牙活動時,遭到演出男歌手性騷擾,此時關於雞排妹議題,性騷擾一詞並未被熱絡討論 2021年2月3日雞排妹在接受平面媒體訪問時,訴說起遭台語歌王翁立友性騷擾過程,此時關於雞排妹的文章,提及性騷擾的字眼提高

#以文章區格,建立tf-idf

g_tokens_by_art <- g_tokens %>% 
  filter(!str_detect(word, regex("[0-9]"))) %>%
  count(artTitle, word, sort = TRUE)
g_total_words_by_art <- g_tokens_by_art %>% 
  group_by(artTitle) %>% 
  summarize(total = sum(n)) %>% 
  arrange(desc(total))
g_tokens_by_art <- left_join(g_tokens_by_art, g_total_words_by_art)
## Joining, by = "artTitle"

#過濾掉文章長度少於20個詞

g_words_tf_idf <- g_tokens_by_art %>%
  bind_tf_idf(word, artTitle, n) 
g_words_tf_idf %>% 
  filter(total > 20) %>% 
  arrange(desc(tf_idf))
##                                             artTitle     word   n total
##     1:                         [問卦]雞排妹炒新聞吧?   炒新聞 160   162
##     2:              [問卦]有沒有人現在厭惡雞排妹的?   炒新聞  18    28
##     3:               [問卦]雞排妹是台灣的愛莉莎莎嗎?     有時   6    31
##     4:  [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣   炒新聞 204   514
##     5:               [問卦]如果義大利人遇到雞排妹會? 義大利人   3    22
##    ---                                                                 
## 54208:  [新聞]愛莉莎莎與雞排妹事件,被忽略的結構盲點   雞排妹   3   496
## 54209:   [新聞]雞排妹5點聲明指性騷者稱被害人是二度傷   雞排妹   1   205
## 54210: Re:[新聞]爛事變好事初衷變調誰敢再學雞排妹指證   雞排妹   1   235
## 54211:  [新聞]網紅曝「翁立友」騷擾「雞排妹」噁心對話   雞排妹   3   711
## 54212:   Re:[問卦]雞排妹如果待的是日本484風向是穩的?   雞排妹   1   241
##                 tf         idf       tf_idf
##     1: 0.987654321 2.409194828 2.379452e+00
##     2: 0.642857143 2.409194828 1.548768e+00
##     3: 0.193548387 5.076423035 9.825335e-01
##     4: 0.396887160 2.409194828 9.561785e-01
##     5: 0.136363636 6.685860947 9.117083e-01
##    ---                                     
## 54208: 0.006048387 0.001249219 7.555763e-06
## 54209: 0.004878049 0.001249219 6.093753e-06
## 54210: 0.004255319 0.001249219 5.315827e-06
## 54211: 0.004219409 0.001249219 5.270968e-06
## 54212: 0.004149378 0.001249219 5.183483e-06

#文章總長度大於100個詞

g_words_tf_idf %>% 
  filter(total > 100) %>% 
  arrange(desc(tf_idf))
##                                             artTitle   word   n total
##     1:                         [問卦]雞排妹炒新聞吧? 炒新聞 160   162
##     2:  [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣 炒新聞 204   514
##     3: Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「   最佳  29   409
##     4:       Re:[問卦]雞排妹的失智列車為啥開不起來?   出書   7   125
##     5:        [問卦]喜來登飯店為什麼會放雞排妹進去?   飯店   6   101
##    ---                                                               
## 39908:  [新聞]愛莉莎莎與雞排妹事件,被忽略的結構盲點 雞排妹   3   496
## 39909:   [新聞]雞排妹5點聲明指性騷者稱被害人是二度傷 雞排妹   1   205
## 39910: Re:[新聞]爛事變好事初衷變調誰敢再學雞排妹指證 雞排妹   1   235
## 39911:  [新聞]網紅曝「翁立友」騷擾「雞排妹」噁心對話 雞排妹   3   711
## 39912:   Re:[問卦]雞排妹如果待的是日本484風向是穩的? 雞排妹   1   241
##                 tf         idf       tf_idf
##     1: 0.987654321 2.409194828 2.379452e+00
##     2: 0.396887160 2.409194828 9.561785e-01
##     3: 0.070904645 4.739950798 3.360845e-01
##     4: 0.056000000 5.992713767 3.355920e-01
##     5: 0.059405941 5.587248658 3.319158e-01
##    ---                                     
## 39908: 0.006048387 0.001249219 7.555763e-06
## 39909: 0.004878049 0.001249219 6.093753e-06
## 39910: 0.004255319 0.001249219 5.315827e-06
## 39911: 0.004219409 0.001249219 5.270968e-06
## 39912: 0.004149378 0.001249219 5.183483e-06

這段期間雞排妹不斷哭訴性騷擾經過,也鼓勵提告,但無確切證據,因此都無結果

#查看關於雞排妹附近字彙

ngram_11 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    ngram <- ngrams(tokens, 11)
    ngram <- lapply(ngram, paste, collapse = " ")
    unlist(ngram)
  })
}

g_ngram_11 <- g_csv %>%
  select(artUrl, sentence) %>%
  unnest_tokens(ngram, sentence, token = ngram_11) %>%
  filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngrams_11_separated <- g_ngram_11 %>%
  separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
g_ngrams_11_separated
##                                                          artUrl word1 word2
##     1: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html  一出    道
##     2: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html    道  瞬間
##     3: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html  瞬間  搶走
##     4: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html  搶走    美
##     5: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html    美  光燈
##    ---                                                                     
## 60848: https://www.ptt.cc/bbs/Gossiping/M.1614170374.A.FBE.html  減少  太快
## 60849: https://www.ptt.cc/bbs/Gossiping/M.1614170374.A.FBE.html  太快  了點
## 60850: https://www.ptt.cc/bbs/Gossiping/M.1614170374.A.FBE.html  了點  當初
## 60851: https://www.ptt.cc/bbs/Gossiping/M.1614170374.A.FBE.html  當初  國外
## 60852: https://www.ptt.cc/bbs/Gossiping/M.1614170374.A.FBE.html  國外    咪
##        word3 word4  word5  word6  word7  word8  word9 word10 word11
##     1:  瞬間  搶走     美   光燈   焦點   成為 新一代   鄉民   女神
##     2:  搶走    美   光燈   焦點   成為 新一代   鄉民   女神   網友
##     3:    美  光燈   焦點   成為 新一代   鄉民   女神   網友     說
##     4:  光燈  焦點   成為 新一代   鄉民   女神   網友     說 雞排妹
##     5:  焦點  成為 新一代   鄉民   女神   網友     說 雞排妹   退休
##    ---                                                             
## 60848:  了點  當初   國外     咪     兔   消失   無聲     量 雞排妹
## 60849:  當初  國外     咪     兔   消失   無聲     量 雞排妹     咪
## 60850:  國外    咪     兔   消失   無聲     量 雞排妹     咪     兔
## 60851:    咪    兔   消失   無聲     量 雞排妹     咪     兔   突然
## 60852:    兔  消失   無聲     量 雞排妹     咪     兔   突然   新聞

#查看關於雞排妹前後五個字彙

g_check_words <- g_ngrams_11_separated %>%
  filter((word6 == "雞排妹"))
g_check_words
##                                                         artUrl  word1  word2
##    1: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html 新一代   鄉民
##    2: https://www.ptt.cc/bbs/Gossiping/M.1612185682.A.4A9.html   可愛 傻傻的
##    3: https://www.ptt.cc/bbs/Gossiping/M.1612185682.A.4A9.html   回家   笨笨
##    4: https://www.ptt.cc/bbs/Gossiping/M.1612186124.A.F14.html     講   提供
##    5: https://www.ptt.cc/bbs/Gossiping/M.1612191187.A.54A.html   網紅   我問
##   ---                                                                       
## 1698: https://www.ptt.cc/bbs/Gossiping/M.1614055671.A.FAF.html 男上位   相比
## 1699: https://www.ptt.cc/bbs/Gossiping/M.1614055671.A.FAF.html     處   比較
## 1700: https://www.ptt.cc/bbs/Gossiping/M.1614057790.A.149.html   私密     處
## 1701: https://www.ptt.cc/bbs/Gossiping/M.1614057790.A.149.html     緊   此時
## 1702: https://www.ptt.cc/bbs/Gossiping/M.1614124717.A.299.html   想想   之前
##          word3  word4 word5  word6 word7  word8    word9 word10   word11
##    1:     女神   網友    說 雞排妹  退休   年輕 愛莉莎莎   接班   有沒有
##    2:       娶   回家  辛苦 雞排妹  有腦   代表     精明   生意     頭腦
##    3:     聽話   就要    養 雞排妹  聰明   商業     頭腦     幫     丈夫
##    4:   消費者   真實  建議 雞排妹  老實   說出     沒錯   業主     眼裡
##    5:     同事 有沒有  聽過 雞排妹    陳     沂       說   知道     又問
##   ---                                                                   
## 1698: 非常容易   相當  此時 雞排妹  自爆   大腿     實在   太短       難
## 1699:       緊   此時  一旁 雞排妹  默默   桌子     底下   嘗試     張大
## 1700:     比較     緊  此時 雞排妹  大膽   嘗試   雞排妹   試完       後
## 1701:   雞排妹   大膽  嘗試 雞排妹  試完     後     張大   眼睛 不可思議
## 1702:     會先 問你要  凝視 雞排妹  正當 化的說     我要   一份   大雞排

#查看關於雞排妹前後五個字彙_長條圖

g_check_words_count <- g_check_words %>%
  melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
  rename(word=value) %>%
  filter(variable!="word6") %>%
  filter(!(word %in% stop_words), nchar(word)>1) %>%
  count(word, sort = TRUE)
g_check_words_count %>%
  arrange(desc(abs(n))) %>%
  head(20) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = n > 0)) +
  geom_col(show.legend = FALSE) +
  xlab("出現在「雞排妹」附近的字") +
  ylab("出現次數") +
  coord_flip()+ 
  theme(text = element_text(family = "Heiti TC Light"))
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

#Word Correlation

g_words_by_art <- g_csv %>%
  unnest_tokens(word, sentence, token=g_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9]"))) %>%
  count(artUrl, word, sort = TRUE)
g_word_pairs <- g_words_by_art %>%
  pairwise_count(word, artUrl, sort = TRUE)
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
g_word_pairs
## # A tibble: 4,720,720 x 3
##    item1  item2      n
##    <chr>  <chr>  <dbl>
##  1 性騷擾 雞排妹   444
##  2 雞排妹 性騷擾   444
##  3 https  雞排妹   276
##  4 雞排妹 https    276
##  5 現在   雞排妹   260
##  6 雞排妹 現在     260
##  7 翁立友 雞排妹   256
##  8 雞排妹 翁立友   256
##  9 com    雞排妹   210
## 10 新聞   雞排妹   210
## # ... with 4,720,710 more rows
g_word_cors <- g_words_by_art %>%
  group_by(word) %>%
  filter(n() >= 20) %>%
  pairwise_cor(word, artUrl, sort = TRUE)
g_word_cors %>%
  filter(item1 == "雞排妹")
## # A tibble: 488 x 3
##    item1  item2  correlation
##    <chr>  <chr>        <dbl>
##  1 雞排妹 性騷擾      0.0255
##  2 雞排妹 https       0.0179
##  3 雞排妹 現在        0.0172
##  4 雞排妹 翁立友      0.0170
##  5 雞排妹 com         0.0150
##  6 雞排妹 新聞        0.0150
##  7 雞排妹 記者會      0.0148
##  8 雞排妹 有沒有      0.0144
##  9 雞排妹 事件        0.0141
## 10 雞排妹 知道        0.0141
## # ... with 478 more rows

#詞彙之間相關性

seed_words <- c("性騷擾")
threshold <- 0.60
remove_words <- g_word_cors %>%
                filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
                .$item1 %>%
                unique()
set.seed(10)
g_word_cors_new <- g_word_cors %>%
                filter(!(item1 %in% remove_words|item2 %in% remove_words))
g_word_cors_new %>%
  filter(correlation > .4) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
  theme_void()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#分群 透過詞彙平均tf-idf,去除部分不重要的字

term_avg_tfidf <- g_words_tf_idf %>% 
  group_by(word) %>% 
  summarise(tfidf_avg = mean(tf_idf))
term_avg_tfidf$tfidf_avg %>% summary
##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000737 0.0197223 0.0371437 0.0741922 0.0875511 1.1143102
term_remove=term_avg_tfidf %>%  
  filter(tfidf_avg<0.0197223) %>% 
  .$word
term_remove %>% head
## [1] "<U+6CA1>有"  "ahmjyq"      "and"         "asks"        "biqdjhuxcpu"
## [6] "bit"
g_dtm = g_words_tf_idf %>%
  filter(!word %in% term_remove) %>%
  cast_dtm(document=artTitle,term=word,value= n)
g_dtm
## <<DocumentTermMatrix (documents: 801, terms: 11290)>>
## Non-/sparse entries: 48867/8994423
## Sparsity           : 99%
## Maximal term length: 13
## Weighting          : term frequency (tf)
g_dtm_matrix = g_dtm %>% as.data.frame.matrix 
g_dtm_matrix[1:10,1:20]
##                                                           炒新聞 最佳 性騷擾
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣                 204    0      4
## [問卦]雞排妹炒新聞吧?                                        160    0      0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「                  0   29      5
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發                0    0     29
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉!                   2    0      7
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事      0    0      0
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤                1    0     25
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸                     0    0      1
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友                  1    0     21
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」                  0    0     20
##                                                           道歉 館長 曾國城
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣                 0    0      0
## [問卦]雞排妹炒新聞吧?                                        0    0      0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「                0    0      0
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發              1    2      0
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉!                28    0      1
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事    1   26      0
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤              3    0      0
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸                   1    0     21
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友                0    0      0
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」                0    0      2
##                                                           翁立友 電影 結構 博弈
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣                   0    4    0    0
## [問卦]雞排妹炒新聞吧?                                          0    0    0    0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「                  1   19    0    0
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發                0    0    0    0
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉!                   2    0    0    0
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事      2    0    0    0
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤                2    0    0    0
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸                     7    0    0    0
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友                  4    0    0    0
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」                  3    0    0    0
##                                                           影片 問題 老闆 https
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣                 0    0    0     0
## [問卦]雞排妹炒新聞吧?                                        0    0    0     0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「                0    0    0     2
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發              1    0    1     9
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉!                 0    0    1     2
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事    0    0    0     4
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤              2    1    3     9
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸                   0    1    2     1
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友                2    3    0     0
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」                1    7    2     0
##                                                           家純 豆花 記者會 com
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣                 0    0      0   0
## [問卦]雞排妹炒新聞吧?                                        0    0      0   0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「                0    0      0   2
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發              0    0      2   5
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉!                 0    0      0   0
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事    0    0      0   3
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤              0    0     11   8
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸                   0    0      2   1
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友                0    0      5   0
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」                0    0      1   0
##                                                           imgur jpg
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣                  0   0
## [問卦]雞排妹炒新聞吧?                                         0   0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「                 0   0
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發               4   3
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉!                  0   0
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事     0   0
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤               2   1
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸                    0   0
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友                 0   0
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」                 0   0

#建立LDA模型 統計每篇文章詞頻

g_artid <- g_tokens %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>% 
  count(artTitle, word) %>% 
  rename(count=n) %>% 
  mutate(artId = group_indices(., artTitle))
## Warning: The `...` argument of `group_keys()` is deprecated as of dplyr 1.0.0.
## Please `group_by()` first
g_artid
##                              artTitle   word count artId
##     1: [問卦]=====雞排妹文停止線=====   友愛     1     1
##     2: [問卦]=====雞排妹文停止線=====   太多     1     1
##     3: [問卦]=====雞排妹文停止線=====   文洗     1     1
##     4: [問卦]=====雞排妹文停止線=====   包容     1     1
##     5: [問卦]=====雞排妹文停止線=====   本肥     1     1
##    ---                                                  
## 53310:    Re:[爆卦]雞排妹鄭家純記者會 雞排妹    11   801
## 53311:    Re:[爆卦]雞排妹鄭家純記者會   雞糞     1   801
## 53312:    Re:[爆卦]雞排妹鄭家純記者會 騎士團     1   801
## 53313:    Re:[爆卦]雞排妹鄭家純記者會   議價     1   801
## 53314:    Re:[爆卦]雞排妹鄭家純記者會   騷擾     1   801
reserved_word <- g_artid %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > 5) %>% 
  unlist()
g_artid <- g_artid %>% 
  filter(word %in% reserved_word)
g_com_dtm <- g_artid %>% cast_dtm(artId, word, count)
g_com_dtm
## <<DocumentTermMatrix (documents: 801, terms: 1724)>>
## Non-/sparse entries: 33632/1347292
## Sparsity           : 98%
## Maximal term length: 5
## Weighting          : term frequency (tf)

#轉為分成兩群的LDA

g_lda <- LDA(g_com_dtm, k = 2, control = list(seed = 1234))
g_topics <- tidy(g_lda, matrix = "beta")
g_topics
## # A tibble: 3,448 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     1 太多  1.59e- 5
##  2     2 太多  6.58e- 4
##  3     1 和平  1.07e- 3
##  4     2 和平  2.21e- 8
##  5     1 面子  2.96e-13
##  6     2 面子  4.23e- 4
##  7     1 現在  1.50e- 3
##  8     2 現在  1.09e- 2
##  9     1 尊重  1.91e- 3
## 10     2 尊重  7.32e- 4
## # ... with 3,438 more rows
g_top_terms <- g_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
remove_words <- c("雞排妹")
g_top_terms <- g_topics %>%
  filter(! term %in% remove_words) %>% 
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
g_top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  theme(text = element_text(family = "Heiti TC Light"))
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database

可看出在只分為兩群的情況,發現討論到雞排妹還是以性騷擾一詞熱度最高

#查看雞排妹議題持續程度

plot_date <- 
  article_count_by_date %>% 
  ggplot(aes(x = artDate, y = count)) +
  geom_line(color = "#00AFBB", size = 1) +
  geom_vline(xintercept = as.numeric(as.Date("2021-02-05")), col='red', size = 1) +
  scale_x_date(labels = date_format("%Y/%m/%d")) +
  ggtitle("ptt八卦版 雞排妹討論文章數") + 
  xlab("日期") + 
  ylab("數量") + 
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。

plot_date
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database

g_oberve <-g_csv %>% 
  select(artTitle, commentNum, push, boo) %>% 
  filter(commentNum >= 100) %>% 
  mutate(p_ratio = push/commentNum, b_ratio = boo/commentNum) %>% 
  arrange(-p_ratio)

推文最高的文章為:有人會看了雞排妹今天的操作更討厭她嗎? 文章時間為2021/02/05,雞排妹現身記者會現場,但又拿不出證據! 因此從這天開始,雞排妹、性騷擾討論度開始下降

#情緒分析_資料前處理

MetaData = fread("./data/gp_articleMetaData2.CSV",encoding = 'UTF-8')
Reviews  = fread("./data/gp_articleReviews2.csv",encoding = 'UTF-8')
keywords = c('雞排妹','性騷擾')
toMatch = paste(keywords,collapse="|")
MetaData = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
MToken <- MetaData %>% unnest_tokens(word, sentence, token=g_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=g_tokenizer)
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")])
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
data_select = data %>% 
  filter(!grepl('[[:punct:]]',word)) %>% 
  filter(!grepl("['^0-9a-z']",word)) %>% 
  filter(nchar(.$word)>1) 

word_count <- data_select %>%
  select(artDate,word) %>%
  group_by(artDate,word) %>%
  summarise(count=n()) %>%  
  filter(count>3) %>%  
  arrange(desc(count))
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

#準備LIWC字典

word_count
## # A tibble: 14,304 x 3
## # Groups:   artDate [19]
##    artDate    word   count
##    <date>     <chr>  <int>
##  1 2021-02-05 雞排妹  1439
##  2 2021-02-03 雞排妹  1040
##  3 2021-02-05 性騷擾   974
##  4 2021-02-03 飛機     937
##  5 2021-02-03 性騷擾   897
##  6 2021-02-06 雞排妹   874
##  7 2021-02-04 雞排妹   854
##  8 2021-02-05 道歉     698
##  9 2021-02-04 性騷擾   672
## 10 2021-02-04 飛機     613
## # ... with 14,294 more rows
P <- read_file("./dict/liwc_new/positive.txt") 
N <- read_file("./dict/liwc_new/negative.txt") 
typeof(P)
## [1] "character"
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative") 

LIWC = rbind(P, N)

head(LIWC)
##       word sentiment
## 1     一流  positive
## 2 下定決心  positive
## 3 不拘小節  positive
## 4   不費力  positive
## 5     主動  positive
## 6     乾杯  positive

#將文章和與LIWC情緒字典做join #正負情緒發文折線圖

MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot()+
    geom_line(aes(x=artDate,y=count))+
    scale_x_date(labels = date_format("%m/%d"))

#算出每天情緒總和

sentiment_count = data_select %>%
  select(artDate,word) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=n()) 
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

#正負情緒分數折線圖

range(sentiment_count$artDate)
## [1] "2021-02-01" "2021-02-24"
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-02-01','2021-02-20'))
               )+geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-05'))
[1]])),colour = "red") 
## Warning: Removed 6 row(s) containing missing values (geom_path).

#正負情緒比例折線圖

sentiment_count %>% 
  group_by(artDate) %>%
  mutate(ratio = count/sum(count)) %>%
  ggplot()+
  geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-02-01','2021-02-20'))
               )
## Warning: Removed 6 row(s) containing missing values (geom_path).

剛爆發性騷擾疑雲時,負面情緒較高,但時間經過了兩個禮拜,雞排妹依舊沒有提出證據,因此對於性騷擾的正面情緒轉為較高,大家認為只是炒新聞,並非真相。

sentiment_count %>%
  select(count,artDate) %>%
  group_by(artDate) %>%
  summarise(sum = sum(count)) %>%
  arrange(desc(sum))
## # A tibble: 19 x 2
##    artDate      sum
##    <date>     <int>
##  1 2021-02-05  8629
##  2 2021-02-03  5886
##  3 2021-02-04  4677
##  4 2021-02-06  3802
##  5 2021-02-02  2574
##  6 2021-02-07  2523
##  7 2021-02-08  1043
##  8 2021-02-15   648
##  9 2021-02-17   593
## 10 2021-02-16   555
## 11 2021-02-19   254
## 12 2021-02-01   168
## 13 2021-02-23   114
## 14 2021-02-14    67
## 15 2021-02-20    44
## 16 2021-02-13    32
## 17 2021-02-18    27
## 18 2021-02-22    13
## 19 2021-02-24    11

#2021-02-05 文字雲

  sentiment_plot_0205 <-word_count %>% 
  filter(!(word %in% c("雞排妹" ))) %>%
  filter(artDate == as.Date('2021-02-05')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  filter(count>100) %>%   
  wordcloud2()
## Adding missing grouping variables: `artDate`
  sentiment_plot_0205

#2021-02-06 文字雲

 sentiment_plot_0206 <- word_count %>% 
  filter(!(word %in% c("雞排妹" ))) %>%
  filter(artDate == as.Date('2021-02-06')) %>% 
  select(word,count) %>% 
  group_by(word) %>% 
  summarise(count = sum(count)) %>%
  arrange(desc(count)) %>%
  wordcloud2()
## Adding missing grouping variables: `artDate`
  sentiment_plot_0206

從2021-02-05 文字雲發現鄉民認為性騷擾的可能性很高 但2021-02-20 文字雲發現鄉民認為是假新聞了,因為雞排妹遲遲提不出證據

#正負情緒代表字

sentiment_sum <- 
  word_count %>%
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = sum(count)
  ) %>% 
  arrange(desc(sum)) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum %>%
  top_n(30,wt = sum) %>%
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

#正負情緒文字雲

sentiment_sum %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"),
                   max.words = 50)

#2021-02-05 正負情緒代表字

sentiment_sum_select <- 
word_count %>%
  filter(artDate == as.Date('2021-02-05')) %>% 
    inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = sum(count)
  ) %>% 
  arrange(desc(sum)) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum_select   %>%
  top_n(30,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to sentiment 0205",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()

sentiment_sum_select %>%
  acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
  comparison.cloud(
    colors = c("salmon", "#72bcd4"), # positive negative
                   max.words = 50)

#歸類正負面文章

article_type = 
  data_select %>%
  inner_join(LIWC) %>% 
  group_by(artUrl,sentiment) %>%
  summarise(count=n()) %>%
  spread(sentiment,count,fill = 0) %>% 
  mutate(type = case_when(positive > negative ~ "positive", 
                             TRUE ~ "negative")) %>%
  data.frame() 
## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
article_type %>%
  group_by(type) %>%
  summarise(count = n())
## # A tibble: 2 x 2
##   type     count
##   <chr>    <int>
## 1 negative   660
## 2 positive   393
article_type_date = left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")


article_type_date %>%
  group_by(artDate,type) %>%
  summarise(count = n()) %>%
  ggplot(aes(x = artDate, y = count, fill = type)) + 
  geom_bar(stat = "identity", position = "dodge")+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2021-02-05','2021-02-20'))
               )
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## Warning: Removed 13 rows containing missing values (geom_bar).

negative_article <-
article_type %>%
  filter(type=="negative")%>%
  select(artUrl) %>%
  left_join(data_select[,c("artUrl", "word")], by = "artUrl")

positive_article <-
article_type %>%
  filter(type=="positive")%>%
  select(artUrl) %>%
  left_join(data_select[,c("artUrl", "word")], by = "artUrl")

#情緒關鍵字:負面情緒文章

negative_article %>%
inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = n()
    )%>% 
  arrange(desc(sum)) %>%
  data.frame() %>%
  top_n(30,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to negative sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

#情緒關鍵字:正面情緒文章

positive_article %>%
inner_join(LIWC) %>%
    group_by(word,sentiment) %>%
  summarise(
    sum = n()
    )%>% 
  arrange(desc(sum)) %>%
  data.frame() %>%
  top_n(30,wt = sum) %>%
  ungroup() %>% 
  mutate(word = reorder(word, sum)) %>%
  ggplot(aes(word, sum, fill = sentiment)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~sentiment, scales = "free_y") +
  labs(y = "Contribution to positive sentiment",
       x = NULL) +
  theme(text=element_text(size=14))+
  coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.

結論 從雞排妹訴說著被性騷擾,開始帶起演藝圈性騷擾的風向,但風向會一直存在著要提出證據,加以提告,否則只是炒新聞、增加熱度,故當遲遲提出不了證據時,討論度就會急速下降,在兩個禮拜後,關於雞排妹的討論議題已寥寥無幾,鄉民也不認為雞排妹說的為事實了。