社群媒體期中報告 - PTT八卦版:雞排妹遭性騷擾的討論分析 第__組 組員:鍾兆宇、郭耿耀、陶冠霖、林品仲 2021/03/30 # 動機與分析目的 大家一直傳聞演藝圈都有所謂的「演藝圈潛規則」,想以雞排妹遭翁立友性騷擾,雙方各執一詞,以當時發生時間點去探討鄉民認為性騷擾的可能性如何
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## Warning in Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8"): 作業系統
## 回報無法實現設定語區為 "zh_TW.UTF-8" 的要求
## [1] ""
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "widyr", "readr", "reshape2", "NLP", "ggraph", "igraph", "tm", "data.table", "quanteda", "Matrix", "slam", "wordcloud", "topicmodels", "LDAvis", "webshot", "htmlwidgets","servr","tidytext","readr")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
## Loading required package: dplyr
## Warning: package 'dplyr' was built under R version 4.0.4
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
## Warning: package 'tidytext' was built under R version 4.0.4
require(jiebaR)
## Loading required package: jiebaR
## Warning: package 'jiebaR' was built under R version 4.0.4
## Loading required package: jiebaRD
## Warning: package 'jiebaRD' was built under R version 4.0.4
require(gutenbergr)
## Loading required package: gutenbergr
## Warning: package 'gutenbergr' was built under R version 4.0.4
require(stringr)
## Loading required package: stringr
## Warning: package 'stringr' was built under R version 4.0.4
require(wordcloud2)
## Loading required package: wordcloud2
## Warning: package 'wordcloud2' was built under R version 4.0.4
require(ggplot2)
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 4.0.4
require(tidyr)
## Loading required package: tidyr
## Warning: package 'tidyr' was built under R version 4.0.4
require(scales)
## Loading required package: scales
## Warning: package 'scales' was built under R version 4.0.4
require(widyr)
## Loading required package: widyr
## Warning: package 'widyr' was built under R version 4.0.5
require(readr)
## Loading required package: readr
## Warning: package 'readr' was built under R version 4.0.4
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
require(reshape2)
## Loading required package: reshape2
## Warning: package 'reshape2' was built under R version 4.0.4
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
require(NLP)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
require(ggraph)
## Loading required package: ggraph
## Warning: package 'ggraph' was built under R version 4.0.5
require(igraph)
## Loading required package: igraph
## Warning: package 'igraph' was built under R version 4.0.4
##
## Attaching package: 'igraph'
## The following object is masked from 'package:tidyr':
##
## crossing
## The following objects are masked from 'package:dplyr':
##
## as_data_frame, groups, union
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
require(tm)
## Loading required package: tm
## Warning: package 'tm' was built under R version 4.0.5
require(data.table)
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 4.0.4
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(quanteda)
## Loading required package: quanteda
## Warning: package 'quanteda' was built under R version 4.0.5
## Warning in stringi::stri_info(): Your native charset does not map to Unicode
## well. This may cause serious problems. Consider switching to UTF-8.
## Warning in stringi::stri_info(): Your current locale is not in the list
## of available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.
## Warning in stringi::stri_info(): Your native charset does not map to Unicode
## well. This may cause serious problems. Consider switching to UTF-8.
## Warning in stringi::stri_info(): Your current locale is not in the list
## of available locales. Some functions may not work properly. Refer to
## stri_locale_list() for more details on known locale specifiers.
## Package version: 3.0.0
## Unicode version: 10.0
## ICU version: 61.1
## Parallel computing: 8 of 8 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:tm':
##
## stopwords
## The following objects are masked from 'package:NLP':
##
## meta, meta<-
require(Matrix)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
require(slam)
## Loading required package: slam
##
## Attaching package: 'slam'
## The following object is masked from 'package:data.table':
##
## rollup
require(wordcloud)
## Loading required package: wordcloud
## Warning: package 'wordcloud' was built under R version 4.0.4
## Loading required package: RColorBrewer
require(topicmodels)
## Loading required package: topicmodels
## Warning: package 'topicmodels' was built under R version 4.0.5
require(LDAvis)
## Loading required package: LDAvis
## Warning: package 'LDAvis' was built under R version 4.0.5
require(webshot)
## Loading required package: webshot
## Warning: package 'webshot' was built under R version 4.0.5
require(htmlwidgets)
## Loading required package: htmlwidgets
## Warning: package 'htmlwidgets' was built under R version 4.0.4
require(servr)
## Loading required package: servr
## Warning: package 'servr' was built under R version 4.0.5
require(tidytext)
library(readr)
g_csv <- fread("./data/gp_articleMetaData2.CSV", encoding = "UTF-8")
g_csv$artDate <- g_csv$artDate %>% as.Date("%Y/%m/%d")
str(g_csv)
## Classes 'data.table' and 'data.frame': 1089 obs. of 11 variables:
## $ artTitle : chr "[問卦]雞排妹接班人網友:愛莉莎莎" "[問卦]酸民們看到雞排妹一樣把持不住吧?" "[問卦]要選笨笨的泱泱還是聰明的雞排妹??" "[問卦]雞排妹會被業主封殺?" ...
## $ artDate : Date, format: "2021-02-01" "2021-02-01" ...
## $ artTime : chr "04:00:11" "04:16:41" "13:21:19" "13:28:42" ...
## $ artUrl : chr "https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html" "https://www.ptt.cc/bbs/Gossiping/M.1612153004.A.446.html" "https://www.ptt.cc/bbs/Gossiping/M.1612185682.A.4A9.html" "https://www.ptt.cc/bbs/Gossiping/M.1612186124.A.F14.html" ...
## $ artPoster : chr "akumakei" "kingtama" "Metallicat" "hiball" ...
## $ artCat : chr "Gossiping" "Gossiping" "Gossiping" "Gossiping" ...
## $ commentNum: int 28 54 33 47 36 78 137 21 910 13 ...
## $ push : int 4 28 11 22 19 39 17 7 454 1 ...
## $ boo : int 19 13 9 6 1 12 87 7 193 1 ...
## $ sentence : chr "https://i.imgur.com/2cf7NuN.jpg\n雞排妹曾經是鄉民女神\n活耀於2014~2019年\nhttps://i.imgur.com/6ZjuCjC.jpg\n但愛"| __truncated__ "https://i.imgur.com/K4Ip0No.jpg\nhttps://i.imgur.com/9PK1gw3.jpg\nhttps://i.imgur.com/lLttlGw.jpg\n就是你\n\n嘴"| __truncated__ "這兩個其實都還不錯 真要選的話要選哪個\n\n泱泱呆呆的 雖然很可愛 但是傻傻的 娶回家可能很辛苦\n\n雞排妹就是很有腦"| __truncated__ "尾牙騷擾事件:\n雞排妹是受害者\n\n整形代言事件:\n把業主指名道姓講出來,\n提供消費者真實建議。\n\n雞排妹是很老"| __truncated__ ...
## $ V11 : logi NA NA NA NA NA NA ...
## - attr(*, ".internal.selfref")=<externalptr>
#資料處理_保留文章以及日期欄位及去重
data <- g_csv %>%
dplyr::select(artDate, sentence) %>%
distinct()
#資料處理_日期分群,計算每天共有幾篇討論文章
article_count_by_date <- data %>%
group_by(artDate) %>%
summarise(count = n())
head(article_count_by_date, 20)
## # A tibble: 19 x 2
## artDate count
## <date> <int>
## 1 2021-02-01 7
## 2 2021-02-02 82
## 3 2021-02-03 191
## 4 2021-02-04 183
## 5 2021-02-05 286
## 6 2021-02-06 158
## 7 2021-02-07 70
## 8 2021-02-08 20
## 9 2021-02-13 5
## 10 2021-02-14 9
## 11 2021-02-15 15
## 12 2021-02-16 15
## 13 2021-02-17 14
## 14 2021-02-18 2
## 15 2021-02-19 10
## 16 2021-02-20 5
## 17 2021-02-22 2
## 18 2021-02-23 4
## 19 2021-02-24 1
#日期折線圖
plot_date <-
article_count_by_date %>%
ggplot(aes(x = artDate, y = count)) +
geom_line(color = "#00AFBB", size = 1) +
geom_vline(xintercept = as.numeric(as.Date("2021-02-03")), col='red', size = 1) +
geom_vline(xintercept = as.numeric(as.Date("2021-02-05")), col='red', size = 1) +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("ptt八卦版 雞排妹討論文章數") +
xlab("日期") +
ylab("數量") +
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
plot_date
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_stringMetric, as.graphicsAnnot(x$label)): font family not
## found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
#資料的分析:關於雞排妹的話題討論度從2021/02/03開始提高,2021/02/05討論度為最高點,當天為翁立友開記者會日期
#斷詞、停用字
jieba_tokenizer <- worker(user="g_dict.txt", stop_word ="stop_words.txt")
g_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
#詞頻
g_tokens <- g_csv %>%
unnest_tokens(word, sentence, token=g_tokenizer) %>%
select(-artTime, -artUrl,-artPoster, - artCat, - commentNum,-push,-boo)
g_tokens_count <- g_tokens %>%
group_by(word) %>%
summarise(sum = n()) %>%
arrange(desc(sum))
head(g_tokens_count)
## # A tibble: 6 x 2
## word sum
## <chr> <int>
## 1 雞排妹 2920
## 2 性騷擾 1012
## 3 翁立友 656
## 4 https 531
## 5 炒新聞 482
## 6 新聞 395
#文字雲觀察_從詞頻將不相關的詞語 雞排妹、、翁立友、https、com 排除
wordc_plot <- g_tokens_count %>%
filter(word != "雞排妹" & word != "翁立友" & word != "https" & word != "com") %>%
filter(sum > 100) %>%
wordcloud2()
wordc_plot
t1<-g_tokens%>%
filter(word=='性騷擾')%>%
count(artDate)%>%
mutate(s='性騷擾')
t2<-g_tokens%>%
filter(word=='證據')%>%
count(artDate)%>%
mutate(s='證據')
t3<-g_tokens%>%
filter(word=='炒新聞')%>%
count(artDate)%>%
mutate(s='炒新聞')
t4<-g_tokens%>%
filter(word=='雞排妹')%>%
count(artDate)%>%
mutate(s='雞排妹')
rbind(t1,t2,t3,t4)%>%
ggplot()+
geom_line(aes(x=artDate,y=n,colour=s))+
scale_x_date(breaks=date_breaks("7 days"),labels = date_format("%m/%d"))+
xlab("日期") +
ylab("出現次數")
theme(text = element_text(family = "Heiti TC Light")) # 指定繁體中文黑體
## List of 1
## $ text:List of 11
## ..$ family : chr "Heiti TC Light"
## ..$ face : NULL
## ..$ colour : NULL
## ..$ size : NULL
## ..$ hjust : NULL
## ..$ vjust : NULL
## ..$ angle : NULL
## ..$ lineheight : NULL
## ..$ margin : NULL
## ..$ debug : NULL
## ..$ inherit.blank: logi FALSE
## ..- attr(*, "class")= chr [1:2] "element_text" "element"
## - attr(*, "class")= chr [1:2] "theme" "gg"
## - attr(*, "complete")= logi FALSE
## - attr(*, "validate")= logi TRUE
#關於雞排妹的話題,以長條圖分析大家在討論什麼
g_tokens_by_date <- g_tokens %>%
count(artDate, word, sort = TRUE)
plot_merge <- g_tokens_by_date %>%
filter(word != "雞排妹" & word != "翁立友" & word != "https" & word != "com") %>%
filter(artDate == as.Date("2021-02-01") |
artDate == as.Date("2021-02-03") |
artDate == as.Date("2021-02-05") |
artDate == as.Date("2021-02-06") |
artDate == as.Date("2021-02-07") ) %>%
group_by(artDate) %>%
top_n(5, n) %>%
ungroup() %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(x=word, y=n, fill = artDate)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y = NULL) +
facet_wrap(~artDate, scales="free", ncol = 2) +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
plot_merge
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
2021年1月30日雞排妹在個人臉書發文痛訴主持的尾牙活動時,遭到演出男歌手性騷擾,此時關於雞排妹議題,性騷擾一詞並未被熱絡討論 2021年2月3日雞排妹在接受平面媒體訪問時,訴說起遭台語歌王翁立友性騷擾過程,此時關於雞排妹的文章,提及性騷擾的字眼提高
#以文章區格,建立tf-idf
g_tokens_by_art <- g_tokens %>%
filter(!str_detect(word, regex("[0-9]"))) %>%
count(artTitle, word, sort = TRUE)
g_total_words_by_art <- g_tokens_by_art %>%
group_by(artTitle) %>%
summarize(total = sum(n)) %>%
arrange(desc(total))
g_tokens_by_art <- left_join(g_tokens_by_art, g_total_words_by_art)
## Joining, by = "artTitle"
#過濾掉文章長度少於20個詞
g_words_tf_idf <- g_tokens_by_art %>%
bind_tf_idf(word, artTitle, n)
g_words_tf_idf %>%
filter(total > 20) %>%
arrange(desc(tf_idf))
## artTitle word n total
## 1: [問卦]雞排妹炒新聞吧? 炒新聞 160 162
## 2: [問卦]有沒有人現在厭惡雞排妹的? 炒新聞 18 28
## 3: [問卦]雞排妹是台灣的愛莉莎莎嗎? 有時 6 31
## 4: [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣 炒新聞 204 514
## 5: [問卦]如果義大利人遇到雞排妹會? 義大利人 3 22
## ---
## 54208: [新聞]愛莉莎莎與雞排妹事件,被忽略的結構盲點 雞排妹 3 496
## 54209: [新聞]雞排妹5點聲明指性騷者稱被害人是二度傷 雞排妹 1 205
## 54210: Re:[新聞]爛事變好事初衷變調誰敢再學雞排妹指證 雞排妹 1 235
## 54211: [新聞]網紅曝「翁立友」騷擾「雞排妹」噁心對話 雞排妹 3 711
## 54212: Re:[問卦]雞排妹如果待的是日本484風向是穩的? 雞排妹 1 241
## tf idf tf_idf
## 1: 0.987654321 2.409194828 2.379452e+00
## 2: 0.642857143 2.409194828 1.548768e+00
## 3: 0.193548387 5.076423035 9.825335e-01
## 4: 0.396887160 2.409194828 9.561785e-01
## 5: 0.136363636 6.685860947 9.117083e-01
## ---
## 54208: 0.006048387 0.001249219 7.555763e-06
## 54209: 0.004878049 0.001249219 6.093753e-06
## 54210: 0.004255319 0.001249219 5.315827e-06
## 54211: 0.004219409 0.001249219 5.270968e-06
## 54212: 0.004149378 0.001249219 5.183483e-06
#文章總長度大於100個詞
g_words_tf_idf %>%
filter(total > 100) %>%
arrange(desc(tf_idf))
## artTitle word n total
## 1: [問卦]雞排妹炒新聞吧? 炒新聞 160 162
## 2: [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣 炒新聞 204 514
## 3: Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「 最佳 29 409
## 4: Re:[問卦]雞排妹的失智列車為啥開不起來? 出書 7 125
## 5: [問卦]喜來登飯店為什麼會放雞排妹進去? 飯店 6 101
## ---
## 39908: [新聞]愛莉莎莎與雞排妹事件,被忽略的結構盲點 雞排妹 3 496
## 39909: [新聞]雞排妹5點聲明指性騷者稱被害人是二度傷 雞排妹 1 205
## 39910: Re:[新聞]爛事變好事初衷變調誰敢再學雞排妹指證 雞排妹 1 235
## 39911: [新聞]網紅曝「翁立友」騷擾「雞排妹」噁心對話 雞排妹 3 711
## 39912: Re:[問卦]雞排妹如果待的是日本484風向是穩的? 雞排妹 1 241
## tf idf tf_idf
## 1: 0.987654321 2.409194828 2.379452e+00
## 2: 0.396887160 2.409194828 9.561785e-01
## 3: 0.070904645 4.739950798 3.360845e-01
## 4: 0.056000000 5.992713767 3.355920e-01
## 5: 0.059405941 5.587248658 3.319158e-01
## ---
## 39908: 0.006048387 0.001249219 7.555763e-06
## 39909: 0.004878049 0.001249219 6.093753e-06
## 39910: 0.004255319 0.001249219 5.315827e-06
## 39911: 0.004219409 0.001249219 5.270968e-06
## 39912: 0.004149378 0.001249219 5.183483e-06
這段期間雞排妹不斷哭訴性騷擾經過,也鼓勵提告,但無確切證據,因此都無結果
#查看關於雞排妹附近字彙
ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
g_ngram_11 <- g_csv %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngrams_11_separated <- g_ngram_11 %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
g_ngrams_11_separated
## artUrl word1 word2
## 1: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html 一出 道
## 2: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html 道 瞬間
## 3: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html 瞬間 搶走
## 4: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html 搶走 美
## 5: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html 美 光燈
## ---
## 60848: https://www.ptt.cc/bbs/Gossiping/M.1614170374.A.FBE.html 減少 太快
## 60849: https://www.ptt.cc/bbs/Gossiping/M.1614170374.A.FBE.html 太快 了點
## 60850: https://www.ptt.cc/bbs/Gossiping/M.1614170374.A.FBE.html 了點 當初
## 60851: https://www.ptt.cc/bbs/Gossiping/M.1614170374.A.FBE.html 當初 國外
## 60852: https://www.ptt.cc/bbs/Gossiping/M.1614170374.A.FBE.html 國外 咪
## word3 word4 word5 word6 word7 word8 word9 word10 word11
## 1: 瞬間 搶走 美 光燈 焦點 成為 新一代 鄉民 女神
## 2: 搶走 美 光燈 焦點 成為 新一代 鄉民 女神 網友
## 3: 美 光燈 焦點 成為 新一代 鄉民 女神 網友 說
## 4: 光燈 焦點 成為 新一代 鄉民 女神 網友 說 雞排妹
## 5: 焦點 成為 新一代 鄉民 女神 網友 說 雞排妹 退休
## ---
## 60848: 了點 當初 國外 咪 兔 消失 無聲 量 雞排妹
## 60849: 當初 國外 咪 兔 消失 無聲 量 雞排妹 咪
## 60850: 國外 咪 兔 消失 無聲 量 雞排妹 咪 兔
## 60851: 咪 兔 消失 無聲 量 雞排妹 咪 兔 突然
## 60852: 兔 消失 無聲 量 雞排妹 咪 兔 突然 新聞
#查看關於雞排妹前後五個字彙
g_check_words <- g_ngrams_11_separated %>%
filter((word6 == "雞排妹"))
g_check_words
## artUrl word1 word2
## 1: https://www.ptt.cc/bbs/Gossiping/M.1612152014.A.260.html 新一代 鄉民
## 2: https://www.ptt.cc/bbs/Gossiping/M.1612185682.A.4A9.html 可愛 傻傻的
## 3: https://www.ptt.cc/bbs/Gossiping/M.1612185682.A.4A9.html 回家 笨笨
## 4: https://www.ptt.cc/bbs/Gossiping/M.1612186124.A.F14.html 講 提供
## 5: https://www.ptt.cc/bbs/Gossiping/M.1612191187.A.54A.html 網紅 我問
## ---
## 1698: https://www.ptt.cc/bbs/Gossiping/M.1614055671.A.FAF.html 男上位 相比
## 1699: https://www.ptt.cc/bbs/Gossiping/M.1614055671.A.FAF.html 處 比較
## 1700: https://www.ptt.cc/bbs/Gossiping/M.1614057790.A.149.html 私密 處
## 1701: https://www.ptt.cc/bbs/Gossiping/M.1614057790.A.149.html 緊 此時
## 1702: https://www.ptt.cc/bbs/Gossiping/M.1614124717.A.299.html 想想 之前
## word3 word4 word5 word6 word7 word8 word9 word10 word11
## 1: 女神 網友 說 雞排妹 退休 年輕 愛莉莎莎 接班 有沒有
## 2: 娶 回家 辛苦 雞排妹 有腦 代表 精明 生意 頭腦
## 3: 聽話 就要 養 雞排妹 聰明 商業 頭腦 幫 丈夫
## 4: 消費者 真實 建議 雞排妹 老實 說出 沒錯 業主 眼裡
## 5: 同事 有沒有 聽過 雞排妹 陳 沂 說 知道 又問
## ---
## 1698: 非常容易 相當 此時 雞排妹 自爆 大腿 實在 太短 難
## 1699: 緊 此時 一旁 雞排妹 默默 桌子 底下 嘗試 張大
## 1700: 比較 緊 此時 雞排妹 大膽 嘗試 雞排妹 試完 後
## 1701: 雞排妹 大膽 嘗試 雞排妹 試完 後 張大 眼睛 不可思議
## 1702: 會先 問你要 凝視 雞排妹 正當 化的說 我要 一份 大雞排
#查看關於雞排妹前後五個字彙_長條圖
g_check_words_count <- g_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
g_check_words_count %>%
arrange(desc(abs(n))) %>%
head(20) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("出現在「雞排妹」附近的字") +
ylab("出現次數") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
#Word Correlation
g_words_by_art <- g_csv %>%
unnest_tokens(word, sentence, token=g_tokenizer) %>%
filter(!str_detect(word, regex("[0-9]"))) %>%
count(artUrl, word, sort = TRUE)
g_word_pairs <- g_words_by_art %>%
pairwise_count(word, artUrl, sort = TRUE)
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
g_word_pairs
## # A tibble: 4,720,720 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 性騷擾 雞排妹 444
## 2 雞排妹 性騷擾 444
## 3 https 雞排妹 276
## 4 雞排妹 https 276
## 5 現在 雞排妹 260
## 6 雞排妹 現在 260
## 7 翁立友 雞排妹 256
## 8 雞排妹 翁立友 256
## 9 com 雞排妹 210
## 10 新聞 雞排妹 210
## # ... with 4,720,710 more rows
g_word_cors <- g_words_by_art %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
g_word_cors %>%
filter(item1 == "雞排妹")
## # A tibble: 488 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 雞排妹 性騷擾 0.0255
## 2 雞排妹 https 0.0179
## 3 雞排妹 現在 0.0172
## 4 雞排妹 翁立友 0.0170
## 5 雞排妹 com 0.0150
## 6 雞排妹 新聞 0.0150
## 7 雞排妹 記者會 0.0148
## 8 雞排妹 有沒有 0.0144
## 9 雞排妹 事件 0.0141
## 10 雞排妹 知道 0.0141
## # ... with 478 more rows
#詞彙之間相關性
seed_words <- c("性騷擾")
threshold <- 0.60
remove_words <- g_word_cors %>%
filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
.$item1 %>%
unique()
set.seed(10)
g_word_cors_new <- g_word_cors %>%
filter(!(item1 %in% remove_words|item2 %in% remove_words))
g_word_cors_new %>%
filter(correlation > .4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning: ggrepel: 10 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#分群 透過詞彙平均tf-idf,去除部分不重要的字
term_avg_tfidf <- g_words_tf_idf %>%
group_by(word) %>%
summarise(tfidf_avg = mean(tf_idf))
term_avg_tfidf$tfidf_avg %>% summary
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000737 0.0197223 0.0371437 0.0741922 0.0875511 1.1143102
term_remove=term_avg_tfidf %>%
filter(tfidf_avg<0.0197223) %>%
.$word
term_remove %>% head
## [1] "<U+6CA1>有" "ahmjyq" "and" "asks" "biqdjhuxcpu"
## [6] "bit"
g_dtm = g_words_tf_idf %>%
filter(!word %in% term_remove) %>%
cast_dtm(document=artTitle,term=word,value= n)
g_dtm
## <<DocumentTermMatrix (documents: 801, terms: 11290)>>
## Non-/sparse entries: 48867/8994423
## Sparsity : 99%
## Maximal term length: 13
## Weighting : term frequency (tf)
g_dtm_matrix = g_dtm %>% as.data.frame.matrix
g_dtm_matrix[1:10,1:20]
## 炒新聞 最佳 性騷擾
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣 204 0 4
## [問卦]雞排妹炒新聞吧? 160 0 0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「 0 29 5
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發 0 0 29
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉! 2 0 7
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事 0 0 0
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤 1 0 25
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸 0 0 1
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友 1 0 21
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」 0 0 20
## 道歉 館長 曾國城
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣 0 0 0
## [問卦]雞排妹炒新聞吧? 0 0 0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「 0 0 0
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發 1 2 0
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉! 28 0 1
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事 1 26 0
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤 3 0 0
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸 1 0 21
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友 0 0 0
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」 0 0 2
## 翁立友 電影 結構 博弈
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣 0 4 0 0
## [問卦]雞排妹炒新聞吧? 0 0 0 0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「 1 19 0 0
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發 0 0 0 0
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉! 2 0 0 0
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事 2 0 0 0
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤 2 0 0 0
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸 7 0 0 0
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友 4 0 0 0
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」 3 0 0 0
## 影片 問題 老闆 https
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣 0 0 0 0
## [問卦]雞排妹炒新聞吧? 0 0 0 0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「 0 0 0 2
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發 1 0 1 9
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉! 0 0 1 2
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事 0 0 0 4
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤 2 1 3 9
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸 0 1 2 1
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友 2 3 0 0
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」 1 7 2 0
## 家純 豆花 記者會 com
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣 0 0 0 0
## [問卦]雞排妹炒新聞吧? 0 0 0 0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「 0 0 0 2
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發 0 0 2 5
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉! 0 0 0 0
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事 0 0 0 3
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤 0 0 11 8
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸 0 0 2 1
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友 0 0 5 0
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」 0 0 1 0
## imgur jpg
## [問卦]如果雞排妹跟大家說我就是在炒新聞會怎樣 0 0
## [問卦]雞排妹炒新聞吧? 0 0
## Re:[新聞]遭帥醫師性騷拍屁股讚美 雞排妹直呼「 0 0
## Re:[新聞]雞排妹「募集性騷案件」辦展覽! 堅定發 4 3
## Re:[問卦]雞排妹聲明會不提告,要的是一個道歉! 0 0
## Re:[新聞]眾批雞排妹「靠露炒新聞還喊性騷」館長:這是兩回事 0 0
## Re:[新聞]雞排妹遇性騷擾先開記者會?婦團憾:錯誤 2 1
## [新聞]雞排妹控他撞胸曾國城「抱歉肚子比她胸 0 0
## Re:[新聞]雞排妹哭了!控翁立友性騷擾「反遭網友 0 0
## Re:[新聞]呱吉挺雞排妹「相信她真有受騷擾感覺」 0 0
#建立LDA模型 統計每篇文章詞頻
g_artid <- g_tokens %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artTitle, word) %>%
rename(count=n) %>%
mutate(artId = group_indices(., artTitle))
## Warning: The `...` argument of `group_keys()` is deprecated as of dplyr 1.0.0.
## Please `group_by()` first
g_artid
## artTitle word count artId
## 1: [問卦]=====雞排妹文停止線===== 友愛 1 1
## 2: [問卦]=====雞排妹文停止線===== 太多 1 1
## 3: [問卦]=====雞排妹文停止線===== 文洗 1 1
## 4: [問卦]=====雞排妹文停止線===== 包容 1 1
## 5: [問卦]=====雞排妹文停止線===== 本肥 1 1
## ---
## 53310: Re:[爆卦]雞排妹鄭家純記者會 雞排妹 11 801
## 53311: Re:[爆卦]雞排妹鄭家純記者會 雞糞 1 801
## 53312: Re:[爆卦]雞排妹鄭家純記者會 騎士團 1 801
## 53313: Re:[爆卦]雞排妹鄭家純記者會 議價 1 801
## 53314: Re:[爆卦]雞排妹鄭家純記者會 騷擾 1 801
reserved_word <- g_artid %>%
group_by(word) %>%
count() %>%
filter(n > 5) %>%
unlist()
g_artid <- g_artid %>%
filter(word %in% reserved_word)
g_com_dtm <- g_artid %>% cast_dtm(artId, word, count)
g_com_dtm
## <<DocumentTermMatrix (documents: 801, terms: 1724)>>
## Non-/sparse entries: 33632/1347292
## Sparsity : 98%
## Maximal term length: 5
## Weighting : term frequency (tf)
#轉為分成兩群的LDA
g_lda <- LDA(g_com_dtm, k = 2, control = list(seed = 1234))
g_topics <- tidy(g_lda, matrix = "beta")
g_topics
## # A tibble: 3,448 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 太多 1.59e- 5
## 2 2 太多 6.58e- 4
## 3 1 和平 1.07e- 3
## 4 2 和平 2.21e- 8
## 5 1 面子 2.96e-13
## 6 2 面子 4.23e- 4
## 7 1 現在 1.50e- 3
## 8 2 現在 1.09e- 2
## 9 1 尊重 1.91e- 3
## 10 2 尊重 7.32e- 4
## # ... with 3,438 more rows
g_top_terms <- g_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
remove_words <- c("雞排妹")
g_top_terms <- g_topics %>%
filter(! term %in% remove_words) %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
g_top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
theme(text = element_text(family = "Heiti TC Light"))
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
可看出在只分為兩群的情況,發現討論到雞排妹還是以性騷擾一詞熱度最高
#查看雞排妹議題持續程度
plot_date <-
article_count_by_date %>%
ggplot(aes(x = artDate, y = count)) +
geom_line(color = "#00AFBB", size = 1) +
geom_vline(xintercept = as.numeric(as.Date("2021-02-05")), col='red', size = 1) +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("ptt八卦版 雞排妹討論文章數") +
xlab("日期") +
ylab("數量") +
theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定,避免中文字顯示錯誤。
plot_date
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
## Warning in grid.Call.graphics(C_text, as.graphicsAnnot(x$label), x$x, x$y, :
## font family not found in Windows font database
## Warning in grid.Call(C_textBounds, as.graphicsAnnot(x$label), x$x, x$y, : font
## family not found in Windows font database
g_oberve <-g_csv %>%
select(artTitle, commentNum, push, boo) %>%
filter(commentNum >= 100) %>%
mutate(p_ratio = push/commentNum, b_ratio = boo/commentNum) %>%
arrange(-p_ratio)
推文最高的文章為:有人會看了雞排妹今天的操作更討厭她嗎? 文章時間為2021/02/05,雞排妹現身記者會現場,但又拿不出證據! 因此從這天開始,雞排妹、性騷擾討論度開始下降
#情緒分析_資料前處理
MetaData = fread("./data/gp_articleMetaData2.CSV",encoding = 'UTF-8')
Reviews = fread("./data/gp_articleReviews2.csv",encoding = 'UTF-8')
keywords = c('雞排妹','性騷擾')
toMatch = paste(keywords,collapse="|")
MetaData = with(MetaData, MetaData[grepl(toMatch,sentence)|grepl(toMatch,artTitle),])
Reviews = left_join(MetaData, Reviews[,c("artUrl", "cmtContent")], by = "artUrl")
MToken <- MetaData %>% unnest_tokens(word, sentence, token=g_tokenizer)
RToken <- Reviews %>% unnest_tokens(word, cmtContent, token=g_tokenizer)
data <- rbind(MToken[,c("artDate","artUrl", "word")],RToken[,c("artDate","artUrl", "word")])
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
data_select = data %>%
filter(!grepl('[[:punct:]]',word)) %>%
filter(!grepl("['^0-9a-z']",word)) %>%
filter(nchar(.$word)>1)
word_count <- data_select %>%
select(artDate,word) %>%
group_by(artDate,word) %>%
summarise(count=n()) %>%
filter(count>3) %>%
arrange(desc(count))
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
#準備LIWC字典
word_count
## # A tibble: 14,304 x 3
## # Groups: artDate [19]
## artDate word count
## <date> <chr> <int>
## 1 2021-02-05 雞排妹 1439
## 2 2021-02-03 雞排妹 1040
## 3 2021-02-05 性騷擾 974
## 4 2021-02-03 飛機 937
## 5 2021-02-03 性騷擾 897
## 6 2021-02-06 雞排妹 874
## 7 2021-02-04 雞排妹 854
## 8 2021-02-05 道歉 698
## 9 2021-02-04 性騷擾 672
## 10 2021-02-04 飛機 613
## # ... with 14,294 more rows
P <- read_file("./dict/liwc_new/positive.txt")
N <- read_file("./dict/liwc_new/negative.txt")
typeof(P)
## [1] "character"
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
P = data.frame(word = P, sentiment = "positive")
N = data.frame(word = N, sentiment = "negative")
LIWC = rbind(P, N)
head(LIWC)
## word sentiment
## 1 一流 positive
## 2 下定決心 positive
## 3 不拘小節 positive
## 4 不費力 positive
## 5 主動 positive
## 6 乾杯 positive
#將文章和與LIWC情緒字典做join #正負情緒發文折線圖
MetaData$artDate= MetaData$artDate %>% as.Date("%Y/%m/%d")
MetaData %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot()+
geom_line(aes(x=artDate,y=count))+
scale_x_date(labels = date_format("%m/%d"))
#算出每天情緒總和
sentiment_count = data_select %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n())
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
#正負情緒分數折線圖
range(sentiment_count$artDate)
## [1] "2021-02-01" "2021-02-24"
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-01','2021-02-20'))
)+geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-02-05'))
[1]])),colour = "red")
## Warning: Removed 6 row(s) containing missing values (geom_path).
#正負情緒比例折線圖
sentiment_count %>%
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-01','2021-02-20'))
)
## Warning: Removed 6 row(s) containing missing values (geom_path).
剛爆發性騷擾疑雲時,負面情緒較高,但時間經過了兩個禮拜,雞排妹依舊沒有提出證據,因此對於性騷擾的正面情緒轉為較高,大家認為只是炒新聞,並非真相。
sentiment_count %>%
select(count,artDate) %>%
group_by(artDate) %>%
summarise(sum = sum(count)) %>%
arrange(desc(sum))
## # A tibble: 19 x 2
## artDate sum
## <date> <int>
## 1 2021-02-05 8629
## 2 2021-02-03 5886
## 3 2021-02-04 4677
## 4 2021-02-06 3802
## 5 2021-02-02 2574
## 6 2021-02-07 2523
## 7 2021-02-08 1043
## 8 2021-02-15 648
## 9 2021-02-17 593
## 10 2021-02-16 555
## 11 2021-02-19 254
## 12 2021-02-01 168
## 13 2021-02-23 114
## 14 2021-02-14 67
## 15 2021-02-20 44
## 16 2021-02-13 32
## 17 2021-02-18 27
## 18 2021-02-22 13
## 19 2021-02-24 11
#2021-02-05 文字雲
sentiment_plot_0205 <-word_count %>%
filter(!(word %in% c("雞排妹" ))) %>%
filter(artDate == as.Date('2021-02-05')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
filter(count>100) %>%
wordcloud2()
## Adding missing grouping variables: `artDate`
sentiment_plot_0205
#2021-02-06 文字雲
sentiment_plot_0206 <- word_count %>%
filter(!(word %in% c("雞排妹" ))) %>%
filter(artDate == as.Date('2021-02-06')) %>%
select(word,count) %>%
group_by(word) %>%
summarise(count = sum(count)) %>%
arrange(desc(count)) %>%
wordcloud2()
## Adding missing grouping variables: `artDate`
sentiment_plot_0206
從2021-02-05 文字雲發現鄉民認為性騷擾的可能性很高 但2021-02-20 文字雲發現鄉民認為是假新聞了,因為雞排妹遲遲提不出證據
#正負情緒代表字
sentiment_sum <-
word_count %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum %>%
top_n(30,wt = sum) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
#正負情緒文字雲
sentiment_sum %>%
acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"),
max.words = 50)
#2021-02-05 正負情緒代表字
sentiment_sum_select <-
word_count %>%
filter(artDate == as.Date('2021-02-05')) %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = sum(count)
) %>%
arrange(desc(sum)) %>%
data.frame()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
sentiment_sum_select %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to sentiment 0205",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
sentiment_sum_select %>%
acast(word ~ sentiment, value.var = "sum", fill = 0) %>%
comparison.cloud(
colors = c("salmon", "#72bcd4"), # positive negative
max.words = 50)
#歸類正負面文章
article_type =
data_select %>%
inner_join(LIWC) %>%
group_by(artUrl,sentiment) %>%
summarise(count=n()) %>%
spread(sentiment,count,fill = 0) %>%
mutate(type = case_when(positive > negative ~ "positive",
TRUE ~ "negative")) %>%
data.frame()
## Joining, by = "word"
## `summarise()` has grouped output by 'artUrl'. You can override using the `.groups` argument.
article_type %>%
group_by(type) %>%
summarise(count = n())
## # A tibble: 2 x 2
## type count
## <chr> <int>
## 1 negative 660
## 2 positive 393
article_type_date = left_join(article_type[,c("artUrl", "type")], MetaData[,c("artUrl", "artDate")], by = "artUrl")
article_type_date %>%
group_by(artDate,type) %>%
summarise(count = n()) %>%
ggplot(aes(x = artDate, y = count, fill = type)) +
geom_bar(stat = "identity", position = "dodge")+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2021-02-05','2021-02-20'))
)
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## Warning: Removed 13 rows containing missing values (geom_bar).
negative_article <-
article_type %>%
filter(type=="negative")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
positive_article <-
article_type %>%
filter(type=="positive")%>%
select(artUrl) %>%
left_join(data_select[,c("artUrl", "word")], by = "artUrl")
#情緒關鍵字:負面情緒文章
negative_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to negative sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
#情緒關鍵字:正面情緒文章
positive_article %>%
inner_join(LIWC) %>%
group_by(word,sentiment) %>%
summarise(
sum = n()
)%>%
arrange(desc(sum)) %>%
data.frame() %>%
top_n(30,wt = sum) %>%
ungroup() %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum, fill = sentiment)) +
geom_col(show.legend = FALSE) +
facet_wrap(~sentiment, scales = "free_y") +
labs(y = "Contribution to positive sentiment",
x = NULL) +
theme(text=element_text(size=14))+
coord_flip()
## Joining, by = "word"
## `summarise()` has grouped output by 'word'. You can override using the `.groups` argument.
結論 從雞排妹訴說著被性騷擾,開始帶起演藝圈性騷擾的風向,但風向會一直存在著要提出證據,加以提告,否則只是炒新聞、增加熱度,故當遲遲提出不了證據時,討論度就會急速下降,在兩個禮拜後,關於雞排妹的討論議題已寥寥無幾,鄉民也不認為雞排妹說的為事實了。