載入的資料是由中山大學管理學院文字分析平台取得,在平台資料選擇下載原始資料所取得之csv檔案。
本資料為2019/01/01 ~ 2019/04/12 PTT八卦版之資料,透過文字分析平台檢索「館長」、「陳之漢」兩個關鍵字,共搜尋到646篇文章。
Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8") # 避免中文亂碼
[1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
packages = c("dplyr", "tidytext", "jiebaR", "gutenbergr", "stringr", "wordcloud2", "ggplot2", "tidyr", "scales", "widyr", "readr", "reshape2", "NLP", "ggraph", "igraph", "tm", "data.table", "quanteda", "Matrix", "slam", "Rtsne", "randomcoloR", "wordcloud")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
require(dplyr)
require(tidytext)
require(jiebaR)
require(gutenbergr)
require(stringr)
require(wordcloud2)
require(ggplot2)
require(tidyr)
require(scales)
require(widyr)
require(readr)
require(reshape2)
require(NLP)
require(ggraph)
require(igraph)
require(tm)
require(data.table)
require(quanteda)
require(Matrix)
require(slam)
require(Rtsne)
require(randomcoloR)
require(wordcloud)
g_csv <- fread("guan_jang_data.csv", encoding = "UTF-8", header = TRUE)
g_csv <- g_csv %>%
filter(artUrl != "https://www.ptt.cc/bbs/Gossiping/M.1547888391.A.836.html")
g_csv$artDate = g_csv$artDate %>% as.Date("%Y/%m/%d")
str(g_csv)
'data.frame': 646 obs. of 10 variables:
$ artTitle : chr "[問卦]館長:服飾Q4營收破6000萬,利潤只有300" "Re:[問卦]館長:服飾Q4營收破6000萬,利潤只有300" "Re:[問卦]館長:服飾Q4營收破6000萬,利潤只有300" "Re:[問卦]館長:服飾Q4營收破6000萬,利潤只有300" ...
$ artDate : Date, format: "2018-12-31" "2018-12-31" "2019-01-01" "2019-01-01" ...
$ artTime : chr "23:09:54" "23:23:59" "00:47:39" "01:00:14" ...
$ artUrl : chr "https://www.ptt.cc/bbs/Gossiping/M.1546326956.A.E8A.html" "https://www.ptt.cc/bbs/Gossiping/M.1546327801.A.3DC.html" "https://www.ptt.cc/bbs/Gossiping/M.1546332821.A.7DD.html" "https://www.ptt.cc/bbs/Gossiping/M.1546333577.A.187.html" ...
$ artPoster : chr "jack8587" "foreverthink" "MrLuna" "Dannybigma" ...
$ artCat : chr "Gossiping" "Gossiping" "Gossiping" "Gossiping" ...
$ commentNum: int 133 22 24 7 4 9 2 4 691 7 ...
$ push : int 65 9 7 1 1 3 2 2 471 4 ...
$ boo : int 13 4 8 3 0 2 0 1 60 0 ...
$ sentence : chr "館長昨天在直播表示,感謝大家支持,衣服事業短短3個月已進帳6000多萬,但會計精算過\n後,扣除人事成本只賺300萬,換"| __truncated__ "算?\n成\n\n三成是行規吧...\n\n第一次或頭幾次跟工廠做生意都是訂金要三成\n\n怎麼會是壓低成本呢\n\n這叫人家怕被詐"| __truncated__ "奇怪 大量訂製衣服 這流程是不是怪怪的\n\n有人跟成衣廠訂貨 還要自己出材料費的嗎??\n\n般流程 指定布料 設計圖 "| __truncated__ "算?\n成\n萬?\n界?\n奇怪 大量訂製衣服 這流程是不是怪怪的\n\n有人跟成衣廠訂貨 還要自己出材料費的嗎??\n\n般流程 "| __truncated__ ...
- attr(*, ".internal.selfref")=<externalptr>
head(g_csv)
這個章節的目的是計算出每一天文章的發表數量,可以看出特定主題討論的熱度。
g_date <- g_csv %>%
select(artDate, artUrl) %>%
distinct()
由於這份資料的每一列是特定文章的每一個詞彙,我們只需要文章以及日期兩個欄位即可,其他重複欄位可以去除。(一篇文章有很多個詞彙,所以會有很多列,但我們只需要保留一個URL即可)。
article_count_by_date <- g_date %>%
group_by(artDate) %>%
summarise(count = n())
article_count_by_date %>%
arrange(desc(count))%>%
top_n(10)
Selecting by count
按照日期分群,計算每天共有幾篇討論文章。
article_count_by_date %>%
ggplot(aes(x = artDate, y = count)) +
geom_line(color = "purple", size = 1.5) +
geom_vline(xintercept = c(as.numeric(as.Date("2019-01-05")),
as.numeric(as.Date("2019-01-16")),
as.numeric(as.Date("2019-03-04")),
as.numeric(as.Date("2019-03-10")),
as.numeric(as.Date("2019-04-09"))), col='red', size = 1) +
scale_x_date(labels = date_format("%Y/%m/%d")) +
ggtitle("「館長」討論文章數") +
xlab("日期") +
ylab("數量") +
theme(text = element_text(family = "Heiti TC Light"))
從上圖中可以看到關於「館長」的討論在1/05, 1/16, 3/04, 3/10, 4/09 出現高點。 1/05: [問卦]館長嘴斷食,究竟哪邊對? [爆卦]館長:肏你媽有種現在就打過來啦 1/16: [爆卦]館長正在跟柯文哲直播談虐童案 [新聞]館長促阿北2020選總統 柯P:我要再想想 3/04: [新聞]「叫小賀!」孫安佐單挑館長影片曝光眼 [問卦]館長打得贏甄子丹嗎 3/10: [新聞]統促黨嗆館長打一場 「簽生死狀,條件你 [問卦]館長譙統促黨髒話本來就理虧,不是嗎? 4/09: [新聞]館長對決美國智庫?蔡賴今晚熱身賽 [新聞]與館長談統獨賴清德:統一就像斯斯有兩
接下來我們來大略觀察討論的內容為何,使用的方式為文字雲。
# 初始化斷詞引擎
jieba_tokenizer <- worker(user="k_dict.txt", stop_word = "stop_words.txt")
# 自定義斷詞函式
g_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
}
g_tokens <- g_csv %>%
unnest_tokens(word, sentence, token=g_tokenizer) %>%
select(-artTime, -artUrl)
g_tokens_count <- g_tokens %>%
group_by(word) %>%
summarise(sum = n()) %>%
arrange(desc(sum))
先將資料集中所有文章按照文字進行分群,計算每一個字的總詞頻。
head(g_tokens_count)
結果為總詞頻最多的字。
g_tokens_count %>%
filter(word != "館長") %>%
filter(sum > 20) %>%
wordcloud2()
將整理好的資料直接送入wordcloud2
文字雲可以直覺看出較常提到的字,但如果想得到精確的「最常出現詞彙」,我們則可以透過長條圖來查看。
g_tokens_count %>%
top_n(10) %>%
mutate(word = reorder(word, sum)) %>%
ggplot(aes(word, sum)) +
geom_col(show.legend = FALSE) +
labs(x = NULL, y="詞頻") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
Selecting by sum
g_tokens_by_art <- g_tokens %>%
filter(!str_detect(word, regex("[0-9]"))) %>%
count(artTitle, word, sort = TRUE)
g_total_words_by_art <- g_tokens_by_art %>%
group_by(artTitle) %>%
summarize(total = sum(n)) %>%
arrange(desc(total))
g_tokens_by_art <- left_join(g_tokens_by_art, g_total_words_by_art)
Joining, by = "artTitle"
g_words_tf_idf <- g_tokens_by_art %>%
bind_tf_idf(word, artTitle, n)
g_words_tf_idf %>%
filter(total > 20) %>%
arrange(desc(tf_idf))
文章本文: 背心尊者有強大的背心能力 平常館長也很常穿背心
不過我觀察館長穿的是寬鬆類背心
背心尊者穿的是緊身類背心
雖然是不同背心
但同樣都是背心愛好者
手下的教練 也是有背心軍團的資格
館長根本是真人版的背心尊者吧 大家覺得呢
g_words_tf_idf %>%
filter(total > 100) %>%
arrange(desc(tf_idf))
g_tokens_by_date <- g_tokens %>%
filter(!str_detect(word, regex("[0-9]"))) %>%
count(artDate, word, sort = TRUE)
g_total_words_by_date <- g_tokens_by_date %>%
group_by(artDate) %>%
summarize(total = sum(n)) %>%
arrange(desc(total))
g_tokens_by_date <- left_join(g_tokens_by_date, g_total_words_by_date)
Joining, by = "artDate"
g_words_tf_idf_date <- g_tokens_by_date %>%
bind_tf_idf(word, artDate, n)
g_words_tf_idf_date %>%
filter(total > 20) %>%
group_by(artDate) %>%
top_n(1) %>%
arrange(artDate)
Selecting by tf_idf
g_words_tf_idf_date %>%
filter(total > 20) %>%
filter(artDate == as.Date("2019-01-05") |
artDate == as.Date("2019-01-16") |
artDate == as.Date("2019-03-04") |
artDate == as.Date("2019-03-10") |
artDate == as.Date("2019-04-09")) %>%
group_by(artDate) %>%
top_n(1) %>%
arrange(artDate)
Selecting by tf_idf
前面通過篇數找出的文章篇數高點 1/05, 1/16, 3/04, 3/10, 4/09 。 1/05: [問卦]館長嘴斷食,究竟哪邊對? [爆卦]館長:肏你媽有種現在就打過來啦 1/16: [爆卦]館長正在跟柯文哲直播談虐童案 [新聞]館長促阿北2020選總統 柯P:我要再想想 3/04: [新聞]「叫小賀!」孫安佐單挑館長影片曝光眼 [問卦]館長打得贏甄子丹嗎 3/10: [新聞]統促黨嗆館長打一場 「簽生死狀,條件你 [問卦]館長譙統促黨髒話本來就理虧,不是嗎? 4/09: [新聞]館長對決美國智庫?蔡賴今晚熱身賽 [新聞]與館長談統獨賴清德:統一就像斯斯有兩
ngram_11 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
ngram <- ngrams(tokens, 11)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
})
}
g_ngram_11 <- g_csv %>%
select(artUrl, sentence) %>%
unnest_tokens(ngram, sentence, token = ngram_11) %>%
filter(!str_detect(ngram, regex("[0-9a-zA-Z]")))
g_ngram_11
g_ngrams_11_separated <- g_ngram_11 %>%
separate(ngram, paste0("word", c(1:11),sep=""), sep = " ")
g_ngrams_11_separated
g_check_words <- g_ngrams_11_separated %>%
filter((word6=="統促黨"))
g_check_words
g_check_words_count <- g_check_words %>%
melt(id.vars = "artUrl", measure.vars = paste0("word", c(1:11),sep="")) %>%
rename(word=value) %>%
filter(variable!="word6") %>%
filter(!(word %in% stop_words), nchar(word)>1) %>%
count(word, sort = TRUE)
g_check_words_count
g_check_words_count %>%
arrange(desc(abs(n))) %>%
head(15) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n, fill = n > 0)) +
geom_col(show.legend = FALSE) +
xlab("Words near by \"館長\"") +
ylab("Word count") +
coord_flip()+
theme(text = element_text(family = "Heiti TC Light"))
g_words_by_art <- g_csv %>%
unnest_tokens(word, sentence, token=g_tokenizer) %>%
filter(!str_detect(word, regex("[0-9]"))) %>%
count(artUrl, word, sort = TRUE)
g_words_by_art
g_word_pairs <- g_words_by_art %>%
pairwise_count(word, artUrl, sort = TRUE)
g_word_pairs
g_word_cors <- g_words_by_art %>%
group_by(word) %>%
filter(n() >= 20) %>%
pairwise_cor(word, artUrl, sort = TRUE)
g_word_cors
g_word_cors %>%
filter(item1 == "館長")
set.seed(2019)
g_word_cors %>%
filter(correlation > .4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
# 設定幾個詞做爲seed words
seed_words <- c("新聞", "綜合", "appledaily")
# 設定threshold爲0.65
threshold <- 0.65
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- g_word_cors %>%
filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
.$item1 %>%
unique()
remove_words
[1] "appledaily" "realtime" "即時新聞" "綜合" "新聞標題" "新聞" "完整" "內文"
[9] "網址" "報導" "連結" "來源" "備註" "媒體"
set.seed(2017)
g_word_cors_new <- g_word_cors %>%
filter(!(item1 %in% remove_words|item2 %in% remove_words))
g_word_cors_new %>%
filter(correlation > .4) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
g_words_tf_idf
term_avg_tfidf = g_words_tf_idf %>%
group_by(word) %>%
summarise(tfidf_avg = mean(tf_idf))
term_avg_tfidf %>% arrange(desc(tfidf_avg))
term_avg_tfidf$tfidf_avg %>% summary
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.0001514 0.0325254 0.0521481 0.0988045 0.1138864 2.1033061
term_remove=term_avg_tfidf %>%
filter(tfidf_avg<0.0325254) %>%
.$word
term_remove %>% head
[1] "阿嘎" "阿共嚇" "啊災" "挨告" "愛民" "愛鄉"
g_dtm = g_words_tf_idf %>%
filter(!word %in% term_remove) %>%
cast_dtm(document=artTitle,term=word,value= n)
g_dtm
<<DocumentTermMatrix (documents: 550, terms: 8705)>>
Non-/sparse entries: 22406/4765344
Sparsity : 100%
Maximal term length: 13
Weighting : term frequency (tf)
g_dtm_matrix = g_dtm %>% as.data.frame.matrix
g_dtm_matrix[1:10,1:20]
library(doParallel)
clust = makeCluster(detectCores())
registerDoParallel(clust); getDoParWorkers()
[1] 4
t0 = Sys.time()
d = g_dtm_matrix %>%
dist(method="euclidean") #歐式距離,算文章與文章之間的距離
Sys.time() - t0
Time difference of 7.377416 secs
hc = hclust(d, method='ward.D')
plot(hc, labels = FALSE)
rect.hclust(hc, k = 3, border="red")
kg = cutree(hc, k = 3)
L = split(g_dtm_matrix, kg)
L$`1`[1:10,1:10]
sapply(L, function(x) x%>% colMeans %>% sort %>% tail %>% names)
1 2 3
[1,] "賴清德" "韓國瑜" "影片"
[2,] "統促黨" "體育" "健身"
[3,] "完整" "比賽" "看到"
[4,] "總統" "高雄市" "艾瑪"
[5,] "中國" "成吉思汗" "台灣"
[6,] "台灣" "艾瑪" "八卦"
# t0 = Sys.time()
# n = 2000 #n個字
# tsne = g_dtm[, 1:n] %>% as.data.frame.matrix %>%
# scale %>% t %>% Rtsne(
# check_duplicates = FALSE, theta=0.0, max_iter=3200)
# Sys.time()-t0
# Y = tsne$Y # tSNE coordinates
# d_Y = dist(Y) # distance matrix
# hc_Y = hclust(d_Y ) # hi-clustering
# plot(hc_Y,label=F)
# rect.hclust(hc_Y, k=10, border="red")
# K = 10 # number of clusters
# g = cutree(hc_Y,K) # cut into K clusters
# table(g) %>% as.vector %>% sort # sizes of clusters
# wc = col_sums(g_dtm[,1:n]) #n個字
# colors = distinctColorPalette(K)
# png("./g.png", width=3200, height=1800)#輸出圖片到路徑下
# textplot(
# Y[,1], Y[,2], colnames(g_dtm)[1:n], show=F,
# col=colors[g],
# cex= 0.3 + 1.25 * sqrt(wc/mean(wc)),
# font=2, family = "Heiti TC Light")
# dev.off()