資料的時間區間:2020/4/1-2021/4/1
資料的關鍵字:生小孩或生孩子
網站來源:PPT
抓取資料主題或範圍:八卦版文章與網友留言、女孩版網友留言與婚姻版網友留言。
#安裝
packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr","data.table","topicmodels")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)
#載入
library(readr)
library(dplyr)
library(stringr)
library(jiebaR)
library(tidytext)
library(NLP)
library(tidyr)
library(ggplot2)
library(ggraph)
library(igraph)
library(scales)
library(reshape2)
library(widyr)
library(data.table)
library(topicmodels)#rm(list=ls())
#資料說明
##1.女孩版資料(ptt_Catch_GirlsBoard_articleReviews),共3,685筆資料
csv <- fread("./data/ptt_Catch_GirlsBoard_articleReviews.csv", encoding = "UTF-8")
count(csv)##2.婚姻版資料(ptt_Catch_MarryBoard__articleReviews),共1,553筆資料
csv1 <- fread("./data/ptt_Catch_MarryBoard__articleReviews.csv", encoding = "UTF-8")
count(csv1)##2.八卦版討論之留言(ptt_Gossiping_articleReviews),共12,548筆資料
csv2 <- fread("./data/ptt_Gossiping_articleReviews.csv", encoding = "UTF-8")
count(csv2)# 加入停用的字典
jieba_tokenizer <- worker(user="./dict/user_dict.txt", stop_word = "./dict/stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
tokens <- tokens[!tokens %in% stop_words]
# 去掉字串長度爲1的詞彙
tokens <- tokens[nchar(tokens)>1]
return(tokens)
})
} tokens <- csvT %>%
mutate(cmtContent = gsub("[[:punct:]]", "",cmtContent)) %>%
mutate(cmtContent = gsub("[0-9a-zA-Z]", "",cmtContent)) %>%
unnest_tokens(word, cmtContent, token=customized_tokenizer) %>%
count(artUrl, word) %>% # 計算每篇文章出現的字頻
rename(count=n)
save(tokens, file = "token.Rda")
load(file = "token.Rda")
freq = 3
#依據字頻挑字
reserved_word <- tokens %>%
group_by(word) %>%
count() %>%
filter(n > freq) %>%
unlist()
word_removed <- tokens %>%
filter(word %in% reserved_word)
#將資料轉換為Document Term Matrix (DTM)
dtm <- word_removed %>% cast_dtm(artUrl, word, count)
dtm## <<DocumentTermMatrix (documents: 449, terms: 2432)>>
## Non-/sparse entries: 31539/1060429
## Sparsity : 97%
## Maximal term length: 5
## Weighting : term frequency (tf)
嘗試2、4、6、10、16個主題數,將結果存起來,
再做進一步分析,將跑完的檔案存成ldas_result.rdata,可以直接載入
ldas = c()
topics = c(2,4,6,10,16)
for(topic in topics){
start_time <- Sys.time()
lda <- LDA(dtm, k = topic, control = list(seed = 2021))
ldas =c(ldas,lda)
print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
}## [1] "2 topic(s) and use time is 1.79074501991272"
## [1] "4 topic(s) and use time is 8.3955819606781"
## [1] "6 topic(s) and use time is 15.1177880764008"
## [1] "10 topic(s) and use time is 26.3967609405518"
## [1] "16 topic(s) and use time is 48.0908648967743"
#install.packages("tidyverse")
#install.packages("purrr")
library(tidyverse)
library(purrr)
#載入每個主題的LDA結果
load("ldas_result.rdata")
topics = c(2,4,6,10,16)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")Caption for the picture.
試著比較4個與6個主題差異,可以得知topic=6其分類效果最好。
#取出代表字詞(term)
# LDA分成4個主題
lda <- LDA(dtm, k =4 , control = list(seed = 1))
#移除常見共同字彙
removed_word = c("小孩","問題","不想","生小孩","孩子","生孩子","一定","一堆","有錢","沒錢","真的","五樓")
# 看各群的常用詞彙
tidy(lda, matrix = "beta") %>% # 取出topic term beta值
filter(! term %in% removed_word) %>%
group_by(topic) %>%
top_n(12, beta) %>% # beta值前12的字
ungroup() %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()Caption for the picture.
#取出代表字詞(term)
# LDA分成6個主題
lda <- LDA(dtm, k =6 , control = list(seed = 1))
#移除常見共同字彙
removed_word = c("小孩","問題","不想","生小孩","孩子","生孩子","一定","一堆","有錢","沒錢","真的","五樓")
# 看各群的常用詞彙
tidy(lda, matrix = "beta") %>% # 取出topic term beta值
filter(! term %in% removed_word) %>%
group_by(topic) %>%
top_n(12, beta) %>% # beta值前12的字
ungroup() %>%
mutate(topic = as.factor(topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = topic)) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered()主題一:養兒防老
主題二:基因
主題三:不生小孩的討論
主題四:生小孩的年紀
主題五:生小孩的問題
主題六:房價與薪水跟生小孩之間的關係
Caption for the picture.
# 在tidy function中使用參數"gamma"來取得 theta矩陣
topics <- tidy(lda, matrix="gamma") %>% # document topic gamma
group_by(document) %>%
top_n(1, wt=gamma)
topicsposts_topic %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(artDate,topic) %>%
summarise(sum =sum(topic)) %>%
ggplot()+
geom_line(aes(artDate, sum,color=as.factor(topic))) +
facet_wrap(~ topic, scales = "free") +
coord_flip()Caption for the picture.
可以看出生小孩大家比較多著重於討論薪水跟房價
require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)
posts_topic %>%
mutate(artDate = as.Date(artDate)) %>%
group_by(topic,artDate = format(artDate,'%Y%m')) %>%
summarise(n=sum(topic)) %>%
group_by(artDate) %>%
mutate(total_value =sum(n))%>%
ggplot(aes(x=artDate, y=n/total_value, fill=as.factor(topic))) +
geom_bar(stat = "identity") + ylab("proportion") +
scale_fill_manual(values=mycolors[c(1,5,8,12,14,16)],name="文章主題", labels=c("養兒防老","基因","不生小孩","年紀","生小孩的問題","房價與薪水")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))Caption for the picture.
# 文章和留言處理
reviews <- csvT %>%
select(artUrl, cmtPoster, cmtStatus, cmtContent)
posts <- csvT %>%
group_by(artUrl) %>%
mutate(commentNum=n()) %>%
select(artUrl, artPoster,commentNum,artDate) %>%
unique()
posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl")
# 把文章和topic
posts_Reviews <- merge(x = posts_Reviews, y = topics, by.x = "artUrl", by.y="document")
head(posts_Reviews,5)## IGRAPH 9285c61 DN-- 6941 17782 --
## + attr: name (v/c), artUrl (e/c)
## + edges from 9285c61 (vertex names):
## [1] cheetahspeed->syearth james732 ->syearth globekiller ->syearth
## [4] pacitic ->syearth sai1268 ->syearth sai1268 ->syearth
## [7] owo0204 ->syearth susaku ->syearth nonightcat ->syearth
## [10] nonightcat ->syearth toyota2211 ->syearth toyota2211 ->syearth
## [13] geesegeese ->syearth geesegeese ->syearth geesegeese ->syearth
## [16] cuteama ->syearth lovealgebra ->syearth james732 ->syearth
## [19] yayaqaz ->syearth jorden ->syearth chun821543 ->ORK
## [22] algebraic ->ORK rootpresent ->ORK rootpresent ->ORK
## + ... omitted several edges
看一下留言數分布圖(方便後面篩選),
並找出最大留言數:max=639
## [1] 693
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
allPoster <- c(posts_Reviews$artPoster, posts_Reviews$cmtPoster)
# 總參與人數 6941
length(unique(allPoster))## [1] 6941
#標記所有出現過得使用者
poster:只發過文、發過文+留過言
replyer:只留過言
userList <- data.frame(user=unique(allPoster)) %>%
mutate(type=ifelse(user%in%posts$artPoster, "poster", "replyer"))
head(userList,3)link <- posts_Reviews %>%
group_by(cmtPoster, artUrl) %>%
filter(n()>2) %>%
filter(artDate > as.Date(posts_Reviews[posts_Reviews$commentNum == max_reviews,]$artDate[[1]])) %>%
select(cmtPoster, artPoster, artUrl) %>%
unique()
linkfiltered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
head(filtered_user,3)set.seed(487)
# v=filtered_user
reviewNetwork = degree(reviewNetwork) > 2
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)用使用者的身份來區分點的顏色
poster:gold(有發文)
replyer:lightblue(只有回覆文章)
set.seed(487)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)filter_degree = 7 # 使用者degree
# 過濾留言者對發文者的推噓程度
link <- posts_Reviews %>%
filter(commentNum > 20) %>%
filter(cmtStatus!="→") %>%
group_by(cmtPoster, artUrl) %>%
filter( n() > 1) %>%
ungroup() %>%
select(cmtPoster, artPoster, artUrl, cmtStatus) %>%
unique()
# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述
# 篩選link中有出現的使用者
filtered_user <- userList %>%
filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
arrange(desc(type))
# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")
# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)
# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"),
col=c("lightgreen","palevioletred"), lty=1, cex=1) # networkD3
#install.packages("networkD3")
library(networkD3)
links = link
nodes = filtered_user
nodes$id = 0:(length(nodes$user) - 1)
# 整理資料格式
nodes_complete <- data.frame(nodeID = unique(c(links$cmtPoster, links$artPoster)))
nodes_complete$group <- nodes$type[match(nodes_complete$nodeID, nodes$user)]
links$source <- match(links$cmtPoster, nodes_complete$nodeID) - 1
links$target <- match(links$artPoster, nodes_complete$nodeID) - 1
# 畫圖
library(networkD3)
forceNetwork(Links = links, Nodes = nodes_complete, Source = "source",
Target = "target", NodeID = "nodeID", Group = "group",
opacity = 0.8, fontSize = 10, zoom = TRUE,legend = TRUE, opacityNoHover = TRUE,
colourScale = "d3.scaleOrdinal(d3.schemeCategory10);",
linkColour = ifelse(links$cmtStatus == "推", "red","green") # 設定推噓顏色
)## Links is a tbl_df. Converting to a plain data frame.
其中我們覺得不生小孩的原因是房價、薪水及養兒防老,然後申論下去後,才發現不單單只有這些原因,包含社會的風氣、政府的作為、基因的遺傳、體力的消耗、生男生女及住不住月中都是不生小孩的幾個大原因。
因此政府若想要提高生育率的話,除了經濟面考量外,像是呼籲企業加薪、提高基本工資,或者透過輔導產業轉型,提高薪酬水準;其他部份也必須從社會福利、社會安全層面著手,以及建立更完善的托育機制。