資料的時間區間:2020/4/1-2021/4/1
資料的關鍵字:生小孩或生孩子
網站來源:PPT
抓取資料主題或範圍:八卦版文章與網友留言、女孩版網友留言與婚姻版網友留言。

1 環境設置與前處理

1.1 避免中文亂碼之處理

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")
## [1] ""

1.2 安裝需要的packages與載入

#安裝
packages = c("readr", "dplyr", "stringr", "jiebaR", "tidytext", "NLP", "readr", "tidyr", "ggplot2", "ggraph", "igraph", "scales", "reshape2", "widyr","data.table","topicmodels")
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

#載入
library(readr)
library(dplyr)
library(stringr)
library(jiebaR)
library(tidytext)
library(NLP)
library(tidyr)
library(ggplot2)
library(ggraph)
library(igraph)
library(scales)
library(reshape2)
library(widyr)
library(data.table)
library(topicmodels)

1.3 讀入資料並初步合併處理

#rm(list=ls())

#資料說明
##1.女孩版資料(ptt_Catch_GirlsBoard_articleReviews),共3,685筆資料
csv <- fread("./data/ptt_Catch_GirlsBoard_articleReviews.csv", encoding = "UTF-8")
count(csv)
##2.婚姻版資料(ptt_Catch_MarryBoard__articleReviews),共1,553筆資料
csv1 <- fread("./data/ptt_Catch_MarryBoard__articleReviews.csv", encoding = "UTF-8")
count(csv1)
##2.八卦版討論之留言(ptt_Gossiping_articleReviews),共12,548筆資料
csv2 <- fread("./data/ptt_Gossiping_articleReviews.csv", encoding = "UTF-8")
count(csv2)
##合併留言
csvT <- rbind(csv,csv1,csv2)
count(csvT)

1.4 jieba初始化

# 加入停用的字典
jieba_tokenizer <- worker(user="./dict/user_dict.txt", stop_word = "./dict/stop_words.txt")

# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    tokens <- tokens[!tokens %in% stop_words]
    # 去掉字串長度爲1的詞彙
    tokens <- tokens[nchar(tokens)>1]
    return(tokens)
  })
}

1.5 使用jieba斷字

  tokens <- csvT %>%
   mutate(cmtContent = gsub("[[:punct:]]", "",cmtContent)) %>%
   mutate(cmtContent = gsub("[0-9a-zA-Z]", "",cmtContent)) %>%
   unnest_tokens(word, cmtContent, token=customized_tokenizer) %>%
   count(artUrl, word) %>% # 計算每篇文章出現的字頻
   rename(count=n)
save(tokens, file = "token.Rda")

load(file = "token.Rda")
freq = 3
#依據字頻挑字
reserved_word <- tokens %>% 
  group_by(word) %>% 
  count() %>% 
  filter(n > freq) %>% 
  unlist()

word_removed <- tokens %>% 
  filter(word %in% reserved_word)

#將資料轉換為Document Term Matrix (DTM)
dtm <- word_removed %>% cast_dtm(artUrl, word, count)
dtm
## <<DocumentTermMatrix (documents: 449, terms: 2432)>>
## Non-/sparse entries: 31539/1060429
## Sparsity           : 97%
## Maximal term length: 5
## Weighting          : term frequency (tf)

2 LDA主題模型

2.1 尋找最佳主題數

嘗試2、4、6、10、16個主題數,將結果存起來,
再做進一步分析,將跑完的檔案存成ldas_result.rdata,可以直接載入

 ldas = c()
 topics = c(2,4,6,10,16)
 for(topic in topics){
   start_time <- Sys.time()
   lda <- LDA(dtm, k = topic, control = list(seed = 2021))
   ldas =c(ldas,lda)
   print(paste(topic ,paste("topic(s) and use time is ",  Sys.time() -start_time)))
   save(ldas,file = "ldas_result.rdata") # 將模型輸出成檔案
 }
## [1] "2 topic(s) and use time is  1.79074501991272"
## [1] "4 topic(s) and use time is  8.3955819606781"
## [1] "6 topic(s) and use time is  15.1177880764008"
## [1] "10 topic(s) and use time is  26.3967609405518"
## [1] "16 topic(s) and use time is  48.0908648967743"

2.2 透過perplexity找到最佳主題數

#install.packages("tidyverse")
#install.packages("purrr")
library(tidyverse)
library(purrr)

#載入每個主題的LDA結果
load("ldas_result.rdata")

topics = c(2,4,6,10,16)
data_frame(k = topics, perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

Caption for the picture.

2.3 挑選最佳主題

試著比較4個與6個主題差異,可以得知topic=6其分類效果最好。

2.3.1 topic=4

#取出代表字詞(term)
# LDA分成4個主題
lda <- LDA(dtm, k =4 , control = list(seed = 1))

#移除常見共同字彙
removed_word = c("小孩","問題","不想","生小孩","孩子","生孩子","一定","一堆","有錢","沒錢","真的","五樓")  

# 看各群的常用詞彙
tidy(lda, matrix = "beta") %>% # 取出topic term beta值
  filter(! term %in% removed_word) %>% 
  group_by(topic) %>%
  top_n(12, beta) %>% # beta值前12的字
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

Caption for the picture.

2.3.2 topic=6

#取出代表字詞(term)
# LDA分成6個主題
lda <- LDA(dtm, k =6 , control = list(seed = 1))

#移除常見共同字彙
removed_word = c("小孩","問題","不想","生小孩","孩子","生孩子","一定","一堆","有錢","沒錢","真的","五樓")  

# 看各群的常用詞彙
tidy(lda, matrix = "beta") %>% # 取出topic term beta值
  filter(! term %in% removed_word) %>% 
  group_by(topic) %>%
  top_n(12, beta) %>% # beta值前12的字
  ungroup() %>%
  mutate(topic = as.factor(topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = topic)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

主題一:養兒防老
主題二:基因
主題三:不生小孩的討論
主題四:生小孩的年紀
主題五:生小孩的問題
主題六:房價與薪水跟生小孩之間的關係

Caption for the picture.

2.4 主題分類

# 在tidy function中使用參數"gamma"來取得 theta矩陣
topics <- tidy(lda, matrix="gamma") %>% # document topic gamma
                  group_by(document) %>%
                  top_n(1, wt=gamma)
topics
posts_topic <- merge(x = csvT, y = topics, by.x = "artUrl", by.y="document")
set.seed(1)

2.5 日期主題分布

posts_topic %>%
  mutate(artDate = as.Date(artDate)) %>% 
  group_by(artDate,topic) %>%
  summarise(sum =sum(topic)) %>%
  ggplot()+
  geom_line(aes(artDate, sum,color=as.factor(topic))) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

Caption for the picture.

2.6 以比例了解討論時間變化

可以看出生小孩大家比較多著重於討論薪水跟房價

require(RColorBrewer)
mycolors <- colorRampPalette(brewer.pal(8, "Set3"))(20)



posts_topic %>%
  mutate(artDate = as.Date(artDate)) %>%
  group_by(topic,artDate = format(artDate,'%Y%m')) %>%
  summarise(n=sum(topic)) %>%
  group_by(artDate) %>%
  mutate(total_value =sum(n))%>%
  ggplot(aes(x=artDate, y=n/total_value, fill=as.factor(topic))) +
  geom_bar(stat = "identity") + ylab("proportion") +
  scale_fill_manual(values=mycolors[c(1,5,8,12,14,16)],name="文章主題", labels=c("養兒防老","基因","不生小孩","年紀","生小孩的問題","房價與薪水")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

Caption for the picture.

3 社群網路圖

# 文章和留言處理
reviews <- csvT %>%
      select(artUrl, cmtPoster, cmtStatus, cmtContent)

posts <- csvT %>%
      group_by(artUrl) %>%
      mutate(commentNum=n()) %>%
      select(artUrl, artPoster,commentNum,artDate) %>%
      unique()

posts_Reviews <- merge(x = posts, y = reviews, by = "artUrl")

# 把文章和topic
posts_Reviews <- merge(x = posts_Reviews, y = topics, by.x = "artUrl", by.y="document")
head(posts_Reviews,5)

3.1 取出 cmtPoster(回覆者)、artPoster(發文者)、artUrl(文章連結) 三個欄位

link <- posts_Reviews %>% select(cmtPoster, artPoster, artUrl)
head(link,3)

3.2 建立網路關係(沒有經過篩選)

reviewNetwork <- graph_from_data_frame(d=link, directed=T)
reviewNetwork
## IGRAPH 9285c61 DN-- 6941 17782 -- 
## + attr: name (v/c), artUrl (e/c)
## + edges from 9285c61 (vertex names):
##  [1] cheetahspeed->syearth james732    ->syearth globekiller ->syearth
##  [4] pacitic     ->syearth sai1268     ->syearth sai1268     ->syearth
##  [7] owo0204     ->syearth susaku      ->syearth nonightcat  ->syearth
## [10] nonightcat  ->syearth toyota2211  ->syearth toyota2211  ->syearth
## [13] geesegeese  ->syearth geesegeese  ->syearth geesegeese  ->syearth
## [16] cuteama     ->syearth lovealgebra ->syearth james732    ->syearth
## [19] yayaqaz     ->syearth jorden      ->syearth chun821543  ->ORK    
## [22] algebraic   ->ORK     rootpresent ->ORK     rootpresent ->ORK    
## + ... omitted several edges
plot(reviewNetwork)

plot(reviewNetwork, vertex.size=2, edge.arrow.size=.5,vertex.label=NA)

3.3 初階資料篩選

看一下留言數分布圖(方便後面篩選),
並找出最大留言數:max=639

max_reviews=max(posts$commentNum)
max_reviews
## [1] 693
posts %>%
ggplot(aes(x=commentNum)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

3.4 檢視參與人數

length(unique(posts_Reviews$artPoster)) # 發文者數量 353
## [1] 353

3.5 檢視回覆者數量

length(unique(posts_Reviews$cmtPoster)) # 回覆者數量 6735
## [1] 6735

3.6 檢視總參與人數

allPoster <- c(posts_Reviews$artPoster, posts_Reviews$cmtPoster)
# 總參與人數 6941
length(unique(allPoster))
## [1] 6941

#標記所有出現過得使用者

poster:只發過文、發過文+留過言
replyer:只留過言

userList <- data.frame(user=unique(allPoster)) %>%
              mutate(type=ifelse(user%in%posts$artPoster, "poster", "replyer"))
head(userList,3)

3.7 以最高回復量那天來分析(2020-12-12)來分析網路圖

link <- posts_Reviews  %>%
      group_by(cmtPoster, artUrl) %>%
      filter(n()>2) %>%
      filter(artDate > as.Date(posts_Reviews[posts_Reviews$commentNum == max_reviews,]$artDate[[1]])) %>%
      select(cmtPoster, artPoster, artUrl) %>%
      unique()
link
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))
head(filtered_user,3)
set.seed(487)
# v=filtered_user

reviewNetwork = degree(reviewNetwork) > 2
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)

3.8 加上nodes的顯示資訊

用使用者的身份來區分點的顏色
poster:gold(有發文)
replyer:lightblue(只有回覆文章)

set.seed(487)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")
plot(reviewNetwork, vertex.size=3, edge.arrow.size=0.3,vertex.label=NA)

filter_degree = 7 # 使用者degree

# 過濾留言者對發文者的推噓程度
link <- posts_Reviews %>%
      filter(commentNum > 20) %>%
      filter(cmtStatus!="→") %>%
      group_by(cmtPoster, artUrl) %>%
      filter( n() > 1) %>%
      ungroup() %>% 
      select(cmtPoster, artPoster, artUrl, cmtStatus) %>% 
      unique()

# 接下來把網路圖畫出來,跟前面做的事都一樣,因此不再細述

# 篩選link中有出現的使用者
filtered_user <- userList %>%
          filter(user%in%link$cmtPoster | user%in%link$artPoster) %>%
          arrange(desc(type))

# 建立網路關係
reviewNetwork <- graph_from_data_frame(d=link, v=filtered_user, directed=F)

# 依據使用者身份對點進行上色
labels <- degree(reviewNetwork)
V(reviewNetwork)$label <- names(labels)
V(reviewNetwork)$color <- ifelse(V(reviewNetwork)$type=="poster", "gold", "lightblue")


# 依據回覆發生的文章所對應的主題,對他們的關聯線進行上色
E(reviewNetwork)$color <- ifelse(E(reviewNetwork)$cmtStatus == "推", "lightgreen", "palevioletred")

# 畫出社群網路圖
set.seed(5432)
plot(reviewNetwork, vertex.size=2, edge.width=3, vertex.label.dist=1,
     vertex.label=ifelse(degree(reviewNetwork) > filter_degree, V(reviewNetwork)$label, NA),vertex.label.font=2)

# 加入標示
legend("bottomright", c("發文者","回文者"), pch=21,
  col="#777777", pt.bg=c("gold","lightblue"), pt.cex=1, cex=1)
legend("topleft", c("推","噓"), 
       col=c("lightgreen","palevioletred"), lty=1, cex=1)

# networkD3

#install.packages("networkD3")
library(networkD3)
links = link
nodes = filtered_user
nodes$id = 0:(length(nodes$user) - 1)

# 整理資料格式
nodes_complete <- data.frame(nodeID = unique(c(links$cmtPoster, links$artPoster)))
nodes_complete$group <- nodes$type[match(nodes_complete$nodeID, nodes$user)]
links$source <- match(links$cmtPoster, nodes_complete$nodeID) - 1
links$target <- match(links$artPoster, nodes_complete$nodeID) - 1

# 畫圖
library(networkD3)
forceNetwork(Links = links, Nodes = nodes_complete, Source = "source", 
             Target = "target", NodeID = "nodeID", Group = "group", 
             opacity = 0.8, fontSize = 10, zoom = TRUE,legend = TRUE, opacityNoHover = TRUE,
             
             colourScale = "d3.scaleOrdinal(d3.schemeCategory10);",
             linkColour = ifelse(links$cmtStatus == "推", "red","green")  # 設定推噓顏色
             )
## Links is a tbl_df. Converting to a plain data frame.

4 結論

其中我們覺得不生小孩的原因是房價、薪水及養兒防老,然後申論下去後,才發現不單單只有這些原因,包含社會的風氣、政府的作為、基因的遺傳、體力的消耗、生男生女及住不住月中都是不生小孩的幾個大原因。

因此政府若想要提高生育率的話,除了經濟面考量外,像是呼籲企業加薪、提高基本工資,或者透過輔導產業轉型,提高薪酬水準;其他部份也必須從社會福利、社會安全層面著手,以及建立更完善的托育機制。