主題: 為什麼不敢結婚?

動機與分析目的: 美國中情局(CIA)公布2021年全球人口生育預測報告,其中倒數5名的國家都位於亞洲,分別為香港、澳門、新加坡、南韓與台灣,而台灣竟然為全球227個國家中的最後一名,如此震撼的消息也是提醒著我們少子化的問題一直都未解決。年輕人不敢結婚生子會造成國家競爭力下降,因此我們想透過閱歷豐富、經濟能力足夠的PTT鄉民了解目前年輕人對生子的想法,並分析其中資訊,進一步找出原因幫助政府改善社會環境。

問題: 因此我們希望可以藉由分析社群文章來了解以下三個問題 1.為何目前的生育率極低? 2.不結婚的理由 3.需要怎樣的幫助才會增加生育率

基本安裝

# echo = T,results = 'hide'
packages = c("dplyr", "tidytext", "stringr", "wordcloud2", "ggplot2",'readr','data.table','reshape2','wordcloud','tidyr','scales')
existing = as.character(installed.packages()[,1])
for(pkg in packages[!(packages %in% existing)]) install.packages(pkg)

載入packages

library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
library(jiebaR)
library(NLP)
library(ggraph)
library(igraph)
library(widyr)

資料介紹

  • 資料來源:PTT的結婚、婚姻和女版
  • 資料集:自2020/01/01至2020/12/31
# 讀取各版資料
getMarry <- fread('./data/GetmarryArticleMetaData.csv',encoding = 'UTF-8')
women <- fread('./data/WomenArticleMetaData.csv',encoding = 'UTF-8')
marriage <- fread('./data/MarriageArticleMetaData.csv',encoding = 'UTF-8')

print(paste('getMarry版有',count(getMarry),'筆資料'))
## [1] "getMarry版有 206 筆資料"
print(paste('women版有',count(women),'筆資料'))
## [1] "women版有 387 筆資料"
print(paste('marriage版有',count(marriage),'筆資料'))
## [1] "marriage版有 481 筆資料"
# 過濾GetMarry中包含[廣宣]、[交易]、[請益]、[贈送]、[推薦]、[分享]字眼的artTitle
keywords <- c('廣宣','交易','請益','贈送','推薦','分享')
toMatch <- paste(keywords,collapse="|")
getMarry <- with(getMarry, getMarry[!grepl(toMatch,artTitle)])

# 新增category欄位
getMarry$artCat <- "GetMarry"

# 印出
print(paste('過濾title含有不相關字眼的getMarry版有',count(getMarry),'筆資料'))
## [1] "過濾title含有不相關字眼的getMarry版有 32 筆資料"

資料前處理: 第一次斷詞+刪除停用字

  • 為了分析po文內容,先透過特定字典斷詞和刪除停用字
# 加入字典
jieba_tokenizer <- worker(stop_word = "./dict/stop_words.txt")

# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
  • 將斷詞拉到新增的word欄位並過濾包含英文的字詞
# unnest output=word, input=sentence, 斷詞器=customized_tokenizer
# 過濾英文字&單個字

getMarryToken <- getMarry %>% 
  unnest_tokens(word, sentence, token=customized_tokenizer) %>%
  filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
  filter(nchar(.$word)>1)

womenToken <- women %>% 
  unnest_tokens(word, sentence, token=customized_tokenizer) %>%
  filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
  filter(nchar(.$word)>1)

marriageToken <- marriage %>% 
  unnest_tokens(word, sentence, token=customized_tokenizer) %>%
  filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
  filter(nchar(.$word)>1)

# 把資料併在一起
 data <- rbind(getMarryToken[,c("artDate","artUrl","artCat","word")],womenToken[,c("artDate","artUrl","artCat","word")],marriageToken[,c("artDate","artUrl","artCat","word")])
  • 轉換日期格式
# 格式化日期
data$artDate <- data$artDate %>% as.Date("%Y/%m/%d")
head(data)
##       artDate                                                  artUrl   artCat
## 1: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 2: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 3: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 4: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 5: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 6: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
##        word
## 1:     明天
## 2:     結婚
## 3: 忐忑不安
## 4:     好怕
## 5:     環節
## 6:     會出

文字雲

  • 根據斷詞計算出現最多的字詞並繪製文字雲
# nchar:Count the Number of Characters
# sum:計算總合
tokens_count <- data %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>50) %>%
  arrange(desc(sum))
tokens_count %>% wordcloud2()

TF-IDF

# 計算各詞在各文章中出現的次數
data_words <- data %>%
  count(artUrl, word, sort = TRUE)
data_words
##                                                          artUrl word  n
##     1:  https://www.ptt.cc/bbs/marriage/M.1587963172.A.332.html 媽媽 32
##     2:  https://www.ptt.cc/bbs/marriage/M.1588194512.A.962.html 老公 31
##     3:  https://www.ptt.cc/bbs/marriage/M.1592541632.A.8F9.html 小孩 28
##     4:  https://www.ptt.cc/bbs/marriage/M.1579685584.A.C8F.html 男友 27
##     5:  https://www.ptt.cc/bbs/marriage/M.1593592497.A.942.html 孩子 26
##    ---                                                                 
## 80053: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 以後  1
## 80054: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 孕育  1
## 80055: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 找個  1
## 80056: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 知道  1
## 80057: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 周遭  1

每個月的正負情緒變化[使用程式碼>>第五周:以情緒分析ptt與dcard鮭魚之亂]

讀入LIWC字典

P <- read_file('dict/liwc/positive.txt') # 正向字典txt檔
N <- read_file('dict/liwc/negative.txt') # 負向字典txt檔

分割字詞,並將兩個情緒字典併在一起

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)


#正負情緒發文折線圖
 data$artDate=  data$artDate %>% as.Date("%Y/%m/%d")
 data %>%
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot()+
    geom_line(aes(x=artDate,y=count))+
    scale_x_date(labels = date_format("%m/%d"))

算出每天情緒總和(sentiment_count)

# sentiment_count:artDate,sentiment,count
sentiment_count = data %>%
  select(artDate,word) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=n())  
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

正負情緒分數折線圖

# 檢視資料的日期區間
range(sentiment_count$artDate) #"2020-01-01" "2020-12-29"
## [1] "2020-01-01" "2021-05-03"
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2020-01-01','2020-12-29'))
               )
## Warning: Removed 144 row(s) containing missing values (geom_path).

  # 加上標示日期的線
#  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-18'))
#[1]])),colour = "red") 

正負情緒比例折線圖(擠在一起)

sentiment_count %>% 
  # 標準化的部分
  group_by(artDate) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2020-01-01','2020-12-29'))
               )
## Warning: Removed 144 row(s) containing missing values (geom_path).

  # 加上標示日期的線
 #geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-18'))
#[1]])),colour = "red")

透過n-gram幫助建立字典(使用分析PTT八卦版水庫相關文章之詞彙關係的程式碼)

jieba_tokenizer = worker()

# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}
getMarryBigram <- getMarry %>% 
  unnest_tokens(bigram, sentence, token = jieba_bigram) 

womenBigram <- women %>% 
  unnest_tokens(bigram, sentence, token = jieba_bigram) 

marriageBigram <- marriage %>% 
  unnest_tokens(bigram, sentence, token = jieba_bigram) 


# 把資料併在一起
 data_bigram <- rbind(getMarryBigram[,c("artDate","artUrl","artCat","bigram")],womenBigram[,c("artDate","artUrl","artCat","bigram")],marriageBigram[,c("artDate","artUrl","artCat","bigram")])

統計最常出現的bigram 組合

# 清除包含英文或數字的bigram組合
# 計算每個組合出現的次數
data_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  count(bigram, sort = TRUE)
##          bigram   n
##      1:   也 是 410
##      2:   都 是 368
##      3:   的 人 360
##      4:   我 的 323
##      5: 自己 的 322
##     ---            
## 134542: ꀊ 然後   1
## 134543: ꀊ 相處   1
## 134544: ꀊ 也許   1
## 134545: ꀊ 以前   1
## 134546:   ꀊ 原   1

Trigram

jieba_trigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      ngram<- ngrams(unlist(tokens), 3)
      ngram <- lapply(ngram, paste, collapse = " ")
      unlist(ngram)
    }
  })
}
getMarryTrigram <- getMarry %>% 
  unnest_tokens(trigram, sentence, token = jieba_trigram) 

womenTrigram <- women %>% 
  unnest_tokens(trigram, sentence, token = jieba_trigram) 

marriageTrigram <- marriage %>% 
  unnest_tokens(trigram, sentence, token = jieba_trigram) 


# 把資料併在一起
 data_trigram <- rbind(getMarryTrigram[,c("artDate","artUrl","artCat","trigram")],womenTrigram[,c("artDate","artUrl","artCat","trigram")],marriageTrigram[,c("artDate","artUrl","artCat","trigram")])
data_trigram %>%
  filter(!str_detect(trigram, regex("[0-9a-zA-Z]"))) %>%
  count(trigram, sort = TRUE)
##                trigram  n
##      1:       跟 我 說 49
##      2:       是 為 了 40
##      3: 同意 記者 抄文 38
##      4: 是否 同意 記者 37
##      5:     為 什麼 要 37
##     ---                  
## 204199:       ꀊ 我 的  1
## 204200:     ꀊ 我 婆婆  1
## 204201:   ꀊ 相處 長年  1
## 204202:     ꀊ 也許 是  1
## 204203:     ꀊ 以前 我  1

#發現bigram、trigram都包含停用字 # 製作stop word dictionary

# load stop words
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')

bigram移除停用字

data_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  count(word1, word2, sort = TRUE) %>%
  unite_("bigram", c("word1","word2"), sep=" ")
##         bigram  n
##     1: 兩個 人 91
##     2: 這件 事 78
##     3: 妳 老公 62
##     4: 結婚 後 52
##     5: 比較 好 49
##    ---           
## 53826: ꀊ 看到  1
## 53827: ꀊ 相處  1
## 53828: ꀊ 也許  1
## 53829: ꀊ 以前  1
## 53830:   ꀊ 原  1

tigram移除停用字

data_trigram %>%
  filter(!str_detect(trigram, regex("[0-9a-zA-Z]"))) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
  count(word1, word2, word3, sort = TRUE) %>%
  unite_("trigram", c("word1", "word2", "word3"), sep=" ")
##               trigram  n
##     1: 同意 記者 抄文 38
##     2: 是否 同意 記者 37
##     3:     控制 碼 請 10
##     4:   顏色 控制 碼 10
##     5:   當時 父母 談  9
##    ---                  
## 32883:   ㄧ 時 逃不了  1
## 32884:     ㄧ 體 那為  1
## 32885:     ㄧ 直 以來  1
## 32886:       ㄨ 服 ㄒ  1
## 32887:   ꀊ 相處 長年  1

使用自建字典

# load marry_lexicon
marry_lexicon <- scan(file = "./dict/marry_lexicon.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8',quiet = T)
# 結婚相關字典
marry_lexicon
##  [1] "情緒勒索"     "做家事"       "原生家庭"     "月子中心"     "是否同意"    
##  [6] "記者抄文"     "雙方父母"     "雙方家長"     "一起生活"     "經濟壓力"    
## [11] "倒垃圾"       "登記結婚"     "婚姻諮商"     "請保姆"       "恕刪"        
## [16] "掃地機器人"   "喜歡小孩"     "玩手機"       "生活重心"     "肺炎疫情"    
## [21] "辦婚禮"       "公婆家"       "黑人問號"     "解決方法"     "辰亦儒"      
## [26] "放汽座"       "男神蘇志燮"   "武漢肺炎疫情" "收禮金"       "吃人夠夠"

使用新字典建立斷詞器

jieba_tokenizer = worker()

# 使用自建字典重新斷詞
new_user_word(jieba_tokenizer, c(marry_lexicon))
## [1] TRUE
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

剛才的斷詞結果沒有使用新增的辭典,因此我們重新進行斷詞,再計算各詞彙在各文章中出現的次數

#將data的word欄位斷詞
marry_new_words <- data%>%
  unnest_tokens(word, word, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
marry_new_words
##                                                          artUrl word  n
##     1:  https://www.ptt.cc/bbs/marriage/M.1587963172.A.332.html 媽媽 32
##     2:  https://www.ptt.cc/bbs/marriage/M.1588194512.A.962.html 老公 31
##     3:  https://www.ptt.cc/bbs/marriage/M.1592541632.A.8F9.html 小孩 28
##     4:  https://www.ptt.cc/bbs/marriage/M.1579685584.A.C8F.html 男友 27
##     5:  https://www.ptt.cc/bbs/marriage/M.1593592497.A.942.html 孩子 26
##    ---                                                                 
## 77738: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 以後  1
## 77739: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 孕育  1
## 77740: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 找個  1
## 77741: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 知道  1
## 77742: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 周遭  1

計算兩個詞彙同時出現的總次數

word_pairs <- marry_new_words %>%
  pairwise_count(word, artUrl, sort = TRUE) 
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
word_pairs
## # A tibble: 8,135,048 x 3
##    item1 item2     n
##    <chr> <chr> <dbl>
##  1 覺得  結婚    293
##  2 結婚  覺得    293
##  3 真的  結婚    252
##  4 結婚  真的    252
##  5 真的  覺得    231
##  6 覺得  真的    231
##  7 知道  結婚    220
##  8 結婚  知道    220
##  9 現在  結婚    219
## 10 結婚  現在    219
## # … with 8,135,038 more rows

計算兩個詞彙間的相關性

word_cors <- marry_new_words %>%
  group_by(word) %>%
  filter(n() >= 10) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

word_cors
## # A tibble: 1,981,056 x 3
##    item1  item2  correlation
##    <chr>  <chr>        <dbl>
##  1 抄文   記者         0.885
##  2 記者   抄文         0.885
##  3 刪掉   顏色         0.846
##  4 顏色   刪掉         0.846
##  5 來家裡 一套         0.819
##  6 一套   來家裡       0.819
##  7 看板   作者         0.798
##  8 作者   看板         0.798
##  9 來家裡 並說         0.798
## 10 隨意   並說         0.798
## # … with 1,981,046 more rows

使用詞彙關係圖畫出相關性大於0.5的組合

set.seed(2020)
word_cors %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
  theme_void()

### 使用詞彙關係圖畫出相關性大於0.6的組合

set.seed(2020)

word_cors %>%
  filter(correlation > 0.6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
  theme_void()

移除不需要的字

# 設定幾個詞做爲seed words
seed_words <- c("記者", "抄文")
# 設定threshold爲0.6
threshold <- 0.6
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
                filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
                .$item1 %>%
                unique()
remove_words
## [1] "抄文" "記者" "同意"

使用詞彙關係圖畫出相關性大於0.6的組合

# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
                filter(!(item1 %in% remove_words|item2 %in% remove_words))

word_cors_new %>%
  filter(correlation > 0.6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
  theme_void()

mnw <- list() for (x in 1:12) { mnw[[x]] <- data_m[[x]] %>% unnest_tokens(word, sentence, token = chi_tokenizer) %>% filter(!str_detect(word, regex(“[0-9a-zA-Z]”))) %>% count(artUrl, word, sort = TRUE) %>% group_by(word) %>% filter(n() >= 10) %>% pairwise_cor(word, artUrl, sort = TRUE) }

# make word_cors by month 

corg <- function(x,y=0.1){ mnw[[x]] %>% filter(correlation > y) %>% graph_from_data_frame() %>% ggraph(layout = “fr”) + geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + geom_node_point(color = “lightblue”, size = 3) + geom_node_text(aes(label = name), repel = TRUE, family = “Heiti TC Light”) + theme_void() } # input the month number to plot corg(6,0.5)

# build the function for plot