主題: 為什麼不敢結婚?

動機與分析目的: 美國中情局(CIA)公布2021年全球人口生育預測報告,其中倒數5名的國家都位於亞洲,分別為香港、澳門、新加坡、南韓與台灣,而台灣竟然為全球227個國家中的最後一名,如此震撼的消息也是提醒著我們少子化的問題一直都未解決。年輕人不敢結婚生子會造成國家競爭力下降,因此我們想透過閱歷豐富、經濟能力足夠的PTT鄉民了解目前年輕人對生子的想法,並分析其中資訊,進一步找出原因幫助政府改善社會環境。

載入packages

library(dplyr)
library(stringr)
library(tidytext)
library(wordcloud2)
library(data.table)
library(ggplot2)
library(reshape2)
library(wordcloud)
library(tidyr)
library(readr)
library(scales)
library(jiebaR)
library(NLP)
library(ggraph)
library(igraph)
library(widyr)

資料介紹

  • 資料來源:PTT的結婚、婚姻和女版
  • 資料集:自2020/01/01至2020/12/31
# 讀取各版資料
getMarry <- fread('./data/2020GetmarryArticleMetaData.csv',encoding = 'UTF-8')
women <- fread('./data/2020WomenArticleMetaData.csv',encoding = 'UTF-8')
marriage <- fread('./data/2020MarriageArticleMetaData.csv',encoding = 'UTF-8')

print(paste('getMarry版有',count(getMarry),'筆資料'))
## [1] "getMarry版有 162 筆資料"
print(paste('women版有',count(women),'筆資料'))
## [1] "women版有 308 筆資料"
print(paste('marriage版有',count(marriage),'筆資料'))
## [1] "marriage版有 316 筆資料"
# 過濾GetMarry中包含[廣宣]、[交易]、[請益]、[贈送]、[推薦]、[分享]字眼的artTitle
keywords <- c('廣宣','交易','請益','贈送','推薦','分享')
toMatch <- paste(keywords,collapse="|")
getMarry <- with(getMarry, getMarry[!grepl(toMatch,artTitle)])

# 新增category欄位
getMarry$artCat <- "GetMarry"

# 印出
print(paste('過濾title含有不相關字眼的getMarry版有',count(getMarry),'筆資料'))
## [1] "過濾title含有不相關字眼的getMarry版有 31 筆資料"

資料前處理: 第一次斷詞+刪除停用字

  • 為了分析po文內容,先透過特定字典斷詞和刪除停用字
# 加入字典
jieba_tokenizer <- worker(stop_word = "./dict/stop_words.txt")

# 設定斷詞function
customized_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
  • 將斷詞拉到新增的word欄位並過濾包含英文的字詞
# unnest output=word, input=sentence, 斷詞器=customized_tokenizer
# 過濾英文字&單個字

getMarryToken <- getMarry %>% 
  unnest_tokens(word, artContent, token=customized_tokenizer) %>%
  filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
  filter(nchar(.$word)>1)

womenToken <- women %>% 
  unnest_tokens(word, sentence, token=customized_tokenizer) %>%
  filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
  filter(nchar(.$word)>1)

marriageToken <- marriage %>% 
  unnest_tokens(word, sentence, token=customized_tokenizer) %>%
  filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
  filter(nchar(.$word)>1)

# 把資料併在一起
 data <- rbind(getMarryToken[,c("artDate","artUrl","artCat","word")],womenToken[,c("artDate","artUrl","artCat","word")],marriageToken[,c("artDate","artUrl","artCat","word")])
  • 轉換日期格式
# 格式化日期
data$artDate <- data$artDate %>% as.Date("%Y/%m/%d")
head(data)
##       artDate                                                  artUrl   artCat
## 1: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 2: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 3: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 4: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 5: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 6: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
##        word
## 1:     明天
## 2:     結婚
## 3: 忐忑不安
## 4:     好怕
## 5:     環節
## 6:     會出

文字雲

  • 根據斷詞計算出現最多的字詞並繪製文字雲
# nchar:Count the Number of Characters
# sum:計算總合
tokens_count <- data %>% 
  filter(nchar(.$word)>1) %>%
  group_by(word) %>% 
  summarise(sum = n()) %>% 
  filter(sum>50) %>%
  arrange(desc(sum))
tokens_count %>% wordcloud2()

TF-IDF

# 計算各詞在各文章中出現的次數
data_words <- data %>%
  count(artUrl, word, sort = TRUE)
data_words
##                                                          artUrl word  n
##     1:  https://www.ptt.cc/bbs/marriage/M.1587963172.A.332.html 媽媽 32
##     2:  https://www.ptt.cc/bbs/marriage/M.1588194512.A.962.html 老公 31
##     3:  https://www.ptt.cc/bbs/marriage/M.1592541632.A.8F9.html 小孩 28
##     4:  https://www.ptt.cc/bbs/marriage/M.1579685584.A.C8F.html 男友 27
##     5:  https://www.ptt.cc/bbs/marriage/M.1593592497.A.942.html 孩子 26
##    ---                                                                 
## 57346: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 原因  1
## 57347: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 直接  1
## 57348: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 職稱  1
## 57349: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 抓到  1
## 57350: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 最近  1

每個月的正負情緒變化

讀入LIWC字典

P <- read_file('dict/liwc/positive.txt') # 正向字典txt檔
N <- read_file('dict/liwc/negative.txt') # 負向字典txt檔

分割字詞,並將兩個情緒字典併在一起

# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]

# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047

# 把兩個字典拼在一起
LIWC = rbind(P, N)


#正負情緒發文折線圖
 data$artDate=  data$artDate %>% as.Date("%Y/%m/%d")
 data %>%
  group_by(artDate) %>%
  summarise(count = n()) %>%
  ggplot()+
    geom_line(aes(x=artDate,y=count))+
    scale_x_date(labels = date_format("%m/%d"))

算出每天情緒總和(sentiment_count)

# sentiment_count:artDate,sentiment,count
sentiment_count = data %>%
  select(artDate,word) %>%
  inner_join(LIWC) %>% 
  group_by(artDate,sentiment) %>%
  summarise(count=n())  
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.

正負情緒分數折線圖

# 檢視資料的日期區間
range(sentiment_count$artDate) #"2020-01-01" "2020-12-29"
## [1] "2020-01-01" "2020-12-29"
sentiment_count %>%
  ggplot()+
  geom_line(aes(x=artDate,y=count,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2020-01-01','2020-12-29'))
               )

  # 加上標示日期的線
#  geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-18'))
#[1]])),colour = "red") 

正負情緒比例折線圖

sentiment_count %>% 
  # 標準化的部分
  group_by(artDate) %>%
  mutate(ratio = count/sum(count)) %>%
  # 畫圖的部分
  ggplot()+
  geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
  scale_x_date(labels = date_format("%m/%d"),
               limits = as.Date(c('2020-01-01','2020-12-29'))
               )

  # 加上標示日期的線
 #geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-18'))
#[1]])),colour = "red")

透過n-gram幫助建立字典

jieba_tokenizer = worker()

# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      bigram<- ngrams(tokens, 2)
      bigram <- lapply(bigram, paste, collapse = " ")
      unlist(bigram)
    }
  })
}
getMarryBigram <- getMarry %>% 
  unnest_tokens(bigram, artContent, token = jieba_bigram) 

womenBigram <- women %>% 
  unnest_tokens(bigram, sentence, token = jieba_bigram) 

marriageBigram <- marriage %>% 
  unnest_tokens(bigram, sentence, token = jieba_bigram) 


# 把資料併在一起
 data_bigram <- rbind(getMarryBigram[,c("artDate","artUrl","artCat","bigram")],womenBigram[,c("artDate","artUrl","artCat","bigram")],marriageBigram[,c("artDate","artUrl","artCat","bigram")])

統計最常出現的bigram 組合

# 清除包含英文或數字的bigram組合
# 計算每個組合出現的次數
data_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  count(bigram, sort = TRUE)
##          bigram   n
##      1:   也 是 306
##      2:   都 是 275
##      3:   的 人 265
##      4: 自己 的 252
##      5:   我 的 243
##     ---            
## 101661: ꀊ 然後   1
## 101662: ꀊ 相處   1
## 101663: ꀊ 也許   1
## 101664: ꀊ 以前   1
## 101665:   ꀊ 原   1

Trigram

jieba_trigram <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      ngram<- ngrams(unlist(tokens), 3)
      ngram <- lapply(ngram, paste, collapse = " ")
      unlist(ngram)
    }
  })
}
getMarryTrigram <- getMarry %>% 
  unnest_tokens(trigram, artContent, token = jieba_trigram) 

womenTrigram <- women %>% 
  unnest_tokens(trigram, sentence, token = jieba_trigram) 

marriageTrigram <- marriage %>% 
  unnest_tokens(trigram, sentence, token = jieba_trigram) 


# 把資料併在一起
 data_trigram <- rbind(getMarryTrigram[,c("artDate","artUrl","artCat","trigram")],womenTrigram[,c("artDate","artUrl","artCat","trigram")],marriageTrigram[,c("artDate","artUrl","artCat","trigram")])
data_trigram %>%
  filter(!str_detect(trigram, regex("[0-9a-zA-Z]"))) %>%
  count(trigram, sort = TRUE)
##                trigram  n
##      1:       跟 我 說 44
##      2: 同意 記者 抄文 32
##      3: 是否 同意 記者 31
##      4:       是 為 了 30
##      5:     為 什麼 要 28
##     ---                  
## 150206:       ꀊ 我 的  1
## 150207:     ꀊ 我 婆婆  1
## 150208:   ꀊ 相處 長年  1
## 150209:     ꀊ 也許 是  1
## 150210:     ꀊ 以前 我  1

#發現bigram、trigram都包含停用字 # 製作stop word dictionary

# load stop words
stop_words <- scan(file = "./dict/stop_words.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8')

bigram移除停用字

data_bigram %>%
  filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
  separate(bigram, c("word1", "word2"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
  count(word1, word2, sort = TRUE) %>%
  unite_("bigram", c("word1","word2"), sep=" ")
##         bigram  n
##     1: 兩個 人 65
##     2: 這件 事 53
##     3: 妳 老公 47
##     4: 做 家事 41
##     5: 結婚 後 38
##    ---           
## 39354: ꀊ 看到  1
## 39355: ꀊ 相處  1
## 39356: ꀊ 也許  1
## 39357: ꀊ 以前  1
## 39358:   ꀊ 原  1

tigram移除停用字

data_trigram %>%
  filter(!str_detect(trigram, regex("[0-9a-zA-Z]"))) %>%
  separate(trigram, c("word1", "word2", "word3"), sep = " ") %>% 
  filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
  count(word1, word2, word3, sort = TRUE) %>%
  unite_("trigram", c("word1", "word2", "word3"), sep=" ")
##               trigram  n
##     1: 同意 記者 抄文 32
##     2: 是否 同意 記者 31
##     3:     控制 碼 請  6
##     4:   顏色 控制 碼  6
##     5:     原文 恕 刪  6
##    ---                  
## 23454:   ㄧ 時 逃不了  1
## 23455:     ㄧ 體 那為  1
## 23456:     ㄧ 直 以來  1
## 23457:       ㄨ 服 ㄒ  1
## 23458:   ꀊ 相處 長年  1

使用自建字典

# load marry_lexicon
marry_lexicon <- scan(file = "./dict/marry_lexicon.txt", what=character(),sep='\n', 
                   encoding='utf-8',fileEncoding='utf-8',quiet = T)
# 結婚相關字典
marry_lexicon
##  [1] "情緒勒索"     "做家事"       "原生家庭"     "月子中心"     "是否同意"    
##  [6] "記者抄文"     "雙方父母"     "雙方家長"     "一起生活"     "經濟壓力"    
## [11] "倒垃圾"       "登記結婚"     "婚姻諮商"     "請保姆"       "恕刪"        
## [16] "掃地機器人"   "喜歡小孩"     "玩手機"       "生活重心"     "肺炎疫情"    
## [21] "辦婚禮"       "公婆家"       "黑人問號"     "解決方法"     "辰亦儒"      
## [26] "放汽座"       "男神蘇志燮"   "武漢肺炎疫情" "收禮金"       "吃人夠夠"    
## [31] "是否同意記者" "顏色控制碼"

使用新字典建立斷詞器

jieba_tokenizer = worker()

# 使用自建字典重新斷詞
new_user_word(jieba_tokenizer, c(marry_lexicon))
## [1] TRUE
chi_tokenizer <- function(t) {
  lapply(t, function(x) {
    if(nchar(x)>1){
      tokens <- segment(x, jieba_tokenizer)
      tokens <- tokens[!tokens %in% stop_words]
      # 去掉字串長度爲1的詞彙
      tokens <- tokens[nchar(tokens)>1]
      return(tokens)
    }
  })
}

剛才的斷詞結果沒有使用新增的辭典,因此我們重新進行斷詞,再計算各詞彙在各文章中出現的次數

#將data的word欄位斷詞
marry_new_words <- data%>%
  unnest_tokens(word, word, token=chi_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(artUrl, word, sort = TRUE)
marry_new_words
##                                                          artUrl word  n
##     1:  https://www.ptt.cc/bbs/marriage/M.1587963172.A.332.html 媽媽 32
##     2:  https://www.ptt.cc/bbs/marriage/M.1588194512.A.962.html 老公 31
##     3:  https://www.ptt.cc/bbs/marriage/M.1592541632.A.8F9.html 小孩 28
##     4:  https://www.ptt.cc/bbs/marriage/M.1579685584.A.C8F.html 男友 27
##     5:  https://www.ptt.cc/bbs/marriage/M.1593592497.A.942.html 孩子 26
##    ---                                                                 
## 55895: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 原因  1
## 55896: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 直接  1
## 55897: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 職稱  1
## 55898: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 抓到  1
## 55899: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 最近  1

計算兩個詞彙同時出現的總次數

word_pairs <- marry_new_words %>%
  pairwise_count(word, artUrl, sort = TRUE) 
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
word_pairs
## # A tibble: 5,918,178 x 3
##    item1 item2     n
##    <chr> <chr> <dbl>
##  1 覺得  結婚    229
##  2 結婚  覺得    229
##  3 真的  結婚    198
##  4 結婚  真的    198
##  5 真的  覺得    174
##  6 覺得  真的    174
##  7 現在  結婚    171
##  8 結婚  現在    171
##  9 知道  結婚    163
## 10 結婚  知道    163
## # … with 5,918,168 more rows

計算兩個詞彙間的相關性

word_cors <- marry_new_words %>%
  group_by(word) %>%
  filter(n() >= 10) %>%
  pairwise_cor(word, artUrl, sort = TRUE)

word_cors
## # A tibble: 1,009,020 x 3
##    item1 item2 correlation
##    <chr> <chr>       <dbl>
##  1 抄文  記者        0.901
##  2 記者  抄文        0.901
##  3 抄文  同意        0.701
##  4 同意  抄文        0.701
##  5 產後  憂鬱        0.691
##  6 憂鬱  產後        0.691
##  7 記者  同意        0.668
##  8 同意  記者        0.668
##  9 祖先  屬虎        0.633
## 10 屬虎  祖先        0.633
## # … with 1,009,010 more rows

使用詞彙關係圖畫出相關性大於0.5的組合

set.seed(2020)
word_cors %>%
  filter(correlation > 0.5) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
  theme_void()

### 使用詞彙關係圖畫出相關性大於0.6的組合

set.seed(2020)

word_cors %>%
  filter(correlation > 0.6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
  theme_void()

移除不需要的字

# 設定幾個詞做爲seed words
seed_words <- c("記者", "抄文")
# 設定threshold爲0.6
threshold <- 0.6
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
                filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
                .$item1 %>%
                unique()
remove_words
## [1] "抄文" "記者" "同意" "是否"

使用詞彙關係圖畫出相關性大於0.6的組合, 因每月討論量數有限,故調整0.1-0.6

# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
                filter(!(item1 %in% remove_words|item2 %in% remove_words))

word_cors_new %>%
  filter(correlation > 0.6) %>%
  graph_from_data_frame() %>%
  ggraph(layout = "fr") +
  geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + 
  geom_node_point(color = "lightblue", size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
  theme_void()

######################## 2020-1月-共現圖 ######################## avatar ######################## 2020-2月-共現圖 ######################## avatar ######################## 2020-3月-共現圖 ######################## avatar ######################## 2020-4月-共現圖 ######################## avatar ######################## 2020-5月-共現圖 ######################## avatar ######################## 2020-6月-共現圖 ######################## avatar ######################## 2020-7月-共現圖 ######################## avatar ######################## 2020-8月-共現圖 ######################## avatar ######################## 2020-9月-共現圖 ######################## avatar ######################## 2020-10月-共現圖 ######################## avatar ######################## 2020-11月-共現圖 ######################## avatar ######################## 2020-12月-共現圖 ######################## avatar