主題: 為什麼不敢結婚?
動機與分析目的: 美國中情局(CIA)公布2021年全球人口生育預測報告,其中倒數5名的國家都位於亞洲,分別為香港、澳門、新加坡、南韓與台灣,而台灣竟然為全球227個國家中的最後一名,如此震撼的消息也是提醒著我們少子化的問題一直都未解決。年輕人不敢結婚生子會造成國家競爭力下降,因此我們想透過閱歷豐富、經濟能力足夠的PTT鄉民了解目前年輕人對生子的想法,並分析其中資訊,進一步找出原因幫助政府改善社會環境。
問題: 因此我們希望可以藉由分析社群文章來了解以下三個問題 1.為何目前的生育率極低? 2.不結婚的理由 3.需要怎樣的幫助才會增加生育率
# 讀取各版資料
getMarry <- fread('./data/GetmarryArticleMetaData.csv',encoding = 'UTF-8')
women <- fread('./data/WomenArticleMetaData.csv',encoding = 'UTF-8')
marriage <- fread('./data/MarriageArticleMetaData.csv',encoding = 'UTF-8')
print(paste('getMarry版有',count(getMarry),'筆資料'))
## [1] "getMarry版有 206 筆資料"
## [1] "women版有 387 筆資料"
## [1] "marriage版有 481 筆資料"
# 過濾GetMarry中包含[廣宣]、[交易]、[請益]、[贈送]、[推薦]、[分享]字眼的artTitle
keywords <- c('廣宣','交易','請益','贈送','推薦','分享')
toMatch <- paste(keywords,collapse="|")
getMarry <- with(getMarry, getMarry[!grepl(toMatch,artTitle)])
# 新增category欄位
getMarry$artCat <- "GetMarry"
# 印出
print(paste('過濾title含有不相關字眼的getMarry版有',count(getMarry),'筆資料'))
## [1] "過濾title含有不相關字眼的getMarry版有 32 筆資料"
# 加入字典
jieba_tokenizer <- worker(stop_word = "./dict/stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
# unnest output=word, input=sentence, 斷詞器=customized_tokenizer
# 過濾英文字&單個字
getMarryToken <- getMarry %>%
unnest_tokens(word, sentence, token=customized_tokenizer) %>%
filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
filter(nchar(.$word)>1)
womenToken <- women %>%
unnest_tokens(word, sentence, token=customized_tokenizer) %>%
filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
filter(nchar(.$word)>1)
marriageToken <- marriage %>%
unnest_tokens(word, sentence, token=customized_tokenizer) %>%
filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
filter(nchar(.$word)>1)
# 把資料併在一起
data <- rbind(getMarryToken[,c("artDate","artUrl","artCat","word")],womenToken[,c("artDate","artUrl","artCat","word")],marriageToken[,c("artDate","artUrl","artCat","word")])
## artDate artUrl artCat
## 1: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 2: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 3: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 4: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 5: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 6: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## word
## 1: 明天
## 2: 結婚
## 3: 忐忑不安
## 4: 好怕
## 5: 環節
## 6: 會出
## artUrl word n
## 1: https://www.ptt.cc/bbs/marriage/M.1587963172.A.332.html 媽媽 32
## 2: https://www.ptt.cc/bbs/marriage/M.1588194512.A.962.html 老公 31
## 3: https://www.ptt.cc/bbs/marriage/M.1592541632.A.8F9.html 小孩 28
## 4: https://www.ptt.cc/bbs/marriage/M.1579685584.A.C8F.html 男友 27
## 5: https://www.ptt.cc/bbs/marriage/M.1593592497.A.942.html 孩子 26
## ---
## 80053: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 以後 1
## 80054: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 孕育 1
## 80055: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 找個 1
## 80056: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 知道 1
## 80057: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 周遭 1
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
#正負情緒發文折線圖
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
data %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot()+
geom_line(aes(x=artDate,y=count))+
scale_x_date(labels = date_format("%m/%d"))
# sentiment_count:artDate,sentiment,count
sentiment_count = data %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n())
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## [1] "2020-01-01" "2021-05-03"
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2020-01-01','2020-12-29'))
)
## Warning: Removed 144 row(s) containing missing values (geom_path).
sentiment_count %>%
# 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2020-01-01','2020-12-29'))
)
## Warning: Removed 144 row(s) containing missing values (geom_path).
# 加上標示日期的線
#geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-18'))
#[1]])),colour = "red")
jieba_tokenizer = worker()
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
getMarryBigram <- getMarry %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
womenBigram <- women %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
marriageBigram <- marriage %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
# 把資料併在一起
data_bigram <- rbind(getMarryBigram[,c("artDate","artUrl","artCat","bigram")],womenBigram[,c("artDate","artUrl","artCat","bigram")],marriageBigram[,c("artDate","artUrl","artCat","bigram")])
# 清除包含英文或數字的bigram組合
# 計算每個組合出現的次數
data_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
count(bigram, sort = TRUE)
## bigram n
## 1: 也 是 410
## 2: 都 是 368
## 3: 的 人 360
## 4: 我 的 323
## 5: 自己 的 322
## ---
## 134542: ꀊ 然後 1
## 134543: ꀊ 相處 1
## 134544: ꀊ 也許 1
## 134545: ꀊ 以前 1
## 134546: ꀊ 原 1
jieba_trigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
ngram<- ngrams(unlist(tokens), 3)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
}
})
}
getMarryTrigram <- getMarry %>%
unnest_tokens(trigram, sentence, token = jieba_trigram)
womenTrigram <- women %>%
unnest_tokens(trigram, sentence, token = jieba_trigram)
marriageTrigram <- marriage %>%
unnest_tokens(trigram, sentence, token = jieba_trigram)
# 把資料併在一起
data_trigram <- rbind(getMarryTrigram[,c("artDate","artUrl","artCat","trigram")],womenTrigram[,c("artDate","artUrl","artCat","trigram")],marriageTrigram[,c("artDate","artUrl","artCat","trigram")])
## trigram n
## 1: 跟 我 說 49
## 2: 是 為 了 40
## 3: 同意 記者 抄文 38
## 4: 是否 同意 記者 37
## 5: 為 什麼 要 37
## ---
## 204199: ꀊ 我 的 1
## 204200: ꀊ 我 婆婆 1
## 204201: ꀊ 相處 長年 1
## 204202: ꀊ 也許 是 1
## 204203: ꀊ 以前 我 1
#發現bigram、trigram都包含停用字 # 製作stop word dictionary
data_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
count(word1, word2, sort = TRUE) %>%
unite_("bigram", c("word1","word2"), sep=" ")
## bigram n
## 1: 兩個 人 91
## 2: 這件 事 78
## 3: 妳 老公 62
## 4: 結婚 後 52
## 5: 比較 好 49
## ---
## 53826: ꀊ 看到 1
## 53827: ꀊ 相處 1
## 53828: ꀊ 也許 1
## 53829: ꀊ 以前 1
## 53830: ꀊ 原 1
data_trigram %>%
filter(!str_detect(trigram, regex("[0-9a-zA-Z]"))) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
count(word1, word2, word3, sort = TRUE) %>%
unite_("trigram", c("word1", "word2", "word3"), sep=" ")
## trigram n
## 1: 同意 記者 抄文 38
## 2: 是否 同意 記者 37
## 3: 控制 碼 請 10
## 4: 顏色 控制 碼 10
## 5: 當時 父母 談 9
## ---
## 32883: ㄧ 時 逃不了 1
## 32884: ㄧ 體 那為 1
## 32885: ㄧ 直 以來 1
## 32886: ㄨ 服 ㄒ 1
## 32887: ꀊ 相處 長年 1
# load marry_lexicon
marry_lexicon <- scan(file = "./dict/marry_lexicon.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8',quiet = T)
# 結婚相關字典
marry_lexicon
## [1] "情緒勒索" "做家事" "原生家庭" "月子中心" "是否同意"
## [6] "記者抄文" "雙方父母" "雙方家長" "一起生活" "經濟壓力"
## [11] "倒垃圾" "登記結婚" "婚姻諮商" "請保姆" "恕刪"
## [16] "掃地機器人" "喜歡小孩" "玩手機" "生活重心" "肺炎疫情"
## [21] "辦婚禮" "公婆家" "黑人問號" "解決方法" "辰亦儒"
## [26] "放汽座" "男神蘇志燮" "武漢肺炎疫情" "收禮金" "吃人夠夠"
## [1] TRUE
#將data的word欄位斷詞
marry_new_words <- data%>%
unnest_tokens(word, word, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
marry_new_words
## artUrl word n
## 1: https://www.ptt.cc/bbs/marriage/M.1587963172.A.332.html 媽媽 32
## 2: https://www.ptt.cc/bbs/marriage/M.1588194512.A.962.html 老公 31
## 3: https://www.ptt.cc/bbs/marriage/M.1592541632.A.8F9.html 小孩 28
## 4: https://www.ptt.cc/bbs/marriage/M.1579685584.A.C8F.html 男友 27
## 5: https://www.ptt.cc/bbs/marriage/M.1593592497.A.942.html 孩子 26
## ---
## 77738: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 以後 1
## 77739: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 孕育 1
## 77740: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 找個 1
## 77741: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 知道 1
## 77742: https://www.ptt.cc/bbs/WomenTalk/M.1620016646.A.8D2.html 周遭 1
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## # A tibble: 8,135,048 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 覺得 結婚 293
## 2 結婚 覺得 293
## 3 真的 結婚 252
## 4 結婚 真的 252
## 5 真的 覺得 231
## 6 覺得 真的 231
## 7 知道 結婚 220
## 8 結婚 知道 220
## 9 現在 結婚 219
## 10 結婚 現在 219
## # … with 8,135,038 more rows
word_cors <- marry_new_words %>%
group_by(word) %>%
filter(n() >= 10) %>%
pairwise_cor(word, artUrl, sort = TRUE)
word_cors
## # A tibble: 1,981,056 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 抄文 記者 0.885
## 2 記者 抄文 0.885
## 3 刪掉 顏色 0.846
## 4 顏色 刪掉 0.846
## 5 來家裡 一套 0.819
## 6 一套 來家裡 0.819
## 7 看板 作者 0.798
## 8 作者 看板 0.798
## 9 來家裡 並說 0.798
## 10 隨意 並說 0.798
## # … with 1,981,046 more rows
set.seed(2020)
word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
### 使用詞彙關係圖畫出相關性大於0.6的組合
set.seed(2020)
word_cors %>%
filter(correlation > 0.6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
# 設定幾個詞做爲seed words
seed_words <- c("記者", "抄文")
# 設定threshold爲0.6
threshold <- 0.6
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
.$item1 %>%
unique()
remove_words
## [1] "抄文" "記者" "同意"
# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
filter(!(item1 %in% remove_words|item2 %in% remove_words))
word_cors_new %>%
filter(correlation > 0.6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
mnw <- list() for (x in 1:12) { mnw[[x]] <- data_m[[x]] %>% unnest_tokens(word, sentence, token = chi_tokenizer) %>% filter(!str_detect(word, regex(“[0-9a-zA-Z]”))) %>% count(artUrl, word, sort = TRUE) %>% group_by(word) %>% filter(n() >= 10) %>% pairwise_cor(word, artUrl, sort = TRUE) }
corg <- function(x,y=0.1){ mnw[[x]] %>% filter(correlation > y) %>% graph_from_data_frame() %>% ggraph(layout = “fr”) + geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) + geom_node_point(color = “lightblue”, size = 3) + geom_node_text(aes(label = name), repel = TRUE, family = “Heiti TC Light”) + theme_void() } # input the month number to plot corg(6,0.5)