主題: 為什麼不敢結婚?
動機與分析目的: 美國中情局(CIA)公布2021年全球人口生育預測報告,其中倒數5名的國家都位於亞洲,分別為香港、澳門、新加坡、南韓與台灣,而台灣竟然為全球227個國家中的最後一名,如此震撼的消息也是提醒著我們少子化的問題一直都未解決。年輕人不敢結婚生子會造成國家競爭力下降,因此我們想透過閱歷豐富、經濟能力足夠的PTT鄉民了解目前年輕人對生子的想法,並分析其中資訊,進一步找出原因幫助政府改善社會環境。
# 讀取各版資料
getMarry <- fread('./data/2020GetmarryArticleMetaData.csv',encoding = 'UTF-8')
women <- fread('./data/2020WomenArticleMetaData.csv',encoding = 'UTF-8')
marriage <- fread('./data/2020MarriageArticleMetaData.csv',encoding = 'UTF-8')
print(paste('getMarry版有',count(getMarry),'筆資料'))
## [1] "getMarry版有 162 筆資料"
## [1] "women版有 308 筆資料"
## [1] "marriage版有 316 筆資料"
# 過濾GetMarry中包含[廣宣]、[交易]、[請益]、[贈送]、[推薦]、[分享]字眼的artTitle
keywords <- c('廣宣','交易','請益','贈送','推薦','分享')
toMatch <- paste(keywords,collapse="|")
getMarry <- with(getMarry, getMarry[!grepl(toMatch,artTitle)])
# 新增category欄位
getMarry$artCat <- "GetMarry"
# 印出
print(paste('過濾title含有不相關字眼的getMarry版有',count(getMarry),'筆資料'))
## [1] "過濾title含有不相關字眼的getMarry版有 31 筆資料"
# 加入字典
jieba_tokenizer <- worker(stop_word = "./dict/stop_words.txt")
# 設定斷詞function
customized_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
# unnest output=word, input=sentence, 斷詞器=customized_tokenizer
# 過濾英文字&單個字
getMarryToken <- getMarry %>%
unnest_tokens(word, artContent, token=customized_tokenizer) %>%
filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
filter(nchar(.$word)>1)
womenToken <- women %>%
unnest_tokens(word, sentence, token=customized_tokenizer) %>%
filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
filter(nchar(.$word)>1)
marriageToken <- marriage %>%
unnest_tokens(word, sentence, token=customized_tokenizer) %>%
filter(str_detect(word,regex("[^a-zA-Z]"))) %>%
filter(nchar(.$word)>1)
# 把資料併在一起
data <- rbind(getMarryToken[,c("artDate","artUrl","artCat","word")],womenToken[,c("artDate","artUrl","artCat","word")],marriageToken[,c("artDate","artUrl","artCat","word")])
## artDate artUrl artCat
## 1: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 2: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 3: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 4: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 5: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## 6: 2020-01-11 https://www.ptt.cc/bbs/GetMarry/M.1578737222.A.6A8.html GetMarry
## word
## 1: 明天
## 2: 結婚
## 3: 忐忑不安
## 4: 好怕
## 5: 環節
## 6: 會出
## artUrl word n
## 1: https://www.ptt.cc/bbs/marriage/M.1587963172.A.332.html 媽媽 32
## 2: https://www.ptt.cc/bbs/marriage/M.1588194512.A.962.html 老公 31
## 3: https://www.ptt.cc/bbs/marriage/M.1592541632.A.8F9.html 小孩 28
## 4: https://www.ptt.cc/bbs/marriage/M.1579685584.A.C8F.html 男友 27
## 5: https://www.ptt.cc/bbs/marriage/M.1593592497.A.942.html 孩子 26
## ---
## 57346: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 原因 1
## 57347: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 直接 1
## 57348: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 職稱 1
## 57349: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 抓到 1
## 57350: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 最近 1
# 將字串依,分割
# strsplit回傳list , 我們取出list中的第一個元素
P = strsplit(P, ",")[[1]]
N = strsplit(N, ",")[[1]]
# 建立dataframe 有兩個欄位word,sentiments,word欄位內容是字典向量
P = data.frame(word = P, sentiment = "positive") #664
N = data.frame(word = N, sentiment = "negative") #1047
# 把兩個字典拼在一起
LIWC = rbind(P, N)
#正負情緒發文折線圖
data$artDate= data$artDate %>% as.Date("%Y/%m/%d")
data %>%
group_by(artDate) %>%
summarise(count = n()) %>%
ggplot()+
geom_line(aes(x=artDate,y=count))+
scale_x_date(labels = date_format("%m/%d"))
# sentiment_count:artDate,sentiment,count
sentiment_count = data %>%
select(artDate,word) %>%
inner_join(LIWC) %>%
group_by(artDate,sentiment) %>%
summarise(count=n())
## Joining, by = "word"
## `summarise()` has grouped output by 'artDate'. You can override using the `.groups` argument.
## [1] "2020-01-01" "2020-12-29"
sentiment_count %>%
ggplot()+
geom_line(aes(x=artDate,y=count,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2020-01-01','2020-12-29'))
)
sentiment_count %>%
# 標準化的部分
group_by(artDate) %>%
mutate(ratio = count/sum(count)) %>%
# 畫圖的部分
ggplot()+
geom_line(aes(x=artDate,y=ratio,colour=sentiment))+
scale_x_date(labels = date_format("%m/%d"),
limits = as.Date(c('2020-01-01','2020-12-29'))
)
# 加上標示日期的線
#geom_vline(aes(xintercept = as.numeric(artDate[which(sentiment_count$artDate == as.Date('2021-03-18'))
#[1]])),colour = "red")
jieba_tokenizer = worker()
# unnest_tokens 使用的bigram分詞函數
# Input: a character vector
# Output: a list of character vectors of the same length
jieba_bigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
bigram<- ngrams(tokens, 2)
bigram <- lapply(bigram, paste, collapse = " ")
unlist(bigram)
}
})
}
getMarryBigram <- getMarry %>%
unnest_tokens(bigram, artContent, token = jieba_bigram)
womenBigram <- women %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
marriageBigram <- marriage %>%
unnest_tokens(bigram, sentence, token = jieba_bigram)
# 把資料併在一起
data_bigram <- rbind(getMarryBigram[,c("artDate","artUrl","artCat","bigram")],womenBigram[,c("artDate","artUrl","artCat","bigram")],marriageBigram[,c("artDate","artUrl","artCat","bigram")])
# 清除包含英文或數字的bigram組合
# 計算每個組合出現的次數
data_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
count(bigram, sort = TRUE)
## bigram n
## 1: 也 是 306
## 2: 都 是 275
## 3: 的 人 265
## 4: 自己 的 252
## 5: 我 的 243
## ---
## 101661: ꀊ 然後 1
## 101662: ꀊ 相處 1
## 101663: ꀊ 也許 1
## 101664: ꀊ 以前 1
## 101665: ꀊ 原 1
jieba_trigram <- function(t) {
lapply(t, function(x) {
if(nchar(x)>1){
tokens <- segment(x, jieba_tokenizer)
ngram<- ngrams(unlist(tokens), 3)
ngram <- lapply(ngram, paste, collapse = " ")
unlist(ngram)
}
})
}
getMarryTrigram <- getMarry %>%
unnest_tokens(trigram, artContent, token = jieba_trigram)
womenTrigram <- women %>%
unnest_tokens(trigram, sentence, token = jieba_trigram)
marriageTrigram <- marriage %>%
unnest_tokens(trigram, sentence, token = jieba_trigram)
# 把資料併在一起
data_trigram <- rbind(getMarryTrigram[,c("artDate","artUrl","artCat","trigram")],womenTrigram[,c("artDate","artUrl","artCat","trigram")],marriageTrigram[,c("artDate","artUrl","artCat","trigram")])
## trigram n
## 1: 跟 我 說 44
## 2: 同意 記者 抄文 32
## 3: 是否 同意 記者 31
## 4: 是 為 了 30
## 5: 為 什麼 要 28
## ---
## 150206: ꀊ 我 的 1
## 150207: ꀊ 我 婆婆 1
## 150208: ꀊ 相處 長年 1
## 150209: ꀊ 也許 是 1
## 150210: ꀊ 以前 我 1
#發現bigram、trigram都包含停用字 # 製作stop word dictionary
data_bigram %>%
filter(!str_detect(bigram, regex("[0-9a-zA-Z]"))) %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words)) %>%
count(word1, word2, sort = TRUE) %>%
unite_("bigram", c("word1","word2"), sep=" ")
## bigram n
## 1: 兩個 人 65
## 2: 這件 事 53
## 3: 妳 老公 47
## 4: 做 家事 41
## 5: 結婚 後 38
## ---
## 39354: ꀊ 看到 1
## 39355: ꀊ 相處 1
## 39356: ꀊ 也許 1
## 39357: ꀊ 以前 1
## 39358: ꀊ 原 1
data_trigram %>%
filter(!str_detect(trigram, regex("[0-9a-zA-Z]"))) %>%
separate(trigram, c("word1", "word2", "word3"), sep = " ") %>%
filter(!(word1 %in% stop_words), !(word2 %in% stop_words), !(word3 %in% stop_words)) %>%
count(word1, word2, word3, sort = TRUE) %>%
unite_("trigram", c("word1", "word2", "word3"), sep=" ")
## trigram n
## 1: 同意 記者 抄文 32
## 2: 是否 同意 記者 31
## 3: 控制 碼 請 6
## 4: 顏色 控制 碼 6
## 5: 原文 恕 刪 6
## ---
## 23454: ㄧ 時 逃不了 1
## 23455: ㄧ 體 那為 1
## 23456: ㄧ 直 以來 1
## 23457: ㄨ 服 ㄒ 1
## 23458: ꀊ 相處 長年 1
# load marry_lexicon
marry_lexicon <- scan(file = "./dict/marry_lexicon.txt", what=character(),sep='\n',
encoding='utf-8',fileEncoding='utf-8',quiet = T)
# 結婚相關字典
marry_lexicon
## [1] "情緒勒索" "做家事" "原生家庭" "月子中心" "是否同意"
## [6] "記者抄文" "雙方父母" "雙方家長" "一起生活" "經濟壓力"
## [11] "倒垃圾" "登記結婚" "婚姻諮商" "請保姆" "恕刪"
## [16] "掃地機器人" "喜歡小孩" "玩手機" "生活重心" "肺炎疫情"
## [21] "辦婚禮" "公婆家" "黑人問號" "解決方法" "辰亦儒"
## [26] "放汽座" "男神蘇志燮" "武漢肺炎疫情" "收禮金" "吃人夠夠"
## [31] "是否同意記者" "顏色控制碼"
## [1] TRUE
#將data的word欄位斷詞
marry_new_words <- data%>%
unnest_tokens(word, word, token=chi_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(artUrl, word, sort = TRUE)
marry_new_words
## artUrl word n
## 1: https://www.ptt.cc/bbs/marriage/M.1587963172.A.332.html 媽媽 32
## 2: https://www.ptt.cc/bbs/marriage/M.1588194512.A.962.html 老公 31
## 3: https://www.ptt.cc/bbs/marriage/M.1592541632.A.8F9.html 小孩 28
## 4: https://www.ptt.cc/bbs/marriage/M.1579685584.A.C8F.html 男友 27
## 5: https://www.ptt.cc/bbs/marriage/M.1593592497.A.942.html 孩子 26
## ---
## 55895: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 原因 1
## 55896: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 直接 1
## 55897: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 職稱 1
## 55898: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 抓到 1
## 55899: https://www.ptt.cc/bbs/WomenTalk/M.1608971659.A.1A4.html 最近 1
## Warning: `distinct_()` was deprecated in dplyr 0.7.0.
## Please use `distinct()` instead.
## See vignette('programming') for more help
## Warning: `tbl_df()` was deprecated in dplyr 1.0.0.
## Please use `tibble::as_tibble()` instead.
## # A tibble: 5,918,178 x 3
## item1 item2 n
## <chr> <chr> <dbl>
## 1 覺得 結婚 229
## 2 結婚 覺得 229
## 3 真的 結婚 198
## 4 結婚 真的 198
## 5 真的 覺得 174
## 6 覺得 真的 174
## 7 現在 結婚 171
## 8 結婚 現在 171
## 9 知道 結婚 163
## 10 結婚 知道 163
## # … with 5,918,168 more rows
word_cors <- marry_new_words %>%
group_by(word) %>%
filter(n() >= 10) %>%
pairwise_cor(word, artUrl, sort = TRUE)
word_cors
## # A tibble: 1,009,020 x 3
## item1 item2 correlation
## <chr> <chr> <dbl>
## 1 抄文 記者 0.901
## 2 記者 抄文 0.901
## 3 抄文 同意 0.701
## 4 同意 抄文 0.701
## 5 產後 憂鬱 0.691
## 6 憂鬱 產後 0.691
## 7 記者 同意 0.668
## 8 同意 記者 0.668
## 9 祖先 屬虎 0.633
## 10 屬虎 祖先 0.633
## # … with 1,009,010 more rows
set.seed(2020)
word_cors %>%
filter(correlation > 0.5) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") + #加入中文字型設定,避免中文字顯示錯誤。
theme_void()
### 使用詞彙關係圖畫出相關性大於0.6的組合
set.seed(2020)
word_cors %>%
filter(correlation > 0.6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
# 設定幾個詞做爲seed words
seed_words <- c("記者", "抄文")
# 設定threshold爲0.6
threshold <- 0.6
# 跟seed words相關性高於threshold的詞彙會被加入移除列表中
remove_words <- word_cors %>%
filter((item1 %in% seed_words|item2 %in% seed_words), correlation>threshold) %>%
.$item1 %>%
unique()
remove_words
## [1] "抄文" "記者" "同意" "是否"
# 清除存在這些詞彙的組合
word_cors_new <- word_cors %>%
filter(!(item1 %in% remove_words|item2 %in% remove_words))
word_cors_new %>%
filter(correlation > 0.6) %>%
graph_from_data_frame() %>%
ggraph(layout = "fr") +
geom_edge_link(aes(edge_alpha = correlation), show.legend = FALSE) +
geom_node_point(color = "lightblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE, family = "Heiti TC Light") +
theme_void()
######################## 2020-1月-共現圖 ########################
######################## 2020-2月-共現圖 ########################
######################## 2020-3月-共現圖 ########################
######################## 2020-4月-共現圖 ########################
######################## 2020-5月-共現圖 ########################
######################## 2020-6月-共現圖 ########################
######################## 2020-7月-共現圖 ########################
######################## 2020-8月-共現圖 ########################
######################## 2020-9月-共現圖 ########################
######################## 2020-10月-共現圖 ########################
######################## 2020-11月-共現圖 ########################
######################## 2020-12月-共現圖 ########################