隨著科技的進步,線上的影音平台更能讓喜愛看戲劇的觀眾就算錯過首播也能在自己方便的時間享受其中,而國際 OTT 影音平台則是個擅用「大數據」來分析觀眾收視的習慣,近年來除了用於行銷推播、開發潛在觀眾外更嘗試導入前期影視製作,因此我們想藉由爬取 PTT 有關於戲劇方面的資料,分析去年網友們在 PTT 上針對台灣戲劇的討論話題及內容,建立一個戲劇的分類器,透過不同種模型觀察討論的主題分群,以及在特定戲劇下不同模型的準確度,此外也以留言數的多寡定義該篇文章的熱門程度,觀察不同模型下在 dtm 及 tf-idf 的差異。
## [1] "LC_CTYPE=zh_TW.UTF-8;LC_NUMERIC=C;LC_TIME=zh_TW.UTF-8;LC_COLLATE=zh_TW.UTF-8;LC_MONETARY=zh_TW.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"
library(caret)
library(caTools)
library(data.table)
library(dplyr)
library(e1071)
library(factoextra)
library(ggplot2)
library(igraph)
library(jiebaR)
library(purrr)
library(randomForest)
library(ranger)
library(rpart)
library(rpart.plot)
library(stringr)
library(tidytext)
library(tm)
library(topicmodels)
library(xgboost)# PTT TaiwanDrama 2019 貼文 (post)
PTT_T2019p <- fread("./data/TaiwanDrama2019_articleMetaData.csv", encoding = "UTF-8", data.table = F) %>% select(-artTime, -artCat)
# PTT TaiwanDrama 2019 留言 (message)
PTT_T2019m <- fread("./data/TaiwanDrama2019_articleMetaData_message.csv", encoding = "UTF-8", data.table = F) %>% select(-artTime, -artCat, -commentPoster)# 日期格式由 "chr" 轉為 "date"
PTT_T2019p$artDate <- PTT_T2019p$artDate %>% as.Date("%Y/%m/%d")
PTT_T2019m$artDate <- PTT_T2019m$artDate %>% as.Date("%Y/%m/%d")
PTT_T2019m$commentDate <- PTT_T2019m$commentDate %>% substring(1, 10) %>% as.Date()
# 去除貼文與留言中的英文與數字
PTT_T2019p$sentence <- gsub("[0-9a-zA-Z]", "", PTT_T2019p$sentence)
PTT_T2019m$commentContent <- gsub("[0-9a-zA-Z]", "", PTT_T2019m$commentContent)jieba_tokenizer <- worker(user = "./dict/drama2019_dict.txt", stop_word = "./dict/stop_words.txt" )
jieba_tokenizer$write = "NOFILE" # 若有出現錯誤再跑這行
drama_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
# 將詞彙長度為 1 的詞清除
tokens <- tokens[nchar(tokens) > 1]
return(tokens)
})
}# 貼文
PTT_T2019p_tokens <- tibble(artDate = PTT_T2019p$artDate,
artUrl = PTT_T2019p$artUrl,
artPoster = PTT_T2019p$artPoster,
sentence = PTT_T2019p$sentence,
commentNum = PTT_T2019p$commentNum) %>%
unnest_tokens(word, sentence, token = drama_tokenizer)
# 留言
PTT_T2019m_tokens <- tibble(artDate = PTT_T2019m$artDate,
artUrl = PTT_T2019m$artUrl,
artPoster = PTT_T2019m$artPoster,
commentStatus = PTT_T2019m$commentStatus,
commentDate = PTT_T2019m$commentDate,
commentContent = PTT_T2019m$commentContent) %>%
unnest_tokens(word, commentContent, token = drama_tokenizer)synonym <- function(i)
{
i <- recode(i,
# 導演編劇演員名
# 女兵日記女力報到
宣榕 = "李宣榕",
曜晟 = "林曜晟",
謙文 = "陳謙文",
雅筑 = "楊雅筑",
舒涵 = "梁舒涵",
語昕 = "方語昕",
瀚名 = "梁瀚名",
建予 = "林建予",
鯰魚哥 = "林建予",
鯰魚 = "林建予",
彥凱 = "尹彥凱",
小白 = "游小白",
樂妍 = "王樂妍",
# 圈套
鈞浩 = "徐鈞浩",
承洋 = "吳承洋",
慶華 = "卞慶華",
廷軒 = "陳廷軒",
意箴 = "林意箴",
春誠 = "丁春誠",
賢治 = "梅賢治",
家逵 = "陳家逵",
# 噬罪者
凱勛 = "莊凱勛",
晏豪 = "曹晏豪",
于喬 = "夏于喬",
淑臻 = "蔡淑臻",
子熙 = "林子熙",
# 罪夢者
孝全 = "張孝全",
曉萱 = "范曉萱",
柏傑 = "王柏傑",
# 想見你
佳嬿 = "柯佳嬿",
光漢 = "許光漢",
柏宇 = "施柏宇",
毓麟 = "顏毓麟",
藝文 = "嚴藝文",
# 雙城故事
怡蓉 = "陳怡蓉",
珮瑜 = "曾珮瑜",
柏鈞 = "黃柏鈞",
# 種菜女神
以豪 = "劉以豪",
庭妮 = "陳庭妮",
千娜 = "李千娜",
# 魂囚西門
敬騰 = "蕭敬騰",
碧婷 = "郭碧婷",
宜農 = "鄭宜農",
盈萱 = "謝盈萱",
葦華 = "藍葦華",
# 忠孝節義
麗花 = "楊麗花",
亞蘭 = "陳亞蘭",
# 通靈少女2
書瑤 = "郭書瑤",
貞菱 = "温貞菱",
少勳 = "范少勳",
# 我是顧家男
健瑋 = "黃健瑋",
謹華 = "楊謹華",
佳見 = "謝佳見",
敬宣 = "陳敬宣",
景嵐 = "張景嵐",
宇辰 = "邱宇辰",
# 一千個晚安
棟樑 = "張棟樑",
俞涵 = "連俞涵",
宗霖 = "李宗霖",
#愛寗 = "姚愛寗",
# 如果愛重來
書豪 = "張書豪",
# 生死接線員
少宗 = "曾少宗",
倩妏 = "劉倩妏",
承矩 = "單承矩",
震瀚 = "邱震瀚",
# 月村歡迎你
坤達 = "謝坤達",
思宇 = "林思宇",
念軒 = "吳念軒",
佳豫 = "李佳豫",
# 用九柑仔店
軒睿 = "張軒睿",
允雯 = "莫允雯",
# 天堂的微笑
杰楷 = "修杰楷",
予晞 = "林予晞",
振剛 = "唐振剛",
志友 = "方志友",
尚禾 = "黃尚禾",
# 你有念大學嗎
心亞 = "安心亞",
浩辰 = "禾浩辰",
宇辰 = "邱宇辰",
品潔 = "吳品潔",
# 靈異街11號
國毅 = "李國毅",
嫚書 = "簡嫚書",
# 我們與惡的距離
君陽 = "林君陽",
蒔媛 = "呂蒔媛",
十元 = "呂蒔媛",
靜雯 = "賈靜雯",
昇豪 = "溫昇豪",
慷仁 = "吳慷仁",
采詩 = "周采詩",
都拉斯 = "洪都拉斯",
可元 = "王可元",
沛慈 = "曾沛慈",
哲熹 = "林哲熹",
名帥 = "施名帥",
予晞 = "林予晞",
瓊煖 = "謝瓊煖",
卉喬 = "于卉喬",
#喬喬 = "于卉喬",
# 我們不能是朋友
以豪 = "劉以豪",
雪芙 = "郭雪芙",
其君 = "孫其君",
若妍 = "夏若妍",
艾菲 = "袁艾菲",
# 你那邊怎樣我這邊OK
正龍 = "藍正龍",
心湄 = "藍心湄",
西平 = "曹西平",
之喬 = "曾之喬",
冠廷 = "劉冠廷",
岳擎 = "吳岳擎",
泂江 = "陳泂江",
俊雄 = "黃俊雄",
# 劇名 & 角色名
# 越界
子軒 = "邱子軒",
宇豪 = "夏宇豪",
振文 = "王振文",
振武 = "王振武",
# 女兵日記女力報到
女力報到 = "女兵日記女力報到",
女力 = "女兵日記女力報到",
志強 = "宋志強",
朵拉 = "潘朵拉",
學文 = "周學文",
松齡 = "郭松齡",
曉嫻 = "高曉嫻",
淑靜 = "李淑靜",
素娥 = "葉素娥",
嫚莉 = "黃嫚莉",
雅芝 = "趙雅芝",
國樑 = "林國樑",
佑廷 = "邱佑廷",
寬亮 = "何寬亮",
光興 = "鄭光興",
大房 = "馬大房",
佳佳 = "張佳佳",
喬青 = "方喬青",
# 圈套
少飛 = "孟少飛",
飛飛 = "孟少飛",
立安 = "趙立安",
趙子 = "趙立安",
紅葉 = "左紅葉",
道一 = "古道一",
至德 = "李至德",
阿德 = "李至德",
文浩 = "陳文浩",
國棟 = "唐國棟",
# 那一天
豪廷 = "項豪廷",
希顧 = "于希顧",
志剛 = "盧志剛",
博翔 = "孫博翔",
# 想見你
雨萱 = "黃雨萱",
韻如 = "陳韻如",
子維 = "李子維",
詮勝 = "王詮勝",
俊傑 = "莫俊傑",
芝齊 = "謝芝齊",
宗儒 = "謝宗儒",
文磊 = "吳文磊",
思源 = "陳思源",
# 雙城故事
雙城 = "雙城故事",
念念 = "李念念",
天明 = "鄧天明",
# 魂囚西門
松言 = "魏松言",
# 最佳利益
方律 = "方箏",
博昀 = "陳博昀",
立廷 = "趙立廷",
# 美味滿閣
子堯 = "王子堯",
以茜 = "方以茜",
子文 = "王子文",
芳蓉 = "何芳蓉",
# 愛情白皮書
掛居 = "歐陽掛居",
成美 = "袁成美",
守治 = "瞿守治",
星華 = "季星華",
松岡 = "趙松岡",
康澤 = "沈康澤",
# 我是顧家男
顧家 = "我是顧家男",
家南 = "顧家南",
雙雙 = "徐雙雙",
博仁 = "顏博仁",
小敏 = "哈小敏",
心怡 = "傅心怡",
# 一千個晚安
柏森 = "吳柏森",
天雨 = "戴天雨",
# 如果愛重來
如果愛 = "如果愛重來",
大樂 = "汪大樂",
詠琪 = "何詠琪",
琪琪 = "何詠琪",
士元 = "王士元",
于茜 = "洪于茜",
# 月村歡迎你
月村 = "月村歡迎你",
國建 = "畢國建",
阿建 = "畢國建",
月禮 = "南月禮",
小禮 = "南月禮",
威海 = "馬威海",
# 俗女養成記
俗女 = "俗女養成記",
嘉玲 = "陳嘉玲",
# 用九柑仔店
用九 = "用九柑仔店",
俊龍 = "楊俊龍",
昭君 = "陳昭君",
# 天堂的微笑
又翔 = "孫又翔",
承舒 = "趙承舒",
宇則 = "何宇則",
令悠 = "楊令悠",
向東 = "程向東",
# 你有念大學嗎
大學 = "你有念大學嗎",
可艾 = "常可艾",
子浩 = "陽子浩",
小剛 = "蔡小剛",
子傑 = "梁子傑",
貞怡 = "簡貞怡",
# 靈異街11號
靈異街 = "靈異街11號",
志海 = "高志海",
阿海 = "高志海",
# 我們與惡的距離
與惡 = "我們與惡的距離",
喬安 = "宋喬安",
昭國 = "劉昭國",
天彥 = "劉天彥",
#天晴 = "劉天晴",
美媚 = "丁美媚",
曉明 = "李曉明",
曉文 = "李大芝", # "李曉文" 改名 "李大芝"
大芝 = "李大芝",
思悅 = "應思悅",
思聰 = "應思聰",
一駿 = "林一駿",
喬平 = "宋喬平",
紐世 = "廖紐世",
news哥 = "廖紐世",
# 我們不能是朋友
朋友 = "我們不能是朋友",
克桓 = "褚克桓",
惟惟 = "周惟惟",
皓一 = "黎皓一",
子媛 = "高子媛",
可菲 = "韓可菲",
百洋 = "顏百洋",
# 烏陰天的好日子
陰天 = "烏陰天的好日子",
碧秋 = "王碧秋"
)
}
# 貼文
PTT_T2019p_tokens$word <- synonym(PTT_T2019p_tokens$word)
# 留言
PTT_T2019m_tokens$word <- synonym(PTT_T2019m_tokens$word)
saveRDS(object = PTT_T2019p_tokens, file = "./object/PTT_T2019p_tokens.rds")
saveRDS(object = PTT_T2019m_tokens, file = "./object/PTT_T2019m_tokens.rds")PTT_T2019p_dtm <- PTT_T2019p_tokens %>%
count(artUrl, word, sort = T) %>%
rename(count = n) %>%
cast_dtm(artUrl, word, count)
PTT_T2019p_dtm## <<DocumentTermMatrix (documents: 2529, terms: 59528)>>
## Non-/sparse entries: 383702/150162610
## Sparsity : 100%
## Maximal term length: 11
## Weighting : term frequency (tf)
ldas <- c()
topics <- c(2, 3, 8, 13, 20, 36)
for(topic in topics){
start_time <- Sys.time()
lda <- LDA(PTT_T2019p_dtm, k = topic, control = list(seed = 2020))
ldas <- c(ldas, lda)
print(paste(topic, paste("topic(s) and use time is ", Sys.time() - start_time)))
save(ldas, file = "./object/ldas_result")
}topics <- c(2, 3, 8, 13, 20, 36)
data_frame(k = topics,
perplex = map_dbl(ldas, perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
PTT_T2019p_lda <- ldas[[4]] # 選定主題數為 13 的結果
PTT_T2019p_topics <- tidy(PTT_T2019p_lda, matrix = "beta")
PTT_T2019p_top_terms <- PTT_T2019p_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
PTT_T2019p_top_terms %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered() +
theme(text = element_text(family = "Heiti TC Light"))jieba_tokenizer2 <- worker(user = "./dict/drama2019_dict2.txt", stop_word = "./dict/stop_words2.txt" )
jieba_tokenizer2$write = "NOFILE" # 若有出現錯誤再跑這行
drama_tokenizer2 <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer2)
# 將詞彙長度為 1 的詞清除
tokens <- tokens[nchar(tokens) > 1]
return(tokens)
})
}# 貼文
PTT_T2019p_tokens2 <- tibble(artDate = PTT_T2019p$artDate,
artUrl = PTT_T2019p$artUrl,
artPoster = PTT_T2019p$artPoster,
sentence = PTT_T2019p$sentence,
commentNum = PTT_T2019p$commentNum) %>%
unnest_tokens(word, sentence, token = drama_tokenizer2)
# 留言
PTT_T2019m_tokens2 <- tibble(artDate = PTT_T2019m$artDate,
artUrl = PTT_T2019m$artUrl,
artPoster = PTT_T2019m$artPoster,
commentStatus = PTT_T2019m$commentStatus,
commentDate = PTT_T2019m$commentDate,
commentContent = PTT_T2019m$commentContent) %>%
unnest_tokens(word, commentContent, token = drama_tokenizer2)
saveRDS(object = PTT_T2019p_tokens2, file = "./object/PTT_T2019p_tokens2.rds")
saveRDS(object = PTT_T2019m_tokens2, file = "./object/PTT_T2019m_tokens2.rds")PTT_T2019p_tokens2 <- readRDS(file = "./object/PTT_T2019p_tokens2.rds")
PTT_T2019m_tokens2 <- readRDS(file = "./object/PTT_T2019m_tokens2.rds")PTT_T2019p_dtm2 <- PTT_T2019p_tokens2 %>%
count(artUrl, word, sort = T) %>%
rename(count = n) %>%
cast_dtm(artUrl, word, count)
PTT_T2019p_dtm2## <<DocumentTermMatrix (documents: 2529, terms: 61829)>>
## Non-/sparse entries: 377547/155987994
## Sparsity : 100%
## Maximal term length: 9
## Weighting : term frequency (tf)
PTT_T2019p_lda2 <- readRDS(file = "./object/PTT_T2019p_lda2.rds")
PTT_T2019p_topics2 <- tidy(PTT_T2019p_lda2, matrix = "beta")
PTT_T2019p_top_terms2 <- PTT_T2019p_topics2 %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
PTT_T2019p_top_terms2 %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip() +
scale_x_reordered() +
theme(text = element_text(family = "Heiti TC Light"))k_dtm <- PTT_T2019p_tokens %>%
count(artUrl,word) %>%
cast_dtm(artUrl, word, n)
kmeans.cluster <- kmeans(k_dtm, centers = 13)
saveRDS(object = kmeans.cluster, file = "./object/kmeans.cluster.rds")
kmeans.cluster <- readRDS(file = "./object/kmeans.cluster.rds")
cluster_result <- fviz_cluster(kmeans.cluster, # 分群結果
data = k_dtm, # 資料
geom = c("point","text"), # 點和標籤 (point & label)
frame.type = "norm")
saveRDS(object = cluster_result, file = "./object/cluster_result.rds")
cluster_result <- readRDS(file = "./object/cluster_result.rds")
cluster_result# 取得 artUrl
# 貼文
CLp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*越界.*"))) %>%
.[, "artUrl"]
# 留言
CLm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*越界.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
CLp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% CLp_artUrl)
CLp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% CLp_artUrl)
# 留言
CLm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% CLm_artUrl)
CLm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% CLm_artUrl)# 取得 artUrl
# 貼文
GPp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*女力.*"))) %>%
.[, "artUrl"]
# 留言
GPm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*女力.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
GPp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% GPp_artUrl)
GPp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% GPp_artUrl)
# 留言
GPm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% GPm_artUrl)
GPm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% GPm_artUrl)# 取得 artUrl
# 貼文
TDp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*圈套.*"))) %>%
.[, "artUrl"]
# 留言
TDm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*圈套.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
TDp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% TDp_artUrl)
TDp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% TDp_artUrl)
# 留言
TDm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% TDm_artUrl)
TDm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% TDm_artUrl)# 取得 artUrl
# 貼文
MODCp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*那一天.*"))) %>%
.[, "artUrl"]
# 留言
MODCm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*那一天.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
MODCp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% MODCp_artUrl)
MODCp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% MODCp_artUrl)
# 留言
MODCm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% MODCm_artUrl)
MODCm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% MODCm_artUrl)# 取得 artUrl
# 貼文
SODp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*想見你.*"))) %>%
.[, "artUrl"]
# 留言
SODm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*想見你.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
SODp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% SODp_artUrl)
SODp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% SODp_artUrl)
# 留言
SODm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% SODm_artUrl)
SODm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% SODm_artUrl)# 取得 artUrl
# 貼文
GDp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*魂囚西門.*"))) %>%
.[, "artUrl"]
# 留言
GDm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*魂囚西門.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
GDp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% GDp_artUrl)
GDp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% GDp_artUrl)
# 留言
GDm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% GDm_artUrl)
GDm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% GDm_artUrl)# 取得 artUrl
# 貼文
BIp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*最佳利益.*"))) %>%
.[, "artUrl"]
# 留言
BIm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*最佳利益.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
BIp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% BIp_artUrl)
BIp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% BIp_artUrl)
# 留言
BIm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% BIm_artUrl)
BIm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% BIm_artUrl)# 取得 artUrl
# 貼文
TTVp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*忠孝.*"))) %>%
.[, "artUrl"]
# 留言
TTVm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*忠孝.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
TTVp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% TTVp_artUrl)
TTVp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% TTVp_artUrl)
# 留言
TTVm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% TTVm_artUrl)
TTVm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% TTVm_artUrl)# 取得 artUrl
# 貼文
TMOWp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*俗女.*"))) %>%
.[, "artUrl"]
# 留言
TMOWm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*俗女.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
TMOWp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% TMOWp_artUrl)
TMOWp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% TMOWp_artUrl)
# 留言
TMOWm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% TMOWm_artUrl)
TMOWm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% TMOWm_artUrl)# 取得 artUrl
# 貼文
ELp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*天堂.*"))) %>%
.[, "artUrl"]
# 留言
ELm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*天堂.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
ELp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% ELp_artUrl)
ELp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% ELp_artUrl)
# 留言
ELm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% ELm_artUrl)
ELm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% ELm_artUrl)# 取得 artUrl
# 貼文
HAp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*大學.*"))) %>%
.[, "artUrl"]
# 留言
HAm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*大學.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
HAp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% HAp_artUrl)
HAp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% HAp_artUrl)
# 留言
HAm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% HAm_artUrl)
HAm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% HAm_artUrl)# 取得 artUrl
# 貼文
TWBUp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*與惡.*"))) %>%
.[, "artUrl"]
# 留言
TWBUm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*與惡.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
TWBUp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% TWBUp_artUrl)
TWBUp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% TWBUp_artUrl)
# 留言
TWBUm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% TWBUm_artUrl)
TWBUm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% TWBUm_artUrl)# 取得 artUrl
# 貼文
BWGMp_artUrl <- PTT_T2019p %>%
filter(str_detect(PTT_T2019p$artTitle, regex(".*朋友.*"))) %>%
.[, "artUrl"]
# 留言
BWGMm_artUrl <- PTT_T2019m %>%
filter(str_detect(PTT_T2019m$artTitle, regex(".*朋友.*"))) %>%
.[, "artUrl"]
# 斷詞
# 貼文
BWGMp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% BWGMp_artUrl)
BWGMp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% BWGMp_artUrl)
# 留言
BWGMm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% BWGMm_artUrl)
BWGMm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% BWGMm_artUrl)# 貼文
addLabel <- function(tokens, label){
tokens %>%
select(-c(artDate, commentNum)) %>%
mutate(title = label)
}
## 包含演員名、劇名、角色名
drama_T2019p_tokens <- rbind(
addLabel(CLp_tokens, "越界"),
addLabel(GPp_tokens, "女兵日記女力報到"),
addLabel(TDp_tokens, "圈套"),
addLabel(MODCp_tokens, "那一天"),
addLabel(SODp_tokens, "想見你"),
addLabel(GDp_tokens, "魂囚西門"),
addLabel(BIp_tokens, "最佳利益"),
addLabel(TTVp_tokens, "忠孝節義"),
addLabel(TMOWp_tokens, "俗女養成記"),
addLabel(ELp_tokens, "天堂的微笑"),
addLabel(HAp_tokens, "你有念大學嗎"),
addLabel(TWBUp_tokens, "我們與惡的距離"),
addLabel(BWGMp_tokens, "我們不能是朋友")
)
## 各戲劇 token 數量
table(drama_T2019p_tokens$title)##
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到
## 8746 13431 5637 26168
## 圈套 俗女養成記 天堂的微笑 我們不能是朋友
## 79976 17723 4776 7438
## 我們與惡的距離 想見你 越界 忠孝節義
## 77926 15650 12751 8645
## 最佳利益
## 9733
## 移除演員名、劇名、角色名
drama_T2019p_tokens2 <- rbind(
addLabel(CLp_tokens2, "越界"),
addLabel(GPp_tokens2, "女兵日記女力報到"),
addLabel(TDp_tokens2, "圈套"),
addLabel(MODCp_tokens2, "那一天"),
addLabel(SODp_tokens2, "想見你"),
addLabel(GDp_tokens2, "魂囚西門"),
addLabel(BIp_tokens2, "最佳利益"),
addLabel(TTVp_tokens2, "忠孝節義"),
addLabel(TMOWp_tokens2, "俗女養成記"),
addLabel(ELp_tokens2, "天堂的微笑"),
addLabel(HAp_tokens2, "你有念大學嗎"),
addLabel(TWBUp_tokens2, "我們與惡的距離"),
addLabel(BWGMp_tokens2, "我們不能是朋友")
)
## 各戲劇 token 數量
table(drama_T2019p_tokens2$title)##
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到
## 8296 12657 5153 23125
## 圈套 俗女養成記 天堂的微笑 我們不能是朋友
## 72251 16865 4548 6965
## 我們與惡的距離 想見你 越界 忠孝節義
## 74655 14096 11598 8246
## 最佳利益
## 9482
# 留言
addLabel_m <- function(tokens, label){
tokens %>%
select(-c(artDate, commentStatus, commentDate)) %>%
mutate(title = label)
}
## 包含演員名、劇名、角色名
drama_T2019m_tokens <- rbind(
addLabel_m(CLm_tokens, "越界"),
addLabel_m(GPm_tokens, "女兵日記女力報到"),
addLabel_m(TDm_tokens, "圈套"),
addLabel_m(MODCm_tokens, "那一天"),
addLabel_m(SODm_tokens, "想見你"),
addLabel_m(GDm_tokens, "魂囚西門"),
addLabel_m(BIm_tokens, "最佳利益"),
addLabel_m(TTVm_tokens, "忠孝節義"),
addLabel_m(TMOWm_tokens, "俗女養成記"),
addLabel_m(ELm_tokens, "天堂的微笑"),
addLabel_m(HAm_tokens, "你有念大學嗎"),
addLabel_m(TWBUm_tokens, "我們與惡的距離"),
addLabel_m(BWGMm_tokens, "我們不能是朋友")
)
## 各戲劇 token 數量
table(drama_T2019m_tokens$title)##
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到
## 11792 31397 21883 80451
## 圈套 俗女養成記 天堂的微笑 我們不能是朋友
## 136996 29143 4387 17167
## 我們與惡的距離 想見你 越界 忠孝節義
## 115493 12968 4917 107191
## 最佳利益
## 30416
drama_T2019p_tokens_wct <- drama_T2019p_tokens %>%
count(word, title, sort = T) %>%
rename(count = n) drama_T2019p_tokens_dtm <- drama_T2019p_tokens_wct %>%
cast_dtm(title, word, count)
drama_T2019p_tokens_dtm## <<DocumentTermMatrix (documents: 13, terms: 38738)>>
## Non-/sparse entries: 78453/425141
## Sparsity : 84%
## Maximal term length: 9
## Weighting : term frequency (tf)
drama_T2019p_tokens_dtm_mat <- as.matrix(drama_T2019p_tokens_dtm)
drama_T2019p_tokens_dtm_dist <- proxy::dist(drama_T2019p_tokens_dtm_mat,
method ="cosine")threshold <- 0.80
drama_T2019p_tokens_dtm_dist_cut <- as.matrix(drama_T2019p_tokens_dtm_dist)
drama_T2019p_tokens_dtm_dist_cut[1, ]## 圈套 我們與惡的距離 女兵日記女力報到 想見你
## 0.0000000 0.7494374 0.8802272 0.8656336
## 俗女養成記 越界 你有念大學嗎 那一天
## 0.8230855 0.8102735 0.9332545 0.7510238
## 我們不能是朋友 魂囚西門 忠孝節義 最佳利益
## 0.8181402 0.8309096 0.9020382 0.8253760
## 天堂的微笑
## 0.8710326
for ( row_num in 1:dim(drama_T2019p_tokens_dtm_dist_cut)[1]) {
for (col_num in 1:dim(drama_T2019p_tokens_dtm_dist_cut)[2]) {
## 此處將原本 dist 轉換為權重,距離大於 threshold 則最後權重為 0,反之距離越近(小)者最後權重越大。
dist <- drama_T2019p_tokens_dtm_dist_cut[row_num, col_num]
weight <- ifelse(dist >=threshold, 0, dist)
weight <- abs(weight - threshold)
weight <- ifelse(weight < threshold, weight + threshold,0)
drama_T2019p_tokens_dtm_dist_cut[row_num, col_num] <- weight
}
}# 以 matrix 建構圖型
g <- graph.adjacency(drama_T2019p_tokens_dtm_dist_cut, weighted = T, mode = "undirected")
# 移除 loop
g <- simplify(g)
# 設定各點的 labels 及 degrees
V(g)$label <- row.names(drama_T2019p_tokens_dtm_dist_cut)
V(g)$degree <- degree(g)
# 繪製影集關係圖
set.seed(2020)
par(family = "STKaiti", cex = 1)
layout1 <- layout.kamada.kawai(g)
# 定義圖上各點、線段的呈現方式
V(g)$label.cex <- 2.0 * V(g)$degree / max(V(g)$degree) + 0.5 # max is 2.5,根據 degree 縮放劇名
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight) + .4) / max(log(E(g)$weight) + .4) # edge size
E(g)$color <- rgb(.5, .9, 0, egam)
E(g)$width <- egam * 4
# 畫出網路圖
plot(g, layout = layout1, main = "Drama relation graph -- Post 0.8")劇情類似並出現連結:“魂囚西門” 中的男主角為精神病患者,劇中提到心理諮商等劇情,與 “我們與惡的距離” 當中應思聰的思覺失調及宋喬平的諮商師有關。
將距離轉為權重,原本距離越大,權重越小,原本距離大於 0.72 者,此處轉換後權重為 0
threshold <- 0.72
drama_T2019p_tokens_dtm_dist_cut <- as.matrix(drama_T2019p_tokens_dtm_dist)
drama_T2019p_tokens_dtm_dist_cut[1,]## 圈套 我們與惡的距離 女兵日記女力報到 想見你
## 0.0000000 0.7494374 0.8802272 0.8656336
## 俗女養成記 越界 你有念大學嗎 那一天
## 0.8230855 0.8102735 0.9332545 0.7510238
## 我們不能是朋友 魂囚西門 忠孝節義 最佳利益
## 0.8181402 0.8309096 0.9020382 0.8253760
## 天堂的微笑
## 0.8710326
for ( row_num in 1:dim(drama_T2019p_tokens_dtm_dist_cut)[1]) {
for (col_num in 1:dim(drama_T2019p_tokens_dtm_dist_cut)[2]) {
## 此處將原本 dist 轉換為權重,距離大於 threshold 則最後權重為 0,反之距離越近(小)者最後權重越大。
dist <- drama_T2019p_tokens_dtm_dist_cut[row_num, col_num]
weight <- ifelse(dist >= threshold, 0, dist)
weight <- abs(weight - threshold)
weight <- ifelse(weight < threshold, weight + threshold,0)
drama_T2019p_tokens_dtm_dist_cut[row_num, col_num] <- weight
}
}## 以 matrix 建構圖型
g <- graph.adjacency(drama_T2019p_tokens_dtm_dist_cut, weighted = T, mode = "undirected")
## 移除 loop
g <- simplify(g)
## 設定各點的 labels 及 degrees
V(g)$label <- row.names(drama_T2019p_tokens_dtm_dist_cut)
V(g)$degree <- degree(g)
## 繪製影集關係圖
set.seed(2020)
par(family ="STKaiti", cex = 1)
layout1 <- layout.kamada.kawai(g)
##定義圖上各點、線段的呈現方式
V(g)$label.cex <- 2.0 * V(g)$degree / max(V(g)$degree) + 0.5 # max is 2.5,根據degree縮放劇名
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4) # edge size
E(g)$color <- rgb(.5, .9, 0, egam)
E(g)$width <- egam * 4
## 畫出網路圖
plot(g, layout = layout1, main = "Drama relation graph -- Post 0.72")drama_T2019m_tokens_wct <- drama_T2019m_tokens %>%
count(word, title, sort = T) %>%
rename(count = n)drama_T2019m_tokens_dtm <- drama_T2019m_tokens_wct %>%
cast_dtm(title, word, count)
drama_T2019m_tokens_dtm## <<DocumentTermMatrix (documents: 13, terms: 69293)>>
## Non-/sparse entries: 136120/764689
## Sparsity : 85%
## Maximal term length: 9
## Weighting : term frequency (tf)
drama_T2019m_tokens_dtm_mat <- as.matrix(drama_T2019m_tokens_dtm)
drama_T2019m_tokens_dtm_dist <- proxy::dist(drama_T2019m_tokens_dtm_mat,
method ="cosine")threshold <- 0.72
drama_T2019m_tokens_dtm_dist_cut <- as.matrix(drama_T2019m_tokens_dtm_dist)
drama_T2019m_tokens_dtm_dist_cut[1,]## 圈套 女兵日記女力報到 我們與惡的距離 忠孝節義
## 0.0000000 0.8095248 0.7137306 0.7696144
## 最佳利益 俗女養成記 想見你 你有念大學嗎
## 0.8057282 0.7672023 0.8882045 0.7832867
## 我們不能是朋友 那一天 魂囚西門 天堂的微笑
## 0.7412544 0.5807103 0.7960706 0.8358653
## 越界
## 0.4926713
for ( row_num in 1:dim(drama_T2019m_tokens_dtm_dist_cut)[1]) {
for (col_num in 1:dim(drama_T2019m_tokens_dtm_dist_cut)[2]) {
## 此處將原本 dist 轉換為權重,距離大於 threshold 則最後權重為 0,反之距離越近(小)者最後權重越大。
dist <- drama_T2019m_tokens_dtm_dist_cut[row_num, col_num]
weight <- ifelse(dist >= threshold, 0, dist)
weight <- abs(weight - threshold)
weight <- ifelse(weight < threshold, weight + threshold,0)
drama_T2019m_tokens_dtm_dist_cut[row_num, col_num] <- weight
}
}
drama_T2019m_tokens_dtm_dist_cut[1, ]## 圈套 女兵日記女力報到 我們與惡的距離 忠孝節義
## 0.0000000 0.0000000 0.7262694 0.0000000
## 最佳利益 俗女養成記 想見你 你有念大學嗎
## 0.0000000 0.0000000 0.0000000 0.0000000
## 我們不能是朋友 那一天 魂囚西門 天堂的微笑
## 0.0000000 0.8592897 0.0000000 0.0000000
## 越界
## 0.9473287
## 以 matrix 建構圖型
g <- graph.adjacency(drama_T2019m_tokens_dtm_dist_cut, weighted = T, mode = "undirected")
## 移除 loop
g <- simplify(g)
## 設定各點的 labels 及 degrees
V(g)$label <- row.names(drama_T2019m_tokens_dtm_dist_cut)
V(g)$degree <- degree(g)
## 繪製影集關係圖
set.seed(2020)
par(family ="STKaiti",cex = 1)
layout1 <- layout.kamada.kawai(g)
##定義圖上各點、線段的呈現方式
V(g)$label.cex <- 2.0 * V(g)$degree / max(V(g)$degree) + 0.5 # max is 2.5,根據degree縮放劇名
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight) + .4) / max(log(E(g)$weight) + .4) # edge size
E(g)$color <- rgb(.5, .9, 0, egam)
E(g)$width <- egam *4
## 畫出網路圖
plot(g, layout = layout1, main = "Drama relation graph -- Message 0.72")drama_T2019p_tokens_10 <- drama_T2019p_tokens %>%
group_by(title, word) %>%
filter(n() > 10) %>% # 只篩選出現大於 10 次的字
ungroup()drama_T2019p_tokens_10 %>%
count(title, word, sort = TRUE) %>%
group_by(title) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(x = reorder(word, n), y = n, fill = title)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
coord_flip() +
facet_wrap(~ title, scales = "free") +
scale_y_continuous(expand = c(0, 0)) +
labs(
x = NULL, y = "Word count",
title = "Most frequent words after removing stop words"
) +
theme(text = element_text(family = "Heiti TC Light"))較少為戲劇內容,但核心主題仍可顯示,如 “魂囚西門” 的「心理」、“天堂的微笑” 的「孩子」、“我們不能是朋友” 的「感情」、“我們與惡的距離” 的「媒體」與「社會」、“想見你” 的「穿越」等。
將資料轉成以每個 artUrl 為主的 DocumentTermMatrix (DTM) 格式
artUrl_dtm <- drama_T2019p_tokens_10 %>%
count(artUrl, word) %>%
cast_dtm(artUrl, word, n)
dim(artUrl_dtm)## [1] 1296 2538
artUrl_dtm <- as.data.frame(as.matrix(artUrl_dtm))
artUrl_dtm$title[rownames(artUrl_dtm) %in% CLp_artUrl] <- "越界"
artUrl_dtm$title[rownames(artUrl_dtm) %in% GPp_artUrl] <- "女兵日記女力報到"
artUrl_dtm$title[rownames(artUrl_dtm) %in% TDp_artUrl] <- "圈套"
artUrl_dtm$title[rownames(artUrl_dtm) %in% MODCp_artUrl] <- "那一天"
artUrl_dtm$title[rownames(artUrl_dtm) %in% SODp_artUrl] <- "想見你"
artUrl_dtm$title[rownames(artUrl_dtm) %in% GDp_artUrl] <- "魂囚西門"
artUrl_dtm$title[rownames(artUrl_dtm) %in% BIp_artUrl] <- "最佳利益"
artUrl_dtm$title[rownames(artUrl_dtm) %in% TTVp_artUrl] <- "忠孝節義"
artUrl_dtm$title[rownames(artUrl_dtm) %in% TMOWp_artUrl] <- "俗女養成記"
artUrl_dtm$title[rownames(artUrl_dtm) %in% ELp_artUrl] <- "天堂的微笑"
artUrl_dtm$title[rownames(artUrl_dtm) %in% HAp_artUrl] <- "你有念大學嗎"
artUrl_dtm$title[rownames(artUrl_dtm) %in% TWBUp_artUrl] <- "我們與惡的距離"
artUrl_dtm$title[rownames(artUrl_dtm) %in% BWGMp_artUrl] <- "我們不能是朋友"
artUrl_dtm$title <- as.factor((artUrl_dtm$title))##
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到
## 28 55 53 302
## 圈套 俗女養成記 天堂的微笑 我們不能是朋友
## 165 61 36 44
## 我們與惡的距離 想見你 越界 忠孝節義
## 346 65 16 80
## 最佳利益
## 45
set.seed(2020)
spl <- sample.split(artUrl_dtm$title, 0.7) # 在保留 title 比例的狀況下以 7:3 將原始資料分割成訓練集與測試集
artUrl_dtm_train <- subset(artUrl_dtm, spl == TRUE)
artUrl_dtm_test <- subset(artUrl_dtm, spl == FALSE)artUrl_dtm_svm <- svm(title ~ ., artUrl_dtm_train, kernel = "linear", cost = 10, scale = F)
artUrl_dtm_svm.p <- predict(artUrl_dtm_svm, artUrl_dtm_test)
# classification matrix
table(artUrl_dtm_test$title, artUrl_dtm_svm.p)## artUrl_dtm_svm.p
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
## 魂囚西門 2 0 0 1 0
## 那一天 1 13 0 0 1
## 你有念大學嗎 0 0 15 0 0
## 女兵日記女力報到 1 0 0 89 0
## 圈套 3 0 0 4 36
## 俗女養成記 0 0 0 1 0
## 天堂的微笑 0 0 0 0 0
## 我們不能是朋友 0 0 0 1 0
## 我們與惡的距離 1 1 0 1 2
## 想見你 0 0 0 0 1
## 越界 0 0 0 0 0
## 忠孝節義 0 0 0 0 0
## 最佳利益 0 0 0 1 1
## artUrl_dtm_svm.p
## 俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
## 魂囚西門 1 0 1 2 0
## 那一天 0 0 1 1 0
## 你有念大學嗎 0 0 0 0 0
## 女兵日記女力報到 0 0 0 1 0
## 圈套 0 0 2 5 0
## 俗女養成記 17 0 0 0 0
## 天堂的微笑 0 9 0 0 0
## 我們不能是朋友 0 0 11 1 0
## 我們與惡的距離 0 0 2 95 0
## 想見你 0 0 0 0 18
## 越界 0 0 0 0 0
## 忠孝節義 1 0 0 0 0
## 最佳利益 1 0 1 3 0
## artUrl_dtm_svm.p
## 越界 忠孝節義 最佳利益
## 魂囚西門 0 0 1
## 那一天 0 0 0
## 你有念大學嗎 0 0 1
## 女兵日記女力報到 0 0 0
## 圈套 0 0 0
## 俗女養成記 0 0 0
## 天堂的微笑 0 0 2
## 我們不能是朋友 0 0 0
## 我們與惡的距離 0 0 2
## 想見你 0 0 0
## 越界 5 0 0
## 忠孝節義 0 23 0
## 最佳利益 0 0 7
## [1] 0.1282051
artUrl_dtm_rf <- ranger(title ~ ., data = artUrl_dtm_train, num.trees = 100)
artUrl_dtm_rf.p <- predict(artUrl_dtm_rf, artUrl_dtm_test)$predictions
# classification matrix
table(artUrl_dtm_test$title, artUrl_dtm_rf.p)## artUrl_dtm_rf.p
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
## 魂囚西門 4 0 0 0 0
## 那一天 0 15 0 0 0
## 你有念大學嗎 0 0 16 0 0
## 女兵日記女力報到 0 0 0 91 0
## 圈套 0 0 0 0 44
## 俗女養成記 0 0 0 0 0
## 天堂的微笑 0 0 0 0 0
## 我們不能是朋友 0 0 0 0 0
## 我們與惡的距離 0 0 0 0 0
## 想見你 0 0 0 0 0
## 越界 0 0 0 0 0
## 忠孝節義 0 0 0 0 0
## 最佳利益 0 0 0 0 0
## artUrl_dtm_rf.p
## 俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
## 魂囚西門 0 0 0 4 0
## 那一天 0 0 0 2 0
## 你有念大學嗎 0 0 0 0 0
## 女兵日記女力報到 0 0 0 0 0
## 圈套 0 0 1 4 0
## 俗女養成記 17 0 0 1 0
## 天堂的微笑 0 9 0 2 0
## 我們不能是朋友 0 0 11 2 0
## 我們與惡的距離 0 0 0 103 0
## 想見你 0 0 0 0 19
## 越界 0 0 0 0 0
## 忠孝節義 0 0 0 0 0
## 最佳利益 0 0 0 7 0
## artUrl_dtm_rf.p
## 越界 忠孝節義 最佳利益
## 魂囚西門 0 0 0
## 那一天 0 0 0
## 你有念大學嗎 0 0 0
## 女兵日記女力報到 0 0 0
## 圈套 1 0 0
## 俗女養成記 0 0 0
## 天堂的微笑 0 0 0
## 我們不能是朋友 0 0 0
## 我們與惡的距離 0 0 1
## 想見你 0 0 0
## 越界 5 0 0
## 忠孝節義 0 24 0
## 最佳利益 0 0 7
## [1] 0.06410256
## [1] 19 25 55 120 122 149 183 196 226 230 234 239 240 244 245 254 256 258 259
## [20] 293 312 317 332 347 374
## [1] 圈套
## 13 Levels: 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套 ... 最佳利益
## 一起 劇情 編劇 這對 導演 期待 戲劇 演員 主角 發展 事件
## 14 41 60 77 89 112 124 134 142 190 244
## 火花 劇組 團隊 第一集 偶像劇 看看 想像 以前 還沒 戲中 自然
## 358 419 465 574 695 770 815 827 1031 1108 1167
## 後面 場景 橋段 畫面 不錯 就讓 臉紅 配角 三集 是非 手銬
## 1252 1293 1360 1442 1725 1726 1727 1728 1729 1730 1731
## 戲都 心跳 預期 title
## 1732 1733 1734 2539
## [1] 我們不能是朋友
## 13 Levels: 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套 ... 最佳利益
# Convert label factor to an integer class starting at 0, it's a requirement for XGBoost
title <- artUrl_dtm$title
label <- as.integer(artUrl_dtm$title) - 1
# Split the data for training and testing
artUrl_dtm_train.data <- as.matrix(artUrl_dtm_train[, -which(colnames(artUrl_dtm_train) == "title")])
artUrl_dtm_test.data <- as.matrix(artUrl_dtm_test[, -which(colnames(artUrl_dtm_test) == "title")])
# Transform the two datasets into xgb.DMatrix objects
xgb.artUrl_dtm_train <- xgb.DMatrix(data = artUrl_dtm_train.data, label = label[spl])
xgb.artUrl_dtm_test <- xgb.DMatrix(data = artUrl_dtm_test.data, label = label[!spl])
# Define the parameters for multinomial classification
num_class <- length(levels(title))
params <- list(
booster = "gbtree",
eta = 0.001,
max_depth = 5,
gamma = 3,
subsample = 0.75,
colsample_bytree = 1,
objective = "multi:softprob",
eval_metric = "mlogloss",
num_class = num_class
)# Train the XGBoost classifer
artUrl_dtm_xgb <- xgb.train(
params = params,
data = xgb.artUrl_dtm_train,
nrounds = 30,
nthreads = 1,
early_stopping_rounds = 10,
watchlist = list(val1 = xgb.artUrl_dtm_train, val2 = xgb.artUrl_dtm_test),
verbose = 0
)
saveRDS(object = artUrl_dtm_xgb, file = "./object/artUrl_dtm_xgb.rds")artUrl_dtm_xgb <- readRDS(file = "./object/artUrl_dtm_xgb.rds")
# Predict outcomes with the test data
artUrl_dtm_xgb.p <- predict(artUrl_dtm_xgb, artUrl_dtm_test.data, reshape = T) %>%
as.data.frame()
colnames(artUrl_dtm_xgb.p) <- levels(title)
# Use the predicted label with the highest probability
artUrl_dtm_xgb.p$prediction <- apply(artUrl_dtm_xgb.p, 1, function(x) colnames(artUrl_dtm_xgb.p)[which.max(x)])
# classification matrix
table(artUrl_dtm_test$title, artUrl_dtm_xgb.p$prediction)##
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
## 魂囚西門 4 0 0 0 0
## 那一天 0 15 0 0 1
## 你有念大學嗎 0 0 15 0 0
## 女兵日記女力報到 0 0 0 90 0
## 圈套 0 0 0 1 44
## 俗女養成記 0 0 0 0 0
## 天堂的微笑 0 0 0 0 0
## 我們不能是朋友 0 0 0 0 0
## 我們與惡的距離 0 0 0 0 0
## 想見你 0 0 0 0 0
## 越界 0 0 0 0 1
## 忠孝節義 0 0 0 0 0
## 最佳利益 0 0 0 0 0
##
## 俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
## 魂囚西門 1 0 0 3 0
## 那一天 0 0 0 1 0
## 你有念大學嗎 0 0 0 1 0
## 女兵日記女力報到 0 0 0 1 0
## 圈套 0 0 0 4 0
## 俗女養成記 16 0 0 2 0
## 天堂的微笑 0 10 0 0 1
## 我們不能是朋友 0 0 9 4 0
## 我們與惡的距離 0 0 0 104 0
## 想見你 0 0 0 0 19
## 越界 0 0 0 0 0
## 忠孝節義 0 0 0 0 0
## 最佳利益 0 0 0 6 0
##
## 越界 忠孝節義 最佳利益
## 魂囚西門 0 0 0
## 那一天 0 0 0
## 你有念大學嗎 0 0 0
## 女兵日記女力報到 0 0 0
## 圈套 1 0 0
## 俗女養成記 0 0 0
## 天堂的微笑 0 0 0
## 我們不能是朋友 0 0 0
## 我們與惡的距離 0 0 0
## 想見你 0 0 0
## 越界 4 0 0
## 忠孝節義 0 24 0
## 最佳利益 0 0 8
## [1] 0.07179487
xgb.importance(artUrl_dtm_xgb, feature_names = colnames(xgb.artUrl_dtm_train)) %>%
top_n(20) %>%
ggplot(., aes(reorder(Feature, Gain), y = Gain, fill = (Feature))) +
geom_bar(stat = "identity", show.legend = FALSE) +
theme(text = element_text(size = 20)) +
coord_flip() +
xlab(' ') +
ylab('特徵重要性') +
theme(text = element_text(family = "Heiti TC Light"))## Selecting by Frequency
data.frame(model = c("SVM", "Random Forest", "Xgboost"),
misclassification_rate = c(mean(artUrl_dtm_test$title != artUrl_dtm_svm.p),
mean(artUrl_dtm_test$title != artUrl_dtm_rf.p),
mean(artUrl_dtm_test$title != artUrl_dtm_xgb.p$prediction)))## model misclassification_rate
## 1 SVM 0.12820513
## 2 Random Forest 0.06410256
## 3 Xgboost 0.07179487
drama_T2019p_tokens2 <- drama_T2019p_tokens2 %>%
group_by(title, word) %>%
filter(n() > 10) %>% # 只篩選出現大於 10 次的字
ungroup()drama_T2019p_tokens2 %>%
count(title, word, sort = TRUE) %>%
group_by(title) %>%
top_n(10) %>%
ungroup() %>%
ggplot(aes(x = reorder(word, n), y = n, fill = title)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
coord_flip() +
facet_wrap(~ title, scales = "free") +
scale_y_continuous(expand = c(0, 0)) +
labs(
x = NULL, y = "Word count",
title = "Most frequent words after removing stop words"
) +
theme(text = element_text(family = "Heiti TC Light"))移除演員名、劇名、角色名後,出現較多與戲劇內容本身相關的詞彙,如 “俗女養成記” 的「媽媽」與「女性」、“想見你” 的「喜歡」與「班長」、“越界” 的「排球」等。
將資料轉成以每個 artUrl 為主的 DocumentTermMatrix (DTM) 格式
artUrl_dtm2 <- drama_T2019p_tokens2 %>%
count(artUrl, word) %>%
cast_dtm(artUrl, word, n)
dim(artUrl_dtm2)## [1] 1293 2412
artUrl_dtm2 <- as.data.frame(as.matrix(artUrl_dtm2))
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% CLp_artUrl] <- "越界"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% GPp_artUrl] <- "女兵日記女力報到"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% TDp_artUrl] <- "圈套"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% MODCp_artUrl] <- "那一天"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% SODp_artUrl] <- "想見你"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% GDp_artUrl] <- "魂囚西門"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% BIp_artUrl] <- "最佳利益"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% TTVp_artUrl] <- "忠孝節義"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% TMOWp_artUrl] <- "俗女養成記"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% ELp_artUrl] <- "天堂的微笑"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% HAp_artUrl] <- "你有念大學嗎"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% TWBUp_artUrl] <- "我們與惡的距離"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% BWGMp_artUrl] <- "我們不能是朋友"
artUrl_dtm2$title <- as.factor((artUrl_dtm2$title))##
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到
## 28 55 53 302
## 圈套 俗女養成記 天堂的微笑 我們不能是朋友
## 165 60 35 43
## 我們與惡的距離 想見你 越界 忠孝節義
## 346 65 16 80
## 最佳利益
## 45
set.seed(2020)
spl <- sample.split(artUrl_dtm2$title, 0.7) # 在保留 title 比例的狀況下以 7:3 將原始資料分割成訓練集與測試集
artUrl_dtm2_train <- subset(artUrl_dtm2, spl == TRUE)
artUrl_dtm2_test <- subset(artUrl_dtm2, spl == FALSE)artUrl_dtm2_svm <- svm(title ~ ., artUrl_dtm2_train, kernel = "linear", cost = 10, scale = F)
artUrl_dtm2_svm.p <- predict(artUrl_dtm2_svm, artUrl_dtm2_test)
# classification matrix
table(artUrl_dtm2_test$title, artUrl_dtm2_svm.p)## artUrl_dtm2_svm.p
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
## 魂囚西門 5 0 0 0 1
## 那一天 0 16 1 0 0
## 你有念大學嗎 0 0 16 0 0
## 女兵日記女力報到 0 0 0 89 0
## 圈套 0 1 0 2 28
## 俗女養成記 0 0 0 1 1
## 天堂的微笑 0 0 0 0 0
## 我們不能是朋友 0 0 0 0 2
## 我們與惡的距離 0 0 0 4 5
## 想見你 0 0 0 0 1
## 越界 0 0 0 1 0
## 忠孝節義 0 0 0 0 0
## 最佳利益 0 0 1 1 0
## artUrl_dtm2_svm.p
## 俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
## 魂囚西門 0 0 1 0 0
## 那一天 0 0 0 0 0
## 你有念大學嗎 0 0 0 0 0
## 女兵日記女力報到 0 0 0 1 0
## 圈套 0 0 0 12 5
## 俗女養成記 15 0 0 0 0
## 天堂的微笑 0 10 0 0 1
## 我們不能是朋友 0 0 10 1 0
## 我們與惡的距離 0 1 2 88 2
## 想見你 0 0 0 1 17
## 越界 0 0 0 0 0
## 忠孝節義 0 0 0 0 0
## 最佳利益 0 0 0 0 1
## artUrl_dtm2_svm.p
## 越界 忠孝節義 最佳利益
## 魂囚西門 0 0 1
## 那一天 0 0 0
## 你有念大學嗎 0 0 0
## 女兵日記女力報到 1 0 0
## 圈套 1 0 1
## 俗女養成記 0 0 1
## 天堂的微笑 0 0 0
## 我們不能是朋友 0 0 0
## 我們與惡的距離 0 0 2
## 想見你 0 0 0
## 越界 4 0 0
## 忠孝節義 0 22 2
## 最佳利益 0 0 11
## [1] 0.1512821
artUrl_dtm2_rf <- ranger(title ~ ., data = artUrl_dtm2_train, num.trees = 100)
artUrl_dtm2_rf.p <- predict(artUrl_dtm2_rf, artUrl_dtm2_test)$predictions
# classification matrix
table(artUrl_dtm2_test$title, artUrl_dtm2_rf.p)## artUrl_dtm2_rf.p
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
## 魂囚西門 5 0 0 0 0
## 那一天 0 15 0 0 1
## 你有念大學嗎 0 0 16 0 0
## 女兵日記女力報到 0 0 0 90 0
## 圈套 0 0 0 0 40
## 俗女養成記 0 0 0 0 0
## 天堂的微笑 0 0 0 0 0
## 我們不能是朋友 0 0 0 0 0
## 我們與惡的距離 0 0 0 0 0
## 想見你 0 0 0 0 0
## 越界 0 0 0 0 0
## 忠孝節義 0 0 0 0 0
## 最佳利益 0 0 0 0 0
## artUrl_dtm2_rf.p
## 俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
## 魂囚西門 1 0 0 2 0
## 那一天 0 0 0 1 0
## 你有念大學嗎 0 0 0 0 0
## 女兵日記女力報到 0 0 0 1 0
## 圈套 0 0 1 9 0
## 俗女養成記 16 0 1 1 0
## 天堂的微笑 0 10 0 1 0
## 我們不能是朋友 0 0 10 3 0
## 我們與惡的距離 0 0 0 104 0
## 想見你 0 0 0 2 17
## 越界 0 0 0 0 0
## 忠孝節義 0 0 0 1 0
## 最佳利益 0 0 0 2 0
## artUrl_dtm2_rf.p
## 越界 忠孝節義 最佳利益
## 魂囚西門 0 0 0
## 那一天 0 0 0
## 你有念大學嗎 0 0 0
## 女兵日記女力報到 0 0 0
## 圈套 0 0 0
## 俗女養成記 0 0 0
## 天堂的微笑 0 0 0
## 我們不能是朋友 0 0 0
## 我們與惡的距離 0 0 0
## 想見你 0 0 0
## 越界 5 0 0
## 忠孝節義 0 23 0
## 最佳利益 0 0 12
## [1] 0.06923077
## [1] 1 24 41 120 165 183 197 208 211 221 223 224 238 241 244 259 266 267 274
## [20] 277 289 312 332 337 348 365 389
## [1] 圈套
## 13 Levels: 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套 ... 最佳利益
## 劇情 今天 無法 演出 好好 再次 種種 不斷 情緒 電話 挑戰 演技 提到
## 33 53 72 119 174 201 202 229 262 274 308 616 647
## 不算 處理 那種 尤其 有點 爆發 放在 複雜 經歷 思考 心得 這裡 強大
## 670 678 723 773 774 806 829 833 856 887 985 1044 1077
## 注意 合理 瞬間 崩潰 後面 算是 生氣 死亡 不錯 眼睛 段落 緊張 幾次
## 1096 1118 1134 1160 1182 1262 1348 1492 1631 1704 1743 1887 1937
## 養父 街頭 外面 星期 中槍 阿志 兩次 開車 早餐 自首 傷心 title
## 2036 2039 2075 2095 2097 2117 2138 2154 2159 2162 2170 2413
## [1] 我們與惡的距離
## 13 Levels: 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套 ... 最佳利益
# Convert label factor to an integer class starting at 0, it's a requirement for XGBoost
title <- artUrl_dtm2$title
label <- as.integer(artUrl_dtm2$title) - 1
# Split the data for training and testing
artUrl_dtm2_train.data <- as.matrix(artUrl_dtm2_train[, -which(colnames(artUrl_dtm2_train) == "title")])
artUrl_dtm2_test.data <- as.matrix(artUrl_dtm2_test[, -which(colnames(artUrl_dtm2_test) == "title")])
# Transform the two datasets into xgb.DMatrix objects
xgb.artUrl_dtm2_train <- xgb.DMatrix(data = artUrl_dtm2_train.data, label = label[spl])
xgb.artUrl_dtm2_test <- xgb.DMatrix(data = artUrl_dtm2_test.data, label = label[!spl])
# Define the parameters for multinomial classification
num_class <- length(levels(title))
params <- list(
booster = "gbtree",
eta = 0.001,
max_depth = 5,
gamma = 3,
subsample = 0.75,
colsample_bytree = 1,
objective = "multi:softprob",
eval_metric = "mlogloss",
num_class = num_class
)# Train the XGBoost classifer
artUrl_dtm2_xgb <- xgb.train(
params = params,
data = xgb.artUrl_dtm2_train,
nrounds = 30,
nthreads = 1,
early_stopping_rounds = 10,
watchlist = list(val1 = xgb.artUrl_dtm2_train, val2 = xgb.artUrl_dtm2_test),
verbose = 0
)
saveRDS(object = artUrl_dtm2_xgb, file = "./object/artUrl_dtm2_xgb.rds")artUrl_dtm2_xgb <- readRDS(file = "./object/artUrl_dtm2_xgb.rds")
# Predict outcomes with the test data
artUrl_dtm2_xgb.p <- predict(artUrl_dtm2_xgb, artUrl_dtm2_test.data, reshape = T) %>%
as.data.frame()
colnames(artUrl_dtm2_xgb.p) <- levels(title)
# Use the predicted label with the highest probability
artUrl_dtm2_xgb.p$prediction <- apply(artUrl_dtm2_xgb.p, 1, function(x) colnames(artUrl_dtm2_xgb.p)[which.max(x)])
# classification matrix
table(artUrl_dtm2_test$title, artUrl_dtm2_xgb.p$prediction)##
## 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
## 魂囚西門 6 0 0 0 0
## 那一天 0 11 0 0 0
## 你有念大學嗎 0 0 16 0 0
## 女兵日記女力報到 0 0 0 90 0
## 圈套 0 0 0 0 39
## 俗女養成記 0 0 0 0 0
## 天堂的微笑 0 0 0 0 0
## 我們不能是朋友 0 0 0 0 0
## 我們與惡的距離 0 0 0 8 2
## 想見你 0 0 0 0 0
## 越界 0 0 0 0 0
## 忠孝節義 0 0 0 0 0
## 最佳利益 0 0 0 0 0
##
## 俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
## 魂囚西門 1 0 0 1 0
## 那一天 0 0 0 6 0
## 你有念大學嗎 0 0 0 0 0
## 女兵日記女力報到 0 1 0 0 0
## 圈套 0 0 0 11 0
## 俗女養成記 13 0 1 4 0
## 天堂的微笑 0 8 0 3 0
## 我們不能是朋友 0 0 7 6 0
## 我們與惡的距離 0 1 0 93 0
## 想見你 0 0 0 3 16
## 越界 0 0 0 0 0
## 忠孝節義 0 0 0 1 0
## 最佳利益 0 0 0 3 0
##
## 越界 忠孝節義 最佳利益
## 魂囚西門 0 0 0
## 那一天 0 0 0
## 你有念大學嗎 0 0 0
## 女兵日記女力報到 0 0 0
## 圈套 0 0 0
## 俗女養成記 0 0 0
## 天堂的微笑 0 0 0
## 我們不能是朋友 0 0 0
## 我們與惡的距離 0 0 0
## 想見你 0 0 0
## 越界 5 0 0
## 忠孝節義 0 23 0
## 最佳利益 0 0 11
## [1] 0.1333333
xgb.importance(artUrl_dtm2_xgb, feature_names = colnames(xgb.artUrl_dtm2_train)) %>%
top_n(20) %>%
ggplot(., aes(reorder(Feature, Gain), y = Gain, fill = (Feature))) +
geom_bar(stat = "identity", show.legend = FALSE) +
theme(text = element_text(size = 20)) +
coord_flip() +
xlab(' ') +
ylab('特徵重要性') +
theme(text = element_text(family = "Heiti TC Light"))## Selecting by Frequency
data.frame(model = c("SVM", "Random Forest", "Xgboost", "SVM 2", "Random Forest 2", "Xgboost 2"),
misclassification_rate = c(mean(artUrl_dtm_test$title != artUrl_dtm_svm.p),
mean(artUrl_dtm_test$title != artUrl_dtm_rf.p),
mean(artUrl_dtm_test$title != artUrl_dtm_xgb.p$prediction),
mean(artUrl_dtm2_test$title != artUrl_dtm2_svm.p),
mean(artUrl_dtm2_test$title != artUrl_dtm2_rf.p),
mean(artUrl_dtm2_test$title != artUrl_dtm2_xgb.p$prediction)))## model misclassification_rate
## 1 SVM 0.12820513
## 2 Random Forest 0.06410256
## 3 Xgboost 0.07179487
## 4 SVM 2 0.15128205
## 5 Random Forest 2 0.06923077
## 6 Xgboost 2 0.13333333
PTT_T2019p_tokens_30 <- PTT_T2019p_tokens %>%
group_by(word) %>%
filter(n() > 30) %>% # 只篩選出現大於 30 次的字
ungroup()## [1] 2529 3049
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 12.00 35.00 84.23 88.00 1497.00
dtm <- as.data.frame(as.matrix(dtm))
# 於貼文原檔新增 is_pop 欄位
PTT_T2019p$is_pop <- as.factor(ifelse(PTT_T2019p$commentNum > 200, 1, 0))
# 取得熱門貼文的 artUrl
pop_artUrl <- PTT_T2019p %>%
filter(is_pop == 1) %>%
.[, "artUrl"]##
## 0 1
## 2260 269
dtm_glm <- glm(is_pop ~ ., dtm_train, family = "binomial") # 以 is_pop 作為 y, 其他欄位都當作 x
saveRDS(object = dtm_glm, file = "./object/dtm_glm.rds")
# 出現 glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurreddtm_glm <- readRDS(file = "./object/dtm_glm.rds")
dtm_glm.p <- predict(dtm_glm, dtm_test, type = "response") # 返回為 is_pop 為 1 的機率 ## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 350 42
## 1 328 39
##
## Accuracy : 0.5125
## 95% CI : (0.4763, 0.5486)
## No Information Rate : 0.8933
## P-Value [Acc > NIR] : 1
##
## Kappa : -9e-04
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.48148
## Specificity : 0.51622
## Pos Pred Value : 0.10627
## Neg Pred Value : 0.89286
## Prevalence : 0.10672
## Detection Rate : 0.05138
## Detection Prevalence : 0.48353
## Balanced Accuracy : 0.49885
##
## 'Positive' Class : 1
##
## [1] 367
## Estimate Std. Error z value Pr(>|z|) term
## (Intercept) -3.967874e+15 9.267254e+07 -42816076.24 0 (Intercept)
## 愛情 -1.752723e+26 3.995613e+20 -438661.74 0 愛情
## 比較 -3.643805e+26 1.003368e+21 -363157.27 0 比較
## 表現 2.098304e+27 2.357696e+21 889980.90 0 表現
## 不想 3.178277e+27 4.132703e+21 769055.42 0 不想
## 不像 5.812396e+26 7.608526e+21 76393.19 0 不像
coef %>%
group_by(Estimate > 0) %>% # group_by兩類:Estimate > 0 或 Estimate <= 0
top_n(10, abs(Estimate)) %>% # abs:絕對值
ungroup() %>%
ggplot(aes(reorder(term, Estimate), Estimate, fill = Estimate > 0)) +
geom_col(alpha = 0.8, show.legend = FALSE) +
coord_flip() +
labs(
x = NULL,
title = "Coefficients that increase/decrease odds ratio of is_pop the most") +
theme(text = element_text(family = "Heiti TC Light"))## [1] TRUE
# 貼文 one artUrl per raw
PTT_T2019p %>%
filter(artUrl %in% (row.names(dtm_test))) %>%
mutate(pred = dtm_glm.p) %>% # 因為確認了 dtm_test 和 pred 中的 document 次序是相同的,所以可以直接 mutate
filter(pred < 0.5, is_pop == 1) %>%
top_n(-10, pred) %>% # top_n(負數, col) 代表要根據 col 取最小的前
.[, c("artTitle", "artUrl")] %>% # 因為內容過多,只顯示 artTitle 與 artUrl
head## artTitle
## 1 [LIVE]2018公視新創電影疑霧公堂公視日十
## 2 [LIVE]女兵日記女力報到EP39
## 3 [LIVE]艾蜜麗的五件事第8集(台視六十)
## 4 [LIVE]必勝大丈夫第13集(三立華八)
## 5 [LIVE]必勝大丈夫第19集(三立華八)
## 6 [LIVE]你有念大學嗎?第五集(台視日十)
## artUrl
## 1 https://www.ptt.cc/bbs/TaiwanDrama/M.1546776725.A.799.html
## 2 https://www.ptt.cc/bbs/TaiwanDrama/M.1546946352.A.978.html
## 3 https://www.ptt.cc/bbs/TaiwanDrama/M.1547300772.A.E36.html
## 4 https://www.ptt.cc/bbs/TaiwanDrama/M.1548330013.A.713.html
## 5 https://www.ptt.cc/bbs/TaiwanDrama/M.1549022013.A.994.html
## 6 https://www.ptt.cc/bbs/TaiwanDrama/M.1549805947.A.A17.html
PTT_T2019p %>%
filter(artUrl %in% (row.names(dtm_test))) %>%
mutate(pred = dtm_glm.p) %>%
filter(pred > 0.5, !is_pop == 1) %>%
top_n(-10, pred) %>%
.[, c("artTitle", "artUrl")] %>% # 因為內容過多,只顯示 artTitle 與 artUrl
head## artTitle
## 1 [LIVE]種菜女神EP15-16
## 2 [新聞]嚴藝文處女執導《俗女養成記》謝盈萱挑
## 3 [新聞]福斯原創影集第二部明晚,空中見(非原標
## 4 [心得]女力報到別再打臉了
## 5 [LIVE]必勝大丈夫第四集(三立華八)
## 6 [雙城]雙城故事旅行的意義第16、17集隨便聊
## artUrl
## 1 https://www.ptt.cc/bbs/TaiwanDrama/M.1546610106.A.630.html
## 2 https://www.ptt.cc/bbs/TaiwanDrama/M.1546612054.A.AB4.html
## 3 https://www.ptt.cc/bbs/TaiwanDrama/M.1547037266.A.EF3.html
## 4 https://www.ptt.cc/bbs/TaiwanDrama/M.1547192235.A.F03.html
## 5 https://www.ptt.cc/bbs/TaiwanDrama/M.1547206299.A.146.html
## 6 https://www.ptt.cc/bbs/TaiwanDrama/M.1547292228.A.68E.html
dtm_svm <- svm(is_pop ~ ., dtm_train, kernel = "linear", cost = 10, scale = F)
saveRDS(object = dtm_svm, file = "./object/dtm_svm.rds")dtm_svm <- readRDS(file = "./object/dtm_svm.rds")
dtm_svm.p <- predict(dtm_svm, dtm_test)
confusionMatrix(dtm_svm.p,
dtm_test$is_pop,
dnn = c("Prediction", "Reference"),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 630 32
## 1 48 49
##
## Accuracy : 0.8946
## 95% CI : (0.8705, 0.9155)
## No Information Rate : 0.8933
## P-Value [Acc > NIR] : 0.48270
##
## Kappa : 0.4914
##
## Mcnemar's Test P-Value : 0.09353
##
## Sensitivity : 0.60494
## Specificity : 0.92920
## Pos Pred Value : 0.50515
## Neg Pred Value : 0.95166
## Prevalence : 0.10672
## Detection Rate : 0.06456
## Detection Prevalence : 0.12780
## Balanced Accuracy : 0.76707
##
## 'Positive' Class : 1
##
dtm_rp <- rpart(is_pop ~ ., dtm_train, method = "class")
saveRDS(object = dtm_rp, file = "./object/dtm_rp.rds")dtm_rp <- readRDS(file = "./object/dtm_rp.rds")
dtm_rp.p <- predict(dtm_rp, dtm_test)[, 2]
# predict(dtm_rp, dtm_test) 會返回 2 種 probability:是 0 的機率, 是 1 的機率, 我們只取出是 1 的機率
confusionMatrix(factor(ifelse(dtm_rp.p >= 0.5, 1, 0)),
factor(dtm_test$is_pop),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 663 42
## 1 15 39
##
## Accuracy : 0.9249
## 95% CI : (0.9038, 0.9426)
## No Information Rate : 0.8933
## P-Value [Acc > NIR] : 0.0019787
##
## Kappa : 0.5384
##
## Mcnemar's Test P-Value : 0.0005736
##
## Sensitivity : 0.48148
## Specificity : 0.97788
## Pos Pred Value : 0.72222
## Neg Pred Value : 0.94043
## Prevalence : 0.10672
## Detection Rate : 0.05138
## Detection Prevalence : 0.07115
## Balanced Accuracy : 0.72968
##
## 'Positive' Class : 1
##
prp(dtm_rp, # 模型
faclen = 0, # 呈現的變數不要縮寫
fallen.leaves = TRUE, # 讓樹枝以垂直方式呈現
shadow.col = "gray", # 最下面的節點塗上陰影
extra = 2) # number of correct classifications / number of observations in that nodedtm_rf <- randomForest(is_pop ~ ., dtm_train,
method = "class", importance = TRUE, proximity = TRUE)
saveRDS(object = dtm_rf, file = "./object/dtm_rf.rds")##
## Call:
## randomForest(formula = is_pop ~ ., data = dtm_train, method = "class", importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 55
##
## OOB estimate of error rate: 5.93%
## Confusion matrix:
## 0 1 class.error
## 0 1572 10 0.006321113
## 1 95 93 0.505319149
dtm_rf.p <- predict(dtm_rf, dtm_test, type = "prob")[, 2]
confusionMatrix(factor(ifelse(dtm_rf.p >= 0.5, 1, 0)),
factor(dtm_test$is_pop),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 674 43
## 1 4 38
##
## Accuracy : 0.9381
## 95% CI : (0.9185, 0.9541)
## No Information Rate : 0.8933
## P-Value [Acc > NIR] : 1.240e-05
##
## Kappa : 0.5878
##
## Mcnemar's Test P-Value : 2.976e-08
##
## Sensitivity : 0.46914
## Specificity : 0.99410
## Pos Pred Value : 0.90476
## Neg Pred Value : 0.94003
## Prevalence : 0.10672
## Detection Rate : 0.05007
## Detection Prevalence : 0.05534
## Balanced Accuracy : 0.73162
##
## 'Positive' Class : 1
##
colAUC(cbind(glm = dtm_glm.p,
svm = dtm_svm.p,
rpart = dtm_rp.p,
rf = dtm_rf.p),
dtm_test$is_pop, T)## glm svm rpart rf
## 0 vs. 1 0.5011472 0.7670709 0.7847336 0.8193762
# 計算每個 document 裡不同的字的 tf
document_words <- PTT_T2019p_tokens_30 %>%
count(artUrl, word, sort = TRUE)
total_words <- document_words %>%
group_by(artUrl) %>%
summarize(total = sum(n)) %>%
right_join(document_words) %>%
mutate(is_pop = as.factor(ifelse(artUrl %in% pop_artUrl, 1, 0)))## Joining, by = "artUrl"
total_words <- total_words %>%
bind_tf_idf(word, artUrl, n)
# 選每篇文章 tf-idf 最大的十個詞
tfidf_top10 <- total_words %>%
filter(nchar(word) > 1) %>%
group_by(artUrl)%>%
top_n(10, tf_idf) %>%
arrange(desc(artUrl))
# 視覺化 (跑不太動)
# tfidf_top10 %>%
# ggplot(aes(word, tf_idf, fill = artUrl)) +
# geom_col(show.legend = FALSE) +
# labs(x = NULL, y = "tf-idf") +
# facet_wrap(~artUrl, ncol = 2, scales = "free") +
# coord_flip()## [1] 2529 2976
dtm.tfidf <- as.data.frame(as.matrix(dtm.tfidf))
dtm.tfidf$is_pop <- as.factor(ifelse(rownames(dtm.tfidf) %in% pop_artUrl, 1, 0))
table(dtm.tfidf$is_pop)##
## 0 1
## 2260 269
dtm.tfidf_glm <- glm(is_pop ~ ., dtm.tfidf_train, family = "binomial")
saveRDS(object = dtm.tfidf_glm, file = "./object/dtm.tfidf_glm.rds")dtm.tfidf_glm <- readRDS(file = "./object/dtm.tfidf_glm.rds")
dtm.tfidf_glm.p <- predict(dtm.tfidf_glm, dtm.tfidf_test, type = "response") # 返回為 is_pop 為 1 的機率## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
confusionMatrix(factor(ifelse(dtm.tfidf_glm.p >= 0.5, 1, 0)),
factor(dtm.tfidf_test$is_pop),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 355 45
## 1 323 36
##
## Accuracy : 0.5152
## 95% CI : (0.4789, 0.5512)
## No Information Rate : 0.8933
## P-Value [Acc > NIR] : 1
##
## Kappa : -0.0127
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.44444
## Specificity : 0.52360
## Pos Pred Value : 0.10028
## Neg Pred Value : 0.88750
## Prevalence : 0.10672
## Detection Rate : 0.04743
## Detection Prevalence : 0.47299
## Balanced Accuracy : 0.48402
##
## 'Positive' Class : 1
##
dtm.tfidf_svm <- svm(is_pop ~ ., dtm.tfidf_train, kernel = "linear", cost = 10, scale = F)
dtm.tfidf_svm.p <- predict(dtm.tfidf_svm, dtm.tfidf_test)confusionMatrix(factor(dtm.tfidf_svm.p),
factor(dtm.tfidf_test$is_pop),
dnn = c("Prediction", "Reference"),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 667 47
## 1 11 34
##
## Accuracy : 0.9236
## 95% CI : (0.9023, 0.9415)
## No Information Rate : 0.8933
## P-Value [Acc > NIR] : 0.002962
##
## Kappa : 0.5017
##
## Mcnemar's Test P-Value : 4.312e-06
##
## Sensitivity : 0.41975
## Specificity : 0.98378
## Pos Pred Value : 0.75556
## Neg Pred Value : 0.93417
## Prevalence : 0.10672
## Detection Rate : 0.04480
## Detection Prevalence : 0.05929
## Balanced Accuracy : 0.70176
##
## 'Positive' Class : 1
##
dtm.tfidf_rp <- rpart(is_pop ~ ., dtm.tfidf_train, method = "class")
saveRDS(object = dtm.tfidf_rp, file = "./object/dtm.tfidf_rp.rds")dtm.tfidf_rp <- readRDS(file = "./object/dtm.tfidf_rp.rds")
dtm.tfidf_rp.p <- predict(dtm.tfidf_rp, dtm.tfidf_test)[, 2]
# predict 會返回 2 種 probability:是 0 的機率, 是 1 的機率, 我們只取出是 1 的機率
confusionMatrix(factor(ifelse(dtm.tfidf_rp.p >= 0.5, 1, 0)),
factor(dtm.tfidf_test$is_pop),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 666 55
## 1 12 26
##
## Accuracy : 0.9117
## 95% CI : (0.8892, 0.9309)
## No Information Rate : 0.8933
## P-Value [Acc > NIR] : 0.05338
##
## Kappa : 0.3958
##
## Mcnemar's Test P-Value : 2.88e-07
##
## Sensitivity : 0.32099
## Specificity : 0.98230
## Pos Pred Value : 0.68421
## Neg Pred Value : 0.92372
## Prevalence : 0.10672
## Detection Rate : 0.03426
## Detection Prevalence : 0.05007
## Balanced Accuracy : 0.65164
##
## 'Positive' Class : 1
##
prp(dtm.tfidf_rp, # 模型
faclen = 0, # 呈現的變數不要縮寫
fallen.leaves = TRUE, # 讓樹枝以垂直方式呈現
shadow.col = "gray", # 最下面的節點塗上陰影
extra = 2) # number of correct classifications / number of observations in that nodedtm.tfidf_rf <- randomForest(is_pop ~ ., dtm.tfidf_train,
method = "class", importance = TRUE,
proximity = TRUE)
saveRDS(object = dtm.tfidf_rf, file = "./object/dtm.tfidf_rf.rds")##
## Call:
## randomForest(formula = is_pop ~ ., data = dtm.tfidf_train, method = "class", importance = TRUE, proximity = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 54
##
## OOB estimate of error rate: 6.27%
## Confusion matrix:
## 0 1 class.error
## 0 1571 11 0.006953224
## 1 100 88 0.531914894
dtm.tfidf_rf.p <- predict(dtm.tfidf_rf, dtm.tfidf_test, type = "prob")[, 2]
confusionMatrix(factor(ifelse(dtm.tfidf_rf.p >= 0.5, 1, 0)),
factor(dtm.tfidf_test$is_pop),
positive = "1")## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 672 47
## 1 6 34
##
## Accuracy : 0.9302
## 95% CI : (0.9097, 0.9473)
## No Information Rate : 0.8933
## P-Value [Acc > NIR] : 0.0003287
##
## Kappa : 0.5287
##
## Mcnemar's Test P-Value : 3.92e-08
##
## Sensitivity : 0.4198
## Specificity : 0.9912
## Pos Pred Value : 0.8500
## Neg Pred Value : 0.9346
## Prevalence : 0.1067
## Detection Rate : 0.0448
## Detection Prevalence : 0.0527
## Balanced Accuracy : 0.7055
##
## 'Positive' Class : 1
##
colAUC(cbind(glm.tfidf = dtm.tfidf_glm.p,
svm.tfidf = dtm.tfidf_svm.p,
rpart.tfidf = dtm.tfidf_rp.p,
rf.tfidf = dtm.tfidf_rf.p),
dtm.tfidf_test$is_pop, T)## glm.tfidf svm.tfidf rpart.tfidf rf.tfidf
## 0 vs. 1 0.5159784 0.7017644 0.6642995 0.7820387
colAUC(cbind(glm = dtm_glm.p,
svm = dtm_svm.p,
rpart = dtm_rp.p,
rf = dtm_rf.p,
glm.tfidf = dtm.tfidf_glm.p,
svm.tfidf = dtm.tfidf_svm.p,
rpart.tfidf = dtm.tfidf_rp.p,
rf.tfidf = dtm.tfidf_rf.p),
dtm_test$is_pop, T)以文字分群而言,在 LDA 的結果較能明顯看出每個主題討論的內容,主要是因為貼文中討論演員名、劇名、角色名的字詞較多,較能產生主題的連結,雖然在移除演員名、劇名、角色名後,只有一個主題有明顯出現某戲劇的特徵,但相較 K-means 的分群,K-means 較難去解釋分在一群的結果,只能從離群值中尋找討論度較不一樣的貼文。
依據貼文及留言畫出的網路關係圖,可以看出網友在討論戲劇時,會因為相似的劇情元素產生不同戲劇間的連結,且這種現象在留言上會更加明顯,可能是因為留言的詞彙數量更多且更為多元,故更容易與他部戲劇產生關連。
比較戲劇分類器 1, 2 可發現當移除演員名、劇名、角色名後,model 分類錯誤率都提高了,而且特徵重要性的圖表前幾名大多為劇名、角色名,所以演員名、劇名、角色名應為識別某齣戲劇的重要指標。
關於戲劇分類器使用的 3 個 model (SVM、Random Forest、Xgboost) 分類表現都不錯,通常會使 model 分類錯誤的地方大多為兩齣戲劇有相似的劇情連結。
關於預測熱門留言,以上 model 學習 sensitivity 學的不是很好,可能與樣本數不夠有關,使得模型在學習規則上沒有學習好。
不同的 input DTM 沒有特定哪個比較好,而是在不同 model 中去找出適合的。例如我們發現 Random Forest 在 TF-IDF 分類表現較佳,而 SVM、Decision Tree 在 TF 表現比 TF-IDF 好,這部分與模型本身假設,推論方法有關。