一、動機和分析目的

隨著科技的進步，線上的影音平台更能讓喜愛看戲劇的觀眾就算錯過首播也能在自己方便的時間享受其中，而國際 OTT 影音平台則是個擅用「大數據」來分析觀眾收視的習慣，近年來除了用於行銷推播、開發潛在觀眾外更嘗試導入前期影視製作，因此我們想藉由爬取 PTT 有關於戲劇方面的資料，分析去年網友們在 PTT 上針對台灣戲劇的討論話題及內容，建立一個戲劇的分類器，透過不同種模型觀察討論的主題分群，以及在特定戲劇下不同模型的準確度，此外也以留言數的多寡定義該篇文章的熱門程度，觀察不同模型下在 dtm 及 tf-idf 的差異。

二、資料集描述

資料來源：PTT 台劇版
資料期間：2019/01/01 ~ 2019/12/31，共 2534 篇貼文與 237867 則留言

三、資料分析與解釋

系統參數設定 & 載入 package

Sys.setlocale(category = "LC_ALL", locale = "zh_TW.UTF-8")

## [1] "LC_CTYPE=zh_TW.UTF-8;LC_NUMERIC=C;LC_TIME=zh_TW.UTF-8;LC_COLLATE=zh_TW.UTF-8;LC_MONETARY=zh_TW.UTF-8;LC_MESSAGES=en_US.UTF-8;LC_PAPER=en_US.UTF-8;LC_NAME=C;LC_ADDRESS=C;LC_TELEPHONE=C;LC_MEASUREMENT=en_US.UTF-8;LC_IDENTIFICATION=C"

library(caret)
library(caTools)
library(data.table)
library(dplyr)
library(e1071)
library(factoextra)
library(ggplot2)
library(igraph)
library(jiebaR)
library(purrr)
library(randomForest)
library(ranger)
library(rpart)
library(rpart.plot)
library(stringr)
library(tidytext)
library(tm)
library(topicmodels)
library(xgboost)

資料前處理

讀取資料

讀取 PTT 2019 年台劇版貼文 (post) 及留言 (message)，並篩掉不需要的欄位，包含 artTime, artCat, commentPoster

# PTT TaiwanDrama 2019 貼文 (post)
PTT_T2019p <- fread("./data/TaiwanDrama2019_articleMetaData.csv", encoding = "UTF-8", data.table = F) %>% select(-artTime, -artCat)

# PTT TaiwanDrama 2019 留言 (message)
PTT_T2019m <- fread("./data/TaiwanDrama2019_articleMetaData_message.csv", encoding = "UTF-8", data.table = F) %>% select(-artTime, -artCat, -commentPoster)

將日期轉為 date 型式以及去掉貼文與留言中的英文與數字

# 日期格式由 "chr" 轉為 "date"
PTT_T2019p$artDate <- PTT_T2019p$artDate %>% as.Date("%Y/%m/%d")
PTT_T2019m$artDate <- PTT_T2019m$artDate %>% as.Date("%Y/%m/%d")
PTT_T2019m$commentDate <- PTT_T2019m$commentDate %>% substring(1, 10) %>% as.Date()

# 去除貼文與留言中的英文與數字
PTT_T2019p$sentence <- gsub("[0-9a-zA-Z]", "", PTT_T2019p$sentence)
PTT_T2019m$commentContent <- gsub("[0-9a-zA-Z]", "", PTT_T2019m$commentContent)

斷詞

自訂 jiebar 斷詞及停用字字典，並定義斷詞函式

jieba_tokenizer <- worker(user = "./dict/drama2019_dict.txt", stop_word = "./dict/stop_words.txt" )

jieba_tokenizer$write = "NOFILE" # 若有出現錯誤再跑這行

drama_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    # 將詞彙長度為 1 的詞清除
    tokens <- tokens[nchar(tokens) > 1]
    return(tokens)
  })
}

2019 斷詞

# 貼文
PTT_T2019p_tokens <- tibble(artDate = PTT_T2019p$artDate, 
                            artUrl = PTT_T2019p$artUrl,
                            artPoster = PTT_T2019p$artPoster,
                            sentence = PTT_T2019p$sentence, 
                            commentNum = PTT_T2019p$commentNum) %>% 
  unnest_tokens(word, sentence, token = drama_tokenizer)

# 留言
PTT_T2019m_tokens <- tibble(artDate = PTT_T2019m$artDate, 
                            artUrl = PTT_T2019m$artUrl,
                            artPoster = PTT_T2019m$artPoster,
                            commentStatus = PTT_T2019m$commentStatus,
                            commentDate = PTT_T2019m$commentDate, 
                            commentContent = PTT_T2019m$commentContent) %>% 
  unnest_tokens(word, commentContent, token = drama_tokenizer)

取代同義詞

由於分析過程可能會出現「靜雯」及「賈靜雯」兩個不同名字，但都代表同一人，因此為了後續研究我們將相關名稱做同義詞取代。

synonym <- function(i)
{
  i <- recode(i, 
              # 導演編劇演員名
              # 女兵日記女力報到
              宣榕 = "李宣榕",
              曜晟 = "林曜晟",
              謙文 = "陳謙文",
              雅筑 = "楊雅筑",
              舒涵 = "梁舒涵",
              語昕 = "方語昕",
              瀚名 = "梁瀚名",
              建予 = "林建予",
              鯰魚哥 = "林建予",
              鯰魚 = "林建予",
              彥凱 = "尹彥凱",
              小白 = "游小白",
              樂妍 = "王樂妍",
              
              # 圈套
              鈞浩 = "徐鈞浩",
              承洋 = "吳承洋",
              慶華 = "卞慶華",
              廷軒 = "陳廷軒",
              意箴 = "林意箴",
              春誠 = "丁春誠",
              賢治 = "梅賢治",
              家逵 = "陳家逵",
              
              # 噬罪者
              凱勛 = "莊凱勛",
              晏豪 = "曹晏豪",
              于喬 = "夏于喬",
              淑臻 = "蔡淑臻",
              子熙 = "林子熙",
              
              # 罪夢者
              孝全 = "張孝全",
              曉萱 = "范曉萱",
              柏傑 = "王柏傑",
              
              # 想見你
              佳嬿 = "柯佳嬿",
              光漢 = "許光漢",
              柏宇 = "施柏宇",
              毓麟 = "顏毓麟",
              藝文 = "嚴藝文",
              
              # 雙城故事
              怡蓉 = "陳怡蓉",
              珮瑜 = "曾珮瑜",
              柏鈞 = "黃柏鈞",
              
              # 種菜女神
              以豪 = "劉以豪",
              庭妮 = "陳庭妮",
              千娜 = "李千娜",
              
              # 魂囚西門
              敬騰 = "蕭敬騰",
              碧婷 = "郭碧婷",
              宜農 = "鄭宜農",
              盈萱 = "謝盈萱",
              葦華 = "藍葦華",
              
              # 忠孝節義
              麗花 = "楊麗花",
              亞蘭 = "陳亞蘭",
              
              # 通靈少女2
              書瑤 = "郭書瑤",
              貞菱 = "温貞菱",
              少勳 = "范少勳",
              
              # 我是顧家男
              健瑋 = "黃健瑋",
              謹華 = "楊謹華",
              佳見 = "謝佳見",
              敬宣 = "陳敬宣",
              景嵐 = "張景嵐",
              宇辰 = "邱宇辰",
              
              # 一千個晚安
              棟樑 = "張棟樑",
              俞涵 = "連俞涵",
              宗霖 = "李宗霖",
              #愛寗 = "姚愛寗",
              
              # 如果愛重來
              書豪 = "張書豪",
              
              # 生死接線員
              少宗 = "曾少宗",
              倩妏 = "劉倩妏",
              承矩 = "單承矩",
              震瀚 = "邱震瀚",
              
              # 月村歡迎你 
              坤達 = "謝坤達",
              思宇 = "林思宇",
              念軒 = "吳念軒",
              佳豫 = "李佳豫",
              
              # 用九柑仔店
              軒睿 = "張軒睿",
              允雯 = "莫允雯",
              
              # 天堂的微笑
              杰楷 = "修杰楷",
              予晞 = "林予晞",
              振剛 = "唐振剛",
              志友 = "方志友",
              尚禾 = "黃尚禾",
              
              # 你有念大學嗎
              心亞 = "安心亞",
              浩辰 = "禾浩辰",
              宇辰 = "邱宇辰",
              品潔 = "吳品潔",
              
              # 靈異街11號
              國毅 = "李國毅",
              嫚書 = "簡嫚書",
              
              # 我們與惡的距離
              君陽 = "林君陽",
              蒔媛 = "呂蒔媛",
              十元 = "呂蒔媛", 
              靜雯 = "賈靜雯",
              昇豪 = "溫昇豪",
              慷仁 = "吳慷仁", 
              采詩 = "周采詩", 
              都拉斯 = "洪都拉斯",
              可元 = "王可元", 
              沛慈 = "曾沛慈", 
              哲熹 = "林哲熹",
              名帥 = "施名帥", 
              予晞 = "林予晞",
              瓊煖 = "謝瓊煖",
              卉喬 = "于卉喬",
              #喬喬 = "于卉喬",
              
              # 我們不能是朋友
              以豪 = "劉以豪",
              雪芙 = "郭雪芙",
              其君 = "孫其君",
              若妍 = "夏若妍",
              艾菲 = "袁艾菲",
              
              # 你那邊怎樣我這邊OK
              正龍 = "藍正龍",
              心湄 = "藍心湄",
              西平 = "曹西平",
              之喬 = "曾之喬",
              冠廷 = "劉冠廷",
              岳擎 = "吳岳擎",
              泂江 = "陳泂江",
              俊雄 = "黃俊雄",

              # 劇名 & 角色名
              # 越界
              子軒 = "邱子軒",
              宇豪 = "夏宇豪",
              振文 = "王振文",
              振武 = "王振武",
              
              # 女兵日記女力報到
              女力報到 = "女兵日記女力報到",
              女力 = "女兵日記女力報到",
              志強 = "宋志強",
              朵拉 = "潘朵拉",
              學文 = "周學文",
              松齡 = "郭松齡",
              曉嫻 = "高曉嫻",
              淑靜 = "李淑靜",
              素娥 = "葉素娥",
              嫚莉 = "黃嫚莉",
              雅芝 = "趙雅芝",
              國樑 = "林國樑",
              佑廷 = "邱佑廷",
              寬亮 = "何寬亮",
              光興 = "鄭光興",
              大房 = "馬大房",
              佳佳 = "張佳佳",
              喬青 = "方喬青",
              
              # 圈套
              少飛 = "孟少飛",
              飛飛 = "孟少飛",
              立安 = "趙立安",
              趙子 = "趙立安",
              紅葉 = "左紅葉",
              道一 = "古道一",
              至德 = "李至德",
              阿德 = "李至德",
              文浩 = "陳文浩",
              國棟 = "唐國棟",
              
              # 那一天
              豪廷 = "項豪廷",
              希顧 = "于希顧",
              志剛 = "盧志剛",
              博翔 = "孫博翔",
              
              # 想見你
              雨萱 = "黃雨萱",
              韻如 = "陳韻如",
              子維 = "李子維",
              詮勝 = "王詮勝",
              俊傑 = "莫俊傑",
              芝齊 = "謝芝齊",
              宗儒 = "謝宗儒",
              文磊 = "吳文磊",
              思源 = "陳思源",
              
              # 雙城故事
              雙城 = "雙城故事",
              念念 = "李念念",
              天明 = "鄧天明",
              
              # 魂囚西門
              松言 = "魏松言",
              
              # 最佳利益
              方律 = "方箏",
              博昀 = "陳博昀",
              立廷 = "趙立廷",
              
              # 美味滿閣
              子堯 = "王子堯",
              以茜 = "方以茜",
              子文 = "王子文",
              芳蓉 = "何芳蓉",
              
              # 愛情白皮書
              掛居 = "歐陽掛居",
              成美 = "袁成美",
              守治 = "瞿守治",
              星華 = "季星華",
              松岡 = "趙松岡",
              康澤 = "沈康澤",
              
              # 我是顧家男
              顧家 = "我是顧家男",
              家南 = "顧家南",
              雙雙 = "徐雙雙",
              博仁 = "顏博仁",
              小敏 = "哈小敏",
              心怡 = "傅心怡",
              
              # 一千個晚安
              柏森 = "吳柏森",
              天雨 = "戴天雨",
              
              # 如果愛重來
              如果愛 = "如果愛重來",
              大樂 = "汪大樂",
              詠琪 = "何詠琪",
              琪琪 = "何詠琪",
              士元 = "王士元",
              于茜 = "洪于茜",
              
              # 月村歡迎你
              月村 = "月村歡迎你",
              國建 = "畢國建",
              阿建 = "畢國建",
              月禮 = "南月禮",
              小禮 = "南月禮",
              威海 = "馬威海",
              
              # 俗女養成記
              俗女 = "俗女養成記",
              嘉玲 = "陳嘉玲",
              
              # 用九柑仔店
              用九 = "用九柑仔店",
              俊龍 = "楊俊龍",
              昭君 = "陳昭君",
              
              # 天堂的微笑
              又翔 = "孫又翔",
              承舒 = "趙承舒",
              宇則 = "何宇則",
              令悠 = "楊令悠",
              向東 = "程向東",
              
              # 你有念大學嗎
              大學 = "你有念大學嗎",
              可艾 = "常可艾",
              子浩 = "陽子浩",
              小剛 = "蔡小剛",
              子傑 = "梁子傑",
              貞怡 = "簡貞怡",
              
              # 靈異街11號
              靈異街 = "靈異街11號",
              志海 = "高志海",
              阿海 = "高志海",
              
              # 我們與惡的距離
              與惡 = "我們與惡的距離",
              喬安 = "宋喬安",
              昭國 = "劉昭國", 
              天彥 = "劉天彥",
              #天晴 = "劉天晴", 
              美媚 = "丁美媚",
              曉明 = "李曉明",
              曉文 = "李大芝", # "李曉文" 改名 "李大芝"
              大芝 = "李大芝", 
              思悅 = "應思悅",
              思聰 = "應思聰",
              一駿 = "林一駿",
              喬平 = "宋喬平",
              紐世 = "廖紐世",
              news哥 = "廖紐世",
              
              # 我們不能是朋友
              朋友 = "我們不能是朋友",
              克桓 = "褚克桓",
              惟惟 = "周惟惟",
              皓一 = "黎皓一",
              子媛 = "高子媛",
              可菲 = "韓可菲",
              百洋 = "顏百洋",
              
              # 烏陰天的好日子
              陰天 = "烏陰天的好日子",
              碧秋 = "王碧秋"
              )
}
# 貼文
PTT_T2019p_tokens$word <- synonym(PTT_T2019p_tokens$word)
# 留言
PTT_T2019m_tokens$word <- synonym(PTT_T2019m_tokens$word)

saveRDS(object = PTT_T2019p_tokens, file = "./object/PTT_T2019p_tokens.rds")
saveRDS(object = PTT_T2019m_tokens, file = "./object/PTT_T2019m_tokens.rds")

PTT_T2019p_tokens <- readRDS(file = "./object/PTT_T2019p_tokens.rds")
PTT_T2019m_tokens <- readRDS(file = "./object/PTT_T2019m_tokens.rds")

LDA 分析

轉 DocumentTermMatrix (DTM) 格式

PTT_T2019p_dtm <- PTT_T2019p_tokens %>%
  count(artUrl, word, sort = T) %>%
  rename(count = n) %>%
  cast_dtm(artUrl, word, count)

PTT_T2019p_dtm

## <<DocumentTermMatrix (documents: 2529, terms: 59528)>>
## Non-/sparse entries: 383702/150162610
## Sparsity           : 100%
## Maximal term length: 11
## Weighting          : term frequency (tf)

建立 LDA 模型

嘗試 2, 3, 8, 13, 20, 36 主題數，將結果存起來，再做進一步分析

ldas <- c()
topics <- c(2, 3, 8, 13, 20, 36)
for(topic in topics){
  start_time <- Sys.time()
  lda <- LDA(PTT_T2019p_dtm, k = topic, control = list(seed = 2020))
  ldas <- c(ldas, lda)
  print(paste(topic, paste("topic(s) and use time is ", Sys.time() - start_time)))
  save(ldas, file = "./object/ldas_result")
}

載入每個主題的 LDA 結果

load("./object/ldas_result")

最佳主題數

透過 perplexity 找到最佳主題數

topics <- c(2, 3, 8, 13, 20, 36)
data_frame(k = topics,
           perplex = map_dbl(ldas, perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")

## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

透過 perplexity 找到最佳主題數為 13，因此後續 LDA 的主題模型依照 13 個主題數去研究

各 topic 的代表字

ϕ Matrix：per-topic-per-word probabilities called beta
從 topic 中可得到特定主題生成特定詞彙的機率

PTT_T2019p_lda <- ldas[[4]] # 選定主題數為 13 的結果
PTT_T2019p_topics <- tidy(PTT_T2019p_lda, matrix = "beta")

PTT_T2019p_top_terms <- PTT_T2019p_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

PTT_T2019p_top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  theme(text = element_text(family = "Heiti TC Light"))

由圖可看出第 3 個主題討論 “女兵日記女力報到”、第 5 個主題討論 “我們與惡的距離”、第 7 個主題討論 “忠孝節義”、第 8 個主題討論 “想見你”、第 9 個主題討論 “圈套”。
因此我們可以知道在 LDA 的結果上主要是依據每齣劇的演員名、劇名、角色名來做分群，接著，我們嘗試將演員名、劇名、角色名移除，再跑一次 LDA 觀察結果的差異。

LDA 分析 2 (移除相關名字)

重新斷詞 (移除演員名、劇名、角色名)

jieba_tokenizer2 <- worker(user = "./dict/drama2019_dict2.txt", stop_word = "./dict/stop_words2.txt" )

jieba_tokenizer2$write = "NOFILE" # 若有出現錯誤再跑這行

drama_tokenizer2 <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer2)
    # 將詞彙長度為 1 的詞清除
    tokens <- tokens[nchar(tokens) > 1]
    return(tokens)
  })
}

# 貼文
PTT_T2019p_tokens2 <- tibble(artDate = PTT_T2019p$artDate,
                             artUrl = PTT_T2019p$artUrl,
                             artPoster = PTT_T2019p$artPoster,
                             sentence = PTT_T2019p$sentence,
                             commentNum = PTT_T2019p$commentNum) %>% 
  unnest_tokens(word, sentence, token = drama_tokenizer2)

# 留言
PTT_T2019m_tokens2 <- tibble(artDate = PTT_T2019m$artDate,
                             artUrl = PTT_T2019m$artUrl,
                             artPoster = PTT_T2019m$artPoster,
                             commentStatus = PTT_T2019m$commentStatus,
                             commentDate = PTT_T2019m$commentDate,
                             commentContent = PTT_T2019m$commentContent) %>%
  unnest_tokens(word, commentContent, token = drama_tokenizer2)

saveRDS(object = PTT_T2019p_tokens2, file = "./object/PTT_T2019p_tokens2.rds")
saveRDS(object = PTT_T2019m_tokens2, file = "./object/PTT_T2019m_tokens2.rds")

PTT_T2019p_tokens2 <- readRDS(file = "./object/PTT_T2019p_tokens2.rds")
PTT_T2019m_tokens2 <- readRDS(file = "./object/PTT_T2019m_tokens2.rds")

轉 DocumentTermMatrix (DTM) 格式

PTT_T2019p_dtm2 <- PTT_T2019p_tokens2 %>%
  count(artUrl, word, sort = T) %>%
  rename(count = n) %>%
  cast_dtm(artUrl, word, count)

PTT_T2019p_dtm2

## <<DocumentTermMatrix (documents: 2529, terms: 61829)>>
## Non-/sparse entries: 377547/155987994
## Sparsity           : 100%
## Maximal term length: 9
## Weighting          : term frequency (tf)

建立 LDA 模型

PTT_T2019p_lda2 <- LDA(PTT_T2019p_dtm2, k = 13, control = list(seed = 2020))
saveRDS(object = PTT_T2019p_lda2, file = "./object/PTT_T2019p_lda2.rds")

各 topic 的代表字

PTT_T2019p_lda2 <- readRDS(file = "./object/PTT_T2019p_lda2.rds")
PTT_T2019p_topics2 <- tidy(PTT_T2019p_lda2, matrix = "beta")

PTT_T2019p_top_terms2 <- PTT_T2019p_topics2 %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

PTT_T2019p_top_terms2 %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered() +
  theme(text = element_text(family = "Heiti TC Light"))

拿掉演員名、劇名、角色名後的結果，可發現只有在第 8 個主題較明顯在講 “我們與惡的距離” 這齣戲，像是與惡是一部社會寫實，內容在描述一起殺人案在新聞媒體的渲染下，造成家屬、加害者及律師的不同心境的詮釋。
其他主題較容易因為拿掉 keyword 後造成不同連結，因此較為雜亂。

K-means 分群

k_dtm <- PTT_T2019p_tokens %>%
  count(artUrl,word) %>%
  cast_dtm(artUrl, word, n)

kmeans.cluster <- kmeans(k_dtm, centers = 13)
saveRDS(object = kmeans.cluster, file = "./object/kmeans.cluster.rds")
kmeans.cluster <- readRDS(file = "./object/kmeans.cluster.rds")

cluster_result <- fviz_cluster(kmeans.cluster, # 分群結果
                               data = k_dtm, # 資料
                               geom = c("point","text"), # 點和標籤 (point & label)
                               frame.type = "norm")
saveRDS(object = cluster_result, file = "./object/cluster_result.rds")
cluster_result <- readRDS(file = "./object/cluster_result.rds")
cluster_result

從 K-means 結果可以看出大多數都聚集在同一群中，但也可以看出有少許的離群值，進而去探討該貼文的內容。
像是下面的連結貼文是轉貼一部《傀儡花》的報導，顯示該篇貼文的內容與其他內容差異較大，則在右上方這群的貼文內容則是在討論金鐘獎、金穗獎等內容。
根據課堂上的教學影片提到，當一份文件討論多個主題時，K-means 分群演算法較不佳，造成資料分群上的重疊，因此從我們的結果也可以得證 LDA 的效果較好。

取得各齣戲劇的資料

依 PTT 台劇版的討論習慣，會使用簡稱，例如若為 “我們與惡的距離” 相關內容，會在貼文標題寫到 “與惡”，故我們從 2019 年的台劇版篩選出各齣戲劇相關資料的 artUrl，並取得斷詞資料，分成貼文及留言進行處理。

越界

越界 (Crossing the Line)

# 取得 artUrl 
# 貼文
CLp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*越界.*"))) %>%
  .[, "artUrl"]

# 留言
CLm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*越界.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
CLp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% CLp_artUrl)
CLp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% CLp_artUrl)

# 留言
CLm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% CLm_artUrl)
CLm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% CLm_artUrl)

女兵日記女力報到

女兵日記女力報到 (Girl’s Power)

# 取得 artUrl 
# 貼文
GPp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*女力.*"))) %>%
  .[, "artUrl"]

# 留言
GPm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*女力.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
GPp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% GPp_artUrl)
GPp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% GPp_artUrl)

# 留言
GPm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% GPm_artUrl)
GPm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% GPm_artUrl)

圈套

圈套 (trapped)

# 取得 artUrl 
# 貼文
TDp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*圈套.*"))) %>%
  .[, "artUrl"]

# 留言
TDm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*圈套.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
TDp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% TDp_artUrl)
TDp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% TDp_artUrl)

# 留言
TDm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% TDm_artUrl)
TDm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% TDm_artUrl)

那一天

那一天 (Make Our Days Count)

# 取得 artUrl 
# 貼文
MODCp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*那一天.*"))) %>%
  .[, "artUrl"]

# 留言
MODCm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*那一天.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
MODCp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% MODCp_artUrl)
MODCp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% MODCp_artUrl)

# 留言
MODCm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% MODCm_artUrl)
MODCm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% MODCm_artUrl)

想見你

想見你 (Someday or One Day)

# 取得 artUrl 
# 貼文
SODp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*想見你.*"))) %>%
  .[, "artUrl"]

# 留言
SODm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*想見你.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
SODp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% SODp_artUrl)
SODp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% SODp_artUrl)

# 留言
SODm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% SODm_artUrl)
SODm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% SODm_artUrl)

魂囚西門

魂囚西門 (Green Door)

# 取得 artUrl 
# 貼文
GDp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*魂囚西門.*"))) %>%
  .[, "artUrl"]

# 留言
GDm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*魂囚西門.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
GDp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% GDp_artUrl)
GDp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% GDp_artUrl)

# 留言
GDm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% GDm_artUrl)
GDm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% GDm_artUrl)

最佳利益

最佳利益 (Best Interest)

# 取得 artUrl 
# 貼文
BIp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*最佳利益.*"))) %>%
  .[, "artUrl"]

# 留言
BIm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*最佳利益.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
BIp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% BIp_artUrl)
BIp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% BIp_artUrl)

# 留言
BIm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% BIm_artUrl)
BIm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% BIm_artUrl)

忠孝節義

忠孝節義 (The Timeless Virtues)

# 取得 artUrl 
# 貼文
TTVp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*忠孝.*"))) %>%
  .[, "artUrl"]

# 留言
TTVm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*忠孝.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
TTVp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% TTVp_artUrl)
TTVp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% TTVp_artUrl)

# 留言
TTVm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% TTVm_artUrl)
TTVm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% TTVm_artUrl)

俗女養成記

俗女養成記 (The Making of an Ordinary Woman)

# 取得 artUrl 
# 貼文
TMOWp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*俗女.*"))) %>%
  .[, "artUrl"]

# 留言
TMOWm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*俗女.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
TMOWp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% TMOWp_artUrl)
TMOWp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% TMOWp_artUrl)

# 留言
TMOWm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% TMOWm_artUrl)
TMOWm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% TMOWm_artUrl)

天堂的微笑

天堂的微笑 (Endless Love)

# 取得 artUrl 
# 貼文
ELp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*天堂.*"))) %>%
  .[, "artUrl"]

# 留言
ELm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*天堂.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
ELp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% ELp_artUrl)
ELp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% ELp_artUrl)

# 留言
ELm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% ELm_artUrl)
ELm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% ELm_artUrl)

你有念大學嗎

你有念大學嗎 (Hello Again)

# 取得 artUrl 
# 貼文
HAp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*大學.*"))) %>%
  .[, "artUrl"]

# 留言
HAm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*大學.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
HAp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% HAp_artUrl)
HAp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% HAp_artUrl)

# 留言
HAm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% HAm_artUrl)
HAm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% HAm_artUrl)

我們與惡的距離

我們與惡的距離 (TheWorldBetweenUs)

# 取得 artUrl 
# 貼文
TWBUp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*與惡.*"))) %>%
  .[, "artUrl"]

# 留言
TWBUm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*與惡.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
TWBUp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% TWBUp_artUrl)
TWBUp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% TWBUp_artUrl)

# 留言
TWBUm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% TWBUm_artUrl)
TWBUm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% TWBUm_artUrl)

我們不能是朋友

我們不能是朋友 (Before We Get Married)

# 取得 artUrl 
# 貼文
BWGMp_artUrl <- PTT_T2019p %>% 
  filter(str_detect(PTT_T2019p$artTitle, regex(".*朋友.*"))) %>%
  .[, "artUrl"]

# 留言
BWGMm_artUrl <- PTT_T2019m %>% 
  filter(str_detect(PTT_T2019m$artTitle, regex(".*朋友.*"))) %>%
  .[, "artUrl"]

# 斷詞
# 貼文
BWGMp_tokens <- PTT_T2019p_tokens %>% filter(artUrl %in% BWGMp_artUrl)
BWGMp_tokens2 <- PTT_T2019p_tokens2 %>% filter(artUrl %in% BWGMp_artUrl)

# 留言
BWGMm_tokens <- PTT_T2019m_tokens %>% filter(artUrl %in% BWGMm_artUrl)
BWGMm_tokens2 <- PTT_T2019m_tokens2 %>% filter(artUrl %in% BWGMm_artUrl)

合併資料並新增 label

# 貼文
addLabel <- function(tokens, label){
  tokens %>%
  select(-c(artDate, commentNum)) %>%
  mutate(title = label)
}

## 包含演員名、劇名、角色名
drama_T2019p_tokens <- rbind(
  addLabel(CLp_tokens, "越界"),
  addLabel(GPp_tokens, "女兵日記女力報到"),
  addLabel(TDp_tokens, "圈套"),
  addLabel(MODCp_tokens, "那一天"),
  addLabel(SODp_tokens, "想見你"),
  addLabel(GDp_tokens, "魂囚西門"),
  addLabel(BIp_tokens, "最佳利益"),
  addLabel(TTVp_tokens, "忠孝節義"),
  addLabel(TMOWp_tokens, "俗女養成記"),
  addLabel(ELp_tokens, "天堂的微笑"),
  addLabel(HAp_tokens, "你有念大學嗎"),
  addLabel(TWBUp_tokens, "我們與惡的距離"),
  addLabel(BWGMp_tokens, "我們不能是朋友")
  )
## 各戲劇 token 數量
table(drama_T2019p_tokens$title)

## 
##         魂囚西門           那一天     你有念大學嗎 女兵日記女力報到 
##             8746            13431             5637            26168 
##             圈套       俗女養成記       天堂的微笑   我們不能是朋友 
##            79976            17723             4776             7438 
##   我們與惡的距離           想見你             越界         忠孝節義 
##            77926            15650            12751             8645 
##         最佳利益 
##             9733

## 移除演員名、劇名、角色名
drama_T2019p_tokens2 <- rbind(
  addLabel(CLp_tokens2, "越界"),
  addLabel(GPp_tokens2, "女兵日記女力報到"),
  addLabel(TDp_tokens2, "圈套"),
  addLabel(MODCp_tokens2, "那一天"),
  addLabel(SODp_tokens2, "想見你"),
  addLabel(GDp_tokens2, "魂囚西門"),
  addLabel(BIp_tokens2, "最佳利益"),
  addLabel(TTVp_tokens2, "忠孝節義"),
  addLabel(TMOWp_tokens2, "俗女養成記"),
  addLabel(ELp_tokens2, "天堂的微笑"),
  addLabel(HAp_tokens2, "你有念大學嗎"),
  addLabel(TWBUp_tokens2, "我們與惡的距離"),
  addLabel(BWGMp_tokens2, "我們不能是朋友")
  )
## 各戲劇 token 數量
table(drama_T2019p_tokens2$title)

## 
##         魂囚西門           那一天     你有念大學嗎 女兵日記女力報到 
##             8296            12657             5153            23125 
##             圈套       俗女養成記       天堂的微笑   我們不能是朋友 
##            72251            16865             4548             6965 
##   我們與惡的距離           想見你             越界         忠孝節義 
##            74655            14096            11598             8246 
##         最佳利益 
##             9482

# 留言
addLabel_m <- function(tokens, label){
  tokens %>%
  select(-c(artDate, commentStatus, commentDate)) %>%
  mutate(title = label)
}

## 包含演員名、劇名、角色名
drama_T2019m_tokens <- rbind(
  addLabel_m(CLm_tokens, "越界"),
  addLabel_m(GPm_tokens, "女兵日記女力報到"),
  addLabel_m(TDm_tokens, "圈套"),
  addLabel_m(MODCm_tokens, "那一天"),
  addLabel_m(SODm_tokens, "想見你"),
  addLabel_m(GDm_tokens, "魂囚西門"),
  addLabel_m(BIm_tokens, "最佳利益"),
  addLabel_m(TTVm_tokens, "忠孝節義"),
  addLabel_m(TMOWm_tokens, "俗女養成記"),
  addLabel_m(ELm_tokens, "天堂的微笑"),
  addLabel_m(HAm_tokens, "你有念大學嗎"),
  addLabel_m(TWBUm_tokens, "我們與惡的距離"),
  addLabel_m(BWGMm_tokens, "我們不能是朋友")
  )
## 各戲劇 token 數量
table(drama_T2019m_tokens$title)

## 
##         魂囚西門           那一天     你有念大學嗎 女兵日記女力報到 
##            11792            31397            21883            80451 
##             圈套       俗女養成記       天堂的微笑   我們不能是朋友 
##           136996            29143             4387            17167 
##   我們與惡的距離           想見你             越界         忠孝節義 
##           115493            12968             4917           107191 
##         最佳利益 
##            30416

戲劇相似度網路圖

戲劇相似度網路圖 (Post)

針對各戲劇 count word

drama_T2019p_tokens_wct <- drama_T2019p_tokens %>%
  count(word, title, sort = T) %>%
  rename(count = n)

轉成以每部戲劇為主的 DTM

drama_T2019p_tokens_dtm <- drama_T2019p_tokens_wct %>% 
  cast_dtm(title, word, count)
drama_T2019p_tokens_dtm

## <<DocumentTermMatrix (documents: 13, terms: 38738)>>
## Non-/sparse entries: 78453/425141
## Sparsity           : 84%
## Maximal term length: 9
## Weighting          : term frequency (tf)

以 cosine similarity 為度量方式計算各部戲劇的距離

drama_T2019p_tokens_dtm_mat <- as.matrix(drama_T2019p_tokens_dtm)
drama_T2019p_tokens_dtm_dist <- proxy::dist(drama_T2019p_tokens_dtm_mat, 
                                            method ="cosine")

將距離轉為權重，原本距離越大，權重越小，原本距離大於 0.8 者，此處轉換後權重為 0

threshold <- 0.80 
drama_T2019p_tokens_dtm_dist_cut <- as.matrix(drama_T2019p_tokens_dtm_dist)
drama_T2019p_tokens_dtm_dist_cut[1, ]

##             圈套   我們與惡的距離 女兵日記女力報到           想見你 
##        0.0000000        0.7494374        0.8802272        0.8656336 
##       俗女養成記             越界     你有念大學嗎           那一天 
##        0.8230855        0.8102735        0.9332545        0.7510238 
##   我們不能是朋友         魂囚西門         忠孝節義         最佳利益 
##        0.8181402        0.8309096        0.9020382        0.8253760 
##       天堂的微笑 
##        0.8710326

for ( row_num in 1:dim(drama_T2019p_tokens_dtm_dist_cut)[1]) { 
  for (col_num in 1:dim(drama_T2019p_tokens_dtm_dist_cut)[2]) { 
    ## 此處將原本 dist 轉換為權重，距離大於 threshold 則最後權重為 0，反之距離越近（小）者最後權重越大。
    dist <- drama_T2019p_tokens_dtm_dist_cut[row_num, col_num]
    weight <- ifelse(dist >=threshold, 0, dist) 
    weight <- abs(weight - threshold) 
    weight <- ifelse(weight < threshold, weight + threshold,0)
    drama_T2019p_tokens_dtm_dist_cut[row_num, col_num] <- weight
  }
}

繪製戲劇相似度網路圖 (Post 0.80)

# 以 matrix 建構圖型
g <- graph.adjacency(drama_T2019p_tokens_dtm_dist_cut, weighted = T, mode = "undirected")
# 移除 loop
g <- simplify(g)
# 設定各點的 labels 及 degrees 
V(g)$label <- row.names(drama_T2019p_tokens_dtm_dist_cut)
V(g)$degree <- degree(g)

# 繪製影集關係圖
set.seed(2020)
par(family = "STKaiti", cex = 1) 
layout1 <- layout.kamada.kawai(g)
# 定義圖上各點、線段的呈現方式
V(g)$label.cex <- 2.0 * V(g)$degree / max(V(g)$degree) + 0.5 # max is 2.5，根據 degree 縮放劇名
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight) + .4) / max(log(E(g)$weight) + .4) # edge size
E(g)$color <- rgb(.5, .9, 0, egam)
E(g)$width <- egam * 4
# 畫出網路圖
plot(g, layout = layout1, main = "Drama relation graph -- Post 0.8")

劇情類似卻沒有出現連結：以 “越界” 為例，該齣戲是 2018 年的同性戀戲劇，相關劇情延伸到 2019 年的 “圈套”、“那一天”，雖然是同一系列的延伸，但實際上在 gragh 中並沒有連線，可能因為我們抓取的資料分布在 2019 年，討論 “越界” 的篇數較少，使得 “越界” 在圖中沒有連結。
劇情類似並出現連結：由我們從 PTT 上抓取的貼文內容，可以得知討論者在討論 “我們與惡的距離” 時，可能也會同時比較別齣戲的相同劇情，像是 “最佳利益” 是以律師、官司等等為主要發展的劇情，與 “我們與惡的距離” 的相關連結在討論律師及官司上。
劇情類似並出現連結：“魂囚西門” 中的男主角為精神病患者，劇中提到心理諮商等劇情，與 “我們與惡的距離” 當中應思聰的思覺失調及宋喬平的諮商師有關。
將距離轉為權重，原本距離越大，權重越小，原本距離大於 0.72 者，此處轉換後權重為 0

threshold <- 0.72
drama_T2019p_tokens_dtm_dist_cut <- as.matrix(drama_T2019p_tokens_dtm_dist)
drama_T2019p_tokens_dtm_dist_cut[1,]

##             圈套   我們與惡的距離 女兵日記女力報到           想見你 
##        0.0000000        0.7494374        0.8802272        0.8656336 
##       俗女養成記             越界     你有念大學嗎           那一天 
##        0.8230855        0.8102735        0.9332545        0.7510238 
##   我們不能是朋友         魂囚西門         忠孝節義         最佳利益 
##        0.8181402        0.8309096        0.9020382        0.8253760 
##       天堂的微笑 
##        0.8710326

for ( row_num in 1:dim(drama_T2019p_tokens_dtm_dist_cut)[1]) { 
  for (col_num in 1:dim(drama_T2019p_tokens_dtm_dist_cut)[2]) { 
    ## 此處將原本 dist 轉換為權重，距離大於 threshold 則最後權重為 0，反之距離越近（小）者最後權重越大。
    dist <- drama_T2019p_tokens_dtm_dist_cut[row_num, col_num]
    weight <- ifelse(dist >= threshold, 0, dist) 
    weight <- abs(weight - threshold) 
    weight <- ifelse(weight < threshold, weight + threshold,0)
    drama_T2019p_tokens_dtm_dist_cut[row_num, col_num] <- weight
  }
}

繪製戲劇相似度網路圖（Post 0.72）

## 以 matrix 建構圖型
g <- graph.adjacency(drama_T2019p_tokens_dtm_dist_cut, weighted = T, mode = "undirected")
## 移除 loop
g <- simplify(g)
## 設定各點的 labels 及 degrees 
V(g)$label <- row.names(drama_T2019p_tokens_dtm_dist_cut)
V(g)$degree <- degree(g)

## 繪製影集關係圖
set.seed(2020)
par(family ="STKaiti", cex = 1) 
layout1 <- layout.kamada.kawai(g)
##定義圖上各點、線段的呈現方式
V(g)$label.cex <- 2.0 * V(g)$degree / max(V(g)$degree) + 0.5 # max is 2.5，根據degree縮放劇名
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight)+.4) / max(log(E(g)$weight)+.4) # edge size
E(g)$color <- rgb(.5, .9, 0, egam)
E(g)$width <- egam * 4
## 畫出網路圖
plot(g, layout = layout1, main = "Drama relation graph -- Post 0.72")

戲劇相似度網路圖 (Message)

針對各戲劇 count word

drama_T2019m_tokens_wct <- drama_T2019m_tokens %>%
  count(word, title, sort = T) %>%
  rename(count = n)

轉成以每部戲劇為主的 dtm

drama_T2019m_tokens_dtm <- drama_T2019m_tokens_wct %>% 
  cast_dtm(title, word, count)
drama_T2019m_tokens_dtm

## <<DocumentTermMatrix (documents: 13, terms: 69293)>>
## Non-/sparse entries: 136120/764689
## Sparsity           : 85%
## Maximal term length: 9
## Weighting          : term frequency (tf)

以 cosine similarity 為度量方式計算各部戲劇的距離

drama_T2019m_tokens_dtm_mat <- as.matrix(drama_T2019m_tokens_dtm)
drama_T2019m_tokens_dtm_dist <- proxy::dist(drama_T2019m_tokens_dtm_mat, 
                                            method ="cosine")

將距離轉為權重，原本距離越大，權重越小，原本距離大於 0.72 者，此處轉換後權重為 0

threshold <- 0.72
drama_T2019m_tokens_dtm_dist_cut <- as.matrix(drama_T2019m_tokens_dtm_dist)
drama_T2019m_tokens_dtm_dist_cut[1,]

##             圈套 女兵日記女力報到   我們與惡的距離         忠孝節義 
##        0.0000000        0.8095248        0.7137306        0.7696144 
##         最佳利益       俗女養成記           想見你     你有念大學嗎 
##        0.8057282        0.7672023        0.8882045        0.7832867 
##   我們不能是朋友           那一天         魂囚西門       天堂的微笑 
##        0.7412544        0.5807103        0.7960706        0.8358653 
##             越界 
##        0.4926713

for ( row_num in 1:dim(drama_T2019m_tokens_dtm_dist_cut)[1]) { 
  for (col_num in 1:dim(drama_T2019m_tokens_dtm_dist_cut)[2]) { 
    ## 此處將原本 dist 轉換為權重，距離大於 threshold 則最後權重為 0，反之距離越近（小）者最後權重越大。
    dist <- drama_T2019m_tokens_dtm_dist_cut[row_num, col_num]
    weight <- ifelse(dist >= threshold, 0, dist) 
    weight <- abs(weight - threshold) 
    weight <- ifelse(weight < threshold, weight + threshold,0)
    drama_T2019m_tokens_dtm_dist_cut[row_num, col_num] <- weight
  }
}

drama_T2019m_tokens_dtm_dist_cut[1, ]

##             圈套 女兵日記女力報到   我們與惡的距離         忠孝節義 
##        0.0000000        0.0000000        0.7262694        0.0000000 
##         最佳利益       俗女養成記           想見你     你有念大學嗎 
##        0.0000000        0.0000000        0.0000000        0.0000000 
##   我們不能是朋友           那一天         魂囚西門       天堂的微笑 
##        0.0000000        0.8592897        0.0000000        0.0000000 
##             越界 
##        0.9473287

繪製戲劇相似度網路圖 (Message 0.72)

## 以 matrix 建構圖型
g <- graph.adjacency(drama_T2019m_tokens_dtm_dist_cut, weighted = T, mode = "undirected")
## 移除 loop
g <- simplify(g)
## 設定各點的 labels 及 degrees 
V(g)$label <- row.names(drama_T2019m_tokens_dtm_dist_cut)
V(g)$degree <- degree(g)

## 繪製影集關係圖
set.seed(2020)
par(family ="STKaiti",cex = 1) 
layout1 <- layout.kamada.kawai(g)
##定義圖上各點、線段的呈現方式
V(g)$label.cex <- 2.0 * V(g)$degree / max(V(g)$degree) + 0.5 # max is 2.5，根據degree縮放劇名
V(g)$label.color <- rgb(0, 0, .2, .8)
V(g)$frame.color <- NA
egam <- (log(E(g)$weight) + .4) / max(log(E(g)$weight) + .4) # edge size
E(g)$color <- rgb(.5, .9, 0, egam)
E(g)$width <- egam *4
## 畫出網路圖
plot(g, layout = layout1, main = "Drama relation graph -- Message 0.72")

貼文與留言的戲劇相似度網路圖比較

相同的 threshold 下，貼文的關係會比留言關係低，例如左圖只有 4 條 edges，右圖卻有多條 edges，可能是因為留言時較容易跟別齣劇比較，表示較容易提到別齣劇。且留言的部分因為 terms 較多也較多元，較容易與其他篇文章共同出現，故節點間的 edges 會更多。

戲劇分類器

此戲劇分類器的 training data 包含演員名、劇名、角色名

drama_T2019p_tokens_10 <- drama_T2019p_tokens %>%
  group_by(title, word) %>%
  filter(n() > 10) %>% # 只篩選出現大於 10 次的字
  ungroup()

各戲劇的常用字

drama_T2019p_tokens_10 %>%
  count(title, word, sort = TRUE) %>%
  group_by(title) %>%
  top_n(10) %>%
  ungroup() %>%
  ggplot(aes(x = reorder(word, n), y = n, fill = title)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ title, scales = "free") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(
    x = NULL, y = "Word count",
    title = "Most frequent words after removing stop words"
  ) +
  theme(text = element_text(family = "Heiti TC Light"))

各戲劇的常用字多為演員名、劇名、角色名。
較少為戲劇內容，但核心主題仍可顯示，如 “魂囚西門” 的「心理」、“天堂的微笑” 的「孩子」、“我們不能是朋友” 的「感情」、“我們與惡的距離” 的「媒體」與「社會」、“想見你” 的「穿越」等。
將資料轉成以每個 artUrl 為主的 DocumentTermMatrix (DTM) 格式

artUrl_dtm <- drama_T2019p_tokens_10 %>% 
      count(artUrl, word) %>% 
      cast_dtm(artUrl, word, n)

dim(artUrl_dtm)

## [1] 1296 2538

新增 title 欄位 (label)

artUrl_dtm <- as.data.frame(as.matrix(artUrl_dtm))

artUrl_dtm$title[rownames(artUrl_dtm) %in% CLp_artUrl] <- "越界"
artUrl_dtm$title[rownames(artUrl_dtm) %in% GPp_artUrl] <- "女兵日記女力報到"
artUrl_dtm$title[rownames(artUrl_dtm) %in% TDp_artUrl] <- "圈套"
artUrl_dtm$title[rownames(artUrl_dtm) %in% MODCp_artUrl] <- "那一天"
artUrl_dtm$title[rownames(artUrl_dtm) %in% SODp_artUrl] <- "想見你"
artUrl_dtm$title[rownames(artUrl_dtm) %in% GDp_artUrl] <- "魂囚西門"
artUrl_dtm$title[rownames(artUrl_dtm) %in% BIp_artUrl] <- "最佳利益"
artUrl_dtm$title[rownames(artUrl_dtm) %in% TTVp_artUrl] <- "忠孝節義"
artUrl_dtm$title[rownames(artUrl_dtm) %in% TMOWp_artUrl] <- "俗女養成記"
artUrl_dtm$title[rownames(artUrl_dtm) %in% ELp_artUrl] <- "天堂的微笑"
artUrl_dtm$title[rownames(artUrl_dtm) %in% HAp_artUrl] <- "你有念大學嗎"
artUrl_dtm$title[rownames(artUrl_dtm) %in% TWBUp_artUrl] <- "我們與惡的距離"
artUrl_dtm$title[rownames(artUrl_dtm) %in% BWGMp_artUrl] <- "我們不能是朋友"

artUrl_dtm$title <- as.factor((artUrl_dtm$title))

table(artUrl_dtm$title)

## 
##         魂囚西門           那一天     你有念大學嗎 女兵日記女力報到 
##               28               55               53              302 
##             圈套       俗女養成記       天堂的微笑   我們不能是朋友 
##              165               61               36               44 
##   我們與惡的距離           想見你             越界         忠孝節義 
##              346               65               16               80 
##         最佳利益 
##               45

“我們與惡的距離”、“女兵日記女力報到” 的 artUrl 數量最多

Model building

資料切割：將資料分成訓練集與測試集

set.seed(2020)
spl <- sample.split(artUrl_dtm$title, 0.7) # 在保留 title 比例的狀況下以 7:3 將原始資料分割成訓練集與測試集
artUrl_dtm_train <- subset(artUrl_dtm, spl == TRUE)
artUrl_dtm_test <- subset(artUrl_dtm, spl == FALSE)

SVM

artUrl_dtm_svm <- svm(title ~ ., artUrl_dtm_train, kernel = "linear", cost = 10, scale = F)
artUrl_dtm_svm.p <- predict(artUrl_dtm_svm, artUrl_dtm_test)

# classification matrix
table(artUrl_dtm_test$title, artUrl_dtm_svm.p)

##                   artUrl_dtm_svm.p
##                    魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
##   魂囚西門                2      0            0                1    0
##   那一天                  1     13            0                0    1
##   你有念大學嗎            0      0           15                0    0
##   女兵日記女力報到        1      0            0               89    0
##   圈套                    3      0            0                4   36
##   俗女養成記              0      0            0                1    0
##   天堂的微笑              0      0            0                0    0
##   我們不能是朋友          0      0            0                1    0
##   我們與惡的距離          1      1            0                1    2
##   想見你                  0      0            0                0    1
##   越界                    0      0            0                0    0
##   忠孝節義                0      0            0                0    0
##   最佳利益                0      0            0                1    1
##                   artUrl_dtm_svm.p
##                    俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
##   魂囚西門                  1          0              1              2      0
##   那一天                    0          0              1              1      0
##   你有念大學嗎              0          0              0              0      0
##   女兵日記女力報到          0          0              0              1      0
##   圈套                      0          0              2              5      0
##   俗女養成記               17          0              0              0      0
##   天堂的微笑                0          9              0              0      0
##   我們不能是朋友            0          0             11              1      0
##   我們與惡的距離            0          0              2             95      0
##   想見你                    0          0              0              0     18
##   越界                      0          0              0              0      0
##   忠孝節義                  1          0              0              0      0
##   最佳利益                  1          0              1              3      0
##                   artUrl_dtm_svm.p
##                    越界 忠孝節義 最佳利益
##   魂囚西門            0        0        1
##   那一天              0        0        0
##   你有念大學嗎        0        0        1
##   女兵日記女力報到    0        0        0
##   圈套                0        0        0
##   俗女養成記          0        0        0
##   天堂的微笑          0        0        2
##   我們不能是朋友      0        0        0
##   我們與惡的距離      0        0        2
##   想見你              0        0        0
##   越界                5        0        0
##   忠孝節義            0       23        0
##   最佳利益            0        0        7

# misclassification rate
mean(artUrl_dtm_test$title != artUrl_dtm_svm.p)

## [1] 0.1282051

Random Forest

artUrl_dtm_rf <- ranger(title ~ ., data = artUrl_dtm_train, num.trees = 100)
artUrl_dtm_rf.p <- predict(artUrl_dtm_rf, artUrl_dtm_test)$predictions

# classification matrix
table(artUrl_dtm_test$title, artUrl_dtm_rf.p)

##                   artUrl_dtm_rf.p
##                    魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
##   魂囚西門                4      0            0                0    0
##   那一天                  0     15            0                0    0
##   你有念大學嗎            0      0           16                0    0
##   女兵日記女力報到        0      0            0               91    0
##   圈套                    0      0            0                0   44
##   俗女養成記              0      0            0                0    0
##   天堂的微笑              0      0            0                0    0
##   我們不能是朋友          0      0            0                0    0
##   我們與惡的距離          0      0            0                0    0
##   想見你                  0      0            0                0    0
##   越界                    0      0            0                0    0
##   忠孝節義                0      0            0                0    0
##   最佳利益                0      0            0                0    0
##                   artUrl_dtm_rf.p
##                    俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
##   魂囚西門                  0          0              0              4      0
##   那一天                    0          0              0              2      0
##   你有念大學嗎              0          0              0              0      0
##   女兵日記女力報到          0          0              0              0      0
##   圈套                      0          0              1              4      0
##   俗女養成記               17          0              0              1      0
##   天堂的微笑                0          9              0              2      0
##   我們不能是朋友            0          0             11              2      0
##   我們與惡的距離            0          0              0            103      0
##   想見你                    0          0              0              0     19
##   越界                      0          0              0              0      0
##   忠孝節義                  0          0              0              0      0
##   最佳利益                  0          0              0              7      0
##                   artUrl_dtm_rf.p
##                    越界 忠孝節義 最佳利益
##   魂囚西門            0        0        0
##   那一天              0        0        0
##   你有念大學嗎        0        0        0
##   女兵日記女力報到    0        0        0
##   圈套                1        0        0
##   俗女養成記          0        0        0
##   天堂的微笑          0        0        0
##   我們不能是朋友      0        0        0
##   我們與惡的距離      0        0        1
##   想見你              0        0        0
##   越界                5        0        0
##   忠孝節義            0       24        0
##   最佳利益            0        0        7

# misclassification rate
mean(artUrl_dtm_test$title != artUrl_dtm_rf.p)

## [1] 0.06410256

除了分類至 “我們與惡的距離”、“我們不能是朋友” 有誤外，其他幾乎都分類正確。
尤以 “我們與惡的距離” 錯誤最多，表示其他部戲劇容易被 model 判斷為 “我們與惡的距離”，推測原因為 “我們與惡的距離” 的用字與其他幾部誤判的戲劇用字相似，也呼應前面 “戲劇相似度網路圖” 的結果。

查看分類錯誤的 token

以表現最好的 Random Forest 為例

# 分類錯誤的 document index
which(artUrl_dtm_test$title != artUrl_dtm_rf.p)

##  [1]  19  25  55 120 122 149 183 196 226 230 234 239 240 244 245 254 256 258 259
## [20] 293 312 317 332 347 374

# 分類錯誤的 document
# which(artUrl_dtm_test$title != artUrl_dtm_rf.p) %>%
#   artUrl_dtm_test[., ]

idx <- 122 
# 實際
act <- artUrl_dtm_test[idx, ]
act$title

## [1] 圈套
## 13 Levels: 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套 ... 最佳利益

act.T <- t(act[, 1:ncol(act)]) # 轉置
which(act.T[, 1] > 0)

##   一起   劇情   編劇   這對   導演   期待   戲劇   演員   主角   發展   事件 
##     14     41     60     77     89    112    124    134    142    190    244 
##   火花   劇組   團隊 第一集 偶像劇   看看   想像   以前   還沒   戲中   自然 
##    358    419    465    574    695    770    815    827   1031   1108   1167 
##   後面   場景   橋段   畫面   不錯   就讓   臉紅   配角   三集   是非   手銬 
##   1252   1293   1360   1442   1725   1726   1727   1728   1729   1730   1731 
##   戲都   心跳   預期  title 
##   1732   1733   1734   2539

# 預測
artUrl_dtm_rf.p[idx]

## [1] 我們不能是朋友
## 13 Levels: 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套 ... 最佳利益

以第 122 篇為例
model 將 “圈套” 預測為 “我們不能是朋友”，推測原因為 “我們不能是朋友” 和 “圈套” 一樣是探討禁忌戀情的戲劇，所以「火花」、「偶像劇」、「臉紅」、「手銬」、「心跳」等關鍵字會在兩部戲劇出現。

Xgboost

# Convert label factor to an integer class starting at 0, it's a requirement for XGBoost
title <- artUrl_dtm$title
label <- as.integer(artUrl_dtm$title) - 1

# Split the data for training and testing
artUrl_dtm_train.data <- as.matrix(artUrl_dtm_train[, -which(colnames(artUrl_dtm_train) == "title")])
artUrl_dtm_test.data <- as.matrix(artUrl_dtm_test[, -which(colnames(artUrl_dtm_test) == "title")])

# Transform the two datasets into xgb.DMatrix objects
xgb.artUrl_dtm_train <- xgb.DMatrix(data = artUrl_dtm_train.data, label = label[spl])
xgb.artUrl_dtm_test <- xgb.DMatrix(data = artUrl_dtm_test.data, label = label[!spl])

# Define the parameters for multinomial classification
num_class <- length(levels(title))
params <- list(
  booster = "gbtree",
  eta = 0.001,
  max_depth = 5,
  gamma = 3,
  subsample = 0.75,
  colsample_bytree = 1,
  objective = "multi:softprob",
  eval_metric = "mlogloss",
  num_class = num_class
)

# Train the XGBoost classifer
artUrl_dtm_xgb <- xgb.train(
  params = params,
  data = xgb.artUrl_dtm_train,
  nrounds = 30,
  nthreads = 1,
  early_stopping_rounds = 10,
  watchlist = list(val1 = xgb.artUrl_dtm_train, val2 = xgb.artUrl_dtm_test),
  verbose = 0
)

saveRDS(object = artUrl_dtm_xgb, file = "./object/artUrl_dtm_xgb.rds")

artUrl_dtm_xgb <- readRDS(file = "./object/artUrl_dtm_xgb.rds")

# Predict outcomes with the test data
artUrl_dtm_xgb.p <- predict(artUrl_dtm_xgb, artUrl_dtm_test.data, reshape = T) %>% 
  as.data.frame()
colnames(artUrl_dtm_xgb.p) <- levels(title)

# Use the predicted label with the highest probability
artUrl_dtm_xgb.p$prediction <- apply(artUrl_dtm_xgb.p, 1, function(x) colnames(artUrl_dtm_xgb.p)[which.max(x)])

# classification matrix
table(artUrl_dtm_test$title, artUrl_dtm_xgb.p$prediction)

##                   
##                    魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
##   魂囚西門                4      0            0                0    0
##   那一天                  0     15            0                0    1
##   你有念大學嗎            0      0           15                0    0
##   女兵日記女力報到        0      0            0               90    0
##   圈套                    0      0            0                1   44
##   俗女養成記              0      0            0                0    0
##   天堂的微笑              0      0            0                0    0
##   我們不能是朋友          0      0            0                0    0
##   我們與惡的距離          0      0            0                0    0
##   想見你                  0      0            0                0    0
##   越界                    0      0            0                0    1
##   忠孝節義                0      0            0                0    0
##   最佳利益                0      0            0                0    0
##                   
##                    俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
##   魂囚西門                  1          0              0              3      0
##   那一天                    0          0              0              1      0
##   你有念大學嗎              0          0              0              1      0
##   女兵日記女力報到          0          0              0              1      0
##   圈套                      0          0              0              4      0
##   俗女養成記               16          0              0              2      0
##   天堂的微笑                0         10              0              0      1
##   我們不能是朋友            0          0              9              4      0
##   我們與惡的距離            0          0              0            104      0
##   想見你                    0          0              0              0     19
##   越界                      0          0              0              0      0
##   忠孝節義                  0          0              0              0      0
##   最佳利益                  0          0              0              6      0
##                   
##                    越界 忠孝節義 最佳利益
##   魂囚西門            0        0        0
##   那一天              0        0        0
##   你有念大學嗎        0        0        0
##   女兵日記女力報到    0        0        0
##   圈套                1        0        0
##   俗女養成記          0        0        0
##   天堂的微笑          0        0        0
##   我們不能是朋友      0        0        0
##   我們與惡的距離      0        0        0
##   想見你              0        0        0
##   越界                4        0        0
##   忠孝節義            0       24        0
##   最佳利益            0        0        8

# misclassification rate
mean(artUrl_dtm_test$title != artUrl_dtm_xgb.p$prediction)

## [1] 0.07179487

特徵重要性

xgb.importance(artUrl_dtm_xgb, feature_names = colnames(xgb.artUrl_dtm_train)) %>%
  top_n(20) %>%
  ggplot(., aes(reorder(Feature, Gain), y = Gain, fill = (Feature))) + 
  geom_bar(stat = "identity", show.legend = FALSE) +
  theme(text = element_text(size = 20)) +
  coord_flip() +
  xlab(' ') +
  ylab('特徵重要性') +
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by Frequency

多為劇名、角色名，推測原因為劇名、角色名為識別某齣戲劇的指標，較不為戲劇間的共同用字，故可清楚辨別出是哪齣戲劇。

各 model 表現比較

data.frame(model = c("SVM", "Random Forest", "Xgboost"),
           misclassification_rate = c(mean(artUrl_dtm_test$title != artUrl_dtm_svm.p),
                                      mean(artUrl_dtm_test$title != artUrl_dtm_rf.p),
                                      mean(artUrl_dtm_test$title != artUrl_dtm_xgb.p$prediction)))

##           model misclassification_rate
## 1           SVM             0.12820513
## 2 Random Forest             0.06410256
## 3       Xgboost             0.07179487

對這個 data 而言，Random Forest 的表現最好

戲劇分類器 2 (移除相關名字)

此戲劇分類器的 training data 移除演員名、劇名、角色名

drama_T2019p_tokens2 <- drama_T2019p_tokens2 %>%
  group_by(title, word) %>%
  filter(n() > 10) %>% # 只篩選出現大於 10 次的字
  ungroup()

各戲劇的常用字

drama_T2019p_tokens2 %>%
  count(title, word, sort = TRUE) %>%
  group_by(title) %>%
  top_n(10) %>%
  ungroup() %>%
  ggplot(aes(x = reorder(word, n), y = n, fill = title)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  coord_flip() +
  facet_wrap(~ title, scales = "free") +
  scale_y_continuous(expand = c(0, 0)) +
  labs(
    x = NULL, y = "Word count",
    title = "Most frequent words after removing stop words"
  ) +
  theme(text = element_text(family = "Heiti TC Light"))

移除演員名、劇名、角色名後，出現較多與戲劇內容本身相關的詞彙，如 “俗女養成記” 的「媽媽」與「女性」、“想見你” 的「喜歡」與「班長」、“越界” 的「排球」等。
將資料轉成以每個 artUrl 為主的 DocumentTermMatrix (DTM) 格式

artUrl_dtm2 <- drama_T2019p_tokens2 %>% 
      count(artUrl, word) %>% 
      cast_dtm(artUrl, word, n)

dim(artUrl_dtm2)

## [1] 1293 2412

新增 title 欄位 (label)

artUrl_dtm2 <- as.data.frame(as.matrix(artUrl_dtm2))

artUrl_dtm2$title[rownames(artUrl_dtm2) %in% CLp_artUrl] <- "越界"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% GPp_artUrl] <- "女兵日記女力報到"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% TDp_artUrl] <- "圈套"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% MODCp_artUrl] <- "那一天"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% SODp_artUrl] <- "想見你"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% GDp_artUrl] <- "魂囚西門"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% BIp_artUrl] <- "最佳利益"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% TTVp_artUrl] <- "忠孝節義"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% TMOWp_artUrl] <- "俗女養成記"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% ELp_artUrl] <- "天堂的微笑"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% HAp_artUrl] <- "你有念大學嗎"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% TWBUp_artUrl] <- "我們與惡的距離"
artUrl_dtm2$title[rownames(artUrl_dtm2) %in% BWGMp_artUrl] <- "我們不能是朋友"

artUrl_dtm2$title <- as.factor((artUrl_dtm2$title))

table(artUrl_dtm2$title)

## 
##         魂囚西門           那一天     你有念大學嗎 女兵日記女力報到 
##               28               55               53              302 
##             圈套       俗女養成記       天堂的微笑   我們不能是朋友 
##              165               60               35               43 
##   我們與惡的距離           想見你             越界         忠孝節義 
##              346               65               16               80 
##         最佳利益 
##               45

“我們與惡的距離”、“女兵日記女力報到” 的 artUrl 數量最多

Model building

資料切割：將資料分成訓練集與測試集

set.seed(2020)
spl <- sample.split(artUrl_dtm2$title, 0.7) # 在保留 title 比例的狀況下以 7:3 將原始資料分割成訓練集與測試集
artUrl_dtm2_train <- subset(artUrl_dtm2, spl == TRUE)
artUrl_dtm2_test <- subset(artUrl_dtm2, spl == FALSE)

SVM

artUrl_dtm2_svm <- svm(title ~ ., artUrl_dtm2_train, kernel = "linear", cost = 10, scale = F)
artUrl_dtm2_svm.p <- predict(artUrl_dtm2_svm, artUrl_dtm2_test)

# classification matrix
table(artUrl_dtm2_test$title, artUrl_dtm2_svm.p)

##                   artUrl_dtm2_svm.p
##                    魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
##   魂囚西門                5      0            0                0    1
##   那一天                  0     16            1                0    0
##   你有念大學嗎            0      0           16                0    0
##   女兵日記女力報到        0      0            0               89    0
##   圈套                    0      1            0                2   28
##   俗女養成記              0      0            0                1    1
##   天堂的微笑              0      0            0                0    0
##   我們不能是朋友          0      0            0                0    2
##   我們與惡的距離          0      0            0                4    5
##   想見你                  0      0            0                0    1
##   越界                    0      0            0                1    0
##   忠孝節義                0      0            0                0    0
##   最佳利益                0      0            1                1    0
##                   artUrl_dtm2_svm.p
##                    俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
##   魂囚西門                  0          0              1              0      0
##   那一天                    0          0              0              0      0
##   你有念大學嗎              0          0              0              0      0
##   女兵日記女力報到          0          0              0              1      0
##   圈套                      0          0              0             12      5
##   俗女養成記               15          0              0              0      0
##   天堂的微笑                0         10              0              0      1
##   我們不能是朋友            0          0             10              1      0
##   我們與惡的距離            0          1              2             88      2
##   想見你                    0          0              0              1     17
##   越界                      0          0              0              0      0
##   忠孝節義                  0          0              0              0      0
##   最佳利益                  0          0              0              0      1
##                   artUrl_dtm2_svm.p
##                    越界 忠孝節義 最佳利益
##   魂囚西門            0        0        1
##   那一天              0        0        0
##   你有念大學嗎        0        0        0
##   女兵日記女力報到    1        0        0
##   圈套                1        0        1
##   俗女養成記          0        0        1
##   天堂的微笑          0        0        0
##   我們不能是朋友      0        0        0
##   我們與惡的距離      0        0        2
##   想見你              0        0        0
##   越界                4        0        0
##   忠孝節義            0       22        2
##   最佳利益            0        0       11

# misclassification rate
mean(artUrl_dtm2_test$title != artUrl_dtm2_svm.p)

## [1] 0.1512821

Random Forest

artUrl_dtm2_rf <- ranger(title ~ ., data = artUrl_dtm2_train, num.trees = 100)
artUrl_dtm2_rf.p <- predict(artUrl_dtm2_rf, artUrl_dtm2_test)$predictions

# classification matrix
table(artUrl_dtm2_test$title, artUrl_dtm2_rf.p)

##                   artUrl_dtm2_rf.p
##                    魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
##   魂囚西門                5      0            0                0    0
##   那一天                  0     15            0                0    1
##   你有念大學嗎            0      0           16                0    0
##   女兵日記女力報到        0      0            0               90    0
##   圈套                    0      0            0                0   40
##   俗女養成記              0      0            0                0    0
##   天堂的微笑              0      0            0                0    0
##   我們不能是朋友          0      0            0                0    0
##   我們與惡的距離          0      0            0                0    0
##   想見你                  0      0            0                0    0
##   越界                    0      0            0                0    0
##   忠孝節義                0      0            0                0    0
##   最佳利益                0      0            0                0    0
##                   artUrl_dtm2_rf.p
##                    俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
##   魂囚西門                  1          0              0              2      0
##   那一天                    0          0              0              1      0
##   你有念大學嗎              0          0              0              0      0
##   女兵日記女力報到          0          0              0              1      0
##   圈套                      0          0              1              9      0
##   俗女養成記               16          0              1              1      0
##   天堂的微笑                0         10              0              1      0
##   我們不能是朋友            0          0             10              3      0
##   我們與惡的距離            0          0              0            104      0
##   想見你                    0          0              0              2     17
##   越界                      0          0              0              0      0
##   忠孝節義                  0          0              0              1      0
##   最佳利益                  0          0              0              2      0
##                   artUrl_dtm2_rf.p
##                    越界 忠孝節義 最佳利益
##   魂囚西門            0        0        0
##   那一天              0        0        0
##   你有念大學嗎        0        0        0
##   女兵日記女力報到    0        0        0
##   圈套                0        0        0
##   俗女養成記          0        0        0
##   天堂的微笑          0        0        0
##   我們不能是朋友      0        0        0
##   我們與惡的距離      0        0        0
##   想見你              0        0        0
##   越界                5        0        0
##   忠孝節義            0       23        0
##   最佳利益            0        0       12

# misclassification rate
mean(artUrl_dtm2_test$title != artUrl_dtm2_rf.p)

## [1] 0.06923077

除了分類至 “我們與惡的距離” 有誤外，其他幾乎都分類正確。
表示其他部戲劇容易被 model 判斷為 “我們與惡的距離”，推測原因為 “我們與惡的距離” 的用字與其他幾部誤判的戲劇用字相似，也呼應前面 “戲劇相似度網路圖” 的結果。

查看分類錯誤的 token

以表現最好的 Random Forest 為例

# 分類錯誤的 document index
which(artUrl_dtm2_test$title != artUrl_dtm2_rf.p)

##  [1]   1  24  41 120 165 183 197 208 211 221 223 224 238 241 244 259 266 267 274
## [20] 277 289 312 332 337 348 365 389

# 分類錯誤的 document
# which(artUrl_dtm2_test$title != artUrl_dtm2_rf.p) %>%
#   artUrl_dtm2_test[., ]

idx <- 208
# 實際
act <- artUrl_dtm2_test[idx, ]
act$title

## [1] 圈套
## 13 Levels: 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套 ... 最佳利益

act.T <- t(act[, 1:ncol(act)]) # 轉置
which(act.T[, 1] > 0)

##  劇情  今天  無法  演出  好好  再次  種種  不斷  情緒  電話  挑戰  演技  提到 
##    33    53    72   119   174   201   202   229   262   274   308   616   647 
##  不算  處理  那種  尤其  有點  爆發  放在  複雜  經歷  思考  心得  這裡  強大 
##   670   678   723   773   774   806   829   833   856   887   985  1044  1077 
##  注意  合理  瞬間  崩潰  後面  算是  生氣  死亡  不錯  眼睛  段落  緊張  幾次 
##  1096  1118  1134  1160  1182  1262  1348  1492  1631  1704  1743  1887  1937 
##  養父  街頭  外面  星期  中槍  阿志  兩次  開車  早餐  自首  傷心 title 
##  2036  2039  2075  2095  2097  2117  2138  2154  2159  2162  2170  2413

# 預測
artUrl_dtm2_rf.p[idx]

## [1] 我們與惡的距離
## 13 Levels: 魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套 ... 最佳利益

以第 208 篇為例
model 將 “圈套” 預測為 “我們與惡的距離”，推測原因為 “我們與惡的距離” 和 “圈套” 都有談論到「情緒」、「演技」、「爆發」、「崩潰」、「死亡」、「中槍」、「自首」、「傷心」等關鍵字。

Xgboost

# Convert label factor to an integer class starting at 0, it's a requirement for XGBoost
title <- artUrl_dtm2$title
label <- as.integer(artUrl_dtm2$title) - 1

# Split the data for training and testing
artUrl_dtm2_train.data <- as.matrix(artUrl_dtm2_train[, -which(colnames(artUrl_dtm2_train) == "title")])
artUrl_dtm2_test.data <- as.matrix(artUrl_dtm2_test[, -which(colnames(artUrl_dtm2_test) == "title")])

# Transform the two datasets into xgb.DMatrix objects
xgb.artUrl_dtm2_train <- xgb.DMatrix(data = artUrl_dtm2_train.data, label = label[spl])
xgb.artUrl_dtm2_test <- xgb.DMatrix(data = artUrl_dtm2_test.data, label = label[!spl])

# Define the parameters for multinomial classification
num_class <- length(levels(title))
params <- list(
  booster = "gbtree",
  eta = 0.001,
  max_depth = 5,
  gamma = 3,
  subsample = 0.75,
  colsample_bytree = 1,
  objective = "multi:softprob",
  eval_metric = "mlogloss",
  num_class = num_class
)

# Train the XGBoost classifer
artUrl_dtm2_xgb <- xgb.train(
  params = params,
  data = xgb.artUrl_dtm2_train,
  nrounds = 30,
  nthreads = 1,
  early_stopping_rounds = 10,
  watchlist = list(val1 = xgb.artUrl_dtm2_train, val2 = xgb.artUrl_dtm2_test),
  verbose = 0
)

saveRDS(object = artUrl_dtm2_xgb, file = "./object/artUrl_dtm2_xgb.rds")

artUrl_dtm2_xgb <- readRDS(file = "./object/artUrl_dtm2_xgb.rds")

# Predict outcomes with the test data
artUrl_dtm2_xgb.p <- predict(artUrl_dtm2_xgb, artUrl_dtm2_test.data, reshape = T) %>% 
  as.data.frame()
colnames(artUrl_dtm2_xgb.p) <- levels(title)

# Use the predicted label with the highest probability
artUrl_dtm2_xgb.p$prediction <- apply(artUrl_dtm2_xgb.p, 1, function(x) colnames(artUrl_dtm2_xgb.p)[which.max(x)])

# classification matrix
table(artUrl_dtm2_test$title, artUrl_dtm2_xgb.p$prediction)

##                   
##                    魂囚西門 那一天 你有念大學嗎 女兵日記女力報到 圈套
##   魂囚西門                6      0            0                0    0
##   那一天                  0     11            0                0    0
##   你有念大學嗎            0      0           16                0    0
##   女兵日記女力報到        0      0            0               90    0
##   圈套                    0      0            0                0   39
##   俗女養成記              0      0            0                0    0
##   天堂的微笑              0      0            0                0    0
##   我們不能是朋友          0      0            0                0    0
##   我們與惡的距離          0      0            0                8    2
##   想見你                  0      0            0                0    0
##   越界                    0      0            0                0    0
##   忠孝節義                0      0            0                0    0
##   最佳利益                0      0            0                0    0
##                   
##                    俗女養成記 天堂的微笑 我們不能是朋友 我們與惡的距離 想見你
##   魂囚西門                  1          0              0              1      0
##   那一天                    0          0              0              6      0
##   你有念大學嗎              0          0              0              0      0
##   女兵日記女力報到          0          1              0              0      0
##   圈套                      0          0              0             11      0
##   俗女養成記               13          0              1              4      0
##   天堂的微笑                0          8              0              3      0
##   我們不能是朋友            0          0              7              6      0
##   我們與惡的距離            0          1              0             93      0
##   想見你                    0          0              0              3     16
##   越界                      0          0              0              0      0
##   忠孝節義                  0          0              0              1      0
##   最佳利益                  0          0              0              3      0
##                   
##                    越界 忠孝節義 最佳利益
##   魂囚西門            0        0        0
##   那一天              0        0        0
##   你有念大學嗎        0        0        0
##   女兵日記女力報到    0        0        0
##   圈套                0        0        0
##   俗女養成記          0        0        0
##   天堂的微笑          0        0        0
##   我們不能是朋友      0        0        0
##   我們與惡的距離      0        0        0
##   想見你              0        0        0
##   越界                5        0        0
##   忠孝節義            0       23        0
##   最佳利益            0        0       11

# misclassification rate
mean(artUrl_dtm2_test$title != artUrl_dtm2_xgb.p$prediction)

## [1] 0.1333333

特徵重要性

xgb.importance(artUrl_dtm2_xgb, feature_names = colnames(xgb.artUrl_dtm2_train)) %>%
  top_n(20) %>%
  ggplot(., aes(reorder(Feature, Gain), y = Gain, fill = (Feature))) + 
  geom_bar(stat = "identity", show.legend = FALSE) +
  theme(text = element_text(size = 20)) +
  coord_flip() +
  xlab(' ') +
  ylab('特徵重要性') +
  theme(text = element_text(family = "Heiti TC Light"))

## Selecting by Frequency

移除演員名、劇名、角色名後，還是會有許多名字的中間詞彙，如 “我們與惡的距離” 的「距離」、“想見你” 的「想見」、“俗女養成記” 的「養成」等，推測原因為劇名、角色名為識別某齣戲劇的指標，較不為戲劇間的共同用字，故可清楚辨別出是哪齣戲劇。
但也出現較多是跟劇情內容本身相關的詞彙，如「歌仔戲」、「黑道」、「家屬」、「社會」等。

各 model 表現比較

data.frame(model = c("SVM", "Random Forest", "Xgboost", "SVM 2", "Random Forest 2", "Xgboost 2"),
           misclassification_rate = c(mean(artUrl_dtm_test$title != artUrl_dtm_svm.p),
                                      mean(artUrl_dtm_test$title != artUrl_dtm_rf.p),
                                      mean(artUrl_dtm_test$title != artUrl_dtm_xgb.p$prediction), 
                                      mean(artUrl_dtm2_test$title != artUrl_dtm2_svm.p),
                                      mean(artUrl_dtm2_test$title != artUrl_dtm2_rf.p),
                                      mean(artUrl_dtm2_test$title != artUrl_dtm2_xgb.p$prediction)))

##             model misclassification_rate
## 1             SVM             0.12820513
## 2   Random Forest             0.06410256
## 3         Xgboost             0.07179487
## 4           SVM 2             0.15128205
## 5 Random Forest 2             0.06923077
## 6       Xgboost 2             0.13333333

對這個 data 而言，Random Forest 的表現最好
可發現移除演員名、劇名、角色名後，misclassification_rate 都提高了，推測原因為演員名、劇名、角色名為重要的分類特徵。

熱門留言預測

PTT_T2019p_tokens_30 <- PTT_T2019p_tokens %>%
  group_by(word) %>%
  filter(n() > 30) %>% # 只篩選出現大於 30 次的字
  ungroup()

TF

將資料轉成以每個 artUrl 為主的 DocumentTermMatrix (DTM) 格式

dtm <- PTT_T2019p_tokens_30 %>% 
  count(artUrl, word) %>% 
  cast_dtm(artUrl, word, n)

dim(dtm)

## [1] 2529 3049

新增 is_pop 欄位 (label)

取中位數到平均之間

summary(PTT_T2019p_tokens_30$commentNum)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   12.00   35.00   84.23   88.00 1497.00

hist(PTT_T2019p_tokens_30$commentNum)

留言數大於 200 設定為熱門文章

dtm <- as.data.frame(as.matrix(dtm))

# 於貼文原檔新增 is_pop 欄位
PTT_T2019p$is_pop <- as.factor(ifelse(PTT_T2019p$commentNum > 200, 1, 0))

# 取得熱門貼文的 artUrl
pop_artUrl <- PTT_T2019p %>% 
  filter(is_pop == 1) %>%
  .[, "artUrl"]

設定熱門留言比 9:1

dtm$is_pop <- as.factor(ifelse(rownames(dtm) %in% pop_artUrl, 1, 0)) 
table(dtm$is_pop)

## 
##    0    1 
## 2260  269

資料切割：將資料分成訓練集與測試集

set.seed(2020)
spl <- sample.split(dtm$is_pop, 0.7) 
# 在保留 is_pop 比例的狀況下以 7:3 將原始資料分割成訓練集與測試集
dtm_train <- subset(dtm, spl == TRUE)
dtm_test <- subset(dtm, spl == FALSE)

Logistic Regression

dtm_glm <- glm(is_pop ~ ., dtm_train, family = "binomial") # 以 is_pop 作為 y, 其他欄位都當作 x
saveRDS(object = dtm_glm, file = "./object/dtm_glm.rds")

# 出現 glm.fit: algorithm did not convergeglm.fit: fitted probabilities numerically 0 or 1 occurred

dtm_glm <- readRDS(file = "./object/dtm_glm.rds")
dtm_glm.p <- predict(dtm_glm, dtm_test, type = "response") # 返回為 is_pop 為 1 的機率

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

Confusion Matrix

(predict value, actual value)

confusionMatrix(factor(ifelse(dtm_glm.p >= 0.5, 1, 0)), 
                factor(dtm_test$is_pop), 
                positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 350  42
##          1 328  39
##                                           
##                Accuracy : 0.5125          
##                  95% CI : (0.4763, 0.5486)
##     No Information Rate : 0.8933          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -9e-04          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.48148         
##             Specificity : 0.51622         
##          Pos Pred Value : 0.10627         
##          Neg Pred Value : 0.89286         
##              Prevalence : 0.10672         
##          Detection Rate : 0.05138         
##    Detection Prevalence : 0.48353         
##       Balanced Accuracy : 0.49885         
##                                           
##        'Positive' Class : 1               
##

sum(dtm_glm.p >= 0.5)

## [1] 367

plot(dtm_glm.p)

dtm_glm.p 分佈竟是非 0 即 1，代表模型沒有學習到適當規則，調 pred 機率大小也沒有影響。

各特徵的 estimate 係數

哪些特徵 (詞) 對於判斷分類影響很大？

coef <- as.data.frame(summary(dtm_glm)$coefficients)
coef$term <- row.names(coef)
head(coef)

##                  Estimate   Std. Error      z value Pr(>|z|)        term
## (Intercept) -3.967874e+15 9.267254e+07 -42816076.24        0 (Intercept)
## 愛情        -1.752723e+26 3.995613e+20   -438661.74        0        愛情
## 比較        -3.643805e+26 1.003368e+21   -363157.27        0        比較
## 表現         2.098304e+27 2.357696e+21    889980.90        0        表現
## 不想         3.178277e+27 4.132703e+21    769055.42        0        不想
## 不像         5.812396e+26 7.608526e+21     76393.19        0        不像

coef %>%
  group_by(Estimate > 0) %>% # group_by兩類：Estimate > 0 或 Estimate <= 0
  top_n(10, abs(Estimate)) %>% # abs：絕對值
  ungroup() %>%
  ggplot(aes(reorder(term, Estimate), Estimate, fill = Estimate > 0)) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  coord_flip() +
  labs(
    x = NULL,
    title = "Coefficients that increase/decrease odds ratio of is_pop the most") +
  theme(text = element_text(family = "Heiti TC Light"))

“賀敬軒” 為演唱 “走心” 的歌手，表示兩個詞很容易同時出現，不過在 Logistic Regression 上面兩個的係數是最大正與負，明顯與現況較不符合。

查看分類錯誤的句子

預測 “不熱門” 實際為 “熱門” 的貼文

# 先確認在 dtm_test 中的 document id 順序與 dtm_glm.p 是一樣的
identical(row.names(dtm_test), names(dtm_glm.p))

## [1] TRUE

# 貼文 one artUrl per raw
PTT_T2019p %>% 
  filter(artUrl %in% (row.names(dtm_test))) %>%
  mutate(pred = dtm_glm.p) %>% # 因為確認了 dtm_test 和 pred 中的 document 次序是相同的，所以可以直接 mutate 
  filter(pred < 0.5, is_pop == 1) %>% 
  top_n(-10, pred) %>% # top_n(負數, col) 代表要根據 col 取最小的前 
  .[, c("artTitle", "artUrl")] %>% # 因為內容過多，只顯示 artTitle 與 artUrl
  head

##                                 artTitle
## 1 [LIVE]2018公視新創電影疑霧公堂公視日十
## 2             [LIVE]女兵日記女力報到EP39
## 3  [LIVE]艾蜜麗的五件事第8集（台視六十）
## 4       [LIVE]必勝大丈夫第13集(三立華八)
## 5       [LIVE]必勝大丈夫第19集(三立華八)
## 6   [LIVE]你有念大學嗎？第五集(台視日十)
##                                                       artUrl
## 1 https://www.ptt.cc/bbs/TaiwanDrama/M.1546776725.A.799.html
## 2 https://www.ptt.cc/bbs/TaiwanDrama/M.1546946352.A.978.html
## 3 https://www.ptt.cc/bbs/TaiwanDrama/M.1547300772.A.E36.html
## 4 https://www.ptt.cc/bbs/TaiwanDrama/M.1548330013.A.713.html
## 5 https://www.ptt.cc/bbs/TaiwanDrama/M.1549022013.A.994.html
## 6 https://www.ptt.cc/bbs/TaiwanDrama/M.1549805947.A.A17.html

預測 “熱門” 實際為 “不熱門” 的貼文

PTT_T2019p %>% 
  filter(artUrl %in% (row.names(dtm_test))) %>%
  mutate(pred = dtm_glm.p) %>% 
  filter(pred > 0.5, !is_pop == 1) %>% 
  top_n(-10, pred) %>%
  .[, c("artTitle", "artUrl")] %>% # 因為內容過多，只顯示 artTitle 與 artUrl
  head

##                                      artTitle
## 1                       [LIVE]種菜女神EP15-16
## 2  [新聞]嚴藝文處女執導《俗女養成記》謝盈萱挑
## 3 [新聞]福斯原創影集第二部明晚，空中見(非原標
## 4                    [心得]女力報到別再打臉了
## 5            [LIVE]必勝大丈夫第四集(三立華八)
## 6    [雙城]雙城故事旅行的意義第16、17集隨便聊
##                                                       artUrl
## 1 https://www.ptt.cc/bbs/TaiwanDrama/M.1546610106.A.630.html
## 2 https://www.ptt.cc/bbs/TaiwanDrama/M.1546612054.A.AB4.html
## 3 https://www.ptt.cc/bbs/TaiwanDrama/M.1547037266.A.EF3.html
## 4 https://www.ptt.cc/bbs/TaiwanDrama/M.1547192235.A.F03.html
## 5 https://www.ptt.cc/bbs/TaiwanDrama/M.1547206299.A.146.html
## 6 https://www.ptt.cc/bbs/TaiwanDrama/M.1547292228.A.68E.html

SVM

dtm_svm <- svm(is_pop ~ ., dtm_train, kernel = "linear", cost = 10, scale = F)
saveRDS(object = dtm_svm, file = "./object/dtm_svm.rds")

Confusion Matrix

dtm_svm <- readRDS(file = "./object/dtm_svm.rds")
dtm_svm.p <- predict(dtm_svm, dtm_test)

confusionMatrix(dtm_svm.p, 
                dtm_test$is_pop, 
                dnn = c("Prediction", "Reference"), 
                positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 630  32
##          1  48  49
##                                           
##                Accuracy : 0.8946          
##                  95% CI : (0.8705, 0.9155)
##     No Information Rate : 0.8933          
##     P-Value [Acc > NIR] : 0.48270         
##                                           
##                   Kappa : 0.4914          
##                                           
##  Mcnemar's Test P-Value : 0.09353         
##                                           
##             Sensitivity : 0.60494         
##             Specificity : 0.92920         
##          Pos Pred Value : 0.50515         
##          Neg Pred Value : 0.95166         
##              Prevalence : 0.10672         
##          Detection Rate : 0.06456         
##    Detection Prevalence : 0.12780         
##       Balanced Accuracy : 0.76707         
##                                           
##        'Positive' Class : 1               
##

Decision Tree

dtm_rp <- rpart(is_pop ~ ., dtm_train, method = "class")
saveRDS(object = dtm_rp, file = "./object/dtm_rp.rds")

Confusion Matrix

dtm_rp <- readRDS(file = "./object/dtm_rp.rds")
dtm_rp.p <- predict(dtm_rp, dtm_test)[, 2] 
# predict(dtm_rp, dtm_test) 會返回 2 種 probability：是 0 的機率, 是 1 的機率, 我們只取出是 1 的機率

confusionMatrix(factor(ifelse(dtm_rp.p >= 0.5, 1, 0)),
                factor(dtm_test$is_pop),
                positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 663  42
##          1  15  39
##                                           
##                Accuracy : 0.9249          
##                  95% CI : (0.9038, 0.9426)
##     No Information Rate : 0.8933          
##     P-Value [Acc > NIR] : 0.0019787       
##                                           
##                   Kappa : 0.5384          
##                                           
##  Mcnemar's Test P-Value : 0.0005736       
##                                           
##             Sensitivity : 0.48148         
##             Specificity : 0.97788         
##          Pos Pred Value : 0.72222         
##          Neg Pred Value : 0.94043         
##              Prevalence : 0.10672         
##          Detection Rate : 0.05138         
##    Detection Prevalence : 0.07115         
##       Balanced Accuracy : 0.72968         
##                                           
##        'Positive' Class : 1               
##

調 pred 似乎不影響 sensitivity

決策樹視覺化

prp(dtm_rp,               # 模型
    faclen = 0,           # 呈現的變數不要縮寫
    fallen.leaves = TRUE, # 讓樹枝以垂直方式呈現
    shadow.col = "gray",  # 最下面的節點塗上陰影
    extra = 2)            # number of correct classifications / number of observations in that node

視覺化呈現發現 “歌仔戲” 是否出現是很重要的變數。發現當貼文出現歌仔戲，但沒有提到 “片頭曲” 與 “線上” 的 term 時預測熱門留言的正確率高達 94%，推測是因為楊麗花睽違 16 年的電視歌仔戲復出作品，出現歌仔戲的討論度普遍較高，去掉片頭曲介紹較無討論度後，容易出現較多的留言討論 (熱門留言)。

Random Forest

dtm_rf <- randomForest(is_pop ~ ., dtm_train, 
                       method = "class", importance = TRUE, proximity = TRUE)
saveRDS(object = dtm_rf, file = "./object/dtm_rf.rds")

Confusion Matrix

dtm_rf <- readRDS(file = "./object/dtm_rf.rds")
print(dtm_rf)

## 
## Call:
##  randomForest(formula = is_pop ~ ., data = dtm_train, method = "class",      importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 55
## 
##         OOB estimate of  error rate: 5.93%
## Confusion matrix:
##      0  1 class.error
## 0 1572 10 0.006321113
## 1   95 93 0.505319149

dtm_rf.p <- predict(dtm_rf, dtm_test, type = "prob")[, 2]

confusionMatrix(factor(ifelse(dtm_rf.p >= 0.5, 1, 0)),
                factor(dtm_test$is_pop),
                positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 674  43
##          1   4  38
##                                           
##                Accuracy : 0.9381          
##                  95% CI : (0.9185, 0.9541)
##     No Information Rate : 0.8933          
##     P-Value [Acc > NIR] : 1.240e-05       
##                                           
##                   Kappa : 0.5878          
##                                           
##  Mcnemar's Test P-Value : 2.976e-08       
##                                           
##             Sensitivity : 0.46914         
##             Specificity : 0.99410         
##          Pos Pred Value : 0.90476         
##          Neg Pred Value : 0.94003         
##              Prevalence : 0.10672         
##          Detection Rate : 0.05007         
##    Detection Prevalence : 0.05534         
##       Balanced Accuracy : 0.73162         
##                                           
##        'Positive' Class : 1               
##

feature importance

varImpPlot(dtm_rf, main = "Top 30 -- variable importance")

這張圖是前 30 重要變數，但是發現抓出的重要變數幾乎來自同一齣戲，可以推論貼文不是這部戲時，判斷熱門較容易錯誤。

比較多個模型的 ROC

colAUC(cbind(glm = dtm_glm.p, 
             svm = dtm_svm.p, 
             rpart = dtm_rp.p, 
             rf = dtm_rf.p), 
       dtm_test$is_pop, T)

##               glm       svm     rpart        rf
## 0 vs. 1 0.5011472 0.7670709 0.7847336 0.8193762

對於 TF 的 DTM

Random Forest 對於解決類別不平衡問題較佳，可以保有較高的 sensitivity、specificity
依續為 SVM, Decision Tree
Logistic Regression 接近 45 度幾乎沒有學習到良好的規則 (pred 非 0 即 1)

TF-IDF

# 計算每個 document 裡不同的字的 tf
document_words <- PTT_T2019p_tokens_30 %>%
  count(artUrl, word, sort = TRUE)

total_words <- document_words %>% 
  group_by(artUrl) %>%
  summarize(total = sum(n)) %>%
  right_join(document_words) %>%
  mutate(is_pop = as.factor(ifelse(artUrl %in% pop_artUrl, 1, 0)))

## Joining, by = "artUrl"

total_words <- total_words %>% 
  bind_tf_idf(word, artUrl, n)

# 選每篇文章 tf-idf 最大的十個詞
tfidf_top10 <- total_words %>%
  filter(nchar(word) > 1) %>%
  group_by(artUrl)%>%
  top_n(10, tf_idf) %>%
  arrange(desc(artUrl))

# 視覺化 (跑不太動)
# tfidf_top10  %>%
#   ggplot(aes(word, tf_idf, fill = artUrl)) +
#   geom_col(show.legend = FALSE) +
#   labs(x = NULL, y = "tf-idf") +
#   facet_wrap(~artUrl, ncol = 2, scales = "free") +
#   coord_flip()

將 TF-ITF 轉為 DTM 的值

dtm.tfidf <- tfidf_top10 %>%
  cast_dtm(artUrl, word, tf_idf)
 
dim(dtm.tfidf)

## [1] 2529 2976

新增 is_pop 欄位 (label)

dtm.tfidf <- as.data.frame(as.matrix(dtm.tfidf))
dtm.tfidf$is_pop <- as.factor(ifelse(rownames(dtm.tfidf) %in% pop_artUrl, 1, 0))
table(dtm.tfidf$is_pop)

## 
##    0    1 
## 2260  269

資料切割：將資料分成訓練集與測試集

set.seed(2020)
spl <- sample.split(dtm.tfidf$is_pop, 0.7) 
# 在保留 is_pop 比例的狀況下以 7:3 將原始資料分割成訓練集與測試集
dtm.tfidf_train <- subset(dtm.tfidf, spl == TRUE)
dtm.tfidf_test <- subset(dtm.tfidf, spl == FALSE)

Logistic Regression

dtm.tfidf_glm <- glm(is_pop ~ ., dtm.tfidf_train, family = "binomial")
saveRDS(object = dtm.tfidf_glm, file = "./object/dtm.tfidf_glm.rds")

dtm.tfidf_glm <- readRDS(file = "./object/dtm.tfidf_glm.rds")
dtm.tfidf_glm.p <- predict(dtm.tfidf_glm, dtm.tfidf_test, type = "response") # 返回為 is_pop 為 1 的機率

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading

Confusion Matrix

confusionMatrix(factor(ifelse(dtm.tfidf_glm.p >= 0.5, 1, 0)), 
                factor(dtm.tfidf_test$is_pop),
                positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 355  45
##          1 323  36
##                                           
##                Accuracy : 0.5152          
##                  95% CI : (0.4789, 0.5512)
##     No Information Rate : 0.8933          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -0.0127         
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.44444         
##             Specificity : 0.52360         
##          Pos Pred Value : 0.10028         
##          Neg Pred Value : 0.88750         
##              Prevalence : 0.10672         
##          Detection Rate : 0.04743         
##    Detection Prevalence : 0.47299         
##       Balanced Accuracy : 0.48402         
##                                           
##        'Positive' Class : 1               
##

SVM

dtm.tfidf_svm <- svm(is_pop ~ ., dtm.tfidf_train, kernel = "linear", cost = 10, scale = F)
dtm.tfidf_svm.p <- predict(dtm.tfidf_svm, dtm.tfidf_test)

Confusion Matrix

confusionMatrix(factor(dtm.tfidf_svm.p),
                factor(dtm.tfidf_test$is_pop),
                dnn = c("Prediction", "Reference"),
                positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 667  47
##          1  11  34
##                                           
##                Accuracy : 0.9236          
##                  95% CI : (0.9023, 0.9415)
##     No Information Rate : 0.8933          
##     P-Value [Acc > NIR] : 0.002962        
##                                           
##                   Kappa : 0.5017          
##                                           
##  Mcnemar's Test P-Value : 4.312e-06       
##                                           
##             Sensitivity : 0.41975         
##             Specificity : 0.98378         
##          Pos Pred Value : 0.75556         
##          Neg Pred Value : 0.93417         
##              Prevalence : 0.10672         
##          Detection Rate : 0.04480         
##    Detection Prevalence : 0.05929         
##       Balanced Accuracy : 0.70176         
##                                           
##        'Positive' Class : 1               
##

Decision Tree

dtm.tfidf_rp <- rpart(is_pop ~ ., dtm.tfidf_train, method = "class")
saveRDS(object = dtm.tfidf_rp, file = "./object/dtm.tfidf_rp.rds")

Confusion Matrix

dtm.tfidf_rp <- readRDS(file = "./object/dtm.tfidf_rp.rds")
dtm.tfidf_rp.p <- predict(dtm.tfidf_rp, dtm.tfidf_test)[, 2] 
# predict 會返回 2 種 probability：是 0 的機率, 是 1 的機率, 我們只取出是 1 的機率

confusionMatrix(factor(ifelse(dtm.tfidf_rp.p >= 0.5, 1, 0)),
                factor(dtm.tfidf_test$is_pop),
                positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 666  55
##          1  12  26
##                                           
##                Accuracy : 0.9117          
##                  95% CI : (0.8892, 0.9309)
##     No Information Rate : 0.8933          
##     P-Value [Acc > NIR] : 0.05338         
##                                           
##                   Kappa : 0.3958          
##                                           
##  Mcnemar's Test P-Value : 2.88e-07        
##                                           
##             Sensitivity : 0.32099         
##             Specificity : 0.98230         
##          Pos Pred Value : 0.68421         
##          Neg Pred Value : 0.92372         
##              Prevalence : 0.10672         
##          Detection Rate : 0.03426         
##    Detection Prevalence : 0.05007         
##       Balanced Accuracy : 0.65164         
##                                           
##        'Positive' Class : 1               
##

決策樹視覺化

prp(dtm.tfidf_rp,         # 模型
    faclen = 0,           # 呈現的變數不要縮寫
    fallen.leaves = TRUE, # 讓樹枝以垂直方式呈現
    shadow.col = "gray",  # 最下面的節點塗上陰影
    extra = 2)            # number of correct classifications / number of observations in that node

視覺化呈現表示 “看點” 影響性很大，當看點的 TF-IDF > 0.044 代表成為此貼文的關鍵字時，有 95％成功預測為熱門留言。

Random Forest

dtm.tfidf_rf <- randomForest(is_pop ~ ., dtm.tfidf_train, 
                             method = "class", importance = TRUE, 
                             proximity = TRUE)
saveRDS(object = dtm.tfidf_rf, file = "./object/dtm.tfidf_rf.rds")

Confusion Matrix

dtm.tfidf_rf <- readRDS(file = "./object/dtm.tfidf_rf.rds")
print(dtm.tfidf_rf)

## 
## Call:
##  randomForest(formula = is_pop ~ ., data = dtm.tfidf_train, method = "class",      importance = TRUE, proximity = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 54
## 
##         OOB estimate of  error rate: 6.27%
## Confusion matrix:
##      0  1 class.error
## 0 1571 11 0.006953224
## 1  100 88 0.531914894

dtm.tfidf_rf.p <- predict(dtm.tfidf_rf, dtm.tfidf_test, type = "prob")[, 2]

confusionMatrix(factor(ifelse(dtm.tfidf_rf.p >= 0.5, 1, 0)),
                factor(dtm.tfidf_test$is_pop),
                positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 672  47
##          1   6  34
##                                           
##                Accuracy : 0.9302          
##                  95% CI : (0.9097, 0.9473)
##     No Information Rate : 0.8933          
##     P-Value [Acc > NIR] : 0.0003287       
##                                           
##                   Kappa : 0.5287          
##                                           
##  Mcnemar's Test P-Value : 3.92e-08        
##                                           
##             Sensitivity : 0.4198          
##             Specificity : 0.9912          
##          Pos Pred Value : 0.8500          
##          Neg Pred Value : 0.9346          
##              Prevalence : 0.1067          
##          Detection Rate : 0.0448          
##    Detection Prevalence : 0.0527          
##       Balanced Accuracy : 0.7055          
##                                           
##        'Positive' Class : 1               
##

feature importance

varImpPlot(dtm.tfidf_rf, main = "Top 30 -- variable importance")

孝感動天、重華、堯帝皆是來自忠孝節義，可能與這部戲較容易出現較多留言有關。

比較多個模型的 ROC

colAUC(cbind(glm.tfidf = dtm.tfidf_glm.p, 
             svm.tfidf = dtm.tfidf_svm.p, 
             rpart.tfidf = dtm.tfidf_rp.p, 
             rf.tfidf = dtm.tfidf_rf.p), 
       dtm.tfidf_test$is_pop, T)

##         glm.tfidf svm.tfidf rpart.tfidf  rf.tfidf
## 0 vs. 1 0.5159784 0.7017644   0.6642995 0.7820387

比較 TF 與 TF-IDF 的 ROC

colAUC(cbind(glm = dtm_glm.p, 
             svm = dtm_svm.p, 
             rpart = dtm_rp.p, 
             rf = dtm_rf.p,
             glm.tfidf = dtm.tfidf_glm.p, 
             svm.tfidf = dtm.tfidf_svm.p, 
             rpart.tfidf = dtm.tfidf_rp.p, 
             rf.tfidf = dtm.tfidf_rf.p), 
       dtm_test$is_pop, T)

熱門留言只佔一成情況下 (類別不平衡)，TF-IDF 的 Random Forest 效果最好。
SVM、Decision Tree 在 TF 具有較好的表現，代表 feature 改變對訓練 SVM、Decision Tree 影響較大。
另外可以發現貼近 ROC 45 度的 model，例如 svm.tfidf、rpart.tfidf 都是 no discrimination。

四、結論

以文字分群而言，在 LDA 的結果較能明顯看出每個主題討論的內容，主要是因為貼文中討論演員名、劇名、角色名的字詞較多，較能產生主題的連結，雖然在移除演員名、劇名、角色名後，只有一個主題有明顯出現某戲劇的特徵，但相較 K-means 的分群，K-means 較難去解釋分在一群的結果，只能從離群值中尋找討論度較不一樣的貼文。
依據貼文及留言畫出的網路關係圖，可以看出網友在討論戲劇時，會因為相似的劇情元素產生不同戲劇間的連結，且這種現象在留言上會更加明顯，可能是因為留言的詞彙數量更多且更為多元，故更容易與他部戲劇產生關連。
比較戲劇分類器 1, 2 可發現當移除演員名、劇名、角色名後，model 分類錯誤率都提高了，而且特徵重要性的圖表前幾名大多為劇名、角色名，所以演員名、劇名、角色名應為識別某齣戲劇的重要指標。
關於戲劇分類器使用的 3 個 model (SVM、Random Forest、Xgboost) 分類表現都不錯，通常會使 model 分類錯誤的地方大多為兩齣戲劇有相似的劇情連結。
關於預測熱門留言，以上 model 學習 sensitivity 學的不是很好，可能與樣本數不夠有關，使得模型在學習規則上沒有學習好。
不同的 input DTM 沒有特定哪個比較好，而是在不同 model 中去找出適合的。例如我們發現 Random Forest 在 TF-IDF 分類表現較佳，而 SVM、Decision Tree 在 TF 表現比 TF-IDF 好，這部分與模型本身假設，推論方法有關。

2019 年台灣電視劇文字分析

第 3 組 M084020006沈育嬋 M084020016陳芓僮 M084020021徐賢欽 M084020023陳靖中

一、動機和分析目的

二、資料集描述

三、資料分析與解釋

資料前處理

讀取資料

斷詞

取代同義詞

LDA 分析

建立 LDA 模型

最佳主題數

各 topic 的代表字

LDA 分析 2 (移除相關名字)

建立 LDA 模型

各 topic 的代表字

K-means 分群

取得各齣戲劇的資料

越界

女兵日記女力報到

圈套

那一天

想見你

魂囚西門

最佳利益

忠孝節義

俗女養成記

天堂的微笑

你有念大學嗎

我們與惡的距離

我們不能是朋友

合併資料並新增 label

戲劇相似度網路圖

戲劇相似度網路圖 (Post)

繪製戲劇相似度網路圖 (Post 0.80)

繪製戲劇相似度網路圖（Post 0.72）

戲劇相似度網路圖 (Message)

繪製戲劇相似度網路圖 (Message 0.72)

貼文與留言的戲劇相似度網路圖比較

戲劇分類器

各戲劇的常用字

新增 title 欄位 (label)

Model building

SVM

Random Forest

查看分類錯誤的 token

Xgboost

特徵重要性

各 model 表現比較

戲劇分類器 2 (移除相關名字)

各戲劇的常用字

新增 title 欄位 (label)

Model building

SVM

Random Forest

查看分類錯誤的 token

Xgboost

特徵重要性

各 model 表現比較

熱門留言預測

TF

新增 is_pop 欄位 (label)

Logistic Regression

Confusion Matrix

各特徵的 estimate 係數

查看分類錯誤的句子

SVM

Confusion Matrix

Decision Tree

Confusion Matrix

決策樹視覺化

Random Forest

Confusion Matrix

feature importance

比較多個模型的 ROC

TF-IDF

新增 is_pop 欄位 (label)

Logistic Regression

Confusion Matrix

SVM