資料介紹

Data Source: 辛普森一家字幕庫
第31季第1集~第16集(共16集)

資料前處理

讀取套件及資料

pacman::p_load(tidytext,readr, tidyverse, data.table, tibble, stringr, textdata, 
               tidyr, reshape2, plotly, topicmodels, dplyr, text2vec, udpipe)

text = read_file("./simps/all.txt")

tokenization

text_tidy = tibble(text) %>%
  unnest_tokens(word, text) %>%
  mutate(episode = cumsum(str_detect(word, regex("^EP([1-9]|1[0-6])$", ignore_case = T)))) 

data(stop_words)

text_tidy$episode = paste0("ep",text_tidy$episode)

text_tidy = text_tidy %>%
  anti_join(stop_words) %>%
  filter(!(word %in% c("You", "the", "to", "a", "of", "and", ".", "that", "up", "this", "there", "is","or","gonna","uh","i","god","time","yeah","baby", "kid", "kids", "boy", "son", "dad", "daddy", "mom", "wife", "sir", "guy", "guys", "aw", "uh", "ooh", "huh", "eh", "ah", "yeah", "hey", "gonna","homer","marge")))

word_counts <- text_tidy %>%
  count(episode, word, sort = TRUE) %>%
  ungroup()
word_counts %>% head(10)

## # A tibble: 10 x 3
##    episode word          n
##    <chr>   <chr>     <int>
##  1 ep10    doo          61
##  2 ep13    frink        20
##  3 ep10    christmas    18
##  4 ep5     boat         18
##  5 ep9     todd         17
##  6 ep13    professor    16
##  7 ep7     costa        16
##  8 ep7     rica         16
##  9 ep12    school       15
## 10 ep2     mike         15

轉成dtm格式

episode_dtm <- word_counts %>%
  cast_dtm(episode, word, n)

episode_dtm

## <<DocumentTermMatrix (documents: 16, terms: 5037)>>
## Non-/sparse entries: 8621/71971
## Sparsity           : 89%
## Maximal term length: 16
## Weighting          : term frequency (tf)

LDA分析

主題-文字分析

16個主題

Beta值

episode_lda <- LDA(episode_dtm, k = 16, control = list(seed = 1234))
text_topics <- tidy(episode_lda, matrix = "beta")
text_topics %>% head(10)

## # A tibble: 10 x 3
##    topic term      beta
##    <int> <chr>    <dbl>
##  1     1 doo   6.52e-90
##  2     2 doo   7.65e-89
##  3     3 doo   3.60e-92
##  4     4 doo   5.55e-82
##  5     5 doo   8.02e- 2
##  6     6 doo   4.24e-89
##  7     7 doo   6.83e-90
##  8     8 doo   8.97e-93
##  9     9 doo   3.81e-90
## 10    10 doo   1.99e-84

各主題最常見文字

top_terms <- text_topics %>%
  group_by(topic) %>%
  top_n(5, beta) %>% # 選出各主題前n高的文字
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>% head(10)

## # A tibble: 10 x 3
##    topic term         beta
##    <int> <chr>       <dbl>
##  1     1 school    0.0185 
##  2     1 soda      0.0123 
##  3     1 stem      0.0123 
##  4     1 algorithm 0.0111 
##  5     1 future    0.0111 
##  6     1 jobs      0.0111 
##  7     2 bart      0.0198 
##  8     2 flanders  0.0152 
##  9     2 school    0.00913
## 10     2 mentor    0.00913

top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

4個主題

去除人名，重覆字，無意義字

episode_lda <- LDA(episode_dtm, k = 4, control = list(seed = 1234))
text_topics <- tidy(episode_lda, matrix = "beta")

# 去除人名/普遍出現的字/無法判斷的字
remove_word = c( "doo", "aw", "uh", "ooh", "huh", "eh", "ah", "yeah", "hey", "gonna","blarg","an8","vida","todd","daa","rica","evelyn","baby","monkey","boy","guy","professor","people","son","kid","chief","lisa","body","kids","","god","dad","santa","daddy","sir","artie","dubya","mommy","ho","pura","mike","lenny","love","time","boys","wait","woman","santa's","s.b","jelly","mom","simpsons","tony","homer","hmm","costa","flanders","nelson","bart","yay","simpson","whoa","marge","day")

top_terms <- text_topics  %>%
  filter(!term  %in% remove_word)%>%
  group_by(topic) %>%
  top_n(5, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)

top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
#  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  coord_flip()

5個主題

單從最高頻率圖有些主題不好辨識
試著將每一集分到不同主題，看看同個主題的集數有沒有相似性來分辨各主題是什麼

episode_lda <- LDA(episode_dtm, k = 5, control = list(seed = 1234))

text_topics <- tidy(episode_lda, matrix = "beta")

top_terms <- text_topics %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
  
top_terms %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  coord_flip() +
  scale_x_reordered()

文件-主題分析

gamma值

使用5個主題(效果最好)

episode_gamma <- tidy(episode_lda, matrix = "gamma")
episode_gamma %>% head(10)

## # A tibble: 10 x 3
##    document topic     gamma
##    <chr>    <int>     <dbl>
##  1 ep10         1 0.0000395
##  2 ep13         1 1.00     
##  3 ep5          1 0.0000383
##  4 ep9          1 0.0000432
##  5 ep7          1 0.0000345
##  6 ep12         1 1.00     
##  7 ep2          1 1.00     
##  8 ep4          1 0.0000491
##  9 ep14         1 0.0000373
## 10 ep16         1 0.0000458

episode_gamma$document =  # 按照集數順序畫圖
  ifelse(
    str_detect(episode_gamma$document, regex("^ep[1-9]$", ignore_case = T)),
    paste0("ep0", substring(episode_gamma$document, 3)),
    episode_gamma$document
    ) 

episode_gamma %>%
  mutate(title = reorder(document, gamma * topic)) %>%
  ggplot(aes(factor(topic), gamma)) +
  geom_boxplot() +
  facet_wrap(~document)

主題一是教育
- EP2有荷馬做實習生主管的橋段，以及邁克·韋格曼希望河馬成為他的mentor
- EP12 在講lisa進入了一個教她科學，編碼，數學和神經網絡的資優班。但其他孩子卻被教導做一些奴役的工作
- EP13Frink就是一名教授，研發Frinkcoin變得有錢
主題二和母親(家族)有關
- EP6關於lisa演了一個戲劇並將Marge比喻成無聊的母親
- EP9<Todd, Todd, Why Hast Thou Forsaken Me?>內容提及Todd不再記得母親的臉，令他懷疑上帝的存在
主題三是休閒活動
- EP1荷馬和巴特成為病毒影片明星
- EP7到哥斯大黎加旅行
- EP8<恐怖的感恩節>包含homer一家變成火雞被追趕做感恩節晚餐，homer買了一個mergeAI當感恩節禮物
- EP14是個關於拍攝電影發生的一連串事件(電影劇情被洩漏的危機)
主題四和犯罪有關
- EP3隆德博士因詐騙被警方逮捕
- EP11在募捐時，有人被搶劫了，也出現黑手黨、警察追捕的橋段
- EP15阿蒂在假釋後開始製作Marge的克隆機器人並和他結婚
主題五是樂園和聖誕節
- EP4出現了聖誕節的字母裝飾燈
- EP5是辛普森一家參訪水上主題樂園
- EP10有出現聖誕節主題樂園

LDAvis

承接前面使用5個主題：LDAvis Demo

dtf <- document_term_frequencies(text_tidy, document = "episode", term = "word")
dtm <- document_term_matrix(x = dtf)
dtm_clean <- dtm_remove_lowfreq(dtm, minfreq = 15)
dim(dtm_clean)

## [1] 16 81

set.seed(1234)

topic_n = 5

lda_model =text2vec::LDA$new(n_topics = topic_n,doc_topic_prior = 0.1, topic_word_prior = 0.001)
doc_topic_distr =lda_model$fit_transform(dtm_clean, n_iter = 1000, convergence_tol = 1e-5,check_convergence_every_n = 100)

## INFO  [21:57:21.008] early stopping at 70 iteration 
## INFO  [21:57:21.102] early stopping at 50 iteration

lda_model$get_top_words(n = 10, lambda = 0.5) ## 查看 前10主題字

##       [,1]       [,2]       [,3]       [,4]     [,5]       
##  [1,] "milhouse" "people"   "bart"     "lisa"   "doo"      
##  [2,] "fat"      "wait"     "day"      "family" "christmas"
##  [3,] "phone"    "eat"      "boat"     "kill"   "monkey"   
##  [4,] "body"     "cool"     "flanders" "free"   "santa"    
##  [5,] "food"     "stupid"   "house"    "money"  "day"      
##  [6,] "woman"    "world"    "movie"    "friend" "whoa"     
##  [7,] "nice"     "frink"    "todd"     "leave"  "bart"     
##  [8,] "hell"     "lot"      "mentor"   "coming" "stop"     
##  [9,] "mike"     "remember" "bad"      "fine"   "call"     
## [10,] "love"     "talk"     "hot"      "found"  "play"

# lda_model$plot() # 畫圖
# lda_model$plot(out.dir ="lda_result", open.browser = TRUE) # 存起來

5個主題皆分散不重疊
大致可以看出與前面5個主題相同的pattern
- 主題1為教育
- 主題2為出外郊遊
- 主題5為聖誕節
- 主題3與4較不明顯

結論

由於影集台詞會使用的文字過於生活化且細碎，光是將stopwords刪除terms就剩不多了，導致現階段在進行分析時常出現：不管怎麼切主題數，出來的terms不是差不多，就是資訊量(獨特性)太低，難以解釋與分析。
若以「集數」為文件單位(16個文件)，資料實在太少，下次可能要以「句子」為單位，一句句去分析才能得到較有意義的內容(或是換一個題目分析)。

【辛普森家庭】影集文字分析_LDA分析

【辛普森家庭】影集文字分析_LDA分析

資料介紹

資料前處理

讀取套件及資料

tokenization

轉成dtm格式

LDA分析

主題-文字分析

16個主題

Beta值

各主題最常見文字

4個主題

5個主題

文件-主題分析

gamma值

LDAvis

結論