分析題目:KTV熱門排行榜歌曲詞彙與風格分析

資料來源:"mymusic.net KTV熱唱必備►國語K歌排行超精選

https://www.mymusic.net.tw/ux/w/themeInfo/3354

系統設置

系統參數設定

## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"

安裝需要的packages

#install.packages("modeltools")   # a dependency of topicmodels
#install.packages("topicmodels", type="source")  # should compile from source
require(dplyr)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
require(ggplot2)
## Loading required package: ggplot2
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
require(reshape2)
## Loading required package: reshape2
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
require(readr)
## Loading required package: readr
## 
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
## 
##     col_factor
require(knitr)
## Loading required package: knitr
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
require(tm)
## Loading required package: tm
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
require(readr)
require(tm)
require(data.table)
## Loading required package: data.table
## 
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
## Loading required package: udpipe
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(wordcloud2)
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
require(purrr)
## Loading required package: purrr
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
## 
##     transpose
## The following object is masked from 'package:scales':
## 
##     discard
require(ramify)
## Loading required package: ramify
## 
## Attaching package: 'ramify'
## The following object is masked from 'package:purrr':
## 
##     flatten
## The following object is masked from 'package:webshot':
## 
##     resize
## The following object is masked from 'package:tidyr':
## 
##     fill
## The following object is masked from 'package:graphics':
## 
##     clip
require(RColorBrewer)
require(MASS)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
require(modeltools)
## Loading required package: modeltools
## Loading required package: stats4

匯入資料

Tokenization

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
all_song_rank <- read.csv("Lyric_All.csv", fileEncoding="Big-5", fill=TRUE)
all_song_rank$Lyric <- as.character(all_song_rank$Lyric)
jieba_tokenizer <- worker(user="user_dict_All.txt", stop_word = "stop_words_All.txt")

# 設定斷詞function
song_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}
tokens <- all_song_rank %>%
  unnest_tokens(word, Lyric, token=song_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(Rank, word) %>%
  rename(count=n)

tokens %>% head(20)
## # A tibble: 20 x 3
##     Rank word   count
##    <int> <chr>  <int>
##  1     1 愛情       2
##  2     1 別忘記     2
##  3     1 不痛       2
##  4     1 逞強       2
##  5     1 觸碰       2
##  6     1 發光       3
##  7     1 放棄       2
##  8     1 奮力       3
##  9     1 風         2
## 10     1 害怕       5
## 11     1 記得       1
## 12     1 記著       2
## 13     1 沮喪       2
## 14     1 落寞       2
## 15     1 迷惘       2
## 16     1 人生       2
## 17     1 身旁       6
## 18     1 聲音       2
## 19     1 失望       2
## 20     1 所有的     1

將資料轉換為DTM

song_dtm <- tokens %>% cast_dtm(Rank,word,count)
song_dtm
## <<DocumentTermMatrix (documents: 100, terms: 3467)>>
## Non-/sparse entries: 5697/341003
## Sparsity           : 98%
## Maximal term length: 6
## Weighting          : term frequency (tf)
inspect(song_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 21/79
## Sparsity           : 79%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 愛情 別忘記 不痛 逞強 觸碰 發光 放棄 奮力 風 害怕
##   1     2      2    2    2    2    3    2    3  2    5
##   10    0      0    0    0    0    0    0    0  0    0
##   2     0      0    0    0    0    0    0    0  0    0
##   3     2      2    2    2    2    3    2    3  2    5
##   4     0      0    0    0    0    0    0    0  0    0
##   5     0      0    0    0    0    0    0    0  0    0
##   6     4      0    0    0    0    0    0    0  0    0
##   7     0      0    0    0    0    0    0    0  0    0
##   8     0      0    0    0    0    0    0    0  0    0
##   9     0      0    0    0    0    0    0    0  0    0

查看DTM矩陣,可以發現是個稀疏矩陣

建立LDA模型

lda_4 <- LDA(song_dtm, k = 4, control = list(seed = 2020))

先將LDA模型,主題數數定為 4

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

topics4 <- tidy(lda_4, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics4
## # A tibble: 13,868 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 愛情   6.79e-  3
##  2     2 愛情   1.92e-  3
##  3     3 愛情   5.18e-  3
##  4     4 愛情   2.53e-  3
##  5     1 別忘記 7.64e-110
##  6     2 別忘記 3.00e-111
##  7     3 別忘記 5.55e-109
##  8     4 別忘記 1.74e-  3
##  9     1 不痛   1.75e-109
## 10     2 不痛   2.92e-111
## # … with 13,858 more rows

繪製四個主題的常見詞彙

top_terms <- topics4 %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  theme(text = element_text(family='STHeitiTC-Light'))+
  coord_flip()

透過上方的圖,看不太出每個主題的分類依據,嘗試看看分多一點topics

更多主題

  • 嘗試4,6,8,10,12主題數,將結果存起來,再做進一步分析
ldas = c()
topics = c(4,6,8,10,12)
for(topic in topics){
   start_time <- Sys.time()
   lda <- LDA(song_dtm, k = topic, control = list(seed = 2020))
   ldas =c(ldas,lda)
   print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
   save(ldas,file = "ldas_result")
 }
load("ldas_result")
topics = c(4,6,8,10,12)
data_frame(k = topics,
           perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
  ggplot(aes(k, perplex)) +
  geom_point() +
  geom_line() +
  labs(title = "Evaluating LDA topic models",
       subtitle = "Optimal number of topics (smaller is better)",
       x = "Number of topics",
       y = "Perplexity")
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.

從圖中可以看出,4,6個主題為較適合,故以下將以4,6主題做討論

LDAvis

產生create LDAvis所需的json function

  • 此function是將前面使用 “LDA function”所建立的model,轉換為“LDAVis”套件的input格式。
topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
   ###以下function 用來解決,主題數多會出現NA的問題
  ## 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi)
{
  jensenShannon <- function(x, y) {
      m <- 0.5 * (x + y)
    lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
    rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
    0.5 * sum(lhs) + 0.5 * sum(rhs)
  }
  dist.mat <- proxy::dist(x = phi, method = jensenShannon)
  pca.fit <- stats::cmdscale(dist.mat, k = 2)
  data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}

    # Find required quantities
    phi <- as.matrix(posterior(fitted)$terms)
    theta <- as.matrix(posterior(fitted)$topics)
    vocab <- colnames(phi)
    term_freq <- slam::col_sums(doc_term)

    # Convert to json
    json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                            vocab = vocab,
                            doc.length = as.vector(table(doc_term$i)),
                            term.frequency = term_freq, mds.method = ls_LDA)

    return(json_lda)
}

產生LDAvis結果

4個主題LDAvis結果

# 設置alpha及delta參數
#devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))

####### 以下用來產生ldavis的檔案,可以之後用來在local端、放在網路上打開 ##########
 for(lda in ldas){
   
   k = lda@k ## lda 主題數
   if(k==2){next}
   json_res <- topicmodels_json_ldavis(lda,song_dtm)
   # serVis(json_res,open.browser = T)
   lda_dir =  paste0(k,"_ldavis")
   if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}
   
   serVis(json_res, out.dir =lda_dir, open.browser = F)
   
   writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
 }

topic_10 = ldas[[1]]
json_res <- topicmodels_json_ldavis(topic_10,song_dtm)

serVis(json_res,open.browser = T)

# 如果無法開啟視窗(windows用戶)可執行這段
#serVis(json_res, out.dir = "vis", open.browser = T)
#writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

6個主題LDAvis結果

# 設置alpha及delta參數
#devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))

####### 以下用來產生ldavis的檔案,可以之後用來在local端、放在網路上打開 ##########
 for(lda in ldas){
   
   k = lda@k ## lda 主題數
   if(k==2){next}
   json_res <- topicmodels_json_ldavis(lda,song_dtm)
   # serVis(json_res,open.browser = T)
   lda_dir =  paste0(k,"_ldavis")
   if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}
   
   serVis(json_res, out.dir =lda_dir, open.browser = F)
   
   writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
 }

topic_10 = ldas[[2]]
json_res <- topicmodels_json_ldavis(topic_10,song_dtm)

serVis(json_res,open.browser = T)

# 如果無法開啟視窗(windows用戶)可執行這段
#serVis(json_res, out.dir = "vis", open.browser = T)
#writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

LDA後續分析

  • 根據前面的探索之後,我們對於資料有更加了解,並且看完每個主題數的LDAvis之後,選定主題數4,6的結果來作後續的分析

主題為4的結果

song_lda_4 = ldas[[1]] ## 選定topic 為4 的結果

topics_4 <- tidy(song_lda_4, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics_4
## # A tibble: 13,868 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 愛情   6.79e-  3
##  2     2 愛情   1.92e-  3
##  3     3 愛情   5.18e-  3
##  4     4 愛情   2.53e-  3
##  5     1 別忘記 7.64e-110
##  6     2 別忘記 3.00e-111
##  7     3 別忘記 5.55e-109
##  8     4 別忘記 1.74e-  3
##  9     1 不痛   1.75e-109
## 10     2 不痛   2.92e-111
## # … with 13,858 more rows

尋找Topic的代表字

  • 整理出每一個Topic中生成概率最高的10個詞彙。
mycolors <- colorRampPalette(brewer.pal(3, "YlGnBu"))(5)

top_terms_4 <- topics_4 %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms_4 %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  theme(text = element_text(family='STHeitiTC-Light')) +
  coord_flip()

產生四個主題,我們分別將其命名為:夢幻泡泡、受挫傷心、現代無腦、其他

主題命名

topic_name_4 = c('夢幻泡泡','受挫傷心','現代無腦','其他')

Document 4個主題分佈

# for every document we have a probability distribution of its contained topics
tmResult_4 <- posterior(song_lda_4)
doc_pro_4 <- tmResult_4$topics 
dim(doc_pro_4)               # nDocs(DTM) distributions over K topics
## [1] 100   4

共有100首歌,其分佈在四個主題之中。

cbind Document 主題分佈

# get document topic proportions 
document_topics_4 <- doc_pro_4[all_song_rank$Rank,]
document_topics_df_4 =data.frame(document_topics_4)
colnames(document_topics_df_4) = topic_name_4
rownames(document_topics_df_4) = NULL
song_topic_4 = cbind(all_song_rank,document_topics_df_4)
#song_topic %>% head(10)

查看特定主題的文章

  • 透過找到特定歌曲的分佈進行排序之後,可以看到此主題的比重高的歌曲排名大多為何。
song_topic_4 %>%
    arrange(desc(`夢幻泡泡`)) %>% head(10)

在『夢幻泡泡』中,排名大多屬中間名次居多。

將歌曲前100名,繪製其四個主題分佈

song_topic_4[,c(3:6)] =sapply(song_topic_4[,c(3:6)] , as.numeric)
as.numeric(song_topic_4$Rank) 
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
##  [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
##  [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
##  [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
##  [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100
song_topic_4 %>% 
  group_by(Rank) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "Rank")%>%
  ggplot(aes(x=Rank , y=value, fill=variable)) + 
  geom_bar(stat = "identity") + 
  scale_fill_manual(values=mycolors)+
  theme(text = element_text(family='STHeitiTC-Light'))

這張圖中,無法明確看出四個主題在歌曲中的分佈,因此我們將依排名次序做分組。

將五首歌曲為一組,重新繪製四個主題分佈

song_topic_4 %>% 
  group_by(Rank) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "Rank")%>%
  ggplot(aes(x=cut(Rank, c(0,5,10,15,20,25,30,35,40,45, 50,55,60,65,70,75,80,85,90,95,100),c("1-5","6-10","11-15","16-20","21-25","26-30","31-35","36-40","41-45","46-50","51-55","56-60","61-65","66-70","71-75","76-80","81-85","86-90","91-95","96-100")) , y=value, fill=variable)) + 
  geom_bar(stat = "identity")+xlab("Rank") + 
  scale_fill_manual(values=mycolors)+
  theme(text = element_text(family='STHeitiTC-Light'))+
  theme(axis.text.x = element_text(angle = 90))

從圖中可以發現『夢幻泡泡』的主題,以中間名次出現較多一點,而『現代無腦』的歌曲在前面及後面各有比較集中出現,中間名次反而較少出現。

主題為6的結果

song_lda_6 = ldas[[2]] ## 選定topic 為6 的結果

topics_6 <- tidy(song_lda_6, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics_6
## # A tibble: 20,802 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 愛情   4.23e-  3
##  2     2 愛情   5.60e-  3
##  3     3 愛情   3.92e-  3
##  4     4 愛情   3.78e-  3
##  5     5 愛情   5.94e-  3
##  6     6 愛情   8.01e-163
##  7     1 別忘記 7.54e-179
##  8     2 別忘記 3.32e-178
##  9     3 別忘記 4.45e-176
## 10     4 別忘記 2.52e-  3
## # … with 20,792 more rows

尋找Topic的代表字

  • 整理出每一個Topic中生成概率最高的10個詞彙。
mycolors <- colorRampPalette(brewer.pal(3, "YlGnBu"))(6)

top_terms_6 <- topics_6 %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms_6 %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  theme(text = element_text(family='STHeitiTC-Light')) +
  coord_flip()

產生六個主題,我們分別將其命名為:祝福、療傷、詠物抒情、現代無腦、走不出來、回憶

主題命名

topic_name_6 = c('祝福','療傷','詠物抒情','現代無腦','走不出來','回憶')

Document 6個主題分佈

# for every document we have a probability distribution of its contained topics
tmResult_6 <- posterior(song_lda_6)
doc_pro_6 <- tmResult_6$topics 
dim(doc_pro_6)               # nDocs(DTM) distributions over K topics
## [1] 100   6

共有100首歌,其分佈在六個主題之中。

cbind Document 主題分佈

# get document topic proportions 
document_topics_6 <- doc_pro_6[all_song_rank$Rank,]
document_topics_df_6 =data.frame(document_topics_6)
colnames(document_topics_df_6) = topic_name_6
rownames(document_topics_df_6) = NULL
song_topic_6 = cbind(all_song_rank,document_topics_df_6)
#song_topic %>% head(10)

查看特定主題的文章

  • 透過找到特定歌曲的分佈進行排序之後,可以看到此主題的比重高的歌曲排名大多為何。
song_topic_6 %>%
    arrange(desc(`祝福`)) %>% head(10)

在『祝福』中,排名大多屬後面名次居多。

將歌曲前100名,繪製其六個主題分佈

song_topic_6[,c(3:6)] =sapply(song_topic_6[,c(3:6)] , as.numeric)
as.numeric(song_topic_6$Rank) 
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
##  [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
##  [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
##  [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
##  [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100
song_topic_6 %>% 
  group_by(Rank) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "Rank")%>%
  ggplot(aes(x=Rank , y=value, fill=variable)) + 
  geom_bar(stat = "identity") + 
  scale_fill_manual(values=mycolors)+
  theme(text = element_text(family='STHeitiTC-Light'))

這張圖中,無法明確看出X六個主題在歌曲中的分佈,因此我們將依排名次序做分組。

將五首歌曲為一組,重新繪製六個主題分佈

song_topic_6 %>% 
  group_by(Rank) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "Rank")%>%
  ggplot(aes(x=cut(Rank, c(0,5,10,15,20,25,30,35,40,45, 50,55,60,65,70,75,80,85,90,95,100),c("1-5","6-10","11-15","16-20","21-25","26-30","31-35","36-40","41-45","46-50","51-55","56-60","61-65","66-70","71-75","76-80","81-85","86-90","91-95","96-100")) , y=value, fill=variable)) + 
  geom_bar(stat = "identity")+xlab("Rank") + 
  scale_fill_manual(values=mycolors)+
  theme(text = element_text(family='STHeitiTC-Light'))+
  theme(axis.text.x = element_text(angle = 90))

從圖中可以發現『療傷』和『祝福』的主題,分布的很平均(排行榜的各個階段都有出現),而『詠物抒情』的歌曲在前面比較集中出現。