分析題目：KTV熱門排行榜歌曲詞彙與風格分析

資料來源："mymusic.net KTV熱唱必備►國語K歌排行超精選

https://www.mymusic.net.tw/ux/w/themeInfo/3354

系統設置

系統參數設定

## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"

安裝需要的packages

#install.packages("modeltools")   # a dependency of topicmodels
#install.packages("topicmodels", type="source")  # should compile from source

require(dplyr)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

require(tidytext)

## Loading required package: tidytext

require(jiebaR)

## Loading required package: jiebaR

## Loading required package: jiebaRD

require(gutenbergr)

## Loading required package: gutenbergr

require(stringr)

## Loading required package: stringr

require(wordcloud2)

## Loading required package: wordcloud2

require(ggplot2)

## Loading required package: ggplot2

require(tidyr)

## Loading required package: tidyr

require(scales)

## Loading required package: scales

require(reshape2)

## Loading required package: reshape2

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

require(readr)

## Loading required package: readr

## 
## Attaching package: 'readr'

## The following object is masked from 'package:scales':
## 
##     col_factor

require(knitr)

## Loading required package: knitr

require(wordcloud)

## Loading required package: wordcloud

## Loading required package: RColorBrewer

require(tm)

## Loading required package: tm

## Loading required package: NLP

## 
## Attaching package: 'NLP'

## The following object is masked from 'package:ggplot2':
## 
##     annotate

require(readr)
require(tm)
require(data.table)

## Loading required package: data.table

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)

## Loading required package: udpipe

require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)

## Loading required package: topicmodels

require(LDAvis)

## Loading required package: LDAvis

require(wordcloud2)
require(webshot)

## Loading required package: webshot

require(htmlwidgets)

## Loading required package: htmlwidgets

require(servr)

## Loading required package: servr

require(purrr)

## Loading required package: purrr

## 
## Attaching package: 'purrr'

## The following object is masked from 'package:data.table':
## 
##     transpose

## The following object is masked from 'package:scales':
## 
##     discard

require(ramify)

## Loading required package: ramify

## 
## Attaching package: 'ramify'

## The following object is masked from 'package:purrr':
## 
##     flatten

## The following object is masked from 'package:webshot':
## 
##     resize

## The following object is masked from 'package:tidyr':
## 
##     fill

## The following object is masked from 'package:graphics':
## 
##     clip

require(RColorBrewer)
require(MASS)

## Loading required package: MASS

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

require(modeltools)

## Loading required package: modeltools

## Loading required package: stats4

匯入資料

Tokenization

# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
all_song_rank <- read.csv("Lyric_All.csv", fileEncoding="Big-5", fill=TRUE)
all_song_rank$Lyric <- as.character(all_song_rank$Lyric)

jieba_tokenizer <- worker(user="user_dict_All.txt", stop_word = "stop_words_All.txt")

# 設定斷詞function
song_tokenizer <- function(t) {
  lapply(t, function(x) {
    tokens <- segment(x, jieba_tokenizer)
    return(tokens)
  })
}

tokens <- all_song_rank %>%
  unnest_tokens(word, Lyric, token=song_tokenizer) %>%
  filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
  count(Rank, word) %>%
  rename(count=n)

tokens %>% head(20)

## # A tibble: 20 x 3
##     Rank word   count
##    <int> <chr>  <int>
##  1     1 愛情       2
##  2     1 別忘記     2
##  3     1 不痛       2
##  4     1 逞強       2
##  5     1 觸碰       2
##  6     1 發光       3
##  7     1 放棄       2
##  8     1 奮力       3
##  9     1 風         2
## 10     1 害怕       5
## 11     1 記得       1
## 12     1 記著       2
## 13     1 沮喪       2
## 14     1 落寞       2
## 15     1 迷惘       2
## 16     1 人生       2
## 17     1 身旁       6
## 18     1 聲音       2
## 19     1 失望       2
## 20     1 所有的     1

將資料轉換為DTM

song_dtm <- tokens %>% cast_dtm(Rank,word,count)
song_dtm

## <<DocumentTermMatrix (documents: 100, terms: 3467)>>
## Non-/sparse entries: 5697/341003
## Sparsity           : 98%
## Maximal term length: 6
## Weighting          : term frequency (tf)

inspect(song_dtm[1:10,1:10])

## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 21/79
## Sparsity           : 79%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## Sample             :
##     Terms
## Docs 愛情 別忘記 不痛 逞強 觸碰 發光 放棄 奮力 風 害怕
##   1     2      2    2    2    2    3    2    3  2    5
##   10    0      0    0    0    0    0    0    0  0    0
##   2     0      0    0    0    0    0    0    0  0    0
##   3     2      2    2    2    2    3    2    3  2    5
##   4     0      0    0    0    0    0    0    0  0    0
##   5     0      0    0    0    0    0    0    0  0    0
##   6     4      0    0    0    0    0    0    0  0    0
##   7     0      0    0    0    0    0    0    0  0    0
##   8     0      0    0    0    0    0    0    0  0    0
##   9     0      0    0    0    0    0    0    0  0    0

查看DTM矩陣，可以發現是個稀疏矩陣

建立LDA模型

lda_4 <- LDA(song_dtm, k = 4, control = list(seed = 2020))

先將LDA模型，主題數數定為 4

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

topics4 <- tidy(lda_4, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
topics4

## # A tibble: 13,868 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 愛情   6.79e-  3
##  2     2 愛情   1.92e-  3
##  3     3 愛情   5.18e-  3
##  4     4 愛情   2.53e-  3
##  5     1 別忘記 7.64e-110
##  6     2 別忘記 3.00e-111
##  7     3 別忘記 5.55e-109
##  8     4 別忘記 1.74e-  3
##  9     1 不痛   1.75e-109
## 10     2 不痛   2.92e-111
## # … with 13,858 more rows

繪製四個主題的常見詞彙

top_terms <- topics4 %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)
top_terms %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  theme(text = element_text(family='STHeitiTC-Light'))+
  coord_flip()

透過上方的圖，看不太出每個主題的分類依據，嘗試看看分多一點topics

LDAvis

產生create LDAvis所需的json function

此function是將前面使用 “LDA function”所建立的model，轉換為“LDAVis”套件的input格式。

topicmodels_json_ldavis <- function(fitted, doc_term){
    require(LDAvis)
    require(slam)
  
   ###以下function 用來解決，主題數多會出現NA的問題
  ## 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi)
{
  jensenShannon <- function(x, y) {
      m <- 0.5 * (x + y)
    lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
    rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
    0.5 * sum(lhs) + 0.5 * sum(rhs)
  }
  dist.mat <- proxy::dist(x = phi, method = jensenShannon)
  pca.fit <- stats::cmdscale(dist.mat, k = 2)
  data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}

    # Find required quantities
    phi <- as.matrix(posterior(fitted)$terms)
    theta <- as.matrix(posterior(fitted)$topics)
    vocab <- colnames(phi)
    term_freq <- slam::col_sums(doc_term)

    # Convert to json
    json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
                            vocab = vocab,
                            doc.length = as.vector(table(doc_term$i)),
                            term.frequency = term_freq, mds.method = ls_LDA)

    return(json_lda)
}

產生LDAvis結果

4個主題LDAvis結果

# 設置alpha及delta參數
#devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))

####### 以下用來產生ldavis的檔案，可以之後用來在local端、放在網路上打開 ##########
 for(lda in ldas){
   
   k = lda@k ## lda 主題數
   if(k==2){next}
   json_res <- topicmodels_json_ldavis(lda,song_dtm)
   # serVis(json_res,open.browser = T)
   lda_dir =  paste0(k,"_ldavis")
   if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}
   
   serVis(json_res, out.dir =lda_dir, open.browser = F)
   
   writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
 }

topic_10 = ldas[[1]]
json_res <- topicmodels_json_ldavis(topic_10,song_dtm)

serVis(json_res,open.browser = T)

# 如果無法開啟視窗(windows用戶)可執行這段
#serVis(json_res, out.dir = "vis", open.browser = T)
#writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

6個主題LDAvis結果

# 設置alpha及delta參數
#devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))

####### 以下用來產生ldavis的檔案，可以之後用來在local端、放在網路上打開 ##########
 for(lda in ldas){
   
   k = lda@k ## lda 主題數
   if(k==2){next}
   json_res <- topicmodels_json_ldavis(lda,song_dtm)
   # serVis(json_res,open.browser = T)
   lda_dir =  paste0(k,"_ldavis")
   if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}
   
   serVis(json_res, out.dir =lda_dir, open.browser = F)
   
   writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
 }

topic_10 = ldas[[2]]
json_res <- topicmodels_json_ldavis(topic_10,song_dtm)

serVis(json_res,open.browser = T)

# 如果無法開啟視窗(windows用戶)可執行這段
#serVis(json_res, out.dir = "vis", open.browser = T)
#writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))

LDA後續分析

根據前面的探索之後，我們對於資料有更加了解，並且看完每個主題數的LDAvis之後，選定主題數4,6的結果來作後續的分析

主題為4的結果

song_lda_4 = ldas[[1]] ## 選定topic 為4 的結果

topics_4 <- tidy(song_lda_4, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
topics_4

## # A tibble: 13,868 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 愛情   6.79e-  3
##  2     2 愛情   1.92e-  3
##  3     3 愛情   5.18e-  3
##  4     4 愛情   2.53e-  3
##  5     1 別忘記 7.64e-110
##  6     2 別忘記 3.00e-111
##  7     3 別忘記 5.55e-109
##  8     4 別忘記 1.74e-  3
##  9     1 不痛   1.75e-109
## 10     2 不痛   2.92e-111
## # … with 13,858 more rows

尋找Topic的代表字

整理出每一個Topic中生成概率最高的10個詞彙。

mycolors <- colorRampPalette(brewer.pal(3, "YlGnBu"))(5)

top_terms_4 <- topics_4 %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms_4 %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  theme(text = element_text(family='STHeitiTC-Light')) +
  coord_flip()

產生四個主題，我們分別將其命名為：夢幻泡泡、受挫傷心、現代無腦、其他

主題命名

topic_name_4 = c('夢幻泡泡','受挫傷心','現代無腦','其他')

Document 4個主題分佈

# for every document we have a probability distribution of its contained topics
tmResult_4 <- posterior(song_lda_4)
doc_pro_4 <- tmResult_4$topics 
dim(doc_pro_4)               # nDocs(DTM) distributions over K topics

## [1] 100   4

共有100首歌，其分佈在四個主題之中。

cbind Document 主題分佈

# get document topic proportions 
document_topics_4 <- doc_pro_4[all_song_rank$Rank,]
document_topics_df_4 =data.frame(document_topics_4)
colnames(document_topics_df_4) = topic_name_4
rownames(document_topics_df_4) = NULL
song_topic_4 = cbind(all_song_rank,document_topics_df_4)
#song_topic %>% head(10)

查看特定主題的文章

透過找到特定歌曲的分佈進行排序之後，可以看到此主題的比重高的歌曲排名大多為何。

song_topic_4 %>%
    arrange(desc(`夢幻泡泡`)) %>% head(10)

在『夢幻泡泡』中，排名大多屬中間名次居多。

將歌曲前100名，繪製其四個主題分佈

song_topic_4[,c(3:6)] =sapply(song_topic_4[,c(3:6)] , as.numeric)
as.numeric(song_topic_4$Rank)

##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
##  [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
##  [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
##  [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
##  [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100

song_topic_4 %>% 
  group_by(Rank) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "Rank")%>%
  ggplot(aes(x=Rank , y=value, fill=variable)) + 
  geom_bar(stat = "identity") + 
  scale_fill_manual(values=mycolors)+
  theme(text = element_text(family='STHeitiTC-Light'))

這張圖中，無法明確看出四個主題在歌曲中的分佈，因此我們將依排名次序做分組。

將五首歌曲為一組，重新繪製四個主題分佈

song_topic_4 %>% 
  group_by(Rank) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "Rank")%>%
  ggplot(aes(x=cut(Rank, c(0,5,10,15,20,25,30,35,40,45, 50,55,60,65,70,75,80,85,90,95,100),c("1-5","6-10","11-15","16-20","21-25","26-30","31-35","36-40","41-45","46-50","51-55","56-60","61-65","66-70","71-75","76-80","81-85","86-90","91-95","96-100")) , y=value, fill=variable)) + 
  geom_bar(stat = "identity")+xlab("Rank") + 
  scale_fill_manual(values=mycolors)+
  theme(text = element_text(family='STHeitiTC-Light'))+
  theme(axis.text.x = element_text(angle = 90))

從圖中可以發現『夢幻泡泡』的主題，以中間名次出現較多一點，而『現代無腦』的歌曲在前面及後面各有比較集中出現，中間名次反而較少出現。

主題為6的結果

song_lda_6 = ldas[[2]] ## 選定topic 為6 的結果

topics_6 <- tidy(song_lda_6, matrix = "beta") # 注意，在tidy function裡面要使用"beta"來取出Phi矩陣。
topics_6

## # A tibble: 20,802 x 3
##    topic term        beta
##    <int> <chr>      <dbl>
##  1     1 愛情   4.23e-  3
##  2     2 愛情   5.60e-  3
##  3     3 愛情   3.92e-  3
##  4     4 愛情   3.78e-  3
##  5     5 愛情   5.94e-  3
##  6     6 愛情   8.01e-163
##  7     1 別忘記 7.54e-179
##  8     2 別忘記 3.32e-178
##  9     3 別忘記 4.45e-176
## 10     4 別忘記 2.52e-  3
## # … with 20,792 more rows

尋找Topic的代表字

整理出每一個Topic中生成概率最高的10個詞彙。

mycolors <- colorRampPalette(brewer.pal(3, "YlGnBu"))(6)

top_terms_6 <- topics_6 %>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  arrange(topic, -beta)


top_terms_6 %>%
  mutate(term = reorder(term, beta)) %>%
  ggplot(aes(term, beta, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  scale_fill_manual(values=mycolors)+
  facet_wrap(~ topic, scales = "free") +
  theme(text = element_text(family='STHeitiTC-Light')) +
  coord_flip()

產生六個主題，我們分別將其命名為：祝福、療傷、詠物抒情、現代無腦、走不出來、回憶

主題命名

topic_name_6 = c('祝福','療傷','詠物抒情','現代無腦','走不出來','回憶')

Document 6個主題分佈

# for every document we have a probability distribution of its contained topics
tmResult_6 <- posterior(song_lda_6)
doc_pro_6 <- tmResult_6$topics 
dim(doc_pro_6)               # nDocs(DTM) distributions over K topics

## [1] 100   6

共有100首歌，其分佈在六個主題之中。

cbind Document 主題分佈

# get document topic proportions 
document_topics_6 <- doc_pro_6[all_song_rank$Rank,]
document_topics_df_6 =data.frame(document_topics_6)
colnames(document_topics_df_6) = topic_name_6
rownames(document_topics_df_6) = NULL
song_topic_6 = cbind(all_song_rank,document_topics_df_6)
#song_topic %>% head(10)

查看特定主題的文章

透過找到特定歌曲的分佈進行排序之後，可以看到此主題的比重高的歌曲排名大多為何。

song_topic_6 %>%
    arrange(desc(`祝福`)) %>% head(10)

在『祝福』中，排名大多屬後面名次居多。

將歌曲前100名，繪製其六個主題分佈

song_topic_6[,c(3:6)] =sapply(song_topic_6[,c(3:6)] , as.numeric)
as.numeric(song_topic_6$Rank)

##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34
##  [35]  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51
##  [52]  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68
##  [69]  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84  85
##  [86]  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100

song_topic_6 %>% 
  group_by(Rank) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "Rank")%>%
  ggplot(aes(x=Rank , y=value, fill=variable)) + 
  geom_bar(stat = "identity") + 
  scale_fill_manual(values=mycolors)+
  theme(text = element_text(family='STHeitiTC-Light'))

這張圖中，無法明確看出X六個主題在歌曲中的分佈，因此我們將依排名次序做分組。

將五首歌曲為一組，重新繪製六個主題分佈

song_topic_6 %>% 
  group_by(Rank) %>%
  summarise_if(is.numeric, sum, na.rm = TRUE) %>%
  melt(id.vars = "Rank")%>%
  ggplot(aes(x=cut(Rank, c(0,5,10,15,20,25,30,35,40,45, 50,55,60,65,70,75,80,85,90,95,100),c("1-5","6-10","11-15","16-20","21-25","26-30","31-35","36-40","41-45","46-50","51-55","56-60","61-65","66-70","71-75","76-80","81-85","86-90","91-95","96-100")) , y=value, fill=variable)) + 
  geom_bar(stat = "identity")+xlab("Rank") + 
  scale_fill_manual(values=mycolors)+
  theme(text = element_text(family='STHeitiTC-Light'))+
  theme(axis.text.x = element_text(angle = 90))

從圖中可以發現『療傷』和『祝福』的主題，分布的很平均(排行榜的各個階段都有出現)，而『詠物抒情』的歌曲在前面比較集中出現。

KTV熱門排行榜歌曲分析

讀書會第四組

2020/05/12

系統設置

系統參數設定

安裝需要的packages

匯入資料

Tokenization

將資料轉換為DTM

建立LDA模型

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

繪製四個主題的常見詞彙

更多主題

LDAvis

產生create LDAvis所需的json function

產生LDAvis結果

4個主題LDAvis結果

6個主題LDAvis結果

LDA後續分析

主題為4的結果

尋找Topic的代表字

主題命名

Document 4個主題分佈

cbind Document 主題分佈

查看特定主題的文章

將歌曲前100名，繪製其四個主題分佈

將五首歌曲為一組，重新繪製四個主題分佈

主題為6的結果

尋找Topic的代表字

主題命名

Document 6個主題分佈

cbind Document 主題分佈

查看特定主題的文章

將歌曲前100名，繪製其六個主題分佈

將五首歌曲為一組，重新繪製六個主題分佈

KTV熱門排行榜歌曲分析

讀書會 第四組

2020/05/12

系統設置

系統參數設定

安裝需要的packages

匯入資料

Tokenization

將資料轉換為DTM

建立LDA模型

\(\phi\) Matrix

查看\(\phi\) matrix (topic * term)

繪製四個主題的常見詞彙

更多主題

LDAvis

產生create LDAvis所需的json function

產生LDAvis結果

4個主題LDAvis結果

6個主題LDAvis結果

LDA後續分析

主題為4的結果

尋找Topic的代表字

主題命名

Document 4個主題分佈

cbind Document 主題分佈

查看特定主題的文章

將歌曲前100名，繪製其四個主題分佈

將五首歌曲為一組，重新繪製四個主題分佈

主題為6的結果

尋找Topic的代表字

主題命名

Document 6個主題分佈

cbind Document 主題分佈

查看特定主題的文章

將歌曲前100名，繪製其六個主題分佈

將五首歌曲為一組，重新繪製六個主題分佈

讀書會第四組