分析題目:KTV熱門排行榜歌曲詞彙與風格分析
資料來源:"mymusic.net KTV熱唱必備►國語K歌排行超精選
https://www.mymusic.net.tw/ux/w/themeInfo/3354
## [1] "zh_TW.UTF-8/zh_TW.UTF-8/zh_TW.UTF-8/C/zh_TW.UTF-8/zh_TW.UTF-8"
#install.packages("modeltools") # a dependency of topicmodels
#install.packages("topicmodels", type="source") # should compile from source
require(dplyr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
require(tidytext)
## Loading required package: tidytext
require(jiebaR)
## Loading required package: jiebaR
## Loading required package: jiebaRD
require(gutenbergr)
## Loading required package: gutenbergr
require(stringr)
## Loading required package: stringr
require(wordcloud2)
## Loading required package: wordcloud2
require(ggplot2)
## Loading required package: ggplot2
require(tidyr)
## Loading required package: tidyr
require(scales)
## Loading required package: scales
require(reshape2)
## Loading required package: reshape2
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
require(readr)
## Loading required package: readr
##
## Attaching package: 'readr'
## The following object is masked from 'package:scales':
##
## col_factor
require(knitr)
## Loading required package: knitr
require(wordcloud)
## Loading required package: wordcloud
## Loading required package: RColorBrewer
require(tm)
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
require(readr)
require(tm)
require(data.table)
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, first, last
require(dplyr)
require(stringr)
require(jiebaR)
require(udpipe)
## Loading required package: udpipe
require(tidytext)
require(ggplot2)
require(tidyr)
require(topicmodels)
## Loading required package: topicmodels
require(LDAvis)
## Loading required package: LDAvis
require(wordcloud2)
require(webshot)
## Loading required package: webshot
require(htmlwidgets)
## Loading required package: htmlwidgets
require(servr)
## Loading required package: servr
require(purrr)
## Loading required package: purrr
##
## Attaching package: 'purrr'
## The following object is masked from 'package:data.table':
##
## transpose
## The following object is masked from 'package:scales':
##
## discard
require(ramify)
## Loading required package: ramify
##
## Attaching package: 'ramify'
## The following object is masked from 'package:purrr':
##
## flatten
## The following object is masked from 'package:webshot':
##
## resize
## The following object is masked from 'package:tidyr':
##
## fill
## The following object is masked from 'package:graphics':
##
## clip
require(RColorBrewer)
require(MASS)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
require(modeltools)
## Loading required package: modeltools
## Loading required package: stats4
# 使用默認參數初始化一個斷詞引擎
jieba_tokenizer = worker()
all_song_rank <- read.csv("Lyric_All.csv", fileEncoding="Big-5", fill=TRUE)
all_song_rank$Lyric <- as.character(all_song_rank$Lyric)
jieba_tokenizer <- worker(user="user_dict_All.txt", stop_word = "stop_words_All.txt")
# 設定斷詞function
song_tokenizer <- function(t) {
lapply(t, function(x) {
tokens <- segment(x, jieba_tokenizer)
return(tokens)
})
}
tokens <- all_song_rank %>%
unnest_tokens(word, Lyric, token=song_tokenizer) %>%
filter(!str_detect(word, regex("[0-9a-zA-Z]"))) %>%
count(Rank, word) %>%
rename(count=n)
tokens %>% head(20)
## # A tibble: 20 x 3
## Rank word count
## <int> <chr> <int>
## 1 1 愛情 2
## 2 1 別忘記 2
## 3 1 不痛 2
## 4 1 逞強 2
## 5 1 觸碰 2
## 6 1 發光 3
## 7 1 放棄 2
## 8 1 奮力 3
## 9 1 風 2
## 10 1 害怕 5
## 11 1 記得 1
## 12 1 記著 2
## 13 1 沮喪 2
## 14 1 落寞 2
## 15 1 迷惘 2
## 16 1 人生 2
## 17 1 身旁 6
## 18 1 聲音 2
## 19 1 失望 2
## 20 1 所有的 1
song_dtm <- tokens %>% cast_dtm(Rank,word,count)
song_dtm
## <<DocumentTermMatrix (documents: 100, terms: 3467)>>
## Non-/sparse entries: 5697/341003
## Sparsity : 98%
## Maximal term length: 6
## Weighting : term frequency (tf)
inspect(song_dtm[1:10,1:10])
## <<DocumentTermMatrix (documents: 10, terms: 10)>>
## Non-/sparse entries: 21/79
## Sparsity : 79%
## Maximal term length: 3
## Weighting : term frequency (tf)
## Sample :
## Terms
## Docs 愛情 別忘記 不痛 逞強 觸碰 發光 放棄 奮力 風 害怕
## 1 2 2 2 2 2 3 2 3 2 5
## 10 0 0 0 0 0 0 0 0 0 0
## 2 0 0 0 0 0 0 0 0 0 0
## 3 2 2 2 2 2 3 2 3 2 5
## 4 0 0 0 0 0 0 0 0 0 0
## 5 0 0 0 0 0 0 0 0 0 0
## 6 4 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0 0
## 9 0 0 0 0 0 0 0 0 0 0
查看DTM矩陣,可以發現是個稀疏矩陣
lda_4 <- LDA(song_dtm, k = 4, control = list(seed = 2020))
先將LDA模型,主題數數定為 4
topics4 <- tidy(lda_4, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics4
## # A tibble: 13,868 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 愛情 6.79e- 3
## 2 2 愛情 1.92e- 3
## 3 3 愛情 5.18e- 3
## 4 4 愛情 2.53e- 3
## 5 1 別忘記 7.64e-110
## 6 2 別忘記 3.00e-111
## 7 3 別忘記 5.55e-109
## 8 4 別忘記 1.74e- 3
## 9 1 不痛 1.75e-109
## 10 2 不痛 2.92e-111
## # … with 13,858 more rows
top_terms <- topics4 %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
theme(text = element_text(family='STHeitiTC-Light'))+
coord_flip()
透過上方的圖,看不太出每個主題的分類依據,嘗試看看分多一點topics
ldas = c()
topics = c(4,6,8,10,12)
for(topic in topics){
start_time <- Sys.time()
lda <- LDA(song_dtm, k = topic, control = list(seed = 2020))
ldas =c(ldas,lda)
print(paste(topic ,paste("topic(s) and use time is ", Sys.time() -start_time)))
save(ldas,file = "ldas_result")
}
load("ldas_result")
topics = c(4,6,8,10,12)
data_frame(k = topics,
perplex = map_dbl(ldas, topicmodels::perplexity)) %>%
ggplot(aes(k, perplex)) +
geom_point() +
geom_line() +
labs(title = "Evaluating LDA topic models",
subtitle = "Optimal number of topics (smaller is better)",
x = "Number of topics",
y = "Perplexity")
## Warning: `data_frame()` is deprecated, use `tibble()`.
## This warning is displayed once per session.
從圖中可以看出,4,6個主題為較適合,故以下將以4,6主題做討論
topicmodels_json_ldavis <- function(fitted, doc_term){
require(LDAvis)
require(slam)
###以下function 用來解決,主題數多會出現NA的問題
## 參考 https://github.com/cpsievert/LDAvis/commit/c7234d71168b1e946a361bc00593bc5c4bf8e57e
ls_LDA = function (phi)
{
jensenShannon <- function(x, y) {
m <- 0.5 * (x + y)
lhs <- ifelse(x == 0, 0, x * (log(x) - log(m+1e-16)))
rhs <- ifelse(y == 0, 0, y * (log(y) - log(m+1e-16)))
0.5 * sum(lhs) + 0.5 * sum(rhs)
}
dist.mat <- proxy::dist(x = phi, method = jensenShannon)
pca.fit <- stats::cmdscale(dist.mat, k = 2)
data.frame(x = pca.fit[, 1], y = pca.fit[, 2])
}
# Find required quantities
phi <- as.matrix(posterior(fitted)$terms)
theta <- as.matrix(posterior(fitted)$topics)
vocab <- colnames(phi)
term_freq <- slam::col_sums(doc_term)
# Convert to json
json_lda <- LDAvis::createJSON(phi = phi, theta = theta,
vocab = vocab,
doc.length = as.vector(table(doc_term$i)),
term.frequency = term_freq, mds.method = ls_LDA)
return(json_lda)
}
# 設置alpha及delta參數
#devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))
####### 以下用來產生ldavis的檔案,可以之後用來在local端、放在網路上打開 ##########
for(lda in ldas){
k = lda@k ## lda 主題數
if(k==2){next}
json_res <- topicmodels_json_ldavis(lda,song_dtm)
# serVis(json_res,open.browser = T)
lda_dir = paste0(k,"_ldavis")
if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}
serVis(json_res, out.dir =lda_dir, open.browser = F)
writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
}
topic_10 = ldas[[1]]
json_res <- topicmodels_json_ldavis(topic_10,song_dtm)
serVis(json_res,open.browser = T)
# 如果無法開啟視窗(windows用戶)可執行這段
#serVis(json_res, out.dir = "vis", open.browser = T)
#writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))
# 設置alpha及delta參數
#devotion_lda_removed <- LDA(devotion_dtm_removed, k = 4, method = "Gibbs", control = list(seed = 1234, alpha = 2, delta= 0.1))
####### 以下用來產生ldavis的檔案,可以之後用來在local端、放在網路上打開 ##########
for(lda in ldas){
k = lda@k ## lda 主題數
if(k==2){next}
json_res <- topicmodels_json_ldavis(lda,song_dtm)
# serVis(json_res,open.browser = T)
lda_dir = paste0(k,"_ldavis")
if(!dir.exists(lda_dir)){ dir.create("./",lda_dir)}
serVis(json_res, out.dir =lda_dir, open.browser = F)
writeLines(iconv(readLines(paste0(lda_dir,"/lda.json")), to = "UTF8"))
}
topic_10 = ldas[[2]]
json_res <- topicmodels_json_ldavis(topic_10,song_dtm)
serVis(json_res,open.browser = T)
# 如果無法開啟視窗(windows用戶)可執行這段
#serVis(json_res, out.dir = "vis", open.browser = T)
#writeLines(iconv(readLines("./vis/lda.json"), to = "UTF8"))
song_lda_4 = ldas[[1]] ## 選定topic 為4 的結果
topics_4 <- tidy(song_lda_4, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics_4
## # A tibble: 13,868 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 愛情 6.79e- 3
## 2 2 愛情 1.92e- 3
## 3 3 愛情 5.18e- 3
## 4 4 愛情 2.53e- 3
## 5 1 別忘記 7.64e-110
## 6 2 別忘記 3.00e-111
## 7 3 別忘記 5.55e-109
## 8 4 別忘記 1.74e- 3
## 9 1 不痛 1.75e-109
## 10 2 不痛 2.92e-111
## # … with 13,858 more rows
mycolors <- colorRampPalette(brewer.pal(3, "YlGnBu"))(5)
top_terms_4 <- topics_4 %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_4 %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
theme(text = element_text(family='STHeitiTC-Light')) +
coord_flip()
產生四個主題,我們分別將其命名為:夢幻泡泡、受挫傷心、現代無腦、其他
topic_name_4 = c('夢幻泡泡','受挫傷心','現代無腦','其他')
# for every document we have a probability distribution of its contained topics
tmResult_4 <- posterior(song_lda_4)
doc_pro_4 <- tmResult_4$topics
dim(doc_pro_4) # nDocs(DTM) distributions over K topics
## [1] 100 4
共有100首歌,其分佈在四個主題之中。
# get document topic proportions
document_topics_4 <- doc_pro_4[all_song_rank$Rank,]
document_topics_df_4 =data.frame(document_topics_4)
colnames(document_topics_df_4) = topic_name_4
rownames(document_topics_df_4) = NULL
song_topic_4 = cbind(all_song_rank,document_topics_df_4)
#song_topic %>% head(10)
song_topic_4 %>%
arrange(desc(`夢幻泡泡`)) %>% head(10)
在『夢幻泡泡』中,排名大多屬中間名次居多。
song_topic_4[,c(3:6)] =sapply(song_topic_4[,c(3:6)] , as.numeric)
as.numeric(song_topic_4$Rank)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
song_topic_4 %>%
group_by(Rank) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "Rank")%>%
ggplot(aes(x=Rank , y=value, fill=variable)) +
geom_bar(stat = "identity") +
scale_fill_manual(values=mycolors)+
theme(text = element_text(family='STHeitiTC-Light'))
這張圖中,無法明確看出四個主題在歌曲中的分佈,因此我們將依排名次序做分組。
song_topic_4 %>%
group_by(Rank) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "Rank")%>%
ggplot(aes(x=cut(Rank, c(0,5,10,15,20,25,30,35,40,45, 50,55,60,65,70,75,80,85,90,95,100),c("1-5","6-10","11-15","16-20","21-25","26-30","31-35","36-40","41-45","46-50","51-55","56-60","61-65","66-70","71-75","76-80","81-85","86-90","91-95","96-100")) , y=value, fill=variable)) +
geom_bar(stat = "identity")+xlab("Rank") +
scale_fill_manual(values=mycolors)+
theme(text = element_text(family='STHeitiTC-Light'))+
theme(axis.text.x = element_text(angle = 90))
從圖中可以發現『夢幻泡泡』的主題,以中間名次出現較多一點,而『現代無腦』的歌曲在前面及後面各有比較集中出現,中間名次反而較少出現。
song_lda_6 = ldas[[2]] ## 選定topic 為6 的結果
topics_6 <- tidy(song_lda_6, matrix = "beta") # 注意,在tidy function裡面要使用"beta"來取出Phi矩陣。
topics_6
## # A tibble: 20,802 x 3
## topic term beta
## <int> <chr> <dbl>
## 1 1 愛情 4.23e- 3
## 2 2 愛情 5.60e- 3
## 3 3 愛情 3.92e- 3
## 4 4 愛情 3.78e- 3
## 5 5 愛情 5.94e- 3
## 6 6 愛情 8.01e-163
## 7 1 別忘記 7.54e-179
## 8 2 別忘記 3.32e-178
## 9 3 別忘記 4.45e-176
## 10 4 別忘記 2.52e- 3
## # … with 20,792 more rows
mycolors <- colorRampPalette(brewer.pal(3, "YlGnBu"))(6)
top_terms_6 <- topics_6 %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, -beta)
top_terms_6 %>%
mutate(term = reorder(term, beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
scale_fill_manual(values=mycolors)+
facet_wrap(~ topic, scales = "free") +
theme(text = element_text(family='STHeitiTC-Light')) +
coord_flip()
產生六個主題,我們分別將其命名為:祝福、療傷、詠物抒情、現代無腦、走不出來、回憶
topic_name_6 = c('祝福','療傷','詠物抒情','現代無腦','走不出來','回憶')
# for every document we have a probability distribution of its contained topics
tmResult_6 <- posterior(song_lda_6)
doc_pro_6 <- tmResult_6$topics
dim(doc_pro_6) # nDocs(DTM) distributions over K topics
## [1] 100 6
共有100首歌,其分佈在六個主題之中。
# get document topic proportions
document_topics_6 <- doc_pro_6[all_song_rank$Rank,]
document_topics_df_6 =data.frame(document_topics_6)
colnames(document_topics_df_6) = topic_name_6
rownames(document_topics_df_6) = NULL
song_topic_6 = cbind(all_song_rank,document_topics_df_6)
#song_topic %>% head(10)
song_topic_6 %>%
arrange(desc(`祝福`)) %>% head(10)
在『祝福』中,排名大多屬後面名次居多。
song_topic_6[,c(3:6)] =sapply(song_topic_6[,c(3:6)] , as.numeric)
as.numeric(song_topic_6$Rank)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34
## [35] 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51
## [52] 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68
## [69] 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85
## [86] 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
song_topic_6 %>%
group_by(Rank) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "Rank")%>%
ggplot(aes(x=Rank , y=value, fill=variable)) +
geom_bar(stat = "identity") +
scale_fill_manual(values=mycolors)+
theme(text = element_text(family='STHeitiTC-Light'))
這張圖中,無法明確看出X六個主題在歌曲中的分佈,因此我們將依排名次序做分組。
song_topic_6 %>%
group_by(Rank) %>%
summarise_if(is.numeric, sum, na.rm = TRUE) %>%
melt(id.vars = "Rank")%>%
ggplot(aes(x=cut(Rank, c(0,5,10,15,20,25,30,35,40,45, 50,55,60,65,70,75,80,85,90,95,100),c("1-5","6-10","11-15","16-20","21-25","26-30","31-35","36-40","41-45","46-50","51-55","56-60","61-65","66-70","71-75","76-80","81-85","86-90","91-95","96-100")) , y=value, fill=variable)) +
geom_bar(stat = "identity")+xlab("Rank") +
scale_fill_manual(values=mycolors)+
theme(text = element_text(family='STHeitiTC-Light'))+
theme(axis.text.x = element_text(angle = 90))
從圖中可以發現『療傷』和『祝福』的主題,分布的很平均(排行榜的各個階段都有出現),而『詠物抒情』的歌曲在前面比較集中出現。