This is an attempt to replicate what has Julia Silge & David Robinson did in their book entitled “Text Mining with R: A Tidy Approach” only with a small difference. I have checked the similarities/differences among the lyrics of the 3 Turkish singers, namely Zeki Müren, Bülent Ortaçgil and Müslüm Gürses. These singers are actually belong to different worlds of Turkish popular culture. So I wanted to check if they convey their messages using different words. I have downloaded the available lyrics of the 3 singers using “geniusR” package -beware with the big R at the end- of JosiahParry and followed the steps as described at https://github.com/JosiahParry/geniusR/blob/master/README.Rmd It seems that Ortaçgil is closer to Müren compared to Gürses. One last thing: I have tried to set a topic model with 2 topics for Ortaçgil. It seems that he tells his stories a) by talking about himself (me) and b) the other (you). Overall it is good to see that such analyses also works fine with other languages as well. Comments are welcome!
library(dplyr)
zm <- tibble(
artist = c(
rep("Zeki Muren", 7)
),
album = c(
"Sorma", "Sev beni", "Dunden bugune 9", "Batmayan gunes", "Yorgunum hayat opucugu",
"Gozlerin Doguyor Gecelerime", "Eskimeyen dost"
)
)
library(purrr)
library(geniusR)
zeki.muren <- zm %>%
mutate(tracks = map2(artist, album, genius_album))
zeki.muren
## # A tibble: 7 x 3
## artist album tracks
## <chr> <chr> <list>
## 1 Zeki Muren Sorma <tibble [11 x 3]>
## 2 Zeki Muren Sev beni <tibble [48 x 3]>
## 3 Zeki Muren Dunden bugune 9 <tibble [4 x 3]>
## 4 Zeki Muren Batmayan gunes <tibble [12 x 3]>
## 5 Zeki Muren Yorgunum hayat opucugu <tibble [40 x 3]>
## 6 Zeki Muren Gozlerin Doguyor Gecelerime <tibble [24 x 3]>
## 7 Zeki Muren Eskimeyen dost <tibble [48 x 3]>
library(tidyr)
zeki.muren.sozler <- zeki.muren %>%
unnest(tracks) %>% # Expanding the lyrics
arrange(desc(artist)) # Arranging by artist name
head(zeki.muren.sozler)
## # A tibble: 6 x 5
## artist album title track_n text
## <chr> <chr> <chr> <int> <chr>
## 1 Zeki Muren Sorma Sorma 1 Gün agarinca boynum bükülür
## 2 Zeki Muren Sorma Sorma 1 Dalarim uzaklara gönlüm sikilir
## 3 Zeki Muren Sorma Sorma 1 Sorma ne haldeyim
## 4 Zeki Muren Sorma Sorma 1 Sorma kederdeyim
## 5 Zeki Muren Sorma Sorma 1 Sorma yanginlardayim zaman zaman
## 6 Zeki Muren Sorma Sorma 1 Sorma utanirim
library(dplyr)
library(tidytext)
zeki.muren.tidy <- zeki.muren.sozler %>%
unnest_tokens(soz, text)
zeki.muren.tidy %>%
count(soz, sort=T)
## # A tibble: 293 x 2
## soz n
## <chr> <int>
## 1 muhtacim 30
## 2 bir 20
## 3 ne 17
## 4 bulamazsin 16
## 5 beni 13
## 6 gitme 11
## 7 seni 11
## 8 benim 9
## 9 eski 8
## 10 git 8
## # ... with 283 more rows
mg <- tibble(
artist = c(
rep("Muslum Gurses", 3)
),
album = c(
"Mucize ve bulusma ask tesadufleri sever", "Usta ne yazar", "Senden vazgecmem"
)
)
muslum.gurses <- mg %>%
mutate(tracks = map2(artist, album, genius_album))
muslum.gurses
## # A tibble: 3 x 3
## artist album tracks
## <chr> <chr> <list>
## 1 Muslum Gurses Mucize ve bulusma ask tesadufleri sever <tibble [63 x 3]>
## 2 Muslum Gurses Usta ne yazar <tibble [14 x 3]>
## 3 Muslum Gurses Senden vazgecmem <tibble [85 x 3]>
library(tidyr)
muslum.gurses.sozler <- muslum.gurses%>%
unnest(tracks) %>% # Expanding the lyrics
arrange(desc(artist)) # Arranging by artist name
library(dplyr)
library(tidytext)
muslum.gurses.tidy <- muslum.gurses.sozler %>%
unnest_tokens(soz, text) %>%
filter(soz != "ki") %>%
filter(soz != "bir")
muslum.gurses.tidy %>%
count(soz, sort=T)
## # A tibble: 391 x 2
## soz n
## <chr> <int>
## 1 yarali 21
## 2 agir 13
## 3 bu 13
## 4 bütün 13
## 5 duygularim 9
## 6 ne 9
## 7 usta 8
## 8 vazgeçmem 8
## 9 gibi 6
## 10 sen 6
## # ... with 381 more rows
bo <- tibble(
artist = c(
rep("Bulent Ortacgil", 8)
),
album = c(
"Benimle oynar msn", "Eski Defterler" , "Pencere onu cicegi", "Bu sarklar adam olmaz", "2 perde", "Oyuna devam", "light", "Cekirdek hatras"
)
)
bulent.ortacgil <- bo %>%
mutate(tracks = map2(artist, album, genius_album))
bulent.ortacgil
## # A tibble: 8 x 3
## artist album tracks
## <chr> <chr> <list>
## 1 Bulent Ortacgil Benimle oynar msn <tibble [128 x 3]>
## 2 Bulent Ortacgil Eski Defterler <tibble [95 x 3]>
## 3 Bulent Ortacgil Pencere onu cicegi <tibble [10 x 3]>
## 4 Bulent Ortacgil Bu sarklar adam olmaz <tibble [85 x 3]>
## 5 Bulent Ortacgil 2 perde <tibble [74 x 3]>
## 6 Bulent Ortacgil Oyuna devam <tibble [108 x 3]>
## 7 Bulent Ortacgil light <tibble [73 x 3]>
## 8 Bulent Ortacgil Cekirdek hatras <tibble [24 x 3]>
library(tidyr)
bulent.ortacgil.sozler <- bulent.ortacgil%>%
unnest(tracks) %>% # Expanding the lyrics
arrange(desc(artist)) # Arranging by artist name
library(dplyr)
library(tidytext)
bulent.ortacgil.tidy <- bulent.ortacgil.sozler %>%
unnest_tokens(soz, text) %>%
filter(soz != "bir") %>%
filter(soz != "ki") %>%
filter(soz != "mi")
bulent.ortacgil.tidy %>%
count(soz, sort=T)
## # A tibble: 771 x 2
## soz n
## <chr> <int>
## 1 olmaz 38
## 2 hiç 37
## 3 ben 35
## 4 beni 28
## 5 sen 27
## 6 kýpýr 26
## 7 basit 22
## 8 normal 21
## 9 en 19
## 10 sensiz 16
## # ... with 761 more rows
library(tidyr)
library(stringr)
library(dplyr)
library(ggplot2)
frequency <- bind_rows(mutate(zeki.muren.tidy, artist = "Zeki Muren"),
mutate(muslum.gurses.tidy, artist = "Muslum Gurses"),
mutate(bulent.ortacgil.tidy, author = "Bulent Ortacgil")) %>%
count(artist, soz) %>%
group_by(artist) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(artist, proportion) %>%
gather(artist, proportion, `Zeki Muren`:`Muslum Gurses`)
library(scales)
ggplot(frequency, aes(x = proportion, y = `Bulent Ortacgil`,
color = abs(`Bulent Ortacgil` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = soz), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4", high = "gray75") +
facet_wrap(~artist, ncol = 2) +
theme(legend.position="none") +
labs(y = "Bulent Ortacgil", x = NULL)
cor.test(data = frequency[frequency$artist == "Muslum Gurses",],
~ proportion + `Bulent Ortacgil`)
##
## Pearson's product-moment correlation
##
## data: proportion and Bulent Ortacgil
## t = 1.8239, df = 68, p-value = 0.07256
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.0200285 0.4291581
## sample estimates:
## cor
## 0.2159619
cor.test(data = frequency[frequency$artist == "Zeki Muren",],
~ proportion + `Bulent Ortacgil`)
##
## Pearson's product-moment correlation
##
## data: proportion and Bulent Ortacgil
## t = 3.6212, df = 51, p-value = 0.0006745
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.2072928 0.6438384
## sample estimates:
## cor
## 0.4522472
library(tm)
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
bo.m<- bulent.ortacgil.tidy$soz
bo.kaynak <- VectorSource(bo.m)
bo.corpus <- VCorpus(bo.kaynak)
bo.dtm <- DocumentTermMatrix(bo.corpus)
library(stm)
## stm v1.3.3 (2018-1-26) successfully loaded. See ?stm for help.
## Papers, resources, and other materials at structuraltopicmodel.com
bo.sparse <- bulent.ortacgil.tidy %>%
na.omit() %>%
count(album, soz, sort=TRUE) %>%
cast_sparse(album, soz, n)
bo.topic.model <- stm(bo.sparse, K = 2,
verbose = FALSE, init.type = "Spectral")
## Warning in stm(bo.sparse, K = 2, verbose = FALSE, init.type = "Spectral"):
## K=2 is equivalent to a unidimensional scaling model which you may prefer.
summary(bo.topic.model)
## A topic model with 2 topics, 8 documents and a 771 word dictionary.
## Topic 1 Top Words:
## Highest Prob: olmaz, basit, normal, sensiz, sen, en, yok
## FREX: olmaz, normal, sensiz, dedi, dedim, ya, peki
## Lift: normal, peki, zor, benden, zamanlarda, anlamak, biraz
## Score: olmaz, normal, ab, sensiz, dedim, dedi, ya
## Topic 2 Top Words:
## Highest Prob: hiç, ben, beni, kýpýr, sik, ýçim, sen
## FREX: sik, ýçim, deniz, kýpýrtýsýz, sevdim, latife, ama
## Lift: deniz, kýpýrtýsýz, sevdim, latife, geçmemelisin, yolculuk, hersey
## Score: aldanmadýk, sik, ýçim, latife, deniz, kýpýrtýsýz, sevdim
bo_beta<- tidy(bo.topic.model)
library(drlib)
library(ggplot2)
bo_beta%>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
mutate(topic = paste0("Topic ", topic),
term = reorder_within(term, beta, topic)) %>%
ggplot(aes(term, beta, fill = as.factor(topic))) +
geom_col(alpha = 0.8, show.legend = FALSE) +
facet_wrap(~ topic, scales = "free_y") +
coord_flip() +
scale_x_reordered() +
labs(x = NULL, y = expression(beta),
title = "Highest word probabilities for each topic",
subtitle = "Different words are associated with different topics")