This is an attempt to replicate what has Julia Silge & David Robinson did in their book entitled “Text Mining with R: A Tidy Approach” only with a small difference. I have checked the similarities/differences among the lyrics of the 3 Turkish singers, namely Zeki Müren, Bülent Ortaçgil and Müslüm Gürses. These singers are actually belong to different worlds of Turkish popular culture. So I wanted to check if they convey their messages using different words. I have downloaded the available lyrics of the 3 singers using “geniusR” package -beware with the big R at the end- of JosiahParry and followed the steps as described at https://github.com/JosiahParry/geniusR/blob/master/README.Rmd It seems that Ortaçgil is closer to Müren compared to Gürses. One last thing: I have tried to set a topic model with 2 topics for Ortaçgil. It seems that he tells his stories a) by talking about himself (me) and b) the other (you). Overall it is good to see that such analyses also works fine with other languages as well. Comments are welcome!

Zeki Muren

library(dplyr)
zm <-  tibble(
  artist = c(
    rep("Zeki Muren", 7) 
    ),
  album = c(
   "Sorma", "Sev beni", "Dunden bugune 9", "Batmayan gunes", "Yorgunum hayat opucugu",
   "Gozlerin Doguyor Gecelerime", "Eskimeyen dost"
    )
)
library(purrr)
library(geniusR)
zeki.muren <- zm %>% 
  mutate(tracks = map2(artist, album, genius_album))

zeki.muren
## # A tibble: 7 x 3
##   artist     album                       tracks           
##   <chr>      <chr>                       <list>           
## 1 Zeki Muren Sorma                       <tibble [11 x 3]>
## 2 Zeki Muren Sev beni                    <tibble [48 x 3]>
## 3 Zeki Muren Dunden bugune 9             <tibble [4 x 3]> 
## 4 Zeki Muren Batmayan gunes              <tibble [12 x 3]>
## 5 Zeki Muren Yorgunum hayat opucugu      <tibble [40 x 3]>
## 6 Zeki Muren Gozlerin Doguyor Gecelerime <tibble [24 x 3]>
## 7 Zeki Muren Eskimeyen dost              <tibble [48 x 3]>
library(tidyr)
zeki.muren.sozler <- zeki.muren %>% 
  unnest(tracks) %>%    # Expanding the lyrics 
  arrange(desc(artist)) # Arranging by artist name 

head(zeki.muren.sozler)
## # A tibble: 6 x 5
##   artist     album title track_n text                            
##   <chr>      <chr> <chr>   <int> <chr>                           
## 1 Zeki Muren Sorma Sorma       1 Gün agarinca boynum bükülür     
## 2 Zeki Muren Sorma Sorma       1 Dalarim uzaklara gönlüm sikilir 
## 3 Zeki Muren Sorma Sorma       1 Sorma ne haldeyim               
## 4 Zeki Muren Sorma Sorma       1 Sorma kederdeyim                
## 5 Zeki Muren Sorma Sorma       1 Sorma yanginlardayim zaman zaman
## 6 Zeki Muren Sorma Sorma       1 Sorma utanirim
library(dplyr)
library(tidytext)
zeki.muren.tidy <- zeki.muren.sozler %>%
          unnest_tokens(soz, text)

zeki.muren.tidy %>%
  count(soz, sort=T)
## # A tibble: 293 x 2
##    soz            n
##    <chr>      <int>
##  1 muhtacim      30
##  2 bir           20
##  3 ne            17
##  4 bulamazsin    16
##  5 beni          13
##  6 gitme         11
##  7 seni          11
##  8 benim          9
##  9 eski           8
## 10 git            8
## # ... with 283 more rows

Muslüm Gurses

mg <-  tibble(
  artist = c(
    rep("Muslum Gurses", 3) 
    ),
  album = c(
   "Mucize ve bulusma ask tesadufleri sever", "Usta ne yazar", "Senden vazgecmem"
    )
)
muslum.gurses <- mg %>% 
  mutate(tracks = map2(artist, album, genius_album))

muslum.gurses
## # A tibble: 3 x 3
##   artist        album                                   tracks           
##   <chr>         <chr>                                   <list>           
## 1 Muslum Gurses Mucize ve bulusma ask tesadufleri sever <tibble [63 x 3]>
## 2 Muslum Gurses Usta ne yazar                           <tibble [14 x 3]>
## 3 Muslum Gurses Senden vazgecmem                        <tibble [85 x 3]>
library(tidyr)
muslum.gurses.sozler <- muslum.gurses%>% 
  unnest(tracks) %>%    # Expanding the lyrics 
  arrange(desc(artist)) # Arranging by artist name 
library(dplyr)
library(tidytext)
muslum.gurses.tidy <- muslum.gurses.sozler %>%
          unnest_tokens(soz, text) %>%
          filter(soz != "ki") %>% 
          filter(soz != "bir") 

muslum.gurses.tidy  %>%
  count(soz, sort=T)
## # A tibble: 391 x 2
##    soz            n
##    <chr>      <int>
##  1 yarali        21
##  2 agir          13
##  3 bu            13
##  4 bütün         13
##  5 duygularim     9
##  6 ne             9
##  7 usta           8
##  8 vazgeçmem      8
##  9 gibi           6
## 10 sen            6
## # ... with 381 more rows

Bulent Ortacgil

bo <-  tibble(
  artist = c(
    rep("Bulent Ortacgil", 8) 
    ),
  album = c(
    "Benimle oynar msn", "Eski Defterler" , "Pencere onu cicegi", "Bu sarklar adam olmaz", "2 perde", "Oyuna devam", "light", "Cekirdek hatras"
    )
)
bulent.ortacgil <- bo %>% 
  mutate(tracks = map2(artist, album, genius_album))

bulent.ortacgil
## # A tibble: 8 x 3
##   artist          album                 tracks            
##   <chr>           <chr>                 <list>            
## 1 Bulent Ortacgil Benimle oynar msn     <tibble [128 x 3]>
## 2 Bulent Ortacgil Eski Defterler        <tibble [95 x 3]> 
## 3 Bulent Ortacgil Pencere onu cicegi    <tibble [10 x 3]> 
## 4 Bulent Ortacgil Bu sarklar adam olmaz <tibble [85 x 3]> 
## 5 Bulent Ortacgil 2 perde               <tibble [74 x 3]> 
## 6 Bulent Ortacgil Oyuna devam           <tibble [108 x 3]>
## 7 Bulent Ortacgil light                 <tibble [73 x 3]> 
## 8 Bulent Ortacgil Cekirdek hatras       <tibble [24 x 3]>
library(tidyr)
bulent.ortacgil.sozler <- bulent.ortacgil%>% 
  unnest(tracks) %>%    # Expanding the lyrics 
  arrange(desc(artist)) # Arranging by artist name 
library(dplyr)
library(tidytext)
bulent.ortacgil.tidy <- bulent.ortacgil.sozler %>%
          unnest_tokens(soz, text) %>%
          filter(soz != "bir") %>%
          filter(soz != "ki") %>%
          filter(soz != "mi")

bulent.ortacgil.tidy  %>%
  count(soz, sort=T)
## # A tibble: 771 x 2
##    soz        n
##    <chr>  <int>
##  1 olmaz     38
##  2 hiç       37
##  3 ben       35
##  4 beni      28
##  5 sen       27
##  6 kýpýr     26
##  7 basit     22
##  8 normal    21
##  9 en        19
## 10 sensiz    16
## # ... with 761 more rows
library(tidyr)
library(stringr)
library(dplyr)
library(ggplot2)

frequency <- bind_rows(mutate(zeki.muren.tidy, artist = "Zeki Muren"),
                        mutate(muslum.gurses.tidy, artist = "Muslum Gurses"),
                        mutate(bulent.ortacgil.tidy, author = "Bulent Ortacgil")) %>%
count(artist, soz) %>%
group_by(artist) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(artist, proportion) %>%
gather(artist, proportion, `Zeki Muren`:`Muslum Gurses`)
library(scales)
ggplot(frequency, aes(x = proportion, y = `Bulent Ortacgil`,
color = abs(`Bulent Ortacgil` - proportion))) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 2.5, width = 0.3, height = 0.3) +
geom_text(aes(label = soz), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001),
low = "darkslategray4", high = "gray75") +
facet_wrap(~artist, ncol = 2) +
theme(legend.position="none") +
labs(y = "Bulent Ortacgil", x = NULL)

cor.test(data = frequency[frequency$artist == "Muslum Gurses",],
~ proportion + `Bulent Ortacgil`)
## 
##  Pearson's product-moment correlation
## 
## data:  proportion and Bulent Ortacgil
## t = 1.8239, df = 68, p-value = 0.07256
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.0200285  0.4291581
## sample estimates:
##       cor 
## 0.2159619
cor.test(data = frequency[frequency$artist == "Zeki Muren",],
~ proportion + `Bulent Ortacgil`)
## 
##  Pearson's product-moment correlation
## 
## data:  proportion and Bulent Ortacgil
## t = 3.6212, df = 51, p-value = 0.0006745
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.2072928 0.6438384
## sample estimates:
##       cor 
## 0.4522472

topic models

library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
bo.m<- bulent.ortacgil.tidy$soz
bo.kaynak <- VectorSource(bo.m)
bo.corpus <- VCorpus(bo.kaynak)
bo.dtm <- DocumentTermMatrix(bo.corpus)
library(stm)
## stm v1.3.3 (2018-1-26) successfully loaded. See ?stm for help. 
##  Papers, resources, and other materials at structuraltopicmodel.com
bo.sparse <- bulent.ortacgil.tidy %>%
  na.omit() %>%
  count(album, soz, sort=TRUE) %>%
  cast_sparse(album, soz, n)

bo.topic.model <- stm(bo.sparse, K = 2, 
                   verbose = FALSE, init.type = "Spectral")
## Warning in stm(bo.sparse, K = 2, verbose = FALSE, init.type = "Spectral"):
## K=2 is equivalent to a unidimensional scaling model which you may prefer.
summary(bo.topic.model)
## A topic model with 2 topics, 8 documents and a 771 word dictionary.
## Topic 1 Top Words:
##       Highest Prob: olmaz, basit, normal, sensiz, sen, en, yok 
##       FREX: olmaz, normal, sensiz, dedi, dedim, ya, peki 
##       Lift: normal, peki, zor, benden, zamanlarda, anlamak, biraz 
##       Score: olmaz, normal, ab, sensiz, dedim, dedi, ya 
## Topic 2 Top Words:
##       Highest Prob: hiç, ben, beni, kýpýr, sik, ýçim, sen 
##       FREX: sik, ýçim, deniz, kýpýrtýsýz, sevdim, latife, ama 
##       Lift: deniz, kýpýrtýsýz, sevdim, latife, geçmemelisin, yolculuk, hersey 
##       Score: aldanmadýk, sik, ýçim, latife, deniz, kýpýrtýsýz, sevdim
bo_beta<- tidy(bo.topic.model)

library(drlib)
library(ggplot2)
bo_beta%>%
  group_by(topic) %>%
  top_n(10, beta) %>%
  ungroup() %>%
  mutate(topic = paste0("Topic ", topic),
         term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(term, beta, fill = as.factor(topic))) +
  geom_col(alpha = 0.8, show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free_y") +
  coord_flip() +
  scale_x_reordered() +
  labs(x = NULL, y = expression(beta),
       title = "Highest word probabilities for each topic",
       subtitle = "Different words are associated with different topics")