Reading & Exploring the data file

df <- read_csv("https://raw.githubusercontent.com/bdioli/KanyeWestText/master/data/kanye_lyrics.csv")
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   year = col_double(),
##   type = col_character(),
##   project = col_character(),
##   track_num = col_double(),
##   song = col_character(),
##   lyrics = col_character()
## )
head(df)
## # A tibble: 6 x 6
##    year type   project    track_num song           lyrics                       
##   <dbl> <chr>  <chr>          <dbl> <chr>          <chr>                        
## 1  2003 mixta… Get Well …         1 Intro          "[Intro: DeRay Davis]Kanye, …
## 2  2003 mixta… Get Well …         2 Live From Irv… "Intro: Talib Kweli:\nALRIGH…
## 3  2003 mixta… Get Well …         4 Jesus Walks    "[Intro]\nWe at war\nWe at w…
## 4  2003 mixta… Get Well …         5 Through The W… "[Intro]\nYo, Gee, they can'…
## 5  2003 mixta… Get Well …         6 Two Words      "[Intro: Kanye West]\nWe in …
## 6  2003 mixta… Get Well …         7 Champions      "[Verse 1: Quavo](Quavo!)Lif…
# Kanye West was hospitalized in 2016 and diagnosed with Bi-Polar disorder. 
# This project is going to review his lyrics before and after his hospitalization.
# I hypothesize that Kanye has incorporated more religious and biblical references
# as well as a more trusting and positive attitude in his music after his 
# hospitalization in 2016.

df %>% count(project,year,type,sort=TRUE)
## # A tibble: 10 x 4
##    project                            year type        n
##    <chr>                             <dbl> <chr>   <int>
##  1 Late Registration                  2005 album      23
##  2 The College Dropout                2004 album      21
##  3 The Life of Pablo                  2016 album      20
##  4 Graduation                         2007 album      15
##  5 My Beautiful Dark Twisted Fantasy  2010 album      14
##  6 808s & Heartbreak                  2008 album      12
##  7 I'm Good                           2003 mixtape    12
##  8 Can't Tell Me Nothing              2007 mixtape    11
##  9 Yeezus                             2013 album      10
## 10 Get Well Soon...                   2003 mixtape     9
#Only one album (The Life of Pablo) with 20 songs found post 2015.


df %>% count(song,sort=TRUE)
## # A tibble: 140 x 2
##    song                            n
##    <chr>                       <int>
##  1 Two Words                       3
##  2 Can't Tell Me Nothing           2
##  3 Intro                           2
##  4 Jesus Walks                     2
##  5 Stronger                        2
##  6 Through the Wire                2
##  7 30 Hours                        1
##  8 A Million and One Freestyle     1
##  9 Addiction                       1
## 10 All Falls Down                  1
## # … with 130 more rows
#songs are duplicated in mix tapes, hence we will be using only albums going forth 

Creating most used words pre & post 2016

#Adding a flag for Pre & post 2016 Era
df_kw <- df %>% 
  filter(type=="album") %>% 
  mutate(Era=ifelse(year>=2016,"Post 2016","Pre 2016"))


#Table with top 12 most used words pre & Post 2016 era
df_kw %>% 
  unnest_tokens(word,lyrics) %>% 
  anti_join(stop_words) %>%   #Removing common words 
  count(word,Era,sort=TRUE) %>% 
  group_by(Era) %>% 
  slice_max(n,n=12,with_ties=FALSE) %>% 
  pivot_wider(names_from = "Era",
              values_from = "n",
               values_fill = list(n = 0)) %>% 
  kable()
## Joining, by = "word"
word Post 2016 Pre 2016
kanye 69 283
bam 62 0
feel 61 0
ey 56 0
west 50 258
verse 36 219
god 34 0
love 32 129
hook 31 235
yeezy 29 0
baby 26 90
deep 26 0
la 0 183
shit 0 127
niggas 0 99
life 0 98
2 0 90
nigga 0 90

Creating chart of most used words

#Table with top 12 most used words pre & Post 2016 era
words <- df_kw %>% 
  unnest_tokens(word,lyrics) %>% 
  anti_join(stop_words) %>%   #Removing common words 
  count(word,Era,sort=TRUE) %>% 
  group_by(Era) %>% 
  filter(!str_detect(word, "^[0-9]")) %>% 
  filter(word!="la",word!="hook",word!="verse") 
## Joining, by = "word"
words %>% 
  slice_max(n,n=12,with_ties=FALSE) %>% 
  ungroup() %>% 
  mutate(word=fct_reorder(word,n)) %>% 
  ggplot(aes(x=n,y=word,fill=Era))+
  geom_col(show.legend = FALSE)+
  #scale_y_reordered()+
  facet_wrap(~Era)+
  theme_calc()+
  labs(title="Most used words in Kanye's song pre & post 2016",
       y="",
       x="Word Frequency in Song lyrics")

# It is evident to see that words like God, deep have started appearing in his songs post 2016 where as words like shit nigga, girl have stopped appearing post 2016

Word Cloud for words Post 2016

# define a nice color palette
pal <- brewer.pal(8,"Dark2")

# plot the 50 most common words

words %>%
  filter(Era=="Post 2016") %>% 
  with(wordcloud(word, n, random.order = FALSE, max.words = 100, colors=pal,min.freq = 20,
                  rot.per=0.2))

Word Cloud for words Pre 2016

# define a nice color palette
pal <- brewer.pal(8,"Dark2")

# plot the 50 most common words

words %>%
  filter(Era=="Pre 2016") %>% 
  with(wordcloud(word, n, random.order = FALSE, max.words = 100, colors=pal,min.freq = 50,
                  rot.per=0.2))

Sentiment column chart for Kanye’s lyrics Pre & post 2016

words %>% 
  inner_join(get_sentiments("nrc")) %>% 
  count(Era,sentiment) %>% 
  mutate(per=n/sum(n)) %>% 
  ungroup() %>% 
  mutate(sentiment=fct_reorder(sentiment,per)) %>% 
  ggplot(aes(y=per,x=sentiment,fill=Era))+
  geom_col( position = "dodge")+
  #scale_y_reordered()+
  #facet_wrap(~sentiment)+
  theme_calc()+
  labs(title="Sentiment Analysis of Kanye's lyrics pre & post 2016",
       y="Percent of Total",
       x="")+
  scale_y_continuous(labels = percent_format())
## Joining, by = "word"

#Clearly we can see that the emotion of Trust, joy increasing & negativity, 
# Sadness decreasing post 2016. It is safe to say that after his hospitalization
# in 2016 Kanye has transitioned into becoming more trusting and positive
# as shown through his lyrics.