動機與目的：因為我們都很喜歡聽音樂，同時不免俗會跟著一起唱，總是有那麼幾個句子，即便是旋律忘記了，歌詞卻還是琅琅上口，因此我們想探討在歷年代表性歌手所發行的歌曲歌詞中，是否隱藏著某些規律？隨著年代曲風的改變，歌詞的內容,用詞會不會隨著改變？

資料來源：利用github上johnwmiller所寫的python檔到lyric genius api抓取歌詞（每個歌詞一個檔案），再利用python把所有歌詞的檔案讀出後統一寫進去一個lyric.txt檔中，抓取歌詞的規律是每十年為一單位，取4,5個代表性歌手(ex: the beatles, rihanna….)，每個歌手抓取30首歌以內。

資料分析

資料前處理

#把資料抓進去
lyric<-fread('./lyric.txt',sep='')
colnames(lyric)<-c('text')
lyric$text<-as.character(lyric$text)

#在清一次欄位中的符號和無用文字
structural_lyric$song<-gsub(regex("'title'")," ",structural_lyric$song)
structural_lyric=structural_lyric[c(6,1,3,4,2,5)]#排列欄位
structural_lyric$year<-gsub("'","",structural_lyric$year)#filter '
structural_lyric$artist<-gsub("'","",structural_lyric$artist)
structural_lyric$song<-gsub("'","",structural_lyric$song)
structural_lyric$song<-gsub('"',"",structural_lyric$song)#把歌名餘留的"濾掉
structural_lyric$lyrics<-gsub("['\"]","",structural_lyric$lyrics)#filter lyric '&"
structural_lyric<-structural_lyric%>%#把remix,mix重複的歌曲刪掉
  filter(!str_detect(song,regex('(Remix)|(Mix)')))#filter out the remix#equal=filter(!grepl('Remix',name))

#依照年代分成不同group    1960一個group,1970一個group.....
six<-c("  The Beatles ","  Elvis Presley ","  The Supremes ","  The Rolling Stones ")
seven<-c("  Bee Gees ","  Elton John ","  Paul McCartney ","  Eagles ","  Stevie Wonder ")
eight<-c("  Michael Jackson ","  Madonna ","  Phil Collins ",'  Whitney Houston ','  George Michael ')
nine<-c('  Mariah Carey ','  Janet Jackson ','  Boyz II Men ')
ten<-c('  Usher ','  Rihanna ','  Beyoncé ','  Ludacris ','  Nelly ','  Justin Timberlake ','  50 Cent ')
structural_lyric<-structural_lyric%>%
  mutate(group=ifelse(artist%in%six , 1960, ifelse(artist%in%seven,1970,ifelse(artist%in%eight,1980,ifelse(artist%in%nine,1990,ifelse(artist%in%ten,2000,0))))))

structural_lyric=structural_lyric[c(1,2,7,4,3,5,6)]
     #structural_lyric是一個結構化的歌曲資料

#把歌詞斷成token
lyric_word<-structural_lyric%>%
  unnest_tokens(token,lyrics,to_lower=FALSE)%>%
  filter(!str_detect(token,regex('[0-9]|Intro|Hook|Verse|Chorus')))%>%
  filter(!token%in%stop_words$word)%>%
  filter(!(token=='I'|token=='la'|token=='Im'))
lyric_word=lyric_word[c(1,2,3,7,4,5,6)]
    #lyric_word就是歌詞斷句完畢的資料

#把歌詞用ngram斷成兩個字
lyric_ngram<-structural_lyric%>%
  unnest_tokens(bigram,lyrics,token='ngrams',n=2,to_lower = FALSE)%>% 
  separate(bigram,c('word1','word2'),sep=" ")%>%
  filter(!word1%in%stop_words$word)%>%
  filter(!word2%in%stop_words$word)%>%
  filter(!str_detect(word1,regex('[0-9]|Verse|Hook|Intro')))%>%
  filter(!str_detect(word2,regex("[0-9]|Verse|Hook|Intro")))%>%
  filter(!(word1=='I'|word1=='la'|word1=='Im'||word1==' G'))%>%
  filter(!(word2=='I'|word2=='la'|word2=='Im'||word2==' G'))
 # filter(!word1%in%token_remove)%>%
 # filter(!word2%in%token_remove)
lyric_ngram=lyric_ngram[c(1,2,3,7,8,4,5,6)]#把欄位排序
    #lyric_ngram把歌斷成兩個字在一起

#前處理結束

tf-idf

#tf-idf:依據歌手來算出個別字在不同歌手中所有歌曲總字量的佔比
#算出count,total,grquency
lyric_count_j<-lyric_word%>%
  group_by(token,artist)%>%
  summarise(count=n())%>%
  ungroup%>%
  group_by(artist)%>%
  mutate(total=sum(count),frequency=count/total)%>%
  arrange(desc(frequency))

#bind_tf_idf:依據年代
lyric_tfidf_j<-lyric_word%>%
  group_by(token,artist,group)%>%
  summarise(count=n())%>%
  ungroup()%>%
  bind_tf_idf(token,group,count)%>%
  arrange(desc(tf_idf))

#graph
garbage_word_j<-c('lala','Ai','ah','yeh','Isnt','em','ism','De','Ye','bom','Ooo','Hes','B','tat','dah','da','Ba','Fa','di','el','en','mi','II','na','va','la','Ludacris','Cent','Usher','Beyoncé','Timberlake','Nelly','Timberlake','Justin','AT')#the garbage word need to filter
lyric_tfidf_j%>%
 # filter(group==1970)%>% #set the year
  filter(!token%in%garbage_word_j)%>%
  mutate(word = factor(token, levels = rev(unique(token)))) %>% 
  group_by(group) %>% 
  top_n(8) %>% 
  ungroup() %>%
  arrange(desc(tf_idf)) %>%
  ggplot(aes(word, tf_idf, fill = group))+
  theme(text=element_text(size=15))+
  geom_col(show.legend = FALSE) +
  labs(x = NULL, y = "tf-idf") +
  facet_wrap(~group, ncol = 2, scales = "free") +
  coord_flip()

## Selecting by word

**回推資料後會發現，其實年代前幾名的字都是一個詞出現在同一個歌曲中可能20~30次大量被單一首歌使用而造成，tfidf很高，並不是那個年代很常被使用到（ex:andrew這個token是在the rolling stone的andrew blues中大量出現這個token）因此出來的結果不如預期

#建立一歇每個歌手都很常用到 不特殊字的字典
artist_wordcount<-lyric_word%>%#依據artist 做count
  group_by(artist,token)%>%
  summarise(sum=n())%>%
  arrange(desc(sum))
lyric_tfidf<-artist_wordcount%>%  #tf-idf:依據artist做tf-idf
  bind_tf_idf(token,artist,sum)%>%
  arrange(desc(tf_idf))
token_avg<-lyric_tfidf%>%  #算出token的平均值
  group_by(token)%>%
  summarise(avg=mean(tf_idf))
token_avg$avg%>%summary

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0005012 0.0008465 0.0012700 0.0012731 0.0615269

token_remove<-token_avg%>%
  filter(avg<0.0010178)%>%
  .$token
token_remove%>%head

## [1] "A"     "A.I"   "a.k.a" "A.M"   "Aah"   "AAH"

network for bigram

#visualizaiton a network for bigram
garbage_word2_j<-c('bump','bang','doo','dum','shoop','zoop','mm','AT','II','G','Tu','va','ti','en','la','Ye')
bigram_count_j<-lyric_ngram%>%
  filter(!word1%in%garbage_word2_j)%>%
  filter(!word2%in%garbage_word_j)%>%
  group_by(artist,word1,word2)%>%
  summarise(count=n())%>%
  arrange(desc(count))
bigram_graph_j<-bigram_count_j%>%
 # filter(artist%in%nine)%>% #set the year
  filter(count>15)%>%
  as.data.frame()%>%
  select(-artist)%>%
  graph_from_data_frame()

#graph
a<-grid::arrow(type="closed",length=unit(.05,"inches"))
ggraph(bigram_graph_j,layout='fr')+
  geom_edge_link(aes=(edge_alpha=n),show.legend=FALSE,arrow=a,end_cap=circle(.07,'inches'))+
  geom_node_point(color="lightblue",size=4)+
  geom_node_text(aes(label=name),vjust=1,hjust=1)+
  theme_void()

## Warning: Ignoring unknown parameters: aes

**像是eeny meeny miny moe這幾個詞是一個西方很流行的童謠，通常歌詞中出現eeny meeny就會把這個童謠的第一句常玩，而這個就很能預期的有出現在圖中，另外act a fool也是英文很習慣的用法。

Topic model

#topic model:以一首歌當作一個document
garbage_word3_j<-c('aint','Aint','ain','Fa','da','da','em','gon','don','mi','mm','bang','IS','Tha','Nos','bump','Lou','Ali','ahh','St','aw','DTP','DA','OF','ACT','AT','YA','Ho')
lyric_dtm_j<-lyric_word%>%
  filter(!token%in%garbage_word3_j)%>%
  filter(!token%in%stop_words$word)%>%
  group_by(song,artist,token)%>%
  summarise(count=n())%>%
  cast_dtm(song,token,count)
lyric_lda_j<-LDA(lyric_dtm_j,k=4,control=list(seed=1234))
lyric_topic_j<-tidy(lyric_lda_j,matrix='beta')

#找出topic top word
topic_topterm_j<-lyric_topic_j%>%
  group_by(topic)%>%
  top_n(10,beta)%>%
  ungroup()%>%
  arrange(topic)

#graph
topic_topterm_j%>%
  mutate(term=reorder(term,beta))%>%
  ggplot(aes(term,beta,fill=factor(topic)))+
  geom_col(show.legend = FALSE)+
  facet_wrap(~ topic,ncol=2, scales = "free") +
  coord_flip()+
  theme(text = element_text(family = "Heiti TC Light")) #加入中文字型設定，避免中文字顯示錯誤。

分成四群，是一個不錯的結果，第二群可以看到nigga,bad,shit,bitch….等等hiphop常用到的詞可知這群是2000年，第四群有dance，推測可能較接近盛行disco的年代1970,1980

#重新分成兩群
lyric_lda_j<-LDA(lyric_dtm_j,k=2,control=list(seed=1234))

＃主題分辨

#主題分辨
topic_distinct_j<-tidy(lyric_lda_j,matrix='gamma')
#原始資料文字雲
lyric_word%>%
  group_by(token)%>%
  summarise(count=n())%>%
  filter(!token%in%garbage_word3_j)%>%
  arrange(desc(count))%>%
  wordcloud2()

###love,baby…..這些所有歌曲中最常見的字非常常見的字很突出（這邊是參考網路上，統計出歷年所有歌曲歌詞中，按照排名分別是love,baby,keep,good…..）

#劃分兩主題資料
topic_spread_j<-topic_distinct_j%>%
  mutate(topic=paste0('topic',topic))%>%
  spread(topic,gamma)

#觀察每個年代屬於什麼topic
year_topic_j<-topic_spread_j%>%
  inner_join(structural_lyric,by=c('document'='song'))%>%
  select(artist,group,document,topic1,topic2)%>%
  mutate(topicbelong=ifelse(topic1>topic2,1,2))%>%
  group_by(group,topicbelong)%>%
  summarise(topicount=n())
  #spread(topicbelong,topicount)

#graph
year_topic_j%>%
 # filter(topicbelong=='1')%>%
  ggplot(aes(x=group,y=topicount,group=topicbelong,colour=topicbelong))+
  geom_line()

###可以看出只有2000被分類在topic2，其餘年代都被分在topic1，結果是理想且可預期的，因為不管在先前分析上2000是和其他年代比較有顯著差異的資料，其餘年代相對差異較不大

#取出topic1最高
topic1_j<-topic_spread_j%>%
  top_n(250,topic1)%>%#最接近topic1前250
  inner_join(structural_lyric,by=c('document'='song'))%>%
  select(document,artist,group,topic1,topic2)

#graph
#做成文字雲
topic1_cloud_j<-lyric_word%>%
  filter(song%in%topic1_j$document)%>%
  filter(!token%in%garbage_word3_j)%>%
  group_by(token)%>%
  summarise(count=n())%>%
  arrange(desc(count))
topic1_cloud_j%>%
  wordcloud2()

##topic1比較多love,baby,time這些常見普通的詞

#取出topic2最高
topic2_j<-topic_spread_j%>%
  top_n( 250,topic2)%>%
  inner_join(structural_lyric,by=c('document'='song'))%>%
  select(document,artist,group,topic1,topic2)
topic2_j%>%
  group_by(group)%>%
  summarise(count=n())

## # A tibble: 5 x 2
##   group count
##   <dbl> <int>
## 1  1960    22
## 2  1970    48
## 3  1980    41
## 4  1990    19
## 5  2000   125

#graph
topic2_cloud_j<-lyric_word%>%
  filter(song%in%topic2_j$document)%>%
  filter(!token%in%garbage_word3_j)%>%
  group_by(token)%>%
  summarise(count=n())%>%
  arrange(desc(count))
topic2_cloud_j%>%
  wordcloud2()

##topic2比較多bad,nigga,girl,money….這些嘻哈會使用的詞（錢和女人和一些髒話）

每個年代常建的詞

filter_word = c("And","The","dont","but","Oh","yeah","A","But","youre","Aint","So","if","da","No","If","Its","ya","You","We","Yeah","My","They","All","Well","Just","When","Here","That","Now","Theres","Dont","Youre","It","Do","Hes","This","gonna","wanna","To","Cause","She","don","gon","aint","What","Ill","Ive","Id","mi")
z_top_token <- lyric_word%>% 
  group_by(group,token)%>%
  summarise(sum=n())%>%
  arrange(group,sum)%>%
  filter(sum>5)%>%
  filter(!token%in%stop_words$word)%>%
  filter(!token%in% filter_word)
z_top_token

## # A tibble: 2,560 x 3
## # Groups:   group [5]
##    group token      sum
##    <dbl> <chr>    <int>
##  1  1960 After        6
##  2  1960 alright      6
##  3  1960 Aquarius     6
##  4  1960 As           6
##  5  1960 babe         6
##  6  1960 belong       6
##  7  1960 blow         6
##  8  1960 Bompa        6
##  9  1960 boy          6
## 10  1960 Close        6
## # … with 2,550 more rows

z_top_token_1960 <- z_top_token %>%
  filter(group == "1960")%>%
  arrange(desc(sum))%>%
  head(10)%>%
  ggplot(aes(x=reorder(token,sum),y=sum)) +
  geom_col(show.legend = FALSE) +
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
  labs(x="1960-token", y="數量") +
  coord_flip()

z_top_token_1970 <- z_top_token %>%
  filter(group == "1970")%>%
  arrange(desc(sum))%>%
  head(10)%>%
  ggplot(aes(x=reorder(token,sum),y=sum)) +
  geom_col(show.legend = FALSE) +
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
  labs(x="1970-token", y="數量") +
  coord_flip()

z_top_token_1980 <- z_top_token %>%
  filter(group == "1980")%>%
  arrange(desc(sum))%>%
  head(10)%>%
  ggplot(aes(x=reorder(token,sum),y=sum)) +
  geom_col(show.legend = FALSE) +
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
  labs(x="1980-token", y="數量") +
  coord_flip()


z_top_token_1990 <- z_top_token %>%
  filter(group == "1990")%>%
  arrange(desc(sum))%>%
  head(10)%>%
  ggplot(aes(x=reorder(token,sum),y=sum)) +
  geom_col(show.legend = FALSE) +
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
  labs(x="1990-token", y="數量") +
  coord_flip()


z_top_token_2000 <- z_top_token %>%
  filter(group == "2000")%>%
  arrange(desc(sum))%>%
  head(10)%>%
  ggplot(aes(x=reorder(token,sum),y=sum)) +
  geom_col(show.legend = FALSE)+
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
  labs(x="2000-token", y="數量") +
  coord_flip()

z_top_token_1960

z_top_token_1970

z_top_token_1980

z_top_token_1990

z_top_token_2000

##從每個年代出現最多次的詞是love，可以發現不論哪個年代，love都是一個很常被使用的題材，原因是不論親情、愛情或友情，都是我們生活中非常重要的一部分，所以很常在歌詞中出現。

在來我們按照每個年代流行的曲風來區分，1960、1970比較偏搖滾、R&B的曲風，而1980、1990的流行的風格比較多變，搖滾、DISCO、靈魂樂等，直到2000，HIP HOP出現。我們由長條圖可以發現明顯2000以前所出現的歌詞其實與曲風沒有太大的關係，但到了2000年後的音樂，nigga、bitch與money等詞頻繁的被使用，可以明顯的HIP HOP的風格。

popularword

#算出love girl baby nigga time這些每個年代普遍很熱門的詞 在各年代次數 來看起伏
specific_dic_j<-c('love','girl','baby','time')
wordcount_j<-lyric_word%>%
  filter(token%in%specific_dic_j)%>%
  group_by(token,group)%>%
  summarise(count=n())

#graph
wordcount_j%>%
  ggplot(aes(x=group,y=count,colour=token))+
  geom_line(size=2)

##girl這一條折線在2000年急遽升高，可能是因為在hip hop音樂中，常常會提到女人、金錢等用詞，因此增加girl出現的頻率。

畫出每個年代的正反面情緒數量

 z_token_sentiment <- z_top_token%>%
  inner_join(get_sentiments("bing"),by = c(token = "word")) %>%
  group_by(group,sentiment)%>%
  summarise(count=sum(sum))
  
 z_token_sentiment

## # A tibble: 10 x 3
## # Groups:   group [5]
##    group sentiment count
##    <dbl> <chr>     <int>
##  1  1960 negative    225
##  2  1960 positive    400
##  3  1970 negative    510
##  4  1970 positive    650
##  5  1980 negative    449
##  6  1980 positive    844
##  7  1990 negative    203
##  8  1990 positive    368
##  9  2000 negative   1980
## 10  2000 positive   1070

z_group_sentiment <- z_token_sentiment%>%
  ggplot(aes(x=group,y=count,colour=sentiment , fill = sentiment)) + #x軸為年代，y軸為情緒分數
  geom_col(show.legend = TRUE) +
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
  labs(x="年代", y="情緒分數") 
z_group_sentiment

##可以看出在2000年以前，正面情緒大於負面情緒，是因為歌詞中最多次的love拉高了正面的分數，而2000年後，HIP HOP風格的歌詞多為“fuck”、“damn”、“shit”等負面情緒詞，所以提高了整體的負面分數。

將歌詞斷成token

 z_token <- lyric_word%>% 
  group_by(group,artist,token)%>%
  summarise(sum=n())%>%
  arrange(group,artist,sum)%>%
  filter(sum>5)%>%
  filter(!token%in%stop_words$word)%>%
  filter(!token%in% filter_word)

z_token

## # A tibble: 2,412 x 4
## # Groups:   group, artist [24]
##    group artist             token     sum
##    <dbl> <chr>              <chr>   <int>
##  1  1960 "  Elvis Presley " America     6
##  2  1960 "  Elvis Presley " blues       6
##  3  1960 "  Elvis Presley " didnt       6
##  4  1960 "  Elvis Presley " grace       6
##  5  1960 "  Elvis Presley " hold        6
##  6  1960 "  Elvis Presley " hundred     6
##  7  1960 "  Elvis Presley " lips        6
##  8  1960 "  Elvis Presley " live        6
##  9  1960 "  Elvis Presley " Maybe       6
## 10  1960 "  Elvis Presley " pretty      6
## # … with 2,402 more rows

z_token_sentiment_ne <- z_token%>%
  filter(group =="2000")%>%
  inner_join(get_sentiments("bing"),by = c(token = "word"))%>%
  filter(sentiment == "negative")%>%
  group_by(artist)%>%
  summarise(count=sum(sum))%>%
  arrange(desc(artist,sum))

z_token_sentiment_ne

## # A tibble: 7 x 2
##   artist                 count
##   <chr>                  <int>
## 1 "  Usher "               288
## 2 "  Rihanna "             114
## 3 "  Nelly "               161
## 4 "  Ludacris "            260
## 5 "  Justin Timberlake "   153
## 6 "  Beyoncé "              79
## 7 "  50 Cent "             323

負面用詞最高的前三名50cent、usher、ludacris都hip hop 歌手，可以印證hip hop風格的詞彙出現較多負面情緒詞

把歌詞用ngram斷成兩個字

##lyric_ngram把歌斷成兩個字在一起
lyric_ngram<-structural_lyric%>%
  unnest_tokens(bigram,lyrics,token='ngrams',n=2,to_lower = FALSE)
lyric_ngram=lyric_ngram[c(1,2,6,7,3,4,5)]#把欄位排序
 

##將每位歌手的bigram數量統計，沒有stopwords(因為要做bigram情緒分析)
 z_bigrams_count = lyric_ngram %>%
   #select(group,artist,bigram)%>%
   count(group,artist,bigram, sort = TRUE)%>%
   separate(bigram,c('word1','word2'),sep=" ")%>%
    filter(!str_detect(word1,regex('[0-9]|Verse|Hook|Intro|Chorus|Justin|Timberlake')))%>%
    filter(!str_detect(word2,regex("[0-9]|Verse|Hook|Intro|Chorus|Justin|Timberlake")))%>%
    filter(!(word1=='I'|word1=='la'|word1=='Im'))%>%
    filter(!(word2=='I'|word2=='la'|word2=='Im'))

##加入stopwords
z_stop_words = z_bigrams_count %>%
  filter(!word1%in%stop_words$word)%>%
  filter(!word2%in%stop_words$word)

z_stop_words

## # A tibble: 23,368 x 5
##    group artist               word1 word2      n
##    <dbl> <chr>                <chr> <chr>  <int>
##  1  2000 "  Usher "           bump  bump      91
##  2  2000 "  Ludacris "        bang  bang      71
##  3  1970 "  Stevie Wonder "   day   sucker    52
##  4  1970 "  Stevie Wonder "   All   day       44
##  5  2000 "  Rihanna "         bad   bad       43
##  6  1970 "  Elton John "      Amen  Amen      42
##  7  1970 "  Stevie Wonder "   love  All       41
##  8  2000 "  Ludacris "        meeny miny      38
##  9  2000 "  Ludacris "        miny  moe       38
## 10  1980 "  Michael Jackson " da    da        37
## # … with 23,358 more rows

##組合word1、word2
 z_bigrams_united <- z_stop_words %>%
   unite(bigram,word1,word2,sep=" ")

每個年代常用的bigrams

z_top_lyrics_bigrams <- z_bigrams_united %>%
  group_by(group)%>%
  top_n(10,n)%>%
  ungroup()%>%
  arrange(group,n)
#z_top_lyrics_bigrams
# 
z_top_lyrics_bigrams %>%
  arrange(n)%>%
  ggplot(aes(x=reorder(bigram,n),y=n,fill=group)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ group, scales = "free") +
  coord_flip()

##沒有很明確的主題結果，可能會因為某一首歌的詞出現比較多次而提高整體比例，像2000的“BUMP”這首歌中BUMP BUMP一共出現了104次，所以資料量太少導致單一首歌的頻率對整個結果影響很大。

bigram -分析各年代情緒走勢

z_AFINN <- get_sentiments("afinn")

z_bing <-get_sentiments("bing")
z_bing

## # A tibble: 6,788 x 2
##    word        sentiment
##    <chr>       <chr>    
##  1 2-faced     negative 
##  2 2-faces     negative 
##  3 a+          positive 
##  4 abnormal    negative 
##  5 abolish     negative 
##  6 abominable  negative 
##  7 abominably  negative 
##  8 abominate   negative 
##  9 abomination negative 
## 10 abort       negative 
## # … with 6,778 more rows

z_sentiment_bigrams <-z_bigrams_count %>%
  select(group, word1, word2) %>%
   count(group,word1, word2, sort = TRUE)%>%
  inner_join(z_AFINN, by = c(word2 = "word")) %>%
  mutate(sentiment_tag=ifelse(score>0, "positive", "negative"))
 
z_sentiment_bigrams

## # A tibble: 6,011 x 6
##    group word1 word2     n score sentiment_tag
##    <dbl> <chr> <chr> <int> <int> <chr>        
##  1  2000 you   want      7     1 positive     
##  2  2000 act   like      6     2 positive     
##  3  2000 me    like      6     2 positive     
##  4  2000 Oh    yeah      6     1 positive     
##  5  2000 yeah  yeah      6     1 positive     
##  6  1970 a     dream     5     1 positive     
##  7  1970 my    love      5     3 positive     
##  8  1980 a     dream     5     1 positive     
##  9  1980 dont  want      5     1 positive     
## 10  1980 in    love      5     3 positive     
## # … with 6,001 more rows

z_sentiment_plot_data <- z_sentiment_bigrams %>%
  group_by(group,sentiment_tag) %>%
  summarise(count=n())  
z_sentiment_plot_data

## # A tibble: 10 x 3
## # Groups:   group [5]
##    group sentiment_tag count
##    <dbl> <chr>         <int>
##  1  1960 negative        275
##  2  1960 positive        427
##  3  1970 negative        464
##  4  1970 positive        577
##  5  1980 negative        462
##  6  1980 positive        689
##  7  1990 negative        253
##  8  1990 positive        372
##  9  2000 negative       1333
## 10  2000 positive       1159

# 最後把圖畫出來
z_sentiment_plot_data %>%
  ggplot()+
  geom_line(aes(x=group,y=count,colour=sentiment_tag))+
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
  labs(x="年代", y="數量")

##使用bigram的word2情緒化出的情緒走勢圖，與前面用token畫出來的呈現相同的結果，在2000前出現正面分數大於負面，2000年後是負面情緒多

word1否定詞的情緒分析

z_negative <- c("not", "no", "never", "without")

z_negative_words <- z_bigrams_count %>%
  filter(word1 %in% z_negative)%>%
  inner_join(z_AFINN, by = c(word2 = "word")) %>%
  count(group,word1,word2, score, sort = TRUE)
  #mutate(score = ifelse(sentiment=="positive",1, -1))
z_negative_words

## # A tibble: 130 x 5
##    group word1 word2  score     n
##    <dbl> <chr> <chr>  <int> <int>
##  1  2000 no    no        -1     5
##  2  1980 no    no        -1     3
##  3  1990 no    no        -1     3
##  4  2000 no    doubt     -1     3
##  5  1960 no    loving     2     2
##  6  1960 no    no        -1     2
##  7  1970 never die       -3     2
##  8  1980 not   good       3     2
##  9  2000 never hurt      -2     2
## 10  2000 no    wrong     -2     2
## # … with 120 more rows

# 查看 前面出現否定詞 和 後面的所有詞彙
z_negative_all_words <- z_bigrams_count%>%
  filter(word1 %in% z_negative) %>%
  count(group,word1, word2, sort = TRUE)
z_negative_all_words

## # A tibble: 721 x 4
##    group word1 word2     n
##    <dbl> <chr> <chr> <int>
##  1  2000 no    more      6
##  2  2000 not   a         6
##  3  1980 never gonna     5
##  4  2000 no    no        5
##  5  1970 no    one       4
##  6  1980 no    more      4
##  7  1980 no    one       4
##  8  2000 never be        4
##  9  1960 never let       3
## 10  1960 never take      3
## # … with 711 more rows

# 如果在情緒詞前出現的是否定詞的話，則將他的情緒對調
z_negative_bigrams <-z_negative_words %>%
  mutate(sentiment=ifelse(word1 %in% z_negative, -1*score, score)) %>%
  mutate(sentiment_tag=ifelse(score>0, "positive", "negative"))
z_negative_bigrams

## # A tibble: 130 x 7
##    group word1 word2  score     n sentiment sentiment_tag
##    <dbl> <chr> <chr>  <int> <int>     <dbl> <chr>        
##  1  2000 no    no        -1     5         1 negative     
##  2  1980 no    no        -1     3         1 negative     
##  3  1990 no    no        -1     3         1 negative     
##  4  2000 no    doubt     -1     3         1 negative     
##  5  1960 no    loving     2     2        -2 positive     
##  6  1960 no    no        -1     2         1 negative     
##  7  1970 never die       -3     2         3 negative     
##  8  1980 not   good       3     2        -3 positive     
##  9  2000 never hurt      -2     2         2 negative     
## 10  2000 no    wrong     -2     2         2 negative     
## # … with 120 more rows

否定詞的情緒分析(圖)

# 計算我們資料集中 每年代的情緒值
z_negated_sentiment_plot_data <- z_negative_bigrams %>%
  group_by(group,sentiment_tag) %>%
  summarise(count=n())  
z_negated_sentiment_plot_data

## # A tibble: 10 x 3
## # Groups:   group [5]
##    group sentiment_tag count
##    <dbl> <chr>         <int>
##  1  1960 negative         13
##  2  1960 positive          5
##  3  1970 negative         14
##  4  1970 positive         14
##  5  1980 negative         13
##  6  1980 positive         11
##  7  1990 negative         11
##  8  1990 positive          6
##  9  2000 negative         28
## 10  2000 positive         15

# 最後把圖畫出來
z_negated_sentiment_plot_data %>%
  ggplot()+
  geom_line(aes(x=group,y=count,colour=sentiment_tag))+
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
   labs(x="年代", y="數量")

合併兩種情緒值的資料

z_all_sentiments <- bind_rows(
  z_sentiment_plot_data %>% mutate(sentiment_tag=paste(sentiment_tag, "_original", sep = "")),
  z_negated_sentiment_plot_data %>% mutate(sentiment_tag=paste(sentiment_tag, "_negated", sep = "")))
z_all_sentiments

## # A tibble: 20 x 3
## # Groups:   group [5]
##    group sentiment_tag     count
##    <dbl> <chr>             <int>
##  1  1960 negative_original   275
##  2  1960 positive_original   427
##  3  1970 negative_original   464
##  4  1970 positive_original   577
##  5  1980 negative_original   462
##  6  1980 positive_original   689
##  7  1990 negative_original   253
##  8  1990 positive_original   372
##  9  2000 negative_original  1333
## 10  2000 positive_original  1159
## 11  1960 negative_negated     13
## 12  1960 positive_negated      5
## 13  1970 negative_negated     14
## 14  1970 positive_negated     14
## 15  1980 negative_negated     13
## 16  1980 positive_negated     11
## 17  1990 negative_negated     11
## 18  1990 positive_negated      6
## 19  2000 negative_negated     28
## 20  2000 positive_negated     15

z_all_sentiments_after <- bind_rows(z_sentiment_plot_data ,z_negated_sentiment_plot_data )%>%
  group_by(group,sentiment_tag)%>%
  summarise(count=sum(count)) 

  z_sentiment_plot_data

## # A tibble: 10 x 3
## # Groups:   group [5]
##    group sentiment_tag count
##    <dbl> <chr>         <int>
##  1  1960 negative        275
##  2  1960 positive        427
##  3  1970 negative        464
##  4  1970 positive        577
##  5  1980 negative        462
##  6  1980 positive        689
##  7  1990 negative        253
##  8  1990 positive        372
##  9  2000 negative       1333
## 10  2000 positive       1159

  z_negated_sentiment_plot_data

## # A tibble: 10 x 3
## # Groups:   group [5]
##    group sentiment_tag count
##    <dbl> <chr>         <int>
##  1  1960 negative         13
##  2  1960 positive          5
##  3  1970 negative         14
##  4  1970 positive         14
##  5  1980 negative         13
##  6  1980 positive         11
##  7  1990 negative         11
##  8  1990 positive          6
##  9  2000 negative         28
## 10  2000 positive         15

  z_all_sentiments_after

## # A tibble: 10 x 3
## # Groups:   group [5]
##    group sentiment_tag count
##    <dbl> <chr>         <int>
##  1  1960 negative        288
##  2  1960 positive        432
##  3  1970 negative        478
##  4  1970 positive        591
##  5  1980 negative        475
##  6  1980 positive        700
##  7  1990 negative        264
##  8  1990 positive        378
##  9  2000 negative       1361
## 10  2000 positive       1174

z_all_sentiments_after %>%
  ggplot()+
  geom_line(aes(x=group,y=count,colour=sentiment_tag))+
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
  labs(x="年代", y="數量")

##反轉情緒後發現因為word1為否定詞的數量太少，不影響反轉後的結果。

先比較正面情緒

z_all_sentiments %>% 
  filter(sentiment_tag %in% c("positive_original", "positive_negated")) %>%
  ggplot()+
  geom_line(aes(x=group,y=count,colour=sentiment_tag))+
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
  labs(x="年代", y="數量")

再比較負面情緒

z_all_sentiments %>% 
  filter(sentiment_tag %in% c("negative_original", "negative_negated")) %>%
  ggplot()+
  geom_line(aes(x=group,y=count,colour=sentiment_tag)) +
  theme(axis.text.x=element_text(size=20),axis.text.y=element_text(size=20))+
  labs(x="年代", y="數量")

結論：發現其實有時候資料抓出來，你預期他會有什麼樣的規律成果，但其實可能不盡然，樣本數太少，抓取的歌手太集中，導致不同年代的歌曲用詞沒有辦法有太大的改變。此外也在這次報告中體會到老師上課提到的觀念，清資料會花上一半以上的時間，這次報告確實是很多時間來把資料變成我們有辦法分析的格式，不同於之前古騰堡的文本，一個gutenberg_download就可以得到一份乾淨也非常結構化的資料，但也在清資料的部分學到很多更扎實的技巧

midtermproject_第14組:歷年代表流行歌手歌詞探討

B054020013鄭竹秀、B054020047洪瑋襄

2019/4/23