歌詞文字分析前處理&文字雲(頻率圖)

# get characteristic words by decade
songs$decade = as.integer(paste0(substring(songs$Year,1,3),"0"))
songsByDecade = aggregate(Lyrics ~ decade, data=songs, paste, sep=" ")

# create corpus
docs <- Corpus(VectorSource(songsByDecade$Lyrics))

# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("dont", "cant","can","wanna","just","now","know","got","get","never","one")) 
# Remove punctuations
docs <- tm_map(docs, removePunctuation, mc.cores=1)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming 去字尾
# docs <- tm_map(docs, stemDocument)

# Build a term-document matrix
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)

# Generate the Word cloud
set.seed(1234)
# 多色彩專用 colors=brewer.pal(4, "Dark2")
wordcloud(words = d$word, freq = d$freq, min.freq = 500, scale=c(3,0.5),
          max.words=200, random.order=FALSE, rot.per=0.35)



每十年區間中,最常被使用前20名詞彙

# 每十年最常用單字top20
colnames(m) = c("1960","1970","1980","1990","2000","2010")
m = as.data.frame(m)

top20 = as.data.frame(sapply(m[,1:6], function(x) head(rownames(m[order(-x),]),20)))
top20$Rank = seq(1,20,1)

top20_long =
  top20 %>% gather(Year,Word,c(1:6))

ggplot(top20_long, aes(Year, (-1*Rank))) + geom_point(color="white") + 
    geom_text(aes(label=top20_long$Word, color=top20_long$Year),  fontface='bold', size=4) +
    theme_classic() +
    theme(legend.position="none", plot.title = element_text(size=18), 
          axis.text.x=element_text(size=16)) + 
    xlab("") + ylab("Ranking") +
    scale_y_continuous(limits=c(-20,-1), breaks=c(-20, -10, -.5), labels=c("#20", "#10", "#1"))



熱門top100歌曲用字數量變化 1965-2015

songs$Word_Count = sapply(songs$Lyrics, function(x) length(strsplit(x," ")[[1]]))
songs$Unique_Word = sapply(songs$Lyrics, function(x) length(unique(strsplit(x, " ")[[1]])))
songs$Density = round(songs$Unique_Word/songs$Word_Count,4)*100
songs = songs[songs$Word_Count > 5,]

e = ggplot(aes(x=Year,y=Word_Count),data=songs)+
  geom_point(color="#1B687E", alpha=0.4, size=3)+
  theme_classic()+
  stat_smooth(method="lm",color="black")+
  ylab("Count")+
  labs(title="Words per Songs (Total)")+
  theme(plot.title = element_text(size = 18))

f = ggplot(aes(x=Year,y=Unique_Word),data=songs)+
  geom_point(color="#6ab078", alpha=0.4, size=3)+
  theme_classic()+
  stat_smooth(method="lm",color="black")+
  ylab("Count")+
  labs(title="Words per Songs (Unique)")+
  theme(plot.title = element_text(size = 18))

grid.arrange(e,f,ncol=2)



熱門歌曲用詞數量總計長條圖

ggplot(aes(x=Year, y=Word_Count),data=songs)+
  geom_histogram(stat="identity",fill="#02C874")+
  scale_x_continuous(limits = c(1965,2016),breaks = seq(1965,2015,5))+
  labs(x="Year", y="Word Count")



上榜總次數統計&歌手上榜次數統計

artists = as.data.frame(table(songs$Artist))
artists$Var1 = as.character(artists$Var1)
artists$Artist = sapply(artists$Var1, 
                        function(x)  strsplit(x," featuring")[[1]][1])
artists = artists %>% 
  group_by(Artist) %>% 
  summarise(Freq=sum(Freq)) %>% 
  arrange(-Freq) 

artists_top20 = artists %>% 
  head(20)

artists_top20 = artists_top20[order(artists_top20$Freq),]
artists_top20$Artist = factor(artists_top20$Artist, levels = artists_top20$Artist)

a = ggplot(aes(x=Artist,y=Freq),data=artists_top20) +
  geom_bar(stat="identity",fill="#2DA58A") +
  coord_flip() +
  theme_classic() +
  xlab("") +
  ylab("") +
  labs(title="Number of Songs, Top20 Aritsts") +
  geom_text(aes(label=Freq), hjust=-0.2) +
  scale_y_continuous(limits = c(0,40))

 b = ggplot(aes(x=Freq),data=artists) +
  geom_bar(fill="#2DA58A") +
  theme_classic() +
   xlab("") +
   ylab("") +
   labs(title="Number of Songs per Artist")
  
grid.arrange(b,a, ncol=2,widths=c(1,1))



多人演唱歌曲之上榜趨勢統計

# 多人演唱上榜趨勢統計
multiples = songs[grep("feat|with|duet",songs$Artist),]
mult = data.frame(table(multiples$Year))
colnames(mult) = c("Year","Freq")
mult$Year=as.integer(as.character(mult$Year))

ggplot(aes(x=Year,y=Freq),data=mult) +
  geom_histogram(stat="identity",fill="#e9b286") +
  xlab("") +
  ylab("") +
  scale_x_continuous(limits = c(1965,2016),breaks = seq(1965,2015,5)) +
  theme_classic() +
  labs(title="Songs Featuring by 2+ Artists")



熱門歌曲情緒分析 - 前處理

# 1965-2015
s = songs
# 2015
# s = songs[songs$Year == 2015,]

# gsub用來找string中特定字串符號,並加以取代
sent = get_nrc_sentiment(s[,5])

colnames(sent)<-c("Anger", "Anticipation", "Disgust", "Fear", "Joy", "Sadness", "Surprise", "Trust", "Negative", "Positive")
sent = cbind(s[,1:4],sent)


# Example 2015 top10
temp = sent[sent$Year == 2015 ,c(-1,-3,-4)]
rownames(temp) = NULL
datatable(temp,options = list(dom = 't'))



綜合情緒分析 - 各種情緒詞彙出現之頻率

sums = as.data.frame(colSums(sent[,-(1:4)]))
colnames(sums) = "Frequency"
sums$Sentiment = rownames(sums)

p = ggplot(aes(x=Sentiment,y=Frequency,fill=Sentiment),data=sums)+
  geom_bar(stat="identity")+
  coord_flip()+
  theme(legend.position = "none")+
  theme(axis.title = element_text(size = 14, face="bold"),
        axis.text = element_text(size=12))


q = ggplot(aes(x=Sentiment,y=Frequency,fill=Sentiment),data=sums)+
  geom_bar(stat="identity",width = 1)+
  coord_polar("x",start = 0)+
  theme(legend.position = "none")+
  labs(x="",y="")+
  theme(axis.title = element_text(size = 14, face="bold"),
        axis.text = element_text(size=12))


grid.arrange(p,q,ncol=2)



綜合(正負)型情緒分數統整,1965-2015年的波動圖

allthesongs = get_nrc_sentiment(songs[,5])
colnames(allthesongs)<-c("Anger", "Anticipation", "Disgust", "Fear", "Joy", "Sadness", "Surprise", "Trust", "Negative", "Positive")
allthesongs = cbind(songs,allthesongs)

score = as.data.frame(sentiment_by(songs$Lyrics))

allthesongs$ID = seq.int(nrow(allthesongs))

df = merge(allthesongs,score,by.x = "ID",by.y = "element_id")

df_byYear = df %>% 
  group_by(Year) %>% 
  summarise(sum_sentiment = sum(ave_sentiment),
            sum_wordCount = sum(word_count),
            ave_s = mean(ave_sentiment),
            ave_word = mean(word_count))

# 值得注意最高點跟最低點的年份,該年前後世界上(美國)發生什麼事件導致
# https://zh.wikipedia.org/wiki/世界史年表_(20世纪-现在)
ggplot(aes(x=Year,y=ave_s),data=df_byYear)+
  geom_line()+
  geom_hline(yintercept = mean(df$ave_sentiment),colour="red",size=1.5)+
  labs(x="Year",y="Emotional Valance")+
  scale_x_continuous(limits = c(1965,2015), breaks = seq(1965,2015,5))

最高點該年大事件:

1974年:美國尼克森總統因水門事件下台;國際能源機構設立。

1975年:美國所支持的南越政權正式瓦解;蔣介石逝世。

最低點該年大事件:

1997年:2月,鄧小平逝世,英國將香港主權交回中國;

由泰國開始引發東亞金融風暴;京都協定的簽定。

1998年:法國首奪世界盃足球賽冠軍 。印尼 黑色五月暴動。