# get characteristic words by decade
songs$decade = as.integer(paste0(substring(songs$Year,1,3),"0"))
songsByDecade = aggregate(Lyrics ~ decade, data=songs, paste, sep=" ")
# create corpus
docs <- Corpus(VectorSource(songsByDecade$Lyrics))
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("dont", "cant","can","wanna","just","now","know","got","get","never","one"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation, mc.cores=1)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming 去字尾
# docs <- tm_map(docs, stemDocument)
# Build a term-document matrix
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
# Generate the Word cloud
set.seed(1234)
# 多色彩專用 colors=brewer.pal(4, "Dark2")
wordcloud(words = d$word, freq = d$freq, min.freq = 500, scale=c(3,0.5),
max.words=200, random.order=FALSE, rot.per=0.35)
# 每十年最常用單字top20
colnames(m) = c("1960","1970","1980","1990","2000","2010")
m = as.data.frame(m)
top20 = as.data.frame(sapply(m[,1:6], function(x) head(rownames(m[order(-x),]),20)))
top20$Rank = seq(1,20,1)
top20_long =
top20 %>% gather(Year,Word,c(1:6))
ggplot(top20_long, aes(Year, (-1*Rank))) + geom_point(color="white") +
geom_text(aes(label=top20_long$Word, color=top20_long$Year), fontface='bold', size=4) +
theme_classic() +
theme(legend.position="none", plot.title = element_text(size=18),
axis.text.x=element_text(size=16)) +
xlab("") + ylab("Ranking") +
scale_y_continuous(limits=c(-20,-1), breaks=c(-20, -10, -.5), labels=c("#20", "#10", "#1"))
songs$Word_Count = sapply(songs$Lyrics, function(x) length(strsplit(x," ")[[1]]))
songs$Unique_Word = sapply(songs$Lyrics, function(x) length(unique(strsplit(x, " ")[[1]])))
songs$Density = round(songs$Unique_Word/songs$Word_Count,4)*100
songs = songs[songs$Word_Count > 5,]
e = ggplot(aes(x=Year,y=Word_Count),data=songs)+
geom_point(color="#1B687E", alpha=0.4, size=3)+
theme_classic()+
stat_smooth(method="lm",color="black")+
ylab("Count")+
labs(title="Words per Songs (Total)")+
theme(plot.title = element_text(size = 18))
f = ggplot(aes(x=Year,y=Unique_Word),data=songs)+
geom_point(color="#6ab078", alpha=0.4, size=3)+
theme_classic()+
stat_smooth(method="lm",color="black")+
ylab("Count")+
labs(title="Words per Songs (Unique)")+
theme(plot.title = element_text(size = 18))
grid.arrange(e,f,ncol=2)
ggplot(aes(x=Year, y=Word_Count),data=songs)+
geom_histogram(stat="identity",fill="#02C874")+
scale_x_continuous(limits = c(1965,2016),breaks = seq(1965,2015,5))+
labs(x="Year", y="Word Count")
artists = as.data.frame(table(songs$Artist))
artists$Var1 = as.character(artists$Var1)
artists$Artist = sapply(artists$Var1,
function(x) strsplit(x," featuring")[[1]][1])
artists = artists %>%
group_by(Artist) %>%
summarise(Freq=sum(Freq)) %>%
arrange(-Freq)
artists_top20 = artists %>%
head(20)
artists_top20 = artists_top20[order(artists_top20$Freq),]
artists_top20$Artist = factor(artists_top20$Artist, levels = artists_top20$Artist)
a = ggplot(aes(x=Artist,y=Freq),data=artists_top20) +
geom_bar(stat="identity",fill="#2DA58A") +
coord_flip() +
theme_classic() +
xlab("") +
ylab("") +
labs(title="Number of Songs, Top20 Aritsts") +
geom_text(aes(label=Freq), hjust=-0.2) +
scale_y_continuous(limits = c(0,40))
b = ggplot(aes(x=Freq),data=artists) +
geom_bar(fill="#2DA58A") +
theme_classic() +
xlab("") +
ylab("") +
labs(title="Number of Songs per Artist")
grid.arrange(b,a, ncol=2,widths=c(1,1))
# 多人演唱上榜趨勢統計
multiples = songs[grep("feat|with|duet",songs$Artist),]
mult = data.frame(table(multiples$Year))
colnames(mult) = c("Year","Freq")
mult$Year=as.integer(as.character(mult$Year))
ggplot(aes(x=Year,y=Freq),data=mult) +
geom_histogram(stat="identity",fill="#e9b286") +
xlab("") +
ylab("") +
scale_x_continuous(limits = c(1965,2016),breaks = seq(1965,2015,5)) +
theme_classic() +
labs(title="Songs Featuring by 2+ Artists")
# 1965-2015
s = songs
# 2015
# s = songs[songs$Year == 2015,]
# gsub用來找string中特定字串符號,並加以取代
sent = get_nrc_sentiment(s[,5])
colnames(sent)<-c("Anger", "Anticipation", "Disgust", "Fear", "Joy", "Sadness", "Surprise", "Trust", "Negative", "Positive")
sent = cbind(s[,1:4],sent)
# Example 2015 top10
temp = sent[sent$Year == 2015 ,c(-1,-3,-4)]
rownames(temp) = NULL
datatable(temp,options = list(dom = 't'))
sums = as.data.frame(colSums(sent[,-(1:4)]))
colnames(sums) = "Frequency"
sums$Sentiment = rownames(sums)
p = ggplot(aes(x=Sentiment,y=Frequency,fill=Sentiment),data=sums)+
geom_bar(stat="identity")+
coord_flip()+
theme(legend.position = "none")+
theme(axis.title = element_text(size = 14, face="bold"),
axis.text = element_text(size=12))
q = ggplot(aes(x=Sentiment,y=Frequency,fill=Sentiment),data=sums)+
geom_bar(stat="identity",width = 1)+
coord_polar("x",start = 0)+
theme(legend.position = "none")+
labs(x="",y="")+
theme(axis.title = element_text(size = 14, face="bold"),
axis.text = element_text(size=12))
grid.arrange(p,q,ncol=2)
allthesongs = get_nrc_sentiment(songs[,5])
colnames(allthesongs)<-c("Anger", "Anticipation", "Disgust", "Fear", "Joy", "Sadness", "Surprise", "Trust", "Negative", "Positive")
allthesongs = cbind(songs,allthesongs)
score = as.data.frame(sentiment_by(songs$Lyrics))
allthesongs$ID = seq.int(nrow(allthesongs))
df = merge(allthesongs,score,by.x = "ID",by.y = "element_id")
df_byYear = df %>%
group_by(Year) %>%
summarise(sum_sentiment = sum(ave_sentiment),
sum_wordCount = sum(word_count),
ave_s = mean(ave_sentiment),
ave_word = mean(word_count))
# 值得注意最高點跟最低點的年份,該年前後世界上(美國)發生什麼事件導致
# https://zh.wikipedia.org/wiki/世界史年表_(20世纪-现在)
ggplot(aes(x=Year,y=ave_s),data=df_byYear)+
geom_line()+
geom_hline(yintercept = mean(df$ave_sentiment),colour="red",size=1.5)+
labs(x="Year",y="Emotional Valance")+
scale_x_continuous(limits = c(1965,2015), breaks = seq(1965,2015,5))
最高點該年大事件:
1974年:美國尼克森總統因水門事件下台;國際能源機構設立。
1975年:美國所支持的南越政權正式瓦解;蔣介石逝世。
最低點該年大事件:
1997年:2月,鄧小平逝世,英國將香港主權交回中國;
由泰國開始引發東亞金融風暴;京都協定的簽定。
1998年:法國首奪世界盃足球賽冠軍 。印尼 黑色五月暴動。