Carico librerie

library(skmeans)
library(tm)
library(clue)
library(cluster)
library(fpc)
library(wordcloud)
library(readxl)
library(SnowballC)
library(wordcloud2)
library(RColorBrewer)
library(syuzhet)
library(ggplot2)
library(textstem)
library(knitr)
library(kableExtra)
library(htmlwidgets)
library(webshot)
library(textstem)
library(Xplortext)

Carico Dataset

Analisi della discografia

Pulizia testo e prime wordcloud

Creazione della variabile Annoc, questa ci indica il decennio al quale la canzone appartiene

df_definitivo$Annoc[df_definitivo$Anno < 1970] <- "Anni 60"
df_definitivo$Annoc[df_definitivo$Anno  >= 1970 & df_definitivo$Anno  < 1980  ] <- "Anni 70"
df_definitivo$Annoc[df_definitivo$Anno  >= 1980 & df_definitivo$Anno  < 1990  ] <- "Anni 80"
df_definitivo$Annoc[df_definitivo$Anno  >= 1990 & df_definitivo$Anno  < 2000  ] <- "Anni 90"

df_definitivo$Annoc<-factor(df_definitivo$Annoc, levels=c("Anni 60", "Anni 70", "Anni 80", 
                                                          "Anni 90"), ordered=TRUE)
table(df_definitivo$Annoc)

## 
## Anni 60 Anni 70 Anni 80 Anni 90 
##     145      10      28      54

Pulizia del testo e costruzione del corpus

text<-as.character(df_definitivo$Testo)
corpus<-Corpus(VectorSource(df_definitivo$Testo))
corpus <-tm_map(corpus, removePunctuation)
corpus <-tm_map(corpus, removeNumbers)
corpus <-tm_map(corpus, content_transformer(tolower))
corpus <-tm_map(corpus, removeWords, c(stopwords("en"),"dont","yeah","got","youre"))
corpus_sw<-tm_map(corpus, stripWhitespace)
Corpus_lem<-tm_map(corpus_sw, lemmatize_strings)
Lyrics_dtm <- TermDocumentMatrix(corpus_sw)
dtm_m <- as.matrix(Lyrics_dtm)

Termini più comuni estrapolati dal testo ripulito.

dtm_v <- sort(rowSums(dtm_m), decreasing = TRUE)
dtm_d <- data.frame(word = names(dtm_v), freq = dtm_v)
kable(head(dtm_d, 10), col.names = c("Parola", "Frequenza"), row.names = FALSE, align = "c") %>%
  kable_styling(full_width = F)

Parola	Frequenza
love	530
know	409
now	271
baby	259
well	231
can	209
see	202
like	188
say	188
gonna	181

Lyrics_dtm <- TermDocumentMatrix(Corpus_lem)
dtm_m <- as.matrix(Lyrics_dtm)

Con WC abbiamo creato un wordcloud.

wc <- wordcloud2(dtm_d, fontFamily = "Arial", size = 1.2)
wc

Termini più comuni estrapolati dal testo ripulito e lemmatizzato.

dtm_v <- sort(rowSums(dtm_m), decreasing = TRUE)
dtm_d_l <- data.frame(word = names(dtm_v), freq = dtm_v)
kable(head(dtm_d_l,10), col.names = c("Lemma", "Frequenza"), row.names = FALSE, align = "c") %>%
  kable_styling(full_width = F)

Lemma	Frequenza
love	600
know	466
good	386
say	349
now	271
come	264
baby	259
see	239
can	209
like	189

Con WC abbiamo creato un wordcloud.

wordcloud(dtm_d_l$word,dtm_d_l$freq,min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(4, "Dark2"))

Analisi del sentiment

Conteggio delle parole associate a ciascuna emozione

text_full_lemmatized <- c()
for(i in 1:length(Corpus_lem)){
  text_full_lemmatized <- append(Corpus_lem[[i]]$content, text_full_lemmatized)
}
d <- get_nrc_sentiment(text_full_lemmatized)
kable(head(d,10)[,1:8],
      
      col.names = c("Rabbia", "Aspettativa", "Disgusto", "Paura", "Gioia", "Tristezza", "Sorpresa", "Fiducia"), align = "c") %>%
  kable_styling(full_width = F)

Rabbia	Aspettativa	Disgusto	Paura	Gioia	Tristezza	Sorpresa	Fiducia
0	0	0	1	0	0	0	0
4	6	0	2	5	2	5	7
0	5	0	0	1	2	1	2
0	5	0	0	8	0	4	5
2	5	2	1	9	1	5	4
4	2	4	6	3	4	2	3
7	2	3	2	3	5	2	2
1	1	1	2	1	2	1	1
0	8	1	0	6	1	1	4
0	0	0	0	0	0	0	0

td<-data.frame(t(d))
td_new <- data.frame(rowSums(td))
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]

Conteggio termini associati a 8 sentimenti per l’intera discografia dei Beatles

sentiments <- td_new2$sentiment
ord <- order(td_new2$count, decreasing = T)
ord_sent <- td_new2[ord,]
rownames(ord_sent) <- NULL
ord_sent$sentiment <- factor(ord_sent$sentiment, levels = sentiments[ord])
quickplot(sentiment, data = ord_sent, weight = count, geom = "bar",
          fill = sentiment, ylab = "Count", xlab = "Sentiment")+
  theme(legend.position = "none") + theme_bw()

Analisi delle decadi

Mettiamo insieme le figure

par(mfrow=c(2,2))

plot(avg_score60, type = "l", ylab = "Average Sentiment Score", xlab = "",
     main = "Canzoni dal 1960-1969", xaxt='n',bty="n")
abline(h=0, col="red")
points(x = c(which.min(avg_score60), which.max(avg_score60)),
       y = c(min(avg_score60), max(avg_score60)),
       pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score60), which.max(avg_score60)),
     y = c(min(avg_score60), max(avg_score60)),
     labels = c("She's Leaving Home", "The long one"),pos=4)

plot(avg_score70, type = "l", ylab = "Average Sentiment Score", xlab = "",
     main = "Canzoni dal 1970-1979", yaxt='n',xaxt='n',bty="n")
abline(h=0, col="red")
points(x = c(which.min(avg_score70), which.max(avg_score70)),
       y = c(min(avg_score70), max(avg_score70)),
       pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score70), which.max(avg_score70)),
     y = c(min(avg_score70), max(avg_score70)),
     labels = c("Maggie Mae", "For You Blue"),pos=4)

plot(avg_score80, type = "l", ylab = "Average Sentiment Score", xlab = "",
     main = "Canzoni dal 1980-1989", bty='n',xaxt='n')
abline(h=0, col="red")
points(x = c(which.min(avg_score80), which.max(avg_score80)),
       y = c(min(avg_score80), max(avg_score80)),
       pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score80), which.max(avg_score80)),
     y = c(min(avg_score80), max(avg_score80)),
     labels = c("I Call Your Name", "The Ballad Of John And Yoko"),pos=2)

plot(avg_score90, type = "l", ylab = "Average Sentiment Score", xlab = "",
     main = "Canzoni dal 1990-1999", yaxt='n',xaxt='n',bty="n")
abline(h=0, col="red")
points(x = c(which.min(avg_score90), which.max(avg_score90)),
       y = c(min(avg_score90), max(avg_score90)),
       pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score90), which.max(avg_score90)),
     y = c(min(avg_score90), max(avg_score90)),
     labels = c("Not Guilty", "I Got A Woman"),pos=4)

Ecco i wordcloud per decade

wordcloud(dtm60_lm$word,dtm60_lm$freq,min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(4, "Dark2"))+title(main="Anni 60")

## integer(0)

wordcloud(dtm70_lm$word,dtm70_lm$freq,min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(4, "Dark2"))+title(main="Anni 70")

## integer(0)

wordcloud(dtm80_lm$word,dtm80_lm$freq,min.freq = 1,
          max.words=180, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(4, "Dark2"))+title(main="Anni 80")

## integer(0)

wordcloud(dtm90_lm$word,dtm90_lm$freq,min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(4, "Dark2"))+title(main="Anni 90")

## integer(0)

Paragone decadi per numero di lemmi usati

nsongs <- c(length(Anni60_corpus), length(Anni70_corpus), length(Anni80_corpus),
            length(Anni90_corpus))
twords <- c(sum(dtm60_lm[,2]), sum(dtm70_lm[,2]), sum(dtm80_lm[,2]), sum(dtm90_lm[,2]))
uwords <- c(nrow(Anni60_corpus_dtm), nrow(Anni70_corpus_dtm), nrow(Anni80_corpus_dtm),
            nrow(Anni90_corpus_dtm))
wps <- round(twords / nsongs, 2)
uwps <- round(uwords / nsongs, 2)
names(nsongs) <- c("1960-1969", "1970-1979", "1980-1989", "1990-1999")
kable(cbind(nsongs, twords, uwords, wps, uwps), col.names = c("Numero di canzoni", "Lemmi totali", "Lemmi unici", "Lemmi per canzone", "Lemmi unici per canzoni"), align = "c") %>%
  kable_styling(full_width = F)

	Numero di canzoni	Lemmi totali	Lemmi unici	Lemmi per canzone	Lemmi unici per canzoni
1960-1969	145	13573	1555	93.61	10.72
1970-1979	10	836	293	83.60	29.30
1980-1989	28	2602	579	92.93	20.68
1990-1999	54	4961	799	91.87	14.80

Paragone decadi per frequenza di uso delle parole

dtms <- list(dtm60_lm, dtm70_lm, dtm80_lm, dtm90_lm)
output <- list()
mat <- NULL
for(i in 1:length(dtms)){
  words <- dtms[[i]][1:10, 1]
  freq <- dtms[[i]][1:10, 2]/sum(dtms[[i]][,2])
  perc <- paste(round(freq*100, 2), "%")
  output[[i]] <- cbind(words, perc)
  mat <- cbind(mat, output[[i]])
}
kable(mat, col.names = c("1960s", "", "1970s", "", "1980s", "", "1990s", ""),align = "c") %>%
  kable_styling(full_width = F)

1960s		1970s		1980s		1990s
love	2.65 %	back	3.83 %	know	4.84 %	love	3.2 %
know	1.91 %	mine	2.99 %	love	2.84 %	good	2.78 %
say	1.47 %	say	2.75 %	can	1.81 %	say	1.71 %
come	1.44 %	home	2.63 %	baby	1.73 %	baby	1.65 %
good	1.42 %	get	2.03 %	good	1.69 %	know	1.57 %
now	1.31 %	can	1.91 %	say	1.61 %	now	1.39 %
see	1.27 %	dig	1.79 %	name	1.31 %	just	1.25 %
youre	1.03 %	like	1.56 %	shes	1.19 %	shout	1.13 %
get	0.96 %	nothing	1.56 %	way	1.11 %	gonna	1.05 %
baby	0.93 %	way	1.56 %	hand	1.04 %	little	1.01 %

Evoluzione dei lemmi più usati

word_evolution <- function(words, dtms, years, plot_title, color_pal = "Set2"){
  percs <- list()
  for(i in 1:length(words)){
    perc <- numeric(0)
    for(j in 1:length(dtms)){
      perc[j] <- dtms[[j]][words[i], 2]*100/sum(dtms[[j]][,2])
    }
    percs[[i]] <- perc
  }
  names(percs) <- words
  df <- NULL
  for(i in 1:length(words)){
    mat <- cbind(decades, words[i], percs[[i]])
    df <- rbind(df, mat)
  }
  df <- as.data.frame(df)
  colnames(df) <- c("decade", "word", "perc")
  df$perc <- as.numeric(df$perc)
  ggplot(data = df, aes(x = decade, y = perc, group = word, col = word)) +
    geom_line(size = 1) + labs(x = "Decade", y = "% sull'intero numero di parole",
                               title = plot_title, color = "Lemma") +
    scale_color_brewer(palette=color_pal) + theme_bw()
}
words <- c("love", "home", "know", "good", "back", "baby")
dtms <- list(dtm60_lm, dtm70_lm,dtm80_lm, dtm90_lm)
decades <- c("1960s", "1970s", "1980s", "1990s")
title = "Evolution of the most used lemmas"
word_evolution(words, dtms, decades, title)

Sentiment score per decade

boxplot(avg_score60, avg_score70, avg_score80, avg_score90, 
        col=brewer.pal(4, "Set3"),
        names=c("1960s","1970s", "1980s", "1990s"))
abline(h=0, col="red", lwd=2, lty=2)

Analisi delle corrispondenze lessicali

Costruzione corpus e stoplist

text<-as.character(df_definitivo$Testo)
corpus<-Corpus(VectorSource(df_definitivo$Testo))
corpus <-tm_map(corpus, removePunctuation)
corpus <-tm_map(corpus, removeNumbers)
corpus <-tm_map(corpus, content_transformer(tolower))
corpus <-tm_map(corpus, removeWords, c(stopwords("en"),"dont","yeah","got","youre"))
corpus_sw<-tm_map(corpus, stripWhitespace)
Corpus_lem<-tm_map(corpus_sw, lemmatize_strings)

corpus_df<-data.frame(text=sapply(Corpus_lem,as.character),stringsAsFactors = FALSE)
corpus_df$year<-df_definitivo$Anno
corpus_df$album<-df_definitivo$Album
corpus_df$titolo<-df_definitivo$Titolo
corpus_df$annoc<-df_definitivo$Annoc
U_stoplist<-c("dont","yeah","got","youre","m","t","don","s","re","oh","ve","ll","im")

LCA per decade

decade.TD<-TextData(corpus_df, var.text="text", idiom="en",var.agg = "annoc",Fmin=70, Dmin=60,
                 stop.word.user=U_stoplist, stop.word.tm=TRUE)
TableLex.decade<-as.matrix(decade.TD$DocTerm)

res.chi2<-chisq.test(TableLex.decade) #chi-quadrato
tau<-(res.chi2$observed/res.chi2$expected) #association rate

kable(head(round(tau[1:4,1:10],2)),
      
      col.names = c("can", "come", "get","go","good","just","know","like","love","make"), align = "c") %>%
  kable_styling(full_width = F)  #tabella tau

	can	come	get	go	good	just	know	like	love	make
Anni 60	0.86	1.22	0.85	1.00	0.82	0.78	0.91	1.04	0.99	1.13
Anni 70	2.07	0.92	4.63	1.02	0.77	0.55	0.17	1.86	0.32	0.15
Anni 80	1.84	0.56	1.04	0.67	0.93	0.72	2.22	0.56	1.01	0.70
Anni 90	0.76	0.68	0.78	1.18	1.53	1.79	0.72	1.00	1.13	0.95

res.LexCA<-LexCA(decade.TD, graph=FALSE)

Varianza e percentuale di varianza spiegata dalle dimensioni ottenute dalla LCA

kable(head(round(res.LexCA$eig,2),3)[,1:3],
      
      col.names = c("Varianza", "% Varianza", "% Varianza cumulata"), align = "c") %>%
  kable_styling(full_width = F) #eigenvalues

	Varianza	% Varianza	% Varianza cumulata
dim 1	0.03	38.18	38.18
dim 2	0.03	37.33	75.50
dim 3	0.02	24.50	100.00

Rappresentazione grafica varianza spiegata dalle dimensioni

plot(res.LexCA,eigen=TRUE,selDoc=NULL,selWord=NULL, col="blue")

Rappresentazione delle classi

plot(res.LexCA,selWord=NULL,cex=1,col.doc="grey30")
lines(res.LexCA$row$coord[1:4,1],res.LexCA$row$coord[1:4,2],lwd=1.5,col="grey20")

Rappresentazione relazione decadi-parole

plot(res.LexCA,xlim=c(-0.5,0.6),ylim=c(-0.3,0.83),col.doc="red",col.word="black",cex=1)
lines(res.LexCA$row$coord[1:4,1],res.LexCA$row$coord[1:4,2],lwd=1.5,col="grey20")

Ellissi di confidenza

ellipseLexCA(res.LexCA,selWord=NULL,col.doc="grey30")
lines(res.LexCA$row$coord[1:4,1],res.LexCA$row$coord[1:4,2],lwd=1.5,col="grey20")

Clustering gerarchico per album

album.TD<-TextData(corpus_df, var.text="text",var.agg = "album",idiom="en",Fmin=60, Dmin=60,
                   stop.word.user=U_stoplist, stop.word.tm=TRUE)
TableLex.album<-as.matrix(album.TD$DocTerm)
res.chi2<-chisq.test(TableLex.album)
res.LexCA_album<-LexCA(album.TD, graph=FALSE)
res.hc_album<-LexHCca(res.LexCA_album, nb.clust=5, graph=TRUE) #clustering album

#res.hc$clust.count
#res.hc$description$desc.cluster.doc$words

Clustering gerarchico per i testi

testo.TD<-TextData(corpus_df, var.text="text",idiom="en",Fmin=60, Dmin=60,
                   stop.word.user=U_stoplist, stop.word.tm=TRUE)
TableLex.testo<-as.matrix(testo.TD$DocTerm)

res.chi2<-chisq.test(TableLex.testo)
res.LexCA<-LexCA(testo.TD, graph=FALSE)
#summary(res.LexCA,metaWords=FALSE)

plot(res.LexCA,eigen=TRUE,selDoc=NULL,selWord=NULL, col="blue")

Rappresentazione delle parole

plot(res.LexCA,selDoc=NULL,col.word="black",cex=1,
     title="Word representation")

Dendogramma e cluster dei testi

res.hc<-LexHCca(res.LexCA, nb.clust=5, graph=TRUE) #clustering testi

## Warning: ggrepel: 166 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#res.hc$clust.count
#res.hc$description$desc.cluster.doc$words

Cluster 1
	Intern %	glob %	Intern freq	Glob freq	p.value	v.test
love	10.179	2.577	359	600	0.000	25.479
just	1.616	0.636	57	148	0.000	6.868
like	1.304	0.812	46	189	0.001	3.238
know	2.694	2.002	95	466	0.003	3.010
time	0.312	0.606	11	141	0.013	-2.479
one	0.284	0.649	10	151	0.002	-3.069
will	0.284	0.696	10	162	0.001	-3.391
get	0.227	0.777	8	181	0.000	-4.474
go	0.425	1.370	15	319	0.000	-5.859
now	0.284	1.164	10	271	0.000	-6.031
come	0.227	1.134	8	264	0.000	-6.395
good	0.482	1.658	17	386	0.000	-6.704

Cluster 5
	Intern %	glob %	Intern freq	Glob freq	p.value	v.test
now	13.725	1.164	119	271	0.000	20.878
make	0.115	0.752	1	175	0.020	-2.328
good	0.692	1.658	6	386	0.020	-2.335
want	0.115	0.786	1	183	0.015	-2.426
time	0.000	0.606	0	141	0.009	-2.600
will	0.000	0.696	0	162	0.004	-2.864
tell	0.000	0.713	0	166	0.004	-2.912
come	0.115	1.134	1	264	0.001	-3.304
see	0.000	1.027	0	239	0.000	-3.696
know	0.461	2.002	4	466	0.000	-3.712
love	0.692	2.577	6	600	0.000	-4.007
say	0.000	1.499	0	349	0.000	-4.658

K-medie

Composizione dei cluster del k-medie sferico

Silhouette

plot(silhouette(soft.part))

z<-as.data.frame(silhouette(soft.part))

Nuvole di confronto

soft.clus.proto<-t(cl_prototypes(soft.part))
comparison.cloud(soft.clus.proto, max.words=100)

Progetto Beatles

Autore: Gruppo SRL