Con il seguente script abbiamo analizzato i testi delle canzoni dei Beatles al fine di poter verificare la presenza o meno di caratteristiche e pattern nelle canzoni del gruppo.

Carico librerie

library(skmeans)
library(tm)
library(clue)
library(cluster)
library(fpc)
library(wordcloud)
library(readxl)
library(SnowballC)
library(wordcloud2)
library(RColorBrewer)
library(syuzhet)
library(ggplot2)
library(textstem)
library(knitr)
library(kableExtra)
library(htmlwidgets)
library(webshot)
library(textstem)
library(Xplortext)

Carico Dataset

Analisi della discografia

Pulizia testo e prime wordcloud

Creazione della variabile Annoc, questa ci indica il decennio al quale la canzone appartiene

df_definitivo$Annoc[df_definitivo$Anno < 1970] <- "Anni 60"
df_definitivo$Annoc[df_definitivo$Anno  >= 1970 & df_definitivo$Anno  < 1980  ] <- "Anni 70"
df_definitivo$Annoc[df_definitivo$Anno  >= 1980 & df_definitivo$Anno  < 1990  ] <- "Anni 80"
df_definitivo$Annoc[df_definitivo$Anno  >= 1990 & df_definitivo$Anno  < 2000  ] <- "Anni 90"

df_definitivo$Annoc<-factor(df_definitivo$Annoc, levels=c("Anni 60", "Anni 70", "Anni 80", 
                                                          "Anni 90"), ordered=TRUE)
table(df_definitivo$Annoc)
## 
## Anni 60 Anni 70 Anni 80 Anni 90 
##     145      10      28      54

Pulizia del testo e costruzione del corpus

text<-as.character(df_definitivo$Testo)
corpus<-Corpus(VectorSource(df_definitivo$Testo))
corpus <-tm_map(corpus, removePunctuation)
corpus <-tm_map(corpus, removeNumbers)
corpus <-tm_map(corpus, content_transformer(tolower))
corpus <-tm_map(corpus, removeWords, c(stopwords("en"),"dont","yeah","got","youre"))
corpus_sw<-tm_map(corpus, stripWhitespace)
Corpus_lem<-tm_map(corpus_sw, lemmatize_strings)
Lyrics_dtm <- TermDocumentMatrix(corpus_sw)
dtm_m <- as.matrix(Lyrics_dtm)

Termini più comuni estrapolati dal testo ripulito.

dtm_v <- sort(rowSums(dtm_m), decreasing = TRUE)
dtm_d <- data.frame(word = names(dtm_v), freq = dtm_v)
kable(head(dtm_d, 10), col.names = c("Parola", "Frequenza"), row.names = FALSE, align = "c") %>%
  kable_styling(full_width = F)
Parola Frequenza
love 530
know 409
now 271
baby 259
well 231
can 209
see 202
like 188
say 188
gonna 181
Lyrics_dtm <- TermDocumentMatrix(Corpus_lem)
dtm_m <- as.matrix(Lyrics_dtm)

Con WC abbiamo creato un wordcloud.

wc <- wordcloud2(dtm_d, fontFamily = "Arial", size = 1.2)
wc

Termini più comuni estrapolati dal testo ripulito e lemmatizzato.

dtm_v <- sort(rowSums(dtm_m), decreasing = TRUE)
dtm_d_l <- data.frame(word = names(dtm_v), freq = dtm_v)
kable(head(dtm_d_l,10), col.names = c("Lemma", "Frequenza"), row.names = FALSE, align = "c") %>%
  kable_styling(full_width = F)
Lemma Frequenza
love 600
know 466
good 386
say 349
now 271
come 264
baby 259
see 239
can 209
like 189

Con WC abbiamo creato un wordcloud.

wordcloud(dtm_d_l$word,dtm_d_l$freq,min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(4, "Dark2"))

Analisi del sentiment

Conteggio delle parole associate a ciascuna emozione

text_full_lemmatized <- c()
for(i in 1:length(Corpus_lem)){
  text_full_lemmatized <- append(Corpus_lem[[i]]$content, text_full_lemmatized)
}
d <- get_nrc_sentiment(text_full_lemmatized)
kable(head(d,10)[,1:8],
      
      col.names = c("Rabbia", "Aspettativa", "Disgusto", "Paura", "Gioia", "Tristezza", "Sorpresa", "Fiducia"), align = "c") %>%
  kable_styling(full_width = F)
Rabbia Aspettativa Disgusto Paura Gioia Tristezza Sorpresa Fiducia
0 0 0 1 0 0 0 0
4 6 0 2 5 2 5 7
0 5 0 0 1 2 1 2
0 5 0 0 8 0 4 5
2 5 2 1 9 1 5 4
4 2 4 6 3 4 2 3
7 2 3 2 3 5 2 2
1 1 1 2 1 2 1 1
0 8 1 0 6 1 1 4
0 0 0 0 0 0 0 0
td<-data.frame(t(d))
td_new <- data.frame(rowSums(td))
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]

Conteggio termini associati a 8 sentimenti per l’intera discografia dei Beatles

sentiments <- td_new2$sentiment
ord <- order(td_new2$count, decreasing = T)
ord_sent <- td_new2[ord,]
rownames(ord_sent) <- NULL
ord_sent$sentiment <- factor(ord_sent$sentiment, levels = sentiments[ord])
quickplot(sentiment, data = ord_sent, weight = count, geom = "bar",
          fill = sentiment, ylab = "Count", xlab = "Sentiment")+
  theme(legend.position = "none") + theme_bw()

Analisi delle decadi

Mettiamo insieme le figure

par(mfrow=c(2,2))

plot(avg_score60, type = "l", ylab = "Average Sentiment Score", xlab = "",
     main = "Canzoni dal 1960-1969", xaxt='n',bty="n")
abline(h=0, col="red")
points(x = c(which.min(avg_score60), which.max(avg_score60)),
       y = c(min(avg_score60), max(avg_score60)),
       pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score60), which.max(avg_score60)),
     y = c(min(avg_score60), max(avg_score60)),
     labels = c("She's Leaving Home", "The long one"),pos=4)

plot(avg_score70, type = "l", ylab = "Average Sentiment Score", xlab = "",
     main = "Canzoni dal 1970-1979", yaxt='n',xaxt='n',bty="n")
abline(h=0, col="red")
points(x = c(which.min(avg_score70), which.max(avg_score70)),
       y = c(min(avg_score70), max(avg_score70)),
       pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score70), which.max(avg_score70)),
     y = c(min(avg_score70), max(avg_score70)),
     labels = c("Maggie Mae", "For You Blue"),pos=4)

plot(avg_score80, type = "l", ylab = "Average Sentiment Score", xlab = "",
     main = "Canzoni dal 1980-1989", bty='n',xaxt='n')
abline(h=0, col="red")
points(x = c(which.min(avg_score80), which.max(avg_score80)),
       y = c(min(avg_score80), max(avg_score80)),
       pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score80), which.max(avg_score80)),
     y = c(min(avg_score80), max(avg_score80)),
     labels = c("I Call Your Name", "The Ballad Of John And Yoko"),pos=2)

plot(avg_score90, type = "l", ylab = "Average Sentiment Score", xlab = "",
     main = "Canzoni dal 1990-1999", yaxt='n',xaxt='n',bty="n")
abline(h=0, col="red")
points(x = c(which.min(avg_score90), which.max(avg_score90)),
       y = c(min(avg_score90), max(avg_score90)),
       pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score90), which.max(avg_score90)),
     y = c(min(avg_score90), max(avg_score90)),
     labels = c("Not Guilty", "I Got A Woman"),pos=4)

Ecco i wordcloud per decade

wordcloud(dtm60_lm$word,dtm60_lm$freq,min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(4, "Dark2"))+title(main="Anni 60")

## integer(0)
wordcloud(dtm70_lm$word,dtm70_lm$freq,min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(4, "Dark2"))+title(main="Anni 70")

## integer(0)
wordcloud(dtm80_lm$word,dtm80_lm$freq,min.freq = 1,
          max.words=180, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(4, "Dark2"))+title(main="Anni 80")

## integer(0)
wordcloud(dtm90_lm$word,dtm90_lm$freq,min.freq = 1,
          max.words=150, random.order=FALSE, rot.per=0.35, 
          colors=brewer.pal(4, "Dark2"))+title(main="Anni 90")

## integer(0)

Paragone decadi per numero di lemmi usati

nsongs <- c(length(Anni60_corpus), length(Anni70_corpus), length(Anni80_corpus),
            length(Anni90_corpus))
twords <- c(sum(dtm60_lm[,2]), sum(dtm70_lm[,2]), sum(dtm80_lm[,2]), sum(dtm90_lm[,2]))
uwords <- c(nrow(Anni60_corpus_dtm), nrow(Anni70_corpus_dtm), nrow(Anni80_corpus_dtm),
            nrow(Anni90_corpus_dtm))
wps <- round(twords / nsongs, 2)
uwps <- round(uwords / nsongs, 2)
names(nsongs) <- c("1960-1969", "1970-1979", "1980-1989", "1990-1999")
kable(cbind(nsongs, twords, uwords, wps, uwps), col.names = c("Numero di canzoni", "Lemmi totali", "Lemmi unici", "Lemmi per canzone", "Lemmi unici per canzoni"), align = "c") %>%
  kable_styling(full_width = F)
Numero di canzoni Lemmi totali Lemmi unici Lemmi per canzone Lemmi unici per canzoni
1960-1969 145 13573 1555 93.61 10.72
1970-1979 10 836 293 83.60 29.30
1980-1989 28 2602 579 92.93 20.68
1990-1999 54 4961 799 91.87 14.80

Paragone decadi per frequenza di uso delle parole

dtms <- list(dtm60_lm, dtm70_lm, dtm80_lm, dtm90_lm)
output <- list()
mat <- NULL
for(i in 1:length(dtms)){
  words <- dtms[[i]][1:10, 1]
  freq <- dtms[[i]][1:10, 2]/sum(dtms[[i]][,2])
  perc <- paste(round(freq*100, 2), "%")
  output[[i]] <- cbind(words, perc)
  mat <- cbind(mat, output[[i]])
}
kable(mat, col.names = c("1960s", "", "1970s", "", "1980s", "", "1990s", ""),align = "c") %>%
  kable_styling(full_width = F)
1960s 1970s 1980s 1990s
love 2.65 % back 3.83 % know 4.84 % love 3.2 %
know 1.91 % mine 2.99 % love 2.84 % good 2.78 %
say 1.47 % say 2.75 % can 1.81 % say 1.71 %
come 1.44 % home 2.63 % baby 1.73 % baby 1.65 %
good 1.42 % get 2.03 % good 1.69 % know 1.57 %
now 1.31 % can 1.91 % say 1.61 % now 1.39 %
see 1.27 % dig 1.79 % name 1.31 % just 1.25 %
youre 1.03 % like 1.56 % shes 1.19 % shout 1.13 %
get 0.96 % nothing 1.56 % way 1.11 % gonna 1.05 %
baby 0.93 % way 1.56 % hand 1.04 % little 1.01 %

Evoluzione dei lemmi più usati

word_evolution <- function(words, dtms, years, plot_title, color_pal = "Set2"){
  percs <- list()
  for(i in 1:length(words)){
    perc <- numeric(0)
    for(j in 1:length(dtms)){
      perc[j] <- dtms[[j]][words[i], 2]*100/sum(dtms[[j]][,2])
    }
    percs[[i]] <- perc
  }
  names(percs) <- words
  df <- NULL
  for(i in 1:length(words)){
    mat <- cbind(decades, words[i], percs[[i]])
    df <- rbind(df, mat)
  }
  df <- as.data.frame(df)
  colnames(df) <- c("decade", "word", "perc")
  df$perc <- as.numeric(df$perc)
  ggplot(data = df, aes(x = decade, y = perc, group = word, col = word)) +
    geom_line(size = 1) + labs(x = "Decade", y = "% sull'intero numero di parole",
                               title = plot_title, color = "Lemma") +
    scale_color_brewer(palette=color_pal) + theme_bw()
}
words <- c("love", "home", "know", "good", "back", "baby")
dtms <- list(dtm60_lm, dtm70_lm,dtm80_lm, dtm90_lm)
decades <- c("1960s", "1970s", "1980s", "1990s")
title = "Evolution of the most used lemmas"
word_evolution(words, dtms, decades, title)

Sentiment score per decade

boxplot(avg_score60, avg_score70, avg_score80, avg_score90, 
        col=brewer.pal(4, "Set3"),
        names=c("1960s","1970s", "1980s", "1990s"))
abline(h=0, col="red", lwd=2, lty=2)

Analisi delle corrispondenze lessicali

Costruzione corpus e stoplist

text<-as.character(df_definitivo$Testo)
corpus<-Corpus(VectorSource(df_definitivo$Testo))
corpus <-tm_map(corpus, removePunctuation)
corpus <-tm_map(corpus, removeNumbers)
corpus <-tm_map(corpus, content_transformer(tolower))
corpus <-tm_map(corpus, removeWords, c(stopwords("en"),"dont","yeah","got","youre"))
corpus_sw<-tm_map(corpus, stripWhitespace)
Corpus_lem<-tm_map(corpus_sw, lemmatize_strings)

corpus_df<-data.frame(text=sapply(Corpus_lem,as.character),stringsAsFactors = FALSE)
corpus_df$year<-df_definitivo$Anno
corpus_df$album<-df_definitivo$Album
corpus_df$titolo<-df_definitivo$Titolo
corpus_df$annoc<-df_definitivo$Annoc
U_stoplist<-c("dont","yeah","got","youre","m","t","don","s","re","oh","ve","ll","im")

LCA per decade

decade.TD<-TextData(corpus_df, var.text="text", idiom="en",var.agg = "annoc",Fmin=70, Dmin=60,
                 stop.word.user=U_stoplist, stop.word.tm=TRUE)
TableLex.decade<-as.matrix(decade.TD$DocTerm)

res.chi2<-chisq.test(TableLex.decade) #chi-quadrato
tau<-(res.chi2$observed/res.chi2$expected) #association rate

kable(head(round(tau[1:4,1:10],2)),
      
      col.names = c("can", "come", "get","go","good","just","know","like","love","make"), align = "c") %>%
  kable_styling(full_width = F)  #tabella tau
can come get go good just know like love make
Anni 60 0.86 1.22 0.85 1.00 0.82 0.78 0.91 1.04 0.99 1.13
Anni 70 2.07 0.92 4.63 1.02 0.77 0.55 0.17 1.86 0.32 0.15
Anni 80 1.84 0.56 1.04 0.67 0.93 0.72 2.22 0.56 1.01 0.70
Anni 90 0.76 0.68 0.78 1.18 1.53 1.79 0.72 1.00 1.13 0.95
res.LexCA<-LexCA(decade.TD, graph=FALSE)

Varianza e percentuale di varianza spiegata dalle dimensioni ottenute dalla LCA

kable(head(round(res.LexCA$eig,2),3)[,1:3],
      
      col.names = c("Varianza", "% Varianza", "% Varianza cumulata"), align = "c") %>%
  kable_styling(full_width = F) #eigenvalues
Varianza % Varianza % Varianza cumulata
dim 1 0.03 38.18 38.18
dim 2 0.03 37.33 75.50
dim 3 0.02 24.50 100.00

Rappresentazione grafica varianza spiegata dalle dimensioni

plot(res.LexCA,eigen=TRUE,selDoc=NULL,selWord=NULL, col="blue")

Rappresentazione delle classi

plot(res.LexCA,selWord=NULL,cex=1,col.doc="grey30")
lines(res.LexCA$row$coord[1:4,1],res.LexCA$row$coord[1:4,2],lwd=1.5,col="grey20")

Rappresentazione relazione decadi-parole

plot(res.LexCA,xlim=c(-0.5,0.6),ylim=c(-0.3,0.83),col.doc="red",col.word="black",cex=1)
lines(res.LexCA$row$coord[1:4,1],res.LexCA$row$coord[1:4,2],lwd=1.5,col="grey20")

Ellissi di confidenza

ellipseLexCA(res.LexCA,selWord=NULL,col.doc="grey30")
lines(res.LexCA$row$coord[1:4,1],res.LexCA$row$coord[1:4,2],lwd=1.5,col="grey20")

Clustering gerarchico per album

album.TD<-TextData(corpus_df, var.text="text",var.agg = "album",idiom="en",Fmin=60, Dmin=60,
                   stop.word.user=U_stoplist, stop.word.tm=TRUE)
TableLex.album<-as.matrix(album.TD$DocTerm)
res.chi2<-chisq.test(TableLex.album)
res.LexCA_album<-LexCA(album.TD, graph=FALSE)
res.hc_album<-LexHCca(res.LexCA_album, nb.clust=5, graph=TRUE) #clustering album

#res.hc$clust.count
#res.hc$description$desc.cluster.doc$words

Clustering gerarchico per i testi

testo.TD<-TextData(corpus_df, var.text="text",idiom="en",Fmin=60, Dmin=60,
                   stop.word.user=U_stoplist, stop.word.tm=TRUE)
TableLex.testo<-as.matrix(testo.TD$DocTerm)

res.chi2<-chisq.test(TableLex.testo)
res.LexCA<-LexCA(testo.TD, graph=FALSE)
#summary(res.LexCA,metaWords=FALSE)

plot(res.LexCA,eigen=TRUE,selDoc=NULL,selWord=NULL, col="blue")

Rappresentazione delle parole

plot(res.LexCA,selDoc=NULL,col.word="black",cex=1,
     title="Word representation")

Dendogramma e cluster dei testi

res.hc<-LexHCca(res.LexCA, nb.clust=5, graph=TRUE) #clustering testi

## Warning: ggrepel: 166 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps

#res.hc$clust.count
#res.hc$description$desc.cluster.doc$words
Cluster 1
Intern % glob % Intern freq Glob freq p.value v.test
love 10.179 2.577 359 600 0.000 25.479
just 1.616 0.636 57 148 0.000 6.868
like 1.304 0.812 46 189 0.001 3.238
know 2.694 2.002 95 466 0.003 3.010
time 0.312 0.606 11 141 0.013 -2.479
one 0.284 0.649 10 151 0.002 -3.069
will 0.284 0.696 10 162 0.001 -3.391
get 0.227 0.777 8 181 0.000 -4.474
go 0.425 1.370 15 319 0.000 -5.859
now 0.284 1.164 10 271 0.000 -6.031
come 0.227 1.134 8 264 0.000 -6.395
good 0.482 1.658 17 386 0.000 -6.704
Cluster 5
Intern % glob % Intern freq Glob freq p.value v.test
now 13.725 1.164 119 271 0.000 20.878
make 0.115 0.752 1 175 0.020 -2.328
good 0.692 1.658 6 386 0.020 -2.335
want 0.115 0.786 1 183 0.015 -2.426
time 0.000 0.606 0 141 0.009 -2.600
will 0.000 0.696 0 162 0.004 -2.864
tell 0.000 0.713 0 166 0.004 -2.912
come 0.115 1.134 1 264 0.001 -3.304
see 0.000 1.027 0 239 0.000 -3.696
know 0.461 2.002 4 466 0.000 -3.712
love 0.692 2.577 6 600 0.000 -4.007
say 0.000 1.499 0 349 0.000 -4.658

K-medie

Composizione dei cluster del k-medie sferico

Silhouette

plot(silhouette(soft.part))

z<-as.data.frame(silhouette(soft.part))

Nuvole di confronto

soft.clus.proto<-t(cl_prototypes(soft.part))
comparison.cloud(soft.clus.proto, max.words=100)