Con il seguente script abbiamo analizzato i testi delle canzoni dei Beatles al fine di poter verificare la presenza o meno di caratteristiche e pattern nelle canzoni del gruppo.
library(skmeans)
library(tm)
library(clue)
library(cluster)
library(fpc)
library(wordcloud)
library(readxl)
library(SnowballC)
library(wordcloud2)
library(RColorBrewer)
library(syuzhet)
library(ggplot2)
library(textstem)
library(knitr)
library(kableExtra)
library(htmlwidgets)
library(webshot)
library(textstem)
library(Xplortext)
Creazione della variabile Annoc, questa ci indica il decennio al quale la canzone appartiene
df_definitivo$Annoc[df_definitivo$Anno < 1970] <- "Anni 60"
df_definitivo$Annoc[df_definitivo$Anno >= 1970 & df_definitivo$Anno < 1980 ] <- "Anni 70"
df_definitivo$Annoc[df_definitivo$Anno >= 1980 & df_definitivo$Anno < 1990 ] <- "Anni 80"
df_definitivo$Annoc[df_definitivo$Anno >= 1990 & df_definitivo$Anno < 2000 ] <- "Anni 90"
df_definitivo$Annoc<-factor(df_definitivo$Annoc, levels=c("Anni 60", "Anni 70", "Anni 80",
"Anni 90"), ordered=TRUE)
table(df_definitivo$Annoc)
##
## Anni 60 Anni 70 Anni 80 Anni 90
## 145 10 28 54
Pulizia del testo e costruzione del corpus
text<-as.character(df_definitivo$Testo)
corpus<-Corpus(VectorSource(df_definitivo$Testo))
corpus <-tm_map(corpus, removePunctuation)
corpus <-tm_map(corpus, removeNumbers)
corpus <-tm_map(corpus, content_transformer(tolower))
corpus <-tm_map(corpus, removeWords, c(stopwords("en"),"dont","yeah","got","youre"))
corpus_sw<-tm_map(corpus, stripWhitespace)
Corpus_lem<-tm_map(corpus_sw, lemmatize_strings)
Lyrics_dtm <- TermDocumentMatrix(corpus_sw)
dtm_m <- as.matrix(Lyrics_dtm)
Termini più comuni estrapolati dal testo ripulito.
dtm_v <- sort(rowSums(dtm_m), decreasing = TRUE)
dtm_d <- data.frame(word = names(dtm_v), freq = dtm_v)
kable(head(dtm_d, 10), col.names = c("Parola", "Frequenza"), row.names = FALSE, align = "c") %>%
kable_styling(full_width = F)
| Parola | Frequenza |
|---|---|
| love | 530 |
| know | 409 |
| now | 271 |
| baby | 259 |
| well | 231 |
| can | 209 |
| see | 202 |
| like | 188 |
| say | 188 |
| gonna | 181 |
Lyrics_dtm <- TermDocumentMatrix(Corpus_lem)
dtm_m <- as.matrix(Lyrics_dtm)
Con WC abbiamo creato un wordcloud.
wc <- wordcloud2(dtm_d, fontFamily = "Arial", size = 1.2)
wc
Termini più comuni estrapolati dal testo ripulito e lemmatizzato.
dtm_v <- sort(rowSums(dtm_m), decreasing = TRUE)
dtm_d_l <- data.frame(word = names(dtm_v), freq = dtm_v)
kable(head(dtm_d_l,10), col.names = c("Lemma", "Frequenza"), row.names = FALSE, align = "c") %>%
kable_styling(full_width = F)
| Lemma | Frequenza |
|---|---|
| love | 600 |
| know | 466 |
| good | 386 |
| say | 349 |
| now | 271 |
| come | 264 |
| baby | 259 |
| see | 239 |
| can | 209 |
| like | 189 |
Con WC abbiamo creato un wordcloud.
wordcloud(dtm_d_l$word,dtm_d_l$freq,min.freq = 1,
max.words=150, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(4, "Dark2"))
Conteggio delle parole associate a ciascuna emozione
text_full_lemmatized <- c()
for(i in 1:length(Corpus_lem)){
text_full_lemmatized <- append(Corpus_lem[[i]]$content, text_full_lemmatized)
}
d <- get_nrc_sentiment(text_full_lemmatized)
kable(head(d,10)[,1:8],
col.names = c("Rabbia", "Aspettativa", "Disgusto", "Paura", "Gioia", "Tristezza", "Sorpresa", "Fiducia"), align = "c") %>%
kable_styling(full_width = F)
| Rabbia | Aspettativa | Disgusto | Paura | Gioia | Tristezza | Sorpresa | Fiducia |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 |
| 4 | 6 | 0 | 2 | 5 | 2 | 5 | 7 |
| 0 | 5 | 0 | 0 | 1 | 2 | 1 | 2 |
| 0 | 5 | 0 | 0 | 8 | 0 | 4 | 5 |
| 2 | 5 | 2 | 1 | 9 | 1 | 5 | 4 |
| 4 | 2 | 4 | 6 | 3 | 4 | 2 | 3 |
| 7 | 2 | 3 | 2 | 3 | 5 | 2 | 2 |
| 1 | 1 | 1 | 2 | 1 | 2 | 1 | 1 |
| 0 | 8 | 1 | 0 | 6 | 1 | 1 | 4 |
| 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 |
td<-data.frame(t(d))
td_new <- data.frame(rowSums(td))
names(td_new)[1] <- "count"
td_new <- cbind("sentiment" = rownames(td_new), td_new)
rownames(td_new) <- NULL
td_new2<-td_new[1:8,]
Conteggio termini associati a 8 sentimenti per l’intera discografia dei Beatles
sentiments <- td_new2$sentiment
ord <- order(td_new2$count, decreasing = T)
ord_sent <- td_new2[ord,]
rownames(ord_sent) <- NULL
ord_sent$sentiment <- factor(ord_sent$sentiment, levels = sentiments[ord])
quickplot(sentiment, data = ord_sent, weight = count, geom = "bar",
fill = sentiment, ylab = "Count", xlab = "Sentiment")+
theme(legend.position = "none") + theme_bw()
Mettiamo insieme le figure
par(mfrow=c(2,2))
plot(avg_score60, type = "l", ylab = "Average Sentiment Score", xlab = "",
main = "Canzoni dal 1960-1969", xaxt='n',bty="n")
abline(h=0, col="red")
points(x = c(which.min(avg_score60), which.max(avg_score60)),
y = c(min(avg_score60), max(avg_score60)),
pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score60), which.max(avg_score60)),
y = c(min(avg_score60), max(avg_score60)),
labels = c("She's Leaving Home", "The long one"),pos=4)
plot(avg_score70, type = "l", ylab = "Average Sentiment Score", xlab = "",
main = "Canzoni dal 1970-1979", yaxt='n',xaxt='n',bty="n")
abline(h=0, col="red")
points(x = c(which.min(avg_score70), which.max(avg_score70)),
y = c(min(avg_score70), max(avg_score70)),
pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score70), which.max(avg_score70)),
y = c(min(avg_score70), max(avg_score70)),
labels = c("Maggie Mae", "For You Blue"),pos=4)
plot(avg_score80, type = "l", ylab = "Average Sentiment Score", xlab = "",
main = "Canzoni dal 1980-1989", bty='n',xaxt='n')
abline(h=0, col="red")
points(x = c(which.min(avg_score80), which.max(avg_score80)),
y = c(min(avg_score80), max(avg_score80)),
pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score80), which.max(avg_score80)),
y = c(min(avg_score80), max(avg_score80)),
labels = c("I Call Your Name", "The Ballad Of John And Yoko"),pos=2)
plot(avg_score90, type = "l", ylab = "Average Sentiment Score", xlab = "",
main = "Canzoni dal 1990-1999", yaxt='n',xaxt='n',bty="n")
abline(h=0, col="red")
points(x = c(which.min(avg_score90), which.max(avg_score90)),
y = c(min(avg_score90), max(avg_score90)),
pch = 21, col = "black", bg = "red")
text(x = c(which.min(avg_score90), which.max(avg_score90)),
y = c(min(avg_score90), max(avg_score90)),
labels = c("Not Guilty", "I Got A Woman"),pos=4)
Ecco i wordcloud per decade
wordcloud(dtm60_lm$word,dtm60_lm$freq,min.freq = 1,
max.words=150, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(4, "Dark2"))+title(main="Anni 60")
## integer(0)
wordcloud(dtm70_lm$word,dtm70_lm$freq,min.freq = 1,
max.words=150, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(4, "Dark2"))+title(main="Anni 70")
## integer(0)
wordcloud(dtm80_lm$word,dtm80_lm$freq,min.freq = 1,
max.words=180, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(4, "Dark2"))+title(main="Anni 80")
## integer(0)
wordcloud(dtm90_lm$word,dtm90_lm$freq,min.freq = 1,
max.words=150, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(4, "Dark2"))+title(main="Anni 90")
## integer(0)
Paragone decadi per numero di lemmi usati
nsongs <- c(length(Anni60_corpus), length(Anni70_corpus), length(Anni80_corpus),
length(Anni90_corpus))
twords <- c(sum(dtm60_lm[,2]), sum(dtm70_lm[,2]), sum(dtm80_lm[,2]), sum(dtm90_lm[,2]))
uwords <- c(nrow(Anni60_corpus_dtm), nrow(Anni70_corpus_dtm), nrow(Anni80_corpus_dtm),
nrow(Anni90_corpus_dtm))
wps <- round(twords / nsongs, 2)
uwps <- round(uwords / nsongs, 2)
names(nsongs) <- c("1960-1969", "1970-1979", "1980-1989", "1990-1999")
kable(cbind(nsongs, twords, uwords, wps, uwps), col.names = c("Numero di canzoni", "Lemmi totali", "Lemmi unici", "Lemmi per canzone", "Lemmi unici per canzoni"), align = "c") %>%
kable_styling(full_width = F)
| Numero di canzoni | Lemmi totali | Lemmi unici | Lemmi per canzone | Lemmi unici per canzoni | |
|---|---|---|---|---|---|
| 1960-1969 | 145 | 13573 | 1555 | 93.61 | 10.72 |
| 1970-1979 | 10 | 836 | 293 | 83.60 | 29.30 |
| 1980-1989 | 28 | 2602 | 579 | 92.93 | 20.68 |
| 1990-1999 | 54 | 4961 | 799 | 91.87 | 14.80 |
Paragone decadi per frequenza di uso delle parole
dtms <- list(dtm60_lm, dtm70_lm, dtm80_lm, dtm90_lm)
output <- list()
mat <- NULL
for(i in 1:length(dtms)){
words <- dtms[[i]][1:10, 1]
freq <- dtms[[i]][1:10, 2]/sum(dtms[[i]][,2])
perc <- paste(round(freq*100, 2), "%")
output[[i]] <- cbind(words, perc)
mat <- cbind(mat, output[[i]])
}
kable(mat, col.names = c("1960s", "", "1970s", "", "1980s", "", "1990s", ""),align = "c") %>%
kable_styling(full_width = F)
| 1960s | 1970s | 1980s | 1990s | ||||
|---|---|---|---|---|---|---|---|
| love | 2.65 % | back | 3.83 % | know | 4.84 % | love | 3.2 % |
| know | 1.91 % | mine | 2.99 % | love | 2.84 % | good | 2.78 % |
| say | 1.47 % | say | 2.75 % | can | 1.81 % | say | 1.71 % |
| come | 1.44 % | home | 2.63 % | baby | 1.73 % | baby | 1.65 % |
| good | 1.42 % | get | 2.03 % | good | 1.69 % | know | 1.57 % |
| now | 1.31 % | can | 1.91 % | say | 1.61 % | now | 1.39 % |
| see | 1.27 % | dig | 1.79 % | name | 1.31 % | just | 1.25 % |
| youre | 1.03 % | like | 1.56 % | shes | 1.19 % | shout | 1.13 % |
| get | 0.96 % | nothing | 1.56 % | way | 1.11 % | gonna | 1.05 % |
| baby | 0.93 % | way | 1.56 % | hand | 1.04 % | little | 1.01 % |
Evoluzione dei lemmi più usati
word_evolution <- function(words, dtms, years, plot_title, color_pal = "Set2"){
percs <- list()
for(i in 1:length(words)){
perc <- numeric(0)
for(j in 1:length(dtms)){
perc[j] <- dtms[[j]][words[i], 2]*100/sum(dtms[[j]][,2])
}
percs[[i]] <- perc
}
names(percs) <- words
df <- NULL
for(i in 1:length(words)){
mat <- cbind(decades, words[i], percs[[i]])
df <- rbind(df, mat)
}
df <- as.data.frame(df)
colnames(df) <- c("decade", "word", "perc")
df$perc <- as.numeric(df$perc)
ggplot(data = df, aes(x = decade, y = perc, group = word, col = word)) +
geom_line(size = 1) + labs(x = "Decade", y = "% sull'intero numero di parole",
title = plot_title, color = "Lemma") +
scale_color_brewer(palette=color_pal) + theme_bw()
}
words <- c("love", "home", "know", "good", "back", "baby")
dtms <- list(dtm60_lm, dtm70_lm,dtm80_lm, dtm90_lm)
decades <- c("1960s", "1970s", "1980s", "1990s")
title = "Evolution of the most used lemmas"
word_evolution(words, dtms, decades, title)
Sentiment score per decade
boxplot(avg_score60, avg_score70, avg_score80, avg_score90,
col=brewer.pal(4, "Set3"),
names=c("1960s","1970s", "1980s", "1990s"))
abline(h=0, col="red", lwd=2, lty=2)
Costruzione corpus e stoplist
text<-as.character(df_definitivo$Testo)
corpus<-Corpus(VectorSource(df_definitivo$Testo))
corpus <-tm_map(corpus, removePunctuation)
corpus <-tm_map(corpus, removeNumbers)
corpus <-tm_map(corpus, content_transformer(tolower))
corpus <-tm_map(corpus, removeWords, c(stopwords("en"),"dont","yeah","got","youre"))
corpus_sw<-tm_map(corpus, stripWhitespace)
Corpus_lem<-tm_map(corpus_sw, lemmatize_strings)
corpus_df<-data.frame(text=sapply(Corpus_lem,as.character),stringsAsFactors = FALSE)
corpus_df$year<-df_definitivo$Anno
corpus_df$album<-df_definitivo$Album
corpus_df$titolo<-df_definitivo$Titolo
corpus_df$annoc<-df_definitivo$Annoc
U_stoplist<-c("dont","yeah","got","youre","m","t","don","s","re","oh","ve","ll","im")
decade.TD<-TextData(corpus_df, var.text="text", idiom="en",var.agg = "annoc",Fmin=70, Dmin=60,
stop.word.user=U_stoplist, stop.word.tm=TRUE)
TableLex.decade<-as.matrix(decade.TD$DocTerm)
res.chi2<-chisq.test(TableLex.decade) #chi-quadrato
tau<-(res.chi2$observed/res.chi2$expected) #association rate
kable(head(round(tau[1:4,1:10],2)),
col.names = c("can", "come", "get","go","good","just","know","like","love","make"), align = "c") %>%
kable_styling(full_width = F) #tabella tau
| can | come | get | go | good | just | know | like | love | make | |
|---|---|---|---|---|---|---|---|---|---|---|
| Anni 60 | 0.86 | 1.22 | 0.85 | 1.00 | 0.82 | 0.78 | 0.91 | 1.04 | 0.99 | 1.13 |
| Anni 70 | 2.07 | 0.92 | 4.63 | 1.02 | 0.77 | 0.55 | 0.17 | 1.86 | 0.32 | 0.15 |
| Anni 80 | 1.84 | 0.56 | 1.04 | 0.67 | 0.93 | 0.72 | 2.22 | 0.56 | 1.01 | 0.70 |
| Anni 90 | 0.76 | 0.68 | 0.78 | 1.18 | 1.53 | 1.79 | 0.72 | 1.00 | 1.13 | 0.95 |
res.LexCA<-LexCA(decade.TD, graph=FALSE)
Varianza e percentuale di varianza spiegata dalle dimensioni ottenute dalla LCA
kable(head(round(res.LexCA$eig,2),3)[,1:3],
col.names = c("Varianza", "% Varianza", "% Varianza cumulata"), align = "c") %>%
kable_styling(full_width = F) #eigenvalues
| Varianza | % Varianza | % Varianza cumulata | |
|---|---|---|---|
| dim 1 | 0.03 | 38.18 | 38.18 |
| dim 2 | 0.03 | 37.33 | 75.50 |
| dim 3 | 0.02 | 24.50 | 100.00 |
Rappresentazione grafica varianza spiegata dalle dimensioni
plot(res.LexCA,eigen=TRUE,selDoc=NULL,selWord=NULL, col="blue")
Rappresentazione delle classi
plot(res.LexCA,selWord=NULL,cex=1,col.doc="grey30")
lines(res.LexCA$row$coord[1:4,1],res.LexCA$row$coord[1:4,2],lwd=1.5,col="grey20")
Rappresentazione relazione decadi-parole
plot(res.LexCA,xlim=c(-0.5,0.6),ylim=c(-0.3,0.83),col.doc="red",col.word="black",cex=1)
lines(res.LexCA$row$coord[1:4,1],res.LexCA$row$coord[1:4,2],lwd=1.5,col="grey20")
Ellissi di confidenza
ellipseLexCA(res.LexCA,selWord=NULL,col.doc="grey30")
lines(res.LexCA$row$coord[1:4,1],res.LexCA$row$coord[1:4,2],lwd=1.5,col="grey20")
album.TD<-TextData(corpus_df, var.text="text",var.agg = "album",idiom="en",Fmin=60, Dmin=60,
stop.word.user=U_stoplist, stop.word.tm=TRUE)
TableLex.album<-as.matrix(album.TD$DocTerm)
res.chi2<-chisq.test(TableLex.album)
res.LexCA_album<-LexCA(album.TD, graph=FALSE)
res.hc_album<-LexHCca(res.LexCA_album, nb.clust=5, graph=TRUE) #clustering album
#res.hc$clust.count
#res.hc$description$desc.cluster.doc$words
testo.TD<-TextData(corpus_df, var.text="text",idiom="en",Fmin=60, Dmin=60,
stop.word.user=U_stoplist, stop.word.tm=TRUE)
TableLex.testo<-as.matrix(testo.TD$DocTerm)
res.chi2<-chisq.test(TableLex.testo)
res.LexCA<-LexCA(testo.TD, graph=FALSE)
#summary(res.LexCA,metaWords=FALSE)
plot(res.LexCA,eigen=TRUE,selDoc=NULL,selWord=NULL, col="blue")
Rappresentazione delle parole
plot(res.LexCA,selDoc=NULL,col.word="black",cex=1,
title="Word representation")
Dendogramma e cluster dei testi
res.hc<-LexHCca(res.LexCA, nb.clust=5, graph=TRUE) #clustering testi
## Warning: ggrepel: 166 unlabeled data points (too many overlaps). Consider
## increasing max.overlaps
#res.hc$clust.count
#res.hc$description$desc.cluster.doc$words
| Intern % | glob % | Intern freq | Glob freq | p.value | v.test | |
|---|---|---|---|---|---|---|
| love | 10.179 | 2.577 | 359 | 600 | 0.000 | 25.479 |
| just | 1.616 | 0.636 | 57 | 148 | 0.000 | 6.868 |
| like | 1.304 | 0.812 | 46 | 189 | 0.001 | 3.238 |
| know | 2.694 | 2.002 | 95 | 466 | 0.003 | 3.010 |
| time | 0.312 | 0.606 | 11 | 141 | 0.013 | -2.479 |
| one | 0.284 | 0.649 | 10 | 151 | 0.002 | -3.069 |
| will | 0.284 | 0.696 | 10 | 162 | 0.001 | -3.391 |
| get | 0.227 | 0.777 | 8 | 181 | 0.000 | -4.474 |
| go | 0.425 | 1.370 | 15 | 319 | 0.000 | -5.859 |
| now | 0.284 | 1.164 | 10 | 271 | 0.000 | -6.031 |
| come | 0.227 | 1.134 | 8 | 264 | 0.000 | -6.395 |
| good | 0.482 | 1.658 | 17 | 386 | 0.000 | -6.704 |
| Intern % | glob % | Intern freq | Glob freq | p.value | v.test | |
|---|---|---|---|---|---|---|
| now | 13.725 | 1.164 | 119 | 271 | 0.000 | 20.878 |
| make | 0.115 | 0.752 | 1 | 175 | 0.020 | -2.328 |
| good | 0.692 | 1.658 | 6 | 386 | 0.020 | -2.335 |
| want | 0.115 | 0.786 | 1 | 183 | 0.015 | -2.426 |
| time | 0.000 | 0.606 | 0 | 141 | 0.009 | -2.600 |
| will | 0.000 | 0.696 | 0 | 162 | 0.004 | -2.864 |
| tell | 0.000 | 0.713 | 0 | 166 | 0.004 | -2.912 |
| come | 0.115 | 1.134 | 1 | 264 | 0.001 | -3.304 |
| see | 0.000 | 1.027 | 0 | 239 | 0.000 | -3.696 |
| know | 0.461 | 2.002 | 4 | 466 | 0.000 | -3.712 |
| love | 0.692 | 2.577 | 6 | 600 | 0.000 | -4.007 |
| say | 0.000 | 1.499 | 0 | 349 | 0.000 | -4.658 |
Composizione dei cluster del k-medie sferico
Silhouette
plot(silhouette(soft.part))
z<-as.data.frame(silhouette(soft.part))
Nuvole di confronto
soft.clus.proto<-t(cl_prototypes(soft.part))
comparison.cloud(soft.clus.proto, max.words=100)