Откроем данные
И возьмем из них только те, в которых есть слова “Новый год” и
“Рождество”
english_songs <- read.csv("english_songs.csv")
cat("В датасете:", formatC(nrow(english_songs), big.mark = ".", decimal.mark = ","), "английских песен", "\n")
## В датасете: 2.105.266 английских песен
library(knitr)
na_counts <- sapply(english_songs, function(x) sum(is.na(x)))
na_counts_df <- data.frame(Column = names(na_counts), NA_Count = na_counts)
kable(na_counts_df, caption = "Количество пропущенных значений в датасете")
Количество пропущенных значений в датасете
| X |
X |
0 |
| title |
title |
0 |
| tag |
tag |
0 |
| artist |
artist |
0 |
| year |
year |
0 |
| views |
views |
0 |
| features |
features |
0 |
| lyrics |
lyrics |
0 |
| id |
id |
0 |
| language_cld3 |
language_cld3 |
0 |
| language_ft |
language_ft |
0 |
| language |
language |
0 |
new_year_songs <- english_songs[grepl("new year", english_songs$lyrics, ignore.case = TRUE) & grepl("christmas", english_songs$lyrics, ignore.case = TRUE), ]
cat("В датасете:", formatC(nrow(new_year_songs), big.mark = ".", decimal.mark = ","), "новогодних песен", "\n")
## В датасете: 1.167 новогодних песен
Новогодних песен с годами становится больше или меньше?
library(dplyr)
library(ggplot2)
library(gridExtra)
plot1 <- ggplot(english_songs, aes(x = year)) +
geom_bar(stat = "count", fill = "chocolate3") +
theme_classic()+
ggtitle("Все песни") +
xlab("Год") + ylab("Количество песен")
plot2 <- ggplot(new_year_songs, aes(x = year)) +
geom_bar(stat = "count", fill = "chocolate4") +
theme_classic()+
ggtitle("Новогодние песни") +
xlab("Год") + ylab("Количество песен")
grid.arrange(plot1, plot2, ncol = 2)

library(ggplot2)
christmas_songs_before_1950 <- new_year_songs %>%
filter(year < 1950)
songs_before_1950 <- english_songs %>%
filter(year < 1950)
top_old_authors <- songs_before_1950 %>%
group_by(artist) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
top_n(6)
top_old_christmas_authors <- christmas_songs_before_1950 %>%
group_by(artist) %>%
summarise(count = n()) %>%
arrange(desc(count)) %>%
top_n(6)
ggplot(top_old_authors, aes(x = reorder(artist, count), y = count)) +
geom_bar(stat = "identity", fill = "darkgoldenrod2", color='darkorange3', linewidth=1.3) +
ggtitle("Топ-6 авторов до 1950 года среди всех песен") +
xlab("Автор") +
ylab("Количество песен") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust=1, size=10),
axis.text.y = element_text(size=10))

ggplot(top_old_christmas_authors, aes(x = reorder(artist, count), y = count)) +
geom_bar(stat = "identity", fill = "darkgoldenrod2", color='darkorange3', linewidth=1.3) +
ggtitle("Топ-6 авторов до 1950 года среди новогодних песен") +
xlab("Автор") +
ylab("Количество песен") +
theme_classic() +
theme(axis.text.x = element_text(angle = 45, hjust=1, size=10),
axis.text.y = element_text(size=10))

plot1 <- ggplot(english_songs%>% filter(year > 1950), aes(x = year)) +
geom_bar(stat = "count", fill = "darkgoldenrod2") +
theme_classic()+
ggtitle("Все песни") +
xlab("Год") + ylab("Количество песен")
plot2 <- ggplot(new_year_songs %>% filter(year > 1950), aes(x = year)) +
geom_bar(stat = "count", fill = "darkgoldenrod3") +
theme_classic()+
ggtitle("Новогодние песни") +
xlab("Год") + ylab("Количество песен")
grid.arrange(plot1, plot2, ncol = 2)

plot1 <- ggplot(english_songs%>% filter(year > 2010), aes(x = year)) +
geom_bar(stat = "count", fill = "darkgoldenrod1") +
theme_classic()+
ggtitle("Все песни после 2010") +
xlab("Год") + ylab("Количество песен")
plot2 <- ggplot(new_year_songs %>% filter(year > 2010), aes(x = year)) +
geom_bar(stat = "count", fill = "darkorange") +
theme_classic()+
ggtitle("Новогодние песни после 2010") +
xlab("Год") + ylab("Количество песен")
grid.arrange(plot1, plot2, ncol = 2)

library(kableExtra)
songs_future <- english_songs %>%
filter(year > 2023)
songs_future %>%
select(title, artist, year, tag) %>%
kable(caption = "Песни, вышедшие после 2023 года")%>%
kable_styling(bootstrap_options = c("condensed", "bordered"))
Песни, вышедшие после 2023 года
|
title
|
artist
|
year
|
tag
|
|
Sing in an Irish accent
|
Tobbly Jones
|
2027
|
rap
|
plot1 <- ggplot(english_songs%>% filter(year > 2010 & year <= 2021), aes(x = year)) +
geom_bar(stat = "count", fill = "darkgoldenrod1") +
theme_classic()+
ggtitle("Все песни после 2010") +
xlab("Год") + ylab("Количество песен")
plot2 <- ggplot(new_year_songs %>% filter(year > 2010), aes(x = year)) +
geom_bar(stat = "count", fill = "darkorange") +
theme_classic()+
ggtitle("Новогодние песни после 2010") +
xlab("Год") + ylab("Количество песен")
grid.arrange(plot1, plot2, ncol = 2)

yearly_counts_english <- english_songs %>% filter(year > 1950 & year <= 2021) %>%
group_by(year) %>%
summarise(total_songs = n())
yearly_counts_new_year <- new_year_songs %>%
group_by(year) %>%
summarise(new_year_songs = n())
combined_data <- inner_join(yearly_counts_english, yearly_counts_new_year, by = "year")
combined_data <- combined_data %>% filter(year > 1950 & year <= 2021) %>%
mutate(proportion = new_year_songs / total_songs)
ggplot(combined_data, aes(x = year, y = proportion)) +
geom_line(color='darkgoldenrod1', linewidth=2) +
geom_smooth(method = "loess", color='chocolate',fill = "chocolate1", alpha = 0.3) +
theme_classic() +
ggtitle("Доля новогодних песен по годам") +
xlab("Год") +
ylab("Доля новогодних песен")

english_songs_subset <- english_songs %>%
filter(year >= 1950 & year <= 2019)
new_year_songs_subset <- new_year_songs %>%
filter(year >= 1950 & year <= 2019)
english_songs_subset$source <- 'Все песни'
new_year_songs_subset$source <- 'Новогодние'
all_songs <- rbind(english_songs_subset, new_year_songs_subset)
plot_all_songs <- ggplot(all_songs %>% filter(source == 'Все песни'), aes(x = tag)) +
geom_bar(fill = "orange2", color='orange4', linewidth=1) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_classic() +
ggtitle("Распределение жанров во всех песнях") +
xlab("Жанр") +
ylab("Количество песен")
plot_new_year_songs <- ggplot(all_songs %>% filter(source == 'Новогодние'), aes(x = tag)) +
geom_bar(fill = "orange2", color='orange4', linewidth=1) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
theme_classic() +
ggtitle("Распределение жанров в новогодних песнях") +
xlab("Жанр") +
ylab("Количество песен")
grid.arrange(plot_all_songs, plot_new_year_songs, ncol = 2)

genre_proportions <- all_songs %>%
group_by(tag) %>%
summarise(total_songs = n(),
new_year_songs = sum(source == 'Новогодние')) %>%
mutate(proportion = new_year_songs / total_songs)
genre_proportions_sorted <- genre_proportions %>%
arrange(desc(proportion))
genre_proportions_sorted$tag <- factor(genre_proportions_sorted$tag, levels = genre_proportions_sorted$tag)
ggplot(genre_proportions_sorted, aes(x = tag, y = proportion)) +
geom_bar(stat = "identity", color = "firebrick4", fill = "indianred3", linewidth=1.5) +
theme_classic()+
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
ggtitle("Доля новогодних песен по жанрам") +
xlab("Жанр") +
ylab("Доля новогодних песен")

Обработаем текст
install.packages("udpipe", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/ky/_jw6kcds24q_vrs7qxmgxpz00000gn/T//RtmpASw8Yq/downloaded_packages
library(udpipe)
library(tm)
english_model <- udpipe_download_model(language = "english", model_dir = ".")
model <- udpipe_load_model(english_model$file_model)
preprocess_text <- function(text) {
corpus <- VCorpus(VectorSource(text))
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, stripWhitespace)
lemmatized_text <- udpipe_annotate(model, x = as.character(content(corpus)))
lemmatized_text <- as.data.frame(lemmatized_text)
lemmatized_corpus <- paste(lemmatized_text$lemma, collapse = " ")
return(lemmatized_corpus)
}
original_text <- "I am just teasing you, Matthew. I would rather celebrate this New Year with you than a bunch of strangers in New York."
text_lower <- tolower(original_text)
text_no_punct <- removePunctuation(text_lower)
text_no_stopwords <- removeWords(text_no_punct, stopwords("english"))
text_no_whitespace <- stripWhitespace(text_no_stopwords)
suppressMessages({
ud_model <- udpipe_download_model(language = "english", model_dir = ".")
model <- udpipe_load_model(ud_model$file_model)
})
lemmatized <- udpipe_annotate(model, x = text_no_whitespace)
lemmatized <- as.data.frame(lemmatized)
lemmatized_text <- paste(lemmatized$lemma, collapse = " ")
text_steps <- data.frame(
Step = c("Исходное предложение", "Низкий регистр", "Удалим знаки препинания", "И стоп-слова", "Без лишних пробелов", "Приведем к инфинитиву"),
Text = c(original_text, text_lower, text_no_punct, text_no_stopwords, text_no_whitespace, lemmatized_text)
)
knitr::kable(text_steps, format = "html", caption = "Предобработаем предложение") %>%
kable_styling(bootstrap_options = c("condensed", "bordered"))
Предобработаем предложение
|
Step
|
Text
|
|
Исходное предложение
|
I am just teasing you, Matthew. I would rather celebrate this New Year
with you than a bunch of strangers in New York.
|
|
Низкий регистр
|
i am just teasing you, matthew. i would rather celebrate this new year
with you than a bunch of strangers in new york.
|
|
Удалим знаки препинания
|
i am just teasing you matthew i would rather celebrate this new year
with you than a bunch of strangers in new york
|
|
И стоп-слова
|
just teasing matthew rather celebrate new year bunch strangers new york
|
|
Без лишних пробелов
|
just teasing matthew rather celebrate new year bunch strangers new york
|
|
Приведем к инфинитиву
|
just tease matthew rather celebrate new year bunch strangers new york
|
library(pbapply)
new_year_songs$processed_lyrics <- pbsapply(new_year_songs$lyrics, preprocess_text)
library(dplyr)
grouped_texts <- new_year_songs %>%
group_by(tag) %>%
summarise(grouped_lyrics = paste(processed_lyrics, collapse = " "))
library(wordcloud)
## Loading required package: RColorBrewer
text_to_display <- paste(new_year_songs$processed_lyrics, collapse = " ")
word_freq <- data.frame(text = unlist(strsplit(text_to_display, " "))) %>%
filter(text != "") %>%
group_by(text) %>%
summarise(freq = n()) %>%
ungroup()
wordcloud(words = text_to_display, scale = c(3, 0.5), max.words = 50,
colors = brewer.pal(8, "Dark2"), random.order = FALSE)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

library(wordcloud)
par(mfrow = c(6, 1))
par(mar = c(2, 2, 2, 2))
for (i in 1:nrow(grouped_texts)) {
genre_name <- grouped_texts$tag[i]
genre_text <- grouped_texts$grouped_lyrics[i]
wordcloud(words = genre_text, scale = c(1, 0.25), max.words = 50,
colors = brewer.pal(8, "Dark2"), random.order = FALSE)
title(main = genre_name)}
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
