Откроем данные

И возьмем из них только те, в которых есть слова “Новый год” и “Рождество”

english_songs <- read.csv("english_songs.csv")
cat("В датасете:", formatC(nrow(english_songs), big.mark = ".", decimal.mark = ","), "английских песен", "\n")
## В датасете: 2.105.266 английских песен
library(knitr)

na_counts <- sapply(english_songs, function(x) sum(is.na(x)))
na_counts_df <- data.frame(Column = names(na_counts), NA_Count = na_counts)
kable(na_counts_df, caption = "Количество пропущенных значений в датасете")
Количество пропущенных значений в датасете
Column NA_Count
X X 0
title title 0
tag tag 0
artist artist 0
year year 0
views views 0
features features 0
lyrics lyrics 0
id id 0
language_cld3 language_cld3 0
language_ft language_ft 0
language language 0
new_year_songs <- english_songs[grepl("new year", english_songs$lyrics, ignore.case = TRUE) & grepl("christmas", english_songs$lyrics, ignore.case = TRUE), ]
cat("В датасете:", formatC(nrow(new_year_songs), big.mark = ".", decimal.mark = ","), "новогодних песен", "\n")
## В датасете: 1.167 новогодних песен

Новогодних песен с годами становится больше или меньше?

library(dplyr)
library(ggplot2)
library(gridExtra)


plot1 <- ggplot(english_songs, aes(x = year)) +
  geom_bar(stat = "count", fill = "chocolate3") +
  theme_classic()+
  ggtitle("Все песни") +
  xlab("Год") + ylab("Количество песен")


plot2 <- ggplot(new_year_songs, aes(x = year)) +
  geom_bar(stat = "count", fill = "chocolate4") +
  theme_classic()+
  ggtitle("Новогодние песни") +
  xlab("Год") + ylab("Количество песен")


grid.arrange(plot1, plot2, ncol = 2)

library(ggplot2)

christmas_songs_before_1950 <- new_year_songs %>%
  filter(year < 1950)
songs_before_1950 <- english_songs %>%
  filter(year < 1950)

top_old_authors <- songs_before_1950 %>%
  group_by(artist) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  top_n(6)

top_old_christmas_authors <- christmas_songs_before_1950 %>%
  group_by(artist) %>%
  summarise(count = n()) %>%
  arrange(desc(count)) %>%
  top_n(6)

ggplot(top_old_authors, aes(x = reorder(artist, count), y = count)) +
  geom_bar(stat = "identity", fill = "darkgoldenrod2", color='darkorange3', linewidth=1.3) +
  ggtitle("Топ-6 авторов до 1950 года среди всех песен") +
  xlab("Автор") +
  ylab("Количество песен") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust=1, size=10),  
        axis.text.y = element_text(size=10)) 

ggplot(top_old_christmas_authors, aes(x = reorder(artist, count), y = count)) +
  geom_bar(stat = "identity", fill = "darkgoldenrod2", color='darkorange3', linewidth=1.3) +
  ggtitle("Топ-6 авторов до 1950 года среди новогодних песен") +
  xlab("Автор") +
  ylab("Количество песен") +
  theme_classic() +
  theme(axis.text.x = element_text(angle = 45, hjust=1, size=10), 
        axis.text.y = element_text(size=10))

plot1 <- ggplot(english_songs%>% filter(year > 1950), aes(x = year)) +
  geom_bar(stat = "count", fill = "darkgoldenrod2") +
  theme_classic()+
  ggtitle("Все песни") +
  xlab("Год") + ylab("Количество песен")


plot2 <- ggplot(new_year_songs %>% filter(year > 1950), aes(x = year)) +
  geom_bar(stat = "count", fill = "darkgoldenrod3") +
  theme_classic()+
  ggtitle("Новогодние песни") +
  xlab("Год") + ylab("Количество песен")


grid.arrange(plot1, plot2, ncol = 2)

plot1 <- ggplot(english_songs%>% filter(year > 2010), aes(x = year)) +
  geom_bar(stat = "count", fill = "darkgoldenrod1") +
  theme_classic()+
  ggtitle("Все песни после 2010") +
  xlab("Год") + ylab("Количество песен")


plot2 <- ggplot(new_year_songs %>% filter(year > 2010), aes(x = year)) +
  geom_bar(stat = "count", fill = "darkorange") +
  theme_classic()+
  ggtitle("Новогодние песни после 2010") +
  xlab("Год") + ylab("Количество песен")


grid.arrange(plot1, plot2, ncol = 2)

library(kableExtra)
songs_future <- english_songs %>%
  filter(year > 2023)

songs_future %>%
  select(title, artist, year, tag) %>%  
  kable(caption = "Песни, вышедшие после 2023 года")%>%
  kable_styling(bootstrap_options = c("condensed", "bordered"))
Песни, вышедшие после 2023 года
title artist year tag
Sing in an Irish accent Tobbly Jones 2027 rap
plot1 <- ggplot(english_songs%>% filter(year > 2010 & year <= 2021), aes(x = year)) +
  geom_bar(stat = "count", fill = "darkgoldenrod1") +
  theme_classic()+
  ggtitle("Все песни после 2010") +
  xlab("Год") + ylab("Количество песен")


plot2 <- ggplot(new_year_songs %>% filter(year > 2010), aes(x = year)) +
  geom_bar(stat = "count", fill = "darkorange") +
  theme_classic()+
  ggtitle("Новогодние песни после 2010") +
  xlab("Год") + ylab("Количество песен")


grid.arrange(plot1, plot2, ncol = 2)

yearly_counts_english <- english_songs %>% filter(year > 1950 & year <= 2021) %>%
  group_by(year) %>%
  summarise(total_songs = n())

yearly_counts_new_year <- new_year_songs %>%
  group_by(year) %>%
  summarise(new_year_songs = n())

combined_data <- inner_join(yearly_counts_english, yearly_counts_new_year, by = "year")
combined_data <- combined_data %>% filter(year > 1950 & year <= 2021) %>%
  mutate(proportion = new_year_songs / total_songs)
ggplot(combined_data, aes(x = year, y = proportion)) +
  geom_line(color='darkgoldenrod1', linewidth=2) +
  geom_smooth(method = "loess", color='chocolate',fill = "chocolate1", alpha = 0.3) +
  theme_classic() +
  ggtitle("Доля новогодних песен по годам") +
  xlab("Год") +
  ylab("Доля новогодних песен")

english_songs_subset <- english_songs %>%
  filter(year >= 1950 & year <= 2019)

new_year_songs_subset <- new_year_songs %>%
  filter(year >= 1950 & year <= 2019)
english_songs_subset$source <- 'Все песни'
new_year_songs_subset$source <- 'Новогодние'
all_songs <- rbind(english_songs_subset, new_year_songs_subset)

plot_all_songs <- ggplot(all_songs %>% filter(source == 'Все песни'), aes(x = tag)) +
  geom_bar(fill = "orange2", color='orange4', linewidth=1) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_classic() +
  ggtitle("Распределение жанров во всех песнях") +
  xlab("Жанр") +
  ylab("Количество песен")

plot_new_year_songs <- ggplot(all_songs %>% filter(source == 'Новогодние'), aes(x = tag)) +
  geom_bar(fill = "orange2", color='orange4', linewidth=1) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  theme_classic() +
  ggtitle("Распределение жанров в новогодних песнях") +
  xlab("Жанр") +
  ylab("Количество песен")

grid.arrange(plot_all_songs, plot_new_year_songs, ncol = 2)

genre_proportions <- all_songs %>%
  group_by(tag) %>%
  summarise(total_songs = n(),
            new_year_songs = sum(source == 'Новогодние')) %>%
  mutate(proportion = new_year_songs / total_songs)
genre_proportions_sorted <- genre_proportions %>%
  arrange(desc(proportion))
genre_proportions_sorted$tag <- factor(genre_proportions_sorted$tag, levels = genre_proportions_sorted$tag)
ggplot(genre_proportions_sorted, aes(x = tag, y = proportion)) +
  geom_bar(stat = "identity", color = "firebrick4", fill = "indianred3", linewidth=1.5) +
  theme_classic()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  ggtitle("Доля новогодних песен по жанрам") +
  xlab("Жанр") +
  ylab("Доля новогодних песен")

Обработаем текст

install.packages("udpipe", repos = "http://cran.us.r-project.org")
## 
## The downloaded binary packages are in
##  /var/folders/ky/_jw6kcds24q_vrs7qxmgxpz00000gn/T//RtmpASw8Yq/downloaded_packages
library(udpipe)
library(tm)

english_model <- udpipe_download_model(language = "english", model_dir = ".")
model <- udpipe_load_model(english_model$file_model)

preprocess_text <- function(text) {
  corpus <- VCorpus(VectorSource(text))
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  corpus <- tm_map(corpus, stripWhitespace)
  lemmatized_text <- udpipe_annotate(model, x = as.character(content(corpus)))
  lemmatized_text <- as.data.frame(lemmatized_text)
  lemmatized_corpus <- paste(lemmatized_text$lemma, collapse = " ")

  return(lemmatized_corpus)
}
original_text <- "I am just teasing you, Matthew. I would rather celebrate this New Year with you than a bunch of strangers in New York."

text_lower <- tolower(original_text)
text_no_punct <- removePunctuation(text_lower)
text_no_stopwords <- removeWords(text_no_punct, stopwords("english"))
text_no_whitespace <- stripWhitespace(text_no_stopwords)

suppressMessages({
  ud_model <- udpipe_download_model(language = "english", model_dir = ".")
  model <- udpipe_load_model(ud_model$file_model)
})

lemmatized <- udpipe_annotate(model, x = text_no_whitespace)
lemmatized <- as.data.frame(lemmatized)
lemmatized_text <- paste(lemmatized$lemma, collapse = " ")

text_steps <- data.frame(
  Step = c("Исходное предложение", "Низкий регистр", "Удалим знаки препинания", "И стоп-слова", "Без лишних пробелов", "Приведем к инфинитиву"),
  Text = c(original_text, text_lower, text_no_punct, text_no_stopwords, text_no_whitespace, lemmatized_text)
)

knitr::kable(text_steps, format = "html", caption = "Предобработаем предложение") %>%
  kable_styling(bootstrap_options = c("condensed", "bordered"))
Предобработаем предложение
Step Text
Исходное предложение I am just teasing you, Matthew. I would rather celebrate this New Year with you than a bunch of strangers in New York.
Низкий регистр i am just teasing you, matthew. i would rather celebrate this new year with you than a bunch of strangers in new york.
Удалим знаки препинания i am just teasing you matthew i would rather celebrate this new year with you than a bunch of strangers in new york
И стоп-слова just teasing matthew rather celebrate new year bunch strangers new york
Без лишних пробелов just teasing matthew rather celebrate new year bunch strangers new york
Приведем к инфинитиву just tease matthew rather celebrate new year bunch strangers new york
library(pbapply)
new_year_songs$processed_lyrics <- pbsapply(new_year_songs$lyrics, preprocess_text)
library(dplyr)
grouped_texts <- new_year_songs %>%
  group_by(tag) %>%
  summarise(grouped_lyrics = paste(processed_lyrics, collapse = " "))
library(wordcloud)
## Loading required package: RColorBrewer
text_to_display <- paste(new_year_songs$processed_lyrics, collapse = " ")
word_freq <- data.frame(text = unlist(strsplit(text_to_display, " "))) %>%
  filter(text != "") %>%
  group_by(text) %>%
  summarise(freq = n()) %>%
  ungroup()
wordcloud(words = text_to_display, scale = c(3, 0.5), max.words = 50,
            colors = brewer.pal(8, "Dark2"), random.order = FALSE)
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

library(wordcloud)
par(mfrow = c(6, 1))
par(mar = c(2, 2, 2, 2))

for (i in 1:nrow(grouped_texts)) {
  genre_name <- grouped_texts$tag[i]
  genre_text <- grouped_texts$grouped_lyrics[i]

  wordcloud(words = genre_text, scale = c(1, 0.25), max.words = 50,
            colors = brewer.pal(8, "Dark2"), random.order = FALSE)
  title(main = genre_name)}
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation): transformation
## drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents