music <- read_csv("~/EDA-project/raw-data/tcc_ceds_music.csv") %>%
clean_names()
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
## .default = col_double(),
## artist_name = col_character(),
## track_name = col_character(),
## genre = col_character(),
## lyrics = col_character(),
## topic = col_character()
## )
## See spec(...) for full column specifications.
unique(music$genre)
## [1] "pop" "country" "blues" "jazz" "reggae" "rock" "hip hop"
music <- music %>%
select(-x1)
music <- music %>%
mutate(id = 1) %>%
mutate(id = cumsum(id))
head(music)
## Warning: `...` is not empty.
##
## We detected these problematic arguments:
## * `needs_dots`
##
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 6 x 31
## artist_name track_name release_date genre lyrics len dating violence
## <chr> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 mukesh mohabbat … 1950 pop hold … 95 5.98e-4 0.0637
## 2 frankie la… i believe 1950 pop belie… 51 3.55e-2 0.0968
## 3 johnnie ray cry 1950 pop sweet… 24 2.77e-3 0.00277
## 4 pérez prado patricia 1950 pop kiss … 54 4.82e-2 0.00155
## 5 giorgos pa… apopse ei… 1950 pop till … 48 1.35e-3 0.00135
## 6 perry como round and… 1950 pop convo… 98 1.05e-3 0.421
## # … with 23 more variables: world_life <dbl>, night_time <dbl>,
## # shake_the_audience <dbl>, family_gospel <dbl>, romantic <dbl>,
## # communication <dbl>, obscene <dbl>, music <dbl>, movement_places <dbl>,
## # light_visual_perceptions <dbl>, family_spiritual <dbl>, like_girls <dbl>,
## # sadness <dbl>, feelings <dbl>, danceability <dbl>, loudness <dbl>,
## # acousticness <dbl>, instrumentalness <dbl>, valence <dbl>, energy <dbl>,
## # topic <chr>, age <dbl>, id <dbl>
music <- music %>%
mutate(age = 2019 - release_date)
head(music)
## Warning: `...` is not empty.
##
## We detected these problematic arguments:
## * `needs_dots`
##
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 6 x 31
## artist_name track_name release_date genre lyrics len dating violence
## <chr> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 mukesh mohabbat … 1950 pop hold … 95 5.98e-4 0.0637
## 2 frankie la… i believe 1950 pop belie… 51 3.55e-2 0.0968
## 3 johnnie ray cry 1950 pop sweet… 24 2.77e-3 0.00277
## 4 pérez prado patricia 1950 pop kiss … 54 4.82e-2 0.00155
## 5 giorgos pa… apopse ei… 1950 pop till … 48 1.35e-3 0.00135
## 6 perry como round and… 1950 pop convo… 98 1.05e-3 0.421
## # … with 23 more variables: world_life <dbl>, night_time <dbl>,
## # shake_the_audience <dbl>, family_gospel <dbl>, romantic <dbl>,
## # communication <dbl>, obscene <dbl>, music <dbl>, movement_places <dbl>,
## # light_visual_perceptions <dbl>, family_spiritual <dbl>, like_girls <dbl>,
## # sadness <dbl>, feelings <dbl>, danceability <dbl>, loudness <dbl>,
## # acousticness <dbl>, instrumentalness <dbl>, valence <dbl>, energy <dbl>,
## # topic <chr>, age <dbl>, id <dbl>
genre_count <- music %>%
count(genre) %>%
arrange(desc(n))
ggplot(genre_count) +
geom_bar(aes(x = reorder(genre,-n), y = n), stat = "identity",
fill = "skyblue", color = "blue") +
labs(x = "Genre", y = "Count")
colnames(music)
## [1] "artist_name" "track_name"
## [3] "release_date" "genre"
## [5] "lyrics" "len"
## [7] "dating" "violence"
## [9] "world_life" "night_time"
## [11] "shake_the_audience" "family_gospel"
## [13] "romantic" "communication"
## [15] "obscene" "music"
## [17] "movement_places" "light_visual_perceptions"
## [19] "family_spiritual" "like_girls"
## [21] "sadness" "feelings"
## [23] "danceability" "loudness"
## [25] "acousticness" "instrumentalness"
## [27] "valence" "energy"
## [29] "topic" "age"
## [31] "id"
new_t <- music %>%
select(dating:energy)
colnames(new_t)
## [1] "dating" "violence"
## [3] "world_life" "night_time"
## [5] "shake_the_audience" "family_gospel"
## [7] "romantic" "communication"
## [9] "obscene" "music"
## [11] "movement_places" "light_visual_perceptions"
## [13] "family_spiritual" "like_girls"
## [15] "sadness" "feelings"
## [17] "danceability" "loudness"
## [19] "acousticness" "instrumentalness"
## [21] "valence" "energy"
new_t$valence = as.numeric(new_t$valence)
new_t$energy = as.numeric(new_t$energy)
# delete NAs
new_t = na.omit(new_t)
# scatterplot matrix of the audio feature parameters
corrplot(cor(new_t))
cor(new_t$acousticness, new_t$energy, use="everything", method="pearson")
## [1] -0.7200447
Checking out the top three correlations and how they differ across genres
cor(new_t$acousticness, new_t$energy, use="everything", method="pearson")
## [1] -0.7200447
cor(new_t$loudness, new_t$energy, use="everything", method="pearson")
## [1] 0.7731351
cor(new_t$acousticness, new_t$loudness, use="everything", method="pearson")
## [1] -0.5397384
correlation <- function(vec, x, y) {
val <- c()
for (i in seq_along(vec)) {
gen <- music %>%
filter(genre == vec[i])
val[i] <- cor(gen[,x], gen[,y], use="everything",
method="pearson")
}
df <- data.frame(genre = vec, corr = val, abs_corr = abs(val))
return(df)
}
gen_vec <- unique(music$genre)
new_df <- correlation(gen_vec, "acousticness", "energy")
new_df %>%
arrange(desc(corr))
## genre corr abs_corr
## 1 hip hop -0.1607465 0.1607465
## 2 reggae -0.2292713 0.2292713
## 3 country -0.6990483 0.6990483
## 4 pop -0.7052780 0.7052780
## 5 blues -0.7073538 0.7073538
## 6 rock -0.7177903 0.7177903
## 7 jazz -0.7946293 0.7946293
ggplot(new_df) +
geom_bar(aes(x = reorder(genre, -abs_corr), y = abs_corr), stat = "identity",
color = "rosybrown", fill = "red", alpha = 0.7) +
labs(x = "Genre", y = "Acousticness and Energy correlation")
gen_vec <- unique(music$genre)
new_df <- correlation(gen_vec, "loudness", "energy")
new_df %>%
arrange(desc(corr))
## genre corr abs_corr
## 1 jazz 0.7890322 0.7890322
## 2 rock 0.7706344 0.7706344
## 3 country 0.7596707 0.7596707
## 4 pop 0.7389424 0.7389424
## 5 reggae 0.7347203 0.7347203
## 6 blues 0.7338127 0.7338127
## 7 hip hop 0.6058536 0.6058536
ggplot(new_df) +
geom_bar(aes(x = reorder(genre, -abs_corr), y = abs_corr), stat = "identity",
color = "rosybrown", fill = "red", alpha = 0.7) +
labs(x = "Genre", y = "Loudness and Energy correlation")
gen_vec <- unique(music$genre)
new_df <- correlation(gen_vec, "acousticness", "loudness")
new_df %>%
arrange(desc(corr))
## genre corr abs_corr
## 1 reggae -0.09367978 0.09367978
## 2 hip hop -0.11420242 0.11420242
## 3 blues -0.46248664 0.46248664
## 4 country -0.47786505 0.47786505
## 5 pop -0.51063782 0.51063782
## 6 rock -0.57171206 0.57171206
## 7 jazz -0.60469124 0.60469124
ggplot(new_df) +
geom_bar(aes(x = reorder(genre, -abs_corr), y = abs_corr), stat = "identity",
color = "rosybrown", fill = "red", alpha = 0.7) +
labs(x = "Genre", y = "Loudness and Acousticness correlation")
Creating three new variable that combines the highly correlated ones
music <- music %>%
mutate(ac_en = (acousticness + energy)/2,
ac_lo = (acousticness + loudness)/2,
en_lo = (energy + loudness)/2)
head(music)
## Warning: `...` is not empty.
##
## We detected these problematic arguments:
## * `needs_dots`
##
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 6 x 34
## artist_name track_name release_date genre lyrics len dating violence
## <chr> <chr> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 mukesh mohabbat … 1950 pop hold … 95 5.98e-4 0.0637
## 2 frankie la… i believe 1950 pop belie… 51 3.55e-2 0.0968
## 3 johnnie ray cry 1950 pop sweet… 24 2.77e-3 0.00277
## 4 pérez prado patricia 1950 pop kiss … 54 4.82e-2 0.00155
## 5 giorgos pa… apopse ei… 1950 pop till … 48 1.35e-3 0.00135
## 6 perry como round and… 1950 pop convo… 98 1.05e-3 0.421
## # … with 26 more variables: world_life <dbl>, night_time <dbl>,
## # shake_the_audience <dbl>, family_gospel <dbl>, romantic <dbl>,
## # communication <dbl>, obscene <dbl>, music <dbl>, movement_places <dbl>,
## # light_visual_perceptions <dbl>, family_spiritual <dbl>, like_girls <dbl>,
## # sadness <dbl>, feelings <dbl>, danceability <dbl>, loudness <dbl>,
## # acousticness <dbl>, instrumentalness <dbl>, valence <dbl>, energy <dbl>,
## # topic <chr>, age <dbl>, id <dbl>, ac_en <dbl>, ac_lo <dbl>, en_lo <dbl>
dist_func <- function(var1, var2, var3) {
ggplot(music) +
geom_freqpoly(aes(x = {{ var1 }}), color = "blue", bins = 100) +
geom_freqpoly(aes(x = {{ var2 }}), color = "red", bins = 100) +
geom_freqpoly(aes(x = {{ var3 }}), color = "purple", bins = 100) +
labs(x = "Feature", y = "Frequency",
title = "How the original variable distributions differ from the new one.",
subtitle = "Original variable is Purple") +
theme_bw() +
theme(plot.title = element_text(hjust = 0.5))
}
dist_func(acousticness, energy, ac_en)
dist_func(acousticness, loudness, ac_lo)
dist_func(energy, loudness, en_lo)