music <- read_csv("~/EDA-project/raw-data/tcc_ceds_music.csv") %>% 
  clean_names()
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   .default = col_double(),
##   artist_name = col_character(),
##   track_name = col_character(),
##   genre = col_character(),
##   lyrics = col_character(),
##   topic = col_character()
## )
## See spec(...) for full column specifications.
unique(music$genre)
## [1] "pop"     "country" "blues"   "jazz"    "reggae"  "rock"    "hip hop"
music <- music %>%
  select(-x1)
music <- music %>% 
  mutate(id = 1) %>% 
  mutate(id = cumsum(id))
head(music)
## Warning: `...` is not empty.
## 
## We detected these problematic arguments:
## * `needs_dots`
## 
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 6 x 31
##   artist_name track_name release_date genre lyrics   len  dating violence
##   <chr>       <chr>             <dbl> <chr> <chr>  <dbl>   <dbl>    <dbl>
## 1 mukesh      mohabbat …         1950 pop   hold …    95 5.98e-4  0.0637 
## 2 frankie la… i believe          1950 pop   belie…    51 3.55e-2  0.0968 
## 3 johnnie ray cry                1950 pop   sweet…    24 2.77e-3  0.00277
## 4 pérez prado patricia           1950 pop   kiss …    54 4.82e-2  0.00155
## 5 giorgos pa… apopse ei…         1950 pop   till …    48 1.35e-3  0.00135
## 6 perry como  round and…         1950 pop   convo…    98 1.05e-3  0.421  
## # … with 23 more variables: world_life <dbl>, night_time <dbl>,
## #   shake_the_audience <dbl>, family_gospel <dbl>, romantic <dbl>,
## #   communication <dbl>, obscene <dbl>, music <dbl>, movement_places <dbl>,
## #   light_visual_perceptions <dbl>, family_spiritual <dbl>, like_girls <dbl>,
## #   sadness <dbl>, feelings <dbl>, danceability <dbl>, loudness <dbl>,
## #   acousticness <dbl>, instrumentalness <dbl>, valence <dbl>, energy <dbl>,
## #   topic <chr>, age <dbl>, id <dbl>
music <- music %>% 
  mutate(age = 2019 - release_date)
head(music)
## Warning: `...` is not empty.
## 
## We detected these problematic arguments:
## * `needs_dots`
## 
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 6 x 31
##   artist_name track_name release_date genre lyrics   len  dating violence
##   <chr>       <chr>             <dbl> <chr> <chr>  <dbl>   <dbl>    <dbl>
## 1 mukesh      mohabbat …         1950 pop   hold …    95 5.98e-4  0.0637 
## 2 frankie la… i believe          1950 pop   belie…    51 3.55e-2  0.0968 
## 3 johnnie ray cry                1950 pop   sweet…    24 2.77e-3  0.00277
## 4 pérez prado patricia           1950 pop   kiss …    54 4.82e-2  0.00155
## 5 giorgos pa… apopse ei…         1950 pop   till …    48 1.35e-3  0.00135
## 6 perry como  round and…         1950 pop   convo…    98 1.05e-3  0.421  
## # … with 23 more variables: world_life <dbl>, night_time <dbl>,
## #   shake_the_audience <dbl>, family_gospel <dbl>, romantic <dbl>,
## #   communication <dbl>, obscene <dbl>, music <dbl>, movement_places <dbl>,
## #   light_visual_perceptions <dbl>, family_spiritual <dbl>, like_girls <dbl>,
## #   sadness <dbl>, feelings <dbl>, danceability <dbl>, loudness <dbl>,
## #   acousticness <dbl>, instrumentalness <dbl>, valence <dbl>, energy <dbl>,
## #   topic <chr>, age <dbl>, id <dbl>
genre_count <- music %>% 
  count(genre) %>% 
  arrange(desc(n))
ggplot(genre_count) +
  geom_bar(aes(x = reorder(genre,-n), y = n), stat = "identity",
           fill = "skyblue", color = "blue") +
  labs(x = "Genre", y = "Count")

colnames(music)
##  [1] "artist_name"              "track_name"              
##  [3] "release_date"             "genre"                   
##  [5] "lyrics"                   "len"                     
##  [7] "dating"                   "violence"                
##  [9] "world_life"               "night_time"              
## [11] "shake_the_audience"       "family_gospel"           
## [13] "romantic"                 "communication"           
## [15] "obscene"                  "music"                   
## [17] "movement_places"          "light_visual_perceptions"
## [19] "family_spiritual"         "like_girls"              
## [21] "sadness"                  "feelings"                
## [23] "danceability"             "loudness"                
## [25] "acousticness"             "instrumentalness"        
## [27] "valence"                  "energy"                  
## [29] "topic"                    "age"                     
## [31] "id"
new_t <- music %>% 
  select(dating:energy)
colnames(new_t)
##  [1] "dating"                   "violence"                
##  [3] "world_life"               "night_time"              
##  [5] "shake_the_audience"       "family_gospel"           
##  [7] "romantic"                 "communication"           
##  [9] "obscene"                  "music"                   
## [11] "movement_places"          "light_visual_perceptions"
## [13] "family_spiritual"         "like_girls"              
## [15] "sadness"                  "feelings"                
## [17] "danceability"             "loudness"                
## [19] "acousticness"             "instrumentalness"        
## [21] "valence"                  "energy"
new_t$valence = as.numeric(new_t$valence)
new_t$energy = as.numeric(new_t$energy)

# delete NAs

new_t = na.omit(new_t)

# scatterplot matrix of the audio feature parameters
corrplot(cor(new_t))

cor(new_t$acousticness, new_t$energy, use="everything", method="pearson")
## [1] -0.7200447

Checking out the top three correlations and how they differ across genres

cor(new_t$acousticness, new_t$energy, use="everything", method="pearson")
## [1] -0.7200447
cor(new_t$loudness, new_t$energy, use="everything", method="pearson")
## [1] 0.7731351
cor(new_t$acousticness, new_t$loudness, use="everything", method="pearson")
## [1] -0.5397384
correlation <- function(vec, x, y) {
  val <- c()
  for (i in seq_along(vec)) {
    gen <- music %>% 
      filter(genre == vec[i])
    val[i] <- cor(gen[,x], gen[,y], use="everything",
        method="pearson")
  }
  df <- data.frame(genre = vec, corr = val, abs_corr = abs(val))
  return(df)
}
gen_vec <- unique(music$genre)
new_df <- correlation(gen_vec, "acousticness", "energy")
new_df %>% 
  arrange(desc(corr))
##     genre       corr  abs_corr
## 1 hip hop -0.1607465 0.1607465
## 2  reggae -0.2292713 0.2292713
## 3 country -0.6990483 0.6990483
## 4     pop -0.7052780 0.7052780
## 5   blues -0.7073538 0.7073538
## 6    rock -0.7177903 0.7177903
## 7    jazz -0.7946293 0.7946293
ggplot(new_df) +
  geom_bar(aes(x = reorder(genre, -abs_corr), y = abs_corr), stat = "identity",
           color = "rosybrown", fill = "red", alpha = 0.7) +
  labs(x = "Genre", y = "Acousticness and Energy correlation")

gen_vec <- unique(music$genre)
new_df <- correlation(gen_vec, "loudness", "energy")
new_df %>% 
  arrange(desc(corr))
##     genre      corr  abs_corr
## 1    jazz 0.7890322 0.7890322
## 2    rock 0.7706344 0.7706344
## 3 country 0.7596707 0.7596707
## 4     pop 0.7389424 0.7389424
## 5  reggae 0.7347203 0.7347203
## 6   blues 0.7338127 0.7338127
## 7 hip hop 0.6058536 0.6058536
ggplot(new_df) +
  geom_bar(aes(x = reorder(genre, -abs_corr), y = abs_corr), stat = "identity",
           color = "rosybrown", fill = "red", alpha = 0.7) +
  labs(x = "Genre", y = "Loudness and Energy correlation")

gen_vec <- unique(music$genre)
new_df <- correlation(gen_vec, "acousticness", "loudness")
new_df %>% 
  arrange(desc(corr))
##     genre        corr   abs_corr
## 1  reggae -0.09367978 0.09367978
## 2 hip hop -0.11420242 0.11420242
## 3   blues -0.46248664 0.46248664
## 4 country -0.47786505 0.47786505
## 5     pop -0.51063782 0.51063782
## 6    rock -0.57171206 0.57171206
## 7    jazz -0.60469124 0.60469124
ggplot(new_df) +
  geom_bar(aes(x = reorder(genre, -abs_corr), y = abs_corr), stat = "identity",
           color = "rosybrown", fill = "red", alpha = 0.7) +
  labs(x = "Genre", y = "Loudness and Acousticness correlation")

Creating three new variable that combines the highly correlated ones

music <- music %>% 
  mutate(ac_en = (acousticness + energy)/2, 
         ac_lo = (acousticness + loudness)/2,
         en_lo = (energy + loudness)/2)
head(music)
## Warning: `...` is not empty.
## 
## We detected these problematic arguments:
## * `needs_dots`
## 
## These dots only exist to allow future extensions and should be empty.
## Did you misspecify an argument?
## # A tibble: 6 x 34
##   artist_name track_name release_date genre lyrics   len  dating violence
##   <chr>       <chr>             <dbl> <chr> <chr>  <dbl>   <dbl>    <dbl>
## 1 mukesh      mohabbat …         1950 pop   hold …    95 5.98e-4  0.0637 
## 2 frankie la… i believe          1950 pop   belie…    51 3.55e-2  0.0968 
## 3 johnnie ray cry                1950 pop   sweet…    24 2.77e-3  0.00277
## 4 pérez prado patricia           1950 pop   kiss …    54 4.82e-2  0.00155
## 5 giorgos pa… apopse ei…         1950 pop   till …    48 1.35e-3  0.00135
## 6 perry como  round and…         1950 pop   convo…    98 1.05e-3  0.421  
## # … with 26 more variables: world_life <dbl>, night_time <dbl>,
## #   shake_the_audience <dbl>, family_gospel <dbl>, romantic <dbl>,
## #   communication <dbl>, obscene <dbl>, music <dbl>, movement_places <dbl>,
## #   light_visual_perceptions <dbl>, family_spiritual <dbl>, like_girls <dbl>,
## #   sadness <dbl>, feelings <dbl>, danceability <dbl>, loudness <dbl>,
## #   acousticness <dbl>, instrumentalness <dbl>, valence <dbl>, energy <dbl>,
## #   topic <chr>, age <dbl>, id <dbl>, ac_en <dbl>, ac_lo <dbl>, en_lo <dbl>
dist_func <- function(var1, var2, var3) {
  ggplot(music) + 
    geom_freqpoly(aes(x = {{ var1 }}), color = "blue", bins = 100) +
    geom_freqpoly(aes(x = {{ var2 }}), color = "red", bins = 100) +
    geom_freqpoly(aes(x = {{ var3 }}), color = "purple", bins = 100) +
    labs(x = "Feature", y = "Frequency",
       title = "How the original variable distributions differ from the new one.", 
       subtitle = "Original variable is Purple") +
    theme_bw() +
    theme(plot.title = element_text(hjust = 0.5))
}
dist_func(acousticness, energy, ac_en)

dist_func(acousticness, loudness, ac_lo)

dist_func(energy, loudness, en_lo)