library(dplyr)
## Warning: package 'dplyr' was built under R version 4.3.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.3.3
spotify_songs <- read.csv("C:/Users/priya/Downloads/spotify_songs.csv")
str(spotify_songs)
## 'data.frame': 32833 obs. of 23 variables:
## $ track_id : chr "6f807x0ima9a1j3VPbc7VN" "0r7CVbZTWZgbTCYdfa2P31" "1z1Hg7Vb0AhHDiEmnDE79l" "75FpbthrwQmzHlBJLuGdC7" ...
## $ track_name : chr "I Don't Care (with Justin Bieber) - Loud Luxury Remix" "Memories - Dillon Francis Remix" "All the Time - Don Diablo Remix" "Call You Mine - Keanu Silva Remix" ...
## $ track_artist : chr "Ed Sheeran" "Maroon 5" "Zara Larsson" "The Chainsmokers" ...
## $ track_popularity : int 66 67 70 60 69 67 62 69 68 67 ...
## $ track_album_id : chr "2oCs0DGTsRO98Gh5ZSl2Cx" "63rPSO264uRjW1X5E6cWv6" "1HoSmj2eLcsrR0vE9gThr4" "1nqYsOef1yKKuGOVchbsk6" ...
## $ track_album_name : chr "I Don't Care (with Justin Bieber) [Loud Luxury Remix]" "Memories (Dillon Francis Remix)" "All the Time (Don Diablo Remix)" "Call You Mine - The Remixes" ...
## $ track_album_release_date: chr "2019-06-14" "2019-12-13" "2019-07-05" "2019-07-19" ...
## $ playlist_name : chr "Pop Remix" "Pop Remix" "Pop Remix" "Pop Remix" ...
## $ playlist_id : chr "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" "37i9dQZF1DXcZDD7cfEKhW" ...
## $ playlist_genre : chr "pop" "pop" "pop" "pop" ...
## $ playlist_subgenre : chr "dance pop" "dance pop" "dance pop" "dance pop" ...
## $ danceability : num 0.748 0.726 0.675 0.718 0.65 0.675 0.449 0.542 0.594 0.642 ...
## $ energy : num 0.916 0.815 0.931 0.93 0.833 0.919 0.856 0.903 0.935 0.818 ...
## $ key : int 6 11 1 7 1 8 5 4 8 2 ...
## $ loudness : num -2.63 -4.97 -3.43 -3.78 -4.67 ...
## $ mode : int 1 1 0 1 1 1 0 0 1 1 ...
## $ speechiness : num 0.0583 0.0373 0.0742 0.102 0.0359 0.127 0.0623 0.0434 0.0565 0.032 ...
## $ acousticness : num 0.102 0.0724 0.0794 0.0287 0.0803 0.0799 0.187 0.0335 0.0249 0.0567 ...
## $ instrumentalness : num 0.00 4.21e-03 2.33e-05 9.43e-06 0.00 0.00 0.00 4.83e-06 3.97e-06 0.00 ...
## $ liveness : num 0.0653 0.357 0.11 0.204 0.0833 0.143 0.176 0.111 0.637 0.0919 ...
## $ valence : num 0.518 0.693 0.613 0.277 0.725 0.585 0.152 0.367 0.366 0.59 ...
## $ tempo : num 122 100 124 122 124 ...
## $ duration_ms : int 194754 162600 176616 169093 189052 163049 187675 207619 193187 253040 ...
n_total <- nrow(spotify_songs)
sample_size <- round(n_total * 0.5)
set.seed(123) # Setting a seed for reproducibility
df_1 <- spotify_songs[sample(1:n_total, sample_size, replace = TRUE), ]
df_2 <- spotify_songs[sample(1:n_total, sample_size, replace = TRUE), ]
df_3 <- spotify_songs[sample(1:n_total, sample_size, replace = TRUE), ]
df_4 <- spotify_songs[sample(1:n_total, sample_size, replace = TRUE), ]
df_5 <- spotify_songs[sample(1:n_total, sample_size, replace = TRUE), ]
str(df_1)
## 'data.frame': 16416 obs. of 23 variables:
## $ track_id : chr "6Z8R6UsFuGXGtiIxiD8ISb" "6pKjxtBn45NCbMdT01Le86" "4RgDgaohkG08f630ZT9QKc" "1kSfJ6FTfXou4dAw51xbc1" ...
## $ track_name : chr "Safe And Sound" "Watch Me - Jay Anthony Remix" "Body - Dzeko Remix" "Otra Era" ...
## $ track_artist : chr "Capital Cities" "Duchess" "Loud Luxury" "Javiera Mena" ...
## $ track_popularity : int 77 39 67 1 0 35 25 43 29 32 ...
## $ track_album_id : chr "3WrufJir7I61NkvkDwxero" "56nmMKVMMwK9Vu67o8M6oJ" "27BS0fcfFoF6hNMdJZPMRR" "3BEtvyK449N9l2NiXsjSmq" ...
## $ track_album_name : chr "In A Tidal Wave Of Mystery (Deluxe Edition)" "Watch Me (Jay Anthony Remix)" "Body (Dzeko Remix)" "Otra Era" ...
## $ track_album_release_date: chr "2013" "2019-11-21" "2018-08-27" "2014-10-28" ...
## $ playlist_name : chr "ELECTROPOP🐹" "Waves Pop and EDM" "Pop EDM Remixes" "This Is: Javiera Mena" ...
## $ playlist_id : chr "44p8nNLe4fGfUeArS3MaIX" "64k01l4j6QtnZ8jMaI84AA" "4aUEH3uhbofktrFkXOOaKj" "37i9dQZF1DWSOvcBNdfJ87" ...
## $ playlist_genre : chr "pop" "edm" "edm" "pop" ...
## $ playlist_subgenre : chr "electropop" "pop edm" "pop edm" "electropop" ...
## $ danceability : num 0.655 0.57 0.653 0.534 0.932 0.53 0.731 0.447 0.372 0.73 ...
## $ energy : num 0.819 0.945 0.946 0.888 0.499 0.727 0.797 0.7 0.924 0.897 ...
## $ key : int 0 10 5 2 10 1 5 2 6 6 ...
## $ loudness : num -4.85 -7.18 -3.76 -6.33 -5.96 ...
## $ mode : int 1 0 0 1 0 0 1 1 0 1 ...
## $ speechiness : num 0.0316 0.0491 0.0556 0.0274 0.172 0.049 0.0309 0.137 0.0753 0.041 ...
## $ acousticness : num 0.000176 0.199 0.00812 0.0196 0.0476 0.153 0.0771 0.271 0.000716 0.000225 ...
## $ instrumentalness : num 3.74e-03 2.45e-01 2.07e-04 1.83e-01 0.00 4.64e-02 2.48e-02 0.00 4.00e-01 3.20e-06 ...
## $ liveness : num 0.104 0.256 0.126 0.151 0.121 0.0818 0.334 0.0521 0.132 0.0487 ...
## $ valence : num 0.766 0.743 0.653 0.624 0.697 0.039 0.933 0.476 0.432 0.679 ...
## $ tempo : num 118 128 126 102 103 ...
## $ duration_ms : int 192790 213105 165714 244979 239293 307870 348624 235099 434624 294467 ...
list(df_1, df_2, df_3, df_4, df_5) -> samples_list
head(df_1)
## track_id track_name track_artist
## 2986 6Z8R6UsFuGXGtiIxiD8ISb Safe And Sound Capital Cities
## 29925 6pKjxtBn45NCbMdT01Le86 Watch Me - Jay Anthony Remix Duchess
## 29710 4RgDgaohkG08f630ZT9QKc Body - Dzeko Remix Loud Luxury
## 2757 1kSfJ6FTfXou4dAw51xbc1 Otra Era Javiera Mena
## 9642 5fsjS6L83RSJBqRJKL0BTY Got Your Money Ol' Dirty Bastard
## 31313 1aosXPBgRredZfKkuaIg6V Hold On - Ben Böhmer Remix Lane 8
## track_popularity track_album_id
## 2986 77 3WrufJir7I61NkvkDwxero
## 29925 39 56nmMKVMMwK9Vu67o8M6oJ
## 29710 67 27BS0fcfFoF6hNMdJZPMRR
## 2757 1 3BEtvyK449N9l2NiXsjSmq
## 9642 0 2pPNC9cSF3mhMZcT7t3iw7
## 31313 35 65YNpWXBT8jeHQL6fWAkVP
## track_album_name track_album_release_date
## 2986 In A Tidal Wave Of Mystery (Deluxe Edition) 2013
## 29925 Watch Me (Jay Anthony Remix) 2019-11-21
## 29710 Body (Dzeko Remix) 2018-08-27
## 2757 Otra Era 2014-10-28
## 9642 The Hits 2008
## 31313 Hold On (Ben Böhmer Remix) 2018-09-20
## playlist_name playlist_id
## 2986 ELECTROPOP🐹 44p8nNLe4fGfUeArS3MaIX
## 29925 Waves Pop and EDM 64k01l4j6QtnZ8jMaI84AA
## 29710 Pop EDM Remixes 4aUEH3uhbofktrFkXOOaKj
## 2757 This Is: Javiera Mena 37i9dQZF1DWSOvcBNdfJ87
## 9642 90s Gangsta Rap / Top Hip-hop Classics 0C2zyPdlkbWHrwVg9gqNdU
## 31313 Deep Electronic Music 2020 & Progressive House 5CMvAWTlDPdZnkleiTHyyo
## playlist_genre playlist_subgenre danceability energy key loudness
## 2986 pop electropop 0.655 0.819 0 -4.852
## 29925 edm pop edm 0.570 0.945 10 -7.182
## 29710 edm pop edm 0.653 0.946 5 -3.757
## 2757 pop electropop 0.534 0.888 2 -6.327
## 9642 rap gangster rap 0.932 0.499 10 -5.957
## 31313 edm progressive electro house 0.530 0.727 1 -9.957
## mode speechiness acousticness instrumentalness liveness valence tempo
## 2986 1 0.0316 0.000176 0.003740 0.1040 0.766 117.956
## 29925 0 0.0491 0.199000 0.245000 0.2560 0.743 128.022
## 29710 0 0.0556 0.008120 0.000207 0.1260 0.653 125.966
## 2757 1 0.0274 0.019600 0.183000 0.1510 0.624 101.989
## 9642 0 0.1720 0.047600 0.000000 0.1210 0.697 103.049
## 31313 0 0.0490 0.153000 0.046400 0.0818 0.039 123.969
## duration_ms
## 2986 192790
## 29925 213105
## 29710 165714
## 2757 244979
## 9642 239293
## 31313 307870
head(df_2)
## track_id track_name track_artist
## 28874 5MPyPukGFvGBieVn34HdiY Hands Up J-ZAID
## 23248 4QIyEq7TcTdAcxjI2AFlAK FEFE 6ix9ine
## 6918 0TBuySbBzGo4Jgc410MvJp Who Want It David Banner
## 23892 2TYlNJNjw45KdmKr5Dh2mh Low Tide XY&O
## 27956 7B8YM25inFu31c5nTiLFgD In Arms - A-Trak Remix Ferreck Dawn
## 17573 6GCNUmk7L7OWtpvSk0fWOg Mirala Miralo Alejandra Guzman
## track_popularity track_album_id track_album_name
## 28874 24 4cbVJ5mScRm9bfmZMua3ix Hands Up
## 23248 15 03KcW1ZhaSnj8pIk1LUNQs DUMMY BOY
## 6918 30 3Rdf6EFR90LuwLXkBWxLvm The God Box
## 23892 41 6TztcF2ft5DVJD4XIZ1al3 Shimmer + Shade
## 27956 43 2BGXNnzk7U6F9IMdKEfjjx In Arms (A-Trak Remix)
## 17573 61 5Zb5uLdHzmnIbOBy4zvVwW Libre
## track_album_release_date playlist_name playlist_id
## 28874 2019-12-20 Bounce United 08QTrfsYYouffgnPjmllAQ
## 23248 2018-11-27 Hip pop 1Sc7bobknESH7SXQcnmoX5
## 6918 2017-05-19 Southern Hip Hop 4lcyWQDOzPfcbZrcBI3FOW
## 23892 2016-04-22 Today's Hits (Clean) 7ENISpOJhocpMJVcGb0qcT
## 27956 2018-11-23 💊ELECTRO-HOUSE-TECH💊 0AFYmoSuoMQiGGjzvBwr6u
## 17573 1993 Latin Pop Classics 37i9dQZF1DX6ThddIjWuGT
## playlist_genre playlist_subgenre danceability energy key loudness mode
## 28874 edm big room 0.638 0.547 1 -16.105 0
## 23248 r&b hip pop 0.931 0.387 1 -9.127 1
## 6918 rap southern hip hop 0.456 0.929 2 -4.576 1
## 23892 r&b hip pop 0.559 0.799 9 -5.807 0
## 27956 edm electro house 0.855 0.782 7 -4.195 1
## 17573 latin latin pop 0.669 0.889 5 -5.472 1
## speechiness acousticness instrumentalness liveness valence tempo
## 28874 0.0433 0.000412 2.85e-01 0.0854 0.0393 127.996
## 23248 0.4120 0.088000 0.00e+00 0.1360 0.3760 125.978
## 6918 0.4530 0.041500 0.00e+00 0.1210 0.3930 168.114
## 23892 0.0455 0.000397 9.58e-03 0.0609 0.3670 116.975
## 27956 0.1400 0.007820 5.97e-01 0.1250 0.4990 122.999
## 17573 0.0318 0.004600 1.49e-05 0.0910 0.5800 122.102
## duration_ms
## 28874 273809
## 23248 179405
## 6918 236373
## 23892 202532
## 27956 318241
## 17573 240933
head(df_3)
## track_id track_name track_artist
## 11182 5yY9lUy8nbvjM1Uyo1Uqoc Life Is Good (feat. Drake) Future
## 25941 3uSSjnDMmoyERaAK9KvpJR Too Deep dvsn
## 11049 4uTwGlgAfIKQTnJn2l8eHO Uno Del Pato Bimoud
## 12522 1ZhrREyOOeFV6TxDOyiPwu Take The Money And Run Steve Miller Band
## 1724 3e9HZxeyfWwjeyPAMmWSSQ thank u, next Ariana Grande
## 13438 0rkcFGA6XEVd2XV2JI6msk When Love Comes To Town U2
## track_popularity track_album_id track_album_name
## 11182 93 5uCEoLCj3ZZZ1EtzQdQWVl Life Is Good (feat. Drake)
## 25941 61 0jLynoED1FbV2Ky7vU6Pjc SEPT 5TH
## 11049 31 5Gm0KeKKP9uErMA0EELFp9 Uno Del Pato
## 12522 67 0fjJOLqG3v7vXRYhz2wxPC Fly Like An Eagle
## 1724 87 2fYhqwDWXjbpjaIJPEfKFw thank u, next
## 13438 3 5DV76VYtjhsEh0v6KwgvY7 Rattle And Hum
## track_album_release_date playlist_name
## 11182 2020-01-10 Trap Americana
## 25941 2016-03-27 Neo Soul 2019
## 11049 2018-07-07 Trap Argentino - Trap Argentina
## 12522 1976-01-01 Classic Rock Drive
## 1724 2019-02-08 Post pop teen
## 13438 1988-10-10 Blues Rock
## playlist_id playlist_genre playlist_subgenre danceability
## 11182 7tkgK1tm9hYkWp7EFyOcAr rap trap 0.676
## 25941 44d7ppo4cggZJmzH2WOhAc r&b neo soul 0.616
## 11049 6ltss0ThreZ3uIMn5mr4Tm rap trap 0.914
## 12522 37i9dQZF1DXdOEFt9ZX0dh rock classic rock 0.564
## 1724 222nc9tKxKhfZ2GBrOpwH3 pop post-teen pop 0.717
## 13438 56dbowk1V5ycS5jW7DSvi5 rock classic rock 0.650
## energy key loudness mode speechiness acousticness instrumentalness
## 11182 0.609 2 -5.831 0 0.4810 0.0706 0.00e+00
## 25941 0.243 9 -13.975 1 0.1030 0.2840 1.49e-01
## 11049 0.463 0 -6.109 0 0.0492 0.7160 0.00e+00
## 12522 0.645 0 -12.923 1 0.0484 0.0552 2.84e-06
## 1724 0.653 1 -5.634 1 0.0658 0.2290 0.00e+00
## 13438 0.665 9 -11.259 1 0.0372 0.0526 4.62e-06
## liveness valence tempo duration_ms
## 11182 0.152 0.508 142.037 237735
## 25941 0.264 0.560 183.948 200223
## 11049 0.079 0.824 105.020 228571
## 12522 0.151 0.836 99.302 170173
## 1724 0.101 0.412 106.966 207320
## 13438 0.130 0.593 115.510 254267
head(df_4)
## track_id track_name track_artist
## 21573 3jjujdWJ72nww5eGnfs2E7 Adore You Harry Styles
## 21251 3SPDQfj2UfWq6A2NllZnzn What Makes You Beautiful One Direction
## 27949 3DmTvxgaOYdKS9dQ1z1mSd Cuando Mueves Dateless
## 12203 1IkCG9jkzCrl3TadPTh4dU Make It Aerosmith
## 10528 0BDmyy2xonhhvcl2QDg2Xh Cobra Formal One
## 29400 4FxaRW9mYHzOmApbAPZXoH Runaway (U & I) - Quintino Remix Galantis
## track_popularity track_album_id track_album_name
## 21573 88 7xV2TzoaVc0ycW7fwBwAml Fine Line
## 21251 62 53DrBEDi1AvhWOtCdljUiu Up All Night
## 27949 45 5vP7LROhcY3cXMViIG8sps Cuando Mueves EP
## 12203 45 19lEZSnCCbVEkKchoPQWDZ Aerosmith
## 10528 36 1se7OjX41LBkLRQT73mR1t Cobra
## 29400 27 1ExWK9MIpYQiUlPJlYIz0O Runaway (U & I) [Remixes]
## track_album_release_date playlist_name
## 21573 2019-12-13 Most Popular 2020 TOP 50
## 21251 2012-05-25 School Dance 2019 (Squeaky Clean)
## 27949 2017-09-25 💊ELECTRO-HOUSE-TECH💊
## 12203 1973-01-05 Nikki Sixx's Top Pixx
## 10528 2018-12-22 Trap Mojito
## 29400 2015-03-16 Epic Bass Drops
## playlist_id playlist_genre playlist_subgenre danceability
## 21573 1fqkbjEACMlekdddm5aobE r&b urban contemporary 0.676
## 21251 0SqaMfNsngZCpPw0UuelA7 latin latin hip hop 0.729
## 27949 0AFYmoSuoMQiGGjzvBwr6u edm electro house 0.806
## 12203 5d1arTPDEr76KMg9geDinZ rock album rock 0.553
## 10528 37i9dQZF1DX1OIMC8iDi74 rap trap 0.549
## 29400 49EXQVXh5k1t8SOhmUeyU7 edm big room 0.597
## energy key loudness mode speechiness acousticness instrumentalness
## 21573 0.771 8 -3.675 1 0.0483 0.023700 0.000007
## 21251 0.771 4 -2.451 1 0.0725 0.007610 0.000000
## 27949 0.640 8 -8.055 0 0.1070 0.001440 0.851000
## 12203 0.685 0 -10.981 1 0.0275 0.015100 0.149000
## 10528 0.949 2 -4.074 1 0.0595 0.001220 0.004550
## 29400 0.858 10 -5.590 0 0.0742 0.000579 0.021500
## liveness valence tempo duration_ms
## 21573 0.1020 0.569 99.048 207133
## 21251 0.0870 0.873 125.011 198053
## 27949 0.2950 0.184 123.004 255610
## 12203 0.1900 0.570 137.898 218960
## 10528 0.1040 0.169 160.022 255000
## 29400 0.0274 0.383 128.009 285000
head(df_5)
## track_id track_name track_artist
## 3177 5D59kJUcYmJPLDcx0Co2ol Designs for You Phantoms
## 21524 5viw7xR22mmAU8UjtX9duJ Feel Alive - Radio Mix Jenil
## 19982 0ISvcKRAPbvg0sq33swmNU Express Your Feelings Michael Anthony
## 12208 1QEEqeFIZktqIpPI4jSVSF More Than a Feeling Boston
## 29956 78WUUxoUBoLKn2hgfPupWQ DAYLIGHT TELYKast
## 1165 5KeyVNymqfqac1wLDseK8v Fineshrine Purity Ring
## track_popularity track_album_id track_album_name
## 3177 48 0xIhw2XT55ZWZvXT2ZKQ3w Disconnect
## 21524 34 23X0mzF6faAVju0ef2YP2E Feel Alive
## 19982 12 0VxW02u2pQ87APZVWFdmsD Tazmania Freestyle Vol. 2
## 12208 78 2QLp07RO6anZHmtcKTEvSC Boston
## 29956 53 3WU47tzEBef9T0SIyn6S7l DAYLIGHT
## 1165 60 7ppypgQppMf3mkRbZxYIFM Shrines
## track_album_release_date
## 3177 2019-07-19
## 21524 2019-08-30
## 19982 2007-12-21
## 12208 1976
## 29956 2019-12-09
## 1165 2012-07-23
## playlist_name
## 3177 Electropop And Play
## 21524 URBAN NATION
## 19982 Latin Hip Hop/Freestyle
## 12208 The Sound of Album Rock
## 29956 EDM 2020 House & Dance
## 1165 Ultimate Indie Presents... Best Indie Tracks of the 2010s
## playlist_id playlist_genre playlist_subgenre danceability
## 3177 7p30DzTAgW6OhspSXHTI88 pop electropop 0.809
## 21524 4EKXjB5zlv2DpTS84h407H r&b urban contemporary 0.536
## 19982 2MYEUjX0YAI9dxrBDzoCK7 latin latin hip hop 0.631
## 12208 3yj9YnQGTdnFuKbDyXGDi6 rock album rock 0.377
## 29956 25ButZrVb1Zj1MJioMs09D edm pop edm 0.732
## 1165 37i9dQZF1DWTHM4kX49UKs pop dance pop 0.609
## energy key loudness mode speechiness acousticness instrumentalness
## 3177 0.904 4 -6.460 0 0.0747 0.052300 5.47e-01
## 21524 0.876 9 -3.974 0 0.0556 0.251000 1.96e-04
## 19982 0.887 11 -11.434 0 0.0511 0.003570 2.61e-03
## 12208 0.682 7 -8.039 1 0.0299 0.000894 2.17e-03
## 29956 0.647 4 -5.571 0 0.0300 0.007050 9.95e-03
## 1165 0.798 5 -4.448 0 0.0404 0.018000 8.68e-06
## liveness valence tempo duration_ms
## 3177 0.0802 0.201 125.002 229360
## 21524 0.0689 0.329 127.983 232510
## 19982 0.0684 0.776 124.043 262373
## 12208 0.0504 0.288 108.736 285133
## 29956 0.3350 0.425 125.014 154828
## 1165 0.4340 0.401 130.034 209760
df_1_summary <- df_1 %>%
group_by(playlist_genre) %>%
summarise(mean_danceability = mean(danceability, na.rm = TRUE),
count = n())
df_2_summary <- df_2 %>%
group_by(playlist_genre) %>%
summarise(mean_danceability = mean(danceability, na.rm = TRUE),
count = n())
df_3_summary <- df_3 %>%
group_by(playlist_genre) %>%
summarise(mean_danceability = mean(danceability, na.rm = TRUE),
count = n())
df_4_summary <- df_4 %>%
group_by(playlist_genre) %>%
summarise(mean_danceability = mean(danceability, na.rm = TRUE),
count = n())
df_5_summary <- df_5 %>%
group_by(playlist_genre) %>%
summarise(mean_danceability = mean(danceability, na.rm = TRUE),
count = n())
combined_summary <- df_1_summary %>%
mutate(sample = "Sample 1") %>%
bind_rows(df_2_summary %>% mutate(sample = "Sample 2")) %>%
bind_rows(df_3_summary %>% mutate(sample = "Sample 3")) %>%
bind_rows(df_4_summary %>% mutate(sample = "Sample 4")) %>%
bind_rows(df_5_summary %>% mutate(sample = "Sample 5"))
print(combined_summary)
## # A tibble: 30 × 4
## playlist_genre mean_danceability count sample
## <chr> <dbl> <int> <chr>
## 1 edm 0.660 3103 Sample 1
## 2 latin 0.711 2577 Sample 1
## 3 pop 0.641 2672 Sample 1
## 4 r&b 0.670 2693 Sample 1
## 5 rap 0.718 2936 Sample 1
## 6 rock 0.517 2435 Sample 1
## 7 edm 0.659 3057 Sample 2
## 8 latin 0.714 2593 Sample 2
## 9 pop 0.635 2743 Sample 2
## 10 r&b 0.671 2714 Sample 2
## # ℹ 20 more rows
ggplot(combined_summary, aes(x = playlist_genre, y = mean_danceability, fill = sample)) +
geom_bar(stat = "identity", position = "dodge") +
theme_minimal() +
labs(title = "Mean Danceability by Playlist Genre Across Subsamples",
x = "Playlist Genre",
y = "Mean Danceability")
The sampling shows variability and stability in danceability across genres, highlighting consistent or fluctuating patterns.
Visualization confirms genre-specific danceability trends, revealing stable or variable genres.
Ensures robust conclusions by assessing sampling variability.
Informs predictive modeling with consistent patterns across samples.
Are other features like energy or tempo similarly genre-dependent?
How do danceability trends change over time?
Does playlist popularity correlate with danceability?
set.seed(123) # For reproducibility
num_simulations <- 1000
# Create an empty list to store Monte Carlo results for each dataframe
monte_carlo_results_list <- list(df_1 = numeric(num_simulations),
df_2 = numeric(num_simulations),
df_3 = numeric(num_simulations),
df_4 = numeric(num_simulations),
df_5 = numeric(num_simulations))
# Loop through each dataframe and perform Monte Carlo simulations
for (i in 1:num_simulations) {
sample_data_1 <- df_1[sample(1:nrow(df_1), sample_size, replace = TRUE), ]
monte_carlo_results_list$df_1[i] <- mean(sample_data_1$danceability, na.rm = TRUE)
sample_data_2 <- df_2[sample(1:nrow(df_2), sample_size, replace = TRUE), ]
monte_carlo_results_list$df_2[i] <- mean(sample_data_2$danceability, na.rm = TRUE)
sample_data_3 <- df_3[sample(1:nrow(df_3), sample_size, replace = TRUE), ]
monte_carlo_results_list$df_3[i] <- mean(sample_data_3$danceability, na.rm = TRUE)
sample_data_4 <- df_4[sample(1:nrow(df_4), sample_size, replace = TRUE), ]
monte_carlo_results_list$df_4[i] <- mean(sample_data_4$danceability, na.rm = TRUE)
sample_data_5 <- df_5[sample(1:nrow(df_5), sample_size, replace = TRUE), ]
monte_carlo_results_list$df_5[i] <- mean(sample_data_5$danceability, na.rm = TRUE)
}
# Combine the Monte Carlo results into a single data frame
monte_carlo_combined <- data.frame(
mean_danceability = c(monte_carlo_results_list$df_1,
monte_carlo_results_list$df_2,
monte_carlo_results_list$df_3,
monte_carlo_results_list$df_4,
monte_carlo_results_list$df_5),
sample = factor(rep(c("Sample 1", "Sample 2", "Sample 3", "Sample 4", "Sample 5"),
each = num_simulations))
)
# Plot separate histograms for each sample using facet_wrap
ggplot(monte_carlo_combined, aes(x = mean_danceability)) +
geom_histogram(bins = 30, color = "black", fill = "lightblue") +
theme_minimal() +
labs(title = "Monte Carlo Simulation: Distribution of Mean Danceability",
x = "Mean Danceability",
y = "Frequency") +
facet_wrap(~ sample, ncol = 1) # Creates separate plots for each sample
# Calculate summary statistics for each sample
monte_carlo_summary <- data.frame(
Sample = c("Sample 1", "Sample 2", "Sample 3", "Sample 4", "Sample 5"),
Mean = c(mean(monte_carlo_results_list$df_1),
mean(monte_carlo_results_list$df_2),
mean(monte_carlo_results_list$df_3),
mean(monte_carlo_results_list$df_4),
mean(monte_carlo_results_list$df_5)),
Median = c(median(monte_carlo_results_list$df_1),
median(monte_carlo_results_list$df_2),
median(monte_carlo_results_list$df_3),
median(monte_carlo_results_list$df_4),
median(monte_carlo_results_list$df_5)),
SD = c(sd(monte_carlo_results_list$df_1),
sd(monte_carlo_results_list$df_2),
sd(monte_carlo_results_list$df_3),
sd(monte_carlo_results_list$df_4),
sd(monte_carlo_results_list$df_5))
)
# Display the summary statistics
print(monte_carlo_summary)
## Sample Mean Median SD
## 1 Sample 1 0.6556869 0.6556870 0.001171692
## 2 Sample 2 0.6553648 0.6553236 0.001143691
## 3 Sample 3 0.6556018 0.6556415 0.001137348
## 4 Sample 4 0.6535446 0.6535748 0.001122815
## 5 Sample 5 0.6561305 0.6561734 0.001144192
The Monte Carlo simulation shows how danceability varies across samples, indicating the stability and reliability of this metric. Consistent means and medians suggest robust central tendencies, while differing standard deviations highlight variability.
Demonstrates that danceability is a stable feature across samples, reinforcing its reliability for further analysis or modeling.
Ensures that observed trends aren’t due to random fluctuations, adding confidence in data-driven decisions.
The sub-samples reflect differences in means and counts across different categories of playlist_genre. This is an indication that they all come from the same population; each sub-sample captured a different aspect of the data distribution. Perhaps some genres are overrepresented in some samples and not others, hence the difference in means of the danceability score.
This might be an unusually high mean danceability for some playlist_genre within one subsample. In case, the danceability across a genre, such as “pop”, is abnormally higher within one subsample as compared to other subsamples, it could simply be because of chance rather than any true trend. This further justifies the need to compare across several samples since there is always the risk of labeling an outlier as an anomaly when, as a matter of fact, this can be due to an artifact introduced by random sampling.
Despite differences, some categories of playlist_genre may consistently reflect high or low danceability across all subsamples. This would hint at the fact that there are characteristics of the data stable and representative of the population, regardless of the sample that could be drawn. Finding such consistent patterns informs data miners on the core structure of the data.
The Monte Carlo simulation provided an average distribution of danceability values across 1,000 simulated samples. This can be useful to describe variability in sampling and further help in proving that even though samples might differ in their individual means, the distribution of the means tends to cluster around one value. It thus helps support the idea that repeated sampling may be useful in identifying reliable trends and diminish the risk of over-interpreting any one subsample.