#install.packages("DataExplorer")
library("DataExplorer")
# Estoy acostumbrado a dejar este tipo de datos en un dataframe, y como soy flojo y además estoy medio menso, siempre lo pongo en df simplemente para que sea más intuitivo.
#data <- read.csv("C:\\Carpeta de R\\DBs\\Most Streamed Spotify Songs 2024.csv")
df <- read.csv("C:\\Carpeta de R\\DBs\\Most Streamed Spotify Songs 2024.csv")
head(df)
## Track Album.Name Artist
## 1 MILLION DOLLAR BABY Million Dollar Baby - Single Tommy Richman
## 2 Not Like Us Not Like Us Kendrick Lamar
## 3 i like the way you kiss me I like the way you kiss me Artemas
## 4 Flowers Flowers - Single Miley Cyrus
## 5 Houdini Houdini Eminem
## 6 Lovin On Me Lovin On Me Jack Harlow
## Release.Date ISRC All.Time.Rank Track.Score Spotify.Streams
## 1 4/26/2024 QM24S2402528 1 725.4 390,470,936
## 2 5/4/2024 USUG12400910 2 545.9 323,703,884
## 3 3/19/2024 QZJ842400387 3 538.4 601,309,283
## 4 1/12/2023 USSM12209777 4 444.9 2,031,280,633
## 5 5/31/2024 USUG12403398 5 423.3 107,034,922
## 6 11/10/2023 USAT22311371 6 410.1 670,665,438
## Spotify.Playlist.Count Spotify.Playlist.Reach Spotify.Popularity
## 1 30,716 196,631,588 92
## 2 28,113 174,597,137 92
## 3 54,331 211,607,669 92
## 4 269,802 136,569,078 85
## 5 7,223 151,469,874 88
## 6 105,892 175,421,034 83
## YouTube.Views YouTube.Likes TikTok.Posts TikTok.Likes TikTok.Views
## 1 84,274,754 1,713,126 5,767,700 651,565,900 5,332,281,936
## 2 116,347,040 3,486,739 674,700 35,223,547 208,339,025
## 3 122,599,116 2,228,730 3,025,400 275,154,237 3,369,120,610
## 4 1,096,100,899 10,629,796 7,189,811 1,078,757,968 14,603,725,994
## 5 77,373,957 3,670,188 16,400
## 6 131,148,091 1,392,593 4,202,367 214,943,489 2,938,686,633
## YouTube.Playlist.Reach Apple.Music.Playlist.Count AirPlay.Spins
## 1 150,597,040 210 40,975
## 2 156,380,351 188 40,778
## 3 373,784,955 190 74,333
## 4 3,351,188,582 394 1,474,799
## 5 112,763,851 182 12,185
## 6 2,867,222,632 138 522,042
## SiriusXM.Spins Deezer.Playlist.Count Deezer.Playlist.Reach
## 1 684 62 17,598,718
## 2 3 67 10,422,430
## 3 536 136 36,321,847
## 4 2,182 264 24,684,248
## 5 1 82 17,660,624
## 6 4,654 86 17,167,254
## Amazon.Playlist.Count Pandora.Streams Pandora.Track.Stations
## 1 114 18,004,655 22,931
## 2 111 7,780,028 28,444
## 3 172 5,022,621 5,639
## 4 210 190,260,277 203,384
## 5 105 4,493,884 7,006
## 6 152 138,529,362 50,982
## Soundcloud.Streams Shazam.Counts TIDAL.Popularity Explicit.Track
## 1 4,818,457 2,669,262 NA 0
## 2 6,623,075 1,118,279 NA 1
## 3 7,208,651 5,285,340 NA 0
## 4 11,822,942 NA 0
## 5 207,179 457,017 NA 1
## 6 9,438,601 4,517,131 NA 1
# Existen valores atípicos que probablemente sucedieron por un cambio en el tipo estándar de codificación de caracteres, del que se haya usado en la DB de Spotify a probablemente UTF-8 o Latin1.
Artist <- df$Artist
ATR <- df$All.Time.Rank
SPC <- df$Spotify.Playlist.Count
TS <- df$Track.Score
ET <- df$Explicit.Track
create_report(df)
##
##
## processing file: report.rmd
## | | | 0% | |. | 2% | |.. | 5% [global_options] | |... | 7% | |.... | 10% [introduce] | |.... | 12% | |..... | 14% [plot_intro] | |...... | 17% | |....... | 19% [data_structure] | |........ | 21% | |......... | 24% [missing_profile] | |.......... | 26% | |........... | 29% [univariate_distribution_header] | |........... | 31% | |............ | 33% [plot_histogram] | |............. | 36% | |.............. | 38% [plot_density] | |............... | 40% | |................ | 43% [plot_frequency_bar] | |................. | 45% | |.................. | 48% [plot_response_bar] | |.................. | 50% | |................... | 52% [plot_with_bar] | |.................... | 55% | |..................... | 57% [plot_normal_qq] | |...................... | 60% | |....................... | 62% [plot_response_qq] | |........................ | 64% | |......................... | 67% [plot_by_qq] | |.......................... | 69% | |.......................... | 71% [correlation_analysis] | |........................... | 74% | |............................ | 76% [principal_component_analysis] | |............................. | 79% | |.............................. | 81% [bivariate_distribution_header] | |............................... | 83% | |................................ | 86% [plot_response_boxplot] | |................................. | 88% | |................................. | 90% [plot_by_boxplot] | |.................................. | 93% | |................................... | 95% [plot_response_scatterplot] | |.................................... | 98% | |.....................................| 100% [plot_by_scatterplot]
## output file: C:/Carpeta de R/Ejercicios Markdowns/report.knit.md
## "C:/Program Files/RStudio/resources/app/bin/quarto/bin/tools/pandoc" +RTS -K512m -RTS "C:\CARPET~1\EJERCI~1\REPORT~1.MD" --to html4 --from markdown+autolink_bare_uris+tex_math_single_backslash --output pandoc20707d712d55.html --lua-filter "C:\Users\esteb\AppData\Local\R\win-library\4.4\rmarkdown\rmarkdown\lua\pagebreak.lua" --lua-filter "C:\Users\esteb\AppData\Local\R\win-library\4.4\rmarkdown\rmarkdown\lua\latex-div.lua" --embed-resources --standalone --variable bs3=TRUE --section-divs --table-of-contents --toc-depth 6 --template "C:\Users\esteb\AppData\Local\R\win-library\4.4\rmarkdown\rmd\h\default.html" --no-highlight --variable highlightjs=1 --variable theme=yeti --mathjax --variable "mathjax-url=https://mathjax.rstudio.com/latest/MathJax.js?config=TeX-AMS-MML_HTMLorMML" --include-in-header "C:\Users\esteb\AppData\Local\Temp\Rtmpc50Ugh\rmarkdown-str20701239144c.html"
##
## Output created: report.html
introduce(df)
## rows columns discrete_columns continuous_columns all_missing_columns
## 1 4600 29 22 6 1
## total_missing_values complete_rows total_observations memory_usage
## 1 7941 0 133400 5679272
plot_intro(df)
plot_boxplot(df, by = "Spotify.Popularity")
## Warning: Removed 2537 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
plot_missing(df)
plot_histogram(df)
plot_bar(df)
## 22 columns ignored with more than 50 categories.
## Track: 4370 categories
## Album.Name: 4005 categories
## Artist: 2000 categories
## Release.Date: 1562 categories
## ISRC: 4598 categories
## All.Time.Rank: 4577 categories
## Spotify.Streams: 4426 categories
## Spotify.Playlist.Count: 4208 categories
## Spotify.Playlist.Reach: 4479 categories
## YouTube.Views: 4291 categories
## YouTube.Likes: 4284 categories
## TikTok.Posts: 3319 categories
## TikTok.Likes: 3616 categories
## TikTok.Views: 3617 categories
## YouTube.Playlist.Reach: 3459 categories
## AirPlay.Spins: 3268 categories
## SiriusXM.Spins: 690 categories
## Deezer.Playlist.Reach: 3559 categories
## Pandora.Streams: 3492 categories
## Pandora.Track.Stations: 2976 categories
## Soundcloud.Streams: 1266 categories
## Shazam.Counts: 4003 categories
plot_correlation(df)
## Warning in dummify(data, maxcat = maxcat): Ignored all discrete features since
## `maxcat` set to 20 categories!
## Warning: Removed 28 rows containing missing values or values outside the scale range
## (`geom_text()`).
La librería DataExplorer nos permite observar los datos de una manera más visual, además de ayudar en el proceso EDA.