#install.packages("dplyr")
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#install.packages("readr")
library(readr)
billboard100 <- read_csv("billboard100.csv")
## Rows: 330087 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): song, artist
## dbl (4): rank, last-week, peak-rank, weeks-on-board
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
nrow(billboard100)
## [1] 330087
ncol(billboard100)
## [1] 7
spec(billboard100)
## cols(
## date = col_date(format = ""),
## rank = col_double(),
## song = col_character(),
## artist = col_character(),
## `last-week` = col_double(),
## `peak-rank` = col_double(),
## `weeks-on-board` = col_double()
## )
head(billboard100,10)
## # A tibble: 10 × 7
## date rank song artist `last-week` `peak-rank` `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 1 1 3
## 2 2021-11-06 2 Stay The K… 2 1 16
## 3 2021-11-06 3 Industry Ba… Lil N… 3 1 14
## 4 2021-11-06 4 Fancy Like Walke… 4 3 19
## 5 2021-11-06 5 Bad Habits Ed Sh… 5 2 18
## 6 2021-11-06 6 Way 2 Sexy Drake… 6 1 8
## 7 2021-11-06 7 Shivers Ed Sh… 9 7 7
## 8 2021-11-06 8 Good 4 U Olivi… 7 1 24
## 9 2021-11-06 9 Need To Know Doja … 11 9 20
## 10 2021-11-06 10 Levitating Dua L… 8 2 56
billboard100%>%head(10)
## # A tibble: 10 × 7
## date rank song artist `last-week` `peak-rank` `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 1 1 3
## 2 2021-11-06 2 Stay The K… 2 1 16
## 3 2021-11-06 3 Industry Ba… Lil N… 3 1 14
## 4 2021-11-06 4 Fancy Like Walke… 4 3 19
## 5 2021-11-06 5 Bad Habits Ed Sh… 5 2 18
## 6 2021-11-06 6 Way 2 Sexy Drake… 6 1 8
## 7 2021-11-06 7 Shivers Ed Sh… 9 7 7
## 8 2021-11-06 8 Good 4 U Olivi… 7 1 24
## 9 2021-11-06 9 Need To Know Doja … 11 9 20
## 10 2021-11-06 10 Levitating Dua L… 8 2 56
10 %>% head(billboard100, .)
## # A tibble: 10 × 7
## date rank song artist `last-week` `peak-rank` `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 1 1 3
## 2 2021-11-06 2 Stay The K… 2 1 16
## 3 2021-11-06 3 Industry Ba… Lil N… 3 1 14
## 4 2021-11-06 4 Fancy Like Walke… 4 3 19
## 5 2021-11-06 5 Bad Habits Ed Sh… 5 2 18
## 6 2021-11-06 6 Way 2 Sexy Drake… 6 1 8
## 7 2021-11-06 7 Shivers Ed Sh… 9 7 7
## 8 2021-11-06 8 Good 4 U Olivi… 7 1 24
## 9 2021-11-06 9 Need To Know Doja … 11 9 20
## 10 2021-11-06 10 Levitating Dua L… 8 2 56
La función select() se utiliza para seleccionar columnas específicas de un marco de datos. Puede ser útil cuando estás trabajando con conjuntos de datos grandes y solo necesitas trabajar con un subconjunto específico de columnas.
billboard100 %>%
select(date, rank, song, artist)
## # A tibble: 330,087 × 4
## date rank song artist
## <date> <dbl> <chr> <chr>
## 1 2021-11-06 1 Easy On Me Adele
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow
## 4 2021-11-06 4 Fancy Like Walker Hayes
## 5 2021-11-06 5 Bad Habits Ed Sheeran
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & Young Thug
## 7 2021-11-06 7 Shivers Ed Sheeran
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo
## 9 2021-11-06 9 Need To Know Doja Cat
## 10 2021-11-06 10 Levitating Dua Lipa
## # ℹ 330,077 more rows
billboard100 %>%
select(date:artist, weeks_popular=`weeks-on-board`)
## # A tibble: 330,087 × 5
## date rank song artist weeks_popular
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 3
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14
## 4 2021-11-06 4 Fancy Like Walker Hayes 19
## 5 2021-11-06 5 Bad Habits Ed Sheeran 18
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & Young … 8
## 7 2021-11-06 7 Shivers Ed Sheeran 7
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24
## 9 2021-11-06 9 Need To Know Doja Cat 20
## 10 2021-11-06 10 Levitating Dua Lipa 56
## # ℹ 330,077 more rows
billboard100 %>%
select(-`last-week`,-`peak-rank`)
## # A tibble: 330,087 × 5
## date rank song artist `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 3
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14
## 4 2021-11-06 4 Fancy Like Walker Hayes 19
## 5 2021-11-06 5 Bad Habits Ed Sheeran 18
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8
## 7 2021-11-06 7 Shivers Ed Sheeran 7
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24
## 9 2021-11-06 9 Need To Know Doja Cat 20
## 10 2021-11-06 10 Levitating Dua Lipa 56
## # ℹ 330,077 more rows
La función mutate() se utiliza para agregar nuevas columnas o modificar columnas existentes en un marco de datos. Puedes realizar operaciones aritméticas, aplicar funciones a columnas existentes y crear nuevas variables basadas en las existentes.
billboard100 %>%
select(date:artist, weeks_popular=`weeks-on-board`)%>%
mutate(.,is_collab=grepl("kiss", song) & grepl("Drake", artist)) %>%
select(song,artist, is_collab,everything())
## # A tibble: 330,087 × 6
## song artist is_collab date rank weeks_popular
## <chr> <chr> <lgl> <date> <dbl> <dbl>
## 1 Easy On Me Adele FALSE 2021-11-06 1 3
## 2 Stay The Kid LAROI & Justi… FALSE 2021-11-06 2 16
## 3 Industry Baby Lil Nas X & Jack Harl… FALSE 2021-11-06 3 14
## 4 Fancy Like Walker Hayes FALSE 2021-11-06 4 19
## 5 Bad Habits Ed Sheeran FALSE 2021-11-06 5 18
## 6 Way 2 Sexy Drake Featuring Futur… FALSE 2021-11-06 6 8
## 7 Shivers Ed Sheeran FALSE 2021-11-06 7 7
## 8 Good 4 U Olivia Rodrigo FALSE 2021-11-06 8 24
## 9 Need To Know Doja Cat FALSE 2021-11-06 9 20
## 10 Levitating Dua Lipa FALSE 2021-11-06 10 56
## # ℹ 330,077 more rows
La función grepl() se utiliza para buscar patrones en texto, y en este caso, está buscando la presencia de la palabra ‘Featuring’ en la columna “artist”.
La función filter() se utiliza para filtrar filas específicas de un marco de datos basándose en condiciones dadas. Puedes usar operadores lógicos y comparaciones para especificar las condiciones que determinarán qué filas deben ser incluidas en el resultado
billboard100 %>%
select(date, rank, song, artist, weeks_popular=`weeks-on-board`) %>%
filter(weeks_popular>=20, artist=="Shakira" | artist=="Taylor Swift")
## # A tibble: 250 × 5
## date rank song artist weeks_popular
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-05-08 61 Willow Taylor Swift 20
## 2 2020-01-25 50 Lover Taylor Swift 22
## 3 2020-01-18 36 Lover Taylor Swift 21
## 4 2020-01-11 34 Lover Taylor Swift 20
## 5 2019-11-16 46 You Need To Calm Down Taylor Swift 21
## 6 2019-11-09 52 You Need To Calm Down Taylor Swift 20
## 7 2018-11-17 49 Delicate Taylor Swift 35
## 8 2018-11-10 43 Delicate Taylor Swift 34
## 9 2018-11-03 38 Delicate Taylor Swift 33
## 10 2018-10-27 34 Delicate Taylor Swift 32
## # ℹ 240 more rows
La función distinct() se utiliza para obtener las filas únicas de un marco de datos o de un conjunto de columnas específicas dentro de un marco de datos. Puedes utilizar esta función para eliminar duplicados basándote en una o más columnas.
billboard100 %>%
select(date:artist, weeks_popular='weeks-on-board') %>%
filter(artist == 'Drake')
## # A tibble: 787 × 5
## date rank song artist weeks_popular
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 91 No Friends In The Industry Drake 8
## 2 2021-10-30 87 No Friends In The Industry Drake 7
## 3 2021-10-30 90 Champagne Poetry Drake 7
## 4 2021-10-23 74 No Friends In The Industry Drake 6
## 5 2021-10-23 77 Champagne Poetry Drake 6
## 6 2021-10-16 64 No Friends In The Industry Drake 5
## 7 2021-10-16 65 Champagne Poetry Drake 5
## 8 2021-10-16 98 TSU Drake 5
## 9 2021-10-09 54 Champagne Poetry Drake 4
## 10 2021-10-09 60 No Friends In The Industry Drake 4
## # ℹ 777 more rows
distinct <- billboard100 %>%
select(date:artist, weeks_popular='weeks-on-board') %>%
filter(artist == 'Drake') %>%
distinct(song)
distinct
## # A tibble: 108 × 1
## song
## <chr>
## 1 No Friends In The Industry
## 2 Champagne Poetry
## 3 TSU
## 4 Pipe Down
## 5 Papi's Home
## 6 Race My Mind
## 7 7am On Bridle Path
## 8 Fucking Fans
## 9 The Remorse
## 10 What's Next
## # ℹ 98 more rows
##Group_by & Summarise # La función group_by() se utiliza para agrupar un marco de datos por una o más columnas. Cuando se aplica group_by(), se crea un “grupo” para cada combinación única de los valores en las columnas especificadas. Posteriormente, puedes aplicar funciones de resumen, como summarise(), a cada uno de estos grupos.
La función summarise() se utiliza para realizar resúmenes o agregaciones de datos dentro de cada grupo creado por group_by(). Puedes aplicar diversas funciones de resumen, como mean(), sum(), min(), max(), entre otras.
billboard100 %>%
select(date:artist, weeks_popular='weeks-on-board') %>%
filter(artist == 'Drake') %>%
group_by(song) %>%
summarise(total_weeks_popular = mean(weeks_popular))
## # A tibble: 108 × 2
## song total_weeks_popular
## <chr> <dbl>
## 1 0 To 100 / The Catch Up 10.5
## 2 10 Bands 7
## 3 30 For 30 Freestyle 1.5
## 4 6 God 1
## 5 6 Man 1
## 6 7am On Bridle Path 2
## 7 8 Out Of 10 2
## 8 9 3
## 9 9 AM In Dallas 1
## 10 Back To Back 10.5
## # ℹ 98 more rows
La función arrange() en la librería dplyr de R se utiliza para ordenar las filas de un marco de datos según una o más columnas. Puedes especificar el orden ascendente o descendente para cada columna.
billboard100 %>%
select(date:artist, weeks_popular='weeks-on-board') %>%
filter(artist == 'Drake') %>%
group_by(song) %>%
summarise(total_weeks_popular = max(weeks_popular)) %>%
arrange(desc(total_weeks_popular), song) %>%
head(10)
## # A tibble: 10 × 2
## song total_weeks_popular
## <chr> <dbl>
## 1 God's Plan 36
## 2 Hotline Bling 36
## 3 Controlla 26
## 4 Fake Love 25
## 5 Headlines 25
## 6 Nice For What 25
## 7 Best I Ever Had 24
## 8 In My Feelings 22
## 9 Nonstop 22
## 10 Started From The Bottom 22
La función count() se utiliza para contar el número de observaciones en cada grupo. Es comúnmente utilizada en combinación con group_by() para realizar recuentos en grupos específicos dentro de un marco de datos.
billboard100 %>%
select(date:artist, weeks_popular='weeks-on-board') %>%
count(artist) %>%
arrange(desc(n))
## # A tibble: 10,205 × 2
## artist n
## <chr> <int>
## 1 Taylor Swift 1023
## 2 Elton John 889
## 3 Madonna 857
## 4 Drake 787
## 5 Kenny Chesney 769
## 6 Tim McGraw 731
## 7 Keith Urban 673
## 8 Stevie Wonder 659
## 9 Rod Stewart 657
## 10 Mariah Carey 621
## # ℹ 10,195 more rows