Clase04

# install.packages("dplyr")
library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# install.packages("readr")

library(readr)# leer csv
getwd() # Verificar dirección

## [1] "C:/Users/ingri/OneDrive/Documentos/R_BIG DATA/R_Big_data"

setwd("C:/Users/ingri/OneDrive/Documentos/R_BIG DATA/BASE DATOS")# Cambiar la dirección

billboard100 <- read_csv("billboard100.csv")

## Rows: 330087 Columns: 7

## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (2): song, artist
## dbl  (4): rank, last-week, peak-rank, weeks-on-board
## date (1): date
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# PIPING %>%

head(billboard100,10) # Mostrar 10 primeras filas de dataframe

## # A tibble: 10 × 7
##    date        rank song         artist `last-week` `peak-rank` `weeks-on-board`
##    <date>     <dbl> <chr>        <chr>        <dbl>       <dbl>            <dbl>
##  1 2021-11-06     1 Easy On Me   Adele            1           1                3
##  2 2021-11-06     2 Stay         The K…           2           1               16
##  3 2021-11-06     3 Industry Ba… Lil N…           3           1               14
##  4 2021-11-06     4 Fancy Like   Walke…           4           3               19
##  5 2021-11-06     5 Bad Habits   Ed Sh…           5           2               18
##  6 2021-11-06     6 Way 2 Sexy   Drake…           6           1                8
##  7 2021-11-06     7 Shivers      Ed Sh…           9           7                7
##  8 2021-11-06     8 Good 4 U     Olivi…           7           1               24
##  9 2021-11-06     9 Need To Know Doja …          11           9               20
## 10 2021-11-06    10 Levitating   Dua L…           8           2               56

billboard100 %>% head(10) # trae los 10 primeros

## # A tibble: 10 × 7
##    date        rank song         artist `last-week` `peak-rank` `weeks-on-board`
##    <date>     <dbl> <chr>        <chr>        <dbl>       <dbl>            <dbl>
##  1 2021-11-06     1 Easy On Me   Adele            1           1                3
##  2 2021-11-06     2 Stay         The K…           2           1               16
##  3 2021-11-06     3 Industry Ba… Lil N…           3           1               14
##  4 2021-11-06     4 Fancy Like   Walke…           4           3               19
##  5 2021-11-06     5 Bad Habits   Ed Sh…           5           2               18
##  6 2021-11-06     6 Way 2 Sexy   Drake…           6           1                8
##  7 2021-11-06     7 Shivers      Ed Sh…           9           7                7
##  8 2021-11-06     8 Good 4 U     Olivi…           7           1               24
##  9 2021-11-06     9 Need To Know Doja …          11           9               20
## 10 2021-11-06    10 Levitating   Dua L…           8           2               56

# %>% El operador %>% (pipe) facilita la escritura de código más legible y facilita la secuencia de operaciones en una tubería (pipeline). En lugar de anidar funciones o asignar resultados intermedios a variables, el operador %>% permite encadenar las operaciones de una manera más clara y directa.

10 %>% head(billboard100,.)  # tomar el primer dato en este caso 10

## # A tibble: 10 × 7
##    date        rank song         artist `last-week` `peak-rank` `weeks-on-board`
##    <date>     <dbl> <chr>        <chr>        <dbl>       <dbl>            <dbl>
##  1 2021-11-06     1 Easy On Me   Adele            1           1                3
##  2 2021-11-06     2 Stay         The K…           2           1               16
##  3 2021-11-06     3 Industry Ba… Lil N…           3           1               14
##  4 2021-11-06     4 Fancy Like   Walke…           4           3               19
##  5 2021-11-06     5 Bad Habits   Ed Sh…           5           2               18
##  6 2021-11-06     6 Way 2 Sexy   Drake…           6           1                8
##  7 2021-11-06     7 Shivers      Ed Sh…           9           7                7
##  8 2021-11-06     8 Good 4 U     Olivi…           7           1               24
##  9 2021-11-06     9 Need To Know Doja …          11           9               20
## 10 2021-11-06    10 Levitating   Dua L…           8           2               56

# SELECT
## La función select() se utiliza para seleccionar columnas específicas de un marco de datos. Puede ser útil cuando estás trabajando con conjuntos de datos grandes y solo necesitas trabajar con un subconjunto específico de columnas.


billboard100 %>% select(date, rank, song, artist,"weeks-on-board")

## # A tibble: 330,087 × 5
##    date        rank song          artist                        `weeks-on-board`
##    <date>     <dbl> <chr>         <chr>                                    <dbl>
##  1 2021-11-06     1 Easy On Me    Adele                                        3
##  2 2021-11-06     2 Stay          The Kid LAROI & Justin Bieber               16
##  3 2021-11-06     3 Industry Baby Lil Nas X & Jack Harlow                     14
##  4 2021-11-06     4 Fancy Like    Walker Hayes                                19
##  5 2021-11-06     5 Bad Habits    Ed Sheeran                                  18
##  6 2021-11-06     6 Way 2 Sexy    Drake Featuring Future & You…                8
##  7 2021-11-06     7 Shivers       Ed Sheeran                                   7
##  8 2021-11-06     8 Good 4 U      Olivia Rodrigo                              24
##  9 2021-11-06     9 Need To Know  Doja Cat                                    20
## 10 2021-11-06    10 Levitating    Dua Lipa                                    56
## # ℹ 330,077 more rows

# seleccioanr con las columnas con las que se quieren trabajar
# "weeks-on-board" se coloca por la forma que esta escrito

billboard100 %>% select(date:artist,weeks_popular="weeks-on-board")

## # A tibble: 330,087 × 5
##    date        rank song          artist                           weeks_popular
##    <date>     <dbl> <chr>         <chr>                                    <dbl>
##  1 2021-11-06     1 Easy On Me    Adele                                        3
##  2 2021-11-06     2 Stay          The Kid LAROI & Justin Bieber               16
##  3 2021-11-06     3 Industry Baby Lil Nas X & Jack Harlow                     14
##  4 2021-11-06     4 Fancy Like    Walker Hayes                                19
##  5 2021-11-06     5 Bad Habits    Ed Sheeran                                  18
##  6 2021-11-06     6 Way 2 Sexy    Drake Featuring Future & Young …             8
##  7 2021-11-06     7 Shivers       Ed Sheeran                                   7
##  8 2021-11-06     8 Good 4 U      Olivia Rodrigo                              24
##  9 2021-11-06     9 Need To Know  Doja Cat                                    20
## 10 2021-11-06    10 Levitating    Dua Lipa                                    56
## # ℹ 330,077 more rows

# seleciionar todas las columnas desde la columna data hasta artist 
#,weeks_popular="weeks-on-board"------ los que hace es renombrar la solumna weeks_popular

billboard100 %>% select(-"last-week",-"peak-rank")  # Excluir las columnas

## # A tibble: 330,087 × 5
##    date        rank song          artist                        `weeks-on-board`
##    <date>     <dbl> <chr>         <chr>                                    <dbl>
##  1 2021-11-06     1 Easy On Me    Adele                                        3
##  2 2021-11-06     2 Stay          The Kid LAROI & Justin Bieber               16
##  3 2021-11-06     3 Industry Baby Lil Nas X & Jack Harlow                     14
##  4 2021-11-06     4 Fancy Like    Walker Hayes                                19
##  5 2021-11-06     5 Bad Habits    Ed Sheeran                                  18
##  6 2021-11-06     6 Way 2 Sexy    Drake Featuring Future & You…                8
##  7 2021-11-06     7 Shivers       Ed Sheeran                                   7
##  8 2021-11-06     8 Good 4 U      Olivia Rodrigo                              24
##  9 2021-11-06     9 Need To Know  Doja Cat                                    20
## 10 2021-11-06    10 Levitating    Dua Lipa                                    56
## # ℹ 330,077 more rows

# MUTATE: La función mutate() se utiliza para agregar nuevas columnas o modificar columnas existentes en un marco de datos. Puedes realizar operaciones aritméticas, aplicar funciones a columnas existentes y crear nuevas variables basadas en las existentes.

billboard100 %>%
  select(date:artist, weeks_popular="weeks-on-board") %>%
  mutate(is_collab = grepl("Kiss", song) & grepl("Drake", artist)) %>%
  select(song,artist, is_collab, everything())#organizar llas columnas

## # A tibble: 330,087 × 6
##    song          artist                 is_collab date        rank weeks_popular
##    <chr>         <chr>                  <lgl>     <date>     <dbl>         <dbl>
##  1 Easy On Me    Adele                  FALSE     2021-11-06     1             3
##  2 Stay          The Kid LAROI & Justi… FALSE     2021-11-06     2            16
##  3 Industry Baby Lil Nas X & Jack Harl… FALSE     2021-11-06     3            14
##  4 Fancy Like    Walker Hayes           FALSE     2021-11-06     4            19
##  5 Bad Habits    Ed Sheeran             FALSE     2021-11-06     5            18
##  6 Way 2 Sexy    Drake Featuring Futur… FALSE     2021-11-06     6             8
##  7 Shivers       Ed Sheeran             FALSE     2021-11-06     7             7
##  8 Good 4 U      Olivia Rodrigo         FALSE     2021-11-06     8            24
##  9 Need To Know  Doja Cat               FALSE     2021-11-06     9            20
## 10 Levitating    Dua Lipa               FALSE     2021-11-06    10            56
## # ℹ 330,077 more rows

# mutate(is_collab = grepl('Kiss', song) & grepl('Drake', artist))  : verifica si kiss esta presente en la columna son devielve TRUE si esta de lo contrario seria FALSE, asi con la de artis ="drake".Si ambas condiciones cumplen

# FILTER: La función filter() se utiliza para filtrar filas específicas de un marco de datos basándose en condiciones dadas. Puedes usar operadores lógicos y comparaciones para especificar las condiciones que determinarán qué filas deben ser incluidas en el resultado


billboard100 %>% 
select(date,rank,song, artist,
       weeks_popular="weeks-on-board") %>% 
  filter(weeks_popular >=20, artist == "Shakira" | artist == "Taylor Swift")

## # A tibble: 250 × 5
##    date        rank song                  artist       weeks_popular
##    <date>     <dbl> <chr>                 <chr>                <dbl>
##  1 2021-05-08    61 Willow                Taylor Swift            20
##  2 2020-01-25    50 Lover                 Taylor Swift            22
##  3 2020-01-18    36 Lover                 Taylor Swift            21
##  4 2020-01-11    34 Lover                 Taylor Swift            20
##  5 2019-11-16    46 You Need To Calm Down Taylor Swift            21
##  6 2019-11-09    52 You Need To Calm Down Taylor Swift            20
##  7 2018-11-17    49 Delicate              Taylor Swift            35
##  8 2018-11-10    43 Delicate              Taylor Swift            34
##  9 2018-11-03    38 Delicate              Taylor Swift            33
## 10 2018-10-27    34 Delicate              Taylor Swift            32
## # ℹ 240 more rows

# Distinct:La función distinct() se utiliza para obtener las filas únicas de un marco de datos o de un conjunto de columnas específicas dentro de un marco de datos. Puedes utilizar esta función para eliminar duplicados basándote en una o más columnas.


billboard100 %>% select(date:artist, weeks_popular ="weeks-on-board") %>% filter(artist == "Drake")

## # A tibble: 787 × 5
##    date        rank song                       artist weeks_popular
##    <date>     <dbl> <chr>                      <chr>          <dbl>
##  1 2021-11-06    91 No Friends In The Industry Drake              8
##  2 2021-10-30    87 No Friends In The Industry Drake              7
##  3 2021-10-30    90 Champagne Poetry           Drake              7
##  4 2021-10-23    74 No Friends In The Industry Drake              6
##  5 2021-10-23    77 Champagne Poetry           Drake              6
##  6 2021-10-16    64 No Friends In The Industry Drake              5
##  7 2021-10-16    65 Champagne Poetry           Drake              5
##  8 2021-10-16    98 TSU                        Drake              5
##  9 2021-10-09    54 Champagne Poetry           Drake              4
## 10 2021-10-09    60 No Friends In The Industry Drake              4
## # ℹ 777 more rows

distinct <- billboard100 %>%
  select(date:artist, weeks_popular='weeks-on-board') %>%
  filter(artist == 'Drake') %>%
  distinct(song)

# distinct(song) selecciona los valores unicos de la columna song, eliminando cualquier duplicado.

# Group_by & Summarise:La función group_by() se utiliza para agrupar un marco de datos por una o más columnas. Cuando se aplica group_by(), se crea un “grupo” para cada combinación única de los valores en las columnas especificadas. Posteriormente, puedes aplicar funciones de resumen, como summarise(), a cada uno de estos grupos.

billboard100 %>% select(date:artist, weeks_popular= "weeks-on-board")   %>%  filter(artist == "Drake") %>%   group_by(song) %>% summarise(total_weeks_popular = mean(weeks_popular))

## # A tibble: 108 × 2
##    song                    total_weeks_popular
##    <chr>                                 <dbl>
##  1 0 To 100 / The Catch Up                10.5
##  2 10 Bands                                7  
##  3 30 For 30 Freestyle                     1.5
##  4 6 God                                   1  
##  5 6 Man                                   1  
##  6 7am On Bridle Path                      2  
##  7 8 Out Of 10                             2  
##  8 9                                       3  
##  9 9 AM In Dallas                          1  
## 10 Back To Back                           10.5
## # ℹ 98 more rows

# group_by(song) esta agrupando los datos por la columna song, es decir que los datos vana dividirse en grupos

#summarise(total_weeks_popular = mean(weeks_popular)):despues de crear la agrupacion calcaula el primedio de la columna weeks_popular y lo aalmcena en  una columna llamada total_weeks_popular

# ARRANGE : La función arrange() en la librería dplyr de R se utiliza para ordenar las filas de un marco de datos según una o más columnas. Puedes especificar el orden ascendente o descendente para cada columna.


billboard100 %>% select(date:artist, weeks_popular="weeks-on-board") %>% filter(artist == "Drake") %>%
group_by(song) %>% summarise(total_weeks_popular= max(weeks_popular)) %>% arrange(desc(total_weeks_popular),song) %>% head(10)

## # A tibble: 10 × 2
##    song                    total_weeks_popular
##    <chr>                                 <dbl>
##  1 God's Plan                               36
##  2 Hotline Bling                            36
##  3 Controlla                                26
##  4 Fake Love                                25
##  5 Headlines                                25
##  6 Nice For What                            25
##  7 Best I Ever Had                          24
##  8 In My Feelings                           22
##  9 Nonstop                                  22
## 10 Started From The Bottom                  22

#  maximo de semanas  summarise(total_weeks_popular= max(weeks_popular))

#  ordena  de forma descendiente por semanas  maximas arrange(desc(total_weeks_popular),song), es decir va mostrar las canciones con mayor semanas

# Count:

billboard100 %>% select(date:artist, weeks_popular='weeks-on-board')  %>% count(artist) %>%
  arrange(desc(n))

## # A tibble: 10,205 × 2
##    artist            n
##    <chr>         <int>
##  1 Taylor Swift   1023
##  2 Elton John      889
##  3 Madonna         857
##  4 Drake           787
##  5 Kenny Chesney   769
##  6 Tim McGraw      731
##  7 Keith Urban     673
##  8 Stevie Wonder   659
##  9 Rod Stewart     657
## 10 Mariah Carey    621
## # ℹ 10,195 more rows

Clase04

ingrid castaño

2024-07-26