options(repos = c(CRAN = "https://cloud.r-project.org"))
install.packages("dplyr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'dplyr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'dplyr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problema al copiar
## C:\Users\Usuario\AppData\Local\R\win-library\4.4\00LOCK\dplyr\libs\x64\dplyr.dll
## a C:\Users\Usuario\AppData\Local\R\win-library\4.4\dplyr\libs\x64\dplyr.dll:
## Permission denied
## Warning: restored 'dplyr'
##
## The downloaded binary packages are in
## C:\Users\Usuario\AppData\Local\Temp\RtmpktV7nz\downloaded_packages
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
R.version
## _
## platform x86_64-w64-mingw32
## arch x86_64
## os mingw32
## crt ucrt
## system x86_64, mingw32
## status
## major 4
## minor 4.1
## year 2024
## month 06
## day 14
## svn rev 86737
## language R
## version.string R version 4.4.1 (2024-06-14 ucrt)
## nickname Race for Your Life
writeLines('PATH="${RTOOLS43_HOME}\\usr\\bin;${PATH}"', con = "~/.Renviron")
Sys.which("make")
## make
## "c:\\rtools44\\usr\\bin\\make.exe"
install.packages("dplyr")
## Warning: package 'dplyr' is in use and will not be installed
install.packages("readr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'readr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'readr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problema al copiar
## C:\Users\Usuario\AppData\Local\R\win-library\4.4\00LOCK\readr\libs\x64\readr.dll
## a C:\Users\Usuario\AppData\Local\R\win-library\4.4\readr\libs\x64\readr.dll:
## Permission denied
## Warning: restored 'readr'
##
## The downloaded binary packages are in
## C:\Users\Usuario\AppData\Local\Temp\RtmpktV7nz\downloaded_packages
library(readr)
billboard100 <- read_csv("billboard100.csv")
## Rows: 330087 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): song, artist
## dbl (4): rank, last-week, peak-rank, weeks-on-board
## date (1): date
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
getwd()
## [1] "C:/Users/Usuario/Desktop/DIPLOMADO/Diplomado Big Data"
#Piping %>%
head(billboard100,10)
## # A tibble: 10 × 7
## date rank song artist `last-week` `peak-rank` `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 1 1 3
## 2 2021-11-06 2 Stay The K… 2 1 16
## 3 2021-11-06 3 Industry Ba… Lil N… 3 1 14
## 4 2021-11-06 4 Fancy Like Walke… 4 3 19
## 5 2021-11-06 5 Bad Habits Ed Sh… 5 2 18
## 6 2021-11-06 6 Way 2 Sexy Drake… 6 1 8
## 7 2021-11-06 7 Shivers Ed Sh… 9 7 7
## 8 2021-11-06 8 Good 4 U Olivi… 7 1 24
## 9 2021-11-06 9 Need To Know Doja … 11 9 20
## 10 2021-11-06 10 Levitating Dua L… 8 2 56
install.packages("tidyverse")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'tidyverse' successfully unpacked and MD5 sums checked
##
## The downloaded binary packages are in
## C:\Users\Usuario\AppData\Local\Temp\RtmpktV7nz\downloaded_packages
install.packages("magrittr")
## Installing package into 'C:/Users/Usuario/AppData/Local/R/win-library/4.4'
## (as 'lib' is unspecified)
## package 'magrittr' successfully unpacked and MD5 sums checked
## Warning: cannot remove prior installation of package 'magrittr'
## Warning in file.copy(savedcopy, lib, recursive = TRUE): problema al copiar
## C:\Users\Usuario\AppData\Local\R\win-library\4.4\00LOCK\magrittr\libs\x64\magrittr.dll
## a
## C:\Users\Usuario\AppData\Local\R\win-library\4.4\magrittr\libs\x64\magrittr.dll:
## Permission denied
## Warning: restored 'magrittr'
##
## The downloaded binary packages are in
## C:\Users\Usuario\AppData\Local\Temp\RtmpktV7nz\downloaded_packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(magrittr)
##
## Adjuntando el paquete: 'magrittr'
##
## The following object is masked from 'package:purrr':
##
## set_names
##
## The following object is masked from 'package:tidyr':
##
## extract
billboard100 %>% head(10)
## # A tibble: 10 × 7
## date rank song artist `last-week` `peak-rank` `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 1 1 3
## 2 2021-11-06 2 Stay The K… 2 1 16
## 3 2021-11-06 3 Industry Ba… Lil N… 3 1 14
## 4 2021-11-06 4 Fancy Like Walke… 4 3 19
## 5 2021-11-06 5 Bad Habits Ed Sh… 5 2 18
## 6 2021-11-06 6 Way 2 Sexy Drake… 6 1 8
## 7 2021-11-06 7 Shivers Ed Sh… 9 7 7
## 8 2021-11-06 8 Good 4 U Olivi… 7 1 24
## 9 2021-11-06 9 Need To Know Doja … 11 9 20
## 10 2021-11-06 10 Levitating Dua L… 8 2 56
10 %>% head(billboard100, .)
## # A tibble: 10 × 7
## date rank song artist `last-week` `peak-rank` `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl> <dbl> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 1 1 3
## 2 2021-11-06 2 Stay The K… 2 1 16
## 3 2021-11-06 3 Industry Ba… Lil N… 3 1 14
## 4 2021-11-06 4 Fancy Like Walke… 4 3 19
## 5 2021-11-06 5 Bad Habits Ed Sh… 5 2 18
## 6 2021-11-06 6 Way 2 Sexy Drake… 6 1 8
## 7 2021-11-06 7 Shivers Ed Sh… 9 7 7
## 8 2021-11-06 8 Good 4 U Olivi… 7 1 24
## 9 2021-11-06 9 Need To Know Doja … 11 9 20
## 10 2021-11-06 10 Levitating Dua L… 8 2 56
#SELECT
billboard100 %>%
select(date, rank, song,artist, 'weeks-on-board')
## # A tibble: 330,087 × 5
## date rank song artist `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 3
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14
## 4 2021-11-06 4 Fancy Like Walker Hayes 19
## 5 2021-11-06 5 Bad Habits Ed Sheeran 18
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8
## 7 2021-11-06 7 Shivers Ed Sheeran 7
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24
## 9 2021-11-06 9 Need To Know Doja Cat 20
## 10 2021-11-06 10 Levitating Dua Lipa 56
## # ℹ 330,077 more rows
billboard100 %>%
select(date:artist, weeks_popular='weeks-on-board')
## # A tibble: 330,087 × 5
## date rank song artist weeks_popular
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 3
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14
## 4 2021-11-06 4 Fancy Like Walker Hayes 19
## 5 2021-11-06 5 Bad Habits Ed Sheeran 18
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & Young … 8
## 7 2021-11-06 7 Shivers Ed Sheeran 7
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24
## 9 2021-11-06 9 Need To Know Doja Cat 20
## 10 2021-11-06 10 Levitating Dua Lipa 56
## # ℹ 330,077 more rows
billboard100 %>%
select(-'last-week', -'peak-rank')
## # A tibble: 330,087 × 5
## date rank song artist `weeks-on-board`
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 1 Easy On Me Adele 3
## 2 2021-11-06 2 Stay The Kid LAROI & Justin Bieber 16
## 3 2021-11-06 3 Industry Baby Lil Nas X & Jack Harlow 14
## 4 2021-11-06 4 Fancy Like Walker Hayes 19
## 5 2021-11-06 5 Bad Habits Ed Sheeran 18
## 6 2021-11-06 6 Way 2 Sexy Drake Featuring Future & You… 8
## 7 2021-11-06 7 Shivers Ed Sheeran 7
## 8 2021-11-06 8 Good 4 U Olivia Rodrigo 24
## 9 2021-11-06 9 Need To Know Doja Cat 20
## 10 2021-11-06 10 Levitating Dua Lipa 56
## # ℹ 330,077 more rows
#MUTATE
names(billboard100)
## [1] "date" "rank" "song" "artist"
## [5] "last-week" "peak-rank" "weeks-on-board"
billboard100 %>%
mutate(is_collab = grepl('Kiss', song) & grepl('Drake', artist))%>%
select(song,artist, is_collab, everything())
## # A tibble: 330,087 × 8
## song artist is_collab date rank `last-week` `peak-rank`
## <chr> <chr> <lgl> <date> <dbl> <dbl> <dbl>
## 1 Easy On Me Adele FALSE 2021-11-06 1 1 1
## 2 Stay The Kid LAR… FALSE 2021-11-06 2 2 1
## 3 Industry Baby Lil Nas X &… FALSE 2021-11-06 3 3 1
## 4 Fancy Like Walker Hayes FALSE 2021-11-06 4 4 3
## 5 Bad Habits Ed Sheeran FALSE 2021-11-06 5 5 2
## 6 Way 2 Sexy Drake Featu… FALSE 2021-11-06 6 6 1
## 7 Shivers Ed Sheeran FALSE 2021-11-06 7 9 7
## 8 Good 4 U Olivia Rodr… FALSE 2021-11-06 8 7 1
## 9 Need To Know Doja Cat FALSE 2021-11-06 9 11 9
## 10 Levitating Dua Lipa FALSE 2021-11-06 10 8 2
## # ℹ 330,077 more rows
## # ℹ 1 more variable: `weeks-on-board` <dbl>
#Filter
billboard100 %>%
select(date, rank, song, artist, weeks_popular='weeks-on-board')%>%
filter(weeks_popular >= 20, artist == 'Shakira' | artist == 'Taylor Swift')
## # A tibble: 250 × 5
## date rank song artist weeks_popular
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-05-08 61 Willow Taylor Swift 20
## 2 2020-01-25 50 Lover Taylor Swift 22
## 3 2020-01-18 36 Lover Taylor Swift 21
## 4 2020-01-11 34 Lover Taylor Swift 20
## 5 2019-11-16 46 You Need To Calm Down Taylor Swift 21
## 6 2019-11-09 52 You Need To Calm Down Taylor Swift 20
## 7 2018-11-17 49 Delicate Taylor Swift 35
## 8 2018-11-10 43 Delicate Taylor Swift 34
## 9 2018-11-03 38 Delicate Taylor Swift 33
## 10 2018-10-27 34 Delicate Taylor Swift 32
## # ℹ 240 more rows
#Distinct
billboard100 %>%
select(date:artist, weeks_popular='weeks-on-board') %>%
filter(artist == 'Drake')
## # A tibble: 787 × 5
## date rank song artist weeks_popular
## <date> <dbl> <chr> <chr> <dbl>
## 1 2021-11-06 91 No Friends In The Industry Drake 8
## 2 2021-10-30 87 No Friends In The Industry Drake 7
## 3 2021-10-30 90 Champagne Poetry Drake 7
## 4 2021-10-23 74 No Friends In The Industry Drake 6
## 5 2021-10-23 77 Champagne Poetry Drake 6
## 6 2021-10-16 64 No Friends In The Industry Drake 5
## 7 2021-10-16 65 Champagne Poetry Drake 5
## 8 2021-10-16 98 TSU Drake 5
## 9 2021-10-09 54 Champagne Poetry Drake 4
## 10 2021-10-09 60 No Friends In The Industry Drake 4
## # ℹ 777 more rows
distinct <- billboard100%>%
select(date:artist, weeks_popular='weeks-on-board')%>%
filter(artist == 'Drake')%>%
distinct(song)
#Group_by & Summarise
billboard100 %>%
select(date:artist, weeks_popular='weeks-on-board') %>%
filter(artist == 'Drake') %>%
group_by(song)%>%
summarise(total_weeks_popular = mean(weeks_popular))
## # A tibble: 108 × 2
## song total_weeks_popular
## <chr> <dbl>
## 1 0 To 100 / The Catch Up 10.5
## 2 10 Bands 7
## 3 30 For 30 Freestyle 1.5
## 4 6 God 1
## 5 6 Man 1
## 6 7am On Bridle Path 2
## 7 8 Out Of 10 2
## 8 9 3
## 9 9 AM In Dallas 1
## 10 Back To Back 10.5
## # ℹ 98 more rows
#Arrange
billboard100%>%
select(date:artist, weeks_popular='weeks-on-board')%>%
filter(artist == 'Drake') %>%
group_by(song) %>%
summarise(total_weeks_popular = max(weeks_popular))%>%
arrange(desc(total_weeks_popular), song)%>%
head(10)
## # A tibble: 10 × 2
## song total_weeks_popular
## <chr> <dbl>
## 1 God's Plan 36
## 2 Hotline Bling 36
## 3 Controlla 26
## 4 Fake Love 25
## 5 Headlines 25
## 6 Nice For What 25
## 7 Best I Ever Had 24
## 8 In My Feelings 22
## 9 Nonstop 22
## 10 Started From The Bottom 22
#COUNT
billboard100 %>%
select(date:artist, weeks_popular='weeks-on-board') %>%
count(artist) %>%
arrange(desc(n))
## # A tibble: 10,205 × 2
## artist n
## <chr> <int>
## 1 Taylor Swift 1023
## 2 Elton John 889
## 3 Madonna 857
## 4 Drake 787
## 5 Kenny Chesney 769
## 6 Tim McGraw 731
## 7 Keith Urban 673
## 8 Stevie Wonder 659
## 9 Rod Stewart 657
## 10 Mariah Carey 621
## # ℹ 10,195 more rows