Remove Objects from Environment
ls()
## character(0)
rm(list = ls())
Load packages:
library(rvest)#Easily Harvest (Scrape) Web Pages. html_nodes
library(tidyverse)#Designed to make it easy to install and load multiple 'tidyverse' packages in a single step.
library(magrittr)#A Forward-Pipe Operator for R
library(scales)
library(knitr)
library(lubridate)#Lubridate provides tools that make it easier to parse and manipulate dates.
library(ggrepel)#This package contains extra geoms for ggplot2.
This is the URL from where we are going to scrape the data: https://spotifycharts.com/regional/us/daily/YYYY-MM-DD
We are interested in the “US Top 200 daily hits” playlist. From Jan 1st to Dec 31st for 2018.
As you can see, the URL is a constant and all that changes over day 1 and day 365 is the date: YYYY-MM-DD
https://spotifycharts.com/regional/us/daily/2018-01-01 https://spotifycharts.com/regional/us/daily/2018-12-31
url <- "https://spotifycharts.com/regional/us/daily/"
timevalues <- seq(as.Date("2018/01/01"), as.Date("2018/12/31"), by = "day")
head(timevalues);tail(timevalues)
## [1] "2018-01-01" "2018-01-02" "2018-01-03" "2018-01-04" "2018-01-05"
## [6] "2018-01-06"
## [1] "2018-12-26" "2018-12-27" "2018-12-28" "2018-12-29" "2018-12-30"
## [6] "2018-12-31"
concat.url<- function(x){
full_url <- paste0(url, x)
full_url
}
#Run the function
finalurl <- concat.url(timevalues)
head(finalurl, n=2);tail(finalurl, n=2)
## [1] "https://spotifycharts.com/regional/us/daily/2018-01-01"
## [2] "https://spotifycharts.com/regional/us/daily/2018-01-02"
## [1] "https://spotifycharts.com/regional/us/daily/2018-12-30"
## [2] "https://spotifycharts.com/regional/us/daily/2018-12-31"
We can use SelectorGadget (CHROME Extension) to get the node names, is a point and click tool that is very handy or in Google Chrome go to “View>Developer>View Source <-|” and look for classes.
SpotifyScrape <- function(x){
page <- x
rank <- page %>%
read_html() %>% #Reads an HTML page
html_nodes('.chart-table-position') %>% #RVEST.PKG: extract pieces out of HTML docs. using XPath & css selectors.
html_text() %>% #RVEST.PKG:Extract attributes, text and tag name from html
as.data.frame()
track <- page %>%
read_html() %>%
html_nodes('strong') %>%
html_text() %>%
as.data.frame()
artist <- page %>%
read_html() %>%
html_nodes('.chart-table-track span') %>%
html_text() %>%
as.data.frame()
streams <- page %>%
read_html() %>%
html_nodes('td.chart-table-streams') %>%
html_text() %>%
as.data.frame()
dates <- page %>%
read_html() %>%
html_nodes('.responsive-select~ .responsive-select+ .responsive-select .responsive-select-value') %>%
html_text() %>%
as.data.frame()
#combine, name, and make it a tibble
chart <- cbind(rank, track, artist, streams, dates) #Combine R Objects by Columns
names(chart) <- c("Rank", "Track", "Artist", "Streams", "Date") #Functions to get or set the names of an object
chart <- as.tibble(chart)#TIBBLE.PKG:turns an existing object into a so-called tibble
return(chart) #Final tibble 5 columns & (200 rows * 365 days) = 73,000
}
Uncomment the code below:
#spotify <- map_df(finalurl, SpotifyScrape) #PURR.PGK:The map functions transform their input by applying a function to each element and returning a vector the same length as the input.
#saveRDS(spotify, "spotifyRAW.rds")
The final tibble should have 5 columns & (200 rows * 365 days) 73,000 rows
spotify <- readRDS("spotifyRAW.rds")
dim(spotify)
## [1] 73000 5
head(spotify, n = 10)
## # A tibble: 10 x 5
## Rank Track Artist Streams Date
## <fct> <chr> <chr> <chr> <chr>
## 1 1 rockstar by Post Malone 1,502,394 01/01/20…
## 2 2 No Limit by G-Eazy 1,027,039 01/01/20…
## 3 3 Gucci Gang by Lil Pump 930,620 01/01/20…
## 4 4 Bartier Cardi (feat. 21 Sava… by Cardi B 877,478 01/01/20…
## 5 5 Havana by Camila Cabel… 860,232 01/01/20…
## 6 6 Ric Flair Drip (& Metro Boom… by Offset 833,470 01/01/20…
## 7 7 Him & I (with Halsey) by G-Eazy 823,508 01/01/20…
## 8 8 I Fall Apart by Post Malone 813,516 01/01/20…
## 9 9 Young Dumb & Broke by Khalid 734,845 01/01/20…
## 10 10 XO TOUR Llif3 by Lil Uzi Vert 683,284 01/01/20…
spotify %<>%
mutate( Artist = gsub("by ", "", Artist), #gsub perform replacement of the first and all matches respectively
Streams = gsub(",", "", Streams),
Streams = as.numeric(Streams),
Date = as.Date(spotify$Date, "%m/%d/%Y"),
WeekDay = wday(Date, label = TRUE),#LUBRIDATE.PKG:Get days component of a date-time
Month = month(Date, label = TRUE)
) %>%
print()
## # A tibble: 73,000 x 7
## Rank Track Artist Streams Date WeekDay Month
## <fct> <chr> <chr> <dbl> <date> <ord> <ord>
## 1 1 rockstar Post Malo… 1502394 2018-01-01 Mon Jan
## 2 2 No Limit G-Eazy 1027039 2018-01-01 Mon Jan
## 3 3 Gucci Gang Lil Pump 930620 2018-01-01 Mon Jan
## 4 4 Bartier Cardi (feat. … Cardi B 877478 2018-01-01 Mon Jan
## 5 5 Havana Camila Ca… 860232 2018-01-01 Mon Jan
## 6 6 Ric Flair Drip (& Met… Offset 833470 2018-01-01 Mon Jan
## 7 7 Him & I (with Halsey) G-Eazy 823508 2018-01-01 Mon Jan
## 8 8 I Fall Apart Post Malo… 813516 2018-01-01 Mon Jan
## 9 9 Young Dumb & Broke Khalid 734845 2018-01-01 Mon Jan
## 10 10 XO TOUR Llif3 Lil Uzi V… 683284 2018-01-01 Mon Jan
## # … with 72,990 more rows
by_streams <- spotify %>%
group_by(Track) %>%
summarise(TotalStreams = sum(Streams)) %>%
arrange(desc(TotalStreams)) %>%
top_n(20)
## Selecting by TotalStreams
by_streams %>%
ggplot(aes(reorder(Track, TotalStreams), y = TotalStreams)) +
geom_col(fill = "sky blue") +
#geom_label_repel(aes(label = total), size = 3) +
coord_flip() +
labs(title = 'US 2018 | Most Streamed Songs | Gods Plan reached 453M',
x = "Track Name",
y = "Total Streams")
by_artist <- spotify %>%
group_by(Artist) %>%
summarise(TotalStreams = sum(Streams)) %>%
arrange(desc(TotalStreams)) %>%
top_n(20)
## Selecting by TotalStreams
by_artist %>%
ggplot(aes(reorder(Artist, TotalStreams), y = TotalStreams)) +
geom_col(fill = "sky blue") +
#geom_label_repel(aes(label = TotalStreams), size = 3) +
coord_flip() +
labs(title = 'US 2018 | Most streamed Artist | Post Malone reached 2.2 billion',
x = "Artist Name",
y = "Total Streams")
by_WeekDay <- spotify %>%
group_by(WeekDay) %>%
summarise(TotalStreams = sum(Streams)) %>%
arrange(desc(TotalStreams)) %>%
print()
## # A tibble: 7 x 2
## WeekDay TotalStreams
## <ord> <dbl>
## 1 Fri 4920574180
## 2 Sat 4438785742
## 3 Tue 4417362704
## 4 Mon 4383396249
## 5 Wed 4357561700
## 6 Thu 4319243008
## 7 Sun 3949123941
ggplot(data=by_WeekDay, aes(x=WeekDay, y=TotalStreams, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
labs(title = 'US 2018 | Most streamed Week Day | Friday reached 4.9 Billion', #4,920,574,180
x = "Day of the Week",
y = "Total Streams")
by_Month <- spotify %>%
group_by(Month) %>%
summarise(TotalStreams = sum(Streams)) %>%
arrange(desc(TotalStreams)) %>%
print()
## # A tibble: 12 x 2
## Month TotalStreams
## <ord> <dbl>
## 1 Dec 2803948705
## 2 Jul 2709741839
## 3 Apr 2700911924
## 4 May 2660144924
## 5 Jun 2643193009
## 6 Oct 2639432245
## 7 Aug 2602465431
## 8 Sep 2534878749
## 9 Mar 2510433792
## 10 Nov 2497397049
## 11 Jan 2282281225
## 12 Feb 2201218632
ggplot(data=by_Month, aes(x=Month, y=TotalStreams, group=1)) +
geom_line(linetype = "dashed")+
geom_point()+
labs(title = 'US 2018 | Most streamed Month | December reached 2.8 Billion', # 2,803,948,705
x = "Month",
y = "Total Streams")
#Group by track and sum Total Streams
by_streams2 <- spotify %>%
group_by(Track) %>%
summarise(TotalStreams = sum(Streams)) %>%
arrange(desc(TotalStreams)) %>%
top_n(100)
## Selecting by TotalStreams
#Create a df with unique tracks and artists
spotify2 <- spotify %>%
select(Track, Artist) %>%
distinct(Track, Artist)
#Left join to prep our data and get the lyrics
top100songs <- left_join(by_streams2, spotify2, by = "Track") %>%
arrange(desc(TotalStreams)) %>%
select(Artist, Track, TotalStreams) %>%
filter (! duplicated(TotalStreams)) %>%
print()
## # A tibble: 100 x 3
## Artist Track TotalStreams
## <chr> <chr> <dbl>
## 1 Drake God's Plan 453226629
## 2 XXXTENTACION SAD! 332633597
## 3 Post Malone Psycho (feat. Ty Dolla $ign) 306877012
## 4 Juice WRLD Lucid Dreams 299907223
## 5 BlocBoy JB Look Alive (feat. Drake) 266861797
## 6 Drake Nice For What 263455062
## 7 Post Malone Better Now 254098207
## 8 Drake In My Feelings 245715031
## 9 XXXTENTACION Moonlight 245626527
## 10 Cardi B I Like It 240430007
## # … with 90 more rows
saveRDS(top100songs, file = "top100songs.rds")