Step 1/3 | Scraping Spotify Data

Remove Objects from Environment

ls()

## character(0)

rm(list = ls())

Load packages:

library(rvest)#Easily Harvest (Scrape) Web Pages. html_nodes
library(tidyverse)#Designed to make it easy to install and load multiple 'tidyverse' packages in a single step.
library(magrittr)#A Forward-Pipe Operator for R
library(scales)
library(knitr)
library(lubridate)#Lubridate provides tools that make it easier to parse and manipulate dates.
library(ggrepel)#This package contains extra geoms for ggplot2.

Data Scraping:

This is the URL from where we are going to scrape the data: https://spotifycharts.com/regional/us/daily/YYYY-MM-DD

We are interested in the “US Top 200 daily hits” playlist. From Jan 1st to Dec 31st for 2018.

As you can see, the URL is a constant and all that changes over day 1 and day 365 is the date: YYYY-MM-DD

https://spotifycharts.com/regional/us/daily/2018-01-01 https://spotifycharts.com/regional/us/daily/2018-12-31

1. Create a fix URL:

url <- "https://spotifycharts.com/regional/us/daily/"

2. Define the date range in a sequence:

timevalues <- seq(as.Date("2018/01/01"), as.Date("2018/12/31"), by = "day")
head(timevalues);tail(timevalues)

## [1] "2018-01-01" "2018-01-02" "2018-01-03" "2018-01-04" "2018-01-05"
## [6] "2018-01-06"

## [1] "2018-12-26" "2018-12-27" "2018-12-28" "2018-12-29" "2018-12-30"
## [6] "2018-12-31"

3. Create a function to concatenate the fix constant + the sequence of dates:

concat.url<- function(x){
 full_url <- paste0(url, x)
 full_url
}
#Run the function
finalurl <- concat.url(timevalues)
head(finalurl, n=2);tail(finalurl, n=2)

## [1] "https://spotifycharts.com/regional/us/daily/2018-01-01"
## [2] "https://spotifycharts.com/regional/us/daily/2018-01-02"

## [1] "https://spotifycharts.com/regional/us/daily/2018-12-30"
## [2] "https://spotifycharts.com/regional/us/daily/2018-12-31"

4. Create a function that reads HTML and exctracs HTML nodes.

We can use SelectorGadget (CHROME Extension) to get the node names, is a point and click tool that is very handy or in Google Chrome go to “View>Developer>View Source <-|” and look for classes.

SpotifyScrape <- function(x){
  page <- x
  rank <- page %>%
    read_html() %>% #Reads an HTML page
    html_nodes('.chart-table-position') %>% #RVEST.PKG: extract pieces out of HTML docs. using XPath & css selectors.
    html_text() %>% #RVEST.PKG:Extract attributes, text and tag name from html
    as.data.frame()
  track <- page %>% 
    read_html() %>% 
    html_nodes('strong') %>% 
    html_text() %>% 
    as.data.frame()
  artist <- page %>% 
    read_html() %>% 
    html_nodes('.chart-table-track span') %>% 
    html_text() %>% 
    as.data.frame()
  streams <- page %>% 
    read_html() %>% 
    html_nodes('td.chart-table-streams') %>% 
    html_text() %>% 
    as.data.frame()
  dates <- page %>% 
    read_html() %>% 
    html_nodes('.responsive-select~ .responsive-select+ .responsive-select .responsive-select-value') %>%
    html_text() %>% 
    as.data.frame()

#combine, name, and make it a tibble
  chart <- cbind(rank, track, artist, streams, dates) #Combine R Objects by Columns
  names(chart) <- c("Rank", "Track", "Artist", "Streams", "Date") #Functions to get or set the names of an object
  chart <- as.tibble(chart)#TIBBLE.PKG:turns an existing object into a so-called tibble
 return(chart) #Final tibble 5 columns & (200 rows * 365 days) = 73,000
}

5. Scrape….scrape….scrape..this step will take a few minutes…

Uncomment the code below:

#spotify <- map_df(finalurl, SpotifyScrape) #PURR.PGK:The map functions transform their input by applying a function to each element and returning a vector the same length as the input.
#saveRDS(spotify, "spotifyRAW.rds")

6. Check data frame dimmension:

The final tibble should have 5 columns & (200 rows * 365 days) 73,000 rows

spotify <- readRDS("spotifyRAW.rds")
dim(spotify)

## [1] 73000     5

head(spotify, n = 10)

## # A tibble: 10 x 5
##    Rank  Track                         Artist           Streams   Date     
##    <fct> <chr>                         <chr>            <chr>     <chr>    
##  1 1     rockstar                      by Post Malone   1,502,394 01/01/20…
##  2 2     No Limit                      by G-Eazy        1,027,039 01/01/20…
##  3 3     Gucci Gang                    by Lil Pump      930,620   01/01/20…
##  4 4     Bartier Cardi (feat. 21 Sava… by Cardi B       877,478   01/01/20…
##  5 5     Havana                        by Camila Cabel… 860,232   01/01/20…
##  6 6     Ric Flair Drip (& Metro Boom… by Offset        833,470   01/01/20…
##  7 7     Him & I (with Halsey)         by G-Eazy        823,508   01/01/20…
##  8 8     I Fall Apart                  by Post Malone   813,516   01/01/20…
##  9 9     Young Dumb & Broke            by Khalid        734,845   01/01/20…
## 10 10    XO TOUR Llif3                 by Lil Uzi Vert  683,284   01/01/20…

7. Data cleaning

Remove “by” and “,”
Transform character values into dates
Extract the day of the week and month out of the date variable

spotify %<>% 
  mutate( Artist = gsub("by ", "", Artist), #gsub perform replacement of the first and all matches respectively
          Streams = gsub(",", "", Streams), 
          Streams = as.numeric(Streams), 
          Date = as.Date(spotify$Date, "%m/%d/%Y"),
          WeekDay = wday(Date, label = TRUE),#LUBRIDATE.PKG:Get days component of a date-time
          Month = month(Date, label = TRUE)
          ) %>% 
  print()

## # A tibble: 73,000 x 7
##    Rank  Track                  Artist     Streams Date       WeekDay Month
##    <fct> <chr>                  <chr>        <dbl> <date>     <ord>   <ord>
##  1 1     rockstar               Post Malo… 1502394 2018-01-01 Mon     Jan  
##  2 2     No Limit               G-Eazy     1027039 2018-01-01 Mon     Jan  
##  3 3     Gucci Gang             Lil Pump    930620 2018-01-01 Mon     Jan  
##  4 4     Bartier Cardi (feat. … Cardi B     877478 2018-01-01 Mon     Jan  
##  5 5     Havana                 Camila Ca…  860232 2018-01-01 Mon     Jan  
##  6 6     Ric Flair Drip (& Met… Offset      833470 2018-01-01 Mon     Jan  
##  7 7     Him & I (with Halsey)  G-Eazy      823508 2018-01-01 Mon     Jan  
##  8 8     I Fall Apart           Post Malo…  813516 2018-01-01 Mon     Jan  
##  9 9     Young Dumb & Broke     Khalid      734845 2018-01-01 Mon     Jan  
## 10 10    XO TOUR Llif3          Lil Uzi V…  683284 2018-01-01 Mon     Jan  
## # … with 72,990 more rows

8. Let’s find some insights with descriptive statistics

Most streamed songs

by_streams <- spotify %>% 
  group_by(Track) %>%
  summarise(TotalStreams = sum(Streams)) %>% 
  arrange(desc(TotalStreams)) %>%
  top_n(20)

## Selecting by TotalStreams

by_streams %>%
  ggplot(aes(reorder(Track, TotalStreams), y = TotalStreams)) +
  geom_col(fill = "sky blue") +
  #geom_label_repel(aes(label = total), size = 3) +
  coord_flip() +
  labs(title = 'US 2018 | Most Streamed Songs | Gods Plan reached 453M',
      x = "Track Name",
      y = "Total Streams")

Most streamed artists

by_artist <- spotify %>% 
  group_by(Artist) %>%
  summarise(TotalStreams = sum(Streams)) %>% 
  arrange(desc(TotalStreams)) %>% 
  top_n(20)

## Selecting by TotalStreams

by_artist %>%
  ggplot(aes(reorder(Artist, TotalStreams), y = TotalStreams)) +
  geom_col(fill = "sky blue") +
  #geom_label_repel(aes(label = TotalStreams), size = 3) +
  coord_flip() +
  labs(title = 'US 2018 | Most streamed Artist | Post Malone reached 2.2 billion',
      x = "Artist Name",
      y = "Total Streams")

Most streamed day of the week

by_WeekDay <- spotify %>% 
  group_by(WeekDay) %>%
  summarise(TotalStreams = sum(Streams)) %>% 
  arrange(desc(TotalStreams)) %>% 
  print()

## # A tibble: 7 x 2
##   WeekDay TotalStreams
##   <ord>          <dbl>
## 1 Fri       4920574180
## 2 Sat       4438785742
## 3 Tue       4417362704
## 4 Mon       4383396249
## 5 Wed       4357561700
## 6 Thu       4319243008
## 7 Sun       3949123941

ggplot(data=by_WeekDay, aes(x=WeekDay, y=TotalStreams, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  labs(title = 'US 2018 | Most streamed Week Day | Friday reached 4.9 Billion', #4,920,574,180
  x = "Day of the Week",
  y = "Total Streams")

Most streamed month of the year

by_Month <- spotify %>% 
  group_by(Month) %>%
  summarise(TotalStreams = sum(Streams)) %>% 
  arrange(desc(TotalStreams)) %>% 
  print()

## # A tibble: 12 x 2
##    Month TotalStreams
##    <ord>        <dbl>
##  1 Dec     2803948705
##  2 Jul     2709741839
##  3 Apr     2700911924
##  4 May     2660144924
##  5 Jun     2643193009
##  6 Oct     2639432245
##  7 Aug     2602465431
##  8 Sep     2534878749
##  9 Mar     2510433792
## 10 Nov     2497397049
## 11 Jan     2282281225
## 12 Feb     2201218632

ggplot(data=by_Month, aes(x=Month, y=TotalStreams, group=1)) +
  geom_line(linetype = "dashed")+
  geom_point()+
  labs(title = 'US 2018 | Most streamed Month | December reached 2.8 Billion', # 2,803,948,705
  x = "Month",
  y = "Total Streams")

9. Keep & save only the top 100 most streamed Tracks for 2018 in USA:

#Group by track and sum Total Streams
by_streams2 <- spotify %>% 
  group_by(Track) %>%
  summarise(TotalStreams = sum(Streams)) %>% 
  arrange(desc(TotalStreams)) %>%
  top_n(100)

## Selecting by TotalStreams

#Create a df with unique tracks and artists
spotify2 <- spotify %>% 
  select(Track, Artist) %>% 
  distinct(Track, Artist)

#Left join to prep our data and get the lyrics
top100songs <- left_join(by_streams2, spotify2, by = "Track") %>% 
  arrange(desc(TotalStreams)) %>% 
  select(Artist, Track, TotalStreams) %>%
  filter (! duplicated(TotalStreams)) %>% 
  print()

## # A tibble: 100 x 3
##    Artist       Track                        TotalStreams
##    <chr>        <chr>                               <dbl>
##  1 Drake        God's Plan                      453226629
##  2 XXXTENTACION SAD!                            332633597
##  3 Post Malone  Psycho (feat. Ty Dolla $ign)    306877012
##  4 Juice WRLD   Lucid Dreams                    299907223
##  5 BlocBoy JB   Look Alive (feat. Drake)        266861797
##  6 Drake        Nice For What                   263455062
##  7 Post Malone  Better Now                      254098207
##  8 Drake        In My Feelings                  245715031
##  9 XXXTENTACION Moonlight                       245626527
## 10 Cardi B      I Like It                       240430007
## # … with 90 more rows

saveRDS(top100songs, file = "top100songs.rds")