Day5_supp, Spotify

Create a URL sequence function that will feed our scraper the right directions to find WHERE the data is.
Create a scraper function that will find WHAT attributes we want into a tibble.
Do some neat analysis with dplyr for visualization!

library(rvest)

## Loading required package: xml2

library(tidyverse)

## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.0.0     v purrr   0.2.5
## v tibble  1.4.2     v dplyr   0.7.6
## v tidyr   0.8.1     v stringr 1.3.1
## v readr   1.1.1     v forcats 0.3.0

## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter()         masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag()            masks stats::lag()
## x purrr::pluck()          masks rvest::pluck()

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:purrr':
## 
##     set_names

## The following object is masked from 'package:tidyr':
## 
##     extract

library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

library(knitr)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following object is masked from 'package:base':
## 
##     date

url <- "https://spotifycharts.com/regional/ca/daily/"
timevalues <- seq(as.Date("2018/08/01"), as.Date("2018/08/31"), by = "day")
head(timevalues)

## [1] "2018-08-01" "2018-08-02" "2018-08-03" "2018-08-04" "2018-08-05"
## [6] "2018-08-06"

unitedata<- function(x){
 full_url <- paste0(url, x)
 full_url
}
finalurl <- unitedata(timevalues)
head(finalurl)

## [1] "https://spotifycharts.com/regional/ca/daily/2018-08-01"
## [2] "https://spotifycharts.com/regional/ca/daily/2018-08-02"
## [3] "https://spotifycharts.com/regional/ca/daily/2018-08-03"
## [4] "https://spotifycharts.com/regional/ca/daily/2018-08-04"
## [5] "https://spotifycharts.com/regional/ca/daily/2018-08-05"
## [6] "https://spotifycharts.com/regional/ca/daily/2018-08-06"

SpotifyScrape <- function(x){
 page <- x
 rank <- page %>% read_html() %>% html_nodes('.chart-table-position') %>% html_text() %>% as.data.frame()
 track <- page %>% read_html() %>% html_nodes('strong') %>% html_text() %>% as.data.frame()
 artist <- page %>% read_html() %>% html_nodes('.chart-table-track span') %>% html_text() %>% as.data.frame()
 streams <- page %>% read_html() %>% html_nodes('td.chart-table-streams') %>% html_text() %>% as.data.frame()
 dates <- page %>% read_html() %>% html_nodes('.responsive-select~ .responsive-select+ .responsive-select .responsive-select-value') %>% html_text() %>% as.data.frame()

#combine, name, and make it a tibble
 chart <- cbind(rank, track, artist, streams, dates)
 names(chart) <- c("Rank", "Track", "Artist", "Streams", "Date")
 chart <- as.tibble(chart)
 return(chart)
}

spotify <- map_df(finalurl, SpotifyScrape)

spotify1 <- spotify %>% 
mutate(Artist = gsub("by ", "", Artist), 
Streams = gsub(",", "", Streams), 
Streams = as.numeric(Streams), 
Date = as.Date(spotify$Date, "%m/%d/%Y"))
head(spotify1)

## # A tibble: 6 x 5
##   Rank  Track                                  Artist    Streams Date      
##   <fct> <chr>                                  <chr>       <dbl> <date>    
## 1 1     In My Feelings                         Drake      370629 2018-08-01
## 2 2     FEFE (feat. Nicki Minaj & Murda Beatz) 6ix9ine    256799 2018-08-01
## 3 3     No Brainer                             DJ Khaled  233672 2018-08-01
## 4 4     Taste (feat. Offset)                   Tyga       176738 2018-08-01
## 5 5     I Like It                              Cardi B    168749 2018-08-01
## 6 6     Girls Like You (feat. Cardi B)         Maroon 5   167706 2018-08-01

 spotify1 %>% 
 group_by(Artist) %>% 
 summarise(Total = sum(Streams)) %>% 
 arrange(desc(Total)) %>%
 top_n(25, Total) %>%
 ggplot() +
 geom_col(aes(x = reorder(Artist, Total), y = Total), fill = "forest green") +
 coord_flip() + 
 scale_y_continuous(labels = unit_format(unit="M", scale=1e-6))