- Create a URL sequence function that will feed our scraper the right directions to find WHERE the data is.
- Create a scraper function that will find WHAT attributes we want into a tibble.
- Do some neat analysis with dplyr for visualization!
library(rvest)
## Loading required package: xml2
library(tidyverse)
## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.0.0 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.6
## v tidyr 0.8.1 v stringr 1.3.1
## v readr 1.1.1 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x readr::guess_encoding() masks rvest::guess_encoding()
## x dplyr::lag() masks stats::lag()
## x purrr::pluck() masks rvest::pluck()
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:purrr':
##
## set_names
## The following object is masked from 'package:tidyr':
##
## extract
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(knitr)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:base':
##
## date
url <- "https://spotifycharts.com/regional/ca/daily/"
timevalues <- seq(as.Date("2018/08/01"), as.Date("2018/08/31"), by = "day")
head(timevalues)
## [1] "2018-08-01" "2018-08-02" "2018-08-03" "2018-08-04" "2018-08-05"
## [6] "2018-08-06"
unitedata<- function(x){
full_url <- paste0(url, x)
full_url
}
finalurl <- unitedata(timevalues)
head(finalurl)
## [1] "https://spotifycharts.com/regional/ca/daily/2018-08-01"
## [2] "https://spotifycharts.com/regional/ca/daily/2018-08-02"
## [3] "https://spotifycharts.com/regional/ca/daily/2018-08-03"
## [4] "https://spotifycharts.com/regional/ca/daily/2018-08-04"
## [5] "https://spotifycharts.com/regional/ca/daily/2018-08-05"
## [6] "https://spotifycharts.com/regional/ca/daily/2018-08-06"
SpotifyScrape <- function(x){
page <- x
rank <- page %>% read_html() %>% html_nodes('.chart-table-position') %>% html_text() %>% as.data.frame()
track <- page %>% read_html() %>% html_nodes('strong') %>% html_text() %>% as.data.frame()
artist <- page %>% read_html() %>% html_nodes('.chart-table-track span') %>% html_text() %>% as.data.frame()
streams <- page %>% read_html() %>% html_nodes('td.chart-table-streams') %>% html_text() %>% as.data.frame()
dates <- page %>% read_html() %>% html_nodes('.responsive-select~ .responsive-select+ .responsive-select .responsive-select-value') %>% html_text() %>% as.data.frame()
#combine, name, and make it a tibble
chart <- cbind(rank, track, artist, streams, dates)
names(chart) <- c("Rank", "Track", "Artist", "Streams", "Date")
chart <- as.tibble(chart)
return(chart)
}
spotify <- map_df(finalurl, SpotifyScrape)
spotify1 <- spotify %>%
mutate(Artist = gsub("by ", "", Artist),
Streams = gsub(",", "", Streams),
Streams = as.numeric(Streams),
Date = as.Date(spotify$Date, "%m/%d/%Y"))
head(spotify1)
## # A tibble: 6 x 5
## Rank Track Artist Streams Date
## <fct> <chr> <chr> <dbl> <date>
## 1 1 In My Feelings Drake 370629 2018-08-01
## 2 2 FEFE (feat. Nicki Minaj & Murda Beatz) 6ix9ine 256799 2018-08-01
## 3 3 No Brainer DJ Khaled 233672 2018-08-01
## 4 4 Taste (feat. Offset) Tyga 176738 2018-08-01
## 5 5 I Like It Cardi B 168749 2018-08-01
## 6 6 Girls Like You (feat. Cardi B) Maroon 5 167706 2018-08-01
spotify1 %>%
group_by(Artist) %>%
summarise(Total = sum(Streams)) %>%
arrange(desc(Total)) %>%
top_n(25, Total) %>%
ggplot() +
geom_col(aes(x = reorder(Artist, Total), y = Total), fill = "forest green") +
coord_flip() +
scale_y_continuous(labels = unit_format(unit="M", scale=1e-6))
