I am going to webscrap best selling artists data and do cleaning,wrangling,analysis and visualization
library(rvest)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
robotstxt::paths_allowed("https://en.wikipedia.org/wiki/List_of_best-selling_music_artists")
##
en.wikipedia.org
## [1] TRUE
chart_html<-read_html("https://en.wikipedia.org/wiki/List_of_best-selling_music_artists")
charts<-chart_html%>%html_nodes(".wikitable")%>%html_table()
charts
## [[1]]
## # A tibble: 9 × 7
## Artist Country `Period active` Release-year of firs…¹ Genre
## <chr> <chr> <chr> <chr> <chr>
## 1 The Beatles United Kingdom 1960–1970[11] 1962[11] Rock…
## 2 Elvis Presley United States 1953–1977[46] 1956[46] Rock…
## 3 Michael Jackson United States 1964–2009[57] 1971[57] Pop …
## 4 Elton John United Kingdom 1962–present[71] 1970[71] Pop …
## 5 Queen United Kingdom 1971–present[79] 1973[79] Rock…
## 6 Madonna United States 1979–present[85] 1983[85] Pop …
## 7 Led Zeppelin United Kingdom 1968–1980[95] 1969[95] Hard…
## 8 Rihanna Barbados 2003–present[103] 2005[103] R&B …
## 9 Pink Floyd United Kingdom 1965–1996, 2005, … 1967[110] Prog…
## # ℹ abbreviated name: ¹`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## # `Claimed sales` <chr>
##
## [[2]]
## # A tibble: 9 × 7
## Artist Country `Period active` Release-year of firs…¹ Genre
## <chr> <chr> <chr> <chr> <chr>
## 1 Eminem United States 1996–present[1… 1999[118] Hip-…
## 2 Mariah Carey United States 1988–present[1… 1990[120] R&B …
## 3 Taylor Swift United States 2006–present[1… 2006[125] Pop …
## 4 Beyoncé United States 1997–present[1… 2002[129][130] R&B …
## 5 Whitney Houston United States 1977–2012[136] 1984[136] R&B …
## 6 Eagles United States 1971–1980, 199… 1972[141] Rock…
## 7 Celine Dion Canada 1981–present[1… 1981[144] Pop …
## 8 AC/DC Australia 1973–present[1… 1975[151] Hard…
## 9 The Rolling Stones United Kingdom 1962–present[1… 1963[154] Rock…
## # ℹ abbreviated name: ¹`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## # `Claimed sales` <chr>
##
## [[3]]
## # A tibble: 28 × 7
## Artist Country `Period active` Release-year of firs…¹ Genre
## <chr> <chr> <chr> <chr> <chr>
## 1 Drake Canada 2001–present[1… 2009[158] Hip-…
## 2 Garth Brooks United States 1989–present[1… 1989[160] Coun…
## 3 Kanye West United States 1996–present[1… 2003[163] Hip-…
## 4 Billy Joel United States 1964–present[1… 1971[166] Pop …
## 5 Justin Bieber Canada 2008–present[1… 2009[169] Pop …
## 6 Ed Sheeran United Kingdom 2004–present[1… 2011[171] Pop …
## 7 Bruno Mars United States 2004–present[1… 2010[173] Pop …
## 8 Bruce Springsteen United States 1972–present[1… 1973[177] Rock…
## 9 U2 Ireland 1976–present[1… 1980[180] Rock…
## 10 Aerosmith United States 1970–present[1… 1973[185] Hard…
## # ℹ 18 more rows
## # ℹ abbreviated name: ¹`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## # `Claimed sales` <chr>
##
## [[4]]
## # A tibble: 40 × 7
## Artist Country `Period active` Release-year of firs…¹ Genre
## <chr> <chr> <chr> <chr> <chr>
## 1 Nicki Minaj Trinidad and To… 2008–present[2… 2010[249] Hip-…
## 2 Coldplay United Kingdom 1997–present[2… 1999[251] Alte…
## 3 Linkin Park United States 1996–present[2… 2000[254] Alte…
## 4 George Strait United States 1981–present[2… 1984[256] Coun…
## 5 Pink United States 1995–present[2… 2000[258] Pop …
## 6 Britney Spears United States 1998–present[2… 1998[260] Pop …
## 7 B'z Japan 1988–present[2… 1988[263] Rock…
## 8 Shania Twain Canada 1993–present[2… 1993[265] Coun…
## 9 Guns N' Roses United States 1985–present[2… 1987[267] Hard…
## 10 Backstreet Boys United States 1993–present[2… 1995[270] Pop[…
## # ℹ 30 more rows
## # ℹ abbreviated name: ¹`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## # `Claimed sales` <chr>
##
## [[5]]
## # A tibble: 17 × 7
## Artist Country `Period active` Release-year of firs…¹ Genre
## <chr> <chr> <chr> <chr> <chr>
## 1 Shakira Colombia 1988–present[3… 1995[354] Lati…
## 2 Alicia Keys United Stat… 1996–present[3… 2001[356] Hip-…
## 3 Christina Aguilera United Stat… 1993–present[3… 1998[358] R&B …
## 4 Lionel Richie United Stat… 1968–present[3… 1981[360] Pop …
## 5 Johnny Cash United Stat… 1954–2003[363] 1956[363] Coun…
## 6 Justin Timberlake United Stat… 1992–present[3… 2002[365] Pop …
## 7 Ariana Grande United Stat… 2008–present[3… 2013[367] Pop …
## 8 R.E.M. United Stat… 1980–2011[369] 1983[369] Alte…
## 9 Post Malone United Stat… 2013–present[3… 2015[371] Pop …
## 10 Flo Rida United Stat… 2007–present[3… 2007[373] Hip-…
## 11 Usher United Stat… 1991–present[3… 1994[375] R&B …
## 12 Tim McGraw United Stat… 1990–present[3… 1994[377] Coun…
## 13 The Black Eyed Peas United Stat… 1995–present[3… 1998[379] Hip-…
## 14 Van Halen United Stat… 1978–2020[381] 1978[381] Hard…
## 15 Ayumi Hamasaki Japan 1998–present[3… 1998[383] J-po…
## 16 Tom Petty United Stat… 1976–2017[386] 1977[386] Rock…
## 17 Johnny Hallyday France 1957–2017[388] 1960[388] Rock…
## # ℹ abbreviated name: ¹`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## # `Claimed sales` <chr>
##
## [[6]]
## # A tibble: 18 × 7
## Artist Country `Period active` Release-year of firs…¹ Genre
## <chr> <chr> <chr> <chr> <chr>
## 1 The Weeknd Canada 2010–present[391] 2013[391] Pop …
## 2 Imagine Dragons United States 2008–present[393] 2012[393] Pop …
## 3 Luke Bryan United States 2001–present[395] 2007[375] Coun…
## 4 Tupac Shakur United States 1991–1996[397] 1991[397] Hip-…
## 5 Alabama United States 1972–present[400] 1980[400] Coun…
## 6 R. Kelly United States 1989–2019[402] 1991[403] R&B …
## 7 Nirvana United States 1987–1994[405] 1990[405] Grun…
## 8 Robbie Williams United Kingdom 1990–present[409] 1996[409] Pop …
## 9 Bob Seger United States 1961–present[411] 1967[411] Rock…
## 10 Kenny G United States 1982–present[413] 1984[413] Smoo…
## 11 Green Day United States 1987–present[415] 1994[379] Punk…
## 12 Enya Ireland 1982–present[418] 1987[418] New-…
## 13 Bryan Adams Canada 1979–present[422] 1979[422] Rock…
## 14 Bob Marley Jamaica 1962–1981[425] 1975[425] Regg…
## 15 The Police United Kingdom 1977–19862007–20… 1978[429] Pop …
## 16 Barry Manilow United States 1973–present[433] 1973[433] Pop …
## 17 Kiss United States 1972–present[435] 1974[435] Hard…
## 18 Aretha Franklin United States 1956–2018[437] 1961[437] Soul…
## # ℹ abbreviated name: ¹`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## # `Claimed sales` <chr>
class(charts)
## [1] "list"
change the list of tables into a data.frame
head(charts[[1]])
head(charts[[2]])
names(charts[[1]])
## [1] "Artist"
## [2] "Country"
## [3] "Period active"
## [4] "Release-year of first charted record"
## [5] "Genre"
## [6] "Total certified units(from available markets)[b]"
## [7] "Claimed sales"
tibble(charts)
#charts[[1]]<-rename(charts[[1]],'Release-year of first charted record'='Releaseyear of first Charted record')
artists_chart<-tibble(charts)
artists_chart<-unnest(artists_chart,charts)
artists_chart
names(artists_chart)
## [1] "Artist"
## [2] "Country"
## [3] "Period active"
## [4] "Release-year of first charted record"
## [5] "Genre"
## [6] "Total certified units(from available markets)[b]"
## [7] "Claimed sales"
class(artists_chart)
## [1] "tbl_df" "tbl" "data.frame"
glimpse(artists_chart)
## Rows: 121
## Columns: 7
## $ Artist <chr> "The Beatles", "Elv…
## $ Country <chr> "United Kingdom", "…
## $ `Period active` <chr> "1960–1970[11]", "1…
## $ `Release-year of first charted record` <chr> "1962[11]", "1956[4…
## $ Genre <chr> "Rock / pop[11]", "…
## $ `Total certified units(from available markets)[b]` <chr> ".mw-parser-output …
## $ `Claimed sales` <chr> "600 million[43][44…
CLEANING THE DATA
names(artists_chart)[6]<-"Total certified"
names(artists_chart)[4]<-"Release year"
#names(artists_chart)<-str_to_title(str_replace_all(names(artists_chart),"\\s","_"))
names(artists_chart)<-str_to_title(gsub("\\s","_",names(artists_chart)))
names(artists_chart)
## [1] "Artist" "Country" "Period_active" "Release_year"
## [5] "Genre" "Total_certified" "Claimed_sales"
#artists_chart<-artists_chart%>%rename(Total certified units(from available markets)[b]=Total certified units)
CLEAN COLUMNS
select(artists_chart,Release_year)
#artists_chart<-artists_chart%>%mutate(Release_year=(str_remove_all(Release_year,"\\[[:digit:]+]")))
artists_chart<-artists_chart%>%mutate(across(Release_year,~str_remove(.,"\\[[:digit:]+]")))
select(artists_chart,Release_year)
artists_chart<-artists_chart%>%mutate(Genre=(str_remove_all(Genre,"\\[[:digit:]+]")))
select(artists_chart,Genre)
artists_chart%>%separate_rows(Genre,sep="/")%>%count(Genre)
artists_chart%>%separate_rows(Genre,sep="/")%>%count(Genre)
artists_chart%>%separate_rows(Genre,sep="/")%>%count(Genre)
a<-artists_chart%>%separate_rows(Genre,sep="/")%>%count(Genre)
select(artists_chart,Genre)
select(artists_chart,Period_active)
#artists_chart<-artists_chart%>%mutate(Genre=(str_remove_all(Period_active,"\\[[:digit:]+]$")))
#select(artists_chart,Period_active)
artists_chart<-artists_chart%>%mutate(Period_active=(str_remove_all(Period_active,"\\[[:digit:]+]")))
artists_chart
artists_chart<-artists_chart%>%separate_rows(Period_active,sep=",")
artists_chart
#artists_chart<-artists_chart%>%mutate(Period_active=(str_remove_all(Period_active,"\\[[:digit:]+]")))
#select(artists_chart,Period_active)
#artists_chart<-artists_chart%>%select(Period_active)%>%separate(Period_active,c("FROM","TO","TOO"),"-|"))