I am going to webscrap best selling artists data and do cleaning,wrangling,analysis and visualization

library(rvest)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()         masks stats::filter()
## ✖ readr::guess_encoding() masks rvest::guess_encoding()
## ✖ dplyr::lag()            masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
robotstxt::paths_allowed("https://en.wikipedia.org/wiki/List_of_best-selling_music_artists")
## 
 en.wikipedia.org
## [1] TRUE
chart_html<-read_html("https://en.wikipedia.org/wiki/List_of_best-selling_music_artists")
charts<-chart_html%>%html_nodes(".wikitable")%>%html_table()
charts
## [[1]]
## # A tibble: 9 × 7
##   Artist          Country        `Period active`    Release-year of firs…¹ Genre
##   <chr>           <chr>          <chr>              <chr>                  <chr>
## 1 The Beatles     United Kingdom 1960–1970[11]      1962[11]               Rock…
## 2 Elvis Presley   United States  1953–1977[46]      1956[46]               Rock…
## 3 Michael Jackson United States  1964–2009[57]      1971[57]               Pop …
## 4 Elton John      United Kingdom 1962–present[71]   1970[71]               Pop …
## 5 Queen           United Kingdom 1971–present[79]   1973[79]               Rock…
## 6 Madonna         United States  1979–present[85]   1983[85]               Pop …
## 7 Led Zeppelin    United Kingdom 1968–1980[95]      1969[95]               Hard…
## 8 Rihanna         Barbados       2003–present[103]  2005[103]              R&B …
## 9 Pink Floyd      United Kingdom 1965–1996, 2005, … 1967[110]              Prog…
## # ℹ abbreviated name: ¹​`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## #   `Claimed sales` <chr>
## 
## [[2]]
## # A tibble: 9 × 7
##   Artist             Country        `Period active` Release-year of firs…¹ Genre
##   <chr>              <chr>          <chr>           <chr>                  <chr>
## 1 Eminem             United States  1996–present[1… 1999[118]              Hip-…
## 2 Mariah Carey       United States  1988–present[1… 1990[120]              R&B …
## 3 Taylor Swift       United States  2006–present[1… 2006[125]              Pop …
## 4 Beyoncé            United States  1997–present[1… 2002[129][130]         R&B …
## 5 Whitney Houston    United States  1977–2012[136]  1984[136]              R&B …
## 6 Eagles             United States  1971–1980, 199… 1972[141]              Rock…
## 7 Celine Dion        Canada         1981–present[1… 1981[144]              Pop …
## 8 AC/DC              Australia      1973–present[1… 1975[151]              Hard…
## 9 The Rolling Stones United Kingdom 1962–present[1… 1963[154]              Rock…
## # ℹ abbreviated name: ¹​`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## #   `Claimed sales` <chr>
## 
## [[3]]
## # A tibble: 28 × 7
##    Artist            Country        `Period active` Release-year of firs…¹ Genre
##    <chr>             <chr>          <chr>           <chr>                  <chr>
##  1 Drake             Canada         2001–present[1… 2009[158]              Hip-…
##  2 Garth Brooks      United States  1989–present[1… 1989[160]              Coun…
##  3 Kanye West        United States  1996–present[1… 2003[163]              Hip-…
##  4 Billy Joel        United States  1964–present[1… 1971[166]              Pop …
##  5 Justin Bieber     Canada         2008–present[1… 2009[169]              Pop …
##  6 Ed Sheeran        United Kingdom 2004–present[1… 2011[171]              Pop …
##  7 Bruno Mars        United States  2004–present[1… 2010[173]              Pop …
##  8 Bruce Springsteen United States  1972–present[1… 1973[177]              Rock…
##  9 U2                Ireland        1976–present[1… 1980[180]              Rock…
## 10 Aerosmith         United States  1970–present[1… 1973[185]              Hard…
## # ℹ 18 more rows
## # ℹ abbreviated name: ¹​`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## #   `Claimed sales` <chr>
## 
## [[4]]
## # A tibble: 40 × 7
##    Artist          Country          `Period active` Release-year of firs…¹ Genre
##    <chr>           <chr>            <chr>           <chr>                  <chr>
##  1 Nicki Minaj     Trinidad and To… 2008–present[2… 2010[249]              Hip-…
##  2 Coldplay        United Kingdom   1997–present[2… 1999[251]              Alte…
##  3 Linkin Park     United States    1996–present[2… 2000[254]              Alte…
##  4 George Strait   United States    1981–present[2… 1984[256]              Coun…
##  5 Pink            United States    1995–present[2… 2000[258]              Pop …
##  6 Britney Spears  United States    1998–present[2… 1998[260]              Pop …
##  7 B'z             Japan            1988–present[2… 1988[263]              Rock…
##  8 Shania Twain    Canada           1993–present[2… 1993[265]              Coun…
##  9 Guns N' Roses   United States    1985–present[2… 1987[267]              Hard…
## 10 Backstreet Boys United States    1993–present[2… 1995[270]              Pop[…
## # ℹ 30 more rows
## # ℹ abbreviated name: ¹​`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## #   `Claimed sales` <chr>
## 
## [[5]]
## # A tibble: 17 × 7
##    Artist              Country      `Period active` Release-year of firs…¹ Genre
##    <chr>               <chr>        <chr>           <chr>                  <chr>
##  1 Shakira             Colombia     1988–present[3… 1995[354]              Lati…
##  2 Alicia Keys         United Stat… 1996–present[3… 2001[356]              Hip-…
##  3 Christina Aguilera  United Stat… 1993–present[3… 1998[358]              R&B …
##  4 Lionel Richie       United Stat… 1968–present[3… 1981[360]              Pop …
##  5 Johnny Cash         United Stat… 1954–2003[363]  1956[363]              Coun…
##  6 Justin Timberlake   United Stat… 1992–present[3… 2002[365]              Pop …
##  7 Ariana Grande       United Stat… 2008–present[3… 2013[367]              Pop …
##  8 R.E.M.              United Stat… 1980–2011[369]  1983[369]              Alte…
##  9 Post Malone         United Stat… 2013–present[3… 2015[371]              Pop …
## 10 Flo Rida            United Stat… 2007–present[3… 2007[373]              Hip-…
## 11 Usher               United Stat… 1991–present[3… 1994[375]              R&B …
## 12 Tim McGraw          United Stat… 1990–present[3… 1994[377]              Coun…
## 13 The Black Eyed Peas United Stat… 1995–present[3… 1998[379]              Hip-…
## 14 Van Halen           United Stat… 1978–2020[381]  1978[381]              Hard…
## 15 Ayumi Hamasaki      Japan        1998–present[3… 1998[383]              J-po…
## 16 Tom Petty           United Stat… 1976–2017[386]  1977[386]              Rock…
## 17 Johnny Hallyday     France       1957–2017[388]  1960[388]              Rock…
## # ℹ abbreviated name: ¹​`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## #   `Claimed sales` <chr>
## 
## [[6]]
## # A tibble: 18 × 7
##    Artist          Country        `Period active`   Release-year of firs…¹ Genre
##    <chr>           <chr>          <chr>             <chr>                  <chr>
##  1 The Weeknd      Canada         2010–present[391] 2013[391]              Pop …
##  2 Imagine Dragons United States  2008–present[393] 2012[393]              Pop …
##  3 Luke Bryan      United States  2001–present[395] 2007[375]              Coun…
##  4 Tupac Shakur    United States  1991–1996[397]    1991[397]              Hip-…
##  5 Alabama         United States  1972–present[400] 1980[400]              Coun…
##  6 R. Kelly        United States  1989–2019[402]    1991[403]              R&B …
##  7 Nirvana         United States  1987–1994[405]    1990[405]              Grun…
##  8 Robbie Williams United Kingdom 1990–present[409] 1996[409]              Pop …
##  9 Bob Seger       United States  1961–present[411] 1967[411]              Rock…
## 10 Kenny G         United States  1982–present[413] 1984[413]              Smoo…
## 11 Green Day       United States  1987–present[415] 1994[379]              Punk…
## 12 Enya            Ireland        1982–present[418] 1987[418]              New-…
## 13 Bryan Adams     Canada         1979–present[422] 1979[422]              Rock…
## 14 Bob Marley      Jamaica        1962–1981[425]    1975[425]              Regg…
## 15 The Police      United Kingdom 1977–19862007–20… 1978[429]              Pop …
## 16 Barry Manilow   United States  1973–present[433] 1973[433]              Pop …
## 17 Kiss            United States  1972–present[435] 1974[435]              Hard…
## 18 Aretha Franklin United States  1956–2018[437]    1961[437]              Soul…
## # ℹ abbreviated name: ¹​`Release-year of first charted record`
## # ℹ 2 more variables: `Total certified units(from available markets)[b]` <chr>,
## #   `Claimed sales` <chr>
class(charts)
## [1] "list"

change the list of tables into a data.frame

head(charts[[1]])
head(charts[[2]])
names(charts[[1]])
## [1] "Artist"                                          
## [2] "Country"                                         
## [3] "Period active"                                   
## [4] "Release-year of first charted record"            
## [5] "Genre"                                           
## [6] "Total certified units(from available markets)[b]"
## [7] "Claimed sales"
tibble(charts)
#charts[[1]]<-rename(charts[[1]],'Release-year of first charted record'='Releaseyear of first Charted record')
artists_chart<-tibble(charts)
artists_chart<-unnest(artists_chart,charts)
artists_chart
names(artists_chart)
## [1] "Artist"                                          
## [2] "Country"                                         
## [3] "Period active"                                   
## [4] "Release-year of first charted record"            
## [5] "Genre"                                           
## [6] "Total certified units(from available markets)[b]"
## [7] "Claimed sales"
class(artists_chart)
## [1] "tbl_df"     "tbl"        "data.frame"
glimpse(artists_chart)
## Rows: 121
## Columns: 7
## $ Artist                                             <chr> "The Beatles", "Elv…
## $ Country                                            <chr> "United Kingdom", "…
## $ `Period active`                                    <chr> "1960–1970[11]", "1…
## $ `Release-year of first charted record`             <chr> "1962[11]", "1956[4…
## $ Genre                                              <chr> "Rock / pop[11]", "…
## $ `Total certified units(from available markets)[b]` <chr> ".mw-parser-output …
## $ `Claimed sales`                                    <chr> "600 million[43][44…

CLEANING THE DATA

names(artists_chart)[6]<-"Total certified"
names(artists_chart)[4]<-"Release year" 
#names(artists_chart)<-str_to_title(str_replace_all(names(artists_chart),"\\s","_"))
names(artists_chart)<-str_to_title(gsub("\\s","_",names(artists_chart)))
names(artists_chart)
## [1] "Artist"          "Country"         "Period_active"   "Release_year"   
## [5] "Genre"           "Total_certified" "Claimed_sales"
#artists_chart<-artists_chart%>%rename(Total certified units(from available markets)[b]=Total certified units)

CLEAN COLUMNS

select(artists_chart,Release_year)
#artists_chart<-artists_chart%>%mutate(Release_year=(str_remove_all(Release_year,"\\[[:digit:]+]")))
artists_chart<-artists_chart%>%mutate(across(Release_year,~str_remove(.,"\\[[:digit:]+]")))
select(artists_chart,Release_year)
artists_chart<-artists_chart%>%mutate(Genre=(str_remove_all(Genre,"\\[[:digit:]+]")))
select(artists_chart,Genre)
artists_chart%>%separate_rows(Genre,sep="/")%>%count(Genre)
artists_chart%>%separate_rows(Genre,sep="/")%>%count(Genre)
artists_chart%>%separate_rows(Genre,sep="/")%>%count(Genre)
a<-artists_chart%>%separate_rows(Genre,sep="/")%>%count(Genre)
select(artists_chart,Genre)
select(artists_chart,Period_active)
#artists_chart<-artists_chart%>%mutate(Genre=(str_remove_all(Period_active,"\\[[:digit:]+]$")))
#select(artists_chart,Period_active)
artists_chart<-artists_chart%>%mutate(Period_active=(str_remove_all(Period_active,"\\[[:digit:]+]")))
artists_chart
artists_chart<-artists_chart%>%separate_rows(Period_active,sep=",")
artists_chart
#artists_chart<-artists_chart%>%mutate(Period_active=(str_remove_all(Period_active,"\\[[:digit:]+]")))
#select(artists_chart,Period_active)
#artists_chart<-artists_chart%>%select(Period_active)%>%separate(Period_active,c("FROM","TO","TOO"),"-|"))