data <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2024/2024-05-07/rolling_stone.csv')
## Rows: 691 Columns: 21
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): sort_name, clean_name, album, genre, type, spotify_url, artist_gen...
## dbl (13): rank_2003, rank_2012, rank_2020, differential, release_year, weeks...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Select only the relevant columns, including additional rankings
data2 <- data %>%
select(sort_name, artist_member_count, rank_2003:rank_2020, artist_gender)
# Filter data for artist_member_count = 1 and no missing ranks
data_filtered <- data2 %>%
filter(artist_member_count == 1) %>%
filter(if_all(starts_with("rank_"), ~ !is.na(.)))
head(data_filtered)
## # A tibble: 6 × 6
## sort_name artist_member_count rank_2003 rank_2012 rank_2020 artist_gender
## <chr> <dbl> <dbl> <dbl> <dbl> <chr>
## 1 Sinatra, Frank 1 100 101 282 Male
## 2 Diddley, Bo 1 214 216 455 Male
## 3 Presley, Elvis 1 55 56 332 Male
## 4 Little Richard 1 50 50 227 Male
## 5 Davis, Miles 1 12 12 31 Male
## 6 Coltrane, John 1 102 103 232 Male
# Pivoting long to wide
data_long <- data_filtered %>%
pivot_longer(cols = starts_with("rank_"),
names_to = "year",
values_to = "rank")
head(data_long)
## # A tibble: 6 × 5
## sort_name artist_member_count artist_gender year rank
## <chr> <dbl> <chr> <chr> <dbl>
## 1 Sinatra, Frank 1 Male rank_2003 100
## 2 Sinatra, Frank 1 Male rank_2012 101
## 3 Sinatra, Frank 1 Male rank_2020 282
## 4 Diddley, Bo 1 Male rank_2003 214
## 5 Diddley, Bo 1 Male rank_2012 216
## 6 Diddley, Bo 1 Male rank_2020 455
# Pivoting wide to long (back to original if needed)
data_wide <- data_long %>%
pivot_wider(names_from = year,
values_from = rank)
## Warning: Values from `rank` are not uniquely identified; output will contain list-cols.
## • Use `values_fn = list` to suppress this warning.
## • Use `values_fn = {summary_fun}` to summarise duplicates.
## • Use the following dplyr code to identify duplicates.
## {data} |>
## dplyr::summarise(n = dplyr::n(), .by = c(sort_name, artist_member_count,
## artist_gender, year)) |>
## dplyr::filter(n > 1L)
head(data_wide)
## # A tibble: 6 × 6
## sort_name artist_member_count artist_gender rank_2003 rank_2012 rank_2020
## <chr> <dbl> <chr> <list> <list> <list>
## 1 Sinatra, Frank 1 Male <dbl [1]> <dbl [1]> <dbl [1]>
## 2 Diddley, Bo 1 Male <dbl [1]> <dbl [1]> <dbl [1]>
## 3 Presley, Elvis 1 Male <dbl [3]> <dbl [3]> <dbl [3]>
## 4 Little Richard 1 Male <dbl [1]> <dbl [1]> <dbl [1]>
## 5 Davis, Miles 1 Male <dbl [2]> <dbl [2]> <dbl [2]>
## 6 Coltrane, John 1 Male <dbl [2]> <dbl [2]> <dbl [2]>
options(tibble.print_max = 100, tibble.width = Inf)
# Separating sort_name into first and last names
data_sep <- data_filtered %>%
separate(sort_name, into = c("last_name", "first_name"), sep = ", ")
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 24 rows [4, 8, 9, 25, 26,
## 39, 50, 86, 89, 100, 104, 108, 112, 117, 118, 120, 121, 122, 124, 127, ...].
head(data_sep)
## # A tibble: 6 × 7
## last_name first_name artist_member_count rank_2003 rank_2012 rank_2020
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Sinatra Frank 1 100 101 282
## 2 Diddley Bo 1 214 216 455
## 3 Presley Elvis 1 55 56 332
## 4 Little Richard <NA> 1 50 50 227
## 5 Davis Miles 1 12 12 31
## 6 Coltrane John 1 102 103 232
## artist_gender
## <chr>
## 1 Male
## 2 Male
## 3 Male
## 4 Male
## 5 Male
## 6 Male
# Uniting artist_gender and artist_member_count into a single column
data_unite <- data_sep %>%
unite(gender_members, artist_gender, artist_member_count, sep = " - ")
head(data_unite)
## # A tibble: 6 × 6
## last_name first_name gender_members rank_2003 rank_2012 rank_2020
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Sinatra Frank Male - 1 100 101 282
## 2 Diddley Bo Male - 1 214 216 455
## 3 Presley Elvis Male - 1 55 56 332
## 4 Little Richard <NA> Male - 1 50 50 227
## 5 Davis Miles Male - 1 12 12 31
## 6 Coltrane John Male - 1 102 103 232
# Handling missing values in rank columns (rank_2003, rank_2012, rank_2020)
data_no_na <- data_unite %>%
mutate(
rank_2003 = ifelse(is.na(rank_2003), 0, rank_2003),
rank_2012 = ifelse(is.na(rank_2012), 0, rank_2012),
rank_2020 = ifelse(is.na(rank_2020), 0, rank_2020)
)
summary(data_no_na)
## last_name first_name gender_members rank_2003
## Length:135 Length:135 Length:135 Min. : 4.0
## Class :character Class :character Class :character 1st Qu.: 77.0
## Mode :character Mode :character Mode :character Median :163.0
## Mean :188.6
## 3rd Qu.:287.0
## Max. :494.0
## rank_2012 rank_2020
## Min. : 4.0 Min. : 1.0
## 1st Qu.: 76.5 1st Qu.: 71.5
## Median :163.0 Median :178.0
## Mean :185.4 Mean :199.4
## 3rd Qu.:283.0 3rd Qu.:313.0
## Max. :487.0 Max. :493.0