Import your data
# Import the data from the Excel file
data <- read_excel("../00_data/my.Data.xlsx")
# Random sampling: Select specific columns and filter out missing values
set.seed(1254)
data_small <- data %>%
select(genre, release_year, differential) %>% # Update columns for your dataset
filter(!is.na(differential)) %>% # Remove rows with missing differential values
sample_n(10)
data_small
## # A tibble: 10 × 3
## genre release_year differential
## <chr> <dbl> <dbl>
## 1 NA 1967 -325
## 2 Punk/Post-Punk/New Wave/Power Pop 1980 81
## 3 Soul/Gospel/R&B 2009 34
## 4 Country/Folk/Country Rock/Folk Rock 1966 -29
## 5 Blues/Blues Rock 1968 1
## 6 Hip-Hop/Rap 2005 131
## 7 NA 1972 -52
## 8 Hard Rock/Metal 1971 8
## 9 NA 1977 -158
## 10 NA 1971 -147
Pivoting
data_wide <- data_small %>%
pivot_wider(names_from = genre, values_from = differential)
data_wide
## # A tibble: 9 × 8
## release_year `NA` `Punk/Post-Punk/New Wave/Power Pop` `Soul/Gospel/R&B`
## <dbl> <dbl> <dbl> <dbl>
## 1 1967 -325 NA NA
## 2 1980 NA 81 NA
## 3 2009 NA NA 34
## 4 1966 NA NA NA
## 5 1968 NA NA NA
## 6 2005 NA NA NA
## 7 1972 -52 NA NA
## 8 1971 -147 NA NA
## 9 1977 -158 NA NA
## # ℹ 4 more variables: `Country/Folk/Country Rock/Folk Rock` <dbl>,
## # `Blues/Blues Rock` <dbl>, `Hip-Hop/Rap` <dbl>, `Hard Rock/Metal` <dbl>
Separating and Uniting
# Unite two columns (like_count and dislike_count)
data_united <- data %>%
unite(col = "newName", rank_2020:differential, sep = "/", remove = TRUE)
data_united
## # A tibble: 691 × 21
## ...1 sort_name clean_name album rank_2003 rank_2012 newName release_year
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <dbl>
## 1 1 Sinatra, Fra… Frank Sin… "In … 100 101 282/-1… 1955
## 2 2 Diddley, Bo Bo Diddley "Bo … 214 216 455/-2… 1955
## 3 3 Presley, Elv… Elvis Pre… "Elv… 55 56 332/-2… 1956
## 4 4 Sinatra, Fra… Frank Sin… "Son… 306 308 NA/-195 1956
## 5 5 Little Richa… Little Ri… "Her… 50 50 227/-1… 1957
## 6 6 Beyonce Beyonce "Lem… NA NA 32/469 2016
## 7 7 Winehouse, A… Amy Wineh… "Bac… NA 451 33/468 2006
## 8 8 Crickets Buddy Hol… "The… 421 420 NA/-80 1957
## 9 9 Bush, Kate Kate Bush "Hou… NA NA 68/433 1985
## 10 10 Davis, Miles Miles Dav… "Kin… 12 12 31/-19 1959
## # ℹ 681 more rows
## # ℹ 13 more variables: genre <chr>, type <chr>, weeks_on_billboard <chr>,
## # peak_billboard_position <dbl>, spotify_popularity <chr>, spotify_url <chr>,
## # artist_member_count <chr>, artist_gender <chr>,
## # artist_birth_year_sum <chr>, debut_album_release_year <chr>,
## # ave_age_at_top_500 <chr>, years_between <chr>, album_id <chr>
Separate a column
# Separate the united column back into original columns
data_united %>%
separate(col = newName, into = c("rank_2020", "differential"), sep = "/")
## # A tibble: 691 × 22
## ...1 sort_name clean_name album rank_2003 rank_2012 rank_2020 differential
## <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 1 Sinatra, F… Frank Sin… "In … 100 101 282 -182
## 2 2 Diddley, Bo Bo Diddley "Bo … 214 216 455 -241
## 3 3 Presley, E… Elvis Pre… "Elv… 55 56 332 -277
## 4 4 Sinatra, F… Frank Sin… "Son… 306 308 NA -195
## 5 5 Little Ric… Little Ri… "Her… 50 50 227 -177
## 6 6 Beyonce Beyonce "Lem… NA NA 32 469
## 7 7 Winehouse,… Amy Wineh… "Bac… NA 451 33 468
## 8 8 Crickets Buddy Hol… "The… 421 420 NA -80
## 9 9 Bush, Kate Kate Bush "Hou… NA NA 68 433
## 10 10 Davis, Mil… Miles Dav… "Kin… 12 12 31 -19
## # ℹ 681 more rows
## # ℹ 14 more variables: release_year <dbl>, genre <chr>, type <chr>,
## # weeks_on_billboard <chr>, peak_billboard_position <dbl>,
## # spotify_popularity <chr>, spotify_url <chr>, artist_member_count <chr>,
## # artist_gender <chr>, artist_birth_year_sum <chr>,
## # debut_album_release_year <chr>, ave_age_at_top_500 <chr>,
## # years_between <chr>, album_id <chr>
Missing Values
# Summarize missing values
data %>%
summarise(
total_missing = sum(is.na(differential)),
percent_missing = mean(is.na(differential)) * 100
)
## # A tibble: 1 × 2
## total_missing percent_missing
## <int> <dbl>
## 1 0 0