Import your data

# Import the data from the Excel file
data <- read_excel("../00_data/my.Data.xlsx")

# Random sampling: Select specific columns and filter out missing values
set.seed(1254)
data_small <- data %>%
    select(genre, release_year, differential) %>%  # Update columns for your dataset
    filter(!is.na(differential)) %>%  # Remove rows with missing differential values
    sample_n(10)

data_small
## # A tibble: 10 × 3
##    genre                               release_year differential
##    <chr>                                      <dbl>        <dbl>
##  1 NA                                          1967         -325
##  2 Punk/Post-Punk/New Wave/Power Pop           1980           81
##  3 Soul/Gospel/R&B                             2009           34
##  4 Country/Folk/Country Rock/Folk Rock         1966          -29
##  5 Blues/Blues Rock                            1968            1
##  6 Hip-Hop/Rap                                 2005          131
##  7 NA                                          1972          -52
##  8 Hard Rock/Metal                             1971            8
##  9 NA                                          1977         -158
## 10 NA                                          1971         -147

Pivoting

data_wide <- data_small %>%
    pivot_wider(names_from = genre, values_from = differential)
data_wide
## # A tibble: 9 × 8
##   release_year  `NA` `Punk/Post-Punk/New Wave/Power Pop` `Soul/Gospel/R&B`
##          <dbl> <dbl>                               <dbl>             <dbl>
## 1         1967  -325                                  NA                NA
## 2         1980    NA                                  81                NA
## 3         2009    NA                                  NA                34
## 4         1966    NA                                  NA                NA
## 5         1968    NA                                  NA                NA
## 6         2005    NA                                  NA                NA
## 7         1972   -52                                  NA                NA
## 8         1971  -147                                  NA                NA
## 9         1977  -158                                  NA                NA
## # ℹ 4 more variables: `Country/Folk/Country Rock/Folk Rock` <dbl>,
## #   `Blues/Blues Rock` <dbl>, `Hip-Hop/Rap` <dbl>, `Hard Rock/Metal` <dbl>

wide to long form

# Pivot data from wide back to long
data_wide %>%
    pivot_longer(-release_year, names_to = "genre", values_to = "differential", values_drop_na = TRUE)
## # A tibble: 10 × 3
##    release_year genre                               differential
##           <dbl> <chr>                                      <dbl>
##  1         1967 NA                                          -325
##  2         1980 Punk/Post-Punk/New Wave/Power Pop             81
##  3         2009 Soul/Gospel/R&B                               34
##  4         1966 Country/Folk/Country Rock/Folk Rock          -29
##  5         1968 Blues/Blues Rock                               1
##  6         2005 Hip-Hop/Rap                                  131
##  7         1972 NA                                           -52
##  8         1971 NA                                          -147
##  9         1971 Hard Rock/Metal                                8
## 10         1977 NA                                          -158

Separating and Uniting

# Unite two columns (like_count and dislike_count)
data_united <- data %>%
    unite(col = "newName", rank_2020:differential, sep = "/", remove = TRUE)

data_united
## # A tibble: 691 × 21
##     ...1 sort_name     clean_name album rank_2003 rank_2012 newName release_year
##    <dbl> <chr>         <chr>      <chr> <chr>     <chr>     <chr>          <dbl>
##  1     1 Sinatra, Fra… Frank Sin… "In … 100       101       282/-1…         1955
##  2     2 Diddley, Bo   Bo Diddley "Bo … 214       216       455/-2…         1955
##  3     3 Presley, Elv… Elvis Pre… "Elv… 55        56        332/-2…         1956
##  4     4 Sinatra, Fra… Frank Sin… "Son… 306       308       NA/-195         1956
##  5     5 Little Richa… Little Ri… "Her… 50        50        227/-1…         1957
##  6     6 Beyonce       Beyonce    "Lem… NA        NA        32/469          2016
##  7     7 Winehouse, A… Amy Wineh… "Bac… NA        451       33/468          2006
##  8     8 Crickets      Buddy Hol… "The… 421       420       NA/-80          1957
##  9     9 Bush, Kate    Kate Bush  "Hou… NA        NA        68/433          1985
## 10    10 Davis, Miles  Miles Dav… "Kin… 12        12        31/-19          1959
## # ℹ 681 more rows
## # ℹ 13 more variables: genre <chr>, type <chr>, weeks_on_billboard <chr>,
## #   peak_billboard_position <dbl>, spotify_popularity <chr>, spotify_url <chr>,
## #   artist_member_count <chr>, artist_gender <chr>,
## #   artist_birth_year_sum <chr>, debut_album_release_year <chr>,
## #   ave_age_at_top_500 <chr>, years_between <chr>, album_id <chr>

Separate a column

# Separate the united column back into original columns
data_united %>%
    separate(col = newName, into = c("rank_2020", "differential"), sep = "/")
## # A tibble: 691 × 22
##     ...1 sort_name   clean_name album rank_2003 rank_2012 rank_2020 differential
##    <dbl> <chr>       <chr>      <chr> <chr>     <chr>     <chr>     <chr>       
##  1     1 Sinatra, F… Frank Sin… "In … 100       101       282       -182        
##  2     2 Diddley, Bo Bo Diddley "Bo … 214       216       455       -241        
##  3     3 Presley, E… Elvis Pre… "Elv… 55        56        332       -277        
##  4     4 Sinatra, F… Frank Sin… "Son… 306       308       NA        -195        
##  5     5 Little Ric… Little Ri… "Her… 50        50        227       -177        
##  6     6 Beyonce     Beyonce    "Lem… NA        NA        32        469         
##  7     7 Winehouse,… Amy Wineh… "Bac… NA        451       33        468         
##  8     8 Crickets    Buddy Hol… "The… 421       420       NA        -80         
##  9     9 Bush, Kate  Kate Bush  "Hou… NA        NA        68        433         
## 10    10 Davis, Mil… Miles Dav… "Kin… 12        12        31        -19         
## # ℹ 681 more rows
## # ℹ 14 more variables: release_year <dbl>, genre <chr>, type <chr>,
## #   weeks_on_billboard <chr>, peak_billboard_position <dbl>,
## #   spotify_popularity <chr>, spotify_url <chr>, artist_member_count <chr>,
## #   artist_gender <chr>, artist_birth_year_sum <chr>,
## #   debut_album_release_year <chr>, ave_age_at_top_500 <chr>,
## #   years_between <chr>, album_id <chr>

Missing Values

# Summarize missing values
data %>%
    summarise(
        total_missing = sum(is.na(differential)),
        percent_missing = mean(is.na(differential)) * 100
    )
## # A tibble: 1 × 2
##   total_missing percent_missing
##           <int>           <dbl>
## 1             0               0