Packages

Dataset

df <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQx0EAzcq52X0nGCMkBjG7xWj8mvMIFaoVFIJynfeVuH4bYMW4Hua2zQmefpAb1mLT7S5ZP07ILmOx4/pub?gid=1510102788&single=true&output=csv")

## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   miRNA = col_character(),
##   read_count = col_double(),
##   precursor = col_character(),
##   total = col_double(),
##   seq = col_double(),
##   `seq(norm)` = col_double(),
##   outcome = col_character()
## )

Clean and normalize the column names

 df <- df %>% 
  janitor::clean_names() %>%  # clean the names
  janitor::remove_empty(which = c("rows", "cols")) #remove empty rows and columns

Delete any space before or after values names

df <- df %>% 
  mutate(mi_rna    = str_trim(mi_rna,    side = "both"), 
         precursor = str_trim(precursor, side = "both"))

add an ID

df <- df %>% 
  rowid_to_column("id")

Check levels names

dplyr::n_distinct(df$mi_rna) # count the different values of the variable

## [1] 2656

dplyr::n_distinct(df$precursor)

## [1] 1917

Divide and conquer.

The regex solution would be more elegant, though

create a dataset with the names that match

Store all the columns that match and didn’t require a new name

df_true <- df %>%
  # standarize the names, all to lower
  mutate(mi_rna = stringr::str_to_lower(mi_rna)) %>%
  # identify the matches
  mutate(matches = case_when(mi_rna == precursor ~ "TRUE",
                             TRUE ~ "FALSE")) %>%
  # store only the rows that match
  filter(matches == "TRUE") %>% 
    # create a new column 
  mutate(new_name = precursor) %>% 
  select(id, mi_rna, precursor, new_name, read_count, total:seq_norm)

now the ones that differ

df_false <- df %>%
  # standarize the names, all to lower
  mutate(mi_rna = stringr::str_to_lower(mi_rna)) %>%
  # identify the matches
  mutate(matches = case_when(mi_rna == precursor ~ "TRUE",
                             TRUE ~ "FALSE")) %>%
  # filter the FALSE values
  filter(matches == "FALSE") %>%
  # extract the last three chr of mi_rna
  mutate(ending_mi_rna = str_sub(mi_rna, start = -3)) %>%
  # now merge the columns
  unite(new_name,  c("precursor", "ending_mi_rna"), sep = "", remove = FALSE) %>% 
  select(id, mi_rna, precursor, new_name, read_count, total:seq_norm)

df_clean <- dplyr::bind_rows(df_true, df_false) %>% 
  arrange(id)

Delete dataframes

rm(df_false, df_true)

head(df_clean)

## # A tibble: 6 x 8
##      id mi_rna        precursor   new_name     read_count  total    seq seq_norm
##   <int> <chr>         <chr>       <chr>             <dbl>  <dbl>  <dbl>    <dbl>
## 1     1 hsa-let-7a-5p hsa-let-7a… hsa-let-7a-…     323533 323533 323533 18064.  
## 2     2 hsa-let-7a-3p hsa-let-7a… hsa-let-7a-…        365    365    365    20.4 
## 3     3 hsa-let-7a-5p hsa-let-7a… hsa-let-7a-…     323643 323643 323643 18071.  
## 4     4 hsa-let-7a-2… hsa-let-7a… hsa-let-7a-…          7      7      7     0.39
## 5     5 hsa-let-7a-5p hsa-let-7a… hsa-let-7a-…     323583 323583 323583 18067.  
## 6     6 hsa-let-7a-3p hsa-let-7a… hsa-let-7a-…        364    364    364    20.3

dplyr::n_distinct(df_clean$mi_rna)

## [1] 2656

dplyr::n_distinct(df_clean$precursor)

## [1] 1917

dplyr::n_distinct(df_clean$new_name)

## [1] 2884

Katrina Genes Names

Sergio Uribe