df <- read_csv("https://docs.google.com/spreadsheets/d/e/2PACX-1vQx0EAzcq52X0nGCMkBjG7xWj8mvMIFaoVFIJynfeVuH4bYMW4Hua2zQmefpAb1mLT7S5ZP07ILmOx4/pub?gid=1510102788&single=true&output=csv")
##
## ── Column specification ────────────────────────────────────────────────────────
## cols(
## miRNA = col_character(),
## read_count = col_double(),
## precursor = col_character(),
## total = col_double(),
## seq = col_double(),
## `seq(norm)` = col_double(),
## outcome = col_character()
## )
Clean and normalize the column names
df <- df %>%
janitor::clean_names() %>% # clean the names
janitor::remove_empty(which = c("rows", "cols")) #remove empty rows and columns
Delete any space before or after values names
df <- df %>%
mutate(mi_rna = str_trim(mi_rna, side = "both"),
precursor = str_trim(precursor, side = "both"))
add an ID
df <- df %>%
rowid_to_column("id")
dplyr::n_distinct(df$mi_rna) # count the different values of the variable
## [1] 2656
dplyr::n_distinct(df$precursor)
## [1] 1917
The regex solution would be more elegant, though
Store all the columns that match and didn’t require a new name
df_true <- df %>%
# standarize the names, all to lower
mutate(mi_rna = stringr::str_to_lower(mi_rna)) %>%
# identify the matches
mutate(matches = case_when(mi_rna == precursor ~ "TRUE",
TRUE ~ "FALSE")) %>%
# store only the rows that match
filter(matches == "TRUE") %>%
# create a new column
mutate(new_name = precursor) %>%
select(id, mi_rna, precursor, new_name, read_count, total:seq_norm)
df_false <- df %>%
# standarize the names, all to lower
mutate(mi_rna = stringr::str_to_lower(mi_rna)) %>%
# identify the matches
mutate(matches = case_when(mi_rna == precursor ~ "TRUE",
TRUE ~ "FALSE")) %>%
# filter the FALSE values
filter(matches == "FALSE") %>%
# extract the last three chr of mi_rna
mutate(ending_mi_rna = str_sub(mi_rna, start = -3)) %>%
# now merge the columns
unite(new_name, c("precursor", "ending_mi_rna"), sep = "", remove = FALSE) %>%
select(id, mi_rna, precursor, new_name, read_count, total:seq_norm)
df_clean <- dplyr::bind_rows(df_true, df_false) %>%
arrange(id)
Delete dataframes
rm(df_false, df_true)
head(df_clean)
## # A tibble: 6 x 8
## id mi_rna precursor new_name read_count total seq seq_norm
## <int> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1 hsa-let-7a-5p hsa-let-7a… hsa-let-7a-… 323533 323533 323533 18064.
## 2 2 hsa-let-7a-3p hsa-let-7a… hsa-let-7a-… 365 365 365 20.4
## 3 3 hsa-let-7a-5p hsa-let-7a… hsa-let-7a-… 323643 323643 323643 18071.
## 4 4 hsa-let-7a-2… hsa-let-7a… hsa-let-7a-… 7 7 7 0.39
## 5 5 hsa-let-7a-5p hsa-let-7a… hsa-let-7a-… 323583 323583 323583 18067.
## 6 6 hsa-let-7a-3p hsa-let-7a… hsa-let-7a-… 364 364 364 20.3
dplyr::n_distinct(df_clean$mi_rna)
## [1] 2656
dplyr::n_distinct(df_clean$precursor)
## [1] 1917
dplyr::n_distinct(df_clean$new_name)
## [1] 2884