rawDf <- read.csv("https://raw.githubusercontent.com/heathergeiger/Data607_data_to_tidy/master/RNA_vs_DNA_example_sample_sheet.csv", stringsAsFactors = FALSE, header = FALSE, colClasses = "character")rawDf1 <- rawDf[, 1:6]
rawDf2 <- rawDf[, 8:13]
colnames(rawDf1) <- letters[1:length(rawDf1)]
colnames(rawDf2) <- letters[1:length(rawDf2)]
fullDf <- bind_rows(rawDf1, rawDf2)
rm(rawDf, rawDf1, rawDf2)fullDf$groupNum <- str_extract(fullDf$a, "[^0][:digit:]+") #Not sure why [^0][:digit:]+ is returning "p068", for instance
fullDf$groupNum <- str_replace(fullDf$groupNum, "p0", "")
fullDf <- fullDf %>%
fill(groupNum) %>%
filter(!str_detect(a, "Group*"))colnames(fullDf) <- fullDf[1, ]
fullDf <- fullDf %>% rename(GroupNum = `68`)
fullDf <- fullDf %>%
filter(!(Index == "" | Index == "Index"))Seems worth keeping group number for now, in case we wanted to compare the data sources this references in later analysis
fullDf <- fullDf %>% mutate_at(vars("Sample", "WGS ID"), funs(str_replace_all(., "_", "-")))fullDf$`WGS Quote ID` <- str_extract(fullDf$`WGS Quote ID`, "[:digit:]+")
stopifnot(all(str_count(fullDf$`WGS Quote ID`) == 5))
fullDf$`WGS Quote ID` <- as.numeric(fullDf$`WGS Quote ID`)fullDf$`Tissue Type` <- str_replace_all(fullDf$`Tissue Type`, "_", " ")
table(fullDf$`Tissue Type`, useNA = "ifany")##
## Cerebellum Cortex Frontal Cortex Occipital
## 6 10 6
## Motor Cortex Lateral Motor Cortex Medial Spinal Cord Cervical
## 7 6 4
## Spinal Cord Lumbar Spinal Cord Thoracic
## 5 4
# Is project number "Sample" or WGS Quote Id"? or "GroupNum"
fullDf %>%
group_by(`WGS ID`, Gender, Sample) %>%
filter(n() != 1) %>%
nrow()## [1] 0
fullDf %>%
group_by(`WGS ID`, Gender, `WGS Quote ID`) %>%
filter(n() != 1) %>%
nrow()## [1] 47
Project seems to be “Sample”, which has a unique WGS ID when grouped also by Gender
fullDf %>%
group_by(`WGS ID`, `Tissue Type`) %>%
filter(n() != 1) %>%
nrow()## [1] 0
Tissue Type is unique by WGS ID