Read in data

rawDf <- read.csv("https://raw.githubusercontent.com/heathergeiger/Data607_data_to_tidy/master/RNA_vs_DNA_example_sample_sheet.csv", stringsAsFactors = FALSE, header = FALSE, colClasses = "character")

Split dataframe vertically, bind into one data set

rawDf1 <- rawDf[, 1:6]
rawDf2 <- rawDf[, 8:13]

colnames(rawDf1) <- letters[1:length(rawDf1)]
colnames(rawDf2) <- letters[1:length(rawDf2)]

fullDf <- bind_rows(rawDf1, rawDf2)

rm(rawDf, rawDf1, rawDf2)

Extract group numbers

fullDf$groupNum <- str_extract(fullDf$a, "[^0][:digit:]+") #Not sure why [^0][:digit:]+ is returning "p068", for instance
fullDf$groupNum <- str_replace(fullDf$groupNum, "p0", "")

fullDf <- fullDf %>% 
  fill(groupNum) %>%
  filter(!str_detect(a, "Group*"))

Remove unneeded rows, adjust column names

colnames(fullDf) <- fullDf[1, ]
fullDf <- fullDf %>% rename(GroupNum = `68`)

fullDf <- fullDf %>%
  filter(!(Index == "" | Index == "Index"))

Seems worth keeping group number for now, in case we wanted to compare the data sources this references in later analysis

Additional tidying to be done:

fullDf <- fullDf %>% mutate_at(vars("Sample", "WGS ID"), funs(str_replace_all(., "_", "-")))
fullDf$`WGS Quote ID` <- str_extract(fullDf$`WGS Quote ID`, "[:digit:]+")
stopifnot(all(str_count(fullDf$`WGS Quote ID`) == 5))

fullDf$`WGS Quote ID` <- as.numeric(fullDf$`WGS Quote ID`)
fullDf$`Tissue Type` <- str_replace_all(fullDf$`Tissue Type`, "_", " ")
table(fullDf$`Tissue Type`, useNA = "ifany")
## 
##           Cerebellum       Cortex Frontal     Cortex Occipital 
##                    6                   10                    6 
## Motor Cortex Lateral  Motor Cortex Medial Spinal Cord Cervical 
##                    7                    6                    4 
##   Spinal Cord Lumbar Spinal Cord Thoracic 
##                    5                    4
# Is project number "Sample" or WGS Quote Id"?  or "GroupNum"
fullDf %>% 
  group_by(`WGS ID`, Gender, Sample) %>%
  filter(n() != 1) %>% 
  nrow()
## [1] 0
fullDf %>% 
  group_by(`WGS ID`, Gender, `WGS Quote ID`) %>%
  filter(n() != 1) %>%
  nrow()
## [1] 47

Project seems to be “Sample”, which has a unique WGS ID when grouped also by Gender

fullDf %>% 
  group_by(`WGS ID`, `Tissue Type`) %>%
  filter(n() != 1) %>% 
  nrow()
## [1] 0

Tissue Type is unique by WGS ID