suppressPackageStartupMessages({
library(knitr)
library(tibble)
library(reshape2)
library(dplyr)
library(tidyverse)
library(magrittr)
library(rstatix)
})

Original Messy Data

dmap <- read.csv("chromosome_map.csv")
kable(head(dmap))
SNP.names chr pos SNP
14057748|F|0-21:C>T-21:C>T Vu09 (old8) 6006483 21:C>T
14073404|F|0-63:T>G-63:T>G Vu09 (old8) 1077531 63:T>G
14077008|F|0-16:T>A-16:T>A Vu09 (old8) 39008686 16:T>A
14078118|F|0-12:C>A-12:C>A Vu03 (old3) 6136968 12:C>A
100008216|F|0-12:A>T-12:A>T Vu04 (old11) 18708075 12:A>T
100008218|F|0-5:A>G-5:A>G Vu05 (old1) 2222217 5:A>G

Data after Separating SNP column in to 2 new columns & Removing strings and parentheses from column “Chr”

dmap$SNP <- gsub("^[[:digit:]]+", "",dmap$SNP)# Remove all digits
dmap$SNP <- sub(":", "",dmap$SNP)# Remove all double colons
dmap$chr<- gsub("\\(.*", "", dmap$chr) 
dmap$chr <- sub("Vu", "",dmap$chr)
dmap <- tidyr::separate(dmap, col="SNP", into=c("allele1","allele2"),sep=">")
kable(head(dmap))
SNP.names chr pos allele1 allele2
14057748|F|0-21:C>T-21:C>T 09 6006483 C T
14073404|F|0-63:T>G-63:T>G 09 1077531 T G
14077008|F|0-16:T>A-16:T>A 09 39008686 T A
14078118|F|0-12:C>A-12:C>A 03 6136968 C A
100008216|F|0-12:A>T-12:A>T 04 18708075 A T
100008218|F|0-5:A>G-5:A>G 05 2222217 A G

Replace everything after the first dot plus duplicates in column “SNP.names”

dmap[,c(2:3)] <- sapply(dmap[,c(2,3)], as.numeric)
dmap <- dmap %>% convert_as_factor(SNP.names,allele1,allele2)
dmap$SNP.names<- gsub("\\|.*", "", dmap$SNP.names)
dmap$SNP.names <- paste0("SNP",dmap$SNP.names)

dmap <- dmap[!duplicated(dmap$SNP.names),]
rownames(dmap) <- dmap$SNP.names
kable(head(dmap))
SNP.names chr pos allele1 allele2
SNP14057748 SNP14057748 9 6006483 C T
SNP14073404 SNP14073404 9 1077531 T G
SNP14077008 SNP14077008 9 39008686 T A
SNP14078118 SNP14078118 3 6136968 C A
SNP100008216 SNP100008216 4 18708075 A T
SNP100008218 SNP100008218 5 2222217 A G

Raw Marker data

dm <- read.csv("chromosome_markers.csv")
kable(dm[1:5,1:8])
SNP X52 X96 X133 X27 X104 X159 X204
14057748|F|0-21:C>T-21:C>T - 0 0 0 0 - 0
14073404|F|0-63:T>G-63:T>G 0 0 0 0 0 0 0
14077008|F|0-16:T>A-16:T>A 0 0 0 0 0 0 0
14078118|F|0-12:C>A-12:C>A 0 2 0 0 0 0 0
100008216|F|0-12:A>T-12:A>T 0 0 0 - - - -

Transpose the data, Replace occurence of “X” in columns, Replace everything after the first dot plus duplicates

dm <- t(dm)
dm <- data.frame(dm)
rownames(dm) <- sub("X", "",rownames(dm))

names(dm) <- dm[1,]
dm <- dm[-1,]
#dm <- tibble::rownames_to_column(dm,var = "Ind")
colnames(dm) <- paste0("SNP",colnames(dm))
colnames(dm)<- gsub("\\|.*", "", colnames(dm))



dm <- dm[,!duplicated(colnames(dm))]
kable(dm[1:6,1:6])
SNP14057748 SNP14073404 SNP14077008 SNP14078118 SNP100008216 SNP100008218
52 - 0 0 0 0 1
96 0 0 0 2 0 -
133 0 0 0 0 0 1
27 0 0 0 0 - 1
104 0 0 0 0 - 1
159 - 0 0 0 - 1

Remove occurence of “G” from column “genotype”

pheno <- read.csv("pheno_data.csv")
names(pheno)[1] <- "genotype"
pheno$genotype <- sub("G", "", pheno$genotype)
#pheno <- pheno[,c(2,1,3:10)]
kable(head(pheno))
genotype Experiment DTF DTM PL NSP NPP GYP GYR TW
001 S1 47 98 15.50 15.67 30.33 46.00 365.00 411
001 S2 45 94 14.10 14.00 34.67 32.33 448.67 481
001 S3 39 94 17.83 16.67 30.00 80.67 432.00 599
001 S4 37 95 18.00 18.33 27.00 58.32 359.67 418
002 S1 40 93 15.13 15.00 36.00 46.67 274.33 321
002 S2 43 100 13.13 13.33 27.00 35.33 267.67 303

Saving clean data

This can be done by passing the object with cleaned data to write.csv(obj).