Data Cleaning & Wrangling using R

suppressPackageStartupMessages({
library(knitr)
library(tibble)
library(reshape2)
library(dplyr)
library(tidyverse)
library(magrittr)
library(rstatix)
})

Original Messy Data

dmap <- read.csv("chromosome_map.csv")
kable(head(dmap))

SNP.names	chr	pos	SNP
14057748\|F\|0-21:C>T-21:C>T	Vu09 (old8)	6006483	21:C>T
14073404\|F\|0-63:T>G-63:T>G	Vu09 (old8)	1077531	63:T>G
14077008\|F\|0-16:T>A-16:T>A	Vu09 (old8)	39008686	16:T>A
14078118\|F\|0-12:C>A-12:C>A	Vu03 (old3)	6136968	12:C>A
100008216\|F\|0-12:A>T-12:A>T	Vu04 (old11)	18708075	12:A>T
100008218\|F\|0-5:A>G-5:A>G	Vu05 (old1)	2222217	5:A>G

Data after Separating SNP column in to 2 new columns & Removing strings and parentheses from column “Chr”

dmap$SNP <- gsub("^[[:digit:]]+", "",dmap$SNP)# Remove all digits
dmap$SNP <- sub(":", "",dmap$SNP)# Remove all double colons
dmap$chr<- gsub("\\(.*", "", dmap$chr) 
dmap$chr <- sub("Vu", "",dmap$chr)
dmap <- tidyr::separate(dmap, col="SNP", into=c("allele1","allele2"),sep=">")
kable(head(dmap))

SNP.names	chr	pos	allele1	allele2
14057748\|F\|0-21:C>T-21:C>T	09	6006483	C	T
14073404\|F\|0-63:T>G-63:T>G	09	1077531	T	G
14077008\|F\|0-16:T>A-16:T>A	09	39008686	T	A
14078118\|F\|0-12:C>A-12:C>A	03	6136968	C	A
100008216\|F\|0-12:A>T-12:A>T	04	18708075	A	T
100008218\|F\|0-5:A>G-5:A>G	05	2222217	A	G

Replace everything after the first dot plus duplicates in column “SNP.names”

dmap[,c(2:3)] <- sapply(dmap[,c(2,3)], as.numeric)
dmap <- dmap %>% convert_as_factor(SNP.names,allele1,allele2)
dmap$SNP.names<- gsub("\\|.*", "", dmap$SNP.names)
dmap$SNP.names <- paste0("SNP",dmap$SNP.names)

dmap <- dmap[!duplicated(dmap$SNP.names),]
rownames(dmap) <- dmap$SNP.names
kable(head(dmap))

	SNP.names	chr	pos	allele1	allele2
SNP14057748	SNP14057748	9	6006483	C	T
SNP14073404	SNP14073404	9	1077531	T	G
SNP14077008	SNP14077008	9	39008686	T	A
SNP14078118	SNP14078118	3	6136968	C	A
SNP100008216	SNP100008216	4	18708075	A	T
SNP100008218	SNP100008218	5	2222217	A	G

Raw Marker data

dm <- read.csv("chromosome_markers.csv")
kable(dm[1:5,1:8])

SNP	X52	X96	X27	X104	X159	X204
14057748\|F\|0-21:C>T-21:C>T	-	0	0	0	-	0
14073404\|F\|0-63:T>G-63:T>G	0	0	0	0	0	0
14077008\|F\|0-16:T>A-16:T>A	0	0	0	0	0	0
14078118\|F\|0-12:C>A-12:C>A	0	2	0	0	0	0
100008216\|F\|0-12:A>T-12:A>T	0	0	-	-	-	-

Transpose the data, Replace occurence of “X” in columns, Replace everything after the first dot plus duplicates

dm <- t(dm)
dm <- data.frame(dm)
rownames(dm) <- sub("X", "",rownames(dm))

names(dm) <- dm[1,]
dm <- dm[-1,]
#dm <- tibble::rownames_to_column(dm,var = "Ind")
colnames(dm) <- paste0("SNP",colnames(dm))
colnames(dm)<- gsub("\\|.*", "", colnames(dm))



dm <- dm[,!duplicated(colnames(dm))]
kable(dm[1:6,1:6])

	SNP14057748	SNP14078118	SNP100008216	SNP100008218
52	-	0	0	1
96	0	2	0	-
133	0	0	0	1
27	0	0	-	1
104	0	0	-	1
159	-	0	-	1

Remove occurence of “G” from column “genotype”

pheno <- read.csv("pheno_data.csv")
names(pheno)[1] <- "genotype"
pheno$genotype <- sub("G", "", pheno$genotype)
#pheno <- pheno[,c(2,1,3:10)]
kable(head(pheno))

genotype	Experiment	DTF	DTM	PL	NSP	NPP	GYP	GYR	TW
001	S1	47	98	15.50	15.67	30.33	46.00	365.00	411
001	S2	45	94	14.10	14.00	34.67	32.33	448.67	481
001	S3	39	94	17.83	16.67	30.00	80.67	432.00	599
001	S4	37	95	18.00	18.33	27.00	58.32	359.67	418
002	S1	40	93	15.13	15.00	36.00	46.67	274.33	321
002	S2	43	100	13.13	13.33	27.00	35.33	267.67	303

Saving clean data

This can be done by passing the object with cleaned data to write.csv(obj).

Data Cleaning & Wrangling using R

Kiplagat John Noel

2024-01-26

Original Messy Data

Data after Separating SNP column in to 2 new columns & Removing strings and parentheses from column “Chr”

Replace everything after the first dot plus duplicates in column “SNP.names”

Raw Marker data

Transpose the data, Replace occurence of “X” in columns, Replace everything after the first dot plus duplicates

Remove occurence of “G” from column “genotype”

Saving clean data