suppressPackageStartupMessages({
library(knitr)
library(tibble)
library(reshape2)
library(dplyr)
library(tidyverse)
library(magrittr)
library(rstatix)
})
Original Messy Data
dmap <- read.csv("chromosome_map.csv")
kable(head(dmap))
| 14057748|F|0-21:C>T-21:C>T |
Vu09 (old8) |
6006483 |
21:C>T |
| 14073404|F|0-63:T>G-63:T>G |
Vu09 (old8) |
1077531 |
63:T>G |
| 14077008|F|0-16:T>A-16:T>A |
Vu09 (old8) |
39008686 |
16:T>A |
| 14078118|F|0-12:C>A-12:C>A |
Vu03 (old3) |
6136968 |
12:C>A |
| 100008216|F|0-12:A>T-12:A>T |
Vu04 (old11) |
18708075 |
12:A>T |
| 100008218|F|0-5:A>G-5:A>G |
Vu05 (old1) |
2222217 |
5:A>G |
Data after Separating SNP column in to 2 new columns & Removing
strings and parentheses from column “Chr”
dmap$SNP <- gsub("^[[:digit:]]+", "",dmap$SNP)# Remove all digits
dmap$SNP <- sub(":", "",dmap$SNP)# Remove all double colons
dmap$chr<- gsub("\\(.*", "", dmap$chr)
dmap$chr <- sub("Vu", "",dmap$chr)
dmap <- tidyr::separate(dmap, col="SNP", into=c("allele1","allele2"),sep=">")
kable(head(dmap))
| 14057748|F|0-21:C>T-21:C>T |
09 |
6006483 |
C |
T |
| 14073404|F|0-63:T>G-63:T>G |
09 |
1077531 |
T |
G |
| 14077008|F|0-16:T>A-16:T>A |
09 |
39008686 |
T |
A |
| 14078118|F|0-12:C>A-12:C>A |
03 |
6136968 |
C |
A |
| 100008216|F|0-12:A>T-12:A>T |
04 |
18708075 |
A |
T |
| 100008218|F|0-5:A>G-5:A>G |
05 |
2222217 |
A |
G |
Replace everything after the first dot plus duplicates in column
“SNP.names”
dmap[,c(2:3)] <- sapply(dmap[,c(2,3)], as.numeric)
dmap <- dmap %>% convert_as_factor(SNP.names,allele1,allele2)
dmap$SNP.names<- gsub("\\|.*", "", dmap$SNP.names)
dmap$SNP.names <- paste0("SNP",dmap$SNP.names)
dmap <- dmap[!duplicated(dmap$SNP.names),]
rownames(dmap) <- dmap$SNP.names
kable(head(dmap))
| SNP14057748 |
SNP14057748 |
9 |
6006483 |
C |
T |
| SNP14073404 |
SNP14073404 |
9 |
1077531 |
T |
G |
| SNP14077008 |
SNP14077008 |
9 |
39008686 |
T |
A |
| SNP14078118 |
SNP14078118 |
3 |
6136968 |
C |
A |
| SNP100008216 |
SNP100008216 |
4 |
18708075 |
A |
T |
| SNP100008218 |
SNP100008218 |
5 |
2222217 |
A |
G |
Raw Marker data
dm <- read.csv("chromosome_markers.csv")
kable(dm[1:5,1:8])
| 14057748|F|0-21:C>T-21:C>T |
- |
0 |
0 |
0 |
0 |
- |
0 |
| 14073404|F|0-63:T>G-63:T>G |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 14077008|F|0-16:T>A-16:T>A |
0 |
0 |
0 |
0 |
0 |
0 |
0 |
| 14078118|F|0-12:C>A-12:C>A |
0 |
2 |
0 |
0 |
0 |
0 |
0 |
| 100008216|F|0-12:A>T-12:A>T |
0 |
0 |
0 |
- |
- |
- |
- |
Transpose the data, Replace occurence of “X” in columns, Replace
everything after the first dot plus duplicates
dm <- t(dm)
dm <- data.frame(dm)
rownames(dm) <- sub("X", "",rownames(dm))
names(dm) <- dm[1,]
dm <- dm[-1,]
#dm <- tibble::rownames_to_column(dm,var = "Ind")
colnames(dm) <- paste0("SNP",colnames(dm))
colnames(dm)<- gsub("\\|.*", "", colnames(dm))
dm <- dm[,!duplicated(colnames(dm))]
kable(dm[1:6,1:6])
| 52 |
- |
0 |
0 |
0 |
0 |
1 |
| 96 |
0 |
0 |
0 |
2 |
0 |
- |
| 133 |
0 |
0 |
0 |
0 |
0 |
1 |
| 27 |
0 |
0 |
0 |
0 |
- |
1 |
| 104 |
0 |
0 |
0 |
0 |
- |
1 |
| 159 |
- |
0 |
0 |
0 |
- |
1 |
Remove occurence of “G” from column “genotype”
pheno <- read.csv("pheno_data.csv")
names(pheno)[1] <- "genotype"
pheno$genotype <- sub("G", "", pheno$genotype)
#pheno <- pheno[,c(2,1,3:10)]
kable(head(pheno))
| 001 |
S1 |
47 |
98 |
15.50 |
15.67 |
30.33 |
46.00 |
365.00 |
411 |
| 001 |
S2 |
45 |
94 |
14.10 |
14.00 |
34.67 |
32.33 |
448.67 |
481 |
| 001 |
S3 |
39 |
94 |
17.83 |
16.67 |
30.00 |
80.67 |
432.00 |
599 |
| 001 |
S4 |
37 |
95 |
18.00 |
18.33 |
27.00 |
58.32 |
359.67 |
418 |
| 002 |
S1 |
40 |
93 |
15.13 |
15.00 |
36.00 |
46.67 |
274.33 |
321 |
| 002 |
S2 |
43 |
100 |
13.13 |
13.33 |
27.00 |
35.33 |
267.67 |
303 |
Saving clean data
This can be done by passing the object with cleaned data to
write.csv(obj).