Split a text-element based on a character
Hi, Just a simple question. I need to split a column into two columns based on a delimiter (/). I have attached a simple example file. First sheet is the original data and the second sheet is how I want the data to be transformed. Of course I can do this using excel for few columns but not for hundreds! Wondering if you could help me do this in either R or linux. Many thanks buddy..
# importing the file to have a look
df <- read.csv("split_column.csv",
header = FALSE)
head(df)
## V1 V2 V3 V4
## 1 A/T C/G A/C G/T
## 2 A/T C/G A/C G/T
## 3 A/T C/G A/C G/T
## 4 A/T C/G A/C G/T
## 5 A/T C/G A/C G/T
## 6 A/T C/G A/C G/T
# find functions that have to do with "split"
help.search("split")
# the above list is quite long
# "apropos" looks for names of objects that match parts of the search term
apropos("split")
## [1] "split" "split<-" "split<-.data.frame"
## [4] "split.data.frame" "split.Date" "split<-.default"
## [7] "split.default" "split.POSIXct" "split.screen"
## [10] "strsplit" "unsplit"
# "strsplit" fits the bill: "Split the elements of a character vector x into substrings according to the matches to substring split within them."
# Put it to work
# Use strsplit to split a sample into components using a indicator
strsplit(df$V1, split = "/")
## Error: non-character argument
# Well....
# check the data frame
str(df)
## 'data.frame': 20 obs. of 4 variables:
## $ V1: Factor w/ 1 level "A/T": 1 1 1 1 1 1 1 1 1 1 ...
## $ V2: Factor w/ 1 level "C/G": 1 1 1 1 1 1 1 1 1 1 ...
## $ V3: Factor w/ 1 level "A/C": 1 1 1 1 1 1 1 1 1 1 ...
## $ V4: Factor w/ 1 level "G/T": 1 1 1 1 1 1 1 1 1 1 ...
# how to modify the call to make it work?
strsplit(as.character(df$V1), split = "/")
## [[1]]
## [1] "A" "T"
##
## [[2]]
## [1] "A" "T"
##
## [[3]]
## [1] "A" "T"
##
## [[4]]
## [1] "A" "T"
##
## [[5]]
## [1] "A" "T"
##
## [[6]]
## [1] "A" "T"
##
## [[7]]
## [1] "A" "T"
##
## [[8]]
## [1] "A" "T"
##
## [[9]]
## [1] "A" "T"
##
## [[10]]
## [1] "A" "T"
##
## [[11]]
## [1] "A" "T"
##
## [[12]]
## [1] "A" "T"
##
## [[13]]
## [1] "A" "T"
##
## [[14]]
## [1] "A" "T"
##
## [[15]]
## [1] "A" "T"
##
## [[16]]
## [1] "A" "T"
##
## [[17]]
## [1] "A" "T"
##
## [[18]]
## [1] "A" "T"
##
## [[19]]
## [1] "A" "T"
##
## [[20]]
## [1] "A" "T"
# turn off the conversion from strings to factors during import
df <- read.csv("split_column.csv",
header = FALSE,
stringsAsFactors = FALSE)
# testing
strsplit(df$V1, split = "/")
## [[1]]
## [1] "A" "T"
##
## [[2]]
## [1] "A" "T"
##
## [[3]]
## [1] "A" "T"
##
## [[4]]
## [1] "A" "T"
##
## [[5]]
## [1] "A" "T"
##
## [[6]]
## [1] "A" "T"
##
## [[7]]
## [1] "A" "T"
##
## [[8]]
## [1] "A" "T"
##
## [[9]]
## [1] "A" "T"
##
## [[10]]
## [1] "A" "T"
##
## [[11]]
## [1] "A" "T"
##
## [[12]]
## [1] "A" "T"
##
## [[13]]
## [1] "A" "T"
##
## [[14]]
## [1] "A" "T"
##
## [[15]]
## [1] "A" "T"
##
## [[16]]
## [1] "A" "T"
##
## [[17]]
## [1] "A" "T"
##
## [[18]]
## [1] "A" "T"
##
## [[19]]
## [1] "A" "T"
##
## [[20]]
## [1] "A" "T"
# output is a list
my.list <- strsplit(df$V1, split = "/")
# list contains both elements of the original object
# have to take the list apart!
# using the usual bracket notation as usual
# As "[" is just another function, we can use it within a simple "apply" call.
# Get the first element of each entry
lapply(strsplit(df$V1, split="/"), "[", 1)
## [[1]]
## [1] "A"
##
## [[2]]
## [1] "A"
##
## [[3]]
## [1] "A"
##
## [[4]]
## [1] "A"
##
## [[5]]
## [1] "A"
##
## [[6]]
## [1] "A"
##
## [[7]]
## [1] "A"
##
## [[8]]
## [1] "A"
##
## [[9]]
## [1] "A"
##
## [[10]]
## [1] "A"
##
## [[11]]
## [1] "A"
##
## [[12]]
## [1] "A"
##
## [[13]]
## [1] "A"
##
## [[14]]
## [1] "A"
##
## [[15]]
## [1] "A"
##
## [[16]]
## [1] "A"
##
## [[17]]
## [1] "A"
##
## [[18]]
## [1] "A"
##
## [[19]]
## [1] "A"
##
## [[20]]
## [1] "A"
# the same with sapply:
sapply(strsplit(df$V1, split="/"), "[", 1)
## [1] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [18] "A" "A" "A"
# Get the second element of each entry
sapply(strsplit(df$V1, split="/"), "[", 2)
## [1] "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T"
## [18] "T" "T" "T"
# To process all columns
# the result is a list that contains data frames (list of data frames)
out.list <- lapply(df,
function(x) {
# assemble a data frame, one column for each parameter
my.df <- data.frame(one = sapply(strsplit(x, split="/"), "[", 1),
two = sapply(strsplit(x, split="/"), "[", 2))
}
)
# have a look at the resulting data frames (at this point, they are still lists...)
out.list[1]
## $V1
## one two
## 1 A T
## 2 A T
## 3 A T
## 4 A T
## 5 A T
## 6 A T
## 7 A T
## 8 A T
## 9 A T
## 10 A T
## 11 A T
## 12 A T
## 13 A T
## 14 A T
## 15 A T
## 16 A T
## 17 A T
## 18 A T
## 19 A T
## 20 A T
out.list[4]
## $V4
## one two
## 1 G T
## 2 G T
## 3 G T
## 4 G T
## 5 G T
## 6 G T
## 7 G T
## 8 G T
## 9 G T
## 10 G T
## 11 G T
## 12 G T
## 13 G T
## 14 G T
## 15 G T
## 16 G T
## 17 G T
## 18 G T
## 19 G T
## 20 G T
# assemble a new data frame from the list - just expand as needed for all four original columns
my.df <- cbind(as.data.frame(out.list[1]),
as.data.frame(out.list[2]))
# better yet, do them all at once
my.df <- cbind(as.data.frame(out.list))
head(my.df)
## V1.one V1.two V2.one V2.two V3.one V3.two V4.one V4.two
## 1 A T C G A C G T
## 2 A T C G A C G T
## 3 A T C G A C G T
## 4 A T C G A C G T
## 5 A T C G A C G T
## 6 A T C G A C G T
# or if you want it as a matrix
my.matrix <- cbind(out.list)
counting characters
# see ?nchar
x <- c("KoHFv5#p", "PmDFanf")
nchar(x)
## [1] 8 7
extract parts of a text
# see ?substring
# get the elements between position 3 and 5 from our example
substring(x, 3, 5)
## [1] "HFv" "DFa"
check for a match in a text - pattern matching
# see ?grep
# check which element of x contains the letter "H"
# "H" is only part of the first element in x
grep("H", x)
## [1] 1
grep("f", x)
## [1] 2
# when ignoring the case, the letter "f" is found in the first and in the second element of x
grep("f", x, ignore.case = TRUE)
## [1] 1 2