Strsplit

Split a text-element based on a character

Hi, Just a simple question. I need to split a column into two columns based on a delimiter (/). I have attached a simple example file. First sheet is the original data and the second sheet is how I want the data to be transformed. Of course I can do this using excel for few columns but not for hundreds! Wondering if you could help me do this in either R or linux. Many thanks buddy..

# importing the file to have a look
df <- read.csv("split_column.csv", 
               header = FALSE)

head(df)

##    V1  V2  V3  V4
## 1 A/T C/G A/C G/T
## 2 A/T C/G A/C G/T
## 3 A/T C/G A/C G/T
## 4 A/T C/G A/C G/T
## 5 A/T C/G A/C G/T
## 6 A/T C/G A/C G/T


# find functions that have to do with "split"

help.search("split")

# the above list is quite long
# "apropos" looks for names of objects that match parts of the search term
apropos("split")

##  [1] "split"              "split<-"            "split<-.data.frame"
##  [4] "split.data.frame"   "split.Date"         "split<-.default"   
##  [7] "split.default"      "split.POSIXct"      "split.screen"      
## [10] "strsplit"           "unsplit"


# "strsplit" fits the bill: "Split the elements of a character vector x into substrings according to the matches to substring split within them."

# Put it to work
# Use strsplit to split a sample into components using a indicator

strsplit(df$V1, split = "/")

## Error: non-character argument


# Well....
# check the data frame
str(df)

## 'data.frame':    20 obs. of  4 variables:
##  $ V1: Factor w/ 1 level "A/T": 1 1 1 1 1 1 1 1 1 1 ...
##  $ V2: Factor w/ 1 level "C/G": 1 1 1 1 1 1 1 1 1 1 ...
##  $ V3: Factor w/ 1 level "A/C": 1 1 1 1 1 1 1 1 1 1 ...
##  $ V4: Factor w/ 1 level "G/T": 1 1 1 1 1 1 1 1 1 1 ...


# how to modify the call to make it work?
strsplit(as.character(df$V1), split = "/")

## [[1]]
## [1] "A" "T"
## 
## [[2]]
## [1] "A" "T"
## 
## [[3]]
## [1] "A" "T"
## 
## [[4]]
## [1] "A" "T"
## 
## [[5]]
## [1] "A" "T"
## 
## [[6]]
## [1] "A" "T"
## 
## [[7]]
## [1] "A" "T"
## 
## [[8]]
## [1] "A" "T"
## 
## [[9]]
## [1] "A" "T"
## 
## [[10]]
## [1] "A" "T"
## 
## [[11]]
## [1] "A" "T"
## 
## [[12]]
## [1] "A" "T"
## 
## [[13]]
## [1] "A" "T"
## 
## [[14]]
## [1] "A" "T"
## 
## [[15]]
## [1] "A" "T"
## 
## [[16]]
## [1] "A" "T"
## 
## [[17]]
## [1] "A" "T"
## 
## [[18]]
## [1] "A" "T"
## 
## [[19]]
## [1] "A" "T"
## 
## [[20]]
## [1] "A" "T"


# turn off the conversion from strings to factors during import
df <- read.csv("split_column.csv", 
               header = FALSE,
               stringsAsFactors = FALSE)

# testing
strsplit(df$V1, split = "/")

## [[1]]
## [1] "A" "T"
## 
## [[2]]
## [1] "A" "T"
## 
## [[3]]
## [1] "A" "T"
## 
## [[4]]
## [1] "A" "T"
## 
## [[5]]
## [1] "A" "T"
## 
## [[6]]
## [1] "A" "T"
## 
## [[7]]
## [1] "A" "T"
## 
## [[8]]
## [1] "A" "T"
## 
## [[9]]
## [1] "A" "T"
## 
## [[10]]
## [1] "A" "T"
## 
## [[11]]
## [1] "A" "T"
## 
## [[12]]
## [1] "A" "T"
## 
## [[13]]
## [1] "A" "T"
## 
## [[14]]
## [1] "A" "T"
## 
## [[15]]
## [1] "A" "T"
## 
## [[16]]
## [1] "A" "T"
## 
## [[17]]
## [1] "A" "T"
## 
## [[18]]
## [1] "A" "T"
## 
## [[19]]
## [1] "A" "T"
## 
## [[20]]
## [1] "A" "T"


# output is a list
my.list <- strsplit(df$V1, split = "/")

# list contains both elements of the original object
# have to take the list apart!

# using the usual bracket notation as usual

# As "[" is just another function, we can use it within a simple "apply" call.

# Get the first element of each entry
lapply(strsplit(df$V1, split="/"), "[", 1)

## [[1]]
## [1] "A"
## 
## [[2]]
## [1] "A"
## 
## [[3]]
## [1] "A"
## 
## [[4]]
## [1] "A"
## 
## [[5]]
## [1] "A"
## 
## [[6]]
## [1] "A"
## 
## [[7]]
## [1] "A"
## 
## [[8]]
## [1] "A"
## 
## [[9]]
## [1] "A"
## 
## [[10]]
## [1] "A"
## 
## [[11]]
## [1] "A"
## 
## [[12]]
## [1] "A"
## 
## [[13]]
## [1] "A"
## 
## [[14]]
## [1] "A"
## 
## [[15]]
## [1] "A"
## 
## [[16]]
## [1] "A"
## 
## [[17]]
## [1] "A"
## 
## [[18]]
## [1] "A"
## 
## [[19]]
## [1] "A"
## 
## [[20]]
## [1] "A"


# the same with sapply:
sapply(strsplit(df$V1, split="/"), "[", 1)

##  [1] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [18] "A" "A" "A"


# Get the second element of each entry
sapply(strsplit(df$V1, split="/"), "[", 2)

##  [1] "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T" "T"
## [18] "T" "T" "T"


# To process all columns
# the result is a list that contains data frames (list of data frames)
out.list <- lapply(df,
       function(x) {
       # assemble a data frame, one column for each parameter
       my.df <- data.frame(one = sapply(strsplit(x, split="/"), "[", 1),
                           two = sapply(strsplit(x, split="/"), "[", 2))
       }
)
# have a look at the resulting data frames (at this point, they are still lists...)
out.list[1]

## $V1
##    one two
## 1    A   T
## 2    A   T
## 3    A   T
## 4    A   T
## 5    A   T
## 6    A   T
## 7    A   T
## 8    A   T
## 9    A   T
## 10   A   T
## 11   A   T
## 12   A   T
## 13   A   T
## 14   A   T
## 15   A   T
## 16   A   T
## 17   A   T
## 18   A   T
## 19   A   T
## 20   A   T

out.list[4]

## $V4
##    one two
## 1    G   T
## 2    G   T
## 3    G   T
## 4    G   T
## 5    G   T
## 6    G   T
## 7    G   T
## 8    G   T
## 9    G   T
## 10   G   T
## 11   G   T
## 12   G   T
## 13   G   T
## 14   G   T
## 15   G   T
## 16   G   T
## 17   G   T
## 18   G   T
## 19   G   T
## 20   G   T


# assemble a new data frame from the list - just expand as needed for all four original columns
my.df <- cbind(as.data.frame(out.list[1]), 
               as.data.frame(out.list[2]))

# better yet, do them all at once
my.df <- cbind(as.data.frame(out.list))

head(my.df)

##   V1.one V1.two V2.one V2.two V3.one V3.two V4.one V4.two
## 1      A      T      C      G      A      C      G      T
## 2      A      T      C      G      A      C      G      T
## 3      A      T      C      G      A      C      G      T
## 4      A      T      C      G      A      C      G      T
## 5      A      T      C      G      A      C      G      T
## 6      A      T      C      G      A      C      G      T


# or if you want it as a matrix
my.matrix <- cbind(out.list)

some additional text processing tools

counting characters

# see ?nchar
x <- c("KoHFv5#p", "PmDFanf")
nchar(x)

## [1] 8 7

extract parts of a text

# see ?substring
# get the elements between position 3 and 5 from our example
substring(x, 3, 5)

## [1] "HFv" "DFa"

check for a match in a text - pattern matching

# see ?grep
# check which element of x contains the letter "H"
# "H" is only part of the first element in x
grep("H", x)

## [1] 1

grep("f", x)

## [1] 2


# when ignoring the case, the letter "f" is found in the first and in the second element of x
grep("f", x, ignore.case = TRUE)

## [1] 1 2