Codes for Handling and Processing Strings in R
These are some of the important scripts described in Gaston Sanchez’s book.
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
# names of states
states = USArrests$state
# substr function is used to take the part of string
substr(x = states, start = 1, stop = 4)
## character(0)
# abbreviate state names
states2 = abbreviate(states)
states2
## named character(0)
# size (no of letters) of each name, counts spaces too
state_chars = nchar(states)
state_chars
## integer(0)
# longest name
states[which(state_chars == max(state_chars))]
## Warning in max(state_chars): no non-missing arguments to max; returning -
## Inf
## NULL
# get states names with k, use grep function and indicate pattern="k"
grep(pattern = "k", x = states, value = TRUE)
## character(0)
# get states names with w or W
grep(pattern = "[wW]", x = states, value = TRUE)
## character(0)
#another solution is to convert all the state names to lower case
# get states names with w
grep(pattern = "w", x = tolower(states), value = TRUE)
## character(0)
# alternatively get states names with W
grep(pattern = "W", x = toupper(states), value = TRUE)
## character(0)
#or we can ask to ignore case
grep(pattern = "w", x = states, value = TRUE, ignore.case = TRUE)
## character(0)
head(USArrests)
## Murder Assault UrbanPop Rape
## Alabama 13.2 236 58 21.2
## Alaska 10.0 263 48 44.5
## Arizona 8.1 294 80 31.0
## Arkansas 8.8 190 50 19.5
## California 9.0 276 91 40.6
## Colorado 7.9 204 78 38.7
# names of states
states = USArrests$state
#understand histogram
#it will show how many times each value appear
x=c(1,1,1,2,2,3)
hist(x)
# histogram for no of characters in states
#use to show no of amino acid sequences in many protein sequences
#We can use regexpr() to get the number oftimes that a searched pattern is found
#in a character vector. When there is no match, we get -1
# position of a's
positions_a = gregexpr(pattern = "a", text = states, ignore.case = TRUE)
#it will give 1,3,5,7 for Alabama
#but if we want to count number of a's in each state
num_a = sapply(positions_a, function(x) ifelse(x[1] > 0, length(x), 0))
#but easy way is to use stringr library
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
# total number of a's, remember it is case sensetive and it does not take ignore.case #function
str_count(states, "a")
## integer(0)
str_count(tolower(states),"a")
## integer(0)
#USE COUNT TOTAL NUMBER OF A G T C in DNA Sequence
#lets count all the vowels
# vector of vowels
vowels = c("a", "e", "i", "o", "u")
# vector for storing results
#this starts five counters at the same time
num_vowels = vector(mode = "integer", length = 5)
# calculate number of vowels in each name
#for j in vowels takes a,e,i,o,u respectively
for (j in seq_along(vowels)) {
num_aux = str_count(tolower(states), vowels[j])
num_vowels[j] = sum(num_aux)
}
# add vowel names
names(num_vowels) = vowels
# total number of vowels
num_vowels
## a e i o u
## 0 0 0 0 0
# sort them in decreasing order
sort(num_vowels, decreasing = TRUE)
## a e i o u
## 0 0 0 0 0
# barplot
barplot(num_vowels,main = "Number of vowels in USA States names",
border = NA,xlab ="vowels", ylim = c(0, 80))
#character function character(). this function creates character vectors
# empty string
empty_str = " "
# class of empty_str
class(empty_str)
## [1] "character"
#another way to create empty character vector not the string
#empty character vector
empty_chr = character(0)
class(empty_chr)
## [1] "character"
#what is the difference
# length of empty string is 1
length(empty_str)
## [1] 1
# length of empty character vector is o
length(empty_chr)
## [1] 0
#we just have to mention length of characters. It creates empty character vectors
# character vector with 5 empty strings
char_vector = character(5)
# display
char_vector
## [1] "" "" "" "" ""
#we can add components to the empty characters
# another example
example = character(0)
example
## character(0)
#check its length
length(example)
## [1] 0
# add first element
example[1] = "first"
example
## [1] "first"
# check its length again
length(example)
## [1] 1
#we can add other elements. does not have to be in order.Missing values are filled with NA.
example[4] = "fourth"
example
## [1] "first" NA NA "fourth"
length(example)
## [1] 4
#sister functions is.character and as.character
#first for testing character and second for converting something to character
# define two objects
a = "test me"
b = 8 + 9
# are a and b characters
is.character(a)
## [1] TRUE
is.character(b)
## [1] FALSE
#this can be done by class
class (a)
## [1] "character"
class (b)
## [1] "numeric"
#convert non character to character
b = as.character(b)
##now move to strings
#in vector and matrix if there is mixture of number and characters, charcter will dominate.
#in data frame the strings are converted into factor
# to turn off this use stringsAsFactors=False
#with list we can combine whatever data we want
list(1:5, letters[1:5], rnorm(5))
## [[1]]
## [1] 1 2 3 4 5
##
## [[2]]
## [1] "a" "b" "c" "d" "e"
##
## [[3]]
## [1] -1.5384467 0.3912095 1.2583841 1.8089722 -2.1600295
#reading raw text use readlines()
# read ktop100.txt file
top105 = readLines("http://www.textfiles.com/music/ktop100.txt")
View(top105)
# how many lines
length(top105)
## [1] 123
# inspecting last 10 elements
tail(top105, n = 10)
## [1] "101. SMASHING PUMPKINS SIVA"
## [2] "102. ELVIS COSTELLO OTHER SIDE OF ..."
## [3] "103. SEERS PSYCHE OUT"
## [4] "104. THRILL KILL CULT SEX ON WHEELZ"
## [5] "105. MATTHEW SWEET I'VE BEEN WAITING"
## [6] "105.3 LATOUR PEOPLE ARE STILL HAVING SEX"
## [7] ""
## [8] "Ed"
## [9] "ed@wente.llnl.gov"
## [10] ""
#most versatile paste() function
# paste(..., sep="", collapse=NULL)
#... means can take any number of strings
#what separator want to use, space comma, collapse means combine strings as one
# paste
PI = paste("The life of", pi)
# paste
IloveR = paste("I", "love", "R", sep = "-")
#If we give paste() objects of dierent length, then it will apply a recycling rule.
# paste with objects of different lengths. Here there are 5 digits but only one X. so
paste("X", 1:5, sep = ".")
## [1] "X.1" "X.2" "X.3" "X.4" "X.5"
# paste with collapsing
paste(1:3, c("!", "?", "+"), sep = "", collapse = "")
## [1] "1!2?3+"
# paste without collapsing
paste(1:3, c("!", "?", "+"), sep = "")
## [1] "1!" "2?" "3+"
#paste0() which is the equivalent of using collapse
# collapsing with paste0
paste0("lets", "collapse", "all", "these", "words")
## [1] "letscollapseallthesewords"
my_string = "programming with data is fun"
# print string
print(my_string)
## [1] "programming with data is fun"
#if you dont want quote
print(my_string, quote = FALSE)
## [1] programming with data is fun
# noquote fucntion. prints same as above
noquote(my_string)
## [1] programming with data is fun
#actually noquotes creates a different class of object called noquote
no_quotes = noquote(c("some", "quoted", "text", "!%^(&="))
# display
no_quotes
## [1] some quoted text !%^(&=
class(no_quotes)
## [1] "noquote"
#concatenate
#syntax cat(..., file = "", sep = " ", fill = FALSE, labels = NULL, append = FALSE)
# simply print with cat(), and it does not print numeric line indicator
cat(my_string)
## programming with data is fun
#combine strings
cat(my_string, "with R")
## programming with data is fun with R
# especifying sep
cat(my_string, "with R", sep = " =) ")
## programming with data is fun =) with R
#can work on vectors to
#first four months
cat(month.name[1:4], sep = " ")
## January February March April
#fill argument can be used to display in nice format
# fill = 30
cat("Loooooooooong strings", "can be displayed", "in a nice format",
"by using the fill argument", fill = 30)
## Loooooooooong strings
## can be displayed
## in a nice format
## by using the fill argument
# cat with output in a given file. Does not work in markdown before knitting
cat(my_string, "with R", file = "output.txt")
#this function helps to format the output like intendations,
#sceintific format, number of digits in number etc.
#sprintf provides nice formatting of string and vectors
# print with sign (positive)
#this can be used to add > sign in front of fasta files pi can be replaced by fasta sequence
sprintf("%+f", pi)
## [1] "+3.141593"
#function toString() can be used to help sprintf fucntion
#toString converts R objects to string
# combining several objects
toString(c("Bonjour", 123, TRUE, NA, log(exp(1))))
## [1] "Bonjour, 123, TRUE, NA, 1"
#width argument can be used to have certain lenght. this could be used for trimming purpose
toString(c("one", "two", "3333333333"), width = 12)
## [1] "one, two...."
# how many characters?
nchar(c("How", "many", "characters?"))
## [1] 3 4 11
# how many characters?. White spaces are also counted as characters
nchar("How many characters?")
## [1] 20
# how many elements? here is just 1.
length("How many characters?")
## [1] 1
#character tranlation by chartr()
#syntax chartr(old, new, x)
# replace a by A. old and new should have same number of characters
chartr("a", "A", "This is a boring string")
## [1] "This is A boring string"
#substr is used both to extract and replace characters
#syntax substr(x, start, stop)
# extract bcd
substr("abcdef", 2, 4)
## [1] "bcd"
# replace 2nd letter with hash symbol
x = c("may", "the", "force", "be", "with", "you")
substr(x, 2, 2) <- "#"
#replace with substring
#syntax substring(text, first, last = 1000000L)
# same as substr
substring("ABCDEF", 2, 4)
## [1] "BCD"
# extract each letter
substring("ABCDEF", 1:6, 1:6)
## [1] "A" "B" "C" "D" "E" "F"
# multiple replacement with recycling. Dont know where is canbe used. Can be used
#t replace more strings in a cycle.
#union function
# two character vectors
set1 = c("some", "random", "words", "some")
set2 = c("some", "many", "none", "few")
# union of set1 and set2 duplicates are removed
union(set1, set2)
## [1] "some" "random" "words" "many" "none" "few"
#intersect function
# two character vectors
set3 = c("some", "random", "few", "words")
set4 = c("some", "many", "none", "few")
# intersect of set3 and set4
intersect(set3, set4)
## [1] "some" "few"
#difference of the elementsbetween two character vectors. This can be done with setdiff():
# two character vectors
set5 = c("some", "random", "few", "words")
set6 = c("some", "many", "none", "few")
# difference between set5 and set6
setdiff(set5, set6)
## [1] "random" "words"
#setequal() allows us to test the equality of two character vectors.
# three character vectors
set7 = c("some", "random", "strings")
set8 = c("some", "many", "none", "few")
set9 = c("strings", "random", "some")
# set7 == set8?
setequal(set7, set8)
## [1] FALSE
#identical() tests if they are exactly equal or not
#is.elem() testing if certaon elelemt are present in vector
set10 = c("some", "stuff", "to", "play", "with")
elem1 = "play"
# elem1 in set10?
is.element(elem1, set10)
## [1] TRUE
set11 = c("today", "produced", "example", "beautiful", "a", "nicely")
# sort (decreasing order)
sort(set11)
## [1] "a" "beautiful" "example" "nicely" "produced" "today"
# sort (increasing order)
sort(set11, decreasing = TRUE)
## [1] "today" "produced" "nicely" "example" "beautiful" "a"
#repition with rep(). combination of paste and rep is one of the several methods
# repeat x 4 times
paste(rep("x", 4), collapse = "")
## [1] "xxxx"
library(stringr)
#Concatenating with str c(). This is equivalent to paste ()
# default usage
str_c("May", "The", "Force", "Be", "With", "You")
## [1] "MayTheForceBeWithYou"
# removing zero length objects
str_c("May", "The", "Force", NULL, "Be", "With", "You", character(0))
## [1] "MayTheForceBeWithYou"
# changing separator
str_c("May", "The", "Force", "Be", "With", "You", sep = "_")
## [1] "May_The_Force_Be_With_You"
# synonym function str_join
#str_join("May", "The", "Force", "Be", "With", "You", sep = "-")
#Number of characters with str length()
# some text (NA included)
some_text = c("one", "two", "three", NA, "five")
# compare
#str_length with nchar
nchar(some_text)
## [1] 3 3 5 NA 4
## [1] 3 3 5 2 4
str_length(some_text)
## [1] 3 3 5 NA 4
## [1] 3 3 5 NA 4
#str_lenght can convert number to factor
# some factor
some_factor = factor(c(1, 1, 1, 2, 2, 2), labels = c("good", "bad"))
some_factor
## [1] good good good bad bad bad
## Levels: good bad
#To extract substrings from a character vector stringr provides str sub() which is equivalent
#to substring(). str_sub(string, start = 1L, end = -1L)
# some text
lorem = "Lorem Ipsum"
# apply
#str_sub
str_sub(lorem, start = 1, end = 5)
## [1] "Lorem"
# another example
str_sub("adios", 1:3)
## [1] "adios" "dios" "ios"
# str_subcan work with with negative positions
#replacing Loremwith Nullam
lorem = "Lorem Ipsum"
str_sub(lorem, 1, 5) <- "Nullam"
lorem
## [1] "Nullam Ipsum"
## [1] "Nullam Ipsum"
# multiple replacements
lorem = "Lorem Ipsum"
str_sub(lorem, c(1, 7), c(5, 8)) <- c("Nullam", "Enim")
lorem
## [1] "Nullam Ipsum" "Lorem Enimsum"
## [1] "Nullam Ipsum" "Lorem Enimsum"
#duplication with str_dup
# default usage
str_dup("hola", 3)
## [1] "holaholahola"
#padding nice papragraph printing
# quote (by Douglas Adams)
some_quote = c(
"I may not have gone",
"where I intended to go,",
"but I think I have ended up",
"where I needed to be")
# display paragraph with width=30
cat(str_wrap(some_quote, width = 30))
## I may not have gone where I intended to go, but I think I have ended up where I needed to be
#display paragraph with first line indentation of 2
cat(str_wrap(some_quote, width = 30, indent = 2), "nn")
## I may not have gone where I intended to go, but I think I have ended up where I needed to be nn
#str_trim () for trimming the white space
#str_trim(string, side = "both")
# text with whitespaces
bad_text = c("This", " example ", "has several ", " whitespaces ")
# remove whitespaces on the right side
str_trim(bad_text, side = "right")
## [1] "This" " example" "has several" " whitespaces"
## [1] "This" " example" "has several" " whitespaces"
#Word extraction with word()
#word(string, start = 1L, end = start, sep = fixed(" "))
# some sentence
change = c("Be the change", "you want to be")
# extract first word
word(change, 1)
## [1] "Be" "you"
## [1] "Be" "you"
# extract second word
word(change, 2)
## [1] "the" "want"
#metacharacters . $ ^ * { } ( )
#use \\ if literal meaning is needed
#replace $ with space
# string
money = "$money"
# the right way in R
sub(pattern = "\\$", replacement = "", x = money)
## [1] "money"
#sub replaces first match while gsub replaces all the matches
#sequences
#/d digits /D non digits and so on
# replace digit with _
sub("\\d", "_", "the dandelion war 2010")
## [1] "the dandelion war _010"
## [1] "the dandelion war _010"
gsub("\\d", "_", "the dandelion war 2010")
## [1] "the dandelion war ____"
## [1] "the dandelion war ____"
#Spaces and non-spaces
# replace space with _
sub("\\s", "_", "the dandelion war 2010")
## [1] "the_dandelion war 2010"
## [1] "the_dandelion war 2010"
gsub("\\s", "_", "the dandelion war 2010")
## [1] "the_dandelion_war_2010"
## [1] "the_dandelion_war_2010"
#character class
# some string
transport = c("car", "bike", "plane", "boat")
# look for e or i
grep(pattern = "[ei]", transport, value = TRUE)
## [1] "bike" "plane"
#USED TO CREATE FASTA FORMAT
# la vie (string)
la_vie = "La vie en #FFC0CB (rose);nnCes
t la vie! nttres jolie"
# if you print
la_vie
## [1] "La vie en #FFC0CB (rose);nnCes\nt la vie! nttres jolie"
print(la_vie)
## [1] "La vie en #FFC0CB (rose);nnCes\nt la vie! nttres jolie"
## [1] "La vie en #FFC0CB (rose);\nCest la vie! \ttres jolie"
# if you cat la_vie
cat(la_vie)
## La vie en #FFC0CB (rose);nnCes
## t la vie! nttres jolie
## La vie en #FFC0CB (rose);
## Cest la vie! tres jolie
# here \n new line and \t tab
##QUANTIFIERS ? * + {n} {n,m}
# people names
people = c("rori", "emilia", "matteo", "mehmet", "filipe", "anna", "tyler",
"rasmus", "jacob", "youna", "flora", "adi")
# match m at most once
grep(pattern = "m?", people, value = TRUE)
## [1] "rori" "emilia" "matteo" "mehmet" "filipe" "anna" "tyler"
## [8] "rasmus" "jacob" "youna" "flora" "adi"
# match m exactly once
grep(pattern = "m{1}", people, value = TRUE, perl = FALSE)
## [1] "emilia" "matteo" "mehmet" "rasmus"
# match t exactly twice
grep(pattern = "tf2g", people, value = TRUE)
## character(0)
#stringr regex functions
#str detect() Detect the presence or absence of a pattern in a string
#str extract() Extract rst piece of a string that matches a pattern
#str extract all() Extract all pieces of a string that match a pattern
#str match() Extract rst matched group from a string
#str match all() Extract all matched groups from a string
#str locate() Locate the position of the rst occurence of a pattern in a string
#str locate all() Locate the position of all occurences of a pattern in a string
#str replace() Replace rst occurrence of a matched pattern in a string
#str replace all() Replace all occurrences of a matched pattern in a string
#str split() Split up a string into a variable number of pieces
#str split fixed() Split up a string into a xed number of pieces
#grep()
# some text
text = c("one word", "a sentence", "you and me", "three two one")
# pattern
pat = "one"
# default usage
grep(pat, text)
## [1] 1 4
# with value (showing matched text)
grep(pat, text, value = TRUE)
## [1] "one word" "three two one"
# with invert (showing unmatched parts)
grep(pat, text, invert = TRUE)
## [1] 2 3
## [1] 2 3
# same with values
grep(pat, text, invert = TRUE, value = TRUE)
## [1] "a sentence" "you and me"
#sub and gsub
#strsplit(text, pattern)
# a sentence
sentence = c("R is a collaborative project with many contributors")
# split into words
strsplit(sentence, " ")
## [[1]]
## [1] "R" "is" "a" "collaborative"
## [5] "project" "with" "many" "contributors"
# telephone numbers
tels = c("510-548-2238", "707-231-2440", "650-752-1300")
# split each number into its portions
strsplit(tels, "-")
## [[1]]
## [1] "510" "548" "2238"
##
## [[2]]
## [1] "707" "231" "2440"
##
## [[3]]
## [1] "650" "752" "1300"
####stringr functions3
#Detecting patterns with str detect()
#some objects
some_objs = c("pen", "pencil", "marker", "spray")
str_detect(some_objs, "pen")
## [1] TRUE TRUE FALSE FALSE
#str_extract
# tweets about Paris
paris_tweets = c(
"#Paris is chock-full of cultural and culinary attractions",
"Some time in #Paris along Canal St.-Martin famous by #Amelie",
"While you
re in #Paris, stop at cafe: http://goo.gl/yaCbW",
"Paris, the city of light")
# hashtag pattern
hash = "#[a-zA-Z]{1,}"
# extract (first) hashtag that it finds will not extract other words with hashtags
str_extract(paris_tweets, hash)
## [1] "#Paris" "#Paris" "#Paris" NA
# extract (all) hashtags
str_extract_all(paris_tweets, "#[a-zA-Z]f1,g")
## [[1]]
## character(0)
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## character(0)
# locate position of (first) hashtag
str_locate(paris_tweets, "#[a-zA-Z]f1,g")
## start end
## [1,] NA NA
## [2,] NA NA
## [3,] NA NA
## [4,] NA NA
## start end
## [1,] 1 6
## [2,] 14 19
## [3,] 17 22
## [4,] NA NA
#Gives the table of postion from start to end
#str_replace
# city names
cities = c("San Francisco", "Barcelona", "Naples", "Paris")
# replace first matched vowel
str_replace(cities, "[aeiou]", ";")
## [1] "S;n Francisco" "B;rcelona" "N;ples" "P;ris"
# city names
cities = c("San Francisco", "Barcelona", "Naples", "Paris")
# replace all matched vowel
str_replace_all(cities, pattern = "[aeiou]", ";")
## [1] "S;n Fr;nc;sc;" "B;rc;l;n;" "N;pl;s" "P;r;s"
## [1] "S;n Fr;nc;sc;" "B;rc;l;n;" "N;pl;s" "P;r;s"
# replace all matched consonants
str_replace_all(cities, pattern = "[^aeiou]", ";")
## [1] ";a;;;;a;;i;;o" ";a;;e;o;a" ";a;;e;" ";a;i;"
#str_split()
# string
flavors = c("chocolate", "vanilla", "cinnamon", "mint", "lemon")
# split by vowels
str_split(flavors, "[aeiou]")
## [[1]]
## [1] "ch" "c" "l" "t" ""
##
## [[2]]
## [1] "v" "n" "ll" ""
##
## [[3]]
## [1] "c" "nn" "m" "n"
##
## [[4]]
## [1] "m" "nt"
##
## [[5]]
## [1] "l" "m" "n"