Codes for Handling and Processing Strings in R

These are some of the important scripts described in Gaston Sanchez’s book.

head(USArrests)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
# names of states
states = USArrests$state

# substr function is used to take the part of string
substr(x = states, start = 1, stop = 4)
## character(0)
# abbreviate state names
states2 = abbreviate(states)
states2
## named character(0)
# size (no of letters) of each name, counts spaces too
state_chars = nchar(states)
state_chars
## integer(0)
# longest name
states[which(state_chars == max(state_chars))]
## Warning in max(state_chars): no non-missing arguments to max; returning -
## Inf
## NULL
# get states names with k, use grep function and indicate pattern="k"
grep(pattern = "k", x = states, value = TRUE)
## character(0)
# get states names with w or W
grep(pattern = "[wW]", x = states, value = TRUE)
## character(0)
#another solution is to convert all the state names to lower case
# get states names with w
grep(pattern = "w", x = tolower(states), value = TRUE)
## character(0)
# alternatively get states names with  W
grep(pattern = "W", x = toupper(states), value = TRUE)
## character(0)
#or we can ask to ignore case 
grep(pattern = "w", x = states, value = TRUE, ignore.case = TRUE)
## character(0)
head(USArrests)
##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7
# names of states
states = USArrests$state
#understand histogram
#it will show how many times each value appear 
x=c(1,1,1,2,2,3)
hist(x)

# histogram for no of characters in states
#use to show no of amino acid sequences in many protein sequences



#We can use regexpr() to get the number oftimes that a searched pattern is found
#in a character vector. When there is no match, we get -1 
# position of a's
positions_a = gregexpr(pattern = "a", text = states, ignore.case = TRUE)
#it will give 1,3,5,7 for Alabama

#but if we want to count number of a's in each state 
num_a = sapply(positions_a, function(x) ifelse(x[1] > 0, length(x), 0))


#but easy way is to use stringr library 
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
# total number of a's, remember it is case sensetive and it does not take ignore.case #function
str_count(states, "a")
## integer(0)
str_count(tolower(states),"a")
## integer(0)
#USE COUNT TOTAL NUMBER OF A G T C in DNA Sequence 
#lets count all the vowels
# vector of vowels
vowels = c("a", "e", "i", "o", "u")
# vector for storing results
#this starts five counters at the same time
num_vowels = vector(mode = "integer", length = 5)

# calculate number of vowels in each name
#for j in vowels takes a,e,i,o,u respectively
for (j in seq_along(vowels)) {
  num_aux = str_count(tolower(states), vowels[j])
num_vowels[j] = sum(num_aux)
}

# add vowel names
names(num_vowels) = vowels

# total number of vowels
num_vowels
## a e i o u 
## 0 0 0 0 0
# sort them in decreasing order
sort(num_vowels, decreasing = TRUE)
## a e i o u 
## 0 0 0 0 0
# barplot
barplot(num_vowels,main = "Number of vowels in USA States names",
border = NA,xlab ="vowels", ylim = c(0, 80))

#character function character(). this function creates character vectors
# empty string
empty_str = " "
# class of empty_str
class(empty_str)
## [1] "character"
#another way to create empty character vector not the string 
#empty character vector
empty_chr = character(0)
class(empty_chr)
## [1] "character"
#what is the difference 

# length of empty string is 1 
length(empty_str)
## [1] 1
# length of empty character vector is o
length(empty_chr)
## [1] 0
#we just have to mention length of characters. It creates empty character vectors
# character vector with 5 empty strings
char_vector = character(5)
# display
char_vector
## [1] "" "" "" "" ""
#we can add components to the empty characters 

# another example
example = character(0)
example
## character(0)
#check its length
length(example)
## [1] 0
# add first element
example[1] = "first"
example
## [1] "first"
# check its length again
length(example)
## [1] 1
#we can add other elements. does not have to be in order.Missing values are filled with NA. 
example[4] = "fourth"
example
## [1] "first"  NA       NA       "fourth"
length(example)
## [1] 4
#sister functions is.character and as.character
#first for testing character and second for converting something to character
# define two objects 
a = "test me"
b = 8 + 9
# are a and b characters 
is.character(a)
## [1] TRUE
is.character(b)
## [1] FALSE
#this can be done by class 
class (a)
## [1] "character"
class (b)
## [1] "numeric"
#convert non character to character 
b = as.character(b)

##now move to strings
#in vector and matrix if there is mixture of number and characters, charcter will dominate.
#in data frame the strings are converted into factor
# to turn off this use stringsAsFactors=False
#with list we can combine whatever data we want 
list(1:5, letters[1:5], rnorm(5))
## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] "a" "b" "c" "d" "e"
## 
## [[3]]
## [1] -1.5384467  0.3912095  1.2583841  1.8089722 -2.1600295

importing data

#reading raw text use readlines()
# read ktop100.txt file
top105 = readLines("http://www.textfiles.com/music/ktop100.txt")
View(top105)
# how many lines
length(top105)
## [1] 123
# inspecting last 10 elements
tail(top105, n = 10)
##  [1] "101. SMASHING PUMPKINS          SIVA"                       
##  [2] "102. ELVIS COSTELLO             OTHER SIDE OF ..."          
##  [3] "103. SEERS                      PSYCHE OUT"                 
##  [4] "104. THRILL KILL CULT           SEX ON WHEELZ"              
##  [5] "105. MATTHEW SWEET              I'VE BEEN WAITING"          
##  [6] "105.3  LATOUR                   PEOPLE ARE STILL HAVING SEX"
##  [7] ""                                                           
##  [8] "Ed"                                                         
##  [9] "ed@wente.llnl.gov"                                          
## [10] ""

string manipulation

#most versatile paste() function
# paste(..., sep="", collapse=NULL)
#... means can take any number of strings
#what separator want to use, space comma, collapse means combine strings as one 

# paste
PI = paste("The life of", pi)
# paste
IloveR = paste("I", "love", "R", sep = "-")

#If we give paste() objects of dierent length, then it will apply a recycling rule.
# paste with objects of different lengths. Here there are 5 digits but only one X. so
paste("X", 1:5, sep = ".")
## [1] "X.1" "X.2" "X.3" "X.4" "X.5"
# paste with collapsing
paste(1:3, c("!", "?", "+"), sep = "", collapse = "")
## [1] "1!2?3+"
# paste without collapsing
paste(1:3, c("!", "?", "+"), sep = "")
## [1] "1!" "2?" "3+"
#paste0() which is the equivalent of using collapse
# collapsing with paste0
paste0("lets", "collapse", "all", "these", "words")
## [1] "letscollapseallthesewords"

printing character

my_string = "programming with data is fun"
# print string
print(my_string)
## [1] "programming with data is fun"
#if you dont want quote 
print(my_string, quote = FALSE)
## [1] programming with data is fun
# noquote fucntion. prints same as above 
noquote(my_string)
## [1] programming with data is fun
#actually noquotes creates a different class of object called noquote
no_quotes = noquote(c("some", "quoted", "text", "!%^(&="))
# display
no_quotes
## [1] some   quoted text   !%^(&=
class(no_quotes)
## [1] "noquote"
#concatenate
#syntax cat(..., file = "", sep = " ", fill = FALSE, labels = NULL, append = FALSE)
# simply print with cat(), and it does not print numeric line indicator
cat(my_string)
## programming with data is fun
#combine strings
cat(my_string, "with R")
## programming with data is fun with R
# especifying sep
cat(my_string, "with R", sep = " =) ")
## programming with data is fun =) with R
#can work on vectors to
#first four months
cat(month.name[1:4], sep = " ")
## January February March April
#fill argument can be used to display in nice format 
# fill = 30
cat("Loooooooooong strings", "can be displayed", "in a nice format",
"by using the fill argument", fill = 30)
## Loooooooooong strings 
## can be displayed 
## in a nice format 
## by using the fill argument
# cat with output in a given file. Does not work in markdown before knitting
cat(my_string, "with R", file = "output.txt")

encoding strings with format () and sprintf()

#this function helps to format the output like intendations,
#sceintific format, number of digits in number etc.


#sprintf provides nice formatting of string and vectors 
# print with sign (positive)
#this can be used to add > sign in front of fasta files pi can be replaced by fasta sequence
sprintf("%+f", pi)
## [1] "+3.141593"
#function toString() can be used to help sprintf fucntion
#toString converts R objects to string


# combining several objects
toString(c("Bonjour", 123, TRUE, NA, log(exp(1))))
## [1] "Bonjour, 123, TRUE, NA, 1"
#width argument can be used to have certain lenght. this could be used for trimming purpose
toString(c("one", "two", "3333333333"), width = 12)
## [1] "one, two...."

some basic string manipulations

# how many characters?
nchar(c("How", "many", "characters?"))
## [1]  3  4 11
# how many characters?. White spaces are also counted as characters
nchar("How many characters?")
## [1] 20
# how many elements? here is just 1. 
length("How many characters?")
## [1] 1
#character tranlation by chartr()
#syntax chartr(old, new, x)
# replace a by A. old and new should have same number of characters
chartr("a", "A", "This is a boring string")
## [1] "This is A boring string"
#substr is used both to extract and replace characters
#syntax substr(x, start, stop)
# extract bcd
substr("abcdef", 2, 4)
## [1] "bcd"
# replace 2nd letter with hash symbol
x = c("may", "the", "force", "be", "with", "you")
substr(x, 2, 2) <- "#"

#replace with substring
#syntax substring(text, first, last = 1000000L)
# same as substr
substring("ABCDEF", 2, 4)
## [1] "BCD"
# extract each letter
substring("ABCDEF", 1:6, 1:6)
## [1] "A" "B" "C" "D" "E" "F"
# multiple replacement with recycling. Dont know where is canbe used. Can be used
#t replace more strings in a cycle.

set operations

#union function
# two character vectors
set1 = c("some", "random", "words", "some")
set2 = c("some", "many", "none", "few")
# union of set1 and set2 duplicates are removed
union(set1, set2)
## [1] "some"   "random" "words"  "many"   "none"   "few"
#intersect function
# two character vectors
set3 = c("some", "random", "few", "words")
set4 = c("some", "many", "none", "few")
# intersect of set3 and set4
intersect(set3, set4)
## [1] "some" "few"
#difference of the elementsbetween two character vectors. This can be done with setdiff():
# two character vectors
set5 = c("some", "random", "few", "words")
set6 = c("some", "many", "none", "few")
# difference between set5 and set6
setdiff(set5, set6)
## [1] "random" "words"
#setequal() allows us to test the equality of two character vectors.
# three character vectors
set7 = c("some", "random", "strings")
set8 = c("some", "many", "none", "few")
set9 = c("strings", "random", "some")
# set7 == set8?
setequal(set7, set8)
## [1] FALSE
#identical() tests if they are exactly equal or not


#is.elem() testing if certaon elelemt are present in vector

set10 = c("some", "stuff", "to", "play", "with")
elem1 = "play"
# elem1 in set10?
is.element(elem1, set10)
## [1] TRUE
set11 = c("today", "produced", "example", "beautiful", "a", "nicely")
# sort (decreasing order)
sort(set11)
## [1] "a"         "beautiful" "example"   "nicely"    "produced"  "today"
# sort (increasing order)
sort(set11, decreasing = TRUE)
## [1] "today"     "produced"  "nicely"    "example"   "beautiful" "a"
#repition with rep(). combination of paste and rep is one of the several methods
# repeat x 4 times
paste(rep("x", 4), collapse = "")
## [1] "xxxx"

string manipulation with stringr

library(stringr)
#Concatenating with str c(). This is equivalent to paste ()
# default usage
str_c("May", "The", "Force", "Be", "With", "You")
## [1] "MayTheForceBeWithYou"
# removing zero length objects
str_c("May", "The", "Force", NULL, "Be", "With", "You", character(0))
## [1] "MayTheForceBeWithYou"
# changing separator
str_c("May", "The", "Force", "Be", "With", "You", sep = "_")
## [1] "May_The_Force_Be_With_You"
# synonym function str_join
#str_join("May", "The", "Force", "Be", "With", "You", sep = "-")

#Number of characters with str length()

# some text (NA included)
some_text = c("one", "two", "three", NA, "five")
# compare 
#str_length with nchar
nchar(some_text)
## [1]  3  3  5 NA  4
## [1] 3 3 5 2 4
str_length(some_text)
## [1]  3  3  5 NA  4
## [1] 3 3 5 NA 4

#str_lenght can convert number to factor
# some factor
some_factor = factor(c(1, 1, 1, 2, 2, 2), labels = c("good", "bad"))
some_factor
## [1] good good good bad  bad  bad 
## Levels: good bad
#To extract substrings from a character vector stringr provides str sub() which is equivalent
#to substring().  str_sub(string, start = 1L, end = -1L)
# some text
lorem = "Lorem Ipsum"
# apply 
#str_sub
str_sub(lorem, start = 1, end = 5)
## [1] "Lorem"
# another example
str_sub("adios", 1:3)
## [1] "adios" "dios"  "ios"
# str_subcan work with with negative positions


#replacing Loremwith Nullam
lorem = "Lorem Ipsum"
str_sub(lorem, 1, 5) <- "Nullam"
lorem
## [1] "Nullam Ipsum"
## [1] "Nullam Ipsum"


# multiple replacements
lorem = "Lorem Ipsum"
str_sub(lorem, c(1, 7), c(5, 8)) <- c("Nullam", "Enim")
lorem
## [1] "Nullam Ipsum"  "Lorem Enimsum"
## [1] "Nullam Ipsum" "Lorem Enimsum"

#duplication with str_dup
# default usage
str_dup("hola", 3)
## [1] "holaholahola"
#padding nice papragraph printing
# quote (by Douglas Adams)
some_quote = c(
"I may not have gone",
"where I intended to go,",
"but I think I have ended up",
"where I needed to be")

# display paragraph with width=30
cat(str_wrap(some_quote, width = 30))
## I may not have gone where I intended to go, but I think I have ended up where I needed to be
#display paragraph with first line indentation of 2
cat(str_wrap(some_quote, width = 30, indent = 2), "nn")
##   I may not have gone   where I intended to go,   but I think I have ended up   where I needed to be nn
#str_trim () for trimming the white space 
#str_trim(string, side = "both")
# text with whitespaces
bad_text = c("This", " example ", "has several ", " whitespaces ")
# remove whitespaces on the right side
str_trim(bad_text, side = "right")
## [1] "This"         " example"     "has several"  " whitespaces"
## [1] "This" " example" "has several" " whitespaces"



#Word extraction with word()
#word(string, start = 1L, end = start, sep = fixed(" "))
# some sentence
change = c("Be the change", "you want to be")
# extract first word
word(change, 1)
## [1] "Be"  "you"
## [1] "Be" "you"
# extract second word
word(change, 2)
## [1] "the"  "want"

regular expressions

#metacharacters . $ ^ * { } ( )
#use \\ if literal meaning is needed
#replace $ with space 
# string
money = "$money"
# the right way in R
sub(pattern = "\\$", replacement = "", x = money)
## [1] "money"
#sub replaces first match while gsub replaces all the matches
#sequences
#/d digits /D non digits and so on 

# replace digit with _
sub("\\d", "_", "the dandelion war 2010")
## [1] "the dandelion war _010"
## [1] "the dandelion war _010"
gsub("\\d", "_", "the dandelion war 2010")
## [1] "the dandelion war ____"
## [1] "the dandelion war ____"

#Spaces and non-spaces
# replace space with _
sub("\\s", "_", "the dandelion war 2010")
## [1] "the_dandelion war 2010"
## [1] "the_dandelion war 2010"
gsub("\\s", "_", "the dandelion war 2010")
## [1] "the_dandelion_war_2010"
## [1] "the_dandelion_war_2010"




#character class

# some string
transport = c("car", "bike", "plane", "boat")
# look for e or i
grep(pattern = "[ei]", transport, value = TRUE)
## [1] "bike"  "plane"
#USED TO CREATE FASTA FORMAT
# la vie (string)
la_vie = "La vie en #FFC0CB (rose);nnCes
t la vie! nttres jolie"
# if you print 
la_vie
## [1] "La vie en #FFC0CB (rose);nnCes\nt la vie! nttres jolie"
print(la_vie)
## [1] "La vie en #FFC0CB (rose);nnCes\nt la vie! nttres jolie"
## [1] "La vie en #FFC0CB (rose);\nCest la vie! \ttres jolie"

# if you cat la_vie
cat(la_vie)
## La vie en #FFC0CB (rose);nnCes
## t la vie! nttres jolie
## La vie en #FFC0CB (rose);
## Cest la vie!    tres jolie
# here \n new line and \t tab 



##QUANTIFIERS ? * + {n} {n,m}
# people names
people = c("rori", "emilia", "matteo", "mehmet", "filipe", "anna", "tyler",
"rasmus", "jacob", "youna", "flora", "adi")

# match m at most once
grep(pattern = "m?", people, value = TRUE)
##  [1] "rori"   "emilia" "matteo" "mehmet" "filipe" "anna"   "tyler" 
##  [8] "rasmus" "jacob"  "youna"  "flora"  "adi"
# match m exactly once
grep(pattern = "m{1}", people, value = TRUE, perl = FALSE)
## [1] "emilia" "matteo" "mehmet" "rasmus"
# match t exactly twice
grep(pattern = "tf2g", people, value = TRUE)
## character(0)
#stringr regex functions
#str detect() Detect the presence or absence of a pattern in a string
#str extract() Extract rst piece of a string that matches a pattern
#str extract all() Extract all pieces of a string that match a pattern
#str match() Extract rst matched group from a string
#str match all() Extract all matched groups from a string
#str locate() Locate the position of the rst occurence of a pattern in a string
#str locate all() Locate the position of all occurences of a pattern in a string
#str replace() Replace rst occurrence of a matched pattern in a string
#str replace all() Replace all occurrences of a matched pattern in a string
#str split() Split up a string into a variable number of pieces
#str split fixed() Split up a string into a xed number of pieces

#grep()
# some text
text = c("one word", "a sentence", "you and me", "three two one")

# pattern
pat = "one"

# default usage
grep(pat, text)
## [1] 1 4
# with value (showing matched text)
grep(pat, text, value = TRUE)
## [1] "one word"      "three two one"
# with invert (showing unmatched parts)
grep(pat, text, invert = TRUE)
## [1] 2 3
## [1] 2 3
# same with values
grep(pat, text, invert = TRUE, value = TRUE)
## [1] "a sentence" "you and me"
#sub and gsub 
#strsplit(text, pattern)
# a sentence
sentence = c("R is a collaborative project with many contributors")
# split into words
strsplit(sentence, " ")
## [[1]]
## [1] "R"             "is"            "a"             "collaborative"
## [5] "project"       "with"          "many"          "contributors"
# telephone numbers
tels = c("510-548-2238", "707-231-2440", "650-752-1300")
# split each number into its portions
strsplit(tels, "-")
## [[1]]
## [1] "510"  "548"  "2238"
## 
## [[2]]
## [1] "707"  "231"  "2440"
## 
## [[3]]
## [1] "650"  "752"  "1300"
####stringr functions3
#Detecting patterns with str detect()
#some objects
some_objs = c("pen", "pencil", "marker", "spray")
str_detect(some_objs, "pen")
## [1]  TRUE  TRUE FALSE FALSE
#str_extract
# tweets about Paris
paris_tweets = c(
"#Paris is chock-full of cultural and culinary attractions",
"Some time in #Paris along Canal St.-Martin famous by #Amelie",
"While you
re in #Paris, stop at cafe: http://goo.gl/yaCbW",
"Paris, the city of light")
# hashtag pattern
hash = "#[a-zA-Z]{1,}"
# extract (first) hashtag that it finds will not extract other words with hashtags
str_extract(paris_tweets, hash)
## [1] "#Paris" "#Paris" "#Paris" NA
# extract (all) hashtags
str_extract_all(paris_tweets, "#[a-zA-Z]f1,g")
## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)
# locate position of (first) hashtag
str_locate(paris_tweets, "#[a-zA-Z]f1,g")
##      start end
## [1,]    NA  NA
## [2,]    NA  NA
## [3,]    NA  NA
## [4,]    NA  NA
## start end
## [1,] 1 6
## [2,] 14 19
## [3,] 17 22
## [4,] NA NA
#Gives the table of postion from start to end 

#str_replace
# city names
cities = c("San Francisco", "Barcelona", "Naples", "Paris")
# replace first matched vowel
str_replace(cities, "[aeiou]", ";")
## [1] "S;n Francisco" "B;rcelona"     "N;ples"        "P;ris"
# city names
cities = c("San Francisco", "Barcelona", "Naples", "Paris")
# replace all matched vowel
str_replace_all(cities, pattern = "[aeiou]", ";")
## [1] "S;n Fr;nc;sc;" "B;rc;l;n;"     "N;pl;s"        "P;r;s"
## [1] "S;n Fr;nc;sc;" "B;rc;l;n;" "N;pl;s" "P;r;s"

# replace all matched consonants
str_replace_all(cities, pattern = "[^aeiou]", ";")
## [1] ";a;;;;a;;i;;o" ";a;;e;o;a"     ";a;;e;"        ";a;i;"
#str_split()

# string
flavors = c("chocolate", "vanilla", "cinnamon", "mint", "lemon")
# split by vowels
str_split(flavors, "[aeiou]")
## [[1]]
## [1] "ch" "c"  "l"  "t"  ""  
## 
## [[2]]
## [1] "v"  "n"  "ll" ""  
## 
## [[3]]
## [1] "c"  "nn" "m"  "n" 
## 
## [[4]]
## [1] "m"  "nt"
## 
## [[5]]
## [1] "l" "m" "n"