Codes for Handling and Processing Strings in R

These are some of the important scripts described in Gaston Sanchez’s book.

head(USArrests)

##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7

# names of states
states = USArrests$state

# substr function is used to take the part of string
substr(x = states, start = 1, stop = 4)

## character(0)

# abbreviate state names
states2 = abbreviate(states)
states2

## named character(0)

# size (no of letters) of each name, counts spaces too
state_chars = nchar(states)
state_chars

## integer(0)

# longest name
states[which(state_chars == max(state_chars))]

## Warning in max(state_chars): no non-missing arguments to max; returning -
## Inf

## NULL

# get states names with k, use grep function and indicate pattern="k"
grep(pattern = "k", x = states, value = TRUE)

## character(0)

# get states names with w or W
grep(pattern = "[wW]", x = states, value = TRUE)

## character(0)

#another solution is to convert all the state names to lower case
# get states names with w
grep(pattern = "w", x = tolower(states), value = TRUE)

## character(0)

# alternatively get states names with  W
grep(pattern = "W", x = toupper(states), value = TRUE)

## character(0)

#or we can ask to ignore case 
grep(pattern = "w", x = states, value = TRUE, ignore.case = TRUE)

## character(0)

head(USArrests)

##            Murder Assault UrbanPop Rape
## Alabama      13.2     236       58 21.2
## Alaska       10.0     263       48 44.5
## Arizona       8.1     294       80 31.0
## Arkansas      8.8     190       50 19.5
## California    9.0     276       91 40.6
## Colorado      7.9     204       78 38.7

# names of states
states = USArrests$state
#understand histogram
#it will show how many times each value appear 
x=c(1,1,1,2,2,3)
hist(x)

# histogram for no of characters in states
#use to show no of amino acid sequences in many protein sequences



#We can use regexpr() to get the number oftimes that a searched pattern is found
#in a character vector. When there is no match, we get -1 
# position of a's
positions_a = gregexpr(pattern = "a", text = states, ignore.case = TRUE)
#it will give 1,3,5,7 for Alabama

#but if we want to count number of a's in each state 
num_a = sapply(positions_a, function(x) ifelse(x[1] > 0, length(x), 0))


#but easy way is to use stringr library 
library(stringr)

## Warning: package 'stringr' was built under R version 3.4.4

# total number of a's, remember it is case sensetive and it does not take ignore.case #function
str_count(states, "a")

## integer(0)

str_count(tolower(states),"a")

## integer(0)

#USE COUNT TOTAL NUMBER OF A G T C in DNA Sequence 
#lets count all the vowels
# vector of vowels
vowels = c("a", "e", "i", "o", "u")
# vector for storing results
#this starts five counters at the same time
num_vowels = vector(mode = "integer", length = 5)

# calculate number of vowels in each name
#for j in vowels takes a,e,i,o,u respectively
for (j in seq_along(vowels)) {
  num_aux = str_count(tolower(states), vowels[j])
num_vowels[j] = sum(num_aux)
}

# add vowel names
names(num_vowels) = vowels

# total number of vowels
num_vowels

## a e i o u 
## 0 0 0 0 0

# sort them in decreasing order
sort(num_vowels, decreasing = TRUE)

## a e i o u 
## 0 0 0 0 0

# barplot
barplot(num_vowels,main = "Number of vowels in USA States names",
border = NA,xlab ="vowels", ylim = c(0, 80))

#character function character(). this function creates character vectors
# empty string
empty_str = " "
# class of empty_str
class(empty_str)

## [1] "character"

#another way to create empty character vector not the string 
#empty character vector
empty_chr = character(0)
class(empty_chr)

## [1] "character"

#what is the difference 

# length of empty string is 1 
length(empty_str)

## [1] 1

# length of empty character vector is o
length(empty_chr)

## [1] 0

#we just have to mention length of characters. It creates empty character vectors
# character vector with 5 empty strings
char_vector = character(5)
# display
char_vector

## [1] "" "" "" "" ""

#we can add components to the empty characters 

# another example
example = character(0)
example

## character(0)

#check its length
length(example)

## [1] 0

# add first element
example[1] = "first"
example

## [1] "first"

# check its length again
length(example)

## [1] 1

#we can add other elements. does not have to be in order.Missing values are filled with NA. 
example[4] = "fourth"
example

## [1] "first"  NA       NA       "fourth"

length(example)

## [1] 4

#sister functions is.character and as.character
#first for testing character and second for converting something to character
# define two objects 
a = "test me"
b = 8 + 9
# are a and b characters 
is.character(a)

## [1] TRUE

is.character(b)

## [1] FALSE

#this can be done by class 
class (a)

## [1] "character"

class (b)

## [1] "numeric"

#convert non character to character 
b = as.character(b)

##now move to strings
#in vector and matrix if there is mixture of number and characters, charcter will dominate.
#in data frame the strings are converted into factor
# to turn off this use stringsAsFactors=False
#with list we can combine whatever data we want 
list(1:5, letters[1:5], rnorm(5))

## [[1]]
## [1] 1 2 3 4 5
## 
## [[2]]
## [1] "a" "b" "c" "d" "e"
## 
## [[3]]
## [1] -1.5384467  0.3912095  1.2583841  1.8089722 -2.1600295

importing data

#reading raw text use readlines()
# read ktop100.txt file
top105 = readLines("http://www.textfiles.com/music/ktop100.txt")
View(top105)
# how many lines
length(top105)

## [1] 123

# inspecting last 10 elements
tail(top105, n = 10)

##  [1] "101. SMASHING PUMPKINS          SIVA"                       
##  [2] "102. ELVIS COSTELLO             OTHER SIDE OF ..."          
##  [3] "103. SEERS                      PSYCHE OUT"                 
##  [4] "104. THRILL KILL CULT           SEX ON WHEELZ"              
##  [5] "105. MATTHEW SWEET              I'VE BEEN WAITING"          
##  [6] "105.3  LATOUR                   PEOPLE ARE STILL HAVING SEX"
##  [7] ""                                                           
##  [8] "Ed"                                                         
##  [9] "ed@wente.llnl.gov"                                          
## [10] ""

string manipulation

#most versatile paste() function
# paste(..., sep="", collapse=NULL)
#... means can take any number of strings
#what separator want to use, space comma, collapse means combine strings as one 

# paste
PI = paste("The life of", pi)
# paste
IloveR = paste("I", "love", "R", sep = "-")

#If we give paste() objects of dierent length, then it will apply a recycling rule.
# paste with objects of different lengths. Here there are 5 digits but only one X. so
paste("X", 1:5, sep = ".")

## [1] "X.1" "X.2" "X.3" "X.4" "X.5"

# paste with collapsing
paste(1:3, c("!", "?", "+"), sep = "", collapse = "")

## [1] "1!2?3+"

# paste without collapsing
paste(1:3, c("!", "?", "+"), sep = "")

## [1] "1!" "2?" "3+"

#paste0() which is the equivalent of using collapse
# collapsing with paste0
paste0("lets", "collapse", "all", "these", "words")

## [1] "letscollapseallthesewords"

printing character

my_string = "programming with data is fun"
# print string
print(my_string)

## [1] "programming with data is fun"

#if you dont want quote 
print(my_string, quote = FALSE)

## [1] programming with data is fun

# noquote fucntion. prints same as above 
noquote(my_string)

## [1] programming with data is fun

#actually noquotes creates a different class of object called noquote
no_quotes = noquote(c("some", "quoted", "text", "!%^(&="))
# display
no_quotes

## [1] some   quoted text   !%^(&=

class(no_quotes)

## [1] "noquote"

#concatenate
#syntax cat(..., file = "", sep = " ", fill = FALSE, labels = NULL, append = FALSE)
# simply print with cat(), and it does not print numeric line indicator
cat(my_string)

## programming with data is fun

#combine strings
cat(my_string, "with R")

## programming with data is fun with R

# especifying sep
cat(my_string, "with R", sep = " =) ")

## programming with data is fun =) with R

#can work on vectors to
#first four months
cat(month.name[1:4], sep = " ")

## January February March April

#fill argument can be used to display in nice format 
# fill = 30
cat("Loooooooooong strings", "can be displayed", "in a nice format",
"by using the fill argument", fill = 30)

## Loooooooooong strings 
## can be displayed 
## in a nice format 
## by using the fill argument

# cat with output in a given file. Does not work in markdown before knitting
cat(my_string, "with R", file = "output.txt")

encoding strings with format () and sprintf()

#this function helps to format the output like intendations,
#sceintific format, number of digits in number etc.


#sprintf provides nice formatting of string and vectors 
# print with sign (positive)
#this can be used to add > sign in front of fasta files pi can be replaced by fasta sequence
sprintf("%+f", pi)

## [1] "+3.141593"

#function toString() can be used to help sprintf fucntion
#toString converts R objects to string


# combining several objects
toString(c("Bonjour", 123, TRUE, NA, log(exp(1))))

## [1] "Bonjour, 123, TRUE, NA, 1"

#width argument can be used to have certain lenght. this could be used for trimming purpose
toString(c("one", "two", "3333333333"), width = 12)

## [1] "one, two...."

some basic string manipulations

# how many characters?
nchar(c("How", "many", "characters?"))

## [1]  3  4 11

# how many characters?. White spaces are also counted as characters
nchar("How many characters?")

## [1] 20

# how many elements? here is just 1. 
length("How many characters?")

## [1] 1

#character tranlation by chartr()
#syntax chartr(old, new, x)
# replace a by A. old and new should have same number of characters
chartr("a", "A", "This is a boring string")

## [1] "This is A boring string"

#substr is used both to extract and replace characters
#syntax substr(x, start, stop)
# extract bcd
substr("abcdef", 2, 4)

## [1] "bcd"

# replace 2nd letter with hash symbol
x = c("may", "the", "force", "be", "with", "you")
substr(x, 2, 2) <- "#"

#replace with substring
#syntax substring(text, first, last = 1000000L)
# same as substr
substring("ABCDEF", 2, 4)

## [1] "BCD"

# extract each letter
substring("ABCDEF", 1:6, 1:6)

## [1] "A" "B" "C" "D" "E" "F"

# multiple replacement with recycling. Dont know where is canbe used. Can be used
#t replace more strings in a cycle.

set operations

#union function
# two character vectors
set1 = c("some", "random", "words", "some")
set2 = c("some", "many", "none", "few")
# union of set1 and set2 duplicates are removed
union(set1, set2)

## [1] "some"   "random" "words"  "many"   "none"   "few"

#intersect function
# two character vectors
set3 = c("some", "random", "few", "words")
set4 = c("some", "many", "none", "few")
# intersect of set3 and set4
intersect(set3, set4)

## [1] "some" "few"

#difference of the elementsbetween two character vectors. This can be done with setdiff():
# two character vectors
set5 = c("some", "random", "few", "words")
set6 = c("some", "many", "none", "few")
# difference between set5 and set6
setdiff(set5, set6)

## [1] "random" "words"

#setequal() allows us to test the equality of two character vectors.
# three character vectors
set7 = c("some", "random", "strings")
set8 = c("some", "many", "none", "few")
set9 = c("strings", "random", "some")
# set7 == set8?
setequal(set7, set8)

## [1] FALSE

#identical() tests if they are exactly equal or not


#is.elem() testing if certaon elelemt are present in vector

set10 = c("some", "stuff", "to", "play", "with")
elem1 = "play"
# elem1 in set10?
is.element(elem1, set10)

## [1] TRUE

set11 = c("today", "produced", "example", "beautiful", "a", "nicely")
# sort (decreasing order)
sort(set11)

## [1] "a"         "beautiful" "example"   "nicely"    "produced"  "today"

# sort (increasing order)
sort(set11, decreasing = TRUE)

## [1] "today"     "produced"  "nicely"    "example"   "beautiful" "a"

#repition with rep(). combination of paste and rep is one of the several methods
# repeat x 4 times
paste(rep("x", 4), collapse = "")

## [1] "xxxx"

string manipulation with stringr

library(stringr)
#Concatenating with str c(). This is equivalent to paste ()
# default usage
str_c("May", "The", "Force", "Be", "With", "You")

## [1] "MayTheForceBeWithYou"

# removing zero length objects
str_c("May", "The", "Force", NULL, "Be", "With", "You", character(0))

## [1] "MayTheForceBeWithYou"

# changing separator
str_c("May", "The", "Force", "Be", "With", "You", sep = "_")

## [1] "May_The_Force_Be_With_You"

# synonym function str_join
#str_join("May", "The", "Force", "Be", "With", "You", sep = "-")

#Number of characters with str length()

# some text (NA included)
some_text = c("one", "two", "three", NA, "five")
# compare 
#str_length with nchar
nchar(some_text)

## [1]  3  3  5 NA  4

## [1] 3 3 5 2 4
str_length(some_text)

## [1]  3  3  5 NA  4

## [1] 3 3 5 NA 4

#str_lenght can convert number to factor
# some factor
some_factor = factor(c(1, 1, 1, 2, 2, 2), labels = c("good", "bad"))
some_factor

## [1] good good good bad  bad  bad 
## Levels: good bad

#To extract substrings from a character vector stringr provides str sub() which is equivalent
#to substring().  str_sub(string, start = 1L, end = -1L)
# some text
lorem = "Lorem Ipsum"
# apply 
#str_sub
str_sub(lorem, start = 1, end = 5)

## [1] "Lorem"

# another example
str_sub("adios", 1:3)

## [1] "adios" "dios"  "ios"

# str_subcan work with with negative positions


#replacing Loremwith Nullam
lorem = "Lorem Ipsum"
str_sub(lorem, 1, 5) <- "Nullam"
lorem

## [1] "Nullam Ipsum"

## [1] "Nullam Ipsum"


# multiple replacements
lorem = "Lorem Ipsum"
str_sub(lorem, c(1, 7), c(5, 8)) <- c("Nullam", "Enim")
lorem

## [1] "Nullam Ipsum"  "Lorem Enimsum"

## [1] "Nullam Ipsum" "Lorem Enimsum"

#duplication with str_dup
# default usage
str_dup("hola", 3)

## [1] "holaholahola"

#padding nice papragraph printing
# quote (by Douglas Adams)
some_quote = c(
"I may not have gone",
"where I intended to go,",
"but I think I have ended up",
"where I needed to be")

# display paragraph with width=30
cat(str_wrap(some_quote, width = 30))

## I may not have gone where I intended to go, but I think I have ended up where I needed to be

#display paragraph with first line indentation of 2
cat(str_wrap(some_quote, width = 30, indent = 2), "nn")

##   I may not have gone   where I intended to go,   but I think I have ended up   where I needed to be nn

#str_trim () for trimming the white space 
#str_trim(string, side = "both")
# text with whitespaces
bad_text = c("This", " example ", "has several ", " whitespaces ")
# remove whitespaces on the right side
str_trim(bad_text, side = "right")

## [1] "This"         " example"     "has several"  " whitespaces"

## [1] "This" " example" "has several" " whitespaces"



#Word extraction with word()
#word(string, start = 1L, end = start, sep = fixed(" "))
# some sentence
change = c("Be the change", "you want to be")
# extract first word
word(change, 1)

## [1] "Be"  "you"

## [1] "Be" "you"
# extract second word
word(change, 2)

## [1] "the"  "want"

regular expressions

#metacharacters . $ ^ * { } ( )
#use \\ if literal meaning is needed
#replace $ with space 
# string
money = "$money"
# the right way in R
sub(pattern = "\\$", replacement = "", x = money)

## [1] "money"

#sub replaces first match while gsub replaces all the matches
#sequences
#/d digits /D non digits and so on 

# replace digit with _
sub("\\d", "_", "the dandelion war 2010")

## [1] "the dandelion war _010"

## [1] "the dandelion war _010"
gsub("\\d", "_", "the dandelion war 2010")

## [1] "the dandelion war ____"

## [1] "the dandelion war ____"

#Spaces and non-spaces
# replace space with _
sub("\\s", "_", "the dandelion war 2010")

## [1] "the_dandelion war 2010"

## [1] "the_dandelion war 2010"
gsub("\\s", "_", "the dandelion war 2010")

## [1] "the_dandelion_war_2010"

## [1] "the_dandelion_war_2010"




#character class

# some string
transport = c("car", "bike", "plane", "boat")
# look for e or i
grep(pattern = "[ei]", transport, value = TRUE)

## [1] "bike"  "plane"

#USED TO CREATE FASTA FORMAT
# la vie (string)
la_vie = "La vie en #FFC0CB (rose);nnCes
t la vie! nttres jolie"
# if you print 
la_vie

## [1] "La vie en #FFC0CB (rose);nnCes\nt la vie! nttres jolie"

print(la_vie)

## [1] "La vie en #FFC0CB (rose);nnCes\nt la vie! nttres jolie"

## [1] "La vie en #FFC0CB (rose);\nCest la vie! \ttres jolie"

# if you cat la_vie
cat(la_vie)

## La vie en #FFC0CB (rose);nnCes
## t la vie! nttres jolie

## La vie en #FFC0CB (rose);
## Cest la vie!    tres jolie
# here \n new line and \t tab 



##QUANTIFIERS ? * + {n} {n,m}
# people names
people = c("rori", "emilia", "matteo", "mehmet", "filipe", "anna", "tyler",
"rasmus", "jacob", "youna", "flora", "adi")

# match m at most once
grep(pattern = "m?", people, value = TRUE)

##  [1] "rori"   "emilia" "matteo" "mehmet" "filipe" "anna"   "tyler" 
##  [8] "rasmus" "jacob"  "youna"  "flora"  "adi"

# match m exactly once
grep(pattern = "m{1}", people, value = TRUE, perl = FALSE)

## [1] "emilia" "matteo" "mehmet" "rasmus"

# match t exactly twice
grep(pattern = "tf2g", people, value = TRUE)

## character(0)

#stringr regex functions
#str detect() Detect the presence or absence of a pattern in a string
#str extract() Extract rst piece of a string that matches a pattern
#str extract all() Extract all pieces of a string that match a pattern
#str match() Extract rst matched group from a string
#str match all() Extract all matched groups from a string
#str locate() Locate the position of the rst occurence of a pattern in a string
#str locate all() Locate the position of all occurences of a pattern in a string
#str replace() Replace rst occurrence of a matched pattern in a string
#str replace all() Replace all occurrences of a matched pattern in a string
#str split() Split up a string into a variable number of pieces
#str split fixed() Split up a string into a xed number of pieces

#grep()
# some text
text = c("one word", "a sentence", "you and me", "three two one")

# pattern
pat = "one"

# default usage
grep(pat, text)

## [1] 1 4

# with value (showing matched text)
grep(pat, text, value = TRUE)

## [1] "one word"      "three two one"

# with invert (showing unmatched parts)
grep(pat, text, invert = TRUE)

## [1] 2 3

## [1] 2 3
# same with values
grep(pat, text, invert = TRUE, value = TRUE)

## [1] "a sentence" "you and me"

#sub and gsub 
#strsplit(text, pattern)
# a sentence
sentence = c("R is a collaborative project with many contributors")
# split into words
strsplit(sentence, " ")

## [[1]]
## [1] "R"             "is"            "a"             "collaborative"
## [5] "project"       "with"          "many"          "contributors"

# telephone numbers
tels = c("510-548-2238", "707-231-2440", "650-752-1300")
# split each number into its portions
strsplit(tels, "-")

## [[1]]
## [1] "510"  "548"  "2238"
## 
## [[2]]
## [1] "707"  "231"  "2440"
## 
## [[3]]
## [1] "650"  "752"  "1300"

####stringr functions3
#Detecting patterns with str detect()
#some objects
some_objs = c("pen", "pencil", "marker", "spray")
str_detect(some_objs, "pen")

## [1]  TRUE  TRUE FALSE FALSE

#str_extract
# tweets about Paris
paris_tweets = c(
"#Paris is chock-full of cultural and culinary attractions",
"Some time in #Paris along Canal St.-Martin famous by #Amelie",
"While you
re in #Paris, stop at cafe: http://goo.gl/yaCbW",
"Paris, the city of light")
# hashtag pattern
hash = "#[a-zA-Z]{1,}"
# extract (first) hashtag that it finds will not extract other words with hashtags
str_extract(paris_tweets, hash)

## [1] "#Paris" "#Paris" "#Paris" NA

# extract (all) hashtags
str_extract_all(paris_tweets, "#[a-zA-Z]f1,g")

## [[1]]
## character(0)
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## character(0)

# locate position of (first) hashtag
str_locate(paris_tweets, "#[a-zA-Z]f1,g")

##      start end
## [1,]    NA  NA
## [2,]    NA  NA
## [3,]    NA  NA
## [4,]    NA  NA

## start end
## [1,] 1 6
## [2,] 14 19
## [3,] 17 22
## [4,] NA NA
#Gives the table of postion from start to end 

#str_replace
# city names
cities = c("San Francisco", "Barcelona", "Naples", "Paris")
# replace first matched vowel
str_replace(cities, "[aeiou]", ";")

## [1] "S;n Francisco" "B;rcelona"     "N;ples"        "P;ris"

# city names
cities = c("San Francisco", "Barcelona", "Naples", "Paris")
# replace all matched vowel
str_replace_all(cities, pattern = "[aeiou]", ";")

## [1] "S;n Fr;nc;sc;" "B;rc;l;n;"     "N;pl;s"        "P;r;s"

## [1] "S;n Fr;nc;sc;" "B;rc;l;n;" "N;pl;s" "P;r;s"

# replace all matched consonants
str_replace_all(cities, pattern = "[^aeiou]", ";")

## [1] ";a;;;;a;;i;;o" ";a;;e;o;a"     ";a;;e;"        ";a;i;"

#str_split()

# string
flavors = c("chocolate", "vanilla", "cinnamon", "mint", "lemon")
# split by vowels
str_split(flavors, "[aeiou]")

## [[1]]
## [1] "ch" "c"  "l"  "t"  ""  
## 
## [[2]]
## [1] "v"  "n"  "ll" ""  
## 
## [[3]]
## [1] "c"  "nn" "m"  "n" 
## 
## [[4]]
## [1] "m"  "nt"
## 
## [[5]]
## [1] "l" "m" "n"

R : Learning String Manipulation

Lok Raj Joshi

July 28, 2018

importing data

string manipulation

printing character

encoding strings with format () and sprintf()

some basic string manipulations

set operations

string manipulation with stringr

regular expressions