Import the stringr library -
> library(stringr)
All the stringr’s functions starts with str_.
Function: str_length()
> str_length("Do the work")
[1] 11
> str_length(c("This","GOOgle","AeioU"))
[1] 4 6 5
Spaces are counted as a character.
Or the base R function: nchar()
> nchar("Do the work")
[1] 11
> nchar(c("This","GOOgle","AeioU"))
[1] 4 6 5
Function: str_to_upper()
> str_to_upper("don't go there")
[1] "DON'T GO THERE"
> str_to_upper(c("This","GOOgle","AeioU"))
[1] "THIS" "GOOGLE" "AEIOU"
> str_to_lower("don't go there")
[1] "don't go there"
> str_to_lower(c("This","GOOgle","AeioU"))
[1] "this" "google" "aeiou"
> str_to_title("the fINAL destiNATION")
[1] "The Final Destination"
> str_to_sentence("this Is Nothing BUT a sentence")
[1] "This is nothing but a sentence"
> str_trim(" This is good! ")
[1] "This is good!"
> str_trim(" This is good! ", side="right")
[1] " This is good!"
> str_trim(" This is good! ", side="left")
[1] "This is good! "
> rbind(
+ str_pad(c("do"), width="10", side="both",pad="$"),
+ str_pad(c("do"), width="10", side="both",pad=" "),
+ str_pad(c("this"), width="10", side="right",pad="-"),
+ str_pad(c("no way"), width="10",side="left",pad="-")
+ )
[,1]
[1,] "$$$$do$$$$"
[2,] " do "
[3,] "this------"
[4,] "----no way"
> string1 <- "An introduction to the science of life."
> rbind(
+ str_trunc(string1, width=30, "right"),
+ str_trunc(string1, width=30, "left"),
+ str_trunc(string1, width=30, "center")
+ )
[,1]
[1,] "An introduction to the scie..."
[2,] "...ion to the science of life."
[3,] "An introductio...ence of life."
> sen1 <- "This is OK, but how! Alas! And, there was nothing!"
> str_split(sen1, pattern=" ")
[[1]]
[1] "This" "is" "OK," "but" "how!" "Alas!"
[7] "And," "there" "was" "nothing!"
> str_split(sen1, ", ")
[[1]]
[1] "This is OK" "but how! Alas! And" "there was nothing!"
> str_split(sen1, "! ")
[[1]]
[1] "This is OK, but how" "Alas"
[3] "And, there was nothing!"
The simplify argument determined what to return. If FALSE, it returns a list of character vectors. If TRUE returns a character matrix -
> str_split(sen1, " ", simplify = F)
[[1]]
[1] "This" "is" "OK," "but" "how!" "Alas!"
[7] "And," "there" "was" "nothing!"
> str_split(sen1, " ", simplify = T)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
[1,] "This" "is" "OK," "but" "how!" "Alas!" "And," "there" "was" "nothing!"
> sen1 <- "This is OK, but how! Alas! And, there was nothing!"
> str_split(sen1, pattern=" ", n=8)
[[1]]
[1] "This" "is" "OK,"
[4] "but" "how!" "Alas!"
[7] "And," "there was nothing!"
Here the sentence is split into 8 parts.
Using str_c() -
> str_c("Join", "this!", sep="_")
[1] "Join_this!"
> str_c( c("No","Yes"),
+ c("way", "yesss"),
+ sep = " ")
[1] "No way" "Yes yesss"
It is equivalent to the function paste() from base R -
> paste("Join", "this!", sep="_")
[1] "Join_this!"
> paste( c("No","Yes"),
+ c("way", "yesss"),
+ sep = " ")
[1] "No way" "Yes yesss"
Collapse a vector of strings into a single string -
> str_c(string = c("this","is","just","an","example"), sep="_")
[1] "this" "is" "just" "an" "example"
NOTICE: This is not a vector of words, but previous one was -
> str_c("this","is","just","an","example", sep="_")
[1] "this_is_just_an_example"
Lets look at the vector of strings that should be of numeric type -
> cost <- c("1,579","1,975","3,000","2,500")
> as.numeric(cost)
[1] NA NA NA NA
So, we have to remove the commas from the values before converting into numeric -
> cost <- str_remove(string = cost, pattern = ",")
> cost # see the vector
[1] "1579" "1975" "3000" "2500"
> as.numeric(cost)
[1] 1579 1975 3000 2500
By default NA turns into a string “NA” -
> string3 <- c("one word", NA, "another", "word")
> string3
[1] "one word" NA "another" "word"
> str_replace_na(string3)
[1] "one word" "NA" "another" "word"
But using the replacement argument NA can be replaced to anything -
> str_replace_na(string3, replacement = 99)
[1] "one word" "99" "another" "word"
str_sort() sorts in alphabetical order -
> string4 <- c("b","a","b","B","q","Y")
> str_sort(string4)
[1] "a" "b" "b" "B" "q" "Y"
str_oder() returns the order of the strings -
> str_order(string4)
[1] 2 1 3 4 5 6
> string4[str_order(string4)] # equivalent to str_sort()
[1] "a" "b" "b" "B" "q" "Y"
Here base R’s sort() can also be used -
> 'b' < 'B'
[1] TRUE
> sort(string4)
[1] "a" "b" "b" "B" "q" "Y"
Using the function str_glue() -
> fname <- c("Ahsanul","Ataur","Mehedi")
> lname <- c("Islam","Rahman","Hasan")
> str_glue("He is my friend, {fname} {lname}.")
He is my friend, Ahsanul Islam.
He is my friend, Ataur Rahman.
He is my friend, Mehedi Hasan.
Mathematical operations can be done inside the {} -
> f <- 10
> ages <- c(21, 22, 19)
> str_glue("{fname} {lname} is {ages} years old now. \nAfter {f} years he will be {ages+f} years old.\n\n")
Ahsanul Islam is 21 years old now.
After 10 years he will be 31 years old.
Ataur Rahman is 22 years old now.
After 10 years he will be 32 years old.
Mehedi Hasan is 19 years old now.
After 10 years he will be 29 years old.
Another one -
> nums <- c(10, 100, 5, 9)
> str_glue("The square of {nums} is {nums**2}")
The square of 10 is 100
The square of 100 is 10000
The square of 5 is 25
The square of 9 is 81
Using str_extract_all() and using the argument boundary(“word”) -
> str_extract_all("This is an example word. How is it?", pattern = boundary("word"))
[[1]]
[1] "This" "is" "an" "example" "word" "How" "is"
[8] "it"
It returns a list of vectors-
> sen2 <- c("This is an example","Second, Example","wow! Nice.")
> str_extract_all(sen2, boundary("word"))
[[1]]
[1] "This" "is" "an" "example"
[[2]]
[1] "Second" "Example"
[[3]]
[1] "wow" "Nice"
> str_extract_all(sen2, boundary("word"), simplify = T) # Returns a matrix
[,1] [,2] [,3] [,4]
[1,] "This" "is" "an" "example"
[2,] "Second" "Example" "" ""
[3,] "wow" "Nice" "" ""
Note: Boundary takes the values - “character”, “line_break”, “sentence”, “word”.
> str_extract_all("THIS is nothing but a sentence. He is right", pattern=fixed("is", ignore_case = T))
[[1]]
[1] "IS" "is" "is"
> str_extract_all("THIS is nothing but a sentence. He is right",
+ pattern="is|but") # the 'IS' from 'THIS' will be matched
[[1]]
[1] "is" "but" "is"
Looks for either is or but.
Using str_subset() -
> fruit <- c("apple", "banana", "Pear", "pinapple")
> str_subset(fruit, "b") # Looks for b in any position
[1] "banana"
> str_subset(fruit, "^a") # Looks for a in first position
[1] "apple"
> str_subset(fruit, "a$") # Looks for a in last position
[1] "banana"
> str_subset(fruit, "[aeiou]") # Looks for any of the patterns in any position
[1] "apple" "banana" "Pear" "pinapple"
> str_subset(fruit, "[A-Z]") # Looks for any of the latters A to B
[1] "Pear"
The argument negate determines what strings to return, matched or not matched -
> str_subset(fruit, "b")
[1] "banana"
> str_subset(fruit, "b", negate = T)
[1] "apple" "Pear" "pinapple"
Suppose we have two different words which is similar in meaning but has slightly different spelling. In these case we can get a matrix in return with words and their matched part using str_match() -
> str_match(c("color","colour","eye"), "colo(|u)r") # "col(o|ou)r" can be used also
[,1] [,2]
[1,] "color" ""
[2,] "colour" "u"
[3,] NA NA
The 3rd row is NA because the pattern didn’t match here.
We can get only the matched words also -
> str_match(c("color","colour","eye"), "col(?:o|ou)r")
[,1]
[1,] "color"
[2,] "colour"
[3,] NA
Using str_detect() -
> str_detect(fruit, "a")
[1] TRUE TRUE TRUE TRUE
> str_detect(fruit, "^a")
[1] TRUE FALSE FALSE FALSE
> str_detect(fruit, "a$")
[1] FALSE TRUE FALSE FALSE
> str_detect(fruit, "[aeiou]")
[1] TRUE TRUE TRUE TRUE
str_which() shows the index of the matching string-
> string5 <- "I can can the can. This is nothing but fun. But how?"
> str_split(string5," ")[[1]]
[1] "I" "can" "can" "the" "can." "This" "is"
[8] "nothing" "but" "fun." "But" "how?"
> str_which(str_split(string5," ")[[1]], pattern = "can")
[1] 2 3 5
The above example takes a vector of strings as input.
> string5
[1] "I can can the can. This is nothing but fun. But how?"
Returns a list of matrix -
> str_locate_all(string5, "can")
[[1]]
start end
[1,] 3 5
[2,] 7 9
[3,] 15 17
> str_locate_all(string5, c("can","fun"))
[[1]]
start end
[1,] 3 5
[2,] 7 9
[3,] 15 17
[[2]]
start end
[1,] 40 42
> word("A small example of string.", start=1, end=4)
[1] "A small example of"
> word("A small example of string.", start=3, end=-1)
[1] "example of string."
Negative values in end count backwards from the last character.
Look at the example string5 -
> string5
[1] "I can can the can. This is nothing but fun. But how?"
Usingstr_count() -
> str_count(string5, "can")
[1] 3
> str_sub("A small example of string.", start=3)
[1] "small example of string."
> str_sub("A small example of string.", end=3)
[1] "A s"
> str_sub("A small example of string.", start=3, end=10)
[1] "small ex"
To replace all matches -
> string6 <- "Everyone loves cat. Cat is sweet, and cat is smol."
> str_replace_all(string6,
+ pattern = fixed("cat", ignore_case = T),
+ replacement = "dog")
[1] "Everyone loves dog. dog is sweet, and dog is smol."
Applying function as replacement -
> str_replace_all(string6, pattern = fixed("cat", ignore_case = T),
+ replacement = toupper)
[1] "Everyone loves CAT. CAT is sweet, and CAT is smol."
More examples -
> fruits <- c("one apple", "two pears", "three bananas")
> str_replace(fruits, "[aeiou]", "-") # replaces only the first match
[1] "-ne apple" "tw- pears" "thr-e bananas"
> str_replace_all(fruits, "[aeiou]", "-") # replaces all
[1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
> str_replace_all(fruits, "two", NA_character_) # turns the matched string into NA
[1] "one apple" NA "three bananas"
> str_replace(fruits, "([aeiou])", "") # deletes the matched string
[1] "ne apple" "tw pears" "thre bananas"
To know details on regex and stringr click here.
Answers to R homework questions by experts at https://www.homeworkhelponline.net/programming/r-programming.