nchar("India") # 5; the number of characters in the string
## [1] 5
nchar(" India ") # 8; blank spaces are counted
## [1] 8
southasia <- c("Bangladesh", "Bhutan", "India", "Nepal", "Pakistan", "Sri Lanka")
nchar(x = southasia)
## [1] 10 6 5 5 8 9
# We get character counts of the elements of `southasia`
# We see that functions in *R* are good at working with vectors
paste()paste("Everybody", "loves", "stats.")
## [1] "Everybody loves stats."
paste("Everybody", "loves", "stats.", sep = "-")
## [1] "Everybody-loves-stats."
paste("Everybody", "loves", "stats.", sep = "")
## [1] "Everybodylovesstats."
Note that R expressions can be interpolated in paste().
paste("The square root", "of twice pi is approximately", sqrt(2*pi))
## [1] "The square root of twice pi is approximately 2.506628274631"
paste("The square root of 16 is", c(sqrt(16), -sqrt(16)))
## [1] "The square root of 16 is 4" "The square root of 16 is -4"
# Again, note how the `paste()` function works with vectors.
stooges <- c("Moe", "Larry", "Curly")
foods <- c("fish.", "cake.")
paste(stooges, "loves", foods)
## [1] "Moe loves fish." "Larry loves cake." "Curly loves fish."
Note that vectors are treated as usual and recycling is applied when necessary.
foods2 <- c("fish", "cake", "fries")
sprintf(fmt = "%s loves %s.", stooges, foods2)
## [1] "Moe loves fish." "Larry loves cake." "Curly loves fries."
Note that sprintf() allows a more natural way to insert a period in each sentence.
stooges3 <- c("Moe", "Larry")
foods3 <- c("fish", "cake", "fries", "steak")
sprintf(fmt = "%s loves %s.", stooges3, foods3)
## [1] "Moe loves fish." "Larry loves cake." "Moe loves fries."
## [4] "Larry loves steak."
Note the use of vector recycling here: stooges3 is extended to c("Moe", "Larry", "Moe", "Larry").
substr()substr(x = "Statistics", start = 1, stop = 4)
## [1] "Stat"
# extracts characters 1 through 4
substr("Statistics", -2, 4)
## [1] "Stat"
# Same as previous; indexes 0, -1, and -2 are simply ignored
substr("Statistics", 7, 10)
## [1] "tics"
substr("Statistics", 7, 12)
## [1] "tics"
# Same as previous. Indexes 11 and 12, being empty, are ignored
substr(stooges, 1, 3)
## [1] "Moe" "Lar" "Cur"
# Vectors are treated as usual.
cities <- c("New York, NY", "Boston, MA", "Los Angeles, CA")
substr(cities, nchar(cities) - 1, nchar(cities))
## [1] "NY" "MA" "CA"
# extracts the states.
path <- "/home/data/trials.csv"
strsplit(x = path, split = "/")
## [[1]]
## [1] "" "home" "data" "trials.csv"
A vector of strings can be used instead of just one string. The delimiter can be a regular expression. Perl regular expressions can be used as delimiters, with Perl = TRUE added.
strsplit()VectorOfText <- c("", "home", "data", "trials.csv")
paste(VectorOfText, collapse = "/")
## [1] "/home/data/trials.csv"
Here’s another way to get the same result:
paste(strsplit(path, "/")[[1]], collapse = "/")
## [1] "/home/data/trials.csv"
s <- "Curly is the smart one. Curly is funny, too."
sub("Curly", "Moe", s)
## [1] "Moe is the smart one. Curly is funny, too."
gsub("Curly", "Moe", s)
## [1] "Moe is the smart one. Moe is funny, too."
The thing to be replaced can be a regular expression. Vectors are welcome.
states <- c("NY", "IL", "CA", "TX")
sports <- c("Baseball", "Football", "Basketball")
outer(states, sports, paste, sep = "*")
## [,1] [,2] [,3]
## [1,] "NY*Baseball" "NY*Football" "NY*Basketball"
## [2,] "IL*Baseball" "IL*Football" "IL*Basketball"
## [3,] "CA*Baseball" "CA*Football" "CA*Basketball"
## [4,] "TX*Baseball" "TX*Football" "TX*Basketball"
m <- outer(states, sports, paste, sep = "*")
m[!lower.tri(outer(states, sports, paste, sep = "*"))]
## [1] "NY*Baseball" "NY*Football" "IL*Football" "NY*Basketball"
## [5] "IL*Basketball" "CA*Basketball"
m[!lower.tri(m)] # Same as previous
## [1] "NY*Baseball" "NY*Football" "IL*Football" "NY*Basketball"
## [5] "IL*Basketball" "CA*Basketball"
As the outputs of the above commands are character strings or vectors of character strings, they can be fruitfully used with functions that take character strings (or vectors thereof) as inputs.
stringr package is often used, in conjunction with regular expressions.This package has the functions str_split, str_sub, str_detect, str_trim, str_extract, str_replace, str_replace_all, etc.
if (!"stringr" %in% rownames(installed.packages())) {install.packages("stringr")}
library("stringr")
text <- c("arm", "leg", "head", "foot", "hand", "hindleg", "elbow")
grep("o", text)
## [1] 4 7
# 4 7; because the 4th and 7th elements include o
grep("o{2}", text)
## [1] 4
# 4; because only the 4th element includes oo
grep("o{2}", text, value = TRUE)
## [1] "foot"
# "foot"; because only foot includes oo
grep("o{3}", text, value = TRUE)
## character(0)
# character(0); because no element includes ooo
regexpr("o", text)
## [1] -1 -1 -1 2 -1 -1 4
## attr(,"match.length")
## [1] -1 -1 -1 1 -1 -1 1
## attr(,"index.type")
## [1] "chars"
## attr(,"useBytes")
## [1] TRUE
grep("[[:alnum:]]{4,}", text, value = TRUE)
## [1] "head" "foot" "hand" "hindleg" "elbow"
Shows elements of text with 4 or more alphanumeric characters: “arm” and “leg” are dropped.
grep("[[:alnum:]]{7,}", text, value = TRUE)
## [1] "hindleg"
Shows elements of text with 7 or more alphanumeric characters: only “hindleg” survives.
textfreq <- as.vector(unlist(lapply(gregexpr("o", text), length)))
present <- ifelse(regexpr("o", text) < 0, 0, 1)
freq * present
## [1] 0 0 0 2 0 0 1
This is incomprehensible. Where did it come from, I wonder!
charmatch() identifies the unique element that begins in a specified way, if it existscharmatch("e", text) # 7; because only element 7 begins with e
## [1] 7
charmatch("h", text) # 0; because more than one element begins with h
## [1] 0
charmatch("he", text) # 3; because only element 3 begins with he
## [1] 3
charmatch("hf", text) # NA; because no element begins with hf
## [1] NA