R语言字符串函数详解

一、以下为stringr包的字符串函数：

1. 字符串的大小写转换

str_to_upper(string, locale = “”)
str_to_lower(string, locale = “”)
str_to_title(string, locale = “”)

string：字符串，亦可为字符串向量；locale：设置语种。

# install.packages('stringr') 安装包
library(stringr)  #加载包
dog <- "The quick brown dog"
dog1 <- c("The", "quick", "brown", "dog")
# 帮助文档的实例中一般都省略参数名，个人习惯补全参数名，容易理解
str_to_upper(string = dog)  #将英文字符串转换成大写

## [1] "THE QUICK BROWN DOG"

str_to_upper(string = dog1)  #string为字符向量时

## [1] "THE"   "QUICK" "BROWN" "DOG"

str_to_lower(string = dog)  #将英文字符串转换成小写

## [1] "the quick brown dog"

str_to_title(string = dog)  #将英文字符串中的单词首字母转换大写

## [1] "The Quick Brown Dog"

# locale可设置不同的语种
str_to_upper(string = "i", locale = "en")  # English

## [1] "I"

str_to_upper(string = "i", locale = "tr")  # Turkish

## [1] "<U+0130>"

2. invert_match 返回非匹配模式的起始结束位置

invert_match(loc)，loc是以函数str_locate_all()获取的位置矩阵作为输入。

numbers <- "1 and 2 and 4 and 456"
num_loc <- str_locate_all(string = numbers, pattern = "[0-9]+")[[1]]  #匹配数字，返回数字的起始和结束位置
num_loc

##      start end
## [1,]     1   1
## [2,]     7   7
## [3,]    13  13
## [4,]    19  21

str_sub(string = numbers, start = num_loc[, "start"], end = num_loc[, "end"])

## [1] "1"   "2"   "4"   "456"

text_loc <- invert_match(loc = num_loc)  #返回不匹配数字的起始和结束位置
text_loc

##      start end
## [1,]     0   0
## [2,]     2   6
## [3,]     8  12
## [4,]    14  18
## [5,]    22  -1

str_sub(string = numbers, start = text_loc[, "start"], end = text_loc[, "end"])

## [1] ""      " and " " and " " and " ""

3. modifiers 指定模式的类别

3.1 fixed(pattern, ignore_case = FALSE)：Compare literal bytes in the string. This is very fast, but not usually what you want for non-ASCII character sets.

3.2 coll(pattern, ignore_case = FALSE, locale = NULL, …)：Compare strings respecting standard collation rules.

3.3 regex(pattern, ignore_case = FALSE, multiline = FALSE, comments = FALSE, dotall = FALSE, …)：默认使用正则表达式

3.4 boundary(type = c(“character”, “line_break”, “sentence”, “word”), skip_word_none = TRUE, …)：Match boundaries between things.

pattern： Pattern to modify behaviour.
ignore_case： Should case differences be ignored in the match?
locale： Locale to use for comparisons. See stri_locale_list() for all possible options.
…： Other less frequently used arguments passed onto stri_opts_collator, stri_opts_regex, or stri_opts_brkiter
multiline： If TRUE, $ and ^ match the beginning and end of each line. If FALSE, the default, only match the start and end of the input.
comments： If TRUE, whitespace and comments beginning with # are ignored. Escape literal spaces with .
dotall： If TRUE, . will also match line terminators.
type： Boundary type to detect.
skip_word_none： Ignore “words” that don’t contain any characters or numbers - i.e. punctuation.

pattern <- "a.b"
strings <- c("abb", "a.b")
str_detect(string = strings, pattern = pattern)

## [1] TRUE TRUE

str_detect(string = strings, pattern = fixed(pattern))

## [1] FALSE  TRUE

str_detect(string = strings, pattern = coll(pattern))

## [1] FALSE  TRUE

# coll() is useful for locale-aware case-insensitive matching
i <- c("I", "<U+0130>", "i")
i

## [1] "I"        "<U+0130>" "i"

str_detect(string = i, pattern = regex("i", TRUE))

## [1]  TRUE FALSE  TRUE

str_detect(string = i, pattern = fixed("i", TRUE))

## [1]  TRUE FALSE  TRUE

str_detect(string = i, pattern = coll("i", TRUE))

## [1]  TRUE FALSE  TRUE

str_detect(string = i, pattern = coll("i", TRUE, locale = "tr"))

## [1] FALSE FALSE  TRUE

# Word boundaries 单词边界
words <- c("These are some words.")
str_count(string = words, pattern = boundary("word"))  #统计语句中单词的个数

## [1] 4

str_split(string = words, pattern = " ")[[1]]  #将语句分割成单词，最后一个单词带有标点

## [1] "These"  "are"    "some"   "words."

str_split(string = words, pattern = boundary("word"))[[1]]  #最后一个单词不带有标点

## [1] "These" "are"   "some"  "words"

# 使用正则表达式
str_extract_all(string = "The Cat in the Hat", pattern = "[a-z]+")  #区分大小写

## [[1]]
## [1] "he"  "at"  "in"  "the" "at"

str_extract_all(string = "The Cat in the Hat", pattern = regex("[a-z]+", TRUE))  #忽略大小写的差异

## [[1]]
## [1] "The" "Cat" "in"  "the" "Hat"

str_extract_all(string = "a\nb\nc", pattern = "^.")

## [[1]]
## [1] "a"

str_extract_all(string = "a\nb\nc", pattern = regex("^.", multiline = TRUE))

## [[1]]
## [1] "a" "b" "c"

str_extract_all(string = "a\nb\nc", pattern = "a.")

## [[1]]
## character(0)

str_extract_all(string = "a\nb\nc", pattern = regex("a.", dotall = TRUE))

## [[1]]
## [1] "a\n"

4. str_c 连接字符串

4.1 str_c(…, sep = “”, collapse = NULL)

4.2 str_join(…, sep = “”, collapse = NULL)，同str_c

sep：设置向量间的连接符；collapse：将向量的所有元素连接成一个字符串时，设置元素间的连接符。

str_c("Letter", letters[1:5])  #连接2个字符向量

## [1] "Lettera" "Letterb" "Letterc" "Letterd" "Lettere"

str_c("Letter", letters[1:5], sep = ": ")  #sep可设置向量间的连接符

## [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e"

# 连接3个不等长的字符向量，出现警告信息，短的向量的元素会自动循环至与最长向量等长。
str_c(letters[1:5], " is for", c("...", "***"))

## [1] "a is for..." "b is for***" "c is for..." "d is for***" "e is for..."

str_c(letters)

##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"

str_c(letters, sep = "-")  #当只有1个向量时，sep参数不起作用

##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"

# 将向量的所有元素连接成一个字符串，collapse设置元素间的连接符
str_c(letters, collapse = "")

## [1] "abcdefghijklmnopqrstuvwxyz"

str_c(letters, collapse = ", ")

## [1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"

str_c("Letter", letters[1:5], sep = ":", collapse = ",")  #同时设置sep和collapse时，与以下语句等价：

## [1] "Letter:a,Letter:b,Letter:c,Letter:d,Letter:e"

a <- str_c("Letter", letters[1:5], sep = ":")
a

## [1] "Letter:a" "Letter:b" "Letter:c" "Letter:d" "Letter:e"

str_c(a, collapse = ",")

## [1] "Letter:a,Letter:b,Letter:c,Letter:d,Letter:e"

# 当字符向量中存在缺失值时，返回的也是缺失值：
str_c(c("a", NA, "b"), "-d")

## [1] "a-d" NA    "b-d"

# 使用函数str_replace_NA 将缺失值替换成‘NA’:
str_c(str_replace_na(c("a", NA, "b")), "-d")

## [1] "a-d"  "NA-d" "b-d"

5. str_conv 指定字符串的编码

str_conv(string, encoding)

x <- rawToChar(as.raw(177))
x

## [1] "\xb1"

str_conv(string = x, encoding = "ISO-8859-2")  # Polish 'a with ogonek'

## [1] "<U+0105>"

str_conv(string = x, encoding = "ISO-8859-1")  # Plus-minus

## [1] "±"

6. str_count 计算字符串中的匹配模式的数目

str_count(string, pattern = “”)

fruit <- c("apple", "banana", "pear", "pineappleapple")
str_count(string = fruit, pattern = "a")  #计算向量fruit的每个元素含有a的数目

## [1] 1 3 1 2

str_count(string = fruit, pattern = "p")

## [1] 2 0 1 5

str_count(string = fruit, pattern = "ap")

## [1] 1 0 0 2

str_count(string = fruit, pattern = "[a-e]")

## [1] 2 4 2 5

# 统计'apple'中的'a'的个数，~~~，'pineappleapple'中'p'的个数：
str_count(string = fruit, pattern = c("a", "b", "p", "p"))

## [1] 1 1 1 5

str_count(string = c("a.", "...", ".a.a"), pattern = ".")  #正则表达式中‘.’是指单个字符，不仅仅是字符‘.’

## [1] 2 3 4

str_count(string = c("a.", "...", ".a.a"), pattern = fixed("."))  #fixed('.')指字符‘.’

## [1] 1 3 2

7. str_detect 检测字符串中是否存在某种模式

str_detect(string, pattern)，结果返回逻辑向量

fruit <- c("apple1", "banana", "pear", "pinapple")
str_detect(string = fruit, pattern = "a")  #fruit的元素是否包含a

## [1] TRUE TRUE TRUE TRUE

str_detect(string = fruit, pattern = "pp")

## [1]  TRUE FALSE FALSE  TRUE

str_detect(string = fruit, pattern = "^a")  #fruit的元素是否以a开头

## [1]  TRUE FALSE FALSE FALSE

str_detect(string = fruit, pattern = "a$")  #fruit的元素是否以a结尾

## [1] FALSE  TRUE FALSE FALSE

str_detect(string = fruit, pattern = "[aeiou]")  #fruit的元素是否包含[aeiou]中的一个字符

## [1] TRUE TRUE TRUE TRUE

# 向量化运算中，短的向量的元素会被循环使用
str_detect(string = fruit, pattern = c("ap", "ba", "pe", "pin"))

## [1] TRUE TRUE TRUE TRUE

# string的长度大于pattern的长度，结果返回与string等长的逻辑向量：
str_detect(string = fruit, pattern = c("ap", "pin"))

## [1]  TRUE FALSE FALSE  TRUE

# string的长度小于pattern的长度，结果返回与pattern等长的逻辑向量：
str_detect(string = fruit, pattern = c("ap", "ba", "pe", "pin", "e1"))

## [1] TRUE TRUE TRUE TRUE TRUE

8. str_dup 重复和连接字符串向量

str_dup(string, times)

fruit <- c("apple", "pear", "banana")
str_dup(string = fruit, times = 2)  # 向量的每个元素重复2次，然后连接起来

## [1] "appleapple"   "pearpear"     "bananabanana"

str_dup(string = fruit, times = 1:3)

## [1] "apple"              "pearpear"           "bananabananabanana"

str_c("ba", str_dup("na", 0:5))

## [1] "ba"           "bana"         "banana"       "bananana"    
## [5] "banananana"   "bananananana"

9. str_extract 从字符串中提取匹配的模式

str_extract(string, pattern) 提取匹配的第一个字符串

str_extract_all(string, pattern, simplify = FALSE) 提取匹配的所有字符串

shopping_list <- c("apples 4x4", "bag of flour", "bag of sugar", "milk x2")

# 提取匹配模式的第一个字符串
str_extract(string = shopping_list, pattern = "\\d")  #提取数字

## [1] "4" NA  NA  "2"

str_extract(string = shopping_list, pattern = "[a-z]+")  #提取字母

## [1] "apples" "bag"    "bag"    "milk"

str_extract(string = shopping_list, pattern = "[a-z]{1,4}")

## [1] "appl" "bag"  "bag"  "milk"

str_extract(string = shopping_list, pattern = "\\b[a-z]{1,4}\\b")

## [1] NA     "bag"  "bag"  "milk"

# 提取所有匹配模式的字符串，结果返回一个列表
str_extract_all(string = shopping_list, pattern = "[a-z]+")

## [[1]]
## [1] "apples" "x"     
## 
## [[2]]
## [1] "bag"   "of"    "flour"
## 
## [[3]]
## [1] "bag"   "of"    "sugar"
## 
## [[4]]
## [1] "milk" "x"

str_extract_all(string = shopping_list, pattern = "\\b[a-z]+\\b")

## [[1]]
## [1] "apples"
## 
## [[2]]
## [1] "bag"   "of"    "flour"
## 
## [[3]]
## [1] "bag"   "of"    "sugar"
## 
## [[4]]
## [1] "milk"

str_extract_all(string = shopping_list, pattern = "\\d")

## [[1]]
## [1] "4" "4"
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## [1] "2"

# 提取所有匹配模式的字符串，结果返回一个矩阵，通过simplify = TRUE设置
str_extract_all(string = shopping_list, pattern = "\\b[a-z]+\\b", simplify = TRUE)

##      [,1]     [,2] [,3]   
## [1,] "apples" ""   ""     
## [2,] "bag"    "of" "flour"
## [3,] "bag"    "of" "sugar"
## [4,] "milk"   ""   ""

str_extract_all(string = shopping_list, pattern = "\\d", simplify = TRUE)

##      [,1] [,2]
## [1,] "4"  "4" 
## [2,] ""   ""  
## [3,] ""   ""  
## [4,] "2"  ""

10. str_length 字符串的长度

str_length(string)

str_length(letters)

##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1

str_length(NA)

## [1] NA

str_length(factor("abc"))

## [1] 3

str_length(c("i", "like", "programming", NA))

## [1]  1  4 11 NA

# Two ways of representing a u with an umlaut
u1 <- "ü"
u2 <- stringi::stri_trans_nfd(u1)
# The print the same:
u1

## [1] "ü"

u2

## [1] "u<U+0308>"

# But have a different length
str_length(u1)

## [1] 1

str_length(u2)

## [1] 2

# Even though they have the same number of characters
str_count(u1)

## [1] 1

str_count(u2)

## [1] 1

11. str_locate 定位在字符串中匹配模式的开始和结束位置

str_locate(string, pattern)：返回匹配的第一个字符串的位置

str_locate_all(string, pattern)：返回匹配的所有位置

fruit <- c("apple", "banana", "pear", "pineapple")
# 返回匹配的第一个字符串的位置：
str_locate(string = fruit, pattern = "a")

##      start end
## [1,]     1   1
## [2,]     2   2
## [3,]     3   3
## [4,]     5   5

str_locate(string = fruit, pattern = "ap")

##      start end
## [1,]     1   2
## [2,]    NA  NA
## [3,]    NA  NA
## [4,]     5   6

str_locate(string = fruit, pattern = c("a", "b", "p", "p"))

##      start end
## [1,]     1   1
## [2,]     1   1
## [3,]     1   1
## [4,]     1   1

# 返回匹配的所有位置：
str_locate_all(string = fruit, pattern = "a")

## [[1]]
##      start end
## [1,]     1   1
## 
## [[2]]
##      start end
## [1,]     2   2
## [2,]     4   4
## [3,]     6   6
## 
## [[3]]
##      start end
## [1,]     3   3
## 
## [[4]]
##      start end
## [1,]     5   5

str_locate_all(string = fruit, pattern = "e")

## [[1]]
##      start end
## [1,]     5   5
## 
## [[2]]
##      start end
## 
## [[3]]
##      start end
## [1,]     2   2
## 
## [[4]]
##      start end
## [1,]     4   4
## [2,]     9   9

str_locate_all(string = fruit, pattern = c("a", "b", "p", "p"))

## [[1]]
##      start end
## [1,]     1   1
## 
## [[2]]
##      start end
## [1,]     1   1
## 
## [[3]]
##      start end
## [1,]     1   1
## 
## [[4]]
##      start end
## [1,]     1   1
## [2,]     6   6
## [3,]     7   7

# 查找每个字符的位置
str_locate_all(string = fruit, pattern = "")

## [[1]]
##      start end
## [1,]     1   1
## [2,]     2   2
## [3,]     3   3
## [4,]     4   4
## [5,]     5   5
## 
## [[2]]
##      start end
## [1,]     1   1
## [2,]     2   2
## [3,]     3   3
## [4,]     4   4
## [5,]     5   5
## [6,]     6   6
## 
## [[3]]
##      start end
## [1,]     1   1
## [2,]     2   2
## [3,]     3   3
## [4,]     4   4
## 
## [[4]]
##       start end
##  [1,]     1   1
##  [2,]     2   2
##  [3,]     3   3
##  [4,]     4   4
##  [5,]     5   5
##  [6,]     6   6
##  [7,]     7   7
##  [8,]     8   8
##  [9,]     9   9

12. str_match 从字符串中提取匹配组

str_match(string, pattern) 提取匹配的第一个字符串

str_match_all(string, pattern) 提取匹配的所有字符串

strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569", "387 287 6718", 
    "apple", "233.398.9187 ", "482 952 3315", "239 923 8115 and 842 566 4692", 
    "Work: 579-499-7527", "$1000", "Home: 543.355.3679")

phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"

str_extract(string = strings, pattern = phone)  #返回匹配的最长字符串

##  [1] "219 733 8965" "329-293-8753" NA             "595 794 7569"
##  [5] "387 287 6718" NA             "233.398.9187" "482 952 3315"
##  [9] "239 923 8115" "579-499-7527" NA             "543.355.3679"

str_match(string = strings, pattern = phone)  #返回匹配的最长字符串，同时返回最长字符串中的子字符串

##       [,1]           [,2]  [,3]  [,4]  
##  [1,] "219 733 8965" "219" "733" "8965"
##  [2,] "329-293-8753" "329" "293" "8753"
##  [3,] NA             NA    NA    NA    
##  [4,] "595 794 7569" "595" "794" "7569"
##  [5,] "387 287 6718" "387" "287" "6718"
##  [6,] NA             NA    NA    NA    
##  [7,] "233.398.9187" "233" "398" "9187"
##  [8,] "482 952 3315" "482" "952" "3315"
##  [9,] "239 923 8115" "239" "923" "8115"
## [10,] "579-499-7527" "579" "499" "7527"
## [11,] NA             NA    NA    NA    
## [12,] "543.355.3679" "543" "355" "3679"

# Extract/match all
str_extract_all(string = strings, pattern = phone)

## [[1]]
## [1] "219 733 8965"
## 
## [[2]]
## [1] "329-293-8753"
## 
## [[3]]
## character(0)
## 
## [[4]]
## [1] "595 794 7569"
## 
## [[5]]
## [1] "387 287 6718"
## 
## [[6]]
## character(0)
## 
## [[7]]
## [1] "233.398.9187"
## 
## [[8]]
## [1] "482 952 3315"
## 
## [[9]]
## [1] "239 923 8115" "842 566 4692"
## 
## [[10]]
## [1] "579-499-7527"
## 
## [[11]]
## character(0)
## 
## [[12]]
## [1] "543.355.3679"

str_match_all(string = strings, pattern = phone)

## [[1]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "219 733 8965" "219" "733" "8965"
## 
## [[2]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "329-293-8753" "329" "293" "8753"
## 
## [[3]]
##      [,1] [,2] [,3] [,4]
## 
## [[4]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "595 794 7569" "595" "794" "7569"
## 
## [[5]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "387 287 6718" "387" "287" "6718"
## 
## [[6]]
##      [,1] [,2] [,3] [,4]
## 
## [[7]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "233.398.9187" "233" "398" "9187"
## 
## [[8]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "482 952 3315" "482" "952" "3315"
## 
## [[9]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "239 923 8115" "239" "923" "8115"
## [2,] "842 566 4692" "842" "566" "4692"
## 
## [[10]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "579-499-7527" "579" "499" "7527"
## 
## [[11]]
##      [,1] [,2] [,3] [,4]
## 
## [[12]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "543.355.3679" "543" "355" "3679"

13. str_order 对字符向量进行排序

str_order(x, decreasing = FALSE, na_last = TRUE, locale = “”, …)

str_sort(x, decreasing = FALSE, na_last = TRUE, locale = “”, …)

str_order(x = letters, locale = "en")

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26

str_sort(x = letters, locale = "en")

##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"

str_order(x = letters, locale = "haw")

##  [1]  1  5  9 15 21  2  3  4  6  7  8 10 11 12 13 14 16 17 18 19 20 22 23
## [24] 24 25 26

str_sort(x = letters, locale = "haw")

##  [1] "a" "e" "i" "o" "u" "b" "c" "d" "f" "g" "h" "j" "k" "l" "m" "n" "p"
## [18] "q" "r" "s" "t" "v" "w" "x" "y" "z"

14. str_pad 在字符串的前后位置填充字符（如空格）

str_pad(string, width, side = c(“left”, “right”, “both”), pad = " “)

width：填充字符后字符串的长度；
side：填充字符串的位置，默认为left；
pad：指定填充的字符串；

rbind(str_pad("hadley", 30, "left"), str_pad("hadley", 30, "right"), str_pad("hadley", 
    30, "both"))

##      [,1]                            
## [1,] "                        hadley"
## [2,] "hadley                        "
## [3,] "            hadley            "

str_pad(string = c("a", "abc", "abcdef"), width = 10)

## [1] "         a" "       abc" "    abcdef"

str_pad(string = "a", width = c(5, 10, 20))

## [1] "    a"                "         a"           "                   a"

str_pad(string = "a", width = 10, pad = c("-", "_", "+"))

## [1] "---------a" "_________a" "+++++++++a"

# 当设置width小于string的长度时，结果返回原string
str_pad(string = "hadley", width = 3, pad = "-")

## [1] "hadley"

str_pad("hadley", width = 8, side = "left", pad = "-")

## [1] "--hadley"

str_pad("hadley", width = 8, side = "right", pad = "-")

## [1] "hadley--"

str_pad("hadley", width = 8, side = "both", pad = "-")

## [1] "-hadley-"

15. str_replace 替换字符串中的匹配模式

str_replace(string, pattern, replacement)

str_replace_all(string, pattern, replacement)

fruits <- c("one apple", "two pears", "three bananas")
str_replace(string = fruits, pattern = "[aeiou]", replacement = "-")  #替换第一个匹配的字符

## [1] "-ne apple"     "tw- pears"     "thr-e bananas"

str_replace_all(string = fruits, pattern = "[aeiou]", replacement = "-")  #替换所有匹配的字符

## [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

str_replace(string = fruits, pattern = "([aeiou])", replacement = "")

## [1] "ne apple"     "tw pears"     "thre bananas"

str_replace(string = fruits, pattern = "([aeiou])", replacement = "\\1\\1")

## [1] "oone apple"     "twoo pears"     "threee bananas"

str_replace(string = fruits, pattern = "[aeiou]", replacement = c("1", "2", 
    "3"))

## [1] "1ne apple"     "tw2 pears"     "thr3e bananas"

str_replace(string = fruits, pattern = c("a", "e", "i"), replacement = "-")

## [1] "one -pple"     "two p-ars"     "three bananas"

fruits <- c("one apple", "two pears", "three bananas")
str_replace(string = fruits, pattern = "[aeiou]", replacement = "-")

## [1] "-ne apple"     "tw- pears"     "thr-e bananas"

str_replace_all(string = fruits, pattern = "[aeiou]", replacement = "-")

## [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"

str_replace_all(string = fruits, pattern = "([aeiou])", replacement = "")

## [1] "n ppl"    "tw prs"   "thr bnns"

str_replace_all(string = fruits, pattern = "([aeiou])", replacement = "\\1\\1")

## [1] "oonee aapplee"      "twoo peeaars"       "threeee baanaanaas"

str_replace_all(string = fruits, pattern = "[aeiou]", replacement = c("1", "2", 
    "3"))

## [1] "1n1 1ppl1"     "tw2 p22rs"     "thr33 b3n3n3s"

str_replace_all(string = fruits, pattern = c("a", "e", "i"), replacement = "-")

## [1] "one -pple"     "two p-ars"     "three bananas"

# 对一个字符串同时应用多个指定规则进行替换：
strings1 <- str_c(fruits, collapse = "---")
strings1

## [1] "one apple---two pears---three bananas"

str_replace_all(string = strings1, pattern = c(one = 1, two = 2, three = 3))

## [1] "1 apple---2 pears---3 bananas"

16. str_replace_na 将缺失值替换成‘NA’

str_replace_na(string, replacement = “NA”)

str_replace_na(c(NA, "abc", "def"))

## [1] "NA"  "abc" "def"

17. str_split 根据一个分隔符将字符串进行分割

str_split(string, pattern, n = Inf)#结果返回列表

str_split_fixed(string, pattern, n)#必须设置n，结果返回矩阵

fruits <- c("apples and oranges and pears and bananas", "pineapples and mangos and guavas")
str_split(string = fruits, pattern = " and ")

## [[1]]
## [1] "apples"  "oranges" "pears"   "bananas"
## 
## [[2]]
## [1] "pineapples" "mangos"     "guavas"

str_split_fixed(string = fruits, pattern = " and ", n = 4)

##      [,1]         [,2]      [,3]     [,4]     
## [1,] "apples"     "oranges" "pears"  "bananas"
## [2,] "pineapples" "mangos"  "guavas" ""

# 通过设置n，指定分割成n块
str_split(string = fruits, pattern = " and ", n = 3)  #将字符串分割成3部分

## [[1]]
## [1] "apples"            "oranges"           "pears and bananas"
## 
## [[2]]
## [1] "pineapples" "mangos"     "guavas"

str_split(string = fruits, pattern = " and ", n = 2)  #将字符串分割成2部分

## [[1]]
## [1] "apples"                        "oranges and pears and bananas"
## 
## [[2]]
## [1] "pineapples"        "mangos and guavas"

str_split(string = fruits, pattern = " and ", n = 5)  #将字符串分割成5部分

## [[1]]
## [1] "apples"  "oranges" "pears"   "bananas"
## 
## [[2]]
## [1] "pineapples" "mangos"     "guavas"

str_split_fixed(string = fruits, pattern = " and ", n = 3)

##      [,1]         [,2]      [,3]               
## [1,] "apples"     "oranges" "pears and bananas"
## [2,] "pineapples" "mangos"  "guavas"

str_split_fixed(string = fruits, pattern = " and ", n = 4)

##      [,1]         [,2]      [,3]     [,4]     
## [1,] "apples"     "oranges" "pears"  "bananas"
## [2,] "pineapples" "mangos"  "guavas" ""

str_split_fixed(string = fruits, pattern = " and ", n = 6)

##      [,1]         [,2]      [,3]     [,4]      [,5] [,6]
## [1,] "apples"     "oranges" "pears"  "bananas" ""   ""  
## [2,] "pineapples" "mangos"  "guavas" ""        ""   ""

18. str_sub 按位置从字符向量中提取或替换子字符串

str_sub(string, start = 1L, end = -1L) 提取子字符串

str_sub(string, start = 1L, end = -1L) <- value 替换子字符串

hw <- "Hadley Wickham"
str_sub(string = hw, start = 1, end = 6)

## [1] "Hadley"

str_sub(string = hw, end = 6)

## [1] "Hadley"

str_sub(string = hw, start = 8, end = 14)

## [1] "Wickham"

str_sub(string = hw, start = 8)

## [1] "Wickham"

str_sub(string = hw, start = c(1, 8), end = c(6, 14))

## [1] "Hadley"  "Wickham"

# 使用负值索引
str_sub(string = hw, start = -1)

## [1] "m"

str_sub(string = hw, start = -7)

## [1] "Wickham"

str_sub(string = hw, end = -7)

## [1] "Hadley W"

# 从函数str_locate_all的结果传入位置参数
pos <- str_locate_all(hw, "[aeio]")[[1]]
pos

##      start end
## [1,]     2   2
## [2,]     5   5
## [3,]     9   9
## [4,]    13  13

str_sub(string = hw, pos)

## [1] "a" "e" "i" "a"

str_sub(string = hw, start = pos[, 1], end = pos[, 2])

## [1] "a" "e" "i" "a"

# 向量化
str_sub(hw, start = seq_len(str_length(hw)))

##  [1] "Hadley Wickham" "adley Wickham"  "dley Wickham"   "ley Wickham"   
##  [5] "ey Wickham"     "y Wickham"      " Wickham"       "Wickham"       
##  [9] "ickham"         "ckham"          "kham"           "ham"           
## [13] "am"             "m"

str_sub(hw, end = seq_len(str_length(hw)))

##  [1] "H"              "Ha"             "Had"            "Hadl"          
##  [5] "Hadle"          "Hadley"         "Hadley "        "Hadley W"      
##  [9] "Hadley Wi"      "Hadley Wic"     "Hadley Wick"    "Hadley Wickh"  
## [13] "Hadley Wickha"  "Hadley Wickham"

# 替换
x <- "BBCDEF"
str_sub(x, 1, 1) <- "A"
x

## [1] "ABCDEF"

str_sub(x, -1, -1) <- "K"
x

## [1] "ABCDEK"

str_sub(x, -2, -2) <- "GHIJ"
x

## [1] "ABCDGHIJK"

str_sub(x, 2, -2) <- ""
x

## [1] "AK"

19. str_subset 提取匹配模式的字符串向量元素

str_subset(string, pattern)

fruit <- c("apple", "banana", "pear", "pinapple")
str_subset(string = fruit, pattern = "a")

## [1] "apple"    "banana"   "pear"     "pinapple"

str_subset(string = fruit, pattern = "ap")

## [1] "apple"    "pinapple"

str_subset(string = fruit, pattern = "^a")

## [1] "apple"

str_subset(string = fruit, pattern = "a$")

## [1] "banana"

str_subset(string = fruit, pattern = "[aeiou]")

## [1] "apple"    "banana"   "pear"     "pinapple"

# 有缺失值时
str_subset(string = c("a", NA, "b"), pattern = ".")

## [1] "a" "b"

20. str_trim 删除字符串中的空格

str_trim(string, side = c(“both”, “left”, “right”))

str_trim(" String with trailing and leading white space\t")

## [1] "String with trailing and leading white space"

str_trim("\n\nString with trailing and leading white space\n\n")

## [1] "String with trailing and leading white space"

21. str_wrap

str_wrap(string, width = 80, indent = 0, exdent = 0)

width：每行的宽度
indent：设置首行缩进
exdent：设置第二行后每行缩进

thanks_path <- file.path(R.home("doc"), "THANKS")
thanks <- str_c(readLines(thanks_path), collapse = "\n")
thanks <- word(thanks, 1, 3, fixed("\n\n"))
cat(str_wrap(thanks), "\n")

## R would not be what it is today without the invaluable help of these people,
## who contributed by donating code, bug fixes and documentation: Valerio Aimale,
## Thomas Baier, Henrik Bengtsson, Roger Bivand, Ben Bolker, David Brahm, Goran
## Brostrom, Patrick Burns, Vince Carey, Saikat DebRoy, Brian D'Urso, Lyndon
## Drake, Dirk Eddelbuettel, Claus Ekstrom, Sebastian Fischmeister, John Fox,
## Paul Gilbert, Yu Gong, Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn,
## Robert King, Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan de Leeuw,
## Jim Lindsey, Patrick Lindsey, Catherine Loader, Gordon Maclean, John Maindonald,
## David Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve Oncley, Richard O'Keefe,
## Hubert Palme, Roger D. Peng, Jose' C. Pinheiro, Tony Plate, Anthony Rossini,
## Jonathan Rougier, Petr Savicky, Guenther Sawitzki, Marc Schwartz, Detlef Steuer,
## Bill Simpson, Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner,
## Bill Venables, Gregory R. Warnes, Andreas Weingessel, Morten Welinder, James
## Wettenhall, Simon Wood and Achim Zeileis. Others have written code that has been
## adopted by R and is acknowledged in the code files, including

cat(str_wrap(string = thanks, width = 70), "\n")

## R would not be what it is today without the invaluable help of these
## people, who contributed by donating code, bug fixes and documentation:
## Valerio Aimale, Thomas Baier, Henrik Bengtsson, Roger Bivand, Ben
## Bolker, David Brahm, Goran Brostrom, Patrick Burns, Vince Carey,
## Saikat DebRoy, Brian D'Urso, Lyndon Drake, Dirk Eddelbuettel, Claus
## Ekstrom, Sebastian Fischmeister, John Fox, Paul Gilbert, Yu Gong,
## Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn, Robert King,
## Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan de Leeuw, Jim
## Lindsey, Patrick Lindsey, Catherine Loader, Gordon Maclean, John
## Maindonald, David Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve
## Oncley, Richard O'Keefe, Hubert Palme, Roger D. Peng, Jose' C.
## Pinheiro, Tony Plate, Anthony Rossini, Jonathan Rougier, Petr Savicky,
## Guenther Sawitzki, Marc Schwartz, Detlef Steuer, Bill Simpson,
## Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner, Bill
## Venables, Gregory R. Warnes, Andreas Weingessel, Morten Welinder,
## James Wettenhall, Simon Wood and Achim Zeileis. Others have written
## code that has been adopted by R and is acknowledged in the code files,
## including

cat(str_wrap(string = thanks, width = 60, indent = 6), "\n")

##       R would not be what it is today without the invaluable help
## of these people, who contributed by donating code, bug fixes
## and documentation: Valerio Aimale, Thomas Baier, Henrik
## Bengtsson, Roger Bivand, Ben Bolker, David Brahm, Goran
## Brostrom, Patrick Burns, Vince Carey, Saikat DebRoy, Brian
## D'Urso, Lyndon Drake, Dirk Eddelbuettel, Claus Ekstrom,
## Sebastian Fischmeister, John Fox, Paul Gilbert, Yu Gong,
## Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn,
## Robert King, Kjetil Kjernsmo, Roger Koenker, Philippe
## Lambert, Jan de Leeuw, Jim Lindsey, Patrick Lindsey,
## Catherine Loader, Gordon Maclean, John Maindonald, David
## Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve Oncley,
## Richard O'Keefe, Hubert Palme, Roger D. Peng, Jose' C.
## Pinheiro, Tony Plate, Anthony Rossini, Jonathan Rougier,
## Petr Savicky, Guenther Sawitzki, Marc Schwartz, Detlef
## Steuer, Bill Simpson, Gordon Smyth, Adrian Trapletti, Terry
## Therneau, Rolf Turner, Bill Venables, Gregory R. Warnes,
## Andreas Weingessel, Morten Welinder, James Wettenhall, Simon
## Wood and Achim Zeileis. Others have written code that has
## been adopted by R and is acknowledged in the code files,
## including

cat(str_wrap(string = thanks, width = 80, indent = 6, exdent = 2), "\n")

##       R would not be what it is today without the invaluable help of these people,
##   who contributed by donating code, bug fixes and documentation: Valerio Aimale,
##   Thomas Baier, Henrik Bengtsson, Roger Bivand, Ben Bolker, David Brahm, Goran
##   Brostrom, Patrick Burns, Vince Carey, Saikat DebRoy, Brian D'Urso, Lyndon
##   Drake, Dirk Eddelbuettel, Claus Ekstrom, Sebastian Fischmeister, John Fox,
##   Paul Gilbert, Yu Gong, Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn,
##   Robert King, Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan de Leeuw,
##   Jim Lindsey, Patrick Lindsey, Catherine Loader, Gordon Maclean, John Maindonald,
##   David Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve Oncley, Richard O'Keefe,
##   Hubert Palme, Roger D. Peng, Jose' C. Pinheiro, Tony Plate, Anthony Rossini,
##   Jonathan Rougier, Petr Savicky, Guenther Sawitzki, Marc Schwartz, Detlef Steuer,
##   Bill Simpson, Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner,
##   Bill Venables, Gregory R. Warnes, Andreas Weingessel, Morten Welinder, James
##   Wettenhall, Simon Wood and Achim Zeileis. Others have written code that has been
##   adopted by R and is acknowledged in the code files, including

22. word 从句子中提取单词

word(string, start = 1L, end = start, sep = fixed(" “))

sentences <- c("Jane saw a cat", "Jane sat down")
word(string = sentences, start = 1)  #提取第一个单词

## [1] "Jane" "Jane"

word(string = sentences, start = 2)  #提取第二个单词

## [1] "saw" "sat"

word(string = sentences, start = -1)  #提取句子的最后一个单词

## [1] "cat"  "down"

word(string = sentences, start = 2, end = -1)  #提取第二个单词到最后一个单词

## [1] "saw a cat" "sat down"

# 向量化：
word(string = sentences[1], start = 1:3, end = -1)

## [1] "Jane saw a cat" "saw a cat"      "a cat"

word(string = sentences[1], start = 1, end = 1:4)

## [1] "Jane"           "Jane saw"       "Jane saw a"     "Jane saw a cat"

# 通过参数sep指定分隔符
str <- "abc.def..123.4568.999"
word(string = str, start = 1, sep = fixed(".."))

## [1] "abc.def"

word(string = str, start = 2, sep = fixed(".."))

## [1] "123.4568.999"

二、以下为基础包的字符串处理函数：

23. paste() 字符串连接：

paste(…, sep = " “, collapse = NULL)

paste("A", 1:6, sep = "")

## [1] "A1" "A2" "A3" "A4" "A5" "A6"

paste("A", 1:6, sep = "", collapse = "-")  #设置collapse时，将连成一个字符串

## [1] "A1-A2-A3-A4-A5-A6"

paste(1:6, collapse = "")

## [1] "123456"

paste(1:6, collapse = "-")

## [1] "1-2-3-4-5-6"

paste("Today is", date())

## [1] "Today is Sat Jun 18 16:29:34 2016"

24. strsplit() 字符串分割，结果返回列表：

strsplit(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE)

split：设置分割符
fixed：逻辑值，默认值为FALSE
perl：逻辑值，默认值为FALSE，取TRUE时，分割符使用正则表达式
useBytes：逻辑值，默认值为FALSE，

x <- c(as = "asfef", qu = "qwerty", "yuiop[", "b", "stuff.blah.yech")
strsplit(x = x, split = "e")

## $as
## [1] "asf" "f"  
## 
## $qu
## [1] "qw"  "rty"
## 
## [[3]]
## [1] "yuiop["
## 
## [[4]]
## [1] "b"
## 
## [[5]]
## [1] "stuff.blah.y" "ch"

unlist(strsplit(x = "a.b.c", split = "."))

## [1] "" "" "" "" ""

unlist(strsplit(x = "a.b.c", split = "[.]"))  #使用‘.’为分割符

## [1] "a" "b" "c"

# 或者：
unlist(strsplit(x = "a.b.c", split = ".", fixed = TRUE))

## [1] "a" "b" "c"

x <- "ascd123afrwf34535ddggh454fgf5e4"
strsplit(x = x, split = "[0-9]+", perl = TRUE)  #以数字为分割符

## [[1]]
## [1] "ascd"  "afrwf" "ddggh" "fgf"   "e"

unlist(strsplit(x = x, split = "[a-z]+"))  #以字母为分割符

## [1] ""      "123"   "34535" "454"   "5"     "4"

25. nchar() 计算字符串的字符个数：

nchar(x, type = “chars”, allowNA = FALSE)

x <- c("asfef", "qwerty", "yuiop[", "b", "stuff.blah.yech")
nchar(x)  #结果返回向量x的每个元素的字符个数

## [1]  5  6  6  1 15

26. substr 字符串截取及替换：

(1)substr(x, start, stop)

(2)substring(text, first, last = 1000000L)

(3)substr(x, start, stop) <- value

(4)substring(text, first, last = 1000000L) <- value

# 对于单个字符串：
substr(x = "abcdef", start = 2, stop = 4)

## [1] "bcd"

substring(text = "abcdef", first = 2, last = 4)

## [1] "bcd"

substring(text = "abcdef", first = 1:6, last = 1:6)

## [1] "a" "b" "c" "d" "e" "f"

substr(x = rep("abcdef", 4), start = 1:4, stop = 4:5)

## [1] "abcd" "bcde" "cd"   "de"

# 对于字符串向量：
x <- c("asfef", "qwerty", "yuiop[", "b", "stuff.blah.yech")
substr(x = x, start = 2, stop = 5)  #对向量x每个元素截取子字符串

## [1] "sfef" "wert" "uiop" ""     "tuff"

substring(text = x, first = 2, last = 4:6)

## [1] "sfe"   "wert"  "uiop[" ""      "tuff"

substring(text = x, first = 2) <- c("..", "+++")  #以赋值的方式进行替换
x

## [1] "a..ef"           "q+++ty"          "y..op["          "b"              
## [5] "s..ff.blah.yech"

27. 字符串替换及大小写转换:

chartr(old, new, x) tolower(x) toupper(x) casefold(x, upper = FALSE)

x <- "MiXeD cAsE 123"
chartr("iXs", "why", x)  #i:w,X:h,s:y，单个字符对应替换

## [1] "MwheD cAyE 123"

chartr("a-cX", "D-Fw", x)

## [1] "MiweD FAsE 123"

tolower(x)  #转换成小写

## [1] "mixed case 123"

toupper(x)  #转换成大写

## [1] "MIXED CASE 123"

casefold(x, upper = FALSE)

## [1] "mixed case 123"

casefold(x, upper = TRUE)

## [1] "MIXED CASE 123"

28. 字符匹配与替换

grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE, fixed = FALSE, useBytes = FALSE, invert = FALSE)，结果返回匹配的向量x的元素的索引

ignore.case：逻辑值，默认值FALSE，区分大小写；
perl：逻辑值，默认值FALSE，不使用正则表达式；
value：逻辑值，设置结果返回匹配元素的值还是索引，默认值为FALSE：返回索引；
fixed：逻辑值，默认值为FALSE，取值为TRUE时使用精确匹配；
useBytes：逻辑值，默认取值FALSE；
invert：逻辑值，默认取值FALSE，设置结果返回匹配还是非匹配的元素；

txt <- c("10arm03", "Foot 12", " 678-lefroo.345", "__.bafoobar90..")
grep(pattern = "foo", x = txt, value = FALSE)  #区分大小写，结果返回匹配的元素索引

## [1] 4

grep(pattern = "foo", x = txt, value = TRUE)  #区分大小写，结果返回匹配的元素值

## [1] "__.bafoobar90.."

grep(pattern = "foo", x = txt, ignore.case = TRUE)  #忽略大小写，结果返回匹配的元素索引

## [1] 2 4

grep(pattern = "foo", x = txt, ignore.case = TRUE, value = TRUE)  #忽略大小写，结果返回匹配的元素值

## [1] "Foot 12"         "__.bafoobar90.."

grep(pattern = "foo", x = txt, ignore.case = TRUE, value = TRUE, invert = TRUE)  #忽略大小写，结果返回不匹配的元素值

## [1] "10arm03"         " 678-lefroo.345"

grep(pattern = "^[0-9]+", x = txt, perl = TRUE)  #返回以数字开头的元素索引

## [1] 1

grep(pattern = "[0-9]+$", x = txt, perl = TRUE, value = TRUE)  #返回以数字结尾的元素

## [1] "10arm03"         "Foot 12"         " 678-lefroo.345"

grep(pattern = "\\d$", x = txt, perl = TRUE, value = TRUE)  #返回以数字结尾的元素

## [1] "10arm03"         "Foot 12"         " 678-lefroo.345"

grepl(pattern, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)，结果返回一个与向量x等长的逻辑向量，匹配的元素返回TRUE，不匹配的返回FALSE。

txt <- c("10arm03", "Foot 12", " 678-lefroo.345", "__.bafoobar90..")
grepl(pattern = "foo", x = txt)

## [1] FALSE FALSE FALSE  TRUE

grepl(pattern = "\\d$", x = txt, perl = TRUE)

## [1]  TRUE  TRUE  TRUE FALSE

sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)，替换匹配的元素的第一个字符串

txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
sub(pattern = "foo", replacement = "99", x = txt)  #将元素中的第一个foo替换成99

## [1] "10arm03"              "Foot 12 99t"          " 678-lefroo.345"     
## [4] "__.ba99bar90foobar.."

sub(pattern = "\\d+$", replacement = "+++", x = txt, perl = TRUE)  #将结尾的数字替换成+++

## [1] "10arm+++"              "Foot 12 foot"          " 678-lefroo.+++"      
## [4] "__.bafoobar90foobar.."

gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)，替换匹配的元素的所有字符串

txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
gsub(pattern = "foo", replacement = "99", x = txt)  #将所有的foo替换成99

## [1] "10arm03"             "Foot 12 99t"         " 678-lefroo.345"    
## [4] "__.ba99bar9099bar.."

gsub(pattern = "\\d+", replacement = "+++", x = txt, perl = TRUE)  #将所有数字替换成+++

## [1] "+++arm+++"              "Foot +++ foot"         
## [3] " +++-lefroo.+++"        "__.bafoobar+++foobar.."

regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)，结果返回每个元素匹配的第一个位置及字符数目，不匹配的元素返回的位置和长度都是-1。

txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
regexpr(pattern = "foo", text = txt)

## [1] -1  9 -1  6
## attr(,"match.length")
## [1] -1  3 -1  3
## attr(,"useBytes")
## [1] TRUE

regexpr(pattern = "\\d+", text = txt)

## [1]  1  6  2 12
## attr(,"match.length")
## [1] 2 2 3 2
## attr(,"useBytes")
## [1] TRUE

gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)，返回每个元素匹配的所有位置及相应的字符数目

txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
gregexpr(pattern = "foo", text = txt)

## [[1]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1] 9
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE
## 
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
## 
## [[4]]
## [1]  6 14
## attr(,"match.length")
## [1] 3 3
## attr(,"useBytes")
## [1] TRUE

gregexpr(pattern = "\\d+", text = txt)

## [[1]]
## [1] 1 6
## attr(,"match.length")
## [1] 2 2
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1] 6
## attr(,"match.length")
## [1] 2
## attr(,"useBytes")
## [1] TRUE
## 
## [[3]]
## [1]  2 13
## attr(,"match.length")
## [1] 3 3
## attr(,"useBytes")
## [1] TRUE
## 
## [[4]]
## [1] 12
## attr(,"match.length")
## [1] 2
## attr(,"useBytes")
## [1] TRUE

regexec(pattern, text, ignore.case = FALSE, fixed = FALSE, useBytes = FALSE)

txt <- c(NA, "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
regexec(pattern = "foo", text = txt)

## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
## 
## [[2]]
## [1] 9
## attr(,"match.length")
## [1] 3
## 
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## 
## [[4]]
## [1] 6
## attr(,"match.length")
## [1] 3

regexec(pattern = "\\d+", text = txt)

## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
## 
## [[2]]
## [1] 6
## attr(,"match.length")
## [1] 2
## 
## [[3]]
## [1] 2
## attr(,"match.length")
## [1] 3
## 
## [[4]]
## [1] 12
## attr(,"match.length")
## [1] 2