一、以下为stringr包的字符串函数:

1. 字符串的大小写转换

  • str_to_upper(string, locale = “”)
  • str_to_lower(string, locale = “”)
  • str_to_title(string, locale = “”)

string:字符串,亦可为字符串向量;locale:设置语种。

# install.packages('stringr') 安装包
library(stringr)  #加载包
dog <- "The quick brown dog"
dog1 <- c("The", "quick", "brown", "dog")
# 帮助文档的实例中一般都省略参数名,个人习惯补全参数名,容易理解
str_to_upper(string = dog)  #将英文字符串转换成大写
## [1] "THE QUICK BROWN DOG"
str_to_upper(string = dog1)  #string为字符向量时
## [1] "THE"   "QUICK" "BROWN" "DOG"
str_to_lower(string = dog)  #将英文字符串转换成小写
## [1] "the quick brown dog"
str_to_title(string = dog)  #将英文字符串中的单词首字母转换大写
## [1] "The Quick Brown Dog"
# locale可设置不同的语种
str_to_upper(string = "i", locale = "en")  # English 
## [1] "I"
str_to_upper(string = "i", locale = "tr")  # Turkish
## [1] "<U+0130>"

2. invert_match 返回非匹配模式的起始结束位置

invert_match(loc),loc是以函数str_locate_all()获取的位置矩阵作为输入。

numbers <- "1 and 2 and 4 and 456"
num_loc <- str_locate_all(string = numbers, pattern = "[0-9]+")[[1]]  #匹配数字,返回数字的起始和结束位置
num_loc
##      start end
## [1,]     1   1
## [2,]     7   7
## [3,]    13  13
## [4,]    19  21
str_sub(string = numbers, start = num_loc[, "start"], end = num_loc[, "end"])
## [1] "1"   "2"   "4"   "456"
text_loc <- invert_match(loc = num_loc)  #返回不匹配数字的起始和结束位置
text_loc
##      start end
## [1,]     0   0
## [2,]     2   6
## [3,]     8  12
## [4,]    14  18
## [5,]    22  -1
str_sub(string = numbers, start = text_loc[, "start"], end = text_loc[, "end"])
## [1] ""      " and " " and " " and " ""

3. modifiers 指定模式的类别

3.1 fixed(pattern, ignore_case = FALSE):Compare literal bytes in the string. This is very fast, but not usually what you want for non-ASCII character sets.

3.2 coll(pattern, ignore_case = FALSE, locale = NULL, …):Compare strings respecting standard collation rules.

3.3 regex(pattern, ignore_case = FALSE, multiline = FALSE, comments = FALSE, dotall = FALSE, …):默认使用正则表达式

3.4 boundary(type = c(“character”, “line_break”, “sentence”, “word”), skip_word_none = TRUE, …):Match boundaries between things.

  • pattern: Pattern to modify behaviour.
  • ignore_case: Should case differences be ignored in the match?
  • locale: Locale to use for comparisons. See stri_locale_list() for all possible options.
  • …: Other less frequently used arguments passed onto stri_opts_collator, stri_opts_regex, or stri_opts_brkiter
  • multiline: If TRUE, $ and ^ match the beginning and end of each line. If FALSE, the default, only match the start and end of the input.
  • comments: If TRUE, whitespace and comments beginning with # are ignored. Escape literal spaces with  .
  • dotall: If TRUE, . will also match line terminators.
  • type: Boundary type to detect.
  • skip_word_none: Ignore “words” that don’t contain any characters or numbers - i.e. punctuation.
pattern <- "a.b"
strings <- c("abb", "a.b")
str_detect(string = strings, pattern = pattern)
## [1] TRUE TRUE
str_detect(string = strings, pattern = fixed(pattern))
## [1] FALSE  TRUE
str_detect(string = strings, pattern = coll(pattern))
## [1] FALSE  TRUE
# coll() is useful for locale-aware case-insensitive matching
i <- c("I", "<U+0130>", "i")
i
## [1] "I"        "<U+0130>" "i"
str_detect(string = i, pattern = regex("i", TRUE))
## [1]  TRUE FALSE  TRUE
str_detect(string = i, pattern = fixed("i", TRUE))
## [1]  TRUE FALSE  TRUE
str_detect(string = i, pattern = coll("i", TRUE))
## [1]  TRUE FALSE  TRUE
str_detect(string = i, pattern = coll("i", TRUE, locale = "tr"))
## [1] FALSE FALSE  TRUE
# Word boundaries 单词边界
words <- c("These are some words.")
str_count(string = words, pattern = boundary("word"))  #统计语句中单词的个数
## [1] 4
str_split(string = words, pattern = " ")[[1]]  #将语句分割成单词,最后一个单词带有标点
## [1] "These"  "are"    "some"   "words."
str_split(string = words, pattern = boundary("word"))[[1]]  #最后一个单词不带有标点
## [1] "These" "are"   "some"  "words"
# 使用正则表达式
str_extract_all(string = "The Cat in the Hat", pattern = "[a-z]+")  #区分大小写
## [[1]]
## [1] "he"  "at"  "in"  "the" "at"
str_extract_all(string = "The Cat in the Hat", pattern = regex("[a-z]+", TRUE))  #忽略大小写的差异
## [[1]]
## [1] "The" "Cat" "in"  "the" "Hat"
str_extract_all(string = "a\nb\nc", pattern = "^.")
## [[1]]
## [1] "a"
str_extract_all(string = "a\nb\nc", pattern = regex("^.", multiline = TRUE))
## [[1]]
## [1] "a" "b" "c"
str_extract_all(string = "a\nb\nc", pattern = "a.")
## [[1]]
## character(0)
str_extract_all(string = "a\nb\nc", pattern = regex("a.", dotall = TRUE))
## [[1]]
## [1] "a\n"

4. str_c 连接字符串

4.1 str_c(…, sep = “”, collapse = NULL)

4.2 str_join(…, sep = “”, collapse = NULL),同str_c

sep:设置向量间的连接符;collapse:将向量的所有元素连接成一个字符串时,设置元素间的连接符。

str_c("Letter", letters[1:5])  #连接2个字符向量
## [1] "Lettera" "Letterb" "Letterc" "Letterd" "Lettere"
str_c("Letter", letters[1:5], sep = ": ")  #sep可设置向量间的连接符
## [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e"
# 连接3个不等长的字符向量,出现警告信息,短的向量的元素会自动循环至与最长向量等长。
str_c(letters[1:5], " is for", c("...", "***"))
## [1] "a is for..." "b is for***" "c is for..." "d is for***" "e is for..."
str_c(letters)
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
str_c(letters, sep = "-")  #当只有1个向量时,sep参数不起作用
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
# 将向量的所有元素连接成一个字符串,collapse设置元素间的连接符
str_c(letters, collapse = "")
## [1] "abcdefghijklmnopqrstuvwxyz"
str_c(letters, collapse = ", ")
## [1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"
str_c("Letter", letters[1:5], sep = ":", collapse = ",")  #同时设置sep和collapse时,与以下语句等价:
## [1] "Letter:a,Letter:b,Letter:c,Letter:d,Letter:e"
a <- str_c("Letter", letters[1:5], sep = ":")
a
## [1] "Letter:a" "Letter:b" "Letter:c" "Letter:d" "Letter:e"
str_c(a, collapse = ",")
## [1] "Letter:a,Letter:b,Letter:c,Letter:d,Letter:e"
# 当字符向量中存在缺失值时,返回的也是缺失值:
str_c(c("a", NA, "b"), "-d")
## [1] "a-d" NA    "b-d"
# 使用函数str_replace_NA 将缺失值替换成‘NA’:
str_c(str_replace_na(c("a", NA, "b")), "-d")
## [1] "a-d"  "NA-d" "b-d"

5. str_conv 指定字符串的编码

str_conv(string, encoding)

x <- rawToChar(as.raw(177))
x
## [1] "\xb1"
str_conv(string = x, encoding = "ISO-8859-2")  # Polish 'a with ogonek' 
## [1] "<U+0105>"
str_conv(string = x, encoding = "ISO-8859-1")  # Plus-minus
## [1] "±"

6. str_count 计算字符串中的匹配模式的数目

str_count(string, pattern = “”)

fruit <- c("apple", "banana", "pear", "pineappleapple")
str_count(string = fruit, pattern = "a")  #计算向量fruit的每个元素含有a的数目
## [1] 1 3 1 2
str_count(string = fruit, pattern = "p")
## [1] 2 0 1 5
str_count(string = fruit, pattern = "ap")
## [1] 1 0 0 2
str_count(string = fruit, pattern = "[a-e]")
## [1] 2 4 2 5
# 统计'apple'中的'a'的个数,~~~,'pineappleapple'中'p'的个数:
str_count(string = fruit, pattern = c("a", "b", "p", "p"))
## [1] 1 1 1 5
str_count(string = c("a.", "...", ".a.a"), pattern = ".")  #正则表达式中‘.’是指单个字符,不仅仅是字符‘.’
## [1] 2 3 4
str_count(string = c("a.", "...", ".a.a"), pattern = fixed("."))  #fixed('.')指字符‘.’
## [1] 1 3 2

7. str_detect 检测字符串中是否存在某种模式

str_detect(string, pattern),结果返回逻辑向量

fruit <- c("apple1", "banana", "pear", "pinapple")
str_detect(string = fruit, pattern = "a")  #fruit的元素是否包含a
## [1] TRUE TRUE TRUE TRUE
str_detect(string = fruit, pattern = "pp")
## [1]  TRUE FALSE FALSE  TRUE
str_detect(string = fruit, pattern = "^a")  #fruit的元素是否以a开头
## [1]  TRUE FALSE FALSE FALSE
str_detect(string = fruit, pattern = "a$")  #fruit的元素是否以a结尾
## [1] FALSE  TRUE FALSE FALSE
str_detect(string = fruit, pattern = "[aeiou]")  #fruit的元素是否包含[aeiou]中的一个字符
## [1] TRUE TRUE TRUE TRUE
# 向量化运算中,短的向量的元素会被循环使用
str_detect(string = fruit, pattern = c("ap", "ba", "pe", "pin"))
## [1] TRUE TRUE TRUE TRUE
# string的长度大于pattern的长度,结果返回与string等长的逻辑向量:
str_detect(string = fruit, pattern = c("ap", "pin"))
## [1]  TRUE FALSE FALSE  TRUE
# string的长度小于pattern的长度,结果返回与pattern等长的逻辑向量:
str_detect(string = fruit, pattern = c("ap", "ba", "pe", "pin", "e1"))
## [1] TRUE TRUE TRUE TRUE TRUE

8. str_dup 重复和连接字符串向量

str_dup(string, times)

fruit <- c("apple", "pear", "banana")
str_dup(string = fruit, times = 2)  # 向量的每个元素重复2次,然后连接起来
## [1] "appleapple"   "pearpear"     "bananabanana"
str_dup(string = fruit, times = 1:3)
## [1] "apple"              "pearpear"           "bananabananabanana"
str_c("ba", str_dup("na", 0:5))
## [1] "ba"           "bana"         "banana"       "bananana"    
## [5] "banananana"   "bananananana"

9. str_extract 从字符串中提取匹配的模式

str_extract(string, pattern) 提取匹配的第一个字符串

str_extract_all(string, pattern, simplify = FALSE) 提取匹配的所有字符串

shopping_list <- c("apples 4x4", "bag of flour", "bag of sugar", "milk x2")

# 提取匹配模式的第一个字符串
str_extract(string = shopping_list, pattern = "\\d")  #提取数字
## [1] "4" NA  NA  "2"
str_extract(string = shopping_list, pattern = "[a-z]+")  #提取字母
## [1] "apples" "bag"    "bag"    "milk"
str_extract(string = shopping_list, pattern = "[a-z]{1,4}")
## [1] "appl" "bag"  "bag"  "milk"
str_extract(string = shopping_list, pattern = "\\b[a-z]{1,4}\\b")
## [1] NA     "bag"  "bag"  "milk"
# 提取所有匹配模式的字符串,结果返回一个列表
str_extract_all(string = shopping_list, pattern = "[a-z]+")
## [[1]]
## [1] "apples" "x"     
## 
## [[2]]
## [1] "bag"   "of"    "flour"
## 
## [[3]]
## [1] "bag"   "of"    "sugar"
## 
## [[4]]
## [1] "milk" "x"
str_extract_all(string = shopping_list, pattern = "\\b[a-z]+\\b")
## [[1]]
## [1] "apples"
## 
## [[2]]
## [1] "bag"   "of"    "flour"
## 
## [[3]]
## [1] "bag"   "of"    "sugar"
## 
## [[4]]
## [1] "milk"
str_extract_all(string = shopping_list, pattern = "\\d")
## [[1]]
## [1] "4" "4"
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## [1] "2"
# 提取所有匹配模式的字符串,结果返回一个矩阵,通过simplify = TRUE设置
str_extract_all(string = shopping_list, pattern = "\\b[a-z]+\\b", simplify = TRUE)
##      [,1]     [,2] [,3]   
## [1,] "apples" ""   ""     
## [2,] "bag"    "of" "flour"
## [3,] "bag"    "of" "sugar"
## [4,] "milk"   ""   ""
str_extract_all(string = shopping_list, pattern = "\\d", simplify = TRUE)
##      [,1] [,2]
## [1,] "4"  "4" 
## [2,] ""   ""  
## [3,] ""   ""  
## [4,] "2"  ""

10. str_length 字符串的长度

str_length(string)

str_length(letters)
##  [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
str_length(NA)
## [1] NA
str_length(factor("abc"))
## [1] 3
str_length(c("i", "like", "programming", NA))
## [1]  1  4 11 NA
# Two ways of representing a u with an umlaut
u1 <- "ü"
u2 <- stringi::stri_trans_nfd(u1)
# The print the same:
u1
## [1] "ü"
u2
## [1] "u<U+0308>"
# But have a different length
str_length(u1)
## [1] 1
str_length(u2)
## [1] 2
# Even though they have the same number of characters
str_count(u1)
## [1] 1
str_count(u2)
## [1] 1

11. str_locate 定位在字符串中匹配模式的开始和结束位置

str_locate(string, pattern):返回匹配的第一个字符串的位置

str_locate_all(string, pattern):返回匹配的所有位置

fruit <- c("apple", "banana", "pear", "pineapple")
# 返回匹配的第一个字符串的位置:
str_locate(string = fruit, pattern = "a")
##      start end
## [1,]     1   1
## [2,]     2   2
## [3,]     3   3
## [4,]     5   5
str_locate(string = fruit, pattern = "ap")
##      start end
## [1,]     1   2
## [2,]    NA  NA
## [3,]    NA  NA
## [4,]     5   6
str_locate(string = fruit, pattern = c("a", "b", "p", "p"))
##      start end
## [1,]     1   1
## [2,]     1   1
## [3,]     1   1
## [4,]     1   1
# 返回匹配的所有位置:
str_locate_all(string = fruit, pattern = "a")
## [[1]]
##      start end
## [1,]     1   1
## 
## [[2]]
##      start end
## [1,]     2   2
## [2,]     4   4
## [3,]     6   6
## 
## [[3]]
##      start end
## [1,]     3   3
## 
## [[4]]
##      start end
## [1,]     5   5
str_locate_all(string = fruit, pattern = "e")
## [[1]]
##      start end
## [1,]     5   5
## 
## [[2]]
##      start end
## 
## [[3]]
##      start end
## [1,]     2   2
## 
## [[4]]
##      start end
## [1,]     4   4
## [2,]     9   9
str_locate_all(string = fruit, pattern = c("a", "b", "p", "p"))
## [[1]]
##      start end
## [1,]     1   1
## 
## [[2]]
##      start end
## [1,]     1   1
## 
## [[3]]
##      start end
## [1,]     1   1
## 
## [[4]]
##      start end
## [1,]     1   1
## [2,]     6   6
## [3,]     7   7
# 查找每个字符的位置
str_locate_all(string = fruit, pattern = "")
## [[1]]
##      start end
## [1,]     1   1
## [2,]     2   2
## [3,]     3   3
## [4,]     4   4
## [5,]     5   5
## 
## [[2]]
##      start end
## [1,]     1   1
## [2,]     2   2
## [3,]     3   3
## [4,]     4   4
## [5,]     5   5
## [6,]     6   6
## 
## [[3]]
##      start end
## [1,]     1   1
## [2,]     2   2
## [3,]     3   3
## [4,]     4   4
## 
## [[4]]
##       start end
##  [1,]     1   1
##  [2,]     2   2
##  [3,]     3   3
##  [4,]     4   4
##  [5,]     5   5
##  [6,]     6   6
##  [7,]     7   7
##  [8,]     8   8
##  [9,]     9   9

12. str_match 从字符串中提取匹配组

str_match(string, pattern) 提取匹配的第一个字符串

str_match_all(string, pattern) 提取匹配的所有字符串

strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569", "387 287 6718", 
    "apple", "233.398.9187 ", "482 952 3315", "239 923 8115 and 842 566 4692", 
    "Work: 579-499-7527", "$1000", "Home: 543.355.3679")

phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"

str_extract(string = strings, pattern = phone)  #返回匹配的最长字符串
##  [1] "219 733 8965" "329-293-8753" NA             "595 794 7569"
##  [5] "387 287 6718" NA             "233.398.9187" "482 952 3315"
##  [9] "239 923 8115" "579-499-7527" NA             "543.355.3679"
str_match(string = strings, pattern = phone)  #返回匹配的最长字符串,同时返回最长字符串中的子字符串
##       [,1]           [,2]  [,3]  [,4]  
##  [1,] "219 733 8965" "219" "733" "8965"
##  [2,] "329-293-8753" "329" "293" "8753"
##  [3,] NA             NA    NA    NA    
##  [4,] "595 794 7569" "595" "794" "7569"
##  [5,] "387 287 6718" "387" "287" "6718"
##  [6,] NA             NA    NA    NA    
##  [7,] "233.398.9187" "233" "398" "9187"
##  [8,] "482 952 3315" "482" "952" "3315"
##  [9,] "239 923 8115" "239" "923" "8115"
## [10,] "579-499-7527" "579" "499" "7527"
## [11,] NA             NA    NA    NA    
## [12,] "543.355.3679" "543" "355" "3679"
# Extract/match all
str_extract_all(string = strings, pattern = phone)
## [[1]]
## [1] "219 733 8965"
## 
## [[2]]
## [1] "329-293-8753"
## 
## [[3]]
## character(0)
## 
## [[4]]
## [1] "595 794 7569"
## 
## [[5]]
## [1] "387 287 6718"
## 
## [[6]]
## character(0)
## 
## [[7]]
## [1] "233.398.9187"
## 
## [[8]]
## [1] "482 952 3315"
## 
## [[9]]
## [1] "239 923 8115" "842 566 4692"
## 
## [[10]]
## [1] "579-499-7527"
## 
## [[11]]
## character(0)
## 
## [[12]]
## [1] "543.355.3679"
str_match_all(string = strings, pattern = phone)
## [[1]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "219 733 8965" "219" "733" "8965"
## 
## [[2]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "329-293-8753" "329" "293" "8753"
## 
## [[3]]
##      [,1] [,2] [,3] [,4]
## 
## [[4]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "595 794 7569" "595" "794" "7569"
## 
## [[5]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "387 287 6718" "387" "287" "6718"
## 
## [[6]]
##      [,1] [,2] [,3] [,4]
## 
## [[7]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "233.398.9187" "233" "398" "9187"
## 
## [[8]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "482 952 3315" "482" "952" "3315"
## 
## [[9]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "239 923 8115" "239" "923" "8115"
## [2,] "842 566 4692" "842" "566" "4692"
## 
## [[10]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "579-499-7527" "579" "499" "7527"
## 
## [[11]]
##      [,1] [,2] [,3] [,4]
## 
## [[12]]
##      [,1]           [,2]  [,3]  [,4]  
## [1,] "543.355.3679" "543" "355" "3679"

13. str_order 对字符向量进行排序

str_order(x, decreasing = FALSE, na_last = TRUE, locale = “”, …)

str_sort(x, decreasing = FALSE, na_last = TRUE, locale = “”, …)

str_order(x = letters, locale = "en")
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26
str_sort(x = letters, locale = "en")
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
str_order(x = letters, locale = "haw")
##  [1]  1  5  9 15 21  2  3  4  6  7  8 10 11 12 13 14 16 17 18 19 20 22 23
## [24] 24 25 26
str_sort(x = letters, locale = "haw")
##  [1] "a" "e" "i" "o" "u" "b" "c" "d" "f" "g" "h" "j" "k" "l" "m" "n" "p"
## [18] "q" "r" "s" "t" "v" "w" "x" "y" "z"

14. str_pad 在字符串的前后位置填充字符(如空格)

str_pad(string, width, side = c(“left”, “right”, “both”), pad = " “)

  • width:填充字符后字符串的长度;
  • side:填充字符串的位置,默认为left;
  • pad:指定填充的字符串;
rbind(str_pad("hadley", 30, "left"), str_pad("hadley", 30, "right"), str_pad("hadley", 
    30, "both"))
##      [,1]                            
## [1,] "                        hadley"
## [2,] "hadley                        "
## [3,] "            hadley            "
str_pad(string = c("a", "abc", "abcdef"), width = 10)
## [1] "         a" "       abc" "    abcdef"
str_pad(string = "a", width = c(5, 10, 20))
## [1] "    a"                "         a"           "                   a"
str_pad(string = "a", width = 10, pad = c("-", "_", "+"))
## [1] "---------a" "_________a" "+++++++++a"
# 当设置width小于string的长度时,结果返回原string
str_pad(string = "hadley", width = 3, pad = "-")
## [1] "hadley"
str_pad("hadley", width = 8, side = "left", pad = "-")
## [1] "--hadley"
str_pad("hadley", width = 8, side = "right", pad = "-")
## [1] "hadley--"
str_pad("hadley", width = 8, side = "both", pad = "-")
## [1] "-hadley-"

15. str_replace 替换字符串中的匹配模式

str_replace(string, pattern, replacement)

str_replace_all(string, pattern, replacement)

fruits <- c("one apple", "two pears", "three bananas")
str_replace(string = fruits, pattern = "[aeiou]", replacement = "-")  #替换第一个匹配的字符
## [1] "-ne apple"     "tw- pears"     "thr-e bananas"
str_replace_all(string = fruits, pattern = "[aeiou]", replacement = "-")  #替换所有匹配的字符
## [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"
str_replace(string = fruits, pattern = "([aeiou])", replacement = "")
## [1] "ne apple"     "tw pears"     "thre bananas"
str_replace(string = fruits, pattern = "([aeiou])", replacement = "\\1\\1")
## [1] "oone apple"     "twoo pears"     "threee bananas"
str_replace(string = fruits, pattern = "[aeiou]", replacement = c("1", "2", 
    "3"))
## [1] "1ne apple"     "tw2 pears"     "thr3e bananas"
str_replace(string = fruits, pattern = c("a", "e", "i"), replacement = "-")
## [1] "one -pple"     "two p-ars"     "three bananas"
fruits <- c("one apple", "two pears", "three bananas")
str_replace(string = fruits, pattern = "[aeiou]", replacement = "-")
## [1] "-ne apple"     "tw- pears"     "thr-e bananas"
str_replace_all(string = fruits, pattern = "[aeiou]", replacement = "-")
## [1] "-n- -ppl-"     "tw- p--rs"     "thr-- b-n-n-s"
str_replace_all(string = fruits, pattern = "([aeiou])", replacement = "")
## [1] "n ppl"    "tw prs"   "thr bnns"
str_replace_all(string = fruits, pattern = "([aeiou])", replacement = "\\1\\1")
## [1] "oonee aapplee"      "twoo peeaars"       "threeee baanaanaas"
str_replace_all(string = fruits, pattern = "[aeiou]", replacement = c("1", "2", 
    "3"))
## [1] "1n1 1ppl1"     "tw2 p22rs"     "thr33 b3n3n3s"
str_replace_all(string = fruits, pattern = c("a", "e", "i"), replacement = "-")
## [1] "one -pple"     "two p-ars"     "three bananas"
# 对一个字符串同时应用多个指定规则进行替换:
strings1 <- str_c(fruits, collapse = "---")
strings1
## [1] "one apple---two pears---three bananas"
str_replace_all(string = strings1, pattern = c(one = 1, two = 2, three = 3))
## [1] "1 apple---2 pears---3 bananas"

16. str_replace_na 将缺失值替换成‘NA’

str_replace_na(string, replacement = “NA”)

str_replace_na(c(NA, "abc", "def"))
## [1] "NA"  "abc" "def"

17. str_split 根据一个分隔符将字符串进行分割

str_split(string, pattern, n = Inf)#结果返回列表

str_split_fixed(string, pattern, n)#必须设置n,结果返回矩阵

fruits <- c("apples and oranges and pears and bananas", "pineapples and mangos and guavas")
str_split(string = fruits, pattern = " and ")
## [[1]]
## [1] "apples"  "oranges" "pears"   "bananas"
## 
## [[2]]
## [1] "pineapples" "mangos"     "guavas"
str_split_fixed(string = fruits, pattern = " and ", n = 4)
##      [,1]         [,2]      [,3]     [,4]     
## [1,] "apples"     "oranges" "pears"  "bananas"
## [2,] "pineapples" "mangos"  "guavas" ""
# 通过设置n,指定分割成n块
str_split(string = fruits, pattern = " and ", n = 3)  #将字符串分割成3部分
## [[1]]
## [1] "apples"            "oranges"           "pears and bananas"
## 
## [[2]]
## [1] "pineapples" "mangos"     "guavas"
str_split(string = fruits, pattern = " and ", n = 2)  #将字符串分割成2部分
## [[1]]
## [1] "apples"                        "oranges and pears and bananas"
## 
## [[2]]
## [1] "pineapples"        "mangos and guavas"
str_split(string = fruits, pattern = " and ", n = 5)  #将字符串分割成5部分
## [[1]]
## [1] "apples"  "oranges" "pears"   "bananas"
## 
## [[2]]
## [1] "pineapples" "mangos"     "guavas"
str_split_fixed(string = fruits, pattern = " and ", n = 3)
##      [,1]         [,2]      [,3]               
## [1,] "apples"     "oranges" "pears and bananas"
## [2,] "pineapples" "mangos"  "guavas"
str_split_fixed(string = fruits, pattern = " and ", n = 4)
##      [,1]         [,2]      [,3]     [,4]     
## [1,] "apples"     "oranges" "pears"  "bananas"
## [2,] "pineapples" "mangos"  "guavas" ""
str_split_fixed(string = fruits, pattern = " and ", n = 6)
##      [,1]         [,2]      [,3]     [,4]      [,5] [,6]
## [1,] "apples"     "oranges" "pears"  "bananas" ""   ""  
## [2,] "pineapples" "mangos"  "guavas" ""        ""   ""

18. str_sub 按位置从字符向量中提取或替换子字符串

str_sub(string, start = 1L, end = -1L) 提取子字符串

str_sub(string, start = 1L, end = -1L) <- value 替换子字符串

hw <- "Hadley Wickham"
str_sub(string = hw, start = 1, end = 6)
## [1] "Hadley"
str_sub(string = hw, end = 6)
## [1] "Hadley"
str_sub(string = hw, start = 8, end = 14)
## [1] "Wickham"
str_sub(string = hw, start = 8)
## [1] "Wickham"
str_sub(string = hw, start = c(1, 8), end = c(6, 14))
## [1] "Hadley"  "Wickham"
# 使用负值索引
str_sub(string = hw, start = -1)
## [1] "m"
str_sub(string = hw, start = -7)
## [1] "Wickham"
str_sub(string = hw, end = -7)
## [1] "Hadley W"
# 从函数str_locate_all的结果传入位置参数
pos <- str_locate_all(hw, "[aeio]")[[1]]
pos
##      start end
## [1,]     2   2
## [2,]     5   5
## [3,]     9   9
## [4,]    13  13
str_sub(string = hw, pos)
## [1] "a" "e" "i" "a"
str_sub(string = hw, start = pos[, 1], end = pos[, 2])
## [1] "a" "e" "i" "a"
# 向量化
str_sub(hw, start = seq_len(str_length(hw)))
##  [1] "Hadley Wickham" "adley Wickham"  "dley Wickham"   "ley Wickham"   
##  [5] "ey Wickham"     "y Wickham"      " Wickham"       "Wickham"       
##  [9] "ickham"         "ckham"          "kham"           "ham"           
## [13] "am"             "m"
str_sub(hw, end = seq_len(str_length(hw)))
##  [1] "H"              "Ha"             "Had"            "Hadl"          
##  [5] "Hadle"          "Hadley"         "Hadley "        "Hadley W"      
##  [9] "Hadley Wi"      "Hadley Wic"     "Hadley Wick"    "Hadley Wickh"  
## [13] "Hadley Wickha"  "Hadley Wickham"
# 替换
x <- "BBCDEF"
str_sub(x, 1, 1) <- "A"
x
## [1] "ABCDEF"
str_sub(x, -1, -1) <- "K"
x
## [1] "ABCDEK"
str_sub(x, -2, -2) <- "GHIJ"
x
## [1] "ABCDGHIJK"
str_sub(x, 2, -2) <- ""
x
## [1] "AK"

19. str_subset 提取匹配模式的字符串向量元素

str_subset(string, pattern)

fruit <- c("apple", "banana", "pear", "pinapple")
str_subset(string = fruit, pattern = "a")
## [1] "apple"    "banana"   "pear"     "pinapple"
str_subset(string = fruit, pattern = "ap")
## [1] "apple"    "pinapple"
str_subset(string = fruit, pattern = "^a")
## [1] "apple"
str_subset(string = fruit, pattern = "a$")
## [1] "banana"
str_subset(string = fruit, pattern = "[aeiou]")
## [1] "apple"    "banana"   "pear"     "pinapple"
# 有缺失值时
str_subset(string = c("a", NA, "b"), pattern = ".")
## [1] "a" "b"

20. str_trim 删除字符串中的空格

str_trim(string, side = c(“both”, “left”, “right”))

str_trim(" String with trailing and leading white space\t")
## [1] "String with trailing and leading white space"
str_trim("\n\nString with trailing and leading white space\n\n")
## [1] "String with trailing and leading white space"

21. str_wrap

str_wrap(string, width = 80, indent = 0, exdent = 0)

  • width:每行的宽度
  • indent:设置首行缩进
  • exdent:设置第二行后每行缩进
thanks_path <- file.path(R.home("doc"), "THANKS")
thanks <- str_c(readLines(thanks_path), collapse = "\n")
thanks <- word(thanks, 1, 3, fixed("\n\n"))
cat(str_wrap(thanks), "\n")
## R would not be what it is today without the invaluable help of these people,
## who contributed by donating code, bug fixes and documentation: Valerio Aimale,
## Thomas Baier, Henrik Bengtsson, Roger Bivand, Ben Bolker, David Brahm, Goran
## Brostrom, Patrick Burns, Vince Carey, Saikat DebRoy, Brian D'Urso, Lyndon
## Drake, Dirk Eddelbuettel, Claus Ekstrom, Sebastian Fischmeister, John Fox,
## Paul Gilbert, Yu Gong, Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn,
## Robert King, Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan de Leeuw,
## Jim Lindsey, Patrick Lindsey, Catherine Loader, Gordon Maclean, John Maindonald,
## David Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve Oncley, Richard O'Keefe,
## Hubert Palme, Roger D. Peng, Jose' C. Pinheiro, Tony Plate, Anthony Rossini,
## Jonathan Rougier, Petr Savicky, Guenther Sawitzki, Marc Schwartz, Detlef Steuer,
## Bill Simpson, Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner,
## Bill Venables, Gregory R. Warnes, Andreas Weingessel, Morten Welinder, James
## Wettenhall, Simon Wood and Achim Zeileis. Others have written code that has been
## adopted by R and is acknowledged in the code files, including
cat(str_wrap(string = thanks, width = 70), "\n")
## R would not be what it is today without the invaluable help of these
## people, who contributed by donating code, bug fixes and documentation:
## Valerio Aimale, Thomas Baier, Henrik Bengtsson, Roger Bivand, Ben
## Bolker, David Brahm, Goran Brostrom, Patrick Burns, Vince Carey,
## Saikat DebRoy, Brian D'Urso, Lyndon Drake, Dirk Eddelbuettel, Claus
## Ekstrom, Sebastian Fischmeister, John Fox, Paul Gilbert, Yu Gong,
## Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn, Robert King,
## Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan de Leeuw, Jim
## Lindsey, Patrick Lindsey, Catherine Loader, Gordon Maclean, John
## Maindonald, David Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve
## Oncley, Richard O'Keefe, Hubert Palme, Roger D. Peng, Jose' C.
## Pinheiro, Tony Plate, Anthony Rossini, Jonathan Rougier, Petr Savicky,
## Guenther Sawitzki, Marc Schwartz, Detlef Steuer, Bill Simpson,
## Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner, Bill
## Venables, Gregory R. Warnes, Andreas Weingessel, Morten Welinder,
## James Wettenhall, Simon Wood and Achim Zeileis. Others have written
## code that has been adopted by R and is acknowledged in the code files,
## including
cat(str_wrap(string = thanks, width = 60, indent = 6), "\n")
##       R would not be what it is today without the invaluable help
## of these people, who contributed by donating code, bug fixes
## and documentation: Valerio Aimale, Thomas Baier, Henrik
## Bengtsson, Roger Bivand, Ben Bolker, David Brahm, Goran
## Brostrom, Patrick Burns, Vince Carey, Saikat DebRoy, Brian
## D'Urso, Lyndon Drake, Dirk Eddelbuettel, Claus Ekstrom,
## Sebastian Fischmeister, John Fox, Paul Gilbert, Yu Gong,
## Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn,
## Robert King, Kjetil Kjernsmo, Roger Koenker, Philippe
## Lambert, Jan de Leeuw, Jim Lindsey, Patrick Lindsey,
## Catherine Loader, Gordon Maclean, John Maindonald, David
## Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve Oncley,
## Richard O'Keefe, Hubert Palme, Roger D. Peng, Jose' C.
## Pinheiro, Tony Plate, Anthony Rossini, Jonathan Rougier,
## Petr Savicky, Guenther Sawitzki, Marc Schwartz, Detlef
## Steuer, Bill Simpson, Gordon Smyth, Adrian Trapletti, Terry
## Therneau, Rolf Turner, Bill Venables, Gregory R. Warnes,
## Andreas Weingessel, Morten Welinder, James Wettenhall, Simon
## Wood and Achim Zeileis. Others have written code that has
## been adopted by R and is acknowledged in the code files,
## including
cat(str_wrap(string = thanks, width = 80, indent = 6, exdent = 2), "\n")
##       R would not be what it is today without the invaluable help of these people,
##   who contributed by donating code, bug fixes and documentation: Valerio Aimale,
##   Thomas Baier, Henrik Bengtsson, Roger Bivand, Ben Bolker, David Brahm, Goran
##   Brostrom, Patrick Burns, Vince Carey, Saikat DebRoy, Brian D'Urso, Lyndon
##   Drake, Dirk Eddelbuettel, Claus Ekstrom, Sebastian Fischmeister, John Fox,
##   Paul Gilbert, Yu Gong, Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn,
##   Robert King, Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan de Leeuw,
##   Jim Lindsey, Patrick Lindsey, Catherine Loader, Gordon Maclean, John Maindonald,
##   David Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve Oncley, Richard O'Keefe,
##   Hubert Palme, Roger D. Peng, Jose' C. Pinheiro, Tony Plate, Anthony Rossini,
##   Jonathan Rougier, Petr Savicky, Guenther Sawitzki, Marc Schwartz, Detlef Steuer,
##   Bill Simpson, Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner,
##   Bill Venables, Gregory R. Warnes, Andreas Weingessel, Morten Welinder, James
##   Wettenhall, Simon Wood and Achim Zeileis. Others have written code that has been
##   adopted by R and is acknowledged in the code files, including

22. word 从句子中提取单词

word(string, start = 1L, end = start, sep = fixed(" “))

sentences <- c("Jane saw a cat", "Jane sat down")
word(string = sentences, start = 1)  #提取第一个单词
## [1] "Jane" "Jane"
word(string = sentences, start = 2)  #提取第二个单词
## [1] "saw" "sat"
word(string = sentences, start = -1)  #提取句子的最后一个单词
## [1] "cat"  "down"
word(string = sentences, start = 2, end = -1)  #提取第二个单词到最后一个单词
## [1] "saw a cat" "sat down"
# 向量化:
word(string = sentences[1], start = 1:3, end = -1)
## [1] "Jane saw a cat" "saw a cat"      "a cat"
word(string = sentences[1], start = 1, end = 1:4)
## [1] "Jane"           "Jane saw"       "Jane saw a"     "Jane saw a cat"
# 通过参数sep指定分隔符
str <- "abc.def..123.4568.999"
word(string = str, start = 1, sep = fixed(".."))
## [1] "abc.def"
word(string = str, start = 2, sep = fixed(".."))
## [1] "123.4568.999"

二、以下为基础包的字符串处理函数:

23. paste() 字符串连接:

paste(…, sep = " “, collapse = NULL)

paste("A", 1:6, sep = "")
## [1] "A1" "A2" "A3" "A4" "A5" "A6"
paste("A", 1:6, sep = "", collapse = "-")  #设置collapse时,将连成一个字符串
## [1] "A1-A2-A3-A4-A5-A6"
paste(1:6, collapse = "")
## [1] "123456"
paste(1:6, collapse = "-")
## [1] "1-2-3-4-5-6"
paste("Today is", date())
## [1] "Today is Sat Jun 18 16:29:34 2016"

24. strsplit() 字符串分割,结果返回列表:

strsplit(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE)

  • split:设置分割符
  • fixed:逻辑值,默认值为FALSE
  • perl:逻辑值,默认值为FALSE,取TRUE时,分割符使用正则表达式
  • useBytes:逻辑值,默认值为FALSE,
x <- c(as = "asfef", qu = "qwerty", "yuiop[", "b", "stuff.blah.yech")
strsplit(x = x, split = "e")
## $as
## [1] "asf" "f"  
## 
## $qu
## [1] "qw"  "rty"
## 
## [[3]]
## [1] "yuiop["
## 
## [[4]]
## [1] "b"
## 
## [[5]]
## [1] "stuff.blah.y" "ch"
unlist(strsplit(x = "a.b.c", split = "."))
## [1] "" "" "" "" ""
unlist(strsplit(x = "a.b.c", split = "[.]"))  #使用‘.’为分割符
## [1] "a" "b" "c"
# 或者:
unlist(strsplit(x = "a.b.c", split = ".", fixed = TRUE))
## [1] "a" "b" "c"
x <- "ascd123afrwf34535ddggh454fgf5e4"
strsplit(x = x, split = "[0-9]+", perl = TRUE)  #以数字为分割符
## [[1]]
## [1] "ascd"  "afrwf" "ddggh" "fgf"   "e"
unlist(strsplit(x = x, split = "[a-z]+"))  #以字母为分割符
## [1] ""      "123"   "34535" "454"   "5"     "4"

25. nchar() 计算字符串的字符个数:

nchar(x, type = “chars”, allowNA = FALSE)

x <- c("asfef", "qwerty", "yuiop[", "b", "stuff.blah.yech")
nchar(x)  #结果返回向量x的每个元素的字符个数
## [1]  5  6  6  1 15

26. substr 字符串截取及替换:

(1)substr(x, start, stop)

(2)substring(text, first, last = 1000000L)

(3)substr(x, start, stop) <- value

(4)substring(text, first, last = 1000000L) <- value

# 对于单个字符串:
substr(x = "abcdef", start = 2, stop = 4)
## [1] "bcd"
substring(text = "abcdef", first = 2, last = 4)
## [1] "bcd"
substring(text = "abcdef", first = 1:6, last = 1:6)
## [1] "a" "b" "c" "d" "e" "f"
substr(x = rep("abcdef", 4), start = 1:4, stop = 4:5)
## [1] "abcd" "bcde" "cd"   "de"
# 对于字符串向量:
x <- c("asfef", "qwerty", "yuiop[", "b", "stuff.blah.yech")
substr(x = x, start = 2, stop = 5)  #对向量x每个元素截取子字符串
## [1] "sfef" "wert" "uiop" ""     "tuff"
substring(text = x, first = 2, last = 4:6)
## [1] "sfe"   "wert"  "uiop[" ""      "tuff"
substring(text = x, first = 2) <- c("..", "+++")  #以赋值的方式进行替换
x
## [1] "a..ef"           "q+++ty"          "y..op["          "b"              
## [5] "s..ff.blah.yech"

27. 字符串替换及大小写转换:

chartr(old, new, x) tolower(x) toupper(x) casefold(x, upper = FALSE)

x <- "MiXeD cAsE 123"
chartr("iXs", "why", x)  #i:w,X:h,s:y,单个字符对应替换
## [1] "MwheD cAyE 123"
chartr("a-cX", "D-Fw", x)
## [1] "MiweD FAsE 123"
tolower(x)  #转换成小写
## [1] "mixed case 123"
toupper(x)  #转换成大写
## [1] "MIXED CASE 123"
casefold(x, upper = FALSE)
## [1] "mixed case 123"
casefold(x, upper = TRUE)
## [1] "MIXED CASE 123"

28. 字符匹配与替换

  1. grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE, fixed = FALSE, useBytes = FALSE, invert = FALSE),结果返回匹配的向量x的元素的索引
  • ignore.case:逻辑值,默认值FALSE,区分大小写;
  • perl:逻辑值,默认值FALSE,不使用正则表达式;
  • value:逻辑值,设置结果返回匹配元素的值还是索引,默认值为FALSE:返回索引;
  • fixed:逻辑值,默认值为FALSE,取值为TRUE时使用精确匹配;
  • useBytes:逻辑值,默认取值FALSE;
  • invert:逻辑值,默认取值FALSE,设置结果返回匹配还是非匹配的元素;
txt <- c("10arm03", "Foot 12", " 678-lefroo.345", "__.bafoobar90..")
grep(pattern = "foo", x = txt, value = FALSE)  #区分大小写,结果返回匹配的元素索引
## [1] 4
grep(pattern = "foo", x = txt, value = TRUE)  #区分大小写,结果返回匹配的元素值
## [1] "__.bafoobar90.."
grep(pattern = "foo", x = txt, ignore.case = TRUE)  #忽略大小写,结果返回匹配的元素索引
## [1] 2 4
grep(pattern = "foo", x = txt, ignore.case = TRUE, value = TRUE)  #忽略大小写,结果返回匹配的元素值
## [1] "Foot 12"         "__.bafoobar90.."
grep(pattern = "foo", x = txt, ignore.case = TRUE, value = TRUE, invert = TRUE)  #忽略大小写,结果返回不匹配的元素值
## [1] "10arm03"         " 678-lefroo.345"
grep(pattern = "^[0-9]+", x = txt, perl = TRUE)  #返回以数字开头的元素索引
## [1] 1
grep(pattern = "[0-9]+$", x = txt, perl = TRUE, value = TRUE)  #返回以数字结尾的元素
## [1] "10arm03"         "Foot 12"         " 678-lefroo.345"
grep(pattern = "\\d$", x = txt, perl = TRUE, value = TRUE)  #返回以数字结尾的元素
## [1] "10arm03"         "Foot 12"         " 678-lefroo.345"
  1. grepl(pattern, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE),结果返回一个与向量x等长的逻辑向量,匹配的元素返回TRUE,不匹配的返回FALSE。
txt <- c("10arm03", "Foot 12", " 678-lefroo.345", "__.bafoobar90..")
grepl(pattern = "foo", x = txt)
## [1] FALSE FALSE FALSE  TRUE
grepl(pattern = "\\d$", x = txt, perl = TRUE)
## [1]  TRUE  TRUE  TRUE FALSE
  1. sub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE),替换匹配的元素的第一个字符串
txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
sub(pattern = "foo", replacement = "99", x = txt)  #将元素中的第一个foo替换成99
## [1] "10arm03"              "Foot 12 99t"          " 678-lefroo.345"     
## [4] "__.ba99bar90foobar.."
sub(pattern = "\\d+$", replacement = "+++", x = txt, perl = TRUE)  #将结尾的数字替换成+++
## [1] "10arm+++"              "Foot 12 foot"          " 678-lefroo.+++"      
## [4] "__.bafoobar90foobar.."
  1. gsub(pattern, replacement, x, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE),替换匹配的元素的所有字符串
txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
gsub(pattern = "foo", replacement = "99", x = txt)  #将所有的foo替换成99
## [1] "10arm03"             "Foot 12 99t"         " 678-lefroo.345"    
## [4] "__.ba99bar9099bar.."
gsub(pattern = "\\d+", replacement = "+++", x = txt, perl = TRUE)  #将所有数字替换成+++
## [1] "+++arm+++"              "Foot +++ foot"         
## [3] " +++-lefroo.+++"        "__.bafoobar+++foobar.."
  1. regexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE),结果返回每个元素匹配的第一个位置及字符数目,不匹配的元素返回的位置和长度都是-1。
txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
regexpr(pattern = "foo", text = txt)
## [1] -1  9 -1  6
## attr(,"match.length")
## [1] -1  3 -1  3
## attr(,"useBytes")
## [1] TRUE
regexpr(pattern = "\\d+", text = txt)
## [1]  1  6  2 12
## attr(,"match.length")
## [1] 2 2 3 2
## attr(,"useBytes")
## [1] TRUE
  1. gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE),返回每个元素匹配的所有位置及相应的字符数目
txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
gregexpr(pattern = "foo", text = txt)
## [[1]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1] 9
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE
## 
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
## 
## [[4]]
## [1]  6 14
## attr(,"match.length")
## [1] 3 3
## attr(,"useBytes")
## [1] TRUE
gregexpr(pattern = "\\d+", text = txt)
## [[1]]
## [1] 1 6
## attr(,"match.length")
## [1] 2 2
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1] 6
## attr(,"match.length")
## [1] 2
## attr(,"useBytes")
## [1] TRUE
## 
## [[3]]
## [1]  2 13
## attr(,"match.length")
## [1] 3 3
## attr(,"useBytes")
## [1] TRUE
## 
## [[4]]
## [1] 12
## attr(,"match.length")
## [1] 2
## attr(,"useBytes")
## [1] TRUE
  1. regexec(pattern, text, ignore.case = FALSE, fixed = FALSE, useBytes = FALSE)
txt <- c(NA, "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
regexec(pattern = "foo", text = txt)
## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
## 
## [[2]]
## [1] 9
## attr(,"match.length")
## [1] 3
## 
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## 
## [[4]]
## [1] 6
## attr(,"match.length")
## [1] 3
regexec(pattern = "\\d+", text = txt)
## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
## 
## [[2]]
## [1] 6
## attr(,"match.length")
## [1] 2
## 
## [[3]]
## [1] 2
## attr(,"match.length")
## [1] 3
## 
## [[4]]
## [1] 12
## attr(,"match.length")
## [1] 2