在文本或非结构数据处理中往往需要正则表达式的强大功能,需要字符串的处理.
Examples
a1 = state.name
a1; nchar(a1)
## [1] "Alabama" "Alaska" "Arizona" "Arkansas"
## [5] "California" "Colorado" "Connecticut" "Delaware"
## [9] "Florida" "Georgia" "Hawaii" "Idaho"
## [13] "Illinois" "Indiana" "Iowa" "Kansas"
## [17] "Kentucky" "Louisiana" "Maine" "Maryland"
## [21] "Massachusetts" "Michigan" "Minnesota" "Mississippi"
## [25] "Missouri" "Montana" "Nebraska" "Nevada"
## [29] "New Hampshire" "New Jersey" "New Mexico" "New York"
## [33] "North Carolina" "North Dakota" "Ohio" "Oklahoma"
## [37] "Oregon" "Pennsylvania" "Rhode Island" "South Carolina"
## [41] "South Dakota" "Tennessee" "Texas" "Utah"
## [45] "Vermont" "Virginia" "Washington" "West Virginia"
## [49] "Wisconsin" "Wyoming"
## [1] 7 6 7 8 10 8 11 8 7 7 6 5 8 7 4 6 8 9 5 8 13 8 9
## [24] 11 8 7 8 6 13 10 10 8 14 12 4 8 6 12 12 14 12 9 5 4 7 8
## [47] 10 13 9 7
a = rnorm(100)
b = matrix(1:12, ncol = 3)
cc = iris
d = list(a = c('a', 'b', 'c'), b = 1:10, e = mtcars)
length(a); length(b); length(cc); length(d)
## [1] 100
## [1] 12
## [1] 5
## [1] 3
- cat()函数可以显示和连接字符串。该函数可以将字符值合并,并直接打印在屏幕中,该函数成为在函数内部打印消息或警告信息的理想函数
- cat()函数中使用’\n’参数为换行符,确保改行的完整信息。’\t’为4字节的空格.
- cat()函数的参数
fill
可用于输出字符串中自动插入换行符,如果fill设置为TRUE,则系统的width值将被用来确定行宽,如果fill参数为一个给定的数值,则输出结果的宽度将使用该值。- cat()函数中有一个参数为
file
,该参数允许输出结果指定到一个文件中。- print()函数和paste(), paste0()函数在屏幕中输出带引号的字符串。
- paste(), paste0()函数是连接字符串常用的函数。
Example :
a = 3
cat('The answer is', a)
## The answer is 3
print('The answer is', a)
## [1] "The answer is"
paste('The answer is', a)
## [1] "The answer is 3"
paste0('The answer is ', a, collapse = ' ')
## [1] "The answer is 3"
cat('My name is', 'Wangzhefeng')
## My name is Wangzhefeng
cat('My name is', '\n', 'Wangzhefeng')
## My name is
## Wangzhefeng
cat('My name is', '\t', 'Wangzhefeng')
## My name is Wangzhefeng
cat('当字符串很长时', '通过fill参数', '可以自动设定换行符',
'所以fill参数是非常灵活的', '在使用cat函数中强烈建议使用', fill = TRUE)
## 当字符串很长时 通过fill参数 可以自动设定换行符
## 所以fill参数是非常灵活的 在使用cat函数中强烈建议使用
cat('当字符串很长时', '通过fill参数', '可以自动设定换行符',
'所以fill参数是非常灵活的', '在使用cat函数中强烈建议使用', fill = 20)
## 当字符串很长时
## 通过fill参数
## 可以自动设定换行符
## 所以fill参数是非常灵活的
## 在使用cat函数中强烈建议使用
getwd()
cat('当字符串很长时', '通过fill参数', '可以自动设定换行符',
'所以fill参数是非常灵活的', '在使用cat函数中强烈建议使用之',
file = "cat.file",
fill = 20)
paste('one', 3, 'three', 4, 'five')
## [1] "one 3 three 4 five"
# 不起作用
paste(c('one', 'two', 'three', 'four'))
## [1] "one" "two" "three" "four"
paste('one', 3, 'three', 4, 'five', sep = ' ')
## [1] "one 3 three 4 five"
paste('one', 3, 'three', 4, 'five', collapse = '-') # collapse = '-' 不起作用
## [1] "one 3 three 4 five"
paste(c('one', 'two', 'three', 'four'), collapse = '-')
## [1] "one-two-three-four"
paste(c('one', 'two', 'three', 'four'), sep = '-') # sep = '-' 不起作用
## [1] "one" "two" "three" "four"
paste('X', 1:5, sep = '')
## [1] "X1" "X2" "X3" "X4" "X5"
paste(c('X', 'Y'), 1:5, sep = '')
## [1] "X1" "Y2" "X3" "Y4" "X5"
paste(c('X', 'Y'), 1:5, sep = '', collapse = '|')
## [1] "X1|Y2|X3|Y4|X5"
paste(c('X', 'Y'), 1:5, '^', c('a', 'b'), sep = '_', collapse = '|')
## [1] "X_1_^_a|Y_2_^_b|X_3_^_a|Y_4_^_b|X_5_^_a"
paste(c('X', 'Y'), 1:5, '^', c('a', 'b'), sep = '_')
## [1] "X_1_^_a" "Y_2_^_b" "X_3_^_a" "Y_4_^_b" "X_5_^_a"
first(start)
和last(stop)
参数可以是一个数值,也可以是一个向量。在应用中强烈建议使用substring()函数,该函数更为稳定.
substr("abcdef", 2, 4)
## [1] "bcd"
substring("abcdef", 2, 4)
## [1] "bcd"
n = nchar("abcdef")
substring("abcdef", 1:n, 1:n)
## [1] "a" "b" "c" "d" "e" "f"
rep("abcdef", 4)
## [1] "abcdef" "abcdef" "abcdef" "abcdef"
substring(rep("abcdef", 4), 1:4, 4:5)
## [1] "abcd" "bcde" "cd" "de"
x = c("asfef", "qwerty", "yuiop[", "b", "stuff.blah.yech")
substring(x, 2, 5)
## [1] "sfef" "wert" "uiop" "" "tuff"
substring(x, 2, 4:6)
## [1] "sfe" "wert" "uiop[" "" "tuff"
为了找到字符串中一个特定字符的位置,首先需要将字符串转换为字符向量(可以向substr函数的first和last参数传递向量来完成),然后通过which函数确定某个字符的位置。
text = 'My name is Wang Zhefeng'
n = nchar(text)
str_value = substring(text, 1:n, 1:n)
str_value
## [1] "M" "y" " " "n" "a" "m" "e" " " "i" "s" " " "W" "a" "n" "g" " " "Z"
## [18] "h" "e" "f" "e" "n" "g"
which(str_value == 'i')
## [1] 9
函数解释 :
strsplit()函数可以使用字符串或正则表达式将字符串划分为更小的段,该函数的第一个参数是要拆分的字符串,第二个参数是用来将字符串分解成多个部分的字符值或正则表达式。该函数将分解后的子段返回到列表
中。
函数语法
strsplit(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE)
Example 1:
string = 'Split the elements of a character vector x into substrings according to the matches to substring split within them.'
part1 = strsplit(string, split = '') # ''不是空格
part1
## [[1]]
## [1] "S" "p" "l" "i" "t" " " "t" "h" "e" " " "e" "l" "e" "m" "e" "n" "t"
## [18] "s" " " "o" "f" " " "a" " " "c" "h" "a" "r" "a" "c" "t" "e" "r" " "
## [35] "v" "e" "c" "t" "o" "r" " " "x" " " "i" "n" "t" "o" " " "s" "u" "b"
## [52] "s" "t" "r" "i" "n" "g" "s" " " "a" "c" "c" "o" "r" "d" "i" "n" "g"
## [69] " " "t" "o" " " "t" "h" "e" " " "m" "a" "t" "c" "h" "e" "s" " " "t"
## [86] "o" " " "s" "u" "b" "s" "t" "r" "i" "n" "g" " " "s" "p" "l" "i" "t"
## [103] " " "w" "i" "t" "h" "i" "n" " " "t" "h" "e" "m" "."
part2 = strsplit(string, split = ' ') # 空格
part2
## [[1]]
## [1] "Split" "the" "elements" "of" "a"
## [6] "character" "vector" "x" "into" "substrings"
## [11] "according" "to" "the" "matches" "to"
## [16] "substring" "split" "within" "them."
Example 2:
把数据框中的邮箱子段拆分成邮箱名和邮箱地址两个字段
df = data.frame(name = c('a', 'b', 'c', 'd'),
email = c('1234567@qq.com', 'qawsedrf@163.com', 'azsxdc@126.com', 'qwer@gmail.com'),
stringsAsFactors = FALSE)
df
## name email
## 1 a 1234567@qq.com
## 2 b qawsedrf@163.com
## 3 c azsxdc@126.com
## 4 d qwer@gmail.com
e_name = c('1234567', 'qawsedrf', 'azsxdc', 'qwer')
e_address = c('qq.com', '163.com', '126.com', 'gmail.com')
df = cbind(df, e_name, e_address)
df
## name email e_name e_address
## 1 a 1234567@qq.com 1234567 qq.com
## 2 b qawsedrf@163.com qawsedrf 163.com
## 3 c azsxdc@126.com azsxdc 126.com
## 4 d qwer@gmail.com qwer gmail.com
library(plyr)
df_email = as.list(df[, 2])
fun = function(data) {strsplit(data, split = '@')}
parts = llply(.data = df_email, .fun = fun)
parts
## [[1]]
## [[1]][[1]]
## [1] "1234567" "qq.com"
##
##
## [[2]]
## [[2]][[1]]
## [1] "qawsedrf" "163.com"
##
##
## [[3]]
## [[3]][[1]]
## [1] "azsxdc" "126.com"
##
##
## [[4]]
## [[4]][[1]]
## [1] "qwer" "gmail.com"
e_name = t(as.data.frame(parts)[1, ])
row.names(e_name) = NULL
e_name = as.data.frame(e_name)
names(e_name) = 'e_name'
e_name
## e_name
## 1 1234567
## 2 qawsedrf
## 3 azsxdc
## 4 qwer
e_address = t(as.data.frame(parts)[2, ])
row.names(e_address) = NULL
e_address = as.data.frame(e_address)
names(e_address) = 'e_address'
e_address
## e_address
## 1 qq.com
## 2 163.com
## 3 126.com
## 4 gmail.com
Example 3:
str = 'My name is Wangzhenfeng'
strsplit(str, split = ' ')
## [[1]]
## [1] "My" "name" "" ""
## [5] "is" "Wangzhenfeng"
strsplit(str, ' +')
## [[1]]
## [1] "My" "name" "is" "Wangzhenfeng"
str = '我们的统计工具是:R语言,他表现出极大的优势'
strsplit(str, split = '[:,]') # 为正则表达式指定多个分隔符号
## [[1]]
## [1] "我们的统计工具是" "R语言" "他表现出极大的优势"
函数解释 :
grep()函数接受一个正则表达式和一个字符串或字符串向量
,并返回由正则表达式匹配的字符串元素的索引。如果参数value=TRUE,则它将返回与正则表达式匹配的实际字符串而不是其索引号。
函数语法
语法如下:其中x必须为字符向量
grep(pattern, x, ignore.case = FALSE, perl = FALSE, value = FALSE,
fixed = FALSE, useBytes = FALSE, invert = FALSE)
Example 1:
该函数的一个重要用途是依据名称从一个数据框中提取一组变量,如在LifeCycleSavings数据框中,存在两个变量,都是以’pop’开头,我们可以使用grep函数找到这两个变量
head(LifeCycleSavings)
## sr pop15 pop75 dpi ddpi
## Australia 11.43 29.35 2.87 2329.68 2.87
## Austria 12.07 23.32 4.41 1507.99 3.93
## Belgium 13.17 23.80 4.43 2108.47 3.82
## Bolivia 5.75 41.89 1.67 189.13 0.22
## Brazil 12.88 42.19 0.83 728.47 4.56
## Canada 8.79 31.72 2.85 2982.88 2.43
grep('^pop', names(LifeCycleSavings))
## [1] 2 3
grep('^pop', names(LifeCycleSavings), value = TRUE)
## [1] "pop15" "pop75"
Example 2:
s = c('Wangzhefeng', 'xuxin', 'wangfen')
grep('n$', s, value = TRUE)
## [1] "xuxin" "wangfen"
Example 3:
要查找的正则表达式不考虑输入的大小写时,可以使用ignor.case=TRUE
inp = c('run dog run', 'work doggedly', 'CAT AND DOG')
grep('dog', inp, ignore.case = TRUE, value = TRUE)
## [1] "run dog run" "work doggedly" "CAT AND DOG"
很显然第二个字符’work doggedly’就不是我们所期望的结果,为解决该问题,可以使用转义尖括号(\\<dog\\>)
限制字符串被空格、标点符号或起始行或结束行包围情况下的匹配。
inp = c('run dog run', 'work doggedly', 'CAT AND DOG')
grep('\\<dog\\>', inp, ignore.case = TRUE, value = TRUE)
## [1] "run dog run" "CAT AND DOG"
Example 4:
如果传递给grep的正则表达式与其任何输入都不匹配,grep将返回一个空的数值型向量,换句话说,该函数可以用来测试一个正则表达式是否存在。
函数解释 :
regexpr()和gregexpr()函数可用于准确指出和提取字符串中与正则表达式相匹配的部分,这两个函数的输出为一个向量
和列表
,由所发现的正则表达式的起始点组成;如果没有匹配发生,返回值为-1,此外,match.length属性与起始点向量结合,提供字符匹配的准确信息。regexpr函数只提供其输入字符串中第一个匹配的有关信息,而gregexpr函数返回所有匹配的信息。
函数语法
regexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
gregexpr(pattern, text, ignore.case = FALSE, perl = FALSE,
fixed = FALSE, useBytes = FALSE)
Exampe 1:
gwh = gregexpr('[a-z][0-9]', text)
gwh
Example 2:
提取出匹配的字符 在这里使用另一个处理输出的函数mapply,该函数的第一个参数为函数,接受多个参数,其余参数是长度相同的向量,其元素将逐一传递到函数中。
wh1 = gregexpr('[a-z][0-9]', text)
getexpr = function(str, grep) {substring(str, grep, grep +attr(grep, 'match.length')-1)}
res = mapply(getexpr, text, wh1)
res
函数解释 :
sub()和gsub()函数是基于正则表达式的文字替换,它们均接受正则表达式的输入参数。 sub()函数只改变第一次出现的正则表达式,而gsub()函数可以替换所有满足正则表达式的字符。 这两个函数的一个重要用途涉及到数值型数据中,这些数据从网页或财务报表中读入,并可能包含逗号或美元符号。
函数语法
sub(pattern, replacement, x,
ignore.case = FALSE, perl = FALSE, fixed = FALSE, useBytes = FALSE)
gsub(pattern, replacement, x,
ignore.case = FALSE, perl = FALSE,fixed = FALSE, useBytes = FALSE)
Example 1:
values = c('$123,456', '$23,111', '¥223,00', '12.7%', '$$11%')
sub(pattern = '[$,¥,%]', replacement = '', x = values)
## [1] "123,456" "23,111" "223,00" "12.7" "$11%"
as.numeric(gsub(pattern = '[$,¥,%]', replacement = '', x = values))
## [1] 123456.0 23111.0 22300.0 12.7 11.0
正则表达式是一种表达字符值模式的方法,可以被用来提取字符串的一部分或以某种方式修改这些字符串。
对正则表达式做一些总结性的工作:
要形成一个字符类,使用方括号[]把需要匹配的字符括起来。如需要创建一个由a,b或3组成的字符类,可用[ab3]表示。破折号可用在字符类内部来表示值域[a-z],[A-Z],[0-9]
如果在R中输入一个正则表达式,是使用双引号的字符串,就需要双反斜杠,如果使用readline输入表达式,只需要一个反斜杠。
R中正则表达式的符号:
表1.常用的元字符
代码 | 说明 |
---|---|
^ |
定位表达式,目标开始 |
$ |
定位表达式,目标结束 |
. |
匹配换行符以外的任何单个字符 |
\w |
匹配字母或数字或下划线或汉字 |
`\s |
匹配任意的空白符 |
\d |
匹配数字 |
\b |
匹配单词的开始或结束 |
| |
分割不同的模式 |
() |
将相同模式放在一起 |
表2.常用的限定符
代码/语法 | 说明 |
---|---|
* |
匹配前面的实体出现0次或更多次 |
? |
匹配前面的实体出现0次或1次 |
+ |
匹配前面的实体出现1次或更多次 |
{n} |
匹配前面的实体精确地出现n次 |
{n,} |
匹配前面的实体至少出现n次 |
{n.m} |
匹配出现次数在n和m次之间 |
表3.常用的反义代码
代码/语法 | 说明 |
---|---|
\W |
匹配任意不是字母,数字,下划线,汉字的字符 |
\S |
匹配任意不是空白符的字符 |
\D |
匹配任意非数字的字符 |
\B |
匹配不是单词开头或结束的位置 |
[^x] |
匹配除了x以外的任意字符 |
[^aeiou] |
匹配除了aeiou这几个字母以外的任意字符 |
if(!require(stringr))install.packages("stringr")
## Loading required package: stringr
library(stringr)
Function
str_c() / str_jion : Join multiple strings into a single string.
Usage
str_c(..., sep = "", collapse = NULL)
str_join(..., sep = "", collapse = NULL)
Argument
...
: One or more character vectors. Zero length arguments are removed.sep
: String to insert between input vectors. 字符串之间的连接符,功能类似于paste()函数collapse
: Optional string used to combine input vectors into single string. 如果是向量之间的连接,collapse的作用与sep一样,只不过此时sep无效Examples
str_c("Letter: ", letters)
## [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e"
## [6] "Letter: f" "Letter: g" "Letter: h" "Letter: i" "Letter: j"
## [11] "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o"
## [16] "Letter: p" "Letter: q" "Letter: r" "Letter: s" "Letter: t"
## [21] "Letter: u" "Letter: v" "Letter: w" "Letter: x" "Letter: y"
## [26] "Letter: z"
str_c("Letter: ", letters, sep = "")
## [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e"
## [6] "Letter: f" "Letter: g" "Letter: h" "Letter: i" "Letter: j"
## [11] "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o"
## [16] "Letter: p" "Letter: q" "Letter: r" "Letter: s" "Letter: t"
## [21] "Letter: u" "Letter: v" "Letter: w" "Letter: x" "Letter: y"
## [26] "Letter: z"
str_c("Letter", letters, sep = ": ")
## [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e"
## [6] "Letter: f" "Letter: g" "Letter: h" "Letter: i" "Letter: j"
## [11] "Letter: k" "Letter: l" "Letter: m" "Letter: n" "Letter: o"
## [16] "Letter: p" "Letter: q" "Letter: r" "Letter: s" "Letter: t"
## [21] "Letter: u" "Letter: v" "Letter: w" "Letter: x" "Letter: y"
## [26] "Letter: z"
str_c(letters, " is for", "...")
## [1] "a is for..." "b is for..." "c is for..." "d is for..." "e is for..."
## [6] "f is for..." "g is for..." "h is for..." "i is for..." "j is for..."
## [11] "k is for..." "l is for..." "m is for..." "n is for..." "o is for..."
## [16] "p is for..." "q is for..." "r is for..." "s is for..." "t is for..."
## [21] "u is for..." "v is for..." "w is for..." "x is for..." "y is for..."
## [26] "z is for..."
str_c(letters[-26], " comes before ", letters[-1])
## [1] "a comes before b" "b comes before c" "c comes before d"
## [4] "d comes before e" "e comes before f" "f comes before g"
## [7] "g comes before h" "h comes before i" "i comes before j"
## [10] "j comes before k" "k comes before l" "l comes before m"
## [13] "m comes before n" "n comes before o" "o comes before p"
## [16] "p comes before q" "q comes before r" "r comes before s"
## [19] "s comes before t" "t comes before u" "u comes before v"
## [22] "v comes before w" "w comes before x" "x comes before y"
## [25] "y comes before z"
str_c(letters, collapse = "")
## [1] "abcdefghijklmnopqrstuvwxyz"
str_c(letters, collapse = ", ")
## [1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"
# Missing inputs give missing outputs
str_c(c("a", NA, "b"), "-d")
## [1] "a-d" NA "b-d"
# Use str_replace_na to display literal NAs:
str_c(str_replace_na(c("a", NA, "b")), "-d")
## [1] "a-d" "NA-d" "b-d"
# collapse参数,对多个字符串无效
str_c("a", "b", collapse = "-")
## [1] "ab"
str_c(c("a", "b"), c("a1", "b1"), collapse = "-")
## [1] "aa1-bb1"
library(magrittr)
c(str_c(c("a", "b"), c("a1", "b1"))) %>% str_c(collapse = "-")
## [1] "aa1-bb1"
对比str_c()函数和paste()函数之间的不同点。
# 多字符串拼接,默认的sep参数行为不一致
str_c('a', 'b') # 默认 sep = ""
## [1] "ab"
paste('a', 'b') # 默认 sep = " "
## [1] "a b"
str_c('a', 'b', sep = " ")
## [1] "a b"
paste('a', 'b', sep = "")
## [1] "ab"
# 向量拼接字符串,collapse参数的行为一致
str_c(head(letters), collapse = "")
## [1] "abcdef"
paste(head(letters), collapse = "")
## [1] "abcdef"
#拼接有NA值的字符串向量,对NA的处理行为不一致
str_c(c("a", NA, "b"), "-d")
## [1] "a-d" NA "b-d"
paste(c("a", NA, "b"), "-d")
## [1] "a -d" "NA -d" "b -d"
Function
str_trim : Trim whitespace from start and end of string.
Usage
str_trim(string, side = c("both", "left", "right"))
Argument
string
: A character vector.side
: Side on which to remove whitespace (left, right or both).Examples
# 默认去掉两边的空格
str_trim(" String with trailing and leading white space\t")
## [1] "String with trailing and leading white space"
str_trim("\n\nString with trailing and leading white space\n\n")
## [1] "String with trailing and leading white space"
# both
str_trim("\n\nString with trailing and leading white space\n\n", side = "both")
## [1] "String with trailing and leading white space"
# left
str_trim("\n\nString with trailing and leading white space\n\n", side = "left")
## [1] "String with trailing and leading white space\n\n"
# rigth
str_trim("\n\nString with trailing and leading white space\n\n", side = "right")
## [1] "\n\nString with trailing and leading white space"
Function
str_pad : Pad a string.
Usage
str_pad(string, width, side = c("left", "right", "both"), pad = " ")
Argument
string
A character vector.width
Minimum width of padded strings.side
Side on which padding character is added (left, right or both).pad
Single padding character (default is a space).Examples
# 默认left填充
rbind(
str_pad("hadley", 30, "left"),
str_pad("hadley", 30, "right"),
str_pad("hadley", 30, "both")
)
## [,1]
## [1,] " hadley"
## [2,] "hadley "
## [3,] " hadley "
# All arguments are vectorised except side
str_pad(c("a", "abc", "abcdef"), width = 10)
## [1] " a" " abc" " abcdef"
str_pad("a", width = c(5, 10, 20))
## [1] " a" " a" " a"
str_pad("a", width = 10, pad = c("-", "_", " "))
## [1] "---------a" "_________a" " a"
# Longer strings are returned unchanged
str_pad("hadley", width = 3)
## [1] "hadley"
Function
str_dup : Duplicate and concatenate strings within a character vector.
Usage
str_dup(string, times)
Argument
string
Input character vector.times
Number of times to duplicate each string.Examples
fruit <- c("apple", "pear", "banana")
str_dup(fruit, 2)
## [1] "appleapple" "pearpear" "bananabanana"
str_dup(fruit, 1:3)
## [1] "apple" "pearpear" "bananabananabanana"
str_c("ba", str_dup("na", 0:5))
## [1] "ba" "bana" "banana" "bananana"
## [5] "banananana" "bananananana"
Function
str_wrap : Wrap strings into nicely formatted paragraphs
Usage
str_wrap(string, width = 80, indent = 0, exdent = 0)
Argument
string
character vector of strings to reformat.width
positive integer giving target line width in characters. A width less than or equal to 1 will put each word on its own line.indent
non-negative integer giving indentation of first line in each paragraph(缩进)exdent
non-negative integer giving indentation of following lines in each paragraph(缩进)Examples
thanks_path <- file.path(R.home("doc"), "THANKS")
thanks <- str_c(readLines(thanks_path), collapse = "\n")
thanks <- word(thanks, 1, 3, fixed("\n\n"))
thanks
## [1] "R would not be what it is today without the invaluable help of these\npeople outside of the R core team, who contributed by donating code, bug\nfixes and documentation:\n\nValerio Aimale, Thomas Baier, Henrik Bengtsson, Roger Bivand,\nBen Bolker, David Brahm, G\"oran Brostr\"om, Patrick Burns, Vince Carey,\nSaikat DebRoy, Matt Dowle, Brian D'Urso, Lyndon Drake, Dirk Eddelbuettel,\nClaus Ekstrom, Sebastian Fischmeister, John Fox, Paul Gilbert,\nYu Gong, Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn,\nRobert King, Kjetil Kjernsmo, Roger Koenker, Philippe Lambert,\nJan de Leeuw, Jim Lindsey, Patrick Lindsey, Catherine Loader,\nGordon Maclean, John Maindonald, David Meyer, Ei-ji Nakama,\nJens Oehlschaegel, Steve Oncley, Richard O'Keefe, Hubert Palme,\nRoger D. Peng, Jose' C. Pinheiro, Tony Plate, Anthony Rossini,\nJonathan Rougier, Petr Savicky, Guenther Sawitzki, Marc Schwartz,\nArun Srinivasan, Detlef Steuer, Bill Simpson, Gordon Smyth, Adrian Trapletti,\nTerry Therneau, Rolf Turner, Bill Venables, Gregory R. Warnes,\nAndreas Weingessel, Morten Welinder, James Wettenhall, Simon Wood, and\nAchim Zeileis.\n\nOthers have written code that has been adopted by R and is\nacknowledged in the code files, including"
cat(str_wrap(thanks), "\n")
## R would not be what it is today without the invaluable help of these people
## outside of the R core team, who contributed by donating code, bug fixes and
## documentation: Valerio Aimale, Thomas Baier, Henrik Bengtsson, Roger Bivand,
## Ben Bolker, David Brahm, G"oran Brostr"om, Patrick Burns, Vince Carey, Saikat
## DebRoy, Matt Dowle, Brian D'Urso, Lyndon Drake, Dirk Eddelbuettel, Claus
## Ekstrom, Sebastian Fischmeister, John Fox, Paul Gilbert, Yu Gong, Gabor
## Grothendieck, Frank E Harrell Jr, Torsten Hothorn, Robert King, Kjetil Kjernsmo,
## Roger Koenker, Philippe Lambert, Jan de Leeuw, Jim Lindsey, Patrick Lindsey,
## Catherine Loader, Gordon Maclean, John Maindonald, David Meyer, Ei-ji Nakama,
## Jens Oehlschaegel, Steve Oncley, Richard O'Keefe, Hubert Palme, Roger D. Peng,
## Jose' C. Pinheiro, Tony Plate, Anthony Rossini, Jonathan Rougier, Petr Savicky,
## Guenther Sawitzki, Marc Schwartz, Arun Srinivasan, Detlef Steuer, Bill Simpson,
## Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner, Bill Venables,
## Gregory R. Warnes, Andreas Weingessel, Morten Welinder, James Wettenhall, Simon
## Wood, and Achim Zeileis. Others have written code that has been adopted by R and
## is acknowledged in the code files, including
cat(str_wrap(thanks, width = 40), "\n")
## R would not be what it is today
## without the invaluable help of these
## people outside of the R core team,
## who contributed by donating code,
## bug fixes and documentation: Valerio
## Aimale, Thomas Baier, Henrik Bengtsson,
## Roger Bivand, Ben Bolker, David Brahm,
## G"oran Brostr"om, Patrick Burns, Vince
## Carey, Saikat DebRoy, Matt Dowle,
## Brian D'Urso, Lyndon Drake, Dirk
## Eddelbuettel, Claus Ekstrom, Sebastian
## Fischmeister, John Fox, Paul Gilbert,
## Yu Gong, Gabor Grothendieck, Frank E
## Harrell Jr, Torsten Hothorn, Robert
## King, Kjetil Kjernsmo, Roger Koenker,
## Philippe Lambert, Jan de Leeuw, Jim
## Lindsey, Patrick Lindsey, Catherine
## Loader, Gordon Maclean, John Maindonald,
## David Meyer, Ei-ji Nakama, Jens
## Oehlschaegel, Steve Oncley, Richard
## O'Keefe, Hubert Palme, Roger D. Peng,
## Jose' C. Pinheiro, Tony Plate, Anthony
## Rossini, Jonathan Rougier, Petr Savicky,
## Guenther Sawitzki, Marc Schwartz, Arun
## Srinivasan, Detlef Steuer, Bill Simpson,
## Gordon Smyth, Adrian Trapletti, Terry
## Therneau, Rolf Turner, Bill Venables,
## Gregory R. Warnes, Andreas Weingessel,
## Morten Welinder, James Wettenhall, Simon
## Wood, and Achim Zeileis. Others have
## written code that has been adopted by R
## and is acknowledged in the code files,
## including
cat(str_wrap(thanks, width = 60, indent = 2), "\n")
## R would not be what it is today without the invaluable help
## of these people outside of the R core team, who contributed
## by donating code, bug fixes and documentation: Valerio
## Aimale, Thomas Baier, Henrik Bengtsson, Roger Bivand,
## Ben Bolker, David Brahm, G"oran Brostr"om, Patrick Burns,
## Vince Carey, Saikat DebRoy, Matt Dowle, Brian D'Urso,
## Lyndon Drake, Dirk Eddelbuettel, Claus Ekstrom, Sebastian
## Fischmeister, John Fox, Paul Gilbert, Yu Gong, Gabor
## Grothendieck, Frank E Harrell Jr, Torsten Hothorn, Robert
## King, Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan
## de Leeuw, Jim Lindsey, Patrick Lindsey, Catherine Loader,
## Gordon Maclean, John Maindonald, David Meyer, Ei-ji Nakama,
## Jens Oehlschaegel, Steve Oncley, Richard O'Keefe, Hubert
## Palme, Roger D. Peng, Jose' C. Pinheiro, Tony Plate, Anthony
## Rossini, Jonathan Rougier, Petr Savicky, Guenther Sawitzki,
## Marc Schwartz, Arun Srinivasan, Detlef Steuer, Bill Simpson,
## Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner,
## Bill Venables, Gregory R. Warnes, Andreas Weingessel, Morten
## Welinder, James Wettenhall, Simon Wood, and Achim Zeileis.
## Others have written code that has been adopted by R and is
## acknowledged in the code files, including
cat(str_wrap(thanks, width = 60, exdent = 2), "\n")
## R would not be what it is today without the invaluable help
## of these people outside of the R core team, who contributed
## by donating code, bug fixes and documentation: Valerio
## Aimale, Thomas Baier, Henrik Bengtsson, Roger Bivand,
## Ben Bolker, David Brahm, G"oran Brostr"om, Patrick Burns,
## Vince Carey, Saikat DebRoy, Matt Dowle, Brian D'Urso,
## Lyndon Drake, Dirk Eddelbuettel, Claus Ekstrom, Sebastian
## Fischmeister, John Fox, Paul Gilbert, Yu Gong, Gabor
## Grothendieck, Frank E Harrell Jr, Torsten Hothorn, Robert
## King, Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan
## de Leeuw, Jim Lindsey, Patrick Lindsey, Catherine Loader,
## Gordon Maclean, John Maindonald, David Meyer, Ei-ji Nakama,
## Jens Oehlschaegel, Steve Oncley, Richard O'Keefe, Hubert
## Palme, Roger D. Peng, Jose' C. Pinheiro, Tony Plate, Anthony
## Rossini, Jonathan Rougier, Petr Savicky, Guenther Sawitzki,
## Marc Schwartz, Arun Srinivasan, Detlef Steuer, Bill Simpson,
## Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner,
## Bill Venables, Gregory R. Warnes, Andreas Weingessel, Morten
## Welinder, James Wettenhall, Simon Wood, and Achim Zeileis.
## Others have written code that has been adopted by R and is
## acknowledged in the code files, including
cat(str_wrap(thanks, width = 0, exdent = 2), "\n")
## R
## would
## not
## be
## what
## it
## is
## today
## without
## the
## invaluable
## help
## of
## these
## people
## outside
## of
## the
## R
## core
## team,
## who
## contributed
## by
## donating
## code,
## bug
## fixes
## and
## documentation:
## Valerio
## Aimale,
## Thomas
## Baier,
## Henrik
## Bengtsson,
## Roger
## Bivand,
## Ben
## Bolker,
## David
## Brahm,
## G"oran
## Brostr"om,
## Patrick
## Burns,
## Vince
## Carey,
## Saikat
## DebRoy,
## Matt
## Dowle,
## Brian
## D'Urso,
## Lyndon
## Drake,
## Dirk
## Eddelbuettel,
## Claus
## Ekstrom,
## Sebastian
## Fischmeister,
## John
## Fox,
## Paul
## Gilbert,
## Yu
## Gong,
## Gabor
## Grothendieck,
## Frank
## E
## Harrell
## Jr,
## Torsten
## Hothorn,
## Robert
## King,
## Kjetil
## Kjernsmo,
## Roger
## Koenker,
## Philippe
## Lambert,
## Jan
## de
## Leeuw,
## Jim
## Lindsey,
## Patrick
## Lindsey,
## Catherine
## Loader,
## Gordon
## Maclean,
## John
## Maindonald,
## David
## Meyer,
## Ei-
## ji
## Nakama,
## Jens
## Oehlschaegel,
## Steve
## Oncley,
## Richard
## O'Keefe,
## Hubert
## Palme,
## Roger
## D.
## Peng,
## Jose'
## C.
## Pinheiro,
## Tony
## Plate,
## Anthony
## Rossini,
## Jonathan
## Rougier,
## Petr
## Savicky,
## Guenther
## Sawitzki,
## Marc
## Schwartz,
## Arun
## Srinivasan,
## Detlef
## Steuer,
## Bill
## Simpson,
## Gordon
## Smyth,
## Adrian
## Trapletti,
## Terry
## Therneau,
## Rolf
## Turner,
## Bill
## Venables,
## Gregory
## R.
## Warnes,
## Andreas
## Weingessel,
## Morten
## Welinder,
## James
## Wettenhall,
## Simon
## Wood,
## and
## Achim
## Zeileis.
## Others
## have
## written
## code
## that
## has
## been
## adopted
## by
## R
## and
## is
## acknowledged
## in
## the
## code
## files,
## including
# txt <- 'R语言作为统计学一门语言,一直在小众领域闪耀着光芒。直到大数据的爆发,R语言变成了一门炙手可热的数据分析的利器。随着越来越多的工程背景的人的加入,R语言的社区在迅速扩大成长。现在已不仅仅是统计领域,教育,银行,电商,互联网….都在使用R语言。'
# 设置宽度为40个字符
# cat(str_wrap(txt, width = 40), "\n")
# 设置宽度为60字符,首行缩进2字符
# cat(str_wrap(txt, width = 60, indent = 2), "\n")
# 设置宽度为10字符,非首行缩进4字符
# cat(str_wrap(txt, width = 10, exdent = 4), "\n")
Function
str_sub Extract and replace substrings from a character vector.
Usage
str_sub(string, start = 1L, end = -1L)
str_sub(string, start = 1L, end = -1L) <- value
Argument
string
input character vector.start
,end
Two integer vectors. start gives the position of the first character (defaults to first), end gives the position of the last (defaults to last character). Alternatively, pass a two-column matrix to start. Negative values count backwards from the last character.value
replacement stringExamples
hw <- "Hadley Wickham"
str_sub(hw, 1, 6)
## [1] "Hadley"
str_sub(hw, end = 6)
## [1] "Hadley"
str_sub(hw, 8, 14)
## [1] "Wickham"
str_sub(hw, 8)
## [1] "Wickham"
str_sub(hw, c(1, 8), c(6, 14))
## [1] "Hadley" "Wickham"
# Negative indices
str_sub(hw, -1)
## [1] "m"
str_sub(hw, -7)
## [1] "Wickham"
str_sub(hw, end = -7)
## [1] "Hadley W"
# Alternatively, you can pass in a two colum matrix, as in the
# output from str_locate_all
pos <- str_locate_all(hw, "[aeio]")[[1]]
pos
## start end
## [1,] 2 2
## [2,] 5 5
## [3,] 9 9
## [4,] 13 13
str_sub(hw, pos)
## [1] "a" "e" "i" "a"
str_sub(hw, pos[, 1], pos[, 2])
## [1] "a" "e" "i" "a"
# Vectorisation
str_sub(hw, seq_len(str_length(hw)))
## [1] "Hadley Wickham" "adley Wickham" "dley Wickham" "ley Wickham"
## [5] "ey Wickham" "y Wickham" " Wickham" "Wickham"
## [9] "ickham" "ckham" "kham" "ham"
## [13] "am" "m"
str_sub(hw, end = seq_len(str_length(hw)))
## [1] "H" "Ha" "Had" "Hadl"
## [5] "Hadle" "Hadley" "Hadley " "Hadley W"
## [9] "Hadley Wi" "Hadley Wic" "Hadley Wick" "Hadley Wickh"
## [13] "Hadley Wickha" "Hadley Wickham"
# Replacement form
x <- "BBCDEF"
str_sub(x, 1, 1) <- "A"
x
## [1] "ABCDEF"
str_sub(x, -1, -1) <- "K"
x
## [1] "ABCDEK"
str_sub(x, -2, -2) <- "GHIJ"
x
## [1] "ABCDGHIJK"
str_sub(x, 2, -2) <- ""
x
## [1] "AK"
Function
str_count : Count the number of matches in a string.
Usage
str_count(string, pattern = "")
Argument
string
Input vector. Either a character vector, or something coercible to one.pattern
Pattern to look for. The default interpretation is a regular expression, as described in stringi-searchregex. Control options with regex(). Match a fixed string (i.e. by comparing only bytes), using fixed(x). This is fast, but approximate. Generally, for matching human text, you’ll want coll(x) which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with boundary(). An empty pattern, “”, is equivalent to boundary(“character”).Examples
fruit <- c("apple", "banana", "pear", "pineapple")
str_count(fruit, "a")
## [1] 1 3 1 1
str_count(fruit, "p")
## [1] 2 0 1 3
str_count(fruit, "e")
## [1] 1 0 1 2
str_count(fruit, c("a", "b", "p", "p"))
## [1] 1 1 1 3
str_count(c("a.", "...", ".a.a"), ".")
## [1] 2 3 4
str_count(c("a.", "...", ".a.a"), fixed("."))
## [1] 1 3 2
str_count(c("a.", "...", ".a.a"), "\\.")
## [1] 1 3 2
Function
str_length : The length of a string.
Usage
str_length(string)
Argument
string
Input vector. Either a character vector, or something coercible to one.Examples
str_length(letters)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
str_length(NA)
## [1] NA
str_length(factor("abc"))
## [1] 3
str_length(c("i", "like", "programming", NA))
## [1] 1 4 11 NA
# Two ways of representing a u with an umlaut
u1 <- "\u00fc"
u2 <- stringi::stri_trans_nfd(u1)
# The print the same:
u1
## [1] "ü"
u2
## [1] "ü"
# But have a different length
str_length(u1)
## [1] 1
str_length(u2)
## [1] 2
# Even though they have the same number of characters
str_count(u1)
## [1] 1
str_count(u2)
## [1] 1
Function
str_order : Order or sort a character vector.
Usage
str_order(x, decreasing = FALSE, na_last = TRUE, locale = "", ...)
str_sort(x, decreasing = FALSE, na_last = TRUE, locale = "", ...)
Argument
x
A character vector to sort.decreasing
A boolean. If FALSE, the default, sorts from lowest to highest; if TRUE sorts from highest to lowest.na_last
Where should NA go? TRUE at the end, FALSE at the beginning, NA dropped.locale
In which locale should the sorting occur? Defaults to the current locale....
Other options used to control sorting order. Passed on to stri_opts_collator.Examples
# 按ASCII字母排序
str_order(letters, locale = "en")
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26
str_sort(letters, locale = "en")
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
str_order(letters, locale = "haw")
## [1] 1 5 9 15 21 2 3 4 6 7 8 10 11 12 13 14 16 17 18 19 20 22 23
## [24] 24 25 26
str_sort(letters, locale = "haw")
## [1] "a" "e" "i" "o" "u" "b" "c" "d" "f" "g" "h" "j" "k" "l" "m" "n" "p"
## [18] "q" "r" "s" "t" "v" "w" "x" "y" "z"
# 按拼音排序
str_sort(c('你','好','粉','丝','日','志'),locale = "zh")
## [1] "粉" "好" "你" "日" "丝" "志"
Function
str_split : Split up a string into pieces
Usage
str_split(string, pattern, n = Inf)
str_split_fixed(string, pattern, n)
Argument
string
Input vector. Either a character vector, or something coercible to one.pattern
Pattern to look for. The default interpretation is a regular expression, as described in stringi-search-regex. Control options with regex(). Match a fixed string (i.e. by comparing only bytes), using fixed(x). This is fast, but approximate. Generally, for matching human text, you’ll want coll(x) which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with boundary(). An empty pattern, “”, is equivalent to boundary(“character”).n
number of pieces to return. Default (Inf) uses all possible split positions. For str_split_fixed, if n is greater than the number of pieces, the result will be padded with empty strings.Examples
fruits <- c(
"apples and oranges and pears and bananas",
"pineapples and mangos and guavas"
)
str_split(fruits, " and ")
## [[1]]
## [1] "apples" "oranges" "pears" "bananas"
##
## [[2]]
## [1] "pineapples" "mangos" "guavas"
# Specify n to restrict the number of possible matches
str_split(fruits, " and ", n = 3)
## [[1]]
## [1] "apples" "oranges" "pears and bananas"
##
## [[2]]
## [1] "pineapples" "mangos" "guavas"
str_split(fruits, " and ", n = 2)
## [[1]]
## [1] "apples" "oranges and pears and bananas"
##
## [[2]]
## [1] "pineapples" "mangos and guavas"
# If n greater than number of pieces, no padding occurs
str_split(fruits, " and ", n = 5)
## [[1]]
## [1] "apples" "oranges" "pears" "bananas"
##
## [[2]]
## [1] "pineapples" "mangos" "guavas"
# Use fixed to return a character matrix
str_split_fixed(fruits, " and ", 3)
## [,1] [,2] [,3]
## [1,] "apples" "oranges" "pears and bananas"
## [2,] "pineapples" "mangos" "guavas"
str_split_fixed(fruits, " and ", 4)
## [,1] [,2] [,3] [,4]
## [1,] "apples" "oranges" "pears" "bananas"
## [2,] "pineapples" "mangos" "guavas" ""
Function
sub_subset:Keep strings matching a pattern
Usage
str_subset(string, pattern)
Argument
string
Input vector. Either a character vector, or something coercible to one.pattern
Pattern to look for. The default interpretation is a regular expression, as described in stringi-search-regex. Control options with regex(). Match a fixed string (i.e. by comparing only bytes), using fixed(x). This is fast, but approximate. Generally, for matching human text, you’ll want coll(x) which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with boundary(). An empty pattern, “”, is equivalent to boundary(“character”).Examples
fruit <- c("apple", "banana", "pear", "pinapple")
str_subset(fruit, "a")
## [1] "apple" "banana" "pear" "pinapple"
str_subset(fruit, "^a")
## [1] "apple"
str_subset(fruit, "a$")
## [1] "banana"
str_subset(fruit, "b")
## [1] "banana"
str_subset(fruit, "[aeiou]")
## [1] "apple" "banana" "pear" "pinapple"
# Missings are silently dropped
str_subset(c("a", NA, "b"), ".")
## [1] "a" "b"
Function
word:Extract words from a sentence.
Usage
word(string, start = 1L, end = start, sep = fixed(" "))
Argument
string
input character vector.start
integer vector giving position of first word to extract. Defaults to first word. If negative, counts backwards from last character.end
integer vector giving position of last word to extract. Defaults to first word. If negative, counts backwards from last character.sep
separator between words. Defaults to single space.Examples
sentences <- c("Jane saw a cat", "Jane sat down")
word(sentences, 1)
## [1] "Jane" "Jane"
word(sentences, 2)
## [1] "saw" "sat"
word(sentences, -1)
## [1] "cat" "down"
word(sentences, 2, -1)
## [1] "saw a cat" "sat down"
# Also vectorised over start and end
word(sentences[1], 1:3, -1)
## [1] "Jane saw a cat" "saw a cat" "a cat"
word(sentences[1], 1, 1:4)
## [1] "Jane" "Jane saw" "Jane saw a" "Jane saw a cat"
# Can define words by other separators
str <- 'abc.def..123.4568.999'
word(str, 1, sep = fixed('..'))
## [1] "abc.def"
word(str, 2, sep = fixed('..'))
## [1] "123.4568.999"
Function
str_detech: Detect the presence or absence of a pattern in a string
Usage
str_detect(string, pattern)
Argument
string
Input vector. Either a character vector, or something coercible to one.pattern
Pattern to look for. The default interpretation is a regular expression, as described in stringi-search-regex. Control options with regex(). Match a fixed string (i.e. by comparing only bytes), using fixed(x). This is fast, but approximate. Generally, for matching human text, you’ll want coll(x) which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with boundary(). An empty pattern, “”, is equivalent to boundary(“character”)Examples
fruit <- c("apple", "banana", "pear", "pinapple")
str_detect(fruit, "a")
## [1] TRUE TRUE TRUE TRUE
str_detect(fruit, "^a")
## [1] TRUE FALSE FALSE FALSE
str_detect(fruit, "a$")
## [1] FALSE TRUE FALSE FALSE
str_detect(fruit, "b")
## [1] FALSE TRUE FALSE FALSE
str_detect(fruit, "[aeiou]")
## [1] TRUE TRUE TRUE TRUE
# Also vectorised over pattern
str_detect("aecfg", letters)
## [1] TRUE FALSE TRUE FALSE TRUE TRUE TRUE FALSE FALSE FALSE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE
Function
str_match: Extract matched groups from a string
Usage
str_match(string, pattern)
str_match_all(string, pattern)
Argument
string
Input vector. Either a character vector, or something coercible to one. pattern Pattern to look for, as defined by an ICU regular expression. See stringi-search-regex for more details.
Examples
strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569",
"387 287 6718", "apple", "233.398.9187 ", "482 952 3315",
"239 923 8115 and 842 566 4692", "Work: 579-499-7527", "$1000",
"Home: 543.355.3679")
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
str_extract(strings, phone)
## [1] "219 733 8965" "329-293-8753" NA "595 794 7569"
## [5] "387 287 6718" NA "233.398.9187" "482 952 3315"
## [9] "239 923 8115" "579-499-7527" NA "543.355.3679"
str_match(strings, phone)
## [,1] [,2] [,3] [,4]
## [1,] "219 733 8965" "219" "733" "8965"
## [2,] "329-293-8753" "329" "293" "8753"
## [3,] NA NA NA NA
## [4,] "595 794 7569" "595" "794" "7569"
## [5,] "387 287 6718" "387" "287" "6718"
## [6,] NA NA NA NA
## [7,] "233.398.9187" "233" "398" "9187"
## [8,] "482 952 3315" "482" "952" "3315"
## [9,] "239 923 8115" "239" "923" "8115"
## [10,] "579-499-7527" "579" "499" "7527"
## [11,] NA NA NA NA
## [12,] "543.355.3679" "543" "355" "3679"
# Extract/match all
str_extract_all(strings, phone)
## [[1]]
## [1] "219 733 8965"
##
## [[2]]
## [1] "329-293-8753"
##
## [[3]]
## character(0)
##
## [[4]]
## [1] "595 794 7569"
##
## [[5]]
## [1] "387 287 6718"
##
## [[6]]
## character(0)
##
## [[7]]
## [1] "233.398.9187"
##
## [[8]]
## [1] "482 952 3315"
##
## [[9]]
## [1] "239 923 8115" "842 566 4692"
##
## [[10]]
## [1] "579-499-7527"
##
## [[11]]
## character(0)
##
## [[12]]
## [1] "543.355.3679"
str_match_all(strings, phone)
## [[1]]
## [,1] [,2] [,3] [,4]
## [1,] "219 733 8965" "219" "733" "8965"
##
## [[2]]
## [,1] [,2] [,3] [,4]
## [1,] "329-293-8753" "329" "293" "8753"
##
## [[3]]
## [,1] [,2] [,3] [,4]
##
## [[4]]
## [,1] [,2] [,3] [,4]
## [1,] "595 794 7569" "595" "794" "7569"
##
## [[5]]
## [,1] [,2] [,3] [,4]
## [1,] "387 287 6718" "387" "287" "6718"
##
## [[6]]
## [,1] [,2] [,3] [,4]
##
## [[7]]
## [,1] [,2] [,3] [,4]
## [1,] "233.398.9187" "233" "398" "9187"
##
## [[8]]
## [,1] [,2] [,3] [,4]
## [1,] "482 952 3315" "482" "952" "3315"
##
## [[9]]
## [,1] [,2] [,3] [,4]
## [1,] "239 923 8115" "239" "923" "8115"
## [2,] "842 566 4692" "842" "566" "4692"
##
## [[10]]
## [,1] [,2] [,3] [,4]
## [1,] "579-499-7527" "579" "499" "7527"
##
## [[11]]
## [,1] [,2] [,3] [,4]
##
## [[12]]
## [,1] [,2] [,3] [,4]
## [1,] "543.355.3679" "543" "355" "3679"
Function
str_extract:Extract matching patterns from a string
Usage
str_extract(string, pattern)
str_extract_all(string, pattern, simplify = FALSE)
Argument
string
Input vector. Either a character vector, or something coercible to one.pattern
Pattern to look for. The default interpretation is a regular expression, as described in stringi-search-regex. Control options with regex(). Match a fixed string (i.e. by comparing only bytes), using fixed(x). This is fast, but approximate. Generally, for matching human text, you’ll want coll(x) which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with boundary(). An empty pattern, “”, is equivalent to boundary(“character”).simplify
If FALSE, the default, returns a list of character vectors. If TRUE returns a character matrix.Examples
shopping_list <- c("apples x4", "bag of flour", "bag of sugar", "milk x2")
str_extract(shopping_list, "\\d")
## [1] "4" NA NA "2"
str_extract(shopping_list, "[a-z]+")
## [1] "apples" "bag" "bag" "milk"
str_extract(shopping_list, "[a-z]{1,4}")
## [1] "appl" "bag" "bag" "milk"
str_extract(shopping_list, "\\b[a-z]{1,4}\\b")
## [1] NA "bag" "bag" "milk"
# Extract all matches
str_extract_all(shopping_list, "[a-z]+")
## [[1]]
## [1] "apples" "x"
##
## [[2]]
## [1] "bag" "of" "flour"
##
## [[3]]
## [1] "bag" "of" "sugar"
##
## [[4]]
## [1] "milk" "x"
str_extract_all(shopping_list, "\\b[a-z]+\\b")
## [[1]]
## [1] "apples"
##
## [[2]]
## [1] "bag" "of" "flour"
##
## [[3]]
## [1] "bag" "of" "sugar"
##
## [[4]]
## [1] "milk"
str_extract_all(shopping_list, "\\d")
## [[1]]
## [1] "4"
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## [1] "2"
# Simplify results into character matrix
str_extract_all(shopping_list, "\\b[a-z]+\\b", simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "apples" "" ""
## [2,] "bag" "of" "flour"
## [3,] "bag" "of" "sugar"
## [4,] "milk" "" ""
str_extract_all(shopping_list, "\\d", simplify = TRUE)
## [,1]
## [1,] "4"
## [2,] ""
## [3,] ""
## [4,] "2"
Function
str_replace: Replace matched patterns in a string
str_replace_na: Turn NA into "NA"
Usage
str_replace(string, pattern, replacement)
str_replace_all(string, pattern, replacement)
str_replace_na(string, replacement = "NA")
Argument
string
Input vector. Either a character vector, or something coercible to one. pattern,replacement
Supply separate pattern and replacement strings to vectorise over the patterns. References of the form , will be replaced with the contents of the respective matched group (created by ()) within the pattern. For str_replace_all only, you can perform multiple patterns and replacements to each string, by passing a named character to pattern.
Examples
fruits <- c("one apple", "two pears", "three bananas")
str_replace(fruits, "[aeiou]", "-")
## [1] "-ne apple" "tw- pears" "thr-e bananas"
str_replace_all(fruits, "[aeiou]", "-")
## [1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
str_replace(fruits, "([aeiou])", "")
## [1] "ne apple" "tw pears" "thre bananas"
str_replace(fruits, "([aeiou])", "\\1\\1")
## [1] "oone apple" "twoo pears" "threee bananas"
str_replace(fruits, "[aeiou]", c("1", "2", "3"))
## [1] "1ne apple" "tw2 pears" "thr3e bananas"
str_replace(fruits, c("a", "e", "i"), "-")
## [1] "one -pple" "two p-ars" "three bananas"
fruits <- c("one apple", "two pears", "three bananas")
str_replace(fruits, "[aeiou]", "-")
## [1] "-ne apple" "tw- pears" "thr-e bananas"
str_replace_all(fruits, "[aeiou]", "-")
## [1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
str_replace_all(fruits, "([aeiou])", "")
## [1] "n ppl" "tw prs" "thr bnns"
str_replace_all(fruits, "([aeiou])", "\\1\\1")
## [1] "oonee aapplee" "twoo peeaars" "threeee baanaanaas"
str_replace_all(fruits, "[aeiou]", c("1", "2", "3"))
## [1] "1n1 1ppl1" "tw2 p22rs" "thr33 b3n3n3s"
str_replace_all(fruits, c("a", "e", "i"), "-")
## [1] "one -pple" "two p-ars" "three bananas"
# If you want to apply multiple patterns and replacements to the same
# string, pass a named vector to pattern.
fruits %>%
str_c(collapse = "---") %>%
str_replace_all(c("one" = "1", "two" = "2", "three" = "3"))
## [1] "1 apple---2 pears---3 bananas"
str_replace_na(c("NA", "abc", "def"))
## [1] "NA" "abc" "def"
# Use a function for more sophisticated replacement. This example
# replaces colour names with their hex values.
colours <- str_c("\\b", colors(), "\\b", collapse="|")
col2hex <- function(col) {
rgb <- col2rgb(col)
rgb(rgb["red", ], rgb["green", ], rgb["blue", ], max = 255)
}
x <- c(
"Roses are red, violets are blue",
"My favourite colour is green"
)
str_replace_all(x, colours, col2hex)
## [1] "Roses are #FF0000, violets are #0000FF"
## [2] "My favourite colour is #00FF00"
Function
str_locate: Locate the position of patterns in a string
Usage
str_locate(string, pattern)
str_locate_all(string, pattern)
Argument
string
Input vector. Either a character vector, or something coercible to one.pattern
Pattern to look for. The default interpretation is a regular expression, as described in stringi-search-regex. Control options with regex(). Match a fixed string (i.e. by comparing only bytes), using fixed(x). This is fast, but approximate. Generally, for matching human text, you’ll want coll(x) which respects character matching rules for the specified locale. Match character, word, line and sentence boundaries with boundary(). An empty pattern, “”, is equivalent to boundary(“character”).Examples
fruit <- c("apple", "banana", "pear", "pineapple")
str_locate(fruit, "$")
## start end
## [1,] 6 5
## [2,] 7 6
## [3,] 5 4
## [4,] 10 9
str_locate(fruit, "a")
## start end
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 5 5
str_locate(fruit, "e")
## start end
## [1,] 5 5
## [2,] NA NA
## [3,] 2 2
## [4,] 4 4
str_locate(fruit, c("a", "b", "p", "p"))
## start end
## [1,] 1 1
## [2,] 1 1
## [3,] 1 1
## [4,] 1 1
str_locate_all(fruit, "a")
## [[1]]
## start end
## [1,] 1 1
##
## [[2]]
## start end
## [1,] 2 2
## [2,] 4 4
## [3,] 6 6
##
## [[3]]
## start end
## [1,] 3 3
##
## [[4]]
## start end
## [1,] 5 5
str_locate_all(fruit, "e")
## [[1]]
## start end
## [1,] 5 5
##
## [[2]]
## start end
##
## [[3]]
## start end
## [1,] 2 2
##
## [[4]]
## start end
## [1,] 4 4
## [2,] 9 9
str_locate_all(fruit, c("a", "b", "p", "p"))
## [[1]]
## start end
## [1,] 1 1
##
## [[2]]
## start end
## [1,] 1 1
##
## [[3]]
## start end
## [1,] 1 1
##
## [[4]]
## start end
## [1,] 1 1
## [2,] 6 6
## [3,] 7 7
# Find location of every character
str_locate_all(fruit, "")
## [[1]]
## start end
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
## [5,] 5 5
##
## [[2]]
## start end
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
## [5,] 5 5
## [6,] 6 6
##
## [[3]]
## start end
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
##
## [[4]]
## start end
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
## [5,] 5 5
## [6,] 6 6
## [7,] 7 7
## [8,] 8 8
## [9,] 9 9
Function
str_conv : Specify the encoding of a string
Usage
str_conv(string, encoding)
Argument
string
: String to re-encode.encoding
: Name of encoding. See stri_enc_list
for a complete listExamples
# Example from encoding?stringi::stringi
x <- rawToChar(as.raw(177))
x
## [1] "\xb1"
str_conv(x, "ISO-8859-2") # Polish "a with ogonek"
## [1] "ą"
str_conv(x, "ISO-8859-1") # Plus-minus
## [1] "±"
# 对中文进行转码处理,把中文字符字节化
x <- charToRaw('你好')
x
## [1] e4 bd a0 e5 a5 bd
# 默认win系统字符集为GBK,GB2312为GBK字集,转码正常,转UTF-8失败
str_conv(x, "GBK")
## [1] "浣犲ソ"
str_conv(x, "GB2312")
## Warning in stri_conv(string, encoding, "UTF-8"): input data \xffffffa0 in
## current source encoding could not be converted to Unicode
## Warning in stri_conv(string, encoding, "UTF-8"): input data \xffffffbd in
## current source encoding could not be converted to Unicode
## [1] "浣\032濂\032"
str_conv(x, "UTF-8")
## [1] "你好"
# 把unicode转为UTF-8
x1 <- "\u5317\u4eac"
str_conv(x1, "UTF-8")
## [1] "北京"
Function
Usage
str_to_upper(string, locale = "")
str_to_lower(string, locale = "")
str_to_title(string, locale = "")
Argument
string
String to modifylocale
Locale to use for translationsExamples
dog <- "The quick brown dog"
str_to_upper(dog)
## [1] "THE QUICK BROWN DOG"
str_to_lower(dog)
## [1] "the quick brown dog"
str_to_title(dog)
## [1] "The Quick Brown Dog"
# Locale matters!
str_to_upper("i", "en") # English
## [1] "I"
str_to_upper("i", "tr") # Turkish
## [1] "İ"
modifiers :Control matching behaviour with modifier functions.
Description
Usage
fixed(pattern, ignore_case = FALSE)
coll(pattern, ignore_case = FALSE, locale = NULL, ...)
regex(pattern, ignore_case = FALSE, multiline = FALSE, comments = FALSE, dotall = FALSE, ...)
boundary(type = c("character", "line_break", "sentence", "word"), skip_word_none = TRUE, ...)
Arguments
pattern
Pattern to modify behaviour.ignore_case
Should case differences be ignored in the match?locale
Locale to use for comparisons. See stri_locale_list() for all possible options....
Other less frequently used arguments passed on to stri_opts_collator, stri_opts_regex, or stri_opts_brkitermultiline
If TRUE, $ and ^ match the beginning and end of each line. If FALSE, the default, only match the start and end of the input.comments
If TRUE, white space and comments beginning with # are ignored. Escape literal spaces with .dotall
If TRUE, . will also match line terminators.type
Boundary type to detect.skip_word_none
Ignore “words” that don’t contain any characters or numbers - i.e. punctuation.Examples
pattern <- "a.b"
strings <- c("abb", "a.b")
str_detect(strings, pattern)
## [1] TRUE TRUE
str_detect(strings, fixed(pattern))
## [1] FALSE TRUE
str_detect(strings, coll(pattern))
## [1] FALSE TRUE
# coll() is useful for locale-aware case-insensitive matching
i <- c("I", "\u0130", "i")
i
## [1] "I" "İ" "i"
str_detect(i, fixed("i", TRUE))
## [1] TRUE FALSE TRUE
str_detect(i, coll("i", TRUE))
## [1] TRUE FALSE TRUE
str_detect(i, coll("i", TRUE, locale = "tr"))
## [1] FALSE TRUE TRUE
# Word boundaries
words <- c("These are some words.")
str_count(words, boundary("word"))
## [1] 4
str_split(words, " ")[[1]]
## [1] "These" "are" "" "" "some" "words."
str_split(words, boundary("word"))[[1]]
## [1] "These" "are" "some" "words"
# Regular expression variations
str_extract_all("The Cat in the Hat", "[a-z]+")
## [[1]]
## [1] "he" "at" "in" "the" "at"
str_extract_all("The Cat in the Hat", regex("[a-z]+", TRUE))
## [[1]]
## [1] "The" "Cat" "in" "the" "Hat"
str_extract_all("a\nb\nc", "^.")
## [[1]]
## [1] "a"
str_extract_all("a\nb\nc", regex("^.", multiline = TRUE))
## [[1]]
## [1] "a" "b" "c"
str_extract_all("a\nb\nc", "a.")
## [[1]]
## character(0)
str_extract_all("a\nb\nc", regex("a.", dotall = TRUE))
## [[1]]
## [1] "a\n"