library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.2.1 √ purrr 0.3.3
## √ tibble 2.1.3 √ dplyr 0.8.3
## √ tidyr 1.0.0 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.4.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
library(htmltools)
library(htmlwidgets)
string1 <- "this is a string"
string2 <- "this is a \'apple\'"
string1
## [1] "this is a string"
string2
## [1] "this is a 'apple'"
转义字符
"'"
## [1] "'"
'"'
## [1] "\""
"\\" # 打印形式会显示转义字符
## [1] "\\"
writeLines("\\")
## \
字符向量
c("one","two","three")
## [1] "one" "two" "three"
str_length(c("R for data science","ljj",NA))
## [1] 18 3 NA
str_c("x","y")
## [1] "xy"
str_c("x","y","z")
## [1] "xyz"
使用 sep 参数来控制字符串间的分隔方式
str_c("xy","zw",sep = "-")
## [1] "xy-zw"
str_c("as","df",sep = "+")
## [1] "as+df"
str_c("X",1:100,sep = "",collapse = "+")
## [1] "X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18+X19+X20+X21+X22+X23+X24+X25+X26+X27+X28+X29+X30+X31+X32+X33+X34+X35+X36+X37+X38+X39+X40+X41+X42+X43+X44+X45+X46+X47+X48+X49+X50+X51+X52+X53+X54+X55+X56+X57+X58+X59+X60+X61+X62+X63+X64+X65+X66+X67+X68+X69+X70+X71+X72+X73+X74+X75+X76+X77+X78+X79+X80+X81+X82+X83+X84+X85+X86+X87+X88+X89+X90+X91+X92+X93+X94+X95+X96+X97+X98+X99+X100"
str_c("Y~",str_c("X",1:100,sep = "",collapse = "+"),sep = "")
## [1] "Y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18+X19+X20+X21+X22+X23+X24+X25+X26+X27+X28+X29+X30+X31+X32+X33+X34+X35+X36+X37+X38+X39+X40+X41+X42+X43+X44+X45+X46+X47+X48+X49+X50+X51+X52+X53+X54+X55+X56+X57+X58+X59+X60+X61+X62+X63+X64+X65+X66+X67+X68+X69+X70+X71+X72+X73+X74+X75+X76+X77+X78+X79+X80+X81+X82+X83+X84+X85+X86+X87+X88+X89+X90+X91+X92+X93+X94+X95+X96+X97+X98+X99+X100"
如果想要将它们输出为 “NA”,可以使用 str_replace_na():
x <- c("abc", NA)
str_c("|-", x, "-|")
## [1] "|-abc-|" NA
str_c("|-", str_replace_na(x), "-|")
## [1] "|-abc-|" "|-NA-|"
str_c("abc",c("a","b","c"),"xyz",sep = "-")
## [1] "abc-a-xyz" "abc-b-xyz" "abc-c-xyz"
要想将字符向量合并为字符串,可以使用 collapse()
str_c("x","y","z",sep = "*") # 将字符串连接
## [1] "x*y*z"
str_c(c("a","b"),c("V","n"),1:10,sep = "+",collapse = "*") # collapse的作用就是将字符向量转化为字符串
## [1] "a+V+1*b+n+2*a+V+3*b+n+4*a+V+5*b+n+6*a+V+7*b+n+8*a+V+9*b+n+10"
str_c("X",1:100,sep = "",collapse = "+")
## [1] "X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18+X19+X20+X21+X22+X23+X24+X25+X26+X27+X28+X29+X30+X31+X32+X33+X34+X35+X36+X37+X38+X39+X40+X41+X42+X43+X44+X45+X46+X47+X48+X49+X50+X51+X52+X53+X54+X55+X56+X57+X58+X59+X60+X61+X62+X63+X64+X65+X66+X67+X68+X69+X70+X71+X72+X73+X74+X75+X76+X77+X78+X79+X80+X81+X82+X83+X84+X85+X86+X87+X88+X89+X90+X91+X92+X93+X94+X95+X96+X97+X98+X99+X100"
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
# 注意,即使字符串过短, str_sub() 函数也不会出错,它将返回尽可能多的字符:
str_sub("a", 1, 5)
## [1] "a"
str_to_lower(str_sub(x,1,1))
## [1] "a" "b" "p"
str_to_upper(c("i", "ı"))
## [1] "I" "<U+0131>"
str_to_lower("I")
## [1] "i"
字符串排序
x <- c("apple", "eggplant", "banana")
str_sort(x)
## [1] "apple" "banana" "eggplant"
2 用自己的语言描述一下 str_c() 函数的 sep 和 collapse 参数有什么区别?
3 使用 str_length() 和 str_sub() 函数提取出一个字符串最中间的字符。如果字符串中的字符数是偶数,你应该怎么做?
4 str_wrap() 函数的功能是什么?应该在何时使用这个函数?
5 str_trim() 函数的功能是什么?其逆操作是哪个函数?
6 编写一个函数将字符向量转换为字符串,例如,将字符向量 c(“a”, “b”, “c”) 转换为字符串 a、 b 和 c。仔细思考一下,如果给定一个长度为0、1 或 2的向量,那么这个函数应该怎么做?
str_c(c("a","b","c"),collapse = "") %>% str_sub(2,2)
## [1] "b"
x <- c("apple", "banana", "pear")
str_view(x, "an")
# 另一个更复杂一些的模式是使用 .,它可以匹配任意字符(除了换行符):
str_view(x,".a.")
# 如果 . 可以匹配任意字符,那么如何匹配字符 . 呢?
# 要想建立正则表示式,我们需要使用\\
dot <- "\\."
dot %>% writeLines()
## \.
str_view(c("abc", "a.c", "bef"), "a\\.c")
如果***,那么如何匹配 这个字符呢?我们还是需要去除其特殊意义,建立形式为 \的正则表达式。 要想建立这样的正则表达式,我们需要使用一个字符串,其中还需要对 进行转义。这意味着要想匹配字符 ,我们需要输入 “\\”—— 你需要 4 个反斜杠来匹配 1 个反斜杠!
解释一下为什么这些字符串不能匹配一个反斜杠 : “"、”\“、”\"。
如何匹配字符序列 "’ ?
writeLines("\'\\\\")
## '\\
有时我们需要在正则表达式中设置锚点,以便 R 从字符串的开头或末尾进行匹配。我们可以设置两种锚点。
^ 从字符串开头进行匹配。
$ 从字符串末尾进行匹配。
x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")
始于权力(^),终于金钱($)
如果想要强制正则表达式匹配一个完整字符串,那么可以同时设置 ^ 和 $ 这两个锚点:
x <- c("apple pie", "apple", "apple cake")
str_view(x,"apple")
str_view(x,"^apple$")
为了避免匹配到summarize、summary、rowsum 等,我们会使用 ** 进行搜索。
x <- c("xyz","$^$fghd","qwe",1:10,"a") # 这是一个字符串,注意以下区别
str_c(x,collapse = "*")
## [1] "xyz*$^$fghd*qwe*1*2*3*4*5*6*7*8*9*10*a"
str_c(x,sep = "-")
## [1] "xyz" "$^$fghd" "qwe" "1" "2" "3" "4"
## [8] "5" "6" "7" "8" "9" "10" "a"
str_c(x,sep = "+",collapse = "")
## [1] "xyz$^$fghdqwe12345678910a"
str_view(x,"\\$\\^\\$",match = TRUE)
str_view(words,"^y",match = TRUE)
str_view(words,"x$",match = TRUE)
words[str_length(words) == 3]
## [1] "act" "add" "age" "ago" "air" "all" "and" "any" "arm" "art" "ask"
## [12] "bad" "bag" "bar" "bed" "bet" "big" "bit" "box" "boy" "bus" "but"
## [23] "buy" "can" "car" "cat" "cup" "cut" "dad" "day" "die" "dog" "dry"
## [34] "due" "eat" "egg" "end" "eye" "far" "few" "fit" "fly" "for" "fun"
## [45] "gas" "get" "god" "guy" "hit" "hot" "how" "job" "key" "kid" "lad"
## [56] "law" "lay" "leg" "let" "lie" "lot" "low" "man" "may" "mrs" "new"
## [67] "non" "not" "now" "odd" "off" "old" "one" "out" "own" "pay" "per"
## [78] "put" "red" "rid" "run" "say" "see" "set" "sex" "she" "sir" "sit"
## [89] "six" "son" "sun" "tax" "tea" "ten" "the" "tie" "too" "top" "try"
## [100] "two" "use" "war" "way" "wee" "who" "why" "win" "yes" "yet" "you"
str_view(words,"^...$",match = TRUE)
因为这个列表非常长,所以你可以设置 str_view() 函数的 match 参数,只显示匹配的单词(match = TRUE)或未匹配的单词(match = FALSE)。
可以匹配任意空白字符(如空格、制表符和换行符)。
[abc] 可以匹配 a、 b 或 c。
[^abc] 可以匹配除 a、 b、 c 外的任意字符
请牢记,要想创建包含 或 * 的正则表达式,你需要在字符串中对 进行转义,因此需要输入“\d” 或 “\s”
因为 | 的优先级很低,所以 abc|xyz 匹配的是 abc 或 xyz,而不是 abcyz 或 abxyz。
str_view(c("grey", "gray"), "gr(e|a)y")
str_view(words,"^[aeiou]",match = TRUE)
str_view(words,pattern = "^[^aeiou]",match = TRUE) # 始于^,终于$
str_view(words,pattern = "^[^aeiou]*$",match = TRUE)
str_view(words,"[^e]ed$",match = TRUE)
str_view(words,"(ing$)|(ize$)",match = TRUE)
qu <- str_detect(words,"qu")
q <- str_detect(words,"q")
identical(qu,q) # 是
## [1] TRUE
x <- c("123-0375-5698","123-456-789")
str_view(x,pattern = "\\d\\d\\d-\\d\\d\\d\\d-\\d\\d\\d\\d",match = TRUE)
?: 0 次或 1 次。
+: 1 次或多次。
*: 0 次或多次。
x <- "1888 is the longest year in Roman numerals: MDCCCCCLXXXVIII"
str_view(x, "CC?")
str_view(x, "CC+")
str_view(x, 'C[LX]+')
{n}:匹配 n 次。
{n,}:匹配 n 次或更多次。
{,m}:最多匹配 m 次。
{n, m}:匹配 n 到 m 次。
str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}?")
默认的匹配方式是“贪婪的”:正则表达式会匹配尽量长的字符串。通过在正则表达式后面添加一个?,你可以将匹配方式更改为“懒惰的”,即匹配尽量短的字符串。虽然这是正则表达式的高级特性,但知道这一点是非常有用的。
str_view(x, 'C{2,3}?')
x <- c("CLXXXXXXX","CXLLLLL","CXXXXXX")
str_view(x, 'C[LX]+?')
str_view(x, 'C[LX]+')
x <- c("asdfhkkkddf","asdfgkkkkkklll","xc")
y <- "1888 is the longest year in Roman numerals: MDCCCCCCCLXXXVIII"
str_view(x,"kk?",match = TRUE) # 懒惰模式
str_view(x,"kk+")
str_view(x,"kk*") # 贪婪模式
str_view(y,"CC?") # 0或1 # 懒惰模式
str_view(y,"CC+") # 1或多 # 贪婪模式
str_view(y,"CC*") # 0或多 # 贪婪模式
str_view(x,"^.*$") # * 匹配前面的子表达式任意次。例如,zo*能匹配“z”,也能匹配“zo”以及“zoo”。*等价于{0,}。
str_view(x,"^.*?$")
writeLines("\\{.+\\}")
## \{.+\}
str_view("1111-11-11","\\d{4}-\\d{2}-\\d{2}")
str_view(words,"^[^aeiou][^aeiou][^aeiou]",match = TRUE)
str_view(words,"[aeiou][aeiou][aeiou]+",match = TRUE)
括号可以用于消除复杂表达式中的歧义。括号还可以定义“分组”,你可以通过回溯引用(如 \1、 \2 等)来引用这些分组。例如,以下的正则表达式可以找出 名称中有重复的一对字母的所有水果:
str_view(fruit, "(..)\\1", match = TRUE)
str_view_all("sssddfff", "(.)\\1\\1", match = TRUE)
str_view(fruit, "(.)(.)\\2\\1", match = TRUE)
str_view(fruit, "(..)\\1", match = TRUE)
str_view(fruit, "(.).\\1.\\1", match = TRUE)
str_view(fruit, "(.)(.)(.).*\\3\\2\\1", match = TRUE)
str_view(words,"^(.)[a-zA-Z0-9]*\\1$",match = TRUE)
包含一对重复字母的单词(例如, church 中包含了重复的 ch)。
包含一个至少重复 3 次的字母的单词(例如, eleven 中的 e 重复了 3 次)。
既然我们已经掌握了正则表达式的基础知识,现在是时候学习如何应用它们来解决实际问题了。我们将在本节中学习多种 stringr 函数,它们可以:
确定与某种模式相匹配的字符串;
找出匹配的位置;
提取出匹配的内容;
使用新值替换匹配内容;
基于匹配拆分字符串。
当遇到一个问题时,有些人会这样想:“我可以用正则表达式来搞定它。”于是,原来的一个问题就变成了两个问题。
别忘了你使用的是一门编程语言,还有很多其他工具随时待命。相对于创建一个复杂的正则表达式,更简单的方式是创建多个简单的正则表达式。如果很难使用一个正则表达式来解决问题,那么你就应该回过头仔细思考一下,是否可以将一个问题分解为多个小问题,然后一个个地依次解决。
x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1] TRUE FALSE TRUE
# 有多少个以t开头的常用单词?
sum(str_detect(words, "^t"))
## [1] 65
mean(str_detect(words,"^t"))
## [1] 0.06632653
# 以元音字母结尾的常用单词的比例是多少?
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
# 以下两种方法均可找出不包含元音字母的所有单词:
no_vowels_1 <- !str_detect(words, "[aeiou]")
# 找出仅包含辅音字母(非元音字母)的所有单词
no_vowels_2 <- str_detect(words, "^[^aeiou]*$")
str_view(words,"^[^aeiou]*$",match = TRUE)
identical(no_vowels_1, no_vowels_2)
## [1] TRUE
str_detect() 函数的一种常见用法是选取出匹配某种模式的元素。你可以通过逻辑取子集方式来完成这种操作,也可以使用便捷的 str_subset() 包装器函数:
words[str_detect(words, "x$")]
## [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"
df <- tibble(
word = words,
i = seq_along(word)
)
df %>%
filter(str_detect(words, "x$"))
## # A tibble: 4 x 2
## word i
## <chr> <int>
## 1 box 108
## 2 sex 747
## 3 six 772
## 4 tax 841
str_detect() 函数的一种变体是 str_count(),后者不是简单地返回是或否,而是返回字符串中匹配的数量:
str_count(fruit,"a")
## [1] 1 1 2 3 0 0 1 2 1 0 0 1 2 2 1 0 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 0 1 1
## [36] 2 0 0 1 1 0 0 1 0 0 1 0 2 1 0 1 0 0 1 1 3 1 1 1 0 1 1 0 2 0 1 0 1 2 1
## [71] 1 0 2 2 1 1 2 1 0 1
# 平均来看,每个单词中有多少个元音字母?
mean(str_count(words, "[aeiou]"))
## [1] 1.991837
str_count() 也完全可以同 mutate() 函数一同使用:
df %>%
mutate(vowels = str_count(word,"[aeiou]"),
consonants = str_count(word,"[^aeiou]")) %>% head()
## # A tibble: 6 x 4
## word i vowels consonants
## <chr> <int> <int> <int>
## 1 a 1 1 0
## 2 able 2 2 2
## 3 about 3 3 2
## 4 absolute 4 4 4
## 5 accept 5 2 4
## 6 account 6 3 4
注意,匹配从来不会重叠。
str_count("abababa", "aba")
## [1] 2
str_view_all("abababa", "aba")
试着使用两种方法来解决以下每个问题,一种方法是使用单个正则表达式,另一种方法是使用多个 str_detect() 函数的组合。
test <- c("xdfshd","dsjhjsx")
str_view(c("xdfshd","dsjhjsx"),"(^x)|(x$)",match = TRUE)
test[str_detect(c("xdfshd","dsjhjsx"),"(^x)|(x$)")]
## [1] "xdfshd" "dsjhjsx"
str_view("as","^[aeiou].*[^aeiou]$",match = TRUE) # 对 全
str_view(words,"^[aeiou].+[^aeiou]$",match = TRUE) # 观察区别
str_view(words,"^[aeiou].?[^aeiou]$",match = TRUE)
str_view(words,"^[aeiou][aeiou]*[aeiou]$",match = TRUE)
str_view(words,"^[aeiou][aeiou]*[^aeiou]$",match = TRUE)
words[str_detect(words,"^[aeiou][aeiou]*[^aeiou]$")]
## [1] "air" "as" "at" "eat" "if" "in" "it" "of" "on" "or" "out"
## [12] "up"
# str_view_all(words,"[aeiou]")
# str_count(words,"[aeiou]")
df %>%
mutate(yuanyin_count = str_count(words,"[aeiou]")) %>%
arrange(desc(yuanyin_count)) %>%
head()
## # A tibble: 6 x 3
## word i yuanyin_count
## <chr> <int> <int>
## 1 appropriate 48 5
## 2 associate 57 5
## 3 available 62 5
## 4 colleague 166 5
## 5 encourage 268 5
## 6 experience 292 5
df %>%
mutate(yuanyin_count = str_count(words,"[aeiou]"),
zimu_count = str_count(words),
bili_yuanyin = yuanyin_count/zimu_count) %>%
arrange(desc(bili_yuanyin))
## # A tibble: 980 x 5
## word i yuanyin_count zimu_count bili_yuanyin
## <chr> <int> <int> <int> <dbl>
## 1 a 1 1 1 1
## 2 area 49 3 4 0.75
## 3 idea 412 3 4 0.75
## 4 age 22 2 3 0.667
## 5 ago 24 2 3 0.667
## 6 air 26 2 3 0.667
## 7 die 228 2 3 0.667
## 8 due 250 2 3 0.667
## 9 eat 256 2 3 0.667
## 10 europe 278 4 6 0.667
## # ... with 970 more rows
要想提取匹配的实际文本,我们可以使用 str_extract() 函数。
length(sentences)
## [1] 720
head(sentences)
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
## [4] "These days a chicken leg is a rare dish."
## [5] "Rice is often served in round bowls."
## [6] "The juice of lemons makes fine punch."
sentences %>% class()
## [1] "character"
假设我们想要找出包含一种颜色的所有句子。
colors <- c( " red ", " orange ", " yellow ", " green ", " blue ", " purple " )
color_match <- str_c(colors, collapse = "|")
color_match
## [1] " red | orange | yellow | green | blue | purple "
str_view_all(sentences, color_match,match = TRUE)
has_color <- str_subset(sentences, color_match)
matches <- str_extract(has_color, color_match)
head(matches)
## [1] " blue " " blue " " blue " " yellow " " green " " red "
注意, str_extract() 只提取第一个匹配。我们可以先选取出具有多于一种匹配的所有句 子,然后就可以很容易地看到更多匹配:
more <- sentences[str_count(sentences, color_match) > 1]
more # 目标字符段
## [1] "It is hard to erase blue or red ink."
str_view_all(more, color_match)
str_extract(more, color_match)
## [1] " blue "
str_extract_all(more, color_match)
## [[1]]
## [1] " blue " " red "
如果设置了 simplify = TRUE,那么 str_extract_all() 会返回一个矩阵,其中较短的匹配 会扩展到与最长的匹配具有同样的长度:
str_extract_all(more, color_match, simplify = TRUE)
## [,1] [,2]
## [1,] " blue " " red "
x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "a" "" ""
## [2,] "a" "b" ""
## [3,] "a" "b" "c"
str_view(sentences,"ing.$",match = TRUE)
sentences_ing_end <- str_subset(sentences,"ing.$")
str_extract_all(sentences_ing_end,"spring",simplify = TRUE)
## [,1]
## [1,] "spring"
## [2,] ""
## [3,] ""
## [4,] ""
## [5,] ""
## [6,] ""
## [7,] ""
## [8,] ""
## [9,] ""
## [10,] ""
## [11,] ""
## [12,] ""
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>% str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
str_extract() 函数可以给出完整匹配; str_match() 函数则可以给出每个独立分组。 str_ match() 返回的不是字符向量,而是一个矩阵,其中一列是完整匹配,后面的列是每个分 组的匹配:
has_noun %>% str_match(noun)
## [,1] [,2] [,3]
## [1,] "the smooth" "the" "smooth"
## [2,] "the sheet" "the" "sheet"
## [3,] "the depth" "the" "depth"
## [4,] "a chicken" "a" "chicken"
## [5,] "the parked" "the" "parked"
## [6,] "the sun" "the" "sun"
## [7,] "the huge" "the" "huge"
## [8,] "the ball" "the" "ball"
## [9,] "the woman" "the" "woman"
## [10,] "a helps" "a" "helps"
10.4.6 练习
str_replace() 和 str_replace_all() 函数可以使用新字符串替换匹配内容。最简单的应用 是使用固定字符串替换匹配内容:
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple" "p-ar" "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-" "p--r" "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three")) # 多了怎么办?
## [1] "one house" "two cars" "three people"
除了使用固定字符串替换匹配内容,你还可以使用回溯引用来插入匹配中的分组。在下面 的代码中,我们交换了第二个单词和第三个单词的顺序:
str_replace_all(sentences,"^([^ ]+) ([^ ]+) ([^ ]+)","\\1 \\3 \\2") %>% head()
## [1] "The canoe birch slid on the smooth planks."
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."
## [4] "These a days chicken leg is a rare dish."
## [5] "Rice often is served in round bowls."
## [6] "The of juice lemons makes fine punch."
使用反斜杠替换字符串中的所有斜杠。
使用 replace_all() 函数实现 str_to_lower() 函数的一个简单版。
str_to_lower("assWWW")
## [1] "asswww"
# str_view_all(sentences,"[A-Z]",match = TRUE)
str_split() 函数可以将字符串拆分为多个片段。例如,我们可以将句子拆分成单词:
sentences %>%
head(5) %>%
str_split(" ")
## [[1]]
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [8] "planks."
##
## [[2]]
## [1] "Glue" "the" "sheet" "to" "the"
## [6] "dark" "blue" "background."
##
## [[3]]
## [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
##
## [[4]]
## [1] "These" "days" "a" "chicken" "leg" "is" "a"
## [8] "rare" "dish."
##
## [[5]]
## [1] "Rice" "is" "often" "served" "in" "round" "bowls."
"a|b|c|d" %>% str_split("\\|") %>% .[[1]]
## [1] "a" "b" "c" "d"
否则,和返回列表的其他 stringr 函数一样,你可以通过设置 simplify = TRUE 返回一个矩阵:
sentences %>% head(5) %>% str_split(" ", simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue"
## [3,] "It's" "easy" "to" "tell" "the" "depth" "of"
## [4,] "These" "days" "a" "chicken" "leg" "is" "a"
## [5,] "Rice" "is" "often" "served" "in" "round" "bowls."
## [,8] [,9]
## [1,] "planks." ""
## [2,] "background." ""
## [3,] "a" "well."
## [4,] "rare" "dish."
## [5,] "" ""
你还可以设定拆分片段的最大数量:
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
## [,1] [,2]
## [1,] "Name" "Hadley"
## [2,] "Country" "NZ"
## [3,] "Age" "35"
除了模式,你还可以通过字母、行、句子和单词边界(boundary() 函数)来拆分字符串:
x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))
str_split(x, " ")[[1]]
## [1] "This" "is" "a" "sentence." "This" "is"
## [7] "another" "sentence."
str_split(x, boundary("word"))[[1]]
## [1] "This" "is" "a" "sentence" "This" "is"
## [7] "another" "sentence"
x <- c("apples, pears, and bananas")
str_split(x," ",simplify = TRUE)
## [,1] [,2] [,3] [,4]
## [1,] "apples," "pears," "and" "bananas"
str_split(x,boundary("word"),simplify = TRUE)
## [,1] [,2] [,3] [,4]
## [1,] "apples" "pears" "and" "bananas"
str_locate() 和 str_locate_all() 函数可以给出每个匹配的开始位置和结束位置。
当没有其他函数能够精确地满足需求时,这两个函数特别有用。你可以使用 str_locate() 函数找出匹配的模式,然后使用str_sub() 函数来提取或修改匹配的内容
# 正常调用:
str_view(fruit, "nana",match = TRUE)
# 上面形式是以下形式的简写
str_view(fruit, regex("nana"),match = TRUE)
你可以使用 regex() 函数的其他参数来控制具体的匹配方式。
ignore_case = TRUE 既可以匹配大写字母,也可以匹配小写字母,它总是使用当前的区域设置:
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
str_view(bananas, regex("banana", ignore_case = TRUE))
multiline = TRUE可以使得 ^ 和 $ 从每行的开头和末尾开始匹配,而不是从完整字符串 的开头和末尾开始匹配:
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")
## [[1]]
## [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
comments = TRUE可以让你在复杂的正则表达式中加入注释和空白字符,以便更易理解。匹配时会忽略空格和 # 后面的内容。如果想要匹配一个空格,你需要对其进行转义: “\”:
phone <- regex(" \\(? # 可选的开括号
(\\d{3}) # 地区编码
[)- ]? # 可选的闭括号、短划线或空格
(\\d{3}) # 另外3个数字
[ -]? # 可选的空格或短划线
(\\d{3}) # 另外3个数字 ",
comments = TRUE)
str_match("514-791-8141", phone)
## [,1] [,2] [,3] [,4]
## [1,] "514-791-814" "514" "791" "814"
dotall = TRUE 可以使得 . 匹配包括 在内的所有字符。
在介绍 str_split() 函数时,你已经知道可以使用 boundary() 函数来匹配边界。你还可 以在其他函数中使用这个函数:
x <- "This is a sentence."
str_view_all(x, boundary("word"))
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This" "is" "a" "sentence"
apropos() 函数可以在全局环境空间中搜索所有可用对象。当不能确切想起函数名称时, 这个函数特别有用:
library(forecastHybrid)
## Loading required package: forecast
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.fracdiff fracdiff
## residuals.fracdiff fracdiff
## Loading required package: thief
apropos("replace")
## [1] "%+replace%" "replace" "replace_na"
## [4] "setReplaceMethod" "str_replace" "str_replace_all"
## [7] "str_replace_na" "theme_replace"
apropos("cast")
## [1] "extractForecasts" "forecast" "forecast.ets"
## [4] "geom_forecast" "GeomForecast" "is.forecast"
## [7] "is.mforecast" "is.splineforecast" "KalmanForecast"
## [10] "StatForecast"
你可以使用以下代码返回当 前目录中的所有 R Markdown 文件
dir()
## [1] "2019-11-2-hdm-learning.html"
## [2] "2019-11-2-hdm learning.Rmd"
## [3] "4221-Article Text-7275-1-10-20190705.pdf"
## [4] "Apply-Series-Function-of-R.html"
## [5] "Apply Series Function of R.Rmd"
## [6] "apply.png"
## [7] "bookdown.pdf"
## [8] "data-import.pdf"
## [9] "data-transformation.pdf"
## [10] "data-visualization-2.1.pdf"
## [11] "dplyr.html"
## [12] "dplyr.Rmd"
## [13] "forecast-时间序列分析.pdf"
## [14] "hdm.png"
## [15] "hdm2.png"
## [16] "iris.csv"
## [17] "iris.txt"
## [18] "Learn-Base-R-effectively.html"
## [19] "Learn Base R effectively.Rmd"
## [20] "library_cheets.Rproj"
## [21] "portesFunctions-统计推断.pdf"
## [22] "postdouble.Rmd"
## [23] "purrr.html"
## [24] "purrr.Rmd"
## [25] "rmarkdown.pdf"
## [26] "R数据科学.pdf"
## [27] "R数据科学笔记.Rmd"
## [28] "str.html"
## [29] "str.Rmd"
## [30] "strings.pdf"
## [31] "tvReg.pdf"
## [32] "使用tsibbles整理时间序列数据.html"
## [33] "使用tsibbles整理时间序列数据.nb.html"
## [34] "使用tsibbles整理时间序列数据.Rmd"
dir(pattern = "\\.Rmd$")
## [1] "2019-11-2-hdm learning.Rmd" "Apply Series Function of R.Rmd"
## [3] "dplyr.Rmd" "Learn Base R effectively.Rmd"
## [5] "postdouble.Rmd" "purrr.Rmd"
## [7] "R数据科学笔记.Rmd" "str.Rmd"
## [9] "使用tsibbles整理时间序列数据.Rmd"
dir(pattern = "\\.html")
## [1] "2019-11-2-hdm-learning.html"
## [2] "Apply-Series-Function-of-R.html"
## [3] "dplyr.html"
## [4] "Learn-Base-R-effectively.html"
## [5] "purrr.html"
## [6] "str.html"
## [7] "使用tsibbles整理时间序列数据.html"
## [8] "使用tsibbles整理时间序列数据.nb.html"
(如果更喜欢使用 *.Rmd 这样的“通配符”,你可以通过 glob2rx() 函数将其转换为正则表达式.)
与 stringr 不同, stringi 的 设计思想是尽量全面,几乎包含了我们可以用到的所有函数: stringi 中有 234 个函数,而 stringr 中只有 42 个。