stringr learning

10.1 简介

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.2.1 --

## √ ggplot2 3.2.1     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   1.0.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(stringr)
library(htmltools)
library(htmlwidgets)

10.2 字符串基础

string1 <- "this is a string"

string2 <- "this is a \'apple\'"

string1

## [1] "this is a string"

string2

## [1] "this is a 'apple'"

转义字符

"'"

## [1] "'"

'"'

## [1] "\""

"\\"  # 打印形式会显示转义字符

## [1] "\\"

writeLines("\\")

## \

字符向量

c("one","two","three")

## [1] "one"   "two"   "three"

10.2.1 字符串长度

str_length(c("R for data science","ljj",NA))

## [1] 18  3 NA

10.2.2 字符串组合

str_c("x","y")

## [1] "xy"

str_c("x","y","z")

## [1] "xyz"

使用 sep 参数来控制字符串间的分隔方式

str_c("xy","zw",sep = "-")

## [1] "xy-zw"

str_c("as","df",sep = "+")

## [1] "as+df"

str_c("X",1:100,sep = "",collapse = "+")

## [1] "X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18+X19+X20+X21+X22+X23+X24+X25+X26+X27+X28+X29+X30+X31+X32+X33+X34+X35+X36+X37+X38+X39+X40+X41+X42+X43+X44+X45+X46+X47+X48+X49+X50+X51+X52+X53+X54+X55+X56+X57+X58+X59+X60+X61+X62+X63+X64+X65+X66+X67+X68+X69+X70+X71+X72+X73+X74+X75+X76+X77+X78+X79+X80+X81+X82+X83+X84+X85+X86+X87+X88+X89+X90+X91+X92+X93+X94+X95+X96+X97+X98+X99+X100"

str_c("Y~",str_c("X",1:100,sep = "",collapse = "+"),sep = "")

## [1] "Y~X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18+X19+X20+X21+X22+X23+X24+X25+X26+X27+X28+X29+X30+X31+X32+X33+X34+X35+X36+X37+X38+X39+X40+X41+X42+X43+X44+X45+X46+X47+X48+X49+X50+X51+X52+X53+X54+X55+X56+X57+X58+X59+X60+X61+X62+X63+X64+X65+X66+X67+X68+X69+X70+X71+X72+X73+X74+X75+X76+X77+X78+X79+X80+X81+X82+X83+X84+X85+X86+X87+X88+X89+X90+X91+X92+X93+X94+X95+X96+X97+X98+X99+X100"

如果想要将它们输出为 “NA”，可以使用 str_replace_na()：

x <- c("abc", NA)

str_c("|-", x, "-|")

## [1] "|-abc-|" NA

str_c("|-", str_replace_na(x), "-|")

## [1] "|-abc-|" "|-NA-|"

str_c("abc",c("a","b","c"),"xyz",sep = "-")

## [1] "abc-a-xyz" "abc-b-xyz" "abc-c-xyz"

要想将字符向量合并为字符串，可以使用 collapse()

str_c("x","y","z",sep = "*")  # 将字符串连接

## [1] "x*y*z"

str_c(c("a","b"),c("V","n"),1:10,sep = "+",collapse = "*")  # collapse的作用就是将字符向量转化为字符串

## [1] "a+V+1*b+n+2*a+V+3*b+n+4*a+V+5*b+n+6*a+V+7*b+n+8*a+V+9*b+n+10"

str_c("X",1:100,sep = "",collapse = "+")

## [1] "X1+X2+X3+X4+X5+X6+X7+X8+X9+X10+X11+X12+X13+X14+X15+X16+X17+X18+X19+X20+X21+X22+X23+X24+X25+X26+X27+X28+X29+X30+X31+X32+X33+X34+X35+X36+X37+X38+X39+X40+X41+X42+X43+X44+X45+X46+X47+X48+X49+X50+X51+X52+X53+X54+X55+X56+X57+X58+X59+X60+X61+X62+X63+X64+X65+X66+X67+X68+X69+X70+X71+X72+X73+X74+X75+X76+X77+X78+X79+X80+X81+X82+X83+X84+X85+X86+X87+X88+X89+X90+X91+X92+X93+X94+X95+X96+X97+X98+X99+X100"

10.2.3 字符串取子集

x <- c("Apple", "Banana", "Pear")

str_sub(x, 1, 3)

## [1] "App" "Ban" "Pea"

str_sub(x, -3, -1)

## [1] "ple" "ana" "ear"

# 注意，即使字符串过短， str_sub() 函数也不会出错，它将返回尽可能多的字符：

str_sub("a", 1, 5)

## [1] "a"

str_to_lower(str_sub(x,1,1))

## [1] "a" "b" "p"

10.2.4 区域设置

str_to_upper(c("i", "ı"))

## [1] "I"        "<U+0131>"

str_to_lower("I")

## [1] "i"

字符串排序

x <- c("apple", "eggplant", "banana")

str_sort(x)

## [1] "apple"    "banana"   "eggplant"

10.2.5 练习

2 用自己的语言描述一下 str_c() 函数的 sep 和 collapse 参数有什么区别？

3 使用 str_length() 和 str_sub() 函数提取出一个字符串最中间的字符。如果字符串中的字符数是偶数，你应该怎么做？

4 str_wrap() 函数的功能是什么？应该在何时使用这个函数？

5 str_trim() 函数的功能是什么？其逆操作是哪个函数？

6 编写一个函数将字符向量转换为字符串，例如，将字符向量 c(“a”, “b”, “c”) 转换为字符串 a、 b 和 c。仔细思考一下，如果给定一个长度为0、1 或 2的向量，那么这个函数应该怎么做？

str_c(c("a","b","c"),collapse = "") %>% str_sub(2,2)

## [1] "b"

10.3　使用正则表达式进行模式匹配

10.3.1 基础匹配

x <- c("apple", "banana", "pear")

str_view(x, "an")

# 另一个更复杂一些的模式是使用 .，它可以匹配任意字符（除了换行符）：
str_view(x,".a.")

# 如果 . 可以匹配任意字符，那么如何匹配字符 . 呢？

# 要想建立正则表示式，我们需要使用\\
dot <- "\\."

dot %>% writeLines()

## \.

str_view(c("abc", "a.c", "bef"), "a\\.c")

如果***，那么如何匹配这个字符呢？我们还是需要去除其特殊意义，建立形式为 \的正则表达式。要想建立这样的正则表达式，我们需要使用一个字符串，其中还需要对进行转义。这意味着要想匹配字符，我们需要输入 “\\”—— 你需要 4 个反斜杠来匹配 1 个反斜杠！

10.3.2 练习

解释一下为什么这些字符串不能匹配一个反斜杠： “"、”\“、”\"。
如何匹配字符序列 "’ ？

writeLines("\'\\\\")

## '\\

正则表达式 ...... 会匹配哪种模式？如何用字符串来表示这个正则表达式？

10.3.3 锚点

有时我们需要在正则表达式中设置锚点，以便 R 从字符串的开头或末尾进行匹配。我们可以设置两种锚点。

^ 从字符串开头进行匹配。
$ 从字符串末尾进行匹配。

x <- c("apple", "banana", "pear")

str_view(x, "^a")

str_view(x, "a$")

始于权力（^），终于金钱（$）

如果想要强制正则表达式匹配一个完整字符串，那么可以同时设置 ^ 和 $ 这两个锚点：

x <- c("apple pie", "apple", "apple cake")

str_view(x,"apple")

str_view(x,"^apple$")

为了避免匹配到summarize、summary、rowsum 等，我们会使用 ** 进行搜索。

10.3.4 练习

如何匹配字符串 “$^$” ？

x <- c("xyz","$^$fghd","qwe",1:10,"a")  # 这是一个字符串，注意以下区别

str_c(x,collapse = "*")

## [1] "xyz*$^$fghd*qwe*1*2*3*4*5*6*7*8*9*10*a"

str_c(x,sep = "-")

##  [1] "xyz"     "$^$fghd" "qwe"     "1"       "2"       "3"       "4"      
##  [8] "5"       "6"       "7"       "8"       "9"       "10"      "a"

str_c(x,sep = "+",collapse = "")

## [1] "xyz$^$fghdqwe12345678910a"

str_view(x,"\\$\\^\\$",match = TRUE)

给定 stringr::words 中的常用单词语料库，创建正则表达式以找出满足下列条件的所有单词。

以 y 开头的单词。

str_view(words,"^y",match = TRUE)

以 x 结尾的单词。

str_view(words,"x$",match = TRUE)

长度正好为 3 个字符的单词。（不要使用 str_length() 函数，这是作弊！）

words[str_length(words) == 3]

##   [1] "act" "add" "age" "ago" "air" "all" "and" "any" "arm" "art" "ask"
##  [12] "bad" "bag" "bar" "bed" "bet" "big" "bit" "box" "boy" "bus" "but"
##  [23] "buy" "can" "car" "cat" "cup" "cut" "dad" "day" "die" "dog" "dry"
##  [34] "due" "eat" "egg" "end" "eye" "far" "few" "fit" "fly" "for" "fun"
##  [45] "gas" "get" "god" "guy" "hit" "hot" "how" "job" "key" "kid" "lad"
##  [56] "law" "lay" "leg" "let" "lie" "lot" "low" "man" "may" "mrs" "new"
##  [67] "non" "not" "now" "odd" "off" "old" "one" "out" "own" "pay" "per"
##  [78] "put" "red" "rid" "run" "say" "see" "set" "sex" "she" "sir" "sit"
##  [89] "six" "son" "sun" "tax" "tea" "ten" "the" "tie" "too" "top" "try"
## [100] "two" "use" "war" "way" "wee" "who" "why" "win" "yes" "yet" "you"

str_view(words,"^...$",match = TRUE)

具有 7 个或更多字符的单词。

因为这个列表非常长，所以你可以设置 str_view() 函数的 match 参数，只显示匹配的单词（match = TRUE）或未匹配的单词（match = FALSE）。

10.3.5 字符类与字符选项

可以匹配任意空白字符（如空格、制表符和换行符）。
[abc] 可以匹配 a、 b 或 c。
[^abc] 可以匹配除 a、 b、 c 外的任意字符
- 匹配前面的子表达式任意次。例如，zo能匹配“z”，也能匹配“zo”以及“zoo”。等价于{0,}。
- 匹配前面的子表达式一次或多次(大于等于1次）。例如，“zo+”能匹配“zo”以及“zoo”，但不能匹配“z”。+等价于{1,}。

请牢记，要想创建包含或 * 的正则表达式，你需要在字符串中对进行转义，因此需要输入“\d” 或 “\s”

因为 | 的优先级很低，所以 abc|xyz 匹配的是 abc 或 xyz，而不是 abcyz 或 abxyz。

str_view(c("grey", "gray"), "gr(e|a)y")

10.3.6 练习

创建正则表达式来找出符合以下条件的所有单词。

以元音字母开头的单词。

str_view(words,"^[aeiou]",match = TRUE)

只包含辅音字母的单词（提示：考虑一下匹配“非”元音字母）。

str_view(words,pattern = "^[^aeiou]",match = TRUE)  # 始于^，终于$

str_view(words,pattern = "^[^aeiou]*$",match = TRUE)

以 ed 结尾，但不以 eed 结尾的单词。

str_view(words,"[^e]ed$",match = TRUE)

以 ing 或 ize 结尾的单词。

str_view(words,"(ing$)|(ize$)",match = TRUE)

实际验证一下规则： i 总是在 e 前面，除非 i 前面有 c。
q 后面总是跟着一个 u 吗？

qu <- str_detect(words,"qu")

q <- str_detect(words,"q")

identical(qu,q)  # 是

## [1] TRUE

编写一个正则表达式来匹配英式英语单词，排除美式英语单词。
创建一个正则表达式来匹配你所在国家的电话号码。

x <- c("123-0375-5698","123-456-789")

str_view(x,pattern = "\\d\\d\\d-\\d\\d\\d\\d-\\d\\d\\d\\d",match = TRUE)

10.3.7 重复

?： 0 次或 1 次。
+： 1 次或多次。
*： 0 次或多次。

x <- "1888 is the longest year in Roman numerals: MDCCCCCLXXXVIII"

str_view(x, "CC?")

str_view(x, "CC+")

str_view(x, 'C[LX]+')

{n}：匹配 n 次。
{n,}：匹配 n 次或更多次。
{,m}：最多匹配 m 次。
{n, m}：匹配 n 到 m 次。

str_view(x, "C{2}")

str_view(x, "C{2,}")

str_view(x, "C{2,3}?")

默认的匹配方式是“贪婪的”：正则表达式会匹配尽量长的字符串。通过在正则表达式后面添加一个?，你可以将匹配方式更改为“懒惰的”，即匹配尽量短的字符串。虽然这是正则表达式的高级特性，但知道这一点是非常有用的。

str_view(x, 'C{2,3}?')

x <- c("CLXXXXXXX","CXLLLLL","CXXXXXX")

str_view(x, 'C[LX]+?')

str_view(x, 'C[LX]+')

10.3.8 练习

给出与 ?、 + 和 * 等价的 {m, n} 形式的正则表达式。

x <- c("asdfhkkkddf","asdfgkkkkkklll","xc")

y <- "1888 is the longest year in Roman numerals: MDCCCCCCCLXXXVIII"

str_view(x,"kk?",match = TRUE)   #　懒惰模式

str_view(x,"kk+")

str_view(x,"kk*")               # 贪婪模式

str_view(y,"CC?")  # 0或1   #　懒惰模式

str_view(y,"CC+")  # 1或多　#  贪婪模式

str_view(y,"CC*")  # 0或多  # 贪婪模式

用语言描述以下正则表达式匹配的是何种模式（仔细阅读来确认我们使用的是正则表达式，还是定义正则表达式的字符串）？

^.*$ 正则表达式

str_view(x,"^.*$")  # * 匹配前面的子表达式任意次。例如，zo*能匹配“z”，也能匹配“zo”以及“zoo”。*等价于{0,}。

str_view(x,"^.*?$")

“\{.+\}”

writeLines("\\{.+\\}")

## \{.+\}

-- 正则表达式1111-11-11

str_view("1111-11-11","\\d{4}-\\d{2}-\\d{2}")

“\\{4}”

创建正则表达式来找出满足以下条件的所有单词。

以 3 个辅音字母开头的单词

str_view(words,"^[^aeiou][^aeiou][^aeiou]",match = TRUE)

有连续 3 个或更多元音字母的单词。

str_view(words,"[aeiou][aeiou][aeiou]+",match = TRUE)

有连续 2 个或更多元音—辅音配对的单词。

10.3.9 分组与回溯引用

括号可以用于消除复杂表达式中的歧义。括号还可以定义“分组”，你可以通过回溯引用（如 \1、 \2 等）来引用这些分组。例如，以下的正则表达式可以找出名称中有重复的一对字母的所有水果：

str_view(fruit, "(..)\\1", match = TRUE)

10.3.10 练习

用语言描述以下正则表达式会匹配何种模式？

(.)\1\1

str_view_all("sssddfff", "(.)\\1\\1", match = TRUE)

“(.)(.)\2\1”

str_view(fruit, "(.)(.)\\2\\1", match = TRUE)

(..)\1

str_view(fruit, "(..)\\1", match = TRUE)

“(.).\1.\1”

str_view(fruit, "(.).\\1.\\1", match = TRUE)

"(.)(.)(.).*\3\2\1"

str_view(fruit, "(.)(.)(.).*\\3\\2\\1", match = TRUE)

创建正则表达式来匹配出以下单词。

开头字母和结尾字母相同的单词。

str_view(words,"^(.)[a-zA-Z0-9]*\\1$",match = TRUE)

包含一对重复字母的单词（例如， church 中包含了重复的 ch）。
包含一个至少重复 3 次的字母的单词（例如， eleven 中的 e 重复了 3 次）。

10.4 工具

既然我们已经掌握了正则表达式的基础知识，现在是时候学习如何应用它们来解决实际问题了。我们将在本节中学习多种 stringr 函数，它们可以：

确定与某种模式相匹配的字符串；
找出匹配的位置；
提取出匹配的内容；
使用新值替换匹配内容；
基于匹配拆分字符串。

当遇到一个问题时，有些人会这样想：“我可以用正则表达式来搞定它。”于是，原来的一个问题就变成了两个问题。

别忘了你使用的是一门编程语言，还有很多其他工具随时待命。相对于创建一个复杂的正则表达式，更简单的方式是创建多个简单的正则表达式。如果很难使用一个正则表达式来解决问题，那么你就应该回过头仔细思考一下，是否可以将一个问题分解为多个小问题，然后一个个地依次解决。

10.4.1 匹配检测

x <- c("apple", "banana", "pear")

str_detect(x, "e")

## [1]  TRUE FALSE  TRUE

# 有多少个以t开头的常用单词？
sum(str_detect(words, "^t"))

## [1] 65

mean(str_detect(words,"^t"))

## [1] 0.06632653

# 以元音字母结尾的常用单词的比例是多少？
mean(str_detect(words, "[aeiou]$"))

## [1] 0.2765306

# 以下两种方法均可找出不包含元音字母的所有单词：
no_vowels_1 <- !str_detect(words, "[aeiou]")

# 找出仅包含辅音字母（非元音字母）的所有单词
no_vowels_2 <- str_detect(words, "^[^aeiou]*$")

str_view(words,"^[^aeiou]*$",match = TRUE)

identical(no_vowels_1, no_vowels_2)

## [1] TRUE

str_detect() 函数的一种常见用法是选取出匹配某种模式的元素。你可以通过逻辑取子集方式来完成这种操作，也可以使用便捷的 str_subset() 包装器函数：

words[str_detect(words, "x$")]

## [1] "box" "sex" "six" "tax"

str_subset(words, "x$")

## [1] "box" "sex" "six" "tax"

df <- tibble(
word = words,
i = seq_along(word)
)

df %>%
  filter(str_detect(words, "x$"))

## # A tibble: 4 x 2
##   word      i
##   <chr> <int>
## 1 box     108
## 2 sex     747
## 3 six     772
## 4 tax     841

str_detect() 函数的一种变体是 str_count()，后者不是简单地返回是或否，而是返回字符串中匹配的数量：

str_count(fruit,"a")

##  [1] 1 1 2 3 0 0 1 2 1 0 0 1 2 2 1 0 0 0 0 0 1 0 1 1 1 1 1 1 0 1 0 0 0 1 1
## [36] 2 0 0 1 1 0 0 1 0 0 1 0 2 1 0 1 0 0 1 1 3 1 1 1 0 1 1 0 2 0 1 0 1 2 1
## [71] 1 0 2 2 1 1 2 1 0 1

# 平均来看，每个单词中有多少个元音字母？
mean(str_count(words, "[aeiou]"))

## [1] 1.991837

str_count() 也完全可以同 mutate() 函数一同使用：

df %>% 
  mutate(vowels = str_count(word,"[aeiou]"),
         consonants = str_count(word,"[^aeiou]")) %>% head()

## # A tibble: 6 x 4
##   word         i vowels consonants
##   <chr>    <int>  <int>      <int>
## 1 a            1      1          0
## 2 able         2      2          2
## 3 about        3      3          2
## 4 absolute     4      4          4
## 5 accept       5      2          4
## 6 account      6      3          4

注意，匹配从来不会重叠。

str_count("abababa", "aba")

## [1] 2

str_view_all("abababa", "aba")

10.4.2 练习

试着使用两种方法来解决以下每个问题，一种方法是使用单个正则表达式，另一种方法是使用多个 str_detect() 函数的组合。

找出以 x 开头或结尾的所有单词。

test <- c("xdfshd","dsjhjsx")

str_view(c("xdfshd","dsjhjsx"),"(^x)|(x$)",match = TRUE)

test[str_detect(c("xdfshd","dsjhjsx"),"(^x)|(x$)")]

## [1] "xdfshd"  "dsjhjsx"

找出以元音字母开头并以辅音字母结尾的所有单词。

str_view("as","^[aeiou].*[^aeiou]$",match = TRUE)  # 对 全

str_view(words,"^[aeiou].+[^aeiou]$",match = TRUE) # 观察区别

str_view(words,"^[aeiou].?[^aeiou]$",match = TRUE)

是否存在包含所有元音字母的单词？

str_view(words,"^[aeiou][aeiou]*[aeiou]$",match = TRUE)

str_view(words,"^[aeiou][aeiou]*[^aeiou]$",match = TRUE)

words[str_detect(words,"^[aeiou][aeiou]*[^aeiou]$")]

##  [1] "air" "as"  "at"  "eat" "if"  "in"  "it"  "of"  "on"  "or"  "out"
## [12] "up"

哪个单词包含最多数量的元音字母？哪个单词包含最大比例的元音字母？（提示：分母应该是什么？）

# str_view_all(words,"[aeiou]")

# str_count(words,"[aeiou]")

df %>% 
  mutate(yuanyin_count = str_count(words,"[aeiou]")) %>% 
  arrange(desc(yuanyin_count)) %>% 
  head()

## # A tibble: 6 x 3
##   word            i yuanyin_count
##   <chr>       <int>         <int>
## 1 appropriate    48             5
## 2 associate      57             5
## 3 available      62             5
## 4 colleague     166             5
## 5 encourage     268             5
## 6 experience    292             5

df %>% 
  mutate(yuanyin_count = str_count(words,"[aeiou]"),
         zimu_count = str_count(words),
         bili_yuanyin = yuanyin_count/zimu_count) %>% 
  arrange(desc(bili_yuanyin))

## # A tibble: 980 x 5
##    word       i yuanyin_count zimu_count bili_yuanyin
##    <chr>  <int>         <int>      <int>        <dbl>
##  1 a          1             1          1        1    
##  2 area      49             3          4        0.75 
##  3 idea     412             3          4        0.75 
##  4 age       22             2          3        0.667
##  5 ago       24             2          3        0.667
##  6 air       26             2          3        0.667
##  7 die      228             2          3        0.667
##  8 due      250             2          3        0.667
##  9 eat      256             2          3        0.667
## 10 europe   278             4          6        0.667
## # ... with 970 more rows

10.4.3 提取匹配内容

要想提取匹配的实际文本，我们可以使用 str_extract() 函数。

length(sentences)

## [1] 720

head(sentences)

## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."

sentences %>% class()

## [1] "character"

假设我们想要找出包含一种颜色的所有句子。

colors <- c( " red ", " orange ", " yellow ", " green ", " blue ", " purple " ) 

color_match <- str_c(colors, collapse = "|")

color_match

## [1] " red | orange | yellow | green | blue | purple "

str_view_all(sentences, color_match,match = TRUE)

has_color <- str_subset(sentences, color_match) 

matches <- str_extract(has_color, color_match)

head(matches)

## [1] " blue "   " blue "   " blue "   " yellow " " green "  " red "

注意， str_extract() 只提取第一个匹配。我们可以先选取出具有多于一种匹配的所有句子，然后就可以很容易地看到更多匹配：

more <- sentences[str_count(sentences, color_match) > 1] 

more    # 目标字符段

## [1] "It is hard to erase blue or red ink."

str_view_all(more, color_match)

str_extract(more, color_match)

## [1] " blue "

str_extract_all(more, color_match)

## [[1]]
## [1] " blue " " red "

如果设置了 simplify = TRUE，那么 str_extract_all() 会返回一个矩阵，其中较短的匹配会扩展到与最长的匹配具有同样的长度：

str_extract_all(more, color_match, simplify = TRUE)

##      [,1]     [,2]   
## [1,] " blue " " red "

x <- c("a", "a b", "a b c") 

str_extract_all(x, "[a-z]", simplify = TRUE)

##      [,1] [,2] [,3]
## [1,] "a"  ""   ""  
## [2,] "a"  "b"  ""  
## [3,] "a"  "b"  "c"

10.4.4 练习

在前面的示例中，你或许已经发现正则表达式匹配了flickered，这并不是一种颜色。修改正则表达式来解决这个问题。
从 Harvard sentences 数据集中提取以下内容。

每个句子的第一个单词。
以 ing 结尾的所有单词。

str_view(sentences,"ing.$",match = TRUE)

sentences_ing_end <- str_subset(sentences,"ing.$")

str_extract_all(sentences_ing_end,"spring",simplify = TRUE)

##       [,1]    
##  [1,] "spring"
##  [2,] ""      
##  [3,] ""      
##  [4,] ""      
##  [5,] ""      
##  [6,] ""      
##  [7,] ""      
##  [8,] ""      
##  [9,] ""      
## [10,] ""      
## [11,] ""      
## [12,] ""

所有复数形式的单词。

noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>% 
  str_subset(noun) %>%
  head(10) 

has_noun %>% str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

str_extract() 函数可以给出完整匹配； str_match() 函数则可以给出每个独立分组。 str_ match() 返回的不是字符向量，而是一个矩阵，其中一列是完整匹配，后面的列是每个分组的匹配：

has_noun %>% str_match(noun)

##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"

10.4.6　练习

找出跟在一个数词（one、 two、 three 等）后面的所有单词，提取出数词与后面的单词。
找出所有缩略形式，分别列出撇号前面和后面的部分。

10.4.7 替换匹配内容

str_replace() 和 str_replace_all() 函数可以使用新字符串替换匹配内容。最简单的应用是使用固定字符串替换匹配内容：

x <- c("apple", "pear", "banana") 

str_replace(x, "[aeiou]", "-")

## [1] "-pple"  "p-ar"   "b-nana"

str_replace_all(x, "[aeiou]", "-")

## [1] "-ppl-"  "p--r"   "b-n-n-"

x <- c("1 house", "2 cars", "3 people") 

str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))  # 多了怎么办？

## [1] "one house"    "two cars"     "three people"

除了使用固定字符串替换匹配内容，你还可以使用回溯引用来插入匹配中的分组。在下面的代码中，我们交换了第二个单词和第三个单词的顺序：

str_replace_all(sentences,"^([^ ]+) ([^ ]+) ([^ ]+)","\\1 \\3 \\2") %>% head()

## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."       
## [6] "The of juice lemons makes fine punch."

10.4.8 练习

使用反斜杠替换字符串中的所有斜杠。
使用 replace_all() 函数实现 str_to_lower() 函数的一个简单版。

str_to_lower("assWWW")

## [1] "asswww"

# str_view_all(sentences,"[A-Z]",match = TRUE)

交换 words 中单词的首字母和末尾字母，其中哪些字符串仍然是个单词？

10.4.9 拆分

str_split() 函数可以将字符串拆分为多个片段。例如，我们可以将句子拆分成单词：

sentences %>% 
  head(5) %>% 
  str_split(" ")

## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."

"a|b|c|d" %>% str_split("\\|") %>% .[[1]]

## [1] "a" "b" "c" "d"

否则，和返回列表的其他 stringr 函数一样，你可以通过设置 simplify = TRUE 返回一个矩阵：

sentences %>% head(5) %>% str_split(" ", simplify = TRUE)

##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]    
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth"
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"  
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"    
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"     
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls."
##      [,8]          [,9]   
## [1,] "planks."     ""     
## [2,] "background." ""     
## [3,] "a"           "well."
## [4,] "rare"        "dish."
## [5,] ""            ""

你还可以设定拆分片段的最大数量：

fields <- c("Name: Hadley", "Country: NZ", "Age: 35")

fields %>% str_split(": ", n = 2, simplify = TRUE)

##      [,1]      [,2]    
## [1,] "Name"    "Hadley"
## [2,] "Country" "NZ"    
## [3,] "Age"     "35"

除了模式，你还可以通过字母、行、句子和单词边界（boundary() 函数）来拆分字符串：

x <- "This is a sentence. This is another sentence." 

str_view_all(x, boundary("word"))

str_split(x, " ")[[1]]

## [1] "This"      "is"        "a"         "sentence." "This"      "is"       
## [7] "another"   "sentence."

str_split(x, boundary("word"))[[1]]

## [1] "This"     "is"       "a"        "sentence" "This"     "is"      
## [7] "another"  "sentence"

10.4.10 练习

拆分字符串 “apples, pears, and bananas”。

x <- c("apples, pears, and bananas")

str_split(x," ",simplify = TRUE)

##      [,1]      [,2]     [,3]  [,4]     
## [1,] "apples," "pears," "and" "bananas"

str_split(x,boundary("word"),simplify = TRUE)

##      [,1]     [,2]    [,3]  [,4]     
## [1,] "apples" "pears" "and" "bananas"

为什么使用 boundary(“word”) 的拆分效果要比 " " 好？
使用空字符串（""）进行拆分会得到什么结果？尝试一下，然后阅读文档

10.4.11 定位匹配内容

str_locate() 和 str_locate_all() 函数可以给出每个匹配的开始位置和结束位置。

当没有其他函数能够精确地满足需求时，这两个函数特别有用。你可以使用 str_locate() 函数找出匹配的模式，然后使用str_sub() 函数来提取或修改匹配的内容

10.5 其他类型的模式

# 正常调用： 
str_view(fruit, "nana",match = TRUE)

# 上面形式是以下形式的简写 
str_view(fruit, regex("nana"),match = TRUE)

你可以使用 regex() 函数的其他参数来控制具体的匹配方式。

ignore_case = TRUE 既可以匹配大写字母，也可以匹配小写字母，它总是使用当前的区域设置：

bananas <- c("banana", "Banana", "BANANA") 

str_view(bananas, "banana")

str_view(bananas, regex("banana", ignore_case = TRUE))

multiline = TRUE可以使得 ^ 和 $ 从每行的开头和末尾开始匹配，而不是从完整字符串的开头和末尾开始匹配：

x <- "Line 1\nLine 2\nLine 3" 
str_extract_all(x, "^Line")

## [[1]]
## [1] "Line"

str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]

## [1] "Line" "Line" "Line"

comments = TRUE可以让你在复杂的正则表达式中加入注释和空白字符，以便更易理解。匹配时会忽略空格和 # 后面的内容。如果想要匹配一个空格，你需要对其进行转义： “\”：

phone <- regex(" \\(? # 可选的开括号
               (\\d{3}) # 地区编码
               [)- ]? # 可选的闭括号、短划线或空格
               (\\d{3}) # 另外3个数字 
               [ -]? # 可选的空格或短划线
               (\\d{3}) # 另外3个数字 ", 
               comments = TRUE)

str_match("514-791-8141", phone)

##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"

dotall = TRUE 可以使得 . 匹配包括在内的所有字符。

在介绍 str_split() 函数时，你已经知道可以使用 boundary() 函数来匹配边界。你还可以在其他函数中使用这个函数：

x <- "This is a sentence."

str_view_all(x, boundary("word"))

str_extract_all(x, boundary("word"))

## [[1]]
## [1] "This"     "is"       "a"        "sentence"

10.6 正则表达式的其他应用

apropos() 函数可以在全局环境空间中搜索所有可用对象。当不能确切想起函数名称时，这个函数特别有用：

library(forecastHybrid)

## Loading required package: forecast

## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Registered S3 methods overwritten by 'forecast':
##   method             from    
##   fitted.fracdiff    fracdiff
##   residuals.fracdiff fracdiff

## Loading required package: thief

apropos("replace")

## [1] "%+replace%"       "replace"          "replace_na"      
## [4] "setReplaceMethod" "str_replace"      "str_replace_all" 
## [7] "str_replace_na"   "theme_replace"

apropos("cast")

##  [1] "extractForecasts"  "forecast"          "forecast.ets"     
##  [4] "geom_forecast"     "GeomForecast"      "is.forecast"      
##  [7] "is.mforecast"      "is.splineforecast" "KalmanForecast"   
## [10] "StatForecast"

你可以使用以下代码返回当前目录中的所有 R Markdown 文件

dir()

##  [1] "2019-11-2-hdm-learning.html"             
##  [2] "2019-11-2-hdm learning.Rmd"              
##  [3] "4221-Article Text-7275-1-10-20190705.pdf"
##  [4] "Apply-Series-Function-of-R.html"         
##  [5] "Apply Series Function of R.Rmd"          
##  [6] "apply.png"                               
##  [7] "bookdown.pdf"                            
##  [8] "data-import.pdf"                         
##  [9] "data-transformation.pdf"                 
## [10] "data-visualization-2.1.pdf"              
## [11] "dplyr.html"                              
## [12] "dplyr.Rmd"                               
## [13] "forecast-时间序列分析.pdf"               
## [14] "hdm.png"                                 
## [15] "hdm2.png"                                
## [16] "iris.csv"                                
## [17] "iris.txt"                                
## [18] "Learn-Base-R-effectively.html"           
## [19] "Learn Base R effectively.Rmd"            
## [20] "library_cheets.Rproj"                    
## [21] "portesFunctions-统计推断.pdf"            
## [22] "postdouble.Rmd"                          
## [23] "purrr.html"                              
## [24] "purrr.Rmd"                               
## [25] "rmarkdown.pdf"                           
## [26] "R数据科学.pdf"                           
## [27] "R数据科学笔记.Rmd"                       
## [28] "str.html"                                
## [29] "str.Rmd"                                 
## [30] "strings.pdf"                             
## [31] "tvReg.pdf"                               
## [32] "使用tsibbles整理时间序列数据.html"       
## [33] "使用tsibbles整理时间序列数据.nb.html"    
## [34] "使用tsibbles整理时间序列数据.Rmd"

dir(pattern = "\\.Rmd$")

## [1] "2019-11-2-hdm learning.Rmd"       "Apply Series Function of R.Rmd"  
## [3] "dplyr.Rmd"                        "Learn Base R effectively.Rmd"    
## [5] "postdouble.Rmd"                   "purrr.Rmd"                       
## [7] "R数据科学笔记.Rmd"                "str.Rmd"                         
## [9] "使用tsibbles整理时间序列数据.Rmd"

dir(pattern = "\\.html")

## [1] "2019-11-2-hdm-learning.html"         
## [2] "Apply-Series-Function-of-R.html"     
## [3] "dplyr.html"                          
## [4] "Learn-Base-R-effectively.html"       
## [5] "purrr.html"                          
## [6] "str.html"                            
## [7] "使用tsibbles整理时间序列数据.html"   
## [8] "使用tsibbles整理时间序列数据.nb.html"

（如果更喜欢使用 *.Rmd 这样的“通配符”，你可以通过 glob2rx() 函数将其转换为正则表达式.）

10.7 stringi

与 stringr 不同， stringi 的设计思想是尽量全面，几乎包含了我们可以用到的所有函数： stringi 中有 234 个函数，而 stringr 中只有 42 个。

stringr learning

LJJ

2019/11/25

10.1 简介

10.2 字符串基础

10.2.1 字符串长度

10.2.2 字符串组合

10.2.3 字符串取子集

10.2.4 区域设置

10.2.5 练习

10.3　使用正则表达式进行模式匹配

10.3.1 基础匹配

10.3.2 练习

10.3.3 锚点

10.3.4 练习

10.3.5 字符类与字符选项

10.3.6 练习

10.3.7 重复

10.3.8 练习

10.3.9 分组与回溯引用

10.3.10 练习

10.4 工具

10.4.1 匹配检测

10.4.2 练习

10.4.3 提取匹配内容

10.4.4 练习

10.4.7 替换匹配内容

10.4.8 练习

10.4.9 拆分

10.4.10 练习

10.4.11 定位匹配内容

10.5 其他类型的模式

10.6 正则表达式的其他应用

10.7 stringi

stringr learning

LJJ

2019/11/25

10.1 简介

10.2 字符串基础

10.2.1 字符串长度

10.2.2 字符串组合

10.2.3 字符串取子集

10.2.4 区域设置

10.2.5 练习

10.3 使用正则表达式进行模式匹配

10.3.1 基础匹配

10.3.2 练习

10.3.3 锚点

10.3.4 练习

10.3.5 字符类与字符选项

10.3.6 练习

10.3.7 重复

10.3.8 练习

10.3.9 分组与回溯引用

10.3.10 练习

10.4 工具

10.4.1 匹配检测

10.4.2 练习

10.4.3 提取匹配内容

10.4.4 练习

10.4.7 替换匹配内容

10.4.8 练习

10.4.9 拆分

10.4.10 练习

10.4.11 定位匹配内容

10.5 其他类型的模式

10.6 正则表达式的其他应用

10.7 stringi

10.3　使用正则表达式进行模式匹配