这篇笔记记录一下在 R 中 如何处理字符串,虽然对我个人来说更加习惯 perl ,但是不想 R、perl 来回切换,所以还是在 R 中学习一下,stringr 这个包很强大

library(tidyverse)
## ─ Attaching packages ─────────────────────── tidyverse 1.2.1 ─
## ✔ ggplot2 3.2.0     ✔ purrr   0.3.2
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   0.8.3     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ─ Conflicts ──────────────────────── tidyverse_conflicts() ─
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(stringr)

string basics

你可以使用 “” 创建一个字符串

string1 <- "this is a string"
string1
## [1] "this is a string"
string2 <- 'a "quote" inside a string'
string2
## [1] "a \"quote\" inside a string"

如果你想在字符串中包含 双引号和单引号,可以使用  进行转义

double_quote <- "\""  # or '"'
double_quote
## [1] "\""
single_quote <- '\''  # or "'"
single_quote
## [1] "'"

如果你想创建带有  的字符串,同样需要转义

slash <- "\\"
slash
## [1] "\\"

string length

R 基础函数有许多对字符串处理的函数,但建议避免使用它们,因为你很难记住它们,然而在 stringr 包中可以很好的对这些函数进行记忆

str_length(c("a", "R for data science", NA))
## [1]  1 18 NA

对字符向量排序

str_order() 返回排序后的索引
str_sort() 返回排序后的实际值

x <- c("ab","ac","ba")
str_order(x)
## [1] 1 2 3
str_sort(x)
## [1] "ab" "ac" "ba"

combine strings

合并多个字符串为一个字符串

str_c("x","y")
## [1] "xy"
str_c("x","y","z")
## [1] "xyz"

指定 sep 参数定义分隔符

str_c("x","y",sep = ", ")
## [1] "x, y"

指定 collapse 把向量参数合并成一个大的字符串

str_c(head(letters),collapse = ",")
## [1] "a,b,c,d,e,f"
# collapse 对多个字符串没有效果  
 str_c(c('a','a1'),c('b','b1'),collapse='-')
## [1] "ab-a1b1"

str_c() 是一个向量化函数,可以对长度短的字符向量进行再循环

str_c("prefix-",c('a','b','c'),"-suffix")
## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"

将一个字符向量合并成一个字符串,使用 collapse 参数

str_c(c("x","y","z"),collapse = ",")  
## [1] "x,y,z"

提取字符串子集

x <- c("apple","banana","pear")
str_sub(x,1,3) # 从第一个字符开始提取到第三个字符结束
## [1] "app" "ban" "pea"

负数表示从后往前区子集,最后一个位置为 -1

str_sub(x,-3,-1)
## [1] "ple" "ana" "ear"

去除字符串中的空格(值针对单个字符串)

str_trim(" abc ")
## [1] "abc"
# 也可以指定去除哪端的空格
str_trim(" abc ",side = "left")
## [1] "abc "

在字符串中添加空格(针对单个字符)

str_pad("abc",5,side = "both")  # 5 表示填充后的长度为 5
## [1] " abc "
# pad 参数可以指定用什么字符填充  
str_pad("abc",5,side = "both",pad = "*")
## [1] "*abc*"

结合 Purrr 包 函数式编程可以向量化去除空格和添加空格

x <- c("x","y","z")
library(purrr)
map_chr(x,str_pad,5,side = "left")
## [1] "    x" "    y" "    z"
str_c(x[seq_len(3) - 1],collapse = ",")
## [1] "x,y"

match patterns with regular expressions

R 中的正则表达式和 perl 中有点不一样,基本语法类似

基础部分

str_view() 和 str_view_all() 这两个函数是用来学习正则表达式的

library(htmlwidgets)
x <- c("apple","banana","pear")  
str_view(x,"an")

" . " 可以表示任意字符,除了换行符

str_view(x,".a.")

如果你要匹配 “.” 号,你需要进行转义(告诉 R 只想对 " . “进行匹配,消除”." 的匹配特性)

str_view(c("abc","a.c","bef"),"a\\.c")  # 由于在 R 中 转义字符为 "\\" 
# 如果要匹配 \,需要用 \\\\ 来匹配
str_view(c("a\\c","abc"),"\\\\")

锚位符

^ 匹配字符开头
$ 匹配字符结尾

x <- c("apple","banana")
str_view(x,"^a")
str_view(x,"a$")  

若想匹配一个完整的字符,同时使用 ^ 和 $

x <- c("apple pie", "apple", "apple cake")
str_view(x,"^apple$")

特殊字符集

和其他语言的正则表达式一样,在使用上不同的地方在于需要使用两个斜杠

x <- c("1","a1w")
str_view(x,"^\\d$")

重复次数

和其他语言的正则表达式一样(略)

分组和反向引用

和其他语言的正则表达式一样(略) ,注意使用 双斜杠 反向引用

x <- c("abab","abba")
str_view(x,"(..)\\1")

以上就是 R 中正则表达式的基础部分,后面将学习如何使用它们解决实际问题,stringr 包实现的功能有

  • 确定哪些与模式匹配
  • 找到模式匹配的位置
  • 提取匹配的内容
  • 替换匹配的部分
  • 根据匹配拆分字符串

detect matches

检查一个字符向量是否匹配一个模式,返回的是一个逻辑向量

x <- c("apple","banana","pear")
str_detect(x,"e")
## [1]  TRUE FALSE  TRUE

一个常见的应用就是,用得到的逻辑向量去获取匹配的子元素

x[str_detect(x,"e")]
## [1] "apple" "pear"
# 可以使用 str_subset 替代上式  
str_subset(x,"e")
## [1] "apple" "pear"

实际应用场景更多的是,query 字符向量作为数据框的列向量

df <- tibble(word = x,
             i = 1:3)

df %>% filter(str_detect(word,"e"))
## # A tibble: 2 x 2
##   word      i
##   <chr> <int>
## 1 apple     1
## 2 pear      3

另一个有用的函数是 str_count(),将返回每一个字符串中能匹配的个数

x <- c("apple", "banana", "pear")
str_count(x, "a")
## [1] 1 3 1

将 str_count() 和 mutate() 相结合将会很有用

df %>% mutate(a = str_count(word,"a"))
## # A tibble: 3 x 3
##   word       i     a
##   <chr>  <int> <int>
## 1 apple      1     1
## 2 banana     2     3
## 3 pear       3     1

extract matches

使用一个案例 Harvard sentences,假设我们想提取句子中带有颜色的词汇

colours <- c("red", "orange", "yellow", "green", "blue", "purple")
colour_match <- str_c(colours, collapse = "|")
colour_match
## [1] "red|orange|yellow|green|blue|purple"
has_colour <- str_subset(sentences, colour_match)
matches <- str_extract(has_colour, colour_match)
head(matches)
## [1] "blue" "blue" "red"  "red"  "red"  "blue"

str_extract() 只提取第一个匹配,你能使用 str_extract_all() 提取所有匹配,结果以列表展示

str_extract_all(has_colour,colour_match) %>% head()
## [[1]]
## [1] "blue"
## 
## [[2]]
## [1] "blue"
## 
## [[3]]
## [1] "red"
## 
## [[4]]
## [1] "red"
## 
## [[5]]
## [1] "red"
## 
## [[6]]
## [1] "blue"

simplify = TRUE 则返回一个矩阵

str_extract_all(has_colour,colour_match,simplify = T) %>% head()
##      [,1]   [,2]
## [1,] "blue" ""  
## [2,] "blue" ""  
## [3,] "red"  ""  
## [4,] "red"  ""  
## [5,] "red"  ""  
## [6,] "blue" ""

grouped matches

找出 sentences 中所有以 “a” 或 “the” 为量词的名词

noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>% 
  str_extract(noun)
##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

str_match() 不同于 str_extract(),主要在于 str_match() 会返回每一个捕获的内容

noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>% 
  str_match(noun)
##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"

如果你的数据是 tibble, 使用 tidyr::extract()会很方便,这个函数类似于 str_match(),需要你命名捕获,然后形成一个新的变量

tibble(sentence = sentences) %>% 
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
    remove = F
  )
## # A tibble: 720 x 3
##    sentence                                    article noun   
##    <chr>                                       <chr>   <chr>  
##  1 The birch canoe slid on the smooth planks.  the     smooth 
##  2 Glue the sheet to the dark blue background. the     sheet  
##  3 It's easy to tell the depth of a well.      the     depth  
##  4 These days a chicken leg is a rare dish.    a       chicken
##  5 Rice is often served in round bowls.        <NA>    <NA>   
##  6 The juice of lemons makes fine punch.       <NA>    <NA>   
##  7 The box was thrown beside the parked truck. the     parked 
##  8 The hogs were fed chopped corn and garbage. <NA>    <NA>   
##  9 Four hours of steady work faced us.         <NA>    <NA>   
## 10 Large size in stockings is hard to sell.    <NA>    <NA>   
## # … with 710 more rows

replacing matches

str_replace() 和 str_replace_all() 允许你替换匹配形成新的字符串

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-") 
## [1] "-pple"  "p-ar"   "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-"  "p--r"   "b-n-n-"

使用 str_replace_all() 你也可以提供一个有 name 的 向量进行多处替换

x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house"    "two cars"     "three people"

在替换过程中,也可以结合反向引用

sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)
## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."

splitting

使用 str_split() 分解字符串成多个部分,返回一个列表,同样可以指定 simplify = TRUE,返回矩阵

sentences %>%
  head(5) %>% 
  str_split(" ")  # 使用空格分解
## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
##      [,1]      [,2]    
## [1,] "Name"    "Hadley"
## [2,] "Country" "NZ"    
## [3,] "Age"     "35"

可以直接用 str_split_fixed() 返回矩阵格式

fields %>% str_split_fixed(": ",n = 2)
##      [,1]      [,2]    
## [1,] "Name"    "Hadley"
## [2,] "Country" "NZ"    
## [3,] "Age"     "35"

截取字符串

str_sub("abcdef",1,3)
## [1] "abc"

其他模式

当你用一个字符串作为匹配模式模式时,将自动执行 regex() ,因此,你可以使用 regex() 去设置更多的匹配细节

str_view(c("apple","banana"),"nana")
# 等价于
str_view(c("apple","banana"),regex("nana"))

通过设置 regex() 其他的参数,可以实现更加强大的正则匹配

忽略大小写
ignore_case = TRUE, 默认为 false

bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
str_view(bananas,regex("banana",ignore_case = T))

多行匹配
multiline = TRUE 这种情况下, ^ 和 $ 则是匹配行的开头和结尾,而不是字符的开头为结尾

x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x,"^Line")[[1]]
## [1] "Line"
str_extract_all(x,regex("^Line",multiline = T))[[1]]
## [1] "Line" "Line" "Line"

允许正则表达式中出现空格
comments = TRUE 为了让代码可读性大大增强,尤其是对于复杂的正则表达式

phone <- regex("
               \\(?       # optional opening parens
               (\\d{3})   # area code
               [) -]?     # optional closing parens, space, or dash
               (\\d{3})   # another three numbers
               [ -]?      # optional space or dash  
               (\\d{3})   # three more numbers
               ",
               comments = T)

str_match("514-791-8141",phone)
##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"

允许点号匹配一切字符,包括换行符
dotall = TRUE

x <- "string\nabcd"
str_view(x,regex("g.*b",dotall = T))

字符定位函数

str_locate() 返回匹配对象的首末位置,一般与 str_sub函数搭配使用

fruit <- c("apple", "banana", "pear")
str_locate_all(fruit,'a')
## [[1]]
##      start end
## [1,]     1   1
## 
## [[2]]
##      start end
## [1,]     2   2
## [2,]     4   4
## [3,]     6   6
## 
## [[3]]
##      start end
## [1,]     3   3

字符串变换函数

大小写转换 str_to_upper / lower()

str_to_upper("apple")
## [1] "APPLE"

将首字母变换为大写
str_to_title()

str_to_title("apple")
## [1] "Apple"