本文档介绍R中字符串处理相关函数和方法,包括base包中的函数和stringr包中的函数。

1. base包

1.1 字符串统计

x <- c("Hello", "World", "!") 
nchar(x)  # 字符串向量中每个字符串的长度   
## [1] 5 5 1
length(x)  # 字符串向量的向量个数 
## [1] 3

1.2 字符串转换

x <- "AAaaBbb" 
tolower(x) 
## [1] "aaaabbb"
toupper(x) 
## [1] "AAAABBB"
chartr("A", "a", x) # "aaaaBbb", 将所有"A"替换成"a"
## [1] "aaaaBbb"

1.3 字符串连接

x <- c("aaa", "bbb", "ccc")
paste(x, 1:3, sep = "-")                 
## [1] "aaa-1" "bbb-2" "ccc-3"
paste(x, 1:3, sep = "-", collapse = "/") 
## [1] "aaa-1/bbb-2/ccc-3"
paste0(x, 1:3)  # 等价于paste(x, 1:3, sep = ""), 效率高
## [1] "aaa1" "bbb2" "ccc3"

1.4 字符串拆分

x <- "Hello World!\nI'm coming!" 
strsplit(x, split = " ", fixed = FALSE, perl = FALSE, useBytes = FALSE) # 若为精确匹配不使用正则表达式, 请设置fixed = TRUE
## [[1]]
## [1] "Hello"       "World!\nI'm" "coming!"
strsplit(x, split = "\\s") # \s匹配任意的空白符, 包括空格, 制表符(Tab), 换行符\n, 中文全角空格等
## [[1]]
## [1] "Hello"   "World!"  "I'm"     "coming!"
strsplit(x, split = "") # 特殊情况:参数split = "", 这时把\n当做一个字符处理
## [[1]]
##  [1] "H"  "e"  "l"  "l"  "o"  " "  "W"  "o"  "r"  "l"  "d"  "!"  "\n" "I" 
## [15] "'"  "m"  " "  "c"  "o"  "m"  "i"  "n"  "g"  "!"

1.5 字符串查询

files <- list.files("c:\\windows")  # windows文件夹下所有文件及文件夹
grep(x = files, pattern = "\\.exe$")   # 返回匹配项的下标
##  [1]   3  10  27  29  32  33  47  58  59  60  62  78  92  93 102 105 107
grepl(x = files, pattern = "\\.exe$")  # 返回是否匹配的逻辑结果
##   [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [56] FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [78]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE
files[grep("\\.exe$", files)]   # 提取结果
##  [1] "ampa.exe"       "bfsvc.exe"      "explorer.exe"   "fveupdate.exe" 
##  [5] "HelpPane.exe"   "hh.exe"         "notepad.exe"    "py.exe"        
##  [9] "pyw.exe"        "regedit.exe"    "regtlib.exe"    "splwow64.exe"  
## [13] "twunk_16.exe"   "twunk_32.exe"   "winhlp32.exe"   "write.exe"     
## [17] "xinstaller.exe"
files[grepl("\\.exe$", files)]  # 提取结果
##  [1] "ampa.exe"       "bfsvc.exe"      "explorer.exe"   "fveupdate.exe" 
##  [5] "HelpPane.exe"   "hh.exe"         "notepad.exe"    "py.exe"        
##  [9] "pyw.exe"        "regedit.exe"    "regtlib.exe"    "splwow64.exe"  
## [13] "twunk_16.exe"   "twunk_32.exe"   "winhlp32.exe"   "write.exe"     
## [17] "xinstaller.exe"
identical(files[grep("\\.exe$", files)], files[grepl("\\.exe$", files)]) # 两种方法,返回结果相同
## [1] TRUE
x <- c("Hello, World!", "Hi, World!", "How are you? World.")
regexpr(pattern = "World", text = x)   # 返回一个长度和text相同的整数向量, 标识匹配的首字符位置, 若不匹配则为-1
## [1]  8  5 14
## attr(,"match.length")
## [1] 5 5 5
## attr(,"useBytes")
## [1] TRUE
gregexpr(pattern = "World", text = x)  # 返回一个list, 其他与regexpr相同
## [[1]]
## [1] 8
## attr(,"match.length")
## [1] 5
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1] 5
## attr(,"match.length")
## [1] 5
## attr(,"useBytes")
## [1] TRUE
## 
## [[3]]
## [1] 14
## attr(,"match.length")
## [1] 5
## attr(,"useBytes")
## [1] TRUE
regexec(pattern = "World", text = x)   # 返回一个list, 比gregexpr少一个属性useBytes
## [[1]]
## [1] 8
## attr(,"match.length")
## [1] 5
## 
## [[2]]
## [1] 5
## attr(,"match.length")
## [1] 5
## 
## [[3]]
## [1] 14
## attr(,"match.length")
## [1] 5

1.6 字符串查询

x <- c("aabaa", "aab", "cba")
sub(x = x, pattern = "aa", replacement = "oo")   # 替换第一个匹配项
## [1] "oobaa" "oob"   "cba"
gsub(x = x, pattern = "aa", replacement = "oo")  # 替换所有匹配项
## [1] "ooboo" "oob"   "cba"
# 例子: 利用正则表达式后向引用\1替换字符串, .*连在一起表示任意数量的不包含换行的字符
sub(x = x, pattern = ".*(aa).*", replacement = "\\1")
## [1] "aa"  "aa"  "cba"

1.7 字符串提取

x <- "123456789"
substr(x = x, start = c(2, 4), stop = c(4, 5, 8))         # 起始位置和结束位置只接受标量参数
## [1] "234"
substring(text = x,  first = c(2, 4), last = c(4, 5, 8))  # 起始位置和结束位置接受向量参数
## [1] "234"     "45"      "2345678"

2 stringr包

library(stringr)

2.1 字符串统计

x <- c("Hello", "World", "!") 
str_length(x) 
## [1] 5 5 1
identical(str_length(x), nchar(x))  # 与nchar相同
## [1] TRUE

2.2 字符串连接

x <- c("aaa", "bbb", "ccc")
str_c(x, 1:3, sep = "-")                 
## [1] "aaa-1" "bbb-2" "ccc-3"
str_c(x, 1:3, sep = "-", collapse = "/") 
## [1] "aaa-1/bbb-2/ccc-3"
identical(str_c(x, 1:3, sep = "-"), paste(x, 1:3, sep = "-"))  # 与paste相同 
## [1] TRUE

2.3 字符串拆分

x <- "Hello World!\nI'm coming!" 
str_split(x, pattern = "\\s")
## [[1]]
## [1] "Hello"   "World!"  "I'm"     "coming!"
identical(str_split(x, pattern = "\\s"), strsplit(x, split = "\\s"))  # 与strsplit相同
## [1] TRUE

2.4 字符串查询

files <- list.files("c:\\windows") # windows文件夹下所有文件及文件夹
str_detect(files, pattern = "\\.exe$")
##   [1] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE
##  [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [23] FALSE FALSE FALSE FALSE  TRUE FALSE  TRUE FALSE FALSE  TRUE  TRUE
##  [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [45] FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [56] FALSE FALSE  TRUE  TRUE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE
##  [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [78]  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
##  [89] FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE  TRUE FALSE FALSE  TRUE FALSE  TRUE FALSE
identical(str_detect(files, pattern = "\\.exe$"),
          grepl(files, pattern = "\\.exe$"))  #  结果与grepl相同
## [1] TRUE
str_locate(files, pattern = "\\.exe$")  # 返回一个列数为2的矩阵,第1列为匹配首位置, 第二列为匹配末位置, 未匹配返回NA
##        start end
##   [1,]    NA  NA
##   [2,]    NA  NA
##   [3,]     5   8
##   [4,]    NA  NA
##   [5,]    NA  NA
##   [6,]    NA  NA
##   [7,]    NA  NA
##   [8,]    NA  NA
##   [9,]    NA  NA
##  [10,]     6   9
##  [11,]    NA  NA
##  [12,]    NA  NA
##  [13,]    NA  NA
##  [14,]    NA  NA
##  [15,]    NA  NA
##  [16,]    NA  NA
##  [17,]    NA  NA
##  [18,]    NA  NA
##  [19,]    NA  NA
##  [20,]    NA  NA
##  [21,]    NA  NA
##  [22,]    NA  NA
##  [23,]    NA  NA
##  [24,]    NA  NA
##  [25,]    NA  NA
##  [26,]    NA  NA
##  [27,]     9  12
##  [28,]    NA  NA
##  [29,]    10  13
##  [30,]    NA  NA
##  [31,]    NA  NA
##  [32,]     9  12
##  [33,]     3   6
##  [34,]    NA  NA
##  [35,]    NA  NA
##  [36,]    NA  NA
##  [37,]    NA  NA
##  [38,]    NA  NA
##  [39,]    NA  NA
##  [40,]    NA  NA
##  [41,]    NA  NA
##  [42,]    NA  NA
##  [43,]    NA  NA
##  [44,]    NA  NA
##  [45,]    NA  NA
##  [46,]    NA  NA
##  [47,]     8  11
##  [48,]    NA  NA
##  [49,]    NA  NA
##  [50,]    NA  NA
##  [51,]    NA  NA
##  [52,]    NA  NA
##  [53,]    NA  NA
##  [54,]    NA  NA
##  [55,]    NA  NA
##  [56,]    NA  NA
##  [57,]    NA  NA
##  [58,]     3   6
##  [59,]     4   7
##  [60,]     8  11
##  [61,]    NA  NA
##  [62,]     8  11
##  [63,]    NA  NA
##  [64,]    NA  NA
##  [65,]    NA  NA
##  [66,]    NA  NA
##  [67,]    NA  NA
##  [68,]    NA  NA
##  [69,]    NA  NA
##  [70,]    NA  NA
##  [71,]    NA  NA
##  [72,]    NA  NA
##  [73,]    NA  NA
##  [74,]    NA  NA
##  [75,]    NA  NA
##  [76,]    NA  NA
##  [77,]    NA  NA
##  [78,]     9  12
##  [79,]    NA  NA
##  [80,]    NA  NA
##  [81,]    NA  NA
##  [82,]    NA  NA
##  [83,]    NA  NA
##  [84,]    NA  NA
##  [85,]    NA  NA
##  [86,]    NA  NA
##  [87,]    NA  NA
##  [88,]    NA  NA
##  [89,]    NA  NA
##  [90,]    NA  NA
##  [91,]    NA  NA
##  [92,]     9  12
##  [93,]     9  12
##  [94,]    NA  NA
##  [95,]    NA  NA
##  [96,]    NA  NA
##  [97,]    NA  NA
##  [98,]    NA  NA
##  [99,]    NA  NA
## [100,]    NA  NA
## [101,]    NA  NA
## [102,]     9  12
## [103,]    NA  NA
## [104,]    NA  NA
## [105,]     6   9
## [106,]    NA  NA
## [107,]    11  14
## [108,]    NA  NA

2.5 字符串替换

x <- c("aabaa", "aab", "cba")
str_replace(x, pattern = "aa", replacement = "oo")     # 替换第一个匹配项
## [1] "oobaa" "oob"   "cba"
str_replace_all(x, pattern = "aa", replacement = "oo") # 替换所有匹配项
## [1] "ooboo" "oob"   "cba"
identical(str_replace(x, pattern = "aa", replacement = "oo"),
          sub(x, pattern = "aa", replacement = "oo"))  # 结果与sub相同
## [1] TRUE

2.6 字符串提取

x <- "123456789"
#  按位置提取
str_sub(x, start = c(2, 4), end = c(4, 5, 8))
## [1] "234"     "45"      "2345678"
identical(str_sub(x, start = c(2, 4), end = c(4, 5, 8)),
          substring(text = x,  first = c(2, 4), last = c(4, 5, 8)))  # 结果与substring相同
## [1] TRUE
str_sub(x, start = c(2, 4), end = c(4, 5, 8)) <- c("aa","bb")  # 字符串替换的另一种方法 
print(x)
## [1] "1aa56789"  "123bb6789" "1aa9"
# 按匹配结果提取
x <- c("aabaa", "aab", "cba")
str_match(x, "aa")  # 只返回第一个匹配项,矩阵形式
##      [,1]
## [1,] "aa"
## [2,] "aa"
## [3,] NA
str_match_all(x, "aa")  # 返回所有匹配项
## [[1]]
##      [,1]
## [1,] "aa"
## [2,] "aa"
## 
## [[2]]
##      [,1]
## [1,] "aa"
## 
## [[3]]
## character(0)
str_extract(x, "aa")  # 只返回第一个匹配项,向量形式
## [1] "aa" "aa" NA
str_extract_all(x, "aa")  # 返回所有匹配项
## [[1]]
## [1] "aa" "aa"
## 
## [[2]]
## [1] "aa"
## 
## [[3]]
## character(0)

2.7 其他的一些新功能

x <- c("  aa", "  bb  ", "cc")
str_dup(x, times = 2)  # 每项复制一次
## [1] "  aa  aa"     "  bb    bb  " "cccc"
str_trim(x, side = "both")  # 去掉每项两边的空格
## [1] "aa" "bb" "cc"
str_pad(x, width = 10, side = "left")  # 在每项的左边增加10个空格  
## [1] "        aa" "      bb  " "        cc"