本文档介绍R中字符串处理相关函数和方法,包括base包中的函数和stringr包中的函数。
x <- c("Hello", "World", "!")
nchar(x) # 字符串向量中每个字符串的长度
## [1] 5 5 1
length(x) # 字符串向量的向量个数
## [1] 3
x <- "AAaaBbb"
tolower(x)
## [1] "aaaabbb"
toupper(x)
## [1] "AAAABBB"
chartr("A", "a", x) # "aaaaBbb", 将所有"A"替换成"a"
## [1] "aaaaBbb"
x <- c("aaa", "bbb", "ccc")
paste(x, 1:3, sep = "-")
## [1] "aaa-1" "bbb-2" "ccc-3"
paste(x, 1:3, sep = "-", collapse = "/")
## [1] "aaa-1/bbb-2/ccc-3"
paste0(x, 1:3) # 等价于paste(x, 1:3, sep = ""), 效率高
## [1] "aaa1" "bbb2" "ccc3"
x <- "Hello World!\nI'm coming!"
strsplit(x, split = " ", fixed = FALSE, perl = FALSE, useBytes = FALSE) # 若为精确匹配不使用正则表达式, 请设置fixed = TRUE
## [[1]]
## [1] "Hello" "World!\nI'm" "coming!"
strsplit(x, split = "\\s") # \s匹配任意的空白符, 包括空格, 制表符(Tab), 换行符\n, 中文全角空格等
## [[1]]
## [1] "Hello" "World!" "I'm" "coming!"
strsplit(x, split = "") # 特殊情况:参数split = "", 这时把\n当做一个字符处理
## [[1]]
## [1] "H" "e" "l" "l" "o" " " "W" "o" "r" "l" "d" "!" "\n" "I"
## [15] "'" "m" " " "c" "o" "m" "i" "n" "g" "!"
files <- list.files("c:\\windows") # windows文件夹下所有文件及文件夹
grep(x = files, pattern = "\\.exe$") # 返回匹配项的下标
## [1] 3 10 27 29 32 33 47 58 59 60 62 78 92 93 102 105 107
grepl(x = files, pattern = "\\.exe$") # 返回是否匹配的逻辑结果
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [78] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [89] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE
files[grep("\\.exe$", files)] # 提取结果
## [1] "ampa.exe" "bfsvc.exe" "explorer.exe" "fveupdate.exe"
## [5] "HelpPane.exe" "hh.exe" "notepad.exe" "py.exe"
## [9] "pyw.exe" "regedit.exe" "regtlib.exe" "splwow64.exe"
## [13] "twunk_16.exe" "twunk_32.exe" "winhlp32.exe" "write.exe"
## [17] "xinstaller.exe"
files[grepl("\\.exe$", files)] # 提取结果
## [1] "ampa.exe" "bfsvc.exe" "explorer.exe" "fveupdate.exe"
## [5] "HelpPane.exe" "hh.exe" "notepad.exe" "py.exe"
## [9] "pyw.exe" "regedit.exe" "regtlib.exe" "splwow64.exe"
## [13] "twunk_16.exe" "twunk_32.exe" "winhlp32.exe" "write.exe"
## [17] "xinstaller.exe"
identical(files[grep("\\.exe$", files)], files[grepl("\\.exe$", files)]) # 两种方法,返回结果相同
## [1] TRUE
x <- c("Hello, World!", "Hi, World!", "How are you? World.")
regexpr(pattern = "World", text = x) # 返回一个长度和text相同的整数向量, 标识匹配的首字符位置, 若不匹配则为-1
## [1] 8 5 14
## attr(,"match.length")
## [1] 5 5 5
## attr(,"useBytes")
## [1] TRUE
gregexpr(pattern = "World", text = x) # 返回一个list, 其他与regexpr相同
## [[1]]
## [1] 8
## attr(,"match.length")
## [1] 5
## attr(,"useBytes")
## [1] TRUE
##
## [[2]]
## [1] 5
## attr(,"match.length")
## [1] 5
## attr(,"useBytes")
## [1] TRUE
##
## [[3]]
## [1] 14
## attr(,"match.length")
## [1] 5
## attr(,"useBytes")
## [1] TRUE
regexec(pattern = "World", text = x) # 返回一个list, 比gregexpr少一个属性useBytes
## [[1]]
## [1] 8
## attr(,"match.length")
## [1] 5
##
## [[2]]
## [1] 5
## attr(,"match.length")
## [1] 5
##
## [[3]]
## [1] 14
## attr(,"match.length")
## [1] 5
x <- c("aabaa", "aab", "cba")
sub(x = x, pattern = "aa", replacement = "oo") # 替换第一个匹配项
## [1] "oobaa" "oob" "cba"
gsub(x = x, pattern = "aa", replacement = "oo") # 替换所有匹配项
## [1] "ooboo" "oob" "cba"
# 例子: 利用正则表达式后向引用\1替换字符串, .*连在一起表示任意数量的不包含换行的字符
sub(x = x, pattern = ".*(aa).*", replacement = "\\1")
## [1] "aa" "aa" "cba"
x <- "123456789"
substr(x = x, start = c(2, 4), stop = c(4, 5, 8)) # 起始位置和结束位置只接受标量参数
## [1] "234"
substring(text = x, first = c(2, 4), last = c(4, 5, 8)) # 起始位置和结束位置接受向量参数
## [1] "234" "45" "2345678"
library(stringr)
x <- c("Hello", "World", "!")
str_length(x)
## [1] 5 5 1
identical(str_length(x), nchar(x)) # 与nchar相同
## [1] TRUE
x <- c("aaa", "bbb", "ccc")
str_c(x, 1:3, sep = "-")
## [1] "aaa-1" "bbb-2" "ccc-3"
str_c(x, 1:3, sep = "-", collapse = "/")
## [1] "aaa-1/bbb-2/ccc-3"
identical(str_c(x, 1:3, sep = "-"), paste(x, 1:3, sep = "-")) # 与paste相同
## [1] TRUE
x <- "Hello World!\nI'm coming!"
str_split(x, pattern = "\\s")
## [[1]]
## [1] "Hello" "World!" "I'm" "coming!"
identical(str_split(x, pattern = "\\s"), strsplit(x, split = "\\s")) # 与strsplit相同
## [1] TRUE
files <- list.files("c:\\windows") # windows文件夹下所有文件及文件夹
str_detect(files, pattern = "\\.exe$")
## [1] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE
## [12] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [23] FALSE FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE TRUE TRUE
## [34] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [45] FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [56] FALSE FALSE TRUE TRUE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
## [67] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [78] TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## [89] FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
## [100] FALSE FALSE TRUE FALSE FALSE TRUE FALSE TRUE FALSE
identical(str_detect(files, pattern = "\\.exe$"),
grepl(files, pattern = "\\.exe$")) # 结果与grepl相同
## [1] TRUE
str_locate(files, pattern = "\\.exe$") # 返回一个列数为2的矩阵,第1列为匹配首位置, 第二列为匹配末位置, 未匹配返回NA
## start end
## [1,] NA NA
## [2,] NA NA
## [3,] 5 8
## [4,] NA NA
## [5,] NA NA
## [6,] NA NA
## [7,] NA NA
## [8,] NA NA
## [9,] NA NA
## [10,] 6 9
## [11,] NA NA
## [12,] NA NA
## [13,] NA NA
## [14,] NA NA
## [15,] NA NA
## [16,] NA NA
## [17,] NA NA
## [18,] NA NA
## [19,] NA NA
## [20,] NA NA
## [21,] NA NA
## [22,] NA NA
## [23,] NA NA
## [24,] NA NA
## [25,] NA NA
## [26,] NA NA
## [27,] 9 12
## [28,] NA NA
## [29,] 10 13
## [30,] NA NA
## [31,] NA NA
## [32,] 9 12
## [33,] 3 6
## [34,] NA NA
## [35,] NA NA
## [36,] NA NA
## [37,] NA NA
## [38,] NA NA
## [39,] NA NA
## [40,] NA NA
## [41,] NA NA
## [42,] NA NA
## [43,] NA NA
## [44,] NA NA
## [45,] NA NA
## [46,] NA NA
## [47,] 8 11
## [48,] NA NA
## [49,] NA NA
## [50,] NA NA
## [51,] NA NA
## [52,] NA NA
## [53,] NA NA
## [54,] NA NA
## [55,] NA NA
## [56,] NA NA
## [57,] NA NA
## [58,] 3 6
## [59,] 4 7
## [60,] 8 11
## [61,] NA NA
## [62,] 8 11
## [63,] NA NA
## [64,] NA NA
## [65,] NA NA
## [66,] NA NA
## [67,] NA NA
## [68,] NA NA
## [69,] NA NA
## [70,] NA NA
## [71,] NA NA
## [72,] NA NA
## [73,] NA NA
## [74,] NA NA
## [75,] NA NA
## [76,] NA NA
## [77,] NA NA
## [78,] 9 12
## [79,] NA NA
## [80,] NA NA
## [81,] NA NA
## [82,] NA NA
## [83,] NA NA
## [84,] NA NA
## [85,] NA NA
## [86,] NA NA
## [87,] NA NA
## [88,] NA NA
## [89,] NA NA
## [90,] NA NA
## [91,] NA NA
## [92,] 9 12
## [93,] 9 12
## [94,] NA NA
## [95,] NA NA
## [96,] NA NA
## [97,] NA NA
## [98,] NA NA
## [99,] NA NA
## [100,] NA NA
## [101,] NA NA
## [102,] 9 12
## [103,] NA NA
## [104,] NA NA
## [105,] 6 9
## [106,] NA NA
## [107,] 11 14
## [108,] NA NA
x <- c("aabaa", "aab", "cba")
str_replace(x, pattern = "aa", replacement = "oo") # 替换第一个匹配项
## [1] "oobaa" "oob" "cba"
str_replace_all(x, pattern = "aa", replacement = "oo") # 替换所有匹配项
## [1] "ooboo" "oob" "cba"
identical(str_replace(x, pattern = "aa", replacement = "oo"),
sub(x, pattern = "aa", replacement = "oo")) # 结果与sub相同
## [1] TRUE
x <- "123456789"
# 按位置提取
str_sub(x, start = c(2, 4), end = c(4, 5, 8))
## [1] "234" "45" "2345678"
identical(str_sub(x, start = c(2, 4), end = c(4, 5, 8)),
substring(text = x, first = c(2, 4), last = c(4, 5, 8))) # 结果与substring相同
## [1] TRUE
str_sub(x, start = c(2, 4), end = c(4, 5, 8)) <- c("aa","bb") # 字符串替换的另一种方法
print(x)
## [1] "1aa56789" "123bb6789" "1aa9"
# 按匹配结果提取
x <- c("aabaa", "aab", "cba")
str_match(x, "aa") # 只返回第一个匹配项,矩阵形式
## [,1]
## [1,] "aa"
## [2,] "aa"
## [3,] NA
str_match_all(x, "aa") # 返回所有匹配项
## [[1]]
## [,1]
## [1,] "aa"
## [2,] "aa"
##
## [[2]]
## [,1]
## [1,] "aa"
##
## [[3]]
## character(0)
str_extract(x, "aa") # 只返回第一个匹配项,向量形式
## [1] "aa" "aa" NA
str_extract_all(x, "aa") # 返回所有匹配项
## [[1]]
## [1] "aa" "aa"
##
## [[2]]
## [1] "aa"
##
## [[3]]
## character(0)
x <- c(" aa", " bb ", "cc")
str_dup(x, times = 2) # 每项复制一次
## [1] " aa aa" " bb bb " "cccc"
str_trim(x, side = "both") # 去掉每项两边的空格
## [1] "aa" "bb" "cc"
str_pad(x, width = 10, side = "left") # 在每项的左边增加10个空格
## [1] " aa" " bb " " cc"