string:字符串,亦可为字符串向量;locale:设置语种。
# install.packages('stringr') 安装包
library(stringr) #加载包
dog <- "The quick brown dog"
dog1 <- c("The", "quick", "brown", "dog")
# 帮助文档的实例中一般都省略参数名,个人习惯补全参数名,容易理解
str_to_upper(string = dog) #将英文字符串转换成大写
## [1] "THE QUICK BROWN DOG"
str_to_upper(string = dog1) #string为字符向量时
## [1] "THE" "QUICK" "BROWN" "DOG"
str_to_lower(string = dog) #将英文字符串转换成小写
## [1] "the quick brown dog"
str_to_title(string = dog) #将英文字符串中的单词首字母转换大写
## [1] "The Quick Brown Dog"
# locale可设置不同的语种
str_to_upper(string = "i", locale = "en") # English
## [1] "I"
str_to_upper(string = "i", locale = "tr") # Turkish
## [1] "<U+0130>"
invert_match(loc),loc是以函数str_locate_all()
获取的位置矩阵作为输入。
numbers <- "1 and 2 and 4 and 456"
num_loc <- str_locate_all(string = numbers, pattern = "[0-9]+")[[1]] #匹配数字,返回数字的起始和结束位置
num_loc
## start end
## [1,] 1 1
## [2,] 7 7
## [3,] 13 13
## [4,] 19 21
str_sub(string = numbers, start = num_loc[, "start"], end = num_loc[, "end"])
## [1] "1" "2" "4" "456"
text_loc <- invert_match(loc = num_loc) #返回不匹配数字的起始和结束位置
text_loc
## start end
## [1,] 0 0
## [2,] 2 6
## [3,] 8 12
## [4,] 14 18
## [5,] 22 -1
str_sub(string = numbers, start = text_loc[, "start"], end = text_loc[, "end"])
## [1] "" " and " " and " " and " ""
3.1 fixed(pattern, ignore_case = FALSE):Compare literal bytes in the string. This is very fast, but not usually what you want for non-ASCII character sets.
3.2 coll(pattern, ignore_case = FALSE, locale = NULL, …):Compare strings respecting standard collation rules.
3.3 regex(pattern, ignore_case = FALSE, multiline = FALSE, comments = FALSE, dotall = FALSE, …):默认使用正则表达式
3.4 boundary(type = c(“character”, “line_break”, “sentence”, “word”), skip_word_none = TRUE, …):Match boundaries between things.
pattern <- "a.b"
strings <- c("abb", "a.b")
str_detect(string = strings, pattern = pattern)
## [1] TRUE TRUE
str_detect(string = strings, pattern = fixed(pattern))
## [1] FALSE TRUE
str_detect(string = strings, pattern = coll(pattern))
## [1] FALSE TRUE
# coll() is useful for locale-aware case-insensitive matching
i <- c("I", "<U+0130>", "i")
i
## [1] "I" "<U+0130>" "i"
str_detect(string = i, pattern = regex("i", TRUE))
## [1] TRUE FALSE TRUE
str_detect(string = i, pattern = fixed("i", TRUE))
## [1] TRUE FALSE TRUE
str_detect(string = i, pattern = coll("i", TRUE))
## [1] TRUE FALSE TRUE
str_detect(string = i, pattern = coll("i", TRUE, locale = "tr"))
## [1] FALSE FALSE TRUE
# Word boundaries 单词边界
words <- c("These are some words.")
str_count(string = words, pattern = boundary("word")) #统计语句中单词的个数
## [1] 4
str_split(string = words, pattern = " ")[[1]] #将语句分割成单词,最后一个单词带有标点
## [1] "These" "are" "some" "words."
str_split(string = words, pattern = boundary("word"))[[1]] #最后一个单词不带有标点
## [1] "These" "are" "some" "words"
# 使用正则表达式
str_extract_all(string = "The Cat in the Hat", pattern = "[a-z]+") #区分大小写
## [[1]]
## [1] "he" "at" "in" "the" "at"
str_extract_all(string = "The Cat in the Hat", pattern = regex("[a-z]+", TRUE)) #忽略大小写的差异
## [[1]]
## [1] "The" "Cat" "in" "the" "Hat"
str_extract_all(string = "a\nb\nc", pattern = "^.")
## [[1]]
## [1] "a"
str_extract_all(string = "a\nb\nc", pattern = regex("^.", multiline = TRUE))
## [[1]]
## [1] "a" "b" "c"
str_extract_all(string = "a\nb\nc", pattern = "a.")
## [[1]]
## character(0)
str_extract_all(string = "a\nb\nc", pattern = regex("a.", dotall = TRUE))
## [[1]]
## [1] "a\n"
4.1 str_c(…, sep = “”, collapse = NULL)
4.2 str_join(…, sep = “”, collapse = NULL),同str_c
sep:设置向量间的连接符;collapse:将向量的所有元素连接成一个字符串时,设置元素间的连接符。
str_c("Letter", letters[1:5]) #连接2个字符向量
## [1] "Lettera" "Letterb" "Letterc" "Letterd" "Lettere"
str_c("Letter", letters[1:5], sep = ": ") #sep可设置向量间的连接符
## [1] "Letter: a" "Letter: b" "Letter: c" "Letter: d" "Letter: e"
# 连接3个不等长的字符向量,出现警告信息,短的向量的元素会自动循环至与最长向量等长。
str_c(letters[1:5], " is for", c("...", "***"))
## [1] "a is for..." "b is for***" "c is for..." "d is for***" "e is for..."
str_c(letters)
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
str_c(letters, sep = "-") #当只有1个向量时,sep参数不起作用
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
# 将向量的所有元素连接成一个字符串,collapse设置元素间的连接符
str_c(letters, collapse = "")
## [1] "abcdefghijklmnopqrstuvwxyz"
str_c(letters, collapse = ", ")
## [1] "a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z"
str_c("Letter", letters[1:5], sep = ":", collapse = ",") #同时设置sep和collapse时,与以下语句等价:
## [1] "Letter:a,Letter:b,Letter:c,Letter:d,Letter:e"
a <- str_c("Letter", letters[1:5], sep = ":")
a
## [1] "Letter:a" "Letter:b" "Letter:c" "Letter:d" "Letter:e"
str_c(a, collapse = ",")
## [1] "Letter:a,Letter:b,Letter:c,Letter:d,Letter:e"
# 当字符向量中存在缺失值时,返回的也是缺失值:
str_c(c("a", NA, "b"), "-d")
## [1] "a-d" NA "b-d"
# 使用函数str_replace_NA 将缺失值替换成‘NA’:
str_c(str_replace_na(c("a", NA, "b")), "-d")
## [1] "a-d" "NA-d" "b-d"
str_conv(string, encoding)
x <- rawToChar(as.raw(177))
x
## [1] "\xb1"
str_conv(string = x, encoding = "ISO-8859-2") # Polish 'a with ogonek'
## [1] "<U+0105>"
str_conv(string = x, encoding = "ISO-8859-1") # Plus-minus
## [1] "±"
str_count(string, pattern = “”)
fruit <- c("apple", "banana", "pear", "pineappleapple")
str_count(string = fruit, pattern = "a") #计算向量fruit的每个元素含有a的数目
## [1] 1 3 1 2
str_count(string = fruit, pattern = "p")
## [1] 2 0 1 5
str_count(string = fruit, pattern = "ap")
## [1] 1 0 0 2
str_count(string = fruit, pattern = "[a-e]")
## [1] 2 4 2 5
# 统计'apple'中的'a'的个数,~~~,'pineappleapple'中'p'的个数:
str_count(string = fruit, pattern = c("a", "b", "p", "p"))
## [1] 1 1 1 5
str_count(string = c("a.", "...", ".a.a"), pattern = ".") #正则表达式中‘.’是指单个字符,不仅仅是字符‘.’
## [1] 2 3 4
str_count(string = c("a.", "...", ".a.a"), pattern = fixed(".")) #fixed('.')指字符‘.’
## [1] 1 3 2
str_detect(string, pattern),结果返回逻辑向量
fruit <- c("apple1", "banana", "pear", "pinapple")
str_detect(string = fruit, pattern = "a") #fruit的元素是否包含a
## [1] TRUE TRUE TRUE TRUE
str_detect(string = fruit, pattern = "pp")
## [1] TRUE FALSE FALSE TRUE
str_detect(string = fruit, pattern = "^a") #fruit的元素是否以a开头
## [1] TRUE FALSE FALSE FALSE
str_detect(string = fruit, pattern = "a$") #fruit的元素是否以a结尾
## [1] FALSE TRUE FALSE FALSE
str_detect(string = fruit, pattern = "[aeiou]") #fruit的元素是否包含[aeiou]中的一个字符
## [1] TRUE TRUE TRUE TRUE
# 向量化运算中,短的向量的元素会被循环使用
str_detect(string = fruit, pattern = c("ap", "ba", "pe", "pin"))
## [1] TRUE TRUE TRUE TRUE
# string的长度大于pattern的长度,结果返回与string等长的逻辑向量:
str_detect(string = fruit, pattern = c("ap", "pin"))
## [1] TRUE FALSE FALSE TRUE
# string的长度小于pattern的长度,结果返回与pattern等长的逻辑向量:
str_detect(string = fruit, pattern = c("ap", "ba", "pe", "pin", "e1"))
## [1] TRUE TRUE TRUE TRUE TRUE
str_dup(string, times)
fruit <- c("apple", "pear", "banana")
str_dup(string = fruit, times = 2) # 向量的每个元素重复2次,然后连接起来
## [1] "appleapple" "pearpear" "bananabanana"
str_dup(string = fruit, times = 1:3)
## [1] "apple" "pearpear" "bananabananabanana"
str_c("ba", str_dup("na", 0:5))
## [1] "ba" "bana" "banana" "bananana"
## [5] "banananana" "bananananana"
str_extract(string, pattern) 提取匹配的第一个字符串
str_extract_all(string, pattern, simplify = FALSE) 提取匹配的所有字符串
shopping_list <- c("apples 4x4", "bag of flour", "bag of sugar", "milk x2")
# 提取匹配模式的第一个字符串
str_extract(string = shopping_list, pattern = "\\d") #提取数字
## [1] "4" NA NA "2"
str_extract(string = shopping_list, pattern = "[a-z]+") #提取字母
## [1] "apples" "bag" "bag" "milk"
str_extract(string = shopping_list, pattern = "[a-z]{1,4}")
## [1] "appl" "bag" "bag" "milk"
str_extract(string = shopping_list, pattern = "\\b[a-z]{1,4}\\b")
## [1] NA "bag" "bag" "milk"
# 提取所有匹配模式的字符串,结果返回一个列表
str_extract_all(string = shopping_list, pattern = "[a-z]+")
## [[1]]
## [1] "apples" "x"
##
## [[2]]
## [1] "bag" "of" "flour"
##
## [[3]]
## [1] "bag" "of" "sugar"
##
## [[4]]
## [1] "milk" "x"
str_extract_all(string = shopping_list, pattern = "\\b[a-z]+\\b")
## [[1]]
## [1] "apples"
##
## [[2]]
## [1] "bag" "of" "flour"
##
## [[3]]
## [1] "bag" "of" "sugar"
##
## [[4]]
## [1] "milk"
str_extract_all(string = shopping_list, pattern = "\\d")
## [[1]]
## [1] "4" "4"
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## [1] "2"
# 提取所有匹配模式的字符串,结果返回一个矩阵,通过simplify = TRUE设置
str_extract_all(string = shopping_list, pattern = "\\b[a-z]+\\b", simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "apples" "" ""
## [2,] "bag" "of" "flour"
## [3,] "bag" "of" "sugar"
## [4,] "milk" "" ""
str_extract_all(string = shopping_list, pattern = "\\d", simplify = TRUE)
## [,1] [,2]
## [1,] "4" "4"
## [2,] "" ""
## [3,] "" ""
## [4,] "2" ""
str_length(string)
str_length(letters)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
str_length(NA)
## [1] NA
str_length(factor("abc"))
## [1] 3
str_length(c("i", "like", "programming", NA))
## [1] 1 4 11 NA
# Two ways of representing a u with an umlaut
u1 <- "ü"
u2 <- stringi::stri_trans_nfd(u1)
# The print the same:
u1
## [1] "ü"
u2
## [1] "u<U+0308>"
# But have a different length
str_length(u1)
## [1] 1
str_length(u2)
## [1] 2
# Even though they have the same number of characters
str_count(u1)
## [1] 1
str_count(u2)
## [1] 1
str_locate(string, pattern):返回匹配的第一个字符串的位置
str_locate_all(string, pattern):返回匹配的所有位置
fruit <- c("apple", "banana", "pear", "pineapple")
# 返回匹配的第一个字符串的位置:
str_locate(string = fruit, pattern = "a")
## start end
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 5 5
str_locate(string = fruit, pattern = "ap")
## start end
## [1,] 1 2
## [2,] NA NA
## [3,] NA NA
## [4,] 5 6
str_locate(string = fruit, pattern = c("a", "b", "p", "p"))
## start end
## [1,] 1 1
## [2,] 1 1
## [3,] 1 1
## [4,] 1 1
# 返回匹配的所有位置:
str_locate_all(string = fruit, pattern = "a")
## [[1]]
## start end
## [1,] 1 1
##
## [[2]]
## start end
## [1,] 2 2
## [2,] 4 4
## [3,] 6 6
##
## [[3]]
## start end
## [1,] 3 3
##
## [[4]]
## start end
## [1,] 5 5
str_locate_all(string = fruit, pattern = "e")
## [[1]]
## start end
## [1,] 5 5
##
## [[2]]
## start end
##
## [[3]]
## start end
## [1,] 2 2
##
## [[4]]
## start end
## [1,] 4 4
## [2,] 9 9
str_locate_all(string = fruit, pattern = c("a", "b", "p", "p"))
## [[1]]
## start end
## [1,] 1 1
##
## [[2]]
## start end
## [1,] 1 1
##
## [[3]]
## start end
## [1,] 1 1
##
## [[4]]
## start end
## [1,] 1 1
## [2,] 6 6
## [3,] 7 7
# 查找每个字符的位置
str_locate_all(string = fruit, pattern = "")
## [[1]]
## start end
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
## [5,] 5 5
##
## [[2]]
## start end
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
## [5,] 5 5
## [6,] 6 6
##
## [[3]]
## start end
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
##
## [[4]]
## start end
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
## [4,] 4 4
## [5,] 5 5
## [6,] 6 6
## [7,] 7 7
## [8,] 8 8
## [9,] 9 9
str_match(string, pattern) 提取匹配的第一个字符串
str_match_all(string, pattern) 提取匹配的所有字符串
strings <- c(" 219 733 8965", "329-293-8753 ", "banana", "595 794 7569", "387 287 6718",
"apple", "233.398.9187 ", "482 952 3315", "239 923 8115 and 842 566 4692",
"Work: 579-499-7527", "$1000", "Home: 543.355.3679")
phone <- "([2-9][0-9]{2})[- .]([0-9]{3})[- .]([0-9]{4})"
str_extract(string = strings, pattern = phone) #返回匹配的最长字符串
## [1] "219 733 8965" "329-293-8753" NA "595 794 7569"
## [5] "387 287 6718" NA "233.398.9187" "482 952 3315"
## [9] "239 923 8115" "579-499-7527" NA "543.355.3679"
str_match(string = strings, pattern = phone) #返回匹配的最长字符串,同时返回最长字符串中的子字符串
## [,1] [,2] [,3] [,4]
## [1,] "219 733 8965" "219" "733" "8965"
## [2,] "329-293-8753" "329" "293" "8753"
## [3,] NA NA NA NA
## [4,] "595 794 7569" "595" "794" "7569"
## [5,] "387 287 6718" "387" "287" "6718"
## [6,] NA NA NA NA
## [7,] "233.398.9187" "233" "398" "9187"
## [8,] "482 952 3315" "482" "952" "3315"
## [9,] "239 923 8115" "239" "923" "8115"
## [10,] "579-499-7527" "579" "499" "7527"
## [11,] NA NA NA NA
## [12,] "543.355.3679" "543" "355" "3679"
# Extract/match all
str_extract_all(string = strings, pattern = phone)
## [[1]]
## [1] "219 733 8965"
##
## [[2]]
## [1] "329-293-8753"
##
## [[3]]
## character(0)
##
## [[4]]
## [1] "595 794 7569"
##
## [[5]]
## [1] "387 287 6718"
##
## [[6]]
## character(0)
##
## [[7]]
## [1] "233.398.9187"
##
## [[8]]
## [1] "482 952 3315"
##
## [[9]]
## [1] "239 923 8115" "842 566 4692"
##
## [[10]]
## [1] "579-499-7527"
##
## [[11]]
## character(0)
##
## [[12]]
## [1] "543.355.3679"
str_match_all(string = strings, pattern = phone)
## [[1]]
## [,1] [,2] [,3] [,4]
## [1,] "219 733 8965" "219" "733" "8965"
##
## [[2]]
## [,1] [,2] [,3] [,4]
## [1,] "329-293-8753" "329" "293" "8753"
##
## [[3]]
## [,1] [,2] [,3] [,4]
##
## [[4]]
## [,1] [,2] [,3] [,4]
## [1,] "595 794 7569" "595" "794" "7569"
##
## [[5]]
## [,1] [,2] [,3] [,4]
## [1,] "387 287 6718" "387" "287" "6718"
##
## [[6]]
## [,1] [,2] [,3] [,4]
##
## [[7]]
## [,1] [,2] [,3] [,4]
## [1,] "233.398.9187" "233" "398" "9187"
##
## [[8]]
## [,1] [,2] [,3] [,4]
## [1,] "482 952 3315" "482" "952" "3315"
##
## [[9]]
## [,1] [,2] [,3] [,4]
## [1,] "239 923 8115" "239" "923" "8115"
## [2,] "842 566 4692" "842" "566" "4692"
##
## [[10]]
## [,1] [,2] [,3] [,4]
## [1,] "579-499-7527" "579" "499" "7527"
##
## [[11]]
## [,1] [,2] [,3] [,4]
##
## [[12]]
## [,1] [,2] [,3] [,4]
## [1,] "543.355.3679" "543" "355" "3679"
str_order(x, decreasing = FALSE, na_last = TRUE, locale = “”, …)
str_sort(x, decreasing = FALSE, na_last = TRUE, locale = “”, …)
str_order(x = letters, locale = "en")
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26
str_sort(x = letters, locale = "en")
## [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "o" "p" "q"
## [18] "r" "s" "t" "u" "v" "w" "x" "y" "z"
str_order(x = letters, locale = "haw")
## [1] 1 5 9 15 21 2 3 4 6 7 8 10 11 12 13 14 16 17 18 19 20 22 23
## [24] 24 25 26
str_sort(x = letters, locale = "haw")
## [1] "a" "e" "i" "o" "u" "b" "c" "d" "f" "g" "h" "j" "k" "l" "m" "n" "p"
## [18] "q" "r" "s" "t" "v" "w" "x" "y" "z"
str_pad(string, width, side = c(“left”, “right”, “both”), pad = " “)
rbind(str_pad("hadley", 30, "left"), str_pad("hadley", 30, "right"), str_pad("hadley",
30, "both"))
## [,1]
## [1,] " hadley"
## [2,] "hadley "
## [3,] " hadley "
str_pad(string = c("a", "abc", "abcdef"), width = 10)
## [1] " a" " abc" " abcdef"
str_pad(string = "a", width = c(5, 10, 20))
## [1] " a" " a" " a"
str_pad(string = "a", width = 10, pad = c("-", "_", "+"))
## [1] "---------a" "_________a" "+++++++++a"
# 当设置width小于string的长度时,结果返回原string
str_pad(string = "hadley", width = 3, pad = "-")
## [1] "hadley"
str_pad("hadley", width = 8, side = "left", pad = "-")
## [1] "--hadley"
str_pad("hadley", width = 8, side = "right", pad = "-")
## [1] "hadley--"
str_pad("hadley", width = 8, side = "both", pad = "-")
## [1] "-hadley-"
str_replace(string, pattern, replacement)
str_replace_all(string, pattern, replacement)
fruits <- c("one apple", "two pears", "three bananas")
str_replace(string = fruits, pattern = "[aeiou]", replacement = "-") #替换第一个匹配的字符
## [1] "-ne apple" "tw- pears" "thr-e bananas"
str_replace_all(string = fruits, pattern = "[aeiou]", replacement = "-") #替换所有匹配的字符
## [1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
str_replace(string = fruits, pattern = "([aeiou])", replacement = "")
## [1] "ne apple" "tw pears" "thre bananas"
str_replace(string = fruits, pattern = "([aeiou])", replacement = "\\1\\1")
## [1] "oone apple" "twoo pears" "threee bananas"
str_replace(string = fruits, pattern = "[aeiou]", replacement = c("1", "2",
"3"))
## [1] "1ne apple" "tw2 pears" "thr3e bananas"
str_replace(string = fruits, pattern = c("a", "e", "i"), replacement = "-")
## [1] "one -pple" "two p-ars" "three bananas"
fruits <- c("one apple", "two pears", "three bananas")
str_replace(string = fruits, pattern = "[aeiou]", replacement = "-")
## [1] "-ne apple" "tw- pears" "thr-e bananas"
str_replace_all(string = fruits, pattern = "[aeiou]", replacement = "-")
## [1] "-n- -ppl-" "tw- p--rs" "thr-- b-n-n-s"
str_replace_all(string = fruits, pattern = "([aeiou])", replacement = "")
## [1] "n ppl" "tw prs" "thr bnns"
str_replace_all(string = fruits, pattern = "([aeiou])", replacement = "\\1\\1")
## [1] "oonee aapplee" "twoo peeaars" "threeee baanaanaas"
str_replace_all(string = fruits, pattern = "[aeiou]", replacement = c("1", "2",
"3"))
## [1] "1n1 1ppl1" "tw2 p22rs" "thr33 b3n3n3s"
str_replace_all(string = fruits, pattern = c("a", "e", "i"), replacement = "-")
## [1] "one -pple" "two p-ars" "three bananas"
# 对一个字符串同时应用多个指定规则进行替换:
strings1 <- str_c(fruits, collapse = "---")
strings1
## [1] "one apple---two pears---three bananas"
str_replace_all(string = strings1, pattern = c(one = 1, two = 2, three = 3))
## [1] "1 apple---2 pears---3 bananas"
str_replace_na(string, replacement = “NA”)
str_replace_na(c(NA, "abc", "def"))
## [1] "NA" "abc" "def"
str_split(string, pattern, n = Inf)#结果返回列表
str_split_fixed(string, pattern, n)#必须设置n,结果返回矩阵
fruits <- c("apples and oranges and pears and bananas", "pineapples and mangos and guavas")
str_split(string = fruits, pattern = " and ")
## [[1]]
## [1] "apples" "oranges" "pears" "bananas"
##
## [[2]]
## [1] "pineapples" "mangos" "guavas"
str_split_fixed(string = fruits, pattern = " and ", n = 4)
## [,1] [,2] [,3] [,4]
## [1,] "apples" "oranges" "pears" "bananas"
## [2,] "pineapples" "mangos" "guavas" ""
# 通过设置n,指定分割成n块
str_split(string = fruits, pattern = " and ", n = 3) #将字符串分割成3部分
## [[1]]
## [1] "apples" "oranges" "pears and bananas"
##
## [[2]]
## [1] "pineapples" "mangos" "guavas"
str_split(string = fruits, pattern = " and ", n = 2) #将字符串分割成2部分
## [[1]]
## [1] "apples" "oranges and pears and bananas"
##
## [[2]]
## [1] "pineapples" "mangos and guavas"
str_split(string = fruits, pattern = " and ", n = 5) #将字符串分割成5部分
## [[1]]
## [1] "apples" "oranges" "pears" "bananas"
##
## [[2]]
## [1] "pineapples" "mangos" "guavas"
str_split_fixed(string = fruits, pattern = " and ", n = 3)
## [,1] [,2] [,3]
## [1,] "apples" "oranges" "pears and bananas"
## [2,] "pineapples" "mangos" "guavas"
str_split_fixed(string = fruits, pattern = " and ", n = 4)
## [,1] [,2] [,3] [,4]
## [1,] "apples" "oranges" "pears" "bananas"
## [2,] "pineapples" "mangos" "guavas" ""
str_split_fixed(string = fruits, pattern = " and ", n = 6)
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] "apples" "oranges" "pears" "bananas" "" ""
## [2,] "pineapples" "mangos" "guavas" "" "" ""
str_sub(string, start = 1L, end = -1L) 提取子字符串
str_sub(string, start = 1L, end = -1L) <- value 替换子字符串
hw <- "Hadley Wickham"
str_sub(string = hw, start = 1, end = 6)
## [1] "Hadley"
str_sub(string = hw, end = 6)
## [1] "Hadley"
str_sub(string = hw, start = 8, end = 14)
## [1] "Wickham"
str_sub(string = hw, start = 8)
## [1] "Wickham"
str_sub(string = hw, start = c(1, 8), end = c(6, 14))
## [1] "Hadley" "Wickham"
# 使用负值索引
str_sub(string = hw, start = -1)
## [1] "m"
str_sub(string = hw, start = -7)
## [1] "Wickham"
str_sub(string = hw, end = -7)
## [1] "Hadley W"
# 从函数str_locate_all的结果传入位置参数
pos <- str_locate_all(hw, "[aeio]")[[1]]
pos
## start end
## [1,] 2 2
## [2,] 5 5
## [3,] 9 9
## [4,] 13 13
str_sub(string = hw, pos)
## [1] "a" "e" "i" "a"
str_sub(string = hw, start = pos[, 1], end = pos[, 2])
## [1] "a" "e" "i" "a"
# 向量化
str_sub(hw, start = seq_len(str_length(hw)))
## [1] "Hadley Wickham" "adley Wickham" "dley Wickham" "ley Wickham"
## [5] "ey Wickham" "y Wickham" " Wickham" "Wickham"
## [9] "ickham" "ckham" "kham" "ham"
## [13] "am" "m"
str_sub(hw, end = seq_len(str_length(hw)))
## [1] "H" "Ha" "Had" "Hadl"
## [5] "Hadle" "Hadley" "Hadley " "Hadley W"
## [9] "Hadley Wi" "Hadley Wic" "Hadley Wick" "Hadley Wickh"
## [13] "Hadley Wickha" "Hadley Wickham"
# 替换
x <- "BBCDEF"
str_sub(x, 1, 1) <- "A"
x
## [1] "ABCDEF"
str_sub(x, -1, -1) <- "K"
x
## [1] "ABCDEK"
str_sub(x, -2, -2) <- "GHIJ"
x
## [1] "ABCDGHIJK"
str_sub(x, 2, -2) <- ""
x
## [1] "AK"
str_subset(string, pattern)
fruit <- c("apple", "banana", "pear", "pinapple")
str_subset(string = fruit, pattern = "a")
## [1] "apple" "banana" "pear" "pinapple"
str_subset(string = fruit, pattern = "ap")
## [1] "apple" "pinapple"
str_subset(string = fruit, pattern = "^a")
## [1] "apple"
str_subset(string = fruit, pattern = "a$")
## [1] "banana"
str_subset(string = fruit, pattern = "[aeiou]")
## [1] "apple" "banana" "pear" "pinapple"
# 有缺失值时
str_subset(string = c("a", NA, "b"), pattern = ".")
## [1] "a" "b"
str_trim(string, side = c(“both”, “left”, “right”))
str_trim(" String with trailing and leading white space\t")
## [1] "String with trailing and leading white space"
str_trim("\n\nString with trailing and leading white space\n\n")
## [1] "String with trailing and leading white space"
str_wrap(string, width = 80, indent = 0, exdent = 0)
thanks_path <- file.path(R.home("doc"), "THANKS")
thanks <- str_c(readLines(thanks_path), collapse = "\n")
thanks <- word(thanks, 1, 3, fixed("\n\n"))
cat(str_wrap(thanks), "\n")
## R would not be what it is today without the invaluable help of these people,
## who contributed by donating code, bug fixes and documentation: Valerio Aimale,
## Thomas Baier, Henrik Bengtsson, Roger Bivand, Ben Bolker, David Brahm, Goran
## Brostrom, Patrick Burns, Vince Carey, Saikat DebRoy, Brian D'Urso, Lyndon
## Drake, Dirk Eddelbuettel, Claus Ekstrom, Sebastian Fischmeister, John Fox,
## Paul Gilbert, Yu Gong, Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn,
## Robert King, Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan de Leeuw,
## Jim Lindsey, Patrick Lindsey, Catherine Loader, Gordon Maclean, John Maindonald,
## David Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve Oncley, Richard O'Keefe,
## Hubert Palme, Roger D. Peng, Jose' C. Pinheiro, Tony Plate, Anthony Rossini,
## Jonathan Rougier, Petr Savicky, Guenther Sawitzki, Marc Schwartz, Detlef Steuer,
## Bill Simpson, Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner,
## Bill Venables, Gregory R. Warnes, Andreas Weingessel, Morten Welinder, James
## Wettenhall, Simon Wood and Achim Zeileis. Others have written code that has been
## adopted by R and is acknowledged in the code files, including
cat(str_wrap(string = thanks, width = 70), "\n")
## R would not be what it is today without the invaluable help of these
## people, who contributed by donating code, bug fixes and documentation:
## Valerio Aimale, Thomas Baier, Henrik Bengtsson, Roger Bivand, Ben
## Bolker, David Brahm, Goran Brostrom, Patrick Burns, Vince Carey,
## Saikat DebRoy, Brian D'Urso, Lyndon Drake, Dirk Eddelbuettel, Claus
## Ekstrom, Sebastian Fischmeister, John Fox, Paul Gilbert, Yu Gong,
## Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn, Robert King,
## Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan de Leeuw, Jim
## Lindsey, Patrick Lindsey, Catherine Loader, Gordon Maclean, John
## Maindonald, David Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve
## Oncley, Richard O'Keefe, Hubert Palme, Roger D. Peng, Jose' C.
## Pinheiro, Tony Plate, Anthony Rossini, Jonathan Rougier, Petr Savicky,
## Guenther Sawitzki, Marc Schwartz, Detlef Steuer, Bill Simpson,
## Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner, Bill
## Venables, Gregory R. Warnes, Andreas Weingessel, Morten Welinder,
## James Wettenhall, Simon Wood and Achim Zeileis. Others have written
## code that has been adopted by R and is acknowledged in the code files,
## including
cat(str_wrap(string = thanks, width = 60, indent = 6), "\n")
## R would not be what it is today without the invaluable help
## of these people, who contributed by donating code, bug fixes
## and documentation: Valerio Aimale, Thomas Baier, Henrik
## Bengtsson, Roger Bivand, Ben Bolker, David Brahm, Goran
## Brostrom, Patrick Burns, Vince Carey, Saikat DebRoy, Brian
## D'Urso, Lyndon Drake, Dirk Eddelbuettel, Claus Ekstrom,
## Sebastian Fischmeister, John Fox, Paul Gilbert, Yu Gong,
## Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn,
## Robert King, Kjetil Kjernsmo, Roger Koenker, Philippe
## Lambert, Jan de Leeuw, Jim Lindsey, Patrick Lindsey,
## Catherine Loader, Gordon Maclean, John Maindonald, David
## Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve Oncley,
## Richard O'Keefe, Hubert Palme, Roger D. Peng, Jose' C.
## Pinheiro, Tony Plate, Anthony Rossini, Jonathan Rougier,
## Petr Savicky, Guenther Sawitzki, Marc Schwartz, Detlef
## Steuer, Bill Simpson, Gordon Smyth, Adrian Trapletti, Terry
## Therneau, Rolf Turner, Bill Venables, Gregory R. Warnes,
## Andreas Weingessel, Morten Welinder, James Wettenhall, Simon
## Wood and Achim Zeileis. Others have written code that has
## been adopted by R and is acknowledged in the code files,
## including
cat(str_wrap(string = thanks, width = 80, indent = 6, exdent = 2), "\n")
## R would not be what it is today without the invaluable help of these people,
## who contributed by donating code, bug fixes and documentation: Valerio Aimale,
## Thomas Baier, Henrik Bengtsson, Roger Bivand, Ben Bolker, David Brahm, Goran
## Brostrom, Patrick Burns, Vince Carey, Saikat DebRoy, Brian D'Urso, Lyndon
## Drake, Dirk Eddelbuettel, Claus Ekstrom, Sebastian Fischmeister, John Fox,
## Paul Gilbert, Yu Gong, Gabor Grothendieck, Frank E Harrell Jr, Torsten Hothorn,
## Robert King, Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan de Leeuw,
## Jim Lindsey, Patrick Lindsey, Catherine Loader, Gordon Maclean, John Maindonald,
## David Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve Oncley, Richard O'Keefe,
## Hubert Palme, Roger D. Peng, Jose' C. Pinheiro, Tony Plate, Anthony Rossini,
## Jonathan Rougier, Petr Savicky, Guenther Sawitzki, Marc Schwartz, Detlef Steuer,
## Bill Simpson, Gordon Smyth, Adrian Trapletti, Terry Therneau, Rolf Turner,
## Bill Venables, Gregory R. Warnes, Andreas Weingessel, Morten Welinder, James
## Wettenhall, Simon Wood and Achim Zeileis. Others have written code that has been
## adopted by R and is acknowledged in the code files, including
word(string, start = 1L, end = start, sep = fixed(" “))
sentences <- c("Jane saw a cat", "Jane sat down")
word(string = sentences, start = 1) #提取第一个单词
## [1] "Jane" "Jane"
word(string = sentences, start = 2) #提取第二个单词
## [1] "saw" "sat"
word(string = sentences, start = -1) #提取句子的最后一个单词
## [1] "cat" "down"
word(string = sentences, start = 2, end = -1) #提取第二个单词到最后一个单词
## [1] "saw a cat" "sat down"
# 向量化:
word(string = sentences[1], start = 1:3, end = -1)
## [1] "Jane saw a cat" "saw a cat" "a cat"
word(string = sentences[1], start = 1, end = 1:4)
## [1] "Jane" "Jane saw" "Jane saw a" "Jane saw a cat"
# 通过参数sep指定分隔符
str <- "abc.def..123.4568.999"
word(string = str, start = 1, sep = fixed(".."))
## [1] "abc.def"
word(string = str, start = 2, sep = fixed(".."))
## [1] "123.4568.999"
paste(…, sep = " “, collapse = NULL)
paste("A", 1:6, sep = "")
## [1] "A1" "A2" "A3" "A4" "A5" "A6"
paste("A", 1:6, sep = "", collapse = "-") #设置collapse时,将连成一个字符串
## [1] "A1-A2-A3-A4-A5-A6"
paste(1:6, collapse = "")
## [1] "123456"
paste(1:6, collapse = "-")
## [1] "1-2-3-4-5-6"
paste("Today is", date())
## [1] "Today is Sat Jun 18 16:29:34 2016"
strsplit(x, split, fixed = FALSE, perl = FALSE, useBytes = FALSE)
x <- c(as = "asfef", qu = "qwerty", "yuiop[", "b", "stuff.blah.yech")
strsplit(x = x, split = "e")
## $as
## [1] "asf" "f"
##
## $qu
## [1] "qw" "rty"
##
## [[3]]
## [1] "yuiop["
##
## [[4]]
## [1] "b"
##
## [[5]]
## [1] "stuff.blah.y" "ch"
unlist(strsplit(x = "a.b.c", split = "."))
## [1] "" "" "" "" ""
unlist(strsplit(x = "a.b.c", split = "[.]")) #使用‘.’为分割符
## [1] "a" "b" "c"
# 或者:
unlist(strsplit(x = "a.b.c", split = ".", fixed = TRUE))
## [1] "a" "b" "c"
x <- "ascd123afrwf34535ddggh454fgf5e4"
strsplit(x = x, split = "[0-9]+", perl = TRUE) #以数字为分割符
## [[1]]
## [1] "ascd" "afrwf" "ddggh" "fgf" "e"
unlist(strsplit(x = x, split = "[a-z]+")) #以字母为分割符
## [1] "" "123" "34535" "454" "5" "4"
nchar(x, type = “chars”, allowNA = FALSE)
x <- c("asfef", "qwerty", "yuiop[", "b", "stuff.blah.yech")
nchar(x) #结果返回向量x的每个元素的字符个数
## [1] 5 6 6 1 15
(1)substr(x, start, stop)
(2)substring(text, first, last = 1000000L)
(3)substr(x, start, stop) <- value
(4)substring(text, first, last = 1000000L) <- value
# 对于单个字符串:
substr(x = "abcdef", start = 2, stop = 4)
## [1] "bcd"
substring(text = "abcdef", first = 2, last = 4)
## [1] "bcd"
substring(text = "abcdef", first = 1:6, last = 1:6)
## [1] "a" "b" "c" "d" "e" "f"
substr(x = rep("abcdef", 4), start = 1:4, stop = 4:5)
## [1] "abcd" "bcde" "cd" "de"
# 对于字符串向量:
x <- c("asfef", "qwerty", "yuiop[", "b", "stuff.blah.yech")
substr(x = x, start = 2, stop = 5) #对向量x每个元素截取子字符串
## [1] "sfef" "wert" "uiop" "" "tuff"
substring(text = x, first = 2, last = 4:6)
## [1] "sfe" "wert" "uiop[" "" "tuff"
substring(text = x, first = 2) <- c("..", "+++") #以赋值的方式进行替换
x
## [1] "a..ef" "q+++ty" "y..op[" "b"
## [5] "s..ff.blah.yech"
chartr(old, new, x) tolower(x) toupper(x) casefold(x, upper = FALSE)
x <- "MiXeD cAsE 123"
chartr("iXs", "why", x) #i:w,X:h,s:y,单个字符对应替换
## [1] "MwheD cAyE 123"
chartr("a-cX", "D-Fw", x)
## [1] "MiweD FAsE 123"
tolower(x) #转换成小写
## [1] "mixed case 123"
toupper(x) #转换成大写
## [1] "MIXED CASE 123"
casefold(x, upper = FALSE)
## [1] "mixed case 123"
casefold(x, upper = TRUE)
## [1] "MIXED CASE 123"
txt <- c("10arm03", "Foot 12", " 678-lefroo.345", "__.bafoobar90..")
grep(pattern = "foo", x = txt, value = FALSE) #区分大小写,结果返回匹配的元素索引
## [1] 4
grep(pattern = "foo", x = txt, value = TRUE) #区分大小写,结果返回匹配的元素值
## [1] "__.bafoobar90.."
grep(pattern = "foo", x = txt, ignore.case = TRUE) #忽略大小写,结果返回匹配的元素索引
## [1] 2 4
grep(pattern = "foo", x = txt, ignore.case = TRUE, value = TRUE) #忽略大小写,结果返回匹配的元素值
## [1] "Foot 12" "__.bafoobar90.."
grep(pattern = "foo", x = txt, ignore.case = TRUE, value = TRUE, invert = TRUE) #忽略大小写,结果返回不匹配的元素值
## [1] "10arm03" " 678-lefroo.345"
grep(pattern = "^[0-9]+", x = txt, perl = TRUE) #返回以数字开头的元素索引
## [1] 1
grep(pattern = "[0-9]+$", x = txt, perl = TRUE, value = TRUE) #返回以数字结尾的元素
## [1] "10arm03" "Foot 12" " 678-lefroo.345"
grep(pattern = "\\d$", x = txt, perl = TRUE, value = TRUE) #返回以数字结尾的元素
## [1] "10arm03" "Foot 12" " 678-lefroo.345"
txt <- c("10arm03", "Foot 12", " 678-lefroo.345", "__.bafoobar90..")
grepl(pattern = "foo", x = txt)
## [1] FALSE FALSE FALSE TRUE
grepl(pattern = "\\d$", x = txt, perl = TRUE)
## [1] TRUE TRUE TRUE FALSE
txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
sub(pattern = "foo", replacement = "99", x = txt) #将元素中的第一个foo替换成99
## [1] "10arm03" "Foot 12 99t" " 678-lefroo.345"
## [4] "__.ba99bar90foobar.."
sub(pattern = "\\d+$", replacement = "+++", x = txt, perl = TRUE) #将结尾的数字替换成+++
## [1] "10arm+++" "Foot 12 foot" " 678-lefroo.+++"
## [4] "__.bafoobar90foobar.."
txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
gsub(pattern = "foo", replacement = "99", x = txt) #将所有的foo替换成99
## [1] "10arm03" "Foot 12 99t" " 678-lefroo.345"
## [4] "__.ba99bar9099bar.."
gsub(pattern = "\\d+", replacement = "+++", x = txt, perl = TRUE) #将所有数字替换成+++
## [1] "+++arm+++" "Foot +++ foot"
## [3] " +++-lefroo.+++" "__.bafoobar+++foobar.."
txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
regexpr(pattern = "foo", text = txt)
## [1] -1 9 -1 6
## attr(,"match.length")
## [1] -1 3 -1 3
## attr(,"useBytes")
## [1] TRUE
regexpr(pattern = "\\d+", text = txt)
## [1] 1 6 2 12
## attr(,"match.length")
## [1] 2 2 3 2
## attr(,"useBytes")
## [1] TRUE
txt <- c("10arm03", "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
gregexpr(pattern = "foo", text = txt)
## [[1]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
##
## [[2]]
## [1] 9
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE
##
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
## attr(,"useBytes")
## [1] TRUE
##
## [[4]]
## [1] 6 14
## attr(,"match.length")
## [1] 3 3
## attr(,"useBytes")
## [1] TRUE
gregexpr(pattern = "\\d+", text = txt)
## [[1]]
## [1] 1 6
## attr(,"match.length")
## [1] 2 2
## attr(,"useBytes")
## [1] TRUE
##
## [[2]]
## [1] 6
## attr(,"match.length")
## [1] 2
## attr(,"useBytes")
## [1] TRUE
##
## [[3]]
## [1] 2 13
## attr(,"match.length")
## [1] 3 3
## attr(,"useBytes")
## [1] TRUE
##
## [[4]]
## [1] 12
## attr(,"match.length")
## [1] 2
## attr(,"useBytes")
## [1] TRUE
txt <- c(NA, "Foot 12 foot", " 678-lefroo.345", "__.bafoobar90foobar..")
regexec(pattern = "foo", text = txt)
## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
##
## [[2]]
## [1] 9
## attr(,"match.length")
## [1] 3
##
## [[3]]
## [1] -1
## attr(,"match.length")
## [1] -1
##
## [[4]]
## [1] 6
## attr(,"match.length")
## [1] 3
regexec(pattern = "\\d+", text = txt)
## [[1]]
## [1] NA
## attr(,"match.length")
## [1] NA
##
## [[2]]
## [1] 6
## attr(,"match.length")
## [1] 2
##
## [[3]]
## [1] 2
## attr(,"match.length")
## [1] 3
##
## [[4]]
## [1] 12
## attr(,"match.length")
## [1] 2