本章では文字列処理の基本と正規表現を学びます。
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.4.3
## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 2.2.1 √ purrr 0.2.4
## √ tibble 1.4.2 √ dplyr 0.7.4
## √ tidyr 0.7.2 √ stringr 1.2.0
## √ readr 1.1.1 √ forcats 0.2.0
## Warning: package 'tibble' was built under R version 3.4.3
## Warning: package 'stringr' was built under R version 3.4.3
## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(stringr)
string1 <- "This is a string"
string1
## [1] "This is a string"
string2 <- 'To put a "quote" inside a string, use single quotes'
string2
## [1] "To put a \"quote\" inside a string, use single quotes"
double_quote1 <- "\""
double_quote1
## [1] "\""
double_quote2 <- '"'
double_quote2
## [1] "\""
single_quote1 <- '\''
single_quote1
## [1] "'"
single_quote2 <- "'"
single_quote2
## [1] "'"
x <- c("\"", "\\")
x
## [1] "\"" "\\"
# 文字列の中身そのものを調べる
writeLines(x)
## "
## \
x <- "\u00b5"
x
## [1] "μ"
c("one", "two", "three")
## [1] "one" "two" "three"
str_length(c("a", "R for data science", NA))
## [1] 1 18 NA
str_c("x", "y")
## [1] "xy"
str_c("x", "y", "z")
## [1] "xyz"
str_c("x", "y", sep = ", ")
## [1] "x, y"
x <- c("abc", "def")
str_c("|-", x, "-|")
## [1] "|-abc-|" "|-def-|"
x <- c("abc", NA)
str_c("|-", x, "-|")
## [1] "|-abc-|" NA
x <- c("abc", NA)
str_c("|-", str_replace_na(x), "-|")
## [1] "|-abc-|" "|-NA-|"
# str_cはベクトルの長さに合わせて処理を繰り返す
str_c("prefix-", c("a", "b", "c"), "-suffix")
## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"
# 長さ0のオブジェクトは取り除く。ifと一緒に使うと便利
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE
str_c(
"Good", time_of_day, "", name,
if(birthday) " and HAPPY BIRTHDAY",
"."
)
## [1] "GoodmorningHadley."
birthday <- TRUE
str_c(
"Good", time_of_day, "", name,
if(birthday) " and HAPPY BIRTHDAY",
"."
)
## [1] "GoodmorningHadley and HAPPY BIRTHDAY."
# 文字列のベクトルをまとめて1つの文字列にするにはcollapseを使う
str_c(c("x", "y", "z"))
## [1] "x" "y" "z"
str_c(c("x", "y", "z"), collapse = ", ")
## [1] "x, y, z"
str_c(c("x", "y", "z"), collapse = "")
## [1] "xyz"
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)
## [1] "App" "Ban" "Pea"
# 負数は末尾からの位置
str_sub(x, -3, -1)
## [1] "ple" "ana" "ear"
# 文字列が短くてもエラーにはならない
str_sub("a", 1, 5)
## [1] "a"
# str_subの代入形式を使って文字列を変更することが出来る
# 先頭1文字を小文字に変更
x
## [1] "Apple" "Banana" "Pear"
str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x
## [1] "apple" "banana" "pear"
y <- "Data science is a very interesting field."
str_to_upper(y)
## [1] "DATA SCIENCE IS A VERY INTERESTING FIELD."
str_to_title(y)
## [1] "Data Science Is A Very Interesting Field."
str_to_upper(c("i", "ı"))
## [1] "I" "<U+0131>"
str_to_upper(c("i", "ı"), locale = "tr")
## [1] "<U+0130>" "<U+0131>"
x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") #英語
## [1] "apple" "banana" "eggplant"
str_sort(x, locale = "haw") #ハワイ語
## [1] "apple" "eggplant" "banana"
paste("aaa", "bbb")
## [1] "aaa bbb"
paste("aaa", "bbb", sep = "")
## [1] "aaabbb"
paste0("aaa", "bbb")
## [1] "aaabbb"
paste(c("aaa", "bbb"))
## [1] "aaa" "bbb"
paste0(c("aaa", "bbb"))
## [1] "aaa" "bbb"
paste(c("aaa", "bbb"), collapse = ",")
## [1] "aaa,bbb"
paste0(c("aaa", "bbb"), collapse = ",")
## [1] "aaa,bbb"
paste(c("aaa", "bbb"), sep = ",")
## [1] "aaa" "bbb"
paste0(c("aaa", "bbb"), sep = ",")
## [1] "aaa," "bbb,"
str_c("aaa", "bbb")
## [1] "aaabbb"
str_c("aaa", "bbb", sep = ", ")
## [1] "aaa, bbb"
str_c(c("aaa", "bbb"), sep = ", ")
## [1] "aaa" "bbb"
str_c(c("aaa", "bbb"), collapse = ", ")
## [1] "aaa, bbb"
paste("aaa", NA)
## [1] "aaa NA"
paste0("aaa", NA)
## [1] "aaaNA"
str_c("aaa", NA)
## [1] NA
str_c("aaa", str_replace_na(NA))
## [1] "aaaNA"
# 文字列を結合する時の区切り文字はsepで指定
str_c("aaa", "bbb", sep = ", ")
## [1] "aaa, bbb"
str_c("aaa", "bbb", collapse = ", ")
## [1] "aaabbb"
str_c("aaa", "bbb", sep = ", ", collapse = ", ")
## [1] "aaa, bbb"
# 文字列のベクトルを結合する時の区切り文字はcollapseで指定
str_c(c("aaa", "bbb"), sep = ", ")
## [1] "aaa" "bbb"
str_c(c("aaa", "bbb"), collapse = ", ")
## [1] "aaa, bbb"
str_c(c("aaa", "bbb"), sep = ", ", collapse = ", ")
## [1] "aaa, bbb"
x <- "abcde"
i <- str_length(x)%/%2+1
str_sub(x, i, i)
## [1] "c"
# 偶数個の場合
x <- "abcdef"
i <- str_length(x)%/%2
str_sub(x, i, i)
## [1] "c"
str_sub(x, i+1, i+1)
## [1] "d"
#Usage: str_wrap(string, width = 80, indent = 0, exdent = 0)
#Arguments:
#string: character vector of strings to reformat.
#width: positive integer giving target line width in characters. A width less than or equal to 1 will put each word on its own line.
#indent: non-negative integer giving indentation of first line in each paragraph
#exdent: non-negative integer giving indentation of following lines in each paragraph
teststr <- "this is 1. this is 2. this is 3."
teststr
## [1] "this is 1. this is 2. this is 3."
wrapstr <- str_wrap(teststr, width = 10)
wrapstr
## [1] "this is 1.\nthis is 2.\nthis is 3."
cat(wrapstr, "\n")
## this is 1.
## this is 2.
## this is 3.
wrapstr <- str_wrap(teststr, width = 10, indent = 5)
cat(wrapstr, "\n")
## this is 1.
## this is 2.
## this is 3.
wrapstr <- str_wrap(teststr, width = 10, exdent = 5)
cat(wrapstr, "\n")
## this is 1.
## this is 2.
## this is 3.
teststr <- " this is trim test "
str_trim(teststr)
## [1] "this is trim test"
str_trim(teststr, side = "left")
## [1] "this is trim test "
str_trim(teststr, side = "right")
## [1] " this is trim test"
str_trim(teststr, side = "both")
## [1] "this is trim test"
teststr <- "this is pad test"
str_pad(teststr, 10)
## [1] "this is pad test"
str_pad(teststr, 10, side = "left")
## [1] "this is pad test"
str_pad(teststr, 10, side = "right")
## [1] "this is pad test"
str_pad(teststr, 10, side = "both")
## [1] "this is pad test"
str_pad("this", 10)
## [1] " this"
str_pad("this", 10, side = "left", pad=",")
## [1] ",,,,,,this"
str_pad("this", 10, side = "right", pad=",")
## [1] "this,,,,,,"
str_pad("this", 10, side = "both", pad=",")
## [1] ",,,this,,,"
str_vec_to_char <- function(x) {
str1 <- str_c(x, collapse = ", ")
str1
}
str_vec_to_char(c(""))
## [1] ""
str_vec_to_char(c("a"))
## [1] "a"
str_vec_to_char(c("a","b"))
## [1] "a, b"
str_vec_to_char(c("a","b","c"))
## [1] "a, b, c"
# 回答サイト
# Write a function that turns (e.g.) a vector c(“a”, “b”, “c”) into the string a, b, and c. Think carefully about what it should do if given a vector of length 0, 1, or 2.
str_commasep <- function(x, sep = ", ", last = ", and ") {
if (length(x) > 1) {
str_c(str_c(x[-length(x)], collapse = sep),
x[length(x)],
sep = last)
} else {
x
}
}
str_commasep("a")
## [1] "a"
str_commasep(c("a", "b"))
## [1] "a, and b"
str_commasep(c("a", "b", "c"))
## [1] "a, b, and c"
x <- c("apple", "banana", "pear")
# 単純マッチ
str_view(x, "an")
# .はどんな文字ともマッチする
str_view(x, ".a.")
# .とマッチするときは正規表現\\.が必要
dot <- "\\."
writeLines(dot)
## \.
str_view(c("abc", "a.c", "bef"), "a\\.c")
# \とマッチするときは正規表現\\\\が必要
x <- "a\\b"
writeLines(x)
## a\b
str_view(x, "\\\\")
# 1. \はEscape文字のため\の後ろにEscapseする文字がないとエラーになる。
# 2. \\は\として認識される。
# 3. \\\は前の\\は\として認識されるが、三番目の\は1と同様でエラー。
x <- "ab\"'\\def"
writeLines(x)
## ab"'\def
str_view(x, "\"'\\\\")
x <- "a.b.c.d.e"
str_view(x, "\\..\\..\\..")
# ^は文字列の先頭とマッチする
# $は文字列の末尾とマッチする
x <- c("apple", "banana", "pear")
str_view(x, "^a")
str_view(x, "a$")
x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")
str_view(x, "^apple$")
# 単語境界マッチ
x <- c("summarize", "summary", "rowsum", "sum")
str_view(x, "\\bsum\\b")
x <- "aaa$^$bbb"
writeLines(x)
## aaa$^$bbb
str_view(x, "\\$\\^\\$")
x <- stringr::words
# a. "y"で始まる
str_view(x, "^y", match = TRUE)
# b. "x"で終わる
str_view(x, "x$", match = TRUE)
# c. 正確に3文字
str_view(x, "\\b...\\b", match = TRUE)
# d. 7文字以上
str_view(x, ".......", match = TRUE)
str_view(c("grey", "gray"), "gr(e|a)y")
x <- c("apple", "banana", "car", "deal", "earth", "unity", "showed", "succeed", "playing", "realize")
# a. 母音で始まる
str_view(x, "^[aiueo]", match = TRUE)
# b. 子音からなる
str_view(x, "^[^aiueo]", match = TRUE)
# c. edで終わるがeedでは終わらない
str_view(x, "[^e]ed$")
# d. ingまたはizeで終わる
str_view(x, "ing$|ize$")
x <- c("abcie", "abie", "abcedei", "abieddd")
str_view(x, "[^c]ie")
x <- stringr::words
str_view(x, "q[^u]", match = TRUE)
str_view(x, "ou|ise$|ae|oe|yse$|tre$", match = TRUE)
tel <- c("042-333-1212", "03-333-1212")
str_view(tel, "(\\d\\d|\\d\\d\\d)-\\d\\d\\d-\\d\\d\\d\\d")
str_view(tel, "(\\d{2}|\\d{3})-\\d{3}-\\d{4}")
x <- "numerals: MDCCCLXXXVIII"
str_view(x, "CC?")
str_view(x, "CC+")
str_view(x, "C[LX]+")
str_view(x, "C{2}")
str_view(x, "C{2,}")
str_view(x, "C{2,3}")
str_view(x, "C{2,3}?")
str_view(x, "C[LX]+?")
str_view(x, "C?")
str_view(x, "C{0,1}")
str_view(x, "C+")
str_view(x, "C{1,}")
str_view(x, "C*")
str_view(x, "C{0,}")
str_view("abcdefg", "^.*$") #すべての文字列
str_view("{a}", "\\{.+\\}") #{}に囲まれた1文字以上の文字列
str_view("1111-22-33", "\\d{4}-\\d{2}-\\d{2}") #1111-22-33の形式
str_view("ab\\\\\\\\cd", "\\\\{4}") #\が4つ
# a. 3個の子音で始まる
str_view("bcde", "^[^aiueo]{3}")
# b. 3個以上の母音が続く
str_view("aaaiueo", "^[aiueo]{3,}")
# c. 2個以上の母音子音対が続く
str_view("abicuc", "([aiueo][^aiueo]){2,}")
str_view(fruit, "(..)\\1", match = TRUE)
# a. 同じ文字が3回連続の場合マッチ
str_view(c("appple", "apple", "baaanana"), "(.)\\1\\1", match = TRUE)
# b. 2文字と順番逆の2文字にマッチ
str_view(c("abba", "kklaalkkk", "baaanana", "mmmm"), "(.)(.)\\2\\1", match = TRUE)
# c. 同じ2文字が連続する場合マッチ
str_view(c("abba", "bananan", "baaanana", "mmmm"), "(..)\\1", match = TRUE)
# d. 2つの文字+1番目の文字+任意の文字+1番目の文字
str_view(c("abaka", "bananan", "baaanana", "mmmm"), "(.).\\1.\\1", match = TRUE)
# e. #3文字+任意文字列+3文字の逆順番の文字
str_view(c("abcdkkkkkcba", "zyabcdkkkkkcbaxz", "abccba", "mmmm"), "(.)(.)(.).*\\3\\2\\1", match = TRUE)
# a. 先頭と末尾が同じ文字
str_view(c("appplea", "aa", "aba", "apple", "baaanana"), "^(.).*\\1$", match = TRUE)
# b. 文字対の繰り返しがある(churchではchを2回繰り返す)
str_view(c("church", "akauoakauo", "baba", "aba", "apple", "baaanana"), "(..).*\\1", match = TRUE)
# c. 1つの文字が少なくとも3回繰り返される(elevenには3つのeがある)
str_view(c("appplea", "eleven", "aba", "apple", "baaanana"), "(.).*\\1.*\\1", match = TRUE)
x <- c("apple", "banana", "pear")
str_detect(x, "e")
## [1] TRUE FALSE TRUE
# 一般単語でtで始まる単語の個数は
sum(str_detect(words, "^t"))
## [1] 65
# 一般単語で母音で終わる単語の割合は
mean(str_detect(words, "[aeiou]$"))
## [1] 0.2765306
# 少なくとも母音を1つ含む単語をすべて探し出して、補集合を取る
no_vowels_1 <- !str_detect(words, "[aeiou]") #aeiouが含まれてない文字列
# 子音(非母音)だけからなる単語をすべて探し出す
no_vowels_2 <- str_detect(words, "^[^aeiou]+$") #aeiouではない文字から始まり、それで終わる
identical(no_vowels_1, no_vowels_2)
## [1] TRUE
words[str_detect(words, "x$")]
## [1] "box" "sex" "six" "tax"
str_subset(words, "x$")
## [1] "box" "sex" "six" "tax"
library(tibble)
df <- tibble(
word = words,
i = seq_along(word)
)
df %>% filter(str_detect(words, "x$"))
## # A tibble: 4 x 2
## word i
## <chr> <int>
## 1 box 108
## 2 sex 747
## 3 six 772
## 4 tax 841
x <- c("apple","banana","pear")
str_count(x, "a")
## [1] 1 3 1
mean(str_count(words, "[aeiou]"))
## [1] 1.991837
library(dplyr)
df %>%
mutate(
vowels = str_count(word, "[aeiou]"),
consonants = str_count(word, "[^aeiou]")
)
## # A tibble: 980 x 4
## word i vowels consonants
## <chr> <int> <int> <int>
## 1 a 1 1 0
## 2 able 2 2 2
## 3 about 3 3 2
## 4 absolute 4 4 4
## 5 accept 5 2 4
## 6 account 6 3 4
## 7 achieve 7 4 3
## 8 across 8 2 4
## 9 act 9 1 2
## 10 active 10 3 3
## # ... with 970 more rows
str_count("abababa", "aba")
## [1] 2
str_view_all("abababa", "aba")
# a. 先頭または末尾がxの全単語を探し出す
x <- c("xax", "xaa", "aax")
str_view(x, "^x.*x$", match = TRUE)
tmp_x <- x[str_detect(x, "^x")]
str_view(tmp_x, "x$", match = TRUE)
# b. 先頭が母音で末尾が子音の全単語を探し出す
x <- c("axab", "xaa", "aax")
str_view(x, "^[aeiou].*[^aeiou]$", match = TRUE)
tmp_x <- x[str_detect(x, "^[aeiou]")]
str_view(tmp_x, "[^aeiou]$", match = TRUE)
# c. 異なる母音をそれぞれ少なくとも1つ含む単語はあるか
words[str_detect(words, "a") &
str_detect(words, "e") &
str_detect(words, "i") &
str_detect(words, "o") &
str_detect(words, "u")]
## character(0)
test_words <- c("abeicoub", "abeicb")
test_words[str_detect(test_words, "a") &
str_detect(test_words, "e") &
str_detect(test_words, "i") &
str_detect(test_words, "o") &
str_detect(test_words, "u")]
## [1] "abeicoub"
# d. 母音数が一番多い単語は何か。母音の割合が最も多い単語は何か(ヒント、分母は何か)
x <- str_count(words, "[aeiou]")
words[which(x == max(x))]
## [1] "appropriate" "associate" "available" "colleague" "encourage"
## [6] "experience" "individual" "television"
x <- str_count(words, "[aeiou]") / str_length(words)
words[which(x == max(x))]
## [1] "a"
words <- words[-which(words == "a")]
x <- str_count(words, "[aeiou]") / str_length(words)
words[which(x == max(x))]
## [1] "area" "idea"
length(sentences)
## [1] 720
head(sentences)
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
## [4] "These days a chicken leg is a rare dish."
## [5] "Rice is often served in round bowls."
## [6] "The juice of lemons makes fine punch."
# 色名の正規表現作成
colors <- c("red", "orange", "yellow", "green", "blue", "purple")
color_match <- str_c(colors, collapse = "|")
color_match
## [1] "red|orange|yellow|green|blue|purple"
# 文章から正規表現でマッチした色を抽出
has_color <- str_subset(sentences, color_match)
matches <- str_extract(has_color, color_match)
head(matches)
## [1] "blue" "blue" "red" "red" "red" "blue"
more <- sentences[str_count(sentences, color_match) > 1]
str_view_all(more, color_match)
str_extract(more, color_match)
## [1] "blue" "green" "orange"
str_extract_all(more, color_match)
## [[1]]
## [1] "blue" "red"
##
## [[2]]
## [1] "green" "red"
##
## [[3]]
## [1] "orange" "red"
str_extract_all(more, color_match, simplify = TRUE)
## [,1] [,2]
## [1,] "blue" "red"
## [2,] "green" "red"
## [3,] "orange" "red"
x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)
## [,1] [,2] [,3]
## [1,] "a" "" ""
## [2,] "a" "b" ""
## [3,] "a" "b" "c"
color_match2 <- str_c("\\b(", color_match, ")\\b")
color_match2
## [1] "\\b(red|orange|yellow|green|blue|purple)\\b"
str_view_all(more, color_match2)
# a. 各行の先頭の語
sentences %>% head()
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
## [4] "These days a chicken leg is a rare dish."
## [5] "Rice is often served in round bowls."
## [6] "The juice of lemons makes fine punch."
str_extract(sentences, "^[a-zA-Z]+") %>% head()
## [1] "The" "Glue" "It" "These" "Rice" "The"
# b. ingで終わるすべての単語
ing_match <- str_detect(sentences, "[a-zA-Z]+ing")
str_extract_all(sentences[ing_match], "[a-zA-Z]+ing") %>% unlist() %>% head()
## [1] "stocking" "spring" "evening" "morning" "winding" "living"
# c. 複数を表す語
str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b") %>% head()
## [[1]]
## [1] "planks"
##
## [[2]]
## character(0)
##
## [[3]]
## character(0)
##
## [[4]]
## [1] "days"
##
## [[5]]
## [1] "bowls"
##
## [[6]]
## [1] "lemons" "makes"
noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
str_subset(noun) %>%
head(10)
has_noun %>%
str_extract(noun)
## [1] "the smooth" "the sheet" "the depth" "a chicken" "the parked"
## [6] "the sun" "the huge" "the ball" "the woman" "a helps"
has_noun %>%
str_match(noun)
## [,1] [,2] [,3]
## [1,] "the smooth" "the" "smooth"
## [2,] "the sheet" "the" "sheet"
## [3,] "the depth" "the" "depth"
## [4,] "a chicken" "a" "chicken"
## [5,] "the parked" "the" "parked"
## [6,] "the sun" "the" "sun"
## [7,] "the huge" "the" "huge"
## [8,] "the ball" "the" "ball"
## [9,] "the woman" "the" "woman"
## [10,] "a helps" "a" "helps"
tibble(sentence = sentences) %>%
tidyr::extract(
sentence, c("article", "noun"), "(a|the) ([^ ]+)",
remove = FALSE
)
## # A tibble: 720 x 3
## sentence article noun
## * <chr> <chr> <chr>
## 1 The birch canoe slid on the smooth planks. the smooth
## 2 Glue the sheet to the dark blue background. the sheet
## 3 It's easy to tell the depth of a well. the depth
## 4 These days a chicken leg is a rare dish. a chicken
## 5 Rice is often served in round bowls. <NA> <NA>
## 6 The juice of lemons makes fine punch. <NA> <NA>
## 7 The box was thrown beside the parked truck. the parked
## 8 The hogs were fed chopped corn and garbage. <NA> <NA>
## 9 Four hours of steady work faced us. <NA> <NA>
## 10 Large size in stockings is hard to sell. <NA> <NA>
## # ... with 710 more rows
numword <- "(one|two|three|four|five|six|seven|eight|nine|ten) ([^ ]+)"
has_numword <- sentences %>%
str_subset(numword) %>%
head(10)
has_numword %>% str_extract(numword)
## [1] "ten served" "one over" "seven books" "two met" "two factors"
## [6] "one and" "three lists" "seven is" "two when" "one floor."
apos_word <- "([A-Za-z]+)'([A-Za-z]+)"
has_apos_word <- sentences %>%
str_subset(apos_word) %>%
head(10)
has_apos_word %>% str_extract(apos_word)
## [1] "It's" "man's" "don't" "store's" "workmen's"
## [6] "Let's" "sun's" "child's" "king's" "It's"
sprit_str <- has_apos_word %>% str_extract(apos_word)
str_split(sprit_str, "'")
## [[1]]
## [1] "It" "s"
##
## [[2]]
## [1] "man" "s"
##
## [[3]]
## [1] "don" "t"
##
## [[4]]
## [1] "store" "s"
##
## [[5]]
## [1] "workmen" "s"
##
## [[6]]
## [1] "Let" "s"
##
## [[7]]
## [1] "sun" "s"
##
## [[8]]
## [1] "child" "s"
##
## [[9]]
## [1] "king" "s"
##
## [[10]]
## [1] "It" "s"
x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")
## [1] "-pple" "p-ar" "b-nana"
str_replace_all(x, "[aeiou]", "-")
## [1] "-ppl-" "p--r" "b-n-n-"
x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))
## [1] "one house" "two cars" "three people"
pattern <- "([^ ]+) ([^ ]+) ([^ ]+)"
has_pattern <- sentences %>%
str_subset(pattern) %>%
head(5)
has_pattern
## [1] "The birch canoe slid on the smooth planks."
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."
## [4] "These days a chicken leg is a rare dish."
## [5] "Rice is often served in round bowls."
sentences %>%
str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
head(5)
## [1] "The canoe birch slid on the smooth planks."
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."
## [4] "These a days chicken leg is a rare dish."
## [5] "Rice often is served in round bowls."
x <- "aaa/bbb/ccc/ddd"
xreplace <- str_replace_all(x, "\\/", "\\\\")
writeLines(xreplace)
## aaa\bbb\ccc\ddd
x <- "ABCD"
str_replace_all(x, c("A" = "a", "B" = "b", "C" = "c", "D" = "d"))
## [1] "abcd"
words <- c("apple", "and", "thus", "banana", "orange", "dna", "eagle", "grape", "shut")
words_match <- str_c(words, collapse = "|")
str_replace(words, "(^[a-zA-Z])([a-zA-Z]*)([a-zA-Z]$)", "\\3\\2\\1")
## [1] "eppla" "dna" "shut" "aananb" "erango" "and" "eagle" "erapg"
## [9] "thus"
ex_words <- str_replace(words, "(^[a-zA-Z])([a-zA-Z]*)([a-zA-Z]$)", "\\3\\2\\1")
str_view(ex_words, words_match)
sentences %>%
head(5) %>%
str_split(" ")
## [[1]]
## [1] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [8] "planks."
##
## [[2]]
## [1] "Glue" "the" "sheet" "to" "the"
## [6] "dark" "blue" "background."
##
## [[3]]
## [1] "It's" "easy" "to" "tell" "the" "depth" "of" "a" "well."
##
## [[4]]
## [1] "These" "days" "a" "chicken" "leg" "is" "a"
## [8] "rare" "dish."
##
## [[5]]
## [1] "Rice" "is" "often" "served" "in" "round" "bowls."
"a|b|c|d" %>%
str_split("\\|") %>%
.[[1]]
## [1] "a" "b" "c" "d"
c("a|b|c|d", "e|f|g") %>%
str_split("\\|") %>%
.[[2]]
## [1] "e" "f" "g"
sentences %>%
head(5) %>%
str_split(" ", simplify = TRUE)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7]
## [1,] "The" "birch" "canoe" "slid" "on" "the" "smooth"
## [2,] "Glue" "the" "sheet" "to" "the" "dark" "blue"
## [3,] "It's" "easy" "to" "tell" "the" "depth" "of"
## [4,] "These" "days" "a" "chicken" "leg" "is" "a"
## [5,] "Rice" "is" "often" "served" "in" "round" "bowls."
## [,8] [,9]
## [1,] "planks." ""
## [2,] "background." ""
## [3,] "a" "well."
## [4,] "rare" "dish."
## [5,] "" ""
sentences %>%
head(5) %>%
str_split(" ", simplify = TRUE) %>%
.[[1,2]]
## [1] "birch"
fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)
## [,1] [,2]
## [1,] "Name" "Hadley"
## [2,] "Country" "NZ"
## [3,] "Age" "35"
x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))
str_split(x, " ")[[1]]
## [1] "This" "is" "a" "sentence." "This" "is"
## [7] "another" "sentence."
str_split(x, boundary("word"))[[1]]
## [1] "This" "is" "a" "sentence" "This" "is"
## [7] "another" "sentence"
x <- "apples, pears, and bananas"
str_split(x, ", and |, ")
## [[1]]
## [1] "apples" "pears" "bananas"
x <- "This is a sentence. This is another sentence."
str_split(x, " ")[[1]]
## [1] "This" "is" "a" "sentence." "This" "is"
## [7] "another" "sentence."
str_split(x, boundary("word"))[[1]]
## [1] "This" "is" "a" "sentence" "This" "is"
## [7] "another" "sentence"
str_split(x, "")[[1]]
## [1] "T" "h" "i" "s" " " "i" "s" " " "a" " " "s" "e" "n" "t" "e" "n" "c"
## [18] "e" "." " " "T" "h" "i" "s" " " "i" "s" " " "a" "n" "o" "t" "h" "e"
## [35] "r" " " "s" "e" "n" "t" "e" "n" "c" "e" "."
str_view(fruit, "nana", match = TRUE)
str_view(fruit, regex("nana"), match = TRUE)
bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")
str_view(bananas, regex("banana", ignore_case = TRUE))
x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]
## [1] "Line"
str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]
## [1] "Line" "Line" "Line"
phone <- regex("\\(? # オプションの開き括弧
(\\d{3}) # エリア番号
[)- ]? # オプションの閉じ括弧、ダッシュ、空白
(\\d{3}) # 3桁の番号
[ -]? # オプションの空白かダッシュ
(\\d{3}) # 3桁の番号
", comments = TRUE)
str_match("514-791-8141", phone)
## [,1] [,2] [,3] [,4]
## [1,] "514-791-814" "514" "791" "814"
#install.packages("microbenchmark")
microbenchmark::microbenchmark(
fixed = str_detect(sentences, fixed("the")),
regex = str_detect(sentences, "the"),
times = 20
)
## Unit: microseconds
## expr min lq mean median uq max neval
## fixed 117.722 121.947 137.8548 124.6640 141.1145 306.075 20
## regex 249.328 254.307 263.8612 255.9675 256.8735 397.836 20
a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)
## [1] "a" "a<U+0301>"
a1 == a2
## [1] FALSE
str_detect(a1, fixed(a2))
## [1] FALSE
str_detect(a1, coll(a2))
## [1] TRUE
i <- c("I", "İ", "i", "ı")
i
## [1] "I" "<U+0130>" "i" "<U+0131>"
str_subset(i, coll("i", ignore_case = TRUE))
## [1] "I" "i"
str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))
## [1] "i"
library(stringi)
stringi::stri_locale_info()
## $Language
## [1] "ja"
##
## $Country
## [1] "JP"
##
## $Variant
## [1] ""
##
## $Name
## [1] "ja_JP"
x <- "This is a sentence."
str_view_all(x, boundary("word"))
str_extract_all(x, boundary("word"))
## [[1]]
## [1] "This" "is" "a" "sentence"
str_subset(c("a\\b", "ab"), "\\\\")
## [1] "a\\b"
str_subset(c("a\\b", "ab"), fixed("\\"))
## [1] "a\\b"
library(tibble)
library(tidyverse)
str_extract_all(sentences, boundary("word")) %>%
unlist() %>%
str_to_lower() %>%
tibble() %>%
set_names("word") %>%
group_by(word) %>%
count(sort = TRUE) %>%
head(5)
## # A tibble: 5 x 2
## # Groups: word [5]
## word n
## <chr> <int>
## 1 the 751
## 2 a 202
## 3 of 132
## 4 to 123
## 5 and 118
apropos("replace")
## [1] "%+replace%" "replace"
## [3] "replace_na" "setReplaceMethod"
## [5] "str_replace" "str_replace_all"
## [7] "str_replace_na" "stri_replace"
## [9] "stri_replace_all" "stri_replace_all_charclass"
## [11] "stri_replace_all_coll" "stri_replace_all_fixed"
## [13] "stri_replace_all_regex" "stri_replace_first"
## [15] "stri_replace_first_charclass" "stri_replace_first_coll"
## [17] "stri_replace_first_fixed" "stri_replace_first_regex"
## [19] "stri_replace_last" "stri_replace_last_charclass"
## [21] "stri_replace_last_coll" "stri_replace_last_fixed"
## [23] "stri_replace_last_regex" "stri_replace_na"
## [25] "theme_replace" "xreplace"
head(dir(pattern = "\\.Rmd$"))
## [1] "chapter11.Rmd" "R_book_korean.Rmd" "sample_markdown.Rmd"
# a. 単語数を数える
x <- "this is a test. this is a sample test for stringi functions"
stri_count_words(x)
## [1] 12
# b. 重複した文字列を探し出す
x <- c("this is test1.", "this is test2.", "this is test3.", "this is test1.")
stri_duplicated(x)
## [1] FALSE FALSE FALSE TRUE
x %>%
str_to_lower() %>%
tibble() %>%
set_names("sentence") %>%
group_by(sentence) %>%
count(sort = TRUE) %>%
filter(n > 1) %>%
head(5)
## # A tibble: 1 x 2
## # Groups: sentence [1]
## sentence n
## <chr> <int>
## 1 this is test1. 2
# c. ランダムな文章を生成する
apropos("stri_rand")
## [1] "stri_rand_lipsum" "stri_rand_shuffle" "stri_rand_strings"
stri_rand_lipsum(2) # dummy text
## [1] "Lorem ipsum dolor sit amet, conubia odio dignissim laoreet volutpat. Venenatis elementum magnis fusce donec malesuada, ridiculus justo quis bibendum morbi. Litora ac pellentesque quisque ac eget purus. Nulla amet. Sapien ornare quam turpis ante sed interdum in dapibus blandit curabitur sed. Tempor urna diam vel hendrerit vivamus mi consectetur, nascetur nulla sed? Hac sed bibendum justo nulla amet vitae, inceptos eros sapien phasellus nulla ac. Nec sed interdum sit finibus donec! Consectetur at eu donec senectus ullamcorper ac et. Porta habitasse vel erat amet risus consectetur. Ligula etiam mauris ut lectus porta lectus. Eros vestibulum penatibus, lobortis platea, ac duis tincidunt vitae leo. A ut vel in orci varius magna nam. Class tempus ad."
## [2] "Est nisi, habitasse quis feugiat, tempor ut enim lobortis ligula libero enim mauris. Ullamcorper urna magna hac class, et. Dolor accumsan etiam egestas lobortis, sem malesuada, pulvinar leo laoreet. Metus, ipsum ut quam rhoncus ac in donec nam. Ultricies non vitae id. Tristique euismod ut dictumst class nibh posuere, erat pellentesque. Blandit diam at mauris est habitant nulla auctor. Cubilia in lacinia erat semper iaculis. Dapibus eros placerat laoreet mus iaculis tempus. Eu amet primis odio habitant inceptos, ac sociosqu diam. Bibendum efficitur integer in vestibulum erat bibendum sit nulla dignissim in et. Sed, pellentesque adipiscing, aliquam fusce non a. Odio phasellus massa, sed dictum donec cum habitasse dolor. Vel ligula consequat inceptos ultricies semper vel mollis quam at. Diam, scelerisque nec neque, tellus venenatis vel dignissim, ac hac nunc amet. Feugiat dolor viverra imperdiet torquent faucibus. Blandit, dictum non vestibulum, risus venenatis vel nunc ac, non."
n <- 10
stri_rand_shuffle(stri_paste(
stri_rand_strings(n, 1, '[0-9]'),
stri_rand_strings(n, 1, '[a-z]'),
stri_rand_strings(n, 1, '[A-Z]'),
stri_rand_strings(n, sample(5:11, 5, replace=TRUE), '[a-zA-Z0-9]')
))
## [1] "pb0wTRphUZ" "7T0SCo5AyKN3Ot" "Gy718CRJK4SiUr" "Sme9mQhwwL"
## [5] "w6bbwQ3wBB" "Vj9OF2ey8P" "KIDMikz1YcVVzv" "jkLqt29WJETcBp"
## [9] "Hk4XQPbs6B" "S3jEYYOdQ7"
stri_sort(c("hladny", "chladny", "bubble"), locale="ja_JP")
## [1] "bubble" "chladny" "hladny"
stri_sort(c("hladny", "chladny", "bubble"), locale="sk_SK")
## [1] "bubble" "hladny" "chladny"