11.1　はじめに

本章では文字列処理の基本と正規表現を学びます。

11.1.1 準備

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.4.3

## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.2.1 --

## √ ggplot2 2.2.1     √ purrr   0.2.4
## √ tibble  1.4.2     √ dplyr   0.7.4
## √ tidyr   0.7.2     √ stringr 1.2.0
## √ readr   1.1.1     √ forcats 0.2.0

## Warning: package 'tibble' was built under R version 3.4.3

## Warning: package 'stringr' was built under R version 3.4.3

## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(stringr)

11.2 文字列の基本

string1 <- "This is a string"
string1

## [1] "This is a string"

string2 <- 'To put a "quote" inside a string, use single quotes'
string2

## [1] "To put a \"quote\" inside a string, use single quotes"

double_quote1 <- "\""
double_quote1

## [1] "\""

double_quote2 <- '"'
double_quote2

## [1] "\""

single_quote1 <- '\''
single_quote1

## [1] "'"

single_quote2 <- "'"
single_quote2

## [1] "'"

x <- c("\"", "\\")
x

## [1] "\"" "\\"

# 文字列の中身そのものを調べる
writeLines(x)

## "
## \

x <- "\u00b5"
x

## [1] "μ"

c("one", "two", "three")

## [1] "one"   "two"   "three"

11.2.1 文字列の長さ

str_length(c("a", "R for data science", NA))

## [1]  1 18 NA

11.2.2 文字列の連結

str_c("x", "y")

## [1] "xy"

str_c("x", "y", "z")

## [1] "xyz"

str_c("x", "y", sep = ", ")

## [1] "x, y"

x <- c("abc", "def")
str_c("|-", x, "-|")

## [1] "|-abc-|" "|-def-|"

x <- c("abc", NA)
str_c("|-", x, "-|")

## [1] "|-abc-|" NA

x <- c("abc", NA)
str_c("|-", str_replace_na(x), "-|")

## [1] "|-abc-|" "|-NA-|"

# str_cはベクトルの長さに合わせて処理を繰り返す
str_c("prefix-", c("a", "b", "c"), "-suffix")

## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"

# 長さ0のオブジェクトは取り除く。ifと一緒に使うと便利
name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE

str_c(
  "Good", time_of_day, "", name,
  if(birthday) " and HAPPY BIRTHDAY",
  "."
)

## [1] "GoodmorningHadley."

birthday <- TRUE
str_c(
  "Good", time_of_day, "", name,
  if(birthday) " and HAPPY BIRTHDAY",
  "."
)

## [1] "GoodmorningHadley and HAPPY BIRTHDAY."

# 文字列のベクトルをまとめて1つの文字列にするにはcollapseを使う
str_c(c("x", "y", "z"))

## [1] "x" "y" "z"

str_c(c("x", "y", "z"), collapse = ", ")

## [1] "x, y, z"

str_c(c("x", "y", "z"), collapse = "")

## [1] "xyz"

11.2.3 文字列の一部抽出

x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)

## [1] "App" "Ban" "Pea"

# 負数は末尾からの位置
str_sub(x, -3, -1)

## [1] "ple" "ana" "ear"

# 文字列が短くてもエラーにはならない
str_sub("a", 1, 5)

## [1] "a"

# str_subの代入形式を使って文字列を変更することが出来る
# 先頭1文字を小文字に変更
x

## [1] "Apple"  "Banana" "Pear"

str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x

## [1] "apple"  "banana" "pear"

y <- "Data science is a very interesting field."
str_to_upper(y)

## [1] "DATA SCIENCE IS A VERY INTERESTING FIELD."

str_to_title(y)

## [1] "Data Science Is A Very Interesting Field."

11.2.4 ロケール

str_to_upper(c("i", "ı"))

## [1] "I"        "<U+0131>"

str_to_upper(c("i", "ı"), locale = "tr")

## [1] "<U+0130>" "<U+0131>"

x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") #英語

## [1] "apple"    "banana"   "eggplant"

str_sort(x, locale = "haw") #ハワイ語

## [1] "apple"    "eggplant" "banana"

練習問題 p.173

1. stringrを使わないコードでは、paste()とpaste0()をよく使う。この２つの関数は何が違うか。等価なstringrの関数は何か。NAの処理ではどこが違うか。

paste("aaa", "bbb")

## [1] "aaa bbb"

paste("aaa", "bbb", sep = "")

## [1] "aaabbb"

paste0("aaa", "bbb")

## [1] "aaabbb"

paste(c("aaa", "bbb"))

## [1] "aaa" "bbb"

paste0(c("aaa", "bbb"))

## [1] "aaa" "bbb"

paste(c("aaa", "bbb"), collapse = ",")

## [1] "aaa,bbb"

paste0(c("aaa", "bbb"), collapse = ",")

## [1] "aaa,bbb"

paste(c("aaa", "bbb"), sep = ",")

## [1] "aaa" "bbb"

paste0(c("aaa", "bbb"), sep = ",")

## [1] "aaa," "bbb,"

str_c("aaa", "bbb")

## [1] "aaabbb"

str_c("aaa", "bbb", sep = ", ")

## [1] "aaa, bbb"

str_c(c("aaa", "bbb"), sep = ", ")

## [1] "aaa" "bbb"

str_c(c("aaa", "bbb"), collapse = ", ")

## [1] "aaa, bbb"

paste("aaa", NA)

## [1] "aaa NA"

paste0("aaa", NA)

## [1] "aaaNA"

str_c("aaa", NA)

## [1] NA

str_c("aaa", str_replace_na(NA))

## [1] "aaaNA"

2. str_c()の引数sepとcollapseとの違いを自分の言葉で述べなさい。

# 文字列を結合する時の区切り文字はsepで指定
str_c("aaa", "bbb", sep = ", ")

## [1] "aaa, bbb"

str_c("aaa", "bbb", collapse = ", ")

## [1] "aaabbb"

str_c("aaa", "bbb", sep = ", ", collapse = ", ")

## [1] "aaa, bbb"

# 文字列のベクトルを結合する時の区切り文字はcollapseで指定
str_c(c("aaa", "bbb"), sep = ", ")

## [1] "aaa" "bbb"

str_c(c("aaa", "bbb"), collapse = ", ")

## [1] "aaa, bbb"

str_c(c("aaa", "bbb"), sep = ", ", collapse = ", ")

## [1] "aaa, bbb"

3. str_length()とstr_sub()を用いて、文字列の中央の文字を抜き出しなさい。文字列の文字が偶数個の場合には、どうするか。

x <- "abcde"
i <- str_length(x)%/%2+1
str_sub(x, i, i)

## [1] "c"

# 偶数個の場合
x <- "abcdef"
i <- str_length(x)%/%2
str_sub(x, i, i)

## [1] "c"

str_sub(x, i+1, i+1)

## [1] "d"

4. str_wrap()は何をするか。何に使うか。

#Usage: str_wrap(string, width = 80, indent = 0, exdent = 0)

#Arguments:
#string: character vector of strings to reformat.
#width: positive integer giving target line width in characters. A width less than or equal to 1 will put each word on its own line.
#indent: non-negative integer giving indentation of first line in each paragraph
#exdent: non-negative integer giving indentation of following lines in each paragraph

teststr <- "this is 1. this is 2. this is 3."
teststr

## [1] "this is 1. this is 2. this is 3."

wrapstr <- str_wrap(teststr, width = 10)
wrapstr

## [1] "this is 1.\nthis is 2.\nthis is 3."

cat(wrapstr, "\n")

## this is 1.
## this is 2.
## this is 3.

wrapstr <- str_wrap(teststr, width = 10, indent = 5)
cat(wrapstr, "\n")

##      this is 1.
## this is 2.
## this is 3.

wrapstr <- str_wrap(teststr, width = 10, exdent = 5)
cat(wrapstr, "\n")

## this is 1.
##      this is 2.
##      this is 3.

5. str_trim()は何をするか。str_trim()の逆操作は何か。

teststr <- "    this is trim test   "
str_trim(teststr)

## [1] "this is trim test"

str_trim(teststr, side = "left")

## [1] "this is trim test   "

str_trim(teststr, side = "right")

## [1] "    this is trim test"

str_trim(teststr, side = "both")

## [1] "this is trim test"

teststr <- "this is pad test"
str_pad(teststr, 10)

## [1] "this is pad test"

str_pad(teststr, 10, side = "left")

## [1] "this is pad test"

str_pad(teststr, 10, side = "right")

## [1] "this is pad test"

str_pad(teststr, 10, side = "both")

## [1] "this is pad test"

str_pad("this", 10)

## [1] "      this"

str_pad("this", 10, side = "left", pad=",")

## [1] ",,,,,,this"

str_pad("this", 10, side = "right", pad=",")

## [1] "this,,,,,,"

str_pad("this", 10, side = "both", pad=",")

## [1] ",,,this,,,"

6. 例えば、ベクトルc(“a”,“b”,“c”)を文字列a,b,cにする関数を書きなさい。ベクトルの長さが0,1,2のときにどうすべきか、よく考えなさい。

str_vec_to_char <- function(x) {
  str1 <- str_c(x, collapse = ", ")
  str1
}
str_vec_to_char(c(""))

## [1] ""

str_vec_to_char(c("a"))

## [1] "a"

str_vec_to_char(c("a","b"))

## [1] "a, b"

str_vec_to_char(c("a","b","c"))

## [1] "a, b, c"

# 回答サイト
# Write a function that turns (e.g.) a vector c(“a”, “b”, “c”) into the string a, b, and c. Think carefully about what it should do if given a vector of length 0, 1, or 2.
str_commasep <- function(x, sep = ", ", last = ", and ") {
  if (length(x) > 1) {
    str_c(str_c(x[-length(x)], collapse = sep),
          x[length(x)],
          sep = last)
  } else {
    x
  }
}
str_commasep("a")

## [1] "a"

str_commasep(c("a", "b"))

## [1] "a, and b"

str_commasep(c("a", "b", "c"))

## [1] "a, b, and c"

11.3 正規表現でパターンマッチ

x <- c("apple", "banana", "pear")
# 単純マッチ
str_view(x, "an")

# .はどんな文字ともマッチする
str_view(x, ".a.")

# .とマッチするときは正規表現\\.が必要
dot <- "\\."
writeLines(dot)

## \.

str_view(c("abc", "a.c", "bef"), "a\\.c")

# \とマッチするときは正規表現\\\\が必要
x <- "a\\b"
writeLines(x)

## a\b

str_view(x, "\\\\")

練習問題 p.175

1. なぜ“\”が、“\”,“\\”,“\\\”が\とマッチしないか説明しなさい。

# 1. \はEscape文字のため\の後ろにEscapseする文字がないとエラーになる。
# 2. \\は\として認識される。
# 3. \\\は前の\\は\として認識されるが、三番目の\は1と同様でエラー。

2. 文字列「“’\」とはどうマッチするか。

x <- "ab\"'\\def"
writeLines(x)

## ab"'\def

str_view(x, "\"'\\\\")

3. 正規表現\..\..\..はどのようなパターンとマッチするか。文字列でどう表すか。

x <- "a.b.c.d.e"
str_view(x, "\\..\\..\\..")

11.3.2 アンカー

# ^は文字列の先頭とマッチする
# $は文字列の末尾とマッチする
x <- c("apple", "banana", "pear")
str_view(x, "^a")

str_view(x, "a$")

x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")

str_view(x, "^apple$")

# 単語境界マッチ
x <- c("summarize", "summary", "rowsum", "sum")
str_view(x, "\\bsum\\b")

練習問題 p.176

1. 文字列“$^$”とのマッチはどう行うか。

x <- "aaa$^$bbb"
writeLines(x)

## aaa$^$bbb

str_view(x, "\\$\\^\\$")

2. 頻出単語のコーパスがstringr::wordsにある。次のような単語をすべて探し出す正規表現を作りなさい。

x <- stringr::words
# a. "y"で始まる
str_view(x, "^y", match = TRUE)

# b. "x"で終わる
str_view(x, "x$", match = TRUE)

# c. 正確に3文字
str_view(x, "\\b...\\b", match = TRUE)

# d. 7文字以上
str_view(x, ".......", match = TRUE)

11.3.3 文字のクラスと候補

str_view(c("grey", "gray"), "gr(e|a)y")

練習問題 p.177

1. 次の語を探す正規表現を作りなさい。

x <- c("apple", "banana", "car", "deal", "earth", "unity", "showed", "succeed", "playing", "realize")
# a. 母音で始まる
str_view(x, "^[aiueo]", match = TRUE)

# b. 子音からなる
str_view(x, "^[^aiueo]", match = TRUE)

# c. edで終わるがeedでは終わらない
str_view(x, "[^e]ed$")

# d. ingまたはizeで終わる
str_view(x, "ing$|ize$")

2. 「cの後以外はeの前にi」という規則を実際に検証しなさい。

x <- c("abcie", "abie", "abcedei", "abieddd")
str_view(x, "[^c]ie")

3. 「q」の後には常に「u」が続くか。

x <- stringr::words
str_view(x, "q[^u]", match = TRUE)

4. アメリカ英語ではなくイギリス英語で書かれている英単語にマッチする正規表現を書きなさい。

str_view(x, "ou|ise$|ae|oe|yse$|tre$", match = TRUE)

5. 自分の国で普通の書き方の電話番号の正規表現を作りなさい。

tel <- c("042-333-1212", "03-333-1212")
str_view(tel, "(\\d\\d|\\d\\d\\d)-\\d\\d\\d-\\d\\d\\d\\d")

str_view(tel, "(\\d{2}|\\d{3})-\\d{3}-\\d{4}")

11.3.4 繰り返し

x <- "numerals: MDCCCLXXXVIII"
str_view(x, "CC?")

str_view(x, "CC+")

str_view(x, "C[LX]+")

str_view(x, "C{2}")

str_view(x, "C{2,}")

str_view(x, "C{2,3}")

str_view(x, "C{2,3}?")

str_view(x, "C[LX]+?")

練習問題 p.178

1. ?,+,*と等価な{m,n}形式を示しなさい。

str_view(x, "C?")

str_view(x, "C{0,1}")

str_view(x, "C+")

str_view(x, "C{1,}")

str_view(x, "C*")

str_view(x, "C{0,}")

2. 次に示す正規表現とマッチするものを言葉で述べなさい。

str_view("abcdefg", "^.*$") #すべての文字列

str_view("{a}", "\\{.+\\}") #{}に囲まれた1文字以上の文字列

str_view("1111-22-33", "\\d{4}-\\d{2}-\\d{2}") #1111-22-33の形式

str_view("ab\\\\\\\\cd", "\\\\{4}") #\が４つ

3. 次のような語を探し出す正規表現を作りなさい。

# a. 3個の子音で始まる
str_view("bcde", "^[^aiueo]{3}")

# b. 3個以上の母音が続く
str_view("aaaiueo", "^[aiueo]{3,}")

# c. 2個以上の母音子音対が続く
str_view("abicuc", "([aiueo][^aiueo]){2,}")

11.3.5 グループ化と後方参照

str_view(fruit, "(..)\\1", match = TRUE)

練習問題 p.179

1. 次の正規表現が何とマッチするか言葉で述べなさい

# a. 同じ文字が3回連続の場合マッチ
str_view(c("appple", "apple", "baaanana"), "(.)\\1\\1", match = TRUE)

# b. 2文字と順番逆の2文字にマッチ
str_view(c("abba", "kklaalkkk", "baaanana", "mmmm"), "(.)(.)\\2\\1", match = TRUE)

# c. 同じ2文字が連続する場合マッチ
str_view(c("abba", "bananan", "baaanana", "mmmm"), "(..)\\1", match = TRUE)

# d. 2つの文字+1番目の文字＋任意の文字＋1番目の文字
str_view(c("abaka", "bananan", "baaanana", "mmmm"), "(.).\\1.\\1", match = TRUE)

# e. #3文字＋任意文字列＋3文字の逆順番の文字
str_view(c("abcdkkkkkcba", "zyabcdkkkkkcbaxz", "abccba", "mmmm"), "(.)(.)(.).*\\3\\2\\1", match = TRUE)

2. 次のような語にマッチする正規表現を作りなさい

# a. 先頭と末尾が同じ文字
str_view(c("appplea", "aa", "aba", "apple", "baaanana"), "^(.).*\\1$", match = TRUE)

# b. 文字対の繰り返しがある（churchではchを2回繰り返す）
str_view(c("church", "akauoakauo", "baba", "aba", "apple", "baaanana"), "(..).*\\1", match = TRUE)

# c. １つの文字が少なくとも3回繰り返される（elevenには３つのeがある）
str_view(c("appplea", "eleven", "aba", "apple", "baaanana"), "(.).*\\1.*\\1", match = TRUE)

11.4 ツール

11.4.1 マッチの可否

x <- c("apple", "banana", "pear")
str_detect(x, "e")

## [1]  TRUE FALSE  TRUE

# 一般単語でtで始まる単語の個数は
sum(str_detect(words, "^t"))

## [1] 65

# 一般単語で母音で終わる単語の割合は
mean(str_detect(words, "[aeiou]$"))

## [1] 0.2765306

# 少なくとも母音を1つ含む単語をすべて探し出して、補集合を取る
no_vowels_1 <- !str_detect(words, "[aeiou]") #aeiouが含まれてない文字列
# 子音（非母音）だけからなる単語をすべて探し出す
no_vowels_2 <- str_detect(words, "^[^aeiou]+$") #aeiouではない文字から始まり、それで終わる
identical(no_vowels_1, no_vowels_2)

## [1] TRUE

words[str_detect(words, "x$")]

## [1] "box" "sex" "six" "tax"

str_subset(words, "x$")

## [1] "box" "sex" "six" "tax"

library(tibble)
df <- tibble(
  word = words,
  i = seq_along(word)
)
df %>% filter(str_detect(words, "x$"))

## # A tibble: 4 x 2
##   word      i
##   <chr> <int>
## 1 box     108
## 2 sex     747
## 3 six     772
## 4 tax     841

x <- c("apple","banana","pear")
str_count(x, "a")

## [1] 1 3 1

mean(str_count(words, "[aeiou]"))

## [1] 1.991837

library(dplyr)
df %>% 
  mutate(
    vowels = str_count(word, "[aeiou]"),
    consonants = str_count(word, "[^aeiou]")
  )

## # A tibble: 980 x 4
##    word         i vowels consonants
##    <chr>    <int>  <int>      <int>
##  1 a            1      1          0
##  2 able         2      2          2
##  3 about        3      3          2
##  4 absolute     4      4          4
##  5 accept       5      2          4
##  6 account      6      3          4
##  7 achieve      7      4          3
##  8 across       8      2          4
##  9 act          9      1          2
## 10 active      10      3          3
## # ... with 970 more rows

str_count("abababa", "aba")

## [1] 2

str_view_all("abababa", "aba")

練習問題 p.183

1. 次の各問題について、単一の正規表現を使った解と複数の呼び出しを組み合わせた解の両方を求めなさい

# a. 先頭または末尾がxの全単語を探し出す
x <- c("xax", "xaa", "aax")
str_view(x, "^x.*x$", match = TRUE)

tmp_x <- x[str_detect(x, "^x")]
str_view(tmp_x, "x$", match = TRUE)

# b. 先頭が母音で末尾が子音の全単語を探し出す
x <- c("axab", "xaa", "aax")
str_view(x, "^[aeiou].*[^aeiou]$", match = TRUE)

tmp_x <- x[str_detect(x, "^[aeiou]")]
str_view(tmp_x, "[^aeiou]$", match = TRUE)

# c. 異なる母音をそれぞれ少なくとも1つ含む単語はあるか
words[str_detect(words, "a") &
        str_detect(words, "e") &
        str_detect(words, "i") &
        str_detect(words, "o") &
        str_detect(words, "u")]

## character(0)

test_words <- c("abeicoub", "abeicb")
test_words[str_detect(test_words, "a") &
        str_detect(test_words, "e") &
        str_detect(test_words, "i") &
        str_detect(test_words, "o") &
        str_detect(test_words, "u")]

## [1] "abeicoub"

# d. 母音数が一番多い単語は何か。母音の割合が最も多い単語は何か（ヒント、分母は何か）
x <- str_count(words, "[aeiou]")
words[which(x == max(x))]

## [1] "appropriate" "associate"   "available"   "colleague"   "encourage"  
## [6] "experience"  "individual"  "television"

x <- str_count(words, "[aeiou]") / str_length(words)
words[which(x == max(x))]

## [1] "a"

words <- words[-which(words == "a")]
x <- str_count(words, "[aeiou]") / str_length(words)
words[which(x == max(x))]

## [1] "area" "idea"

11.4.2 マッチの抽出

length(sentences)

## [1] 720

head(sentences)

## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."

# 色名の正規表現作成
colors <- c("red", "orange", "yellow", "green", "blue", "purple")
color_match <- str_c(colors, collapse = "|")
color_match

## [1] "red|orange|yellow|green|blue|purple"

# 文章から正規表現でマッチした色を抽出
has_color <- str_subset(sentences, color_match)
matches <- str_extract(has_color, color_match)
head(matches)

## [1] "blue" "blue" "red"  "red"  "red"  "blue"

more <- sentences[str_count(sentences, color_match) > 1]
str_view_all(more, color_match)

str_extract(more, color_match)

## [1] "blue"   "green"  "orange"

str_extract_all(more, color_match)

## [[1]]
## [1] "blue" "red" 
## 
## [[2]]
## [1] "green" "red"  
## 
## [[3]]
## [1] "orange" "red"

str_extract_all(more, color_match, simplify = TRUE)

##      [,1]     [,2] 
## [1,] "blue"   "red"
## [2,] "green"  "red"
## [3,] "orange" "red"

x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)

##      [,1] [,2] [,3]
## [1,] "a"  ""   ""  
## [2,] "a"  "b"  ""  
## [3,] "a"  "b"  "c"

練習問題 p.185

1. 以前の例題で、マッチした正規表現が色ではないのにflickered（語尾のredを色と勘違い）とマッチしてしまったことに気付いたはずだ。この問題を修正するように正規表現を変更しなさい

color_match2 <- str_c("\\b(", color_match, ")\\b")
color_match2

## [1] "\\b(red|orange|yellow|green|blue|purple)\\b"

str_view_all(more, color_match2)

2. Harvard英語コーパスから次の抽出を行いなさい

# a. 各行の先頭の語
sentences %>% head()

## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."

str_extract(sentences, "^[a-zA-Z]+") %>% head()

## [1] "The"   "Glue"  "It"    "These" "Rice"  "The"

# b. ingで終わるすべての単語
ing_match <- str_detect(sentences, "[a-zA-Z]+ing")
str_extract_all(sentences[ing_match], "[a-zA-Z]+ing") %>% unlist() %>% head()

## [1] "stocking" "spring"   "evening"  "morning"  "winding"  "living"

# c. 複数を表す語
str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b") %>% head()

## [[1]]
## [1] "planks"
## 
## [[2]]
## character(0)
## 
## [[3]]
## character(0)
## 
## [[4]]
## [1] "days"
## 
## [[5]]
## [1] "bowls"
## 
## [[6]]
## [1] "lemons" "makes"

11.4.3 グループのマッチ

noun <- "(a|the) ([^ ]+)"

has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)

has_noun %>% 
  str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

has_noun %>% 
  str_match(noun)

##       [,1]         [,2]  [,3]     
##  [1,] "the smooth" "the" "smooth" 
##  [2,] "the sheet"  "the" "sheet"  
##  [3,] "the depth"  "the" "depth"  
##  [4,] "a chicken"  "a"   "chicken"
##  [5,] "the parked" "the" "parked" 
##  [6,] "the sun"    "the" "sun"    
##  [7,] "the huge"   "the" "huge"   
##  [8,] "the ball"   "the" "ball"   
##  [9,] "the woman"  "the" "woman"  
## [10,] "a helps"    "a"   "helps"

tibble(sentence = sentences) %>% 
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)", 
    remove = FALSE
  )

## # A tibble: 720 x 3
##    sentence                                    article noun   
##  * <chr>                                       <chr>   <chr>  
##  1 The birch canoe slid on the smooth planks.  the     smooth 
##  2 Glue the sheet to the dark blue background. the     sheet  
##  3 It's easy to tell the depth of a well.      the     depth  
##  4 These days a chicken leg is a rare dish.    a       chicken
##  5 Rice is often served in round bowls.        <NA>    <NA>   
##  6 The juice of lemons makes fine punch.       <NA>    <NA>   
##  7 The box was thrown beside the parked truck. the     parked 
##  8 The hogs were fed chopped corn and garbage. <NA>    <NA>   
##  9 Four hours of steady work faced us.         <NA>    <NA>   
## 10 Large size in stockings is hard to sell.    <NA>    <NA>   
## # ... with 710 more rows

練習問題 p.187

1. one, two, threeのようにnumberの後に続くすべての単語を探し出しなさい。numberと単語の両方を取り出しなさい。

numword <- "(one|two|three|four|five|six|seven|eight|nine|ten) ([^ ]+)"
has_numword <- sentences %>%
  str_subset(numword) %>%
  head(10)
has_numword %>% str_extract(numword)

##  [1] "ten served"  "one over"    "seven books" "two met"     "two factors"
##  [6] "one and"     "three lists" "seven is"    "two when"    "one floor."

2. アポストロフィを使った短縮形をすべて探そう。アポストロフィの前後の部分を分けなさい。

apos_word <- "([A-Za-z]+)'([A-Za-z]+)"
has_apos_word <- sentences %>%
  str_subset(apos_word) %>%
  head(10)
has_apos_word %>% str_extract(apos_word)

##  [1] "It's"      "man's"     "don't"     "store's"   "workmen's"
##  [6] "Let's"     "sun's"     "child's"   "king's"    "It's"

sprit_str <- has_apos_word %>% str_extract(apos_word)
str_split(sprit_str, "'")

## [[1]]
## [1] "It" "s" 
## 
## [[2]]
## [1] "man" "s"  
## 
## [[3]]
## [1] "don" "t"  
## 
## [[4]]
## [1] "store" "s"    
## 
## [[5]]
## [1] "workmen" "s"      
## 
## [[6]]
## [1] "Let" "s"  
## 
## [[7]]
## [1] "sun" "s"  
## 
## [[8]]
## [1] "child" "s"    
## 
## [[9]]
## [1] "king" "s"   
## 
## [[10]]
## [1] "It" "s"

11.4.4 置換マッチ

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")

## [1] "-pple"  "p-ar"   "b-nana"

str_replace_all(x, "[aeiou]", "-")

## [1] "-ppl-"  "p--r"   "b-n-n-"

x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))

## [1] "one house"    "two cars"     "three people"

pattern <- "([^ ]+) ([^ ]+) ([^ ]+)"
has_pattern <- sentences %>%
  str_subset(pattern) %>%
  head(5)
has_pattern

## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."

sentences %>% 
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>% 
  head(5)

## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."

練習問題 p.188

1. 文字列のスラッシュをすべてバックスラッシュで置き換える。

x <- "aaa/bbb/ccc/ddd"
xreplace <- str_replace_all(x, "\\/", "\\\\")
writeLines(xreplace)

## aaa\bbb\ccc\ddd

2. replace_all()を使って、str_to_lower()の単純版を実装しなさい。

x <- "ABCD"
str_replace_all(x, c("A" = "a", "B" = "b", "C" = "c", "D" = "d"))

## [1] "abcd"

3. wordsの先頭と最後の文字を置き換えなさい。できた文字列でwordsにあるのはどれか。

words <- c("apple", "and", "thus", "banana", "orange", "dna", "eagle", "grape", "shut")
words_match <- str_c(words, collapse = "|")
str_replace(words, "(^[a-zA-Z])([a-zA-Z]*)([a-zA-Z]$)", "\\3\\2\\1")

## [1] "eppla"  "dna"    "shut"   "aananb" "erango" "and"    "eagle"  "erapg" 
## [9] "thus"

ex_words <- str_replace(words, "(^[a-zA-Z])([a-zA-Z]*)([a-zA-Z]$)", "\\3\\2\\1")
str_view(ex_words, words_match)

11.4.5 分割

sentences %>%
  head(5) %>% 
  str_split(" ")

## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."

"a|b|c|d" %>% 
  str_split("\\|") %>% 
  .[[1]]

## [1] "a" "b" "c" "d"

c("a|b|c|d", "e|f|g") %>% 
  str_split("\\|") %>% 
  .[[2]]

## [1] "e" "f" "g"

sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE)

##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]    
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth"
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"  
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"    
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"     
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls."
##      [,8]          [,9]   
## [1,] "planks."     ""     
## [2,] "background." ""     
## [3,] "a"           "well."
## [4,] "rare"        "dish."
## [5,] ""            ""

sentences %>%
  head(5) %>% 
  str_split(" ", simplify = TRUE) %>% 
  .[[1,2]]

## [1] "birch"

fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)

##      [,1]      [,2]    
## [1,] "Name"    "Hadley"
## [2,] "Country" "NZ"    
## [3,] "Age"     "35"

x <- "This is a sentence. This is another sentence."
str_view_all(x, boundary("word"))

str_split(x, " ")[[1]]

## [1] "This"      "is"        "a"         "sentence." "This"      "is"       
## [7] "another"   "sentence."

str_split(x, boundary("word"))[[1]]

## [1] "This"     "is"       "a"        "sentence" "This"     "is"      
## [7] "another"  "sentence"

練習問題 p.190

1. “apples, pears, and bananas”のような文字列を要素に分割しなさい。

x <- "apples, pears, and bananas"
str_split(x, ", and |, ")

## [[1]]
## [1] "apples"  "pears"   "bananas"

2. " “よりもboundary(”word“)で分割した方がなぜよいのか。

x <- "This is a sentence. This is another sentence."
str_split(x, " ")[[1]]

## [1] "This"      "is"        "a"         "sentence." "This"      "is"       
## [7] "another"   "sentence."

str_split(x, boundary("word"))[[1]]

## [1] "This"     "is"       "a"        "sentence" "This"     "is"      
## [7] "another"  "sentence"

3. 空文字列(“”)で分割すると何が起こるか。実験して、ドキュメントを読みなさい。

str_split(x, "")[[1]]

##  [1] "T" "h" "i" "s" " " "i" "s" " " "a" " " "s" "e" "n" "t" "e" "n" "c"
## [18] "e" "." " " "T" "h" "i" "s" " " "i" "s" " " "a" "n" "o" "t" "h" "e"
## [35] "r" " " "s" "e" "n" "t" "e" "n" "c" "e" "."

11.5 他の種類のパターン

str_view(fruit, "nana", match = TRUE)

str_view(fruit, regex("nana"), match = TRUE)

bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")

str_view(bananas, regex("banana", ignore_case = TRUE))

x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]

## [1] "Line"

str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]

## [1] "Line" "Line" "Line"

phone <- regex("\\(?    # オプションの開き括弧
               (\\d{3}) # エリア番号
               [)- ]?   # オプションの閉じ括弧、ダッシュ、空白
               (\\d{3}) # 3桁の番号
               [ -]?    # オプションの空白かダッシュ
               (\\d{3}) # 3桁の番号
               ", comments = TRUE)

str_match("514-791-8141", phone)

##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"

#install.packages("microbenchmark")
microbenchmark::microbenchmark(
  fixed = str_detect(sentences, fixed("the")),
  regex = str_detect(sentences, "the"),
  times = 20
)

## Unit: microseconds
##   expr     min      lq     mean   median       uq     max neval
##  fixed 117.722 121.947 137.8548 124.6640 141.1145 306.075    20
##  regex 249.328 254.307 263.8612 255.9675 256.8735 397.836    20

a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)

## [1] "a"  "a<U+0301>"

a1 == a2

## [1] FALSE

str_detect(a1, fixed(a2))

## [1] FALSE

str_detect(a1, coll(a2))

## [1] TRUE

i <- c("I", "İ", "i", "ı")
i

## [1] "I"        "<U+0130>" "i"        "<U+0131>"

str_subset(i, coll("i", ignore_case = TRUE))

## [1] "I" "i"

str_subset(i, coll("i", ignore_case = TRUE, locale = "tr"))

## [1] "i"

library(stringi)
stringi::stri_locale_info()

## $Language
## [1] "ja"
## 
## $Country
## [1] "JP"
## 
## $Variant
## [1] ""
## 
## $Name
## [1] "ja_JP"

x <- "This is a sentence."
str_view_all(x, boundary("word"))

str_extract_all(x, boundary("word"))

## [[1]]
## [1] "This"     "is"       "a"        "sentence"

練習問題 p.193

1. \を含む全文字列を探し出すのに、regex()とfixed()とでそれぞれどのようにすればよいか。

str_subset(c("a\\b", "ab"), "\\\\")

## [1] "a\\b"

str_subset(c("a\\b", "ab"), fixed("\\"))

## [1] "a\\b"

2. sentencesで最も多く使われている単語を5つ示しなさい。

library(tibble)
library(tidyverse)
str_extract_all(sentences, boundary("word")) %>% 
  unlist() %>%
  str_to_lower() %>%
  tibble() %>%
  set_names("word") %>%
  group_by(word) %>%
  count(sort = TRUE) %>%
  head(5)

## # A tibble: 5 x 2
## # Groups:   word [5]
##   word      n
##   <chr> <int>
## 1 the     751
## 2 a       202
## 3 of      132
## 4 to      123
## 5 and     118

11.6 正規表現の別の用途

apropos("replace")

##  [1] "%+replace%"                   "replace"                     
##  [3] "replace_na"                   "setReplaceMethod"            
##  [5] "str_replace"                  "str_replace_all"             
##  [7] "str_replace_na"               "stri_replace"                
##  [9] "stri_replace_all"             "stri_replace_all_charclass"  
## [11] "stri_replace_all_coll"        "stri_replace_all_fixed"      
## [13] "stri_replace_all_regex"       "stri_replace_first"          
## [15] "stri_replace_first_charclass" "stri_replace_first_coll"     
## [17] "stri_replace_first_fixed"     "stri_replace_first_regex"    
## [19] "stri_replace_last"            "stri_replace_last_charclass" 
## [21] "stri_replace_last_coll"       "stri_replace_last_fixed"     
## [23] "stri_replace_last_regex"      "stri_replace_na"             
## [25] "theme_replace"                "xreplace"

head(dir(pattern = "\\.Rmd$"))

## [1] "chapter11.Rmd"       "R_book_korean.Rmd"   "sample_markdown.Rmd"

練習問題 p.194

1. 次のstringi関数を探し出しなさい。

# a. 単語数を数える
x <- "this is a test. this is a sample test for stringi functions"
stri_count_words(x)

## [1] 12

# b. 重複した文字列を探し出す
x <- c("this is test1.", "this is test2.", "this is test3.", "this is test1.")
stri_duplicated(x)

## [1] FALSE FALSE FALSE  TRUE

x %>% 
  str_to_lower() %>%
  tibble() %>%
  set_names("sentence") %>%
  group_by(sentence) %>%
  count(sort = TRUE) %>%
  filter(n > 1) %>% 
  head(5)

## # A tibble: 1 x 2
## # Groups:   sentence [1]
##   sentence           n
##   <chr>          <int>
## 1 this is test1.     2

# c. ランダムな文章を生成する
apropos("stri_rand")

## [1] "stri_rand_lipsum"  "stri_rand_shuffle" "stri_rand_strings"

stri_rand_lipsum(2) # dummy text

## [1] "Lorem ipsum dolor sit amet, conubia odio dignissim laoreet volutpat. Venenatis elementum magnis fusce donec malesuada, ridiculus justo quis bibendum morbi. Litora ac pellentesque quisque ac eget purus. Nulla amet. Sapien ornare quam turpis ante sed interdum in dapibus blandit curabitur sed. Tempor urna diam vel hendrerit vivamus mi consectetur, nascetur nulla sed? Hac sed bibendum justo nulla amet vitae, inceptos eros sapien phasellus nulla ac. Nec sed interdum sit finibus donec! Consectetur at eu donec senectus ullamcorper ac et. Porta habitasse vel erat amet risus consectetur. Ligula etiam mauris ut lectus porta lectus. Eros vestibulum penatibus, lobortis platea, ac duis tincidunt vitae leo. A ut vel in orci varius magna nam. Class tempus ad."                                                                                                                                                                                                                                            
## [2] "Est nisi, habitasse quis feugiat, tempor ut enim lobortis ligula libero enim mauris. Ullamcorper urna magna hac class, et. Dolor accumsan etiam egestas lobortis, sem malesuada, pulvinar leo laoreet. Metus, ipsum ut quam rhoncus ac in donec nam. Ultricies non vitae id. Tristique euismod ut dictumst class nibh posuere, erat pellentesque. Blandit diam at mauris est habitant nulla auctor. Cubilia in lacinia erat semper iaculis. Dapibus eros placerat laoreet mus iaculis tempus. Eu amet primis odio habitant inceptos, ac sociosqu diam. Bibendum efficitur integer in vestibulum erat bibendum sit nulla dignissim in et. Sed, pellentesque adipiscing, aliquam fusce non a. Odio phasellus massa, sed dictum donec cum habitasse dolor. Vel ligula consequat inceptos ultricies semper vel mollis quam at. Diam, scelerisque nec neque, tellus venenatis vel dignissim, ac hac nunc amet. Feugiat dolor viverra imperdiet torquent faucibus. Blandit, dictum non vestibulum, risus venenatis vel nunc ac, non."

n <- 10
stri_rand_shuffle(stri_paste(
  stri_rand_strings(n, 1, '[0-9]'),
  stri_rand_strings(n, 1, '[a-z]'),
  stri_rand_strings(n, 1, '[A-Z]'),
  stri_rand_strings(n, sample(5:11, 5, replace=TRUE), '[a-zA-Z0-9]')
))

##  [1] "pb0wTRphUZ"     "7T0SCo5AyKN3Ot" "Gy718CRJK4SiUr" "Sme9mQhwwL"    
##  [5] "w6bbwQ3wBB"     "Vj9OF2ey8P"     "KIDMikz1YcVVzv" "jkLqt29WJETcBp"
##  [9] "Hk4XQPbs6B"     "S3jEYYOdQ7"

2. stri_sort()が整列に使う言語をどう制御すればよいか。

stri_sort(c("hladny", "chladny", "bubble"), locale="ja_JP")

## [1] "bubble"  "chladny" "hladny"

stri_sort(c("hladny", "chladny", "bubble"), locale="sk_SK")

## [1] "bubble"  "hladny"  "chladny"

R4DS <11章Stringrによる文字列>

11.1 はじめに

11.1.1 準備

11.2 文字列の基本

11.2.1 文字列の長さ

11.2.2 文字列の連結

11.2.3 文字列の一部抽出

11.2.4 ロケール

練習問題 p.173

1. stringrを使わないコードでは、paste()とpaste0()をよく使う。この２つの関数は何が違うか。等価なstringrの関数は何か。NAの処理ではどこが違うか。

2. str_c()の引数sepとcollapseとの違いを自分の言葉で述べなさい。

3. str_length()とstr_sub()を用いて、文字列の中央の文字を抜き出しなさい。文字列の文字が偶数個の場合には、どうするか。

4. str_wrap()は何をするか。何に使うか。

5. str_trim()は何をするか。str_trim()の逆操作は何か。

6. 例えば、ベクトルc(“a”,“b”,“c”)を文字列a,b,cにする関数を書きなさい。ベクトルの長さが0,1,2のときにどうすべきか、よく考えなさい。

11.3 正規表現でパターンマッチ

練習問題 p.175

1. なぜ“\”が、“\”,“\\”,“\\\”が\とマッチしないか説明しなさい。

2. 文字列「“’\」とはどうマッチするか。

3. 正規表現\..\..\..はどのようなパターンとマッチするか。文字列でどう表すか。

11.3.2 アンカー

練習問題 p.176

1. 文字列“$^$”とのマッチはどう行うか。

2. 頻出単語のコーパスがstringr::wordsにある。次のような単語をすべて探し出す正規表現を作りなさい。

11.3.3 文字のクラスと候補

練習問題 p.177

1. 次の語を探す正規表現を作りなさい。

2. 「cの後以外はeの前にi」という規則を実際に検証しなさい。

3. 「q」の後には常に「u」が続くか。

4. アメリカ英語ではなくイギリス英語で書かれている英単語にマッチする正規表現を書きなさい。

5. 自分の国で普通の書き方の電話番号の正規表現を作りなさい。

11.3.4 繰り返し

練習問題 p.178

1. ?,+,*と等価な{m,n}形式を示しなさい。

2. 次に示す正規表現とマッチするものを言葉で述べなさい。

3. 次のような語を探し出す正規表現を作りなさい。

11.3.5 グループ化と後方参照

練習問題 p.179

1. 次の正規表現が何とマッチするか言葉で述べなさい

2. 次のような語にマッチする正規表現を作りなさい

11.4 ツール

11.4.1 マッチの可否

練習問題 p.183

1. 次の各問題について、単一の正規表現を使った解と複数の呼び出しを組み合わせた解の両方を求めなさい

11.4.2 マッチの抽出

練習問題 p.185

1. 以前の例題で、マッチした正規表現が色ではないのにflickered（語尾のredを色と勘違い）とマッチしてしまったことに気付いたはずだ。この問題を修正するように正規表現を変更しなさい

2. Harvard英語コーパスから次の抽出を行いなさい

11.4.3 グループのマッチ

練習問題 p.187

1. one, two, threeのようにnumberの後に続くすべての単語を探し出しなさい。numberと単語の両方を取り出しなさい。

2. アポストロフィを使った短縮形をすべて探そう。アポストロフィの前後の部分を分けなさい。

11.4.4 置換マッチ

練習問題 p.188

1. 文字列のスラッシュをすべてバックスラッシュで置き換える。

2. replace_all()を使って、str_to_lower()の単純版を実装しなさい。

3. wordsの先頭と最後の文字を置き換えなさい。できた文字列でwordsにあるのはどれか。

11.4.5 分割

練習問題 p.190

1. “apples, pears, and bananas”のような文字列を要素に分割しなさい。

2. " “よりもboundary(”word“)で分割した方がなぜよいのか。

3. 空文字列(“”)で分割すると何が起こるか。実験して、ドキュメントを読みなさい。

11.5 他の種類のパターン

練習問題 p.193

1. \を含む全文字列を探し出すのに、regex()とfixed()とでそれぞれどのようにすればよいか。

2. sentencesで最も多く使われている単語を5つ示しなさい。

11.6 正規表現の別の用途

練習問題 p.194

1. 次のstringi関数を探し出しなさい。

2. stri_sort()が整列に使う言語をどう制御すればよいか。

11.1　はじめに