r4ds_ch11.R

# 11章　stringrによる文字列

# 「R for Data Science」 http://r4ds.had.co.nz/strings.html

# 「r4ds-exercise-solutions/strings.Rmd at master · jrnold/r4ds-exercise-solutions」 https://github.com/jrnold/r4ds-exercise-solutions/blob/master/strings.Rmd
# 「Exercise Solutions to R for Data Science」 https://jrnold.github.io/r4ds-exercise-solutions/strings.html

# 「RPubs - r4ds_ch11」 http://rpubs.com/tocci36/r4ds_ch11

# 11.1　はじめに p169

# 11.1.1　準備するもの

library(tidyverse)

## Loading tidyverse: ggplot2
## Loading tidyverse: tibble
## Loading tidyverse: tidyr
## Loading tidyverse: readr
## Loading tidyverse: purrr
## Loading tidyverse: dplyr

## Conflicts with tidy packages ----------------------------------------------

## filter(): dplyr, stats
## lag():    dplyr, stats

library(stringr)

# 11.2　文字列の基本
string1 <- "This is a string"
string2 <- 'To put a "quote" inside a string, use single quotes'

# "This is a string without a closing quote
# "

x <- "\u00b5"
x # "µ"

## [1] "µ"

c("one", "two", "three")

## [1] "one"   "two"   "three"

# 11.2.1　文字列の長さ
str_length(c("a", "R for data science", NA))

## [1]  1 18 NA

str_length("")

## [1] 0

# str_length()
# Error in stri_length(string) : 
#   argument "string" is missing, with no default

# 11.2.2　文字列の連結
str_c("x", "y")

## [1] "xy"

str_c("x", "y", "z")

## [1] "xyz"

str_c("x", "y", sep = ", ")

## [1] "x, y"

x <- c("abc", NA)
str_c("|-", x, "-|")

## [1] "|-abc-|" NA

str_c("|-", str_replace_na(x), "-|")

## [1] "|-abc-|" "|-NA-|"

str_c("prefix-", c("a", "b", "c"), "-suffix")

## [1] "prefix-a-suffix" "prefix-b-suffix" "prefix-c-suffix"

name <- "Hadley"
time_of_day <- "morning"
birthday <- FALSE
str_c(
  "Good ", time_of_day, " ", name,
  if (birthday) " and HAPPY BIRTHDAY",
  "."
)

## [1] "Good morning Hadley."

str_c(c("x", "y", "z"), collapse = ", ")

## [1] "x, y, z"

#?str_c

# 11.2.3　文字列の一部抽出 p172
x <- c("Apple", "Banana", "Pear")
str_sub(x, 1, 3)

## [1] "App" "Ban" "Pea"

# 負数は末尾からの位置
str_sub(x, -3, -1)

## [1] "ple" "ana" "ear"

str_sub("a", 1, 5)

## [1] "a"

str_sub(x, 1, 1) <- str_to_lower(str_sub(x, 1, 1))
x

## [1] "apple"  "banana" "pear"

# 11.2.4　ロケール
#?str_to_title
dog <- "The quick brown dog"
str_to_upper(dog)

## [1] "THE QUICK BROWN DOG"

str_to_lower(dog)

## [1] "the quick brown dog"

str_to_title(dog)

## [1] "The Quick Brown Dog"

str_to_upper(c("i", "ı"))

## [1] "I" "I"

str_to_upper(c("i", "ı"), locale = "tr")

## [1] "İ" "I"

x <- c("apple", "eggplant", "banana")
str_sort(x, locale = "en") # 英語

## [1] "apple"    "banana"   "eggplant"

str_sort(x, locale = "haw") # ハワイ語

## [1] "apple"    "eggplant" "banana"

# 練習問題 p173 (String Basics)
# 1. stringrを使わないコードでは、paste()とpaste0()をよく使う。この2つの関数は何が違うか。
# 等価なstringrの関数は何か。NAの処理ではどこが違うか。
paste("foo", "bar")

## [1] "foo bar"

paste0("foo", "bar")

## [1] "foobar"

# Since str_c does not separate strings with spaces by default it is closer in behavior to paste0.
str_c("foo", "bar")

## [1] "foobar"

# However, str_c and the paste function handle NA differently. The function str_c propagates NA, if any argument is a missing value, it returns a missing value. This is in line with how the numeric R functions, e.g. sum, mean, handle missing values. However, the paste functions, convert NA to the string "NA" and then treat it as any other character vector.

str_c("foo", NA)

## [1] NA

paste("foo", NA)

## [1] "foo NA"

paste0("foo", NA)

## [1] "fooNA"

#?paste

# 2. str_c()の引数sepとcollapseとの違いを自分の言葉で述べなさい。
#?str_c
# The sep argument is the string inserted between arguments to str_c, while collapse is the string used to separate any elements of the character vector into a character vector of length one.
str_c(c("x", "y", "z"), collapse = ", ") # ◯

## [1] "x, y, z"

str_c("x", "y", "z", collapse = ", ") # ▲

## [1] "xyz"

str_c(c("x", "y", "z"), sep = ", ") # ×

## [1] "x" "y" "z"

str_c("x", "y", "z", sep = ", ") # ◯

## [1] "x, y, z"

# 3. str_length()とstr_sub()を用いて、文字列の中央の文字を抜き出しなさい。文字列の文字が
# 偶数個の場合には、どうするか。

# The following function extracts the middle character.
# If the string has an even number of characters the choice is arbitrary.
# We choose to select $\lceil n / 2 \rceil$, because that case works even if the string is only of length one. A more general method would allow the user to select either the floor or ceiling for the middle character of an even string.
#?str_length
#?str_sub

#x <- "12345"
#x <- "1234"
x <- c("a", "abc", "abcd", "abcde", "abcdef")
str_length(x)

## [1] 1 3 4 5 6

str_length(x)/2

## [1] 0.5 1.5 2.0 2.5 3.0

ceiling(str_length(x)/2)

## [1] 1 2 2 3 3

str_sub(x, ceiling(str_length(x)/2), ceiling(str_length(x)/2))

## [1] "a" "b" "b" "c" "c"

# 4. str_wrap()は何をするか。何に使うか。

# The function str_wrap wraps text so that it fits within a certain width. This is useful for wrapping long strings of text to be typeset.
#?str_wrap
thanks_path <- file.path(R.home("doc"), "THANKS")
thanks <- str_c(readLines(thanks_path), collapse = "\n")
thanks <- word(thanks, 1, 3, fixed("\n\n"))
cat(str_wrap(thanks), "\n")

## R would not be what it is today without the invaluable help of these people,
## who contributed by donating code, bug fixes and documentation: Valerio Aimale,
## Thomas Baier, Henrik Bengtsson, Roger Bivand, Ben Bolker, David Brahm, G"oran
## Brostr"om, Patrick Burns, Vince Carey, Saikat DebRoy, Matt Dowle, Brian D'Urso,
## Lyndon Drake, Dirk Eddelbuettel, Claus Ekstrom, Sebastian Fischmeister, John
## Fox, Paul Gilbert, Yu Gong, Gabor Grothendieck, Frank E Harrell Jr, Torsten
## Hothorn, Robert King, Kjetil Kjernsmo, Roger Koenker, Philippe Lambert, Jan
## de Leeuw, Jim Lindsey, Patrick Lindsey, Catherine Loader, Gordon Maclean, John
## Maindonald, David Meyer, Ei-ji Nakama, Jens Oehlschaegel, Steve Oncley, Richard
## O'Keefe, Hubert Palme, Roger D. Peng, Jose' C. Pinheiro, Tony Plate, Anthony
## Rossini, Jonathan Rougier, Petr Savicky, Guenther Sawitzki, Marc Schwartz, Arun
## Srinivasan, Detlef Steuer, Bill Simpson, Gordon Smyth, Adrian Trapletti, Terry
## Therneau, Rolf Turner, Bill Venables, Gregory R. Warnes, Andreas Weingessel,
## Morten Welinder, James Wettenhall, Simon Wood, and Achim Zeileis. Others have
## written code that has been adopted by R and is acknowledged in the code files,
## including

#?cat
cat(str_wrap(thanks, width = 40), "\n")

## R would not be what it is today
## without the invaluable help of these
## people, who contributed by donating
## code, bug fixes and documentation:
## Valerio Aimale, Thomas Baier, Henrik
## Bengtsson, Roger Bivand, Ben Bolker,
## David Brahm, G"oran Brostr"om, Patrick
## Burns, Vince Carey, Saikat DebRoy,
## Matt Dowle, Brian D'Urso, Lyndon Drake,
## Dirk Eddelbuettel, Claus Ekstrom,
## Sebastian Fischmeister, John Fox, Paul
## Gilbert, Yu Gong, Gabor Grothendieck,
## Frank E Harrell Jr, Torsten Hothorn,
## Robert King, Kjetil Kjernsmo, Roger
## Koenker, Philippe Lambert, Jan de
## Leeuw, Jim Lindsey, Patrick Lindsey,
## Catherine Loader, Gordon Maclean, John
## Maindonald, David Meyer, Ei-ji Nakama,
## Jens Oehlschaegel, Steve Oncley, Richard
## O'Keefe, Hubert Palme, Roger D. Peng,
## Jose' C. Pinheiro, Tony Plate, Anthony
## Rossini, Jonathan Rougier, Petr Savicky,
## Guenther Sawitzki, Marc Schwartz, Arun
## Srinivasan, Detlef Steuer, Bill Simpson,
## Gordon Smyth, Adrian Trapletti, Terry
## Therneau, Rolf Turner, Bill Venables,
## Gregory R. Warnes, Andreas Weingessel,
## Morten Welinder, James Wettenhall, Simon
## Wood, and Achim Zeileis. Others have
## written code that has been adopted by R
## and is acknowledged in the code files,
## including

# indent=10

# 5. str_trim()は何をするか。str_trim()の逆操作は何か。

# The function str_trim trims the whitespace from a string.

str_trim(" abc ")

## [1] "abc"

str_trim(" abc ", side = "left")

## [1] "abc "

str_trim(" abc ", side = "right")

## [1] " abc"

str_pad("abc", 10, side="both")

## [1] "   abc    "

str_pad("   abc   ", 10, side="both") # 全体の長さ

## [1] "   abc    "

str_pad("abc", 10, side="both", pad="-")

## [1] "---abc----"

str_pad("abc", 11, side="both", pad="-")

## [1] "----abc----"

?str_pad


# 6. 例えば、ベクトルc("a", "b", "c")を文字列a, b, cにする関数を書きなさい。ベクトルの長さ
# が0、1、2のときにどうすべきか、よく考えなさい。
str_commasep <- function(x, sep = ", ", last = ", and ") {
  if (length(x) > 1) {
    str_c(str_c(x[-length(x)], collapse = sep),
          x[length(x)],
          sep = last)
  } else {
    x
  }
}
# □str_c(c())でcharacter(0)と出力されるのはなぜか？
str_c()

## character(0)

str_c(c())

## character(0)

typeof(str_c(c()))

## [1] "character"

str_commasep(c())

## NULL

str_commasep("")

## [1] ""

str_commasep("a")

## [1] "a"

str_commasep(c("a", "b"))

## [1] "a, and b"

str_commasep(c("a", "b", "c"))

## [1] "a, b, and c"

?str_c
?stri_c

# 11.3　正規表現でパターンマッチ

#?str_view

# 11.3.1　基本マッチ
x <- c("apple", "banana", "pear")
str_view(x, "an")

str_view(x, ".a.")

dot <- "\\."
writeLines(dot)

## \.

str_view(c("abc", "a.c", "bef"), "a\\.c")

x <- "a\\b"
writeLines(x)

## a\b

str_view(x, "\\\\")

# 練習問題 p175 (Matching Patterns and Regular Expressions)
# 1. なぜ"\"が、"\", "\\", "\\\"が\とマッチしないか説明しなさい。
#エスケープ

# 2. 文字列「"'\」とはどうマッチするか。
str_view("\"'\\", "\"'\\\\")

# 3. 正規表現\..\..\..はどのようなパターンとマッチするか。文字列ではどう表すか。
# .?.?.?

# 11.3.2　アンカー
x <- c("apple", "banana", "pear")
str_view(x, "^a")

x <- c("apple pie", "apple", "apple cake")
str_view(x, "apple")

str_view(x, "^apple$")

# 練習問題 p176 (Anchors)
# 1. 文字列"$^$"とのマッチはどう行うか。
str_view("$^$", "\\$\\^\\$")

# 2. 頻出単語のコーパスがstringr::wordsにある。次のような単語をすべて探し出す正規表現を作
# りなさい。
# a. "y"で始まる。
str_view(stringr::words, "^y", match =TRUE)

# b. "x"で終わる。
str_view(stringr::words, "x$", match =TRUE)

# c. 正確に3文字（str_length()を使ってズルをしないこと）。
str_view(stringr::words, "^...$", match =TRUE)

str_view(stringr::words, "¥¥b...¥¥b", match =TRUE)

# d. 7文字以上。
str_view(stringr::words, ".......", match =TRUE)

str_view(stringr::words, ".{7}", match =TRUE)

# このリストは長いので、str_view()でmatch引数を使いマッチした、あるいは、マッチしな
# かった単語だけを表示するとよい。

# 11.3.3　文字のクラスと候補
str_view(c("grey", "gray"), "gr(e|a)y")

# 練習問題 p177 (Character classes and alternatives)

# 1. 次の語を探す正規表現を作りなさい。
#  a. 母音で始まる。
str_view(stringr::words, "^[aeiou]", match =TRUE)

#  b. 子音からなる（ヒント：母音以外とマッチする）。
str_view(stringr::words, "^[^aeiou]+$", match =TRUE)

#  c. edで終わるが、eedでは終わらない。
str_view(stringr::words, "[^e]ed$", match =TRUE)

#  d. ingまたはizeで終わる。
str_view(stringr::words, "(ing|ize)$", match =TRUE)

# 2. 「c の後以外はe の前にi」という規則を実際に検証しなさい。
str_view(stringr::words, "(cei|[^c]ie)", match = TRUE)

str_view(stringr::words, "(cie|[^c]ei)", match = TRUE)

sum(str_detect(stringr::words, "(cei|[^c]ie)"))

## [1] 14

sum(str_detect(stringr::words, "(cie|[^c]ei)"))

## [1] 3

# 3. 「q」の後には常に「u」が続くか。
str_view(stringr::words, "q[^u]", match = TRUE)

str_view(stringr::words, "qu", match = TRUE)

# 4. アメリカ英語ではなくイギリス英語で書かれている英単語にマッチする正規表現を書きなさい。
str_view(stringr::words, "ou|ise$|ae|oe|yse$", match = TRUE)

# 5. 自分の国で普通の書き方の電話番号の正規表現を作りなさい。
x <- c("01-2345-6789", "1235-2351", "090-2345-6789", "04992-1-2345", "03(1234)5678", "04992（1）2345", "1235－2351")
str_view(x, "^(|0\\d{1,4}[-(－（])\\d{1,4}[-)－）]\\d{4}$")

str_view(x, "^(0\\d{1,4}[-(－（])?\\d{1,4}[-)－）]\\d{4}$")

# 11.3.4　繰り返し
x <- "1888 is the longest year in Roman numerals: MDCCCLXXXVIII"
str_view(x, "CC?")

str_view(x, "CC+")

str_view(x, 'C[LX]+')

str_view(x, "C{2}")

str_view(x, "C{2,}")

str_view(x, "C{2,3}")

str_view(x, 'C{2,3}?')

str_view(x, 'C[LX]+?')

# 練習問題 p178 (Repetition)
# 1. ?, +, *と等価な{m,n}形式を示しなさい。
# ?:{0,1}
# +:{1,}
# *:{0,}

# 2. 次に示す正規表現とマッチするものを言葉で述べなさい（正規表現そのものか、正規表現を表す
# 文字列かどちらを使っているか注意して読むこと）。
#  a. ^.*$
#   any
#  b. "\\{.+\\}"
#   {～}
#  c. \d{4}-\d{2}-\d{2}
#   0000-00-00
#  d. "\\\\{4}"
#   \\\\
str_view("\\\\\\\\\\\\", "\\\\{4}")

# 3. 次のような語を探し出す正規表現を作りなさい。
#  a. 3個の子音で始まる。
str_view(stringr::words, "^[^aeiou]{3}", match = TRUE)

#  b. 3個以上の母音が続く。
str_view(stringr::words, "[aeiou]{3}", match = TRUE)

#  c. 2個以上の母音子音対が続く。
str_view(stringr::words, "([aeiou][^aeiou]){2}", match = TRUE)

# 4. https://regexcrossword.com/challenges/beginner にある初級の正規表現クロスワードパズルを
# 解きなさい。
# 1.HE 2.BO 3.OO 4.** 5.19
#   LP   BE   OO   //   84

# 11.3.5　グループ化と後方参照
str_view(fruit, "(..)\\1", match = TRUE)

# 練習問題 p179 (Grouping and backreferences)
# 1. 次の正規表現が何とマッチするか言葉で述べなさい。
#  a. (.)\1\1
#   aaa
#  b. "(.)(.)\\2\\1"
#   xyyx
#  c. (..)\1
#   xyxy
#  d. "(.).\\1.\\1"
#   xyxzx
#  e. "(.)(.)(.).*\\3\\2\\1"
#   abcxxxcba
# 2. 次のような語にマッチする正規表現を作りなさい。
#  a. 先頭と末尾が同じ文字。
str_view(stringr::words, "^(.).*\\1$", match = TRUE)

#  b. 文字対の繰り返しがある（例：「church」では「ch」を2回繰り返す）。
str_view(stringr::words, "(..).*\\1", match = TRUE)

#  c. 1つの文字が少なくとも3回繰り返される（例：「eleven」には3つの「e」がある）。
str_view(stringr::words, "(.).*\\1.*\\1", match = TRUE)

# 次回ここから

# 11.4　ツール p180

# 「Regular Expressions: Now You Have Two Problems」 https://blog.codinghorror.com/regular-expressions-now-you-have-two-problems/
# 「regex - Using a regular expression to validate an email address - Stack Overflow」 https://stackoverflow.com/questions/201323/using-a-regular-expression-to-validate-an-email-address/201378#201378

# 11.4.1　マッチの可否
x <- c("apple", "banana", "pear")
str_detect(x, "e")

## [1]  TRUE FALSE  TRUE

# 一般単語でtで始まる単語の個数は?
sum(str_detect(words, "^t"))

## [1] 65

# 一般単語で母音で終わる単語の割合は?
mean(str_detect(words, "[aeiou]$"))

## [1] 0.2765306

# 少なくとも母音を1つ含む単語をすべて探し出して、補集合を取る
no_vowels_1 <- !str_detect(words, "[aeiou]")
# 子音（非母音）だけからなる単語をすべて探し出す
no_vowels_2 <- str_detect(words, "^[^aeiou]+$")
identical(no_vowels_1, no_vowels_2)

## [1] TRUE

words[str_detect(words, "x$")]

## [1] "box" "sex" "six" "tax"

str_subset(words, "x$")

## [1] "box" "sex" "six" "tax"

df <- tibble(
  word = words,
  i = seq_along(word)
)
df %>%
  filter(str_detect(words, "x$"))

## # A tibble: 4 x 2
##    word     i
##   <chr> <int>
## 1   box   108
## 2   sex   747
## 3   six   772
## 4   tax   841

x <- c("apple", "banana", "pear")
str_count(x, "a")

## [1] 1 3 1

# 平均すると、1つの単語にどれだけ母音があるか?
mean(str_count(words, "[aeiou]"))

## [1] 1.991837

df %>%
  mutate(
    vowels = str_count(word, "[aeiou]"),
    consonants = str_count(word, "[^aeiou]")
  )

## # A tibble: 980 x 4
##        word     i vowels consonants
##       <chr> <int>  <int>      <int>
##  1        a     1      1          0
##  2     able     2      2          2
##  3    about     3      3          2
##  4 absolute     4      4          4
##  5   accept     5      2          4
##  6  account     6      3          4
##  7  achieve     7      4          3
##  8   across     8      2          4
##  9      act     9      1          2
## 10   active    10      3          3
## # ... with 970 more rows

# vowel: 母音

str_count("abababa", "aba")

## [1] 2

str_view_all("abababa", "aba")

# 練習問題 p183 (Detect matches(の下))

# 1. 次の各問題について、単一の正規表現を使った解と複数の呼び出しを組み合わせた解の両方を
# 求めなさい。
#  a. 先頭または末尾がxの全単語を探し出す。

# 単一
str_view(stringr::words, "^x|x$", match = TRUE)

#?str_view
#?str_subset
#?str_detect
words[str_detect(words, "^x|x$")]

## [1] "box" "sex" "six" "tax"

# 複数
words[str_detect(words, "^x") | str_detect(words, "x$")]

## [1] "box" "sex" "six" "tax"

#str_detect(words, "^x")
#str_subset(words, "^x")
#words[str_detect(words, "^e") | str_detect(words, "e$")]

#  b. 先頭が母音で末尾が子音の全単語を探し出す。
str_view(stringr::words, "^[aeiou].*[^aeiou]$", match = TRUE)

words[str_detect(words, "^[aeiou]") & str_detect(words, "[^aeiou]$")]

##   [1] "about"       "accept"      "account"     "across"      "act"        
##   [6] "actual"      "add"         "address"     "admit"       "affect"     
##  [11] "afford"      "after"       "afternoon"   "again"       "against"    
##  [16] "agent"       "air"         "all"         "allow"       "almost"     
##  [21] "along"       "already"     "alright"     "although"    "always"     
##  [26] "amount"      "and"         "another"     "answer"      "any"        
##  [31] "apart"       "apparent"    "appear"      "apply"       "appoint"    
##  [36] "approach"    "arm"         "around"      "art"         "as"         
##  [41] "ask"         "at"          "attend"      "authority"   "away"       
##  [46] "awful"       "each"        "early"       "east"        "easy"       
##  [51] "eat"         "economy"     "effect"      "egg"         "eight"      
##  [56] "either"      "elect"       "electric"    "eleven"      "employ"     
##  [61] "end"         "english"     "enjoy"       "enough"      "enter"      
##  [66] "environment" "equal"       "especial"    "even"        "evening"    
##  [71] "ever"        "every"       "exact"       "except"      "exist"      
##  [76] "expect"      "explain"     "express"     "identify"    "if"         
##  [81] "important"   "in"          "indeed"      "individual"  "industry"   
##  [86] "inform"      "instead"     "interest"    "invest"      "it"         
##  [91] "item"        "obvious"     "occasion"    "odd"         "of"         
##  [96] "off"         "offer"       "often"       "okay"        "old"        
## [101] "on"          "only"        "open"        "opportunity" "or"         
## [106] "order"       "original"    "other"       "ought"       "out"        
## [111] "over"        "own"         "under"       "understand"  "union"      
## [116] "unit"        "university"  "unless"      "until"       "up"         
## [121] "upon"        "usual"

str_subset(words, "^[aeiou].*[^aeiou]$") %>% head()

## [1] "about"   "accept"  "account" "across"  "act"     "actual"

start_with_vowel <- str_detect(words, "^[aeiou]")
end_with_consonant <- str_detect(words, "[^aeiou]$")
words[start_with_vowel & end_with_consonant] %>% head()

## [1] "about"   "accept"  "account" "across"  "act"     "actual"

#  c. 異なる母音をそれぞれ少なくとも1つ含む単語はあるか。
# 単一
str_view(stringr::words, "a.*e|a.*i|a.*o|a.*u|e.*a|e.*i|e.*o|e.*u|i.*a|i.*e|i.*o|i.*u|o.*a|o.*e|o.*i|o.*u|u.*a|u.*e|u.*i|u.*o", match = TRUE)

# 複数?
# あまりいい例が思い浮かばず。。以下、力技例
words[ 
  str_detect(words, "a") & str_detect(words, "e") |
    str_detect(words, "a") & str_detect(words, "i") |
    str_detect(words, "a") & str_detect(words, "o") |
    str_detect(words, "a") & str_detect(words, "u") |
  str_detect(words, "e") & str_detect(words, "a") |
    str_detect(words, "e") & str_detect(words, "i") |
    str_detect(words, "e") & str_detect(words, "o") |
    str_detect(words, "e") & str_detect(words, "u") |
  str_detect(words, "i") & str_detect(words, "a") |
    str_detect(words, "i") & str_detect(words, "e") |
    str_detect(words, "i") & str_detect(words, "o") |
    str_detect(words, "i") & str_detect(words, "u") |
  str_detect(words, "o") & str_detect(words, "a") |
    str_detect(words, "o") & str_detect(words, "e") |
    str_detect(words, "o") & str_detect(words, "i") |
    str_detect(words, "o") & str_detect(words, "o") |
    str_detect(words, "o") & str_detect(words, "u") |
  str_detect(words, "u") & str_detect(words, "a") |
    str_detect(words, "u") & str_detect(words, "e") |
    str_detect(words, "u") & str_detect(words, "i") |
    str_detect(words, "u") & str_detect(words, "o") |
  str_detect(words, "u") & str_detect(words, "a") ]

##   [1] "able"        "about"       "absolute"    "accept"      "account"    
##   [6] "achieve"     "across"      "active"      "actual"      "address"    
##  [11] "admit"       "advertise"   "affect"      "afford"      "after"      
##  [16] "afternoon"   "again"       "against"     "age"         "agent"      
##  [21] "ago"         "agree"       "air"         "allow"       "almost"     
##  [26] "along"       "already"     "alright"     "also"        "although"   
##  [31] "america"     "amount"      "another"     "answer"      "apparent"   
##  [36] "appear"      "appoint"     "approach"    "appropriate" "area"       
##  [41] "argue"       "around"      "arrange"     "associate"   "assume"     
##  [46] "attend"      "authority"   "available"   "aware"       "awful"      
##  [51] "balance"     "base"        "basis"       "bear"        "beat"       
##  [56] "beauty"      "because"     "become"      "before"      "begin"      
##  [61] "behind"      "believe"     "benefit"     "bloke"       "blood"      
##  [66] "blow"        "blue"        "board"       "boat"        "body"       
##  [71] "book"        "both"        "bother"      "bottle"      "bottom"     
##  [76] "box"         "boy"         "break"       "brief"       "brilliant"  
##  [81] "britain"     "brother"     "budget"      "build"       "business"   
##  [86] "cake"        "care"        "case"        "cause"       "certain"    
##  [91] "chair"       "chairman"    "chance"      "change"      "character"  
##  [96] "charge"      "cheap"       "choice"      "choose"      "Christmas"  
## [101] "claim"       "clean"       "clear"       "client"      "clock"      
## [106] "close"       "closes"      "clothe"      "coffee"      "cold"       
## [111] "colleague"   "collect"     "college"     "colour"      "come"       
## [116] "comment"     "commit"      "committee"   "common"      "community"  
## [121] "company"     "compare"     "complete"    "compute"     "concern"    
## [126] "condition"   "confer"      "consider"    "consult"     "contact"    
## [131] "continue"    "contract"    "control"     "converse"    "cook"       
## [136] "copy"        "corner"      "correct"     "cost"        "could"      
## [141] "council"     "count"       "country"     "county"      "couple"     
## [146] "course"      "court"       "cover"       "create"      "cross"      
## [151] "current"     "danger"      "date"        "dead"        "deal"       
## [156] "dear"        "debate"      "decide"      "decision"    "definite"   
## [161] "department"  "describe"    "design"      "detail"      "develop"    
## [166] "die"         "difference"  "difficult"   "dinner"      "direct"     
## [171] "discuss"     "divide"      "do"          "doctor"      "document"   
## [176] "dog"         "door"        "double"      "doubt"       "down"       
## [181] "drive"       "drop"        "due"         "during"      "each"       
## [186] "early"       "east"        "easy"        "eat"         "economy"    
## [191] "educate"     "eight"       "either"      "electric"    "employ"     
## [196] "encourage"   "engine"      "english"     "enjoy"       "enough"     
## [201] "environment" "equal"       "especial"    "europe"      "evening"    
## [206] "evidence"    "exact"       "example"     "excuse"      "exercise"   
## [211] "exist"       "experience"  "explain"     "extra"       "face"       
## [216] "fair"        "family"      "father"      "favour"      "field"      
## [221] "figure"      "file"        "final"       "finance"     "fine"       
## [226] "fire"        "five"        "floor"       "follow"      "food"       
## [231] "foot"        "for"         "force"       "forget"      "form"       
## [236] "fortune"     "forward"     "four"        "france"      "friday"     
## [241] "friend"      "from"        "front"       "function"    "further"    
## [246] "future"      "game"        "garden"      "general"     "germany"    
## [251] "give"        "go"          "god"         "good"        "goodbye"    
## [256] "govern"      "great"       "ground"      "group"       "grow"       
## [261] "guess"       "hair"        "happen"      "hate"        "have"       
## [266] "head"        "health"      "hear"        "heart"       "heat"       
## [271] "heavy"       "history"     "hold"        "holiday"     "home"       
## [276] "honest"      "hope"        "horse"       "hospital"    "hot"        
## [281] "hour"        "house"       "how"         "however"     "hullo"      
## [286] "hundred"     "husband"     "idea"        "identify"    "imagine"    
## [291] "important"   "improve"     "include"     "income"      "increase"   
## [296] "indeed"      "individual"  "industry"    "inform"      "inside"     
## [301] "instead"     "insure"      "interest"    "into"        "introduce"  
## [306] "invest"      "involve"     "issue"       "item"        "jesus"      
## [311] "job"         "join"        "judge"       "kitchen"     "knock"      
## [316] "know"        "labour"      "language"    "large"       "late"       
## [321] "laugh"       "lead"        "learn"       "leave"       "lie"        
## [326] "life"        "like"        "likely"      "line"        "listen"     
## [331] "little"      "live"        "load"        "local"       "lock"       
## [336] "london"      "long"        "look"        "lord"        "lose"       
## [341] "lot"         "love"        "low"         "machine"     "main"       
## [346] "major"       "make"        "manage"      "market"      "matter"     
## [351] "maybe"       "mean"        "meaning"     "measure"     "mention"    
## [356] "middle"      "mile"        "million"     "minister"    "minus"      
## [361] "minute"      "mister"      "moment"      "monday"      "money"      
## [366] "month"       "more"        "morning"     "most"        "mother"     
## [371] "motion"      "move"        "music"       "name"        "nation"     
## [376] "nature"      "near"        "necessary"   "nice"        "nine"       
## [381] "no"          "non"         "none"        "normal"      "north"      
## [386] "not"         "note"        "notice"      "now"         "number"     
## [391] "obvious"     "occasion"    "odd"         "of"          "off"        
## [396] "offer"       "office"      "often"       "okay"        "old"        
## [401] "on"          "once"        "one"         "only"        "open"       
## [406] "operate"     "opportunity" "oppose"      "or"          "order"      
## [411] "organize"    "original"    "other"       "otherwise"   "ought"      
## [416] "out"         "over"        "own"         "page"        "paint"      
## [421] "pair"        "paper"       "pardon"      "parent"      "particular" 
## [426] "pension"     "people"      "perhaps"     "period"      "person"     
## [431] "photograph"  "picture"     "piece"       "place"       "please"     
## [436] "point"       "police"      "policy"      "politic"     "poor"       
## [441] "position"    "positive"    "possible"    "post"        "pound"      
## [446] "power"       "practise"    "prepare"     "pressure"    "presume"    
## [451] "previous"    "price"       "private"     "probable"    "problem"    
## [456] "proceed"     "process"     "produce"     "product"     "programme"  
## [461] "project"     "proper"      "propose"     "protect"     "provide"    
## [466] "public"      "purpose"     "quality"     "quarter"     "question"   
## [471] "quick"       "quid"        "quiet"       "quite"       "radio"      
## [476] "rail"        "raise"       "range"       "rate"        "rather"     
## [481] "read"        "ready"       "real"        "realise"     "really"     
## [486] "reason"      "receive"     "reckon"      "recognize"   "recommend"  
## [491] "record"      "reduce"      "regard"      "region"      "relation"   
## [496] "report"      "require"     "research"    "resource"    "responsible"
## [501] "result"      "return"      "rise"        "road"        "role"       
## [506] "roll"        "room"        "round"       "rule"        "safe"       
## [511] "sale"        "same"        "saturday"    "save"        "school"     
## [516] "science"     "score"       "scotland"    "seat"        "second"     
## [521] "secretary"   "section"     "secure"      "separate"    "serious"    
## [526] "service"     "share"       "shoe"        "shoot"       "shop"       
## [531] "short"       "should"      "show"        "side"        "similar"    
## [536] "simple"      "since"       "single"      "sister"      "site"       
## [541] "situate"     "size"        "slow"        "smoke"       "so"         
## [546] "social"      "society"     "some"        "son"         "soon"       
## [551] "sorry"       "sort"        "sound"       "south"       "space"      
## [556] "speak"       "special"     "specific"    "square"      "stage"      
## [561] "stairs"      "state"       "station"     "stop"        "story"      
## [566] "straight"    "strategy"    "strike"      "strong"      "structure"  
## [571] "student"     "stupid"      "subject"     "succeed"     "sudden"     
## [576] "suggest"     "suit"        "summer"      "sunday"      "support"    
## [581] "suppose"     "sure"        "surprise"    "table"       "take"       
## [586] "tape"        "tea"         "teach"       "team"        "telephone"  
## [591] "television"  "terrible"    "therefore"   "thirteen"    "thou"       
## [596] "though"      "thousand"    "through"     "throw"       "thursday"   
## [601] "tie"         "time"        "to"          "today"       "together"   
## [606] "tomorrow"    "tonight"     "too"         "top"         "total"      
## [611] "touch"       "toward"      "town"        "trade"       "traffic"    
## [616] "train"       "transport"   "travel"      "treat"       "trouble"    
## [621] "true"        "tuesday"     "two"         "under"       "understand" 
## [626] "union"       "unit"        "unite"       "university"  "unless"     
## [631] "until"       "upon"        "use"         "usual"       "value"      
## [636] "various"     "video"       "view"        "village"     "vote"       
## [641] "wage"        "wait"        "waste"       "water"       "wear"       
## [646] "wednesday"   "weigh"       "welcome"     "while"       "white"      
## [651] "who"         "whole"       "wide"        "wife"        "window"     
## [656] "without"     "woman"       "wonder"      "wood"        "word"       
## [661] "work"        "world"       "worry"       "worse"       "worth"      
## [666] "would"       "write"       "wrong"       "year"        "yesterday"  
## [671] "you"         "young"

# 回答例のcross_n()だとwarningが出るので、cross()にするが結果は変。。 patternもおかしい。
pattern <- 
  cross(rerun(5, c("a", "e", "i", "o", "u")),
          .filter = function(...) {
            x <- as.character(unlist(list(...)))
            length(x) != length(unique(x))
          }) %>%
  map_chr(~ str_c(unlist(.x), collapse = ".*")) %>%
  str_c(collapse = "|")

str_subset(words, pattern)

## character(0)

words[str_detect(words, "a") &
        str_detect(words, "e") &
        str_detect(words, "i") &
        str_detect(words, "o") &
        str_detect(words, "u")]

## character(0)

str_subset("aseiouds", pattern)

## [1] "aseiouds"

# 問題を勘違いしてるような。。参考まで。

#  d. 母音数が一番多い単語は何か。母音の割合が最も多い単語は何か（ヒント、分母は何か）。
prop_vowels <- str_count(words, "[aeiou]") / str_length(words)
words[which(prop_vowels == max(prop_vowels))]

## [1] "a"

# "a"しか出ない。。たぶんもう少しいい回答あり。


# 11.4.2　マッチの抽出 p184

# 「Harvard sentences - Wikipedia」 https://en.wikipedia.org/wiki/Harvard_sentences

length(sentences)

## [1] 720

head(sentences)

## [1] "The birch canoe slid on the smooth planks." 
## [2] "Glue the sheet to the dark blue background."
## [3] "It's easy to tell the depth of a well."     
## [4] "These days a chicken leg is a rare dish."   
## [5] "Rice is often served in round bowls."       
## [6] "The juice of lemons makes fine punch."

colors <- c(
  "red", "orange", "yellow", "green", "blue", "purple"
)
color_match <- str_c(colors, collapse = "|")
color_match

## [1] "red|orange|yellow|green|blue|purple"

has_color <- str_subset(sentences, color_match)
matches <- str_extract(has_color, color_match)
head(matches)

## [1] "blue" "blue" "red"  "red"  "red"  "blue"

more <- sentences[str_count(sentences, color_match) > 1]
str_view_all(more, color_match)

# ↑練習問題1で言及

str_extract(more, color_match)

## [1] "blue"   "green"  "orange"

#?str_extract
str_extract_all(more, color_match)

## [[1]]
## [1] "blue" "red" 
## 
## [[2]]
## [1] "green" "red"  
## 
## [[3]]
## [1] "orange" "red"

#?str_extract_all

str_extract_all(more, color_match, simplify = TRUE)

##      [,1]     [,2] 
## [1,] "blue"   "red"
## [2,] "green"  "red"
## [3,] "orange" "red"

x <- c("a", "a b", "a b c")
str_extract_all(x, "[a-z]", simplify = TRUE)

##      [,1] [,2] [,3]
## [1,] "a"  ""   ""  
## [2,] "a"  "b"  ""  
## [3,] "a"  "b"  "c"

# 練習問題 p185 (Extract Matches)

# 1. 以前の例題で、マッチした正規表現が色ではないのに「flickered」（語尾のredを色と勘違い）と
# マッチしてしまったことに気付いたはずだ。この問題を修正するように正規表現を変更しなさい。
# colour -> color
color_match2 <- str_c("\\b(", str_c(colors, collapse = "|"), ")\\b")
color_match2

## [1] "\\b(red|orange|yellow|green|blue|purple)\\b"

more2 <- sentences[str_count(sentences, color_match) > 1]
str_view_all(more2, "\\b(red|orange|yellow|green|blue|purple)\\b", match = T)

#str_view_all(more2, "\\borange|red|yellow|green|blue|purple\\b", match = T)
str_view_all(more2, color_match2, match = TRUE)

# これでもいいが、grepみたいに単語単位指定とかないのかな？

# 2. Harvard英語コーパスから次の抽出を行いなさい。
#  a. 各行の先頭の語。
str_extract(sentences, "[a-zA-X]+") %>% head()

## [1] "The"   "Glue"  "It"    "These" "Rice"  "The"

#  b. ingで終わるすべての単語。
pattern <- "\\b[A-Za-z]+ing\\b"
sentences_with_ing <- str_detect(sentences, pattern)
unique(unlist(str_extract_all(sentences[sentences_with_ing], pattern))) %>%
  head()

## [1] "spring"  "evening" "morning" "winding" "living"  "king"

#  c. 複数を表す語。
unique(unlist(str_extract_all(sentences, "\\b[A-Za-z]{3,}s\\b"))) %>%
  head()

## [1] "planks" "days"   "bowls"  "lemons" "makes"  "hogs"

# 11.4.3　グループのマッチ p186

# 「ヒューリスティックスとは - コトバンク」 https://kotobank.jp/word/%E3%83%92%E3%83%A5%E3%83%BC%E3%83%AA%E3%82%B9%E3%83%86%E3%82%A3%E3%83%83%E3%82%AF%E3%82%B9-23094

noun <- "(a|the) ([^ ]+)"
has_noun <- sentences %>%
  str_subset(noun) %>%
  head(10)
has_noun %>%
  str_extract(noun)

##  [1] "the smooth" "the sheet"  "the depth"  "a chicken"  "the parked"
##  [6] "the sun"    "the huge"   "the ball"   "the woman"  "a helps"

tibble(sentence = sentences) %>%
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the) ([^ ]+)",
    remove = FALSE
  )

## # A tibble: 720 x 3
##                                       sentence article    noun
##  *                                       <chr>   <chr>   <chr>
##  1  The birch canoe slid on the smooth planks.     the  smooth
##  2 Glue the sheet to the dark blue background.     the   sheet
##  3      It's easy to tell the depth of a well.     the   depth
##  4    These days a chicken leg is a rare dish.       a chicken
##  5        Rice is often served in round bowls.    <NA>    <NA>
##  6       The juice of lemons makes fine punch.    <NA>    <NA>
##  7 The box was thrown beside the parked truck.     the  parked
##  8 The hogs were fed chopped corn and garbage.    <NA>    <NA>
##  9         Four hours of steady work faced us.    <NA>    <NA>
## 10    Large size in stockings is hard to sell.    <NA>    <NA>
## # ... with 710 more rows

#?tidyr::extract

# 練習問題 p187 (Grouped Matches)

# 1. 「one」、「two」、「three」のように「number」の後に続くすべての単語を探し出しなさい。
# 「number」と単語の両方を取り出しなさい。
tibble(sentence = sentences) %>%
  tidyr::extract(
    sentence, c("article", "noun"), "(a|the|[Oo]ne|[Tt]wo|[Tt]hree|[Ff]our|[Ff]ive|[Ss]ix|[Ss]even|[Ee]ight|[Nn]ine|[Tt]en) ([^ ]+)",
    remove = FALSE
  )

## # A tibble: 720 x 3
##                                       sentence article    noun
##  *                                       <chr>   <chr>   <chr>
##  1  The birch canoe slid on the smooth planks.     the  smooth
##  2 Glue the sheet to the dark blue background.     the   sheet
##  3      It's easy to tell the depth of a well.     the   depth
##  4    These days a chicken leg is a rare dish.       a chicken
##  5        Rice is often served in round bowls.     ten  served
##  6       The juice of lemons makes fine punch.    <NA>    <NA>
##  7 The box was thrown beside the parked truck.     the  parked
##  8 The hogs were fed chopped corn and garbage.    <NA>    <NA>
##  9         Four hours of steady work faced us.    Four   hours
## 10    Large size in stockings is hard to sell.    <NA>    <NA>
## # ... with 710 more rows

# 模範解答
numword <- "(one|two|three|four|five|six|seven|eight|nine|ten) +(\\S+)"
sentences[str_detect(sentences, numword)] %>%
  str_extract(numword)

##  [1] "ten served"    "one over"      "seven books"   "two met"      
##  [5] "two factors"   "one and"       "three lists"   "seven is"     
##  [9] "two when"      "one floor."    "ten inches."   "one with"     
## [13] "one war"       "one button"    "six minutes."  "ten years"    
## [17] "one in"        "ten chased"    "one like"      "two shares"   
## [21] "two distinct"  "one costs"     "ten two"       "five robins." 
## [25] "four kinds"    "one rang"      "ten him."      "three story"  
## [29] "ten by"        "one wall."     "three inches"  "ten your"     
## [33] "six comes"     "one before"    "three batches" "two leaves."

# 2. アポストロフィを使った短縮形をすべて探そう。アポストロフィの前後の部分を分けなさい。
contraction <- "([A-Za-z]+)'([A-Za-z]+)"
sentences %>%
  `[`(str_detect(sentences, contraction)) %>%
  str_extract(contraction)

##  [1] "It's"       "man's"      "don't"      "store's"    "workmen's" 
##  [6] "Let's"      "sun's"      "child's"    "king's"     "It's"      
## [11] "don't"      "queen's"    "don't"      "pirate's"   "neighbor's"

# 11.4.4　置換マッチ

x <- c("apple", "pear", "banana")
str_replace(x, "[aeiou]", "-")

## [1] "-pple"  "p-ar"   "b-nana"

str_replace_all(x, "[aeiou]", "-")

## [1] "-ppl-"  "p--r"   "b-n-n-"

x <- c("1 house", "2 cars", "3 people")
str_replace_all(x, c("1" = "one", "2" = "two", "3" = "three"))

## [1] "one house"    "two cars"     "three people"

sentences %>%
  str_replace("([^ ]+) ([^ ]+) ([^ ]+)", "\\1 \\3 \\2") %>%
  head(5)

## [1] "The canoe birch slid on the smooth planks." 
## [2] "Glue sheet the to the dark blue background."
## [3] "It's to easy tell the depth of a well."     
## [4] "These a days chicken leg is a rare dish."   
## [5] "Rice often is served in round bowls."

# 練習問題 p188 (Replacing Matches)
# 1. 文字列のスラッシュをすべてバックスラッシュで置き換える。
backslashed <- str_replace_all("past/present/future", "\\/", "\\\\")
writeLines(backslashed)

## past\present\future

# 2. replace_all()を使って、str_to_lower()の単純版を実装しなさい。
# 模範
lower <- str_replace_all(words, c("A"="a", "B"="b", "C"="c", "D"="d", "E"="e", "F"="f", "G"="g", "H"="h", "I"="i", "J"="j", "K"="k", "L"="l", "M"="m", "N"="n", "O"="o", "P"="p", "Q"="q", "R"="r", "S"="s", "T"="t", "U"="u", "V"="v", "W"="w", "X"="x", "Y"="y", "Z"="z"))
head(lower)

## [1] "a"        "able"     "about"    "absolute" "accept"   "account"

# もう少しまともな方法は・・・
lower2 <- str_replace_all(words, "([A-Z])", str_to_lower("\\1"))
# 正しいかは要確認


# 3. wordsの先頭と最後の文字を置き換えなさい。できた文字列でwordsにあるのはどれか。

# First, make a vector of all the words with first and last letters swapped
swapped <- str_replace_all(words, "^([A-Za-z])(.*)([a-z])$", "\\3\\2\\1")
# Next, find what of "swapped" is also in the original list using intersect() from previous chapter
intersect(swapped, words)

##  [1] "a"          "america"    "area"       "dad"        "dead"      
##  [6] "lead"       "read"       "depend"     "god"        "educate"   
## [11] "else"       "encourage"  "engine"     "europe"     "evidence"  
## [16] "example"    "excuse"     "exercise"   "expense"    "experience"
## [21] "eye"        "dog"        "health"     "high"       "knock"     
## [26] "deal"       "level"      "local"      "nation"     "on"        
## [31] "non"        "no"         "rather"     "dear"       "refer"     
## [36] "remember"   "serious"    "stairs"     "test"       "tonight"   
## [41] "transport"  "treat"      "trust"      "window"     "yesterday"

# 11.4.5　分割
sentences %>%
  head(5) %>%
  str_split(" ")

## [[1]]
## [1] "The"     "birch"   "canoe"   "slid"    "on"      "the"     "smooth" 
## [8] "planks."
## 
## [[2]]
## [1] "Glue"        "the"         "sheet"       "to"          "the"        
## [6] "dark"        "blue"        "background."
## 
## [[3]]
## [1] "It's"  "easy"  "to"    "tell"  "the"   "depth" "of"    "a"     "well."
## 
## [[4]]
## [1] "These"   "days"    "a"       "chicken" "leg"     "is"      "a"      
## [8] "rare"    "dish."  
## 
## [[5]]
## [1] "Rice"   "is"     "often"  "served" "in"     "round"  "bowls."

"a|b|c|d" %>%
  str_split("\\|") %>%
  .[[1]]

## [1] "a" "b" "c" "d"

sentences %>%
  head(5) %>%
  str_split(" ", simplify = TRUE)

##      [,1]    [,2]    [,3]    [,4]      [,5]  [,6]    [,7]    
## [1,] "The"   "birch" "canoe" "slid"    "on"  "the"   "smooth"
## [2,] "Glue"  "the"   "sheet" "to"      "the" "dark"  "blue"  
## [3,] "It's"  "easy"  "to"    "tell"    "the" "depth" "of"    
## [4,] "These" "days"  "a"     "chicken" "leg" "is"    "a"     
## [5,] "Rice"  "is"    "often" "served"  "in"  "round" "bowls."
##      [,8]          [,9]   
## [1,] "planks."     ""     
## [2,] "background." ""     
## [3,] "a"           "well."
## [4,] "rare"        "dish."
## [5,] ""            ""

fields <- c("Name: Hadley", "Country: NZ", "Age: 35")
fields %>% str_split(": ", n = 2, simplify = TRUE)

##      [,1]      [,2]    
## [1,] "Name"    "Hadley"
## [2,] "Country" "NZ"    
## [3,] "Age"     "35"

x <- "This is a sentence. This is another sentence."
# x <- "This is a \"sentence\", This is another 'second-sentence'."
str_view_all(x, boundary("word"))

str_split(x, " ")[[1]]

## [1] "This"      "is"        "a"         "sentence." "This"      "is"       
## [7] "another"   "sentence."

str_split(x, boundary("word"))[[1]]

## [1] "This"     "is"       "a"        "sentence" "This"     "is"      
## [7] "another"  "sentence"

str_split(x, "[ .,'\"-]+")[[1]] # 最後に不要な空の要素ができるのでboundary("word")の方が良い

## [1] "This"     "is"       "a"        "sentence" "This"     "is"      
## [7] "another"  "sentence" ""

# 練習問題 p190 (Splitting)
# 1. "apples, pears, and bananas"のような文字列を要素に分割しなさい。
x <- c("apples, pears, and bananas")
str_split(x, ", +(and +)?")[[1]]

## [1] "apples"  "pears"   "bananas"

# 2. " "よりもboundary("word")で分割した方がなぜよいのか。

# Splitting by boundary("word") splits on punctuation and not just whitespace.
# あと最後の不要な要素ができないので。

# 3. 空文字列（""）で分割すると何が起こるか。実験して、ドキュメントを読みなさい。
str_split("ab. cd|agt", "")[[1]]

##  [1] "a" "b" "." " " "c" "d" "|" "a" "g" "t"

#?str_split
# pattern
#  An empty pattern, "", is equivalent to boundary("character").


# 11.4.6　マッチを探し出す
# ?str_locate
# ?str_locate_all
# Examples
# fruit <- c("apple", "banana", "pear", "pineapple")
# str_locate(fruit, "$")
# str_locate(fruit, "a")
# str_locate(fruit, "e")
# str_locate(fruit, c("a", "b", "p", "p"))
#
# str_locate_all(fruit, "a")
# str_locate_all(fruit, "e")
# str_locate_all(fruit, c("a", "b", "p", "p"))
#
# # Find location of every character
# str_locate_all(fruit, "")

# 11.5　他の種類のパターン

# 呼び出し
str_view(fruit, "nana")

# は次の省略形
str_view(fruit, regex("nana"))

#?stringr::regex

bananas <- c("banana", "Banana", "BANANA")
str_view(bananas, "banana")

str_view(bananas, regex("banana", ignore_case = TRUE))

# str_view(bananas, "banana", ignore_case = TRUE) # ×

x <- "Line 1\nLine 2\nLine 3"
str_extract_all(x, "^Line")[[1]]

## [1] "Line"

str_extract_all(x, regex("^Line", multiline = TRUE))[[1]]

## [1] "Line" "Line" "Line"

phone <- regex("
\\(? # オプションの開き括弧
               (\\d{3}) # エリア番号
               [)- ]? # オプションの閉じ括弧、ダッシュ、空白
               (\\d{3}) # 3桁の番号
               [ -]? # オプションの空白かダッシュ
               (\\d{3}) # 3桁の番号
               ", comments = TRUE)
str_match("514-791-8141", phone)

##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"

str_match("514 791 8141", phone)

##      [,1] [,2] [,3] [,4]
## [1,] NA   NA   NA   NA

# 正しくはこうか(原書のミス？)
phone <- regex("
\\(? # オプションの開き括弧
               (\\d{3}) # エリア番号
               [-)\\ ]? # オプションの閉じ括弧、ダッシュ、空白
               (\\d{3}) # 3桁の番号
               [\\ -]? # オプションの空白かダッシュ
               (\\d{3}) # 3桁の番号
               ", comments = TRUE)
str_match("514-791-8141", phone)

##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514-791-814" "514" "791" "814"

str_match("514 791 8141", phone)

##      [,1]          [,2]  [,3]  [,4] 
## [1,] "514 791 814" "514" "791" "814"

# dotall = TRUEとすれば、\nも含めてすべての文字と.がマッチ

# typeが気になる
#install.packages("microbenchmark")
#library(microbenchmark)
microbenchmark::microbenchmark(
  fixed = str_detect(sentences, fixed("the")),
  regex = str_detect(sentences, "the"),
  times = 20
)

## Unit: microseconds
##   expr     min       lq     mean   median       uq     max neval
##  fixed  96.088 101.7295 168.2302 155.3465 238.1900 331.738    20
##  regex 314.646 321.6380 499.5178 462.3580 643.2225 823.175    20

a1 <- "\u00e1"
a2 <- "a\u0301"
c(a1, a2)

## [1] "á" "á"

a1 == a2

## [1] FALSE

str_detect(a1, fixed(a2))

## [1] FALSE

# 非英語データのための人間らしいマッチ
str_detect(a1, coll(a2))

## [1] TRUE

# locale引数を取ることに注意。地域が異なると照合規則も異なる
i <- c("I", "İ", "i", "ı")
i

## [1] "I"  "İ" "i"  "ı"

str_subset(i, coll("i", ignore_case = TRUE))

## [1] "I" "i"

str_subset(
  i,
  coll("i", ignore_case = TRUE, locale = "tr")
)

## [1] "İ" "i"

# 微妙に違う

stringi::stri_locale_info()

## $Language
## [1] "ja"
## 
## $Country
## [1] "JP"
## 
## $Variant
## [1] ""
## 
## $Name
## [1] "ja_JP"

# fixedやregexに比べてcollは遅いらしい

x <- "This is a sentence."
str_view_all(x, boundary("word"))

str_extract_all(x, boundary("word"))

## [[1]]
## [1] "This"     "is"       "a"        "sentence"

# 練習問題 p193 ()

# 1. \\を含む全文字列を探し出すのに、regex()とfixed()とでそれぞれどのようにすればよいか。
str_subset(c("a\\b", "ab"), "\\\\")

## [1] "a\\b"

str_subset(c("a\\b", "ab"), fixed("\\"))

## [1] "a\\b"

# 2. sentencesで最も多く使われている単語を5つ示しなさい。
str_extract_all(sentences, boundary("word")) %>%
  unlist() %>%
  str_to_lower() %>%
  tibble() %>%
  set_names("word") %>%
  group_by(word) %>%
  count(sort = TRUE) %>%
  head(5)

## # A tibble: 5 x 2
## # Groups:   word [5]
##    word     n
##   <chr> <int>
## 1   the   751
## 2     a   202
## 3    of   132
## 4    to   123
## 5   and   118

# 11.6　正規表現の別の用途

# 全オブジェクトを探索。正確に覚えてないときなど
apropos("replace")

## [1] "%+replace%"       "replace"          "replace_na"      
## [4] "setReplaceMethod" "str_replace"      "str_replace_all" 
## [7] "str_replace_na"   "theme_replace"

head(dir(pattern = "\\.Rmd$"))

## [1] "r4ds_ch11.spin.Rmd" "rmarkdown_test.Rmd"

dir(pattern = glob2rx("*.Rmd"))

## [1] "r4ds_ch11.spin.Rmd" "rmarkdown_test.Rmd"

# 11.7　stringi
#  stringiパッケージの方が関数が多い
library("stringi")
# 練習問題 p194 (stringi)

# 1. 次のstringi関数を探し出しなさい。
#  a. 単語数を数える。

# ?stri_count_words

#  b. 重複した文字列を探し出す。

# ?stri_duplicated

#  c. ランダムな文章を生成する。

# ?stri_rand_
# ?stri_rand_lipsum
# ?stri_rand_strings
# ?stri_rand_shuffle
apropos("stri_rand")

## [1] "stri_rand_lipsum"  "stri_rand_shuffle" "stri_rand_strings"

stri_rand_lipsum(2)

## [1] "Lorem ipsum dolor sit amet, porta eu. Magna vitae. Fringilla lorem potenti duis mus ac urna curabitur laoreet ligula. A justo vehicula sodales eget nulla. Dolor ridiculus at dictum maecenas finibus, ultricies. Parturient elit quam dignissim porttitor proin orci, lectus scelerisque velit efficitur ut molestie. Vulputate ut non facilisis. Est, nulla habitasse at in nisi quam. A nostra scelerisque vel sed sed."                                                                                                                                                                                                                                    
## [2] "Elit neque in posuere magna conubia feugiat vulputate libero at urna porta fermentum. Eleifend vel vivamus. Amet sociis nam amet, elementum ridiculus auctor mauris. Rutrum dui non non accumsan eget felis lectus. Et lacinia sit fames luctus ex ut scelerisque convallis faucibus ultricies inceptos et. Malesuada vel enim metus vestibulum eget ac, sodales sociosqu ac eget parturient nulla, sodales. Magna euismod diam convallis aliquam adipiscing. Ac eleifend faucibus magna et magnis sapien consectetur libero eu quisque consectetur, tincidunt! Quam adipiscing ullamcorper sed magna class molestie sociosqu at pharetra ultrices eu aliquam."

# 2. stri_sort()が整列に使う言語をどう制御すればよいか。

# Use the locale argument to the opts_collator argument.

# ~p194

r4ds_ch11.R

toshiyuki.sakai

Tue Feb 27 12:29:08 2018