概要

前書き
- Rで文字列処理をするライブラリである{stringr}と{stringi}について、baseの関数と付き合わせたコードの自分用のメモ

参照サイト
　stringr-vignettes
　stringi
　hadley/stringr
　RPubs - このパッケージがすごい2014: stringr
　stringi package arekore 　stringiで輝く☆テキストショリスト
　 stringr 1.0.0を使ってみる

ライブラリ読み込み

SET_LOAD_LIB <- c("knitr", "readr", "dplyr", "tidyr", "readr", "stringr", "stringi")
sapply(X = SET_LOAD_LIB, FUN = library, character.only = TRUE, logical.return = TRUE)

##   knitr   readr   dplyr   tidyr   readr stringr stringi 
##    TRUE    TRUE    TRUE    TRUE    TRUE    TRUE    TRUE

knitr::opts_chunk$set(comment = NA)

関数確認

下記を参考に、{stringr}と{stringi}の関数を表示（演算子は除外）。
stringr and stringi

ls("package:stringr") %>%
  stringr::str_subset(pattern = "^[a-zA-Z]")

 [1] "boundary"        "coll"            "fixed"          
 [4] "ignore.case"     "invert_match"    "perl"           
 [7] "regex"           "str_c"           "str_conv"       
[10] "str_count"       "str_detect"      "str_dup"        
[13] "str_extract"     "str_extract_all" "str_join"       
[16] "str_length"      "str_locate"      "str_locate_all" 
[19] "str_match"       "str_match_all"   "str_order"      
[22] "str_pad"         "str_replace"     "str_replace_all"
[25] "str_replace_na"  "str_sort"        "str_split"      
[28] "str_split_fixed" "str_sub"         "str_sub<-"      
[31] "str_subset"      "str_to_lower"    "str_to_title"   
[34] "str_to_upper"    "str_trim"        "str_wrap"       
[37] "word"

ls("package:stringi") %>%
  stringr::str_subset(pattern = "^[a-zA-Z]")

  [1] "stri_c"                       "stri_cmp"                    
  [3] "stri_cmp_eq"                  "stri_cmp_equiv"              
  [5] "stri_cmp_ge"                  "stri_cmp_gt"                 
  [7] "stri_cmp_le"                  "stri_cmp_lt"                 
  [9] "stri_cmp_neq"                 "stri_cmp_nequiv"             
 [11] "stri_compare"                 "stri_conv"                   
 [13] "stri_count"                   "stri_count_boundaries"       
 [15] "stri_count_charclass"         "stri_count_coll"             
 [17] "stri_count_fixed"             "stri_count_regex"            
 [19] "stri_count_words"             "stri_detect"                 
 [21] "stri_detect_charclass"        "stri_detect_coll"            
 [23] "stri_detect_fixed"            "stri_detect_regex"           
 [25] "stri_dup"                     "stri_duplicated"             
 [27] "stri_duplicated_any"          "stri_enc_detect"             
 [29] "stri_enc_detect2"             "stri_enc_fromutf32"          
 [31] "stri_enc_get"                 "stri_enc_info"               
 [33] "stri_enc_isascii"             "stri_enc_isutf16be"          
 [35] "stri_enc_isutf16le"           "stri_enc_isutf32be"          
 [37] "stri_enc_isutf32le"           "stri_enc_isutf8"             
 [39] "stri_enc_list"                "stri_enc_mark"               
 [41] "stri_enc_set"                 "stri_enc_toascii"            
 [43] "stri_enc_tonative"            "stri_enc_toutf32"            
 [45] "stri_enc_toutf8"              "stri_encode"                 
 [47] "stri_endswith"                "stri_endswith_charclass"     
 [49] "stri_endswith_coll"           "stri_endswith_fixed"         
 [51] "stri_escape_unicode"          "stri_extract"                
 [53] "stri_extract_all"             "stri_extract_all_charclass"  
 [55] "stri_extract_all_coll"        "stri_extract_all_fixed"      
 [57] "stri_extract_all_regex"       "stri_extract_all_words"      
 [59] "stri_extract_first"           "stri_extract_first_charclass"
 [61] "stri_extract_first_coll"      "stri_extract_first_fixed"    
 [63] "stri_extract_first_regex"     "stri_extract_first_words"    
 [65] "stri_extract_last"            "stri_extract_last_charclass" 
 [67] "stri_extract_last_coll"       "stri_extract_last_fixed"     
 [69] "stri_extract_last_regex"      "stri_extract_last_words"     
 [71] "stri_flatten"                 "stri_info"                   
 [73] "stri_install_check"           "stri_install_icudt"          
 [75] "stri_isempty"                 "stri_join"                   
 [77] "stri_length"                  "stri_list2matrix"            
 [79] "stri_locale_get"              "stri_locale_info"            
 [81] "stri_locale_list"             "stri_locale_set"             
 [83] "stri_locate"                  "stri_locate_all"             
 [85] "stri_locate_all_boundaries"   "stri_locate_all_charclass"   
 [87] "stri_locate_all_coll"         "stri_locate_all_fixed"       
 [89] "stri_locate_all_regex"        "stri_locate_all_words"       
 [91] "stri_locate_first"            "stri_locate_first_boundaries"
 [93] "stri_locate_first_charclass"  "stri_locate_first_coll"      
 [95] "stri_locate_first_fixed"      "stri_locate_first_regex"     
 [97] "stri_locate_first_words"      "stri_locate_last"            
 [99] "stri_locate_last_boundaries"  "stri_locate_last_charclass"  
[101] "stri_locate_last_coll"        "stri_locate_last_fixed"      
[103] "stri_locate_last_regex"       "stri_locate_last_words"      
[105] "stri_match"                   "stri_match_all"              
[107] "stri_match_all_regex"         "stri_match_first"            
[109] "stri_match_first_regex"       "stri_match_last"             
[111] "stri_match_last_regex"        "stri_numbytes"               
[113] "stri_opts_brkiter"            "stri_opts_collator"          
[115] "stri_opts_fixed"              "stri_opts_regex"             
[117] "stri_order"                   "stri_pad"                    
[119] "stri_pad_both"                "stri_pad_left"               
[121] "stri_pad_right"               "stri_paste"                  
[123] "stri_rand_lipsum"             "stri_rand_shuffle"           
[125] "stri_rand_strings"            "stri_read_lines"             
[127] "stri_read_raw"                "stri_replace"                
[129] "stri_replace_all"             "stri_replace_all_charclass"  
[131] "stri_replace_all_coll"        "stri_replace_all_fixed"      
[133] "stri_replace_all_regex"       "stri_replace_first"          
[135] "stri_replace_first_charclass" "stri_replace_first_coll"     
[137] "stri_replace_first_fixed"     "stri_replace_first_regex"    
[139] "stri_replace_last"            "stri_replace_last_charclass" 
[141] "stri_replace_last_coll"       "stri_replace_last_fixed"     
[143] "stri_replace_last_regex"      "stri_replace_na"             
[145] "stri_reverse"                 "stri_sort"                   
[147] "stri_split"                   "stri_split_boundaries"       
[149] "stri_split_charclass"         "stri_split_coll"             
[151] "stri_split_fixed"             "stri_split_lines"            
[153] "stri_split_lines1"            "stri_split_regex"            
[155] "stri_startswith"              "stri_startswith_charclass"   
[157] "stri_startswith_coll"         "stri_startswith_fixed"       
[159] "stri_stats_general"           "stri_stats_latex"            
[161] "stri_sub"                     "stri_sub<-"                  
[163] "stri_subset"                  "stri_subset_charclass"       
[165] "stri_subset_coll"             "stri_subset_fixed"           
[167] "stri_subset_regex"            "stri_trans_general"          
[169] "stri_trans_isnfc"             "stri_trans_isnfd"            
[171] "stri_trans_isnfkc"            "stri_trans_isnfkc_casefold"  
[173] "stri_trans_isnfkd"            "stri_trans_list"             
[175] "stri_trans_nfc"               "stri_trans_nfd"              
[177] "stri_trans_nfkc"              "stri_trans_nfkc_casefold"    
[179] "stri_trans_nfkd"              "stri_trans_tolower"          
[181] "stri_trans_totitle"           "stri_trans_toupper"          
[183] "stri_trim"                    "stri_trim_both"              
[185] "stri_trim_left"               "stri_trim_right"             
[187] "stri_unescape_unicode"        "stri_unique"                 
[189] "stri_wrap"                    "stri_write_lines"

{stringr}と{stringi}をbaseと合わせて試す

{stringr}は{stringi}ラッパーになったので、基本的には前者の挙動をみる。

文字列連結と分割(str_c/str_split)

文字列連結
base: paste/paste0
stringr: stringr::str_c, ~~stringr::str_join(deprecated)~~
stringi: stringi::stri_join, ~~stringi::stri_c(aliases)~~, ~~stringi::stri_paste(aliases)~~
文字列分割
base: strsplit
stringr: stringr::str_split, stringr::str_split_fixed

文字列連結

month.abb

 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"

n_day <- seq(from = 1, to = length(month.abb))

# stringr::str_cの引数を変えて比較
stringr::str_c(month.abb, sep = ".")

 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"

stringr::str_c(month.abb, collapse = "")

[1] "JanFebMarAprMayJunJulAugSepOctNovDec"

stringr::str_c(month.abb, sep = ".", collapse = "")

[1] "JanFebMarAprMayJunJulAugSepOctNovDec"

# stringr::str_cに複数のベクトルを与え、引数を変えて比較
stringr::str_c(month.abb, n_day, sep = ".")

 [1] "Jan.1"  "Feb.2"  "Mar.3"  "Apr.4"  "May.5"  "Jun.6"  "Jul.7" 
 [8] "Aug.8"  "Sep.9"  "Oct.10" "Nov.11" "Dec.12"

stringr::str_c(month.abb, n_day, collapse = "")

[1] "Jan1Feb2Mar3Apr4May5Jun6Jul7Aug8Sep9Oct10Nov11Dec12"

stringr::str_c(month.abb, n_day, sep = ".", collapse = "")

[1] "Jan.1Feb.2Mar.3Apr.4May.5Jun.6Jul.7Aug.8Sep.9Oct.10Nov.11Dec.12"

# stringr::str_cに複数のベクトルをひとつにまとめて与え、引数を変えて比較
stringr::str_c(c(month.abb, n_day), sep = ".")

 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec" "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10" 
[23] "11"  "12"

stringr::str_c(c(month.abb, n_day), collapse = "")

[1] "JanFebMarAprMayJunJulAugSepOctNovDec123456789101112"

stringr::str_c(c(month.abb, n_day), sep = ".", collapse = "")

[1] "JanFebMarAprMayJunJulAugSepOctNovDec123456789101112"

# stringr::str_cはstringi::stri_cを呼んでいる
stringr::str_c

function (..., sep = "", collapse = NULL) 
{
    stri_c(..., sep = sep, collapse = collapse, ignore_null = TRUE)
}
<environment: namespace:stringr>

# stringr::str_cとpaste/paste0を比較

# str_cのsep= ""がデフォルト 
stringr::str_c(month.abb, n_day)

 [1] "Jan1"  "Feb2"  "Mar3"  "Apr4"  "May5"  "Jun6"  "Jul7"  "Aug8" 
 [9] "Sep9"  "Oct10" "Nov11" "Dec12"

# stringr::str_cのsep= " "がデフォルト
paste(month.abb, n_day)

 [1] "Jan 1"  "Feb 2"  "Mar 3"  "Apr 4"  "May 5"  "Jun 6"  "Jul 7" 
 [8] "Aug 8"  "Sep 9"  "Oct 10" "Nov 11" "Dec 12"

# stringr::str_cはNAをNAとして扱う
is.na(stringr::str_c(NA))

[1] TRUE

# paste0はNAを"NA"として扱う
is.na(paste0(NA))

[1] FALSE

文字列分割

# stringr::str_splitはパターン毎（先頭一致）に区切った文字列ベクトルからなるリストを返す
# 引数nを指定しないと、デフォルトの引数n = Infを使って全て区切る
month.name %>%
  stringr::str_split(pattern = "e")

[[1]]
[1] "January"

[[2]]
[1] "F"      "bruary"

[[3]]
[1] "March"

[[4]]
[1] "April"

[[5]]
[1] "May"

[[6]]
[1] "Jun" ""   

[[7]]
[1] "July"

[[8]]
[1] "August"

[[9]]
[1] "S"  "pt" "mb" "r" 

[[10]]
[1] "Octob" "r"    

[[11]]
[1] "Nov" "mb"  "r"  

[[12]]
[1] "D"  "c"  "mb" "r"

# 「n = 1」だと区切らない
month.name %>%
  stringr::str_split(pattern = "e", n = 1)

[[1]]
[1] "January"

[[2]]
[1] "February"

[[3]]
[1] "March"

[[4]]
[1] "April"

[[5]]
[1] "May"

[[6]]
[1] "June"

[[7]]
[1] "July"

[[8]]
[1] "August"

[[9]]
[1] "September"

[[10]]
[1] "October"

[[11]]
[1] "November"

[[12]]
[1] "December"

# 「n = 2」
month.name %>%
  stringr::str_split(pattern = "e", n = 2)

[[1]]
[1] "January"

[[2]]
[1] "F"      "bruary"

[[3]]
[1] "March"

[[4]]
[1] "April"

[[5]]
[1] "May"

[[6]]
[1] "Jun" ""   

[[7]]
[1] "July"

[[8]]
[1] "August"

[[9]]
[1] "S"       "ptember"

[[10]]
[1] "Octob" "r"    

[[11]]
[1] "Nov"  "mber"

[[12]]
[1] "D"      "cember"

# stringi::stri_split_fixedと挙動は一緒
month.name %>%
  stringi::stri_split_fixed(pattern = "e", n = 2)

[[1]]
[1] "January"

[[2]]
[1] "F"      "bruary"

[[3]]
[1] "March"

[[4]]
[1] "April"

[[5]]
[1] "May"

[[6]]
[1] "Jun" ""   

[[7]]
[1] "July"

[[8]]
[1] "August"

[[9]]
[1] "S"       "ptember"

[[10]]
[1] "Octob" "r"    

[[11]]
[1] "Nov"  "mber"

[[12]]
[1] "D"      "cember"

# stringr::str_split_fixedはパターン毎（先頭一致）に引数nの個数で区切った次元の行列を返す
# 「n = 1」だと区切らない
month.name %>%
  stringr::str_split_fixed(pattern = "e", n = 1)

      [,1]       
 [1,] "January"  
 [2,] "February" 
 [3,] "March"    
 [4,] "April"    
 [5,] "May"      
 [6,] "June"     
 [7,] "July"     
 [8,] "August"   
 [9,] "September"
[10,] "October"  
[11,] "November" 
[12,] "December"

# 「n = 2」だと2個に区切る
month.name %>%
  stringr::str_split_fixed(pattern = "e", n = 2)

      [,1]      [,2]     
 [1,] "January" ""       
 [2,] "F"       "bruary" 
 [3,] "March"   ""       
 [4,] "April"   ""       
 [5,] "May"     ""       
 [6,] "Jun"     ""       
 [7,] "July"    ""       
 [8,] "August"  ""       
 [9,] "S"       "ptember"
[10,] "Octob"   "r"      
[11,] "Nov"     "mber"   
[12,] "D"       "cember"

month.name %>%
  stringr::str_split_fixed(pattern = "e", n = 3)

      [,1]      [,2]     [,3]  
 [1,] "January" ""       ""    
 [2,] "F"       "bruary" ""    
 [3,] "March"   ""       ""    
 [4,] "April"   ""       ""    
 [5,] "May"     ""       ""    
 [6,] "Jun"     ""       ""    
 [7,] "July"    ""       ""    
 [8,] "August"  ""       ""    
 [9,] "S"       "pt"     "mber"
[10,] "Octob"   "r"      ""    
[11,] "Nov"     "mb"     "r"   
[12,] "D"       "c"      "mber"

month.name %>%
  stringr::str_split_fixed(pattern = "e", n = 4)

      [,1]      [,2]     [,3] [,4]
 [1,] "January" ""       ""   ""  
 [2,] "F"       "bruary" ""   ""  
 [3,] "March"   ""       ""   ""  
 [4,] "April"   ""       ""   ""  
 [5,] "May"     ""       ""   ""  
 [6,] "Jun"     ""       ""   ""  
 [7,] "July"    ""       ""   ""  
 [8,] "August"  ""       ""   ""  
 [9,] "S"       "pt"     "mb" "r" 
[10,] "Octob"   "r"      ""   ""  
[11,] "Nov"     "mb"     "r"  ""  
[12,] "D"       "c"      "mb" "r"

# 各入力に含まれる"e"は最大3個なので、5列目は全て空文字列
month.name %>%
  stringr::str_split_fixed(pattern = "e", n = 5)

      [,1]      [,2]     [,3] [,4] [,5]
 [1,] "January" ""       ""   ""   ""  
 [2,] "F"       "bruary" ""   ""   ""  
 [3,] "March"   ""       ""   ""   ""  
 [4,] "April"   ""       ""   ""   ""  
 [5,] "May"     ""       ""   ""   ""  
 [6,] "Jun"     ""       ""   ""   ""  
 [7,] "July"    ""       ""   ""   ""  
 [8,] "August"  ""       ""   ""   ""  
 [9,] "S"       "pt"     "mb" "r"  ""  
[10,] "Octob"   "r"      ""   ""   ""  
[11,] "Nov"     "mb"     "r"  ""   ""  
[12,] "D"       "c"      "mb" "r"  ""

# base::strsplitはstringr::str_splitと挙動がほぼ同じだが、引数splitを末尾に含んでいた場合は異なる
strsplit(x = month.name, split = "e") %>% 
  sapply(X = ., FUN = length)

 [1] 1 2 1 1 1 1 1 1 4 2 3 4

month.name %>%
  stringr::str_split(pattern = "e") %>%
  sapply(X = ., FUN = length)

 [1] 1 2 1 1 1 2 1 1 4 2 3 4

# base::strsplitは6月(June)が1個のベクトル（stringr::str_splitのときは2個目が空文字列）
strsplit(x = month.name, split = "e")

[[1]]
[1] "January"

[[2]]
[1] "F"      "bruary"

[[3]]
[1] "March"

[[4]]
[1] "April"

[[5]]
[1] "May"

[[6]]
[1] "Jun"

[[7]]
[1] "July"

[[8]]
[1] "August"

[[9]]
[1] "S"  "pt" "mb" "r" 

[[10]]
[1] "Octob" "r"    

[[11]]
[1] "Nov" "mb"  "r"  

[[12]]
[1] "D"  "c"  "mb" "r"

# 戻り値がデータフレームではないので、一度data.frameに変換してから{dplyr}で処理
month.name %>%
  stringr::str_split_fixed(pattern = "e", n = 4) %>%
  as.data.frame() %>%
  dplyr::bind_cols()

Source: local data frame [12 x 4]

        V1     V2 V3 V4
1  January             
2        F bruary      
3    March             
4    April             
5      May             
6      Jun             
7     July             
8   August             
9        S     pt mb  r
10   Octob      r      
11     Nov     mb  r   
12       D      c mb  r

# MeCabによる形態素解析結果をパースするときによく使います
# n = 「","の個数 + 1」
c("名詞,サ変接続,*,*,*,*,テスト,テスト,テスト", "名詞,サ変接続,*,*,*,*,統計,トウケイ,トーケイ") %>%
  stringr::str_split_fixed(string = , pattern = ",",n = 9)

     [,1]   [,2]       [,3] [,4] [,5] [,6] [,7]     [,8]       [,9]      
[1,] "名詞" "サ変接続" "*"  "*"  "*"  "*"  "テスト" "テスト"   "テスト"  
[2,] "名詞" "サ変接続" "*"  "*"  "*"  "*"  "統計"   "トウケイ" "トーケイ"

文字列エンコーディング(str_conv)

base: iconv
stringr: stringr::str_conv
stringi: stringi::stri_encode, ~~stringi::stri_conv(alias)~~

# SHIFT-JISのオープンデータ
# Windows環境ならエンコーディング不要？
shift_jis_str <- readr::read_lines(
  file = "https://www.city.chiba.jp/shimin/shimin/kohokocho/documents/shisetsu.csv",
  n_max = 1
) %>%
  print

[1] "\x83y\x81[\x83W\x83^\x83C\x83g\x83\x8b,\x8e{\x90݃W\x83\x83\x83\x93\x83\x8b,\x8e{\x90݁A\x8fꏊ\x81A\x83C\x83x\x83\x93\x83g\x82̖\xbc\x8f́i\x93ǂ݁j,\x97X\x95֔ԍ\x86,\x8fZ\x8f\x8a,\x83r\x83\x8b\x96\xbc,\x83t\x83\x8d\x83A\x90\x94,\x88ܓx,\x8co\x93x"

# 文字コードを判定すると"Shift_JIS"っぽい（Confidenceが一番高い）
# stringi::stri_enc_detectとstringi::stri_enc_detect2はまだ実験的に作成
estimate_encording <- stringi::stri_enc_detect(str = shift_jis_str) %>% 
  print

[[1]]
[[1]]$Encoding
[1] "Shift_JIS"    "windows-1252" "windows-1250" "GB18030"     
[5] "Big5"         "windows-1253"

[[1]]$Language
[1] "ja" "es" "cs" "zh" "zh" "el"

[[1]]$Confidence
[1] 1.00 0.24 0.18 0.10 0.10 0.04

# エンコーディング
utf_str <- suppressWarnings(stringr::str_conv(
  string = shift_jis_str, 
  encoding = estimate_encording[[1]]$Encoding[which.max(estimate_encording[[1]]$Confidence)]
)) %>% 
  print

[1] "ページタイトル,施設ジャンル,施設、場所、イベントの名称（読み）,郵便番号,住所,ビル名,フロア数,緯度,経度"

# base::iconvと挙動は同じ
iconv(x = shift_jis_str, from = "SHIFT-JIS", to = "UTF-8")

[1] "ページタイトル,施設ジャンル,施設、場所、イベントの名称（読み）,郵便番号,住所,ビル名,フロア数,緯度,経度"

文字列の包含判定(str_detect)

base: grepl
stringr: stringr::str_detect
stringi: stringi::stri_detect_fixed

# パターンを含むかどうかの論理値を返す
stringr::str_detect(string = month.name, pattern = "J")

 [1]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
[12] FALSE

# base::greplと挙動は同じ
grepl(x = month.name, pattern = "J")

 [1]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
[12] FALSE

文字列の抽出(str_extract/str_match)

base: regmatches
stringr: stringr::str_extract, stringr::str_extract_all
stringi: stringi::stri_extract_first, stringi::stri_extract_all

stringr: stringr::str_match, stringr::str_match_all
stringi: stringi::stri_match_first, stringi::stri_match_all

# stringr::str_extractはパターンにマッチした箇所のみをベクトルで返す
stringr::str_extract(string = month.name, pattern = "(.{1,3})(.{1,3})")

 [1] "Januar" "Februa" "March"  "April"  "May"    "June"   "July"  
 [8] "August" "Septem" "Octobe" "Novemb" "Decemb"

# stringr::str_match() でグループ化した箇所を、グループに対応する列番号に代入した行列で返す
# 1列目は完全一致した文字列
stringr::str_match(string = month.name, pattern = "(.{1,3})(.{1,3})")

      [,1]     [,2]  [,3] 
 [1,] "Januar" "Jan" "uar"
 [2,] "Februa" "Feb" "rua"
 [3,] "March"  "Mar" "ch" 
 [4,] "April"  "Apr" "il" 
 [5,] "May"    "Ma"  "y"  
 [6,] "June"   "Jun" "e"  
 [7,] "July"   "Jul" "y"  
 [8,] "August" "Aug" "ust"
 [9,] "Septem" "Sep" "tem"
[10,] "Octobe" "Oct" "obe"
[11,] "Novemb" "Nov" "emb"
[12,] "Decemb" "Dec" "emb"

# グループ化しないと同じ結果を出すが、stringr::str_extractはベクトルで、stringr::str_matchは1列の行列
stringr::str_extract(string = month.name, pattern = ".{1,3}.{1,3}")

 [1] "Januar" "Februa" "March"  "April"  "May"    "June"   "July"  
 [8] "August" "Septem" "Octobe" "Novemb" "Decemb"

stringr::str_match(string = month.name, pattern = ".{1,3}.{1,3}")

      [,1]    
 [1,] "Januar"
 [2,] "Februa"
 [3,] "March" 
 [4,] "April" 
 [5,] "May"   
 [6,] "June"  
 [7,] "July"  
 [8,] "August"
 [9,] "Septem"
[10,] "Octobe"
[11,] "Novemb"
[12,] "Decemb"

# stringr::str_extract_allはマッチした全ての箇所を出す
# stringr::str_extractは複数パターンがあった場合は先頭一致
# 引数simplifyで行列とリストの変更が可能（デフォルトはFALSEでリスト）
stringr::str_extract(string = month.name, pattern = "er|em")

 [1] NA   NA   NA   NA   NA   NA   NA   NA   "em" "er" "em" "em"

stringr::str_extract_all(string = month.name, pattern = "er|em", simplify = TRUE)

      [,1] [,2]
 [1,] ""   ""  
 [2,] ""   ""  
 [3,] ""   ""  
 [4,] ""   ""  
 [5,] ""   ""  
 [6,] ""   ""  
 [7,] ""   ""  
 [8,] ""   ""  
 [9,] "em" "er"
[10,] "er" ""  
[11,] "em" "er"
[12,] "em" "er"

# stringr::str_match_allはマッチした全ての箇所を出す
# stringr::str_matchは複数パターンがあった場合は先頭一致
# stringr::str_extract/stringr::str_extract_allと異なりどのパターンにマッチしたかわかる
stringr::str_match(string = month.name, pattern = "(er)|(em)")

      [,1] [,2] [,3]
 [1,] NA   NA   NA  
 [2,] NA   NA   NA  
 [3,] NA   NA   NA  
 [4,] NA   NA   NA  
 [5,] NA   NA   NA  
 [6,] NA   NA   NA  
 [7,] NA   NA   NA  
 [8,] NA   NA   NA  
 [9,] "em" NA   "em"
[10,] "er" "er" NA  
[11,] "em" NA   "em"
[12,] "em" NA   "em"

stringr::str_match_all(string = month.name, pattern = "(er)|(em)")

[[1]]
     [,1] [,2] [,3]

[[2]]
     [,1] [,2] [,3]

[[3]]
     [,1] [,2] [,3]

[[4]]
     [,1] [,2] [,3]

[[5]]
     [,1] [,2] [,3]

[[6]]
     [,1] [,2] [,3]

[[7]]
     [,1] [,2] [,3]

[[8]]
     [,1] [,2] [,3]

[[9]]
     [,1] [,2] [,3]
[1,] "em" ""   "em"
[2,] "er" "er" ""  

[[10]]
     [,1] [,2] [,3]
[1,] "er" "er" ""  

[[11]]
     [,1] [,2] [,3]
[1,] "em" ""   "em"
[2,] "er" "er" ""  

[[12]]
     [,1] [,2] [,3]
[1,] "em" ""   "em"
[2,] "er" "er" ""

# regmatchesはグループ化してもしなくても変わらない
# stringr::str_extract_allに近い結果（stringr::str_extract_allはマッチしない箇所は空文字列）
regmatches(x = month.name, m = gregexpr(text = month.name, pattern = "er|em"))

[[1]]
character(0)

[[2]]
character(0)

[[3]]
character(0)

[[4]]
character(0)

[[5]]
character(0)

[[6]]
character(0)

[[7]]
character(0)

[[8]]
character(0)

[[9]]
[1] "em" "er"

[[10]]
[1] "er"

[[11]]
[1] "em" "er"

[[12]]
[1] "em" "er"

条件を含んだ文字列(str_subset)

base: grep
stringr: stringr::str_subset
stringi: stringi::stri_detect_fixed

# stringr::str_subsetは条件を含んだ文字列全体を返す
stringr::str_subset(string = month.name, pattern = "M|J")

[1] "January" "March"   "May"     "June"    "July"

# stringr::str_extract/stringr::str_matchは条件にマッチした部分文字列
stringr::str_extract(string = month.name, pattern = "M|J")

 [1] "J" NA  "M" NA  "M" "J" "J" NA  NA  NA  NA  NA

stringr::str_match(string = month.name, pattern = "M|J") %>%
  t

     [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
[1,] "J"  NA   "M"  NA   "M"  "J"  "J"  NA   NA   NA    NA    NA

# base::grepの引数valueをTRUEにした場合と挙動は同じ
grep(x = month.name, pattern = "M|J", value = TRUE)

[1] "January" "March"   "May"     "June"    "July"

条件を含んだ箇所の数(str_count)

base: -
stringr: stringr::str_count
stringi: stringi::stri_count_fixed

stringr::str_count(string = month.name, pattern = "e")

 [1] 0 1 0 0 0 1 0 0 3 1 2 3

# 複数パターン時はマッチした分だけ加算
stringr::str_count(string = month.name, pattern = "e|J")

 [1] 1 1 0 0 0 2 1 0 3 1 2 3

文字列置換(str_replace)

base: sub, gsub
stringr: stringr::str_replace, stringr::str_replace_all
stringi: stringi::stri_replace, stringi::stri_replace_all

# stringr::str_replaceは引数patternに先頭一致した文字列を、
# stringr::str_replace_allは全てを、
# 引数replacementの値に置換
stringr::str_replace(string = month.name, pattern = "e", replacement = "x")

 [1] "January"   "Fxbruary"  "March"     "April"     "May"      
 [6] "Junx"      "July"      "August"    "Sxptember" "Octobxr"  
[11] "Novxmber"  "Dxcember"

stringr::str_replace_all(string = month.name, pattern = "e", replacement = "x")

 [1] "January"   "Fxbruary"  "March"     "April"     "May"      
 [6] "Junx"      "July"      "August"    "Sxptxmbxr" "Octobxr"  
[11] "Novxmbxr"  "Dxcxmbxr"

# stringr::str_replace_allは次のような対応付ける記述も可能
stringr::str_replace_all(string = month.name, pattern = c("e" = "x", "J" = "K"))

 [1] "Kanuary"   "Fxbruary"  "March"     "April"     "May"      
 [6] "Kunx"      "Kuly"      "August"    "Sxptxmbxr" "Octobxr"  
[11] "Novxmbxr"  "Dxcxmbxr"

# stringr::str_replaceだと引数replacementがないとエラー
try(expr =  stringr::str_replace(string = month.name, pattern = c("e" = "x", "J" = "K")), silent = TRUE)


# base::subとbase::gsubと挙動は同じ
sub(x = month.name, pattern = "e", replacement = "x")

 [1] "January"   "Fxbruary"  "March"     "April"     "May"      
 [6] "Junx"      "July"      "August"    "Sxptember" "Octobxr"  
[11] "Novxmber"  "Dxcember"

gsub(x = month.name, pattern = "e", replacement = "x")

 [1] "January"   "Fxbruary"  "March"     "April"     "May"      
 [6] "Junx"      "July"      "August"    "Sxptxmbxr" "Octobxr"  
[11] "Novxmbxr"  "Dxcxmbxr"

# stringr::str_replace_naでNAの置換も可能
stringr::str_replace_na(string = c(month.name, NA),  replacement = "AN")

 [1] "January"   "February"  "March"     "April"     "May"      
 [6] "June"      "July"      "August"    "September" "October"  
[11] "November"  "December"  "AN"

stringr::str_replace_na(string = c(month.name, "NA"), replacement = "AN")

 [1] "January"   "February"  "March"     "April"     "May"      
 [6] "June"      "July"      "August"    "September" "October"  
[11] "November"  "December"  "NA"

# subだとNAを含むと失敗する
sub(x = c(month.name, NA), pattern = NA, replacement = "AN")

 [1] NA NA NA NA NA NA NA NA NA NA NA NA NA

sub(x = c(month.name, "NA"), pattern = "NA", replacement = "AN")

 [1] "January"   "February"  "March"     "April"     "May"      
 [6] "June"      "July"      "August"    "September" "October"  
[11] "November"  "December"  "AN"

部分文字列抽出と置換(str_sub)

base: substr, substring
stringr: stringr::str_sub
stringi: stringi::stri_sub

month_name_stringr <- month_name_base <- month.name

stringr::str_sub(string = month_name_stringr, start = 1, end = 3)

 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"

# startとendの引数にマイナスの値を指定すると、後ろから数えた文字列になる
stringr::str_sub(string = month_name_stringr, start = -3, end = -1)

 [1] "ary" "ary" "rch" "ril" "May" "une" "uly" "ust" "ber" "ber" "ber"
[12] "ber"

stringr::str_sub(string = month_name_stringr, start = 4, end = -1) <- "ber"
# 4文字目以降に引数valueの"ber"を挿入
month_name_stringr

 [1] "Janber" "Febber" "Marber" "Aprber" "Mayber" "Junber" "Julber"
 [8] "Augber" "Sepber" "Octber" "Novber" "Decber"

# base::substr/base::substringとは文字列置換の挙動が異なる
# base::substr/base::substringは引数valueのサイズに合わせて置換
# base::substr/base::substringは同じ
substr(x = month_name_base, start = 1, stop = 3)

 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"

substr(x = month_name_base, start = 4, stop = 10) <- "ber"
# 4文字目以降に引数valueの"ber"を挿入
# 3文字以下の場合は文字の挿入はされない
# 挿入後に元の文字列サイズは越えないように前から切られる
month_name_base

 [1] "Janbery"   "Febberry"  "Marbe"     "Aprbe"     "May"      
 [6] "Junb"      "Julb"      "Augber"    "Sepberber" "Octberr"  
[11] "Novberer"  "Decberer"

month_name_base <- month.name
substring(text = month_name_base, first = 4)  <- "ber"
month_name_base

 [1] "Janbery"   "Febberry"  "Marbe"     "Aprbe"     "May"      
 [6] "Junb"      "Julb"      "Augber"    "Sepberber" "Octberr"  
[11] "Novberer"  "Decberer"

# startとstopの引数にマイナスは指定しても、後ろからにはならない
substr(x = month_name_base, start = -3, stop = -1)

 [1] "" "" "" "" "" "" "" "" "" "" "" ""

substr(x = month_name_base, start = -3, stop = 3)

 [1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"

文字列におけるパターンの位置(str_locate)

base: grep
stringr: stringr::str_locate, stringr::str_locate_all
stringi: stringi::stri_locate, stri_locate_all, (stringi::stri_locate_first, stringi::stri_locate_last)

# stringr::str_locateは複数パターンがあった場合は先頭一致するパターンの位置
# "June"の戻り値は最初に一致したパターン"J"の位置
stringr::str_locate(string = month.name, pattern = "e|J")

      start end
 [1,]     1   1
 [2,]     2   2
 [3,]    NA  NA
 [4,]    NA  NA
 [5,]    NA  NA
 [6,]     1   1
 [7,]     1   1
 [8,]    NA  NA
 [9,]     2   2
[10,]     6   6
[11,]     4   4
[12,]     2   2

# stringr::str_locate_allはマッチした全ての箇所を出す
# 複数時の順番はマッチした順（パターンの記述順は無視）
# "June"の戻り値は最初に一致したパターン"J"の位置と、次に一致したパターン"e"の位置
stringr::str_locate_all(string = month.name, pattern = "e|J")

[[1]]
     start end
[1,]     1   1

[[2]]
     start end
[1,]     2   2

[[3]]
     start end

[[4]]
     start end

[[5]]
     start end

[[6]]
     start end
[1,]     1   1
[2,]     4   4

[[7]]
     start end
[1,]     1   1

[[8]]
     start end

[[9]]
     start end
[1,]     2   2
[2,]     5   5
[3,]     8   8

[[10]]
     start end
[1,]     6   6

[[11]]
     start end
[1,]     4   4
[2,]     7   7

[[12]]
     start end
[1,]     2   2
[2,]     4   4
[3,]     7   7

# stringr::invert_matchにstringr::str_locate_allの戻り値を渡すとマッチしなかった位置の範囲を返す
match_idx <- stringr::invert_match(
  loc = stringr::str_locate_all(string = month.name, pattern = "e|J")[[9]]
)
stringr::str_sub(
  string = month.name[9],
  start = match_idx[, "start"], end = match_idx[, "end"]
)

[1] "S"  "pt" "mb" "r"

# base::regexpr/base::gregexprは似た挙動
# マッチした開始位置とマッチした長さを返す
regexpr(text = month.name, pattern = "e|J")

 [1]  1  2 -1 -1 -1  1  1 -1  2  6  4  2
attr(,"match.length")
 [1]  1  1 -1 -1 -1  1  1 -1  1  1  1  1
attr(,"useBytes")
[1] TRUE

文字列の繰り返し(str_dup)

base: rep
stringr: stringr::str_dup
stringi: stringi::stri_dup

# times引数の各値だけstring引数の各ベクトルを繰り返す
# stringが「"月", "火", ..., "日"」で、timesが「1, 2, ..., 7」なので、
# 「月が1回」「火が2回」という結果になる
stringr::str_dup(
  string = format(as.Date("2015-07-12") + seq(from = 1, to = 7), "%a"), 
  times = seq(from = 1, to = 7)
)

[1] "月"             "火火"           "水水水"         "木木木木"      
[5] "金金金金金"     "土土土土土土"   "日日日日日日日"

# base::repが似た挙動
rep(
  x = format(as.Date("2015-07-12") + seq(from = 1, to = 7), "%a"),
  times = seq(from = 1, to = 7)
)

 [1] "月" "火" "火" "水" "水" "水" "木" "木" "木" "木" "金" "金" "金" "金"
[15] "金" "土" "土" "土" "土" "土" "土" "日" "日" "日" "日" "日" "日" "日"

文字列の長さ(str_length)

base: length
stringr: stringr::str_length
stringi: stringi::stri_length

# 各文字列の長さを返す
stringr::str_length(string = month.name)

 [1] 7 8 5 5 3 4 4 6 9 7 8 8

# UTF-8でないとダメっぽい（Windowsの場合は通る。SHIFT-JISを{stringi}側でUTF-8に変換してくれる？）
stringr::str_length(string = shift_jis_str)

Warning in stri_length(string): invalid UTF-8 byte sequence detected.
perhaps you should try calling stri_enc_toutf8()

[1] NA

stringr::str_length(string = utf_str)

[1] 55

# unicode文字列も変換すると正しく数えられる
unicode_str <- "\\u3042\\u3043\\u3045\\u3045"
stringr::str_length(string = unicode_str)

[1] 24

stringr::str_length(string = stringi::stri_unescape_unicode(unicode_str))

[1] 4

# stringi::stri_numbytesだとバイト長を測る
stringr::str_length(string = c("abc", "123", "\u0105\u0104"))

[1] 3 3 2

stringi::stri_numbytes(str = c("abc", "123", "\u0105\u0104"))

[1] 3 3 4

# 長さによる判定でもできるが、空文字列かどうかの判定するstringi::stri_isemptyがある
stringi::stri_isempty(str = c("", "abc", "123", "\u0105\u0104", character(1)))

[1]  TRUE FALSE FALSE FALSE  TRUE

文字列の順序とソート(str_order/str_sort)

base: order, sort
stringr: stringr::str_order, stringr::str_sort
stringi: stringr::stri_order, stringr::stri_sort

ja_aiueo <- stringr::str_split(string = "あいうえおアイウエオｱｲｳｴｵ", pattern = "") %>% 
  dplyr::combine()

stringr::str_sort(x = ja_aiueo)

 [1] "あ" "ア" "ｱ"  "い" "イ" "ｲ"  "う" "ウ" "ｳ"  "え" "エ" "ｴ"  "お" "オ"
[15] "ｵ"

stringr::str_order(x = ja_aiueo)

 [1]  1  6 11  2  7 12  3  8 13  4  9 14  5 10 15

# base::orderは同じ結果だが、base::sortは微妙に異なる結果を出す
# Windowsだと「"ｱ"  "ア" "あ" "ｲ"  "イ" "い" "ｳ"  "ウ" "う" "ｴ"  "エ" "え" "ｵ"  "オ" "お"」になる
sort(x = ja_aiueo)

 [1] "あ" "ｱ"  "ア" "い" "ｲ"  "イ" "ｳ"  "ウ" "う" "エ" "え" "ｴ"  "オ" "お"
[15] "ｵ"

order(x = ja_aiueo)

 [1]  1  6 11  2  7 12  3  8 13  4  9 14  5 10 15

文字列の整形（空白生成と空白除去、折り返し）(str_pad/str_trim/str_wrap)

base: -
stringr: stringr::str_pad, stringr::str_trim, stringr::str_wrap
stringi: stringi::stri_pad_left, stringi::stri_pad_both, stringi::stri_pad_right,
stringi::str_trim_left, stringi::stri_trim_both, stringi::stri_trim_right,
stringi::stri_stri_wrap

# width引数の長さになるまで、pad引数の文字列を、side引数の箇所（"both"の際は右側優先）に追加
padding_month_name <- stringr::str_pad(
  string = month.name,
  width = 8, side = "both", pad = " "
) %>%
  print

 [1] "January "  "February"  " March  "  " April  "  "  May   " 
 [6] "  June  "  "  July  "  " August "  "September" "October " 
[11] "November"  "December"

# 文字列長が1なら空白以外も引数padに指定可能
stringr::str_pad(
  string = month.name,
  width = 8, side = "both", pad = "-"
)

 [1] "January-"  "February"  "-March--"  "-April--"  "--May---" 
 [6] "--June--"  "--July--"  "-August-"  "September" "October-" 
[11] "November"  "December"

# 空白文字列を除去
stringr::str_trim(string = padding_month_name, side = "both")

 [1] "January"   "February"  "March"     "April"     "May"      
 [6] "June"      "July"      "August"    "September" "October"  
[11] "November"  "December"

# 文字列ベクトルを引数widthくらいの長さで改行
# パラグラフ頭に引数indent分の空白を、各文頭に引数exdent分の空白を、作成
stringr::str_wrap(
  string = stringr::str_c(month.name, collapse = " "),
  width = 30, indent = 10, exdent = 3
)

[1] "          January February\n   March April May June July\n   August September October\n   November December"

文字列の大文字・小文字・タイトルケース化(str_to_lower/str_to_title/str_to_upper)

base: tolower, toupper
stringr: stringr::str_to_lower, stringr::str_to_title, stringr::str_to_upper
stringi: stringi::stri_trans_tolower, stringi::stri_trans_totitle, stringi::stri_trans_toupper

stringr::str_to_upper(string = month.name)

 [1] "JANUARY"   "FEBRUARY"  "MARCH"     "APRIL"     "MAY"      
 [6] "JUNE"      "JULY"      "AUGUST"    "SEPTEMBER" "OCTOBER"  
[11] "NOVEMBER"  "DECEMBER"

stringr::str_to_lower(string = month.name)

 [1] "january"   "february"  "march"     "april"     "may"      
 [6] "june"      "july"      "august"    "september" "october"  
[11] "november"  "december"

stringr::str_to_title(string = stringr::str_to_lower(string = month.name))

 [1] "January"   "February"  "March"     "April"     "May"      
 [6] "June"      "July"      "August"    "September" "October"  
[11] "November"  "December"

# baseにはタイトルケース化の関数はない（パッケージの利用やパターンマッチで行える。下記リンクより）
# http://stackoverflow.com/questions/6364783/capitalize-the-first-letter-of-both-words-in-a-two-word-string
# base::tolower/base::toupperで、大文字化と小文字化は可能
tolower(x = month.name)

 [1] "january"   "february"  "march"     "april"     "may"      
 [6] "june"      "july"      "august"    "september" "october"  
[11] "november"  "december"

toupper(x = month.name)

 [1] "JANUARY"   "FEBRUARY"  "MARCH"     "APRIL"     "MAY"      
 [6] "JUNE"      "JULY"      "AUGUST"    "SEPTEMBER" "OCTOBER"  
[11] "NOVEMBER"  "DECEMBER"

{stringr}で定義されているその他の関数

単語抽出(word)

base: -
stringr: stringr::word
stringi: -

str <- c("我輩", "は", "猫", "で", "ある")

# 特定の文字で分かち書きされた文から単語を抽出
stringr::word(
  string = stringi::stri_flatten(str = str, collapse = " "), 
  start = 1, end = seq(from = 1, to = length(str)),
  sep = " "
)

[1] "我輩"               "我輩 は"            "我輩 は 猫"        
[4] "我輩 は 猫 で"      "我輩 は 猫 で ある"

# 単語からなる文字列ベクトルでは挙動しない
# 単語をある文字列で連結させ、ひとつの文字列する必要がある(stringi::stri_flattenがこの用途に有用)
# "我輩 は 猫 で ある" => OK
# c("我輩", "は", "猫", "で", "ある") => NG
stringr::word(string = str, start = 1)

[1] "我輩" "は"   "猫"   "で"   "ある"

単語分割パターンの制御(boundary)

base: -
stringr: stringr::boundary
stringi: stringi::stri_opts_brkiter

# stringr::boundaryではstringi::stri_opts_brkiterを呼び出す
# stringr::str_splitの使用時にstringi::stri_split_boundariesにてstringi::stri_opts_brkiterが参照される
# Wepページのようなスペースが揃っていないテキストを入手した際、パースに悩むときに使う
web_like_text <- stringr::str_wrap(
  string = stringr::str_c(month.name, collapse = " "),
  width = 30, indent = 10, exdent = 3
) %>%
  print

[1] "          January February\n   March April May June July\n   August September October\n   November December"

# うまくいかない
stringr::str_split(string = web_like_text, pattern = " ")

[[1]]
 [1] ""           ""           ""           ""           ""          
 [6] ""           ""           ""           ""           ""          
[11] "January"    "February\n" ""           ""           "March"     
[16] "April"      "May"        "June"       "July\n"     ""          
[21] ""           "August"     "September"  "October\n"  ""          
[26] ""           "November"   "December"

# type = "word"で単語毎に分ける
stringr::str_split(
  string = web_like_text, 
  pattern = stringr::boundary(type = "word", skip_word_none = TRUE)
) %>% 
  dplyr::combine()

 [1] "January"   "February"  "March"     "April"     "May"      
 [6] "June"      "July"      "August"    "September" "October"  
[11] "November"  "December"

# type = "line_break"は改行文字直前の単語のみを抽出
stringr::str_split(
  string = web_like_text, 
  pattern = stringr::boundary(type = "line_break", skip_word_none = TRUE)
) %>% 
  dplyr::combine()

[1] "February\n" "July\n"     "October\n"

# 改行文字を削除すると機能しない
stringr::str_split(
  string = stringr::str_replace_all(string = web_like_text, pattern = "\\n", replacement = ""), 
  pattern = stringr::boundary(type = "line_break", skip_word_none = TRUE)
) %>% 
  dplyr::combine()

character(0)

# type = "sentence"は文単位（改行）で区切る
stringr::str_split(
  string = web_like_text, 
  pattern = stringr::boundary(type = "sentence", skip_word_none = TRUE)
) %>% 
  dplyr::combine()

[1] "          January February\n"   "   March April May June July\n"
[3] "   August September October\n"  "   November December"

# 改行文字を削除するとそのまま出力
stringr::str_split(
  string = stringr::str_replace_all(string = web_like_text, pattern = "\\n", replacement = ""), 
  pattern = stringr::boundary(type = "sentence", skip_word_none = TRUE)
) %>% 
  dplyr::combine()

[1] "          January February   March April May June July   August September October   November December"

# type = "character"の挙動がよくわからない
stringr::str_split(
  string = stringr::str_wrap(
    string = stringr::str_c(month.abb, n_day, sep = ".", collapse = ""),
    width = 30, indent = 10, exdent = 3
  ), 
  pattern = stringr::boundary(type = "character", skip_word_none = TRUE)
) %>% 
  dplyr::combine()

character(0)

# 日本語でも挙動する
web_like_ja_text <- stringr::str_wrap(
  string = stringr::str_c(
    stringr::str_dup(
      string = format(as.Date("2015-07-12") + seq(from = 1, to = 7), "%a"), 
      times = seq(from = 1, to = 7)),
    collapse = " "
  ),
  width = 20, indent = 10, exdent = 3
) %>% 
  print

[1] "          月 火火 水水水 木\n   木木木 金金金金金 土土土土土土\n   日日日日日日日"

stringr::str_split(
  string = web_like_ja_text, 
  pattern = stringr::boundary(type = "word", skip_word_none = TRUE)
)

[[1]]
[1] "月"             "火火"           "水水水"         "木"            
[5] "木木木"         "金金金金金"     "土土土土土土"   "日日日日日日日"

stringr::str_split(
  string = web_like_ja_text, 
  pattern = stringr::boundary(type = "line_break", skip_word_none = TRUE)
)

[[1]]
[1] "木\n" "土\n"

stringr::str_split(
  string = web_like_ja_text, 
  pattern = stringr::boundary(type = "sentence", skip_word_none = TRUE)
)

[[1]]
[1] "          月 火火 水水水 木\n"      
[2] "   木木木 金金金金金 土土土土土土\n"
[3] "   日日日日日日日"

# 「stringi::stri_*_boundaries」を呼び出している関数で使える

# stringr::str_locate/stringr::str_locate_allでも使える
# stri_locate_all_boundaries, stri_locate_first_boundaries, stri_locate_last_boundariesにて参照
word_locate <- stringr::str_locate_all(
  string = web_like_text, 
  pattern = stringr::boundary(type = "word", skip_word_none = TRUE)
)
stringr::str_sub(
  string = web_like_text,
  start = word_locate[[1]][, "start"], end = word_locate[[1]][, "end"]
)

 [1] "January"   "February"  "March"     "April"     "May"      
 [6] "June"      "July"      "August"    "September" "October"  
[11] "November"  "December"

# stringr::str_countでも使える（stringi::stri_count_boundariesで参照）
# 単語で分割された数（元データが暦の英語名）が出力
stringr::str_count(
  string = web_like_text, 
  pattern = stringr::boundary(type = "word", skip_word_none = TRUE)
)

[1] 12

# stringr::str_detect/stringr::str_extract/stringr::str_subsetは未実装（エラー）
# 「stringi::stri_*_boundaries」を呼び出す関数がない
# それぞれのドキュメントには引数patternでstringr::boundary()が使えると書いてあります
try(
  expr = stringr::str_subset(
    string = web_like_text, 
    pattern = stringr::boundary(type = "word", skip_word_none = TRUE)
  )
)

マッチパターンの制御(regex, fixed, coll)

base: -
stringr: stringr::regex, stringr::fixed, stringr::coll
stringi: stri_opts_regex, stri_opts_fixed, stri_opts_collator

# {stringr}のデフォルトの正規表現は{stringi}で使われるICU正規表現エンジンで処理される
# どういう正規表現が使えるかは下記のオンラインマニュアルを参照
# http://docs.rexamine.com/R-man/stringi/stringi-search-regex.html


# stringr::regexはデフォルトのICUのオプションを変更する際に使う
# 変えられるオプションはstringi::stri_opts_regexを参考のこと
multiline_str <- stringr::str_c(
  stringr::str_split(string = web_like_text, pattern = stringr::boundary(type = "line_break", skip_word_none = TRUE)) %>% 
    dplyr::combine(), 
  collapse = ""
) %>%
  print

[1] "February\nJuly\nOctober\n"

# 引数multilineはFALSEがデフォルト
str_extract_all(string = multiline_str, pattern = "^.")

[[1]]
[1] "F"

# 引数multilineはTRUEに変更
str_extract_all(string = multiline_str, pattern = stringr::regex(pattern = "^.", multiline = TRUE))

[[1]]
[1] "F" "J" "O"

# デフォルト仕様と異なるマッチの方法をさせたい場合にstringr::fixed/stringr::collを用いる（前述のstringr::boundaryも同様の用途で使える）
# @kohske先生の記事がとてもわかりやすい
# http://qiita.com/kohske/items/85d49da04571e9055c44#パターン検索

# stringr::fixedではパターンをロケールに依存しないバイト列としてマッチさせる
# Examplesより
strings <- c("abb", "a.b")
pattern <- "a.b"

str_detect(strings, pattern)

[1] TRUE TRUE

str_detect(strings, fixed(pattern))

[1] FALSE  TRUE

str_detect(strings, coll(pattern))

[1] FALSE  TRUE

# stringr::collではロケールを考慮したマッチに役立つ
i <- c("I", "\u0130", "i") %>%
  print

[1] "I"  "İ" "i"

str_detect(i, fixed("i", TRUE))

[1]  TRUE FALSE  TRUE

str_detect(i, coll("i", TRUE))

[1]  TRUE FALSE  TRUE

str_detect(i, coll("i", TRUE, locale = "tr"))

[1] FALSE  TRUE  TRUE

{stringi}で定義されているいろいろな関数

{stringr}でAPI化されていない関数もいくつかあるので、それらを動かしてみる
　要参照

{stringi}の環境関係(stri_info, stri_install_check, …)

stri_info, stri_install_check, stri_install_icudt

# Windowsだと警告文が出る
# Your native charset is not a superset of US-ASCII. This may cause serious problems. Consider switching to UTF-8.
stringi::stri_info()

$Unicode.version
[1] "6.3"

$ICU.version
[1] "52.1"

$Locale
$Locale$Language
[1] "ja"

$Locale$Country
[1] "JP"

$Locale$Variant
[1] ""

$Locale$Name
[1] "ja_JP"


$Charset.internal
[1] "UTF-8"  "UTF-16"

$Charset.native
$Charset.native$Name.friendly
[1] "UTF-8"

$Charset.native$Name.ICU
[1] "UTF-8"

$Charset.native$Name.UTR22
[1] NA

$Charset.native$Name.IBM
[1] "ibm-1208"

$Charset.native$Name.WINDOWS
[1] "windows-65001"

$Charset.native$Name.JAVA
[1] "UTF-8"

$Charset.native$Name.IANA
[1] "UTF-8"

$Charset.native$Name.MIME
[1] "UTF-8"

$Charset.native$ASCII.subset
[1] TRUE

$Charset.native$Unicode.1to1
[1] NA

$Charset.native$CharSize.8bit
[1] FALSE

$Charset.native$CharSize.min
[1] 1

$Charset.native$CharSize.max
[1] 3

stringi::stri_install_check()
stringi::stri_install_icudt()

Localeの設定関係(stri_locale_get, stri_locale_list, …)

stri_locale_get, stri_locale_info, stri_locale_list, stri_locale_set

# 使用できるICUのロケール名
stringi::stri_locale_list()

  [1] "af"          "af_NA"       "af_ZA"       "agq"         "agq_CM"     
  [6] "ak"          "ak_GH"       "am"          "am_ET"       "ar"         
 [11] "ar_001"      "ar_AE"       "ar_BH"       "ar_DJ"       "ar_DZ"      
 [16] "ar_EG"       "ar_EH"       "ar_ER"       "ar_IL"       "ar_IQ"      
 [21] "ar_JO"       "ar_KM"       "ar_KW"       "ar_LB"       "ar_LY"      
 [26] "ar_MA"       "ar_MR"       "ar_OM"       "ar_PS"       "ar_QA"      
 [31] "ar_SA"       "ar_SD"       "ar_SO"       "ar_SS"       "ar_SY"      
 [36] "ar_TD"       "ar_TN"       "ar_YE"       "as"          "as_IN"      
 [41] "asa"         "asa_TZ"      "az"          "az_Cyrl"     "az_Cyrl_AZ" 
 [46] "az_Latn"     "az_Latn_AZ"  "bas"         "bas_CM"      "be"         
 [51] "be_BY"       "bem"         "bem_ZM"      "bez"         "bez_TZ"     
 [56] "bg"          "bg_BG"       "bm"          "bm_ML"       "bn"         
 [61] "bn_BD"       "bn_IN"       "bo"          "bo_CN"       "bo_IN"      
 [66] "br"          "br_FR"       "brx"         "brx_IN"      "bs"         
 [71] "bs_Cyrl"     "bs_Cyrl_BA"  "bs_Latn"     "bs_Latn_BA"  "ca"         
 [76] "ca_AD"       "ca_ES"       "ca_FR"       "ca_IT"       "cgg"        
 [81] "cgg_UG"      "chr"         "chr_US"      "cs"          "cs_CZ"      
 [86] "cy"          "cy_GB"       "da"          "da_DK"       "da_GL"      
 [91] "dav"         "dav_KE"      "de"          "de_AT"       "de_BE"      
 [96] "de_CH"       "de_DE"       "de_LI"       "de_LU"       "dje"        
[101] "dje_NE"      "dua"         "dua_CM"      "dyo"         "dyo_SN"     
[106] "dz"          "dz_BT"       "ebu"         "ebu_KE"      "ee"         
[111] "ee_GH"       "ee_TG"       "el"          "el_CY"       "el_GR"      
[116] "en"          "en_001"      "en_150"      "en_AG"       "en_AI"      
[121] "en_AS"       "en_AU"       "en_BB"       "en_BE"       "en_BM"      
[126] "en_BS"       "en_BW"       "en_BZ"       "en_CA"       "en_CC"      
[131] "en_CK"       "en_CM"       "en_CX"       "en_DG"       "en_DM"      
[136] "en_ER"       "en_FJ"       "en_FK"       "en_FM"       "en_GB"      
[141] "en_GD"       "en_GG"       "en_GH"       "en_GI"       "en_GM"      
[146] "en_GU"       "en_GY"       "en_HK"       "en_IE"       "en_IM"      
[151] "en_IN"       "en_IO"       "en_JE"       "en_JM"       "en_KE"      
[156] "en_KI"       "en_KN"       "en_KY"       "en_LC"       "en_LR"      
[161] "en_LS"       "en_MG"       "en_MH"       "en_MO"       "en_MP"      
[166] "en_MS"       "en_MT"       "en_MU"       "en_MW"       "en_NA"      
[171] "en_NF"       "en_NG"       "en_NR"       "en_NU"       "en_NZ"      
[176] "en_PG"       "en_PH"       "en_PK"       "en_PN"       "en_PR"      
[181] "en_PW"       "en_RW"       "en_SB"       "en_SC"       "en_SD"      
[186] "en_SG"       "en_SH"       "en_SL"       "en_SS"       "en_SX"      
[191] "en_SZ"       "en_TC"       "en_TK"       "en_TO"       "en_TT"      
[196] "en_TV"       "en_TZ"       "en_UG"       "en_UM"       "en_US"      
[201] "en_US_POSIX" "en_VC"       "en_VG"       "en_VI"       "en_VU"      
[206] "en_WS"       "en_ZA"       "en_ZM"       "en_ZW"       "eo"         
[211] "es"          "es_419"      "es_AR"       "es_BO"       "es_CL"      
[216] "es_CO"       "es_CR"       "es_CU"       "es_DO"       "es_EA"      
[221] "es_EC"       "es_ES"       "es_GQ"       "es_GT"       "es_HN"      
[226] "es_IC"       "es_MX"       "es_NI"       "es_PA"       "es_PE"      
[231] "es_PH"       "es_PR"       "es_PY"       "es_SV"       "es_US"      
[236] "es_UY"       "es_VE"       "et"          "et_EE"       "eu"         
[241] "eu_ES"       "ewo"         "ewo_CM"      "fa"          "fa_AF"      
[246] "fa_IR"       "ff"          "ff_SN"       "fi"          "fi_FI"      
[251] "fil"         "fil_PH"      "fo"          "fo_FO"       "fr"         
[256] "fr_BE"       "fr_BF"       "fr_BI"       "fr_BJ"       "fr_BL"      
[261] "fr_CA"       "fr_CD"       "fr_CF"       "fr_CG"       "fr_CH"      
[266] "fr_CI"       "fr_CM"       "fr_DJ"       "fr_DZ"       "fr_FR"      
[271] "fr_GA"       "fr_GF"       "fr_GN"       "fr_GP"       "fr_GQ"      
[276] "fr_HT"       "fr_KM"       "fr_LU"       "fr_MA"       "fr_MC"      
[281] "fr_MF"       "fr_MG"       "fr_ML"       "fr_MQ"       "fr_MR"      
[286] "fr_MU"       "fr_NC"       "fr_NE"       "fr_PF"       "fr_PM"      
[291] "fr_RE"       "fr_RW"       "fr_SC"       "fr_SN"       "fr_SY"      
[296] "fr_TD"       "fr_TG"       "fr_TN"       "fr_VU"       "fr_WF"      
[301] "fr_YT"       "ga"          "ga_IE"       "gl"          "gl_ES"      
[306] "gsw"         "gsw_CH"      "gsw_LI"      "gu"          "gu_IN"      
[311] "guz"         "guz_KE"      "gv"          "gv_IM"       "ha"         
[316] "ha_Latn"     "ha_Latn_GH"  "ha_Latn_NE"  "ha_Latn_NG"  "haw"        
[321] "haw_US"      "he"          "he_IL"       "hi"          "hi_IN"      
[326] "hr"          "hr_BA"       "hr_HR"       "hu"          "hu_HU"      
[331] "hy"          "hy_AM"       "id"          "id_ID"       "ig"         
[336] "ig_NG"       "ii"          "ii_CN"       "is"          "is_IS"      
[341] "it"          "it_CH"       "it_IT"       "it_SM"       "ja"         
[346] "ja_JP"       "jgo"         "jgo_CM"      "jmc"         "jmc_TZ"     
[351] "ka"          "ka_GE"       "kab"         "kab_DZ"      "kam"        
[356] "kam_KE"      "kde"         "kde_TZ"      "kea"         "kea_CV"     
[361] "khq"         "khq_ML"      "ki"          "ki_KE"       "kk"         
[366] "kk_Cyrl"     "kk_Cyrl_KZ"  "kkj"         "kkj_CM"      "kl"         
[371] "kl_GL"       "kln"         "kln_KE"      "km"          "km_KH"      
[376] "kn"          "kn_IN"       "ko"          "ko_KP"       "ko_KR"      
[381] "kok"         "kok_IN"      "ks"          "ks_Arab"     "ks_Arab_IN" 
[386] "ksb"         "ksb_TZ"      "ksf"         "ksf_CM"      "kw"         
[391] "kw_GB"       "ky"          "ky_Cyrl"     "ky_Cyrl_KG"  "lag"        
[396] "lag_TZ"      "lg"          "lg_UG"       "lkt"         "lkt_US"     
[401] "ln"          "ln_AO"       "ln_CD"       "ln_CF"       "ln_CG"      
[406] "lo"          "lo_LA"       "lt"          "lt_LT"       "lu"         
[411] "lu_CD"       "luo"         "luo_KE"      "luy"         "luy_KE"     
[416] "lv"          "lv_LV"       "mas"         "mas_KE"      "mas_TZ"     
[421] "mer"         "mer_KE"      "mfe"         "mfe_MU"      "mg"         
[426] "mg_MG"       "mgh"         "mgh_MZ"      "mgo"         "mgo_CM"     
[431] "mk"          "mk_MK"       "ml"          "ml_IN"       "mn"         
[436] "mn_Cyrl"     "mn_Cyrl_MN"  "mr"          "mr_IN"       "ms"         
[441] "ms_Latn"     "ms_Latn_BN"  "ms_Latn_MY"  "ms_Latn_SG"  "mt"         
[446] "mt_MT"       "mua"         "mua_CM"      "my"          "my_MM"      
[451] "naq"         "naq_NA"      "nb"          "nb_NO"       "nb_SJ"      
[456] "nd"          "nd_ZW"       "ne"          "ne_IN"       "ne_NP"      
[461] "nl"          "nl_AW"       "nl_BE"       "nl_BQ"       "nl_CW"      
[466] "nl_NL"       "nl_SR"       "nl_SX"       "nmg"         "nmg_CM"     
[471] "nn"          "nn_NO"       "nnh"         "nnh_CM"      "nus"        
[476] "nus_SD"      "nyn"         "nyn_UG"      "om"          "om_ET"      
[481] "om_KE"       "or"          "or_IN"       "pa"          "pa_Arab"    
[486] "pa_Arab_PK"  "pa_Guru"     "pa_Guru_IN"  "pl"          "pl_PL"      
[491] "ps"          "ps_AF"       "pt"          "pt_AO"       "pt_BR"      
[496] "pt_CV"       "pt_GW"       "pt_MO"       "pt_MZ"       "pt_PT"      
[501] "pt_ST"       "pt_TL"       "rm"          "rm_CH"       "rn"         
[506] "rn_BI"       "ro"          "ro_MD"       "ro_RO"       "rof"        
[511] "rof_TZ"      "ru"          "ru_BY"       "ru_KG"       "ru_KZ"      
[516] "ru_MD"       "ru_RU"       "ru_UA"       "rw"          "rw_RW"      
[521] "rwk"         "rwk_TZ"      "saq"         "saq_KE"      "sbp"        
[526] "sbp_TZ"      "seh"         "seh_MZ"      "ses"         "ses_ML"     
[531] "sg"          "sg_CF"       "shi"         "shi_Latn"    "shi_Latn_MA"
[536] "shi_Tfng"    "shi_Tfng_MA" "si"          "si_LK"       "sk"         
[541] "sk_SK"       "sl"          "sl_SI"       "sn"          "sn_ZW"      
[546] "so"          "so_DJ"       "so_ET"       "so_KE"       "so_SO"      
[551] "sq"          "sq_AL"       "sq_MK"       "sq_XK"       "sr"         
[556] "sr_Cyrl"     "sr_Cyrl_BA"  "sr_Cyrl_ME"  "sr_Cyrl_RS"  "sr_Cyrl_XK" 
[561] "sr_Latn"     "sr_Latn_BA"  "sr_Latn_ME"  "sr_Latn_RS"  "sr_Latn_XK" 
[566] "sv"          "sv_AX"       "sv_FI"       "sv_SE"       "sw"         
[571] "sw_KE"       "sw_TZ"       "sw_UG"       "swc"         "swc_CD"     
[576] "ta"          "ta_IN"       "ta_LK"       "ta_MY"       "ta_SG"      
[581] "te"          "te_IN"       "teo"         "teo_KE"      "teo_UG"     
[586] "th"          "th_TH"       "ti"          "ti_ER"       "ti_ET"      
[591] "to"          "to_TO"       "tr"          "tr_CY"       "tr_TR"      
[596] "twq"         "twq_NE"      "tzm"         "tzm_Latn"    "tzm_Latn_MA"
[601] "uk"          "uk_UA"       "ur"          "ur_IN"       "ur_PK"      
[606] "uz"          "uz_Arab"     "uz_Arab_AF"  "uz_Cyrl"     "uz_Cyrl_UZ" 
[611] "uz_Latn"     "uz_Latn_UZ"  "vai"         "vai_Latn"    "vai_Latn_LR"
[616] "vai_Vaii"    "vai_Vaii_LR" "vi"          "vi_VN"       "vun"        
[621] "vun_TZ"      "xog"         "xog_UG"      "yav"         "yav_CM"     
[626] "yo"          "yo_BJ"       "yo_NG"       "zgh"         "zgh_MA"     
[631] "zh"          "zh_Hans"     "zh_Hans_CN"  "zh_Hans_HK"  "zh_Hans_MO" 
[636] "zh_Hans_SG"  "zh_Hant"     "zh_Hant_HK"  "zh_Hant_MO"  "zh_Hant_TW" 
[641] "zu"          "zu_ZA"

# ローケル情報の取得と設定
stringi::stri_locale_info()

$Language
[1] "ja"

$Country
[1] "JP"

$Variant
[1] ""

$Name
[1] "ja_JP"

now_locale <- stringi::stri_locale_get()
stringi::stri_locale_set(locale = now_locale)

Unicode文字クラスで文字列マッチ(stri_*_charclass)

stri_*_charclass

# {stringr}ではUnicode文字クラスでのパターンマッチは用意されていない
stringi::stri_subset_charclass(
  str = c("stRRRingi","REXAMINE","123"),
  pattern = c("\\p{Ll}", "\\p{Lu}", "\\p{Zs}")
)

[1] "stRRRingi" "REXAMINE"

文字列比較(stri_cmp*)

stri_cmp*, ~~stri_compare(alias)~~

# stringi::stri_cmp*
ls("package:stringi") %>% 
  stringr::str_subset(pattern = "^stri_cmp")

[1] "stri_cmp"        "stri_cmp_eq"     "stri_cmp_equiv"  "stri_cmp_ge"    
[5] "stri_cmp_gt"     "stri_cmp_le"     "stri_cmp_lt"     "stri_cmp_neq"   
[9] "stri_cmp_nequiv"

# 文字列比較（ロケール依存）
# 「Cのstrcmp()と同じような挙動」らしい
# [e1 < e2]: -1, [e1 == e2]: 0, [e1 > e2]: +1
stringi::stri_cmp(e1 = "number100", e2 = "number2")

[1] -1

stringi::stri_cmp(e1 = "number100", e2 = "number2", opts_collator = stri_opts_collator(numeric = TRUE))

[1] 1

# ロケール非依存
# stringi::stri_cmp_eq/stringi::stri_cmp_neq: exactly the same/difference code points
stringi::stri_cmp_eq(e1 = stringi::stri_trans_nfkd("\u0105"), e2 = "\u105")

[1] FALSE

stringi::stri_cmp_neq(e1 = "hladny", e2 = "HLADNY")

[1] TRUE

# ロケール依存
# stringi::stri_cmp_equiv: canonically equivalent
# stringi::stri_cmp_nequiv: not canonically equivalent
# opts_collatorで受け取れる引数(locale, strength, ...)を指定できる
stringi::stri_cmp_equiv(e1 = "hladny", e2 = "HLADNY", strength = 2)

[1] TRUE

stringi::stri_cmp_nequiv(e1 = "hladny", e2 = "HLADNY", strength = 2)

[1] FALSE

stringi::stri_cmp_nequiv(e1 = "hladny", e2 = "HLADNY", strength = 3)

[1] TRUE

# 辞書順の符号比較（ロケール依存）
# stringi::stri_cmp_lt = ">", stringi::stri_cmp_gt = "<"
# stringi::stri_cmp_le = ">=", stringi::stri_cmp_ge = "<="
stringi::stri_cmp_lt(e1 = "hladny", e2 = "chladny", locale = "pl_PL")

[1] FALSE

stringi::stri_cmp_lt(e1 = "hladny", e2 = "chladny", locale = "sk_SK")

[1] TRUE

stringi::stri_cmp_gt(e1 = "hladny", e2 = "chladny", locale = "pl_PL")

[1] TRUE

# 文字列比較用の演算子も定義されている
# default collator optionsが使われる
# %s==%, %s!=%, %s<%, %s<=%, %s>%, %s>=%
# %stri==%, %stri!=%, %stri<%, %stri<=%, %stri>%, %stri>=%

# 「%s==%」「%stri==%」は canonical equivalence, locale-dependent
# 「%s===%」「%stri===%」は canonical equivalence, locale-independent(code point-based)
(stringi::stri_trans_nfkd("\u0105")) %s==% "\u105"

[1] TRUE

(stringi::stri_trans_nfkd("\u0105")) %s===% "\u105"

[1] FALSE

# 「%s!=%」「%stri!=%」は not canonical equivalence, locale-dependent
# 「%s!==%」「%stri!==%」は cnot anonical equivalence, locale-independent(code point-based)

文字列の重複について(stri_duplicated, stri_unique)

stri_duplicated, stri_unique

# 重複するか判定
dup_input <- c("a", "b", "a", NA, "a", NA)
stringi::stri_duplicated(str = dup_input)

[1] FALSE FALSE  TRUE FALSE  TRUE  TRUE

# 後ろから判定する場合は引数fromLastをTRUE
stringi::stri_duplicated(str = dup_input, fromLast = TRUE)

[1]  TRUE FALSE  TRUE  TRUE FALSE FALSE

rev(stringi::stri_duplicated(str = rev(dup_input)))

[1]  TRUE FALSE  TRUE  TRUE FALSE FALSE

# base::duplicatedは文字列の正準等価性を見ているので、ロケール依存する文字列に対応できない
dup_str <- c("\u0105", stringi::stri_trans_nfkd("\u0105")) %>%
  print

[1] "ą" "ą"

duplicated(x = dup_str)

[1] FALSE FALSE

stringi::stri_duplicated(str = dup_str)

[1] FALSE  TRUE

# 重複がいくつあるか
stringi::stri_duplicated_any(str = dup_input)

[1] 3

# 重複する文字列を削除
stringi::stri_unique(str = dup_input)

[1] "a" "b" NA

stringi::stri_unique(str = dup_str)

[1] "ą"

# base::uniqueと同じだが、ロケール依存する文字列には対応できない
unique(x = dup_input)

[1] "a" "b" NA

unique(x = dup_str)

[1] "ą" "ą"

文字列の先頭・終端のパターン一致判定(stri_startswith, stri_endswith)

stri_startswith, stri_endswith

stringi::stri_startswith(str = month.name, fixed = "J")

 [1]  TRUE FALSE FALSE FALSE FALSE  TRUE  TRUE FALSE FALSE FALSE FALSE
[12] FALSE

stringi::stri_endswith(str = month.name, fixed = "ber")

 [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE  TRUE  TRUE
[12]  TRUE

# 引数fromで開始位置を変えられる
stringi::stri_startswith(str = month.name, fixed = "A", from = 2)

 [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE

stringi::stri_startswith(str = month.name, fixed = "a", from = 2)

 [1]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE

stringi::stri_startswith(str = month.name, coll = "A", from = 2)

 [1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE

# stri_opts_collatorの引数strength({1,2,3,4})で照合の強さを設定(1が最も緩い。デフォルトは3)
stringi::stri_startswith(str = month.name, coll = "A", from = 2, strength = 1)

 [1]  TRUE FALSE  TRUE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE

数の取得(stri_count_words, stri_count_boundaries)

stri_count_words, stri_count_boundaries

# 単語数の取得（単語の境界はstringi::stri_count_boundariesで判定）
stringi::stri_count_words(str = stringi::stri_flatten(str = str, collapse = " "))

[1] 5

count_test <- "The\u00a0above-mentioned    features are very useful. Warm thanks to their developers."
stringi::stri_count_words(str = count_test)

[1] 12

# stri_count_boundariesの挙動はよくわからない
stringi::stri_count_boundaries(str = count_test, type = "word")

[1] 28

stringi::stri_count_boundaries(str = count_test, type = "sentence")

[1] 2

stringi::stri_count_boundaries(str = count_test, type = "character")

[1] 81

形式変換関係(stri_list2matrix, stri_flatten)

stri_list2matrix, stri_flatten

input_lst <- list("a", c("b", "c"))

# base::simplify2array
simplify2array(x = input_lst)

[[1]]
[1] "a"

[[2]]
[1] "b" "c"

# stringi::stri_list2matrixでリストを行列へ
stringi::stri_list2matrix(x = input_lst)

     [,1] [,2]
[1,] "a"  "b" 
[2,] NA   "c"

stringi::stri_list2matrix(x = input_lst, fill = "")

     [,1] [,2]
[1,] "a"  "b" 
[2,] ""   "c"

stringi::stri_list2matrix(x = input_lst, fill = "", n_min = 5)

     [,1] [,2]
[1,] "a"  "b" 
[2,] ""   "c" 
[3,] ""   ""  
[4,] ""   ""  
[5,] ""   ""

stringi::stri_list2matrix(x = input_lst, fill = "", n_min = 5, byrow = TRUE)

     [,1] [,2] [,3] [,4] [,5]
[1,] "a"  ""   ""   ""   ""  
[2,] "b"  "c"  ""   ""   ""

# stringi::stri_flattenで文字列ベクトルを一つの文字列に
str

[1] "我輩" "は"   "猫"   "で"   "ある"

stringr::str_length(string = str)

[1] 2 1 1 1 2

stringi::stri_flatten(str = str, collapse = " ")

[1] "我輩 は 猫 で ある"

stringr::str_length(string = stringi::stri_flatten(str = str, collapse = " "))

[1] 11

ダミーテキストの生成(stri_rand_lipsum, stri_rand_strings)

stri_rand_lipsum, stri_rand_strings

# Lorem ipsumに基づくダミーテキストの生成
# https://ja.wikipedia.org/wiki/Lorem_ipsum
# 引数start_lipsumは"Lorem ipsum dolor sit ame"から始めるかどうか
stringi::stri_rand_lipsum(nparagraphs = 2, start_lipsum = TRUE)

[1] "Lorem ipsum dolor sit amet, nec semper netus massa. Imperdiet pellentesque mattis quam eu eu aenean non a in nibh, ut ut, quam! Sed erat dis, mi hac orci condimentum sollicitudin efficitur. Eu vitae dis vestibulum. Est arcu donec sed lectus sem imperdiet dolor malesuada. Dictumst at lobortis tincidunt leo eu vestibulum eu in. Ac, eget proin lorem nulla orci integer. Sapien, senectus accumsan sagittis, felis augue. Integer in sapien potenti, nullam et. Ut sapien odio. Vulputate dolor neque interdum habitant in, sociosqu ut. Non, class ipsum nunc quam nascetur diam."
[2] "In non dignissim integer et ut est sed, ipsum. Convallis, et habitasse sed neque eu pulvinar nec. Non aliquam nec curabitur, sed ut ante tempor, dui. Pellentesque est posuere, ac id sed auctor eu fames vestibulum. Accumsan ultricies maecenas nec ut dui nascetur, ut etiam. Est eu proin torquent. A sed ut ultrices. Tincidunt vehicula suspendisse, ultrices non dui, ex."

stringi::stri_rand_lipsum(nparagraphs = 2, start_lipsum = FALSE)

[1] "Duis dapibus primis fringilla purus massa sed commodo massa fringilla. Tempor, libero morbi penatibus justo purus curabitur nostra. Ut mi sed ultrices nec per a laoreet leo. Nullam tristique velit, nisi hendrerit sociis pellentesque, a eu ipsum dis. Sed ut arcu maximus sed proin, elementum quam diam mauris, iaculis pretium, posuere curae. Ridiculus a nulla tincidunt. In vel ac tincidunt nam tincidunt a hac. Interdum sed habitasse non tortor tempus sed mollis. Varius, magna erat fusce sed. Sed vel tempus augue lobortis morbi molestie purus, ultrices luctus volutpat eros, purus, interdum, leo. Himenaeos interdum platea pharetra."
[2] "Tincidunt dolor volutpat libero nibh himenaeos ex commodo ad nunc magna etiam. Tellus ac feugiat ut massa leo ex amet ut at nunc. Eu leo augue lacus sit ullamcorper vitae. Ac lorem odio ut. Et nisi. Tristique ut purus leo quis amet. Per est blandit diam nibh sed integer nulla elementum, vulputate. Varius a purus vehicula mollis dictum posuere. Vel scelerisque mauris nulla. Fusce eu integer nibh quam eu. Posuere donec ac et, sagittis. Commodo eget velit."

#  n個のlength文字数のランダムな文字ベクトルを生成
stringi::stri_rand_strings(n = 10, length = 5)

 [1] "i1Lyu" "ac5VV" "jgdSs" "TCtwp" "fSqAO" "cOHrF" "wPPcY" "fQYIm"
 [9] "4WvpS" "yXIFc"

# 引数patternで使用する文字列を設定できる
stringi::stri_rand_strings(n = 10, length = 5, pattern = "[a-zあ-ん]")

 [1] "おぺろpe"   "ろめふのろ" "らげeけぉ"  "yhべoつ"    "ねおぜうこ"
 [6] "ゑなをでぞ" "きたぐぜら" "uるっゆf"   "oめrくし"   "ぅへすkぐ"

テキストの並び替え(stri_rand_shuffle, stri_reverse)

stri_rand_shuffle, stri_reverse

# 個々の文字列ベクトルの文字順をランダムに並び替え
stringi::stri_rand_shuffle(str = month.name[1:10])

 [1] "raunJay"   "eabruFry"  "rcaMh"     "liArp"     "rMy"      
 [6] "pJue"      "nJFy"      "rAugut"    "rSeptembe" "ebOcotr"

# 個々の文字列ベクトルの文字順を逆順に並び替え
stringi::stri_reverse(str = month.name)

 [1] "yraunaJ"   "yraurbeF"  "hcraM"     "lirpA"     "yaM"      
 [6] "enuJ"      "yluJ"      "tsuguA"    "rebmetpeS" "rebotcO"  
[11] "rebmevoN"  "rebmeceD"

テキストファイルの入出力([THIS IS AN EXPERIMENTAL FUNCTION])

stri_read_raw, stri_read_lines, stri_write_lines

# [THIS IS AN EXPERIMENTAL FUNCTION]

統計情報(stri_stats_general, stri_stats_latex)

stri_stats_general, stri_stats_latex

lipsum_str <- stringi::stri_rand_lipsum(nparagraphs = 1, start_lipsum = FALSE) %>%
  print

[1] "In semper purus dolor cras, consequat a in. Sed sed mollis, finibus. Vulputate nec facilisis elit quisque a nullam in in nulla eros cursus varius. Integer quisque luctus sed sapien sed vel at. Nunc purus lacus eu eu eleifend maximus mi. At dictumst consectetur sollicitudin nunc tempor vestibulum dolor aliquam non lectus suspendisse. Vestibulum penatibus tempor, nibh metus cum. Blandit vel id eu eleifend tempus. Tortor, commodo, lacinia leo morbi imperdiet maecenas est pharetra justo orci, nec netus. Finibus enim sapien in mauris nec ac vel sem senectus adipiscing. Efficitur per tempus posuere vitae elementum vestibulum netus, luctus. Fermentum maecenas metus natoque eros suspendisse odio nam ac et augue pellentesque. Risus aliquam dictum eu mi quam, a non."

# 統計情報
# \rや\nが含まれていない文字列で、空白文字（Unicode binary property WHITE_SPACE）で単語が区切られている
# Lines: 行数, LinesNEmpty: "WHITE_SPACE"ではない文字を少なくともひとつ含む行の数
# Chars: Unicode符号位置にマッチした総数, CharsNWhite: "WHITE_SPACE"ではないUnicode符号位置の数
stringi::stri_stats_general(str = lipsum_str)

      Lines LinesNEmpty       Chars CharsNWhite 
          1           1         766         649

s <- c("Lorem \\textbf{ipsum} dolor sit \\textit{amet}, consectetur adipisicing elit.",
       "\\begin{small}Proin nibh augue,\\end{small} suscipit a, scelerisque sed, lacinia in, mi.",
       "")

# LaTeXテキストの統計情報
# CharsWord: 文字数（空白を含む記号とコマンドだけを除去）, CharsCmdEnvir: コマンドと単語数（記号を含む）
# CharsWhite: LaTeX white space数({と}を含む)
# Words: 単語数, Cmds: コマンド数, Envirs: 環境数
stringi::stri_stats_latex(str = s)

    CharsWord CharsCmdEnvir    CharsWhite         Words          Cmds 
           96            38            27            18             2 
       Envirs 
            1

エスケープ・アンエスケープ(stri_*escape_unicode)

stri_*escape_unicode

stringi::stri_escape_unicode(str = "㎠")

[1] "\\u33a0"

stringi::stri_unescape_unicode(str = "\\u33a0")

[1] "㎠"

エンコーディング(stri_enc_*)

stri_enc_*

stringi::stri_enc_list() %>%
  head(n = 5)

$`UTF-8`
 [1] "UTF-8"             "ibm-1208"          "ibm-1209"         
 [4] "ibm-5304"          "ibm-5305"          "ibm-13496"        
 [7] "ibm-13497"         "ibm-17592"         "ibm-17593"        
[10] "windows-65001"     "cp1208"            "x-UTF_8J"         
[13] "unicode-1-1-utf-8" "unicode-2-0-utf-8"

$`UTF-16`
[1] "UTF-16"          "ISO-10646-UCS-2" "ibm-1204"        "ibm-1205"       
[5] "unicode"         "csUnicode"       "ucs-2"          

$`UTF-16BE`
 [1] "UTF-16BE"           "x-utf-16be"         "UnicodeBigUnmarked"
 [4] "ibm-1200"           "ibm-1201"           "ibm-13488"         
 [7] "ibm-13489"          "ibm-17584"          "ibm-17585"         
[10] "ibm-21680"          "ibm-21681"          "ibm-25776"         
[13] "ibm-25777"          "ibm-29872"          "ibm-29873"         
[16] "ibm-61955"          "ibm-61956"          "windows-1201"      
[19] "cp1200"             "cp1201"             "UTF16_BigEndian"   

$`UTF-16LE`
 [1] "UTF-16LE"              "x-utf-16le"           
 [3] "UnicodeLittleUnmarked" "ibm-1202"             
 [5] "ibm-1203"              "ibm-13490"            
 [7] "ibm-13491"             "ibm-17586"            
 [9] "ibm-17587"             "ibm-21682"            
[11] "ibm-21683"             "ibm-25778"            
[13] "ibm-25779"             "ibm-29874"            
[15] "ibm-29875"             "UTF16_LittleEndian"   
[17] "windows-1200"         

$`UTF-32`
[1] "UTF-32"          "ISO-10646-UCS-4" "ibm-1236"        "ibm-1237"       
[5] "csUCS4"          "ucs-4"

stringi::stri_enc_info()

$Name.friendly
[1] "UTF-8"

$Name.ICU
[1] "UTF-8"

$Name.UTR22
[1] NA

$Name.IBM
[1] "ibm-1208"

$Name.WINDOWS
[1] "windows-65001"

$Name.JAVA
[1] "UTF-8"

$Name.IANA
[1] "UTF-8"

$Name.MIME
[1] "UTF-8"

$ASCII.subset
[1] TRUE

$Unicode.1to1
[1] NA

$CharSize.8bit
[1] FALSE

$CharSize.min
[1] 1

$CharSize.max
[1] 3

stringi::stri_enc_get()

[1] "UTF-8"

stringi::stri_enc_set(enc = stringi::stri_enc_get())


# 各文字列の宣言しているエンコーデング形式を取得
# ASCII, latin1, bytes, native, UTF-8
stringi::stri_enc_mark(str = month.name)

 [1] "ASCII" "ASCII" "ASCII" "ASCII" "ASCII" "ASCII" "ASCII" "ASCII"
 [9] "ASCII" "ASCII" "ASCII" "ASCII"

# stringi::stri_enc_to*
ls("package:stringi") %>% 
  stringr::str_subset(pattern = "^stri_enc_to")

[1] "stri_enc_toascii"  "stri_enc_tonative" "stri_enc_toutf32" 
[4] "stri_enc_toutf8"

# stringi::stri_enc_is*
ls("package:stringi") %>% 
  stringr::str_subset(pattern = "^stri_enc_is")

[1] "stri_enc_isascii"   "stri_enc_isutf16be" "stri_enc_isutf16le"
[4] "stri_enc_isutf32be" "stri_enc_isutf32le" "stri_enc_isutf8"

stringi::stri_enc_toascii(str = "\x1a")

[1] "\032"

stringi::stri_enc_toascii(str = "\041")

[1] "!"

# 使い方がわからない
# stringi::stri_enc_fromutf32(vec = "")
# (2015.07.18 追記) @kohskeさんよりフォローがありました（ありがとうございます）。
# UTF32コードからutf8文字列生成
stringi::stri_enc_fromutf32(vec = c(0x6771, 0x4EAC, 0x90FD))

[1] "東京都"

文字列変換(stri_trans_*)

stri_trans_*

# 文字列変換で利用可能な識別子
stringi::stri_trans_list()

  [1] "ASCII-Latin"             "Accents-Any"            
  [3] "Amharic-Latin/BGN"       "Any-Accents"            
  [5] "Any-Publishing"          "Arabic-Latin"           
  [7] "Arabic-Latin/BGN"        "Armenian-Latin"         
  [9] "Armenian-Latin/BGN"      "Azerbaijani-Latin/BGN"  
 [11] "Belarusian-Latin/BGN"    "Bengali-Devanagari"     
 [13] "Bengali-Gujarati"        "Bengali-Gurmukhi"       
 [15] "Bengali-Kannada"         "Bengali-Latin"          
 [17] "Bengali-Malayalam"       "Bengali-Oriya"          
 [19] "Bengali-Tamil"           "Bengali-Telugu"         
 [21] "Bopomofo-Latin"          "Bulgarian-Latin/BGN"    
 [23] "Cyrillic-Latin"          "Devanagari-Bengali"     
 [25] "Devanagari-Gujarati"     "Devanagari-Gurmukhi"    
 [27] "Devanagari-Kannada"      "Devanagari-Latin"       
 [29] "Devanagari-Malayalam"    "Devanagari-Oriya"       
 [31] "Devanagari-Tamil"        "Devanagari-Telugu"      
 [33] "Digit-Tone"              "Fullwidth-Halfwidth"    
 [35] "Georgian-Latin"          "Georgian-Latin/BGN"     
 [37] "Greek-Latin"             "Greek-Latin/BGN"        
 [39] "Greek-Latin/UNGEGN"      "Gujarati-Bengali"       
 [41] "Gujarati-Devanagari"     "Gujarati-Gurmukhi"      
 [43] "Gujarati-Kannada"        "Gujarati-Latin"         
 [45] "Gujarati-Malayalam"      "Gujarati-Oriya"         
 [47] "Gujarati-Tamil"          "Gujarati-Telugu"        
 [49] "Gurmukhi-Bengali"        "Gurmukhi-Devanagari"    
 [51] "Gurmukhi-Gujarati"       "Gurmukhi-Kannada"       
 [53] "Gurmukhi-Latin"          "Gurmukhi-Malayalam"     
 [55] "Gurmukhi-Oriya"          "Gurmukhi-Tamil"         
 [57] "Gurmukhi-Telugu"         "Halfwidth-Fullwidth"    
 [59] "Han-Latin"               "Han-Latin/Names"        
 [61] "Hangul-Latin"            "Hans-Hant"              
 [63] "Hant-Hans"               "Hebrew-Latin"           
 [65] "Hebrew-Latin/BGN"        "Hiragana-Katakana"      
 [67] "Hiragana-Latin"          "IPA-XSampa"             
 [69] "Jamo-Latin"              "Kannada-Bengali"        
 [71] "Kannada-Devanagari"      "Kannada-Gujarati"       
 [73] "Kannada-Gurmukhi"        "Kannada-Latin"          
 [75] "Kannada-Malayalam"       "Kannada-Oriya"          
 [77] "Kannada-Tamil"           "Kannada-Telugu"         
 [79] "Katakana-Hiragana"       "Katakana-Latin"         
 [81] "Katakana-Latin/BGN"      "Kazakh-Latin/BGN"       
 [83] "Kirghiz-Latin/BGN"       "Korean-Latin/BGN"       
 [85] "Latin-ASCII"             "Latin-Arabic"           
 [87] "Latin-Armenian"          "Latin-Bengali"          
 [89] "Latin-Bopomofo"          "Latin-Cyrillic"         
 [91] "Latin-Devanagari"        "Latin-Georgian"         
 [93] "Latin-Greek"             "Latin-Greek/UNGEGN"     
 [95] "Latin-Gujarati"          "Latin-Gurmukhi"         
 [97] "Latin-Hangul"            "Latin-Hebrew"           
 [99] "Latin-Hiragana"          "Latin-Jamo"             
[101] "Latin-Kannada"           "Latin-Katakana"         
[103] "Latin-Malayalam"         "Latin-NumericPinyin"    
[105] "Latin-Oriya"             "Latin-Syriac"           
[107] "Latin-Tamil"             "Latin-Telugu"           
[109] "Latin-Thaana"            "Latin-Thai"             
[111] "Macedonian-Latin/BGN"    "Malayalam-Bengali"      
[113] "Malayalam-Devanagari"    "Malayalam-Gujarati"     
[115] "Malayalam-Gurmukhi"      "Malayalam-Kannada"      
[117] "Malayalam-Latin"         "Malayalam-Oriya"        
[119] "Malayalam-Tamil"         "Malayalam-Telugu"       
[121] "Maldivian-Latin/BGN"     "Mongolian-Latin/BGN"    
[123] "NumericPinyin-Latin"     "NumericPinyin-Pinyin"   
[125] "Oriya-Bengali"           "Oriya-Devanagari"       
[127] "Oriya-Gujarati"          "Oriya-Gurmukhi"         
[129] "Oriya-Kannada"           "Oriya-Latin"            
[131] "Oriya-Malayalam"         "Oriya-Tamil"            
[133] "Oriya-Telugu"            "Pashto-Latin/BGN"       
[135] "Persian-Latin/BGN"       "Pinyin-NumericPinyin"   
[137] "Publishing-Any"          "Russian-Latin/BGN"      
[139] "Serbian-Latin/BGN"       "Simplified-Traditional" 
[141] "Syriac-Latin"            "Tamil-Bengali"          
[143] "Tamil-Devanagari"        "Tamil-Gujarati"         
[145] "Tamil-Gurmukhi"          "Tamil-Kannada"          
[147] "Tamil-Latin"             "Tamil-Malayalam"        
[149] "Tamil-Oriya"             "Tamil-Telugu"           
[151] "Telugu-Bengali"          "Telugu-Devanagari"      
[153] "Telugu-Gujarati"         "Telugu-Gurmukhi"        
[155] "Telugu-Kannada"          "Telugu-Latin"           
[157] "Telugu-Malayalam"        "Telugu-Oriya"           
[159] "Telugu-Tamil"            "Thaana-Latin"           
[161] "Thai-Latin"              "Tone-Digit"             
[163] "Traditional-Simplified"  "Turkmen-Latin/BGN"      
[165] "Ukrainian-Latin/BGN"     "Uzbek-Latin/BGN"        
[167] "XSampa-IPA"              "az-Lower"               
[169] "az-Title"                "az-Upper"               
[171] "cs-cs_FONIPA"            "cs-ja"                  
[173] "cs-ko"                   "cs_FONIPA-ja"           
[175] "cs_FONIPA-ko"            "el-Lower"               
[177] "el-Title"                "el-Upper"               
[179] "es-am"                   "es-es_FONIPA"           
[181] "es-ja"                   "es-zh"                  
[183] "es_419-ja"               "es_419-zh"              
[185] "es_FONIPA-am"            "es_FONIPA-es_419_FONIPA"
[187] "es_FONIPA-ja"            "es_FONIPA-zh"           
[189] "it-am"                   "it-ja"                  
[191] "ja_Latn-ko"              "ja_Latn-ru"             
[193] "lt-Lower"                "lt-Title"               
[195] "lt-Upper"                "nl-Title"               
[197] "pl-ja"                   "pl-pl_FONIPA"           
[199] "pl_FONIPA-ja"            "ro-ja"                  
[201] "ro-ro_FONIPA"            "ro_FONIPA-ja"           
[203] "ru-ja"                   "ru-zh"                  
[205] "sk-ja"                   "sk-sk_FONIPA"           
[207] "sk_FONIPA-ja"            "tr-Lower"               
[209] "tr-Title"                "tr-Upper"               
[211] "uz_Cyrl-uz_Latn"         "uz_Latn-uz_Cyrl"        
[213] "zh_Latn_PINYIN-ru"       "Any-Null"               
[215] "Any-Lower"               "Any-Upper"              
[217] "Any-Title"               "Any-Name"               
[219] "Name-Any"                "Any-Remove"             
[221] "Any-Hex/Unicode"         "Any-Hex/Java"           
[223] "Any-Hex/C"               "Any-Hex/XML"            
[225] "Any-Hex/XML10"           "Any-Hex/Perl"           
[227] "Any-Hex"                 "Hex-Any/Unicode"        
[229] "Hex-Any/Java"            "Hex-Any/C"              
[231] "Hex-Any/XML"             "Hex-Any/XML10"          
[233] "Hex-Any/Perl"            "Hex-Any"                
[235] "Any-NFC"                 "Any-NFKC"               
[237] "Any-NFD"                 "Any-NFKD"               
[239] "Any-FCD"                 "Any-FCC"                
[241] "Any-Latin"               "Any-Telugu"             
[243] "Any-Gurmukhi"            "Any-Gujarati"           
[245] "Any-Malayalam"           "Any-Oriya"              
[247] "Any-Devanagari"          "Any-Kannada"            
[249] "Any-Tamil"               "Any-cs_FONIPA"          
[251] "Any-ru"                  "Any-Bengali"            
[253] "Any-uz_Latn"             "Any-Katakana"           
[255] "Any-ro_FONIPA"           "Any-zh"                 
[257] "Any-am"                  "Any-es_419_FONIPA"      
[259] "Any-es_FONIPA"           "Any-sk_FONIPA"          
[261] "Any-Hant"                "Any-Hans"               
[263] "Any-Hiragana"            "Any-Syriac"             
[265] "Any-Greek"               "Any-Greek/UNGEGN"       
[267] "Any-Cyrillic"            "Any-Hangul"             
[269] "Any-Bopomofo"            "Any-Arabic"             
[271] "Any-Thai"                "Any-Armenian"           
[273] "Any-Thaana"              "Any-Georgian"           
[275] "Any-Hebrew"              "Any-uz_Cyrl"            
[277] "Any-pl_FONIPA"

# 汎用の文字列変換関数
# http://userguide.icu-project.org/transforms/general
tsurami_str <- "ツラミ"
stringi::stri_trans_general(str = "stringi", id = "latin-cyrillic")

[1] "стринги"

stringi::stri_trans_general(str = tsurami_str, id = "Katakana-Latin")

[1] "tsurami"

stringi::stri_trans_general(str = tsurami_str, id = "Katakana-Hiragana")

[1] "つらみ"

# Windowsだと「cs-ja」の結果が「ストリン<U+0261>イ」と表示される
stringi::stri_trans_list() %>% 
  stringr::str_subset(pattern = "ja$") %>% 
  data.frame(trans_id = ., stringsAsFactors = FALSE) %>%
  dplyr::rowwise() %>%
  dplyr::do(
    dplyr::data_frame(
      id = .$trans_id,
      trans_wd = stringi::stri_trans_general(str = "stringi", id = .$trans_id)
    )
  )

Source: local data frame [13 x 2]
Groups: <by row>

             id     trans_wd
1         cs-ja  ストリンɡイ
2  cs_FONIPA-ja ストリングイ
3         es-ja   ストリンヒ
4     es_419-ja   ストリンヒ
5  es_FONIPA-ja ストリングイ
6         it-ja   ストリンジ
7         pl-ja   ストリンギ
8  pl_FONIPA-ja   ストリンギ
9         ro-ja   ストリンジ
10 ro_FONIPA-ja   ストリンギ
11        ru-ja      stringi
12        sk-ja   ストリンギ
13 sk_FONIPA-ja   ストリンギ

# Unicode正規化形式
# NFC (Canonical Decomposition, followed by Canonical Composition)
# NFD (Canonical Decomposition),
# NFKC (Compatibility Decomposition, followed by Canonical Composition),
# NFKC_Casefold (combination of NFKC, case folding, and removing ignorable characters which was introduced with Unicode 5.2)
# NFKD (Compatibility Decomposition)

# Unicode正規化に関しては下記を参照のこと
# http://www.unicode.org/reports/tr15/
# https://ja.wikipedia.org/wiki/Unicode正規化


# stringi::stri_trans_n*
# 文字列をUnicode正規化形式へ変換
ls("package:stringi") %>% 
  stringr::str_subset(pattern = "^stri_trans_n")

[1] "stri_trans_nfc"           "stri_trans_nfd"          
[3] "stri_trans_nfkc"          "stri_trans_nfkc_casefold"
[5] "stri_trans_nfkd"

transNormalizationForm <- function (uni, type = "trans") {
  if (type == "trans"){
    transd_str <- c(
      stringi::stri_trans_nfc(str = uni),
      stringi::stri_trans_nfd(str = uni),
      stringi::stri_trans_nfkc(str = uni),
      stringi::stri_trans_nfkc_casefold(str = uni),
      stringi::stri_trans_nfkd(str = uni)
    )
  } else {
    transd_str <- c(
      stringi::stri_trans_isnfc(str = uni),
      stringi::stri_trans_isnfd(str = uni),
      stringi::stri_trans_isnfkc(str = uni),
      stringi::stri_trans_isnfkc_casefold(str = uni),
      stringi::stri_trans_isnfkd(str = uni)
    )
  }
  names(transd_str) <- c("NFC", "NFD", "NFKC", "NFKC_CASEFOLD", "NKFD")
  return(transd_str)
}

# 例に挙げた文字列は下記を参考に
# http://nomenclator.la.coocan.jp/unicode/normalization.htm

# 「ダイエレシス付き大文字ユプシロン」はnkcとnfkc以外が異なる
transNormalizationForm(uni = "\u03AB", type = "trans")

          NFC           NFD          NFKC NFKC_CASEFOLD          NKFD 
         "Ϋ"           "Ϋ"          "Ϋ"          "ϋ"           "Ϋ"

# 「㍿」はNFCとNFDが同じ。NFKCとNFKDが同じ
transNormalizationForm(uni = "\u337f", type = "trans")

          NFC           NFD          NFKC NFKC_CASEFOLD          NKFD 
         "㍿"          "㍿"    "株式会社"    "株式会社"    "株式会社"

# 「㌦」はNFCとNFDが同じ
transNormalizationForm(uni = "\u3326", type = "trans")

          NFC           NFD          NFKC NFKC_CASEFOLD          NKFD 
         "㌦"          "㌦"        "ドル"        "ドル"         "ドル"

# NFKCとNFKDは同じようで違う
stringi::stri_trans_nfkc(str = "\u3326") == stringi::stri_trans_nfkd(str = "\u3326")

[1] FALSE

# 半角全角変換（NFCとNFDは半角のまま）
transNormalizationForm(uni = "ﾂﾗﾐ", type = "trans")

          NFC           NFD          NFKC NFKC_CASEFOLD          NKFD 
        "ﾂﾗﾐ"         "ﾂﾗﾐ"      "ツラミ"      "ツラミ"      "ツラミ"

# 半角＋半角濁点はNFKDだと「カタカナ＋濁点」で、「濁点付きカタカナ」ではない
gati_tsurami_str <- "ヅラミ"
han_gati_tsurami_str <- "ﾂﾞﾗﾐ"
transNormalizationForm(uni = han_gati_tsurami_str, type = "trans")

          NFC           NFD          NFKC NFKC_CASEFOLD          NKFD 
       "ﾂﾞﾗﾐ"        "ﾂﾞﾗﾐ"      "ヅラミ"      "ヅラミ"       "ヅラミ"

transNormalizationForm(uni = han_gati_tsurami_str, type = "trans") == gati_tsurami_str

          NFC           NFD          NFKC NFKC_CASEFOLD          NKFD 
        FALSE         FALSE          TRUE          TRUE         FALSE

# stringi::stri_trans_is*
# 文字列がUnicode正規化されているかどうかをチェック
ls("package:stringi") %>% 
  stringr::str_subset(pattern = "^stri_trans_is")

[1] "stri_trans_isnfc"           "stri_trans_isnfd"          
[3] "stri_trans_isnfkc"          "stri_trans_isnfkc_casefold"
[5] "stri_trans_isnfkd"

# trans-is（行が変換時の関数で、列がチェック時の関数）
# 行列の対角は常にTRUE
sapply(
  X = transNormalizationForm(uni = "\u337f", type = "trans"),
  FUN = transNormalizationForm, type = "is"
)

                NFC   NFD NFKC NFKC_CASEFOLD NKFD
NFC            TRUE  TRUE TRUE          TRUE TRUE
NFD            TRUE  TRUE TRUE          TRUE TRUE
NFKC          FALSE FALSE TRUE          TRUE TRUE
NFKC_CASEFOLD FALSE FALSE TRUE          TRUE TRUE
NKFD          FALSE FALSE TRUE          TRUE TRUE

sapply(
  X = transNormalizationForm(uni = "\u3326", type = "trans"),
  FUN = transNormalizationForm, type = "is"
)

                NFC   NFD  NFKC NFKC_CASEFOLD  NKFD
NFC            TRUE  TRUE  TRUE          TRUE FALSE
NFD            TRUE  TRUE FALSE         FALSE  TRUE
NFKC          FALSE FALSE  TRUE          TRUE FALSE
NFKC_CASEFOLD FALSE FALSE  TRUE          TRUE FALSE
NKFD          FALSE FALSE FALSE         FALSE  TRUE

sapply(
  X = transNormalizationForm(uni = "\u03AB", type = "trans"),
  FUN = transNormalizationForm, type = "is"
)

                NFC   NFD  NFKC NFKC_CASEFOLD  NKFD
NFC            TRUE FALSE  TRUE          TRUE FALSE
NFD           FALSE  TRUE FALSE         FALSE  TRUE
NFKC           TRUE FALSE  TRUE          TRUE FALSE
NFKC_CASEFOLD FALSE FALSE FALSE          TRUE FALSE
NKFD          FALSE  TRUE FALSE         FALSE  TRUE

sapply(
  X = transNormalizationForm(uni = "\u03AB", type = "trans"),
  FUN = transNormalizationForm, type = "is"
)

                NFC   NFD  NFKC NFKC_CASEFOLD  NKFD
NFC            TRUE FALSE  TRUE          TRUE FALSE
NFD           FALSE  TRUE FALSE         FALSE  TRUE
NFKC           TRUE FALSE  TRUE          TRUE FALSE
NFKC_CASEFOLD FALSE FALSE FALSE          TRUE FALSE
NKFD          FALSE  TRUE FALSE         FALSE  TRUE

まとめ

{stringr}と{stringi}の関数をざっと触ったが、通常使う分には{stringr}でよいと思われる。
ただし、文字列変換（半角カナから全角カナ）や文字列のエスケープ、形式変換関係などの{stringi}の関数はとても有用なので使っていきたい。
Unicode正規化やICUなど、非常に勉強になったし、これからテキスト処理する際にも役立ちそう。
stringi::stri_enc_fromutf32とstringr::boundary(type = "character")がよくわからなかったので調べる。
(2015.07.18 追記) Windowsに関する挙動は @yutannihilationさんがフォローしてくださいました（感謝です）。
stringiとWindowsと文字コードとかそのへんのメモ

実行環境

library(devtools)
devtools::session_info()

Session info --------------------------------------------------------------

 setting  value                       
 version  R version 3.2.0 (2015-04-16)
 system   x86_64, darwin13.4.0        
 ui       X11                         
 language (EN)                        
 collate  ja_JP.UTF-8                 
 tz       Asia/Tokyo

Packages ------------------------------------------------------------------

 package    * version     date       source                            
 assertthat * 0.1         2013-12-06 CRAN (R 3.2.0)                    
 curl       * 0.5         2015-02-01 CRAN (R 3.2.0)                    
 DBI        * 0.3.1       2014-09-24 CRAN (R 3.2.0)                    
 devtools     1.7.0       2015-01-17 CRAN (R 3.2.0)                    
 digest     * 0.6.8       2014-12-31 CRAN (R 3.2.0)                    
 dplyr        0.4.2.9000  2015-06-17 Github (hadley/dplyr@7763150)     
 evaluate   * 0.7         2015-04-21 CRAN (R 3.2.0)                    
 formatR    * 1.2         2015-04-21 CRAN (R 3.2.0)                    
 htmltools  * 0.2.6       2014-09-08 CRAN (R 3.2.0)                    
 knitr        1.10        2015-04-23 CRAN (R 3.2.0)                    
 lazyeval   * 0.1.10.9000 2015-06-07 Github (hadley/lazyeval@ecb8dc0)  
 magrittr   * 1.5         2014-11-22 CRAN (R 3.2.0)                    
 R6         * 2.0.1       2014-10-29 CRAN (R 3.2.0)                    
 Rcpp       * 0.11.6      2015-05-01 CRAN (R 3.2.0)                    
 readr        0.1.0.9000  2015-06-08 Github (hadley/readr@9006822)     
 rmarkdown  * 0.6.2.4     2015-06-07 Github (rstudio/rmarkdown@8c9e25b)
 rstudioapi * 0.3.1       2015-04-07 CRAN (R 3.2.0)                    
 stringi      0.4-1       2014-12-14 CRAN (R 3.2.0)                    
 stringr      1.0.0       2015-04-30 CRAN (R 3.2.0)                    
 tidyr        0.2.0.9000  2015-06-07 Github (hadley/tidyr@0dc87b2)     
 yaml       * 2.1.13      2014-06-12 CRAN (R 3.2.0)

{stringr}/{stringi}とbaseの文字列処理について

@yamano357

2015-07-15

概要