前書き
- Rで文字列処理をするライブラリである{stringr}と{stringi}について、baseの関数と付き合わせたコードの自分用のメモ
参照サイト
stringr-vignettes
stringi
hadley/stringr
RPubs - このパッケージがすごい2014: stringr
stringi package arekore stringiで輝く☆テキストショリスト
stringr 1.0.0を使ってみる
SET_LOAD_LIB <- c("knitr", "readr", "dplyr", "tidyr", "readr", "stringr", "stringi")
sapply(X = SET_LOAD_LIB, FUN = library, character.only = TRUE, logical.return = TRUE)
## knitr readr dplyr tidyr readr stringr stringi
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE
knitr::opts_chunk$set(comment = NA)
ls("package:stringr") %>%
stringr::str_subset(pattern = "^[a-zA-Z]")
[1] "boundary" "coll" "fixed"
[4] "ignore.case" "invert_match" "perl"
[7] "regex" "str_c" "str_conv"
[10] "str_count" "str_detect" "str_dup"
[13] "str_extract" "str_extract_all" "str_join"
[16] "str_length" "str_locate" "str_locate_all"
[19] "str_match" "str_match_all" "str_order"
[22] "str_pad" "str_replace" "str_replace_all"
[25] "str_replace_na" "str_sort" "str_split"
[28] "str_split_fixed" "str_sub" "str_sub<-"
[31] "str_subset" "str_to_lower" "str_to_title"
[34] "str_to_upper" "str_trim" "str_wrap"
[37] "word"
ls("package:stringi") %>%
stringr::str_subset(pattern = "^[a-zA-Z]")
[1] "stri_c" "stri_cmp"
[3] "stri_cmp_eq" "stri_cmp_equiv"
[5] "stri_cmp_ge" "stri_cmp_gt"
[7] "stri_cmp_le" "stri_cmp_lt"
[9] "stri_cmp_neq" "stri_cmp_nequiv"
[11] "stri_compare" "stri_conv"
[13] "stri_count" "stri_count_boundaries"
[15] "stri_count_charclass" "stri_count_coll"
[17] "stri_count_fixed" "stri_count_regex"
[19] "stri_count_words" "stri_detect"
[21] "stri_detect_charclass" "stri_detect_coll"
[23] "stri_detect_fixed" "stri_detect_regex"
[25] "stri_dup" "stri_duplicated"
[27] "stri_duplicated_any" "stri_enc_detect"
[29] "stri_enc_detect2" "stri_enc_fromutf32"
[31] "stri_enc_get" "stri_enc_info"
[33] "stri_enc_isascii" "stri_enc_isutf16be"
[35] "stri_enc_isutf16le" "stri_enc_isutf32be"
[37] "stri_enc_isutf32le" "stri_enc_isutf8"
[39] "stri_enc_list" "stri_enc_mark"
[41] "stri_enc_set" "stri_enc_toascii"
[43] "stri_enc_tonative" "stri_enc_toutf32"
[45] "stri_enc_toutf8" "stri_encode"
[47] "stri_endswith" "stri_endswith_charclass"
[49] "stri_endswith_coll" "stri_endswith_fixed"
[51] "stri_escape_unicode" "stri_extract"
[53] "stri_extract_all" "stri_extract_all_charclass"
[55] "stri_extract_all_coll" "stri_extract_all_fixed"
[57] "stri_extract_all_regex" "stri_extract_all_words"
[59] "stri_extract_first" "stri_extract_first_charclass"
[61] "stri_extract_first_coll" "stri_extract_first_fixed"
[63] "stri_extract_first_regex" "stri_extract_first_words"
[65] "stri_extract_last" "stri_extract_last_charclass"
[67] "stri_extract_last_coll" "stri_extract_last_fixed"
[69] "stri_extract_last_regex" "stri_extract_last_words"
[71] "stri_flatten" "stri_info"
[73] "stri_install_check" "stri_install_icudt"
[75] "stri_isempty" "stri_join"
[77] "stri_length" "stri_list2matrix"
[79] "stri_locale_get" "stri_locale_info"
[81] "stri_locale_list" "stri_locale_set"
[83] "stri_locate" "stri_locate_all"
[85] "stri_locate_all_boundaries" "stri_locate_all_charclass"
[87] "stri_locate_all_coll" "stri_locate_all_fixed"
[89] "stri_locate_all_regex" "stri_locate_all_words"
[91] "stri_locate_first" "stri_locate_first_boundaries"
[93] "stri_locate_first_charclass" "stri_locate_first_coll"
[95] "stri_locate_first_fixed" "stri_locate_first_regex"
[97] "stri_locate_first_words" "stri_locate_last"
[99] "stri_locate_last_boundaries" "stri_locate_last_charclass"
[101] "stri_locate_last_coll" "stri_locate_last_fixed"
[103] "stri_locate_last_regex" "stri_locate_last_words"
[105] "stri_match" "stri_match_all"
[107] "stri_match_all_regex" "stri_match_first"
[109] "stri_match_first_regex" "stri_match_last"
[111] "stri_match_last_regex" "stri_numbytes"
[113] "stri_opts_brkiter" "stri_opts_collator"
[115] "stri_opts_fixed" "stri_opts_regex"
[117] "stri_order" "stri_pad"
[119] "stri_pad_both" "stri_pad_left"
[121] "stri_pad_right" "stri_paste"
[123] "stri_rand_lipsum" "stri_rand_shuffle"
[125] "stri_rand_strings" "stri_read_lines"
[127] "stri_read_raw" "stri_replace"
[129] "stri_replace_all" "stri_replace_all_charclass"
[131] "stri_replace_all_coll" "stri_replace_all_fixed"
[133] "stri_replace_all_regex" "stri_replace_first"
[135] "stri_replace_first_charclass" "stri_replace_first_coll"
[137] "stri_replace_first_fixed" "stri_replace_first_regex"
[139] "stri_replace_last" "stri_replace_last_charclass"
[141] "stri_replace_last_coll" "stri_replace_last_fixed"
[143] "stri_replace_last_regex" "stri_replace_na"
[145] "stri_reverse" "stri_sort"
[147] "stri_split" "stri_split_boundaries"
[149] "stri_split_charclass" "stri_split_coll"
[151] "stri_split_fixed" "stri_split_lines"
[153] "stri_split_lines1" "stri_split_regex"
[155] "stri_startswith" "stri_startswith_charclass"
[157] "stri_startswith_coll" "stri_startswith_fixed"
[159] "stri_stats_general" "stri_stats_latex"
[161] "stri_sub" "stri_sub<-"
[163] "stri_subset" "stri_subset_charclass"
[165] "stri_subset_coll" "stri_subset_fixed"
[167] "stri_subset_regex" "stri_trans_general"
[169] "stri_trans_isnfc" "stri_trans_isnfd"
[171] "stri_trans_isnfkc" "stri_trans_isnfkc_casefold"
[173] "stri_trans_isnfkd" "stri_trans_list"
[175] "stri_trans_nfc" "stri_trans_nfd"
[177] "stri_trans_nfkc" "stri_trans_nfkc_casefold"
[179] "stri_trans_nfkd" "stri_trans_tolower"
[181] "stri_trans_totitle" "stri_trans_toupper"
[183] "stri_trim" "stri_trim_both"
[185] "stri_trim_left" "stri_trim_right"
[187] "stri_unescape_unicode" "stri_unique"
[189] "stri_wrap" "stri_write_lines"
文字列連結
base: paste/paste0
stringr: stringr::str_c
, stringr::str_join(deprecated)
stringi: stringi::stri_join
, , stringi::stri_c(aliases)
stringi::stri_paste(aliases)
文字列分割
base: strsplit
stringr: stringr::str_split
, stringr::str_split_fixed
month.abb
[1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"
n_day <- seq(from = 1, to = length(month.abb))
# stringr::str_cの引数を変えて比較
stringr::str_c(month.abb, sep = ".")
[1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"
stringr::str_c(month.abb, collapse = "")
[1] "JanFebMarAprMayJunJulAugSepOctNovDec"
stringr::str_c(month.abb, sep = ".", collapse = "")
[1] "JanFebMarAprMayJunJulAugSepOctNovDec"
# stringr::str_cに複数のベクトルを与え、引数を変えて比較
stringr::str_c(month.abb, n_day, sep = ".")
[1] "Jan.1" "Feb.2" "Mar.3" "Apr.4" "May.5" "Jun.6" "Jul.7"
[8] "Aug.8" "Sep.9" "Oct.10" "Nov.11" "Dec.12"
stringr::str_c(month.abb, n_day, collapse = "")
[1] "Jan1Feb2Mar3Apr4May5Jun6Jul7Aug8Sep9Oct10Nov11Dec12"
stringr::str_c(month.abb, n_day, sep = ".", collapse = "")
[1] "Jan.1Feb.2Mar.3Apr.4May.5Jun.6Jul.7Aug.8Sep.9Oct.10Nov.11Dec.12"
# stringr::str_cに複数のベクトルをひとつにまとめて与え、引数を変えて比較
stringr::str_c(c(month.abb, n_day), sep = ".")
[1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec" "1" "2" "3" "4" "5" "6" "7" "8" "9" "10"
[23] "11" "12"
stringr::str_c(c(month.abb, n_day), collapse = "")
[1] "JanFebMarAprMayJunJulAugSepOctNovDec123456789101112"
stringr::str_c(c(month.abb, n_day), sep = ".", collapse = "")
[1] "JanFebMarAprMayJunJulAugSepOctNovDec123456789101112"
# stringr::str_cはstringi::stri_cを呼んでいる
stringr::str_c
function (..., sep = "", collapse = NULL)
{
stri_c(..., sep = sep, collapse = collapse, ignore_null = TRUE)
}
<environment: namespace:stringr>
# stringr::str_cとpaste/paste0を比較
# str_cのsep= ""がデフォルト
stringr::str_c(month.abb, n_day)
[1] "Jan1" "Feb2" "Mar3" "Apr4" "May5" "Jun6" "Jul7" "Aug8"
[9] "Sep9" "Oct10" "Nov11" "Dec12"
# stringr::str_cのsep= " "がデフォルト
paste(month.abb, n_day)
[1] "Jan 1" "Feb 2" "Mar 3" "Apr 4" "May 5" "Jun 6" "Jul 7"
[8] "Aug 8" "Sep 9" "Oct 10" "Nov 11" "Dec 12"
# stringr::str_cはNAをNAとして扱う
is.na(stringr::str_c(NA))
[1] TRUE
# paste0はNAを"NA"として扱う
is.na(paste0(NA))
[1] FALSE
# stringr::str_splitはパターン毎(先頭一致)に区切った文字列ベクトルからなるリストを返す
# 引数nを指定しないと、デフォルトの引数n = Infを使って全て区切る
month.name %>%
stringr::str_split(pattern = "e")
[[1]]
[1] "January"
[[2]]
[1] "F" "bruary"
[[3]]
[1] "March"
[[4]]
[1] "April"
[[5]]
[1] "May"
[[6]]
[1] "Jun" ""
[[7]]
[1] "July"
[[8]]
[1] "August"
[[9]]
[1] "S" "pt" "mb" "r"
[[10]]
[1] "Octob" "r"
[[11]]
[1] "Nov" "mb" "r"
[[12]]
[1] "D" "c" "mb" "r"
# 「n = 1」だと区切らない
month.name %>%
stringr::str_split(pattern = "e", n = 1)
[[1]]
[1] "January"
[[2]]
[1] "February"
[[3]]
[1] "March"
[[4]]
[1] "April"
[[5]]
[1] "May"
[[6]]
[1] "June"
[[7]]
[1] "July"
[[8]]
[1] "August"
[[9]]
[1] "September"
[[10]]
[1] "October"
[[11]]
[1] "November"
[[12]]
[1] "December"
# 「n = 2」
month.name %>%
stringr::str_split(pattern = "e", n = 2)
[[1]]
[1] "January"
[[2]]
[1] "F" "bruary"
[[3]]
[1] "March"
[[4]]
[1] "April"
[[5]]
[1] "May"
[[6]]
[1] "Jun" ""
[[7]]
[1] "July"
[[8]]
[1] "August"
[[9]]
[1] "S" "ptember"
[[10]]
[1] "Octob" "r"
[[11]]
[1] "Nov" "mber"
[[12]]
[1] "D" "cember"
# stringi::stri_split_fixedと挙動は一緒
month.name %>%
stringi::stri_split_fixed(pattern = "e", n = 2)
[[1]]
[1] "January"
[[2]]
[1] "F" "bruary"
[[3]]
[1] "March"
[[4]]
[1] "April"
[[5]]
[1] "May"
[[6]]
[1] "Jun" ""
[[7]]
[1] "July"
[[8]]
[1] "August"
[[9]]
[1] "S" "ptember"
[[10]]
[1] "Octob" "r"
[[11]]
[1] "Nov" "mber"
[[12]]
[1] "D" "cember"
# stringr::str_split_fixedはパターン毎(先頭一致)に引数nの個数で区切った次元の行列を返す
# 「n = 1」だと区切らない
month.name %>%
stringr::str_split_fixed(pattern = "e", n = 1)
[,1]
[1,] "January"
[2,] "February"
[3,] "March"
[4,] "April"
[5,] "May"
[6,] "June"
[7,] "July"
[8,] "August"
[9,] "September"
[10,] "October"
[11,] "November"
[12,] "December"
# 「n = 2」だと2個に区切る
month.name %>%
stringr::str_split_fixed(pattern = "e", n = 2)
[,1] [,2]
[1,] "January" ""
[2,] "F" "bruary"
[3,] "March" ""
[4,] "April" ""
[5,] "May" ""
[6,] "Jun" ""
[7,] "July" ""
[8,] "August" ""
[9,] "S" "ptember"
[10,] "Octob" "r"
[11,] "Nov" "mber"
[12,] "D" "cember"
month.name %>%
stringr::str_split_fixed(pattern = "e", n = 3)
[,1] [,2] [,3]
[1,] "January" "" ""
[2,] "F" "bruary" ""
[3,] "March" "" ""
[4,] "April" "" ""
[5,] "May" "" ""
[6,] "Jun" "" ""
[7,] "July" "" ""
[8,] "August" "" ""
[9,] "S" "pt" "mber"
[10,] "Octob" "r" ""
[11,] "Nov" "mb" "r"
[12,] "D" "c" "mber"
month.name %>%
stringr::str_split_fixed(pattern = "e", n = 4)
[,1] [,2] [,3] [,4]
[1,] "January" "" "" ""
[2,] "F" "bruary" "" ""
[3,] "March" "" "" ""
[4,] "April" "" "" ""
[5,] "May" "" "" ""
[6,] "Jun" "" "" ""
[7,] "July" "" "" ""
[8,] "August" "" "" ""
[9,] "S" "pt" "mb" "r"
[10,] "Octob" "r" "" ""
[11,] "Nov" "mb" "r" ""
[12,] "D" "c" "mb" "r"
# 各入力に含まれる"e"は最大3個なので、5列目は全て空文字列
month.name %>%
stringr::str_split_fixed(pattern = "e", n = 5)
[,1] [,2] [,3] [,4] [,5]
[1,] "January" "" "" "" ""
[2,] "F" "bruary" "" "" ""
[3,] "March" "" "" "" ""
[4,] "April" "" "" "" ""
[5,] "May" "" "" "" ""
[6,] "Jun" "" "" "" ""
[7,] "July" "" "" "" ""
[8,] "August" "" "" "" ""
[9,] "S" "pt" "mb" "r" ""
[10,] "Octob" "r" "" "" ""
[11,] "Nov" "mb" "r" "" ""
[12,] "D" "c" "mb" "r" ""
# base::strsplitはstringr::str_splitと挙動がほぼ同じだが、引数splitを末尾に含んでいた場合は異なる
strsplit(x = month.name, split = "e") %>%
sapply(X = ., FUN = length)
[1] 1 2 1 1 1 1 1 1 4 2 3 4
month.name %>%
stringr::str_split(pattern = "e") %>%
sapply(X = ., FUN = length)
[1] 1 2 1 1 1 2 1 1 4 2 3 4
# base::strsplitは6月(June)が1個のベクトル(stringr::str_splitのときは2個目が空文字列)
strsplit(x = month.name, split = "e")
[[1]]
[1] "January"
[[2]]
[1] "F" "bruary"
[[3]]
[1] "March"
[[4]]
[1] "April"
[[5]]
[1] "May"
[[6]]
[1] "Jun"
[[7]]
[1] "July"
[[8]]
[1] "August"
[[9]]
[1] "S" "pt" "mb" "r"
[[10]]
[1] "Octob" "r"
[[11]]
[1] "Nov" "mb" "r"
[[12]]
[1] "D" "c" "mb" "r"
# 戻り値がデータフレームではないので、一度data.frameに変換してから{dplyr}で処理
month.name %>%
stringr::str_split_fixed(pattern = "e", n = 4) %>%
as.data.frame() %>%
dplyr::bind_cols()
Source: local data frame [12 x 4]
V1 V2 V3 V4
1 January
2 F bruary
3 March
4 April
5 May
6 Jun
7 July
8 August
9 S pt mb r
10 Octob r
11 Nov mb r
12 D c mb r
# MeCabによる形態素解析結果をパースするときによく使います
# n = 「","の個数 + 1」
c("名詞,サ変接続,*,*,*,*,テスト,テスト,テスト", "名詞,サ変接続,*,*,*,*,統計,トウケイ,トーケイ") %>%
stringr::str_split_fixed(string = , pattern = ",",n = 9)
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9]
[1,] "名詞" "サ変接続" "*" "*" "*" "*" "テスト" "テスト" "テスト"
[2,] "名詞" "サ変接続" "*" "*" "*" "*" "統計" "トウケイ" "トーケイ"
base: iconv
stringr: stringr::str_conv
stringi: stringi::stri_encode
, stringi::stri_conv(alias)
# SHIFT-JISのオープンデータ
# Windows環境ならエンコーディング不要?
shift_jis_str <- readr::read_lines(
file = "https://www.city.chiba.jp/shimin/shimin/kohokocho/documents/shisetsu.csv",
n_max = 1
) %>%
print
[1] "\x83y\x81[\x83W\x83^\x83C\x83g\x83\x8b,\x8e{\x90݃W\x83\x83\x83\x93\x83\x8b,\x8e{\x90݁A\x8fꏊ\x81A\x83C\x83x\x83\x93\x83g\x82̖\xbc\x8f́i\x93ǂ݁j,\x97X\x95֔ԍ\x86,\x8fZ\x8f\x8a,\x83r\x83\x8b\x96\xbc,\x83t\x83\x8d\x83A\x90\x94,\x88ܓx,\x8co\x93x"
# 文字コードを判定すると"Shift_JIS"っぽい(Confidenceが一番高い)
# stringi::stri_enc_detectとstringi::stri_enc_detect2はまだ実験的に作成
estimate_encording <- stringi::stri_enc_detect(str = shift_jis_str) %>%
print
[[1]]
[[1]]$Encoding
[1] "Shift_JIS" "windows-1252" "windows-1250" "GB18030"
[5] "Big5" "windows-1253"
[[1]]$Language
[1] "ja" "es" "cs" "zh" "zh" "el"
[[1]]$Confidence
[1] 1.00 0.24 0.18 0.10 0.10 0.04
# エンコーディング
utf_str <- suppressWarnings(stringr::str_conv(
string = shift_jis_str,
encoding = estimate_encording[[1]]$Encoding[which.max(estimate_encording[[1]]$Confidence)]
)) %>%
print
[1] "ページタイトル,施設ジャンル,施設、場所、イベントの名称(読み),郵便番号,住所,ビル名,フロア数,緯度,経度"
# base::iconvと挙動は同じ
iconv(x = shift_jis_str, from = "SHIFT-JIS", to = "UTF-8")
[1] "ページタイトル,施設ジャンル,施設、場所、イベントの名称(読み),郵便番号,住所,ビル名,フロア数,緯度,経度"
base: grepl
stringr: stringr::str_detect
stringi: stringi::stri_detect_fixed
# パターンを含むかどうかの論理値を返す
stringr::str_detect(string = month.name, pattern = "J")
[1] TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
[12] FALSE
# base::greplと挙動は同じ
grepl(x = month.name, pattern = "J")
[1] TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
[12] FALSE
base: regmatches
stringr: stringr::str_extract
, stringr::str_extract_all
stringi: stringi::stri_extract_first
, stringi::stri_extract_all
stringr: stringr::str_match
, stringr::str_match_all
stringi: stringi::stri_match_first
, stringi::stri_match_all
# stringr::str_extractはパターンにマッチした箇所のみをベクトルで返す
stringr::str_extract(string = month.name, pattern = "(.{1,3})(.{1,3})")
[1] "Januar" "Februa" "March" "April" "May" "June" "July"
[8] "August" "Septem" "Octobe" "Novemb" "Decemb"
# stringr::str_match() でグループ化した箇所を、グループに対応する列番号に代入した行列で返す
# 1列目は完全一致した文字列
stringr::str_match(string = month.name, pattern = "(.{1,3})(.{1,3})")
[,1] [,2] [,3]
[1,] "Januar" "Jan" "uar"
[2,] "Februa" "Feb" "rua"
[3,] "March" "Mar" "ch"
[4,] "April" "Apr" "il"
[5,] "May" "Ma" "y"
[6,] "June" "Jun" "e"
[7,] "July" "Jul" "y"
[8,] "August" "Aug" "ust"
[9,] "Septem" "Sep" "tem"
[10,] "Octobe" "Oct" "obe"
[11,] "Novemb" "Nov" "emb"
[12,] "Decemb" "Dec" "emb"
# グループ化しないと同じ結果を出すが、stringr::str_extractはベクトルで、stringr::str_matchは1列の行列
stringr::str_extract(string = month.name, pattern = ".{1,3}.{1,3}")
[1] "Januar" "Februa" "March" "April" "May" "June" "July"
[8] "August" "Septem" "Octobe" "Novemb" "Decemb"
stringr::str_match(string = month.name, pattern = ".{1,3}.{1,3}")
[,1]
[1,] "Januar"
[2,] "Februa"
[3,] "March"
[4,] "April"
[5,] "May"
[6,] "June"
[7,] "July"
[8,] "August"
[9,] "Septem"
[10,] "Octobe"
[11,] "Novemb"
[12,] "Decemb"
# stringr::str_extract_allはマッチした全ての箇所を出す
# stringr::str_extractは複数パターンがあった場合は先頭一致
# 引数simplifyで行列とリストの変更が可能(デフォルトはFALSEでリスト)
stringr::str_extract(string = month.name, pattern = "er|em")
[1] NA NA NA NA NA NA NA NA "em" "er" "em" "em"
stringr::str_extract_all(string = month.name, pattern = "er|em", simplify = TRUE)
[,1] [,2]
[1,] "" ""
[2,] "" ""
[3,] "" ""
[4,] "" ""
[5,] "" ""
[6,] "" ""
[7,] "" ""
[8,] "" ""
[9,] "em" "er"
[10,] "er" ""
[11,] "em" "er"
[12,] "em" "er"
# stringr::str_match_allはマッチした全ての箇所を出す
# stringr::str_matchは複数パターンがあった場合は先頭一致
# stringr::str_extract/stringr::str_extract_allと異なりどのパターンにマッチしたかわかる
stringr::str_match(string = month.name, pattern = "(er)|(em)")
[,1] [,2] [,3]
[1,] NA NA NA
[2,] NA NA NA
[3,] NA NA NA
[4,] NA NA NA
[5,] NA NA NA
[6,] NA NA NA
[7,] NA NA NA
[8,] NA NA NA
[9,] "em" NA "em"
[10,] "er" "er" NA
[11,] "em" NA "em"
[12,] "em" NA "em"
stringr::str_match_all(string = month.name, pattern = "(er)|(em)")
[[1]]
[,1] [,2] [,3]
[[2]]
[,1] [,2] [,3]
[[3]]
[,1] [,2] [,3]
[[4]]
[,1] [,2] [,3]
[[5]]
[,1] [,2] [,3]
[[6]]
[,1] [,2] [,3]
[[7]]
[,1] [,2] [,3]
[[8]]
[,1] [,2] [,3]
[[9]]
[,1] [,2] [,3]
[1,] "em" "" "em"
[2,] "er" "er" ""
[[10]]
[,1] [,2] [,3]
[1,] "er" "er" ""
[[11]]
[,1] [,2] [,3]
[1,] "em" "" "em"
[2,] "er" "er" ""
[[12]]
[,1] [,2] [,3]
[1,] "em" "" "em"
[2,] "er" "er" ""
# regmatchesはグループ化してもしなくても変わらない
# stringr::str_extract_allに近い結果(stringr::str_extract_allはマッチしない箇所は空文字列)
regmatches(x = month.name, m = gregexpr(text = month.name, pattern = "er|em"))
[[1]]
character(0)
[[2]]
character(0)
[[3]]
character(0)
[[4]]
character(0)
[[5]]
character(0)
[[6]]
character(0)
[[7]]
character(0)
[[8]]
character(0)
[[9]]
[1] "em" "er"
[[10]]
[1] "er"
[[11]]
[1] "em" "er"
[[12]]
[1] "em" "er"
base: grep
stringr: stringr::str_subset
stringi: stringi::stri_detect_fixed
# stringr::str_subsetは条件を含んだ文字列全体を返す
stringr::str_subset(string = month.name, pattern = "M|J")
[1] "January" "March" "May" "June" "July"
# stringr::str_extract/stringr::str_matchは条件にマッチした部分文字列
stringr::str_extract(string = month.name, pattern = "M|J")
[1] "J" NA "M" NA "M" "J" "J" NA NA NA NA NA
stringr::str_match(string = month.name, pattern = "M|J") %>%
t
[,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12]
[1,] "J" NA "M" NA "M" "J" "J" NA NA NA NA NA
# base::grepの引数valueをTRUEにした場合と挙動は同じ
grep(x = month.name, pattern = "M|J", value = TRUE)
[1] "January" "March" "May" "June" "July"
base: -
stringr: stringr::str_count
stringi: stringi::stri_count_fixed
stringr::str_count(string = month.name, pattern = "e")
[1] 0 1 0 0 0 1 0 0 3 1 2 3
# 複数パターン時はマッチした分だけ加算
stringr::str_count(string = month.name, pattern = "e|J")
[1] 1 1 0 0 0 2 1 0 3 1 2 3
base: sub
, gsub
stringr: stringr::str_replace
, stringr::str_replace_all
stringi: stringi::stri_replace
, stringi::stri_replace_all
# stringr::str_replaceは引数patternに先頭一致した文字列を、
# stringr::str_replace_allは全てを、
# 引数replacementの値に置換
stringr::str_replace(string = month.name, pattern = "e", replacement = "x")
[1] "January" "Fxbruary" "March" "April" "May"
[6] "Junx" "July" "August" "Sxptember" "Octobxr"
[11] "Novxmber" "Dxcember"
stringr::str_replace_all(string = month.name, pattern = "e", replacement = "x")
[1] "January" "Fxbruary" "March" "April" "May"
[6] "Junx" "July" "August" "Sxptxmbxr" "Octobxr"
[11] "Novxmbxr" "Dxcxmbxr"
# stringr::str_replace_allは次のような対応付ける記述も可能
stringr::str_replace_all(string = month.name, pattern = c("e" = "x", "J" = "K"))
[1] "Kanuary" "Fxbruary" "March" "April" "May"
[6] "Kunx" "Kuly" "August" "Sxptxmbxr" "Octobxr"
[11] "Novxmbxr" "Dxcxmbxr"
# stringr::str_replaceだと引数replacementがないとエラー
try(expr = stringr::str_replace(string = month.name, pattern = c("e" = "x", "J" = "K")), silent = TRUE)
# base::subとbase::gsubと挙動は同じ
sub(x = month.name, pattern = "e", replacement = "x")
[1] "January" "Fxbruary" "March" "April" "May"
[6] "Junx" "July" "August" "Sxptember" "Octobxr"
[11] "Novxmber" "Dxcember"
gsub(x = month.name, pattern = "e", replacement = "x")
[1] "January" "Fxbruary" "March" "April" "May"
[6] "Junx" "July" "August" "Sxptxmbxr" "Octobxr"
[11] "Novxmbxr" "Dxcxmbxr"
# stringr::str_replace_naでNAの置換も可能
stringr::str_replace_na(string = c(month.name, NA), replacement = "AN")
[1] "January" "February" "March" "April" "May"
[6] "June" "July" "August" "September" "October"
[11] "November" "December" "AN"
stringr::str_replace_na(string = c(month.name, "NA"), replacement = "AN")
[1] "January" "February" "March" "April" "May"
[6] "June" "July" "August" "September" "October"
[11] "November" "December" "NA"
# subだとNAを含むと失敗する
sub(x = c(month.name, NA), pattern = NA, replacement = "AN")
[1] NA NA NA NA NA NA NA NA NA NA NA NA NA
sub(x = c(month.name, "NA"), pattern = "NA", replacement = "AN")
[1] "January" "February" "March" "April" "May"
[6] "June" "July" "August" "September" "October"
[11] "November" "December" "AN"
base: substr
, substring
stringr: stringr::str_sub
stringi: stringi::stri_sub
month_name_stringr <- month_name_base <- month.name
stringr::str_sub(string = month_name_stringr, start = 1, end = 3)
[1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"
# startとendの引数にマイナスの値を指定すると、後ろから数えた文字列になる
stringr::str_sub(string = month_name_stringr, start = -3, end = -1)
[1] "ary" "ary" "rch" "ril" "May" "une" "uly" "ust" "ber" "ber" "ber"
[12] "ber"
stringr::str_sub(string = month_name_stringr, start = 4, end = -1) <- "ber"
# 4文字目以降に引数valueの"ber"を挿入
month_name_stringr
[1] "Janber" "Febber" "Marber" "Aprber" "Mayber" "Junber" "Julber"
[8] "Augber" "Sepber" "Octber" "Novber" "Decber"
# base::substr/base::substringとは文字列置換の挙動が異なる
# base::substr/base::substringは引数valueのサイズに合わせて置換
# base::substr/base::substringは同じ
substr(x = month_name_base, start = 1, stop = 3)
[1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"
substr(x = month_name_base, start = 4, stop = 10) <- "ber"
# 4文字目以降に引数valueの"ber"を挿入
# 3文字以下の場合は文字の挿入はされない
# 挿入後に元の文字列サイズは越えないように前から切られる
month_name_base
[1] "Janbery" "Febberry" "Marbe" "Aprbe" "May"
[6] "Junb" "Julb" "Augber" "Sepberber" "Octberr"
[11] "Novberer" "Decberer"
month_name_base <- month.name
substring(text = month_name_base, first = 4) <- "ber"
month_name_base
[1] "Janbery" "Febberry" "Marbe" "Aprbe" "May"
[6] "Junb" "Julb" "Augber" "Sepberber" "Octberr"
[11] "Novberer" "Decberer"
# startとstopの引数にマイナスは指定しても、後ろからにはならない
substr(x = month_name_base, start = -3, stop = -1)
[1] "" "" "" "" "" "" "" "" "" "" "" ""
substr(x = month_name_base, start = -3, stop = 3)
[1] "Jan" "Feb" "Mar" "Apr" "May" "Jun" "Jul" "Aug" "Sep" "Oct" "Nov"
[12] "Dec"
base: grep
stringr: stringr::str_locate
, stringr::str_locate_all
stringi: stringi::stri_locate
, stri_locate_all
, (stringi::stri_locate_first, stringi::stri_locate_last)
# stringr::str_locateは複数パターンがあった場合は先頭一致するパターンの位置
# "June"の戻り値は最初に一致したパターン"J"の位置
stringr::str_locate(string = month.name, pattern = "e|J")
start end
[1,] 1 1
[2,] 2 2
[3,] NA NA
[4,] NA NA
[5,] NA NA
[6,] 1 1
[7,] 1 1
[8,] NA NA
[9,] 2 2
[10,] 6 6
[11,] 4 4
[12,] 2 2
# stringr::str_locate_allはマッチした全ての箇所を出す
# 複数時の順番はマッチした順(パターンの記述順は無視)
# "June"の戻り値は最初に一致したパターン"J"の位置と、次に一致したパターン"e"の位置
stringr::str_locate_all(string = month.name, pattern = "e|J")
[[1]]
start end
[1,] 1 1
[[2]]
start end
[1,] 2 2
[[3]]
start end
[[4]]
start end
[[5]]
start end
[[6]]
start end
[1,] 1 1
[2,] 4 4
[[7]]
start end
[1,] 1 1
[[8]]
start end
[[9]]
start end
[1,] 2 2
[2,] 5 5
[3,] 8 8
[[10]]
start end
[1,] 6 6
[[11]]
start end
[1,] 4 4
[2,] 7 7
[[12]]
start end
[1,] 2 2
[2,] 4 4
[3,] 7 7
# stringr::invert_matchにstringr::str_locate_allの戻り値を渡すとマッチしなかった位置の範囲を返す
match_idx <- stringr::invert_match(
loc = stringr::str_locate_all(string = month.name, pattern = "e|J")[[9]]
)
stringr::str_sub(
string = month.name[9],
start = match_idx[, "start"], end = match_idx[, "end"]
)
[1] "S" "pt" "mb" "r"
# base::regexpr/base::gregexprは似た挙動
# マッチした開始位置とマッチした長さを返す
regexpr(text = month.name, pattern = "e|J")
[1] 1 2 -1 -1 -1 1 1 -1 2 6 4 2
attr(,"match.length")
[1] 1 1 -1 -1 -1 1 1 -1 1 1 1 1
attr(,"useBytes")
[1] TRUE
base: rep
stringr: stringr::str_dup
stringi: stringi::stri_dup
# times引数の各値だけstring引数の各ベクトルを繰り返す
# stringが「"月", "火", ..., "日"」で、timesが「1, 2, ..., 7」なので、
# 「月が1回」「火が2回」という結果になる
stringr::str_dup(
string = format(as.Date("2015-07-12") + seq(from = 1, to = 7), "%a"),
times = seq(from = 1, to = 7)
)
[1] "月" "火火" "水水水" "木木木木"
[5] "金金金金金" "土土土土土土" "日日日日日日日"
# base::repが似た挙動
rep(
x = format(as.Date("2015-07-12") + seq(from = 1, to = 7), "%a"),
times = seq(from = 1, to = 7)
)
[1] "月" "火" "火" "水" "水" "水" "木" "木" "木" "木" "金" "金" "金" "金"
[15] "金" "土" "土" "土" "土" "土" "土" "日" "日" "日" "日" "日" "日" "日"
base: length
stringr: stringr::str_length
stringi: stringi::stri_length
# 各文字列の長さを返す
stringr::str_length(string = month.name)
[1] 7 8 5 5 3 4 4 6 9 7 8 8
# UTF-8でないとダメっぽい(Windowsの場合は通る。SHIFT-JISを{stringi}側でUTF-8に変換してくれる?)
stringr::str_length(string = shift_jis_str)
Warning in stri_length(string): invalid UTF-8 byte sequence detected.
perhaps you should try calling stri_enc_toutf8()
[1] NA
stringr::str_length(string = utf_str)
[1] 55
# unicode文字列も変換すると正しく数えられる
unicode_str <- "\\u3042\\u3043\\u3045\\u3045"
stringr::str_length(string = unicode_str)
[1] 24
stringr::str_length(string = stringi::stri_unescape_unicode(unicode_str))
[1] 4
# stringi::stri_numbytesだとバイト長を測る
stringr::str_length(string = c("abc", "123", "\u0105\u0104"))
[1] 3 3 2
stringi::stri_numbytes(str = c("abc", "123", "\u0105\u0104"))
[1] 3 3 4
# 長さによる判定でもできるが、空文字列かどうかの判定するstringi::stri_isemptyがある
stringi::stri_isempty(str = c("", "abc", "123", "\u0105\u0104", character(1)))
[1] TRUE FALSE FALSE FALSE TRUE
base: order
, sort
stringr: stringr::str_order
, stringr::str_sort
stringi: stringr::stri_order
, stringr::stri_sort
ja_aiueo <- stringr::str_split(string = "あいうえおアイウエオアイウエオ", pattern = "") %>%
dplyr::combine()
stringr::str_sort(x = ja_aiueo)
[1] "あ" "ア" "ア" "い" "イ" "イ" "う" "ウ" "ウ" "え" "エ" "エ" "お" "オ"
[15] "オ"
stringr::str_order(x = ja_aiueo)
[1] 1 6 11 2 7 12 3 8 13 4 9 14 5 10 15
# base::orderは同じ結果だが、base::sortは微妙に異なる結果を出す
# Windowsだと「"ア" "ア" "あ" "イ" "イ" "い" "ウ" "ウ" "う" "エ" "エ" "え" "オ" "オ" "お"」になる
sort(x = ja_aiueo)
[1] "あ" "ア" "ア" "い" "イ" "イ" "ウ" "ウ" "う" "エ" "え" "エ" "オ" "お"
[15] "オ"
order(x = ja_aiueo)
[1] 1 6 11 2 7 12 3 8 13 4 9 14 5 10 15
base: -
stringr: stringr::str_pad
, stringr::str_trim
, stringr::str_wrap
stringi: stringi::stri_pad_left
, stringi::stri_pad_both
, stringi::stri_pad_right
,
stringi::str_trim_left
, stringi::stri_trim_both
, stringi::stri_trim_right
,
stringi::stri_stri_wrap
# width引数の長さになるまで、pad引数の文字列を、side引数の箇所("both"の際は右側優先)に追加
padding_month_name <- stringr::str_pad(
string = month.name,
width = 8, side = "both", pad = " "
) %>%
print
[1] "January " "February" " March " " April " " May "
[6] " June " " July " " August " "September" "October "
[11] "November" "December"
# 文字列長が1なら空白以外も引数padに指定可能
stringr::str_pad(
string = month.name,
width = 8, side = "both", pad = "-"
)
[1] "January-" "February" "-March--" "-April--" "--May---"
[6] "--June--" "--July--" "-August-" "September" "October-"
[11] "November" "December"
# 空白文字列を除去
stringr::str_trim(string = padding_month_name, side = "both")
[1] "January" "February" "March" "April" "May"
[6] "June" "July" "August" "September" "October"
[11] "November" "December"
# 文字列ベクトルを引数widthくらいの長さで改行
# パラグラフ頭に引数indent分の空白を、各文頭に引数exdent分の空白を、作成
stringr::str_wrap(
string = stringr::str_c(month.name, collapse = " "),
width = 30, indent = 10, exdent = 3
)
[1] " January February\n March April May June July\n August September October\n November December"
base: tolower
, toupper
stringr: stringr::str_to_lower
, stringr::str_to_title
, stringr::str_to_upper
stringi: stringi::stri_trans_tolower
, stringi::stri_trans_totitle
, stringi::stri_trans_toupper
stringr::str_to_upper(string = month.name)
[1] "JANUARY" "FEBRUARY" "MARCH" "APRIL" "MAY"
[6] "JUNE" "JULY" "AUGUST" "SEPTEMBER" "OCTOBER"
[11] "NOVEMBER" "DECEMBER"
stringr::str_to_lower(string = month.name)
[1] "january" "february" "march" "april" "may"
[6] "june" "july" "august" "september" "october"
[11] "november" "december"
stringr::str_to_title(string = stringr::str_to_lower(string = month.name))
[1] "January" "February" "March" "April" "May"
[6] "June" "July" "August" "September" "October"
[11] "November" "December"
# baseにはタイトルケース化の関数はない(パッケージの利用やパターンマッチで行える。下記リンクより)
# http://stackoverflow.com/questions/6364783/capitalize-the-first-letter-of-both-words-in-a-two-word-string
# base::tolower/base::toupperで、大文字化と小文字化は可能
tolower(x = month.name)
[1] "january" "february" "march" "april" "may"
[6] "june" "july" "august" "september" "october"
[11] "november" "december"
toupper(x = month.name)
[1] "JANUARY" "FEBRUARY" "MARCH" "APRIL" "MAY"
[6] "JUNE" "JULY" "AUGUST" "SEPTEMBER" "OCTOBER"
[11] "NOVEMBER" "DECEMBER"
base: -
stringr: stringr::word
stringi: -
str <- c("我輩", "は", "猫", "で", "ある")
# 特定の文字で分かち書きされた文から単語を抽出
stringr::word(
string = stringi::stri_flatten(str = str, collapse = " "),
start = 1, end = seq(from = 1, to = length(str)),
sep = " "
)
[1] "我輩" "我輩 は" "我輩 は 猫"
[4] "我輩 は 猫 で" "我輩 は 猫 で ある"
# 単語からなる文字列ベクトルでは挙動しない
# 単語をある文字列で連結させ、ひとつの文字列する必要がある(stringi::stri_flattenがこの用途に有用)
# "我輩 は 猫 で ある" => OK
# c("我輩", "は", "猫", "で", "ある") => NG
stringr::word(string = str, start = 1)
[1] "我輩" "は" "猫" "で" "ある"
base: -
stringr: stringr::boundary
stringi: stringi::stri_opts_brkiter
# stringr::boundaryではstringi::stri_opts_brkiterを呼び出す
# stringr::str_splitの使用時にstringi::stri_split_boundariesにてstringi::stri_opts_brkiterが参照される
# Wepページのようなスペースが揃っていないテキストを入手した際、パースに悩むときに使う
web_like_text <- stringr::str_wrap(
string = stringr::str_c(month.name, collapse = " "),
width = 30, indent = 10, exdent = 3
) %>%
print
[1] " January February\n March April May June July\n August September October\n November December"
# うまくいかない
stringr::str_split(string = web_like_text, pattern = " ")
[[1]]
[1] "" "" "" "" ""
[6] "" "" "" "" ""
[11] "January" "February\n" "" "" "March"
[16] "April" "May" "June" "July\n" ""
[21] "" "August" "September" "October\n" ""
[26] "" "November" "December"
# type = "word"で単語毎に分ける
stringr::str_split(
string = web_like_text,
pattern = stringr::boundary(type = "word", skip_word_none = TRUE)
) %>%
dplyr::combine()
[1] "January" "February" "March" "April" "May"
[6] "June" "July" "August" "September" "October"
[11] "November" "December"
# type = "line_break"は改行文字直前の単語のみを抽出
stringr::str_split(
string = web_like_text,
pattern = stringr::boundary(type = "line_break", skip_word_none = TRUE)
) %>%
dplyr::combine()
[1] "February\n" "July\n" "October\n"
# 改行文字を削除すると機能しない
stringr::str_split(
string = stringr::str_replace_all(string = web_like_text, pattern = "\\n", replacement = ""),
pattern = stringr::boundary(type = "line_break", skip_word_none = TRUE)
) %>%
dplyr::combine()
character(0)
# type = "sentence"は文単位(改行)で区切る
stringr::str_split(
string = web_like_text,
pattern = stringr::boundary(type = "sentence", skip_word_none = TRUE)
) %>%
dplyr::combine()
[1] " January February\n" " March April May June July\n"
[3] " August September October\n" " November December"
# 改行文字を削除するとそのまま出力
stringr::str_split(
string = stringr::str_replace_all(string = web_like_text, pattern = "\\n", replacement = ""),
pattern = stringr::boundary(type = "sentence", skip_word_none = TRUE)
) %>%
dplyr::combine()
[1] " January February March April May June July August September October November December"
# type = "character"の挙動がよくわからない
stringr::str_split(
string = stringr::str_wrap(
string = stringr::str_c(month.abb, n_day, sep = ".", collapse = ""),
width = 30, indent = 10, exdent = 3
),
pattern = stringr::boundary(type = "character", skip_word_none = TRUE)
) %>%
dplyr::combine()
character(0)
# 日本語でも挙動する
web_like_ja_text <- stringr::str_wrap(
string = stringr::str_c(
stringr::str_dup(
string = format(as.Date("2015-07-12") + seq(from = 1, to = 7), "%a"),
times = seq(from = 1, to = 7)),
collapse = " "
),
width = 20, indent = 10, exdent = 3
) %>%
print
[1] " 月 火火 水水水 木\n 木木木 金金金金金 土土土土土土\n 日日日日日日日"
stringr::str_split(
string = web_like_ja_text,
pattern = stringr::boundary(type = "word", skip_word_none = TRUE)
)
[[1]]
[1] "月" "火火" "水水水" "木"
[5] "木木木" "金金金金金" "土土土土土土" "日日日日日日日"
stringr::str_split(
string = web_like_ja_text,
pattern = stringr::boundary(type = "line_break", skip_word_none = TRUE)
)
[[1]]
[1] "木\n" "土\n"
stringr::str_split(
string = web_like_ja_text,
pattern = stringr::boundary(type = "sentence", skip_word_none = TRUE)
)
[[1]]
[1] " 月 火火 水水水 木\n"
[2] " 木木木 金金金金金 土土土土土土\n"
[3] " 日日日日日日日"
# 「stringi::stri_*_boundaries」を呼び出している関数で使える
# stringr::str_locate/stringr::str_locate_allでも使える
# stri_locate_all_boundaries, stri_locate_first_boundaries, stri_locate_last_boundariesにて参照
word_locate <- stringr::str_locate_all(
string = web_like_text,
pattern = stringr::boundary(type = "word", skip_word_none = TRUE)
)
stringr::str_sub(
string = web_like_text,
start = word_locate[[1]][, "start"], end = word_locate[[1]][, "end"]
)
[1] "January" "February" "March" "April" "May"
[6] "June" "July" "August" "September" "October"
[11] "November" "December"
# stringr::str_countでも使える(stringi::stri_count_boundariesで参照)
# 単語で分割された数(元データが暦の英語名)が出力
stringr::str_count(
string = web_like_text,
pattern = stringr::boundary(type = "word", skip_word_none = TRUE)
)
[1] 12
# stringr::str_detect/stringr::str_extract/stringr::str_subsetは未実装(エラー)
# 「stringi::stri_*_boundaries」を呼び出す関数がない
# それぞれのドキュメントには引数patternでstringr::boundary()が使えると書いてあります
try(
expr = stringr::str_subset(
string = web_like_text,
pattern = stringr::boundary(type = "word", skip_word_none = TRUE)
)
)
base: -
stringr: stringr::regex
, stringr::fixed
, stringr::coll
stringi: stri_opts_regex
, stri_opts_fixed
, stri_opts_collator
# {stringr}のデフォルトの正規表現は{stringi}で使われるICU正規表現エンジンで処理される
# どういう正規表現が使えるかは下記のオンラインマニュアルを参照
# http://docs.rexamine.com/R-man/stringi/stringi-search-regex.html
# stringr::regexはデフォルトのICUのオプションを変更する際に使う
# 変えられるオプションはstringi::stri_opts_regexを参考のこと
multiline_str <- stringr::str_c(
stringr::str_split(string = web_like_text, pattern = stringr::boundary(type = "line_break", skip_word_none = TRUE)) %>%
dplyr::combine(),
collapse = ""
) %>%
print
[1] "February\nJuly\nOctober\n"
# 引数multilineはFALSEがデフォルト
str_extract_all(string = multiline_str, pattern = "^.")
[[1]]
[1] "F"
# 引数multilineはTRUEに変更
str_extract_all(string = multiline_str, pattern = stringr::regex(pattern = "^.", multiline = TRUE))
[[1]]
[1] "F" "J" "O"
# デフォルト仕様と異なるマッチの方法をさせたい場合にstringr::fixed/stringr::collを用いる(前述のstringr::boundaryも同様の用途で使える)
# @kohske先生の記事がとてもわかりやすい
# http://qiita.com/kohske/items/85d49da04571e9055c44#パターン検索
# stringr::fixedではパターンをロケールに依存しないバイト列としてマッチさせる
# Examplesより
strings <- c("abb", "a.b")
pattern <- "a.b"
str_detect(strings, pattern)
[1] TRUE TRUE
str_detect(strings, fixed(pattern))
[1] FALSE TRUE
str_detect(strings, coll(pattern))
[1] FALSE TRUE
# stringr::collではロケールを考慮したマッチに役立つ
i <- c("I", "\u0130", "i") %>%
print
[1] "I" "İ" "i"
str_detect(i, fixed("i", TRUE))
[1] TRUE FALSE TRUE
str_detect(i, coll("i", TRUE))
[1] TRUE FALSE TRUE
str_detect(i, coll("i", TRUE, locale = "tr"))
[1] FALSE TRUE TRUE
stri_info
, stri_install_check
, stri_install_icudt
# Windowsだと警告文が出る
# Your native charset is not a superset of US-ASCII. This may cause serious problems. Consider switching to UTF-8.
stringi::stri_info()
$Unicode.version
[1] "6.3"
$ICU.version
[1] "52.1"
$Locale
$Locale$Language
[1] "ja"
$Locale$Country
[1] "JP"
$Locale$Variant
[1] ""
$Locale$Name
[1] "ja_JP"
$Charset.internal
[1] "UTF-8" "UTF-16"
$Charset.native
$Charset.native$Name.friendly
[1] "UTF-8"
$Charset.native$Name.ICU
[1] "UTF-8"
$Charset.native$Name.UTR22
[1] NA
$Charset.native$Name.IBM
[1] "ibm-1208"
$Charset.native$Name.WINDOWS
[1] "windows-65001"
$Charset.native$Name.JAVA
[1] "UTF-8"
$Charset.native$Name.IANA
[1] "UTF-8"
$Charset.native$Name.MIME
[1] "UTF-8"
$Charset.native$ASCII.subset
[1] TRUE
$Charset.native$Unicode.1to1
[1] NA
$Charset.native$CharSize.8bit
[1] FALSE
$Charset.native$CharSize.min
[1] 1
$Charset.native$CharSize.max
[1] 3
stringi::stri_install_check()
stringi::stri_install_icudt()
stri_locale_get
, stri_locale_info
, stri_locale_list
, stri_locale_set
# 使用できるICUのロケール名
stringi::stri_locale_list()
[1] "af" "af_NA" "af_ZA" "agq" "agq_CM"
[6] "ak" "ak_GH" "am" "am_ET" "ar"
[11] "ar_001" "ar_AE" "ar_BH" "ar_DJ" "ar_DZ"
[16] "ar_EG" "ar_EH" "ar_ER" "ar_IL" "ar_IQ"
[21] "ar_JO" "ar_KM" "ar_KW" "ar_LB" "ar_LY"
[26] "ar_MA" "ar_MR" "ar_OM" "ar_PS" "ar_QA"
[31] "ar_SA" "ar_SD" "ar_SO" "ar_SS" "ar_SY"
[36] "ar_TD" "ar_TN" "ar_YE" "as" "as_IN"
[41] "asa" "asa_TZ" "az" "az_Cyrl" "az_Cyrl_AZ"
[46] "az_Latn" "az_Latn_AZ" "bas" "bas_CM" "be"
[51] "be_BY" "bem" "bem_ZM" "bez" "bez_TZ"
[56] "bg" "bg_BG" "bm" "bm_ML" "bn"
[61] "bn_BD" "bn_IN" "bo" "bo_CN" "bo_IN"
[66] "br" "br_FR" "brx" "brx_IN" "bs"
[71] "bs_Cyrl" "bs_Cyrl_BA" "bs_Latn" "bs_Latn_BA" "ca"
[76] "ca_AD" "ca_ES" "ca_FR" "ca_IT" "cgg"
[81] "cgg_UG" "chr" "chr_US" "cs" "cs_CZ"
[86] "cy" "cy_GB" "da" "da_DK" "da_GL"
[91] "dav" "dav_KE" "de" "de_AT" "de_BE"
[96] "de_CH" "de_DE" "de_LI" "de_LU" "dje"
[101] "dje_NE" "dua" "dua_CM" "dyo" "dyo_SN"
[106] "dz" "dz_BT" "ebu" "ebu_KE" "ee"
[111] "ee_GH" "ee_TG" "el" "el_CY" "el_GR"
[116] "en" "en_001" "en_150" "en_AG" "en_AI"
[121] "en_AS" "en_AU" "en_BB" "en_BE" "en_BM"
[126] "en_BS" "en_BW" "en_BZ" "en_CA" "en_CC"
[131] "en_CK" "en_CM" "en_CX" "en_DG" "en_DM"
[136] "en_ER" "en_FJ" "en_FK" "en_FM" "en_GB"
[141] "en_GD" "en_GG" "en_GH" "en_GI" "en_GM"
[146] "en_GU" "en_GY" "en_HK" "en_IE" "en_IM"
[151] "en_IN" "en_IO" "en_JE" "en_JM" "en_KE"
[156] "en_KI" "en_KN" "en_KY" "en_LC" "en_LR"
[161] "en_LS" "en_MG" "en_MH" "en_MO" "en_MP"
[166] "en_MS" "en_MT" "en_MU" "en_MW" "en_NA"
[171] "en_NF" "en_NG" "en_NR" "en_NU" "en_NZ"
[176] "en_PG" "en_PH" "en_PK" "en_PN" "en_PR"
[181] "en_PW" "en_RW" "en_SB" "en_SC" "en_SD"
[186] "en_SG" "en_SH" "en_SL" "en_SS" "en_SX"
[191] "en_SZ" "en_TC" "en_TK" "en_TO" "en_TT"
[196] "en_TV" "en_TZ" "en_UG" "en_UM" "en_US"
[201] "en_US_POSIX" "en_VC" "en_VG" "en_VI" "en_VU"
[206] "en_WS" "en_ZA" "en_ZM" "en_ZW" "eo"
[211] "es" "es_419" "es_AR" "es_BO" "es_CL"
[216] "es_CO" "es_CR" "es_CU" "es_DO" "es_EA"
[221] "es_EC" "es_ES" "es_GQ" "es_GT" "es_HN"
[226] "es_IC" "es_MX" "es_NI" "es_PA" "es_PE"
[231] "es_PH" "es_PR" "es_PY" "es_SV" "es_US"
[236] "es_UY" "es_VE" "et" "et_EE" "eu"
[241] "eu_ES" "ewo" "ewo_CM" "fa" "fa_AF"
[246] "fa_IR" "ff" "ff_SN" "fi" "fi_FI"
[251] "fil" "fil_PH" "fo" "fo_FO" "fr"
[256] "fr_BE" "fr_BF" "fr_BI" "fr_BJ" "fr_BL"
[261] "fr_CA" "fr_CD" "fr_CF" "fr_CG" "fr_CH"
[266] "fr_CI" "fr_CM" "fr_DJ" "fr_DZ" "fr_FR"
[271] "fr_GA" "fr_GF" "fr_GN" "fr_GP" "fr_GQ"
[276] "fr_HT" "fr_KM" "fr_LU" "fr_MA" "fr_MC"
[281] "fr_MF" "fr_MG" "fr_ML" "fr_MQ" "fr_MR"
[286] "fr_MU" "fr_NC" "fr_NE" "fr_PF" "fr_PM"
[291] "fr_RE" "fr_RW" "fr_SC" "fr_SN" "fr_SY"
[296] "fr_TD" "fr_TG" "fr_TN" "fr_VU" "fr_WF"
[301] "fr_YT" "ga" "ga_IE" "gl" "gl_ES"
[306] "gsw" "gsw_CH" "gsw_LI" "gu" "gu_IN"
[311] "guz" "guz_KE" "gv" "gv_IM" "ha"
[316] "ha_Latn" "ha_Latn_GH" "ha_Latn_NE" "ha_Latn_NG" "haw"
[321] "haw_US" "he" "he_IL" "hi" "hi_IN"
[326] "hr" "hr_BA" "hr_HR" "hu" "hu_HU"
[331] "hy" "hy_AM" "id" "id_ID" "ig"
[336] "ig_NG" "ii" "ii_CN" "is" "is_IS"
[341] "it" "it_CH" "it_IT" "it_SM" "ja"
[346] "ja_JP" "jgo" "jgo_CM" "jmc" "jmc_TZ"
[351] "ka" "ka_GE" "kab" "kab_DZ" "kam"
[356] "kam_KE" "kde" "kde_TZ" "kea" "kea_CV"
[361] "khq" "khq_ML" "ki" "ki_KE" "kk"
[366] "kk_Cyrl" "kk_Cyrl_KZ" "kkj" "kkj_CM" "kl"
[371] "kl_GL" "kln" "kln_KE" "km" "km_KH"
[376] "kn" "kn_IN" "ko" "ko_KP" "ko_KR"
[381] "kok" "kok_IN" "ks" "ks_Arab" "ks_Arab_IN"
[386] "ksb" "ksb_TZ" "ksf" "ksf_CM" "kw"
[391] "kw_GB" "ky" "ky_Cyrl" "ky_Cyrl_KG" "lag"
[396] "lag_TZ" "lg" "lg_UG" "lkt" "lkt_US"
[401] "ln" "ln_AO" "ln_CD" "ln_CF" "ln_CG"
[406] "lo" "lo_LA" "lt" "lt_LT" "lu"
[411] "lu_CD" "luo" "luo_KE" "luy" "luy_KE"
[416] "lv" "lv_LV" "mas" "mas_KE" "mas_TZ"
[421] "mer" "mer_KE" "mfe" "mfe_MU" "mg"
[426] "mg_MG" "mgh" "mgh_MZ" "mgo" "mgo_CM"
[431] "mk" "mk_MK" "ml" "ml_IN" "mn"
[436] "mn_Cyrl" "mn_Cyrl_MN" "mr" "mr_IN" "ms"
[441] "ms_Latn" "ms_Latn_BN" "ms_Latn_MY" "ms_Latn_SG" "mt"
[446] "mt_MT" "mua" "mua_CM" "my" "my_MM"
[451] "naq" "naq_NA" "nb" "nb_NO" "nb_SJ"
[456] "nd" "nd_ZW" "ne" "ne_IN" "ne_NP"
[461] "nl" "nl_AW" "nl_BE" "nl_BQ" "nl_CW"
[466] "nl_NL" "nl_SR" "nl_SX" "nmg" "nmg_CM"
[471] "nn" "nn_NO" "nnh" "nnh_CM" "nus"
[476] "nus_SD" "nyn" "nyn_UG" "om" "om_ET"
[481] "om_KE" "or" "or_IN" "pa" "pa_Arab"
[486] "pa_Arab_PK" "pa_Guru" "pa_Guru_IN" "pl" "pl_PL"
[491] "ps" "ps_AF" "pt" "pt_AO" "pt_BR"
[496] "pt_CV" "pt_GW" "pt_MO" "pt_MZ" "pt_PT"
[501] "pt_ST" "pt_TL" "rm" "rm_CH" "rn"
[506] "rn_BI" "ro" "ro_MD" "ro_RO" "rof"
[511] "rof_TZ" "ru" "ru_BY" "ru_KG" "ru_KZ"
[516] "ru_MD" "ru_RU" "ru_UA" "rw" "rw_RW"
[521] "rwk" "rwk_TZ" "saq" "saq_KE" "sbp"
[526] "sbp_TZ" "seh" "seh_MZ" "ses" "ses_ML"
[531] "sg" "sg_CF" "shi" "shi_Latn" "shi_Latn_MA"
[536] "shi_Tfng" "shi_Tfng_MA" "si" "si_LK" "sk"
[541] "sk_SK" "sl" "sl_SI" "sn" "sn_ZW"
[546] "so" "so_DJ" "so_ET" "so_KE" "so_SO"
[551] "sq" "sq_AL" "sq_MK" "sq_XK" "sr"
[556] "sr_Cyrl" "sr_Cyrl_BA" "sr_Cyrl_ME" "sr_Cyrl_RS" "sr_Cyrl_XK"
[561] "sr_Latn" "sr_Latn_BA" "sr_Latn_ME" "sr_Latn_RS" "sr_Latn_XK"
[566] "sv" "sv_AX" "sv_FI" "sv_SE" "sw"
[571] "sw_KE" "sw_TZ" "sw_UG" "swc" "swc_CD"
[576] "ta" "ta_IN" "ta_LK" "ta_MY" "ta_SG"
[581] "te" "te_IN" "teo" "teo_KE" "teo_UG"
[586] "th" "th_TH" "ti" "ti_ER" "ti_ET"
[591] "to" "to_TO" "tr" "tr_CY" "tr_TR"
[596] "twq" "twq_NE" "tzm" "tzm_Latn" "tzm_Latn_MA"
[601] "uk" "uk_UA" "ur" "ur_IN" "ur_PK"
[606] "uz" "uz_Arab" "uz_Arab_AF" "uz_Cyrl" "uz_Cyrl_UZ"
[611] "uz_Latn" "uz_Latn_UZ" "vai" "vai_Latn" "vai_Latn_LR"
[616] "vai_Vaii" "vai_Vaii_LR" "vi" "vi_VN" "vun"
[621] "vun_TZ" "xog" "xog_UG" "yav" "yav_CM"
[626] "yo" "yo_BJ" "yo_NG" "zgh" "zgh_MA"
[631] "zh" "zh_Hans" "zh_Hans_CN" "zh_Hans_HK" "zh_Hans_MO"
[636] "zh_Hans_SG" "zh_Hant" "zh_Hant_HK" "zh_Hant_MO" "zh_Hant_TW"
[641] "zu" "zu_ZA"
# ローケル情報の取得と設定
stringi::stri_locale_info()
$Language
[1] "ja"
$Country
[1] "JP"
$Variant
[1] ""
$Name
[1] "ja_JP"
now_locale <- stringi::stri_locale_get()
stringi::stri_locale_set(locale = now_locale)
stri_*_charclass
# {stringr}ではUnicode文字クラスでのパターンマッチは用意されていない
stringi::stri_subset_charclass(
str = c("stRRRingi","REXAMINE","123"),
pattern = c("\\p{Ll}", "\\p{Lu}", "\\p{Zs}")
)
[1] "stRRRingi" "REXAMINE"
stri_cmp*, ~~stri_compare(alias)~~
# stringi::stri_cmp*
ls("package:stringi") %>%
stringr::str_subset(pattern = "^stri_cmp")
[1] "stri_cmp" "stri_cmp_eq" "stri_cmp_equiv" "stri_cmp_ge"
[5] "stri_cmp_gt" "stri_cmp_le" "stri_cmp_lt" "stri_cmp_neq"
[9] "stri_cmp_nequiv"
# 文字列比較(ロケール依存)
# 「Cのstrcmp()と同じような挙動」らしい
# [e1 < e2]: -1, [e1 == e2]: 0, [e1 > e2]: +1
stringi::stri_cmp(e1 = "number100", e2 = "number2")
[1] -1
stringi::stri_cmp(e1 = "number100", e2 = "number2", opts_collator = stri_opts_collator(numeric = TRUE))
[1] 1
# ロケール非依存
# stringi::stri_cmp_eq/stringi::stri_cmp_neq: exactly the same/difference code points
stringi::stri_cmp_eq(e1 = stringi::stri_trans_nfkd("\u0105"), e2 = "\u105")
[1] FALSE
stringi::stri_cmp_neq(e1 = "hladny", e2 = "HLADNY")
[1] TRUE
# ロケール依存
# stringi::stri_cmp_equiv: canonically equivalent
# stringi::stri_cmp_nequiv: not canonically equivalent
# opts_collatorで受け取れる引数(locale, strength, ...)を指定できる
stringi::stri_cmp_equiv(e1 = "hladny", e2 = "HLADNY", strength = 2)
[1] TRUE
stringi::stri_cmp_nequiv(e1 = "hladny", e2 = "HLADNY", strength = 2)
[1] FALSE
stringi::stri_cmp_nequiv(e1 = "hladny", e2 = "HLADNY", strength = 3)
[1] TRUE
# 辞書順の符号比較(ロケール依存)
# stringi::stri_cmp_lt = ">", stringi::stri_cmp_gt = "<"
# stringi::stri_cmp_le = ">=", stringi::stri_cmp_ge = "<="
stringi::stri_cmp_lt(e1 = "hladny", e2 = "chladny", locale = "pl_PL")
[1] FALSE
stringi::stri_cmp_lt(e1 = "hladny", e2 = "chladny", locale = "sk_SK")
[1] TRUE
stringi::stri_cmp_gt(e1 = "hladny", e2 = "chladny", locale = "pl_PL")
[1] TRUE
# 文字列比較用の演算子も定義されている
# default collator optionsが使われる
# %s==%, %s!=%, %s<%, %s<=%, %s>%, %s>=%
# %stri==%, %stri!=%, %stri<%, %stri<=%, %stri>%, %stri>=%
# 「%s==%」「%stri==%」は canonical equivalence, locale-dependent
# 「%s===%」「%stri===%」は canonical equivalence, locale-independent(code point-based)
(stringi::stri_trans_nfkd("\u0105")) %s==% "\u105"
[1] TRUE
(stringi::stri_trans_nfkd("\u0105")) %s===% "\u105"
[1] FALSE
# 「%s!=%」「%stri!=%」は not canonical equivalence, locale-dependent
# 「%s!==%」「%stri!==%」は cnot anonical equivalence, locale-independent(code point-based)
stri_duplicated
, stri_unique
# 重複するか判定
dup_input <- c("a", "b", "a", NA, "a", NA)
stringi::stri_duplicated(str = dup_input)
[1] FALSE FALSE TRUE FALSE TRUE TRUE
# 後ろから判定する場合は引数fromLastをTRUE
stringi::stri_duplicated(str = dup_input, fromLast = TRUE)
[1] TRUE FALSE TRUE TRUE FALSE FALSE
rev(stringi::stri_duplicated(str = rev(dup_input)))
[1] TRUE FALSE TRUE TRUE FALSE FALSE
# base::duplicatedは文字列の正準等価性を見ているので、ロケール依存する文字列に対応できない
dup_str <- c("\u0105", stringi::stri_trans_nfkd("\u0105")) %>%
print
[1] "ą" "ą"
duplicated(x = dup_str)
[1] FALSE FALSE
stringi::stri_duplicated(str = dup_str)
[1] FALSE TRUE
# 重複がいくつあるか
stringi::stri_duplicated_any(str = dup_input)
[1] 3
# 重複する文字列を削除
stringi::stri_unique(str = dup_input)
[1] "a" "b" NA
stringi::stri_unique(str = dup_str)
[1] "ą"
# base::uniqueと同じだが、ロケール依存する文字列には対応できない
unique(x = dup_input)
[1] "a" "b" NA
unique(x = dup_str)
[1] "ą" "ą"
stri_startswith
, stri_endswith
stringi::stri_startswith(str = month.name, fixed = "J")
[1] TRUE FALSE FALSE FALSE FALSE TRUE TRUE FALSE FALSE FALSE FALSE
[12] FALSE
stringi::stri_endswith(str = month.name, fixed = "ber")
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE TRUE TRUE
[12] TRUE
# 引数fromで開始位置を変えられる
stringi::stri_startswith(str = month.name, fixed = "A", from = 2)
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE
stringi::stri_startswith(str = month.name, fixed = "a", from = 2)
[1] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE
stringi::stri_startswith(str = month.name, coll = "A", from = 2)
[1] FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE
# stri_opts_collatorの引数strength({1,2,3,4})で照合の強さを設定(1が最も緩い。デフォルトは3)
stringi::stri_startswith(str = month.name, coll = "A", from = 2, strength = 1)
[1] TRUE FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE
[12] FALSE
stri_count_words
, stri_count_boundaries
# 単語数の取得(単語の境界はstringi::stri_count_boundariesで判定)
stringi::stri_count_words(str = stringi::stri_flatten(str = str, collapse = " "))
[1] 5
count_test <- "The\u00a0above-mentioned features are very useful. Warm thanks to their developers."
stringi::stri_count_words(str = count_test)
[1] 12
# stri_count_boundariesの挙動はよくわからない
stringi::stri_count_boundaries(str = count_test, type = "word")
[1] 28
stringi::stri_count_boundaries(str = count_test, type = "sentence")
[1] 2
stringi::stri_count_boundaries(str = count_test, type = "character")
[1] 81
stri_list2matrix
, stri_flatten
input_lst <- list("a", c("b", "c"))
# base::simplify2array
simplify2array(x = input_lst)
[[1]]
[1] "a"
[[2]]
[1] "b" "c"
# stringi::stri_list2matrixでリストを行列へ
stringi::stri_list2matrix(x = input_lst)
[,1] [,2]
[1,] "a" "b"
[2,] NA "c"
stringi::stri_list2matrix(x = input_lst, fill = "")
[,1] [,2]
[1,] "a" "b"
[2,] "" "c"
stringi::stri_list2matrix(x = input_lst, fill = "", n_min = 5)
[,1] [,2]
[1,] "a" "b"
[2,] "" "c"
[3,] "" ""
[4,] "" ""
[5,] "" ""
stringi::stri_list2matrix(x = input_lst, fill = "", n_min = 5, byrow = TRUE)
[,1] [,2] [,3] [,4] [,5]
[1,] "a" "" "" "" ""
[2,] "b" "c" "" "" ""
# stringi::stri_flattenで文字列ベクトルを一つの文字列に
str
[1] "我輩" "は" "猫" "で" "ある"
stringr::str_length(string = str)
[1] 2 1 1 1 2
stringi::stri_flatten(str = str, collapse = " ")
[1] "我輩 は 猫 で ある"
stringr::str_length(string = stringi::stri_flatten(str = str, collapse = " "))
[1] 11
stri_rand_lipsum
, stri_rand_strings
# Lorem ipsumに基づくダミーテキストの生成
# https://ja.wikipedia.org/wiki/Lorem_ipsum
# 引数start_lipsumは"Lorem ipsum dolor sit ame"から始めるかどうか
stringi::stri_rand_lipsum(nparagraphs = 2, start_lipsum = TRUE)
[1] "Lorem ipsum dolor sit amet, nec semper netus massa. Imperdiet pellentesque mattis quam eu eu aenean non a in nibh, ut ut, quam! Sed erat dis, mi hac orci condimentum sollicitudin efficitur. Eu vitae dis vestibulum. Est arcu donec sed lectus sem imperdiet dolor malesuada. Dictumst at lobortis tincidunt leo eu vestibulum eu in. Ac, eget proin lorem nulla orci integer. Sapien, senectus accumsan sagittis, felis augue. Integer in sapien potenti, nullam et. Ut sapien odio. Vulputate dolor neque interdum habitant in, sociosqu ut. Non, class ipsum nunc quam nascetur diam."
[2] "In non dignissim integer et ut est sed, ipsum. Convallis, et habitasse sed neque eu pulvinar nec. Non aliquam nec curabitur, sed ut ante tempor, dui. Pellentesque est posuere, ac id sed auctor eu fames vestibulum. Accumsan ultricies maecenas nec ut dui nascetur, ut etiam. Est eu proin torquent. A sed ut ultrices. Tincidunt vehicula suspendisse, ultrices non dui, ex."
stringi::stri_rand_lipsum(nparagraphs = 2, start_lipsum = FALSE)
[1] "Duis dapibus primis fringilla purus massa sed commodo massa fringilla. Tempor, libero morbi penatibus justo purus curabitur nostra. Ut mi sed ultrices nec per a laoreet leo. Nullam tristique velit, nisi hendrerit sociis pellentesque, a eu ipsum dis. Sed ut arcu maximus sed proin, elementum quam diam mauris, iaculis pretium, posuere curae. Ridiculus a nulla tincidunt. In vel ac tincidunt nam tincidunt a hac. Interdum sed habitasse non tortor tempus sed mollis. Varius, magna erat fusce sed. Sed vel tempus augue lobortis morbi molestie purus, ultrices luctus volutpat eros, purus, interdum, leo. Himenaeos interdum platea pharetra."
[2] "Tincidunt dolor volutpat libero nibh himenaeos ex commodo ad nunc magna etiam. Tellus ac feugiat ut massa leo ex amet ut at nunc. Eu leo augue lacus sit ullamcorper vitae. Ac lorem odio ut. Et nisi. Tristique ut purus leo quis amet. Per est blandit diam nibh sed integer nulla elementum, vulputate. Varius a purus vehicula mollis dictum posuere. Vel scelerisque mauris nulla. Fusce eu integer nibh quam eu. Posuere donec ac et, sagittis. Commodo eget velit."
# n個のlength文字数のランダムな文字ベクトルを生成
stringi::stri_rand_strings(n = 10, length = 5)
[1] "i1Lyu" "ac5VV" "jgdSs" "TCtwp" "fSqAO" "cOHrF" "wPPcY" "fQYIm"
[9] "4WvpS" "yXIFc"
# 引数patternで使用する文字列を設定できる
stringi::stri_rand_strings(n = 10, length = 5, pattern = "[a-zあ-ん]")
[1] "おぺろpe" "ろめふのろ" "らげeけぉ" "yhべoつ" "ねおぜうこ"
[6] "ゑなをでぞ" "きたぐぜら" "uるっゆf" "oめrくし" "ぅへすkぐ"
stri_rand_shuffle
, stri_reverse
# 個々の文字列ベクトルの文字順をランダムに並び替え
stringi::stri_rand_shuffle(str = month.name[1:10])
[1] "raunJay" "eabruFry" "rcaMh" "liArp" "rMy"
[6] "pJue" "nJFy" "rAugut" "rSeptembe" "ebOcotr"
# 個々の文字列ベクトルの文字順を逆順に並び替え
stringi::stri_reverse(str = month.name)
[1] "yraunaJ" "yraurbeF" "hcraM" "lirpA" "yaM"
[6] "enuJ" "yluJ" "tsuguA" "rebmetpeS" "rebotcO"
[11] "rebmevoN" "rebmeceD"
stri_read_raw
, stri_read_lines
, stri_write_lines
# [THIS IS AN EXPERIMENTAL FUNCTION]
stri_stats_general
, stri_stats_latex
lipsum_str <- stringi::stri_rand_lipsum(nparagraphs = 1, start_lipsum = FALSE) %>%
print
[1] "In semper purus dolor cras, consequat a in. Sed sed mollis, finibus. Vulputate nec facilisis elit quisque a nullam in in nulla eros cursus varius. Integer quisque luctus sed sapien sed vel at. Nunc purus lacus eu eu eleifend maximus mi. At dictumst consectetur sollicitudin nunc tempor vestibulum dolor aliquam non lectus suspendisse. Vestibulum penatibus tempor, nibh metus cum. Blandit vel id eu eleifend tempus. Tortor, commodo, lacinia leo morbi imperdiet maecenas est pharetra justo orci, nec netus. Finibus enim sapien in mauris nec ac vel sem senectus adipiscing. Efficitur per tempus posuere vitae elementum vestibulum netus, luctus. Fermentum maecenas metus natoque eros suspendisse odio nam ac et augue pellentesque. Risus aliquam dictum eu mi quam, a non."
# 統計情報
# \rや\nが含まれていない文字列で、空白文字(Unicode binary property WHITE_SPACE)で単語が区切られている
# Lines: 行数, LinesNEmpty: "WHITE_SPACE"ではない文字を少なくともひとつ含む行の数
# Chars: Unicode符号位置にマッチした総数, CharsNWhite: "WHITE_SPACE"ではないUnicode符号位置の数
stringi::stri_stats_general(str = lipsum_str)
Lines LinesNEmpty Chars CharsNWhite
1 1 766 649
s <- c("Lorem \\textbf{ipsum} dolor sit \\textit{amet}, consectetur adipisicing elit.",
"\\begin{small}Proin nibh augue,\\end{small} suscipit a, scelerisque sed, lacinia in, mi.",
"")
# LaTeXテキストの統計情報
# CharsWord: 文字数(空白を含む記号とコマンドだけを除去), CharsCmdEnvir: コマンドと単語数(記号を含む)
# CharsWhite: LaTeX white space数({と}を含む)
# Words: 単語数, Cmds: コマンド数, Envirs: 環境数
stringi::stri_stats_latex(str = s)
CharsWord CharsCmdEnvir CharsWhite Words Cmds
96 38 27 18 2
Envirs
1
stri_*escape_unicode
stringi::stri_escape_unicode(str = "㎠")
[1] "\\u33a0"
stringi::stri_unescape_unicode(str = "\\u33a0")
[1] "㎠"
stri_enc_*
stringi::stri_enc_list() %>%
head(n = 5)
$`UTF-8`
[1] "UTF-8" "ibm-1208" "ibm-1209"
[4] "ibm-5304" "ibm-5305" "ibm-13496"
[7] "ibm-13497" "ibm-17592" "ibm-17593"
[10] "windows-65001" "cp1208" "x-UTF_8J"
[13] "unicode-1-1-utf-8" "unicode-2-0-utf-8"
$`UTF-16`
[1] "UTF-16" "ISO-10646-UCS-2" "ibm-1204" "ibm-1205"
[5] "unicode" "csUnicode" "ucs-2"
$`UTF-16BE`
[1] "UTF-16BE" "x-utf-16be" "UnicodeBigUnmarked"
[4] "ibm-1200" "ibm-1201" "ibm-13488"
[7] "ibm-13489" "ibm-17584" "ibm-17585"
[10] "ibm-21680" "ibm-21681" "ibm-25776"
[13] "ibm-25777" "ibm-29872" "ibm-29873"
[16] "ibm-61955" "ibm-61956" "windows-1201"
[19] "cp1200" "cp1201" "UTF16_BigEndian"
$`UTF-16LE`
[1] "UTF-16LE" "x-utf-16le"
[3] "UnicodeLittleUnmarked" "ibm-1202"
[5] "ibm-1203" "ibm-13490"
[7] "ibm-13491" "ibm-17586"
[9] "ibm-17587" "ibm-21682"
[11] "ibm-21683" "ibm-25778"
[13] "ibm-25779" "ibm-29874"
[15] "ibm-29875" "UTF16_LittleEndian"
[17] "windows-1200"
$`UTF-32`
[1] "UTF-32" "ISO-10646-UCS-4" "ibm-1236" "ibm-1237"
[5] "csUCS4" "ucs-4"
stringi::stri_enc_info()
$Name.friendly
[1] "UTF-8"
$Name.ICU
[1] "UTF-8"
$Name.UTR22
[1] NA
$Name.IBM
[1] "ibm-1208"
$Name.WINDOWS
[1] "windows-65001"
$Name.JAVA
[1] "UTF-8"
$Name.IANA
[1] "UTF-8"
$Name.MIME
[1] "UTF-8"
$ASCII.subset
[1] TRUE
$Unicode.1to1
[1] NA
$CharSize.8bit
[1] FALSE
$CharSize.min
[1] 1
$CharSize.max
[1] 3
stringi::stri_enc_get()
[1] "UTF-8"
stringi::stri_enc_set(enc = stringi::stri_enc_get())
# 各文字列の宣言しているエンコーデング形式を取得
# ASCII, latin1, bytes, native, UTF-8
stringi::stri_enc_mark(str = month.name)
[1] "ASCII" "ASCII" "ASCII" "ASCII" "ASCII" "ASCII" "ASCII" "ASCII"
[9] "ASCII" "ASCII" "ASCII" "ASCII"
# stringi::stri_enc_to*
ls("package:stringi") %>%
stringr::str_subset(pattern = "^stri_enc_to")
[1] "stri_enc_toascii" "stri_enc_tonative" "stri_enc_toutf32"
[4] "stri_enc_toutf8"
# stringi::stri_enc_is*
ls("package:stringi") %>%
stringr::str_subset(pattern = "^stri_enc_is")
[1] "stri_enc_isascii" "stri_enc_isutf16be" "stri_enc_isutf16le"
[4] "stri_enc_isutf32be" "stri_enc_isutf32le" "stri_enc_isutf8"
stringi::stri_enc_toascii(str = "\x1a")
[1] "\032"
stringi::stri_enc_toascii(str = "\041")
[1] "!"
# 使い方がわからない
# stringi::stri_enc_fromutf32(vec = "")
# (2015.07.18 追記) @kohskeさんよりフォローがありました(ありがとうございます)。
# UTF32コードからutf8文字列生成
stringi::stri_enc_fromutf32(vec = c(0x6771, 0x4EAC, 0x90FD))
[1] "東京都"
stri_trans_*
# 文字列変換で利用可能な識別子
stringi::stri_trans_list()
[1] "ASCII-Latin" "Accents-Any"
[3] "Amharic-Latin/BGN" "Any-Accents"
[5] "Any-Publishing" "Arabic-Latin"
[7] "Arabic-Latin/BGN" "Armenian-Latin"
[9] "Armenian-Latin/BGN" "Azerbaijani-Latin/BGN"
[11] "Belarusian-Latin/BGN" "Bengali-Devanagari"
[13] "Bengali-Gujarati" "Bengali-Gurmukhi"
[15] "Bengali-Kannada" "Bengali-Latin"
[17] "Bengali-Malayalam" "Bengali-Oriya"
[19] "Bengali-Tamil" "Bengali-Telugu"
[21] "Bopomofo-Latin" "Bulgarian-Latin/BGN"
[23] "Cyrillic-Latin" "Devanagari-Bengali"
[25] "Devanagari-Gujarati" "Devanagari-Gurmukhi"
[27] "Devanagari-Kannada" "Devanagari-Latin"
[29] "Devanagari-Malayalam" "Devanagari-Oriya"
[31] "Devanagari-Tamil" "Devanagari-Telugu"
[33] "Digit-Tone" "Fullwidth-Halfwidth"
[35] "Georgian-Latin" "Georgian-Latin/BGN"
[37] "Greek-Latin" "Greek-Latin/BGN"
[39] "Greek-Latin/UNGEGN" "Gujarati-Bengali"
[41] "Gujarati-Devanagari" "Gujarati-Gurmukhi"
[43] "Gujarati-Kannada" "Gujarati-Latin"
[45] "Gujarati-Malayalam" "Gujarati-Oriya"
[47] "Gujarati-Tamil" "Gujarati-Telugu"
[49] "Gurmukhi-Bengali" "Gurmukhi-Devanagari"
[51] "Gurmukhi-Gujarati" "Gurmukhi-Kannada"
[53] "Gurmukhi-Latin" "Gurmukhi-Malayalam"
[55] "Gurmukhi-Oriya" "Gurmukhi-Tamil"
[57] "Gurmukhi-Telugu" "Halfwidth-Fullwidth"
[59] "Han-Latin" "Han-Latin/Names"
[61] "Hangul-Latin" "Hans-Hant"
[63] "Hant-Hans" "Hebrew-Latin"
[65] "Hebrew-Latin/BGN" "Hiragana-Katakana"
[67] "Hiragana-Latin" "IPA-XSampa"
[69] "Jamo-Latin" "Kannada-Bengali"
[71] "Kannada-Devanagari" "Kannada-Gujarati"
[73] "Kannada-Gurmukhi" "Kannada-Latin"
[75] "Kannada-Malayalam" "Kannada-Oriya"
[77] "Kannada-Tamil" "Kannada-Telugu"
[79] "Katakana-Hiragana" "Katakana-Latin"
[81] "Katakana-Latin/BGN" "Kazakh-Latin/BGN"
[83] "Kirghiz-Latin/BGN" "Korean-Latin/BGN"
[85] "Latin-ASCII" "Latin-Arabic"
[87] "Latin-Armenian" "Latin-Bengali"
[89] "Latin-Bopomofo" "Latin-Cyrillic"
[91] "Latin-Devanagari" "Latin-Georgian"
[93] "Latin-Greek" "Latin-Greek/UNGEGN"
[95] "Latin-Gujarati" "Latin-Gurmukhi"
[97] "Latin-Hangul" "Latin-Hebrew"
[99] "Latin-Hiragana" "Latin-Jamo"
[101] "Latin-Kannada" "Latin-Katakana"
[103] "Latin-Malayalam" "Latin-NumericPinyin"
[105] "Latin-Oriya" "Latin-Syriac"
[107] "Latin-Tamil" "Latin-Telugu"
[109] "Latin-Thaana" "Latin-Thai"
[111] "Macedonian-Latin/BGN" "Malayalam-Bengali"
[113] "Malayalam-Devanagari" "Malayalam-Gujarati"
[115] "Malayalam-Gurmukhi" "Malayalam-Kannada"
[117] "Malayalam-Latin" "Malayalam-Oriya"
[119] "Malayalam-Tamil" "Malayalam-Telugu"
[121] "Maldivian-Latin/BGN" "Mongolian-Latin/BGN"
[123] "NumericPinyin-Latin" "NumericPinyin-Pinyin"
[125] "Oriya-Bengali" "Oriya-Devanagari"
[127] "Oriya-Gujarati" "Oriya-Gurmukhi"
[129] "Oriya-Kannada" "Oriya-Latin"
[131] "Oriya-Malayalam" "Oriya-Tamil"
[133] "Oriya-Telugu" "Pashto-Latin/BGN"
[135] "Persian-Latin/BGN" "Pinyin-NumericPinyin"
[137] "Publishing-Any" "Russian-Latin/BGN"
[139] "Serbian-Latin/BGN" "Simplified-Traditional"
[141] "Syriac-Latin" "Tamil-Bengali"
[143] "Tamil-Devanagari" "Tamil-Gujarati"
[145] "Tamil-Gurmukhi" "Tamil-Kannada"
[147] "Tamil-Latin" "Tamil-Malayalam"
[149] "Tamil-Oriya" "Tamil-Telugu"
[151] "Telugu-Bengali" "Telugu-Devanagari"
[153] "Telugu-Gujarati" "Telugu-Gurmukhi"
[155] "Telugu-Kannada" "Telugu-Latin"
[157] "Telugu-Malayalam" "Telugu-Oriya"
[159] "Telugu-Tamil" "Thaana-Latin"
[161] "Thai-Latin" "Tone-Digit"
[163] "Traditional-Simplified" "Turkmen-Latin/BGN"
[165] "Ukrainian-Latin/BGN" "Uzbek-Latin/BGN"
[167] "XSampa-IPA" "az-Lower"
[169] "az-Title" "az-Upper"
[171] "cs-cs_FONIPA" "cs-ja"
[173] "cs-ko" "cs_FONIPA-ja"
[175] "cs_FONIPA-ko" "el-Lower"
[177] "el-Title" "el-Upper"
[179] "es-am" "es-es_FONIPA"
[181] "es-ja" "es-zh"
[183] "es_419-ja" "es_419-zh"
[185] "es_FONIPA-am" "es_FONIPA-es_419_FONIPA"
[187] "es_FONIPA-ja" "es_FONIPA-zh"
[189] "it-am" "it-ja"
[191] "ja_Latn-ko" "ja_Latn-ru"
[193] "lt-Lower" "lt-Title"
[195] "lt-Upper" "nl-Title"
[197] "pl-ja" "pl-pl_FONIPA"
[199] "pl_FONIPA-ja" "ro-ja"
[201] "ro-ro_FONIPA" "ro_FONIPA-ja"
[203] "ru-ja" "ru-zh"
[205] "sk-ja" "sk-sk_FONIPA"
[207] "sk_FONIPA-ja" "tr-Lower"
[209] "tr-Title" "tr-Upper"
[211] "uz_Cyrl-uz_Latn" "uz_Latn-uz_Cyrl"
[213] "zh_Latn_PINYIN-ru" "Any-Null"
[215] "Any-Lower" "Any-Upper"
[217] "Any-Title" "Any-Name"
[219] "Name-Any" "Any-Remove"
[221] "Any-Hex/Unicode" "Any-Hex/Java"
[223] "Any-Hex/C" "Any-Hex/XML"
[225] "Any-Hex/XML10" "Any-Hex/Perl"
[227] "Any-Hex" "Hex-Any/Unicode"
[229] "Hex-Any/Java" "Hex-Any/C"
[231] "Hex-Any/XML" "Hex-Any/XML10"
[233] "Hex-Any/Perl" "Hex-Any"
[235] "Any-NFC" "Any-NFKC"
[237] "Any-NFD" "Any-NFKD"
[239] "Any-FCD" "Any-FCC"
[241] "Any-Latin" "Any-Telugu"
[243] "Any-Gurmukhi" "Any-Gujarati"
[245] "Any-Malayalam" "Any-Oriya"
[247] "Any-Devanagari" "Any-Kannada"
[249] "Any-Tamil" "Any-cs_FONIPA"
[251] "Any-ru" "Any-Bengali"
[253] "Any-uz_Latn" "Any-Katakana"
[255] "Any-ro_FONIPA" "Any-zh"
[257] "Any-am" "Any-es_419_FONIPA"
[259] "Any-es_FONIPA" "Any-sk_FONIPA"
[261] "Any-Hant" "Any-Hans"
[263] "Any-Hiragana" "Any-Syriac"
[265] "Any-Greek" "Any-Greek/UNGEGN"
[267] "Any-Cyrillic" "Any-Hangul"
[269] "Any-Bopomofo" "Any-Arabic"
[271] "Any-Thai" "Any-Armenian"
[273] "Any-Thaana" "Any-Georgian"
[275] "Any-Hebrew" "Any-uz_Cyrl"
[277] "Any-pl_FONIPA"
# 汎用の文字列変換関数
# http://userguide.icu-project.org/transforms/general
tsurami_str <- "ツラミ"
stringi::stri_trans_general(str = "stringi", id = "latin-cyrillic")
[1] "стринги"
stringi::stri_trans_general(str = tsurami_str, id = "Katakana-Latin")
[1] "tsurami"
stringi::stri_trans_general(str = tsurami_str, id = "Katakana-Hiragana")
[1] "つらみ"
# Windowsだと「cs-ja」の結果が「ストリン<U+0261>イ」と表示される
stringi::stri_trans_list() %>%
stringr::str_subset(pattern = "ja$") %>%
data.frame(trans_id = ., stringsAsFactors = FALSE) %>%
dplyr::rowwise() %>%
dplyr::do(
dplyr::data_frame(
id = .$trans_id,
trans_wd = stringi::stri_trans_general(str = "stringi", id = .$trans_id)
)
)
Source: local data frame [13 x 2]
Groups: <by row>
id trans_wd
1 cs-ja ストリンɡイ
2 cs_FONIPA-ja ストリングイ
3 es-ja ストリンヒ
4 es_419-ja ストリンヒ
5 es_FONIPA-ja ストリングイ
6 it-ja ストリンジ
7 pl-ja ストリンギ
8 pl_FONIPA-ja ストリンギ
9 ro-ja ストリンジ
10 ro_FONIPA-ja ストリンギ
11 ru-ja stringi
12 sk-ja ストリンギ
13 sk_FONIPA-ja ストリンギ
# Unicode正規化形式
# NFC (Canonical Decomposition, followed by Canonical Composition)
# NFD (Canonical Decomposition),
# NFKC (Compatibility Decomposition, followed by Canonical Composition),
# NFKC_Casefold (combination of NFKC, case folding, and removing ignorable characters which was introduced with Unicode 5.2)
# NFKD (Compatibility Decomposition)
# Unicode正規化に関しては下記を参照のこと
# http://www.unicode.org/reports/tr15/
# https://ja.wikipedia.org/wiki/Unicode正規化
# stringi::stri_trans_n*
# 文字列をUnicode正規化形式へ変換
ls("package:stringi") %>%
stringr::str_subset(pattern = "^stri_trans_n")
[1] "stri_trans_nfc" "stri_trans_nfd"
[3] "stri_trans_nfkc" "stri_trans_nfkc_casefold"
[5] "stri_trans_nfkd"
transNormalizationForm <- function (uni, type = "trans") {
if (type == "trans"){
transd_str <- c(
stringi::stri_trans_nfc(str = uni),
stringi::stri_trans_nfd(str = uni),
stringi::stri_trans_nfkc(str = uni),
stringi::stri_trans_nfkc_casefold(str = uni),
stringi::stri_trans_nfkd(str = uni)
)
} else {
transd_str <- c(
stringi::stri_trans_isnfc(str = uni),
stringi::stri_trans_isnfd(str = uni),
stringi::stri_trans_isnfkc(str = uni),
stringi::stri_trans_isnfkc_casefold(str = uni),
stringi::stri_trans_isnfkd(str = uni)
)
}
names(transd_str) <- c("NFC", "NFD", "NFKC", "NFKC_CASEFOLD", "NKFD")
return(transd_str)
}
# 例に挙げた文字列は下記を参考に
# http://nomenclator.la.coocan.jp/unicode/normalization.htm
# 「ダイエレシス付き大文字ユプシロン」はnkcとnfkc以外が異なる
transNormalizationForm(uni = "\u03AB", type = "trans")
NFC NFD NFKC NFKC_CASEFOLD NKFD
"Ϋ" "Ϋ" "Ϋ" "ϋ" "Ϋ"
# 「㍿」はNFCとNFDが同じ。NFKCとNFKDが同じ
transNormalizationForm(uni = "\u337f", type = "trans")
NFC NFD NFKC NFKC_CASEFOLD NKFD
"㍿" "㍿" "株式会社" "株式会社" "株式会社"
# 「㌦」はNFCとNFDが同じ
transNormalizationForm(uni = "\u3326", type = "trans")
NFC NFD NFKC NFKC_CASEFOLD NKFD
"㌦" "㌦" "ドル" "ドル" "ドル"
# NFKCとNFKDは同じようで違う
stringi::stri_trans_nfkc(str = "\u3326") == stringi::stri_trans_nfkd(str = "\u3326")
[1] FALSE
# 半角全角変換(NFCとNFDは半角のまま)
transNormalizationForm(uni = "ツラミ", type = "trans")
NFC NFD NFKC NFKC_CASEFOLD NKFD
"ツラミ" "ツラミ" "ツラミ" "ツラミ" "ツラミ"
# 半角+半角濁点はNFKDだと「カタカナ+濁点」で、「濁点付きカタカナ」ではない
gati_tsurami_str <- "ヅラミ"
han_gati_tsurami_str <- "ヅラミ"
transNormalizationForm(uni = han_gati_tsurami_str, type = "trans")
NFC NFD NFKC NFKC_CASEFOLD NKFD
"ヅラミ" "ヅラミ" "ヅラミ" "ヅラミ" "ヅラミ"
transNormalizationForm(uni = han_gati_tsurami_str, type = "trans") == gati_tsurami_str
NFC NFD NFKC NFKC_CASEFOLD NKFD
FALSE FALSE TRUE TRUE FALSE
# stringi::stri_trans_is*
# 文字列がUnicode正規化されているかどうかをチェック
ls("package:stringi") %>%
stringr::str_subset(pattern = "^stri_trans_is")
[1] "stri_trans_isnfc" "stri_trans_isnfd"
[3] "stri_trans_isnfkc" "stri_trans_isnfkc_casefold"
[5] "stri_trans_isnfkd"
# trans-is(行が変換時の関数で、列がチェック時の関数)
# 行列の対角は常にTRUE
sapply(
X = transNormalizationForm(uni = "\u337f", type = "trans"),
FUN = transNormalizationForm, type = "is"
)
NFC NFD NFKC NFKC_CASEFOLD NKFD
NFC TRUE TRUE TRUE TRUE TRUE
NFD TRUE TRUE TRUE TRUE TRUE
NFKC FALSE FALSE TRUE TRUE TRUE
NFKC_CASEFOLD FALSE FALSE TRUE TRUE TRUE
NKFD FALSE FALSE TRUE TRUE TRUE
sapply(
X = transNormalizationForm(uni = "\u3326", type = "trans"),
FUN = transNormalizationForm, type = "is"
)
NFC NFD NFKC NFKC_CASEFOLD NKFD
NFC TRUE TRUE TRUE TRUE FALSE
NFD TRUE TRUE FALSE FALSE TRUE
NFKC FALSE FALSE TRUE TRUE FALSE
NFKC_CASEFOLD FALSE FALSE TRUE TRUE FALSE
NKFD FALSE FALSE FALSE FALSE TRUE
sapply(
X = transNormalizationForm(uni = "\u03AB", type = "trans"),
FUN = transNormalizationForm, type = "is"
)
NFC NFD NFKC NFKC_CASEFOLD NKFD
NFC TRUE FALSE TRUE TRUE FALSE
NFD FALSE TRUE FALSE FALSE TRUE
NFKC TRUE FALSE TRUE TRUE FALSE
NFKC_CASEFOLD FALSE FALSE FALSE TRUE FALSE
NKFD FALSE TRUE FALSE FALSE TRUE
sapply(
X = transNormalizationForm(uni = "\u03AB", type = "trans"),
FUN = transNormalizationForm, type = "is"
)
NFC NFD NFKC NFKC_CASEFOLD NKFD
NFC TRUE FALSE TRUE TRUE FALSE
NFD FALSE TRUE FALSE FALSE TRUE
NFKC TRUE FALSE TRUE TRUE FALSE
NFKC_CASEFOLD FALSE FALSE FALSE TRUE FALSE
NKFD FALSE TRUE FALSE FALSE TRUE
{stringr}と{stringi}の関数をざっと触ったが、通常使う分には{stringr}でよいと思われる。
ただし、文字列変換(半角カナから全角カナ)や文字列のエスケープ、形式変換関係などの{stringi}の関数はとても有用なので使っていきたい。
Unicode正規化やICUなど、非常に勉強になったし、これからテキスト処理する際にも役立ちそう。
stringi::stri_enc_fromutf32
とstringr::boundary(type = "character")
がよくわからなかったので調べる。
(2015.07.18 追記) Windowsに関する挙動は @yutannihilationさんがフォローしてくださいました(感謝です)。
stringiとWindowsと文字コードとかそのへんのメモ
library(devtools)
devtools::session_info()
Session info --------------------------------------------------------------
setting value
version R version 3.2.0 (2015-04-16)
system x86_64, darwin13.4.0
ui X11
language (EN)
collate ja_JP.UTF-8
tz Asia/Tokyo
Packages ------------------------------------------------------------------
package * version date source
assertthat * 0.1 2013-12-06 CRAN (R 3.2.0)
curl * 0.5 2015-02-01 CRAN (R 3.2.0)
DBI * 0.3.1 2014-09-24 CRAN (R 3.2.0)
devtools 1.7.0 2015-01-17 CRAN (R 3.2.0)
digest * 0.6.8 2014-12-31 CRAN (R 3.2.0)
dplyr 0.4.2.9000 2015-06-17 Github (hadley/dplyr@7763150)
evaluate * 0.7 2015-04-21 CRAN (R 3.2.0)
formatR * 1.2 2015-04-21 CRAN (R 3.2.0)
htmltools * 0.2.6 2014-09-08 CRAN (R 3.2.0)
knitr 1.10 2015-04-23 CRAN (R 3.2.0)
lazyeval * 0.1.10.9000 2015-06-07 Github (hadley/lazyeval@ecb8dc0)
magrittr * 1.5 2014-11-22 CRAN (R 3.2.0)
R6 * 2.0.1 2014-10-29 CRAN (R 3.2.0)
Rcpp * 0.11.6 2015-05-01 CRAN (R 3.2.0)
readr 0.1.0.9000 2015-06-08 Github (hadley/readr@9006822)
rmarkdown * 0.6.2.4 2015-06-07 Github (rstudio/rmarkdown@8c9e25b)
rstudioapi * 0.3.1 2015-04-07 CRAN (R 3.2.0)
stringi 0.4-1 2014-12-14 CRAN (R 3.2.0)
stringr 1.0.0 2015-04-30 CRAN (R 3.2.0)
tidyr 0.2.0.9000 2015-06-07 Github (hadley/tidyr@0dc87b2)
yaml * 2.1.13 2014-06-12 CRAN (R 3.2.0)