教師なしビニング方法として, いくつかの区間の指定の方法がある.
# Equal Width Discretization
ewd <- function(x, k){
w <- (max(x) - min(x)) / k # interval
in_bd <- min(x) + (1:(k - 1)) * w # boundary without min(x) and max(x)
bd <- c(min(x), in_bd, max(x)) # boundary
cut(x, bd, 1:k, include.lowest = T) # discretization with equal width
}
# Equal Frequency Discretization
efd <- function(x, k){
nx <- length(x) # object length
bd_idx <- round(c(1,(1:(k - 1)) * (nx / k), nx)) # boundary of index
efd_idx <- cut(1:nx, bd_idx, 1:k, include.lowest = T) # discretization with equal frequency
efd_idx[order(x)] # original order
}
# example
library(dplyr)
set.seed(2)
x <- rpois(10, 5)
dat <- data.frame(x = x, ewd = ewd(x,3), efd = efd(x,3)) %>%
.[order(.$x),]
# result
kableExtra::kable(dat, format="html", align="c", caption="Equal Width/Frequency Discretization") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position="float_left") %>%
kableExtra::column_spec(1:3, "5em")
# number of each bins
par(mfcol = c(1,2))
barplot(table(dat$ewd), main = "Equal Width")
barplot(table(dat$efd), main = "Equal Frequency")
x | ewd | efd | |
---|---|---|---|
1 | 3 | 1 | 1 |
4 | 3 | 1 | 1 |
7 | 3 | 1 | 1 |
3 | 5 | 1 | 2 |
9 | 5 | 1 | 2 |
10 | 5 | 1 | 2 |
2 | 6 | 2 | 2 |
8 | 7 | 2 | 3 |
5 | 9 | 3 | 3 |
6 | 9 | 3 | 3 |
quantile
を用いて分位数を求めるquantile(x, probs, type)
prob
[0,1]から分位数を求める.デフォルトでは四分位数が返る.type
9つある手法から選ぶ(デフォルト:7).
set.seed(2)
x <- rpois(10, 5)
qrt <- quantile(x) # quartile(default): 0%, 25%, 50%, 75%, 100%
trt <- quantile(x, prob = seq(0, 1, length.out = 4)) # tertile: 0%, 33.33%, 66.67%, 100%
dcl <- quantile(x, prob = seq(0, 1, length.out = 11), type = 5) # deciles
prc <- quantile(x, prob = c(0.50, 0.70, 0.80)) # percentile: 50%, 70%, 80%
par(mfrow = c(2,2), cex.axis = 0.6)
plot(sort(x), main = "quartile")
abline(h = qrt, col = 2); axis(4, qrt, qrt, col = 2)
plot(sort(x), main = "tertile")
abline(h = round(trt,1), col = 2); axis(4, trt, round(trt,1), col = 2 )
plot(sort(x), main = "deciles")
abline(h = dcl, col = 2); axis(4, dcl, dcl, col = 2)
plot(sort(x), main = "percentile")
abline(h = prc, col = 2); axis(4, prc, prc, col = 2)
cut
を使って離散化cut(x, breaks, labels=NULL, right=T, include.lowest=F)
breaks
切断点のベクトル、もしくはxを分割する区間数.
breaks
に区関数を与えた場合, 等間隔区間に分割.labels
カテゴリの水準のラベルinclude.lowest
[default=FALSE]最小(もしくは最大)の分割点値に一致する値を区間に含めるかどうか.right
[default=TRUE] 区間が右に閉じている(そして左に開いている)x <- 0:10 # データ
br <- c(0, 2, 4, 6, 8, 10) # ブレークポイント, 5 bins
ct <- 1:5 # カテゴリラベル
# デフォルトでは右に閉じている状態. (0,2], (2,4], (4,6],(6,8], (8,10]
ct_1 <- cut(x, breaks = br, labels = ct) # default [right = T, include.lowest = F]
ct_2 <- cut(x, breaks = br, labels = ct, right = F, include.lowest = F)
ct_3 <- cut(x, breaks = br, labels = ct, right = T, include.lowest = T)
ct_4 <- cut(x, breaks = br, labels = ct, right = F, include.lowest = T)
ct_5 <- cut(x, breaks = 5, labels = ct) # 区関数を指定する.
cbind(x, ct_1, ct_2, ct_3, ct_4, ct_5)
## x ct_1 ct_2 ct_3 ct_4 ct_5
## [1,] 0 NA 1 1 1 1
## [2,] 1 1 1 1 1 1
## [3,] 2 1 2 1 2 1
## [4,] 3 2 2 2 2 2
## [5,] 4 2 3 2 3 2
## [6,] 5 3 3 3 3 3
## [7,] 6 3 4 3 4 3
## [8,] 7 4 4 4 4 4
## [9,] 8 4 5 4 5 4
## [10,] 9 5 5 5 5 5
## [11,] 10 5 NA 5 5 5
findInterval
を用いて離散化findInterval(x, vec, rightmost.closed = FALSE, all.inside = FALSE, left.open = FALSE)
vec
離散値レンジ,rightmost.closed
, left.open
: 右側が閉じているか, 左側が開いているか.
x <- 0:10 # データ
br <- c(0, 2, 4, 6, 8, 10) # ブレークポイント, 5 bins
# デフォルトでは左に閉じている状態. [0,2), [2,4), [4,6),[6,8), [8,10)
intvl_1 <- findInterval(x, vec = br, rightmost.closed = F, left.open = F) # default
intvl_2 <- findInterval(x, vec = br, rightmost.closed = F, left.open = T)
intvl_3 <- findInterval(x, vec = br, rightmost.closed = T, left.open = F)
# intvl_3 <- findInterval(x, vec = br, rightmost.closed = T, all.inside = T)
intvl_4 <- findInterval(x, vec = br, rightmost.closed = T, left.open = T)
# intvl_4 <- findInterval(x, vec = br, left.open = T, all.inside = T)
cbind(x, intvl_1,intvl_2, intvl_3, intvl_4)
## x intvl_1 intvl_2 intvl_3 intvl_4
## [1,] 0 1 0 1 1
## [2,] 1 1 1 1 1
## [3,] 2 2 1 2 1
## [4,] 3 2 2 2 2
## [5,] 4 3 2 3 2
## [6,] 5 3 3 3 3
## [7,] 6 4 3 4 3
## [8,] 7 4 4 4 4
## [9,] 8 5 4 5 4
## [10,] 9 5 5 5 5
## [11,] 10 6 5 5 5
infotheo::discretize
を用いて離散値化データフレームの各列を等間隔もしくは等頻度区間に基づいて離散値化する.
重複した数値がある場合, equalfreq
では各区間のデータ数が均一な分割にならない.
infotheo::discretize(X, disc, nbins)
X
: データフレームdisc
離散値化のメソッド
equalfreq
: 等頻度区間による(Equal frequency discretization) [default]equalwidth
: 等間隔区間による(Equal width discretization)globalequalwidth
: 与えられた値の範囲nbins
: 区関数[default = N^(1/3)]set.seed(2)
dat <- data.frame(x = rpois(10,5))
ef <- infotheo::discretize(X = dat, disc = "equalfreq", nbins = 3) # default
ew <- infotheo::discretize(X = dat, disc = "equalwidth", nbins = 3)
gew <- infotheo::discretize(X = dat, disc = "globalequalwidth", nbins = 3)
d <- bind_cols(list(dat, ef, ew, gew)) %>%
setNames(., c("x","ef","ew","gew")) %>%
.[order(.$x),]
# result
kableExtra::kable(d, format = "html", align = "c", caption = "equalfreq/equalwidth/globalequalwidth") %>%
kableExtra::kable_styling(bootstrap_options = "striped", full_width = F, position = "left") %>%
kableExtra::column_spec(1:4, "5em")
x | ef | ew | gew | |
---|---|---|---|---|
1 | 3 | 1 | 1 | 1 |
4 | 3 | 1 | 1 | 1 |
7 | 3 | 1 | 1 | 1 |
3 | 5 | 1 | 2 | 1 |
9 | 5 | 1 | 2 | 1 |
10 | 5 | 1 | 2 | 1 |
2 | 6 | 3 | 2 | 2 |
8 | 7 | 3 | 3 | 2 |
5 | 9 | 3 | 3 | 3 |
6 | 9 | 3 | 3 | 3 |
dplyr::ntile
を用いて均等に分割## x ntl
## 1 3 1
## 4 3 1
## 7 3 1
## 3 5 1
## 9 5 2
## 10 5 2
## 2 6 2
## 8 7 3
## 5 9 3
## 6 9 3
## R version 3.5.1 (2018-07-02)
## Platform: x86_64-apple-darwin15.6.0 (64-bit)
## Running under: macOS High Sierra 10.13.1
##
## Matrix products: default
## BLAS: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRblas.0.dylib
## LAPACK: /Library/Frameworks/R.framework/Versions/3.5/Resources/lib/libRlapack.dylib
##
## locale:
## [1] ja_JP.UTF-8/ja_JP.UTF-8/ja_JP.UTF-8/C/ja_JP.UTF-8/ja_JP.UTF-8
##
## attached base packages:
## [1] stats graphics grDevices utils datasets methods base
##
## other attached packages:
## [1] dplyr_0.8.0.1
##
## loaded via a namespace (and not attached):
## [1] Rcpp_1.0.0 rstudioapi_0.9.0 xml2_1.2.0
## [4] knitr_1.21 magrittr_1.5 hms_0.4.2
## [7] munsell_0.5.0 tidyselect_0.2.5 rvest_0.3.2
## [10] viridisLite_0.3.0 colorspace_1.4-0 R6_2.3.0
## [13] rlang_0.3.1 highr_0.7 httr_1.4.0
## [16] stringr_1.3.1 tools_3.5.1 webshot_0.5.1
## [19] xfun_0.4 htmltools_0.3.6 yaml_2.2.0
## [22] assertthat_0.2.0 digest_0.6.18 tibble_2.0.1
## [25] crayon_1.3.4 infotheo_1.2.0 kableExtra_1.0.0
## [28] purrr_0.3.0 readr_1.3.1 glue_1.3.0
## [31] evaluate_0.12 rmarkdown_1.11 stringi_1.2.4
## [34] compiler_3.5.1 pillar_1.3.1 scales_1.0.0
## [37] pkgconfig_2.0.2