binning in R

cut

range(pg$body_mass_g, na.rm=T)
## [1] 2700 6300
pg %>% 
  mutate(body_bin = cut(body_mass_g, breaks = c(seq(2500, 6500, 500)), dig.lab = 4)) %>% 
  group_by(body_bin) %>% 
  tally() # summarise(n = n())
## # A tibble: 9 x 2
##   body_bin        n
##   <fct>       <int>
## 1 (2500,3000]    11
## 2 (3000,3500]    67
## 3 (3500,4000]    92
## 4 (4000,4500]    57
## 5 (4500,5000]    54
## 6 (5000,5500]    33
## 7 (5500,6000]    26
## 8 (6000,6500]     2
## 9 <NA>            2

utile

pg %>% 
  mutate(body_bin = ntile(body_mass_g, n = 5)) %>% 
  group_by(body_bin) %>% 
  tally()
## # A tibble: 6 x 2
##   body_bin     n
##      <int> <int>
## 1        1    69
## 2        2    69
## 3        3    68
## 4        4    68
## 5        5    68
## 6       NA     2

dlookr::binning

library(dlookr)
binning(pg$body_mass_g)
## binned type: quantile
## number of bins: 10
## x
## [2700,3289.167] (3289.167,3470]     (3470,3650]     (3650,3800]     (3800,4050] 
##              34              34              36              37              35 
##     (4050,4300]     (4300,4650]     (4650,4955] (4955,5421.667] (5421.667,6300] 
##              31              35              32              34              34 
##            <NA> 
##               2
binning(pg$body_mass_g, type = "equal")
## binned type: equal
## number of bins: 10
## x
## [2700,3060] (3060,3420] (3420,3780] (3780,4140] (4140,4500] (4500,4860] 
##          15          43          71          53          45          38 
## (4860,5220] (5220,5580] (5580,5940] (5940,6300]        <NA> 
##          28          27          16           6           2
binning(pg$body_mass_g, type = "pretty")
## binned type: pretty
## number of bins: 8
## x
## [2500,3000] (3000,3500] (3500,4000] (4000,4500] (4500,5000] (5000,5500] 
##          11          67          92          57          54          33 
## (5500,6000] (6000,6500]        <NA> 
##          26           2           2
pg %>% 
  mutate(body_bin = binning(pg$body_mass_g, type = "pretty") %>%  extract()) %>% 
  group_by(species, body_bin) %>% 
  summarise(freq = n())
## # A tibble: 18 x 3
## # Groups:   species [3]
##    species   body_bin     freq
##    <fct>     <ord>       <int>
##  1 Adelie    [2500,3000]     9
##  2 Adelie    (3000,3500]    50
##  3 Adelie    (3500,4000]    57
##  4 Adelie    (4000,4500]    28
##  5 Adelie    (4500,5000]     7
##  6 Adelie    <NA>            1
##  7 Chinstrap [2500,3000]     2
##  8 Chinstrap (3000,3500]    17
##  9 Chinstrap (3500,4000]    34
## 10 Chinstrap (4000,4500]    13
## 11 Chinstrap (4500,5000]     2
## 12 Gentoo    (3500,4000]     1
## 13 Gentoo    (4000,4500]    16
## 14 Gentoo    (4500,5000]    45
## 15 Gentoo    (5000,5500]    33
## 16 Gentoo    (5500,6000]    26
## 17 Gentoo    (6000,6500]     2
## 18 Gentoo    <NA>            1
pg %>% 
  mutate(body_bin = binning(pg$body_mass_g, 
                            type = "pretty", 
                            nbin = 5, 
                            labels = c("2K", "3K", "4K", "5K", "6K"))) %>% 
  group_by(species, body_bin) %>% 
  summarise(freq = n())
## # A tibble: 12 x 3
## # Groups:   species [3]
##    species   body_bin  freq
##    <fct>     <bins>   <int>
##  1 Adelie    2K           9
##  2 Adelie    3K         107
##  3 Adelie    4K          35
##  4 Adelie    <NA>         1
##  5 Chinstrap 2K           2
##  6 Chinstrap 3K          51
##  7 Chinstrap 4K          15
##  8 Gentoo    3K           1
##  9 Gentoo    4K          61
## 10 Gentoo    5K          59
## 11 Gentoo    6K           2
## 12 Gentoo    <NA>         1
table(cut(pg$body_mass_g, 
    breaks = quantile(pg$body_mass_g, na.rm = T), 
    labels = c("Q1", "Q2", "Q3", "Q4")))
## 
## Q1 Q2 Q3 Q4 
## 88 87 81 85