rm(list =ls())
library(splitstackshape)
## Warning: package 'splitstackshape' was built under R version 4.0.3
DF <- data.frame(
  ID = 1:100,
  A = sample(c("AA", "BB", "CC", "DD", "EE"), 100, replace = TRUE),
  B = rnorm(100), C = abs(round(rnorm(100), digits=1)),
  D = sample(c("CA", "NY", "TX"), 100, replace = TRUE),
  E = sample(c("M", "F"), 100, replace = TRUE))
DF 
##      ID  A           B   C  D E
## 1     1 DD  0.03441729 0.4 CA M
## 2     2 EE -0.48057769 0.7 NY M
## 3     3 BB -0.45370948 1.4 NY M
## 4     4 EE  0.50118422 1.7 TX M
## 5     5 DD  0.82885393 0.0 TX F
## 6     6 CC  0.09621272 0.8 TX F
## 7     7 CC -0.85007721 1.1 CA F
## 8     8 BB -0.30694125 0.2 NY M
## 9     9 CC  0.99978119 0.7 TX F
## 10   10 DD -0.16857996 1.0 CA F
## 11   11 DD -2.37911331 0.9 NY M
## 12   12 DD  0.30485217 0.5 NY F
## 13   13 EE  0.46067009 1.8 NY M
## 14   14 CC  1.08088015 0.6 NY F
## 15   15 CC -0.20780062 1.9 NY F
## 16   16 EE  1.50685545 0.4 NY M
## 17   17 EE -0.66212591 1.3 CA M
## 18   18 EE -0.88294955 0.2 NY M
## 19   19 BB -1.24402533 0.9 TX M
## 20   20 DD  2.32245726 1.8 TX M
## 21   21 EE  0.92080453 1.0 TX M
## 22   22 DD  0.08856214 1.1 TX F
## 23   23 EE  0.22902353 0.3 CA F
## 24   24 EE -0.10527522 0.8 NY M
## 25   25 DD  0.57473822 0.5 CA F
## 26   26 AA  0.60201496 1.4 TX M
## 27   27 AA -1.64180454 0.3 TX M
## 28   28 CC -0.64838537 1.1 CA M
## 29   29 AA  0.42918070 0.1 CA F
## 30   30 DD  0.91106027 0.2 TX F
## 31   31 DD -2.78875159 0.3 TX M
## 32   32 DD -0.34767965 0.4 NY F
## 33   33 AA -1.47237915 0.7 NY F
## 34   34 BB  0.19998324 0.0 TX M
## 35   35 CC  0.51590247 0.9 TX M
## 36   36 CC -1.23449643 0.1 CA F
## 37   37 EE  0.84661286 2.0 TX M
## 38   38 CC  0.46813645 1.8 TX F
## 39   39 AA -1.82074794 0.6 CA M
## 40   40 BB  0.30922620 0.6 CA F
## 41   41 DD  0.12531904 0.1 CA M
## 42   42 DD  0.37900925 0.4 NY M
## 43   43 CC  0.37344147 0.6 NY M
## 44   44 CC -0.06941607 0.3 TX F
## 45   45 BB -1.27215066 1.0 NY F
## 46   46 CC  0.81065422 0.1 CA M
## 47   47 DD  1.57385719 0.1 NY F
## 48   48 BB -1.94361620 1.0 NY M
## 49   49 AA -0.34198650 1.7 NY M
## 50   50 DD -0.22377866 1.0 CA M
## 51   51 CC  0.55976001 0.5 CA F
## 52   52 CC  1.23620996 0.0 NY M
## 53   53 EE  0.01189679 1.0 TX F
## 54   54 BB -1.13708737 0.3 TX F
## 55   55 CC -1.06354794 0.3 NY M
## 56   56 EE  0.39853589 0.4 NY F
## 57   57 AA  0.05145718 1.0 NY M
## 58   58 AA  1.99668303 1.5 NY F
## 59   59 BB -0.11404972 0.0 NY F
## 60   60 EE  1.40360140 0.5 CA F
## 61   61 CC -1.22916923 0.4 CA F
## 62   62 DD -0.03260632 0.8 NY F
## 63   63 AA -0.01727829 0.8 TX F
## 64   64 BB -0.54550994 0.6 CA F
## 65   65 BB -0.90824469 0.4 TX F
## 66   66 EE -0.58431577 0.2 TX F
## 67   67 EE  1.42363420 0.3 CA F
## 68   68 BB -0.12214226 1.0 CA F
## 69   69 DD -1.05689633 1.1 CA M
## 70   70 BB  0.39824874 1.4 CA F
## 71   71 CC -2.26568316 0.9 TX M
## 72   72 EE -0.47527774 0.2 NY F
## 73   73 AA  1.18219827 0.3 NY F
## 74   74 DD  2.07444699 1.7 CA F
## 75   75 BB -0.93770711 1.6 CA F
## 76   76 EE  1.17139689 0.8 TX F
## 77   77 BB -0.19844364 0.9 NY M
## 78   78 DD -1.67040169 0.1 NY F
## 79   79 CC -1.37142627 0.4 TX F
## 80   80 AA -0.46251392 0.9 CA F
## 81   81 BB -1.22163393 0.5 CA M
## 82   82 CC  0.01949581 1.5 TX M
## 83   83 BB  0.24854064 0.1 NY F
## 84   84 EE -0.04593022 0.2 CA F
## 85   85 AA  0.52804947 0.0 NY F
## 86   86 DD  0.67480280 0.8 TX M
## 87   87 EE -0.08791508 0.4 NY M
## 88   88 BB -0.97048108 0.1 TX M
## 89   89 EE  1.13904273 1.2 NY F
## 90   90 CC  1.47874446 0.0 CA M
## 91   91 AA  0.41931573 0.0 CA M
## 92   92 EE  0.12802631 1.6 TX F
## 93   93 BB  0.53974479 0.1 CA M
## 94   94 AA -0.47498459 0.8 CA M
## 95   95 BB  1.13354857 2.3 TX F
## 96   96 BB -2.63403382 1.0 CA F
## 97   97 CC  0.26860337 0.2 CA M
## 98   98 EE  0.15548078 0.8 NY M
## 99   99 AA -0.90728455 1.0 CA F
## 100 100 DD -0.96131054 1.5 CA F
# Take a 10% sample from all -A- groups in DF
stratified(DF, "A", .1)
##     ID  A          B   C  D E
##  1: 11 DD -2.3791133 0.9 NY M
##  2: 50 DD -0.2237787 1.0 CA M
##  3: 89 EE  1.1390427 1.2 NY F
##  4: 67 EE  1.4236342 0.3 CA F
##  5: 45 BB -1.2721507 1.0 NY F
##  6: 54 BB -1.1370874 0.3 TX F
##  7: 55 CC -1.0635479 0.3 NY M
##  8:  7 CC -0.8500772 1.1 CA F
##  9: 99 AA -0.9072846 1.0 CA F
## 10: 94 AA -0.4749846 0.8 CA M
# Take a 10% sample from only "AA" and "BB" groups from -A- in DF
stratified(DF, "A", .1, select = list(A = c("AA", "BB")))
##    ID  A          B   C  D E
## 1: 81 BB -1.2216339 0.5 CA M
## 2: 83 BB  0.2485406 0.1 NY F
## 3: 26 AA  0.6020150 1.4 TX M
## 4: 85 AA  0.5280495 0.0 NY F
# Take 5 samples from all -D- groups in DF, specified by column number
stratified(DF, group = 5, size = 5)
##     ID  A           B   C  D E
##  1: 61 CC -1.22916923 0.4 CA F
##  2: 84 EE -0.04593022 0.2 CA F
##  3: 81 BB -1.22163393 0.5 CA M
##  4: 29 AA  0.42918070 0.1 CA F
##  5: 68 BB -0.12214226 1.0 CA F
##  6: 89 EE  1.13904273 1.2 NY F
##  7:  2 EE -0.48057769 0.7 NY M
##  8: 57 AA  0.05145718 1.0 NY M
##  9: 49 AA -0.34198650 1.7 NY M
## 10: 56 EE  0.39853589 0.4 NY F
## 11: 82 CC  0.01949581 1.5 TX M
## 12: 95 BB  1.13354857 2.3 TX F
## 13:  5 DD  0.82885393 0.0 TX F
## 14: 44 CC -0.06941607 0.3 TX F
## 15: 34 BB  0.19998324 0.0 TX M
# Use a two-column strata: -E- and -D-
stratified(DF, c("E", "D"), size = .15)
##     ID  A           B   C  D E
##  1: 41 DD  0.12531904 0.1 CA M
##  2: 90 CC  1.47874446 0.0 CA M
##  3: 87 EE -0.08791508 0.4 NY M
##  4:  2 EE -0.48057769 0.7 NY M
##  5: 49 AA -0.34198650 1.7 NY M
##  6: 21 EE  0.92080453 1.0 TX M
##  7: 86 DD  0.67480280 0.8 TX M
##  8: 76 EE  1.17139689 0.8 TX F
##  9: 92 EE  0.12802631 1.6 TX F
## 10:  7 CC -0.85007721 1.1 CA F
## 11: 74 DD  2.07444699 1.7 CA F
## 12: 29 AA  0.42918070 0.1 CA F
## 13: 47 DD  1.57385719 0.1 NY F
## 14: 89 EE  1.13904273 1.2 NY F
## 15: 32 DD -0.34767965 0.4 NY F
# Use a two-column strata (-E- and -D-) but only use cases where -E- == "M"
stratified(DF, c("E", "D"), .15, select = list(E = "M"))
##    ID  A          B   C  D E
## 1: 69 DD -1.0568963 1.1 CA M
## 2: 93 BB  0.5397448 0.1 CA M
## 3:  3 BB -0.4537095 1.4 NY M
## 4: 52 CC  1.2362100 0.0 NY M
## 5: 16 EE  1.5068554 0.4 NY M
## 6: 71 CC -2.2656832 0.9 TX M
## 7: 19 BB -1.2440253 0.9 TX M
## As above, but where -E- == "M" and -D- == "CA" or "TX"
stratified(DF, c("E", "D"), .15, select = list(E = "M", D = c("CA", "TX")))
##    ID  A          B   C  D E
## 1: 39 AA -1.8207479 0.6 CA M
## 2: 90 CC  1.4787445 0.0 CA M
## 3: 26 AA  0.6020150 1.4 TX M
## 4: 34 BB  0.1999832 0.0 TX M
# Use a three-column strata: -E-, -D-, and -A-
stratified(DF, c("E", "D", "A"), size = 2)
## Groups M CA EE, F TX AA contain fewer rows than requested. Returning all rows.
##     ID  A           B   C  D E
##  1:  1 DD  0.03441729 0.4 CA M
##  2: 50 DD -0.22377866 1.0 CA M
##  3: 16 EE  1.50685545 0.4 NY M
##  4: 24 EE -0.10527522 0.8 NY M
##  5: 77 BB -0.19844364 0.9 NY M
##  6:  3 BB -0.45370948 1.4 NY M
##  7:  4 EE  0.50118422 1.7 TX M
##  8: 37 EE  0.84661286 2.0 TX M
##  9: 22 DD  0.08856214 1.1 TX F
## 10: 30 DD  0.91106027 0.2 TX F
## 11: 44 CC -0.06941607 0.3 TX F
## 12: 38 CC  0.46813645 1.8 TX F
## 13: 61 CC -1.22916923 0.4 CA F
## 14:  7 CC -0.85007721 1.1 CA F
## 15: 74 DD  2.07444699 1.7 CA F
## 16: 10 DD -0.16857996 1.0 CA F
## 17: 42 DD  0.37900925 0.4 NY M
## 18: 11 DD -2.37911331 0.9 NY M
## 19: 47 DD  1.57385719 0.1 NY F
## 20: 78 DD -1.67040169 0.1 NY F
## 21: 14 CC  1.08088015 0.6 NY F
## 22: 15 CC -0.20780062 1.9 NY F
## 23: 17 EE -0.66212591 1.3 CA M
## 24: 34 BB  0.19998324 0.0 TX M
## 25: 88 BB -0.97048108 0.1 TX M
## 26: 86 DD  0.67480280 0.8 TX M
## 27: 20 DD  2.32245726 1.8 TX M
## 28: 67 EE  1.42363420 0.3 CA F
## 29: 23 EE  0.22902353 0.3 CA F
## 30: 27 AA -1.64180454 0.3 TX M
## 31: 26 AA  0.60201496 1.4 TX M
## 32: 97 CC  0.26860337 0.2 CA M
## 33: 46 CC  0.81065422 0.1 CA M
## 34: 80 AA -0.46251392 0.9 CA F
## 35: 99 AA -0.90728455 1.0 CA F
## 36: 33 AA -1.47237915 0.7 NY F
## 37: 85 AA  0.52804947 0.0 NY F
## 38: 71 CC -2.26568316 0.9 TX M
## 39: 35 CC  0.51590247 0.9 TX M
## 40: 39 AA -1.82074794 0.6 CA M
## 41: 91 AA  0.41931573 0.0 CA M
## 42: 75 BB -0.93770711 1.6 CA F
## 43: 96 BB -2.63403382 1.0 CA F
## 44: 43 CC  0.37344147 0.6 NY M
## 45: 55 CC -1.06354794 0.3 NY M
## 46: 59 BB -0.11404972 0.0 NY F
## 47: 83 BB  0.24854064 0.1 NY F
## 48: 57 AA  0.05145718 1.0 NY M
## 49: 49 AA -0.34198650 1.7 NY M
## 50: 76 EE  1.17139689 0.8 TX F
## 51: 92 EE  0.12802631 1.6 TX F
## 52: 95 BB  1.13354857 2.3 TX F
## 53: 65 BB -0.90824469 0.4 TX F
## 54: 72 EE -0.47527774 0.2 NY F
## 55: 89 EE  1.13904273 1.2 NY F
## 56: 63 AA -0.01727829 0.8 TX F
## 57: 93 BB  0.53974479 0.1 CA M
## 58: 81 BB -1.22163393 0.5 CA M
##     ID  A           B   C  D E
## Not run: 
# The following will produce errors
#stratified(DF, "D", c(5, 3))
#stratified(DF, "D", c(5, 3, 2))

## End(Not run)

# Sizes using a named vector
stratified(DF, "D", c(CA = 5, NY = 3, TX = 2))
##     ID  A           B   C  D E
##  1: 10 DD -0.16857996 1.0 CA F
##  2: 46 CC  0.81065422 0.1 CA M
##  3: 17 EE -0.66212591 1.3 CA M
##  4: 29 AA  0.42918070 0.1 CA F
##  5: 51 CC  0.55976001 0.5 CA F
##  6: 49 AA -0.34198650 1.7 NY M
##  7: 83 BB  0.24854064 0.1 NY F
##  8: 62 DD -0.03260632 0.8 NY F
##  9: 82 CC  0.01949581 1.5 TX M
## 10:  5 DD  0.82885393 0.0 TX F
# Works with multiple groups as well
stratified(DF, c("D", "E"), 
           c("NY F" = 2, "NY M" = 3, "TX F" = 1, "TX M" = 1,
             "CA F" = 5, "CA M" = 1))
##     ID  A           B   C  D E
##  1: 59 BB -0.11404972 0.0 NY F
##  2: 62 DD -0.03260632 0.8 NY F
##  3: 42 DD  0.37900925 0.4 NY M
##  4: 13 EE  0.46067009 1.8 NY M
##  5: 43 CC  0.37344147 0.6 NY M
##  6: 22 DD  0.08856214 1.1 TX F
##  7:  4 EE  0.50118422 1.7 TX M
##  8: 68 BB -0.12214226 1.0 CA F
##  9: 80 AA -0.46251392 0.9 CA F
## 10: 64 BB -0.54550994 0.6 CA F
## 11: 23 EE  0.22902353 0.3 CA F
## 12: 67 EE  1.42363420 0.3 CA F
## 13: 50 DD -0.22377866 1.0 CA M