rm(list =ls())
library(splitstackshape)
## Warning: package 'splitstackshape' was built under R version 4.0.3
DF <- data.frame(
ID = 1:100,
A = sample(c("AA", "BB", "CC", "DD", "EE"), 100, replace = TRUE),
B = rnorm(100), C = abs(round(rnorm(100), digits=1)),
D = sample(c("CA", "NY", "TX"), 100, replace = TRUE),
E = sample(c("M", "F"), 100, replace = TRUE))
DF
## ID A B C D E
## 1 1 DD 0.03441729 0.4 CA M
## 2 2 EE -0.48057769 0.7 NY M
## 3 3 BB -0.45370948 1.4 NY M
## 4 4 EE 0.50118422 1.7 TX M
## 5 5 DD 0.82885393 0.0 TX F
## 6 6 CC 0.09621272 0.8 TX F
## 7 7 CC -0.85007721 1.1 CA F
## 8 8 BB -0.30694125 0.2 NY M
## 9 9 CC 0.99978119 0.7 TX F
## 10 10 DD -0.16857996 1.0 CA F
## 11 11 DD -2.37911331 0.9 NY M
## 12 12 DD 0.30485217 0.5 NY F
## 13 13 EE 0.46067009 1.8 NY M
## 14 14 CC 1.08088015 0.6 NY F
## 15 15 CC -0.20780062 1.9 NY F
## 16 16 EE 1.50685545 0.4 NY M
## 17 17 EE -0.66212591 1.3 CA M
## 18 18 EE -0.88294955 0.2 NY M
## 19 19 BB -1.24402533 0.9 TX M
## 20 20 DD 2.32245726 1.8 TX M
## 21 21 EE 0.92080453 1.0 TX M
## 22 22 DD 0.08856214 1.1 TX F
## 23 23 EE 0.22902353 0.3 CA F
## 24 24 EE -0.10527522 0.8 NY M
## 25 25 DD 0.57473822 0.5 CA F
## 26 26 AA 0.60201496 1.4 TX M
## 27 27 AA -1.64180454 0.3 TX M
## 28 28 CC -0.64838537 1.1 CA M
## 29 29 AA 0.42918070 0.1 CA F
## 30 30 DD 0.91106027 0.2 TX F
## 31 31 DD -2.78875159 0.3 TX M
## 32 32 DD -0.34767965 0.4 NY F
## 33 33 AA -1.47237915 0.7 NY F
## 34 34 BB 0.19998324 0.0 TX M
## 35 35 CC 0.51590247 0.9 TX M
## 36 36 CC -1.23449643 0.1 CA F
## 37 37 EE 0.84661286 2.0 TX M
## 38 38 CC 0.46813645 1.8 TX F
## 39 39 AA -1.82074794 0.6 CA M
## 40 40 BB 0.30922620 0.6 CA F
## 41 41 DD 0.12531904 0.1 CA M
## 42 42 DD 0.37900925 0.4 NY M
## 43 43 CC 0.37344147 0.6 NY M
## 44 44 CC -0.06941607 0.3 TX F
## 45 45 BB -1.27215066 1.0 NY F
## 46 46 CC 0.81065422 0.1 CA M
## 47 47 DD 1.57385719 0.1 NY F
## 48 48 BB -1.94361620 1.0 NY M
## 49 49 AA -0.34198650 1.7 NY M
## 50 50 DD -0.22377866 1.0 CA M
## 51 51 CC 0.55976001 0.5 CA F
## 52 52 CC 1.23620996 0.0 NY M
## 53 53 EE 0.01189679 1.0 TX F
## 54 54 BB -1.13708737 0.3 TX F
## 55 55 CC -1.06354794 0.3 NY M
## 56 56 EE 0.39853589 0.4 NY F
## 57 57 AA 0.05145718 1.0 NY M
## 58 58 AA 1.99668303 1.5 NY F
## 59 59 BB -0.11404972 0.0 NY F
## 60 60 EE 1.40360140 0.5 CA F
## 61 61 CC -1.22916923 0.4 CA F
## 62 62 DD -0.03260632 0.8 NY F
## 63 63 AA -0.01727829 0.8 TX F
## 64 64 BB -0.54550994 0.6 CA F
## 65 65 BB -0.90824469 0.4 TX F
## 66 66 EE -0.58431577 0.2 TX F
## 67 67 EE 1.42363420 0.3 CA F
## 68 68 BB -0.12214226 1.0 CA F
## 69 69 DD -1.05689633 1.1 CA M
## 70 70 BB 0.39824874 1.4 CA F
## 71 71 CC -2.26568316 0.9 TX M
## 72 72 EE -0.47527774 0.2 NY F
## 73 73 AA 1.18219827 0.3 NY F
## 74 74 DD 2.07444699 1.7 CA F
## 75 75 BB -0.93770711 1.6 CA F
## 76 76 EE 1.17139689 0.8 TX F
## 77 77 BB -0.19844364 0.9 NY M
## 78 78 DD -1.67040169 0.1 NY F
## 79 79 CC -1.37142627 0.4 TX F
## 80 80 AA -0.46251392 0.9 CA F
## 81 81 BB -1.22163393 0.5 CA M
## 82 82 CC 0.01949581 1.5 TX M
## 83 83 BB 0.24854064 0.1 NY F
## 84 84 EE -0.04593022 0.2 CA F
## 85 85 AA 0.52804947 0.0 NY F
## 86 86 DD 0.67480280 0.8 TX M
## 87 87 EE -0.08791508 0.4 NY M
## 88 88 BB -0.97048108 0.1 TX M
## 89 89 EE 1.13904273 1.2 NY F
## 90 90 CC 1.47874446 0.0 CA M
## 91 91 AA 0.41931573 0.0 CA M
## 92 92 EE 0.12802631 1.6 TX F
## 93 93 BB 0.53974479 0.1 CA M
## 94 94 AA -0.47498459 0.8 CA M
## 95 95 BB 1.13354857 2.3 TX F
## 96 96 BB -2.63403382 1.0 CA F
## 97 97 CC 0.26860337 0.2 CA M
## 98 98 EE 0.15548078 0.8 NY M
## 99 99 AA -0.90728455 1.0 CA F
## 100 100 DD -0.96131054 1.5 CA F
# Take a 10% sample from all -A- groups in DF
stratified(DF, "A", .1)
## ID A B C D E
## 1: 11 DD -2.3791133 0.9 NY M
## 2: 50 DD -0.2237787 1.0 CA M
## 3: 89 EE 1.1390427 1.2 NY F
## 4: 67 EE 1.4236342 0.3 CA F
## 5: 45 BB -1.2721507 1.0 NY F
## 6: 54 BB -1.1370874 0.3 TX F
## 7: 55 CC -1.0635479 0.3 NY M
## 8: 7 CC -0.8500772 1.1 CA F
## 9: 99 AA -0.9072846 1.0 CA F
## 10: 94 AA -0.4749846 0.8 CA M
# Take a 10% sample from only "AA" and "BB" groups from -A- in DF
stratified(DF, "A", .1, select = list(A = c("AA", "BB")))
## ID A B C D E
## 1: 81 BB -1.2216339 0.5 CA M
## 2: 83 BB 0.2485406 0.1 NY F
## 3: 26 AA 0.6020150 1.4 TX M
## 4: 85 AA 0.5280495 0.0 NY F
# Take 5 samples from all -D- groups in DF, specified by column number
stratified(DF, group = 5, size = 5)
## ID A B C D E
## 1: 61 CC -1.22916923 0.4 CA F
## 2: 84 EE -0.04593022 0.2 CA F
## 3: 81 BB -1.22163393 0.5 CA M
## 4: 29 AA 0.42918070 0.1 CA F
## 5: 68 BB -0.12214226 1.0 CA F
## 6: 89 EE 1.13904273 1.2 NY F
## 7: 2 EE -0.48057769 0.7 NY M
## 8: 57 AA 0.05145718 1.0 NY M
## 9: 49 AA -0.34198650 1.7 NY M
## 10: 56 EE 0.39853589 0.4 NY F
## 11: 82 CC 0.01949581 1.5 TX M
## 12: 95 BB 1.13354857 2.3 TX F
## 13: 5 DD 0.82885393 0.0 TX F
## 14: 44 CC -0.06941607 0.3 TX F
## 15: 34 BB 0.19998324 0.0 TX M
# Use a two-column strata: -E- and -D-
stratified(DF, c("E", "D"), size = .15)
## ID A B C D E
## 1: 41 DD 0.12531904 0.1 CA M
## 2: 90 CC 1.47874446 0.0 CA M
## 3: 87 EE -0.08791508 0.4 NY M
## 4: 2 EE -0.48057769 0.7 NY M
## 5: 49 AA -0.34198650 1.7 NY M
## 6: 21 EE 0.92080453 1.0 TX M
## 7: 86 DD 0.67480280 0.8 TX M
## 8: 76 EE 1.17139689 0.8 TX F
## 9: 92 EE 0.12802631 1.6 TX F
## 10: 7 CC -0.85007721 1.1 CA F
## 11: 74 DD 2.07444699 1.7 CA F
## 12: 29 AA 0.42918070 0.1 CA F
## 13: 47 DD 1.57385719 0.1 NY F
## 14: 89 EE 1.13904273 1.2 NY F
## 15: 32 DD -0.34767965 0.4 NY F
# Use a two-column strata (-E- and -D-) but only use cases where -E- == "M"
stratified(DF, c("E", "D"), .15, select = list(E = "M"))
## ID A B C D E
## 1: 69 DD -1.0568963 1.1 CA M
## 2: 93 BB 0.5397448 0.1 CA M
## 3: 3 BB -0.4537095 1.4 NY M
## 4: 52 CC 1.2362100 0.0 NY M
## 5: 16 EE 1.5068554 0.4 NY M
## 6: 71 CC -2.2656832 0.9 TX M
## 7: 19 BB -1.2440253 0.9 TX M
## As above, but where -E- == "M" and -D- == "CA" or "TX"
stratified(DF, c("E", "D"), .15, select = list(E = "M", D = c("CA", "TX")))
## ID A B C D E
## 1: 39 AA -1.8207479 0.6 CA M
## 2: 90 CC 1.4787445 0.0 CA M
## 3: 26 AA 0.6020150 1.4 TX M
## 4: 34 BB 0.1999832 0.0 TX M
# Use a three-column strata: -E-, -D-, and -A-
stratified(DF, c("E", "D", "A"), size = 2)
## Groups M CA EE, F TX AA contain fewer rows than requested. Returning all rows.
## ID A B C D E
## 1: 1 DD 0.03441729 0.4 CA M
## 2: 50 DD -0.22377866 1.0 CA M
## 3: 16 EE 1.50685545 0.4 NY M
## 4: 24 EE -0.10527522 0.8 NY M
## 5: 77 BB -0.19844364 0.9 NY M
## 6: 3 BB -0.45370948 1.4 NY M
## 7: 4 EE 0.50118422 1.7 TX M
## 8: 37 EE 0.84661286 2.0 TX M
## 9: 22 DD 0.08856214 1.1 TX F
## 10: 30 DD 0.91106027 0.2 TX F
## 11: 44 CC -0.06941607 0.3 TX F
## 12: 38 CC 0.46813645 1.8 TX F
## 13: 61 CC -1.22916923 0.4 CA F
## 14: 7 CC -0.85007721 1.1 CA F
## 15: 74 DD 2.07444699 1.7 CA F
## 16: 10 DD -0.16857996 1.0 CA F
## 17: 42 DD 0.37900925 0.4 NY M
## 18: 11 DD -2.37911331 0.9 NY M
## 19: 47 DD 1.57385719 0.1 NY F
## 20: 78 DD -1.67040169 0.1 NY F
## 21: 14 CC 1.08088015 0.6 NY F
## 22: 15 CC -0.20780062 1.9 NY F
## 23: 17 EE -0.66212591 1.3 CA M
## 24: 34 BB 0.19998324 0.0 TX M
## 25: 88 BB -0.97048108 0.1 TX M
## 26: 86 DD 0.67480280 0.8 TX M
## 27: 20 DD 2.32245726 1.8 TX M
## 28: 67 EE 1.42363420 0.3 CA F
## 29: 23 EE 0.22902353 0.3 CA F
## 30: 27 AA -1.64180454 0.3 TX M
## 31: 26 AA 0.60201496 1.4 TX M
## 32: 97 CC 0.26860337 0.2 CA M
## 33: 46 CC 0.81065422 0.1 CA M
## 34: 80 AA -0.46251392 0.9 CA F
## 35: 99 AA -0.90728455 1.0 CA F
## 36: 33 AA -1.47237915 0.7 NY F
## 37: 85 AA 0.52804947 0.0 NY F
## 38: 71 CC -2.26568316 0.9 TX M
## 39: 35 CC 0.51590247 0.9 TX M
## 40: 39 AA -1.82074794 0.6 CA M
## 41: 91 AA 0.41931573 0.0 CA M
## 42: 75 BB -0.93770711 1.6 CA F
## 43: 96 BB -2.63403382 1.0 CA F
## 44: 43 CC 0.37344147 0.6 NY M
## 45: 55 CC -1.06354794 0.3 NY M
## 46: 59 BB -0.11404972 0.0 NY F
## 47: 83 BB 0.24854064 0.1 NY F
## 48: 57 AA 0.05145718 1.0 NY M
## 49: 49 AA -0.34198650 1.7 NY M
## 50: 76 EE 1.17139689 0.8 TX F
## 51: 92 EE 0.12802631 1.6 TX F
## 52: 95 BB 1.13354857 2.3 TX F
## 53: 65 BB -0.90824469 0.4 TX F
## 54: 72 EE -0.47527774 0.2 NY F
## 55: 89 EE 1.13904273 1.2 NY F
## 56: 63 AA -0.01727829 0.8 TX F
## 57: 93 BB 0.53974479 0.1 CA M
## 58: 81 BB -1.22163393 0.5 CA M
## ID A B C D E
## Not run:
# The following will produce errors
#stratified(DF, "D", c(5, 3))
#stratified(DF, "D", c(5, 3, 2))
## End(Not run)
# Sizes using a named vector
stratified(DF, "D", c(CA = 5, NY = 3, TX = 2))
## ID A B C D E
## 1: 10 DD -0.16857996 1.0 CA F
## 2: 46 CC 0.81065422 0.1 CA M
## 3: 17 EE -0.66212591 1.3 CA M
## 4: 29 AA 0.42918070 0.1 CA F
## 5: 51 CC 0.55976001 0.5 CA F
## 6: 49 AA -0.34198650 1.7 NY M
## 7: 83 BB 0.24854064 0.1 NY F
## 8: 62 DD -0.03260632 0.8 NY F
## 9: 82 CC 0.01949581 1.5 TX M
## 10: 5 DD 0.82885393 0.0 TX F
# Works with multiple groups as well
stratified(DF, c("D", "E"),
c("NY F" = 2, "NY M" = 3, "TX F" = 1, "TX M" = 1,
"CA F" = 5, "CA M" = 1))
## ID A B C D E
## 1: 59 BB -0.11404972 0.0 NY F
## 2: 62 DD -0.03260632 0.8 NY F
## 3: 42 DD 0.37900925 0.4 NY M
## 4: 13 EE 0.46067009 1.8 NY M
## 5: 43 CC 0.37344147 0.6 NY M
## 6: 22 DD 0.08856214 1.1 TX F
## 7: 4 EE 0.50118422 1.7 TX M
## 8: 68 BB -0.12214226 1.0 CA F
## 9: 80 AA -0.46251392 0.9 CA F
## 10: 64 BB -0.54550994 0.6 CA F
## 11: 23 EE 0.22902353 0.3 CA F
## 12: 67 EE 1.42363420 0.3 CA F
## 13: 50 DD -0.22377866 1.0 CA M