This is my external memory bank file of how to sample with R.
sample(c("rock", "paper", "scissors"), 1)
## [1] "scissors"
(x <- c(1:10))
## [1] 1 2 3 4 5 6 7 8 9 10
sample(x)
## [1] 10 6 8 1 5 3 9 4 7 2
sample(x, 3)
## [1] 6 1 9
sample(x, 10, replace = FALSE)
## [1] 3 7 4 9 2 6 10 1 8 5
sample(x, 10, replace = TRUE)
## [1] 10 4 6 9 9 8 3 4 5 7
sample(x, 30, replace = TRUE)
## [1] 10 10 8 5 9 4 10 2 3 4 8 8 8 2 1 2 10 2 3 3 3 9 3 8 3
## [26] 2 1 1 4 10
An essential property of Bernoulli sampling is that all elements of the population have equal probability of being included in the sample. https://en.wikipedia.org/wiki/Bernoulli_sampling
sample(c(0,1), 100, replace = TRUE)
## [1] 1 0 1 1 1 0 1 1 1 1 0 1 1 1 1 0 1 0 1 1 1 0 0 1 1 1 0 1 1 1 0 0 0 1 1 1 1
## [38] 1 1 1 1 1 0 0 0 0 0 0 1 1 1 0 0 1 0 1 1 0 1 1 0 0 0 1 0 0 0 0 0 0 0 1 1 1
## [75] 1 0 0 1 0 0 0 0 0 1 0 0 1 1 1 1 0 0 1 0 1 1 1 0 1 1
sample(c("A","B"), 100, replace = TRUE)
## [1] "B" "B" "B" "B" "B" "A" "B" "A" "B" "B" "B" "B" "A" "A" "A" "B" "A" "B"
## [19] "B" "A" "A" "A" "B" "B" "B" "B" "A" "A" "B" "A" "B" "B" "B" "B" "B" "A"
## [37] "A" "A" "A" "A" "B" "B" "B" "B" "A" "B" "B" "A" "A" "B" "B" "A" "A" "B"
## [55] "A" "B" "B" "A" "B" "B" "A" "B" "B" "B" "B" "B" "A" "A" "B" "B" "A" "B"
## [73] "B" "A" "A" "B" "A" "B" "A" "B" "A" "A" "B" "A" "B" "B" "B" "B" "A" "B"
## [91] "A" "B" "B" "B" "A" "A" "A" "B" "A" "B"
sample(c("A","B","C"), 100, replace = TRUE)
## [1] "A" "C" "A" "C" "B" "A" "A" "B" "C" "A" "C" "B" "A" "C" "C" "C" "B" "A"
## [19] "C" "C" "B" "C" "B" "B" "B" "C" "B" "B" "A" "A" "C" "C" "A" "B" "C" "B"
## [37] "B" "C" "B" "B" "A" "C" "B" "B" "A" "A" "B" "A" "C" "B" "A" "B" "C" "B"
## [55] "B" "C" "C" "A" "A" "C" "A" "B" "C" "B" "B" "B" "A" "B" "A" "A" "A" "B"
## [73] "A" "A" "B" "B" "C" "C" "A" "A" "C" "A" "A" "A" "C" "B" "B" "A" "C" "B"
## [91] "A" "C" "A" "A" "C" "B" "C" "B" "C" "C"
x <- 1:10
x
## [1] 1 2 3 4 5 6 7 8 9 10
sample(x[x > 8]) # length 2
## [1] 10 9
sample(x[x > 9]) # oops -- length 10!
## [1] 3 9 5 6 4 2 8 1 7 10
sample(x[x > 10]) # length 0
## integer(0)
## safer version:
resample <- function(x, ...) x[sample.int(length(x), ...)]
resample(x[x > 8]) # length 2
## [1] 10 9
resample(x[x > 9]) # length 1
## [1] 10
resample(x[x > 10]) # length 0
## integer(0)
## R 3.x.y only
sample.int(1e10, 12, replace = TRUE)
## [1] 1452296966 8443241679 390442800 588623543 2689679303 3955951316
## [7] 1899081598 8506790486 2001084239 2900953782 2363777215 4388851199
sample.int(1e10, 12) # not that there is much chance of duplicates
## [1] 2237448912 5587494849 4423423507 456190469 2179078875 2558628617
## [7] 6784838698 8731692921 1140971190 7673737654 8966730338 137213113
Achtung Benford’s Law. https://en.wikipedia.org/wiki/Benford%27s_law
library(nycflights13)
str(nycflights13::planes)
str(nycflights13::airlines)
str(nycflights13::airports)
str(nycflights13::flights)
str(nycflights13::weather)
df1 <- nycflights13::planes
df1 <- nycflights13::airlines
df1 <- nycflights13::airports
df1 <- nycflights13::flights
df1 <- nycflights13::weather
df2 <- df1 %>%
mutate(first_digit_of_flight = as.integer(substr(hour,1,1)))
class(df2$first_digit_of_flight)
table(df2$first_digit_of_flight)
library(ggplot2)
ggplot(df2, aes(first_digit_of_flight)) +
geom_histogram(alpha = 1,
color = "black",
fill = "black",
linetype = "solid",
size = 0.5,
binwidth = 0.01) +
ggtitle("This is the Title.", subtitle = "This is the subtitle") +
labs(x = "First Digit of Flight", y = "Count.")
library(tidyverse)
by_cyl <- mtcars %>% group_by(cyl)
# Sample fixed number per group
sample_n(mtcars, 10)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
sample_n(mtcars, 50, replace = TRUE)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Porsche 914-2...1 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Merc 230...2 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Merc 230...4 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Datsun 710...6 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## AMC Javelin...7 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Merc 450SL...8 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Ford Pantera L...9 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Datsun 710...11 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## AMC Javelin...12 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Datsun 710...13 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Datsun 710...14 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Chrysler Imperial...15 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Dodge Challenger...16 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla...18 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Dodge Challenger...19 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Volvo 142E...20 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Ford Pantera L...21 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Camaro Z28...22 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Lincoln Continental...23 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial...24 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## AMC Javelin...25 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Camaro Z28...26 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## AMC Javelin...27 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Lincoln Continental...29 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Volvo 142E...30 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Toyota Corolla...31 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag...33 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Fiat X1-9...34 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Mazda RX4 Wag...36 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Duster 360...37 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 450SE...38 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Maserati Bora...39 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Camaro Z28...40 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Duster 360...41 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 450SE...42 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Ford Pantera L...43 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Porsche 914-2...44 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Fiat X1-9...45 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Toyota Corolla...47 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Merc 450SL...48 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Maserati Bora...49 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Chrysler Imperial...50 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
sample_n(mtcars, 10, weight = mpg)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
sample_n(by_cyl, 3)
## # A tibble: 9 × 11
## # Groups: cyl [3]
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
## 2 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 3 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 4 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 5 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
## 6 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4
## 7 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
## 8 10.4 8 472 205 2.93 5.25 18.0 0 0 3 4
## 9 15.5 8 318 150 2.76 3.52 16.9 0 0 3 2
sample_n(by_cyl, 10, replace = TRUE)
## # A tibble: 30 × 11
## # Groups: cyl [3]
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1
## 2 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
## 3 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2
## 4 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
## 5 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2
## 6 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
## 7 24.4 4 147. 62 3.69 3.19 20 1 0 4 2
## 8 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1
## 9 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1
## 10 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1
## # … with 20 more rows
sample_n(by_cyl, 3, weight = mpg / mean(mpg))
## # A tibble: 9 × 11
## # Groups: cyl [3]
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## 2 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
## 3 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
## 4 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6
## 5 21 6 160 110 3.9 2.88 17.0 0 1 4 4
## 6 21 6 160 110 3.9 2.62 16.5 0 1 4 4
## 7 15.2 8 276. 180 3.07 3.78 18 0 0 3 3
## 8 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4
## 9 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2
Default is to sample all data = randomly resample rows
sample_frac(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
sample_frac(mtcars, 0.1)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
sample_frac(mtcars, 1.5, replace = TRUE)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Maserati Bora...1 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Datsun 710...2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Toyota Corolla...3 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Lotus Europa...4 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Ferrari Dino...7 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Duster 360...8 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 450SL...9 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Camaro Z28...10 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Duster 360...11 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Datsun 710...12 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Fiat X1-9...13 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Merc 450SLC...14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Merc 450SL...15 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Hornet 4 Drive...16 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Merc 450SL...17 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 240D...18 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 240D...19 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 280...20 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 450SLC...21 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Maserati Bora...24 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Merc 450SL...25 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Datsun 710...26 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Datsun 710...27 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Fiat 128...29 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Lotus Europa...31 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Toyota Corolla...32 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Hornet 4 Drive...33 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Toyota Corolla...34 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Merc 280...35 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Lotus Europa...36 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Fiat 128...37 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Dodge Challenger...38 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Ferrari Dino...40 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Dodge Challenger...42 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Fiat X1-9...43 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Camaro Z28...46 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Maserati Bora...47 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
sample_frac(mtcars, 0.1, weight = 1 / mpg)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
## AMC Javelin 15.2 8 304 150 3.15 3.435 17.30 0 0 3 2
## Ford Pantera L 15.8 8 351 264 4.22 3.170 14.50 0 1 5 4
sample_frac(by_cyl, 0.2)
## # A tibble: 6 × 11
## # Groups: cyl [3]
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
## 2 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
## 3 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4
## 4 14.7 8 440 230 3.23 5.34 17.4 0 0 3 4
## 5 16.4 8 276. 180 3.07 4.07 17.4 0 0 3 3
## 6 15.2 8 304 150 3.15 3.44 17.3 0 0 3 2
sample_frac(by_cyl, 1, replace = TRUE)
## # A tibble: 32 × 11
## # Groups: cyl [3]
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
## 2 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
## 3 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1
## 4 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2
## 5 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1
## 6 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
## 7 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2
## 8 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2
## 9 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1
## 10 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1
## # … with 22 more rows
For example, sample 1% of each month’s flights.
library(nycflights13)
library(tidyverse)
# count them first
nycflights13::flights %>%
group_by(month) %>%
count()
## # A tibble: 12 × 2
## # Groups: month [12]
## month n
## <int> <int>
## 1 1 27004
## 2 2 24951
## 3 3 28834
## 4 4 28330
## 5 5 28796
## 6 6 28243
## 7 7 29425
## 8 8 29327
## 9 9 27574
## 10 10 28889
## 11 11 27268
## 12 12 28135
by_month <- nycflights13::flights %>% group_by(month)
by_month
## # A tibble: 336,776 × 19
## # Groups: month [12]
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # … with 336,766 more rows, and 11 more variables: arr_delay <dbl>,
## # carrier <chr>, flight <int>, tailnum <chr>, origin <chr>, dest <chr>,
## # air_time <dbl>, distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
sample1 <- sample_frac(by_month, .01, replace = FALSE)
sample1 %>%
group_by(month) %>%
count()
## # A tibble: 12 × 2
## # Groups: month [12]
## month n
## <int> <int>
## 1 1 270
## 2 2 250
## 3 3 288
## 4 4 283
## 5 5 288
## 6 6 282
## 7 7 294
## 8 8 293
## 9 9 276
## 10 10 289
## 11 11 273
## 12 12 281
library(dplyr)
set.seed(123)
# select 5%
slice_sample(gapminder::gapminder, prop = 0.05)
## # A tibble: 85 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Denmark Europe 1982 74.6 5117810 21688.
## 2 Egypt Africa 1982 56.0 45681811 3504.
## 3 Brazil Americas 2002 71.0 179914212 8131.
## 4 Finland Europe 1997 77.1 5134406 23724.
## 5 Burkina Faso Africa 1962 37.8 4919632 723.
## 6 Malaysia Asia 1957 52.1 7739235 1810.
## 7 Norway Europe 1957 73.4 3491938 11654.
## 8 Senegal Africa 1962 41.5 3430243 1655.
## 9 Puerto Rico Americas 1972 72.2 2847132 9123.
## 10 Reunion Africa 1987 71.9 562035 5303.
## # … with 75 more rows
# select 5% weighted by life expectancy
slice_sample(gapminder::gapminder, prop = 0.05, weight_by = lifeExp)
## # A tibble: 85 × 6
## country continent year lifeExp pop gdpPercap
## <fct> <fct> <int> <dbl> <int> <dbl>
## 1 Uruguay Americas 1952 66.1 2252965 5717.
## 2 Sierra Leone Africa 1987 40.0 3868905 1294.
## 3 Honduras Americas 1992 66.4 5077347 3082.
## 4 Somalia Africa 1987 44.5 6921858 1093.
## 5 Guinea Africa 1982 42.9 4710497 857.
## 6 Philippines Asia 1977 60.1 46850962 2373.
## 7 Jamaica Americas 1972 69 1997616 7434.
## 8 Panama Americas 2002 74.7 2990875 7356.
## 9 Mali Africa 1977 41.7 6491649 686.
## 10 Slovenia Europe 1982 71.1 1861252 17867.
## # … with 75 more rows
# source: https://twitter.com/rfunctionaday/status/1446711797948497922
library(tidyverse)
str(diamonds)
## tibble [53,940 × 10] (S3: tbl_df/tbl/data.frame)
## $ carat : num [1:53940] 0.23 0.21 0.23 0.29 0.31 0.24 0.24 0.26 0.22 0.23 ...
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 5 4 2 4 2 3 3 3 1 3 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 2 2 2 6 7 7 6 5 2 5 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 2 3 5 4 2 6 7 3 4 5 ...
## $ depth : num [1:53940] 61.5 59.8 56.9 62.4 63.3 62.8 62.3 61.9 65.1 59.4 ...
## $ table : num [1:53940] 55 61 65 58 58 57 57 55 61 61 ...
## $ price : int [1:53940] 326 326 327 334 335 336 336 337 337 338 ...
## $ x : num [1:53940] 3.95 3.89 4.05 4.2 4.34 3.94 3.95 4.07 3.87 4 ...
## $ y : num [1:53940] 3.98 3.84 4.07 4.23 4.35 3.96 3.98 4.11 3.78 4.05 ...
## $ z : num [1:53940] 2.43 2.31 2.31 2.63 2.75 2.48 2.47 2.53 2.49 2.39 ...
table(diamonds$cut, diamonds$color)
##
## D E F G H I J
## Fair 163 224 312 314 303 175 119
## Good 662 933 909 871 702 522 307
## Very Good 1513 2400 2164 2299 1824 1204 678
## Premium 1603 2337 2331 2924 2360 1428 808
## Ideal 2834 3903 3826 4884 3115 2093 896
DF1 <- diamonds %>%
group_by(cut, color) %>%
nest() %>%
ungroup()
table(DF1$cut, DF1$color)
##
## D E F G H I J
## Fair 1 1 1 1 1 1 1
## Good 1 1 1 1 1 1 1
## Very Good 1 1 1 1 1 1 1
## Premium 1 1 1 1 1 1 1
## Ideal 1 1 1 1 1 1 1
DF1 # have a look
## # A tibble: 35 × 3
## cut color data
## <ord> <ord> <list>
## 1 Ideal E <tibble [3,903 × 8]>
## 2 Premium E <tibble [2,337 × 8]>
## 3 Good E <tibble [933 × 8]>
## 4 Premium I <tibble [1,428 × 8]>
## 5 Good J <tibble [307 × 8]>
## 6 Very Good J <tibble [678 × 8]>
## 7 Very Good I <tibble [1,204 × 8]>
## 8 Very Good H <tibble [1,824 × 8]>
## 9 Fair E <tibble [224 × 8]>
## 10 Ideal J <tibble [896 × 8]>
## # … with 25 more rows
# Show rows 4 and 5
DF1 %>% slice(4:5)
## # A tibble: 2 × 3
## cut color data
## <ord> <ord> <list>
## 1 Premium I <tibble [1,428 × 8]>
## 2 Good J <tibble [307 × 8]>
DF2 <- diamonds %>%
group_by(cut, color) %>%
nest() %>%
ungroup() %>%
slice(4:5) %>%
unnest(cols = c(data))
tibble::glimpse(DF2)
## Rows: 1,735
## Columns: 10
## $ cut <ord> Premium, Premium, Premium, Premium, Premium, Premium, Premium,…
## $ color <ord> I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I, I,…
## $ carat <dbl> 0.29, 0.24, 0.42, 0.32, 0.90, 0.76, 0.81, 0.72, 1.00, 0.32, 0.…
## $ clarity <ord> VS2, VS1, SI2, SI1, VS2, VVS1, VVS2, IF, SI2, SI1, SI1, SI1, S…
## $ depth <dbl> 62.4, 62.5, 61.5, 62.9, 63.0, 58.8, 61.9, 63.0, 58.2, 62.7, 62…
## $ table <dbl> 58, 57, 59, 58, 58, 59, 60, 57, 60, 58, 58, 59, 59, 58, 59, 59…
## $ price <int> 334, 355, 552, 554, 2761, 2790, 2795, 2795, 2795, 554, 554, 55…
## $ x <dbl> 4.20, 3.97, 4.78, 4.35, 6.16, 6.00, 5.91, 5.72, 6.61, 4.37, 4.…
## $ y <dbl> 4.23, 3.94, 4.84, 4.33, 6.12, 5.94, 5.86, 5.70, 6.55, 4.34, 4.…
## $ z <dbl> 2.63, 2.47, 2.96, 2.73, 3.87, 3.51, 3.64, 3.60, 3.83, 2.73, 2.…
str(DF2)
## tibble [1,735 × 10] (S3: tbl_df/tbl/data.frame)
## $ cut : Ord.factor w/ 5 levels "Fair"<"Good"<..: 4 4 4 4 4 4 4 4 4 4 ...
## $ color : Ord.factor w/ 7 levels "D"<"E"<"F"<"G"<..: 6 6 6 6 6 6 6 6 6 6 ...
## $ carat : num [1:1735] 0.29 0.24 0.42 0.32 0.9 0.76 0.81 0.72 1 0.32 ...
## $ clarity: Ord.factor w/ 8 levels "I1"<"SI2"<"SI1"<..: 4 5 2 3 4 7 6 8 2 3 ...
## $ depth : num [1:1735] 62.4 62.5 61.5 62.9 63 58.8 61.9 63 58.2 62.7 ...
## $ table : num [1:1735] 58 57 59 58 58 59 60 57 60 58 ...
## $ price : int [1:1735] 334 355 552 554 2761 2790 2795 2795 2795 554 ...
## $ x : num [1:1735] 4.2 3.97 4.78 4.35 6.16 6 5.91 5.72 6.61 4.37 ...
## $ y : num [1:1735] 4.23 3.94 4.84 4.33 6.12 5.94 5.86 5.7 6.55 4.34 ...
## $ z : num [1:1735] 2.63 2.47 2.96 2.73 3.87 3.51 3.64 3.6 3.83 2.73 ...
table(DF2$cut, DF2$color)
##
## D E F G H I J
## Fair 0 0 0 0 0 0 0
## Good 0 0 0 0 0 0 307
## Very Good 0 0 0 0 0 0 0
## Premium 0 0 0 0 0 1428 0
## Ideal 0 0 0 0 0 0 0
library(nycflights13)
str(nycflights13::flights)
## tibble [336,776 × 19] (S3: tbl_df/tbl/data.frame)
## $ year : int [1:336776] 2013 2013 2013 2013 2013 2013 2013 2013 2013 2013 ...
## $ month : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ day : int [1:336776] 1 1 1 1 1 1 1 1 1 1 ...
## $ dep_time : int [1:336776] 517 533 542 544 554 554 555 557 557 558 ...
## $ sched_dep_time: int [1:336776] 515 529 540 545 600 558 600 600 600 600 ...
## $ dep_delay : num [1:336776] 2 4 2 -1 -6 -4 -5 -3 -3 -2 ...
## $ arr_time : int [1:336776] 830 850 923 1004 812 740 913 709 838 753 ...
## $ sched_arr_time: int [1:336776] 819 830 850 1022 837 728 854 723 846 745 ...
## $ arr_delay : num [1:336776] 11 20 33 -18 -25 12 19 -14 -8 8 ...
## $ carrier : chr [1:336776] "UA" "UA" "AA" "B6" ...
## $ flight : int [1:336776] 1545 1714 1141 725 461 1696 507 5708 79 301 ...
## $ tailnum : chr [1:336776] "N14228" "N24211" "N619AA" "N804JB" ...
## $ origin : chr [1:336776] "EWR" "LGA" "JFK" "JFK" ...
## $ dest : chr [1:336776] "IAH" "IAH" "MIA" "BQN" ...
## $ air_time : num [1:336776] 227 227 160 183 116 150 158 53 140 138 ...
## $ distance : num [1:336776] 1400 1416 1089 1576 762 ...
## $ hour : num [1:336776] 5 5 5 5 6 5 6 6 6 6 ...
## $ minute : num [1:336776] 15 29 40 45 0 58 0 0 0 0 ...
## $ time_hour : POSIXct[1:336776], format: "2013-01-01 05:00:00" "2013-01-01 05:00:00" ...
flights1 <- flights %>%
group_by(tailnum) %>%
nest() %>%
ungroup()
head(flights1)
## # A tibble: 6 × 2
## tailnum data
## <chr> <list>
## 1 N14228 <tibble [111 × 18]>
## 2 N24211 <tibble [130 × 18]>
## 3 N619AA <tibble [24 × 18]>
## 4 N804JB <tibble [219 × 18]>
## 5 N668DN <tibble [49 × 18]>
## 6 N39463 <tibble [107 × 18]>
flights2 <- flights %>%
group_by(tailnum) %>%
nest() %>%
ungroup() %>%
slice(4:5) %>%
unnest(cols = c(data))
table(flights2$tailnum)
##
## N668DN N804JB
## 49 219
flights3 <- flights %>%
group_by(tailnum) %>%
nest() %>%
ungroup() %>%
sample_n(10) %>% # sample a number of rows
unnest(cols = c(data))
table(flights3$tailnum)
##
## N11535 N355JB N359NW N466WN N515UA N5EGAA N683BR N76522 N8611A N925WN
## 232 282 80 38 81 41 3 122 50 31
tibble::glimpse(flights3)
## Rows: 960
## Columns: 19
## $ tailnum <chr> "N76522", "N76522", "N76522", "N76522", "N76522", "N765…
## $ year <int> 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2013, 2…
## $ month <int> 1, 1, 1, 1, 1, 1, 1, 1, 1, 10, 10, 10, 10, 10, 10, 11, …
## $ day <int> 1, 3, 4, 6, 7, 11, 20, 21, 23, 3, 8, 10, 16, 24, 26, 3,…
## $ dep_time <int> 826, 532, 1627, 1922, 2116, 830, 851, 1906, 839, 1858, …
## $ sched_dep_time <int> 817, 530, 1600, 1915, 2035, 830, 840, 1905, 840, 1901, …
## $ dep_delay <dbl> 9, 2, 27, 7, 41, 0, 11, 1, -1, -3, -3, 26, -6, 1, -3, 5…
## $ arr_time <int> 1145, 851, 1907, 2233, 2220, 1201, 1156, 2222, 1146, 22…
## $ sched_arr_time <int> 1158, 831, 1912, 2221, 2154, 1210, 1151, 2235, 1143, 22…
## $ arr_delay <dbl> -13, 20, -5, 12, 26, -9, 5, -13, 3, -2, -5, -1, -16, -1…
## $ carrier <chr> "UA", "UA", "UA", "UA", "UA", "UA", "UA", "UA", "UA", "…
## $ flight <int> 1480, 1136, 1200, 1607, 1243, 1480, 1626, 1054, 1626, 1…
## $ origin <chr> "EWR", "LGA", "EWR", "EWR", "EWR", "EWR", "EWR", "EWR",…
## $ dest <chr> "SFO", "IAH", "SAN", "PBI", "BOS", "SFO", "SAN", "SFO",…
## $ air_time <dbl> 357, 241, 314, 159, 42, 356, 339, 351, 341, 331, 104, 2…
## $ distance <dbl> 2565, 1416, 2425, 1023, 200, 2565, 2425, 2565, 2425, 24…
## $ hour <dbl> 8, 5, 16, 19, 20, 8, 8, 19, 8, 19, 17, 19, 20, 16, 9, 1…
## $ minute <dbl> 17, 30, 0, 15, 35, 30, 40, 5, 40, 1, 0, 34, 22, 42, 7, …
## $ time_hour <dttm> 2013-01-01 08:00:00, 2013-01-03 05:00:00, 2013-01-04 1…
flights4 <- flights %>%
group_by(tailnum) %>%
nest() %>%
ungroup() %>%
sample_frac(.1) %>% # sample a percentage
unnest(cols = c(data))
length(table(flights$tailnum))
## [1] 4043
length(table(flights4$tailnum))
## [1] 404
At the moment, these references are not in any particular order.