# install.packages('dplyr') install.packages('tidyr')
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
add_rownames(df, var = “rowname”)
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
head(mtcars %>% tbl_df())
## Source: local data frame [6 x 11]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 6 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
head(mtcars %>% add_rownames())
## Source: local data frame [6 x 12]
##
## rowname mpg cyl disp hp drat wt qsec vs am
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1
## 2 Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1
## 3 Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1
## 4 Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0
## 5 Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0
## 6 Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0
## Variables not shown: gear <dbl>, carb <dbl>.
all.equal(target, current, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE, …)
all.equal(target, current, ignore_col_order = TRUE, ignore_row_order = TRUE, convert = FALSE, …)
scramble <- function(x) x[sample(nrow(x)), sample(ncol(x))]
mtcars_df <- tbl_df(mtcars)
all.equal(target = mtcars_df, current = scramble(mtcars_df))
## [1] TRUE
all.equal(target = mtcars_df, current = scramble(mtcars_df), ignore_col_order = FALSE)
## [1] "Column names same but in different order"
all.equal(target = mtcars_df, current = scramble(mtcars_df), ignore_row_order = FALSE)
## [1] "Attributes: < Component \"row.names\": Modes: numeric, character >"
## [2] "Attributes: < Component \"row.names\": target is numeric, current is character >"
## [3] "Component \"mpg\": Mean relative difference: 0.3596205"
## [4] "Component \"cyl\": Mean relative difference: 0.5614035"
## [5] "Component \"disp\": Mean relative difference: 0.7018268"
## [6] "Component \"hp\": Mean relative difference: 0.521815"
## [7] "Component \"drat\": Mean relative difference: 0.2149861"
## [8] "Component \"wt\": Mean relative difference: 0.3676665"
## [9] "Component \"qsec\": Mean relative difference: 0.1184257"
## [10] "Component \"vs\": Mean relative difference: 2"
## [11] "Component \"am\": Mean relative difference: 2"
## [12] "Component \"gear\": Mean relative difference: 0.3448276"
## [13] "Component \"carb\": Mean relative difference: 0.7297297"
arrange(.data, …)
mtcars[1:5, ]
## mpg cyl disp hp drat wt qsec vs am gear carb
## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
arrange(.data = mtcars[1:5, ], cyl, disp)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
## 2 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 3 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
arrange(.data = mtcars[1:5, ], desc(cyl), desc(disp))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 2 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 3 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 4 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 5 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
arrange(.data = mtcars[1:5, ], desc(cyl, disp))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 2 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 3 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 5 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
arrange(.data = mtcars[1:5, ], desc(disp))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2
## 2 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1
## 3 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4
## 4 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4
## 5 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1
between(x, left, right)
x <- rnorm(10)
x
## [1] 0.7785705 -0.4961340 0.8021603 -1.3345285 -1.0332436 -0.4169530
## [7] -0.2867346 -0.6100643 0.5070427 -0.6159983
between(x = x, left = -1, right = 1)
## [1] TRUE TRUE TRUE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
x[between(x = x, left = -1, right = 1)]
## [1] 0.7785705 -0.4961340 0.8021603 -0.4169530 -0.2867346 -0.6100643
## [7] 0.5070427 -0.6159983
by_cyl <- group_by(mtcars, cyl)
do(by_cyl, head(., 2))
## Source: local data frame [6 x 11]
## Groups: cyl [3]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 3 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 4 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
models[[2]]
## [[1]]
##
## Call:
## lm(formula = mpg ~ disp, data = .)
##
## Coefficients:
## (Intercept) disp
## 40.8720 -0.1351
##
##
## [[2]]
##
## Call:
## lm(formula = mpg ~ disp, data = .)
##
## Coefficients:
## (Intercept) disp
## 19.081987 0.003605
##
##
## [[3]]
##
## Call:
## lm(formula = mpg ~ disp, data = .)
##
## Coefficients:
## (Intercept) disp
## 22.03280 -0.01963
filter(.data = mtcars, cyl == 8)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 2 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 3 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 4 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 5 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 6 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 7 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 8 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 9 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 10 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 11 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 12 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 13 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 14 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
filter(.data = mtcars, cyl < 6)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 3 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 4 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 5 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 6 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 7 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 8 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 9 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 10 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 11 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
# Multiple criteria
filter(.data = mtcars, cyl < 6 & vs == 1) # and
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 3 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 4 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 5 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 6 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 7 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 8 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 9 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 10 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
filter(.data = mtcars, cyl < 6 | vs == 1) # or
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 3 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 4 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 5 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 6 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 7 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 8 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 9 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 10 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 11 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 12 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 13 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 14 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 15 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
# Multiple arguments are equivalent to and
filter(.data = mtcars, cyl < 6, vs == 1)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 3 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 4 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 5 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 6 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 7 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 8 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 9 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 10 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
by_cyl <- group_by(mtcars, cyl)
n_groups(by_cyl)
## [1] 3
group_size(by_cyl)
## [1] 11 7 14
summarise(by_cyl, mean(disp), mean(hp))
## Source: local data frame [3 x 3]
##
## cyl mean(disp) mean(hp)
## <dbl> <dbl> <dbl>
## 1 4 105.1364 82.63636
## 2 6 183.3143 122.28571
## 3 8 353.1000 209.21429
filter(by_cyl, disp == max(disp))
## Source: local data frame [3 x 11]
## Groups: cyl [3]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 3 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
# summarise peels off a single layer of grouping
by_vs_am <- group_by(mtcars, vs, am)
by_vs <- summarise(by_vs_am, n = n())
by_vs
## Source: local data frame [4 x 3]
## Groups: vs [?]
##
## vs am n
## (dbl) (dbl) (int)
## 1 0 0 12
## 2 0 1 6
## 3 1 0 7
## 4 1 1 7
summarise(by_vs, n = sum(n))
## Source: local data frame [2 x 2]
##
## vs n
## <dbl> <int>
## 1 0 18
## 2 1 14
# use ungroup() to remove if not wanted
summarise(ungroup(by_vs), n = sum(n))
## Source: local data frame [1 x 1]
##
## n
## <int>
## 1 32
# You can group by expressions: this is just short-hand for a mutate/rename
# followed by a simple group_by
group_by(mtcars, vsam = vs + am)
## Source: local data frame [32 x 12]
## Groups: vsam [3]
##
## mpg cyl disp hp drat wt qsec vs am gear carb vsam
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 1
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 1
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 2
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 1
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 0
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 1
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 0
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 1
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 1
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 1
## .. ... ... ... ... ... ... ... ... ... ... ... ...
group_by(mtcars, vs2 = vs)
## Source: local data frame [32 x 12]
## Groups: vs2 [2]
##
## mpg cyl disp hp drat wt qsec vs am gear carb vs2
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 0
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 0
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 1
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 1
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 0
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 1
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 0
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 1
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 1
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 1
## .. ... ... ... ... ... ... ... ... ... ... ... ...
head(mutate(mtcars, displ_l = disp/61.0237, displ_2 = displ_l/2))
## mpg cyl disp hp drat wt qsec vs am gear carb displ_l displ_2
## 1 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 2.621932 1.3109661
## 2 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 2.621932 1.3109661
## 3 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 1.769804 0.8849021
## 4 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 4.227866 2.1139328
## 5 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 5.899347 2.9496737
## 6 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1 3.687092 1.8435460
head(transmute(mtcars, displ_l = disp/61.0237, displ_2 = displ_l/2))
## displ_l displ_2
## 1 2.621932 1.3109661
## 2 2.621932 1.3109661
## 3 1.769804 0.8849021
## 4 4.227866 2.1139328
## 5 5.899347 2.9496737
## 6 3.687092 1.8435460
df <- data.frame(year = 2000:2005, value = (0:5)^2)
scrambled <- df[sample(nrow(df)), ]
scrambled
## year value
## 3 2002 4
## 2 2001 1
## 4 2003 9
## 1 2000 0
## 5 2004 16
## 6 2005 25
wrong <- mutate(scrambled, running = cumsum(value))
wrong
## year value running
## 1 2002 4 4
## 2 2001 1 5
## 3 2003 9 14
## 4 2000 0 14
## 5 2004 16 30
## 6 2005 25 55
arrange(wrong, year)
## year value running
## 1 2000 0 14
## 2 2001 1 5
## 3 2002 4 4
## 4 2003 9 14
## 5 2004 16 30
## 6 2005 25 55
right <- mutate(scrambled, running = order_by(year, cumsum(value)))
right
## year value running
## 1 2002 4 5
## 2 2001 1 1
## 3 2003 9 14
## 4 2000 0 0
## 5 2004 16 30
## 6 2005 25 55
arrange(right, year)
## year value running
## 1 2000 0 0
## 2 2001 1 1
## 3 2002 4 5
## 4 2003 9 14
## 5 2004 16 30
## 6 2005 25 55
df <- expand.grid(x = 1:3, y = 3:1)
df
## x y
## 1 1 3
## 2 2 3
## 3 3 3
## 4 1 2
## 5 2 2
## 6 3 2
## 7 1 1
## 8 2 1
## 9 3 1
rowwise(df)
## Source: local data frame [9 x 2]
## Groups: <by row>
##
## x y
## (int) (int)
## 1 1 3
## 2 2 3
## 3 3 3
## 4 1 2
## 5 2 2
## 6 3 2
## 7 1 1
## 8 2 1
## 9 3 1
df %>% rowwise() %>% do(i = seq(.$x, .$y))
## Source: local data frame [9 x 1]
## Groups: <by row>
##
## i
## (chr)
## 1 <int[3]>
## 2 <int[2]>
## 3 <int[1]>
## 4 <int[2]>
## 5 <int[1]>
## 6 <int[2]>
## 7 <int[1]>
## 8 <int[2]>
## 9 <int[3]>
用法:
# Sample fixed number per group
sample_n(mtcars, 10)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
sample_n(mtcars, 10, replace = TRUE)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
sample_n(mtcars, 10, weight = mpg)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
by_cyl <- mtcars %>% group_by(cyl)
sample_n(by_cyl, 3)
## Source: local data frame [9 x 11]
## Groups: cyl [3]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 2 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 4 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 5 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 6 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 7 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 8 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 9 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
sample_n(by_cyl, 10, replace = TRUE)
## Source: local data frame [30 x 11]
## Groups: cyl [3]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 2 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 3 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 4 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 5 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 6 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 7 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 8 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 9 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 10 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## .. ... ... ... ... ... ... ... ... ... ... ...
sample_n(by_cyl, 3, weight = mpg/mean(mpg))
## Source: local data frame [9 x 11]
## Groups: cyl [3]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 2 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 3 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 4 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 5 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 7 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 8 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 9 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
# Sample fixed fraction per group Default is to sample all data = randomly
# resample rows sample_frac(mtcars)
sample_frac(mtcars, 0.1)
## mpg cyl disp hp drat wt qsec vs am gear carb
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
# sample_frac(mtcars, 1.5, replace = TRUE)
sample_frac(mtcars, 0.1, weight = 1/mpg)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
sample_frac(by_cyl, 0.2)
## Source: local data frame [6 x 11]
## Groups: cyl [3]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 2 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 3 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 4 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 5 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 6 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
# sample_frac(by_cyl, 1, replace = TRUE)
iris <- tbl_df(iris[1:5, ]) # so it prints a little nicer
select(iris, starts_with("Petal"))
## Source: local data frame [5 x 2]
##
## Petal.Length Petal.Width
## <dbl> <dbl>
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
select(iris, ends_with("Width"))
## Source: local data frame [5 x 2]
##
## Sepal.Width Petal.Width
## <dbl> <dbl>
## 1 3.5 0.2
## 2 3.0 0.2
## 3 3.2 0.2
## 4 3.1 0.2
## 5 3.6 0.2
select(iris, contains("etal"))
## Source: local data frame [5 x 2]
##
## Petal.Length Petal.Width
## <dbl> <dbl>
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
select(iris, matches(".t."))
## Source: local data frame [5 x 4]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
select(iris, Petal.Length, Petal.Width)
## Source: local data frame [5 x 2]
##
## Petal.Length Petal.Width
## <dbl> <dbl>
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
vars <- c("Petal.Length", "Petal.Width")
select(iris, one_of(vars))
## Source: local data frame [5 x 2]
##
## Petal.Length Petal.Width
## <dbl> <dbl>
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
df <- as.data.frame(matrix(runif(60), nrow = 6))
df <- tbl_df(df[c(3, 4, 7, 1, 9, 8, 5, 2, 6, 10)])
select(df, V8:V6)
## Source: local data frame [6 x 4]
##
## V8 V5 V2 V6
## <dbl> <dbl> <dbl> <dbl>
## 1 0.50784733 0.3705222928 0.5708563 0.2725626
## 2 0.42939419 0.4695904597 0.6392353 0.3995365
## 3 0.93737482 0.0724870036 0.4107849 0.6763092
## 4 0.11792486 0.9691334283 0.9426179 0.8371149
## 5 0.05031189 0.0007885925 0.1192577 0.2032340
## 6 0.15951781 0.3957703849 0.4203731 0.2628636
select(df, num_range("V", 4:6))
## Source: local data frame [6 x 3]
##
## V4 V5 V6
## <dbl> <dbl> <dbl>
## 1 0.97624757 0.3705222928 0.2725626
## 2 0.08484239 0.4695904597 0.3995365
## 3 0.58344759 0.0724870036 0.6763092
## 4 0.21230545 0.9691334283 0.8371149
## 5 0.15639510 0.0007885925 0.2032340
## 6 0.80585008 0.3957703849 0.2628636
# Drop variables
select(iris, -starts_with("Petal"))
## Source: local data frame [5 x 3]
##
## Sepal.Length Sepal.Width Species
## <dbl> <dbl> <fctr>
## 1 5.1 3.5 setosa
## 2 4.9 3.0 setosa
## 3 4.7 3.2 setosa
## 4 4.6 3.1 setosa
## 5 5.0 3.6 setosa
select(iris, -ends_with("Width"))
## Source: local data frame [5 x 3]
##
## Sepal.Length Petal.Length Species
## <dbl> <dbl> <fctr>
## 1 5.1 1.4 setosa
## 2 4.9 1.4 setosa
## 3 4.7 1.3 setosa
## 4 4.6 1.5 setosa
## 5 5.0 1.4 setosa
select(iris, -contains("etal"))
## Source: local data frame [5 x 3]
##
## Sepal.Length Sepal.Width Species
## <dbl> <dbl> <fctr>
## 1 5.1 3.5 setosa
## 2 4.9 3.0 setosa
## 3 4.7 3.2 setosa
## 4 4.6 3.1 setosa
## 5 5.0 3.6 setosa
select(iris, -matches(".t."))
## Source: local data frame [5 x 1]
##
## Species
## <fctr>
## 1 setosa
## 2 setosa
## 3 setosa
## 4 setosa
## 5 setosa
select(iris, -Petal.Length, -Petal.Width)
## Source: local data frame [5 x 3]
##
## Sepal.Length Sepal.Width Species
## <dbl> <dbl> <fctr>
## 1 5.1 3.5 setosa
## 2 4.9 3.0 setosa
## 3 4.7 3.2 setosa
## 4 4.6 3.1 setosa
## 5 5.0 3.6 setosa
# Rename variables: * select() keeps only the variables you specify
select(iris, petal_length = Petal.Length)
## Source: local data frame [5 x 1]
##
## petal_length
## <dbl>
## 1 1.4
## 2 1.4
## 3 1.3
## 4 1.5
## 5 1.4
# Renaming multiple variables uses a prefix:
select(iris, petal = starts_with("Petal"))
## Source: local data frame [5 x 2]
##
## petal1 petal2
## <dbl> <dbl>
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
# Reorder variables: keep the variable 'Species' in the front
select(iris, Species, everything())
## Source: local data frame [5 x 5]
##
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 5.1 3.5 1.4 0.2
## 2 setosa 4.9 3.0 1.4 0.2
## 3 setosa 4.7 3.2 1.3 0.2
## 4 setosa 4.6 3.1 1.5 0.2
## 5 setosa 5.0 3.6 1.4 0.2
# * rename() keeps all variables
rename(iris, petal_length = Petal.Length)
## Source: local data frame [5 x 5]
##
## Sepal.Length Sepal.Width petal_length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fctr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3.0 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5.0 3.6 1.4 0.2 setosa
first <- 1:5
second <- 3:8
intersect(first, second)
## [1] 3 4 5
union(first, second)
## [1] 1 2 3 4 5 6 7 8
setdiff(first, second)
## [1] 1 2
setdiff(second, first)
## [1] 6 7 8
setequal(first, second)
## [1] FALSE
setequal(1:5, 5:1)
## [1] TRUE
# install.packages('data.table')
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, last
summarise(mtcars, mean(disp))
## mean(disp)
## 1 230.7219
summarise(group_by(mtcars, cyl), mean(disp))
## Source: local data frame [3 x 2]
##
## cyl mean(disp)
## <dbl> <dbl>
## 1 4 105.1364
## 2 6 183.3143
## 3 8 353.1000
summarise(group_by(mtcars, cyl), m = mean(disp), sd = sd(disp))
## Source: local data frame [3 x 3]
##
## cyl m sd
## <dbl> <dbl> <dbl>
## 1 4 105.1364 26.87159
## 2 6 183.3143 41.56246
## 3 8 353.1000 67.77132
# With data frames, you can create and immediately use summaries
by_cyl <- mtcars %>% group_by(cyl)
by_cyl %>% summarise(a = n(), b = a + 1)
## Source: local data frame [3 x 3]
##
## cyl a b
## <dbl> <int> <dbl>
## 1 4 11 12
## 2 6 7 8
## 3 8 14 15
# One function
by_species <- iris %>% group_by(Species)
by_species %>% summarise_each(funs(length))
## Source: local data frame [1 x 5]
##
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fctr> <int> <int> <int> <int>
## 1 setosa 5 5 5 5
by_species %>% summarise_each(funs(mean))
## Source: local data frame [1 x 5]
##
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 4.86 3.28 1.4 0.2
by_species %>% summarise_each(funs(mean), Petal.Width)
## Source: local data frame [1 x 2]
##
## Species Petal.Width
## <fctr> <dbl>
## 1 setosa 0.2
by_species %>% summarise_each(funs(mean), matches("Width"))
## Source: local data frame [1 x 3]
##
## Species Sepal.Width Petal.Width
## <fctr> <dbl> <dbl>
## 1 setosa 3.28 0.2
by_species %>% mutate_each(funs(half = ./2))
## Source: local data frame [5 x 5]
## Groups: Species [1]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## (dbl) (dbl) (dbl) (dbl) (fctr)
## 1 2.55 1.75 0.70 0.1 setosa
## 2 2.45 1.50 0.70 0.1 setosa
## 3 2.35 1.60 0.65 0.1 setosa
## 4 2.30 1.55 0.75 0.1 setosa
## 5 2.50 1.80 0.70 0.1 setosa
by_species %>% mutate_each(funs(min_rank))
## Source: local data frame [5 x 5]
## Groups: Species [1]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## (int) (int) (int) (int) (fctr)
## 1 5 4 2 1 setosa
## 2 3 1 2 1 setosa
## 3 2 3 1 1 setosa
## 4 1 2 5 1 setosa
## 5 4 5 2 1 setosa
# Two functions
by_species %>% summarise_each(funs(min, max))
## Source: local data frame [1 x 9]
##
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min
## <fctr> <dbl> <dbl> <dbl>
## 1 setosa 4.6 3 1.3
## Variables not shown: Petal.Width_min <dbl>, Sepal.Length_max <dbl>,
## Sepal.Width_max <dbl>, Petal.Length_max <dbl>, Petal.Width_max <dbl>.
by_species %>% summarise_each(funs(min, max), Petal.Width, Sepal.Width)
## Source: local data frame [1 x 5]
##
## Species Petal.Width_min Sepal.Width_min Petal.Width_max Sepal.Width_max
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 0.2 3 0.2 3.6
by_species %>% summarise_each(funs(min, max), matches("Width"))
## Source: local data frame [1 x 5]
##
## Species Sepal.Width_min Petal.Width_min Sepal.Width_max Petal.Width_max
## <fctr> <dbl> <dbl> <dbl> <dbl>
## 1 setosa 3 0.2 3.6 0.2
# Alternative function specification
iris %>% summarise_each(funs(ul = length(unique(.))))
## Source: local data frame [1 x 5]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <int> <int> <int> <int> <int>
## 1 5 5 3 1 1
by_species %>% summarise_each(funs(ul = length(unique(.))))
## Source: local data frame [1 x 5]
##
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## <fctr> <int> <int> <int> <int>
## 1 setosa 5 5 3 1
by_species %>% summarise_each(c("min", "max"))
## Source: local data frame [1 x 9]
##
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min
## <fctr> <dbl> <dbl> <dbl>
## 1 setosa 4.6 3 1.3
## Variables not shown: Petal.Width_min <dbl>, Sepal.Length_max <dbl>,
## Sepal.Width_max <dbl>, Petal.Length_max <dbl>, Petal.Width_max <dbl>.
# Alternative variable specification
summarise_each_(iris, funs(max), names(iris)[-5])
## Source: local data frame [1 x 4]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.6 1.5 0.2
summarise_each_(iris, funs(max), list(quote(-Species)))
## Source: local data frame [1 x 4]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## <dbl> <dbl> <dbl> <dbl>
## 1 5.1 3.6 1.5 0.2
top_n(x, n, wt)
gb_cyl <- group_by(mtcars_df, cyl)
top_n(gb_cyl, 2, desc(disp))
## Source: local data frame [8 x 11]
## Groups: cyl [3]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 4 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 5 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 6 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 7 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 8 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
top_n(gb_cyl, 2, disp)
## Source: local data frame [6 x 11]
## Groups: cyl [3]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl) (dbl)
## 1 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 2 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 3 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 4 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 5 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 6 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
x <- c(5, 1, 3, 2, 2, NA)
row_number(x)
## [1] 5 1 4 2 3 NA
min_rank(x)
## [1] 5 1 4 2 2 NA
dense_rank(x)
## [1] 4 1 3 2 2 NA
percent_rank(x)
## [1] 1.00 0.00 0.75 0.25 0.25 NA
cume_dist(x)
## [1] 1.0 0.2 0.8 0.6 0.6 NA
ntile(x, 2)
## [1] 2 1 2 1 1 NA
ntile(runif(100), 10)
## [1] 9 5 4 5 5 9 6 9 8 7 3 2 7 10 7 8 9 10 3 2 9 5 4
## [24] 8 6 5 6 2 6 4 8 4 9 7 2 1 2 2 2 3 8 9 6 1 10 8
## [47] 5 10 3 9 4 6 1 10 10 10 6 3 1 10 1 8 3 1 10 8 5 7 8
## [70] 1 2 2 4 4 6 7 8 1 4 4 5 6 5 10 1 4 7 6 5 3 9 7
## [93] 2 3 3 7 7 1 3 9
complete(data, …, fill = list())
library(dplyr)
library(tidyr)
df <- data_frame(group = c(1:2, 1), item_id = c(1:2, 2), item_name = c("a",
"b", "b"), value1 = 1:3, value2 = 4:6)
df
## Source: local data frame [3 x 5]
##
## group item_id item_name value1 value2
## <dbl> <dbl> <chr> <int> <int>
## 1 1 1 a 1 4
## 2 2 2 b 2 5
## 3 1 2 b 3 6
df %>% complete(group, nesting(item_id, item_name))
## Source: local data frame [4 x 5]
##
## group item_id item_name value1 value2
## <dbl> <dbl> <chr> <int> <int>
## 1 1 1 a 1 4
## 2 1 2 b 3 6
## 3 2 1 a NA NA
## 4 2 2 b 2 5
# You can also choose to fill in missing values
df %>% complete(group, nesting(item_id, item_name), fill = list(value1 = 0,
value2 = 99))
## Source: local data frame [4 x 5]
##
## group item_id item_name value1 value2
## <dbl> <dbl> <chr> <dbl> <dbl>
## 1 1 1 a 1 4
## 2 1 2 b 3 6
## 3 2 1 a 0 99
## 4 2 2 b 2 5
df <- data_frame(year = c(2010, 2010, 2010, 2010, 2012, 2012, 2012), qtr = c(1,
2, 3, 4, 1, 2, 3), return = rnorm(7))
df
## Source: local data frame [7 x 3]
##
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2010 1 0.4171819
## 2 2010 2 -2.1309442
## 3 2010 3 -0.7005132
## 4 2010 4 0.1543173
## 5 2012 1 -0.3485685
## 6 2012 2 0.3815484
## 7 2012 3 0.7407655
df %>% expand(year, qtr)
## Source: local data frame [8 x 2]
##
## year qtr
## <dbl> <dbl>
## 1 2010 1
## 2 2010 2
## 3 2010 3
## 4 2010 4
## 5 2012 1
## 6 2012 2
## 7 2012 3
## 8 2012 4
df %>% expand(year = 2010:2012, qtr)
## Source: local data frame [12 x 2]
##
## year qtr
## <int> <dbl>
## 1 2010 1
## 2 2010 2
## 3 2010 3
## 4 2010 4
## 5 2011 1
## 6 2011 2
## 7 2011 3
## 8 2011 4
## 9 2012 1
## 10 2012 2
## 11 2012 3
## 12 2012 4
df %>% expand(year = full_seq(year, 1), qtr)
## Source: local data frame [12 x 2]
##
## year qtr
## <dbl> <dbl>
## 1 2010 1
## 2 2010 2
## 3 2010 3
## 4 2010 4
## 5 2011 1
## 6 2011 2
## 7 2011 3
## 8 2011 4
## 9 2012 1
## 10 2012 2
## 11 2012 3
## 12 2012 4
df %>% complete(year = full_seq(year, 1), qtr)
## Source: local data frame [12 x 3]
##
## year qtr return
## <dbl> <dbl> <dbl>
## 1 2010 1 0.4171819
## 2 2010 2 -2.1309442
## 3 2010 3 -0.7005132
## 4 2010 4 0.1543173
## 5 2011 1 NA
## 6 2011 2 NA
## 7 2011 3 NA
## 8 2011 4 NA
## 9 2012 1 -0.3485685
## 10 2012 2 0.3815484
## 11 2012 3 0.7407655
## 12 2012 4 NA
extract(data, col, into, regex = “([[:alnum:]]+)”, remove = TRUE, convert = FALSE, …)
df <- data.frame(x = c(NA, "a-b", "a-d", "b-c", "d-e"))
df
## x
## 1 <NA>
## 2 a-b
## 3 a-d
## 4 b-c
## 5 d-e
df %>% extract(x, "A")
## A
## 1 <NA>
## 2 a
## 3 a
## 4 b
## 5 d
df %>% extract(x, c("A", "B"), "([[:alnum:]]+)-([[:alnum:]]+)")
## A B
## 1 <NA> <NA>
## 2 a b
## 3 a d
## 4 b c
## 5 d e
# If no match, NA:
df %>% extract(x, c("A", "B"), "([a-d]+)-([a-d]+)")
## A B
## 1 <NA> <NA>
## 2 a b
## 3 a d
## 4 b c
## 5 <NA> <NA>
extract_numeric("$1,200.34")
## [1] 1200.34
extract_numeric("-2%")
## [1] -2
extract_numeric("-2-2")
## [1] NA
extract_numeric("12abc34")
## [1] 1234
fill(data, …, .direction = c(“down”, “up”))
df <- data.frame(Month = 1:12, Year = c(2000, rep(NA, 4), 2005, rep(NA, 3),
2006, NA, 2010))
df
## Month Year
## 1 1 2000
## 2 2 NA
## 3 3 NA
## 4 4 NA
## 5 5 NA
## 6 6 2005
## 7 7 NA
## 8 8 NA
## 9 9 NA
## 10 10 2006
## 11 11 NA
## 12 12 2010
df %>% fill(Year, .direction = "down")
## Month Year
## 1 1 2000
## 2 2 2000
## 3 3 2000
## 4 4 2000
## 5 5 2000
## 6 6 2005
## 7 7 2005
## 8 8 2005
## 9 9 2005
## 10 10 2006
## 11 11 2006
## 12 12 2010
df %>% fill(Year, .direction = "up")
## Month Year
## 1 1 2000
## 2 2 2005
## 3 3 2005
## 4 4 2005
## 5 5 2005
## 6 6 2005
## 7 7 2006
## 8 8 2006
## 9 9 2006
## 10 10 2006
## 11 11 2010
## 12 12 2010
full_seq(x, period, tol = 1e-06)
full_seq(c(1, 2, 4, 5, 10), 1)
## [1] 1 2 3 4 5 6 7 8 9 10
full_seq(c(2, 4, 10), 2)
## [1] 2 4 6 8 10
gather(data, key, value, …, na.rm = FALSE, convert = FALSE, factor_key = FALSE)
stocks <- data_frame(time = as.Date("2009-01-01") + 0:3, X = rnorm(4, 0, 1),
Y = rnorm(4, 0, 2), Z = rnorm(4, 0, 4))
stocks
## Source: local data frame [4 x 4]
##
## time X Y Z
## <date> <dbl> <dbl> <dbl>
## 1 2009-01-01 -2.9642095 0.493950 10.318943
## 2 2009-01-02 -0.7862788 -3.017822 2.459886
## 3 2009-01-03 1.6122230 -2.072584 -3.610145
## 4 2009-01-04 -1.1993662 -4.241418 3.852937
gather(data = stocks, key = stock, value = price, -time)
## Source: local data frame [12 x 3]
##
## time stock price
## <date> <chr> <dbl>
## 1 2009-01-01 X -2.9642095
## 2 2009-01-02 X -0.7862788
## 3 2009-01-03 X 1.6122230
## 4 2009-01-04 X -1.1993662
## 5 2009-01-01 Y 0.4939500
## 6 2009-01-02 Y -3.0178216
## 7 2009-01-03 Y -2.0725837
## 8 2009-01-04 Y -4.2414183
## 9 2009-01-01 Z 10.3189432
## 10 2009-01-02 Z 2.4598863
## 11 2009-01-03 Z -3.6101455
## 12 2009-01-04 Z 3.8529366
mini_iris <- iris[c(1, 51, 101), ]
mini_iris
## Source: local data frame [3 x 5]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fctr>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 NA NA NA NA NA
## 3 NA NA NA NA NA
gather(mini_iris, key = flower_att, value = measurement, Sepal.Length, Sepal.Width,
Petal.Length, Petal.Width)
## Source: local data frame [12 x 3]
##
## Species flower_att measurement
## <fctr> <chr> <dbl>
## 1 setosa Sepal.Length 5.1
## 2 NA Sepal.Length NA
## 3 NA Sepal.Length NA
## 4 setosa Sepal.Width 3.5
## 5 NA Sepal.Width NA
## 6 NA Sepal.Width NA
## 7 setosa Petal.Length 1.4
## 8 NA Petal.Length NA
## 9 NA Petal.Length NA
## 10 setosa Petal.Width 0.2
## 11 NA Petal.Width NA
## 12 NA Petal.Width NA
gather(mini_iris, key = flower_att, value = measurement, -Species)
## Source: local data frame [12 x 3]
##
## Species flower_att measurement
## <fctr> <chr> <dbl>
## 1 setosa Sepal.Length 5.1
## 2 NA Sepal.Length NA
## 3 NA Sepal.Length NA
## 4 setosa Sepal.Width 3.5
## 5 NA Sepal.Width NA
## 6 NA Sepal.Width NA
## 7 setosa Petal.Length 1.4
## 8 NA Petal.Length NA
## 9 NA Petal.Length NA
## 10 setosa Petal.Width 0.2
## 11 NA Petal.Width NA
## 12 NA Petal.Width NA
mini_iris <- iris %>% group_by(Species) %>% slice(1)
mini_iris
## Source: local data frame [1 x 5]
## Groups: Species [1]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## (dbl) (dbl) (dbl) (dbl) (fctr)
## 1 5.1 3.5 1.4 0.2 setosa
mini_iris %>% gather(key = flower_att, value = measurement, -Species)
## Source: local data frame [4 x 3]
## Groups: Species [1]
##
## Species flower_att measurement
## (fctr) (chr) (dbl)
## 1 setosa Sepal.Length 5.1
## 2 setosa Sepal.Width 3.5
## 3 setosa Petal.Length 1.4
## 4 setosa Petal.Width 0.2
nest(data, …, .key = data)
iris %>% nest(-Species)
## Source: local data frame [1 x 2]
##
## Species data
## <fctr> <list>
## 1 setosa <tbl_df [5,4]>
chickwts %>% nest(weight)
## Source: local data frame [6 x 2]
##
## feed data
## <fctr> <list>
## 1 horsebean <tbl_df [10,1]>
## 2 linseed <tbl_df [12,1]>
## 3 soybean <tbl_df [14,1]>
## 4 sunflower <tbl_df [12,1]>
## 5 meatmeal <tbl_df [11,1]>
## 6 casein <tbl_df [12,1]>
replace_na(data, replace = list(), …)
library(dplyr)
df <- data_frame(x = c(1, 2, NA), y = c("a", NA, "b"))
df %>% replace_na(list(x = 0, y = "unknown"))
## Source: local data frame [3 x 2]
##
## x y
## <dbl> <chr>
## 1 1 a
## 2 2 unknown
## 3 0 b
separate(data, col, into, sep = “[^[:alnum:]]+”, remove = TRUE, convert = FALSE, extra = “warn”, fill = “warn”, …)
library(dplyr)
df <- data.frame(x = c(NA, "a.b", "a.d", "b.c"))
df %>% separate(x, c("A", "B"))
## A B
## 1 <NA> <NA>
## 2 a b
## 3 a d
## 4 b c
# If every row doesn't split into the same number of pieces, use the extra
# and file arguments to control what happens
df <- data.frame(x = c("a", "a b", "a b c", NA))
df %>% separate(x, c("a", "b"))
## a b
## 1 a <NA>
## 2 a b
## 3 a b
## 4 <NA> <NA>
# The same behaviour but no warnings
df %>% separate(x, c("a", "b"), extra = "drop", fill = "right")
## a b
## 1 a <NA>
## 2 a b
## 3 a b
## 4 <NA> <NA>
# Another option:
df %>% separate(x, c("a", "b"), extra = "merge", fill = "left")
## a b
## 1 <NA> a
## 2 a b
## 3 a b c
## 4 <NA> <NA>
# If only want to split specified number of times use extra = 'merge'
df <- data.frame(x = c("x: 123", "y: error: 7"))
df %>% separate(x, c("key", "value"), ": ", extra = "merge")
## key value
## 1 x 123
## 2 y error: 7
spread(data, key, value, fill = NA, convert = FALSE, drop = TRUE)
library(dplyr)
stocks <- data.frame(time = as.Date("2009-01-01") + 0:9, X = rnorm(10, 0, 1),
Y = rnorm(10, 0, 2), Z = rnorm(10, 0, 4))
stocksm <- stocks %>% gather(stock, price, -time)
stocksm
## time stock price
## 1 2009-01-01 X -0.52189978
## 2 2009-01-02 X 0.80832212
## 3 2009-01-03 X 1.53476105
## 4 2009-01-04 X 1.40697005
## 5 2009-01-05 X -1.26666165
## 6 2009-01-06 X -0.66619804
## 7 2009-01-07 X -0.27924740
## 8 2009-01-08 X 1.48666964
## 9 2009-01-09 X 1.69425614
## 10 2009-01-10 X -1.35129012
## 11 2009-01-01 Y -1.92650802
## 12 2009-01-02 Y -1.04971015
## 13 2009-01-03 Y -0.06182085
## 14 2009-01-04 Y 0.47333560
## 15 2009-01-05 Y -2.79939690
## 16 2009-01-06 Y -5.98040102
## 17 2009-01-07 Y 0.84574146
## 18 2009-01-08 Y -1.33068578
## 19 2009-01-09 Y 0.35614342
## 20 2009-01-10 Y -2.83740229
## 21 2009-01-01 Z -5.27641988
## 22 2009-01-02 Z 3.39574345
## 23 2009-01-03 Z 4.30022453
## 24 2009-01-04 Z -0.39124420
## 25 2009-01-05 Z -3.19956039
## 26 2009-01-06 Z -2.83736630
## 27 2009-01-07 Z -2.80122389
## 28 2009-01-08 Z -3.04052729
## 29 2009-01-09 Z 1.71237941
## 30 2009-01-10 Z -0.08754712
stocksm %>% spread(stock, price)
## time X Y Z
## 1 2009-01-01 -0.5218998 -1.92650802 -5.27641988
## 2 2009-01-02 0.8083221 -1.04971015 3.39574345
## 3 2009-01-03 1.5347610 -0.06182085 4.30022453
## 4 2009-01-04 1.4069700 0.47333560 -0.39124420
## 5 2009-01-05 -1.2666617 -2.79939690 -3.19956039
## 6 2009-01-06 -0.6661980 -5.98040102 -2.83736630
## 7 2009-01-07 -0.2792474 0.84574146 -2.80122389
## 8 2009-01-08 1.4866696 -1.33068578 -3.04052729
## 9 2009-01-09 1.6942561 0.35614342 1.71237941
## 10 2009-01-10 -1.3512901 -2.83740229 -0.08754712
stocksm %>% spread(time, price)
## stock 2009-01-01 2009-01-02 2009-01-03 2009-01-04 2009-01-05 2009-01-06
## 1 X -0.5218998 0.8083221 1.53476105 1.4069700 -1.266662 -0.666198
## 2 Y -1.9265080 -1.0497102 -0.06182085 0.4733356 -2.799397 -5.980401
## 3 Z -5.2764199 3.3957434 4.30022453 -0.3912442 -3.199560 -2.837366
## 2009-01-07 2009-01-08 2009-01-09 2009-01-10
## 1 -0.2792474 1.486670 1.6942561 -1.35129012
## 2 0.8457415 -1.330686 0.3561434 -2.83740229
## 3 -2.8012239 -3.040527 1.7123794 -0.08754712
# Spread and gather are complements
df <- data.frame(x = c("a", "b"), y = c(3, 4), z = c(5, 6))
df
## x y z
## 1 a 3 5
## 2 b 4 6
df %>% spread(x, y) %>% gather(x, y, a:b, na.rm = TRUE)
## z x y
## 1 5 a 3
## 4 6 b 4
# Use 'convert = TRUE' to produce variables of mixed type
df <- data.frame(row = rep(c(1, 51), each = 3), var = c("Sepal.Length", "Species",
"Species_num"), value = c(5.1, "setosa", 1, 7, "versicolor", 2))
df %>% spread(var, value) %>% str
## 'data.frame': 2 obs. of 4 variables:
## $ row : num 1 51
## $ Sepal.Length: Factor w/ 6 levels "1","2","5.1",..: 3 4
## $ Species : Factor w/ 6 levels "1","2","5.1",..: 5 6
## $ Species_num : Factor w/ 6 levels "1","2","5.1",..: 1 2
df %>% spread(var, value, convert = TRUE) %>% str
## 'data.frame': 2 obs. of 4 variables:
## $ row : num 1 51
## $ Sepal.Length: num 5.1 7
## $ Species : chr "setosa" "versicolor"
## $ Species_num : int 1 2