library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#group_by(Species) %>% summarise(MIN=min(Sepal.Length),
# MEAN=mean(Sepal.Length),
# MEDIAN=median(Sepal.Length),
# MAX=max(Sepal.Length)
#)
iris %>%
group_by(Species) %>%
summarise_each(funs(min, mean, median, max),
Sepal.Length,Sepal.Width)
## Source: local data frame [3 x 9]
##
## Species Sepal.Length_min Sepal.Width_min Sepal.Length_mean
## 1 setosa 4.3 2.3 5.006
## 2 versicolor 4.9 2.0 5.936
## 3 virginica 4.9 2.2 6.588
## Variables not shown: Sepal.Width_mean (dbl), Sepal.Length_median (dbl),
## Sepal.Width_median (dbl), Sepal.Length_max (dbl), Sepal.Width_max (dbl)
#iris %>%
# group_by(Species) %>%
# summarise_each(funs(min, mean, median, max),
# start_with("Sepal"))
—————————————————
#Examples
# One function
by_species <- iris %>% group_by(Species)
by_species %>% summarise_each(funs(length))
## Source: local data frame [3 x 5]
##
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 50 50 50 50
## 2 versicolor 50 50 50 50
## 3 virginica 50 50 50 50
by_species %>% summarise_each(funs(mean))
## Source: local data frame [3 x 5]
##
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 5.006 3.428 1.462 0.246
## 2 versicolor 5.936 2.770 4.260 1.326
## 3 virginica 6.588 2.974 5.552 2.026
by_species %>% summarise_each(funs(mean), Petal.Width)
## Source: local data frame [3 x 2]
##
## Species Petal.Width
## 1 setosa 0.246
## 2 versicolor 1.326
## 3 virginica 2.026
by_species %>% summarise_each(funs(mean), matches("Width"))
## Source: local data frame [3 x 3]
##
## Species Sepal.Width Petal.Width
## 1 setosa 3.428 0.246
## 2 versicolor 2.770 1.326
## 3 virginica 2.974 2.026
by_species %>% mutate_each(funs(half = . / 2))
## Source: local data frame [150 x 5]
## Groups: Species
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 2.55 1.75 0.70 0.10 setosa
## 2 2.45 1.50 0.70 0.10 setosa
## 3 2.35 1.60 0.65 0.10 setosa
## 4 2.30 1.55 0.75 0.10 setosa
## 5 2.50 1.80 0.70 0.10 setosa
## 6 2.70 1.95 0.85 0.20 setosa
## 7 2.30 1.70 0.70 0.15 setosa
## 8 2.50 1.70 0.75 0.10 setosa
## 9 2.20 1.45 0.70 0.10 setosa
## 10 2.45 1.55 0.75 0.05 setosa
## .. ... ... ... ... ...
by_species %>% mutate_each(funs(min_rank))
## Source: local data frame [150 x 5]
## Groups: Species
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 29 29 12 6 setosa
## 2 17 3 12 6 setosa
## 3 10 13 5 6 setosa
## 4 6 9 25 6 setosa
## 5 21 35 12 6 setosa
## 6 41 45 45 42 setosa
## 7 6 20 12 35 setosa
## 8 21 20 25 6 setosa
## 9 2 2 12 6 setosa
## 10 17 9 25 1 setosa
## .. ... ... ... ... ...
# Two functions
by_species %>% summarise_each(funs(min, max))
## Source: local data frame [3 x 9]
##
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min
## 1 setosa 4.3 2.3 1.0
## 2 versicolor 4.9 2.0 3.0
## 3 virginica 4.9 2.2 4.5
## Variables not shown: Petal.Width_min (dbl), Sepal.Length_max (dbl),
## Sepal.Width_max (dbl), Petal.Length_max (dbl), Petal.Width_max (dbl)
by_species %>% summarise_each(funs(min, max), Petal.Width, Sepal.Width)
## Source: local data frame [3 x 5]
##
## Species Petal.Width_min Sepal.Width_min Petal.Width_max
## 1 setosa 0.1 2.3 0.6
## 2 versicolor 1.0 2.0 1.8
## 3 virginica 1.4 2.2 2.5
## Variables not shown: Sepal.Width_max (dbl)
by_species %>% summarise_each(funs(min, max), matches("Width"))
## Source: local data frame [3 x 5]
##
## Species Sepal.Width_min Petal.Width_min Sepal.Width_max
## 1 setosa 2.3 0.1 4.4
## 2 versicolor 2.0 1.0 3.4
## 3 virginica 2.2 1.4 3.8
## Variables not shown: Petal.Width_max (dbl)
# Alternative function specification
iris %>% summarise_each(funs(ul = length(unique(.))))
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## 1 35 23 43 22 3
by_species %>% summarise_each(funs(ul = length(unique(.))))
## Source: local data frame [3 x 5]
##
## Species Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 setosa 15 16 9 6
## 2 versicolor 21 14 19 9
## 3 virginica 21 13 20 12
by_species %>% summarise_each(c("min", "max"))
## Source: local data frame [3 x 9]
##
## Species Sepal.Length_min Sepal.Width_min Petal.Length_min
## 1 setosa 4.3 2.3 1.0
## 2 versicolor 4.9 2.0 3.0
## 3 virginica 4.9 2.2 4.5
## Variables not shown: Petal.Width_min (dbl), Sepal.Length_max (dbl),
## Sepal.Width_max (dbl), Petal.Length_max (dbl), Petal.Width_max (dbl)
# Alternative variable specification
summarise_each_(iris, funs(max), names(iris)[-5])
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 7.9 4.4 6.9 2.5
summarise_each_(iris, funs(max), list(quote(-Species)))
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 7.9 4.4 6.9 2.5
—————————————————–
#Examples
iris <- tbl_df(iris) # so it prints a little nicer
select(iris, starts_with("Petal"))
## Source: local data frame [150 x 2]
##
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
## 7 1.4 0.3
## 8 1.5 0.2
## 9 1.4 0.2
## 10 1.5 0.1
## .. ... ...
select(iris, ends_with("Width"))
## Source: local data frame [150 x 2]
##
## Sepal.Width Petal.Width
## 1 3.5 0.2
## 2 3.0 0.2
## 3 3.2 0.2
## 4 3.1 0.2
## 5 3.6 0.2
## 6 3.9 0.4
## 7 3.4 0.3
## 8 3.4 0.2
## 9 2.9 0.2
## 10 3.1 0.1
## .. ... ...
select(iris, contains("etal"))
## Source: local data frame [150 x 2]
##
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
## 7 1.4 0.3
## 8 1.5 0.2
## 9 1.4 0.2
## 10 1.5 0.1
## .. ... ...
select(iris, matches(".t."))
## Source: local data frame [150 x 4]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
## 7 4.6 3.4 1.4 0.3
## 8 5.0 3.4 1.5 0.2
## 9 4.4 2.9 1.4 0.2
## 10 4.9 3.1 1.5 0.1
## .. ... ... ... ...
select(iris, Petal.Length, Petal.Width)
## Source: local data frame [150 x 2]
##
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
## 7 1.4 0.3
## 8 1.5 0.2
## 9 1.4 0.2
## 10 1.5 0.1
## .. ... ...
#Examples
iris <- tbl_df(iris) # so it prints a little nicer
select(iris, starts_with("Petal"))
## Source: local data frame [150 x 2]
##
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
## 7 1.4 0.3
## 8 1.5 0.2
## 9 1.4 0.2
## 10 1.5 0.1
## .. ... ...
select(iris, ends_with("Width"))
## Source: local data frame [150 x 2]
##
## Sepal.Width Petal.Width
## 1 3.5 0.2
## 2 3.0 0.2
## 3 3.2 0.2
## 4 3.1 0.2
## 5 3.6 0.2
## 6 3.9 0.4
## 7 3.4 0.3
## 8 3.4 0.2
## 9 2.9 0.2
## 10 3.1 0.1
## .. ... ...
select(iris, contains("etal"))
## Source: local data frame [150 x 2]
##
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
## 7 1.4 0.3
## 8 1.5 0.2
## 9 1.4 0.2
## 10 1.5 0.1
## .. ... ...
select(iris, matches(".t."))
## Source: local data frame [150 x 4]
##
## Sepal.Length Sepal.Width Petal.Length Petal.Width
## 1 5.1 3.5 1.4 0.2
## 2 4.9 3.0 1.4 0.2
## 3 4.7 3.2 1.3 0.2
## 4 4.6 3.1 1.5 0.2
## 5 5.0 3.6 1.4 0.2
## 6 5.4 3.9 1.7 0.4
## 7 4.6 3.4 1.4 0.3
## 8 5.0 3.4 1.5 0.2
## 9 4.4 2.9 1.4 0.2
## 10 4.9 3.1 1.5 0.1
## .. ... ... ... ...
select(iris, Petal.Length, Petal.Width)
## Source: local data frame [150 x 2]
##
## Petal.Length Petal.Width
## 1 1.4 0.2
## 2 1.4 0.2
## 3 1.3 0.2
## 4 1.5 0.2
## 5 1.4 0.2
## 6 1.7 0.4
## 7 1.4 0.3
## 8 1.5 0.2
## 9 1.4 0.2
## 10 1.5 0.1
## .. ... ...
#------------------------------------------------
#Examples
library(dplyr)
by_cyl <- mtcars %>% group_by(cyl)
# Sample fixed number per group
sample_n(mtcars, 10)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
sample_n(mtcars, 50, replace = TRUE)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Dodge Challenger.1 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Toyota Corolla.1 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Chrysler Imperial.1 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Chrysler Imperial.2 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Hornet Sportabout.1 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 280.1 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Datsun 710.1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Lincoln Continental.1 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial.3 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Dodge Challenger.2 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino.1 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 450SL.1 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Merc 450SE.1 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Mazda RX4.1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Duster 360.1 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## AMC Javelin.1 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Hornet Sportabout.2 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Dodge Challenger.3 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Mazda RX4.2 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Toyota Corolla.2 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Mazda RX4.3 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Toyota Corolla.3 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## AMC Javelin.2 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Mazda RX4.4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
sample_n(mtcars, 10, weight = mpg)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
sample_n(by_cyl, 3)
## Source: local data frame [9 x 11]
## Groups: cyl
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 2 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 3 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 4 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 5 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 6 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 7 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 8 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## 9 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
sample_n(by_cyl, 10, replace = TRUE)
## Source: local data frame [30 x 11]
## Groups: cyl
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 2 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 3 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 4 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 5 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 6 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 7 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 8 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 9 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 10 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 11 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 12 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 13 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 14 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 15 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 16 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 17 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 18 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 19 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 20 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 21 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 22 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 23 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 24 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 25 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## 26 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 27 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 28 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 29 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 30 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
sample_n(by_cyl, 3, weight = mpg / mean(mpg))
## Source: local data frame [9 x 11]
## Groups: cyl
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 2 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 3 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 4 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 5 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 6 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 7 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 8 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 9 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
# Sample fixed fraction per group
# Default is to sample all data = randomly resample rows
sample_frac(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
sample_frac(mtcars, 0.1)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Ford Pantera L 15.8 8 351 264 4.22 3.17 14.50 0 1 5 4
## Dodge Challenger 15.5 8 318 150 2.76 3.52 16.87 0 0 3 2
## Ferrari Dino 19.7 6 145 175 3.62 2.77 15.50 0 1 5 6
sample_frac(mtcars, 1.5, replace = TRUE)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Ferrari Dino.1 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ford Pantera L.1 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Ford Pantera L.2 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Ferrari Dino.2 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Valiant.1 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## AMC Javelin.1 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 230.1 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Valiant.2 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Toyota Corolla.1 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## AMC Javelin.2 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## Merc 230.2 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Merc 230.3 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Porsche 914-2.1 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## Datsun 710.1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## Merc 450SLC.1 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Ford Pantera L.3 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## Lincoln Continental.1 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Merc 280C.1 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Camaro Z28.1 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## Hornet Sportabout.1 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## Merc 230.4 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## Lincoln Continental.2 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## Chrysler Imperial.1 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## Porsche 914-2.2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
sample_frac(mtcars, 0.1, weight = 1 / mpg)
## mpg cyl disp hp drat wt qsec vs am gear carb
## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
sample_frac(by_cyl, 0.2)
## Source: local data frame [6 x 11]
## Groups: cyl
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 2 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 3 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 4 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 5 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
sample_frac(by_cyl, 1, replace = TRUE)
## Source: local data frame [32 x 11]
## Groups: cyl
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 2 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 3 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 4 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 5 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 6 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 7 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 8 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 9 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 10 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 11 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 12 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 13 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 14 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 15 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 16 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 17 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 18 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 19 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 20 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 21 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 22 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 23 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 24 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 25 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 26 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 27 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 28 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 29 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 30 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 31 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 32 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
———————————————
#Examples
df <- expand.grid(x = 1:3, y = 3:1)
df %>%
rowwise() %>%
do(i = seq(.$x, .$y))
## Source: local data frame [9 x 1]
## Groups: <by row>
##
## i
## 1 <int[3]>
## 2 <int[2]>
## 3 <int[1]>
## 4 <int[2]>
## 5 <int[1]>
## 6 <int[2]>
## 7 <int[1]>
## 8 <int[2]>
## 9 <int[3]>
.Last.value
## $help_type
## [1] "html"
#%>% summarise(n = length(i))
———————————————–
#Examples
one <- mtcars[1:10, ]
two <- mtcars[11:32, ]
rbind_list(one, two)
## Source: local data frame [32 x 11]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 11 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 12 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 13 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 15 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 16 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 17 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 18 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 19 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 20 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 21 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 22 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 24 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 25 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 26 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 27 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 28 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 29 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 30 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 31 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
rbind_all(list(one, two))
## Source: local data frame [32 x 11]
##
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 11 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 12 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 13 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 15 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 16 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 17 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 18 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 19 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 20 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 21 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 22 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 24 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 25 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 26 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 27 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 28 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 29 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 30 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 31 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
———————————————
#Examples
mutate(mtcars, displ_l = disp / 61.0237)
## mpg cyl disp hp drat wt qsec vs am gear carb displ_l
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 2.621932
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 2.621932
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 1.769804
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 4.227866
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 5.899347
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 3.687092
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 5.899347
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 2.403984
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 2.307300
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 2.746474
## 11 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 2.746474
## 12 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 4.519556
## 13 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 4.519556
## 14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 4.519556
## 15 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 7.734700
## 16 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 7.538055
## 17 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 7.210313
## 18 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 1.289663
## 19 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 1.240502
## 20 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 1.165121
## 21 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 1.968088
## 22 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 5.211090
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 4.981671
## 24 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 5.735477
## 25 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 6.554830
## 26 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 1.294579
## 27 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 1.971365
## 28 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 1.558411
## 29 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 5.751864
## 30 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 2.376126
## 31 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 4.932510
## 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 1.982836
transmute(mtcars, displ_l = disp / 61.0237)
## displ_l
## 1 2.621932
## 2 2.621932
## 3 1.769804
## 4 4.227866
## 5 5.899347
## 6 3.687092
## 7 5.899347
## 8 2.403984
## 9 2.307300
## 10 2.746474
## 11 2.746474
## 12 4.519556
## 13 4.519556
## 14 4.519556
## 15 7.734700
## 16 7.538055
## 17 7.210313
## 18 1.289663
## 19 1.240502
## 20 1.165121
## 21 1.968088
## 22 5.211090
## 23 4.981671
## 24 5.735477
## 25 6.554830
## 26 1.294579
## 27 1.971365
## 28 1.558411
## 29 5.751864
## 30 2.376126
## 31 4.932510
## 32 1.982836
mutate(mtcars, cyl = NULL)
## mpg disp hp drat wt qsec vs am gear carb
## 1 21.0 160.0 110 3.90 2.620 16.46 0 1 4 4
## 2 21.0 160.0 110 3.90 2.875 17.02 0 1 4 4
## 3 22.8 108.0 93 3.85 2.320 18.61 1 1 4 1
## 4 21.4 258.0 110 3.08 3.215 19.44 1 0 3 1
## 5 18.7 360.0 175 3.15 3.440 17.02 0 0 3 2
## 6 18.1 225.0 105 2.76 3.460 20.22 1 0 3 1
## 7 14.3 360.0 245 3.21 3.570 15.84 0 0 3 4
## 8 24.4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 9 22.8 140.8 95 3.92 3.150 22.90 1 0 4 2
## 10 19.2 167.6 123 3.92 3.440 18.30 1 0 4 4
## 11 17.8 167.6 123 3.92 3.440 18.90 1 0 4 4
## 12 16.4 275.8 180 3.07 4.070 17.40 0 0 3 3
## 13 17.3 275.8 180 3.07 3.730 17.60 0 0 3 3
## 14 15.2 275.8 180 3.07 3.780 18.00 0 0 3 3
## 15 10.4 472.0 205 2.93 5.250 17.98 0 0 3 4
## 16 10.4 460.0 215 3.00 5.424 17.82 0 0 3 4
## 17 14.7 440.0 230 3.23 5.345 17.42 0 0 3 4
## 18 32.4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 19 30.4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 20 33.9 71.1 65 4.22 1.835 19.90 1 1 4 1
## 21 21.5 120.1 97 3.70 2.465 20.01 1 0 3 1
## 22 15.5 318.0 150 2.76 3.520 16.87 0 0 3 2
## 23 15.2 304.0 150 3.15 3.435 17.30 0 0 3 2
## 24 13.3 350.0 245 3.73 3.840 15.41 0 0 3 4
## 25 19.2 400.0 175 3.08 3.845 17.05 0 0 3 2
## 26 27.3 79.0 66 4.08 1.935 18.90 1 1 4 1
## 27 26.0 120.3 91 4.43 2.140 16.70 0 1 5 2
## 28 30.4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 29 15.8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 30 19.7 145.0 175 3.62 2.770 15.50 0 1 5 6
## 31 15.0 301.0 335 3.54 3.570 14.60 0 1 5 8
## 32 21.4 121.0 109 4.11 2.780 18.60 1 1 4 2
#---------------------------------------------
#Examples
library(dplyr)
order_by(10:1, cumsum(1:10))
## [1] 55 54 52 49 45 40 34 27 19 10
x <- 10:1
y <- 1:10
order_by(x, cumsum(y))
## [1] 55 54 52 49 45 40 34 27 19 10
df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
scrambled <- df[sample(nrow(df)), ]
wrong <- mutate(scrambled, running = cumsum(value))
arrange(wrong, year)
## year value running
## 1 2000 0 51
## 2 2001 1 1
## 3 2002 4 55
## 4 2003 9 10
## 5 2004 16 51
## 6 2005 25 35
right <- mutate(scrambled, running = order_by(year, cumsum(value)))
arrange(right, year)
## year value running
## 1 2000 0 0
## 2 2001 1 1
## 3 2002 4 5
## 4 2003 9 14
## 5 2004 16 30
## 6 2005 25 55
#------------------------------------------
#Examples
if (require("nycflights13")) {
carriers <- group_by(flights, carrier)
summarise(carriers, n())
mutate(carriers, n = n())
filter(carriers, n() < 100)
}
## Loading required package: nycflights13
## Source: local data frame [32 x 16]
## Groups: carrier
##
## year month day dep_time dep_delay arr_time arr_delay carrier tailnum
## 1 2013 1 30 1222 67 1402 107 OO N978SW
## 2 2013 11 3 1424 -6 1629 -5 OO N813SK
## 3 2013 11 10 1443 13 1701 27 OO N813SK
## 4 2013 11 17 1422 -8 1610 -24 OO N693CA
## 5 2013 11 25 1803 4 2011 -6 OO N803SK
## 6 2013 11 30 1648 1 1814 3 OO N746SK
## 7 2013 6 15 1626 -9 1810 -20 OO N913EV
## 8 2013 6 22 1846 131 2107 157 OO N427SW
## 9 2013 8 27 1755 -10 1956 3 OO N789SK
## 10 2013 8 28 2039 154 2213 140 OO N790SK
## 11 2013 8 29 1832 27 2039 46 OO N797SK
## 12 2013 8 30 1930 85 2102 69 OO N762SK
## 13 2013 9 2 NA NA NA NA OO N768SK
## 14 2013 9 3 1759 -6 1959 6 OO N795SK
## 15 2013 9 4 1803 -2 1941 -12 OO N780SK
## 16 2013 9 5 1758 -7 1938 -15 OO N789SK
## 17 2013 9 6 1803 -2 1945 -8 OO N740SK
## 18 2013 9 8 1754 -11 1946 -7 OO N726SK
## 19 2013 9 9 1757 -8 1951 -2 OO N780SK
## 20 2013 9 10 1759 -6 1945 -8 OO N705SK
## 21 2013 9 11 NA NA NA NA OO N728SK
## 22 2013 9 12 NA NA NA NA OO N789SK
## 23 2013 9 13 1845 40 2041 48 OO N702SK
## 24 2013 9 15 1756 -9 1929 -24 OO N778SK
## 25 2013 9 16 1752 -13 1939 -14 OO N710SK
## 26 2013 9 17 1755 -10 1937 -16 OO N794SK
## 27 2013 9 18 1754 -11 1927 -26 OO N701SK
## 28 2013 9 19 1759 -6 1946 -7 OO N760SK
## 29 2013 9 20 1758 -7 1929 -24 OO N766SK
## 30 2013 9 22 1759 -6 1945 -8 OO N772SK
## 31 2013 9 23 1759 -6 1935 -18 OO N776SK
## 32 2013 9 24 1751 -14 1937 -16 OO N785SK
## Variables not shown: flight (int), origin (chr), dest (chr), air_time
## (dbl), distance (dbl), hour (dbl), minute (dbl)
#-----------------------------------------
#Examples
arrange(mtcars, cyl, disp)
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
## 2 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 3 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 4 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 5 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 6 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 7 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 8 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 9 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 10 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 11 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 12 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 13 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 14 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 15 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 16 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 17 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 18 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 19 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 20 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 21 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 22 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 24 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 25 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 26 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 27 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 28 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 29 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 30 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 31 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 32 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
arrange(mtcars, desc(disp))
## mpg cyl disp hp drat wt qsec vs am gear carb
## 1 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4
## 2 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4
## 3 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4
## 4 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2
## 6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4
## 7 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4
## 8 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4
## 9 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2
## 10 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2
## 11 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8
## 12 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3
## 13 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3
## 14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3
## 15 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1
## 16 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1
## 17 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4
## 18 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4
## 19 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4
## 20 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4
## 21 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2
## 22 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6
## 23 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2
## 24 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2
## 25 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2
## 26 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1
## 27 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1
## 28 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2
## 29 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1
## 30 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1
## 31 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2
## 32 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1
————————————————–
#Examples
library(dplyr)
if (require("microbenchmark") && has_lahman()) {
lahman_local <- lahman_srcs("df", "dt")
teams <- lapply(lahman_local, function(x) x %>% tbl("Teams"))
compare_tbls(teams, function(x) x %>% filter(yearID == 2010))
bench_tbls(teams, function(x) x %>% filter(yearID == 2010))
# You can also supply arbitrary additional arguments to bench_tbls
# if there are other operations you'd like to compare.
bench_tbls(teams, function(x) x %>% filter(yearID == 2010),
base = subset(Lahman::Teams, yearID == 2010))
# A more complicated example using multiple tables
setup <- function(src) {
list(
src %>% tbl("Batting") %>% filter(stint == 1) %>% select(playerID:H),
src %>% tbl("Master") %>% select(playerID, birthYear)
)
}
two_tables <- lapply(lahman_local, setup)
op <- function(tbls) {
semi_join(tbls[[1]], tbls[[2]], by = "playerID")
}
# compare_tbls(two_tables, op)
bench_tbls(two_tables, op, times = 2)
}
————————————–
#Examples
x <- rnorm(1e2)
x[between(x, -1, 1)]
## [1] -0.74514504 -0.88052357 0.85416135 0.60696128 -0.86278670
## [6] -0.48890137 -0.55613603 0.51548160 0.54895192 0.09681148
## [11] -0.55527200 -0.59463210 -0.25493668 -0.59350103 0.47608594
## [16] 0.13219356 -0.09316145 -0.41946417 0.43699721 -0.13482535
## [21] 0.19949309 0.67237610 -0.15521628 -0.07728611 -0.86609228
## [26] -0.71164465 -0.70987854 -0.41268549 -0.13551132 -0.94724260
## [31] 0.50409441 -0.02076468 0.60846707 0.53619117 0.04492938
## [36] 0.62336881 -0.17632775 -0.84539537 0.84669174 -0.11654251
## [41] 0.60434531 0.40685478 0.59458089 -0.63614126 0.54816286
## [46] -0.98417697 -0.85356170 0.31287356 0.57445725 -0.83425817
## [51] 0.10005099 0.65569335 -0.60177045 -0.63651447 0.45174517
## [56] -0.41951984 0.16901617 -0.23651226 0.08232645 -0.49527933
## [61] -0.06836192 -0.05942266 0.60916248 -0.30171271 0.43767429
## [66] -0.50610181 0.35060795
#Examples
build_sql("SELECT * FROM TABLE")
## <SQL> SELECT * FROM TABLE
x <- "TABLE"
build_sql("SELECT * FROM ", x)
## <SQL> SELECT * FROM 'TABLE'
build_sql("SELECT * FROM ", ident(x))
## <SQL> SELECT * FROM "TABLE"
build_sql("SELECT * FROM ", sql(x))
## <SQL> SELECT * FROM TABLE
# http://xkcd.com/327/
name <- "Robert'); DROP TABLE Students;--"
build_sql("INSERT INTO Students (Name) VALUES (", name, ")")
## <SQL> INSERT INTO Students (Name) VALUES ('Robert''); DROP TABLE Students;--')
#Examples
# If you're performing many operations you can either do step by step
if (require("nycflights13")) {
a1 <- group_by(flights, year, month, day)
a2 <- select(a1, arr_delay, dep_delay)
a3 <- summarise(a2,
arr = mean(arr_delay, na.rm = TRUE),
dep = mean(dep_delay, na.rm = TRUE))
a4 <- filter(a3, arr > 30 | dep > 30)
# If you don't want to save the intermediate results, you need to
# wrap the functions:
filter(
summarise(
select(
group_by(flights, year, month, day),
arr_delay, dep_delay
),
arr = mean(arr_delay, na.rm = TRUE),
dep = mean(dep_delay, na.rm = TRUE)
),
arr > 30 | dep > 30
)
# This is difficult to read because the order of the operations is from
# inside to out, and the arguments are a long way away from the function.
# Alternatively you can use chain or %>% to sequence the operations
# linearly:
flights %>%
group_by(year, month, day) %>%
select(arr_delay, dep_delay) %>%
summarise(
arr = mean(arr_delay, na.rm = TRUE),
dep = mean(dep_delay, na.rm = TRUE)
) %>%
filter(arr > 30 | dep > 30)
}
## Source: local data frame [49 x 5]
## Groups: year, month
##
## year month day arr dep
## 1 2013 1 16 34.24736 24.61287
## 2 2013 1 31 32.60285 28.65836
## 3 2013 2 11 36.29009 39.07360
## 4 2013 2 27 31.25249 37.76327
## 5 2013 3 8 85.86216 83.53692
## 6 2013 3 18 41.29189 30.11796
## 7 2013 4 10 38.41231 33.02368
## 8 2013 4 12 36.04814 34.83843
## 9 2013 4 18 36.02848 34.91536
## 10 2013 4 19 47.91170 46.12783
## 11 2013 4 22 37.81217 30.64255
## 12 2013 4 25 33.68125 23.33956
## 13 2013 5 8 39.60918 43.21778
## 14 2013 5 23 61.97090 51.14472
## 15 2013 5 24 24.25742 30.34072
## 16 2013 6 2 26.07552 34.01337
## 17 2013 6 10 28.02229 30.61945
## 18 2013 6 13 63.75369 45.79083
## 19 2013 6 18 37.64803 35.95077
## 20 2013 6 24 51.17681 47.15742
## 21 2013 6 25 41.51368 43.06303
## 22 2013 6 26 27.31741 30.61175
## 23 2013 6 27 44.78330 40.89123
## 24 2013 6 28 44.97685 48.82778
## 25 2013 6 30 43.51028 44.18818
## 26 2013 7 1 58.28050 56.23383
## 27 2013 7 7 40.30638 36.61745
## 28 2013 7 8 29.64885 37.29665
## 29 2013 7 9 31.33437 30.71150
## 30 2013 7 10 59.62648 52.86070
## 31 2013 7 22 62.76340 46.66705
## 32 2013 7 23 44.95982 44.74169
## 33 2013 7 28 49.83178 37.71016
## 34 2013 8 1 35.98926 34.57403
## 35 2013 8 8 55.48116 43.34995
## 36 2013 8 9 43.31364 34.69190
## 37 2013 8 22 29.97674 33.60042
## 38 2013 8 28 35.20307 40.52689
## 39 2013 9 2 45.51843 53.02955
## 40 2013 9 12 58.91242 49.95875
## 41 2013 10 7 39.01726 39.14671
## 42 2013 10 11 18.92299 31.23184
## 43 2013 12 5 51.66625 52.32799
## 44 2013 12 8 36.91180 21.51534
## 45 2013 12 9 42.57556 34.80022
## 46 2013 12 10 44.50880 26.46549
## 47 2013 12 14 46.39750 28.36155
## 48 2013 12 17 55.87186 40.70560
## 49 2013 12 23 32.22604 32.25415
#Examples
if (require("RSQLite") && has_lahman("sqlite")) {
batting <- tbl(lahman_sqlite(), "Batting")
remote <- select(filter(batting, yearID > 2010 && stint == 1), playerID:H)
remote2 <- collapse(remote)
cached <- compute(remote)
local <- collect(remote)
}
## Loading required package: RSQLite
## Loading required package: DBI
## Creating table: AllstarFull
## Creating table: Appearances
## Creating table: AwardsManagers
## Creating table: AwardsPlayers
## Creating table: AwardsShareManagers
## Creating table: AwardsSharePlayers
## Creating table: Batting
## Creating table: BattingPost
## Creating table: Fielding
## Creating table: FieldingOF
## Creating table: FieldingPost
## Creating table: HallOfFame
## Creating table: LahmanData
## Creating table: Managers
## Creating table: ManagersHalf
## Creating table: Master
## Creating table: Pitching
## Creating table: PitchingPost
## Creating table: Salaries
## Creating table: Schools
## Creating table: SchoolsPlayers
## Creating table: SeriesPost
## Creating table: Teams
## Creating table: TeamsFranchises
## Creating table: TeamsHalf
#Examples
if (require("RSQLite") && require("RSQLite.extfuns")) {
db <- src_sqlite(tempfile(), create = TRUE)
iris2 <- copy_to(db, iris)
mtcars$model <- rownames(mtcars)
mtcars2 <- copy_to(db, mtcars, indexes = list("model"))
explain(filter(mtcars2, model == "Hornet 4 Drive"))
# Note that tables are temporary by default, so they're not
# visible from other connections to the same database.
src_tbls(db)
db2 <- src_sqlite(db$path)
src_tbls(db2)
}
## Loading required package: RSQLite.extfuns
## <SQL>
## SELECT "mpg", "cyl", "disp", "hp", "drat", "wt", "qsec", "vs", "am", "gear", "carb", "model"
## FROM "mtcars"
## WHERE "model" = 'Hornet 4 Drive'
##
##
## <PLAN>
## selectid order from
## 1 0 0 0
## detail
## 1 SEARCH TABLE mtcars USING INDEX mtcars_model (model=?)
## character(0)
#Examples
a <- 1:5
data_frame(a, b = a * 2)
## Source: local data frame [5 x 2]
##
## a b
## 1 1 2
## 2 2 4
## 3 3 6
## 4 4 8
## 5 5 10
data_frame(a, b = a * 2, c = 1)
## Source: local data frame [5 x 3]
##
## a b c
## 1 1 2 1
## 2 2 4 1
## 3 3 6 1
## 4 4 8 1
## 5 5 10 1
data_frame(x = runif(10), y = x * 2)
## Source: local data frame [10 x 2]
##
## x y
## 1 0.41226008 0.8245202
## 2 0.55327525 1.1065505
## 3 0.23731705 0.4746341
## 4 0.58561299 1.1712260
## 5 0.08340908 0.1668182
## 6 0.74077732 1.4815546
## 7 0.56287209 1.1257442
## 8 0.97710989 1.9542198
## 9 0.18900110 0.3780022
## 10 0.11197740 0.2239548
# data_frame never coerces its inputs
str(data_frame(letters))
## Classes 'tbl_df', 'tbl' and 'data.frame': 26 obs. of 1 variable:
## $ letters: chr "a" "b" "c" "d" ...
str(data_frame(x = diag(5)))
## Classes 'tbl_df', 'tbl' and 'data.frame': 5 obs. of 1 variable:
## $ x: num [1:5, 1:5] 1 0 0 0 0 0 1 0 0 0 ...
# or munges column names
data_frame(`a + b` = 1:5)
## Source: local data frame [5 x 1]
##
## a + b
## 1 1
## 2 2
## 3 3
## 4 4
## 5 5
#Examples
desc(1:10)
## [1] -1 -2 -3 -4 -5 -6 -7 -8 -9 -10
desc(factor(letters))
## [1] -1 -2 -3 -4 -5 -6 -7 -8 -9 -10 -11 -12 -13 -14 -15 -16 -17
## [18] -18 -19 -20 -21 -22 -23 -24 -25 -26
first_day <- seq(as.Date("1910/1/1"), as.Date("1920/1/1"),
"years")
desc(first_day)
## [1] 21915 21550 21185 20819 20454 20089 19724 19358 18993 18628 18263
#Examples
df <- data.frame(
x = sample(10, 100, rep = TRUE),
y = sample(10, 100, rep = TRUE)
)
nrow(df)
## [1] 100
nrow(distinct(df))
## [1] 65
distinct(df, x)
## x y
## 1 10 5
## 2 8 4
## 3 5 5
## 4 7 8
## 5 2 8
## 6 9 3
## 7 4 2
## 8 1 7
## 9 3 6
## 10 6 3
distinct(df, y)
## x y
## 1 10 5
## 2 8 4
## 3 10 9
## 4 7 8
## 5 7 10
## 6 7 7
## 7 9 3
## 8 4 2
## 9 9 1
## 10 3 6
# You can also use distinct on computed variables
distinct(df, diff = abs(x - y))
## x y diff
## 1 10 5 5
## 2 8 4 4
## 3 10 9 1
## 4 5 5 0
## 5 7 10 3
## 6 2 8 6
## 7 4 2 2
## 8 9 1 8
## 9 10 1 9
## 10 8 1 7
——————————————-
#Examples
by_cyl <- group_by(mtcars, cyl)
do(by_cyl, head(., 2))
## Source: local data frame [6 x 12]
## Groups: cyl
##
## mpg cyl disp hp drat wt qsec vs am gear carb model
## 1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 Datsun 710
## 2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 Merc 240D
## 3 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 Mazda RX4
## 4 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 Mazda RX4 Wag
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 Hornet Sportabout
## 6 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 Duster 360
models <- by_cyl %>% do(mod = lm(mpg ~ disp, data = .))
models
## Source: local data frame [3 x 2]
## Groups: <by row>
##
## cyl mod
## 1 4 <S3:lm>
## 2 6 <S3:lm>
## 3 8 <S3:lm>
summarise(models, rsq = summary(mod)$r.squared)
## Source: local data frame [3 x 1]
##
## rsq
## 1 0.64840514
## 2 0.01062604
## 3 0.27015777
models %>% do(data.frame(coef = coef(.$mod)))
## Source: local data frame [6 x 1]
## Groups: <by row>
##
## coef
## 1 40.871955322
## 2 -0.135141815
## 3 19.081987419
## 4 0.003605119
## 5 22.032798914
## 6 -0.019634095
models %>% do(data.frame(
var = names(coef(.$mod)),
coef(summary(.$mod)))
)
## Source: local data frame [6 x 5]
## Groups: <by row>
##
## var Estimate Std..Error t.value Pr...t..
## 1 (Intercept) 40.871955322 3.589605400 11.3861973 1.202715e-06
## 2 disp -0.135141815 0.033171608 -4.0740206 2.782827e-03
## 3 (Intercept) 19.081987419 2.913992892 6.5483988 1.243968e-03
## 4 disp 0.003605119 0.015557115 0.2317344 8.259297e-01
## 5 (Intercept) 22.032798914 3.345241115 6.5863112 2.588765e-05
## 6 disp -0.019634095 0.009315926 -2.1075838 5.677488e-02
models <- by_cyl %>% do(
mod_linear = lm(mpg ~ disp, data = .),
mod_quad = lm(mpg ~ poly(disp, 2), data = .)
)
models
## Source: local data frame [3 x 3]
## Groups: <by row>
##
## cyl mod_linear mod_quad
## 1 4 <S3:lm> <S3:lm>
## 2 6 <S3:lm> <S3:lm>
## 3 8 <S3:lm> <S3:lm>
compare <- models %>% do(aov = anova(.$mod_linear, .$mod_quad))
# compare %>% summarise(p.value = aov$`Pr(>F)`)
if (require("nycflights13")) {
# You can use it to do any arbitrary computation, like fitting a linear
# model. Let's explore how carrier departure delays vary over the time
carriers <- group_by(flights, carrier)
group_size(carriers)
mods <- do(carriers, mod = lm(arr_delay ~ dep_time, data = .))
mods %>% do(as.data.frame(coef(.$mod)))
mods %>% summarise(rsq = summary(mod)$r.squared)
## Not run:
# This longer example shows the progress bar in action
by_dest <- flights %>% group_by(dest) %>% filter(n() > 100)
library(mgcv)
by_dest %>% do(smooth = gam(arr_delay ~ s(dep_time) + month, data = .))
## End(Not run)
}
## Loading required package: nlme
##
## Attaching package: 'nlme'
##
## The following object is masked from 'package:dplyr':
##
## collapse
##
## This is mgcv 1.8-4. For overview type 'help("mgcv-package")'.
## Source: local data frame [93 x 2]
## Groups: <by row>
##
## dest smooth
## 1 ABQ <S3:gam, glm, lm>
## 2 ACK <S3:gam, glm, lm>
## 3 ALB <S3:gam, glm, lm>
## 4 ATL <S3:gam, glm, lm>
## 5 AUS <S3:gam, glm, lm>
## 6 AVL <S3:gam, glm, lm>
## 7 BDL <S3:gam, glm, lm>
## 8 BGR <S3:gam, glm, lm>
## 9 BHM <S3:gam, glm, lm>
## 10 BNA <S3:gam, glm, lm>
## 11 BOS <S3:gam, glm, lm>
## 12 BQN <S3:gam, glm, lm>
## 13 BTV <S3:gam, glm, lm>
## 14 BUF <S3:gam, glm, lm>
## 15 BUR <S3:gam, glm, lm>
## 16 BWI <S3:gam, glm, lm>
## 17 CAE <S3:gam, glm, lm>
## 18 CAK <S3:gam, glm, lm>
## 19 CHS <S3:gam, glm, lm>
## 20 CLE <S3:gam, glm, lm>
## 21 CLT <S3:gam, glm, lm>
## 22 CMH <S3:gam, glm, lm>
## 23 CRW <S3:gam, glm, lm>
## 24 CVG <S3:gam, glm, lm>
## 25 DAY <S3:gam, glm, lm>
## 26 DCA <S3:gam, glm, lm>
## 27 DEN <S3:gam, glm, lm>
## 28 DFW <S3:gam, glm, lm>
## 29 DSM <S3:gam, glm, lm>
## 30 DTW <S3:gam, glm, lm>
## 31 EGE <S3:gam, glm, lm>
## 32 FLL <S3:gam, glm, lm>
## 33 GRR <S3:gam, glm, lm>
## 34 GSO <S3:gam, glm, lm>
## 35 GSP <S3:gam, glm, lm>
## 36 HNL <S3:gam, glm, lm>
## 37 HOU <S3:gam, glm, lm>
## 38 IAD <S3:gam, glm, lm>
## 39 IAH <S3:gam, glm, lm>
## 40 ILM <S3:gam, glm, lm>
## 41 IND <S3:gam, glm, lm>
## 42 JAX <S3:gam, glm, lm>
## 43 LAS <S3:gam, glm, lm>
## 44 LAX <S3:gam, glm, lm>
## 45 LGB <S3:gam, glm, lm>
## 46 MCI <S3:gam, glm, lm>
## 47 MCO <S3:gam, glm, lm>
## 48 MDW <S3:gam, glm, lm>
## 49 MEM <S3:gam, glm, lm>
## 50 MHT <S3:gam, glm, lm>
## 51 MIA <S3:gam, glm, lm>
## 52 MKE <S3:gam, glm, lm>
## 53 MSN <S3:gam, glm, lm>
## 54 MSP <S3:gam, glm, lm>
## 55 MSY <S3:gam, glm, lm>
## 56 MVY <S3:gam, glm, lm>
## 57 OAK <S3:gam, glm, lm>
## 58 OKC <S3:gam, glm, lm>
## 59 OMA <S3:gam, glm, lm>
## 60 ORD <S3:gam, glm, lm>
## 61 ORF <S3:gam, glm, lm>
## 62 PBI <S3:gam, glm, lm>
## 63 PDX <S3:gam, glm, lm>
## 64 PHL <S3:gam, glm, lm>
## 65 PHX <S3:gam, glm, lm>
## 66 PIT <S3:gam, glm, lm>
## 67 PSE <S3:gam, glm, lm>
## 68 PVD <S3:gam, glm, lm>
## 69 PWM <S3:gam, glm, lm>
## 70 RDU <S3:gam, glm, lm>
## 71 RIC <S3:gam, glm, lm>
## 72 ROC <S3:gam, glm, lm>
## 73 RSW <S3:gam, glm, lm>
## 74 SAN <S3:gam, glm, lm>
## 75 SAT <S3:gam, glm, lm>
## 76 SAV <S3:gam, glm, lm>
## 77 SDF <S3:gam, glm, lm>
## 78 SEA <S3:gam, glm, lm>
## 79 SFO <S3:gam, glm, lm>
## 80 SJC <S3:gam, glm, lm>
## 81 SJU <S3:gam, glm, lm>
## 82 SLC <S3:gam, glm, lm>
## 83 SMF <S3:gam, glm, lm>
## 84 SNA <S3:gam, glm, lm>
## 85 SRQ <S3:gam, glm, lm>
## 86 STL <S3:gam, glm, lm>
## 87 STT <S3:gam, glm, lm>
## 88 SYR <S3:gam, glm, lm>
## 89 TPA <S3:gam, glm, lm>
## 90 TUL <S3:gam, glm, lm>
## 91 TVC <S3:gam, glm, lm>
## 92 TYS <S3:gam, glm, lm>
## 93 XNA <S3:gam, glm, lm>
——————————————
#Examples
library(dplyr)
if (require("RSQLite") && has_lahman("sqlite")) {
batting <- tbl(lahman_sqlite(), "Batting")
batting %>% show_query()
batting %>% explain()
# The batting database has indices on all ID variables:
# SQLite automatically picks the most restrictive index
batting %>%
filter(lgID == "NL" & yearID == 2000L) %>%
explain()
# OR's will use multiple indexes
batting %>%
filter(lgID == "NL" | yearID == 2000) %>%
explain()
# Joins will use indexes in both tables
teams <- tbl(lahman_sqlite(), "Teams")
batting %>%
left_join(teams, c("yearID", "teamID")) %>%
explain()
}
## <SQL>
## SELECT "playerID", "yearID", "stint", "teamID", "lgID", "G", "G_batting", "AB", "R", "H", "X2B", "X3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB", "HBP", "SH", "SF", "GIDP", "G_old"
## FROM "Batting"
## <SQL>
## SELECT "playerID", "yearID", "stint", "teamID", "lgID", "G", "G_batting", "AB", "R", "H", "X2B", "X3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB", "HBP", "SH", "SF", "GIDP", "G_old"
## FROM "Batting"
##
##
## <PLAN>
## selectid order from detail
## 1 0 0 0 SCAN TABLE Batting
## <SQL>
## SELECT "playerID", "yearID", "stint", "teamID", "lgID", "G", "G_batting", "AB", "R", "H", "X2B", "X3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB", "HBP", "SH", "SF", "GIDP", "G_old"
## FROM "Batting"
## WHERE "lgID" = 'NL' AND "yearID" = 2000
##
##
## <PLAN>
## selectid order from
## 1 0 0 0
## detail
## 1 SEARCH TABLE Batting USING INDEX Batting_yearID (yearID=?)
## <SQL>
## SELECT "playerID", "yearID", "stint", "teamID", "lgID", "G", "G_batting", "AB", "R", "H", "X2B", "X3B", "HR", "RBI", "SB", "CS", "BB", "SO", "IBB", "HBP", "SH", "SF", "GIDP", "G_old"
## FROM "Batting"
## WHERE "lgID" = 'NL' OR "yearID" = 2000.0
##
##
## <PLAN>
## selectid order from
## 1 0 0 0
## 2 0 0 0
## detail
## 1 SEARCH TABLE Batting USING INDEX Batting_lgID (lgID=?)
## 2 SEARCH TABLE Batting USING INDEX Batting_yearID (yearID=?)
## <SQL>
## SELECT "playerID", "yearID", "stint", "teamID", "lgID.x", "G.x", "G_batting", "AB.x", "R.x", "H.x", "X2B.x", "X3B.x", "HR.x", "RBI", "SB.x", "CS.x", "BB.x", "SO.x", "IBB", "HBP.x", "SH", "SF.x", "GIDP", "G_old", "lgID.y", "franchID", "divID", "Rank", "G.y", "Ghome", "W", "L", "DivWin", "WCWin", "LgWin", "WSWin", "R.y", "AB.y", "H.y", "X2B.y", "X3B.y", "HR.y", "BB.y", "SO.y", "SB.y", "CS.y", "HBP.y", "SF.y", "RA", "ER", "ERA", "CG", "SHO", "SV", "IPouts", "HA", "HRA", "BBA", "SOA", "E", "DP", "FP", "name", "park", "attendance", "BPF", "PPF", "teamIDBR", "teamIDlahman45", "teamIDretro"
## FROM (SELECT * FROM (SELECT "playerID" AS "playerID", "yearID" AS "yearID", "stint" AS "stint", "teamID" AS "teamID", "lgID" AS "lgID.x", "G" AS "G.x", "G_batting" AS "G_batting", "AB" AS "AB.x", "R" AS "R.x", "H" AS "H.x", "X2B" AS "X2B.x", "X3B" AS "X3B.x", "HR" AS "HR.x", "RBI" AS "RBI", "SB" AS "SB.x", "CS" AS "CS.x", "BB" AS "BB.x", "SO" AS "SO.x", "IBB" AS "IBB", "HBP" AS "HBP.x", "SH" AS "SH", "SF" AS "SF.x", "GIDP" AS "GIDP", "G_old" AS "G_old"
## FROM "Batting") AS "_W2"
##
## LEFT JOIN
##
## (SELECT "yearID" AS "yearID", "lgID" AS "lgID.y", "teamID" AS "teamID", "franchID" AS "franchID", "divID" AS "divID", "Rank" AS "Rank", "G" AS "G.y", "Ghome" AS "Ghome", "W" AS "W", "L" AS "L", "DivWin" AS "DivWin", "WCWin" AS "WCWin", "LgWin" AS "LgWin", "WSWin" AS "WSWin", "R" AS "R.y", "AB" AS "AB.y", "H" AS "H.y", "X2B" AS "X2B.y", "X3B" AS "X3B.y", "HR" AS "HR.y", "BB" AS "BB.y", "SO" AS "SO.y", "SB" AS "SB.y", "CS" AS "CS.y", "HBP" AS "HBP.y", "SF" AS "SF.y", "RA" AS "RA", "ER" AS "ER", "ERA" AS "ERA", "CG" AS "CG", "SHO" AS "SHO", "SV" AS "SV", "IPouts" AS "IPouts", "HA" AS "HA", "HRA" AS "HRA", "BBA" AS "BBA", "SOA" AS "SOA", "E" AS "E", "DP" AS "DP", "FP" AS "FP", "name" AS "name", "park" AS "park", "attendance" AS "attendance", "BPF" AS "BPF", "PPF" AS "PPF", "teamIDBR" AS "teamIDBR", "teamIDlahman45" AS "teamIDlahman45", "teamIDretro" AS "teamIDretro"
## FROM "Teams") AS "_W3"
##
## USING ("yearID", "teamID")) AS "_W4"
##
##
## <PLAN>
## selectid order from
## 1 1 0 0
## 2 0 0 0
## 3 0 1 1
## detail
## 1 SCAN TABLE Teams
## 2 SCAN TABLE Batting
## 3 SEARCH SUBQUERY 1 AS _W3 USING AUTOMATIC COVERING INDEX (teamID=? AND yearID=?)
#Examples
library(dplyr)
library(RSQLite)
library(DBI)
library(mgcv)
library(nlme)
f <- function(x) if (x == 1) stop("Error!") else 1
## Not run:
#f(1)
f(2)
## [1] 1
## End(Not run)
safef <- failwith(NULL, f)
#safef(1)
safef(2)
## [1] 1
# Examples
filter(mtcars, cyl == 8)
## mpg cyl disp hp drat wt qsec vs am gear carb model
## 1 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 Hornet Sportabout
## 2 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 Duster 360
## 3 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 Merc 450SE
## 4 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 Merc 450SL
## 5 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 Merc 450SLC
## 6 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 Cadillac Fleetwood
## 7 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 Lincoln Continental
## 8 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 Chrysler Imperial
## 9 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 Dodge Challenger
## 10 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 AMC Javelin
## 11 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 Camaro Z28
## 12 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 Pontiac Firebird
## 13 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 Ford Pantera L
## 14 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 Maserati Bora
filter(mtcars, cyl < 6)
## mpg cyl disp hp drat wt qsec vs am gear carb model
## 1 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 Datsun 710
## 2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 Merc 240D
## 3 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 Merc 230
## 4 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 Fiat 128
## 5 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 Honda Civic
## 6 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 Toyota Corolla
## 7 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 Toyota Corona
## 8 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 Fiat X1-9
## 9 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 Porsche 914-2
## 10 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 Lotus Europa
## 11 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 Volvo 142E
# Examples
funs(mean, "mean", mean(., na.rm = TRUE))
## <fun_calls>
## $ mean: mean(.)
## $ mean: mean(.)
## $ mean: mean(., na.rm = TRUE)
# Overide default names
funs(m1 = mean, m2 = "mean", m3 = mean(., na.rm = TRUE))
## <fun_calls>
## $ m1: mean(.)
## $ m2: mean(.)
## $ m3: mean(., na.rm = TRUE)
# If you have a function names in a vector, use funs_q
fs <- c("min", "max")
funs_(fs)
## <fun_calls>
## $ min: min(.)
## $ max: max(.)
# Examples
glimpse(mtcars)
## Variables:
## $ mpg (dbl) 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19...
## $ cyl (dbl) 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4,...
## $ disp (dbl) 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, ...
## $ hp (dbl) 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180,...
## $ drat (dbl) 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3....
## $ wt (dbl) 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, ...
## $ qsec (dbl) 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, ...
## $ vs (dbl) 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ am (dbl) 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,...
## $ gear (dbl) 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4,...
## $ carb (dbl) 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2,...
## $ model (chr) "Mazda RX4", "Mazda RX4 Wag", "Datsun 710", "Hornet 4 Dr...
if (require("RSQLite") && has_lahman("sqlite")) {
batting <- tbl(lahman_sqlite(), "Batting")
glimpse(batting)
}
## Variables:
## $ playerID (chr) "aardsda01", "aardsda01", "aardsda01", "aardsda01", ...
## $ yearID (int) 2004, 2006, 2007, 2008, 2009, 2010, 2012, 1954, 1955...
## $ stint (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1...
## $ teamID (chr) "SFN", "CHN", "CHA", "BOS", "SEA", "SEA", "NYA", "ML...
## $ lgID (chr) "NL", "NL", "AL", "AL", "AL", "AL", "AL", "NL", "NL"...
## $ G (int) 11, 45, 25, 47, 73, 53, 1, 122, 153, 153, 151, 153, ...
## $ G_batting (int) 11, 43, 2, 5, 3, 4, NA, 122, 153, 153, 151, 153, 154...
## $ AB (int) 0, 2, 0, 1, 0, 0, NA, 468, 602, 609, 615, 601, 629, ...
## $ R (int) 0, 0, 0, 0, 0, 0, NA, 58, 105, 106, 118, 109, 116, 1...
## $ H (int) 0, 0, 0, 0, 0, 0, NA, 131, 189, 200, 198, 196, 223, ...
## $ X2B (int) 0, 0, 0, 0, 0, 0, NA, 27, 37, 34, 27, 34, 46, 20, 39...
## $ X3B (int) 0, 0, 0, 0, 0, 0, NA, 6, 9, 14, 6, 4, 7, 11, 10, 6, ...
## $ HR (int) 0, 0, 0, 0, 0, 0, NA, 13, 27, 26, 44, 30, 39, 40, 34...
## $ RBI (int) 0, 0, 0, 0, 0, 0, NA, 69, 106, 92, 132, 95, 123, 126...
## $ SB (int) 0, 0, 0, 0, 0, 0, NA, 2, 3, 2, 1, 4, 8, 16, 21, 15, ...
## $ CS (int) 0, 0, 0, 0, 0, 0, NA, 2, 1, 4, 1, 1, 0, 7, 9, 7, 5, ...
## $ BB (int) 0, 0, 0, 0, 0, 0, NA, 28, 49, 37, 57, 59, 51, 60, 56...
## $ SO (int) 0, 0, 0, 1, 0, 0, NA, 39, 61, 54, 58, 49, 54, 63, 64...
## $ IBB (int) 0, 0, 0, 0, 0, 0, NA, NA, 5, 6, 15, 16, 17, 13, 20, ...
## $ HBP (int) 0, 0, 0, 0, 0, 0, NA, 3, 3, 2, 0, 1, 4, 2, 2, 3, 0, ...
## $ SH (int) 0, 1, 0, 0, 0, 0, NA, 6, 7, 5, 0, 0, 0, 0, 1, 0, 0, ...
## $ SF (int) 0, 0, 0, 0, 0, 0, NA, 4, 4, 7, 3, 3, 9, 12, 9, 6, 5,...
## $ GIDP (int) 0, 0, 0, 0, 0, 0, NA, 13, 20, 21, 13, 21, 19, 8, 16,...
## $ G_old (int) 11, 45, 2, 5, NA, NA, NA, 122, 153, 153, 151, 153, 1...
#Examples
if (require("data.table") && require("nycflights13")) {
flights_dt <- tbl_dt(flights)
group_size(group_by(flights_dt, year, month, day))
group_size(group_by(flights_dt, dest))
monthly <- group_by(flights_dt, month)
summarise(monthly, n = n(), delay = mean(arr_delay))
}
## Loading required package: data.table
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:dplyr':
##
## between, last
## Source: local data table [12 x 3]
##
## month n delay
## 1 1 27004 NA
## 2 2 24951 NA
## 3 3 28834 NA
## 4 4 28330 NA
## 5 5 28796 NA
## 6 6 28243 NA
## 7 7 29425 NA
## 8 8 29327 NA
## 9 9 27574 NA
## 10 10 28889 NA
## 11 11 27268 NA
## 12 12 28135 NA
#Examples
grouped <- group_by(mtcars, cyl)
groups(grouped)
## [[1]]
## cyl
groups(ungroup(grouped))
## NULL
——————————————-
#Examples
by_cyl <- group_by(mtcars, cyl)
summarise(by_cyl, mean(disp), mean(hp))
## Source: local data frame [3 x 3]
##
## cyl mean(disp) mean(hp)
## 1 4 105.1364 82.63636
## 2 6 183.3143 122.28571
## 3 8 353.1000 209.21429
filter(by_cyl, disp == max(disp))
## Source: local data frame [3 x 12]
## Groups: cyl
##
## mpg cyl disp hp drat wt qsec vs am gear carb model
## 1 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 Hornet 4 Drive
## 2 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 Merc 240D
## 3 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 Cadillac Fleetwood
# summarise peels off a single layer of grouping
by_vs_am <- group_by(mtcars, vs, am)
by_vs <- summarise(by_vs_am, n = n())
by_vs
## Source: local data frame [4 x 3]
## Groups: vs
##
## vs am n
## 1 0 0 12
## 2 0 1 6
## 3 1 0 7
## 4 1 1 7
summarise(by_vs, n = sum(n))
## Source: local data frame [2 x 2]
##
## vs n
## 1 0 18
## 2 1 14
# use ungroup() to remove if not wanted
summarise(ungroup(by_vs), n = sum(n))
## Source: local data frame [1 x 1]
##
## n
## 1 32
# You can group by expressions: this is just short-hand for
# a mutate/rename followed by a simple group_by
group_by(mtcars, vsam = vs + am)
## Source: local data frame [32 x 13]
## Groups: vsam
##
## mpg cyl disp hp drat wt qsec vs am gear carb model
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 Mazda RX4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 Mazda RX4 Wag
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 Datsun 710
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 Hornet 4 Drive
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 Hornet Sportabout
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 Valiant
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 Duster 360
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 Merc 240D
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 Merc 230
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 Merc 280
## 11 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 Merc 280C
## 12 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 Merc 450SE
## 13 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 Merc 450SL
## 14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 Merc 450SLC
## 15 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 Cadillac Fleetwood
## 16 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 Lincoln Continental
## 17 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 Chrysler Imperial
## 18 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 Fiat 128
## 19 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 Honda Civic
## 20 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 Toyota Corolla
## 21 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 Toyota Corona
## 22 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 Dodge Challenger
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 AMC Javelin
## 24 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 Camaro Z28
## 25 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 Pontiac Firebird
## 26 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 Fiat X1-9
## 27 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 Porsche 914-2
## 28 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 Lotus Europa
## 29 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 Ford Pantera L
## 30 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 Ferrari Dino
## 31 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 Maserati Bora
## 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 Volvo 142E
## Variables not shown: vsam (dbl)
group_by(mtcars, vs2 = vs)
## Source: local data frame [32 x 13]
## Groups: vs2
##
## mpg cyl disp hp drat wt qsec vs am gear carb model
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 Mazda RX4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 Mazda RX4 Wag
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 Datsun 710
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 Hornet 4 Drive
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 Hornet Sportabout
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 Valiant
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 Duster 360
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 Merc 240D
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 Merc 230
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 Merc 280
## 11 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 Merc 280C
## 12 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 Merc 450SE
## 13 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 Merc 450SL
## 14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 Merc 450SLC
## 15 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 Cadillac Fleetwood
## 16 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 Lincoln Continental
## 17 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 Chrysler Imperial
## 18 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 Fiat 128
## 19 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 Honda Civic
## 20 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 Toyota Corolla
## 21 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 Toyota Corona
## 22 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 Dodge Challenger
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 AMC Javelin
## 24 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 Camaro Z28
## 25 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 Pontiac Firebird
## 26 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 Fiat X1-9
## 27 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 Porsche 914-2
## 28 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 Lotus Europa
## 29 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 Ford Pantera L
## 30 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 Ferrari Dino
## 31 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 Maserati Bora
## 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 Volvo 142E
## Variables not shown: vs2 (dbl)
# You can also group by a constant, but it's not very useful
group_by(mtcars, "vs")
## Source: local data frame [32 x 13]
## Groups: "vs"
##
## mpg cyl disp hp drat wt qsec vs am gear carb model
## 1 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 Mazda RX4
## 2 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 Mazda RX4 Wag
## 3 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 Datsun 710
## 4 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 Hornet 4 Drive
## 5 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 Hornet Sportabout
## 6 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 Valiant
## 7 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 Duster 360
## 8 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 Merc 240D
## 9 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 Merc 230
## 10 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 Merc 280
## 11 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 Merc 280C
## 12 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 Merc 450SE
## 13 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 Merc 450SL
## 14 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 Merc 450SLC
## 15 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 Cadillac Fleetwood
## 16 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 Lincoln Continental
## 17 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 Chrysler Imperial
## 18 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 Fiat 128
## 19 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 Honda Civic
## 20 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 Toyota Corolla
## 21 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 Toyota Corona
## 22 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 Dodge Challenger
## 23 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 AMC Javelin
## 24 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 Camaro Z28
## 25 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 Pontiac Firebird
## 26 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 Fiat X1-9
## 27 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 Porsche 914-2
## 28 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 Lotus Europa
## 29 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 Ford Pantera L
## 30 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 Ferrari Dino
## 31 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 Maserati Bora
## 32 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 Volvo 142E
## Variables not shown: "vs" (chr)
# By default, group_by sets groups. Use add = TRUE to add groups
groups(group_by(by_cyl, vs, am))
## [[1]]
## vs
##
## [[2]]
## am
groups(group_by(by_cyl, vs, am, add = TRUE))
## [[1]]
## cyl
##
## [[2]]
## vs
##
## [[3]]
## am
# Duplicate groups are silently dropped
groups(group_by(by_cyl, cyl, cyl))
## [[1]]
## cyl
#Examples
if (require("nycflights13")) {
by_day <- flights %>% group_by(year, month, day)
n_groups(by_day)
group_size(by_day)
by_dest <- flights %>% group_by(dest)
n_groups(by_dest)
group_size(by_dest)
}
## [1] 254 265 439 8 17215 2439 275 443 375 297 6333
## [12] 15508 896 2589 4681 371 1781 36 116 864 52 2884
## [23] 4573 14064 3524 138 3941 1525 9705 7266 8738 569 9384
## [34] 213 17 12055 765 1606 849 15 707 2115 5700 7198
## [45] 110 2077 25 2720 5997 16174 1 1 668 2008 14082
## [56] 4113 1789 1009 11728 2802 572 7185 3799 15 221 59
## [67] 312 346 849 17283 1536 6554 1354 1632 4656 2875 365
## [78] 19 376 2352 8163 2454 2416 3537 2737 686 804 10
## [89] 1157 3923 13331 329 5819 2467 284 825 1211 4339 522
## [100] 1761 7466 315 101 631 1036
#Examples
if (require("Lahman")) {
batting_df <- tbl_df(Batting)
person_df <- tbl_df(Master)
uperson_df <- tbl_df(Master[!duplicated(Master$playerID), ])
# Inner join: match batting and person data
inner_join(batting_df, person_df)
inner_join(batting_df, uperson_df)
# Left join: match, but preserve batting data
left_join(batting_df, uperson_df)
}
## Loading required package: Lahman
## Joining by: "playerID"
## Joining by: "playerID"
## Joining by: "playerID"
## Source: local data frame [97,889 x 49]
##
## playerID yearID stint teamID lgID G G_batting AB R H X2B X3B HR
## 1 aardsda01 2004 1 SFN NL 11 11 0 0 0 0 0 0
## 2 aardsda01 2006 1 CHN NL 45 43 2 0 0 0 0 0
## 3 aardsda01 2007 1 CHA AL 25 2 0 0 0 0 0 0
## 4 aardsda01 2008 1 BOS AL 47 5 1 0 0 0 0 0
## 5 aardsda01 2009 1 SEA AL 73 3 0 0 0 0 0 0
## 6 aardsda01 2010 1 SEA AL 53 4 0 0 0 0 0 0
## 7 aardsda01 2012 1 NYA AL 1 NA NA NA NA NA NA NA
## 8 aaronha01 1954 1 ML1 NL 122 122 468 58 131 27 6 13
## 9 aaronha01 1955 1 ML1 NL 153 153 602 105 189 37 9 27
## 10 aaronha01 1956 1 ML1 NL 153 153 609 106 200 34 14 26
## .. ... ... ... ... ... ... ... ... ... ... ... ... ..
## Variables not shown: RBI (int), SB (int), CS (int), BB (int), SO (int),
## IBB (int), HBP (int), SH (int), SF (int), GIDP (int), G_old (int),
## birthYear (int), birthMonth (int), birthDay (int), birthCountry (chr),
## birthState (chr), birthCity (chr), deathYear (int), deathMonth (int),
## deathDay (int), deathCountry (chr), deathState (chr), deathCity (chr),
## nameFirst (chr), nameLast (chr), nameGiven (chr), weight (int), height
## (int), bats (fctr), throws (fctr), debut (chr), finalGame (chr), retroID
## (chr), bbrefID (chr), deathDate (date), birthDate (date)
#Examples
if (require("Lahman")) {
batting_df <- tbl_df(Batting)
person_df <- tbl_df(Master)
uperson_df <- tbl_df(Master[!duplicated(Master$playerID), ])
# Inner join: match batting and person data
inner_join(batting_df, person_df)
inner_join(batting_df, uperson_df)
# Left join: match, but preserve batting data
left_join(batting_df, uperson_df)
}
## Joining by: "playerID"
## Joining by: "playerID"
## Joining by: "playerID"
## Source: local data frame [97,889 x 49]
##
## playerID yearID stint teamID lgID G G_batting AB R H X2B X3B HR
## 1 aardsda01 2004 1 SFN NL 11 11 0 0 0 0 0 0
## 2 aardsda01 2006 1 CHN NL 45 43 2 0 0 0 0 0
## 3 aardsda01 2007 1 CHA AL 25 2 0 0 0 0 0 0
## 4 aardsda01 2008 1 BOS AL 47 5 1 0 0 0 0 0
## 5 aardsda01 2009 1 SEA AL 73 3 0 0 0 0 0 0
## 6 aardsda01 2010 1 SEA AL 53 4 0 0 0 0 0 0
## 7 aardsda01 2012 1 NYA AL 1 NA NA NA NA NA NA NA
## 8 aaronha01 1954 1 ML1 NL 122 122 468 58 131 27 6 13
## 9 aaronha01 1955 1 ML1 NL 153 153 602 105 189 37 9 27
## 10 aaronha01 1956 1 ML1 NL 153 153 609 106 200 34 14 26
## .. ... ... ... ... ... ... ... ... ... ... ... ... ..
## Variables not shown: RBI (int), SB (int), CS (int), BB (int), SO (int),
## IBB (int), HBP (int), SH (int), SF (int), GIDP (int), G_old (int),
## birthYear (int), birthMonth (int), birthDay (int), birthCountry (chr),
## birthState (chr), birthCity (chr), deathYear (int), deathMonth (int),
## deathDay (int), deathCountry (chr), deathState (chr), deathCity (chr),
## nameFirst (chr), nameLast (chr), nameGiven (chr), weight (int), height
## (int), bats (fctr), throws (fctr), debut (chr), finalGame (chr), retroID
## (chr), bbrefID (chr), deathDate (date), birthDate (date)
#Examples
if (require("data.table") && require("Lahman")) {
batting_dt <- tbl_dt(Batting)
person_dt <- tbl_dt(Master)
# Inner join: match batting and person data
inner_join(batting_dt, person_dt)
# Left join: keep batting data even if person missing
left_join(batting_dt, person_dt)
# Semi-join: find batting data for top 4 teams, 2010:2012
grid <- expand.grid(
teamID = c("WAS", "ATL", "PHI", "NYA"),
yearID = 2010:2012)
top4 <- semi_join(batting_dt, grid, copy = TRUE)
# Anti-join: find batting data with out player data
anti_join(batting_dt, person_dt)
}
## Joining by: "playerID"
## Joining by: "playerID"
## Joining by: c("yearID", "teamID")
## Joining by: "playerID"
## Source: local data table [0 x 24]
#Examples
library(dplyr)
library(RSQLite)
library(DBI)
library(mgcv)
library(nlme)
library(data.table)
library(Lahman)
#if (require("RSQLite") && has_lahman("sqlite")) {
# Left joins ----------------------------------------------------------------
batting <- tbl(lahman_sqlite(), "Batting")
head(batting,5)
## playerID yearID stint teamID lgID G G_batting AB R H X2B X3B HR RBI SB
## 1 aardsda01 2004 1 SFN NL 11 11 0 0 0 0 0 0 0 0
## 2 aardsda01 2006 1 CHN NL 45 43 2 0 0 0 0 0 0 0
## 3 aardsda01 2007 1 CHA AL 25 2 0 0 0 0 0 0 0 0
## 4 aardsda01 2008 1 BOS AL 47 5 1 0 0 0 0 0 0 0
## 5 aardsda01 2009 1 SEA AL 73 3 0 0 0 0 0 0 0 0
## CS BB SO IBB HBP SH SF GIDP G_old
## 1 0 0 0 0 0 0 0 0 11
## 2 0 0 0 0 0 1 0 0 45
## 3 0 0 0 0 0 0 0 0 2
## 4 0 0 1 0 0 0 0 0 5
## 5 0 0 0 0 0 0 0 0 NA
team_info <- select(tbl(lahman_sqlite(), "Teams"), yearID, lgID, teamID, G, R:H)
head(team_info,5)
## yearID lgID teamID G R AB H
## 1 1871 NA PH1 28 376 1281 410
## 2 1871 NA CH1 28 302 1196 323
## 3 1871 NA BS1 31 401 1372 426
## 4 1871 NA WS3 32 310 1353 375
## 5 1871 NA NY2 33 302 1404 403
# Combine player and whole team statistics
first_stint <- select(filter(batting, stint == 1), playerID:H)
head(first_stint,5)
## playerID yearID stint teamID lgID G G_batting AB R H
## 1 aardsda01 2004 1 SFN NL 11 11 0 0 0
## 2 aardsda01 2006 1 CHN NL 45 43 2 0 0
## 3 aardsda01 2007 1 CHA AL 25 2 0 0 0
## 4 aardsda01 2008 1 BOS AL 47 5 1 0 0
## 5 aardsda01 2009 1 SEA AL 73 3 0 0 0
both <- left_join(first_stint, team_info, type = "inner", by = c("yearID", "teamID", "lgID"))
head(both)
## playerID yearID stint teamID lgID G.x G_batting AB.x R.x H.x G.y R.y
## 1 aardsda01 2004 1 SFN NL 11 11 0 0 0 162 850
## 2 aardsda01 2006 1 CHN NL 45 43 2 0 0 162 716
## 3 aardsda01 2007 1 CHA AL 25 2 0 0 0 162 693
## 4 aardsda01 2008 1 BOS AL 47 5 1 0 0 162 845
## 5 aardsda01 2009 1 SEA AL 73 3 0 0 0 162 640
## 6 aardsda01 2010 1 SEA AL 53 4 0 0 0 162 513
## AB.y H.y
## 1 5546 1500
## 2 5587 1496
## 3 5441 1341
## 4 5596 1565
## 5 5543 1430
## 6 5409 1274
#explain(both)
# Join with a local data frame
grid <- expand.grid(
teamID = c("WAS", "ATL", "PHI", "NYA"),
yearID = 2010:2012)
grid
## teamID yearID
## 1 WAS 2010
## 2 ATL 2010
## 3 PHI 2010
## 4 NYA 2010
## 5 WAS 2011
## 6 ATL 2011
## 7 PHI 2011
## 8 NYA 2011
## 9 WAS 2012
## 10 ATL 2012
## 11 PHI 2012
## 12 NYA 2012
top4a <- left_join(batting, grid, copy = TRUE)
## Joining by: c("yearID", "teamID")
head(top4a)
## playerID yearID stint teamID lgID G G_batting AB R H X2B X3B HR RBI SB
## 1 aardsda01 2004 1 SFN NL 11 11 0 0 0 0 0 0 0 0
## 2 aardsda01 2006 1 CHN NL 45 43 2 0 0 0 0 0 0 0
## 3 aardsda01 2007 1 CHA AL 25 2 0 0 0 0 0 0 0 0
## 4 aardsda01 2008 1 BOS AL 47 5 1 0 0 0 0 0 0 0
## 5 aardsda01 2009 1 SEA AL 73 3 0 0 0 0 0 0 0 0
## 6 aardsda01 2010 1 SEA AL 53 4 0 0 0 0 0 0 0 0
## CS BB SO IBB HBP SH SF GIDP G_old
## 1 0 0 0 0 0 0 0 0 11
## 2 0 0 0 0 0 1 0 0 45
## 3 0 0 0 0 0 0 0 0 2
## 4 0 0 1 0 0 0 0 0 5
## 5 0 0 0 0 0 0 0 0 NA
## 6 0 0 0 0 0 0 0 0 NA
# explain(top4a)
# Indices don't really help here because there's no matching index on
# batting
# top4b <- left_join(batting, grid, copy = TRUE, auto_index = TRUE)
#head(top4b)
# explain(top4b)
# Semi-joins
people <- tbl(lahman_sqlite(), "Master")
head(people)
## playerID birthYear birthMonth birthDay birthCountry birthState
## 1 aardsda01 1981 12 27 USA CO
## 2 aaronha01 1934 2 5 USA AL
## 3 aaronto01 1939 8 5 USA AL
## 4 aasedo01 1954 9 8 USA CA
## 5 abadan01 1972 8 25 USA FL
## 6 abadfe01 1985 12 17 D.R. La Romana
## birthCity deathYear deathMonth deathDay deathCountry deathState
## 1 Denver NA NA NA <NA> <NA>
## 2 Mobile NA NA NA <NA> <NA>
## 3 Mobile 1984 8 16 USA GA
## 4 Orange NA NA NA <NA> <NA>
## 5 Palm Beach NA NA NA <NA> <NA>
## 6 La Romana NA NA NA <NA> <NA>
## deathCity nameFirst nameLast nameGiven weight height bats throws
## 1 <NA> David Aardsma David Allan 205 75 R R
## 2 <NA> Hank Aaron Henry Louis 180 72 R R
## 3 Atlanta Tommie Aaron Tommie Lee 190 75 R R
## 4 <NA> Don Aase Donald William 190 75 R R
## 5 <NA> Andy Abad Fausto Andres 184 73 L L
## 6 <NA> Fernando Abad Fernando Antonio 220 73 L L
## debut finalGame retroID bbrefID deathDate birthDate
## 1 2004-04-06 2013-09-28 aardd001 aardsda01 NA 4378
## 2 1954-04-13 1976-10-03 aaroh101 aaronha01 NA -13114
## 3 1962-04-10 1971-09-26 aarot101 aaronto01 5341 -11107
## 4 1977-07-26 1990-10-03 aased001 aasedo01 NA -5594
## 5 2001-09-10 2006-04-13 abada001 abadan01 NA 967
## 6 2010-07-28 2013-09-27 abadf001 abadfe01 NA 5829
# All people in half of fame
hof <- tbl(lahman_sqlite(), "HallOfFame")
head(hof)
## playerID yearID votedBy ballots needed votes inducted category
## 1 cobbty01 1936 BBWAA 226 170 222 Y Player
## 2 ruthba01 1936 BBWAA 226 170 215 Y Player
## 3 wagneho01 1936 BBWAA 226 170 215 Y Player
## 4 mathech01 1936 BBWAA 226 170 205 Y Player
## 5 johnswa01 1936 BBWAA 226 170 189 Y Player
## 6 lajoina01 1936 BBWAA 226 170 146 N Player
## needed_note
## 1 <NA>
## 2 <NA>
## 3 <NA>
## 4 <NA>
## 5 <NA>
## 6 <NA>
semi_join(people, hof)
## Joining by: "playerID"
## Source: sqlite 3.8.6 [C:\Users\SHIMAD~1\AppData\Local\Temp\RtmpMfAkMc/lahman.sqlite]
## From: <derived table> [?? x 26]
##
## playerID birthYear birthMonth birthDay birthCountry birthState
## 1 aaronha01 1934 2 5 USA AL
## 2 abbotji01 1967 9 19 USA MI
## 3 adamsba01 1882 5 18 USA IN
## 4 adamsbo03 1921 12 14 USA CA
## 5 adamssp01 1894 8 26 USA PA
## 6 ageeto01 1942 8 9 USA AL
## 7 aguilri01 1961 12 31 USA CA
## 8 akerja01 1940 7 13 USA CA
## 9 alexado01 1950 9 4 USA AL
## 10 alexape01 1887 2 26 USA NE
## .. ... ... ... ... ... ...
## Variables not shown: birthCity (chr), deathYear (int), deathMonth (int),
## deathDay (int), deathCountry (chr), deathState (chr), deathCity (chr),
## nameFirst (chr), nameLast (chr), nameGiven (chr), weight (int), height
## (int), bats (chr), throws (chr), debut (chr), finalGame (chr), retroID
## (chr), bbrefID (chr), deathDate (dbl), birthDate (dbl)
anti_join(people, hof)
## Joining by: "playerID"
## Source: sqlite 3.8.6 [C:\Users\SHIMAD~1\AppData\Local\Temp\RtmpMfAkMc/lahman.sqlite]
## From: <derived table> [?? x 26]
##
## playerID birthYear birthMonth birthDay birthCountry birthState
## 1 aardsda01 1981 12 27 USA CO
## 2 aaronto01 1939 8 5 USA AL
## 3 aasedo01 1954 9 8 USA CA
## 4 abadan01 1972 8 25 USA FL
## 5 abadfe01 1985 12 17 D.R. La Romana
## 6 abadijo01 1854 11 4 USA PA
## 7 abbated01 1877 4 15 USA PA
## 8 abbeybe01 1869 11 11 USA VT
## 9 abbeych01 1866 10 14 USA NE
## 10 abbotda01 1862 3 16 USA OH
## .. ... ... ... ... ... ...
## Variables not shown: birthCity (chr), deathYear (int), deathMonth (int),
## deathDay (int), deathCountry (chr), deathState (chr), deathCity (chr),
## nameFirst (chr), nameLast (chr), nameGiven (chr), weight (int), height
## (int), bats (chr), throws (chr), debut (chr), finalGame (chr), retroID
## (chr), bbrefID (chr), deathDate (dbl), birthDate (dbl)
# Find all managers
manager <- tbl(lahman_sqlite(), "Managers")
head(manager)
## playerID yearID teamID lgID inseason G W L rank plyrMgr
## 1 wrighha01 1871 BS1 NA 1 31 20 10 3 Y
## 2 woodji01 1871 CH1 NA 1 28 19 9 2 Y
## 3 paborch01 1871 CL1 NA 1 29 10 19 8 Y
## 4 lennobi01 1871 FW1 NA 1 14 5 9 8 Y
## 5 deaneha01 1871 FW1 NA 2 5 2 3 8 Y
## 6 fergubo01 1871 NY2 NA 1 33 16 17 5 Y
# Find all managers in hall of fame
famous_manager <- semi_join(semi_join(people, manager), hof)
## Joining by: "playerID"
## Joining by: "playerID"
famous_manager
## Source: sqlite 3.8.6 [C:\Users\SHIMAD~1\AppData\Local\Temp\RtmpMfAkMc/lahman.sqlite]
## From: <derived table> [?? x 26]
##
## playerID birthYear birthMonth birthDay birthCountry birthState
## 1 allisdo01 1846 7 12 USA PA
## 2 alomasa02 1966 6 18 P.R. NA
## 3 aloufe01 1935 5 12 D.R. San Cristobal
## 4 alstowa01 1911 12 1 USA OH
## 5 andersp01 1934 2 22 USA SD
## 6 ansonca01 1852 4 17 USA IA
## 7 applilu01 1907 4 2 USA NC
## 8 austiji01 1879 12 8 United Kingdom Swansea
## 9 bakerdu01 1949 6 15 USA CA
## 10 bancrda01 1891 4 20 USA IA
## .. ... ... ... ... ... ...
## Variables not shown: birthCity (chr), deathYear (int), deathMonth (int),
## deathDay (int), deathCountry (chr), deathState (chr), deathCity (chr),
## nameFirst (chr), nameLast (chr), nameGiven (chr), weight (int), height
## (int), bats (chr), throws (chr), debut (chr), finalGame (chr), retroID
## (chr), bbrefID (chr), deathDate (dbl), birthDate (dbl)
# explain(famous_manager)
# Anti-joins ----------------------------------------------------------------
# batters without person covariates
#anti_join(batting, people)
#}
#Examples
lead(1:10, 1)
## [1] 2 3 4 5 6 7 8 9 10 NA
lead(1:10, 2)
## [1] 3 4 5 6 7 8 9 10 NA NA
lag(1:10, 1)
## [1] NA 1 2 3 4 5 6 7 8 9
lead(1:10, 1)
## [1] 2 3 4 5 6 7 8 9 10 NA
x <- runif(5)
cbind(ahead = lead(x), x, behind = lag(x))
## ahead x behind
## [1,] 0.2664911 0.1326000 NA
## [2,] 0.0858742 0.2664911 0.1326000
## [3,] 0.4120477 0.0858742 0.2664911
## [4,] 0.9449989 0.4120477 0.0858742
## [5,] NA 0.9449989 0.4120477
# Use order_by if data not already ordered
df <- data.frame(year = 2000:2005, value = (0:5) ^ 2)
scrambled <- df[sample(nrow(df)), ]
wrong <- mutate(scrambled, prev = lag(value))
arrange(wrong, year)
## year value prev
## 1 2000 0 1
## 2 2001 1 16
## 3 2002 4 9
## 4 2003 9 0
## 5 2004 16 25
## 6 2005 25 NA
right <- mutate(scrambled, prev = lag(value, order_by = year))
arrange(right, year)
## year value prev
## 1 2000 0 NA
## 2 2001 1 0
## 3 2002 4 1
## 4 2003 9 4
## 5 2004 16 9
## 6 2005 25 16
#Examples
location(mtcars)
## <05CF3A20>
## Variables:
## * mpg: <03D70308>
## * cyl: <04D50D40>
## * disp: <03BD45E8>
## * hp: <0391C8E0>
## * drat: <02331CF8>
## * wt: <0384D650>
## * qsec: <03292E50>
## * vs: <03AE4BA0>
## * am: <022EC4E8>
## * gear: <039A0BA8>
## * carb: <035BD618>
## * model: <05FCE8D8>
## Attributes:
## * names: <05CF3A78>
## * row.names: <05FCE8D8>
## * class: <05F00728>
mtcars2 <- mutate(mtcars, cyl2 = cyl * 2)
location(mtcars2)
## <12BB4E50>
## Variables:
## * mpg: <03D70308>
## * cyl: <04D50D40>
## * disp: <03BD45E8>
## * hp: <0391C8E0>
## * drat: <02331CF8>
## * wt: <0384D650>
## * qsec: <03292E50>
## * vs: <03AE4BA0>
## * am: <022EC4E8>
## * gear: <039A0BA8>
## * carb: <035BD618>
## * model: <05FCE8D8>
## * cyl2: <0CE30990>
## Attributes:
## * class: <0C30F8C8>
## * names: <12BB4DF8>
## * row.names: <11649B20>
changes(mtcars, mtcars)
## <identical>
changes(mtcars, mtcars2)
## Changed variables:
## old new
## cyl2 <added> 0CE30990
##
## Changed attributes:
## old new
## names 05CF3A78 12BB4DF8
## row.names 05FCE8D8 0D5FF9D8
## class 05F00728 0C30F8C8