Based on Select Pupils by Number of Pupils per Group, by Andrea Cantieni.
Selecting observations (rows) by the value in a variable (column) is simple, e.g. dfr[ dfr$var > 2, ]
, but what about an aggregate of values for the obserservations in a group?
opts_chunk$set(tidy = FALSE)
Ideas on tidiness differ.
require(dplyr)
set.seed(123)
dfr <- data.frame(id = 1:9
, classid = c(rep(1,2), rep(2,4), rep(3,3))
, math = as.integer(rnorm(n=9, mean=60, sd=10)))
dfr
## id classid math
## 1 1 1 54
## 2 2 1 57
## 3 3 2 75
## 4 4 2 60
## 5 5 2 61
## 6 6 2 77
## 7 7 3 64
## 8 8 3 47
## 9 9 3 53
str(dfr)
## 'data.frame': 9 obs. of 3 variables:
## $ id : int 1 2 3 4 5 6 7 8 9
## $ classid: num 1 1 2 2 2 2 3 3 3
## $ math : int 54 57 75 60 61 77 64 47 53
# step-by-step
dfr.g <- group_by(dfr, classid)
filter(dfr.g, n()>2)
## Source: local data frame [7 x 3]
## Groups: classid
##
## id classid math
## 1 3 2 75
## 2 4 2 60
## 3 5 2 61
## 4 6 2 77
## 5 7 3 64
## 6 8 3 47
## 7 9 3 53
remove(dfr.g)
# or nested syntax
filter(group_by(dfr, classid)
, n()>2)
## Source: local data frame [7 x 3]
## Groups: classid
##
## id classid math
## 1 3 2 75
## 2 4 2 60
## 3 5 2 61
## 4 6 2 77
## 5 7 3 64
## 6 8 3 47
## 7 9 3 53
# or with %.% operator
dfr %.%
group_by(classid) %.%
filter(n()>2)
## Source: local data frame [7 x 3]
## Groups: classid
##
## id classid math
## 1 3 2 75
## 2 4 2 60
## 3 5 2 61
## 4 6 2 77
## 5 7 3 64
## 6 8 3 47
## 7 9 3 53
dfr[dfr$classid %in% which(xtabs(~classid, dfr) > 2), ]
## id classid math
## 3 3 2 75
## 4 4 2 60
## 5 5 2 61
## 6 6 2 77
## 7 7 3 64
## 8 8 3 47
## 9 9 3 53
dfr <- mutate(dfr, name = ifelse(classid == 1, 'A'
, ifelse(classid == 2, 'B'
, ifelse(classid == 3, 'C'
, NA))))
str(dfr$name)
## chr [1:9] "A" "A" "B" "B" "B" "B" "C" "C" "C"
dfr %.%
group_by(name) %.%
filter(n()>2)
## Source: local data frame [7 x 4]
## Groups: name
##
## id classid math name
## 1 3 2 75 B
## 2 4 2 60 B
## 3 5 2 61 B
## 4 6 2 77 B
## 5 7 3 64 C
## 6 8 3 47 C
## 7 9 3 53 C
dfr[dfr$classid %in% which(xtabs(~name, dfr) > 2), ]
## id classid math name
## 3 3 2 75 B
## 4 4 2 60 B
## 5 5 2 61 B
## 6 6 2 77 B
## 7 7 3 64 C
## 8 8 3 47 C
## 9 9 3 53 C
dfr$fac <- as.factor(dfr$name)
str(dfr$fac)
## Factor w/ 3 levels "A","B","C": 1 1 2 2 2 2 3 3 3
dfr %.%
group_by(fac) %.%
filter(n()>2)
## Source: local data frame [7 x 5]
## Groups: fac
##
## id classid math name fac
## 1 3 2 75 B B
## 2 4 2 60 B B
## 3 5 2 61 B B
## 4 6 2 77 B B
## 5 7 3 64 C C
## 6 8 3 47 C C
## 7 9 3 53 C C
dfr[dfr$fac %in% which(xtabs(~fac, dfr) > 2), ]
## [1] id classid math name fac
## <0 rows> (or 0-length row.names)
dfr[dfr$fac %in% names(which(xtabs(~fac, dfr) > 2)), ]
## id classid math name fac
## 3 3 2 75 B B
## 4 4 2 60 B B
## 5 5 2 61 B B
## 6 6 2 77 B B
## 7 7 3 64 C C
## 8 8 3 47 C C
## 9 9 3 53 C C