Select Observations by an Aggregate

of the Observations per Group

Based on Select Pupils by Number of Pupils per Group, by Andrea Cantieni.

Selecting observations (rows) by the value in a variable (column) is simple, e.g. dfr[ dfr$var > 2, ], but what about an aggregate of values for the obserservations in a group?

opts_chunk$set(tidy = FALSE)

Ideas on tidiness differ.

require(dplyr)

Data frame

set.seed(123)
dfr <- data.frame(id = 1:9
                  , classid = c(rep(1,2), rep(2,4), rep(3,3))
                  , math = as.integer(rnorm(n=9, mean=60, sd=10)))
dfr

##   id classid math
## 1  1       1   54
## 2  2       1   57
## 3  3       2   75
## 4  4       2   60
## 5  5       2   61
## 6  6       2   77
## 7  7       3   64
## 8  8       3   47
## 9  9       3   53

str(dfr)

## 'data.frame':    9 obs. of  3 variables:
##  $ id     : int  1 2 3 4 5 6 7 8 9
##  $ classid: num  1 1 2 2 2 2 3 3 3
##  $ math   : int  54 57 75 60 61 77 64 47 53

dplyr, 3 styles

# step-by-step
dfr.g <- group_by(dfr, classid)
filter(dfr.g, n()>2)

## Source: local data frame [7 x 3]
## Groups: classid
## 
##   id classid math
## 1  3       2   75
## 2  4       2   60
## 3  5       2   61
## 4  6       2   77
## 5  7       3   64
## 6  8       3   47
## 7  9       3   53

remove(dfr.g)

# or nested syntax
filter(group_by(dfr, classid)
       , n()>2)

## Source: local data frame [7 x 3]
## Groups: classid
## 
##   id classid math
## 1  3       2   75
## 2  4       2   60
## 3  5       2   61
## 4  6       2   77
## 5  7       3   64
## 6  8       3   47
## 7  9       3   53


# or with %.% operator
dfr %.% 
  group_by(classid) %.% 
  filter(n()>2)

## Source: local data frame [7 x 3]
## Groups: classid
## 
##   id classid math
## 1  3       2   75
## 2  4       2   60
## 3  5       2   61
## 4  6       2   77
## 5  7       3   64
## 6  8       3   47
## 7  9       3   53

Select by table aggregate in base

dfr[dfr$classid %in% which(xtabs(~classid, dfr) > 2), ]

##   id classid math
## 3  3       2   75
## 4  4       2   60
## 5  5       2   61
## 6  6       2   77
## 7  7       3   64
## 8  8       3   47
## 9  9       3   53

If Group is character, rather than numeric?

dfr <- mutate(dfr, name = ifelse(classid == 1, 'A'
                        , ifelse(classid == 2, 'B'
                        , ifelse(classid == 3, 'C'
                                 , NA))))
str(dfr$name)

##  chr [1:9] "A" "A" "B" "B" "B" "B" "C" "C" "C"

dplyr and base both work

dfr %.% 
  group_by(name) %.% 
  filter(n()>2)

## Source: local data frame [7 x 4]
## Groups: name
## 
##   id classid math name
## 1  3       2   75    B
## 2  4       2   60    B
## 3  5       2   61    B
## 4  6       2   77    B
## 5  7       3   64    C
## 6  8       3   47    C
## 7  9       3   53    C


dfr[dfr$classid %in% which(xtabs(~name, dfr) > 2), ]

##   id classid math name
## 3  3       2   75    B
## 4  4       2   60    B
## 5  5       2   61    B
## 6  6       2   77    B
## 7  7       3   64    C
## 8  8       3   47    C
## 9  9       3   53    C

If Group is a factor (with an internal value that's integer)?

dfr$fac <- as.factor(dfr$name)
str(dfr$fac)

##  Factor w/ 3 levels "A","B","C": 1 1 2 2 2 2 3 3 3

dplyr works, but base won't

dfr %.% 
  group_by(fac) %.% 
  filter(n()>2)

## Source: local data frame [7 x 5]
## Groups: fac
## 
##   id classid math name fac
## 1  3       2   75    B   B
## 2  4       2   60    B   B
## 3  5       2   61    B   B
## 4  6       2   77    B   B
## 5  7       3   64    C   C
## 6  8       3   47    C   C
## 7  9       3   53    C   C


dfr[dfr$fac %in% which(xtabs(~fac, dfr) > 2), ]

## [1] id      classid math    name    fac    
## <0 rows> (or 0-length row.names)

Select by Factor names, not internal value

dfr[dfr$fac %in% names(which(xtabs(~fac, dfr) > 2)), ]

##   id classid math name fac
## 3  3       2   75    B   B
## 4  4       2   60    B   B
## 5  5       2   61    B   B
## 6  6       2   77    B   B
## 7  7       3   64    C   C
## 8  8       3   47    C   C
## 9  9       3   53    C   C