R语言︱数据集分组

1 cut()

日期分组

#cut
vDates <- as.Date(c("2013-06-01", "2013-07-08", "2013-09-01", "2013-09-15")) 
#as.Data()函数的作用非常重要;如果没有它,R语言会认为以上内容仅仅是数字串而非日期对象  
vDates.bymonth <- cut(vDates, breaks = "month") 

Dates <- data.frame(vDates, vDates.bymonth)
Dates
##       vDates vDates.bymonth
## 1 2013-06-01     2013-06-01
## 2 2013-07-08     2013-07-01
## 3 2013-09-01     2013-09-01
## 4 2013-09-15     2013-09-01

2 dplyr– filter and select

aggregate()

#aggregate
orders <- data.frame(
    "ORDERID" = 1:5,
    "CLIENT" = c("WVF VIP", "UFS COM", "SWFR", "JFS PEP","DSG"),
    "SELLERID"= c(5, 13, 2, 27, 15),
    "AMOUNT" = c(440, 1863.4, 1813, 670.8, 3730)
)
#先对CLIENT分组再对SELLERID分组就必须写成:orders[,c("SELLERID","CLIENT")]
result1<-aggregate(orders$AMOUNT, orders[, c("SELLERID", "CLIENT")], sum)

result2<-aggregate(orders$AMOUNT, orders[, c("SELLERID", "CLIENT")], max)  
result<-cbind(result1, result2$x)
result
##   SELLERID  CLIENT      x result2$x
## 1       15     DSG 3730.0    3730.0
## 2       27 JFS PEP  670.8     670.8
## 3        2    SWFR 1813.0    1813.0
## 4       13 UFS COM 1863.4    1863.4
## 5        5 WVF VIP  440.0     440.0

split-combine-apply

sp<-split(orders,orders[, c("SELLERID","CLIENT")],drop=TRUE)  

result1<-sapply(sp, FUN=function(x) sum(x$AMOUNT)) 

result2<-sapply(sp, FUN=function(x) max(x$AMOUNT))
result <- data.frame("result1" = result1,
"result2" = result2)
result
##            result1 result2
## 15.DSG      3730.0  3730.0
## 27.JFS PEP   670.8   670.8
## 2.SWFR      1813.0  1813.0
## 13.UFS COM  1863.4  1863.4
## 5.WVF VIP    440.0   440.0

subset()

#subset(x, subset, ...)                       
#subset(x, subset, select, drop = FALSE, ...)    ##对于矩阵  
#subset(x, subset, select, drop = FALSE, ...)    ##对于数据框  
#x是对象,subset是保留元素或者行列的逻辑表达式,对于缺失值用NA代替。  
#Select 是选取的范围,应小于x。  
x <- data.frame(matrix(1:30, nrow = 5,byrow = T))  
rownames(x) <- c("one","two","three","four","five")  
colnames(x) <- c("a","b","c","d","e","f")  
new <- subset(x,a >= 14,select = a:f)  
new                            ## 从a到f列选取a>14的行
##       a  b  c  d  e  f
## four 19 20 21 22 23 24
## five 25 26 27 28 29 30

which

data <- data.frame("V1" = c(1,3,4,5),"V2" = c(1,4,5,3) )
data$V1[which(data$V2 < 5)]                 #筛选出V1中,V2小于0的数字,跟order的作用些许相似  
## [1] 1 3 5

参考文献: R语言︱数据集分组、筛选