library(tidyverse)
library(DT)
library(lubridate)
library(stringr)
library(explore)

iris %>% datatable()

iris$Species %>% n_distinct()#查看独一无二的值（几种物种）

## [1] 3

library(forcats)
fct_count(iris$Species)

分组求均值

iris %>% filter(Species=="setosa") %>% pull(Sepal.Width) %>% mean()

## [1] 3.428

iris %>% filter(Species=="setosa") %>% pull(Sepal.Length) %>% mean()

## [1] 5.006

接下来看简单方法

汇总数据

summarise

iris %>% summarise(mean(Sepal.Length),mean(Sepal.Width))

mean(iris$Sepal.Length)

## [1] 5.843333

iris %>% summarise(
  n=n(),
  unique=n_distinct(Species),
  first=first(Species),
  last=last(Species),
  `51sst`=nth(Species,51)
)

summarize_all

iris %>% summarise_all(n_distinct)#查看每一列有几个独一无二的值

summarize_if

summarise_if(iris,is.numeric,n_distinct)#对数值列中，每列具有独一无二的值的查看

summarize_at

iris %>% 
  summarise_at(vars(contains("s")),n_distinct)

group_by和summarise(*)

iris %>% 
  group_by(Species) %>% 
  summarise(Avg.Sepal.Length=mean(Sepal.Length))

iris %>% 
  group_by(Species) %>% 
  summarise(sd.Petal.Length=sd(Petal.Length),
            sum.Petal.Length=sum(Petal.Length),
            mean.Petal.Length=mean(Petal.Length))

group_by和mutate(*)

iris %>% 
  group_by(Species) %>% 
  mutate(Avg.Sepal.Length=mean(Sepal.Length)) %>% 
  DT::datatable()

iris %>% 
  group_by(Species) %>% 
  mutate(Avg.Sepal.Length=mean(Sepal.Length),
         sum_Sepal.Length=sum(Sepal.Length)) %>% 
  DT::datatable()

#先按物种分组，再生成列

#对比：不分组生成列
iris %>% 
  mutate(Avg.Sepal.Length=mean(Sepal.Length)) %>% 
  DT::datatable()

#均值就是整体的均值

group_by和slice

iris %>% group_by(Species) %>% 
  slice(1:2) #分组看前两行

group_by和arrange

iris %>% 
  group_by(Species) %>% 
  arrange(Sepal.Length,.by_group=T) %>%
  datatable()  #分组之后排序

多个分组

默认情况下，以后的分组会覆盖现有分组，但是可以用add增加参数来添加到现有分组上下文中

iris %>% 
  group_by(Species) %>% 
  mutate(SpeciesN=n()) %>% 
  group_by(Sepal.Length) %>% 
  mutate(Sepal.LengthN=n()) %>% 
  datatable()

iris %>% 
  group_by(Species) %>% 
  mutate(SpeciesN=n()) %>% 
  group_by(Sepal.Length,add=T) %>% 
  mutate(Sepal.LengthN=n()) %>% 
  datatable()

##删除分组ungroup 如果删除分组，上下文才能再次在整个表上工作，可用ungroup()函数

iris %>% 
  group_by(Species) %>% 
  mutate(SpeciesN=n()) %>% 
  ungroup() %>% 
  datatable()

iris %>% 
  group_by(Species) %>% 
  mutate(SpeciesN=n()) %>% 
  str()

## tibble [150 x 6] (S3: grouped_df/tbl_df/tbl/data.frame)
##  $ Sepal.Length: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ SpeciesN    : int [1:150] 50 50 50 50 50 50 50 50 50 50 ...
##  - attr(*, "groups")= tibble [3 x 2] (S3: tbl_df/tbl/data.frame)
##   ..$ Species: Factor w/ 3 levels "setosa","versicolor",..: 1 2 3
##   ..$ .rows  :List of 3
##   .. ..$ : int [1:50] 1 2 3 4 5 6 7 8 9 10 ...
##   .. ..$ : int [1:50] 51 52 53 54 55 56 57 58 59 60 ...
##   .. ..$ : int [1:50] 101 102 103 104 105 106 107 108 109 110 ...
##   ..- attr(*, ".drop")= logi TRUE

iris %>% 
  group_by(Species) %>% 
  mutate(SpeciesN=n()) %>% 
  ungroup() %>% 
  str()

## tibble [150 x 6] (S3: tbl_df/tbl/data.frame)
##  $ Sepal.Length: num [1:150] 5.1 4.9 4.7 4.6 5 5.4 4.6 5 4.4 4.9 ...
##  $ Sepal.Width : num [1:150] 3.5 3 3.2 3.1 3.6 3.9 3.4 3.4 2.9 3.1 ...
##  $ Petal.Length: num [1:150] 1.4 1.4 1.3 1.5 1.4 1.7 1.4 1.5 1.4 1.5 ...
##  $ Petal.Width : num [1:150] 0.2 0.2 0.2 0.2 0.2 0.4 0.3 0.2 0.2 0.1 ...
##  $ Species     : Factor w/ 3 levels "setosa","versicolor",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ SpeciesN    : int [1:150] 50 50 50 50 50 50 50 50 50 50 ...

连接数据集

X <- tibble(
  A=letters[1:3],
  B=c("t","u","v"),
  C=1:3)

Y <- tibble(
  A=c("a","b","d"),
  B=c("t","u","w"),
  D=3:1
)

inner join

inner_join(X,Y)

## Joining, by = c("A", "B")

inner_join(X,Y,by="A")

left join

left_join(X,Y)#按照X进行连接,匹配列名相同的，通过by可以指定特定列。

## Joining, by = c("A", "B")

left_join(Y,X)

## Joining, by = c("A", "B")

full join(相当于取并集)

full_join(X,Y)

## Joining, by = c("A", "B")

anti join

anti_join(X,Y)

## Joining, by = c("A", "B")

anti_join(Y,X)

## Joining, by = c("A", "B")

bind_cols

iris[,1:3] %>% 
  bind_cols(iris[,4:5]) %>% head()

bind_rows

iris %>% 
  select(-Species) %>% 
  sample_n(50) %>% 
  bind_rows(iris) %>% 
  sample_n(100) %>% 
  datatable()

union(两个表具有相同结构，联合操作)

如果两个表具有相同结构，union()删除重复数据，union_all()不删除重复数据，完整拼接。

rownames_to_colum 将行名变成一列

mtcars

mtcars %>% 
  rownames_to_column(
    "car"
  )->mtcars2

mtcars2

Tidyverse2

xxr

2020/8/22

汇总数据

summarise

summarize_all

summarize_if

summarize_at

group_by和summarise(*)

group_by和mutate(*)

group_by和slice

group_by和arrange

多个分组

连接数据集

inner join

left join

full join(相当于取并集)

anti join

bind_cols

bind_rows

union(两个表具有相同结构，联合操作)

rownames_to_colum 将行名变成一列