tidyverse包和整洁数据概念

tidyr 名字来自于 tidy(整洁)一词。所谓“整洁数据”,根据 Hadley Wickham 对整洁数据的专门研究,其定义如下:

加载tidyverse包及各个包的功能简介

library(tidyverse)
各个包的作用:

矩形数据:如果每行的元素数等于列数,并且每列的元素数等于行数,那么此类数据为矩形数据。但是数据并非都是这种类型。

tibble程序包引入了一种新的数据结构。

创建tibble,并和数据框进行相互转化

#创建tibble
Tib<-tibble(x=1:4,
            y=c("Beijing","Shanghai","Guangzhou","Shenzhen"))
Tib
## # A tibble: 4 × 2
##       x y        
##   <int> <chr>    
## 1     1 Beijing  
## 2     2 Shanghai 
## 3     3 Guangzhou
## 4     4 Shenzhen
#创建数据框,并将数据框转化为tibble
Df<-data.frame(x=1:4,
            y=c("Beijing","Shanghai","Guangzhou","Shenzhen"))
Df
##   x         y
## 1 1   Beijing
## 2 2  Shanghai
## 3 3 Guangzhou
## 4 4  Shenzhen
DftoTib<-as_tibble(Df)
DftoTib
## # A tibble: 4 × 2
##       x y        
##   <int> <chr>    
## 1     1 Beijing  
## 2     2 Shanghai 
## 3     3 Guangzhou
## 4     4 Shenzhen
Dffactor<-data.frame(x=1:4,
            y=c("Beijing","Shanghai","Guangzhou","Shenzhen"),
            stringsAsFactors = FALSE)
DfNotfactor<-data.frame(x=1:4,
            y=c("Beijing","Shanghai","Guangzhou","Shenzhen"),
            stringsAsFactors = TRUE)

#将变量y变成tibble中的factor变量,只需将c()函数封装到factor()函数中
Tib1<-tibble(x=1:4,
            y=factor(c("Beijing","Shanghai","Guangzhou","Shenzhen")))
Tib1
## # A tibble: 4 × 2
##       x y        
##   <int> <fct>    
## 1     1 Beijing  
## 2     2 Shanghai 
## 3     3 Guangzhou
## 4     4 Shenzhen
a<-c(class(Tib$y),class(Df$y),class(DfNotfactor$y),
     class(Dffactor$y),class(Tib1$y))
a
## [1] "character" "character" "factor"    "character" "factor"
Df[,1]
## [1] 1 2 3 4
Tib[,1]
## # A tibble: 4 × 1
##       x
##   <int>
## 1     1
## 2     2
## 3     3
## 4     4
Tib[[1]]
## [1] 1 2 3 4
Tib$x
## [1] 1 2 3 4

数据框和tibble的区别

当输出数据框时,所有列都被将输出到计算机屏幕。

当输出tibble时,默认情况下只会输出前10行以及计算机屏幕所能显示的列数,未显示的变量名 将在输出结果的底部列出。

data(starwars)
starwars
## # A tibble: 87 × 14
##    name        height  mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex   gender homew…⁵
##    <chr>        <int> <dbl> <chr>   <chr>   <chr>     <dbl> <chr> <chr>  <chr>  
##  1 Luke Skywa…    172    77 blond   fair    blue       19   male  mascu… Tatooi…
##  2 C-3PO          167    75 <NA>    gold    yellow    112   none  mascu… Tatooi…
##  3 R2-D2           96    32 <NA>    white,… red        33   none  mascu… Naboo  
##  4 Darth Vader    202   136 none    white   yellow     41.9 male  mascu… Tatooi…
##  5 Leia Organa    150    49 brown   light   brown      19   fema… femin… Aldera…
##  6 Owen Lars      178   120 brown,… light   blue       52   male  mascu… Tatooi…
##  7 Beru White…    165    75 brown   light   blue       47   fema… femin… Tatooi…
##  8 R5-D4           97    32 <NA>    white,… red        NA   none  mascu… Tatooi…
##  9 Biggs Dark…    183    84 black   light   brown      24   male  mascu… Tatooi…
## 10 Obi-Wan Ke…    182    77 auburn… fair    blue-g…    57   male  mascu… Stewjon
## # … with 77 more rows, 4 more variables: species <chr>, films <list>,
## #   vehicles <list>, starships <list>, and abbreviated variable names
## #   ¹​hair_color, ²​skin_color, ³​eye_color, ⁴​birth_year, ⁵​homeworld
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
write_csv(starwars,"starwars.csv")

依照顺序创建变量

sequentialTib<-tibble(nItems=c(12,45,107),
                      cost=c(0.5,1.2,1.8),
                      totalWorth=nItems*cost)
sequentialTib
## # A tibble: 3 × 3
##   nItems  cost totalWorth
##    <dbl> <dbl>      <dbl>
## 1     12   0.5         6 
## 2     45   1.2        54 
## 3    107   1.8       193.

dplyr程序包及其功能介绍

dplyr程序包使我们能够以更直观的方式执行以下操作:

使用dplyr操作CO2数据集

library(tibble)
data(CO2)
CO2tib<-as_tibble(CO2)
CO2tib
## # A tibble: 84 × 5
##    Plant Type   Treatment   conc uptake
##    <ord> <fct>  <fct>      <dbl>  <dbl>
##  1 Qn1   Quebec nonchilled    95   16  
##  2 Qn1   Quebec nonchilled   175   30.4
##  3 Qn1   Quebec nonchilled   250   34.8
##  4 Qn1   Quebec nonchilled   350   37.2
##  5 Qn1   Quebec nonchilled   500   35.3
##  6 Qn1   Quebec nonchilled   675   39.2
##  7 Qn1   Quebec nonchilled  1000   39.7
##  8 Qn2   Quebec nonchilled    95   13.6
##  9 Qn2   Quebec nonchilled   175   27.3
## 10 Qn2   Quebec nonchilled   250   37.1
## # … with 74 more rows
## # ℹ Use `print(n = ...)` to see more rows
#使用select()函数选择1,2,3,5列
selectedData<-select(CO2tib,1,2,3,5)
selectedData
## # A tibble: 84 × 4
##    Plant Type   Treatment  uptake
##    <ord> <fct>  <fct>       <dbl>
##  1 Qn1   Quebec nonchilled   16  
##  2 Qn1   Quebec nonchilled   30.4
##  3 Qn1   Quebec nonchilled   34.8
##  4 Qn1   Quebec nonchilled   37.2
##  5 Qn1   Quebec nonchilled   35.3
##  6 Qn1   Quebec nonchilled   39.2
##  7 Qn1   Quebec nonchilled   39.7
##  8 Qn2   Quebec nonchilled   13.6
##  9 Qn2   Quebec nonchilled   27.3
## 10 Qn2   Quebec nonchilled   37.1
## # … with 74 more rows
## # ℹ Use `print(n = ...)` to see more rows
#使用filter()函数过滤行:选择uptake>16
filteredData<-filter(selectedData,uptake>16)
filteredData
## # A tibble: 66 × 4
##    Plant Type   Treatment  uptake
##    <ord> <fct>  <fct>       <dbl>
##  1 Qn1   Quebec nonchilled   30.4
##  2 Qn1   Quebec nonchilled   34.8
##  3 Qn1   Quebec nonchilled   37.2
##  4 Qn1   Quebec nonchilled   35.3
##  5 Qn1   Quebec nonchilled   39.2
##  6 Qn1   Quebec nonchilled   39.7
##  7 Qn2   Quebec nonchilled   27.3
##  8 Qn2   Quebec nonchilled   37.1
##  9 Qn2   Quebec nonchilled   41.8
## 10 Qn2   Quebec nonchilled   40.6
## # … with 56 more rows
## # ℹ Use `print(n = ...)` to see more rows
#使用group_by()函数分组数据:按照Plant变量进行分组
groupData<-group_by(filteredData,Plant)
groupData
## # A tibble: 66 × 4
## # Groups:   Plant [11]
##    Plant Type   Treatment  uptake
##    <ord> <fct>  <fct>       <dbl>
##  1 Qn1   Quebec nonchilled   30.4
##  2 Qn1   Quebec nonchilled   34.8
##  3 Qn1   Quebec nonchilled   37.2
##  4 Qn1   Quebec nonchilled   35.3
##  5 Qn1   Quebec nonchilled   39.2
##  6 Qn1   Quebec nonchilled   39.7
##  7 Qn2   Quebec nonchilled   27.3
##  8 Qn2   Quebec nonchilled   37.1
##  9 Qn2   Quebec nonchilled   41.8
## 10 Qn2   Quebec nonchilled   40.6
## # … with 56 more rows
## # ℹ Use `print(n = ...)` to see more rows
#使用ungroup()函数解除tibble中的分组结构
Data<-ungroup(groupData)
Data
## # A tibble: 66 × 4
##    Plant Type   Treatment  uptake
##    <ord> <fct>  <fct>       <dbl>
##  1 Qn1   Quebec nonchilled   30.4
##  2 Qn1   Quebec nonchilled   34.8
##  3 Qn1   Quebec nonchilled   37.2
##  4 Qn1   Quebec nonchilled   35.3
##  5 Qn1   Quebec nonchilled   39.2
##  6 Qn1   Quebec nonchilled   39.7
##  7 Qn2   Quebec nonchilled   27.3
##  8 Qn2   Quebec nonchilled   37.1
##  9 Qn2   Quebec nonchilled   41.8
## 10 Qn2   Quebec nonchilled   40.6
## # … with 56 more rows
## # ℹ Use `print(n = ...)` to see more rows
#使用summarize()函数创建汇总变量
sumData<-summarize(groupData,meanup=mean(uptake),sdup=sd(uptake))
sumData
## # A tibble: 11 × 3
##    Plant meanup   sdup
##    <ord>  <dbl>  <dbl>
##  1 Qn1     36.1  3.42 
##  2 Qn2     38.8  6.07 
##  3 Qn3     37.6 10.3  
##  4 Qc1     32.6  5.03 
##  5 Qc3     35.5  7.52 
##  6 Qc2     36.6  5.14 
##  7 Mn3     26.2  3.49 
##  8 Mn2     29.9  3.92 
##  9 Mn1     29.0  5.70 
## 10 Mc3     18.4  0.826
## 11 Mc1     20.1  1.83
#使用mutate()创建新变量:利用sumData结果计算各组变异系数
muData<-mutate(sumData,CV=(sdup/meanup)*100)
muData
## # A tibble: 11 × 4
##    Plant meanup   sdup    CV
##    <ord>  <dbl>  <dbl> <dbl>
##  1 Qn1     36.1  3.42   9.48
##  2 Qn2     38.8  6.07  15.7 
##  3 Qn3     37.6 10.3   27.5 
##  4 Qc1     32.6  5.03  15.4 
##  5 Qc3     35.5  7.52  21.2 
##  6 Qc2     36.6  5.14  14.1 
##  7 Mn3     26.2  3.49  13.3 
##  8 Mn2     29.9  3.92  13.1 
##  9 Mn1     29.0  5.70  19.6 
## 10 Mc3     18.4  0.826  4.48
## 11 Mc1     20.1  1.83   9.11
#使用arrange()函数根据变量进行排序tibble
arrData<-arrange(muData,CV) #默认按照升序排列
arrData
## # A tibble: 11 × 4
##    Plant meanup   sdup    CV
##    <ord>  <dbl>  <dbl> <dbl>
##  1 Mc3     18.4  0.826  4.48
##  2 Mc1     20.1  1.83   9.11
##  3 Qn1     36.1  3.42   9.48
##  4 Mn2     29.9  3.92  13.1 
##  5 Mn3     26.2  3.49  13.3 
##  6 Qc2     36.6  5.14  14.1 
##  7 Qc1     32.6  5.03  15.4 
##  8 Qn2     38.8  6.07  15.7 
##  9 Mn1     29.0  5.70  19.6 
## 10 Qc3     35.5  7.52  21.2 
## 11 Qn3     37.6 10.3   27.5
arrData1<-arrange(muData,desc(CV))  #按照降序排列
arrData1
## # A tibble: 11 × 4
##    Plant meanup   sdup    CV
##    <ord>  <dbl>  <dbl> <dbl>
##  1 Qn3     37.6 10.3   27.5 
##  2 Qc3     35.5  7.52  21.2 
##  3 Mn1     29.0  5.70  19.6 
##  4 Qn2     38.8  6.07  15.7 
##  5 Qc1     32.6  5.03  15.4 
##  6 Qc2     36.6  5.14  14.1 
##  7 Mn3     26.2  3.49  13.3 
##  8 Mn2     29.9  3.92  13.1 
##  9 Qn1     36.1  3.42   9.48
## 10 Mc1     20.1  1.83   9.11
## 11 Mc3     18.4  0.826  4.48

利用管道符 %>% 链接dplyr操作

%>% 快捷键:
arrData2<-CO2tib %>%  #获取CO2数据集
  select(c(1:3,5)) %>%  #选择所需要的列
  filter(uptake>16) %>%  #过滤挑选满足条件的样本
  group_by(Plant) %>%  #根据Plant进行分组
  summarize(meanup=mean(uptake),sdup=sd(uptake)) %>%  
  #计算各变量组内均值和标准差
  mutate(CV=(sdup/meanup)/100) %>%  #计算变异系数
  arrange(CV)           #按照变异系数CV进行升序排列

arrData2                #保存输出数据
## # A tibble: 11 × 4
##    Plant meanup   sdup       CV
##    <ord>  <dbl>  <dbl>    <dbl>
##  1 Mc3     18.4  0.826 0.000448
##  2 Mc1     20.1  1.83  0.000911
##  3 Qn1     36.1  3.42  0.000948
##  4 Mn2     29.9  3.92  0.00131 
##  5 Mn3     26.2  3.49  0.00133 
##  6 Qc2     36.6  5.14  0.00141 
##  7 Qc1     32.6  5.03  0.00154 
##  8 Qn2     38.8  6.07  0.00157 
##  9 Mn1     29.0  5.70  0.00196 
## 10 Qc3     35.5  7.52  0.00212 
## 11 Qn3     37.6 10.3   0.00275

备注:在 %>% 之后另起一行便于理解代码

ggplot2程序包及其功能简介

在R语言中,主要的绘图系统如下:

1、绘制iris数据集Sepal.width 关于Sepal.length的散点图

library(ggplot2)
data(iris)
fig1<-iris %>% ggplot(aes(x=Sepal.Length,y=Sepal.Width))+
      geom_point()+
      theme_bw()


fig1

2、在fig1中直接添加密度等高线(geom_density_2d())和一条具有置信带的平滑曲线对 数据进行拟合

fig2<-fig1 +
     geom_density_2d()+
     geom_smooth()

fig2

3、突出数据中的分组结构:将Species变量依据shape()和col()进行展示

fig3<-iris %>% ggplot(aes(x=Sepal.Length,y=Sepal.Width,shape=Species))+
      geom_point()+
      theme_bw()


fig3

fig4<-iris %>% ggplot(aes(x=Sepal.Length,y=Sepal.Width,col=Species))+
      geom_point()+
      theme_bw()


fig4

4、使用facet_wrap()对子图进行分组

fig5<-iris %>% ggplot(aes(x=Sepal.Length,y=Sepal.Width,col=Species))+
      facet_wrap(~Species)+
      geom_point()+
      theme_bw()


fig5

tidyr程序包及其功能简介

我们先创建一个格式不整洁的tibble,然后将其转换成整洁格式。 任务:虚构患者数据,其中包括干预开始后,于第0,3,6个月分别测量的患者的体重指数 (Body Mass Index,BMI). tibble中包含变量:

1、创建格式不整洁的tibble

#library(tibble)
#library(tidyr)
library(tidyverse)
Data<-tibble(patient=c("A","B","C"),
            Month0=c(21,17,29),
            Month3=c(20,21,27),
            Month6=c(21,22,23))
Data
## # A tibble: 3 × 4
##   patient Month0 Month3 Month6
##   <chr>    <dbl>  <dbl>  <dbl>
## 1 A           21     20     21
## 2 B           17     21     22
## 3 C           29     27     23

2、利用gather()函数整理数据:将宽数据转化为窄数据

gather()函数用法:gather(data,key,value),参数说明如下 任务目标:

方式一:

tidyData<-Data %>% 
        gather(key=Month,value=BMI,-patient)

tidyData
## # A tibble: 9 × 3
##   patient Month    BMI
##   <chr>   <chr>  <dbl>
## 1 A       Month0    21
## 2 B       Month0    17
## 3 C       Month0    29
## 4 A       Month3    20
## 5 B       Month3    21
## 6 C       Month3    27
## 7 A       Month6    21
## 8 B       Month6    22
## 9 C       Month6    23

方式二:

tidyData1<-Data %>% 
        gather(key=Month,value=BMI,Month0:Month6)

tidyData1
## # A tibble: 9 × 3
##   patient Month    BMI
##   <chr>   <chr>  <dbl>
## 1 A       Month0    21
## 2 B       Month0    17
## 3 C       Month0    29
## 4 A       Month3    20
## 5 B       Month3    21
## 6 C       Month3    27
## 7 A       Month6    21
## 8 B       Month6    22
## 9 C       Month6    23

方式三:

tidyData2<-Data %>% 
        gather(key=Month,value=BMI,c(Month0,Month3,Month6))

tidyData2
## # A tibble: 9 × 3
##   patient Month    BMI
##   <chr>   <chr>  <dbl>
## 1 A       Month0    21
## 2 B       Month0    17
## 3 C       Month0    29
## 4 A       Month3    20
## 5 B       Month3    21
## 6 C       Month3    27
## 7 A       Month6    21
## 8 B       Month6    22
## 9 C       Month6    23

3、利用spread()函数将窄数据转化为宽数据

spread()函数用法与gather()函数相反。spread()函数用来扩展列,可针对gather() 之前创建的key列和value列,将其中一列拆分成多列。

Data1<-tidyData %>% 
        spread(key=Month,value=BMI)

Data1
## # A tibble: 3 × 4
##   patient Month0 Month3 Month6
##   <chr>    <dbl>  <dbl>  <dbl>
## 1 A           21     20     21
## 2 B           17     21     22
## 3 C           29     27     23

purrr程序包及其功能简介

重点介绍purrr程序包中最常用函数的重要性。 ## 使用map()函数替换for循环

#随机生成三个数值向量的列表
set.seed(1234)
listdata<-list(a=rnorm(5),
               b=rnorm(6),
               c=rnorm(8))
listdata
## $a
## [1] -1.2070657  0.2774292  1.0844412 -2.3456977  0.4291247
## 
## $b
## [1]  0.5060559 -0.5747400 -0.5466319 -0.5644520 -0.8900378 -0.4771927
## 
## $c
## [1] -0.99838644 -0.77625389  0.06445882  0.95949406 -0.11028549 -0.51100951
## [7] -0.91119542 -0.83717168
#利用for循环分别统计各个列表元素的长度
Lengths<-vector("list",length = 3)
for (i in seq_along(listdata)){
  Lengths[[i]]<-length(listdata[[i]])
}
Lengths
## [[1]]
## [1] 5
## 
## [[2]]
## [1] 6
## 
## [[3]]
## [1] 8
#利用map()统计各个列表元素的长度
library(purrr)
map(listdata,length)
## $a
## [1] 5
## 
## $b
## [1] 6
## 
## $c
## [1] 8

使用map_df()返回tibble而非列表

返回向量而非列表

library(purrr)
map_dbl(listdata,length)
## a b c 
## 5 6 8
map_chr(listdata,length)
##   a   b   c 
## "5" "6" "8"
map_int(listdata,length)
## a b c 
## 5 6 8
#map_lgl(listdata,length)
map_df(listdata,length)
## # A tibble: 1 × 3
##       a     b     c
##   <int> <int> <int>
## 1     5     6     8
map_df(listdata,mean)
## # A tibble: 1 × 3
##        a      b      c
##    <dbl>  <dbl>  <dbl>
## 1 -0.352 -0.424 -0.390

在map()系列函数使用匿名函数

在程序运行中定义的函数称为匿名函数。R中只要调用function(.)即可定义一个匿名函数, function(.)函数之后的表达式是函数主体。 另外purrr包中还提供了function(.)的简写形式:~

library(purrr)
set.seed(1234)
listdata<-list(a=rnorm(5),
               b=rnorm(6),
               c=rnorm(8))
listdata
## $a
## [1] -1.2070657  0.2774292  1.0844412 -2.3456977  0.4291247
## 
## $b
## [1]  0.5060559 -0.5747400 -0.5466319 -0.5644520 -0.8900378 -0.4771927
## 
## $c
## [1] -0.99838644 -0.77625389  0.06445882  0.95949406 -0.11028549 -0.51100951
## [7] -0.91119542 -0.83717168
map(listdata,function(.). +2)
## $a
## [1]  0.7929343  2.2774292  3.0844412 -0.3456977  2.4291247
## 
## $b
## [1] 2.506056 1.425260 1.453368 1.435548 1.109962 1.522807
## 
## $c
## [1] 1.001614 1.223746 2.064459 2.959494 1.889715 1.488990 1.088805 1.162828
#使用function(.)的简写形式:~
map(listdata,~. +2)
## $a
## [1]  0.7929343  2.2774292  3.0844412 -0.3456977  2.4291247
## 
## $b
## [1] 2.506056 1.425260 1.453368 1.435548 1.109962 1.522807
## 
## $c
## [1] 1.001614 1.223746 2.064459 2.959494 1.889715 1.488990 1.088805 1.162828