library(tidyverse)
各个包的作用:
矩形数据:如果每行的元素数等于列数,并且每列的元素数等于行数,那么此类数据为矩形数据。但是数据并非都是这种类型。
tibble程序包引入了一种新的数据结构。
#创建tibble
Tib<-tibble(x=1:4,
y=c("Beijing","Shanghai","Guangzhou","Shenzhen"))
Tib
## # A tibble: 4 × 2
## x y
## <int> <chr>
## 1 1 Beijing
## 2 2 Shanghai
## 3 3 Guangzhou
## 4 4 Shenzhen
#创建数据框,并将数据框转化为tibble
Df<-data.frame(x=1:4,
y=c("Beijing","Shanghai","Guangzhou","Shenzhen"))
Df
## x y
## 1 1 Beijing
## 2 2 Shanghai
## 3 3 Guangzhou
## 4 4 Shenzhen
DftoTib<-as_tibble(Df)
DftoTib
## # A tibble: 4 × 2
## x y
## <int> <chr>
## 1 1 Beijing
## 2 2 Shanghai
## 3 3 Guangzhou
## 4 4 Shenzhen
Dffactor<-data.frame(x=1:4,
y=c("Beijing","Shanghai","Guangzhou","Shenzhen"),
stringsAsFactors = FALSE)
DfNotfactor<-data.frame(x=1:4,
y=c("Beijing","Shanghai","Guangzhou","Shenzhen"),
stringsAsFactors = TRUE)
#将变量y变成tibble中的factor变量,只需将c()函数封装到factor()函数中
Tib1<-tibble(x=1:4,
y=factor(c("Beijing","Shanghai","Guangzhou","Shenzhen")))
Tib1
## # A tibble: 4 × 2
## x y
## <int> <fct>
## 1 1 Beijing
## 2 2 Shanghai
## 3 3 Guangzhou
## 4 4 Shenzhen
a<-c(class(Tib$y),class(Df$y),class(DfNotfactor$y),
class(Dffactor$y),class(Tib1$y))
a
## [1] "character" "character" "factor" "character" "factor"
Df[,1]
## [1] 1 2 3 4
Tib[,1]
## # A tibble: 4 × 1
## x
## <int>
## 1 1
## 2 2
## 3 3
## 4 4
Tib[[1]]
## [1] 1 2 3 4
Tib$x
## [1] 1 2 3 4
当输出数据框时,所有列都被将输出到计算机屏幕。
当输出tibble时,默认情况下只会输出前10行以及计算机屏幕所能显示的列数,未显示的变量名 将在输出结果的底部列出。
data(starwars)
starwars
## # A tibble: 87 × 14
## name height mass hair_…¹ skin_…² eye_c…³ birth…⁴ sex gender homew…⁵
## <chr> <int> <dbl> <chr> <chr> <chr> <dbl> <chr> <chr> <chr>
## 1 Luke Skywa… 172 77 blond fair blue 19 male mascu… Tatooi…
## 2 C-3PO 167 75 <NA> gold yellow 112 none mascu… Tatooi…
## 3 R2-D2 96 32 <NA> white,… red 33 none mascu… Naboo
## 4 Darth Vader 202 136 none white yellow 41.9 male mascu… Tatooi…
## 5 Leia Organa 150 49 brown light brown 19 fema… femin… Aldera…
## 6 Owen Lars 178 120 brown,… light blue 52 male mascu… Tatooi…
## 7 Beru White… 165 75 brown light blue 47 fema… femin… Tatooi…
## 8 R5-D4 97 32 <NA> white,… red NA none mascu… Tatooi…
## 9 Biggs Dark… 183 84 black light brown 24 male mascu… Tatooi…
## 10 Obi-Wan Ke… 182 77 auburn… fair blue-g… 57 male mascu… Stewjon
## # … with 77 more rows, 4 more variables: species <chr>, films <list>,
## # vehicles <list>, starships <list>, and abbreviated variable names
## # ¹hair_color, ²skin_color, ³eye_color, ⁴birth_year, ⁵homeworld
## # ℹ Use `print(n = ...)` to see more rows, and `colnames()` to see all variable names
write_csv(starwars,"starwars.csv")
依照顺序创建变量
sequentialTib<-tibble(nItems=c(12,45,107),
cost=c(0.5,1.2,1.8),
totalWorth=nItems*cost)
sequentialTib
## # A tibble: 3 × 3
## nItems cost totalWorth
## <dbl> <dbl> <dbl>
## 1 12 0.5 6
## 2 45 1.2 54
## 3 107 1.8 193.
library(tibble)
data(CO2)
CO2tib<-as_tibble(CO2)
CO2tib
## # A tibble: 84 × 5
## Plant Type Treatment conc uptake
## <ord> <fct> <fct> <dbl> <dbl>
## 1 Qn1 Quebec nonchilled 95 16
## 2 Qn1 Quebec nonchilled 175 30.4
## 3 Qn1 Quebec nonchilled 250 34.8
## 4 Qn1 Quebec nonchilled 350 37.2
## 5 Qn1 Quebec nonchilled 500 35.3
## 6 Qn1 Quebec nonchilled 675 39.2
## 7 Qn1 Quebec nonchilled 1000 39.7
## 8 Qn2 Quebec nonchilled 95 13.6
## 9 Qn2 Quebec nonchilled 175 27.3
## 10 Qn2 Quebec nonchilled 250 37.1
## # … with 74 more rows
## # ℹ Use `print(n = ...)` to see more rows
#使用select()函数选择1,2,3,5列
selectedData<-select(CO2tib,1,2,3,5)
selectedData
## # A tibble: 84 × 4
## Plant Type Treatment uptake
## <ord> <fct> <fct> <dbl>
## 1 Qn1 Quebec nonchilled 16
## 2 Qn1 Quebec nonchilled 30.4
## 3 Qn1 Quebec nonchilled 34.8
## 4 Qn1 Quebec nonchilled 37.2
## 5 Qn1 Quebec nonchilled 35.3
## 6 Qn1 Quebec nonchilled 39.2
## 7 Qn1 Quebec nonchilled 39.7
## 8 Qn2 Quebec nonchilled 13.6
## 9 Qn2 Quebec nonchilled 27.3
## 10 Qn2 Quebec nonchilled 37.1
## # … with 74 more rows
## # ℹ Use `print(n = ...)` to see more rows
#使用filter()函数过滤行:选择uptake>16
filteredData<-filter(selectedData,uptake>16)
filteredData
## # A tibble: 66 × 4
## Plant Type Treatment uptake
## <ord> <fct> <fct> <dbl>
## 1 Qn1 Quebec nonchilled 30.4
## 2 Qn1 Quebec nonchilled 34.8
## 3 Qn1 Quebec nonchilled 37.2
## 4 Qn1 Quebec nonchilled 35.3
## 5 Qn1 Quebec nonchilled 39.2
## 6 Qn1 Quebec nonchilled 39.7
## 7 Qn2 Quebec nonchilled 27.3
## 8 Qn2 Quebec nonchilled 37.1
## 9 Qn2 Quebec nonchilled 41.8
## 10 Qn2 Quebec nonchilled 40.6
## # … with 56 more rows
## # ℹ Use `print(n = ...)` to see more rows
#使用group_by()函数分组数据:按照Plant变量进行分组
groupData<-group_by(filteredData,Plant)
groupData
## # A tibble: 66 × 4
## # Groups: Plant [11]
## Plant Type Treatment uptake
## <ord> <fct> <fct> <dbl>
## 1 Qn1 Quebec nonchilled 30.4
## 2 Qn1 Quebec nonchilled 34.8
## 3 Qn1 Quebec nonchilled 37.2
## 4 Qn1 Quebec nonchilled 35.3
## 5 Qn1 Quebec nonchilled 39.2
## 6 Qn1 Quebec nonchilled 39.7
## 7 Qn2 Quebec nonchilled 27.3
## 8 Qn2 Quebec nonchilled 37.1
## 9 Qn2 Quebec nonchilled 41.8
## 10 Qn2 Quebec nonchilled 40.6
## # … with 56 more rows
## # ℹ Use `print(n = ...)` to see more rows
#使用ungroup()函数解除tibble中的分组结构
Data<-ungroup(groupData)
Data
## # A tibble: 66 × 4
## Plant Type Treatment uptake
## <ord> <fct> <fct> <dbl>
## 1 Qn1 Quebec nonchilled 30.4
## 2 Qn1 Quebec nonchilled 34.8
## 3 Qn1 Quebec nonchilled 37.2
## 4 Qn1 Quebec nonchilled 35.3
## 5 Qn1 Quebec nonchilled 39.2
## 6 Qn1 Quebec nonchilled 39.7
## 7 Qn2 Quebec nonchilled 27.3
## 8 Qn2 Quebec nonchilled 37.1
## 9 Qn2 Quebec nonchilled 41.8
## 10 Qn2 Quebec nonchilled 40.6
## # … with 56 more rows
## # ℹ Use `print(n = ...)` to see more rows
#使用summarize()函数创建汇总变量
sumData<-summarize(groupData,meanup=mean(uptake),sdup=sd(uptake))
sumData
## # A tibble: 11 × 3
## Plant meanup sdup
## <ord> <dbl> <dbl>
## 1 Qn1 36.1 3.42
## 2 Qn2 38.8 6.07
## 3 Qn3 37.6 10.3
## 4 Qc1 32.6 5.03
## 5 Qc3 35.5 7.52
## 6 Qc2 36.6 5.14
## 7 Mn3 26.2 3.49
## 8 Mn2 29.9 3.92
## 9 Mn1 29.0 5.70
## 10 Mc3 18.4 0.826
## 11 Mc1 20.1 1.83
#使用mutate()创建新变量:利用sumData结果计算各组变异系数
muData<-mutate(sumData,CV=(sdup/meanup)*100)
muData
## # A tibble: 11 × 4
## Plant meanup sdup CV
## <ord> <dbl> <dbl> <dbl>
## 1 Qn1 36.1 3.42 9.48
## 2 Qn2 38.8 6.07 15.7
## 3 Qn3 37.6 10.3 27.5
## 4 Qc1 32.6 5.03 15.4
## 5 Qc3 35.5 7.52 21.2
## 6 Qc2 36.6 5.14 14.1
## 7 Mn3 26.2 3.49 13.3
## 8 Mn2 29.9 3.92 13.1
## 9 Mn1 29.0 5.70 19.6
## 10 Mc3 18.4 0.826 4.48
## 11 Mc1 20.1 1.83 9.11
#使用arrange()函数根据变量进行排序tibble
arrData<-arrange(muData,CV) #默认按照升序排列
arrData
## # A tibble: 11 × 4
## Plant meanup sdup CV
## <ord> <dbl> <dbl> <dbl>
## 1 Mc3 18.4 0.826 4.48
## 2 Mc1 20.1 1.83 9.11
## 3 Qn1 36.1 3.42 9.48
## 4 Mn2 29.9 3.92 13.1
## 5 Mn3 26.2 3.49 13.3
## 6 Qc2 36.6 5.14 14.1
## 7 Qc1 32.6 5.03 15.4
## 8 Qn2 38.8 6.07 15.7
## 9 Mn1 29.0 5.70 19.6
## 10 Qc3 35.5 7.52 21.2
## 11 Qn3 37.6 10.3 27.5
arrData1<-arrange(muData,desc(CV)) #按照降序排列
arrData1
## # A tibble: 11 × 4
## Plant meanup sdup CV
## <ord> <dbl> <dbl> <dbl>
## 1 Qn3 37.6 10.3 27.5
## 2 Qc3 35.5 7.52 21.2
## 3 Mn1 29.0 5.70 19.6
## 4 Qn2 38.8 6.07 15.7
## 5 Qc1 32.6 5.03 15.4
## 6 Qc2 36.6 5.14 14.1
## 7 Mn3 26.2 3.49 13.3
## 8 Mn2 29.9 3.92 13.1
## 9 Qn1 36.1 3.42 9.48
## 10 Mc1 20.1 1.83 9.11
## 11 Mc3 18.4 0.826 4.48
arrData2<-CO2tib %>% #获取CO2数据集
select(c(1:3,5)) %>% #选择所需要的列
filter(uptake>16) %>% #过滤挑选满足条件的样本
group_by(Plant) %>% #根据Plant进行分组
summarize(meanup=mean(uptake),sdup=sd(uptake)) %>%
#计算各变量组内均值和标准差
mutate(CV=(sdup/meanup)/100) %>% #计算变异系数
arrange(CV) #按照变异系数CV进行升序排列
arrData2 #保存输出数据
## # A tibble: 11 × 4
## Plant meanup sdup CV
## <ord> <dbl> <dbl> <dbl>
## 1 Mc3 18.4 0.826 0.000448
## 2 Mc1 20.1 1.83 0.000911
## 3 Qn1 36.1 3.42 0.000948
## 4 Mn2 29.9 3.92 0.00131
## 5 Mn3 26.2 3.49 0.00133
## 6 Qc2 36.6 5.14 0.00141
## 7 Qc1 32.6 5.03 0.00154
## 8 Qn2 38.8 6.07 0.00157
## 9 Mn1 29.0 5.70 0.00196
## 10 Qc3 35.5 7.52 0.00212
## 11 Qn3 37.6 10.3 0.00275
备注:在 %>% 之后另起一行便于理解代码
1、绘制iris数据集Sepal.width 关于Sepal.length的散点图
library(ggplot2)
data(iris)
fig1<-iris %>% ggplot(aes(x=Sepal.Length,y=Sepal.Width))+
geom_point()+
theme_bw()
fig1
2、在fig1中直接添加密度等高线(geom_density_2d())和一条具有置信带的平滑曲线对 数据进行拟合
fig2<-fig1 +
geom_density_2d()+
geom_smooth()
fig2
3、突出数据中的分组结构:将Species变量依据shape()和col()进行展示
fig3<-iris %>% ggplot(aes(x=Sepal.Length,y=Sepal.Width,shape=Species))+
geom_point()+
theme_bw()
fig3
fig4<-iris %>% ggplot(aes(x=Sepal.Length,y=Sepal.Width,col=Species))+
geom_point()+
theme_bw()
fig4
4、使用facet_wrap()对子图进行分组
fig5<-iris %>% ggplot(aes(x=Sepal.Length,y=Sepal.Width,col=Species))+
facet_wrap(~Species)+
geom_point()+
theme_bw()
fig5
1、创建格式不整洁的tibble
#library(tibble)
#library(tidyr)
library(tidyverse)
Data<-tibble(patient=c("A","B","C"),
Month0=c(21,17,29),
Month3=c(20,21,27),
Month6=c(21,22,23))
Data
## # A tibble: 3 × 4
## patient Month0 Month3 Month6
## <chr> <dbl> <dbl> <dbl>
## 1 A 21 20 21
## 2 B 17 21 22
## 3 C 29 27 23
2、利用gather()函数整理数据:将宽数据转化为窄数据
gather()函数用法:gather(data,key,value),参数说明如下 任务目标:方式一:
tidyData<-Data %>%
gather(key=Month,value=BMI,-patient)
tidyData
## # A tibble: 9 × 3
## patient Month BMI
## <chr> <chr> <dbl>
## 1 A Month0 21
## 2 B Month0 17
## 3 C Month0 29
## 4 A Month3 20
## 5 B Month3 21
## 6 C Month3 27
## 7 A Month6 21
## 8 B Month6 22
## 9 C Month6 23
方式二:
tidyData1<-Data %>%
gather(key=Month,value=BMI,Month0:Month6)
tidyData1
## # A tibble: 9 × 3
## patient Month BMI
## <chr> <chr> <dbl>
## 1 A Month0 21
## 2 B Month0 17
## 3 C Month0 29
## 4 A Month3 20
## 5 B Month3 21
## 6 C Month3 27
## 7 A Month6 21
## 8 B Month6 22
## 9 C Month6 23
方式三:
tidyData2<-Data %>%
gather(key=Month,value=BMI,c(Month0,Month3,Month6))
tidyData2
## # A tibble: 9 × 3
## patient Month BMI
## <chr> <chr> <dbl>
## 1 A Month0 21
## 2 B Month0 17
## 3 C Month0 29
## 4 A Month3 20
## 5 B Month3 21
## 6 C Month3 27
## 7 A Month6 21
## 8 B Month6 22
## 9 C Month6 23
3、利用spread()函数将窄数据转化为宽数据
spread()函数用法与gather()函数相反。spread()函数用来扩展列,可针对gather() 之前创建的key列和value列,将其中一列拆分成多列。
Data1<-tidyData %>%
spread(key=Month,value=BMI)
Data1
## # A tibble: 3 × 4
## patient Month0 Month3 Month6
## <chr> <dbl> <dbl> <dbl>
## 1 A 21 20 21
## 2 B 17 21 22
## 3 C 29 27 23
重点介绍purrr程序包中最常用函数的重要性。 ## 使用map()函数替换for循环
#随机生成三个数值向量的列表
set.seed(1234)
listdata<-list(a=rnorm(5),
b=rnorm(6),
c=rnorm(8))
listdata
## $a
## [1] -1.2070657 0.2774292 1.0844412 -2.3456977 0.4291247
##
## $b
## [1] 0.5060559 -0.5747400 -0.5466319 -0.5644520 -0.8900378 -0.4771927
##
## $c
## [1] -0.99838644 -0.77625389 0.06445882 0.95949406 -0.11028549 -0.51100951
## [7] -0.91119542 -0.83717168
#利用for循环分别统计各个列表元素的长度
Lengths<-vector("list",length = 3)
for (i in seq_along(listdata)){
Lengths[[i]]<-length(listdata[[i]])
}
Lengths
## [[1]]
## [1] 5
##
## [[2]]
## [1] 6
##
## [[3]]
## [1] 8
#利用map()统计各个列表元素的长度
library(purrr)
map(listdata,length)
## $a
## [1] 5
##
## $b
## [1] 6
##
## $c
## [1] 8
library(purrr)
map_dbl(listdata,length)
## a b c
## 5 6 8
map_chr(listdata,length)
## a b c
## "5" "6" "8"
map_int(listdata,length)
## a b c
## 5 6 8
#map_lgl(listdata,length)
map_df(listdata,length)
## # A tibble: 1 × 3
## a b c
## <int> <int> <int>
## 1 5 6 8
map_df(listdata,mean)
## # A tibble: 1 × 3
## a b c
## <dbl> <dbl> <dbl>
## 1 -0.352 -0.424 -0.390
在程序运行中定义的函数称为匿名函数。R中只要调用function(.)即可定义一个匿名函数, function(.)函数之后的表达式是函数主体。 另外purrr包中还提供了function(.)的简写形式:~
library(purrr)
set.seed(1234)
listdata<-list(a=rnorm(5),
b=rnorm(6),
c=rnorm(8))
listdata
## $a
## [1] -1.2070657 0.2774292 1.0844412 -2.3456977 0.4291247
##
## $b
## [1] 0.5060559 -0.5747400 -0.5466319 -0.5644520 -0.8900378 -0.4771927
##
## $c
## [1] -0.99838644 -0.77625389 0.06445882 0.95949406 -0.11028549 -0.51100951
## [7] -0.91119542 -0.83717168
map(listdata,function(.). +2)
## $a
## [1] 0.7929343 2.2774292 3.0844412 -0.3456977 2.4291247
##
## $b
## [1] 2.506056 1.425260 1.453368 1.435548 1.109962 1.522807
##
## $c
## [1] 1.001614 1.223746 2.064459 2.959494 1.889715 1.488990 1.088805 1.162828
#使用function(.)的简写形式:~
map(listdata,~. +2)
## $a
## [1] 0.7929343 2.2774292 3.0844412 -0.3456977 2.4291247
##
## $b
## [1] 2.506056 1.425260 1.453368 1.435548 1.109962 1.522807
##
## $c
## [1] 1.001614 1.223746 2.064459 2.959494 1.889715 1.488990 1.088805 1.162828