使用tibble实现简单数据框

tibble包

library(tidyverse)
## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.0     √ purrr   0.2.5
## √ tibble  1.4.2     √ dplyr   0.7.8
## √ tidyr   0.8.2     √ stringr 1.3.1
## √ readr   1.3.1     √ forcats 0.3.0
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
# 将标准数据框转换为tibble
as_tibble(iris)
## # A tibble: 150 x 5
##    Sepal.Length Sepal.Width Petal.Length Petal.Width Species
##           <dbl>       <dbl>        <dbl>       <dbl> <fct>  
##  1          5.1         3.5          1.4         0.2 setosa 
##  2          4.9         3            1.4         0.2 setosa 
##  3          4.7         3.2          1.3         0.2 setosa 
##  4          4.6         3.1          1.5         0.2 setosa 
##  5          5           3.6          1.4         0.2 setosa 
##  6          5.4         3.9          1.7         0.4 setosa 
##  7          4.6         3.4          1.4         0.3 setosa 
##  8          5           3.4          1.5         0.2 setosa 
##  9          4.4         2.9          1.4         0.2 setosa 
## 10          4.9         3.1          1.5         0.1 setosa 
## # ... with 140 more rows
# tibble()函数使用向量创建tibble
tibble(x=1:5,y=2,z=x+y)#自动重复长度为2的输入
## # A tibble: 5 x 3
##       x     y     z
##   <int> <dbl> <dbl>
## 1     1     2     3
## 2     2     2     4
## 3     3     2     5
## 4     4     2     6
## 5     5     2     7
# tibble的功能较dataframe少,不能改变输入,变量名称,不能创建行名称
# 如果一定要,就使用R中的无效变量作为列名,要英语这种变量,需要 反引号括起来 ``
# 如果要在ggplot2,dplyr中使用这些变量,同样使用反引号括起这些变量
tb <- tibble(
  `:)` = "smile", 
  ` ` = "space",
  `2000` = "number"
)
tb
## # A tibble: 1 x 3
##   `:)`  ` `   `2000`
##   <chr> <chr> <chr> 
## 1 smile space number
# 创建tibble的另一种方式是使用,tribble函数定制化,以~开头,对数据按行编码
tribble(
  ~x,~y,~z,
  #--/--/----
  "a", 2, 3.6,
  "b", 1, 8.6
)
## # A tibble: 2 x 3
##   x         y     z
##   <chr> <dbl> <dbl>
## 1 a         2   3.6
## 2 b         1   8.6
# tibble对打印的优化,仅显示前10行结果,并且适合屏幕
tibble(
  a = lubridate::now() + runif(1e3) * 86400,
  b = lubridate::today() + runif(1e3) * 30,
  c = 1:1e3,
  d = runif(1e3),#随机产生1000个0-1间的正态分布
  e = sample(letters, 1e3, replace = TRUE)
)
## # A tibble: 1,000 x 5
##    a                   b              c      d e    
##    <dttm>              <date>     <int>  <dbl> <chr>
##  1 2019-01-25 04:14:02 2019-02-19     1 0.926  o    
##  2 2019-01-25 07:06:59 2019-02-10     2 0.0620 i    
##  3 2019-01-25 15:39:16 2019-01-30     3 0.977  d    
##  4 2019-01-24 18:49:18 2019-01-31     4 0.844  m    
##  5 2019-01-24 20:33:47 2019-02-15     5 0.449  z    
##  6 2019-01-25 00:27:47 2019-01-31     6 0.647  m    
##  7 2019-01-24 20:30:40 2019-02-13     7 0.635  d    
##  8 2019-01-24 21:15:19 2019-02-19     8 0.302  s    
##  9 2019-01-25 17:34:17 2019-02-09     9 0.993  q    
## 10 2019-01-25 16:32:21 2019-02-17    10 0.0666 z    
## # ... with 990 more rows
# print函数控制打印行数(n),显示宽度,Inf打印所有列
nycflights13::flights %>% 
  print(n = 10, width = Inf)
## # A tibble: 336,776 x 19
##     year month   day dep_time sched_dep_time dep_delay arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>
##  1  2013     1     1      517            515         2      830
##  2  2013     1     1      533            529         4      850
##  3  2013     1     1      542            540         2      923
##  4  2013     1     1      544            545        -1     1004
##  5  2013     1     1      554            600        -6      812
##  6  2013     1     1      554            558        -4      740
##  7  2013     1     1      555            600        -5      913
##  8  2013     1     1      557            600        -3      709
##  9  2013     1     1      557            600        -3      838
## 10  2013     1     1      558            600        -2      753
##    sched_arr_time arr_delay carrier flight tailnum origin dest  air_time
##             <int>     <dbl> <chr>    <int> <chr>   <chr>  <chr>    <dbl>
##  1            819        11 UA        1545 N14228  EWR    IAH        227
##  2            830        20 UA        1714 N24211  LGA    IAH        227
##  3            850        33 AA        1141 N619AA  JFK    MIA        160
##  4           1022       -18 B6         725 N804JB  JFK    BQN        183
##  5            837       -25 DL         461 N668DN  LGA    ATL        116
##  6            728        12 UA        1696 N39463  EWR    ORD        150
##  7            854        19 B6         507 N516JB  EWR    FLL        158
##  8            723       -14 EV        5708 N829AS  LGA    IAD         53
##  9            846        -8 B6          79 N593JB  JFK    MCO        140
## 10            745         8 AA         301 N3ALAA  LGA    ORD        138
##    distance  hour minute time_hour          
##       <dbl> <dbl>  <dbl> <dttm>             
##  1     1400     5     15 2013-01-01 05:00:00
##  2     1416     5     29 2013-01-01 05:00:00
##  3     1089     5     40 2013-01-01 05:00:00
##  4     1576     5     45 2013-01-01 05:00:00
##  5      762     6      0 2013-01-01 06:00:00
##  6      719     5     58 2013-01-01 05:00:00
##  7     1065     6      0 2013-01-01 06:00:00
##  8      229     6      0 2013-01-01 06:00:00
##  9      944     6      0 2013-01-01 06:00:00
## 10      733     6      0 2013-01-01 06:00:00
## # ... with 3.368e+05 more rows
#查看数据集
nycflights13::flights %>% 
  View()

#取子集 $ [[]]
df <- tibble(
  x = runif(5),
  y = rnorm(5)
)
df
## # A tibble: 5 x 2
##         x      y
##     <dbl>  <dbl>
## 1 0.00340 -0.774
## 2 0.242    1.19 
## 3 0.171   -0.454
## 4 0.984    1.26 
## 5 0.773    0.747
# 提取子集
df$x
## [1] 0.003400322 0.241965603 0.170880569 0.983685581 0.772722888
df[["x"]]
## [1] 0.003400322 0.241965603 0.170880569 0.983685581 0.772722888
df[[1]]#按位置提取
## [1] 0.003400322 0.241965603 0.170880569 0.983685581 0.772722888
df %>% .$x #在管道中提取,使用特殊占位符.(一点)
## [1] 0.003400322 0.241965603 0.170880569 0.983685581 0.772722888
# 有些较旧的函数不支持,tibble,可将其转回dataframe
class(as.data.frame(tb))
## [1] "data.frame"

使用readr进行数据导入

library(tidyverse)
# 自己读取一个数据
diamond<-read_csv("diamonds.csv")
## Parsed with column specification:
## cols(
##   carat = col_double(),
##   cut = col_character(),
##   color = col_character(),
##   clarity = col_character(),
##   depth = col_double(),
##   table = col_double(),
##   price = col_double(),
##   x = col_double(),
##   y = col_double(),
##   z = col_double()
## )
# 试下管道的感觉如何
diamond %>%
  print(n=10,width=Inf) %>%
  ggplot(aes(clarity,price,color=cut))+
       geom_boxplot()
## # A tibble: 53,940 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <chr>     <chr> <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # ... with 5.393e+04 more rows

#分面
diamond %>%
  print(n=10,width=Inf) %>%
  ggplot(aes(clarity,price))+
       geom_boxplot()+
        facet_wrap(~cut,nrow = 2)
## # A tibble: 53,940 x 10
##    carat cut       color clarity depth table price     x     y     z
##    <dbl> <chr>     <chr> <chr>   <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
##  1 0.23  Ideal     E     SI2      61.5    55   326  3.95  3.98  2.43
##  2 0.21  Premium   E     SI1      59.8    61   326  3.89  3.84  2.31
##  3 0.23  Good      E     VS1      56.9    65   327  4.05  4.07  2.31
##  4 0.290 Premium   I     VS2      62.4    58   334  4.2   4.23  2.63
##  5 0.31  Good      J     SI2      63.3    58   335  4.34  4.35  2.75
##  6 0.24  Very Good J     VVS2     62.8    57   336  3.94  3.96  2.48
##  7 0.24  Very Good I     VVS1     62.3    57   336  3.95  3.98  2.47
##  8 0.26  Very Good H     SI1      61.9    55   337  4.07  4.11  2.53
##  9 0.22  Fair      E     VS2      65.1    61   337  3.87  3.78  2.49
## 10 0.23  Very Good H     VS1      59.4    61   338  4     4.05  2.39
## # ... with 5.393e+04 more rows

解析一些read_csv的参数

skip=n 跳过前n行

comment=“#”,丢弃所有以#开头的行

col_names=FALSE 数据读取没有列

“”反斜杠加n 用于换行

na,设定用哪些值用缺失值处理 na==“.” 即点用缺失值代替

为什么不用R基础包中的read.csv呢?

速度快

可以生产tibble不会将字符向量转换为因子、

解析向量, Parse_*函数族

str(parse_logical(c("TRUE", "FALSE", "NA")))
##  logi [1:3] TRUE FALSE NA
#>  logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
##  int [1:3] 1 2 3
#>  int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
##  Date[1:2], format: "2010-01-01" "1979-10-14"
#>  Date[1:2], format: "2010-01-01" "1979-10-14"

#解析失败则输出以缺失值形式存在
x <- parse_integer(c("123", "345", "abc", "123.45"))
## Warning: 2 parsing failures.
## row col               expected actual
##   3  -- an integer                abc
##   4  -- no trailing characters    .45
x
## [1] 123 345  NA  NA
## attr(,"problems")
## # A tibble: 2 x 4
##     row   col expected               actual
##   <int> <int> <chr>                  <chr> 
## 1     3    NA an integer             abc   
## 2     4    NA no trailing characters .45
problems(x)#获取完整的失败信息合集
## # A tibble: 2 x 4
##     row   col expected               actual
##   <int> <int> <chr>                  <chr> 
## 1     3    NA an integer             abc   
## 2     4    NA no trailing characters .45

重要的解析函数

parse_logical() parse_integer() 分别解析逻辑值和整数

parse_double严格数值型解析函数 parse_number灵活数值型解析函数

parse_character 字符编码很重要

paese_factor 可创建因子,R使用这种数据结构表示分类变量

parse_datetime parse_date, parse_time 解析日期 时间

数值

parse_double("1.23")
## [1] 1.23
# 设置新的地区对象, decimal_mark参数,覆盖.的默认值
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
# parse_number忽略数值前后的非数值型字符
parse_number("$100")
## [1] 100
parse_number("20%")
## [1] 20
parse_number("It cost $123.45")
## [1] 123.45
# parse_number忽略分组符号
parse_number("$123,456,789")
## [1] 123456789
parse_number("123.456.789", locale = locale(grouping_mark = "."))
## [1] 123456789
parse_number("123'456'789", locale = locale(grouping_mark = "'"))
## [1] 123456789

字符串

# charToRaw获取字符串的底层表示, ASCII码 16进制表示英文字符
charToRaw("Hadley")
## [1] 48 61 64 6c 65 79
# gusess_encoding函数 找到编码方式
# parse_character设定编码方式

因子

fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
## Warning: 1 parsing failure.
## row col           expected   actual
##   3  -- value in level set bananana
## [1] apple  banana <NA>  
## attr(,"problems")
## # A tibble: 1 x 4
##     row   col expected           actual  
##   <int> <int> <chr>              <chr>   
## 1     3    NA value in level set bananana
## Levels: apple banana

日期,日期与时间, 时间

#parse_datetime 期待的日期时间是符合 ISO 8601标准的日期时间(一种国际标准)
parse_datetime("2010-10-01T2010")
## [1] "2010-10-01 20:10:00 UTC"
parse_datetime("20101010")
## [1] "2010-10-10 UTC"
# parse_date期待的是四位数的年份,格式如下以- 或/分割
parse_date("2010-10-01")
## [1] "2010-10-01"
#parse_time 期待的是小时:分钟和秒,:am或pm 标识符
library(hms)
parse_time("01:10 am")
## 01:10:00
#> 01:10:00
parse_time("20:10:01")
## 20:10:01
#> 20:10:01

# 如默认设置不符合,可自行设置,格式如下
# 解析成三个不同的时间
parse_date("01/02/15", "%m/%d/%y")
## [1] "2015-01-02"
#> [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y")
## [1] "2015-02-01"
#> [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d")
## [1] "2001-02-15"
#> [1] "2001-02-15"
Year
%Y (4 digits).
%y (2 digits); 00-69 -> 2000-2069, 70-99 -> 1970-1999.
Month
%m (2 digits).
%b (abbreviated name, like “Jan”).
%B (full name, “January”).
Day
%d (2 digits).
%e (optional leading space).
Time
%H 0-23 hour.
%I 0-12, must be used with %p.
%p AM/PM indicator.
%M minutes.
%S integer seconds.
%OS real seconds.
%Z Time zone (as name, e.g. America/Chicago). Beware of abbreviations: if you’re American, note that “EST” is a Canadian time zone that does not have daylight savings time. It is not Eastern Standard Time! We’ll come back to this time zones.
%z (as offset from UTC, e.g. +0800).
Non-digits
%. skips one non-digit character.
%* skips any number of non-digits.

# readr解析文件
## readr以一种启发式过程来确定每列的类型
## 先用guess_parse()函数返回readr最可信的猜测
# 示例
guess_parser("2010-10-01")
## [1] "date"
#> [1] "date"
guess_parser("15:01")
## [1] "time"
#> [1] "time"
guess_parser(c("TRUE", "FALSE"))
## [1] "logical"
#> [1] "logical"
guess_parser(c("1", "5", "9"))
## [1] "double"
#> [1] "double"
guess_parser(c("12,352,561"))
## [1] "number"
#> [1] "number"

str(parse_guess("2010-10-10"))
##  Date[1:1], format: "2010-10-10"

默认设置的问题解决 1. 前1000行不能代表所有行,2列中有大量缺失值

challenge <- read_csv(readr_example("challenge.csv")) #
## Parsed with column specification:
## cols(
##   x = col_double(),
##   y = col_logical()
## )
## Warning: 1000 parsing failures.
##  row col           expected     actual                                               file
## 1001   y 1/0/T/F/TRUE/FALSE 2015-01-16 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1002   y 1/0/T/F/TRUE/FALSE 2018-05-18 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1003   y 1/0/T/F/TRUE/FALSE 2015-09-05 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1004   y 1/0/T/F/TRUE/FALSE 2012-11-28 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1005   y 1/0/T/F/TRUE/FALSE 2020-01-13 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## .... ... .................. .......... ..................................................
## See problems(...) for more details.
# redadr_example读取R包中的文件路径
problems(challenge)
## # A tibble: 1,000 x 5
##      row col   expected        actual    file                              
##    <int> <chr> <chr>           <chr>     <chr>                             
##  1  1001 y     1/0/T/F/TRUE/F~ 2015-01-~ 'D:/R/R-3.5.2/library/readr/extda~
##  2  1002 y     1/0/T/F/TRUE/F~ 2018-05-~ 'D:/R/R-3.5.2/library/readr/extda~
##  3  1003 y     1/0/T/F/TRUE/F~ 2015-09-~ 'D:/R/R-3.5.2/library/readr/extda~
##  4  1004 y     1/0/T/F/TRUE/F~ 2012-11-~ 'D:/R/R-3.5.2/library/readr/extda~
##  5  1005 y     1/0/T/F/TRUE/F~ 2020-01-~ 'D:/R/R-3.5.2/library/readr/extda~
##  6  1006 y     1/0/T/F/TRUE/F~ 2016-04-~ 'D:/R/R-3.5.2/library/readr/extda~
##  7  1007 y     1/0/T/F/TRUE/F~ 2011-05-~ 'D:/R/R-3.5.2/library/readr/extda~
##  8  1008 y     1/0/T/F/TRUE/F~ 2020-07-~ 'D:/R/R-3.5.2/library/readr/extda~
##  9  1009 y     1/0/T/F/TRUE/F~ 2011-04-~ 'D:/R/R-3.5.2/library/readr/extda~
## 10  1010 y     1/0/T/F/TRUE/F~ 2010-05-~ 'D:/R/R-3.5.2/library/readr/extda~
## # ... with 990 more rows
## 修改列类型,根据problems()返回的结果
## 告诉readr如何加载数据
challenge<-read_csv(
  readr_example("challenge.csv"),
  col_types = cols(
    x=col_double(),
    y=col_date()
    )
)
tail(challenge)#查看后6个
## # A tibble: 6 x 2
##       x y         
##   <dbl> <date>    
## 1 0.805 2019-11-21
## 2 0.164 2018-03-29
## 3 0.472 2014-08-04
## 4 0.718 2015-08-16
## 5 0.270 2020-02-04
## 6 0.608 2019-01-06
# 先将所有列都作为字符向量输入,发现读取问题
challenge2 <- read_csv(readr_example("challenge.csv"), 
  col_types = cols(.default = col_character())
)
tail(challenge2)##
## # A tibble: 6 x 2
##   x                   y         
##   <chr>               <chr>     
## 1 0.805274312151596   2019-11-21
## 2 0.1635163405444473  2018-03-29
## 3 0.47193897631950676 2014-08-04
## 4 0.7183186465408653  2015-08-16
## 5 0.26987858884967864 2020-02-04
## 6 0.608237189007923   2019-01-06
# type_convert函数启动启发式解析
df <- tribble(
  ~x,  ~y,
  "1", "1.21",
  "2", "2.32",
  "3", "4.56"
)
df
## # A tibble: 3 x 2
##   x     y    
##   <chr> <chr>
## 1 1     1.21 
## 2 2     2.32 
## 3 3     4.56
type.convert(df)##启发式解析数据
## # A tibble: 3 x 2
##       x     y
##   <int> <dbl>
## 1     1  1.21
## 2     2  2.32
## 3     3  4.56

写入文件

write_csv(challenge,"challenge.csv")
read_csv("challenge.csv")
## Parsed with column specification:
## cols(
##   x = col_double(),
##   y = col_logical()
## )
## Warning: 1000 parsing failures.
##  row col           expected     actual            file
## 1001   y 1/0/T/F/TRUE/FALSE 2015-01-16 'challenge.csv'
## 1002   y 1/0/T/F/TRUE/FALSE 2018-05-18 'challenge.csv'
## 1003   y 1/0/T/F/TRUE/FALSE 2015-09-05 'challenge.csv'
## 1004   y 1/0/T/F/TRUE/FALSE 2012-11-28 'challenge.csv'
## 1005   y 1/0/T/F/TRUE/FALSE 2020-01-13 'challenge.csv'
## .... ... .................. .......... ...............
## See problems(...) for more details.
## # A tibble: 2,000 x 2
##        x y    
##    <dbl> <lgl>
##  1   404 NA   
##  2  4172 NA   
##  3  3004 NA   
##  4   787 NA   
##  5    37 NA   
##  6  2332 NA   
##  7  2489 NA   
##  8  1449 NA   
##  9  3665 NA   
## 10  3863 NA   
## # ... with 1,990 more rows
# 但存在的问题是,保存为csv文件后类型信息就丢失了,读取需要重新解析
# 暂存结果 write_rds() read_rds函数,实际是对基础函数saveRDS() readRDS的包装
write_rds(challenge,"challenge.rds")
read_rds("challenge.rds")#读取后仍保留了格式
## # A tibble: 2,000 x 2
##        x y         
##    <dbl> <date>    
##  1   404 NA        
##  2  4172 NA        
##  3  3004 NA        
##  4   787 NA        
##  5    37 NA        
##  6  2332 NA        
##  7  2489 NA        
##  8  1449 NA        
##  9  3665 NA        
## 10  3863 NA        
## # ... with 1,990 more rows
# feather包实现了一种快速二进制格式,可在多个编程语言间共享
library(feather)
write_feather(challenge, "challenge.feather")
read_feather("challenge.feather")
## # A tibble: 2,000 x 2
##        x y         
##    <dbl> <date>    
##  1   404 NA        
##  2  4172 NA        
##  3  3004 NA        
##  4   787 NA        
##  5    37 NA        
##  6  2332 NA        
##  7  2489 NA        
##  8  1449 NA        
##  9  3665 NA        
## 10  3863 NA        
## # ... with 1,990 more rows
# feather比RDS格式更快,且可在R之外使用