使用tibble实现简单数据框
tibble包
library(tidyverse)
## -- Attaching packages -------------------------------------------------- tidyverse 1.2.1 --
## √ ggplot2 3.1.0 √ purrr 0.2.5
## √ tibble 1.4.2 √ dplyr 0.7.8
## √ tidyr 0.8.2 √ stringr 1.3.1
## √ readr 1.3.1 √ forcats 0.3.0
## -- Conflicts ----------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# 将标准数据框转换为tibble
as_tibble(iris)
## # A tibble: 150 x 5
## Sepal.Length Sepal.Width Petal.Length Petal.Width Species
## <dbl> <dbl> <dbl> <dbl> <fct>
## 1 5.1 3.5 1.4 0.2 setosa
## 2 4.9 3 1.4 0.2 setosa
## 3 4.7 3.2 1.3 0.2 setosa
## 4 4.6 3.1 1.5 0.2 setosa
## 5 5 3.6 1.4 0.2 setosa
## 6 5.4 3.9 1.7 0.4 setosa
## 7 4.6 3.4 1.4 0.3 setosa
## 8 5 3.4 1.5 0.2 setosa
## 9 4.4 2.9 1.4 0.2 setosa
## 10 4.9 3.1 1.5 0.1 setosa
## # ... with 140 more rows
# tibble()函数使用向量创建tibble
tibble(x=1:5,y=2,z=x+y)#自动重复长度为2的输入
## # A tibble: 5 x 3
## x y z
## <int> <dbl> <dbl>
## 1 1 2 3
## 2 2 2 4
## 3 3 2 5
## 4 4 2 6
## 5 5 2 7
# tibble的功能较dataframe少,不能改变输入,变量名称,不能创建行名称
# 如果一定要,就使用R中的无效变量作为列名,要英语这种变量,需要 反引号括起来 ``
# 如果要在ggplot2,dplyr中使用这些变量,同样使用反引号括起这些变量
tb <- tibble(
`:)` = "smile",
` ` = "space",
`2000` = "number"
)
tb
## # A tibble: 1 x 3
## `:)` ` ` `2000`
## <chr> <chr> <chr>
## 1 smile space number
# 创建tibble的另一种方式是使用,tribble函数定制化,以~开头,对数据按行编码
tribble(
~x,~y,~z,
#--/--/----
"a", 2, 3.6,
"b", 1, 8.6
)
## # A tibble: 2 x 3
## x y z
## <chr> <dbl> <dbl>
## 1 a 2 3.6
## 2 b 1 8.6
# tibble对打印的优化,仅显示前10行结果,并且适合屏幕
tibble(
a = lubridate::now() + runif(1e3) * 86400,
b = lubridate::today() + runif(1e3) * 30,
c = 1:1e3,
d = runif(1e3),#随机产生1000个0-1间的正态分布
e = sample(letters, 1e3, replace = TRUE)
)
## # A tibble: 1,000 x 5
## a b c d e
## <dttm> <date> <int> <dbl> <chr>
## 1 2019-01-25 04:14:02 2019-02-19 1 0.926 o
## 2 2019-01-25 07:06:59 2019-02-10 2 0.0620 i
## 3 2019-01-25 15:39:16 2019-01-30 3 0.977 d
## 4 2019-01-24 18:49:18 2019-01-31 4 0.844 m
## 5 2019-01-24 20:33:47 2019-02-15 5 0.449 z
## 6 2019-01-25 00:27:47 2019-01-31 6 0.647 m
## 7 2019-01-24 20:30:40 2019-02-13 7 0.635 d
## 8 2019-01-24 21:15:19 2019-02-19 8 0.302 s
## 9 2019-01-25 17:34:17 2019-02-09 9 0.993 q
## 10 2019-01-25 16:32:21 2019-02-17 10 0.0666 z
## # ... with 990 more rows
# print函数控制打印行数(n),显示宽度,Inf打印所有列
nycflights13::flights %>%
print(n = 10, width = Inf)
## # A tibble: 336,776 x 19
## year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <dbl> <int>
## 1 2013 1 1 517 515 2 830
## 2 2013 1 1 533 529 4 850
## 3 2013 1 1 542 540 2 923
## 4 2013 1 1 544 545 -1 1004
## 5 2013 1 1 554 600 -6 812
## 6 2013 1 1 554 558 -4 740
## 7 2013 1 1 555 600 -5 913
## 8 2013 1 1 557 600 -3 709
## 9 2013 1 1 557 600 -3 838
## 10 2013 1 1 558 600 -2 753
## sched_arr_time arr_delay carrier flight tailnum origin dest air_time
## <int> <dbl> <chr> <int> <chr> <chr> <chr> <dbl>
## 1 819 11 UA 1545 N14228 EWR IAH 227
## 2 830 20 UA 1714 N24211 LGA IAH 227
## 3 850 33 AA 1141 N619AA JFK MIA 160
## 4 1022 -18 B6 725 N804JB JFK BQN 183
## 5 837 -25 DL 461 N668DN LGA ATL 116
## 6 728 12 UA 1696 N39463 EWR ORD 150
## 7 854 19 B6 507 N516JB EWR FLL 158
## 8 723 -14 EV 5708 N829AS LGA IAD 53
## 9 846 -8 B6 79 N593JB JFK MCO 140
## 10 745 8 AA 301 N3ALAA LGA ORD 138
## distance hour minute time_hour
## <dbl> <dbl> <dbl> <dttm>
## 1 1400 5 15 2013-01-01 05:00:00
## 2 1416 5 29 2013-01-01 05:00:00
## 3 1089 5 40 2013-01-01 05:00:00
## 4 1576 5 45 2013-01-01 05:00:00
## 5 762 6 0 2013-01-01 06:00:00
## 6 719 5 58 2013-01-01 05:00:00
## 7 1065 6 0 2013-01-01 06:00:00
## 8 229 6 0 2013-01-01 06:00:00
## 9 944 6 0 2013-01-01 06:00:00
## 10 733 6 0 2013-01-01 06:00:00
## # ... with 3.368e+05 more rows
#查看数据集
nycflights13::flights %>%
View()
#取子集 $ [[]]
df <- tibble(
x = runif(5),
y = rnorm(5)
)
df
## # A tibble: 5 x 2
## x y
## <dbl> <dbl>
## 1 0.00340 -0.774
## 2 0.242 1.19
## 3 0.171 -0.454
## 4 0.984 1.26
## 5 0.773 0.747
# 提取子集
df$x
## [1] 0.003400322 0.241965603 0.170880569 0.983685581 0.772722888
df[["x"]]
## [1] 0.003400322 0.241965603 0.170880569 0.983685581 0.772722888
df[[1]]#按位置提取
## [1] 0.003400322 0.241965603 0.170880569 0.983685581 0.772722888
df %>% .$x #在管道中提取,使用特殊占位符.(一点)
## [1] 0.003400322 0.241965603 0.170880569 0.983685581 0.772722888
# 有些较旧的函数不支持,tibble,可将其转回dataframe
class(as.data.frame(tb))
## [1] "data.frame"
使用readr进行数据导入
library(tidyverse)
# 自己读取一个数据
diamond<-read_csv("diamonds.csv")
## Parsed with column specification:
## cols(
## carat = col_double(),
## cut = col_character(),
## color = col_character(),
## clarity = col_character(),
## depth = col_double(),
## table = col_double(),
## price = col_double(),
## x = col_double(),
## y = col_double(),
## z = col_double()
## )
# 试下管道的感觉如何
diamond %>%
print(n=10,width=Inf) %>%
ggplot(aes(clarity,price,color=cut))+
geom_boxplot()
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # ... with 5.393e+04 more rows

#分面
diamond %>%
print(n=10,width=Inf) %>%
ggplot(aes(clarity,price))+
geom_boxplot()+
facet_wrap(~cut,nrow = 2)
## # A tibble: 53,940 x 10
## carat cut color clarity depth table price x y z
## <dbl> <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.290 Premium I VS2 62.4 58 334 4.2 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
## 7 0.24 Very Good I VVS1 62.3 57 336 3.95 3.98 2.47
## 8 0.26 Very Good H SI1 61.9 55 337 4.07 4.11 2.53
## 9 0.22 Fair E VS2 65.1 61 337 3.87 3.78 2.49
## 10 0.23 Very Good H VS1 59.4 61 338 4 4.05 2.39
## # ... with 5.393e+04 more rows

解析一些read_csv的参数
skip=n 跳过前n行
col_names=FALSE 数据读取没有列
“”反斜杠加n 用于换行
na,设定用哪些值用缺失值处理 na==“.” 即点用缺失值代替
为什么不用R基础包中的read.csv呢?
速度快
可以生产tibble不会将字符向量转换为因子、
解析向量, Parse_*函数族
str(parse_logical(c("TRUE", "FALSE", "NA")))
## logi [1:3] TRUE FALSE NA
#> logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
## int [1:3] 1 2 3
#> int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
## Date[1:2], format: "2010-01-01" "1979-10-14"
#> Date[1:2], format: "2010-01-01" "1979-10-14"
#解析失败则输出以缺失值形式存在
x <- parse_integer(c("123", "345", "abc", "123.45"))
## Warning: 2 parsing failures.
## row col expected actual
## 3 -- an integer abc
## 4 -- no trailing characters .45
x
## [1] 123 345 NA NA
## attr(,"problems")
## # A tibble: 2 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA an integer abc
## 2 4 NA no trailing characters .45
problems(x)#获取完整的失败信息合集
## # A tibble: 2 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA an integer abc
## 2 4 NA no trailing characters .45
重要的解析函数
parse_logical() parse_integer() 分别解析逻辑值和整数
parse_double严格数值型解析函数 parse_number灵活数值型解析函数
parse_character 字符编码很重要
paese_factor 可创建因子,R使用这种数据结构表示分类变量
parse_datetime parse_date, parse_time 解析日期 时间
数值
parse_double("1.23")
## [1] 1.23
# 设置新的地区对象, decimal_mark参数,覆盖.的默认值
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
# parse_number忽略数值前后的非数值型字符
parse_number("$100")
## [1] 100
parse_number("20%")
## [1] 20
parse_number("It cost $123.45")
## [1] 123.45
# parse_number忽略分组符号
parse_number("$123,456,789")
## [1] 123456789
parse_number("123.456.789", locale = locale(grouping_mark = "."))
## [1] 123456789
parse_number("123'456'789", locale = locale(grouping_mark = "'"))
## [1] 123456789
字符串
# charToRaw获取字符串的底层表示, ASCII码 16进制表示英文字符
charToRaw("Hadley")
## [1] 48 61 64 6c 65 79
# gusess_encoding函数 找到编码方式
# parse_character设定编码方式
因子
fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
## Warning: 1 parsing failure.
## row col expected actual
## 3 -- value in level set bananana
## [1] apple banana <NA>
## attr(,"problems")
## # A tibble: 1 x 4
## row col expected actual
## <int> <int> <chr> <chr>
## 1 3 NA value in level set bananana
## Levels: apple banana
日期,日期与时间, 时间
#parse_datetime 期待的日期时间是符合 ISO 8601标准的日期时间(一种国际标准)
parse_datetime("2010-10-01T2010")
## [1] "2010-10-01 20:10:00 UTC"
parse_datetime("20101010")
## [1] "2010-10-10 UTC"
# parse_date期待的是四位数的年份,格式如下以- 或/分割
parse_date("2010-10-01")
## [1] "2010-10-01"
#parse_time 期待的是小时:分钟和秒,:am或pm 标识符
library(hms)
parse_time("01:10 am")
## 01:10:00
#> 01:10:00
parse_time("20:10:01")
## 20:10:01
#> 20:10:01
# 如默认设置不符合,可自行设置,格式如下
# 解析成三个不同的时间
parse_date("01/02/15", "%m/%d/%y")
## [1] "2015-01-02"
#> [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y")
## [1] "2015-02-01"
#> [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d")
## [1] "2001-02-15"
#> [1] "2001-02-15"
Year
%Y (4 digits).
%y (2 digits); 00-69 -> 2000-2069, 70-99 -> 1970-1999.
Month
%m (2 digits).
%b (abbreviated name, like “Jan”).
%B (full name, “January”).
Day
%d (2 digits).
%e (optional leading space).
Time
%H 0-23 hour.
%I 0-12, must be used with %p.
%p AM/PM indicator.
%M minutes.
%S integer seconds.
%OS real seconds.
%Z Time zone (as name, e.g. America/Chicago). Beware of abbreviations: if you’re American, note that “EST” is a Canadian time zone that does not have daylight savings time. It is not Eastern Standard Time! We’ll come back to this time zones.
%z (as offset from UTC, e.g. +0800).
Non-digits
%. skips one non-digit character.
%* skips any number of non-digits.
# readr解析文件
## readr以一种启发式过程来确定每列的类型
## 先用guess_parse()函数返回readr最可信的猜测
# 示例
guess_parser("2010-10-01")
## [1] "date"
#> [1] "date"
guess_parser("15:01")
## [1] "time"
#> [1] "time"
guess_parser(c("TRUE", "FALSE"))
## [1] "logical"
#> [1] "logical"
guess_parser(c("1", "5", "9"))
## [1] "double"
#> [1] "double"
guess_parser(c("12,352,561"))
## [1] "number"
#> [1] "number"
str(parse_guess("2010-10-10"))
## Date[1:1], format: "2010-10-10"
默认设置的问题解决 1. 前1000行不能代表所有行,2列中有大量缺失值
challenge <- read_csv(readr_example("challenge.csv")) #
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_logical()
## )
## Warning: 1000 parsing failures.
## row col expected actual file
## 1001 y 1/0/T/F/TRUE/FALSE 2015-01-16 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1002 y 1/0/T/F/TRUE/FALSE 2018-05-18 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1003 y 1/0/T/F/TRUE/FALSE 2015-09-05 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1004 y 1/0/T/F/TRUE/FALSE 2012-11-28 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## 1005 y 1/0/T/F/TRUE/FALSE 2020-01-13 'D:/R/R-3.5.2/library/readr/extdata/challenge.csv'
## .... ... .................. .......... ..................................................
## See problems(...) for more details.
# redadr_example读取R包中的文件路径
problems(challenge)
## # A tibble: 1,000 x 5
## row col expected actual file
## <int> <chr> <chr> <chr> <chr>
## 1 1001 y 1/0/T/F/TRUE/F~ 2015-01-~ 'D:/R/R-3.5.2/library/readr/extda~
## 2 1002 y 1/0/T/F/TRUE/F~ 2018-05-~ 'D:/R/R-3.5.2/library/readr/extda~
## 3 1003 y 1/0/T/F/TRUE/F~ 2015-09-~ 'D:/R/R-3.5.2/library/readr/extda~
## 4 1004 y 1/0/T/F/TRUE/F~ 2012-11-~ 'D:/R/R-3.5.2/library/readr/extda~
## 5 1005 y 1/0/T/F/TRUE/F~ 2020-01-~ 'D:/R/R-3.5.2/library/readr/extda~
## 6 1006 y 1/0/T/F/TRUE/F~ 2016-04-~ 'D:/R/R-3.5.2/library/readr/extda~
## 7 1007 y 1/0/T/F/TRUE/F~ 2011-05-~ 'D:/R/R-3.5.2/library/readr/extda~
## 8 1008 y 1/0/T/F/TRUE/F~ 2020-07-~ 'D:/R/R-3.5.2/library/readr/extda~
## 9 1009 y 1/0/T/F/TRUE/F~ 2011-04-~ 'D:/R/R-3.5.2/library/readr/extda~
## 10 1010 y 1/0/T/F/TRUE/F~ 2010-05-~ 'D:/R/R-3.5.2/library/readr/extda~
## # ... with 990 more rows
## 修改列类型,根据problems()返回的结果
## 告诉readr如何加载数据
challenge<-read_csv(
readr_example("challenge.csv"),
col_types = cols(
x=col_double(),
y=col_date()
)
)
tail(challenge)#查看后6个
## # A tibble: 6 x 2
## x y
## <dbl> <date>
## 1 0.805 2019-11-21
## 2 0.164 2018-03-29
## 3 0.472 2014-08-04
## 4 0.718 2015-08-16
## 5 0.270 2020-02-04
## 6 0.608 2019-01-06
# 先将所有列都作为字符向量输入,发现读取问题
challenge2 <- read_csv(readr_example("challenge.csv"),
col_types = cols(.default = col_character())
)
tail(challenge2)##
## # A tibble: 6 x 2
## x y
## <chr> <chr>
## 1 0.805274312151596 2019-11-21
## 2 0.1635163405444473 2018-03-29
## 3 0.47193897631950676 2014-08-04
## 4 0.7183186465408653 2015-08-16
## 5 0.26987858884967864 2020-02-04
## 6 0.608237189007923 2019-01-06
# type_convert函数启动启发式解析
df <- tribble(
~x, ~y,
"1", "1.21",
"2", "2.32",
"3", "4.56"
)
df
## # A tibble: 3 x 2
## x y
## <chr> <chr>
## 1 1 1.21
## 2 2 2.32
## 3 3 4.56
type.convert(df)##启发式解析数据
## # A tibble: 3 x 2
## x y
## <int> <dbl>
## 1 1 1.21
## 2 2 2.32
## 3 3 4.56
写入文件
write_csv(challenge,"challenge.csv")
read_csv("challenge.csv")
## Parsed with column specification:
## cols(
## x = col_double(),
## y = col_logical()
## )
## Warning: 1000 parsing failures.
## row col expected actual file
## 1001 y 1/0/T/F/TRUE/FALSE 2015-01-16 'challenge.csv'
## 1002 y 1/0/T/F/TRUE/FALSE 2018-05-18 'challenge.csv'
## 1003 y 1/0/T/F/TRUE/FALSE 2015-09-05 'challenge.csv'
## 1004 y 1/0/T/F/TRUE/FALSE 2012-11-28 'challenge.csv'
## 1005 y 1/0/T/F/TRUE/FALSE 2020-01-13 'challenge.csv'
## .... ... .................. .......... ...............
## See problems(...) for more details.
## # A tibble: 2,000 x 2
## x y
## <dbl> <lgl>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
# 但存在的问题是,保存为csv文件后类型信息就丢失了,读取需要重新解析
# 暂存结果 write_rds() read_rds函数,实际是对基础函数saveRDS() readRDS的包装
write_rds(challenge,"challenge.rds")
read_rds("challenge.rds")#读取后仍保留了格式
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
# feather包实现了一种快速二进制格式,可在多个编程语言间共享
library(feather)
write_feather(challenge, "challenge.feather")
read_feather("challenge.feather")
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
# feather比RDS格式更快,且可在R之外使用