library(readr)
## Warning: package 'readr' was built under R version 4.2.3
#read_delim(file, delim = "|")
# i would the function read_delim with the argument delim="|"
intersect(names(formals(read_csv)), names(formals(read_tsv)))
## [1] "file" "col_names" "col_types" "col_select"
## [5] "id" "locale" "na" "quoted_na"
## [9] "quote" "comment" "trim_ws" "skip"
## [13] "n_max" "guess_max" "name_repair" "num_threads"
## [17] "progress" "show_col_types" "skip_empty_rows" "lazy"
# there arguments listed below are common between read_csv and read_tsv
read_csv("a,b\n1,2,3\n4,5,6")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 2 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (1): a
## num (1): b
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 x 2
## a b
## <dbl> <dbl>
## 1 1 23
## 2 4 56
# two columns are entered in the header a and b but we have 3 values so it combine the last two values in column b
read_csv("a,b,c\n1,2\n1,2,3,4")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 2 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): a, b
## num (1): c
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 x 3
## a b c
## <dbl> <dbl> <dbl>
## 1 1 2 NA
## 2 1 2 34
#3 column are specified in the header but we have two value for first row, so column c return NA and we have 4 values in in row 2, two of them gets combined in column c
read_csv("a,b\n\"1")
## Rows: 0 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): a, b
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 0 x 2
## # ... with 2 variables: a <chr>, b <chr>
#two column identified and 1 is quoted so it gets dropped
read_csv("a,b\n1,2\na,b")
## Rows: 2 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): a, b
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2 x 2
## a b
## <chr> <chr>
## 1 1 2
## 2 a b
# two column are created a and b, that contains values 1,2 and character a,b so they get converted to character
read_csv("a;b\n1;3")
## Rows: 1 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): a;b
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 1 x 1
## `a;b`
## <chr>
## 1 1;3
#values are separated by ";" so they get both combined in column a instead 1 in a and 3 in b for that we have to use "," instead of ";"
#locale(decimal_mark = ".", grouping_mark = ".")
# if we set decimal_mark and grouping mark to the same character, it throws an error saying they have to be different
locale(decimal_mark = ",")
## <locale>
## Numbers: 123.456,78
## Formats: %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
## (Thu), Friday (Fri), Saturday (Sat)
## Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
## June (Jun), July (Jul), August (Aug), September (Sep), October
## (Oct), November (Nov), December (Dec)
## AM/PM: AM/PM
# when we set decimal_mark to ",", the grouping mark is set to a period a
locale(grouping_mark = ".")
## <locale>
## Numbers: 123.456,78
## Formats: %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days: Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
## (Thu), Friday (Fri), Saturday (Sat)
## Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
## June (Jun), July (Jul), August (Aug), September (Sep), October
## (Oct), November (Nov), December (Dec)
## AM/PM: AM/PM
# when we set grouping_mark to ".", the decimal mark is set to a comma.
d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015)", "July 1 (2015)")
d5 <- "12/30/14" # Dec 30, 2014
t1 <- "1705"
t2 <- "11:15:10.12 PM"
parse_date(d1, "%B %d, %Y")
## [1] "2010-01-01"
parse_date(d2, "%Y-%b-%d")
## [1] "2015-03-07"
parse_date(d3, "%d-%b-%Y")
## [1] "2017-06-06"
parse_date(d4, "%B %d (%Y)")
## [1] "2015-08-19" "2015-07-01"
parse_date(d5, "%m/%d/%y")
## [1] "2014-12-30"
parse_time(t1, "%H%M")
## 17:05:00
parse_time(t2, "%H:%M:%OS %p")
## 23:15:10.12
guess_parser("2010-10-01")
## [1] "date"
guess_parser("15:01")
## [1] "time"
guess_parser(c("TRUE", "FALSE"))
## [1] "logical"
guess_parser(c("1", "5", "9"))
## [1] "double"
guess_parser(c("12,352,561"))
## [1] "number"
str(parse_guess("2010-10-10"))
## Date[1:1], format: "2010-10-10"
# readr reads the first 1000 rows to guess the type of each column, here we go through some examples, we see that the first code return a date, second one a time, third one return logical, fourth one returns double, fifth one return a number and last one returns a date which all makes sense
challenge <- read_csv(readr_example("challenge.csv"))
## Rows: 2000 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (1): x
## date (1): y
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(challenge)
## # A tibble: 6 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
challenge <- read_csv(
readr_example("challenge.csv"),
col_types = cols(
x = col_double(),
y = col_date()
)
)
tail(challenge)
## # A tibble: 6 x 2
## x y
## <dbl> <date>
## 1 0.805 2019-11-21
## 2 0.164 2018-03-29
## 3 0.472 2014-08-04
## 4 0.718 2015-08-16
## 5 0.270 2020-02-04
## 6 0.608 2019-01-06
#here the issue was that first observation had a lot of NA so readr end up classifiying it as logical, while loading the file we can force it to be a date using col_date().
challenge2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001)
## Rows: 2000 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (1): x
## date (1): y
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
challenge2
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
#among other strategies, we can force readr to guess on more than 1000 observation by assigning a value to guess_max
challenge3 <- read_csv(readr_example("challenge.csv"),
col_types = cols(.default = col_character())
)
challenge3
## # A tibble: 2,000 x 2
## x y
## <chr> <chr>
## 1 404 <NA>
## 2 4172 <NA>
## 3 3004 <NA>
## 4 787 <NA>
## 5 37 <NA>
## 6 2332 <NA>
## 7 2489 <NA>
## 8 1449 <NA>
## 9 3665 <NA>
## 10 3863 <NA>
## # ... with 1,990 more rows
# here we are forcing it to read both column as character
library(tibble)
## Warning: package 'tibble' was built under R version 4.2.3
df <- tribble(
~x, ~y,
"1", "1.21",
"2", "2.32",
"3", "4.56"
)
df
## # A tibble: 3 x 2
## x y
## <chr> <chr>
## 1 1 1.21
## 2 2 2.32
## 3 3 4.56
type_convert(df)
##
## -- Column specification --------------------------------------------------------
## cols(
## x = col_double(),
## y = col_double()
## )
## # A tibble: 3 x 2
## x y
## <dbl> <dbl>
## 1 1 1.21
## 2 2 2.32
## 3 3 4.56
# here we initially created a dataset tribble that reads as character then we used function type_convert to parse and rads as double
write_csv(challenge, "challenge.csv")
challenge
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
# write csv enable us to save dataset challenge into the path specified in the second part of the argument
write_csv(challenge, "challenge-2.csv")
read_csv("challenge-2.csv")
## Rows: 2000 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (1): x
## date (1): y
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
# when we save to csv the type of data info gets lost, we can use write_rds and read_rds which saves the data type info
write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
library(feather)
## Warning: package 'feather' was built under R version 4.2.3
write_feather(challenge, "challenge.feather")
read_feather("challenge.feather")
## # A tibble: 2,000 x 2
## x y
## <dbl> <date>
## 1 404 NA
## 2 4172 NA
## 3 3004 NA
## 4 787 NA
## 5 37 NA
## 6 2332 NA
## 7 2489 NA
## 8 1449 NA
## 9 3665 NA
## 10 3863 NA
## # ... with 1,990 more rows
# another alternative is to use the feather package that uses a fast binary file format that can be shared across multiple programming languages. and it is also faster when we are dealing with significantly larger files.