Week 7 - CRN140 Assignment

library(readr)

## Warning: package 'readr' was built under R version 4.2.3

#read_delim(file, delim = "|")
# i would the function read_delim with the argument delim="|"

intersect(names(formals(read_csv)), names(formals(read_tsv)))

##  [1] "file"            "col_names"       "col_types"       "col_select"     
##  [5] "id"              "locale"          "na"              "quoted_na"      
##  [9] "quote"           "comment"         "trim_ws"         "skip"           
## [13] "n_max"           "guess_max"       "name_repair"     "num_threads"    
## [17] "progress"        "show_col_types"  "skip_empty_rows" "lazy"

# there arguments listed below are common between read_csv and read_tsv

read_csv("a,b\n1,2,3\n4,5,6")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 2 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (1): a
## num (1): b
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## # A tibble: 2 x 2
##       a     b
##   <dbl> <dbl>
## 1     1    23
## 2     4    56

# two columns are entered in the header a and b but we have 3 values so it combine the last two values in column b
read_csv("a,b,c\n1,2\n1,2,3,4")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 2 Columns: 3
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl (2): a, b
## num (1): c
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## # A tibble: 2 x 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2    NA
## 2     1     2    34

#3 column are specified in the header but we have two value for first row, so column c return NA and we have 4 values in in row 2, two of them gets combined in column c 
read_csv("a,b\n\"1")

## Rows: 0 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): a, b
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## # A tibble: 0 x 2
## # ... with 2 variables: a <chr>, b <chr>

#two column identified and 1 is quoted so it gets dropped
read_csv("a,b\n1,2\na,b")

## Rows: 2 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (2): a, b
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## # A tibble: 2 x 2
##   a     b    
##   <chr> <chr>
## 1 1     2    
## 2 a     b

# two column are created a and b, that contains values 1,2 and character a,b so they get converted to character
read_csv("a;b\n1;3")

## Rows: 1 Columns: 1
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (1): a;b
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## # A tibble: 1 x 1
##   `a;b`
##   <chr>
## 1 1;3

#values are separated by ";" so they get both combined in column a instead 1 in a and 3 in b for that we have to use "," instead of ";"

#locale(decimal_mark = ".", grouping_mark = ".")
# if we set decimal_mark and grouping mark to the same character, it throws an error saying they have to be different
locale(decimal_mark = ",")

## <locale>
## Numbers:  123.456,78
## Formats:  %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
##         (Thu), Friday (Fri), Saturday (Sat)
## Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
##         June (Jun), July (Jul), August (Aug), September (Sep), October
##         (Oct), November (Nov), December (Dec)
## AM/PM:  AM/PM

# when we set decimal_mark to ",", the grouping mark is set to a period a
locale(grouping_mark = ".")

## <locale>
## Numbers:  123.456,78
## Formats:  %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
##         (Thu), Friday (Fri), Saturday (Sat)
## Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
##         June (Jun), July (Jul), August (Aug), September (Sep), October
##         (Oct), November (Nov), December (Dec)
## AM/PM:  AM/PM

# when we set grouping_mark to ".", the decimal mark is set to a comma.

d1 <- "January 1, 2010"
d2 <- "2015-Mar-07"
d3 <- "06-Jun-2017"
d4 <- c("August 19 (2015)", "July 1 (2015)")
d5 <- "12/30/14" # Dec 30, 2014
t1 <- "1705"
t2 <- "11:15:10.12 PM"

parse_date(d1, "%B %d, %Y")

## [1] "2010-01-01"

parse_date(d2, "%Y-%b-%d")

## [1] "2015-03-07"

parse_date(d3, "%d-%b-%Y")

## [1] "2017-06-06"

parse_date(d4, "%B %d (%Y)")

## [1] "2015-08-19" "2015-07-01"

parse_date(d5, "%m/%d/%y")

## [1] "2014-12-30"

parse_time(t1, "%H%M")

## 17:05:00

parse_time(t2, "%H:%M:%OS %p")

## 23:15:10.12

guess_parser("2010-10-01")

## [1] "date"

guess_parser("15:01")

## [1] "time"

guess_parser(c("TRUE", "FALSE"))

## [1] "logical"

guess_parser(c("1", "5", "9"))

## [1] "double"

guess_parser(c("12,352,561"))

## [1] "number"

str(parse_guess("2010-10-10"))

##  Date[1:1], format: "2010-10-10"

# readr reads the first 1000 rows to guess the type of each column, here we go through some examples, we see that the first code return a date, second one a time, third one return logical, fourth one returns double, fifth one return a number and last one returns a date which all makes sense

challenge <- read_csv(readr_example("challenge.csv"))

## Rows: 2000 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl  (1): x
## date (1): y
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

head(challenge)

## # A tibble: 6 x 2
##       x y     
##   <dbl> <date>
## 1   404 NA    
## 2  4172 NA    
## 3  3004 NA    
## 4   787 NA    
## 5    37 NA    
## 6  2332 NA

challenge <- read_csv(
  readr_example("challenge.csv"), 
  col_types = cols(
    x = col_double(),
    y = col_date()
  )
)
tail(challenge)

## # A tibble: 6 x 2
##       x y         
##   <dbl> <date>    
## 1 0.805 2019-11-21
## 2 0.164 2018-03-29
## 3 0.472 2014-08-04
## 4 0.718 2015-08-16
## 5 0.270 2020-02-04
## 6 0.608 2019-01-06

#here the issue was that first observation had a lot of NA so readr end up classifiying it as logical, while loading the file we can force it to be a date using col_date().

challenge2 <- read_csv(readr_example("challenge.csv"), guess_max = 1001)

## Rows: 2000 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl  (1): x
## date (1): y
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

challenge2

## # A tibble: 2,000 x 2
##        x y     
##    <dbl> <date>
##  1   404 NA    
##  2  4172 NA    
##  3  3004 NA    
##  4   787 NA    
##  5    37 NA    
##  6  2332 NA    
##  7  2489 NA    
##  8  1449 NA    
##  9  3665 NA    
## 10  3863 NA    
## # ... with 1,990 more rows

#among other strategies, we can force readr to guess on more than 1000 observation by assigning a value to guess_max
challenge3 <- read_csv(readr_example("challenge.csv"), 
  col_types = cols(.default = col_character())
)
challenge3

## # A tibble: 2,000 x 2
##    x     y    
##    <chr> <chr>
##  1 404   <NA> 
##  2 4172  <NA> 
##  3 3004  <NA> 
##  4 787   <NA> 
##  5 37    <NA> 
##  6 2332  <NA> 
##  7 2489  <NA> 
##  8 1449  <NA> 
##  9 3665  <NA> 
## 10 3863  <NA> 
## # ... with 1,990 more rows

# here we are forcing it to read both column as character
library(tibble)

## Warning: package 'tibble' was built under R version 4.2.3

df <- tribble(
  ~x,  ~y,
  "1", "1.21",
  "2", "2.32",
  "3", "4.56"
)
df

## # A tibble: 3 x 2
##   x     y    
##   <chr> <chr>
## 1 1     1.21 
## 2 2     2.32 
## 3 3     4.56

type_convert(df)

## 
## -- Column specification --------------------------------------------------------
## cols(
##   x = col_double(),
##   y = col_double()
## )

## # A tibble: 3 x 2
##       x     y
##   <dbl> <dbl>
## 1     1  1.21
## 2     2  2.32
## 3     3  4.56

# here we initially created a dataset tribble that reads as character then we used function type_convert to parse and rads as double

write_csv(challenge, "challenge.csv")
challenge

## # A tibble: 2,000 x 2
##        x y     
##    <dbl> <date>
##  1   404 NA    
##  2  4172 NA    
##  3  3004 NA    
##  4   787 NA    
##  5    37 NA    
##  6  2332 NA    
##  7  2489 NA    
##  8  1449 NA    
##  9  3665 NA    
## 10  3863 NA    
## # ... with 1,990 more rows

# write csv enable us to save dataset challenge into the path specified in the second part of the argument
write_csv(challenge, "challenge-2.csv")
read_csv("challenge-2.csv")

## Rows: 2000 Columns: 2
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## dbl  (1): x
## date (1): y
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

## # A tibble: 2,000 x 2
##        x y     
##    <dbl> <date>
##  1   404 NA    
##  2  4172 NA    
##  3  3004 NA    
##  4   787 NA    
##  5    37 NA    
##  6  2332 NA    
##  7  2489 NA    
##  8  1449 NA    
##  9  3665 NA    
## 10  3863 NA    
## # ... with 1,990 more rows

# when we save to csv the type of data info gets lost, we can use write_rds and read_rds which saves the data type info
write_rds(challenge, "challenge.rds")
read_rds("challenge.rds")

## # A tibble: 2,000 x 2
##        x y     
##    <dbl> <date>
##  1   404 NA    
##  2  4172 NA    
##  3  3004 NA    
##  4   787 NA    
##  5    37 NA    
##  6  2332 NA    
##  7  2489 NA    
##  8  1449 NA    
##  9  3665 NA    
## 10  3863 NA    
## # ... with 1,990 more rows

library(feather)

## Warning: package 'feather' was built under R version 4.2.3

write_feather(challenge, "challenge.feather")
read_feather("challenge.feather")

## # A tibble: 2,000 x 2
##        x y     
##    <dbl> <date>
##  1   404 NA    
##  2  4172 NA    
##  3  3004 NA    
##  4   787 NA    
##  5    37 NA    
##  6  2332 NA    
##  7  2489 NA    
##  8  1449 NA    
##  9  3665 NA    
## 10  3863 NA    
## # ... with 1,990 more rows

# another alternative is to use the feather package that uses a fast binary file format that can be shared across multiple programming languages. and it is also faster when we are dealing with significantly larger files.

Week 7 - CRN140 Assignment

Mehdi Alaoui

2023-05-06