Wk 7 Assignment: Data import with R

title: "Untitled" author: "Suma Pendyala" date: "6/27/2020" output: html_document

Chapter 11 - Data import

Introduction

library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.1     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
read_csv("The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3", skip = 2)
## # A tibble: 1 x 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
#> # A tibble: 1 x 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3

read_csv("# A comment I want to skip
  x,y,z
  1,2,3", comment = "#")
## # A tibble: 1 x 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
#> # A tibble: 1 x 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
read_csv("1,2,3\n4,5,6", col_names = FALSE)
## # A tibble: 2 x 3
##      X1    X2    X3
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6
#> # A tibble: 2 x 3
#>      X1    X2    X3
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6
read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))
## # A tibble: 2 x 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6
#> # A tibble: 2 x 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6
read_csv("a,b,c\n1,2,.", na = ".")
## # A tibble: 1 x 3
##       a     b c    
##   <dbl> <dbl> <lgl>
## 1     1     2 NA
#> # A tibble: 1 x 3
#>       a     b c    
#>   <dbl> <dbl> <lgl>
#> 1     1     2 NA

11.2.2 Exercises

1 - What function would you use to read a file where fields were separated with "|"?

read_delim("a|b|c\n1|2|3\n4|5|6", delim = "|")
## # A tibble: 2 x 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6

2 - Apart from file, skip, and comment, what other arguments do read_csv() and read_tsv() have in common?

intersect(
  read_csv %>% args %>% as.list %>% names,
  read_tsv %>% args %>% as.list %>% names
) %>% setdiff(c("file", "skip", "comment"))
##  [1] "col_names"       "col_types"       "locale"          "na"             
##  [5] "quoted_na"       "quote"           "trim_ws"         "n_max"          
##  [9] "guess_max"       "progress"        "skip_empty_rows" ""

3 - What are the most important arguments to read_fwf()?

col_positions
## Error in eval(expr, envir, enclos): object 'col_positions' not found

4 - Sometimes strings in a CSV file contain commas. To prevent them from causing problems they need to be surrounded by a quoting character, like " or '. By default, read_csv() assumes that the quoting character will be ". What argument to read_csv() do you need to specify to read the following text into a data frame?

read_csv("x,y\n1,'a,b'", quote = "'")
## # A tibble: 1 x 2
##       x y    
##   <dbl> <chr>
## 1     1 a,b
read_delim("x,y\n1,'a,b'", delim = ",", quote = "'")
## # A tibble: 1 x 2
##       x y    
##   <dbl> <chr>
## 1     1 a,b
"x,y\n1,'a,b'"
## [1] "x,y\n1,'a,b'"
read_csv("x,y\n1,'a,b'", quote = "'")
## # A tibble: 1 x 2
##       x y    
##   <dbl> <chr>
## 1     1 a,b

5 - Identify what is wrong with each of the following inline CSV files. What happens when you run the code?

read_csv("a,b\n1,2,3\n4,5,6")
## Warning: 2 parsing failures.
## row col  expected    actual         file
##   1  -- 2 columns 3 columns literal data
##   2  -- 2 columns 3 columns literal data
## # A tibble: 2 x 2
##       a     b
##   <dbl> <dbl>
## 1     1     2
## 2     4     5
read_csv("a,b,c\n1,2\n1,2,3,4")
## Warning: 2 parsing failures.
## row col  expected    actual         file
##   1  -- 3 columns 2 columns literal data
##   2  -- 3 columns 4 columns literal data
## # A tibble: 2 x 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2    NA
## 2     1     2     3
read_csv("a,b\n\"1")
## Warning: 2 parsing failures.
## row col                     expected    actual         file
##   1  a  closing quote at end of file           literal data
##   1  -- 2 columns                    1 columns literal data
## # A tibble: 1 x 2
##       a b    
##   <dbl> <chr>
## 1     1 <NA>
read_csv("a,b\n1,2\na,b")
## # A tibble: 2 x 2
##   a     b    
##   <chr> <chr>
## 1 1     2    
## 2 a     b
read_csv("a;b\n1;3")
## # A tibble: 1 x 1
##   `a;b`
##   <chr>
## 1 1;3

11.3 Parsing a vector

str(parse_logical(c("TRUE", "FALSE", "NA")))
##  logi [1:3] TRUE FALSE NA
str(parse_integer(c("1", "2", "3")))
##  int [1:3] 1 2 3
str(parse_date(c("2010-01-01", "1979-10-14")))
##  Date[1:2], format: "2010-01-01" "1979-10-14"
parse_integer(c("1", "231", ".", "456"), na = ".")
## [1]   1 231  NA 456
x <- parse_integer(c("123", "345", "abc", "123.45"))
## Warning: 2 parsing failures.
## row col               expected actual
##   3  -- an integer                abc
##   4  -- no trailing characters    .45
problems(x)
## # A tibble: 2 x 4
##     row   col expected               actual
##   <int> <int> <chr>                  <chr> 
## 1     3    NA an integer             abc   
## 2     4    NA no trailing characters .45

11.3.1 Numbers

parse_double("1.23")
## [1] 1.23
parse_double("1,23", locale = locale(decimal_mark = ","))
## [1] 1.23
parse_number("$100")
## [1] 100
parse_number("20%")
## [1] 20
parse_number("It cost $123.45")
## [1] 123.45

11.3.2 Strings

charToRaw("Hadley")
## [1] 48 61 64 6c 65 79
x1 <- "El Ni\xf1o was particularly bad this year"
x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"

x1
## [1] "El Niño was particularly bad this year"
#> [1] "El Ni\xf1o was particularly bad this year"
x2
## [1] "‚±‚ñ‚É‚¿‚Í"
#> [1] "\x82\xb1\x82\xf1\x82??\xbf\x82\xcd"
parse_character(x1, locale = locale(encoding = "Latin1"))
## [1] "El Niño was particularly bad this year"
parse_character(x2, locale = locale(encoding = "Shift-JIS"))
## [1] "<U+3053><U+3093><U+306B><U+3061><U+306F>"
guess_encoding(charToRaw(x1))
## # A tibble: 2 x 2
##   encoding   confidence
##   <chr>           <dbl>
## 1 ISO-8859-1       0.46
## 2 ISO-8859-9       0.23
#> # A tibble: 2 x 2
#>   encoding   confidence
#>   <chr>           <dbl>
#> 1 ISO-8859-1       0.46
#> 2 ISO-8859-9       0.23
guess_encoding(charToRaw(x2))
## # A tibble: 1 x 2
##   encoding confidence
##   <chr>         <dbl>
## 1 KOI8-R         0.42
#> # A tibble: 1 x 2
#>   encoding confidence
#>   <chr>         <dbl>
#> 1 KOI8-R         0.42

11.3.3 Factors

fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)
## Warning: 1 parsing failure.
## row col           expected   actual
##   3  -- value in level set bananana
## [1] apple  banana <NA>  
## attr(,"problems")
## # A tibble: 1 x 4
##     row   col expected           actual  
##   <int> <int> <chr>              <chr>   
## 1     3    NA value in level set bananana
## Levels: apple banana
#> Warning: 1 parsing failure.
#> row col           expected   actual
#>   3  -- value in level set bananana
#> [1] apple  banana <NA>  
#> attr(,"problems")
#> # A tibble: 1 x 4
#>     row   col expected           actual  
#>   <int> <int> <chr>              <chr>
#> 1     3    NA value in level set bananana
#> Levels: apple banana

11.3.4 Dates, date-times, and times

parse_datetime("2010-10-01T2010")
## [1] "2010-10-01 20:10:00 UTC"
parse_datetime("20101010")
## [1] "2010-10-10 UTC"
parse_date("2010-10-01")
## [1] "2010-10-01"
library(hms)

parse_time("01:10 am")
## 01:10:00
parse_time("20:10:01")
## 20:10:01
parse_date("01/02/15", "%m/%d/%y")
## [1] "2015-01-02"
parse_date("01/02/15", "%d/%m/%y")
## [1] "2015-02-01"
parse_date("01/02/15", "%y/%m/%d")
## [1] "2001-02-15"
parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))
## [1] "2015-01-01"

11.3.5 Exercises

1- What are the most important arguments to locale()?

2- What happens if you try and set decimal_mark and grouping_mark to the same character? What happens to the default value of grouping_mark when you set decimal_mark to ","? What happens to the default value of decimal_mark when you set the grouping_mark to "."?

locale(decimal_mark = ',')
## <locale>
## Numbers:  123.456,78
## Formats:  %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
##         (Thu), Friday (Fri), Saturday (Sat)
## Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
##         June (Jun), July (Jul), August (Aug), September (Sep), October
##         (Oct), November (Nov), December (Dec)
## AM/PM:  AM/PM
read_csv() and read_tsv()
## Error: <text>:1:12: unexpected symbol
## 1: read_csv() and
##                ^
locale(decimal_mark = ',')
## <locale>
## Numbers:  123.456,78
## Formats:  %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
##         (Thu), Friday (Fri), Saturday (Sat)
## Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
##         June (Jun), July (Jul), August (Aug), September (Sep), October
##         (Oct), November (Nov), December (Dec)
## AM/PM:  AM/PM

3- I didn't discuss the date_format and time_format options to locale(). What do they do? Construct an example that shows when they might be useful.

parse_date("01/02/15", locale = locale(date_format = "%d/%m/%y"))
## [1] "2015-02-01"

4- If you live outside the US, create a new locale object that encapsulates the settings for the types of file you read most commonly.

col_positions

5- What's the difference between read_csv() and read_csv2()?

read_csv() is comma delimited. read_csv2()

6- What are the most common encodings used in Europe? What are the most common encodings used in Asia? Do some googling to find out.

7- Generate the correct format string to parse each of the following dates and times:

d1 <- "January 1, 2010"
parse_date(d1, "%B %d, %Y")
## [1] "2010-01-01"
d2 <- "2015-Mar-07"
parse_date(d2, "%Y-%b-%d")
## [1] "2015-03-07"
d3 <- "06-Jun-2017"
parse_date(d3, "%d-%b-%Y")
## [1] "2017-06-06"
d4 <- c("August 19 (2015)", "July 1 (2015)")
parse_date(d4, "%B %d (%Y)")
## [1] "2015-08-19" "2015-07-01"
d5 <- "12/30/14" # Dec 30, 2014
parse_date(d5, "%m/%d/%y")
## [1] "2014-12-30"
t1 <- "1705"
parse_time(t1, "%H%M")
## 17:05:00
t2 <- "11:15:10.12 PM"
parse_time(t2, "%I:%M:%OS %p")
## 23:15:10.12