title: "Untitled" author: "Suma Pendyala" date: "6/27/2020" output: html_document

library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------------- tidyverse 1.3.0 --

## v ggplot2 3.3.1     v purrr   0.3.4
## v tibble  3.0.1     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts ---------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

read_csv("The first line of metadata
  The second line of metadata
  x,y,z
  1,2,3", skip = 2)

## # A tibble: 1 x 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3

#> # A tibble: 1 x 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3

read_csv("# A comment I want to skip
  x,y,z
  1,2,3", comment = "#")

## # A tibble: 1 x 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3

#> # A tibble: 1 x 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3

read_csv("1,2,3\n4,5,6", col_names = FALSE)

## # A tibble: 2 x 3
##      X1    X2    X3
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6

#> # A tibble: 2 x 3
#>      X1    X2    X3
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6

read_csv("1,2,3\n4,5,6", col_names = c("x", "y", "z"))

## # A tibble: 2 x 3
##       x     y     z
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6

#> # A tibble: 2 x 3
#>       x     y     z
#>   <dbl> <dbl> <dbl>
#> 1     1     2     3
#> 2     4     5     6

read_csv("a,b,c\n1,2,.", na = ".")

## # A tibble: 1 x 3
##       a     b c    
##   <dbl> <dbl> <lgl>
## 1     1     2 NA

#> # A tibble: 1 x 3
#>       a     b c    
#>   <dbl> <dbl> <lgl>
#> 1     1     2 NA

1 - What function would you use to read a file where fields were separated with "|"?

read_delim("a|b|c\n1|2|3\n4|5|6", delim = "|")

## # A tibble: 2 x 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2     3
## 2     4     5     6

2 - Apart from file, skip, and comment, what other arguments do read_csv() and read_tsv() have in common?

intersect(
  read_csv %>% args %>% as.list %>% names,
  read_tsv %>% args %>% as.list %>% names
) %>% setdiff(c("file", "skip", "comment"))

##  [1] "col_names"       "col_types"       "locale"          "na"             
##  [5] "quoted_na"       "quote"           "trim_ws"         "n_max"          
##  [9] "guess_max"       "progress"        "skip_empty_rows" ""

col_positions

## Error in eval(expr, envir, enclos): object 'col_positions' not found

4 - Sometimes strings in a CSV file contain commas. To prevent them from causing problems they need to be surrounded by a quoting character, like " or '. By default, read_csv() assumes that the quoting character will be ". What argument to read_csv() do you need to specify to read the following text into a data frame?

read_csv("x,y\n1,'a,b'", quote = "'")

## # A tibble: 1 x 2
##       x y    
##   <dbl> <chr>
## 1     1 a,b

read_delim("x,y\n1,'a,b'", delim = ",", quote = "'")

## # A tibble: 1 x 2
##       x y    
##   <dbl> <chr>
## 1     1 a,b

"x,y\n1,'a,b'"

## [1] "x,y\n1,'a,b'"

read_csv("x,y\n1,'a,b'", quote = "'")

## # A tibble: 1 x 2
##       x y    
##   <dbl> <chr>
## 1     1 a,b

5 - Identify what is wrong with each of the following inline CSV files. What happens when you run the code?

read_csv("a,b\n1,2,3\n4,5,6")

## Warning: 2 parsing failures.
## row col  expected    actual         file
##   1  -- 2 columns 3 columns literal data
##   2  -- 2 columns 3 columns literal data

## # A tibble: 2 x 2
##       a     b
##   <dbl> <dbl>
## 1     1     2
## 2     4     5

read_csv("a,b,c\n1,2\n1,2,3,4")

## Warning: 2 parsing failures.
## row col  expected    actual         file
##   1  -- 3 columns 2 columns literal data
##   2  -- 3 columns 4 columns literal data

## # A tibble: 2 x 3
##       a     b     c
##   <dbl> <dbl> <dbl>
## 1     1     2    NA
## 2     1     2     3

read_csv("a,b\n\"1")

## Warning: 2 parsing failures.
## row col                     expected    actual         file
##   1  a  closing quote at end of file           literal data
##   1  -- 2 columns                    1 columns literal data

## # A tibble: 1 x 2
##       a b    
##   <dbl> <chr>
## 1     1 <NA>

read_csv("a,b\n1,2\na,b")

## # A tibble: 2 x 2
##   a     b    
##   <chr> <chr>
## 1 1     2    
## 2 a     b

read_csv("a;b\n1;3")

## # A tibble: 1 x 1
##   `a;b`
##   <chr>
## 1 1;3

str(parse_logical(c("TRUE", "FALSE", "NA")))

##  logi [1:3] TRUE FALSE NA

str(parse_integer(c("1", "2", "3")))

##  int [1:3] 1 2 3

str(parse_date(c("2010-01-01", "1979-10-14")))

##  Date[1:2], format: "2010-01-01" "1979-10-14"

parse_integer(c("1", "231", ".", "456"), na = ".")

## [1]   1 231  NA 456

x <- parse_integer(c("123", "345", "abc", "123.45"))

## Warning: 2 parsing failures.
## row col               expected actual
##   3  -- an integer                abc
##   4  -- no trailing characters    .45

problems(x)

## # A tibble: 2 x 4
##     row   col expected               actual
##   <int> <int> <chr>                  <chr> 
## 1     3    NA an integer             abc   
## 2     4    NA no trailing characters .45

parse_double("1.23")

## [1] 1.23

parse_double("1,23", locale = locale(decimal_mark = ","))

## [1] 1.23

parse_number("$100")

## [1] 100

parse_number("20%")

## [1] 20

parse_number("It cost $123.45")

## [1] 123.45

charToRaw("Hadley")

## [1] 48 61 64 6c 65 79

x1 <- "El Ni\xf1o was particularly bad this year"
x2 <- "\x82\xb1\x82\xf1\x82\xc9\x82\xbf\x82\xcd"

x1

## [1] "El Niño was particularly bad this year"

#> [1] "El Ni\xf1o was particularly bad this year"
x2

## [1] "‚±‚ñ‚É‚¿‚Í"

#> [1] "\x82\xb1\x82\xf1\x82??\xbf\x82\xcd"

parse_character(x1, locale = locale(encoding = "Latin1"))

## [1] "El Niño was particularly bad this year"

parse_character(x2, locale = locale(encoding = "Shift-JIS"))

## [1] "<U+3053><U+3093><U+306B><U+3061><U+306F>"

guess_encoding(charToRaw(x1))

## # A tibble: 2 x 2
##   encoding   confidence
##   <chr>           <dbl>
## 1 ISO-8859-1       0.46
## 2 ISO-8859-9       0.23

#> # A tibble: 2 x 2
#>   encoding   confidence
#>   <chr>           <dbl>
#> 1 ISO-8859-1       0.46
#> 2 ISO-8859-9       0.23
guess_encoding(charToRaw(x2))

## # A tibble: 1 x 2
##   encoding confidence
##   <chr>         <dbl>
## 1 KOI8-R         0.42

#> # A tibble: 1 x 2
#>   encoding confidence
#>   <chr>         <dbl>
#> 1 KOI8-R         0.42

fruit <- c("apple", "banana")
parse_factor(c("apple", "banana", "bananana"), levels = fruit)

## Warning: 1 parsing failure.
## row col           expected   actual
##   3  -- value in level set bananana

## [1] apple  banana <NA>  
## attr(,"problems")
## # A tibble: 1 x 4
##     row   col expected           actual  
##   <int> <int> <chr>              <chr>   
## 1     3    NA value in level set bananana
## Levels: apple banana

#> Warning: 1 parsing failure.
#> row col           expected   actual
#>   3  -- value in level set bananana
#> [1] apple  banana <NA>  
#> attr(,"problems")
#> # A tibble: 1 x 4
#>     row   col expected           actual  
#>   <int> <int> <chr>              <chr>

#> 1     3    NA value in level set bananana
#> Levels: apple banana

parse_datetime("2010-10-01T2010")

## [1] "2010-10-01 20:10:00 UTC"

parse_datetime("20101010")

## [1] "2010-10-10 UTC"

parse_date("2010-10-01")

## [1] "2010-10-01"

library(hms)

parse_time("01:10 am")

## 01:10:00

parse_time("20:10:01")

## 20:10:01

parse_date("01/02/15", "%m/%d/%y")

## [1] "2015-01-02"

parse_date("01/02/15", "%d/%m/%y")

## [1] "2015-02-01"

parse_date("01/02/15", "%y/%m/%d")

## [1] "2001-02-15"

parse_date("1 janvier 2015", "%d %B %Y", locale = locale("fr"))

## [1] "2015-01-01"

2- What happens if you try and set decimal_mark and grouping_mark to the same character? What happens to the default value of grouping_mark when you set decimal_mark to ","? What happens to the default value of decimal_mark when you set the grouping_mark to "."?

locale(decimal_mark = ',')

## <locale>
## Numbers:  123.456,78
## Formats:  %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
##         (Thu), Friday (Fri), Saturday (Sat)
## Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
##         June (Jun), July (Jul), August (Aug), September (Sep), October
##         (Oct), November (Nov), December (Dec)
## AM/PM:  AM/PM

read_csv() and read_tsv()

## Error: <text>:1:12: unexpected symbol
## 1: read_csv() and
##                ^

locale(decimal_mark = ',')

## <locale>
## Numbers:  123.456,78
## Formats:  %AD / %AT
## Timezone: UTC
## Encoding: UTF-8
## <date_names>
## Days:   Sunday (Sun), Monday (Mon), Tuesday (Tue), Wednesday (Wed), Thursday
##         (Thu), Friday (Fri), Saturday (Sat)
## Months: January (Jan), February (Feb), March (Mar), April (Apr), May (May),
##         June (Jun), July (Jul), August (Aug), September (Sep), October
##         (Oct), November (Nov), December (Dec)
## AM/PM:  AM/PM

3- I didn't discuss the date_format and time_format options to locale(). What do they do? Construct an example that shows when they might be useful.

parse_date("01/02/15", locale = locale(date_format = "%d/%m/%y"))

## [1] "2015-02-01"

4- If you live outside the US, create a new locale object that encapsulates the settings for the types of file you read most commonly.

6- What are the most common encodings used in Europe? What are the most common encodings used in Asia? Do some googling to find out.

7- Generate the correct format string to parse each of the following dates and times: