1 Import data with readr

The front side of this sheet shows how to read text files into R with readr.

Try one of the following packages to import other types of files:

  • haven - SPSS, Stata, and SAS files

  • readxl - excel files (.xls and .xlsx)

  • DBI - databases

  • jsonlite - json

  • xml2 - XML

  • httr - Web APIs

  • rvest - HTML (Web Scraping)

library(tidyverse)
## -- Attaching packages ------------------------------------------ tidyverse 1.3.0 --
## √ ggplot2 3.2.1     √ purrr   0.3.3
## √ tibble  2.1.3     √ dplyr   0.8.3
## √ tidyr   1.0.0     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.4.0
## -- Conflicts --------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(nycflights13)
library(tidyr)
library(corrplot)
## corrplot 0.84 loaded
library(DT)

1.2 Save Data

1.3 Read Tabular Data

read_*(file, 
       col_names = TRUE, 
       col_types = NULL, 
       locale = default_locale(), 
       na = c("", "NA"),
       quoted_na = TRUE, 
       comment = "", 
       trim_ws = TRUE, 
       skip = 0, 
       n_max = Inf, 
       guess_max = min(1000,n_max), 
       progress = interactive())
write_file(x = "a,b,c\n1,2,3\n4,5,NA", path = "file.csv")

read_csv("file.csv")

write_file(x = "a;b;c\n1;2;3\n4;5;NA", path = "file2.csv")

read_csv2("file2.csv")

write_file(x = "a|b|c\n1|2|3\n4|5|NA", path = "file.txt")

read_delim("file.txt", delim = "|")

write_file(x = "a b c\n1 2 3\n4 5 NA", path = "file.fwf")

read_fwf("file.fwf", col_positions = c(1, 3, 5))

write_file(x = "a\tb\tc\n1\t2\t3\n4\t5\tNA", path = "file.tsv")

read_tsv("file.tsv") # Also read_table()

1.4 Read Non-Tabular Data

1.5 Data types

Use problems() to diagnose problems

x <- read_csv("file.csv");

problems(x)

2 Tidy Data with tidyr

View full data set with View() or glimpse()

Revert to data frame with as.data.frame()

2.1 Reshape Data

tidy_example <- tibble(country = LETTERS[1:20],
                       `1999` = c(0.7,37,212,2,45,8,9,41,12,12,1:10),
                       `2000` = `1999`+rnorm(20,100,1),
                       `2001` = `2000` + rnorm(20,10,10))

corrplot::corrplot(cor(tidy_example[,-1]),method = "number",type = "upper")

tidy_example_gather <- tidy_example %>% 
  gather(key = "year",value = "cases",-country)

tidy_example_gather %>% datatable()
tidy_example_gather %>% 
  spread(key = "year",value = "cases") %>% 
  datatable()

2.2 Handle Missing Values

  • drop_na(data, ...) Drop rows containing NA’s in … columns

  • fill(data, ..., .direction = c("down", "up")) Fill in NA’s in … columns with most recent non-NA values.

  • replace_na(data,replace = list(), ...) Replace NA’s by column

tidy_example[c(2,5),] <- NA

tidy_example %>% datatable()
drop_na(tidy_example) %>% 
  datatable()
fill(tidy_example,country:`2001`) %>% 
  datatable()
tidy_example %>% 
  mutate(x = replace_na(`1999`,0)) %>% datatable()
tidy_example %>% 
  replace_na(list(`2000` = 2)) %>% 
  datatable()

2.3 Expand Tables

mtcars %>% 
  select(cyl,gear,carb,everything()) %>% 
  arrange(cyl,gear) %>% 
  datatable()
mtcars %>% has_rownames()
## [1] TRUE
complete(mtcars, cyl, gear, carb) %>% datatable()
expand(mtcars, cyl, gear, carb) %>% datatable()

2.4 Split Cells

  • separate(data, col, into, sep = "[^[:alnum:]]+", remove = TRUE, convert = FALSE,extra = "warn", fill = "warn", ...) Separate each cell in a column to make several columns.

  • unite(data, col, ..., sep = "_", remove = TRUE) Collapse cells across several columns to make a single column

  • separate_rows(data, ..., sep = "[^[:alnum:].]+", convert = FALSE) Separate each cell in a column to make several rows

split_exam <- nycflights13::flights[1:100,1:8] 

split_exam %>% datatable()
split_exam <- split_exam %>% 
  unite(year,month,day,col = "date",sep = "-")  # 那几列按照什么规则sep合成col
  
split_exam$date <- as.Date(split_exam$date)  

split_exam %>% datatable()
split_exam %>% 
  separate(col = date,sep = "-",into = c("year","month","day"))  # 那一列col按照什么规则sep拆分into
## # A tibble: 100 x 8
##    year  month day   dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <chr> <chr> <chr>    <int>          <int>     <dbl>    <int>          <int>
##  1 2013  01    01         517            515         2      830            819
##  2 2013  01    01         533            529         4      850            830
##  3 2013  01    01         542            540         2      923            850
##  4 2013  01    01         544            545        -1     1004           1022
##  5 2013  01    01         554            600        -6      812            837
##  6 2013  01    01         554            558        -4      740            728
##  7 2013  01    01         555            600        -5      913            854
##  8 2013  01    01         557            600        -3      709            723
##  9 2013  01    01         557            600        -3      838            846
## 10 2013  01    01         558            600        -2      753            745
## # ... with 90 more rows