Importing and Exporting Data with R

Learning Objectives

Importing CSV files into R
Importing special data files into R
Exporting data from R

Let’s start by loading packages for this activity.

library(data.table)
library(tidyverse)
library(arrow)
library(bench)
library(flextable)
library(ggbeeswarm)

# Importing the possums data set from a url
possums <- read.csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/possum.csv")

Let’s take a look at a few rows and a few columns from the `possums` data set

# Printing first 6 columns and first 5 rows
possums |> 
  dplyr::select(1:6) |> 
  slice_head(n = 5) |>
  flextable() |> 
  autofit()

case	site	sex	age	head_length_mm	skull_width_mm
1	Cambarville	Male	8	94.1	60.4
2	Cambarville	Female	6	92.5	57.6
3	Cambarville	Female	6	94.0	60.0
4	Cambarville	Female	6	93.2	57.1
5	Cambarville	Female	2	91.5	56.3

Importing data into R

We can use the Import wizard to import data into R using the `read.csv()` function.

possum <- read.csv("C:/Users/user/Desktop/Grand Valley/Winter 2024/STA 518/possum.csv")

We can also use the `read_csv()` from the `readr` package

library(readr)
possum <- read_csv("C:/Users/user/Desktop/Grand Valley/Winter 2024/STA 518/possum.csv")

Using the `read_csv()` function

possums <- read_csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/possum.csv")

Let’s try to import the data from the `parquet` file

library(arrow)

# Downloading file from GitHub
download.file(url = "https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/possum.parquet", destfile = "possum.parquet", mode = "wb")

# Import after manually downloading file
possums <- read_parquet("possum.parquet")

Lastly we can use the `fread()` function from the `data.table` package

library(data.table)
# Importing data from a url
possums <- fread("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/possum.csv")

Let’s generate some medium data to time the different import functions

# Generating 'big data'
set.seed(1994)
x <- runif(5e4)
y <- runif(5e4)
x[sample(5e4, 5e3)] <- NA
y[sample(5e4, 5e3)] <- NA
bigData <- as.data.frame(x = x, y = y)

# Saving as CSV file w/ data.table
fwrite(bigData, "bigData.csv")

# Saving as parquet file
write_parquet(bigData, "bigData.parquet")

# Saving as RDS file
write_rds(bigData, "bigData.rds")

Timing the runtime of different read functions

library(bench)

# Comparing run times
readBmResult <- mark(read.csv("bigData.csv"), read_csv("bigData.csv", show_col_types = FALSE), 
               fread("bigData.csv"), read_rds("bigData.rds"),
     read_parquet("bigData.parquet", as_tibble = TRUE),
     check = FALSE, min_iterations = 5) 

ggObj <- plot(readBmResult)

importTimes <- ggObj$data |> mutate(expression = paste0(map_chr(str_split(expression, pattern = "[(]"), 1), "()"))

# Printing table
importTimes |> 
  arrange(desc(median)) |> 
  dplyr::select(expression:mem_alloc) |> 
  distinct() |> 
  flextable() |> 
  autofit()

expression	min	median	itr/sec	mem_alloc
read.csv()	45.17ms	47.16ms	19.89971	1.7MB
read_csv()	12.83ms	15.64ms	63.90479	359.65KB
read_parquet()	5.56ms	6.07ms	161.29775	9.01KB
fread()	3.58ms	3.87ms	245.35529	795.97KB
read_rds()	549.7µs	795.5µs	1,174.39498	395.83KB

Let’s visualize the performances of different import functions

# Creating violin plots
importTimes |> ggplot(aes(x = time, y = fct_reorder(expression, time))) + 
  geom_violin(fill = "dodgerblue") + 
  geom_jitter(height = 0.2, pch=21, fill = "black", color = "white") + 
  labs(title = "Comparison of read times", y = "", x = "Run time") + theme_bw()

Exporting data from R

Comparing different write functions

Reproduce the table and violin plots below comparing the differences in export speeds between the various functions using the code below.

library(bench)

# Comparing run times
writeBmResult <- mark(write.csv(bigData, "bigData.csv"), 
                      write_csv(bigData, "bigData.csv"), 
                      fwrite(bigData, "bigData.csv"), 
                      write_rds(bigData, "bigData.rds"),
                      write_parquet(bigData, "bigData.parquet"),
     check = FALSE, min_iterations = 10) 

ggObj <- plot(writeBmResult)

exportTimes <- ggObj$data |> mutate(expression = paste0(map_chr(str_split(expression, pattern = "[(]"), 1), "()"))

# Printing table
exportTimes |> 
  arrange(desc(median)) |> 
  dplyr::select(expression:mem_alloc) |> 
  distinct() |> 
  flextable() |> 
  autofit()

expression	min	median	itr/sec	mem_alloc
write.csv()	239.53ms	247.15ms	3.787915	520.4KB
write_csv()	93.01ms	96.19ms	9.794579	102.99KB
write_rds()	2.45ms	43.08ms	24.483151	8.63KB
write_parquet()	17.46ms	19.66ms	48.826228	14.94KB
fwrite()	6.65ms	9.19ms	91.135579	0B

Let’s visualize the performances of different export functions

# Creating violin plots
exportTimes |> ggplot(aes(x = time, y = fct_reorder(expression, time))) + 
  geom_violin(fill = "dodgerblue") + 
  geom_jitter(height = 0.2, pch=21, fill = "black", color = "white") + 
  labs(title = "Comparison of write times", y = "", x = "Run time") + theme_bw()

Importing and Exporting Data with R

Learning Objectives

Let’s start by loading packages for this activity.

Let’s take a look at a few rows and a few columns from the possums data set

Importing data into R

We can use the Import wizard to import data into R using the read.csv() function.

We can also use the read_csv() from the readr package

Using the read_csv() function

Let’s try to import the data from the parquet file

Lastly we can use the fread() function from the data.table package

Let’s generate some medium data to time the different import functions

Timing the runtime of different read functions

Let’s visualize the performances of different import functions

Exporting data from R

Comparing different write functions

Let’s visualize the performances of different export functions

Let’s take a look at a few rows and a few columns from the `possums` data set

We can use the Import wizard to import data into R using the `read.csv()` function.

We can also use the `read_csv()` from the `readr` package

Using the `read_csv()` function

Let’s try to import the data from the `parquet` file

Lastly we can use the `fread()` function from the `data.table` package