Importing and Exporting Data with R

Learning Objectives

  • Importing CSV files into R

  • Importing special data files into R

  • Exporting data from R

Let’s start by loading packages for this activity.

library(data.table)
library(tidyverse)
library(arrow)
library(bench)
library(flextable)
library(ggbeeswarm)
# Importing the possums data set from a url
possums <- read.csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/possum.csv")

Let’s take a look at a few rows and a few columns from the possums data set

# Printing first 6 columns and first 5 rows
possums |> 
  dplyr::select(1:6) |> 
  slice_head(n = 5) |>
  flextable() |> 
  autofit()

case

site

sex

age

head_length_mm

skull_width_mm

1

Cambarville

Male

8

94.1

60.4

2

Cambarville

Female

6

92.5

57.6

3

Cambarville

Female

6

94.0

60.0

4

Cambarville

Female

6

93.2

57.1

5

Cambarville

Female

2

91.5

56.3

Importing data into R

We can use the Import wizard to import data into R using the read.csv() function.

possum <- read.csv("C:/Users/user/Desktop/Grand Valley/Winter 2024/STA 518/possum.csv")

We can also use the read_csv() from the readr package

library(readr)
possum <- read_csv("C:/Users/user/Desktop/Grand Valley/Winter 2024/STA 518/possum.csv")

Using the read_csv() function

possums <- read_csv("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/possum.csv")

Let’s try to import the data from the parquet file

library(arrow)

# Downloading file from GitHub
download.file(url = "https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/possum.parquet", destfile = "possum.parquet", mode = "wb")

# Import after manually downloading file
possums <- read_parquet("possum.parquet")

Lastly we can use the fread() function from the data.table package

library(data.table)
# Importing data from a url
possums <- fread("https://raw.githubusercontent.com/dilernia/STA418-518/main/Data/possum.csv")

Let’s generate some medium data to time the different import functions

# Generating 'big data'
set.seed(1994)
x <- runif(5e4)
y <- runif(5e4)
x[sample(5e4, 5e3)] <- NA
y[sample(5e4, 5e3)] <- NA
bigData <- as.data.frame(x = x, y = y)

# Saving as CSV file w/ data.table
fwrite(bigData, "bigData.csv")

# Saving as parquet file
write_parquet(bigData, "bigData.parquet")

# Saving as RDS file
write_rds(bigData, "bigData.rds")

Timing the runtime of different read functions

library(bench)

# Comparing run times
readBmResult <- mark(read.csv("bigData.csv"), read_csv("bigData.csv", show_col_types = FALSE), 
               fread("bigData.csv"), read_rds("bigData.rds"),
     read_parquet("bigData.parquet", as_tibble = TRUE),
     check = FALSE, min_iterations = 5) 

ggObj <- plot(readBmResult)

importTimes <- ggObj$data |> mutate(expression = paste0(map_chr(str_split(expression, pattern = "[(]"), 1), "()"))

# Printing table
importTimes |> 
  arrange(desc(median)) |> 
  dplyr::select(expression:mem_alloc) |> 
  distinct() |> 
  flextable() |> 
  autofit()

expression

min

median

itr/sec

mem_alloc

read.csv()

45.17ms

47.16ms

19.89971

1.7MB

read_csv()

12.83ms

15.64ms

63.90479

359.65KB

read_parquet()

5.56ms

6.07ms

161.29775

9.01KB

fread()

3.58ms

3.87ms

245.35529

795.97KB

read_rds()

549.7µs

795.5µs

1,174.39498

395.83KB

Let’s visualize the performances of different import functions

# Creating violin plots
importTimes |> ggplot(aes(x = time, y = fct_reorder(expression, time))) + 
  geom_violin(fill = "dodgerblue") + 
  geom_jitter(height = 0.2, pch=21, fill = "black", color = "white") + 
  labs(title = "Comparison of read times", y = "", x = "Run time") + theme_bw()

Exporting data from R

Comparing different write functions

Reproduce the table and violin plots below comparing the differences in export speeds between the various functions using the code below.

library(bench)

# Comparing run times
writeBmResult <- mark(write.csv(bigData, "bigData.csv"), 
                      write_csv(bigData, "bigData.csv"), 
                      fwrite(bigData, "bigData.csv"), 
                      write_rds(bigData, "bigData.rds"),
                      write_parquet(bigData, "bigData.parquet"),
     check = FALSE, min_iterations = 10) 

ggObj <- plot(writeBmResult)

exportTimes <- ggObj$data |> mutate(expression = paste0(map_chr(str_split(expression, pattern = "[(]"), 1), "()"))

# Printing table
exportTimes |> 
  arrange(desc(median)) |> 
  dplyr::select(expression:mem_alloc) |> 
  distinct() |> 
  flextable() |> 
  autofit()

expression

min

median

itr/sec

mem_alloc

write.csv()

239.53ms

247.15ms

3.787915

520.4KB

write_csv()

93.01ms

96.19ms

9.794579

102.99KB

write_rds()

2.45ms

43.08ms

24.483151

8.63KB

write_parquet()

17.46ms

19.66ms

48.826228

14.94KB

fwrite()

6.65ms

9.19ms

91.135579

0B

Let’s visualize the performances of different export functions

# Creating violin plots
exportTimes |> ggplot(aes(x = time, y = fct_reorder(expression, time))) + 
  geom_violin(fill = "dodgerblue") + 
  geom_jitter(height = 0.2, pch=21, fill = "black", color = "white") + 
  labs(title = "Comparison of write times", y = "", x = "Run time") + theme_bw()