Functional programming with purrr

R Markdown

library(purrr)
library(dplyr)
library(repurrrsive)
library(data.table)
library(ggplot2)

files <- list.files("simulated_data_from_1990_to_2005", pattern = "*.csv")

# Loop solution

all_csv <- list()

init_time <- Sys.time()
for (file in files) {
  all_csv[[file]] <- fread(paste0("simulated_data_from_1990_to_2005/",
                                  file), data.table = FALSE)
  
}
Sys.time() - init_time

## Time difference of 0.2639539 secs

cat("loop function time: ", Sys.time() - init_time)

## loop function time:  0.268939

# Purrr solution
init_time <- Sys.time()
all_csv_purrr <- map(paste0("simulated_data_from_1990_to_2005/",
                            files), fread)

cat("map function time: ", Sys.time() - init_time)

## map function time:  0.03615785

Other map functions

map(all_csv[[1]], ~mean(.x))

## $years
## [1] 1990
## 
## $a
## [1] 4.920349
## 
## $b
## [1] 201.0297

data.frame(average = map_dbl(all_csv[[1]], ~mean(.x)),
           logical = map_lgl(all_csv[[1]], ~mean(.x) > 5))

##           average logical
## years 1990.000000    TRUE
## a        4.920349   FALSE
## b      201.029747    TRUE

#map_chr()

Maps with pipe operators

data("sw_films")

sw_films <- sw_films %>% 
            set_names(map_chr(sw_films, "title"))


waterfowl_data <- list(
                        LakeErieS = c(0, 0, 10, 5),
                        LakeErieN = c(0, 0, 1000, 5),
                        LakeErieW = c(10000, 0, 0, 1),
                        LakeErieE = c(10, 10, 5, 0)
)

map(waterfowl_data,  ~.x %>% sum() %>% log())

## $LakeErieS
## [1] 2.70805
## 
## $LakeErieN
## [1] 6.912743
## 
## $LakeErieW
## [1] 9.21044
## 
## $LakeErieE
## [1] 3.218876

#map_df

# List of sites north, east, and west
sites <- list("north", "east", "west")

# Create a list of dataframes, each with a years, a, and b column 
list_of_df <-  map(sites,  
  ~data.frame(name = .x,
       a = rnorm(mean = 5, n = 200, sd = 5/2),
       b = rnorm(mean = 200, n = 200, sd = 15)))

# Map over the models to look at the relationship of a vs b
list_of_df %>%
    map(~ lm(a ~ b, data = .)) %>%
    map(summary)

## [[1]]
## 
## Call:
## lm(formula = a ~ b, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.3707 -1.4360  0.1539  1.5372  8.0580 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  4.60056    2.77597   1.657    0.099 .
## b            0.00242    0.01378   0.176    0.861  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.711 on 198 degrees of freedom
## Multiple R-squared:  0.0001557,  Adjusted R-squared:  -0.004894 
## F-statistic: 0.03084 on 1 and 198 DF,  p-value: 0.8608
## 
## 
## [[2]]
## 
## Call:
## lm(formula = a ~ b, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.5307 -1.6329 -0.0207  1.5045  7.2908 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  6.084166   2.244388   2.711   0.0073 **
## b           -0.005489   0.011307  -0.485   0.6279   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.388 on 198 degrees of freedom
## Multiple R-squared:  0.001189,   Adjusted R-squared:  -0.003855 
## F-statistic: 0.2357 on 1 and 198 DF,  p-value: 0.6279
## 
## 
## [[3]]
## 
## Call:
## lm(formula = a ~ b, data = .)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.3305 -1.5904  0.1304  1.7518  8.9316 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) 3.947289   2.191565   1.801   0.0732 .
## b           0.004746   0.010934   0.434   0.6647  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.569 on 198 degrees of freedom
## Multiple R-squared:  0.0009506,  Adjusted R-squared:  -0.004095 
## F-statistic: 0.1884 on 1 and 198 DF,  p-value: 0.6647

# Pull out the director element of sw_films in a list and character vector
map(sw_films, ~.x[["director"]])

## $`A New Hope`
## [1] "George Lucas"
## 
## $`Attack of the Clones`
## [1] "George Lucas"
## 
## $`The Phantom Menace`
## [1] "George Lucas"
## 
## $`Revenge of the Sith`
## [1] "George Lucas"
## 
## $`Return of the Jedi`
## [1] "Richard Marquand"
## 
## $`The Empire Strikes Back`
## [1] "Irvin Kershner"
## 
## $`The Force Awakens`
## [1] "J. J. Abrams"

map_chr(sw_films, ~.x[["director"]])

##              A New Hope    Attack of the Clones      The Phantom Menace 
##          "George Lucas"          "George Lucas"          "George Lucas" 
##     Revenge of the Sith      Return of the Jedi The Empire Strikes Back 
##          "George Lucas"      "Richard Marquand"        "Irvin Kershner" 
##       The Force Awakens 
##          "J. J. Abrams"

# Compare outputs when checking if director is George Lucas
map(sw_films, ~.x[["director"]] == "George Lucas")

## $`A New Hope`
## [1] TRUE
## 
## $`Attack of the Clones`
## [1] TRUE
## 
## $`The Phantom Menace`
## [1] TRUE
## 
## $`Revenge of the Sith`
## [1] TRUE
## 
## $`Return of the Jedi`
## [1] FALSE
## 
## $`The Empire Strikes Back`
## [1] FALSE
## 
## $`The Force Awakens`
## [1] FALSE

map_lgl(sw_films, ~.x[["director"]] == "George Lucas")

##              A New Hope    Attack of the Clones      The Phantom Menace 
##                    TRUE                    TRUE                    TRUE 
##     Revenge of the Sith      Return of the Jedi The Empire Strikes Back 
##                    TRUE                   FALSE                   FALSE 
##       The Force Awakens 
##                   FALSE

map2 and pmap

# List of 1, 2 and 3
means <- list(1,2,3)

# Create sites list
sites <- list("north","west", "east")

# Map over two arguments: sites and means
list_of_files_map2 <- map2(sites, means, ~data.frame(sites = .x,
                           a = rnorm(mean = .y, n = 200, sd = (5/2))))

list_of_files_map2[[1]][1:10,]

##    sites            a
## 1  north  0.379258658
## 2  north  1.411342440
## 3  north  6.116933682
## 4  north -0.166818119
## 5  north -0.005043385
## 6  north  2.423186565
## 7  north  0.240921771
## 8  north -1.452356184
## 9  north -0.182885834
## 10 north  0.617260259

list_of_means <- list(5, 2, 10, 15)
list_if_sd <- list(0.6, 0.1, 3, 4)
list_of_smpls <- list(50, 100, 200, 250)

sim_data <- list()

# Loop solutions
for (i in list_of_means) {
  for (j in list_if_sd) {
    for (k in list_of_smpls) {
      num <- 1
        sim_data[[1]] <- rnorm(mean = i, sd = j, n = k)
      num <- num + 1
      
    }
  }
}

# purrr solution

input_list <- list(list_of_means,
                   list_if_sd,
                   list_of_smpls)

sim_data <- pmap(input_list,
        function(means, sd, samplesize)
          data.frame(a = rnorm(mean = means,
                               sd = sd,
                               n = samplesize)))

Safely and possibly

# Map safely over log
a <- list(-10, 1, 10, 0) %>% 
      map(safely(log, otherwise = NA_real_))

## Warning in .f(...): Se han producido NaNs

# Print the list
a

## [[1]]
## [[1]]$result
## [1] NaN
## 
## [[1]]$error
## NULL
## 
## 
## [[2]]
## [[2]]$result
## [1] 0
## 
## [[2]]$error
## NULL
## 
## 
## [[3]]
## [[3]]$result
## [1] 2.302585
## 
## [[3]]$error
## NULL
## 
## 
## [[4]]
## [[4]]$result
## [1] -Inf
## 
## [[4]]$error
## NULL

# Print the result element in the list
a[["result"]]

## NULL

# Print the error element in the list
a[["error"]]

## NULL

# Take the log of each element in the list
a <- list(-10, 1, 10, 0) %>% 
  map(possibly(function(x){
    log(x)
},otherwise = NA_real_))

## Warning in log(x): Se han producido NaNs

walk

# Load the gap_split data
data(gap_split)

# Map over the first 10 elements of gap_split
plots <- map2(gap_split[1:10], 
              names(gap_split[1:10]), 
              ~ ggplot(.x, aes(year, lifeExp)) + 
                geom_line() +
                labs(title = .y))

# Object name, then function name
walk(plots, print)

## final example

# Turn data into correct dataframe format
film_by_character <- tibble(filmtitle = map_chr(sw_films, "title")) %>%
    mutate(filmtitle, characters = map(sw_films, "characters")) %>%
    tidyr::unnest()

# Pull out elements from sw_people
sw_characters <- map_df(sw_people, `[`, c("height", "mass", "name", "url"))

# Join the two new objects
character_data <- inner_join(film_by_character, sw_characters, by = c("characters" = "url")) %>%
    # Make sure the columns are numbers
    mutate(height = as.numeric(height), mass = as.numeric(mass))

## Warning: NAs introducidos por coerción

## Warning: NAs introducidos por coerción

# Plot the heights, faceted by film title
ggplot(character_data, aes(x = height)) +
  geom_histogram(stat = "count") +
  facet_wrap(~ filmtitle)

## Warning: Ignoring unknown parameters: binwidth, bins, pad

## Warning: Removed 6 rows containing non-finite values (stat_count).

Functional programming with purrr

Miguel Arquez Abdala

1/1/2020

R Markdown

Other map functions

Maps with pipe operators

map2 and pmap

Safely and possibly

walk