Preparing data for analysis: functions and loops

options(htmltools.dir.version = FALSE)
knitr::opts_chunk$set(
  #fig.width=9, fig.height=3.5, fig.retina=3,
  fig.retina = 3,
  #out.height = "100%",
  cache = FALSE,
  echo = TRUE,
  message = FALSE, 
  warning = FALSE,
  hiline = TRUE
)
library(here)
library(emo)
library(tidyverse)
library(knitr)
library(flextable)
library(texreg)
library(sjPlot)
library(bp)

Writing a function in R

  • I want to write a function that randomly assigns pairs of students from a list
  • Here is the list of student initials
initials <- c("SA","KE", "EK", "KL", "ZM", "PS", "ANT", "AMT", "ZLZ", "YZ")
  • I want to end up with
     member1 member2
[1,] "AMT"   "PS"   
[2,] "YZ"    "ANT"  
[3,] "SA"    "KE"   
[4,] "EK"    "ZM"   
[5,] "ZLZ"   "KL"   

Writing a function in R

Start with code that works for one case

# randomly sample half of the students
member1 <- sample(initials, size = length(initials)/2, replace = FALSE)
member1
[1] "ANT" "SA"  "PS"  "EK"  "YZ" 
# shuffle the other half of the students
member2 <- sample(setdiff(initials, member1))
member2
[1] "KL"  "KE"  "ZLZ" "AMT" "ZM" 

Think about the output that you want

cbind(member1, member2)
     member1 member2
[1,] "ANT"   "KL"   
[2,] "SA"    "KE"   
[3,] "PS"    "ZLZ"  
[4,] "EK"    "AMT"  
[5,] "YZ"    "ZM"   

Writing a function in R

  • Turn the code into a function
    • What are the arguments?
    • Give it a simple but informative name
makepairs <- function(x){
  member1 <- sample(x, size = floor(length(x)/2), replace = FALSE)
  member2 <- sample(setdiff(x, member1))
  cbind(member1, member2)
}
makepairs(initials)
     member1 member2
[1,] "KE"    "PS"   
[2,] "KL"    "ZLZ"  
[3,] "ZM"    "YZ"   
[4,] "ANT"   "AMT"  
[5,] "EK"    "SA"   

Writing a function in R

  • Make it generalizable!
newinitials <- c("SA","KE", "EK", "KL", "ZM", "PS", "ANT", "AMT", "ZLZ", "YZ", "AM")
makepairs(newinitials)
     member1 member2
[1,] "KE"    "ZM"   
[2,] "ZLZ"   "SA"   
[3,] "KL"    "EK"   
[4,] "ANT"   "PS"   
[5,] "YZ"    "AMT"  
[6,] "KE"    "AM"   
  • What is wrong with the current function?

Writing a function in R

  • This function works for both odd and even numbers of students
makepairs <- function(x){
  member1 <- sample(x, size = floor(length(x)/2), replace = FALSE)
  member2 <- sample(setdiff(x, member1))
  L <- max(length(member1), length(member2))
  length(member1) <- L
  length(member2) <- L
  return(cbind(member1, member2))
}
makepairs(newinitials)
     member1 member2
[1,] "KL"    "KE"   
[2,] "AMT"   "ZM"   
[3,] "AM"    "SA"   
[4,] "ANT"   "YZ"   
[5,] "PS"    "EK"   
[6,] NA      "ZLZ"  

Writing a function in R

  • To save the output from a function
todayspairs <- makepairs(initials)
todayspairs
     member1 member2
[1,] "KE"    "ZM"   
[2,] "AMT"   "ANT"  
[3,] "KL"    "SA"   
[4,] "ZLZ"   "YZ"   
[5,] "EK"    "PS"   

Writing a function in R

  • If you want your function to return more than one object
makepairs <- function(x){
  member1 <- sample(x, size = floor(length(x)/2), replace = FALSE)
  member2 <- sample(setdiff(x, member1))
  L <- max(length(member1), length(member2))
  length(member1) <- L
  length(member2) <- L
  pairs <- cbind(member1, member2)
  return(list(pairs, member1, member2))
}
makepairs(initials)
[[1]]
     member1 member2
[1,] "AMT"   "ZM"   
[2,] "SA"    "KL"   
[3,] "ZLZ"   "YZ"   
[4,] "PS"    "KE"   
[5,] "EK"    "ANT"  

[[2]]
[1] "AMT" "SA"  "ZLZ" "PS"  "EK" 

[[3]]
[1] "ZM"  "KL"  "YZ"  "KE"  "ANT"

Writing a function in R

  • If you want your function to return more than one object
todayspairs <- makepairs(initials)
todayspairs[[1]]
     member1 member2
[1,] "PS"    "SA"   
[2,] "ZM"    "ANT"  
[3,] "AMT"   "YZ"   
[4,] "KE"    "ZLZ"  
[5,] "KL"    "EK"   
todayspairs[[2]]
[1] "PS"  "ZM"  "AMT" "KE"  "KL" 
todayspairs[[3]]
[1] "SA"  "ANT" "YZ"  "ZLZ" "EK" 

Making use of lists in R

  • We have blood pressure data on patients from three different hospitals and want to combine them into one data set
head(hospital1)
  PATID TRT SYST DIAST HR
1     1   0  123    58 72
2     2   1  124    60 76
3     3   0  145    67 75
4     4   1  129    66 72
5     5   0  124    78 67
6     6   0  132    65 69
head(hospital2)
    PATID TRT SYST DIAST HR
101   101   0  140    71 92
102   102   0  130    67 94
103   103   0  145    66 93
104   104   0  139    86 93
105   105   1  129    59 76
106   106   0  129    72 91
head(hospital3)
    PATID TRT SYST DIAST HR
201   201   1  107    60 63
202   202   0  108    65 65
203   203   0  101    65 64
204   204   1  106    66 58
205   205   1  101    60 61
206   206   0  154    57 63

Making use of lists in R

  • But first, some quality checks
list_hosp <- list(hospital1, hospital2, hospital3)
lapply(list_hosp, FUN = summary)
[[1]]
     PATID             TRT            SYST           DIAST      
 Min.   :  1.00   Min.   :0.00   Min.   :108.0   Min.   :48.00  
 1st Qu.: 25.75   1st Qu.:0.00   1st Qu.:125.0   1st Qu.:59.00  
 Median : 50.50   Median :0.00   Median :134.0   Median :65.00  
 Mean   : 50.50   Mean   :0.47   Mean   :137.6   Mean   :64.97  
 3rd Qu.: 75.25   3rd Qu.:1.00   3rd Qu.:148.2   3rd Qu.:69.00  
 Max.   :100.00   Max.   :1.00   Max.   :178.0   Max.   :85.00  
       HR        
 Min.   : 55.00  
 1st Qu.: 63.00  
 Median : 68.00  
 Mean   : 68.86  
 3rd Qu.: 73.00  
 Max.   :102.00  

[[2]]
     PATID            TRT            SYST           DIAST      
 Min.   :101.0   Min.   :0.00   Min.   : 89.0   Min.   :38.00  
 1st Qu.:125.8   1st Qu.:0.00   1st Qu.:115.8   1st Qu.:62.75  
 Median :150.5   Median :0.00   Median :127.0   Median :70.00  
 Mean   :150.5   Mean   :0.49   Mean   :124.8   Mean   :69.89  
 3rd Qu.:175.2   3rd Qu.:1.00   3rd Qu.:133.2   3rd Qu.:78.00  
 Max.   :200.0   Max.   :1.00   Max.   :157.0   Max.   :94.00  
       HR        
 Min.   : 60.00  
 1st Qu.: 74.00  
 Median : 78.00  
 Mean   : 78.47  
 3rd Qu.: 82.00  
 Max.   :103.00  

[[3]]
     PATID            TRT            SYST           DIAST       
 Min.   :201.0   Min.   :0.00   Min.   :101.0   Min.   : 49.00  
 1st Qu.:213.2   1st Qu.:0.00   1st Qu.:139.2   1st Qu.: 56.25  
 Median :225.5   Median :0.00   Median :152.5   Median : 60.00  
 Mean   :225.5   Mean   :0.48   Mean   :149.0   Mean   : 62.34  
 3rd Qu.:237.8   3rd Qu.:1.00   3rd Qu.:161.5   3rd Qu.: 65.00  
 Max.   :250.0   Max.   :1.00   Max.   :183.0   Max.   :133.00  
       HR        
 Min.   : 58.00  
 1st Qu.: 62.00  
 Median : 64.50  
 Mean   : 66.14  
 3rd Qu.: 69.75  
 Max.   :101.00  

Making use of lists in R

  • Create a new variable HOSPID for each data set
for(i in 1:3){
  list_hosp[[i]]$HOSPID <- i
}
  • Then use reduce() and rbind() to concatenate the three data sets
allhosp <- reduce(list_hosp, rbind)
head(allhosp)
  PATID TRT SYST DIAST HR HOSPID
1     1   0  123    58 72      1
2     2   1  124    60 76      1
3     3   0  145    67 75      1
4     4   1  129    66 72      1
5     5   0  124    78 67      1
6     6   0  132    65 69      1

In-class assignment

  • Today’s in-class assignment is available in the Week 3 Module