Library

pacman::p_load(
  dplyr,      # data cleaning 
  tidyverse  # data management and visualization
)

Baisc Loop

# loop over integer sequence
for (i in 1:3) {
  print(i)
}

## [1] 1
## [1] 2
## [1] 3

# create the vector
for (i in c("Reto", "Ben", "Lea")) {
  print(i)
}

## [1] "Reto"
## [1] "Ben"
## [1] "Lea"

# integer sequence
x <- 1:3
# use vector x for the loop
for(i in x) {
  print(i)
}

## [1] 1
## [1] 2
## [1] 3

# character vector
participants <- c("Reto", "Ben", "Lea")
# Use vector participant for the loop 
for(name in participants) {
  print(i)
}

## [1] 3
## [1] 3
## [1] 3

output <- vector("double", length(1:10))

for (i in 1:10) {
sq <- i^2
print(paste0("i = ", i, ": sq =", sq))
output[[i]] <- sq
}

## [1] "i = 1: sq =1"
## [1] "i = 2: sq =4"
## [1] "i = 3: sq =9"
## [1] "i = 4: sq =16"
## [1] "i = 5: sq =25"
## [1] "i = 6: sq =36"
## [1] "i = 7: sq =49"
## [1] "i = 8: sq =64"
## [1] "i = 9: sq =81"
## [1] "i = 10: sq =100"

Loop over variables in a data frame

Loop over an index

mtcars

cars <- mtcars[, c("mpg", "wt", "disp", "hp")]
cars

cars <- mtcars[, c("mpg", "wt", "disp", "hp")]

for (i in 1:4) {
  print(i)
  print(sd(cars[,i]))
}

## [1] 1
## [1] 6.026948
## [1] 2
## [1] 0.9784574
## [1] 3
## [1] 123.9387
## [1] 4
## [1] 68.56287

v <- rep(NA, 4)
names(v) <- names(cars)

for (i in 1:4) {
  v[[i]] <- sd(cars[[i]])
}
v

##         mpg          wt        disp          hp 
##   6.0269481   0.9784574 123.9386938  68.5628685

for(i in 1:4) {
  v[i] <- sd(cars[, i])
}
v

##         mpg          wt        disp          hp 
##   6.0269481   0.9784574 123.9386938  68.5628685

v <- matrix(NA, nrow=4, ncol=3)
rownames(v) <- names(cars)          # variable names for row names (mpg, wt, disp, hp)
colnames(v) <- c("mean", "sd", "N") # statistics names for col names

for (i in 1:4) {
  v[i, "mean"] <- mean(cars[,i])
  v[i, "sd"] <- sd(cars[,i])
  v[i, "N"] <- sum(!is.na(cars[,i]))
}

v

##           mean          sd  N
## mpg   20.09062   6.0269481 32
## wt     3.21725   0.9784574 32
## disp 230.72188 123.9386938 32
## hp   146.68750  68.5628685 32

Loop over variable names

Because a data frame always has named columns, we could just as easily use names as positions to index a data frame. This can be especially helpful if we are trying to analyze a few selected variables from a larger data frame (for example, using mtcars.)

analysis_vars <- c("mpg", "wt", "disp", "hp") # Selected columns from a larger data frame 

for (i in analysis_vars) {   
  print(i)
  print(sd(mtcars[[i]]))  # calculate sd for selected columns only 
}

## [1] "mpg"
## [1] 6.026948
## [1] "wt"
## [1] 0.9784574
## [1] "disp"
## [1] 123.9387
## [1] "hp"
## [1] 68.56287

for (i in analysis_vars) {
  print(i)
  print(sd(mtcars[,i]))
}

## [1] "mpg"
## [1] 6.026948
## [1] "wt"
## [1] 0.9784574
## [1] "disp"
## [1] 123.9387
## [1] "hp"
## [1] 68.56287

v <- matrix(NA, nrow = length(analysis_vars), ncol = 3)
rownames(v) <- analysis_vars
colnames(v) <- c("mean", "sd", "N")

for (i in analysis_vars) {
  v[i, "mean"] <- mean(mtcars[,i])
  v[i, "sd"] <- mean(mtcars[,i])
  v[i, "N"] <- mean(mtcars[,i])
}

v

##           mean        sd         N
## mpg   20.09062  20.09062  20.09062
## wt     3.21725   3.21725   3.21725
## disp 230.72188 230.72188 230.72188
## hp   146.68750 146.68750 146.68750

air.data <- airquality # data frame 
head(air.data)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

# Manual calculation 
mean.1 <- mean(air.data[,1], na.rm = T)
mean.2 <- mean(air.data[,2], na.rm = T)
mean.3 <- mean(air.data[,3], na.rm = T)
mean.4 <- mean(air.data[,4], na.rm = T)
mean.5 <- mean(air.data[,5], na.rm = T)
mean.6 <- mean(air.data[,1], na.rm = T)

means <- c(mean.1, mean.2, mean.3, mean.4, mean.4, mean.6)
means

## [1]  42.129310 185.931507   9.957516  77.882353  77.882353  42.129310

# For loop - numeric indices
for (i in 1:ncol(air.data)) {
  air.data[, i] <- scale(air.data[, i], scale = T, center = T)
}
head(air.data)

##         Ozone     Solar.R       Wind       Temp     Month       Day
## 1 -0.03423409  0.04517615 -0.7259482 -1.1497140 -1.407294 -1.670019
## 2 -0.18580489 -0.75430487 -0.5556388 -0.6214670 -1.407294 -1.557210
## 3 -0.91334473 -0.41008388  0.7500660 -0.4101682 -1.407294 -1.444401
## 4 -0.73145977  1.41095624  0.4378323 -1.6779609 -1.407294 -1.331592
## 5          NA          NA  1.2326091 -2.3118573 -1.407294 -1.218782
## 6 -0.42831817          NA  1.4029185 -1.2553634 -1.407294 -1.105973

# For loop - numeric indices using [[]] - The "double square brackets" in R can be used to reference "data frame columns"
air.data2 <- airquality  
for (i in 1:ncol(air.data)) {
  air.data2[[i]] <- scale(air.data2[[i]], scale = T, center = T)
}
head(air.data2)

##         Ozone     Solar.R       Wind       Temp     Month       Day
## 1 -0.03423409  0.04517615 -0.7259482 -1.1497140 -1.407294 -1.670019
## 2 -0.18580489 -0.75430487 -0.5556388 -0.6214670 -1.407294 -1.557210
## 3 -0.91334473 -0.41008388  0.7500660 -0.4101682 -1.407294 -1.444401
## 4 -0.73145977  1.41095624  0.4378323 -1.6779609 -1.407294 -1.331592
## 5          NA          NA  1.2326091 -2.3118573 -1.407294 -1.218782
## 6 -0.42831817          NA  1.4029185 -1.2553634 -1.407294 -1.105973

# For loop - character indices
for (var in names(air.data)) {
  air.data[, var] <- scale(air.data[, var], scale = T, center = T)
}

head(air.data)

##         Ozone     Solar.R       Wind       Temp     Month       Day
## 1 -0.03423409  0.04517615 -0.7259482 -1.1497140 -1.407294 -1.670019
## 2 -0.18580489 -0.75430487 -0.5556388 -0.6214670 -1.407294 -1.557210
## 3 -0.91334473 -0.41008388  0.7500660 -0.4101682 -1.407294 -1.444401
## 4 -0.73145977  1.41095624  0.4378323 -1.6779609 -1.407294 -1.331592
## 5          NA          NA  1.2326091 -2.3118573 -1.407294 -1.218782
## 6 -0.42831817          NA  1.4029185 -1.2553634 -1.407294 -1.105973

Loops over rows in a data frame

set.seed(20210205)
q <- as.data.frame(matrix(sample(c(1:5,NA), 35, replace=TRUE), ncol=5))
q

##   V1 V2 V3 V4 V5
## 1  4  2  3  1  3
## 2  5  3  1 NA  5
## 3 NA  3  5  3 NA
## 4  5  5  3  2  5
## 5  3  5  2 NA NA
## 6  5  5  1 NA  1
## 7 NA  1  3  5  1

Vscale <- rep(NA, nrow(q))
names(Vscale) <- row.names(q)

for (i in 1:nrow(q)) {
  v <- as.matrix(q[i,])
  Vscale[i] <- mean(v, na.rm = TRUE)
}

Vscale

##        1        2        3        4        5        6        7 
## 2.600000 3.500000 3.666667 4.000000 3.333333 3.000000 2.500000

Loops over tible

tb <- readr::read_csv2("http://rpository.com/ds4psy/data/tb.csv")

## ℹ Using "','" as decimal and "'.'" as grouping mark. Use `read_delim()` for more control.

## Rows: 100 Columns: 5
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ";"
## dbl (5): id, age, height, shoesize, IQ
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

dim(tb)

## [1] 100   5

head(tb)

## # A tibble: 6 × 5
##      id   age height shoesize    IQ
##   <dbl> <dbl>  <dbl>    <dbl> <dbl>
## 1     1    21    173       38    89
## 2     2    26    193       43    93
## 3     3    24    171       41    92
## 4     4    32    191       43    97
## 5     5    26    156       36   110
## 6     6    28    172       34   117

tb_2 <- tb %>% select(-1)
tb_2

## # A tibble: 100 × 4
##      age height shoesize    IQ
##    <dbl>  <dbl>    <dbl> <dbl>
##  1    21    173       38    89
##  2    26    193       43    93
##  3    24    171       41    92
##  4    32    191       43    97
##  5    26    156       36   110
##  6    28    172       34   117
##  7    20    166       35   107
##  8    31    172       34   110
##  9    18    192       32    88
## 10    22    176       39   111
## # … with 90 more rows

output <- vector("double", 4) # 4 column 

for (i in 1:4) {
  mn <- mean(tb_2[[i]])
  output[[i]] <- mn
}

output

## [1]  26.29 177.78  39.05 104.85

output <- vector("double", length(1:4)) # 4 column

for (i in 1:4) {
  mn <- mean(tb_2[[i]])
  output[[i]] <- mn
}

output

## [1]  26.29 177.78  39.05 104.85

output <- vector("double", length(1:4)) # 4 column 

for (i in 1:ncol(tb_2)) {
  mn <- mean(tb_2[[i]])
  output[[i]] <- mn
}

output

## [1]  26.29 177.78  39.05 104.85

output_2 <- vector("double", 4)

for (i in seq_along(tb_2)) {   # loop through COLUMNS of a df/table, return integer
  mn <- mean(tb_2[[i]])
  output_2[[i]] <- mn
}

output_2

## [1]  26.29 177.78  39.05 104.85

output_2 <- vector("double", 4)

for (i in 1:ncol(tb_2)) {
  mn <- mean(tb_2[[i]])
  output_2[[i]] <- mn
}

output_2

## [1]  26.29 177.78  39.05 104.85

output_4 <- vector("double", 4)

for (i in 2:5) {
  mn <- mean(tb[[i]])
  output_4[[i-1]] <- mn
}

output_4

## [1]  26.29 177.78  39.05 104.85

all.equal(output, output_4)

## [1] TRUE

summary(tb$age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   22.00   24.50   26.29   30.25   46.00

summary(tb$height)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   150.0   171.8   177.0   177.8   185.0   206.0

summary(tb$shoesize)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   36.00   39.00   39.05   42.00   47.00

s <- summary(tb$IQ)

typeof(s)

## [1] "double"

head(tb) # tibble

## # A tibble: 6 × 5
##      id   age height shoesize    IQ
##   <dbl> <dbl>  <dbl>    <dbl> <dbl>
## 1     1    21    173       38    89
## 2     2    26    193       43    93
## 3     3    24    171       41    92
## 4     4    32    191       43    97
## 5     5    26    156       36   110
## 6     6    28    172       34   117

summarise <- vector(mode = "list", length = 4) # 4 columns 

for (i in 2:5) {
  sm <- summary(tb[[i]])
  summarise[[i - 1]] <- sm
}

summarise

## [[1]]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   15.00   22.00   24.50   26.29   30.25   46.00 
## 
## [[2]]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   150.0   171.8   177.0   177.8   185.0   206.0 
## 
## [[3]]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   29.00   36.00   39.00   39.05   42.00   47.00 
## 
## [[4]]
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    85.0    97.0   104.0   104.8   113.0   145.0

Loop over matrix

# Create matrix
x <- matrix(c(28, 35, 13, 13, 1.62, 1.53, 1.83, 1.71, 65, 59, 72, 83),
            nrow = 4,
            dimnames = list(c("Veronica", "Karl", "Miriam", "Peter"), # row
                            c("Age", "Size", "Weight"))) # column 

x

##          Age Size Weight
## Veronica  28 1.62     65
## Karl      35 1.53     59
## Miriam    13 1.83     72
## Peter     13 1.71     83

for (rname in rownames(x)) {
  for (cname in colnames(x)) {
    print(paste("The", cname, "of", rname, "is", x[rname, cname]))
  }
}

## [1] "The Age of Veronica is 28"
## [1] "The Size of Veronica is 1.62"
## [1] "The Weight of Veronica is 65"
## [1] "The Age of Karl is 35"
## [1] "The Size of Karl is 1.53"
## [1] "The Weight of Karl is 59"
## [1] "The Age of Miriam is 13"
## [1] "The Size of Miriam is 1.83"
## [1] "The Weight of Miriam is 72"
## [1] "The Age of Peter is 13"
## [1] "The Size of Peter is 1.71"
## [1] "The Weight of Peter is 83"

for (var in colnames(x)) {
  m <- mean(x[, var])
  print(paste("Average", var, "is", m))
}

## [1] "Average Age is 22.25"
## [1] "Average Size is 1.6725"
## [1] "Average Weight is 69.75"

Collecting the output of loops

Individual values: Individual values can be collected in a named vector by combining functions c() and setNames()

# setNames: setnames() changes the names of a data.frame or data.table by reference
x <- 1:5
x_names <- setNames(x, letters[1:5])
x_names

## a b c d e 
## 1 2 3 4 5

# For 
mu <- NULL

for(var in names(air.data)) {
  mu <- c(mu, setNames(mean(air.data[, var], na.rm = T), var))
}

mu

##         Ozone       Solar.R          Wind          Temp         Month 
## -1.754632e-17 -1.763889e-17 -2.275248e-17  4.988747e-18 -2.815949e-17 
##           Day 
## -1.585911e-17

Vectors: Vectors of the same size can be collected in a data.frame

scaled_data <- NULL

for(var in names(air.data)) {
  
  scaled_data <- cbind(scaled_data,
                       scale(air.data[, var],
                             scale = T,
                             center = T))
}

# convert to data.frame and name
scaled_data <- as.data.frame(scaled_data) %>% setNames(names(air.data))
head(scaled_data)

##         Ozone     Solar.R       Wind       Temp     Month       Day
## 1 -0.03423409  0.04517615 -0.7259482 -1.1497140 -1.407294 -1.670019
## 2 -0.18580489 -0.75430487 -0.5556388 -0.6214670 -1.407294 -1.557210
## 3 -0.91334473 -0.41008388  0.7500660 -0.4101682 -1.407294 -1.444401
## 4 -0.73145977  1.41095624  0.4378323 -1.6779609 -1.407294 -1.331592
## 5          NA          NA  1.2326091 -2.3118573 -1.407294 -1.218782
## 6 -0.42831817          NA  1.4029185 -1.2553634 -1.407294 -1.105973

Nested loops

mat <- matrix(nrow=5, ncol=5) # create a 30 x 30 matrix (of 30 rows and 30 columns)

for(i in 1:nrow(mat))  # for each row
{
  for(j in 1:ncol(mat)) # for each column
  {
    mat[i,j] = i*j     # assign values based on position: product of two indexes
  }
}

Histogram

hist(tb_2$shoesize)

h<-hist(tb$IQ)

## $breaks
## [1]  80  90 100 110 120 130 140 150
## 
## $counts
## [1] 12 29 29 20  8  1  1
## 
## $density
## [1] 0.012 0.029 0.029 0.020 0.008 0.001 0.001
## 
## $mids
## [1]  85  95 105 115 125 135 145
## 
## $xname
## [1] "tb$IQ"
## 
## $equidist
## [1] TRUE
## 
## attr(,"class")
## [1] "histogram"

tb_2

## # A tibble: 100 × 4
##      age height shoesize    IQ
##    <dbl>  <dbl>    <dbl> <dbl>
##  1    21    173       38    89
##  2    26    193       43    93
##  3    24    171       41    92
##  4    32    191       43    97
##  5    26    156       36   110
##  6    28    172       34   117
##  7    20    166       35   107
##  8    31    172       34   110
##  9    18    192       32    88
## 10    22    176       39   111
## # … with 90 more rows

my_blue <- ds4psy::pal_ds4psy[[4]]

# Histogram for 1 column 
hist(tb_2[[1]], col = my_blue,  
     main = paste0("Histogram of ", names(tb_2[1]), "value:"),
     xlab = names(tb_2[1]))

# For loop:
out <- vector("list", 4) # output of loop

for (i in seq_along(tb_2)) {  # loop through COLUMNS of tb_2: 
  
  print(i)
  var_name <- names(tb_2[i])
  title <- paste0("Histogram of ", var_name, " values:")
  x_lab <- var_name
  out[[i]] <- hist(tb_2[[i]], col = my_blue, 
                   main = title, xlab = x_lab)
  
}  # end for.

## [1] 1

## [1] 2

## [1] 3

## [1] 4

plot(out[[2]], col = "gold")

set.seed(1)

tb <- tibble(
  a = rnorm(10),  # normal distribution 
  b = rnorm(10),
  c = rnorm(10),
  d = rnorm(10)
  
)

tb

## # A tibble: 10 × 4
##         a       b       c       d
##     <dbl>   <dbl>   <dbl>   <dbl>
##  1 -0.626  1.51    0.919   1.36  
##  2  0.184  0.390   0.782  -0.103 
##  3 -0.836 -0.621   0.0746  0.388 
##  4  1.60  -2.21   -1.99   -0.0538
##  5  0.330  1.12    0.620  -1.38  
##  6 -0.820 -0.0449 -0.0561 -0.415 
##  7  0.487 -0.0162 -0.156  -0.394 
##  8  0.738  0.944  -1.47   -0.0593
##  9  0.576  0.821  -0.478   1.10  
## 10 -0.305  0.594   0.418   0.763

rescale01 <- function(x) {
  rng <- range(x, na.rm = TRUE)
  (x - rng[1]) / (rng[2] - rng[1])
}

df <- tb
df$a <- rescale01(df$a)
df$b <- rescale01(df$b)
df$c <- rescale01(df$c)
df$d <- rescale01(df$d)

df

## # A tibble: 10 × 4
##          a     b     c     d
##      <dbl> <dbl> <dbl> <dbl>
##  1 0.0860  1     1     1    
##  2 0.419   0.699 0.953 0.466
##  3 0       0.428 0.710 0.645
##  4 1       0     0     0.484
##  5 0.479   0.896 0.897 0    
##  6 0.00624 0.582 0.665 0.352
##  7 0.544   0.590 0.630 0.359
##  8 0.647   0.848 0.178 0.482
##  9 0.581   0.815 0.520 0.905
## 10 0.218   0.754 0.828 0.782

df2 <- tb

for (i in seq_along(df2)) {
  
  df2[[i]] <- rescale01(df2[[i]])
  
}

df2

## # A tibble: 10 × 4
##          a     b     c     d
##      <dbl> <dbl> <dbl> <dbl>
##  1 0.0860  1     1     1    
##  2 0.419   0.699 0.953 0.466
##  3 0       0.428 0.710 0.645
##  4 1       0     0     0.484
##  5 0.479   0.896 0.897 0    
##  6 0.00624 0.582 0.665 0.352
##  7 0.544   0.590 0.630 0.359
##  8 0.647   0.848 0.178 0.482
##  9 0.581   0.815 0.520 0.905
## 10 0.218   0.754 0.828 0.782

# Verify equality:
all.equal(df, df2)

## [1] TRUE

Iteration, loops, and lists (1)

2023 April 02