Getting Started

library(profvis)
# Generate data
times <- 4e5
cols <- 150
data <- as.data.frame(x = matrix(rnorm(times * cols, mean = 5), ncol = cols))
data <- cbind(id = paste0("g", seq_len(times)), data)

profvis({
  data1 <- data   # Store in another variable for this run

  # Get column means
  means <- apply(data1[, names(data1) != "id"], 2, mean)

  # Subtract mean from each column
  for (i in seq_along(means)) {
    data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
  }
})

system.time({
  data1 <- data   # Store in another variable for this run

  # Get column means
  means <- apply(data1[, names(data1) != "id"], 2, mean)

  # Subtract mean from each column
  for (i in seq_along(means)) {
    data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
  }
})
##    user  system elapsed 
##   1.716   0.260   1.977

Example for identify the problem

# Generate data
times <- 4e5
cols <- 150
data <- as.data.frame(x = matrix(rnorm(times * cols, mean = 5), ncol = cols))
data <- cbind(id = paste0("g", seq_len(times)), data)

profvis({
  data1 <- data   # Store in another variable for this run

  # Get column means
  means <- apply(data1[, names(data1) != "id"], 2, mean)

  # Subtract mean from each column
  for (i in seq_along(means)) {
    data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
  }
})

system.time({
  data1 <- data   # Store in another variable for this run

  # Get column means
  means <- apply(data1[, names(data1) != "id"], 2, mean)

  # Subtract mean from each column
  for (i in seq_along(means)) {
    data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
  }
})
##    user  system elapsed 
##   1.804   0.060   1.862

Compare the other functions with apply.

profvis({
  data1 <- data
  # Four different ways of getting column means
  means <- apply(data1[, names(data1) != "id"], 2, mean)
  means <- colMeans(data1[, names(data1) != "id"])
  means <- lapply(data1[, names(data1) != "id"], mean)
  means <- vapply(data1[, names(data1) != "id"], mean, numeric(1))
})

system.time(means <- apply(data1[, names(data1) != "id"], 2, mean))
##    user  system elapsed 
##   1.064   0.076   1.143
system.time(means <- colMeans(data1[, names(data1) != "id"]))
##    user  system elapsed 
##   0.168   0.064   0.232
system.time(means <- lapply(data1[, names(data1) != "id"], mean))
##    user  system elapsed 
##   0.128   0.000   0.125
system.time(means <- vapply(data1[, names(data1) != "id"], mean, numeric(1)))
##    user  system elapsed 
##   0.120   0.000   0.123

Replace apply with vapply

profvis({
  data1 <- data
  means <- vapply(data1[, names(data1) != "id"], mean, numeric(1))

  for (i in seq_along(means)) {
    data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
  }
})

system.time({
  data1 <- data
  means <- vapply(data1[, names(data1) != "id"], mean, numeric(1))

  for (i in seq_along(means)) {
    data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
  }
})
##    user  system elapsed 
##   0.744   0.000   0.746

Apply a function over columns that does both steps

profvis({
  data1 <- data

  # Given a column, normalize values and return them
  col_norm <- function(col) {
    col - mean(col)
  }

  # Apply the normalizer function over all columns except id
  data1[, names(data1) != "id"] <- lapply(data1[, names(data1) != "id"], col_norm)
})

system.time({
  data1 <- data

  # Given a column, normalize values and return them
  col_norm <- function(col) {
    col - mean(col)
  }

  # Apply the normalizer function over all columns except id
  data1[, names(data1) != "id"] <- lapply(data1[, names(data1) != "id"], col_norm)
})
##    user  system elapsed 
##   0.196   0.000   0.196