Getting Started
library(profvis)
# Generate data
times <- 4e5
cols <- 150
data <- as.data.frame(x = matrix(rnorm(times * cols, mean = 5), ncol = cols))
data <- cbind(id = paste0("g", seq_len(times)), data)
profvis({
data1 <- data # Store in another variable for this run
# Get column means
means <- apply(data1[, names(data1) != "id"], 2, mean)
# Subtract mean from each column
for (i in seq_along(means)) {
data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
}
})
system.time({
data1 <- data # Store in another variable for this run
# Get column means
means <- apply(data1[, names(data1) != "id"], 2, mean)
# Subtract mean from each column
for (i in seq_along(means)) {
data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
}
})
## user system elapsed
## 1.716 0.260 1.977
Example for identify the problem
# Generate data
times <- 4e5
cols <- 150
data <- as.data.frame(x = matrix(rnorm(times * cols, mean = 5), ncol = cols))
data <- cbind(id = paste0("g", seq_len(times)), data)
profvis({
data1 <- data # Store in another variable for this run
# Get column means
means <- apply(data1[, names(data1) != "id"], 2, mean)
# Subtract mean from each column
for (i in seq_along(means)) {
data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
}
})
system.time({
data1 <- data # Store in another variable for this run
# Get column means
means <- apply(data1[, names(data1) != "id"], 2, mean)
# Subtract mean from each column
for (i in seq_along(means)) {
data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
}
})
## user system elapsed
## 1.804 0.060 1.862
Compare the other functions with apply.
profvis({
data1 <- data
# Four different ways of getting column means
means <- apply(data1[, names(data1) != "id"], 2, mean)
means <- colMeans(data1[, names(data1) != "id"])
means <- lapply(data1[, names(data1) != "id"], mean)
means <- vapply(data1[, names(data1) != "id"], mean, numeric(1))
})
system.time(means <- apply(data1[, names(data1) != "id"], 2, mean))
## user system elapsed
## 1.064 0.076 1.143
system.time(means <- colMeans(data1[, names(data1) != "id"]))
## user system elapsed
## 0.168 0.064 0.232
system.time(means <- lapply(data1[, names(data1) != "id"], mean))
## user system elapsed
## 0.128 0.000 0.125
system.time(means <- vapply(data1[, names(data1) != "id"], mean, numeric(1)))
## user system elapsed
## 0.120 0.000 0.123
Replace apply with vapply
profvis({
data1 <- data
means <- vapply(data1[, names(data1) != "id"], mean, numeric(1))
for (i in seq_along(means)) {
data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
}
})
system.time({
data1 <- data
means <- vapply(data1[, names(data1) != "id"], mean, numeric(1))
for (i in seq_along(means)) {
data1[, names(data1) != "id"][, i] <- data1[, names(data1) != "id"][, i] - means[i]
}
})
## user system elapsed
## 0.744 0.000 0.746
Apply a function over columns that does both steps
profvis({
data1 <- data
# Given a column, normalize values and return them
col_norm <- function(col) {
col - mean(col)
}
# Apply the normalizer function over all columns except id
data1[, names(data1) != "id"] <- lapply(data1[, names(data1) != "id"], col_norm)
})
system.time({
data1 <- data
# Given a column, normalize values and return them
col_norm <- function(col) {
col - mean(col)
}
# Apply the normalizer function over all columns except id
data1[, names(data1) != "id"] <- lapply(data1[, names(data1) != "id"], col_norm)
})
## user system elapsed
## 0.196 0.000 0.196