Normalizing columns
# First generate data with 400000 rows and 150 cols
data <- as.data.frame(
matrix(rnorm(4e5 * 150, mean = 5), ncol = 150)
)
normCols <- function(d) {
# Get vector of column means
means <- apply(d, 2, mean)
# Subtract mean from each column
for (i in seq_along(means)) {
d[, i] <- d[, i] - means[i]
}
d
}
invisible(normCols(data))
With timing
system.time({
normCols <- function(d) {
# Get vector of column means
means <- apply(d, 2, mean)
# Subtract mean from each column
for (i in seq_along(means)) {
d[, i] <- d[, i] - means[i]
}
d
}
normCols(data)
})
With profiling
normCols <- function(d) {
means <- apply(d, 2, mean)
for (i in seq_along(means)) {
d[, i] <- d[, i] - means[i]
}
d
}
normCols(data)
Four different ways of getting column means
library(profvis)
profvis({
# Four different ways of getting column means
means <- apply(data, 2, mean)
means <- colMeans(data)
means <- lapply(data, mean)
means <- vapply(data, mean, numeric(1))
})
Optimized version
profvis({
normCols2 <- function(d) {
means <- vapply(d, mean, numeric(1))
for (i in seq_along(means)) {
d[, i] <- d[, i] - means[i]
}
}
normCols2(data)
})
Text processing
profvis({
lines <- readLines("output.prof")
proc_lines <- list()
for (i in seq_along(lines)) {
line <- lines[i]
line <- strsplit(line, " ")[[1]]
linedata <- data.frame(
row = i,
col = rev(seq_along(line)),
label = line
)
proc_lines[[i]] <- linedata
}
# rbind all the data frames together
proc_data <- do.call(rbind, proc_lines)
})
Faster version, using list instead of data frames
profvis({
lines <- readLines("output.prof")
proc_lines <- list()
for (i in seq_along(lines)) {
line <- lines[i]
line <- strsplit(line, " ")[[1]]
# Put line data in a list instead of a data frame
linedata <- list(
row = rep(i, length(line)),
col = rev(seq_along(line)),
label = line
)
proc_lines[[i]] <- linedata
}
extract_vector <- function(x, name) {
vecs <- lapply(x, `[[`, name)
do.call(c, vecs)
}
proc_data <- data.frame(
row = extract_vector(proc_lines, "row"),
col = extract_vector(proc_lines, "col"),
label = extract_vector(proc_lines, "label")
)
})