https://github.com/cytomining/cytominer/pull/118 implements cytostats::score_features_sv_entropy below.
library(tidyverse)
library(foreach)
library(doParallel)
library(cytostats)
registerDoParallel(cores = 1)
Reimplement in R
singular_value_entropy_r <- function(A) {
# calculate the svd
singular_values <- svd(A, 0, 0)$d
# normalize relative values
singular_values <- singular_values/sum(singular_values)
# no need to remove negative sv's because the input to score_features_sv_entropy_r will
# always be crossprod(A, A) which is PSD
-sum(singular_values * log10(singular_values))
}
score_features_sv_entropy_r <- function(data) {
# for each feature calculate the contribution to the entropy by leaving that feature out
sv_entropy <-
foreach(i = 1:ncol(data), .combine = c) %dopar% singular_value_entropy_r(data[-i, -i])
singular_value_entropy_r(data) - sv_entropy
}
Verify that the R implementation is identical to the Rcpp
n <- 10000
k <- 100
A <- matrix(rnorm(n * k), n, k)
testthat::expect_equal(score_features_sv_entropy_r(crossprod(A, A)),
cytostats::score_features_sv_entropy(crossprod(A, A)))
Profile the code over a range of values for k
profile_timings <- function(f) {
n <- 10000
timings <-
seq(10, 150, 5) %>%
map_df(function(k) {
A <- matrix(rnorm(n * k), n, k)
te <-
rbenchmark::benchmark(
f(crossprod(A, A)),
replications = 3) %>%
magrittr::extract2("elapsed")
data_frame(k, te)
})
# because this is O(k^2)
m <- lm(te ~ poly(k, 2), timings)
print(timings %>% ggplot(aes(k, te)) + geom_point() + geom_smooth(method = "loess") + xlab("Dimensions") + ylab("Elapsed time (s)"))
print(summary(m))
timings_estimated <- data_frame(k = c(200, 500, 1000, 1500))
timings_estimated$te <- predict(m, timings_estimated)
print(timings_estimated %>% ggplot(aes(k, te)) + geom_line() + xlab("Dimensions") + ylab("Elapsed time, estimated (s)"))
}
Rcpp implementation
profile_timings(f = score_features_sv_entropy)
Call:
lm(formula = te ~ poly(k, 2), data = timings)
Residuals:
Min 1Q Median 3Q Max
-0.098265 -0.017064 0.007229 0.016823 0.129679
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.210931 0.007221 29.21 < 2e-16 ***
poly(k, 2)1 1.149138 0.038885 29.55 < 2e-16 ***
poly(k, 2)2 0.513715 0.038885 13.21 4.83e-13 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.03888 on 26 degrees of freedom
Multiple R-squared: 0.9758, Adjusted R-squared: 0.9739
F-statistic: 523.9 on 2 and 26 DF, p-value: < 2.2e-16
R implementation
profile_timings(f = score_features_sv_entropy_r)
Call:
lm(formula = te ~ poly(k, 2), data = timings)
Residuals:
Min 1Q Median 3Q Max
-0.062419 -0.015226 0.003148 0.018015 0.073077
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.307000 0.005873 52.27 <2e-16 ***
poly(k, 2)1 1.555503 0.031626 49.18 <2e-16 ***
poly(k, 2)2 0.664002 0.031626 21.00 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.03163 on 26 degrees of freedom
Multiple R-squared: 0.991, Adjusted R-squared: 0.9903
F-statistic: 1430 on 2 and 26 DF, p-value: < 2.2e-16
R implementation (4 cores)
registerDoParallel(cores = 4)
profile_timings(f = score_features_sv_entropy_r)
Call:
lm(formula = te ~ poly(k, 2), data = timings)
Residuals:
Min 1Q Median 3Q Max
-0.037804 -0.018163 0.001008 0.011448 0.055553
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.217103 0.003788 57.32 < 2e-16 ***
poly(k, 2)1 0.666822 0.020397 32.69 < 2e-16 ***
poly(k, 2)2 0.243456 0.020397 11.94 4.72e-12 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.0204 on 26 degrees of freedom
Multiple R-squared: 0.979, Adjusted R-squared: 0.9774
F-statistic: 605.6 on 2 and 26 DF, p-value: < 2.2e-16