library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1 ✔ purrr 0.2.4
## ✔ tibble 1.4.2 ✔ dplyr 0.7.4
## ✔ tidyr 0.8.0 ✔ stringr 1.2.0
## ✔ readr 1.1.1 ✔ forcats 0.2.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(foreach)
##
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
##
## accumulate, when
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
library(cytostats)
registerDoParallel(cores = 4)
singular_value_entropy_r <- function(A) {
# calculate the svd
singular_values <- svd(A, 0, 0)$d
# normalize relative values
singular_values <- singular_values/sum(singular_values)
-sum(singular_values * log10(singular_values))
}
score_features_sv_entropy_r_par <- function(data) {
# for each feature calculate the contribution to the entropy by leaving that feature out
sv_entropy <-
foreach(i = 1:ncol(data), .combine = c) %dopar% singular_value_entropy_r(data[-i, -i])
singular_value_entropy_r(data) - sv_entropy
}
score_features_sv_entropy_r <- function(data) {
# for each feature calculate the contribution to the entropy by leaving that feature out
sv_entropy <-
foreach(i = 1:ncol(data), .combine = c) %do% singular_value_entropy_r(data[-i, -i])
singular_value_entropy_r(data) - sv_entropy
}
n <- 10000
k <- 100
A <- matrix(rnorm(n * k), n, k)
testthat::expect_equal(score_features_sv_entropy_r(crossprod(A, A)),
cytostats::score_features_sv_entropy(crossprod(A, A)))
profile_timings <- function(f) {
n <- 10000
timings <-
seq(10, 150, 5) %>%
map_df(function(k) {
A <- matrix(rnorm(n * k), n, k)
te <-
rbenchmark::benchmark(
f(crossprod(A, A)),
replications = 3) %>%
magrittr::extract2("elapsed")
data_frame(k, te)
})
m <- lm(te ~ poly(k, 2), timings)
print(timings %>% ggplot(aes(k, te)) + geom_point() + geom_smooth(method = "loess"))
print(summary(m))
testset <- data_frame(k = c(200, 500, 1000, 1500))
testset$te <- predict(m, testset)
print(testset)
}
profile_timings(f = score_features_sv_entropy_r)

##
## Call:
## lm(formula = te ~ poly(k, 2), data = timings)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.05813 -0.02377 0.01062 0.02140 0.05540
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.32386 0.00581 55.74 <2e-16 ***
## poly(k, 2)1 1.59250 0.03129 50.90 <2e-16 ***
## poly(k, 2)2 0.61669 0.03129 19.71 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03129 on 26 degrees of freedom
## Multiple R-squared: 0.9913, Adjusted R-squared: 0.9907
## F-statistic: 1489 on 2 and 26 DF, p-value: < 2.2e-16
##
## # A tibble: 4 x 2
## k te
## <dbl> <dbl>
## 1 200 2.10
## 2 500 16.1
## 3 1000 68.7
## 4 1500 158
profile_timings(f = score_features_sv_entropy_r_par)

##
## Call:
## lm(formula = te ~ poly(k, 2), data = timings)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.098012 -0.028504 -0.007682 0.005987 0.206515
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.24862 0.01300 19.130 < 2e-16 ***
## poly(k, 2)1 0.85983 0.06999 12.285 2.49e-12 ***
## poly(k, 2)2 0.33307 0.06999 4.759 6.35e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.06999 on 26 degrees of freedom
## Multiple R-squared: 0.8697, Adjusted R-squared: 0.8597
## F-statistic: 86.79 on 2 and 26 DF, p-value: 3.114e-12
##
## # A tibble: 4 x 2
## k te
## <dbl> <dbl>
## 1 200 1.21
## 2 500 8.77
## 3 1000 37.2
## 4 1500 85.4
profile_timings(f = score_features_sv_entropy)

##
## Call:
## lm(formula = te ~ poly(k, 2), data = timings)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.086966 -0.033448 -0.001619 0.019889 0.174698
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.221241 0.008883 24.91 < 2e-16 ***
## poly(k, 2)1 1.230548 0.047834 25.73 < 2e-16 ***
## poly(k, 2)2 0.578820 0.047834 12.10 3.48e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04783 on 26 degrees of freedom
## Multiple R-squared: 0.9688, Adjusted R-squared: 0.9664
## F-statistic: 404.1 on 2 and 26 DF, p-value: < 2.2e-16
##
## # A tibble: 4 x 2
## k te
## <dbl> <dbl>
## 1 200 1.75
## 2 500 14.5
## 3 1000 63.4
## 4 1500 147