library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 2.2.1     ✔ purrr   0.2.4
## ✔ tibble  1.4.2     ✔ dplyr   0.7.4
## ✔ tidyr   0.8.0     ✔ stringr 1.2.0
## ✔ readr   1.1.1     ✔ forcats 0.2.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(foreach)
## 
## Attaching package: 'foreach'
## The following objects are masked from 'package:purrr':
## 
##     accumulate, when
library(doParallel)
## Loading required package: iterators
## Loading required package: parallel
library(cytostats)

registerDoParallel(cores = 4)

singular_value_entropy_r <- function(A) {
  # calculate the svd
  singular_values <- svd(A, 0, 0)$d
  
  # normalize relative values
  singular_values <- singular_values/sum(singular_values)
  
  -sum(singular_values * log10(singular_values))
  
}

score_features_sv_entropy_r_par <- function(data) {
  
  # for each feature calculate the contribution to the entropy by leaving that feature out
  sv_entropy <- 
    foreach(i = 1:ncol(data), .combine = c) %dopar% singular_value_entropy_r(data[-i, -i])
  
  singular_value_entropy_r(data) - sv_entropy
}

score_features_sv_entropy_r <- function(data) {
  
  # for each feature calculate the contribution to the entropy by leaving that feature out
  sv_entropy <- 
    foreach(i = 1:ncol(data), .combine = c) %do% singular_value_entropy_r(data[-i, -i])
  
  singular_value_entropy_r(data) - sv_entropy
}

n <- 10000
k <- 100
A <- matrix(rnorm(n * k), n, k)
testthat::expect_equal(score_features_sv_entropy_r(crossprod(A, A)), 
                       cytostats::score_features_sv_entropy(crossprod(A, A)))

profile_timings <- function(f) {
  n <- 10000
  
  timings <- 
    seq(10, 150, 5) %>%
    map_df(function(k) {
      A <- matrix(rnorm(n * k), n, k)
      
      te <- 
        rbenchmark::benchmark(
          f(crossprod(A, A)),
          replications = 3) %>%
        magrittr::extract2("elapsed")
      
      data_frame(k, te) 
      
    })
  
  
  m <- lm(te ~ poly(k, 2), timings)
  
  print(timings %>%  ggplot(aes(k, te)) + geom_point() + geom_smooth(method = "loess"))

  print(summary(m))
  
  testset <- data_frame(k = c(200, 500, 1000, 1500))
                        
  testset$te <- predict(m, testset)  
  
  print(testset)
}


profile_timings(f = score_features_sv_entropy_r)

## 
## Call:
## lm(formula = te ~ poly(k, 2), data = timings)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.05813 -0.02377  0.01062  0.02140  0.05540 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.32386    0.00581   55.74   <2e-16 ***
## poly(k, 2)1  1.59250    0.03129   50.90   <2e-16 ***
## poly(k, 2)2  0.61669    0.03129   19.71   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03129 on 26 degrees of freedom
## Multiple R-squared:  0.9913, Adjusted R-squared:  0.9907 
## F-statistic:  1489 on 2 and 26 DF,  p-value: < 2.2e-16
## 
## # A tibble: 4 x 2
##       k     te
##   <dbl>  <dbl>
## 1   200   2.10
## 2   500  16.1 
## 3  1000  68.7 
## 4  1500 158
profile_timings(f = score_features_sv_entropy_r_par)

## 
## Call:
## lm(formula = te ~ poly(k, 2), data = timings)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.098012 -0.028504 -0.007682  0.005987  0.206515 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.24862    0.01300  19.130  < 2e-16 ***
## poly(k, 2)1  0.85983    0.06999  12.285 2.49e-12 ***
## poly(k, 2)2  0.33307    0.06999   4.759 6.35e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.06999 on 26 degrees of freedom
## Multiple R-squared:  0.8697, Adjusted R-squared:  0.8597 
## F-statistic: 86.79 on 2 and 26 DF,  p-value: 3.114e-12
## 
## # A tibble: 4 x 2
##       k    te
##   <dbl> <dbl>
## 1   200  1.21
## 2   500  8.77
## 3  1000 37.2 
## 4  1500 85.4
profile_timings(f = score_features_sv_entropy)

## 
## Call:
## lm(formula = te ~ poly(k, 2), data = timings)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.086966 -0.033448 -0.001619  0.019889  0.174698 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 0.221241   0.008883   24.91  < 2e-16 ***
## poly(k, 2)1 1.230548   0.047834   25.73  < 2e-16 ***
## poly(k, 2)2 0.578820   0.047834   12.10 3.48e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.04783 on 26 degrees of freedom
## Multiple R-squared:  0.9688, Adjusted R-squared:  0.9664 
## F-statistic: 404.1 on 2 and 26 DF,  p-value: < 2.2e-16
## 
## # A tibble: 4 x 2
##       k     te
##   <dbl>  <dbl>
## 1   200   1.75
## 2   500  14.5 
## 3  1000  63.4 
## 4  1500 147