Question 1

One of the key observations after reading the R handout is to try using vectorized version of a function rather than using lapply/sapply or looping through (using while/repeat/for).Vectorized version is order of magnitude (12X) faster than sapply or looping thorugh. Here is the experiment and related data:

#Example finding sine of values in a vector.

# get 10 Million numbers beweteen 0, 2* pi randomly
nums = runif(10000000, 0,2*pi)

# use the vector implementation to find sine of 10M points and time it
system.time(sum(sin(nums)))
##    user  system elapsed 
##    0.63    0.04    0.67
# use sapply to iterate and find the sine of the vector of 10 Million numbers
system.time(sum(sapply(nums,sin)))
##    user  system elapsed 
##   10.90    0.35   11.68
# Write a custom iterator to iterate over the 10 Million numbers and find the sum
sin_cal = function(nums){
  sum = 0 
  for (elem in nums){
    sum = sum + sin(elem)
  }
  return(sum)
}

# time the looping iterator
system.time(sin_cal(nums))
##    user  system elapsed 
##    7.32    0.07    7.52

it is clear from the above data that the vectorize verision is 12X faster than the non vectorized version. sappply and the iteration (loop) are having the similar runtime, sapply is being more slower in this particular case.

Another observation is, since R being a interpreted language it is slower in task where looping is done (Function call or operation inside a loop) . If the task can’t be vectorized and is a bottleneck we should think of implementing the bottleneck in C/C++ and calling it through R C/C++ interface, we will observe excellent improvement in speed.

Question 2

# gamma function implementation (iterative version)
# log_gamma = log(!(n-1)) for n > 1
# log_gamma(1) = 0 for n = 1

log_gamma_loop = function (n){
  sum = 0 
  val = n-1
  while(val > 0){
    sum = sum + log(val)
    val = val -1
  }
  return(sum)
}

Question 3

# gamma function implementation (recursive version)
# log_gamma = log(!(n-1)) for n > 1
# log_gamma(1) = 0 for n = 1

# bumpup the expressions limit to avoid the early stack overflow
options(expressions=500000)

log_gamma_recursive = function (n){
  if(n <= 1){ #base case log_gamma(1) == 0
    return(0)
  }else{
    return(log_gamma_recursive(n-1) + log(n-1))
  }
}

Question 4

Returns the sum of gamma function from 1…n Both recursive and iterative function (log_gamma_loop() and log_gamma_recursive()) are used.

sum_log_gamma_loop = function(n){
  sum = 0
  for(i in seq(1,n,by=1)){
    sum = sum + log_gamma_loop(i)
  }
  return(sum)    
}


sum_log_gamma_recursive = function(n){
  if(n > 1){
    return (sum_log_gamma_recursive(n-1) + log_gamma_recursive(n))
  }else{
    return(log_gamma_recursive(n))
  }
}

Question 5

# generate a geometric series for the test input upto max

geomSeries <- function(base, max) {
    base^(0:floor(log(max, base)))
}
test_pattern_midsize = geomSeries(base=2, max=2^14) #2^16

test_pattern_small = geomSeries(base=2, max=2048)

test_pattern_large = geomSeries(base=2, max=2^25)


# Plotting helper function
plotfunc = function(df){
  require(ggplot2)
  if("sum_log_gamma_recursive_time" %in% colnames(df)){
    ggplot(df, aes(size)) + 
      geom_line(aes(y = sum_log_gamma_loop_time, colour = "sum_log_gamma_loop_time")) + 
      geom_line(aes(y = sum_log_gamma_recursive_time, colour = "sum_log_gamma_recursive_time"))+
      geom_line(aes(y = sum_log_gamma_lib_time, colour = "sum_log_gamma_lib_time"))+
      labs(x="Size",y="time(s)") 
    
  }else{
    ggplot(df, aes(size)) + 
      geom_line(aes(y = sum_log_gamma_loop_time, colour = "sum_log_gamma_loop_time")) + 
      geom_line(aes(y = sum_log_gamma_lib_time, colour = "sum_log_gamma_lib_time"))+
      labs(x="Size",y="time(s)") 
  }
  
}

sum_log_gamma_lib = function(n){
  l = (sum(sapply(1:n,FUN=lgamma)))
  return(l)
}

# test runner wrapper, test pattern and the run function is passed as argument
# runtime vector is returned
run_test_vector = function(test_pattern,Func){
  sum_lgamma = vector(mode="numeric", length=0); # holds the runtime of the results
  for (elem in test_pattern){
    l = system.time(Func(elem))[[1]]
    sum_lgamma = c(sum_lgamma,l)
  }
  return(sum_lgamma) 
}

#gather runtime data an display it in table and plot form

# Run small set of test cases where the recursive example does not produce overflow
# generate the runtime comparision of the all three versions sum_log_gamma_lib() sum_log_gamma_loop() and sum_log_gamma_recursive()

sum_log_gamma_lib_time = run_test_vector(test_pattern_small,sum_log_gamma_lib)
sum_log_gamma_loop_time = run_test_vector(test_pattern_small,sum_log_gamma_loop)
sum_log_gamma_recursive_time = run_test_vector(test_pattern_small,sum_log_gamma_recursive)

df = data.frame(sum_log_gamma_lib_time,sum_log_gamma_loop_time,sum_log_gamma_recursive_time)
df =  cbind(size=test_pattern_small,df)

#display the table for comparision purpose
df 
##    size sum_log_gamma_lib_time sum_log_gamma_loop_time
## 1     1                   0.00                    0.00
## 2     2                   0.00                    0.00
## 3     4                   0.00                    0.00
## 4     8                   0.00                    0.00
## 5    16                   0.00                    0.00
## 6    32                   0.00                    0.00
## 7    64                   0.00                    0.00
## 8   128                   0.00                    0.02
## 9   256                   0.02                    0.04
## 10  512                   0.00                    0.16
## 11 1024                   0.00                    0.59
## 12 2048                   0.00                    2.38
##    sum_log_gamma_recursive_time
## 1                          0.00
## 2                          0.00
## 3                          0.00
## 4                          0.00
## 5                          0.00
## 6                          0.00
## 7                          0.01
## 8                          0.03
## 9                          0.10
## 10                         0.40
## 11                         1.75
## 12                         7.54
plotfunc(df)
## Loading required package: ggplot2

it is clear from the above plot and table that

# Run midsize  test case with sum_log_gamma_lib() and sum_log_gamma_loop()

sum_log_gamma_lib_time = run_test_vector(test_pattern_midsize,sum_log_gamma_lib)
sum_log_gamma_loop_time = run_test_vector(test_pattern_midsize,sum_log_gamma_loop)

df_midsize = data.frame(sum_log_gamma_lib_time,sum_log_gamma_loop_time)
df_midsize =  cbind(size=test_pattern_midsize,df_midsize)

df_midsize
##     size sum_log_gamma_lib_time sum_log_gamma_loop_time
## 1      1                      0                    0.00
## 2      2                      0                    0.00
## 3      4                      0                    0.00
## 4      8                      0                    0.00
## 5     16                      0                    0.00
## 6     32                      0                    0.00
## 7     64                      0                    0.00
## 8    128                      0                    0.00
## 9    256                      0                    0.05
## 10   512                      0                    0.16
## 11  1024                      0                    0.59
## 12  2048                      0                    2.45
## 13  4096                      0                    9.68
## 14  8192                      0                   40.08
## 15 16384                      0                  159.46
# now plot the data frames
plotfunc(df_midsize)

# Run large  test case with sum_log_gamma_lib()

sum_log_gamma_lib_time = run_test_vector(test_pattern_large,sum_log_gamma_lib)

df_large = data.frame(sum_log_gamma_lib_time)
df_large = cbind(size=test_pattern_large,df_large)

df_large
##        size sum_log_gamma_lib_time
## 1         1                   0.00
## 2         2                   0.00
## 3         4                   0.00
## 4         8                   0.00
## 5        16                   0.00
## 6        32                   0.00
## 7        64                   0.00
## 8       128                   0.00
## 9       256                   0.00
## 10      512                   0.00
## 11     1024                   0.00
## 12     2048                   0.00
## 13     4096                   0.00
## 14     8192                   0.01
## 15    16384                   0.00
## 16    32768                   0.03
## 17    65536                   0.03
## 18   131072                   0.10
## 19   262144                   0.21
## 20   524288                   0.41
## 21  1048576                   1.11
## 22  2097152                   2.47
## 23  4194304                   5.06
## 24  8388608                  12.32
## 25 16777216                  22.14
## 26 33554432                  45.95
# now plot the data frames

 ggplot(df_large, aes(size)) + 
      geom_line(aes(y = sum_log_gamma_lib_time, colour = "sum_log_gamma_lib_time"))+
      labs(x="Size",y="time(s)")