######################################### Q1 ##############################################
#One of the key observation after reading the R handout is try to use vectorized version of a function
#rather than using lapply/sapply or looping through (using while/repeat/for) Here is the experiment and data for it
#Example finding sin of values in an vector.
#get 10 Million numbers beweteen 0, 2* pi randomly
nums = runif(10000000, 0,2*pi)
# use the vector implementation to find sine of 10M points above and time it
system.time(sin(nums))
## user system elapsed
## 0.64 0.05 0.77
# use sapply to iterate and find the sin of the vector of 10 Million numbers
system.time(sapply(nums,sin))
## user system elapsed
## 13.28 0.22 14.50
# Write a custom iterator to iterate over the 10 Million numbers and find the sum
sin_cal = function(nums){
sum = 0
for (elem in nums){
sum = sum + sin(elem)
}
return(sum)
}
# time the handwritten iterator
system.time(sin_cal(nums))
## user system elapsed
## 8.15 0.04 9.02
#it is clear from the above data that the vectorize verision is 11X faster than the non vectorized version. sappply and the iteration (loop) are having the similar runtime.
#Another observation is, since R being a interpreted language it is slower in task where looping (nested looping) is required. If the task can't be vectorized and is a bottleneck you should think of implementing it in C/C++ and calling it thorugh R, you will see dramatic improvement in speed.
# bumpup the expressions limit to avoid the early stack overflow
options(expressions=500000)
######################################### Q2 ##############################################
# gamma function implementation (iterative version)
# log_gamma = log(!(n-1)) for n > 1
# log_gamma(1) = 0 for n = 1
log_gamma_loop = function (n){
sum = 0
val = n-1
while(val > 0){
sum = sum + log(val)
val = val -1
}
return(sum)
}
test = 0
if(test){
print(log_gamma_loop(5))
print(log_gamma_loop(1))
}
######################################### Q3 ##############################################
# gamma function implementation (recursive version)
# log_gamma = log(!(n-1)) for n > 1
# log_gamma(1) = 0 for n = 1
log_gamma_recursive = function (n){
if(n <= 1){ #base case log_gamma(1) == 0
return(0)
}else{
return(log_gamma_recursive(n-1) + log(n-1))
}
}
#print(log_gamma_recursive(5))
######################################### Q4 ##############################################
sum_log_gamma_loop = function(n){
sum = 0
for(i in seq(1,n,by=1)){
sum = sum + log_gamma_loop(i)
}
return(sum)
}
sum_log_gamma_recursive = function(n){
if(n > 1){
return (sum_log_gamma_recursive(n-1) + log_gamma_recursive(n))
}else{
return(log_gamma_recursive(n))
}
}
sum_log_gamma_recursive(5)
## [1] 5.66296
#system.time(sum_log_gamma_loop(500000))
#system.time(sum_log_gamma_recursive())
#system.time(lgamma(10^8))
val_loop = sum_log_gamma_loop(10^2);val_loop
## [1] 15617.2
#system.time()
#seq = 1:10^5;
#val = sum(sapply(1:10^2,FUN=lgamma));val
######################################### Q5 ##############################################
# generate a geometric series for the test input max upto 10^8
geomSeries <- function(base, max) {
base^(0:floor(log(max, base)))
}
test_pattern_larger = geomSeries(base=2, max=2^16)
test_pattern_small = geomSeries(base=2, max=2048)
# generate the runtime comparision of the three version of sum_log_gamma
# we pass option skip argument which skips the recursive version of the
#sum_lgamma = as.numeric(0) # holds the vector of running time for sum_lgamma / library implementation
sum_log_gamma_lib = function(n){
l = (sum(sapply(1:n,FUN=lgamma)))
return(l)
}
run_test_vector = function(test_pattern,Func){
sum_lgamma = vector(mode="numeric", length=0); # holds the runtime of the results
for (elem in test_pattern){
l = system.time(Func(elem))[[1]]
print(l)
sum_lgamma = c(sum_lgamma,l)
}
return(sum_lgamma)
}
sum_log_gamma_lib_time = 0.0
# Run small set of test case where the recursive example does not overflow
sum_log_gamma_lib_time = run_test_vector(test_pattern_small,sum_log_gamma_lib)
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
sum_log_gamma_loop_time = run_test_vector(test_pattern_small,sum_log_gamma_loop)
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0.05
## [1] 0.19
## [1] 0.78
## [1] 3.05
sum_log_gamma_recursive_time = run_test_vector(test_pattern_small,sum_log_gamma_recursive)
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0.02
## [1] 0.03
## [1] 0.16
## [1] 0.66
## [1] 2.68
## [1] 10
df = data.frame(sum_log_gamma_lib_time,sum_log_gamma_loop_time,sum_log_gamma_recursive_time)
df = cbind(df,size=test_pattern_small)
df
## sum_log_gamma_lib_time sum_log_gamma_loop_time
## 1 0 0.00
## 2 0 0.00
## 3 0 0.00
## 4 0 0.00
## 5 0 0.00
## 6 0 0.00
## 7 0 0.00
## 8 0 0.00
## 9 0 0.05
## 10 0 0.19
## 11 0 0.78
## 12 0 3.05
## sum_log_gamma_recursive_time size
## 1 0.00 1
## 2 0.00 2
## 3 0.00 4
## 4 0.00 8
## 5 0.00 16
## 6 0.00 32
## 7 0.02 64
## 8 0.03 128
## 9 0.16 256
## 10 0.66 512
## 11 2.68 1024
## 12 10.00 2048
# Run larger set of test case with sum_log_gamma_lib() and sum_log_gamma_loop()
sum_log_gamma_lib_time = run_test_vector(test_pattern_larger,sum_log_gamma_lib)
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0.01
## [1] 0
## [1] 0.03
## [1] 0.01
## [1] 0.03
## [1] 0.05
sum_log_gamma_loop_time = run_test_vector(test_pattern_larger,sum_log_gamma_loop)
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0
## [1] 0.02
## [1] 0.04
## [1] 0.28
## [1] 0.93
## [1] 3.66
## [1] 12.19
## [1] 43.26
## [1] 170.82
## [1] 674.97
## [1] 2663.28
df_large = data.frame(sum_log_gamma_lib_time,sum_log_gamma_loop_time)
df_large = cbind(df_large,size=test_pattern_larger)
df_large
## sum_log_gamma_lib_time sum_log_gamma_loop_time size
## 1 0.00 0.00 1
## 2 0.00 0.00 2
## 3 0.00 0.00 4
## 4 0.00 0.00 8
## 5 0.00 0.00 16
## 6 0.00 0.00 32
## 7 0.00 0.00 64
## 8 0.00 0.02 128
## 9 0.00 0.04 256
## 10 0.00 0.28 512
## 11 0.00 0.93 1024
## 12 0.01 3.66 2048
## 13 0.00 12.19 4096
## 14 0.03 43.26 8192
## 15 0.01 170.82 16384
## 16 0.03 674.97 32768
## 17 0.05 2663.28 65536
plotfunc = function(df){
require(ggplot2)
if("sum_log_gamma_recursive_time" %in% colnames(df)){
ggplot(df, aes(size)) +
geom_line(aes(y = sum_log_gamma_loop_time, colour = "sum_log_gamma_loop_time")) +
geom_line(aes(y = sum_log_gamma_recursive_time, colour = "sum_log_gamma_recursive_time"))+
geom_line(aes(y = sum_log_gamma_lib_time, colour = "sum_log_gamma_lib_time"))+
labs(x="Size",y="time(s)")
}else{
ggplot(df, aes(size)) +
geom_line(aes(y = sum_log_gamma_loop_time, colour = "sum_log_gamma_loop_time")) +
geom_line(aes(y = sum_log_gamma_lib_time, colour = "sum_log_gamma_lib_time"))+
labs(x="Size",y="time(s)")
}
}
plotfunc(df)
## Loading required package: ggplot2

plotfunc(df_large)

#Error: C stack usage is too close to the limit