prof3.R

# Source
# http://adv-r.had.co.nz/Profiling.html
#identify bottlenecks within a function
#Lineprof

# For each sequence of calls, lineprof calculates:
# 
# time: the time in seconds
# 
# alloc: the memory allocated, in megabytes
# released: the memory released, in megabytes. Unless torture = TRUE this release is somewhat random: memory will be only released if a garbage collection is triggered by an allocation.
# dups: the number of calls to the internal duplicate function which is called by C code to duplicate R vectors.

#install.packages("lineprof")
#install.packages("stringi")
#devtools::install_github("hadley/lineprof")
library(lineprof)
library(shiny)

## Warning: package 'shiny' was built under R version 3.2.3

f <- function() {
  pause(0.1)
  g()
  h()
}
g <- function() {
  pause(0.1)
  h()
}
h <- function() {
  pause(0.1)
}

l <- lineprof(f())
l

## Reducing depth to 2 (from 4)

##    time alloc release dups                 ref         src
## 1 0.045 0.001       0    2 c("pause", ".Call") pause/.Call
## 2 0.048 0.001       0    0           <text>#27 g/pause    
## 3 0.040 0.001       0    0           <text>#28 g/h        
## 4 0.040 0.000       0    0           <text>#31 h/pause

#shine(l)
#?lineprof
#Comparing functions using microbenchmark
library(microbenchmark)
library(data.table)
library(readr)

getwd()

## [1] "C:/Users/dell/Desktop/Teaching"

dir("C:/Users/dell/Desktop/",pattern = ".csv")

## [1] "Analytics decisionstats.com Audience Overview 20110617-20120717.csv"
## [2] "BigDiamonds.csv"                                                    
## [3] "BigDiamonds.csv.zip"                                                
## [4] "Boston.csv"                                                         
## [5] "ccFraud.csv"                                                        
## [6] "test.csv"

microbenchmark(
  fread("C:/Users/dell/Desktop/BigDiamonds.csv" ),
  read_csv("C:/Users/dell/Desktop/BigDiamonds.csv")
,times=3
  )

## 
Read 0.0% of 598024 rows
Read 28.4% of 598024 rows
Read 53.5% of 598024 rows
Read 80.3% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:07

## Warning: 597311 problems parsing 'C:/Users/dell/Desktop/BigDiamonds.csv'.
## See problems(...) for more details.

## Warning: 597311 problems parsing 'C:/Users/dell/Desktop/BigDiamonds.csv'.
## See problems(...) for more details.

## 
Read 21.7% of 598024 rows
Read 50.2% of 598024 rows
Read 83.6% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:05
## 
Read 48.5% of 598024 rows
Read 76.9% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:04

## Warning: 597311 problems parsing 'C:/Users/dell/Desktop/BigDiamonds.csv'.
## See problems(...) for more details.

## Unit: seconds
##                                               expr      min       lq
##     fread("C:/Users/dell/Desktop/BigDiamonds.csv") 3.980537 4.359082
##  read_csv("C:/Users/dell/Desktop/BigDiamonds.csv") 3.257605 3.285306
##      mean   median       uq      max neval cld
##  5.055004 4.737628 5.592237 6.446847     3   a
##  3.770749 3.313006 4.027321 4.741635     3   a

#Compiler ~5% speedup
fread_c <- compiler::cmpfun(fread)

microbenchmark(
  fread("C:/Users/dell/Desktop/BigDiamonds.csv" ),
  fread_c("C:/Users/dell/Desktop/BigDiamonds.csv")
  ,times=3
)

## 
Read 53.5% of 598024 rows
Read 81.9% of 598024 rows
Read 93.6% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:05
## 
Read 36.8% of 598024 rows
Read 68.6% of 598024 rows
Read 98.7% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:05
## 
Read 21.7% of 598024 rows
Read 35.1% of 598024 rows
Read 58.5% of 598024 rows
Read 90.3% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:06
## 
Read 43.5% of 598024 rows
Read 73.6% of 598024 rows
Read 98.7% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:05
## 
Read 13.4% of 598024 rows
Read 43.5% of 598024 rows
Read 75.2% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:05
## 
Read 50.2% of 598024 rows
Read 73.6% of 598024 rows
Read 87.0% of 598024 rows
Read 598024 rows and 13 (of 13) columns from 0.049 GB file in 00:00:06

## Unit: seconds
##                                              expr      min       lq
##    fread("C:/Users/dell/Desktop/BigDiamonds.csv") 4.188466 4.577904
##  fread_c("C:/Users/dell/Desktop/BigDiamonds.csv") 4.320403 4.494925
##      mean   median       uq      max neval cld
##  4.875539 4.967343 5.219075 5.470808     3   a
##  4.724924 4.669447 4.927185 5.184923     3   a

#Parallel Processing
library(parallel)
cores <- detectCores()
cores

## [1] 4

detectCores()

## [1] 4

#
sessionInfo()

## R version 3.2.2 (2015-08-14)
## Platform: i386-w64-mingw32/i386 (32-bit)
## Running under: Windows 7 (build 7601) Service Pack 1
## 
## locale:
## [1] LC_COLLATE=English_United States.1252 
## [2] LC_CTYPE=English_United States.1252   
## [3] LC_MONETARY=English_United States.1252
## [4] LC_NUMERIC=C                          
## [5] LC_TIME=English_United States.1252    
## 
## attached base packages:
## [1] parallel  stats     graphics  grDevices utils     datasets  methods  
## [8] base     
## 
## other attached packages:
## [1] readr_0.1.1          data.table_1.9.4     microbenchmark_1.4-2
## [4] shiny_0.12.2         lineprof_0.1.9001   
## 
## loaded via a namespace (and not attached):
##  [1] codetools_0.2-14 digest_0.6.8     htmltools_0.3    R6_2.1.1        
##  [5] splines_3.2.2    scales_0.3.0     grid_3.2.2       stringr_1.0.0   
##  [9] knitr_1.10.5     survival_2.38-3  munsell_0.4.2    compiler_3.2.2  
## [13] proto_0.3-10     httpuv_1.3.3     lattice_0.20-33  mime_0.4        
## [17] chron_2.3-47     TH.data_1.0-6    ggplot2_1.0.1    MASS_7.3-43     
## [21] plyr_1.8.3       multcomp_1.4-0   zoo_1.7-12       stringi_1.0-1   
## [25] magrittr_1.5     reshape2_1.4.1   rmarkdown_0.7    evaluate_0.7    
## [29] gtable_0.1.2     sandwich_2.3-3   colorspace_1.2-6 tools_3.2.2     
## [33] mvtnorm_1.0-2    xtable_1.8-0     Rcpp_0.12.2

pause <- function(i) {
  function(x) Sys.sleep(i)
}

system.time(lapply(1:10, pause(0.25)))

##    user  system elapsed 
##    0.00    0.00    2.51

cluster <- makePSOCKcluster(cores)
system.time(parLapply(cluster, 1:10, function(i) Sys.sleep(1)))

##    user  system elapsed 
##    0.00    0.00    3.01

# communication overhead with parallel computing. If the subproblems are very small,
# then parallelisation might hurt
# rather than help. 
# 
# An embarrassingly parallel problem is one that’s made 
# up of many simple problems that can be solved independently.

prof3.R

dell

Sun Jan 03 20:37:39 2016