The names of functions used in R packages on the CRAN were obtained from the source packages, and made into a word cloud. Most processing was done in the shell. There were 5457 packages listed as of 2014-04-20. In total, there were 8,453,111 lines of non-comment, non-empty lines of code. Great work folks!
tm Vignettes: http://cran.r-project.org/web/packages/tm/vignettes/tm.pdf
Word Cloud in R: http://onertipaday.blogspot.com/2011/07/word-cloud-in-r.html
## dplyr for summarize
library(dplyr)
## wordcloud
library(wordcloud)
## Load
cranAllFunctions <- read.csv("../r.group.logo.data/cran_all_functions.txt", header = FALSE)
## number of rows
nrow(cranAllFunctions)
## [1] 6759694
## summarize
cranAllFunctionsSummarized <- cranAllFunctions %.%
group_by(V1) %.%
summarize(n = n()) %.%
arrange(desc(n))
## Count number of unique functions
nrow(cranAllFunctionsSummarized)
## [1] 123597
## Check
head(cranAllFunctionsSummarized, 20)
## Source: local data frame [20 x 2]
##
## V1 n
## 1 if 439874
## 2 c 395426
## 3 function 340789
## 4 length 322338
## 5 paste 171423
## 6 return 155903
## 7 is.null 150350
## 8 stop 149603
## 9 list 145974
## 10 cat 140296
## 11 rep 106768
## 12 names 101199
## 13 sum 92626
## 14 matrix 80117
## 15 log 78072
## 16 for 64863
## 17 nrow 63491
## 18 as.integer 59875
## 19 is.na 58657
## 20 max 53662
## Set colors
colors <- brewer.pal(8,"Dark2")
## Create cloud with functions appearing at least 2 times
wordcloud(words = cranAllFunctionsSummarized$V1,
freq = cranAllFunctionsSummarized$n,
scale = c(4,0.5),
min.freq = 3,
max.words = 1000,
random.order = FALSE,
random.color = FALSE,
color = colors)
Data prepration for functions names were carried out in the following shell script. Alphanumerical string immediately preceding a left parenthesis was considered as a function name.
#!/bin/sh
### change directory
cd "../r.group.logo.data"
### Obtain the index.html
wget --level 1 http://cran.r-project.org/src/contrib/
## Count lines containing package names
cat index.html | grep ".tar.gz" | wc -l
## Check pattern
cat index.html | grep ".tar.gz" | head -n 1
## <tr><td valign="top"><img src="/icons/compressed.gif" alt="[ ]"></td><td><a href="A3_0.9.2.tar.gz">A3_0.9.2.tar.gz</a></td><td align="right">26-Mar-2013 19:58 </td><td align="right"> 45K</td><td> </td></tr>
###
### Create regexp
## href="A3_0.9.2.tar.gz"
## ^.*href=\"\(A3_0.9.2.tar.gz\)\".*$
## Check
cat index.html | grep ".tar.gz" | head -n 10 | sed -e "s/^.*href=\"\(.*.tar.gz\)\".*$/\1/"
## Create a file containing one package name per line
cat index.html | grep ".tar.gz" | sed -e "s/^.*href=\"\(.*.tar.gz\)\".*$/\1/" > names.tar.gz.txt
## Count lines for number of packages
wc -l names.tar.gz.txt
## add URL and wget
cat names.tar.gz.txt | head -n 10 | sed -e "s/^/wget http:\/\/cran.r-project.org\/src\/contrib\//"
###
### create a script for actual downloading
echo '#!/bin/sh' > download.sh
## Check
cat download.sh
## Add to script
cat names.tar.gz.txt | sed -e "s/^/wget http:\/\/cran.r-project.org\/src\/contrib\//" >> download.sh
## Check script
head download.sh
## Run
sh download.sh
###
### Extract a specific folder only and combine files
## http://www.cyberciti.biz/faq/extracting-single-file-directory-from-tarball-2/
## tar -zxvf mytar.ball.tar.gz directory-name
# A3_0.9.2.tar.gz ABCExtremes_1.0.tar.gz ABCoptim_0.13.11.tar.gz
# for tarfile in A3_0.9.2.tar.gz ABCExtremes_1.0.tar.gz ABCoptim_0.13.11.tar.gz
# do
# ## Extract the R folder only
# tar -zxvf ${tarfile} ${tarfile%_*.tar.gz}/R
#
# ## Move it out
# mv ${tarfile%_*.tar.gz}/R R_dir_${tarfile%_*.tar.gz}
#
# ## remove the enclosing folder
# rm -rf ${tarfile%_*.tar.gz}
# done
## For all packages, extract R directory and cat scripts into one file each.
for tarfile in *
do
## Extract the R folder only
tar -zxvf ${tarfile} ${tarfile%_*.tar.gz}/R
## cat into a single file r_code_packageNameVer.R
cat ${tarfile%_*.tar.gz}/R/* > r_code_${tarfile%.tar.gz}.R
## remove the folder
rm -rf ${tarfile%_*.tar.gz}
done
###
### Combine into one file after removing comment lines, trailing comments, and empty lines
## cat
cat r_code_*.R > cran_all_in_one.R
## check
wc -l cran_all_in_one.R
## remove comment only lines
cat cran_all_in_one.R | grep -v "^ *#\|^ *$" | sed "s/#.*$//g" > cran_all_in_one_nocomment.R
## check
wc -l cran_all_in_one_nocomment.R
## remove trailing comments
## http://stackoverflow.com/questions/19242275/sed-re-error-illegal-byte-sequence-on-mac-os-x
export LC_CTYPE=C
export LANG=C
cat cran_all_in_one_nocomment.R | sed "s/#.*$//g" > cran_all_in_one_nocomment2.R
## check
wc -l cran_all_in_one_nocomment2.R
###
### Further trim down to function names (words immediately left to ( only)
## tr/sed version
cat cran_all_in_one_nocomment2.R | tr -c '[:alnum:]\n ._(' ' ' | sed -e "s/(/(+/g" > cran_all_in_one_nocomment3.txt
## tr
cat cran_all_in_one_nocomment3.txt | tr '+' '\n' | tr \ \\n | grep -v "^ *$" | grep "(" | sed -e "s/(//g" | grep -v "^ *$" | grep "^[a-zA-z\._]" > cran_all_functions_dirty.txt
## check
wc -l cran_all_functions_dirty.txt
## Clean
## http://stackoverflow.com/questions/20007288/removing-non-alphanumeric-characters-with-sed
cat cran_all_functions_dirty.txt | tr -cd '[:alnum:]._\n' > cran_all_functions.txt
## check
wc -l cran_all_functions.txt