This is a Markdown document for explaining about dplyr package. dplyr package is provided many useful functions to manipulate data.
Dplyr was built by C++, so it is faster than plyr package.
library(dplyr)
##
## Attaching package: 'dplyr'
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RCurl)
## Loading required package: bitops
options(RCurlOptions = list(cainfo = system.file("CurlSSL", "cacert.pem", package = "RCurl")))
url <- getURL("https://dl.dropboxusercontent.com/u/54341374/data/2014-07-08.csv")
mydf <- read.csv(textConnection(url))
dim(mydf)
## [1] 225468 11
head(mydf)
## X date time size r_version r_arch r_os package
## 1 1 2014-07-08 00:54:41 80589 3.1.0 x86_64 mingw32 htmltools
## 2 2 2014-07-08 00:59:53 321767 3.1.0 x86_64 mingw32 tseries
## 3 3 2014-07-08 00:47:13 748063 3.1.0 x86_64 linux-gnu party
## 4 4 2014-07-08 00:48:05 606104 3.1.0 x86_64 linux-gnu Hmisc
## 5 5 2014-07-08 00:46:50 79825 3.0.2 x86_64 linux-gnu digest
## 6 6 2014-07-08 00:48:04 77681 3.1.0 x86_64 linux-gnu randomForest
## version country ip_id
## 1 0.2.4 US 1
## 2 0.10-32 US 2
## 3 1.0-15 US 3
## 4 3.14-4 US 3
## 5 0.6.4 CA 4
## 6 4.6-7 US 3
cran <- tbl_df(mydf)
cran
## Source: local data frame [225,468 x 11]
##
## X date time size r_version r_arch r_os package
## 1 1 2014-07-08 00:54:41 80589 3.1.0 x86_64 mingw32 htmltools
## 2 2 2014-07-08 00:59:53 321767 3.1.0 x86_64 mingw32 tseries
## 3 3 2014-07-08 00:47:13 748063 3.1.0 x86_64 linux-gnu party
## 4 4 2014-07-08 00:48:05 606104 3.1.0 x86_64 linux-gnu Hmisc
## 5 5 2014-07-08 00:46:50 79825 3.0.2 x86_64 linux-gnu digest
## 6 6 2014-07-08 00:48:04 77681 3.1.0 x86_64 linux-gnu randomForest
## 7 7 2014-07-08 00:48:35 393754 3.1.0 x86_64 linux-gnu plyr
## 8 8 2014-07-08 00:47:30 28216 3.0.2 x86_64 linux-gnu whisker
## 9 9 2014-07-08 00:54:58 5928 NA NA NA Rcpp
## 10 10 2014-07-08 00:15:35 2206029 3.0.2 x86_64 linux-gnu hflights
## .. .. ... ... ... ... ... ... ...
## Variables not shown: version (fctr), country (fctr), ip_id (int)
remove(mydf)
select(cran,ip_id,package,country)
## Source: local data frame [225,468 x 3]
##
## ip_id package country
## 1 1 htmltools US
## 2 2 tseries US
## 3 3 party US
## 4 3 Hmisc US
## 5 4 digest CA
## 6 3 randomForest US
## 7 3 plyr US
## 8 5 whisker US
## 9 6 Rcpp CN
## 10 7 hflights US
## .. ... ... ...
select(cran,r_arch:country)
## Source: local data frame [225,468 x 5]
##
## r_arch r_os package version country
## 1 x86_64 mingw32 htmltools 0.2.4 US
## 2 x86_64 mingw32 tseries 0.10-32 US
## 3 x86_64 linux-gnu party 1.0-15 US
## 4 x86_64 linux-gnu Hmisc 3.14-4 US
## 5 x86_64 linux-gnu digest 0.6.4 CA
## 6 x86_64 linux-gnu randomForest 4.6-7 US
## 7 x86_64 linux-gnu plyr 1.8.1 US
## 8 x86_64 linux-gnu whisker 0.3-2 US
## 9 NA NA Rcpp 0.10.4 CN
## 10 x86_64 linux-gnu hflights 0.1 US
## .. ... ... ... ... ...
select(cran,1:10)
## Source: local data frame [225,468 x 10]
##
## X date time size r_version r_arch r_os package
## 1 1 2014-07-08 00:54:41 80589 3.1.0 x86_64 mingw32 htmltools
## 2 2 2014-07-08 00:59:53 321767 3.1.0 x86_64 mingw32 tseries
## 3 3 2014-07-08 00:47:13 748063 3.1.0 x86_64 linux-gnu party
## 4 4 2014-07-08 00:48:05 606104 3.1.0 x86_64 linux-gnu Hmisc
## 5 5 2014-07-08 00:46:50 79825 3.0.2 x86_64 linux-gnu digest
## 6 6 2014-07-08 00:48:04 77681 3.1.0 x86_64 linux-gnu randomForest
## 7 7 2014-07-08 00:48:35 393754 3.1.0 x86_64 linux-gnu plyr
## 8 8 2014-07-08 00:47:30 28216 3.0.2 x86_64 linux-gnu whisker
## 9 9 2014-07-08 00:54:58 5928 NA NA NA Rcpp
## 10 10 2014-07-08 00:15:35 2206029 3.0.2 x86_64 linux-gnu hflights
## .. .. ... ... ... ... ... ... ...
## Variables not shown: version (fctr), country (fctr)
select(cran,-time)
## Source: local data frame [225,468 x 10]
##
## X date size r_version r_arch r_os package version
## 1 1 2014-07-08 80589 3.1.0 x86_64 mingw32 htmltools 0.2.4
## 2 2 2014-07-08 321767 3.1.0 x86_64 mingw32 tseries 0.10-32
## 3 3 2014-07-08 748063 3.1.0 x86_64 linux-gnu party 1.0-15
## 4 4 2014-07-08 606104 3.1.0 x86_64 linux-gnu Hmisc 3.14-4
## 5 5 2014-07-08 79825 3.0.2 x86_64 linux-gnu digest 0.6.4
## 6 6 2014-07-08 77681 3.1.0 x86_64 linux-gnu randomForest 4.6-7
## 7 7 2014-07-08 393754 3.1.0 x86_64 linux-gnu plyr 1.8.1
## 8 8 2014-07-08 28216 3.0.2 x86_64 linux-gnu whisker 0.3-2
## 9 9 2014-07-08 5928 NA NA NA Rcpp 0.10.4
## 10 10 2014-07-08 2206029 3.0.2 x86_64 linux-gnu hflights 0.1
## .. .. ... ... ... ... ... ... ...
## Variables not shown: country (fctr), ip_id (int)
filter(cran,package=="swirl")
## Source: local data frame [820 x 11]
##
## X date time size r_version r_arch r_os package
## 1 27 2014-07-08 00:17:16 105350 3.0.2 x86_64 mingw32 swirl
## 2 156 2014-07-08 00:22:53 41261 3.1.0 x86_64 linux-gnu swirl
## 3 358 2014-07-08 00:13:42 105335 2.15.2 x86_64 mingw32 swirl
## 4 593 2014-07-08 00:59:45 105465 3.1.0 x86_64 darwin13.1.0 swirl
## 5 831 2014-07-08 00:55:27 105335 3.0.3 x86_64 mingw32 swirl
## 6 997 2014-07-08 00:33:06 41261 3.1.0 x86_64 mingw32 swirl
## 7 1023 2014-07-08 00:35:36 106393 3.1.0 x86_64 mingw32 swirl
## 8 1144 2014-07-08 00:00:39 106534 3.0.2 x86_64 linux-gnu swirl
## 9 1402 2014-07-08 00:41:41 41261 3.1.0 i386 mingw32 swirl
## 10 1424 2014-07-08 00:44:49 106393 3.1.0 x86_64 linux-gnu swirl
## .. ... ... ... ... ... ... ... ...
## Variables not shown: version (fctr), country (fctr), ip_id (int)
filter(cran,r_version == "3.1.1" , country == "US")
## Source: local data frame [1,588 x 11]
##
## X date time size r_version r_arch r_os
## 1 2216 2014-07-08 00:48:58 385112 3.1.1 x86_64 darwin13.1.0
## 2 17332 2014-07-08 03:39:57 197459 3.1.1 x86_64 darwin13.1.0
## 3 17465 2014-07-08 03:25:38 23259 3.1.1 x86_64 darwin13.1.0
## 4 18844 2014-07-08 03:59:17 190594 3.1.1 x86_64 darwin13.1.0
## 5 30182 2014-07-08 04:13:15 77683 3.1.1 i386 mingw32
## 6 30193 2014-07-08 04:06:26 2351969 3.1.1 i386 mingw32
## 7 30195 2014-07-08 04:07:09 299080 3.1.1 i386 mingw32
## 8 30217 2014-07-08 04:32:04 568036 3.1.1 i386 mingw32
## 9 30245 2014-07-08 04:10:41 526858 3.1.1 i386 mingw32
## 10 30354 2014-07-08 04:32:51 1763717 3.1.1 i386 mingw32
## .. ... ... ... ... ... ... ...
## Variables not shown: package (fctr), version (fctr), country (fctr), ip_id
## (int)
filter(cran,r_version <= "3.0.2" , country == "IN")
## Warning in Ops.factor(structure(c(25L, 25L, 25L, 25L, 23L, 25L, 25L, 23L,
## : '<=' not meaningful for factors
## Source: local data frame [0 x 11]
##
## Variables not shown: X (int), date (fctr), time (fctr), size (int),
## r_version (fctr), r_arch (fctr), r_os (fctr), package (fctr), version
## (fctr), country (fctr), ip_id (int)
filter(cran,!is.na(r_version))
## Source: local data frame [207,205 x 11]
##
## X date time size r_version r_arch r_os package
## 1 1 2014-07-08 00:54:41 80589 3.1.0 x86_64 mingw32 htmltools
## 2 2 2014-07-08 00:59:53 321767 3.1.0 x86_64 mingw32 tseries
## 3 3 2014-07-08 00:47:13 748063 3.1.0 x86_64 linux-gnu party
## 4 4 2014-07-08 00:48:05 606104 3.1.0 x86_64 linux-gnu Hmisc
## 5 5 2014-07-08 00:46:50 79825 3.0.2 x86_64 linux-gnu digest
## 6 6 2014-07-08 00:48:04 77681 3.1.0 x86_64 linux-gnu randomForest
## 7 7 2014-07-08 00:48:35 393754 3.1.0 x86_64 linux-gnu plyr
## 8 8 2014-07-08 00:47:30 28216 3.0.2 x86_64 linux-gnu whisker
## 9 10 2014-07-08 00:15:35 2206029 3.0.2 x86_64 linux-gnu hflights
## 10 11 2014-07-08 00:15:25 526858 3.0.2 x86_64 linux-gnu LPCM
## .. .. ... ... ... ... ... ... ...
## Variables not shown: version (fctr), country (fctr), ip_id (int)
cran2 <- select(cran,size:ip_id)
arrange(cran2,ip_id)
## Source: local data frame [225,468 x 8]
##
## size r_version r_arch r_os package version country ip_id
## 1 80589 3.1.0 x86_64 mingw32 htmltools 0.2.4 US 1
## 2 180562 3.0.2 x86_64 mingw32 yaml 2.1.13 US 1
## 3 190120 3.1.0 i386 mingw32 babel 0.2-6 US 1
## 4 321767 3.1.0 x86_64 mingw32 tseries 0.10-32 US 2
## 5 52281 3.0.3 x86_64 darwin10.8.0 quadprog 1.5-5 US 2
## 6 876702 3.1.0 x86_64 linux-gnu zoo 1.7-11 US 2
## 7 321764 3.0.2 x86_64 linux-gnu tseries 0.10-32 US 2
## 8 876702 3.1.0 x86_64 linux-gnu zoo 1.7-11 US 2
## 9 321768 3.1.0 x86_64 mingw32 tseries 0.10-32 US 2
## 10 784093 3.1.0 x86_64 linux-gnu strucchange 1.5-0 US 2
## .. ... ... ... ... ... ... ... ...
arrange(cran2,desc(ip_id))
## Source: local data frame [225,468 x 8]
##
## size r_version r_arch r_os package version country
## 1 5933 NA NA NA CPE 1.4.2 CN
## 2 569241 3.1.0 x86_64 mingw32 multcompView 0.1-5 US
## 3 228444 3.1.0 x86_64 mingw32 tourr 0.5.3 NZ
## 4 308962 3.1.0 x86_64 darwin13.1.0 ctv 0.7-9 CN
## 5 950964 3.0.3 i386 mingw32 knitr 1.6 CA
## 6 80185 3.0.3 i386 mingw32 htmltools 0.2.4 CA
## 7 1431750 3.0.3 i386 mingw32 shiny 0.10.0 CA
## 8 2189695 3.1.0 x86_64 mingw32 RMySQL 0.9-3 US
## 9 4818024 3.1.0 i386 mingw32 igraph 0.7.1 US
## 10 197495 3.1.0 x86_64 mingw32 coda 0.16-1 US
## .. ... ... ... ... ... ... ...
## Variables not shown: ip_id (int)
arrange(cran2,package,ip_id)
## Source: local data frame [225,468 x 8]
##
## size r_version r_arch r_os package version country ip_id
## 1 71677 3.0.3 x86_64 darwin10.8.0 A3 0.9.2 CN 1003
## 2 71672 3.1.0 x86_64 linux-gnu A3 0.9.2 US 1015
## 3 71677 3.1.0 x86_64 mingw32 A3 0.9.2 IN 1054
## 4 70438 3.0.1 x86_64 darwin10.8.0 A3 0.9.2 CN 1513
## 5 71677 NA NA NA A3 0.9.2 BR 1526
## 6 71892 3.0.2 x86_64 linux-gnu A3 0.9.2 IN 1542
## 7 71677 3.1.0 x86_64 linux-gnu A3 0.9.2 ZA 2925
## 8 71672 3.1.0 x86_64 mingw32 A3 0.9.2 IL 3889
## 9 71677 3.0.3 x86_64 mingw32 A3 0.9.2 DE 3917
## 10 71672 3.1.0 x86_64 mingw32 A3 0.9.2 US 4219
## .. ... ... ... ... ... ... ... ...
arrange(cran2,country,desc(r_version),ip_id)
## Source: local data frame [225,468 x 8]
##
## size r_version r_arch r_os package version country
## 1 1556858 3.1.1 i386 mingw32 RcppArmadillo 0.4.320.0 A1
## 2 1823512 3.1.0 x86_64 linux-gnu mgcv 1.8-1 A1
## 3 15732 3.1.0 i686 linux-gnu grnn 0.1.0 A1
## 4 3014840 3.1.0 x86_64 mingw32 Rcpp 0.11.2 A1
## 5 660087 3.1.0 i386 mingw32 xts 0.9-7 A1
## 6 522261 3.1.0 i386 mingw32 FNN 1.1 A1
## 7 522263 3.1.0 i386 mingw32 FNN 1.1 A1
## 8 1676627 3.1.0 x86_64 linux-gnu rgeos 0.3-5 A1
## 9 2118530 3.1.0 x86_64 linux-gnu spacetime 1.1-0 A1
## 10 2217180 3.1.0 x86_64 mingw32 gstat 1.0-19 A1
## .. ... ... ... ... ... ... ...
## Variables not shown: ip_id (int)
cran3 <- select(cran,ip_id,package,size)
mutate(cran3,size_kb = size / 2^10)
## Source: local data frame [225,468 x 4]
##
## ip_id package size size_kb
## 1 1 htmltools 80589 78.700195
## 2 2 tseries 321767 314.225586
## 3 3 party 748063 730.530273
## 4 3 Hmisc 606104 591.898438
## 5 4 digest 79825 77.954102
## 6 3 randomForest 77681 75.860352
## 7 3 plyr 393754 384.525391
## 8 5 whisker 28216 27.554688
## 9 6 Rcpp 5928 5.789062
## 10 7 hflights 2206029 2154.325195
## .. ... ... ... ...
mutate(cran3,size_mb = size / 2^20, size_gb = size_mb / 2^10)
## Source: local data frame [225,468 x 5]
##
## ip_id package size size_mb size_gb
## 1 1 htmltools 80589 0.076855659 7.505435e-05
## 2 2 tseries 321767 0.306860924 2.996689e-04
## 3 3 party 748063 0.713408470 6.966880e-04
## 4 3 Hmisc 606104 0.578025818 5.644783e-04
## 5 4 digest 79825 0.076127052 7.434282e-05
## 6 3 randomForest 77681 0.074082375 7.234607e-05
## 7 3 plyr 393754 0.375513077 3.667120e-04
## 8 5 whisker 28216 0.026908875 2.627820e-05
## 9 6 Rcpp 5928 0.005653381 5.520880e-06
## 10 7 hflights 2206029 2.103833199 2.054525e-03
## .. ... ... ... ... ...
mutate(cran3,correct_size=size+1000)
## Source: local data frame [225,468 x 4]
##
## ip_id package size correct_size
## 1 1 htmltools 80589 81589
## 2 2 tseries 321767 322767
## 3 3 party 748063 749063
## 4 3 Hmisc 606104 607104
## 5 4 digest 79825 80825
## 6 3 randomForest 77681 78681
## 7 3 plyr 393754 394754
## 8 5 whisker 28216 29216
## 9 6 Rcpp 5928 6928
## 10 7 hflights 2206029 2207029
## .. ... ... ... ...
summarize(cran,avg_bytes=mean(size))
## Source: local data frame [1 x 1]
##
## avg_bytes
## 1 844086.5