Levi Waldron
June 20, 2017
Introduction to dplyr
A built html version of this lecture is available.
The source R Markdown is also available from Github.
sample()
set.seed()
Normally distributed random variable with mean \(\mu = 0\) / standard deviation \(\sigma = 1\), and a sample of \(n=100\)
Poisson distributed random variable (\(\lambda = 2\)), and a sample of \(n=100\).
Negative Binomially distributed random variable (\(size=30, \mu=2\)), and a sample of \(n=100\).
Pseudo code | Example code |
---|---|
library(packagename) | library(dplyr) |
?functionname | ?select |
?package::functionname | ?dplyr::select |
? ‘Reserved keyword or symbol’ | ? ‘%>%’ |
??searchforpossiblyexistingfunctionandortopic | ??simulate |
help(package = “loadedpackage”) | help(“dplyr”) |
browseVignettes(“packagename”) | browseVignettes(“dplyr”) |
Slide credit: Marcel Ramos
Pseudo code:
source("https://bioconductor.org/biocLite.R")
packages <- c("packagename", "githubuser/repository", "biopackage")
BiocInstaller::biocLite(packages)
devtools
5 + 2 #addition
5 - 2 #subtraction
5 * 2 #multiplication
5 / 2 #division
5 ^ 2 #exponentiation
5 ** 2 #exponentiation
5 %% 2 #modulus (a.k.a. remainder)
5 < x #less than
5 <= x #less than or equal to
5 > x #greater than
5 >= x #greater than or equal to
5 == x #equal to
5 != x #not equal to
!x #logical NOT
True || False #stepwise logical OR
True && False #stepwise logical AND
x <- 5
x * 2
## [1] 10
x <- x + 1
y <- 4
x * y
## [1] 24
set.seed(1)
rnorm(5)
## [1] -0.6264538 0.1836433 -0.8356286 1.5952808 0.3295078
1:5
## [1] 1 2 3 4 5
sample( 1:5 )
## [1] 2 1 3 4 5
c("yes", "no")
## [1] "yes" "no"
factor(c("yes", "no"))
## [1] yes no
## Levels: no yes
factor(c("good", "very good", "poor"),
levels=c("poor", "good", "very good"),
ordered=TRUE)
## [1] good very good poor
## Levels: poor < good < very good
1:5 %in% 4:5
## [1] FALSE FALSE FALSE TRUE TRUE
c(NA, NaN, -Inf, Inf)
## [1] NA NaN -Inf Inf
class()
to find the class of a variable.
c( 1, "2", FALSE)
## [1] "1" "2" "FALSE"
c( 1, FALSE )
## [1] 1 0
x <- 1:4
x[ 2 ]
## [1] 2
x <- 1:10
x[ 4:7 ]
## [1] 4 5 6 7
x <- c( "a", "b", "c", "d", "e", "f" )
x[ c(5,3,1) ]
## [1] "e" "c" "a"
x[ -1 ]
## [1] "b" "c" "d" "e" "f"
x[-1:-2]
## [1] "c" "d" "e" "f"
x <- 1:10
y <- x%%2 == 0
x[y]
## [1] 2 4 6 8 10
matrix( 1:20, nrow = 5, ncol = 4 )
## [,1] [,2] [,3] [,4]
## [1,] 1 6 11 16
## [2,] 2 7 12 17
## [3,] 3 8 13 18
## [4,] 4 9 14 19
## [5,] 5 10 15 20
boring.matrix <- matrix( 1:20, nrow = 5, ncol = 4 )
dim( boring.matrix )
## [1] 5 4
boring.matrix[ ,1 ]
## [1] 1 2 3 4 5
boring.matrix[ 2, 1 ]
## [1] 2
boring.matrix[ 2, ]
## [1] 2 7 12 17
boring.matrix
## [,1] [,2] [,3] [,4]
## [1,] 1 6 11 16
## [2,] 2 7 12 17
## [3,] 3 8 13 18
## [4,] 4 9 14 19
## [5,] 5 10 15 20
boring.matrix[ boring.matrix[ ,1 ] ==3,]
## [1] 3 8 13 18
boring.matrix <- matrix(1:9, nrow = 3)
boring.matrix
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
t(boring.matrix)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 4 5 6
## [3,] 7 8 9
boring.matrix + 1
## [,1] [,2] [,3]
## [1,] 2 5 8
## [2,] 3 6 9
## [3,] 4 7 10
boring.matrix + 1:3
## [,1] [,2] [,3]
## [1,] 2 5 8
## [2,] 4 7 10
## [3,] 6 9 12
boring.matrix
## [,1] [,2] [,3]
## [1,] 1 4 7
## [2,] 2 5 8
## [3,] 3 6 9
boring.matrix + boring.matrix
## [,1] [,2] [,3]
## [1,] 2 8 14
## [2,] 4 10 16
## [3,] 6 12 18
boring.matrix * boring.matrix
## [,1] [,2] [,3]
## [1,] 1 16 49
## [2,] 4 25 64
## [3,] 9 36 81
boring.matrix %*% boring.matrix
## [,1] [,2] [,3]
## [1,] 30 66 102
## [2,] 36 81 126
## [3,] 42 96 150
colnames(boring.matrix) <- c("col.1", "col.2", "col.3")
rownames(boring.matrix) <- c("row.1", "row.2", "row.3")
boring.matrix
## col.1 col.2 col.3
## row.1 1 4 7
## row.2 2 5 8
## row.3 3 6 9
boring.matrix["row.1", ]
## col.1 col.2 col.3
## 1 4 7
measurements <- c( 1.3, 1.6, 3.2, 9.8, 10.2 )
self.reporting <- c( 13, 6, 4, 7, 6, 5, 8, 9, 7, 4 )
sex <- FALSE
parents <- c( "Parent1.name", "Parent2.name" )
my.person <- list( measurements, self.reporting,
sex, parents)
my.person
## [[1]]
## [1] 1.3 1.6 3.2 9.8 10.2
##
## [[2]]
## [1] 13 6 4 7 6 5 8 9 7 4
##
## [[3]]
## [1] FALSE
##
## [[4]]
## [1] "Parent1.name" "Parent2.name"
my.person[1:2]
## [[1]]
## [1] 1.3 1.6 3.2 9.8 10.2
##
## [[2]]
## [1] 13 6 4 7 6 5 8 9 7 4
my.person[[1]]
## [1] 1.3 1.6 3.2 9.8 10.2
my.person <- list( measure = measurements,
parents = parents )
my.person
## $measure
## [1] 1.3 1.6 3.2 9.8 10.2
##
## $parents
## [1] "Parent1.name" "Parent2.name"
my.person$parents
## [1] "Parent1.name" "Parent2.name"
data.frame
objectlist
with vector elements of equal lengthx <- 11:16
y <- seq(0,1,.2)
z <- c( "one", "two", "three", "four", "five", "six" )
a <- factor( z )
test.dataframe <- data.frame(x,y,z,a)
data.frame
elementstest.dataframe[[4]]
## [1] one two three four five six
## Levels: five four one six three two
test.dataframe$parents
## NULL
data.frame
May Contain Different Data Modesclass( test.dataframe[[1]] )
## [1] "integer"
class( test.dataframe[[2]] )
## [1] "numeric"
class( test.dataframe[[3]] )
## [1] "factor"
mini.frame.one <- data.frame( "one" = 1:5 )
mini.frame.two <- data.frame( "two" = 6:10 )
cbind( mini.frame.one, mini.frame.two )
## one two
## 1 1 6
## 2 2 7
## 3 3 8
## 4 4 9
## 5 5 10
Alternatively: c( mini.frame.one, mini.frame.two )
test.dataframe[[1]]
## [1] 11 12 13 14 15 16
test.dataframe[[1]] = 21:26
test.dataframe
## x y z a
## 1 21 0.0 one one
## 2 22 0.2 two two
## 3 23 0.4 three three
## 4 24 0.6 four four
## 5 25 0.8 five five
## 6 26 1.0 six six
DataFrame
classDataFrame
data.frame
but more flexible: columns can be any atomic vector type such as:
GenomicRanges
objectsRle
(run-length encoding)read.table
read.csv
read.delim
biocLite(data.table)
# fread() function for very large tablesbiocLite(hadley/readr)
# fast / convenient / advancedbiocLite(hadley/haven)
# data from other statistical packagesdplyr
dplyr
convention aims to ease cognitive burdendplyr
examplelibrary(nycflights13)
library(dplyr)
delays <- flights %>%
filter(!is.na(dep_delay)) %>%
group_by(year, month, day, hour) %>%
summarise(delay = mean(dep_delay), n = n()) %>%
filter(n > 10)
dplyr
example (cont’d)hist(delays$delay, main="Mean hourly delay", xlab="Delay (hours)")