Applied Statistics for High-throughput Biology: Session 1

Levi Waldron

June 20, 2017

Welcome and outline

A built html version of this lecture is available.

The source R Markdown is also available from Github.

Learning objectives

A bit about me - research interests

Random Variables and Distributions

Random Variables

Probability Distributions

Random Sample

Random Variables - examples

Normally distributed random variable with mean \(\mu = 0\) / standard deviation \(\sigma = 1\), and a sample of \(n=100\)

Random Variables - examples

Poisson distributed random variable (\(\lambda = 2\)), and a sample of \(n=100\).

Random Variables - examples

Negative Binomially distributed random variable (\(size=30, \mu=2\)), and a sample of \(n=100\).

Random Variables - examples

R - basic usage

Tips for learning R

Pseudo code Example code
library(packagename) library(dplyr)
?functionname ?select
?package::functionname ?dplyr::select
? ‘Reserved keyword or symbol’ ? ‘%>%’
??searchforpossiblyexistingfunctionandortopic ??simulate
help(package = “loadedpackage”) help(“dplyr”)
browseVignettes(“packagename”) browseVignettes(“dplyr”)

Slide credit: Marcel Ramos

Installing Packages

Pseudo code:

source("https://bioconductor.org/biocLite.R")
packages <- c("packagename", "githubuser/repository", "biopackage")
BiocInstaller::biocLite(packages)

Note about installing devtools

Introduction to the R language

Logic

Storing Data: The Rules

x <- 5
x * 2
## [1] 10
x <- x + 1
y <- 4
x * y
## [1] 24

Basic Data Types

set.seed(1)
rnorm(5)
## [1] -0.6264538  0.1836433 -0.8356286  1.5952808  0.3295078
1:5
## [1] 1 2 3 4 5
sample( 1:5 )
## [1] 2 1 3 4 5

Basic Data Types (cont’d)

c("yes", "no")
## [1] "yes" "no"
factor(c("yes", "no"))
## [1] yes no 
## Levels: no yes

Basic Data Types (cont’d)

factor(c("good", "very good", "poor"), 
       levels=c("poor", "good", "very good"), 
       ordered=TRUE)
## [1] good      very good poor     
## Levels: poor < good < very good
1:5 %in% 4:5
## [1] FALSE FALSE FALSE  TRUE  TRUE

Basic Data Types (cont’d)

c(NA, NaN, -Inf, Inf)
## [1]   NA  NaN -Inf  Inf

class() to find the class of a variable.

Vectors Must Be of One Data Mode

c( 1, "2", FALSE)
## [1] "1"     "2"     "FALSE"
c( 1, FALSE )
## [1] 1 0

Selecting Vector Elements

x <- 1:4
x[ 2 ]
## [1] 2
x <- 1:10
x[ 4:7 ]
## [1] 4 5 6 7

Selecting Vector Elements (cont’d)

x <- c( "a", "b", "c", "d", "e", "f" )
x[ c(5,3,1) ]
## [1] "e" "c" "a"
x[ -1 ]
## [1] "b" "c" "d" "e" "f"
x[-1:-2]
## [1] "c" "d" "e" "f"

Selecting Vector Elements (cont’d)

x <- 1:10
y <- x%%2 == 0
x[y]
## [1]  2  4  6  8 10

2-Dimensional Vectors are Matrices

matrix( 1:20, nrow = 5, ncol = 4 )
##      [,1] [,2] [,3] [,4]
## [1,]    1    6   11   16
## [2,]    2    7   12   17
## [3,]    3    8   13   18
## [4,]    4    9   14   19
## [5,]    5   10   15   20

Indexing Matrices

boring.matrix <- matrix( 1:20, nrow = 5, ncol = 4 )
dim( boring.matrix )
## [1] 5 4
boring.matrix[ ,1 ]
## [1] 1 2 3 4 5
boring.matrix[ 2, 1 ]
## [1] 2
boring.matrix[ 2, ]
## [1]  2  7 12 17

Indexing Matrices (cont’d)

boring.matrix
##      [,1] [,2] [,3] [,4]
## [1,]    1    6   11   16
## [2,]    2    7   12   17
## [3,]    3    8   13   18
## [4,]    4    9   14   19
## [5,]    5   10   15   20
boring.matrix[ boring.matrix[ ,1 ] ==3,]
## [1]  3  8 13 18

Matrix Operations

boring.matrix <- matrix(1:9, nrow = 3)
boring.matrix
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
t(boring.matrix)
##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    4    5    6
## [3,]    7    8    9

Matrix Operations (cont’d)

boring.matrix + 1
##      [,1] [,2] [,3]
## [1,]    2    5    8
## [2,]    3    6    9
## [3,]    4    7   10
boring.matrix + 1:3
##      [,1] [,2] [,3]
## [1,]    2    5    8
## [2,]    4    7   10
## [3,]    6    9   12

Matrix Operations (cont’d)

boring.matrix
##      [,1] [,2] [,3]
## [1,]    1    4    7
## [2,]    2    5    8
## [3,]    3    6    9
boring.matrix + boring.matrix
##      [,1] [,2] [,3]
## [1,]    2    8   14
## [2,]    4   10   16
## [3,]    6   12   18

Matrix Operations (cont’d)

boring.matrix * boring.matrix
##      [,1] [,2] [,3]
## [1,]    1   16   49
## [2,]    4   25   64
## [3,]    9   36   81
boring.matrix %*% boring.matrix
##      [,1] [,2] [,3]
## [1,]   30   66  102
## [2,]   36   81  126
## [3,]   42   96  150

Naming rows and columns

colnames(boring.matrix) <- c("col.1", "col.2", "col.3")
rownames(boring.matrix) <- c("row.1", "row.2", "row.3")
boring.matrix
##       col.1 col.2 col.3
## row.1     1     4     7
## row.2     2     5     8
## row.3     3     6     9
boring.matrix["row.1", ]
## col.1 col.2 col.3 
##     1     4     7

Lists are Like Filing Cabinets

measurements <- c( 1.3, 1.6, 3.2, 9.8, 10.2 )
self.reporting <- c( 13, 6, 4, 7, 6, 5, 8, 9, 7, 4 )
sex <- FALSE
parents <- c( "Parent1.name", "Parent2.name" )

Lists are Like Filing Cabinets (cont’d)

my.person <- list( measurements, self.reporting, 
                   sex, parents)
my.person
## [[1]]
## [1]  1.3  1.6  3.2  9.8 10.2
## 
## [[2]]
##  [1] 13  6  4  7  6  5  8  9  7  4
## 
## [[3]]
## [1] FALSE
## 
## [[4]]
## [1] "Parent1.name" "Parent2.name"

Lists are Like Filing Cabinets (cont’d)

my.person[1:2]
## [[1]]
## [1]  1.3  1.6  3.2  9.8 10.2
## 
## [[2]]
##  [1] 13  6  4  7  6  5  8  9  7  4
my.person[[1]]
## [1]  1.3  1.6  3.2  9.8 10.2

Lists are Like Filing Cabinets (cont’d)

my.person <- list( measure = measurements, 
                   parents = parents )
my.person
## $measure
## [1]  1.3  1.6  3.2  9.8 10.2
## 
## $parents
## [1] "Parent1.name" "Parent2.name"
my.person$parents
## [1] "Parent1.name" "Parent2.name"

The data.frame object

x <- 11:16
y <- seq(0,1,.2)
z <- c( "one", "two", "three", "four", "five", "six" )
a <- factor( z )
test.dataframe <- data.frame(x,y,z,a)

Accessing data.frame elements

test.dataframe[[4]]
## [1] one   two   three four  five  six  
## Levels: five four one six three two
test.dataframe$parents
## NULL

Columns of a data.frame May Contain Different Data Modes

class( test.dataframe[[1]] )
## [1] "integer"
class( test.dataframe[[2]] )
## [1] "numeric"
class( test.dataframe[[3]] )
## [1] "factor"

Combining Data Frames

mini.frame.one <- data.frame( "one" = 1:5 )
mini.frame.two <- data.frame( "two" = 6:10 )
cbind( mini.frame.one, mini.frame.two )
##   one two
## 1   1   6
## 2   2   7
## 3   3   8
## 4   4   9
## 5   5  10

Alternatively: c( mini.frame.one, mini.frame.two )

Updating Data Frames

test.dataframe[[1]]
## [1] 11 12 13 14 15 16
test.dataframe[[1]] = 21:26
test.dataframe
##    x   y     z     a
## 1 21 0.0   one   one
## 2 22 0.2   two   two
## 3 23 0.4 three three
## 4 24 0.6  four  four
## 5 25 0.8  five  five
## 6 26 1.0   six   six

The DataFrame class

R - reading data

Reading in Data

dplyr

Data Manipulation using dplyr

  1. select (Y)
  2. mutate/transmute (add Ys / new Y)
  3. filter (get Xs based on condition)
  4. slice (get Xs specified)
  5. summarise (reduce to single observation)
  6. arrange (re-order observations)

dplyr example

library(nycflights13)
library(dplyr)
delays <- flights %>% 
  filter(!is.na(dep_delay)) %>%
  group_by(year, month, day, hour) %>%
  summarise(delay = mean(dep_delay), n = n()) %>%
  filter(n > 10)

dplyr example (cont’d)

hist(delays$delay, main="Mean hourly delay", xlab="Delay (hours)")

Lab

Lab exercises

  1. Getting Started
  2. dplyr exercises
  3. random variables exercises