R

Performance

Two ways of assigning value to each element in a vector, one is faster, one is slower.

set.seed(1)
letters = c("a", "b", "c", "d", "e")
values = sample(letters, 1e4, replace = TRUE)
vector = character(0) #Create empty character vector
system.time(for(i in seq(length(values))) vector[i] = values[i])

##    user  system elapsed 
##   0.365   0.011   0.382

vector = character(length(values))
system.time(for(i in seq(length(values))) vector[i] = values[i])

##    user  system elapsed 
##    0.01    0.00    0.01

Suppose we have a function accepts a vector of elements and a number, and outputs a matrix of powers of each element in that vector up to the number given as input:

powers1 = function(x, dg) {
  pw = matrix(x, nrow = length(x))
  prod = x
  for (i in 2:dg) {
    prod = prod * x
    pw = cbind(pw, prod)
  }
  return(pw)
}
powers1(c(2, 3, 4), 3)

##        prod prod
## [1,] 2    4    8
## [2,] 3    9   27
## [3,] 4   16   64

One can get a report of approximately how much time each function call is taking by using the Rprof() function. To use the Rprof() properly, follow 4 steps:

Rprof()
Run code
Rprof(NULL)
summaryRprof()

x = runif(1000000)
Rprof()
invisible(powers1(x, 8))
Rprof(NULL)
summaryRprof()

## $by.self
##          self.time self.pct total.time total.pct
## "cbind"       1.38    85.19       1.38     85.19
## "*"           0.12     7.41       0.12      7.41
## "matrix"      0.12     7.41       0.12      7.41
## 
## $by.total
##                       total.time total.pct self.time self.pct
## "<Anonymous>"               1.62    100.00      0.00     0.00
## "block_exec"                1.62    100.00      0.00     0.00
## "call_block"                1.62    100.00      0.00     0.00
## "eval"                      1.62    100.00      0.00     0.00
## "evaluate_call"             1.62    100.00      0.00     0.00
## "handle"                    1.62    100.00      0.00     0.00
## "in_dir"                    1.62    100.00      0.00     0.00
## "powers1"                   1.62    100.00      0.00     0.00
## "process_file"              1.62    100.00      0.00     0.00
## "process_group.block"       1.62    100.00      0.00     0.00
## "process_group"             1.62    100.00      0.00     0.00
## "withCallingHandlers"       1.62    100.00      0.00     0.00
## "withVisible"               1.62    100.00      0.00     0.00
## "cbind"                     1.38     85.19      1.38    85.19
## "*"                         0.12      7.41      0.12     7.41
## "matrix"                    0.12      7.41      0.12     7.41
## 
## $sample.interval
## [1] 0.02
## 
## $sampling.time
## [1] 1.62

Parallel R

To calculate number of cores available on local computer.

library(parallel)
detectCores()

## [1] 4

The foreach packaage The times() function in foreach allows one to repeat a function n number of times. foreach allows one to repeatedly execute a function while systematically varying the input parameters.

library(foreach)
max.eig <- function(N, sigma) {
  d <- matrix(rnorm(N**2, sd = sigma), nrow = N)
  E <- eigen(d)$values
  abs(E)[[1]]
}
times(10) %do% max.eig(5, 1)

##  [1] 2.789116 3.108739 2.447060 1.682317 2.036381 2.923629 3.358250
##  [8] 2.055538 2.515417 2.665701

foreach(n = 1:5) %do% max.eig(n, 1)

## [[1]]
## [1] 0.02652123
## 
## [[2]]
## [1] 1.090449
## 
## [[3]]
## [1] 2.09963
## 
## [[4]]
## [1] 2.019413
## 
## [[5]]
## [1] 2.384514

In foreach, one can specificy in what data structure the results are returned using the .combine parameter.

# To return results in a vector
foreach(n = 1:5, .combine = c) %do% max.eig(n, 1)

## [1] 0.6900221 1.0689973 2.1916111 1.0521352 1.8602131

# Returns results in a nested list
foreach(n = 1:3) %:% foreach(m = 1:3) %do% max.eig(n, m)

## [[1]]
## [[1]][[1]]
## [1] 0.7578154
## 
## [[1]][[2]]
## [1] 2.640217
## 
## [[1]][[3]]
## [1] 0.1195664
## 
## 
## [[2]]
## [[2]][[1]]
## [1] 0.7509613
## 
## [[2]][[2]]
## [1] 2.986557
## 
## [[2]][[3]]
## [1] 1.432597
## 
## 
## [[3]]
## [[3]][[1]]
## [1] 0.8747027
## 
## [[3]][[2]]
## [1] 2.80636
## 
## [[3]][[3]]
## [1] 4.95146

# Returns results in matrix form using rbind
foreach(n = 1:3, .combine = rbind) %:% foreach(m = 1:3) %do% max.eig(n, m)

##          [,1]      [,2]      [,3]    
## result.1 0.7922618 0.4699619 2.249439
## result.2 0.9079341 4.509158  3.774817
## result.3 2.100027  3.238192  4.400374

# Returns results in matrix form using cbind
foreach(n = 1:3, .combine = cbind) %:% foreach(m = 1:3) %do% max.eig(n, m)

##      result.1  result.2 result.3
## [1,] 0.7872542 1.568529 1.788463
## [2,] 3.208785  3.378223 1.366411
## [3,] 1.230553  2.087211 7.420172

# Using foreach() to loop over multiple variables simultaneously
foreach(n = 1:3, m = 1:3) %do% max.eig(n, m)

## [[1]]
## [1] 0.2278165
## 
## [[2]]
## [1] 2.835797
## 
## [[3]]
## [1] 5.93052

Adding a filter in foreach statement.

library(numbers)
foreach(n = 1:50, .combine = c) %:% when (isPrime(n)) %do% n

##  [1]  2  3  5  7 11 13 17 19 23 29 31 37 41 43 47

To execute foreach() statements in parallel, use %dopar% instead of %do%, and register cluster using doParallel package.

library(doParallel)

## Loading required package: iterators

no_cores = detectCores() - 1
cl = makeCluster(no_cores)
registerDoParallel(cl)
times(10) %dopar% max.eig(10, 1)

##  [1] 3.800989 3.475508 3.155365 3.454822 3.241104 3.170539 2.927744
##  [8] 3.115359 3.006906 3.334678

stopCluster(cl)

Setting .multicombine = TRUE can prevent nested lists.

cl = makeCluster(no_cores)
registerDoParallel(cl)
base = 2
foreach(exponent = 2:4, .combine = list) %dopar% base^exponent

## [[1]]
## [[1]][[1]]
## [1] 4
## 
## [[1]][[2]]
## [1] 8
## 
## 
## [[2]]
## [1] 16

foreach(exponent = 2:4, .combine = list, .multicombine = TRUE) %dopar% base^exponent

## [[1]]
## [1] 4
## 
## [[2]]
## [1] 8
## 
## [[3]]
## [1] 16

Variables within the same local environment are available in foreach package, but variables from parent environment are not available. To make a variable in the parent environment available, set the .export parameter.

base = 2
cl = makeCluster(no_cores)
registerDoParallel(cl)
base = 4
test = function(exponent) {
  foreach(exponent = 2:4, .combine = c, .export = "base") %dopar% base^exponent
}
test()

## [1]  16  64 256

stopCluster(cl)

Sorting

Sorting numbers in a vector by increasing/decreasing value.

x = c(4, 2, 1, 0, 3, 2, 1)
sort(x)

## [1] 0 1 1 2 2 3 4

sort(x, decreasing=TRUE)

## [1] 4 3 2 2 1 1 0

order() returns the indices of a sorted numeric vector, either increasing or decreasing.

x = c(4, 2, 1, 0, 3, 2, 1)
sort(x)

## [1] 0 1 1 2 2 3 4

order(x)

## [1] 4 3 7 2 6 5 1

sort(x, decreasing = TRUE)

## [1] 4 3 2 2 1 1 0

order(x, decreasing = TRUE)

## [1] 1 5 2 6 3 7 4

Strings

gregexpr() operates the same as regexpr() but finds all instances of pattern.

gregexpr("iss", "Mississippi")

## [[1]]
## [1] 2 5
## attr(,"match.length")
## [1] 3 3
## attr(,"useBytes")
## [1] TRUE

regexpr() finds the character position of the first instance of pattern within text.

regexpr("uat", "Equator")

## [1] 3
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE

strsplit() splits a string x into an R list of substrings based on what to split on.

strsplit("6-16-2011", split="-")

## [[1]]
## [1] "6"    "16"   "2011"

substr() returns the substring of a string defined by the start and stop positions of that substring.

substring("Equator", 3, 5)

## [1] "uat"

sprintf()

i = 8
s = sprintf("the square of %d is %d", i, i^2)
s

## [1] "the square of 8 is 64"

Counts number of characters in a string.

nchar("South Pole")

## [1] 10

Flow Control

The preferable for loop through a sequence of numbers of a certain length.

x = NULL

#Preferable for loop
for(i in seq(x)){
  print(i)
}
#Less preferable for loop
for(i in 1:length(x)){
  print(i)
}

## [1] 1
## [1] 0

While loops, break, and repeat.

#Method 1
i = 1
while (i <= 10){
  i = i + 4
}
#Method 2
i = 1
while(TRUE){
  i = i + 4
  if(i > 10){
    break
  }
}
#Method 3
i = 1
repeat{
  i = i + 4
  if(i > 10) break
}

Looping over functions.

g1 = function(x) return(sin(x))
g2 = function(x) return(sqrt(x^2+1))
g3 = function(x) return(x)
plot(0, 0)
for(f in c(g1, g2, g3)) plot(f, 0, 5, add=TRUE, xlim = c(0, 5), ylim = c(-1, 5), asp=1) #plots function over domain from 0 to 5. add=TRUE means functions are plotted in the same plot

Nested if…else statements

x = -5
if(x > 0){
  print("Positive number")
}else if(x < 0){
  print("Negative number")
}else if(x == 0){
  print("Zero")
}

Compact if else statements

x = 3
y = if(x == 2) x else x+1
y

## [1] 4

Next statements: in this example only sum up even numbers in a sequene of numbers from 1 to 10.

n = 1
for(i in seq(1, 10)){
  if(i %% 2 != 0) {next}
  else{
    n = n + i
    print(i)
  }
}

## [1] 2
## [1] 4
## [1] 6
## [1] 8
## [1] 10

## [1] 31

I/O

Two ways to read in .csv files. First way:

data = read.table("<pathToFile>", sep = ",", header = TRUE)

Default sep for read.table() is tab delimited. Second way:

data = read.csv("<pathToFile>")

Default sep for read.csv() is the comma “,”.

Other useful parameters for the read functions + quote: indicate any quoted values. + na.strings: set the character that represents a missing value. + nrows: how many rows to read. + skip: number of lines to skip before starting to read.

Use fread() to read in large datasets faster:

library(data.table)

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:reshape2':
## 
##     dcast, melt

## The following objects are masked from 'package:dplyr':
## 
##     between, last

myData = fread("R_data/file3.txt")

Reading xlsx files.

library(xlsx)

#Read file cameras.xlsx and extract rows 1-4 and columns 2-3.
data = read.xlsx(file="R_data/random.xlsx", sheetIndex = 1, colIndex=2:4, rowIndex=2:4)

Writing to a file.

kids = c("Jack", "Jill")
ages = c(12, 10)
d = data.frame(kids, ages, stringsAsFactors = FALSE)
d

##   kids ages
## 1 Jack   12
## 2 Jill   10

#With column/row names included
write.table(d, "R_data/kds1.txt")

#With column/row names excluded
write.table(d, "R_data/kds2.txt", row.names=FALSE, col.names=FALSE)

#Writing and appending to a file
cat("abc\n", file="R_data/u.txt")
cat("de\n", file="R_data/u.txt", append=TRUE)

#concatenating multiple items to a file
cat(file="R_data/v.txt", 1, 2, "xyz\n")

#Writing data frame to csv file.
write.csv(d, file = "file.csv", row.names=FALSE)

The file name v.txt will consist of a single line.

1 2 xyz

The writeLines() function can write each element in the vector as a line once a connection is established.

c = file("R_data/www.txt", "w")
writeLines(c("abc", "de", "f"), c)
close(c)

And the file www.txt will contain the following content:

abc

de

f

You can also establish a connection first before concatenating to a file for convenience’s sake.

con = file("analysisReport.txt", "w")
data = c(1, 2, 3)
results = c("the", "random")
conclusion = "conclusion"
cat(data, file=con)
cat(results, file=con)
cat(conclusion, file=con)
close(con)

Connection functions such as file() and url() can read in a file. Suppose we have a file name z6.txt with the following contents:

John 25

Mary 28

Jim 19

We can establish a connection via the file() function. The function readLines() has the parameter n, which indicates how many lines to read. The “r” parameter of file() indicates reading from file.

c = file("R_data/z6.txt", "r")
readLines(c, n=1)

## [1] "John 25"

readLines(c, n=1)

## [1] "Mary 28"

readLines(c, n=1)

## [1] "Jim 19"

readLines(c, n=1)

## character(0)

close(c)

#"Rewind" to beginning of the file
c = file("R_data/z6.txt", "r")
readLines(c, n=2)

## [1] "John 25" "Mary 28"

seek(con = c, where = 0) #parameter where=0 sets the position of the file pointer zero characters from the start of the file.

## [1] 16

readLines(c, n=1)

## [1] "John 25"

close(c) #closes connection

readLines() allows one to read a file where each line is treated as a string. Suppose we have a file z5.txt with the following content:

John 25

Mary 28

Jim 19

z1 = readLines("R_data/z5.txt")
z1

## [1] "John 25" ""        "Mary 28" " "       "Jim 19"

Suppose we have a file x.txt with the following content:

1 0 1

1 1 1

1 1 0

1 1 0

0 0 1

We can read it into a matrix the following way:

x = matrix(scan("R_data/x.txt", quiet=TRUE), nrow=5, byrow=TRUE)

read.csv() is more suited for reading comma delimited files.

f = read.csv(file="simple.csv", header = =TRUE, sep=",")

If you already know the modes of each column of your csv file, then you can indicate what mode they are with the colClasses parameter.

pollution = read.csv("R_data/avgpm25.csv", colClasses = c("numeric", "character", "factor", "numeric", "numeric"))
dim(pollution)

Any read function will have the parameters:

na.strings: set the character that represents a missing value
nrows: how many rows to read from the file
skip: number of lines to skip before starting to read

Reading a data frame or matrix from a file. Assume we have a file named z.txt with the following content:

name age

John 25

Mary 28

Jim 19

read.table() reads a file into a dataframe.

z = read.table("R_data/z.txt", header=TRUE)
z

##   name age
## 1 John  25
## 2 Mary  28
## 3  Jim  19

To only read in the last two lines of z.txt:

z = read.table("R_data/z.txt", header=TRUE, skip = 1, nrows = 2)
z

##   John X25
## 1 Mary  28
## 2  Jim  19

read.table() can also read gzipped files directly. ******

Using the readline() function to read in a single line from the keyboard.

> w = readline()
abc de f
> w
[1] "abc de f"
#Can also be used with an optional prompt
> inits = readline("type your initials: ")
type your initials: CC
> inits
[1] "CC"

Using the scan() funtion. Suppose we have four documents z1.txt, z2.txt, z3.txt, and z4.txt. The contents of each document is as follows: z1.txt

123

4 5

6

z2.txt

123

4.2 5

6

z3.txt

abc de f g

z4.txt

abc 123 6 y

Let’s see what we can do with these files using the scan() function. Scans items as double mode by default, by can change what mode to scan to by setting the “what” parameter. scan() also assumes that items of the vector are separated by whitespace, but that could be changed by setting the “sep” parameter.

scan("R_data/z1.txt")

## [1] 123   4   5   6

scan("R_data/z2.txt")

## [1] 123.0   4.2   5.0   6.0

scan("R_data/z3.txt", what="")

## [1] "abc" "de"  "f"   "g"

scan("R_data/z4.txt", what="")

## [1] "abc" "123" "6"   "y"

x2 = scan("R_data/z3.txt", what="", sep="\n")
x2

## [1] "abc"  "de f" "g"

x2 = scan("R_data/z3.txt", what="", sep="\n", quiet=TRUE)
x2

## [1] "abc"  "de f" "g"

The scan() function can be used to read numbers from the keyboard. Returns a numeric vector containing typed numbers.

v = scan("")
1: 12 5 13
4: 3 4 5
7: 8
8: 
Read 7 items
> v
[1] 12 5 13 3 4 5 8

For loops can loop over file names. scan() reads the file. Assume “file1.txt” contains numbers 1 to 6 and “file2.txt” contains numbers 5, 12, and 13.

for (fn in c("R_data/file1.txt", "R_data/file2.txt")) print(scan(fn))

## [1] 1 2 3 4 5 6
## [1]  5 12 13

Functions

Specifying default parameters when writing functions.

test = function(x, y=2) {return(x + y)}
test(3)

## [1] 5

test(5, 5)

## [1] 10

Table

Converting elements in a matrix into their proportions

tab = matrix(c(180, 40, 20, 10), nrow = 2)
proportion = prop.table(tab)
proportion

##      [,1] [,2]
## [1,] 0.72 0.08
## [2,] 0.16 0.04

Tables can be used to tabulate counts for a combination of categories. table() accepts a vector, matrix, factor, or dataframe, and returns a contingency table.

#Motvation
u = c(22, 8, 33, 6, 8, 29, -2)
fl = list(c(5, 12, 13, 12, 13, 5, 13), c("a", "bc", "a", "a", "bc", "a", "a"))
tapply(u, fl, length)

##    a bc
## 5  2 NA
## 12 1  1
## 13 2  1

#Counts cannot be NA, table() solves this problem.
table(fl)

##     fl.2
## fl.1 a bc
##   5  2  0
##   12 1  1
##   13 2  1

#Another example1
vote.for.X = c("Yes", "Yes", "No", "Not Sure", "No")
Voted.for.X.Last.Time = c("Yes", "No", "No", "Yes", "No")
d = data.frame(vote.for.X, Voted.for.X.Last.Time)
cttab = table(d)
cttab

##           Voted.for.X.Last.Time
## vote.for.X No Yes
##   No        2   0
##   Not Sure  0   1
##   Yes       1   1

#Another example2
gender = c("M", "M", "F", "M", "F", "F")
race = c("W", "W", "A", "O", "B", "B")
pol = c("L", "L", "C", "L", "L", "C")
d = data.frame(gender, race, pol)
vt = table(d)
vt

## , , pol = C
## 
##       race
## gender A B O W
##      F 1 1 0 0
##      M 0 0 0 0
## 
## , , pol = L
## 
##       race
## gender A B O W
##      F 0 1 0 0
##      M 0 0 1 2

#Another example3
data = factor(c(rep("east", 450), rep("west", 150)))
table(data)

## data
## east west 
##  450  150

#Table operations. Table is its own class but in practice can be treated like a matrix. 
class(cttab)

## [1] "table"

cttab[1, 1]

## [1] 2

cttab/5

##           Voted.for.X.Last.Time
## vote.for.X  No Yes
##   No       0.4 0.0
##   Not Sure 0.0 0.2
##   Yes      0.2 0.2

#Finding marginal values of Vote.for.X variable.
apply(cttab, 1, sum)

##       No Not Sure      Yes 
##        2        1        2

#addmargins() to find marginal totals of a table.
addmargins(cttab)

##           Voted.for.X.Last.Time
## vote.for.X No Yes Sum
##   No        2   0   2
##   Not Sure  0   1   1
##   Yes       1   1   2
##   Sum       3   2   5

#Get names of dimensions and their levels. Columns are dimensions, rows are levels.
dimnames(cttab)

## $vote.for.X
## [1] "No"       "Not Sure" "Yes"     
## 
## $Voted.for.X.Last.Time
## [1] "No"  "Yes"

Plotting

Given a $X$ matrix, plotting the variables for each sample

x = 1:10
A = c(15, 36, 54, 60, 68, 71, 73, 75, 78, 78)
B = c(20, 49, 58, 69, 75, 80, 83, 86, 88, 89)
C = c(24, 58, 68, 75, 83, 90, 93, 93, 95, 96)
Performance = data.frame(A,B,C)
matplot(x,Performance, type="o", pch=c(1,2,3), col=c("red","green","blue"))

In R base plotting, deciding how densely the tick marks of the axes should be plotted.

data = rnorm(100)
hist(data, breaks = 10, axes = FALSE)
# Adds the y-axis
axis(2)
# Adds the x-axis
axis(1, at = seq(-3, 3, 0.5), labels = seq(-3, 3, 0.5))

segments() draws line segments between pairs of points.

set.seed(1)
x = rnorm(20, mean=5, sd=1)
y = rnorm(20, mean=2, sd=1)
plot(rep(0, 20), x, xlim=c(-0.5, 1.5))
points(rep(1, 20), y)
segments(rep(0, 20), x, rep(1, 20), y)

Creating 3D scatter plots using the scatterplot3d package. Let us work with a simulated gene expression data set with 3 genes. There are five clusters in this data set and we wish to color code points based on what cluster they are from.

require(scatterplot3d)
set.seed(1)
n = c(100, 25, 50, 50, 75)
growth_m = c(0, 2, -2, 0, 4)
growth_sd = c(0.5, 1, 1, 0.5, 0.5)
immune_m = c(0, -2, -3, 4, 0)
immune_sd = c(0.5, 1, 0.5, 0.5, 1)
structural_m = c(3, 0, 2, -1, -2)
structural_sd = c(1, 1, 1, 0.7, 1)
data = data.frame(n=n, growth_m=growth_m, growth_sd, immune_m, immune_sd, structural_m, structural_sd)

get_expression = function(data){
  n = data[1]
  growth_m = data[2]
  growth_sd = data[3]
  immune_m = data[4]
  immune_sd = data[5]
  structural_m = data[6]
  structural_sd = data[7]
  growth = rnorm(n, growth_m, growth_sd)
  immune = rnorm(n, immune_m, immune_sd)
  structural = rnorm(n, structural_m, structural_sd)
  expression = data.frame(growth=growth, immune=immune, structural=structural)
  return(expression)
}

expressions = apply(data, 1, get_expression)
expressions = do.call(rbind, expressions)
expressions$cluster = c(rep("1", 100), rep("2", 25), rep("3", 50), rep("4", 50), rep("5", 75))
expressions$pcolor[expressions$cluster=="1"] = "red"
expressions$pcolor[expressions$cluster=="2"] = "blue"
expressions$pcolor[expressions$cluster=="3"] = "darkgreen"
expressions$pcolor[expressions$cluster=="4"] = "black"
expressions$pcolor[expressions$cluster=="5"] = "darkorange"

with(expressions, {plot = scatterplot3d(growth, immune, structural, color = pcolor, main="Scatterplot of Cell Gene Expression")})
legend("topleft", c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5"), fill=c("red", "blue", "darkgreen", "black", "darkorange"))

ggplot2 system: In this example plots displ on the x-axis and hwy on the y-axis.

library(ggplot2)
head(mpg)

## Source: local data frame [6 x 11]
## 
##   manufacturer model displ  year   cyl      trans   drv   cty   hwy    fl
##          (chr) (chr) (dbl) (int) (int)      (chr) (chr) (int) (int) (chr)
## 1         audi    a4   1.8  1999     4   auto(l5)     f    18    29     p
## 2         audi    a4   1.8  1999     4 manual(m5)     f    21    29     p
## 3         audi    a4   2.0  2008     4 manual(m6)     f    20    31     p
## 4         audi    a4   2.0  2008     4   auto(av)     f    21    30     p
## 5         audi    a4   2.8  1999     6   auto(l5)     f    16    26     p
## 6         audi    a4   2.8  1999     6 manual(m5)     f    18    26     p
## Variables not shown: class (chr)

#Regular plot, with x=displ and y=hwy
qplot(displ, hwy, data=mpg)

#Plot with labels
qplot(displ, hwy, data=mpg, color=drv)

#Plotting colored by group with fitted line. Method is set as linear model. 
qplot(displ, hwy, data=mpg, geom=c("point", "smooth"), color=drv)

#geom argument accepts a character vector
#1. point: add points to plot.
#2. smooth: add fitted line with 95% confience interval.
qplot(displ, hwy, data=mpg, geom=c("point", "smooth"))

#If only one data column is specified, then histogram will be plotted. fill=drv color codes different groups
qplot(hwy, data=mpg, fill=drv)

#Plotting a histogram with customized binwidth
qplot(price, data=diamonds, binwidth=18497/30)

#Plotting histogram as a density function
qplot(price, data=diamonds, geom="density")

#Plotting scatterplot with points belonging to different groups having different symbols/colors
qplot(carat, price, data=diamonds, shape=cut, color=cut)

#facets parameter plots separate groups in different plots. Placing "drv" to the right of "~" indicates how many columns instead of rows
qplot(displ, hwy, data=mpg, facets=.~drv)

#Placing "drv" to the left of "~." indicates how many rows instead of columns. binwidth=2 indicates that each bin in the histogram has a width of 2. 
qplot(hwy, data=mpg, facets=drv~., binwidth=2)

#Smooth and color historgrams. geom parameter "density" draws out the smooth histogram and color=drv assigns different color to different group
qplot(hwy, data=mpg, geom="density", color=drv)

#Making boxplots
qplot(drv, hwy, data=mpg, geom="boxplot")

#Making more detailed boxplots
qplot(drv, hwy, data=mpg, geom="boxplot", color=manufacturer)

Using the ggplot2 function

library(ggplot2)

#Store data in object g first without plotting
g = ggplot(mpg, aes(displ, hwy))

#Add layers to the plot. In this case adding points, linear regression, and subplots in grid. 
g + geom_point() + geom_smooth(method="lm") + facet_grid(.~drv)

#Adding layers with modified parameters
g + geom_point(color="pink", size=4, alpha=1/2)

g + geom_point(size=4, alpha=1/2, aes(color=drv))

g + geom_point(aes(color = drv)) + labs(title="Swirl Rules!") + labs(x="Displacement", y="Hwy Mileage")

#For the arguments in geom_smooth(), size specifies size of dashes, linetype=3 specifies dashed instead of continuous line, method="lm" indicates linear regression, and se=FALSE turns off gray shadows indicating standard errors. 
g + geom_point(aes(color = drv), size=2, alpha=1/2) + geom_smooth(size=4, linetype=3, method="lm", se=FALSE)

#Change the background theme and font with "base_family" parameter.
g + geom_point(aes(color=drv)) + theme_bw(base_family="Times")

#Changing the coordinate axes.
g + geom_line() + coord_cartesian(ylim=c(-3, 3))

#Example Illustration. margins=TRUE tells the plot to display marginal totals over each row and column. Since there are 3 factors in drv and 4 factors in cyl, with margins=FALSE will generate a 3x4 scatterplot. 
g + geom_point() + facet_grid(drv~cyl, margins=TRUE) + geom_smooth(method="lm", size=2, se=FALSE, color="black")

#Plotting boxplots.
ggplot(diamonds, aes(carat, price)) + geom_boxplot() + facet_grid(.~cut)

lattice system: xyplot(y ~ x | f*g, data)

y ~ x: y plotted against x
f*g: f and g are optional conditioning variables. The * means the intersection of two variables
data: dataframe/list from which y and x are looked up.

library(datasets)
library(lattice)
xyplot(Ozone ~ Wind, data = airquality, col="red", pch=8, main="Big Apple Data")

In this example, layout specifies dimensions of grid in the form of c(columns, rows) upon which multiple plots will be plotted. Alternatively, one can assign the output of xyplot() to a variable, then print it later to get the plot to appear.

library(datasets)
library(lattice)
airquality = transform(airquality, Month = factor(Month))
p = xyplot(Ozone ~ Wind | Month, data = airquality, layout = c(5, 1))
#lattice functions return objects of class trellis
print(p)

#To view a particular setting of the p object
p[["x.limits"]]

## [1]  0.37 22.03

Another example with more optional conditioning.

library(ggplot2)
library(lattice)
str(diamonds)
table(diamonds$color, diamonds$cut)
xyplot(price~carat | color*cut, data = diamonds, strip=FALSE, pch=20, xlab="Carat", ylab="Price", main="Diamonds are Sparkly!")
#strip=FALSE indicates that the labels for each box is not shown to avoid clutter

Controlling color and opacity of the points and its importance.

#Arguments of rgb() in order: red, green, blue, alpha
x = rnorm(1000)
y = rnorm(1000)
plot(x, y, pch=19, col=rgb(0, 0.5, 0.5, 0.3))

Lattice panel functon

#Generate data
set.seed(10)
x = rnorm(100)
f = rep(0:1, each=50)
y = x + f - f*x + rnorm(100, sd=0.5)
f = factor(f, labels=c("Group 1", "Group 2"))

#Lattice panel function example 1
xyplot(y ~ x | f, panel=function(x, y, ...){
  panel.xyplot(x, y, ...) 
  panel.abline(h=median(y), lty=2)
}, layout=c(2, 1))

#Lattice panel function example 2
xyplot(y ~ x | f, panel=function(x, y, ...){
  panel.xyplot(x, y, ...)
  panel.lmline(x, y, col = 2) #Overlay simple linear regression line. col parameter determines color of line.  
})

Save plots already plotted to file. In this example, the device is specified as png, and the height and width of the png file are specified.

library(datasets)
x = rnorm(100, 0, 3)
hist(x)
dev.copy(png, file="plot.png", height=480, width=480)
dev.off()

Writing plots to pdf.

pdf(file="myplot.pdf")
x = rnorm(100, 0, 3)
hist(x)
dev.off()

Multiple scatter plots: adding general title to multiple scatter plots via mtext. The oma parameter in par() represents outer margin lengths from bottom, left, top, right respectively.

library(datasets)
par(mfrow=c(1, 3), mar=c(4, 4, 2, 1), oma=c(0, 0, 2, 0))
with(airquality, {
  plot(Wind, Ozone, main="Ozone and Wind")
  plot(Temp, Ozone, main="Ozone and Temperature")
  plot(Temp, Ozone, main="Ozone and Temperature")
  mtext("Ozone and Weather in New York City", outer = TRUE)
})

Usage of legend() function to add legend. type parameter in plot() indicates what type of plot to plot(“p”=points, “l”=lines, “n”=no plotting). lm() draws the linear regression line to the graph. In this example, linear regression is performed on the airquality data with Wind on the x-axis and Ozone on the y-axis.

library(datasets)
with(airquality, plot(Wind, Ozone, main="Ozone and Wind in New York City", type="n"))
with(subset(airquality, Month == 5), points(Wind, Ozone, col="blue"))
with(subset(airquality, Month != 5), points(Wind, Ozone, col="red"))
legend("topright", pch=1, col=c("blue", "red"), legend=c("May", "Other Months"))
model = lm(Ozone ~ Wind, airquality)
model

## 
## Call:
## lm(formula = Ozone ~ Wind, data = airquality)
## 
## Coefficients:
## (Intercept)         Wind  
##      96.873       -5.551

abline(model, lwd=2)

Annotating a graph.

lines(): accepts two vectors of x and y values.
text(): accepts x and y positions of text and actual text.
title(): adds title
mtext(): adds text on the margins (1=bottom, 2=left, 3=top, 4=right)

library(datasets)
with(airquality, plot(Temp, Wind))
lines(airquality$Temp, airquality$wind)
text(x=75, y=20, labels="Hello World!")
title("Test Graph")
mtext("Random text", side=4)

Alternatively, rather than specify the coordinates you want your text to be placed, you mouse-click the position you want your text to be placed.

library(datasets)
with(airquality, plot(Temp, Wind))
lines(airquality$Temp, airquality$wind)
text(locator(1), labels="Hello World!")
title("Test Graph")
mtext("Random text", side=4)

Exploratory Data Analyis in 1 dimension. The rug() function plots the points under the histogram.

#summary() gives you a six number summary of a vector of values.  
data = rnorm(100, 10, 3)
summary(data)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.690   8.407  10.210  10.090  12.210  17.290

#Boxplot
boxplot(data, col="blue")

#histogram
par(mfcol = c(1, 2))
hist(data, col="green", main = "Histogram default breaks")
rug(data)
hist(data, col="green", main = "Histogram with 50 breaks", breaks = 50)
rug(data)

par(mfcol = c(1, 1))
#Barplot summarizes categorical variables
data2 = factor(c(rep("east", 450), rep("west", 150)))
barplot(table(data2), col="wheat")

Exploratory data analysis in 2 dimensions

pollution = read.csv("data_sets/avgpm25.csv", colClasses = c("numeric", "character", "factor", "numeric", "numeric"))
head(pollution)

##        pm25  fips region longitude latitude
## 1  9.771185 01003   east -87.74826 30.59278
## 2  9.993817 01027   east -85.84286 33.26581
## 3 10.688618 01033   east -87.72596 34.73148
## 4 11.337424 01049   east -85.79892 34.45913
## 5 12.119764 01055   east -86.03212 34.01860
## 6 10.827805 01069   east -85.35039 31.18973

#Multiple boxplots. In this example. We are making a box plot of the pm25 data from the pollution dataframe, divided by region. 
boxplot(pm25 ~ region, data = pollution, col="red")

#Multiple histograms
par(mfrow = c(2, 1))
hist(subset(pollution, region=="east")$pm25, col="green")
hist(subset(pollution, region=="west")$pm25, col="green")

par(mfrow=c(1, 1))
#Scatterplot with multiple dimensions.

#col.axis determines color of axis number labels. col.lab determines xlab and ylab colors. 
with(pollution, plot(latitude, pm25, col=region, col.axis="blue", col.lab="red"))
abline(h=12)

Making multiple boxplots from a list input. Each boxplot will represent data from an item in a list.

x = list(a=rnorm(20, 0, 1), b=rnorm(20, 0, 2), c=rnorm(20, 0, 3))
boxplot(x)

Create multiple grids on which plots will be generated with par() function. Any charts generated after these commands will go on the grid. The parameter mfrow indicates that plots will be plotted by row and mfcol indicates that plots will be plotted by column. Both mfrow and mfcol accepts a two element vector, with the first number indicating number of rows and second number indicating number of columns. The mar parameter accepts a 4 element vector indicating the margins for the bottom, left, top, right respectively.

x = seq(10)
par(mfrow=c(2, 2), mar=c(4, 4, 4, 4))
plot(x, sin(x))
plot(x, cos(x))
plot(x, x^2)
plot(x, tan(x))

#Resets par() so that one plot is shown at a time
par(mfcol=c(1, 1))

The las parameter in par() can be used to orient the axis value labels.

0: always parallel to axis
1: always horizontal
2: always perpendicular to axis
3: always vertical

The bg parameter in par() can be used to set the background color

library(datasets)
hist(airquality$Ozone)

par(las=1, bg="wheat")
hist(airquality$Ozone)

Adding a line via abline().

x = rnorm(100, 5, 2)
hist(x)
#Line starts at y=5 and has a slope of 5
abline(5, 5, col="green")
#Vertical line with a line width of 4, position at mean of x
abline(v = mean(x), col="red", lwd=4)
#Horizontal line with position at 15
abline(h = 15)

Making a MA plot

#Assume e is our log2 transformed expression matrix. Where rows are genes and columsn are samples
e <- log2(exprs(mas133))
plot((e[,1]+e[,2])/2,e[,2]-e[,1],cex=0.5)

Making a quantile plot

#Assume "e" is our expression matrix with genes as rows and samples as columns.
#Finding the quantiles for each sample given the probabilties.
qs = apply(e, 2, quantile, prob=c(0.05, 0.25, 0.5, 0.75, 0.95))
#dim(qs) will indicate number of quantiles by number of samples
#matplot() function plots columns of a matrix against rows. Since we want the quantiles in our quantile plot to be on the y-axis, we must transpose qs.
qs = t(qs)
#In matplot, type indicates type of plot (in this case line plot), and it is not clear what the lty parameter is. 
matplot(qs, type="l", lty=1, main="Quantile Plot", xlab="Samples", tlab="Quantiles")

Making a volcano plot with an example dataset

res = read.table("results.txt", header=TRUE)
head(res)
class(res)

#with() function makes it more convenient by allowing user to directly reference column name in plot() function
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2.5), asp=1)) 

#pch is a parameter that determines shape of point

#Let us color the points according to the pvalue and log2fold change characteristics. (1) Red if points have an adjusted pvalue < 0.05 (2) orange is points have log2 fold change > abs(1), and (3) green if both conditons 1 and 2 are met. 
with(subset(res, padj <0.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
with(subset(res, abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), pch=20, col="orange"))
with(subset(res, padj<.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), pch=20, col="green"))

Basic plotting

plot(x, y, main = title, cex=1)
#cex = controls size of points plotted. 1 means original size, 2 means double the size, and 0.5 means half the size.

Drawing polygons in plots

plot(1, 1, asp = 1) #asp = 1 makes ratio of y-axis and x-axis the same
polygon(c(0.8, 1.2, 1.2, 0.8), c(0.8, 0.8, 1.2, 1.2), density=0, border="red")

#polygon(x_coordinates_of_vertices, y_coordinates_of_vertices, color, density, border)

plyr/dataframe

dplyr is a package that makes it simpler and easier to work with dataframes. It also uses efficient data storage backends. A more detailed tutorial of dplyr can be found here.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

year = c(seq(2012, 2016), 2017)
month = c(1, 5, 6, 2, 9, 1)
dep_time = runif(6, 500, 510)
arr_time = runif(6, 800, 810)
d = data.frame(year=year, month=month, dep_time=dep_time, arr_time=arr_time)
d

##   year month dep_time arr_time
## 1 2012     1 504.3841 808.7472
## 2 2013     5 509.6009 804.9235
## 3 2014     6 503.0224 802.2187
## 4 2015     2 504.6572 800.6766
## 5 2016     9 504.2597 807.5462
## 6 2017     1 508.6572 809.9654

#Subsetting based on conditions
filter(d, dep_time > 505, arr_time < 805)

##   year month dep_time arr_time
## 1 2013     5 509.6009 804.9235

#slice() selects rows of dataframe
slice(d, 2:4)

##   year month dep_time arr_time
## 1 2013     5 509.6009 804.9235
## 2 2014     6 503.0224 802.2187
## 3 2015     2 504.6572 800.6766

#Arrange() reorders dataframe by a column. Additional columns are included to break ties.
arrange(d, month, year)

##   year month dep_time arr_time
## 1 2012     1 504.3841 808.7472
## 2 2017     1 508.6572 809.9654
## 3 2015     2 504.6572 800.6766
## 4 2013     5 509.6009 804.9235
## 5 2014     6 503.0224 802.2187
## 6 2016     9 504.2597 807.5462

arrange(d, desc(month)) #Arrange by month in descending order

##   year month dep_time arr_time
## 1 2016     9 504.2597 807.5462
## 2 2014     6 503.0224 802.2187
## 3 2013     5 509.6009 804.9235
## 4 2015     2 504.6572 800.6766
## 5 2012     1 504.3841 808.7472
## 6 2017     1 508.6572 809.9654

#Select allows you to select columns in dataframe.
select(d, dep_time, arr_time)

##   dep_time arr_time
## 1 504.3841 808.7472
## 2 509.6009 804.9235
## 3 503.0224 802.2187
## 4 504.6572 800.6766
## 5 504.2597 807.5462
## 6 508.6572 809.9654

select(d, month:dep_time)

##   month dep_time
## 1     1 504.3841
## 2     5 509.6009
## 3     6 503.0224
## 4     2 504.6572
## 5     9 504.2597
## 6     1 508.6572

select(d, starts_with("sl"))

## data frame with 0 columns and 6 rows

#distinct() used with select() only returns unique values
distinct(select(d, month))

##   month
## 1     1
## 2     5
## 3     6
## 4     2
## 5     9

#mutate() allows you to add columns to dataframe.
mutate(d, time=arr_time-dep_time, time_unit = time/60)

##   year month dep_time arr_time     time time_unit
## 1 2012     1 504.3841 808.7472 304.3631  5.072718
## 2 2013     5 509.6009 804.9235 295.3226  4.922043
## 3 2014     6 503.0224 802.2187 299.1963  4.986604
## 4 2015     2 504.6572 800.6766 296.0194  4.933657
## 5 2016     9 504.2597 807.5462 303.2865  5.054775
## 6 2017     1 508.6572 809.9654 301.3082  5.021804

#transmuate is like mutate() except you only end up with the newly created columns
transmute(d, time=arr_time-dep_time, time_unit = time/60)

##       time time_unit
## 1 304.3631  5.072718
## 2 295.3226  4.922043
## 3 299.1963  4.986604
## 4 296.0194  4.933657
## 5 303.2865  5.054775
## 6 301.3082  5.021804

#Using the pipe function to chain dplyr functions together.
d %>% select(year, month) %>% head

##   year month
## 1 2012     1
## 2 2013     5
## 3 2014     6
## 4 2015     2
## 5 2016     9
## 6 2017     1

d %>% mutate(time=arr_time-dep_time, time_unit = time/60) %>% filter(time >= 301) %>% head

##   year month dep_time arr_time     time time_unit
## 1 2012     1 504.3841 808.7472 304.3631  5.072718
## 2 2016     9 504.2597 807.5462 303.2865  5.054775
## 3 2017     1 508.6572 809.9654 301.3082  5.021804

Extracting subdata frames and applying functions to dataframes

library(plyr)

## -------------------------------------------------------------------------

## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)

## -------------------------------------------------------------------------

## 
## Attaching package: 'plyr'

## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

exams = as.data.frame(replicate(3, rnorm(5, 4, 1)))
exams = rename(exams, c("V1" = "Exam1", "V2" = "Exam2", "V3" = "Exam3"))
exams

##      Exam1    Exam2    Exam3
## 1 3.522454 4.888425 6.235394
## 2 5.747901 2.804460 4.347655
## 3 5.616139 5.423692 6.003491
## 4 3.322527 3.799218 2.792580
## 5 2.937886 5.328165 3.880153

#Extracting
exams[2:5, 2]

## [1] 2.804460 5.423692 3.799218 5.328165

exams[2:5, 2, drop=FALSE]

##      Exam2
## 2 2.804460
## 3 5.423692
## 4 3.799218
## 5 5.328165

exams[exams$Exam1 >= 4.00, ]

##      Exam1    Exam2    Exam3
## 2 5.747901 2.804460 4.347655
## 3 5.616139 5.423692 6.003491

subset(exams, Exam1 >= 4.00)

##      Exam1    Exam2    Exam3
## 2 5.747901 2.804460 4.347655
## 3 5.616139 5.423692 6.003491

#Applying apply()
apply(exams, 1, max)

## [1] 6.235394 5.747901 6.003491 3.799218 5.328165

Creating, accessing, and renaming data frames

kids = c("jack", "jill")
ages = c(12, 10)
d = data.frame(kids, ages, stringsAsFactors = FALSE) #Each column is a vector rather than a factor
d

##   kids ages
## 1 jack   12
## 2 jill   10

#Various accessing methods
d[[1]]

## [1] "jack" "jill"

d$kids

## [1] "jack" "jill"

d[,1]

## [1] "jack" "jill"

#Structure of dataframe d
str(d)

## 'data.frame':    2 obs. of  2 variables:
##  $ kids: chr  "jack" "jill"
##  $ ages: num  12 10

#New dataframe 
d <- data.frame(alpha=1:3, beta=4:6, gamma=7:9)
names(d)

## [1] "alpha" "beta"  "gamma"

#Rename method 1
rename(d, c("beta"="two", "gamma"="three"))

##   alpha two three
## 1     1   4     7
## 2     2   5     8
## 3     3   6     9

#Rename method 2
names(d)[names(d)=="beta"] <- "two"
names(d)[3] = "three"

List

Initialize a list of specified item names each with empty entries

names = c("a", "b", "c")
mylist = sapply(names, function(x) NULL)
mylist

## $a
## NULL
## 
## $b
## NULL
## 
## $c
## NULL

Recursive lists and how to flatten recursive lists.

b = list(u = 5, v = 12)
c = list(w = 13)
a = list(b, c)
a

## [[1]]
## [[1]]$u
## [1] 5
## 
## [[1]]$v
## [1] 12
## 
## 
## [[2]]
## [[2]]$w
## [1] 13

length(a)

## [1] 2

c(list(u=5, v=12, c=list(d=5, e=9)))

## $u
## [1] 5
## 
## $v
## [1] 12
## 
## $c
## $c$d
## [1] 5
## 
## $c$e
## [1] 9

#flatten option in c()
c(list(u=5, v=12, c=list(d=5, e=9)), recursive = T)

##   u   v c.d c.e 
##   5  12   5   9

Using lapply() and sapply() functions to lists

#lapply returns list
lapply(list(1:3, 25:29), median)

## [[1]]
## [1] 2
## 
## [[2]]
## [1] 27

#sapply returns vector
sapply(list(1:3, 25:29), median)

## [1]  2 27

Lists are considered lower precedence to vectors

c(5, 2, list(a=1, b=4))

## [[1]]
## [1] 5
## 
## [[2]]
## [1] 2
## 
## $a
## [1] 1
## 
## $b
## [1] 4

Converting lists back to vectors and removing names

j = list(name="joe", salary=5500, union=T)
#get names 
names(j)

## [1] "name"   "salary" "union"

class(names(j))

## [1] "character"

#List to vector
ulj = unlist(j)
ulj

##   name salary  union 
##  "joe" "5500" "TRUE"

class(ulj)

## [1] "character"

#Removing names method 1
ulj1 = unname(ulj)
ulj1

## [1] "joe"  "5500" "TRUE"

#Removing names method 2
names(ulj) = NULL
ulj

## [1] "joe"  "5500" "TRUE"

Adding and deleting to lists. Getting size of list.

j = list(name="joe", salary=5500, union=T)
#Add method 1: 
j$activity = "sailing"
j

## $name
## [1] "joe"
## 
## $salary
## [1] 5500
## 
## $union
## [1] TRUE
## 
## $activity
## [1] "sailing"

#Add method 2: 
j[4:5] = c(1, 2)
j

## $name
## [1] "joe"
## 
## $salary
## [1] 5500
## 
## $union
## [1] TRUE
## 
## $activity
## [1] 1
## 
## [[5]]
## [1] 2

#Delete an item: 
j[[5]] = NULL
j$activity = NULL
j

## $name
## [1] "joe"
## 
## $salary
## [1] 5500
## 
## $union
## [1] TRUE

#Getting number of components
length(j)

## [1] 3

Creating lists

j = list(name="joe", salary=5500, union=T)
j

## $name
## [1] "joe"
## 
## $salary
## [1] 5500
## 
## $union
## [1] TRUE

jalt = list("joe", 5500, T)
jalt

## [[1]]
## [1] "joe"
## 
## [[2]]
## [1] 5500
## 
## [[3]]
## [1] TRUE

j$salary

## [1] 5500

j[["salary"]]

## [1] 5500

j[2]

## $salary
## [1] 5500

j[[2]]

## [1] 5500

z = vector(mode="list")
z[["abc"]] = 3
z

## $abc
## [1] 3

Array

Arrays to store higher dimensional data

firsttest = matrix(c(46, 21, 50, 30, 25, 50), nrow=3)
secondtest = matrix(c(46, 41, 50, 43, 35, 50), nrow=3)
tests = array(data=c(firsttest, secondtest), dim = c(3, 2, 2))
tests

## , , 1
## 
##      [,1] [,2]
## [1,]   46   30
## [2,]   21   25
## [3,]   50   50
## 
## , , 2
## 
##      [,1] [,2]
## [1,]   46   43
## [2,]   41   35
## [3,]   50   50

tests[,,1]

##      [,1] [,2]
## [1,]   46   30
## [2,]   21   25
## [3,]   50   50

An array with dimensions $c(x_1, x_2, x_3)$ is composed of $x_3$ numbers of $x_1 x x_2$ matrices.

b = array(1:12, c(2, 3, 2))

# Get the first matrix
b[,,1]

##      [,1] [,2] [,3]
## [1,]    1    3    5
## [2,]    2    4    6

# Get the first row of every matrix
b[1,,]

##      [,1] [,2]
## [1,]    1    7
## [2,]    3    9
## [3,]    5   11

# Get the second column of every matrix
b[,2,]

##      [,1] [,2]
## [1,]    3    9
## [2,]    4   10

# Reassign dimensions
dim(b) = c(6, 2)
b

##      [,1] [,2]
## [1,]    1    7
## [2,]    2    8
## [3,]    3    9
## [4,]    4   10
## [5,]    5   11
## [6,]    6   12

Linear Algebra

Basic linear algebra operations on matrices.

y = matrix(1:4, byrow = T, nrow=2)
y%*%y #Matrix - matrix multiplication

##      [,1] [,2]
## [1,]    7   10
## [2,]   15   22

3*y #Matrix - scalar multiplication

##      [,1] [,2]
## [1,]    3    6
## [2,]    9   12

y + y #matrix - matrix addition

##      [,1] [,2]
## [1,]    2    4
## [2,]    6    8

Find the inverse of a square matrix.

A = matrix(c(1, 2, 3, 4), nrow = 2)
Ain = solve(A)
A%*%Ain

##      [,1] [,2]
## [1,]    1    0
## [2,]    0    1

Dataframe

Combine a list of data frames/matrices into a single matrix

x = list(a=matrix(seq(6), nrow = 3), b=matrix(seq(4), nrow = 2))
do.call(rbind, x)

##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6
## [4,]    1    3
## [5,]    2    4

In-place tranform a column in a dataframe to another class using tranform(). First argument is a dataframe, and any arguments afterwards allows you to redefine the class of the column.

library(datasets)
head(airquality)

##   Ozone Solar.R Wind Temp Month Day
## 1    41     190  7.4   67     5   1
## 2    36     118  8.0   72     5   2
## 3    12     149 12.6   74     5   3
## 4    18     313 11.5   62     5   4
## 5    NA      NA 14.3   56     5   5
## 6    28      NA 14.9   66     5   6

airquality = transform(airquality, Month=factor(Month))

Given that a dataframe has factor columns containing double datatypes, transform columns into vectors.

d = data.frame(one = factor(c(0.45, 0.545)), two = factor(c("high", "low")))
d

##     one  two
## 1  0.45 high
## 2 0.545  low

class(d$one)

## [1] "factor"

d1 = transform(d, one = as.vector(one))
d1

##     one  two
## 1  0.45 high
## 2 0.545  low

class(d1$one)

## [1] "character"

Filtering with the subset() function to remove NAs. subset() has inputs data, condition, and select. Select chooses which columns to select from filtered data frame.

x <- c(6, 1:3, NA, 12)
x[x > 5]

## [1]  6 NA 12

subset(x, x > 5)

## [1]  6 12

exams = replicate(3, rnorm(5, 4, 1))
exams

##          [,1]     [,2]     [,3]
## [1,] 5.230604 5.036754 5.232597
## [2,] 3.555671 4.249525 4.501794
## [3,] 4.066296 4.194820 3.314486
## [4,] 2.171491 4.064353 2.759771
## [5,] 4.342042 3.408740 3.560954

subset(exams, exams[,1] > 4 | exams[,2] > 4 | exams[,3] > 4, select=c(2, 3))

##          [,1]     [,2]
## [1,] 5.036754 5.232597
## [2,] 4.249525 4.501794
## [3,] 4.194820 3.314486
## [4,] 4.064353 2.759771
## [5,] 3.408740 3.560954

Using lapply and sapply to dataframes and converting lists to dataframes.

d = data.frame(kids = c("Jack", "Jill"), ages = c(12, 10))
d

##   kids ages
## 1 Jack   12
## 2 Jill   10

dl = lapply(d, sort)
dl

## $kids
## [1] Jack Jill
## Levels: Jack Jill
## 
## $ages
## [1] 10 12

as.data.frame(dl)

##   kids ages
## 1 Jack   10
## 2 Jill   12

Alternatively, to directly receive datafrmae as output rather than list.

d[] = lapply(d, sort)
d

##   kids ages
## 1 Jack   10
## 2 Jill   12

Merging data frames using merge(). The function merge() will automatically merge similar columns and will only keep rows that exist in both dataframes. If columns for each dataframe contains the same information but different names, user can indicate that.

d1 = data.frame(kids=c("Jack", "Jill", "Jillian", "John"), states=c("CA", "MA", "MA", "HI"))
d2 = data.frame(ages=c(10, 7, 12), kids=c("Jill", "Lillian", "Jack"))
d1

##      kids states
## 1    Jack     CA
## 2    Jill     MA
## 3 Jillian     MA
## 4    John     HI

d2

##   ages    kids
## 1   10    Jill
## 2    7 Lillian
## 3   12    Jack

d = merge(d1, d2)
d

##   kids states ages
## 1 Jack     CA   12
## 2 Jill     MA   10

d3 = data.frame(ages = c(12, 10, 7), pals = c("Jack", "Jill", "Lillian"))
merge(d1, d3, by.x="kids", by.y="pals")

##   kids states ages
## 1 Jack     CA   12
## 2 Jill     MA   10

#Becareful of duplicate entries in a column for one dataframe when using merge()
d2a = rbind(d2, list(15, "Jill"))
merge(d1, d2a)

##   kids states ages
## 1 Jack     CA   12
## 2 Jill     MA   10
## 3 Jill     MA   15

Adding rows and columns to dataframes

kids = c("Jack", "Jill")
ages = c(12, 10)
d = data.frame(kids, ages, stringsAsFactors = F)

#Add row method 1:
rbind(d, list("Laura", 19))

##    kids ages
## 1  Jack   12
## 2  Jill   10
## 3 Laura   19

#Add row method 2: 
rbind(d, c("Laura", 19))

##    kids ages
## 1  Jack   12
## 2  Jill   10
## 3 Laura   19

#Add column method 1: 
cbind(d, happy = c(T, F))

##   kids ages happy
## 1 Jack   12  TRUE
## 2 Jill   10 FALSE

#Add column method 2: 
d$happy = T
d

##   kids ages happy
## 1 Jack   12  TRUE
## 2 Jill   10  TRUE

d$sad = c(F, F)
d

##   kids ages happy   sad
## 1 Jack   12  TRUE FALSE
## 2 Jill   10  TRUE FALSE

Completing removing any rows with NAs in a dataframe

kids = c("Jack", NA, "Jillian", "John")
states = c("CA", "MA", "MA", NA)
d = data.frame(kids, states, stringsAsFactors = F)
d

##      kids states
## 1    Jack     CA
## 2    <NA>     MA
## 3 Jillian     MA
## 4    John   <NA>

complete.cases(d)

## [1]  TRUE FALSE  TRUE FALSE

d2 = d[complete.cases(d), ]
d2

##      kids states
## 1    Jack     CA
## 3 Jillian     MA

Conversion

One way to convert a vector to a matrix.

x = 1:16
x

##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16

dim(x) = c(4, 4)
x

##      [,1] [,2] [,3] [,4]
## [1,]    1    5    9   13
## [2,]    2    6   10   14
## [3,]    3    7   11   15
## [4,]    4    8   12   16

x = 1:5
v = as.matrix(x)
v

##      [,1]
## [1,]    1
## [2,]    2
## [3,]    3
## [4,]    4
## [5,]    5

Ways to convert from one object type to the other.

x = 1:3
mode(x) = "double"
typeof(x)

## [1] "double"

mode(x) = "character"
x

## [1] "1" "2" "3"

#Convert to string
number = 5
as.character(number)

## [1] "5"

#The strtoi() function converts from string to integer
x = c("1", "2", "3", "4")
x

## [1] "1" "2" "3" "4"

strtoi(x)

## [1] 1 2 3 4

Apply

sapply(x, f) applies function f() to every element in x and returns a vector/matrix as a result.

a = function(z) return(c(z, z^2))
a(1:4)

## [1]  1  2  3  4  1  4  9 16

result = sapply(1:4, a)
result

##      [,1] [,2] [,3] [,4]
## [1,]    1    2    3    4
## [2,]    1    4    9   16

class(result)

## [1] "matrix"

Usage of apply(m, dimcode, f, fargs)

m is a matrix
dimcode is dimension. 1 if applied to rows and 2 if applied to columns
f is function to be applied
fargs is an optional set of arguments to be supplied to f.

If the function returns k components, then the result of apply() will have k rows.

z = matrix(seq(1, 6), nrow = 3)
z

##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6

apply(z, 2, mean)

## [1] 2 5

f = function(x) x/c(2, 8)
y = apply(z, 1, f)
y

##      [,1]  [,2] [,3]
## [1,]  0.5 1.000 1.50
## [2,]  0.5 0.625 0.75

t(y)

##      [,1]  [,2]
## [1,]  0.5 0.500
## [2,]  1.0 0.625
## [3,]  1.5 0.750

Usage of apply(m, dimcode, f, fargs) #2!

f = function(IW, d){
  maj = sum(IW[1:d]) / d
  return(if(maj > 0.5) 1 else 0)
}
x = matrix(c(1, 0, 1, 1, 0, 1, 1, 1, 1, 0), nrow=2, ncol=5, byrow = T)
x

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    0    1    1    0
## [2,]    1    1    1    1    0

apply(x, 1, f, 3)

## [1] 1 1

apply(x, 1, f, 2)

## [1] 0 1

Searching

regexpr(<pattern>, <x>) finds index of the first occurence of <pattern> in <x>:

regexpr("a", c("cat", "dag", "wallet", "average", "status"))

## [1] 2 2 2 1 3
## attr(,"match.length")
## [1] 1 1 1 1 1
## attr(,"useBytes")
## [1] TRUE

gregexpr(<pattern>, <x>) finds index of every occurence of <pattern> in <x>

# Finds index of every period
gregexpr("\\.", c("1244.34234.234234", "342.24323.12414"))

## [[1]]
## [1]  5 11
## attr(,"match.length")
## [1] 1 1
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1]  4 10
## attr(,"match.length")
## [1] 1 1
## attr(,"useBytes")
## [1] TRUE

subset() can be used to filter out NAs

x = c(6, 1:3, NA, 12)
x[x > 5]

## [1]  6 NA 12

subset(x, x > 5)

## [1]  6 12

match() finds the index of the first match in the second vector of each element in the first vector.

v1 = c("Cow", "Owl", "Greater")
v2 = c("Cheetah", "Owl", "Mountain", "Greater")
match(v1, v2)

## [1] NA  2  4

grep() searches for a specified substring pattern in a vector x of strings. It returns the indices where the pattern occurs. Setting value = TRUE in grep() allows one to return the whole line containing the pattern.

grep("Pole", c("Equation", "North Pole", "South Pole"))

## [1] 2 3

grep("pole", c("Equator", "North Pole", "South Pole"))

## integer(0)

grepl() searches for a specified substring pattern in a vector x of strings. It returns TRUE where pattern occurs and FALSE where it does not.

grepl("Pole", c("Equation", "North Pole", "South Pole"))

## [1] FALSE  TRUE  TRUE

Application: using grep(), readLines(), and read.table() to only read a file that contains certain dates in its first column. In grep syntax, “^” means the line has to start with the pattern it is searching for. [1, 2] means that either 1 or 2 can be in that position in the pattern. value = TRUE, means that instead of returning indices, the whole line containing the pattern is returned.

f = file("data_sets/household_power_consumption.txt")
data = read.table(text=grep("^[1, 2]/2/2007", readLines(f), value = TRUE),
                  sep = ";", na.strings="?", header=TRUE, col.names = c("Date", "Time",
                  "Global_active_power", "Global_reactive_power", "Voltage",
                  "Global_intensity", "Sub_metering_1", "Sub_metering_2", "Sub_metering_3"))
hist(data$Global_active_power, breaks=12, xlab="Global Active Power (kilowatts)",
     ylab="Frequency", main="Global Active Power", col="red")

any() reports whether any arguments in vector is true.

x = 1:10 
any(x > 8)

## [1] TRUE

any(x > 88)

## [1] FALSE

which() returns the indices for which a condition is fulfilled in a vector.

z = c(5, 2, -3, 8)
which(z*z > 8)

## [1] 1 3 4

Determines index of maximum/minimum value in vector using which()

x = c(1, 4, 5, -5, 99, 3)
which.max(x)

## [1] 5

which.min(x)

## [1] 4

Testing vector inequality using all().

x = 1:3
y = c(1, 3, 4)
x == y

## [1]  TRUE FALSE FALSE

all(x == y)

## [1] FALSE

Identical() tests for both value and type equality.

x = 1:3
y = c(1, 2, 3)
identical(x, y)

## [1] FALSE

typeof(x)

## [1] "integer"

typeof(y)

## [1] "double"

Matrices

Matrices are vectors with special class attributes. By default, elements are stored in column-major order. One can specify during matrix creation to have elements stored in row-major order. When doing matrix addition or subtraction, the number of elements must be equal to each other for an element-wise operation.

x = matrix(seq(1, 4, 1), nrow=2)
x

##      [,1] [,2]
## [1,]    1    3
## [2,]    2    4

y = x + 10:13
y

##      [,1] [,2]
## [1,]   11   15
## [2,]   13   17

x = matrix(seq(1, 4, 1), nrow=2, byrow = T)
x

##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4

Using col() and row()

z = matrix(3:8, nrow = 3)
row(z)

##      [,1] [,2]
## [1,]    1    1
## [2,]    2    2
## [3,]    3    3

col(z)

##      [,1] [,2]
## [1,]    1    2
## [2,]    1    2
## [3,]    1    2

Replacing a row with a certain value

y = matrix(1:6, nrow = 3)
y[2, ] = 0

Remvoing rows and columns from matrices

y = matrix(1:6, nrow = 3)
y[-2, ]

##      [,1] [,2]
## [1,]    1    4
## [2,]    3    6

y[, -2]

## [1] 1 2 3

Assignment of submatrices to matrices

x = matrix(nrow = 3, ncol = 3)
y = matrix(c(4, 5, 2, 3), nrow = 2)
x[2:3, 2:3] = y
x

##      [,1] [,2] [,3]
## [1,]   NA   NA   NA
## [2,]   NA    4    2
## [3,]   NA    5    3

Initializing a matrix, then assigning value to each element

y = matrix(nrow=2, ncol=2)
y

##      [,1] [,2]
## [1,]   NA   NA
## [2,]   NA   NA

y[1, 1] = 1
y[2, 1] = 2
y

##      [,1] [,2]
## [1,]    1   NA
## [2,]    2   NA

#Initializing a matrix of zeros
y = matrix(0, nrow=2, ncol=2)
y

##      [,1] [,2]
## [1,]    0    0
## [2,]    0    0

Matrices: col() and row() functions.

z = matrix(3:8, nrow=3)
row(z)

##      [,1] [,2]
## [1,]    1    1
## [2,]    2    2
## [3,]    3    3

col(z)

##      [,1] [,2]
## [1,]    1    2
## [2,]    1    2
## [3,]    1    2

More complicated matrix subsetting.

m = matrix(seq(1, 6), nrow=3)
m

##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6

m[m[,1] > 1 & m[, 2] > 5, ]

## [1] 3 6

which(m > 2)

## [1] 3 4 5 6

which(m > 2, arr.ind = T)

##      row col
## [1,]   3   1
## [2,]   1   2
## [3,]   2   2
## [4,]   3   2

Matrix element assignment and row/column removal.

x = matrix(nrow=3, ncol=3)
y = matrix(c(4, 5, 2, 3), nrow=2)
y

##      [,1] [,2]
## [1,]    4    2
## [2,]    5    3

x[2:3, 2:3] = y
x

##      [,1] [,2] [,3]
## [1,]   NA   NA   NA
## [2,]   NA    4    2
## [3,]   NA    5    3

y[-2, ]

## [1] 4 2

Finding the column-wise/row-wise means using colMeans(x) or rowMeans(x)

x = matrix(c(1, 0, 1, 1, 0, 1, 1, 1, 1, 0), nrow=2, ncol=5, byrow = T)
x

##      [,1] [,2] [,3] [,4] [,5]
## [1,]    1    0    1    1    0
## [2,]    1    1    1    1    0

colMeans(x)

## [1] 1.0 0.5 1.0 1.0 0.0

rowMeans(x)

## [1] 0.6 0.8

Retrieving dimensions of vectors/matrix distinction.

z = matrix(1:8, nrow=4)
z

##      [,1] [,2]
## [1,]    1    5
## [2,]    2    6
## [3,]    3    7
## [4,]    4    8

str(z)

##  int [1:4, 1:2] 1 2 3 4 5 6 7 8

length(z)

## [1] 8

attributes(z)

## $dim
## [1] 4 2

dim(z)

## [1] 4 2

nrow(z)

## [1] 4

ncol(z)

## [1] 2

Appending columns or rows to matrix using cbind() or rbind().

one = c(1, 1, 1, 1)
z = matrix(seq(1, 8), nrow=4)
z = cbind(one, z)
z

##      one    
## [1,]   1 1 5
## [2,]   1 2 6
## [3,]   1 3 7
## [4,]   1 4 8

z = cbind(2, z)
z

##        one    
## [1,] 2   1 1 5
## [2,] 2   1 2 6
## [3,] 2   1 3 7
## [4,] 2   1 4 8

z = rbind(6, z)
z

##        one    
## [1,] 6   6 6 6
## [2,] 2   1 1 5
## [3,] 2   1 2 6
## [4,] 2   1 3 7
## [5,] 2   1 4 8

cbind()

## NULL

How to avoid dimensionality reduction when extracting a row from a matrix.

z = matrix(1:8, nrow=4)
r = z[2, ]
r

## [1] 2 6

attributes(r) #r is a vector

## NULL

r = z[2,, drop=FALSE]
r

##      [,1] [,2]
## [1,]    2    6

attributes(r) #r is now a matrix

## $dim
## [1] 1 2

Assigning colnames and rownames to matrices

z = matrix(1:4, nrow=2)
colnames(z) = c("a", "b")
rownames(z) = c("c", "d")
z

##   a b
## c 1 3
## d 2 4

Factors

Using tapply() and split() with factors. The inputs for tapply() are tapply(vector, factor(s), function). split() performs the same task as tapply() except for applying a function. The inputs for split() are split(vector, factor(s)). split() returns a list of the splits.

ages = c(25, 26, 55, 37, 21, 42)
#Affils must have the same length as ages
affils = c("R", "D", "D", "R", "U", "D")
#affils is automatically converted to and treated as a factor by tapply()
tapply(ages, affils, mean)

##  D  R  U 
## 41 31 21

#tapply() when there are more than 1 factor 
d = data.frame(gender=c("M", "M", "F", "M", "F", "F"),  age=c(47, 59, 21, 32, 33, 24), income=c(55000, 88000, 32450, 76500, 123000, 45650))
d

##   gender age income
## 1      M  47  55000
## 2      M  59  88000
## 3      F  21  32450
## 4      M  32  76500
## 5      F  33 123000
## 6      F  24  45650

d$over25 = ifelse(d$age>25, 1, 0)
d

##   gender age income over25
## 1      M  47  55000      1
## 2      M  59  88000      1
## 3      F  21  32450      0
## 4      M  32  76500      1
## 5      F  33 123000      1
## 6      F  24  45650      0

tapply(d$income, list(d$gender, d$over25), mean)

##       0         1
## F 39050 123000.00
## M    NA  73166.67

#split()
split(d$income, list(d$gender, d$over25))

## $F.0
## [1] 32450 45650
## 
## $M.0
## numeric(0)
## 
## $F.1
## [1] 123000
## 
## $M.1
## [1] 55000 88000 76500

#Another application of split(): finding indices
g = c("M", "F", "F", "I", "M", "M", "F")
split(1:7, g)

## $F
## [1] 2 3 7
## 
## $I
## [1] 4
## 
## $M
## [1] 1 5 6

Generating factor levels. gl(number of levels, number of replications, optional character label vector).

gl(2, 3, labels=c("Male", "Female"))

## [1] Male   Male   Male   Female Female Female
## Levels: Male Female

Creating a factor with repeated elements

g = factor(rep(c(0, 1), each=12))

Creating, examining, and manipulating factors

x = c(5, 12, 13, 12)
xf = factor(x)
xf

## [1] 5  12 13 12
## Levels: 5 12 13

str(xf) #str() displays internal structure of R object

##  Factor w/ 3 levels "5","12","13": 1 2 3 2

unclass(xf)

## [1] 1 2 3 2
## attr(,"levels")
## [1] "5"  "12" "13"

length(xf)

## [1] 4

#Adding new value entry to factor. Cannot add new value that is not a level. 
xff = factor(x, levels=c(5, 12, 13, 88))
xff

## [1] 5  12 13 12
## Levels: 5 12 13 88

xff[2] = 88
xff

## [1] 5  88 13 12
## Levels: 5 12 13 88

#Transforming a vector to a factor
x = c("Jan", "Feb", "Jan", "April")
as.factor(x)

## [1] Jan   Feb   Jan   April
## Levels: April Feb Jan

Vectors

Filtering on matrices

x = matrix(c(1, 2, 3, 2, 3, 4), nrow = 3)
x[x[, 2] >= 3, ]

##      [,1] [,2]
## [1,]    2    3
## [2,]    3    4

z = c(5, 12, 13)
x[z %% 2 == 1, ]

##      [,1] [,2]
## [1,]    1    2
## [2,]    3    4

Filtering on vectors

z = c(5, 2, -3, 8)
z*z > 8

## [1]  TRUE FALSE  TRUE  TRUE

Transforming a vector of TRUE and FALSE to 1 and 0.

x = c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE)
x

## [1]  TRUE  TRUE FALSE  TRUE  TRUE FALSE FALSE FALSE  TRUE

as.numeric(x)

## [1] 1 1 0 1 1 0 0 0 1

Remove all the NA from a vector

x = c(1, 2, 3, NA, 3, 2, NA)
is.na(x)

## [1] FALSE FALSE FALSE  TRUE FALSE FALSE  TRUE

x[!is.na(x)]

## [1] 1 2 3 3 2

Flattening effect of c().

c(5, 2, c(1.5, 6))

## [1] 5.0 2.0 1.5 6.0

Vector element can be assigned names and used as reference to corresponding element.

x = c(1, 2, 3)
names(x)

## NULL

names(x) = c("a", "b", "c")
x

## a b c 
## 1 2 3

x["b"]

## b 
## 2

names(x) = NULL
x

## [1] 1 2 3

Vectors names can alternatively be assigned this way:

x = c()
x["a"] = 1
x["b"] = 2
x

## a b 
## 1 2

All elements in a vector must be of the same mode, or variable type. If elements of different modes are created during vector creation, then R will automatically convert elements to same mode.

x = c(2, "j")
typeof(x)

## [1] "character"

Vector initialization of specificed length.

x = vector(length=4)
x

## [1] FALSE FALSE FALSE FALSE

Given a vector, create a new vector with repeated elements.

x = c(4, 2, 17, 5)
y = x[c(1, 1, 3)]
y

## [1]  4  4 17

seq() can be used to (1) generate numeric vector of evenly-spaced numbers from minimum to maximum or (2) genereate a sequence of numbers from 1 to length of input vector.

qs = seq(0, 20, 2)
qs

##  [1]  0  2  4  6  8 10 12 14 16 18 20

x = c(5, 12, 13, 14)
qs = seq(x)
qs

## [1] 1 2 3 4

x = NULL
qs = seq(x)
#Empty integer vector
qs

## integer(0)

#Creating a sequence of values of specificed length
lambdas = seq(0, 10, len=10)
lambdas

##  [1]  0.000000  1.111111  2.222222  3.333333  4.444444  5.555556  6.666667
##  [8]  7.777778  8.888889 10.000000

Removing elements from vector using negative indexing

x = c(5, 12, 13, 3, 6, 0, 1, 15, 16, 8, 88)
x[-1]

##  [1] 12 13  3  6  0  1 15 16  8 88

x[-length(x)]

##  [1]  5 12 13  3  6  0  1 15 16  8

Miscellaneous

Given a vector, generate indices

Given a function with multiple inputs and a list of values for each type of input, outer() feeds every unique combination of input values into a function and outputs a matrix.

x = 1:3
y = 1:3
f1 = function(x, y) return(x*y)
outer(x, y, f1)

##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    2    4    6
## [3,]    3    6    9

To generate all unique combinations of elements in two vectors.

expand.grid(seq(3), seq(3))

##   Var1 Var2
## 1    1    1
## 2    2    1
## 3    3    1
## 4    1    2
## 5    2    2
## 6    3    2
## 7    1    3
## 8    2    3
## 9    3    3

Reorganizing dataframe feature variables: sometimes tasks such as boxplotting requires reformatting an existing dataframe. In a $m$ by $n$ data matrix, if you want to combine features $i$ and $j$ into one column and create another column indicating whether the values are of feature $i$ or feature $j$, you can use melt() in reshape2 package. Let us work through an example on the diamonds dataset from ggplot2.

library(reshape2)
library(ggplot2)

head(diamonds)

## Source: local data frame [6 x 10]
## 
##   carat       cut  color clarity depth table price     x     y     z
##   (dbl)    (fctr) (fctr)  (fctr) (dbl) (dbl) (int) (dbl) (dbl) (dbl)
## 1  0.23     Ideal      E     SI2  61.5    55   326  3.95  3.98  2.43
## 2  0.21   Premium      E     SI1  59.8    61   326  3.89  3.84  2.31
## 3  0.23      Good      E     VS1  56.9    65   327  4.05  4.07  2.31
## 4  0.29   Premium      I     VS2  62.4    58   334  4.20  4.23  2.63
## 5  0.31      Good      J     SI2  63.3    58   335  4.34  4.35  2.75
## 6  0.24 Very Good      J    VVS2  62.8    57   336  3.94  3.96  2.48

Say we are interested in combining columns x, y, and z into one column while creating another column to indicate whether the value in the combined column comes from x, y, or z, we would:

dat2 = melt(diamonds, id.vars = c("carat", "cut", "color", "clarity", "depth", "table", "price"), measure.vars = c("x", "y", "z"))

head(dat2)

##   carat       cut color clarity depth table price variable value
## 1  0.23     Ideal     E     SI2  61.5    55   326        x  3.95
## 2  0.21   Premium     E     SI1  59.8    61   326        x  3.89
## 3  0.23      Good     E     VS1  56.9    65   327        x  4.05
## 4  0.29   Premium     I     VS2  62.4    58   334        x  4.20
## 5  0.31      Good     J     SI2  63.3    58   335        x  4.34
## 6  0.24 Very Good     J    VVS2  62.8    57   336        x  3.94

The combined column is named value and the column designating x, y, or z assignments is named variable. id.vars parameter specifies columns that remains unchanged while measure.vars parameter specifies names of columns to combine.

To suppress warnings of a function

suppressWarnings(<function>)

Converting a string to a variable name

assign("x", 5)

invisible() can be used to suppress output

invisible(max(2, 4))

pmax() or pmin() allows one to calculate maxima or minima of all input values in parallel.

set.seed(1)
n = 10
mat = matrix(rnorm(2*n), ncol = 2)
head(mat)

##            [,1]        [,2]
## [1,] -0.6264538  1.51178117
## [2,]  0.1836433  0.38984324
## [3,] -0.8356286 -0.62124058
## [4,]  1.5952808 -2.21469989
## [5,]  0.3295078  1.12493092
## [6,] -0.8204684 -0.04493361

maxs = pmax(mat[,1], mat[, 2])
maxs

##  [1]  1.51178117  0.38984324 -0.62124058  1.59528080  1.12493092
##  [6] -0.04493361  0.48742905  0.94383621  0.82122120  0.59390132

The locator() function will return the x and y positions of the position you click your mouse on in the plot window. To get the x and y positions of two separate mouse-clicks on the graph:

locator(2)

To download a file from the internet to a destination folder.

fileUrl = "<url>"
download.file(fileUrl, destfile = "<path_destination_file>", method = "curl")

To create a vector containing a specificed number of evenly-spaced points between 0 and 1 using ppoints().

#Create a vector of 10 points evenly spaced between 0 and 1.
ppoints(10)

##  [1] 0.06097561 0.15853659 0.25609756 0.35365854 0.45121951 0.54878049
##  [7] 0.64634146 0.74390244 0.84146341 0.93902439

To get the arguments for a function, use args(). To get the help files for a function, one way is to use help().

args(qnorm)

## function (p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE) 
## NULL

help(qnorm)

expand.grid() creates a dataframe of all combinations of parameters fed to it:

ms=2^c(1:3)
ks=seq(1,9,2)
params = expand.grid(k=ks,m=ms)
params

##    k m
## 1  1 2
## 2  3 2
## 3  5 2
## 4  7 2
## 5  9 2
## 6  1 4
## 7  3 4
## 8  5 4
## 9  7 4
## 10 9 4
## 11 1 8
## 12 3 8
## 13 5 8
## 14 7 8
## 15 9 8

replicate(n, expr) is used for repeated evaluation of an expression.

group = replicate(3, rnorm(5, 5, 1))
group

##          [,1]     [,2]     [,3]
## [1,] 5.918977 4.943871 6.358680
## [2,] 5.782136 4.844204 4.897212
## [3,] 5.074565 3.529248 5.387672
## [4,] 3.010648 4.521850 4.946195
## [5,] 5.619826 5.417942 3.622940

gsub() performs replacements for all patterns in a character vector.

x = c("Bad stuff happens", "Bad food", "Bad air")
gsub("Bad", "Good", x)

## [1] "Good stuff happens" "Good food"          "Good air"

Integration.

integrand = function(x){x^2}
area = integrate(f = integrand, lower = -1, upper = 1)
area

## 0.6666667 with absolute error < 7.4e-15

Tests to see if all elements in one data structure is equal to all elements in the other.

x = runif(10, 0, 10)
y = x
all.equal(y, x)

## [1] TRUE

If $x$ is a matrix and $y$ is a vector, then sweep(x, 1, y, FUN="*") applies the function * to x[i, ] and element y[i], and in this case x[i, ]*y[i]. If the second argument in sweep() is 2 instead of 1, then sweep() applies this column-wise.

set.seed(2)
data = 1:10
x = matrix(sample(data, 9, replace = TRUE), byrow = TRUE, nrow = 3)
x

##      [,1] [,2] [,3]
## [1,]    2    8    6
## [2,]    2   10   10
## [3,]    2    9    5

y = sample(c(-1, 1), nrow(x), replace=TRUE)
y

## [1]  1  1 -1

z = sweep(x, 1, y, FUN="*")
z

##      [,1] [,2] [,3]
## [1,]    2    8    6
## [2,]    2   10   10
## [3,]   -2   -9   -5

Set operations

x = c(1, 2, 4, 5, 6, 7, 10)
y = c(5, 3, 2, 6, 24, 34)
union(x, y)

##  [1]  1  2  4  5  6  7 10  3 24 34

intersect(x, y)

## [1] 2 5 6

#Subtract y from x
setdiff(x, y)

## [1]  1  4  7 10

#Subtract x from y
setdiff(y, x)

## [1]  3 24 34

setequal(x, y)

## [1] FALSE

Estimate of the memory used to store R object.

object.size(x)

Returns a vector, data frame, or array but with duplicate elements/rows removed.

x = c(rep(1, 3), rep(2, 5))
x

## [1] 1 1 1 2 2 2 2 2

unique(x)

## [1] 1 2

Find the factorial of a number.

factorial(3)

## [1] 6

crossprod() is a function that returns the inner-crossproduct of a vector with itself.

n = 1:10
x = sample(n, 25, replace=TRUE)
y = sample(n, 25, replace = TRUE)
crossprod(y-x)

##      [,1]
## [1,]  440

Check the version of a R package.

packageVersion("ggplot2")

## [1] '2.1.0'

Importing dates from character format using as.Date(). Returns a Date object and accepts a character vector.

dates <- c("05/27/84", "07/07/05")
#Format refers to format of input date format. 
betterDates <- as.Date(dates, format = "%m/%d/%y")
betterDates

## [1] "1984-05-27" "2005-07-07"

months = format(betterDates, "%m")
months

## [1] "05" "07"

class(betterDates)

## [1] "Date"

Finding the rank of each value in a vector:

x = c(1, 4, 3, 6, 10)
rank(x)

## [1] 1 3 2 4 5

range() function returns a vector containing the minimum and maximum value of a given vector or vectors.

x = rpois(50, 3)
range(x, na.rm=TRUE)

## [1] 0 7

Save dataframe to a file. Assume we have a data frame called expressions.

save(expressions, file="expressions.Rda")
load("expressions.Rda")

Viewing the current setting of a function.

#Examples
par("bg")

## [1] "white"

par("mar")

## [1] 5.1 4.1 4.1 2.1

par("mfrow")

## [1] 1 1

View list of parameters for a function. Example is par().

parameters = par()
class(parameters)

## [1] "list"

length(parameters)

## [1] 72

names(parameters)

##  [1] "xlog"      "ylog"      "adj"       "ann"       "ask"      
##  [6] "bg"        "bty"       "cex"       "cex.axis"  "cex.lab"  
## [11] "cex.main"  "cex.sub"   "cin"       "col"       "col.axis" 
## [16] "col.lab"   "col.main"  "col.sub"   "cra"       "crt"      
## [21] "csi"       "cxy"       "din"       "err"       "family"   
## [26] "fg"        "fig"       "fin"       "font"      "font.axis"
## [31] "font.lab"  "font.main" "font.sub"  "lab"       "las"      
## [36] "lend"      "lheight"   "ljoin"     "lmitre"    "lty"      
## [41] "lwd"       "mai"       "mar"       "mex"       "mfcol"    
## [46] "mfg"       "mfrow"     "mgp"       "mkh"       "new"      
## [51] "oma"       "omd"       "omi"       "page"      "pch"      
## [56] "pin"       "plt"       "ps"        "pty"       "smo"      
## [61] "srt"       "tck"       "tcl"       "usr"       "xaxp"     
## [66] "xaxs"      "xaxt"      "xpd"       "yaxp"      "yaxs"     
## [71] "yaxt"      "ylbias"

List files in current directory.

dir()

##  [1] "analysisReport.txt"       "Bash.ipynb"              
##  [3] "C"                        "data_sets"               
##  [5] "file.csv"                 "Git_Tutorial.ipynb"      
##  [7] "Git.txt"                  "HTML:CSS"                
##  [9] "Java_Programs"            "Java.docx"               
## [11] "Java.ipynb"               "Java.txt"                
## [13] "Latex"                    "latex-guide.pdf"         
## [15] "Latex.txt"                "note.txt"                
## [17] "python"                   "Python.ipynb"            
## [19] "Python.txt"               "R_cache"                 
## [21] "R_data"                   "R_figure"                
## [23] "R_package.txt"            "R-plotting.html"         
## [25] "R.html"                   "R.R"                     
## [27] "R.Rmd"                    "R.txt"                   
## [29] "results.txt"              "rmarkdown-cheatsheet.pdf"
## [31] "Rprof.out"                "rsconnect"               
## [33] "SQL.ipynb"                "Unix.txt"

Two ways of timing a program - one with proc.time(), one with system.time().

g = rnorm(1e5)
h = rep(NA, 1e5)

#Stop watch style
start = proc.time()
for(i in 1:1e5){
  h[i] = g[i] + 1
}
end = proc.time()
end - start

##    user  system elapsed 
##   0.290   0.005   0.302

#System
system.time(for(i in 1:1e5){h[i]=g[i]+1})

##    user  system elapsed 
##   0.128   0.004   0.133

Install package from github

library(devtools)
install_github('epurdom/clusterCells')

Install a bioconductor package

library(BiocInstaller)
biocLite(<package>)

Get help files of a package.

help(cluster01)

Check if a package is installed. Returns boolean value.

require("devtools")

Check current version of R

R.Version()

## $platform
## [1] "x86_64-apple-darwin13.4.0"
## 
## $arch
## [1] "x86_64"
## 
## $os
## [1] "darwin13.4.0"
## 
## $system
## [1] "x86_64, darwin13.4.0"
## 
## $status
## [1] ""
## 
## $major
## [1] "3"
## 
## $minor
## [1] "2.4"
## 
## $year
## [1] "2016"
## 
## $month
## [1] "03"
## 
## $day
## [1] "10"
## 
## $`svn rev`
## [1] "70301"
## 
## $language
## [1] "R"
## 
## $version.string
## [1] "R version 3.2.4 (2016-03-10)"
## 
## $nickname
## [1] "Very Secure Dishes"

Installing and loading R packages.

install.packages('dplyr')
library('dplyr')

Get current date in a more detailed format.

date()

## [1] "Wed Jan  4 19:32:12 2017"

Get current date in yyyy-mm-dd format.

Sys.Date()

## [1] "2017-01-04"

print() and cat() functions. Both functions are useful when one needs to print to the screen within the body of a function. The output of print() is numbered whereas the output of cat() is not.

print("abc")

## [1] "abc"

cat("abc\n") #\n is necessary to end on the next line. Default separater is a space.

## abc

cat("abc", "de\n")

## abc de

#Set separater between items. 
cat("abc", "de\n", sep="")

## abcde

cat("abc", "de\n", sep="\n")

## abc
## de

x = c(5, 12, 13, 8, 88)
cat(x, sep=c(".", ".", ".", "\n", "\n"))

## 5.12.13.8
## 88

A closure consists of a function that sets up a local variable and then creates another function that accesses that variable.<<- assigns value to the local variable.

counter = function(){
  ctr = 0
  f = function(){
    ctr <<- ctr + 1
    cat("this count currently has value", ctr, "\n") #cat() concentates variables of different modes together, with a space automatically included in between. 
  }
}
c1 = counter()
c2 = counter()
c1()

## this count currently has value 1

c1()

## this count currently has value 2

c2()

## this count currently has value 1

c2()

## this count currently has value 2

Writing upstairs. Normally, write access to variables at higher levels via the standard assignment operator is not possible. In order to write to variables at higher levels, one can use the “<<-” operator or the assign() function.

two = function(u){
  u <<- 2*u
  z = 2*z
}

x = 1
z = 3
two(x)
x

## [1] 1

## [1] 3

## [1] 2

In this example, because x was copied to the formal argument u, x did not change. z did not change because z is a global variable and code within the function two does not have write access to the variable z. u did not exist prior to calling two(), but it was created as a top-level variable by superassignment.

The superassignment operator assigns to the first variable it is called to assign to as it searches up the levels.

f = function(){
  inc = function(){x <<- x + 1}
  x = 3
  inc()
  return(x)
}
f()

## [1] 4

## [1] 1

Alternatively you could also perform superassignment using the assign() function.

x = 1
two = function(u){
  assign("u", 2*u, pos=.GlobalEnv) #pos=.GlobalEnv makes u a global variable
}
two(x)
u

## [1] 2

w = 12
f = function(y){
  d = 8
  h = function(){
    return(d*(w+y))
  }
  return(h())
}

Using ls() and ls.str() to display top-level environment

ls()
#More detailed
ls.str()

Using the formals() and body() functions to break down a function. formals() indicates the function argument where body() indicates the function expression.

g = function(x){
  return(x + 1)
}
formals(g)

## $x

body(g)

## {
##     return(x + 1)
## }

Functions can return functions.

g = function(){
  t = function(x) return(x^2)
  return(t)
}
g()

## function(x) return(x^2)
## <environment: 0x7fbd42f830d8>

Return statements in a R function can be written in two ways. return() will be slower than simply expressing the result. For clarity it is customary to include return() statements in the middle of code block and omit return at the very end of the code block. return() statements can return any R object, even functions!

return(x)

#Or

x

The get() function takes as an argument a character string representing the name of some object and returns the object of that name.

u = matrix(seq(1, 6), nrow=3)
u

##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6

h = get("u")
h

##      [,1] [,2]
## [1,]    1    4
## [2,]    2    5
## [3,]    3    6

class(h)

## [1] "matrix"

cut() function can be used to generate factors by determining which bin each element in x falls into. The form of cut() is cut(data_vector, binmarks, labels=F).User defined bins are inclusive of the last number but exclusive of the first number.c

z = c(1, 1, 1, 2, 3, 4, 5, 6)
binmarks = seq(0.0, 6, 1)
c1 = cut(z, binmarks, labels=F)
table(c1)

## c1
## 1 2 3 4 5 6 
## 3 1 1 1 1 1

List the files in a directory with list.files(), which returns a character vector.

list.files("./data")

Check if a directory exists, if it doesn’t, then create that directory

if (!file.exists("data")){
  dir.create("data")
}

Get and set current working directory.

getwd()
setwd("../")
#On Mac...
setwd("/Users/Documents")
#On
setwd("C:\\Users\\Calvin\\Downloads")

Rounding numbers

round(123.456, digits=2) #or simply round(123.456, 2)

## [1] 123.46

round(123.456, -2)

## [1] 100

#signif() rounds to the number of significant digits provided
signif(-123.456, 4)

## [1] -123.5

Paste0(x) is a shortcut for paste(x, sep=“”)

paste("a", "b")

## [1] "a b"

paste0("a", "b")

## [1] "ab"

Remove all environment variables

rm(list=ls())
x = c(0, 3, 2)
rm(x)

Download file from the internet

#tmpfile is the destination file
download.file("http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip", "tmpfile.zip")

Paste function

paste(c("Hello", "World!"))

## [1] "Hello"  "World!"

Paste and concatenate result into single string.

paste(c(1, 2, 3, 4))

## [1] "1" "2" "3" "4"

paste(c(1, 2, 3, 4), collapse = "")

## [1] "1234"

Using the break key word to break out of a loop. In this example, break is used to find the index of the first 1 in a vector

x = c(4, 5, 6, 1, 9)
first1 = function(x){
  for (i in 1:length(x)){
    if (x[i] == 1) break
  }
  return(i)
}
first1(x)

## [1] 4

R recycling. Applying an operation to two vectors requires them to be the same length. If they are not, R repeats the shorter one until it is as long as the longer vector.

x = seq(1, 5, 1)
x - 1

## [1] 0 1 2 3 4

y = matrix(seq(1, 6), nrow = 3)
y + c(1, 2)

##      [,1] [,2]
## [1,]    2    6
## [2,]    4    6
## [3,]    4    8

Operator precedence

i = 2
1:i-1

## [1] 0 1

1:(i-1)

## [1] 1

Rounding numbers in R.

x = c(1.2, 3.9, 0.4)
z = round(x)
z

## [1] 1 4 0

Using stop() in R as a check. In this example, we are stopping if the input c in function f is not a scalar.

x = c(1:3)
f = function(x, c){
  if(length(c) != 1) stop("vector c not allowed")
  else{
    return((x+c)^2)
  }
}
f(x, 2)

## [1]  9 16 25

NA means that the value exists but is not known. When operating on vectors that have NAs in them, it is possible to tell the operator to ignore the NAs.

x = c(88, NA, 12, 168)
mean(x)

## [1] NA

mean(x, na.rm=TRUE)

## [1] 89.33333

mode() vs class() vs typeof(). mode() and typeof() are similar and returns how an object is stored in memory, while class() returns an object’s abstract data type. A more detailed discussion of this can be found here.

x = matrix(1:16, nrow=4)
mode(x)

## [1] "numeric"

typeof(x)

## [1] "integer"

class(x)

## [1] "matrix"

NULL means that the value does not exist. NULL is useful in the application of building vectors.

z = NULL
for (i in 1:10) if (i %% 2 == 0) z = c(z, i) # %% is modulo operator
z

## [1]  2  4  6  8 10

u = NULL
length(u)

## [1] 0

v = NA
length(v)

## [1] 1

ifelse() allows one to write a condensed version of if else statements. ifelse() takes in 3 inputs - condition, value returned if condition is fulfilled, and value returned otherwise. ifelse() statements can be nested.

x = 1:10
y = ifelse(x %% 2 == 0, 5, 12) #if x%% 2 == 0, then 5, else 12
y

##  [1] 12  5 12  5 12  5 12  5 12  5

g <- c("M", "F", "F", "I", "M", "M", "F")
h = ifelse(g=="M", 1, ifelse(g=="F", 2, 3))
h

## [1] 1 2 2 3 1 1 2

diff() function by default calculates the difference between each element and the element before it in position.

u = c(1, 6, 7, 2, 3, 5)
diff(u)

## [1]  5  1 -5  1  2

sign() converts numbers in its argument vector to 1, 0, or -1, depending on whether they are positive, zero, or negative.

x = c(5, 1, -5, 1, 2)
sign(x)

## [1]  1  1 -1  1  1

Statistics/Machine Learning

Classical multidimensional scaling of a matrix. Requires as minimum input a dissimilarity matrix. By default returns a reduced feature vector (features = 2) for each sample.

library(tissuesGeneExpression)
data(tissuesGeneExpression)
d = dist(t(e))
mds = cmdscale(d)
plot(mds[,1], mds[,2])

Performing PCA. Usually want to perform PCA after calling scale() on data matrix. The principle components of scaled data are the columns of V in the svd of the same scaled data.

mat = scale(matrix(c(1, 2, 2, 5, 3, 7), nrow=2))
prcomp(mat)

## Standard deviations:
## [1] 1.732051 0.000000
## 
## Rotation:
##            PC1        PC2
## [1,] 0.5773503 -0.5773503
## [2,] 0.5773503  0.7886751
## [3,] 0.5773503 -0.2113249

Say we have a $n x m$ data matrix X, calling scale() on it will subtract the mean from each column and divide each column by its standard deviation.

mat = matrix(c(1, 2, 2, 5, 3, 7), nrow=2)
scale(mat)

Performing singular value decomposition by calling the svd() function, which returns a list containing U, S, and V (no transpose). S is a vector. To convert S to a diagonal matrix, use diag() function. The right singular vectors represent the features.

mat = matrix(c(1, 2, 2, 5, 3, 7), nrow=2)
mat

##      [,1] [,2] [,3]
## [1,]    1    2    3
## [2,]    2    5    7

result = svd(mat)
class(result)

## [1] "list"

result

## $d
## [1] 9.5899624 0.1806108
## 
## $u
##            [,1]       [,2]
## [1,] -0.3897782 -0.9209087
## [2,] -0.9209087  0.3897782
## 
## $v
##            [,1]       [,2]
## [1,] -0.2327012 -0.7826345
## [2,] -0.5614308  0.5928424
## [3,] -0.7941320 -0.1897921

#To diagonalize eigenvalues
diag(result$d)

##          [,1]      [,2]
## [1,] 9.589962 0.0000000
## [2,] 0.000000 0.1806108

To perform hierarchical clustering:

Calculate distance between all points using dist() function.
Perform hierarchical clustering with the hclust() function.

Code demonstration. Note that hierarchical clustering is not deterministic.

#Code for generating clusters.
set.seed(1234)
x = rnorm(12, mean=rep(1:3, each=4), sd=0.2)
y = rnorm(12, mean=rep(c(1, 2, 1), each=4), sd=0.2)
plot(x, y, col="blue", pch=19, cex=2)
text(x+0.05, y+0.05, labels=as.character(1:12))

#To perform hierarchical clustering. Default uses complete linkage. 
dataFrame = data.frame(x=x, y=y)
distxy = dist(dataFrame)
hClustering = hclust(distxy)
plot(hClustering)

#To determine the number of clusters/cluster assignments when tree is cut at a certain height specified by parameter h in 
cut = cutree(hClustering, h=1)
cut

##  [1] 1 1 1 1 2 2 2 2 3 3 3 3

#To plot dendrogram without labels
plot(as.dendrogram(hClustering))

#To perform hierarchical clustering using average linkage.
hClustering = hclust(distxy, method = "average")
plot(hClustering)

#To perform k-means clustering. The kmeans() function returns an object that contains many useful results. 
kmeansObj = kmeans(dataFrame, centers=3)
#Items in the kmeans object. 
names(kmeansObj)

## [1] "cluster"      "centers"      "totss"        "withinss"    
## [5] "tot.withinss" "betweenss"    "size"         "iter"        
## [9] "ifault"

#Cluster assignments
kmeansObj$cluster

##  [1] 3 3 3 3 1 1 1 1 2 2 2 2

#Number of iterations gone through
kmeansObj$iter

## [1] 2

#Centroids
kmeansObj$centers

##           x         y
## 1 1.9906904 2.0078229
## 2 2.8534966 0.9831222
## 3 0.8904553 1.0068707

#cluster centroids in KmeansObj$centers
par(mar=rep(0.2, 4))
plot(x, y, col=kmeansObj$cluster, pch=19, cex=2)
points(kmeansObj$centers, col=1:3, pch = 3, cex=3, lwd=3)

Chi-square test of independence with $\alpha = 0.05$.

library(MASS) #loads survey dataset
tbl = table(survey$Smoke, survey$Exer)
tbl

##        
##         Freq None Some
##   Heavy    7    1    3
##   Never   87   18   84
##   Occas   12    3    4
##   Regul    9    1    7

chisq.test(tbl)

## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect

## 
##  Pearson's Chi-squared test
## 
## data:  tbl
## X-squared = 5.4885, df = 6, p-value = 0.4828

Since the p-value $\geq$ 0.05, do not reject null hypothesis that smoking habit is independent of exercise level of students.

Using the quantile() function to produce sample quantiles corresponding to given probabilities.

set.seed(1)
x = rpois(100, 4)
quantile(x, probs = c(0.25, 0.5, 0.75))

## 25% 50% 75% 
##   3   4   5

PAM clustering via pam() function, which accepts as input a distance matrix. In this example, result is an object and its components can be accessed via result$.

require(cluster)
group1 = replicate(3, rnorm(5, 5, 1))
group2 = replicate(3, rnorm(5, 2, 1))
data = rbind(t(group1), t(group2))
result = pam(data, 2)
result

## Medoids:
##      ID                                             
## [1,]  2 6.980400 4.632779 3.955865 5.569720 4.864945
## [2,]  6 2.291446 1.556708 2.001105 2.074341 1.410479
## Clustering vector:
## [1] 1 1 1 2 2 2
## Objective function:
##    build     swap 
## 1.654503 1.654503 
## 
## Available components:
##  [1] "medoids"    "id.med"     "clustering" "objective"  "isolation" 
##  [6] "clusinfo"   "silinfo"    "diss"       "call"       "data"

Generate a random permutation of a given vector

x = 1:10
sample(x)

##  [1]  3  9  4  6  7 10  1  2  5  8

Uniformly sample with/without replacement from a vector of elements.

set.seed(1)
x = rnorm(10, 4, 1)
x

##  [1] 3.373546 4.183643 3.164371 5.595281 4.329508 3.179532 4.487429
##  [8] 4.738325 4.575781 3.694612

sample(x, 5)

## [1] 3.694612 4.183643 3.179532 3.373546 4.575781

sample(x, 5, replace = TRUE)

## [1] 5.595281 3.373546 5.595281 4.575781 5.595281

Given a sample x feature matrix, the function dist() will compute the distance (default = Euclidean) matrix for the samples. A distance matrix can have dimensions sample x sample and lists the distance between each sample. One can use the function image() to visualize the distance matrix.

#prepare mxn matrix
group1 = replicate(3, rnorm(5, 5, 1))
group2 = replicate(3, rnorm(5, 2, 1))
data = rbind(t(group1), t(group2))
dim(data)

## [1] 6 5

#Most concise
dist(data)

##          1        2        3        4        5
## 2 3.197198                                    
## 3 2.751214 2.557937                           
## 4 8.027959 7.414114 6.490518                  
## 5 7.362879 7.363116 5.776047 3.056570         
## 6 7.729928 7.575169 6.377825 1.812434 1.748685

#To see how many distances are calculated
length(dist(data))

## [1] 15

#With diagonal entries
dist(data, diag=T)

##          1        2        3        4        5        6
## 1 0.000000                                             
## 2 3.197198 0.000000                                    
## 3 2.751214 2.557937 0.000000                           
## 4 8.027959 7.414114 6.490518 0.000000                  
## 5 7.362879 7.363116 5.776047 3.056570 0.000000         
## 6 7.729928 7.575169 6.377825 1.812434 1.748685 0.000000

#Complete 
dist(data, diag=T, upper = T)

##          1        2        3        4        5        6
## 1 0.000000 3.197198 2.751214 8.027959 7.362879 7.729928
## 2 3.197198 0.000000 2.557937 7.414114 7.363116 7.575169
## 3 2.751214 2.557937 0.000000 6.490518 5.776047 6.377825
## 4 8.027959 7.414114 6.490518 0.000000 3.056570 1.812434
## 5 7.362879 7.363116 5.776047 3.056570 0.000000 1.748685
## 6 7.729928 7.575169 6.377825 1.812434 1.748685 0.000000

#Visualize distance matrix
image(as.matrix(dist(data)))

Here are the key functions that a heatmap performs:

Plots values in terms of colors
Clusters by rearranging both order of features and samples
Colors are assigned based on relative values within a row, so to get clear looking clusters, samples should be organized by column for heatmap().
By default plots dendrograms for (1) rows represent samples and (2) columns represent samples
In general m > n for a m x n matrix for heatmap to work.

Input: m x n matrix with or without colnames/rownames

#prepare mxn matrix
group1 = replicate(3, rnorm(5, 5, 1))
group2 = replicate(3, rnorm(5, 2, 1))
data = rbind(t(group1), t(group2))
data

##          [,1]      [,2]     [,3]     [,4]      [,5]
## [1,] 4.292505 5.3645820 5.768533 4.887654 5.8811077
## [2,] 5.398106 4.3879736 5.341120 3.870637 6.4330237
## [3,] 6.980400 4.6327785 3.955865 5.569720 4.8649454
## [4,] 4.401618 1.9607600 2.689739 2.028002 1.2567268
## [5,] 2.188792 0.1950414 3.465555 2.153253 4.1726117
## [6,] 2.475510 1.2900536 2.610726 1.065902 0.7463666

#matrix labels
sampleNames = vector(length=6)
for(i in seq(6)){
  sampleNames[i] = paste("Sample", i)
}
rownames(data) = sampleNames
data = t(data) #Want feature by sample because then samples will be clustered.

#Plot heatmap
heatmap(data)

#Plot heatmap with color key and greater flexibility than R's built-in heatmap(). In heatmap.2()'s heatmaps, colors are based on absolute value, so it does not matter whether samples are by row or by column. scale="none" means no centering of values by row or by column. 
require(gplots)
heatmap.2(data, col=redgreen(75), scale="none", density.info="none", trace="none")

Compute all permutations of a subset of a set with repeats.Returns a matrix with each row representing a permutation. In this example, 3 is the size of set of elements to permutate, 2 is the size of subset, and x is the vector containing elements.

library(gtools)
x = c("A", "G", "C")
permutations(3, 2, x, repeats.allowed = TRUE)

##       [,1] [,2]
##  [1,] "A"  "A" 
##  [2,] "A"  "C" 
##  [3,] "A"  "G" 
##  [4,] "C"  "A" 
##  [5,] "C"  "C" 
##  [6,] "C"  "G" 
##  [7,] "G"  "A" 
##  [8,] "G"  "C" 
##  [9,] "G"  "G"

Calculating correlation

x = seq(1, 10)
y = seq(1, 10) + runif(10, -0.5, 0.5)

cor(x, y)

## [1] 0.99897

m = matrix(c(x, y), nrow = 10, ncol = 2)
m

##       [,1]     [,2]
##  [1,]    1 1.114645
##  [2,]    2 2.057160
##  [3,]    3 2.828777
##  [4,]    4 3.953131
##  [5,]    5 5.000441
##  [6,]    6 5.680866
##  [7,]    7 7.029631
##  [8,]    8 7.575276
##  [9,]    9 8.777756
## [10,]   10 9.712700

#If cor() is called on a matrix, then it will return a correlation matrix
cor(m)

##         [,1]    [,2]
## [1,] 1.00000 0.99897
## [2,] 0.99897 1.00000

Find parameters of F-distribution that fit the data using fitFDist(), which will return a list of the scale factor (shifts plot horizontally) and the df2 factor (controls shape). Suppose X is the data we are trying to fit, and we known that the data has a degree of freedom of 14.

library(limma)
fitFDist(x, 14)

Computing variance of values in a vector.

x = c(1, 2, 1.4, 0.5)
var(x)

## [1] 0.4025

How to determine if the data fits any known distribution using Q-Q plots. In this example, we are going to see if data sampled from a poisson distribution indeed fits the poisson distribution.

x = rpois(100, 4) #Generates samples drawn from poisson distribution with lambda = 4. 
ps = seq(0, 1, len=100) #Generates CDF probabilities to feed to qpois()
qpoisq = qpois(ps, 4) #Generates values of Poisson random variable that correspond to cumulative probability
qqplot(qpoisq, x)

How to determine if the data fits a normal distribution using Q-Q plots.

x = rnorm(100, 5, 2) #Generates 100 values drawn from normal distribution with mean=5 and standard deviation=2
qqnorm(x) #Creates Normal Q-Q Plot
qqline(x) #Fits a line to the Normal Q-Q plot

Application: Generating a covariance matrix given known variance of 1 and user provided covariance.

makecov = function(rho, n){
  m = matrix(nrow=n, ncol=n)
  m = ifelse(col(m)==row(m), 1, rho)
  return(m)
}
cov = makecov(0.2, 3)
cov

##      [,1] [,2] [,3]
## [1,]  1.0  0.2  0.2
## [2,]  0.2  1.0  0.2
## [3,]  0.2  0.2  1.0

Binomial distribution density. $p = \binom{N}{x}p^{x}(1-p)^{N-x}$.

N = 10
p = 0.5
x = 1
dbinom(x, N, p)

## [1] 0.009765625

Cumulative probability density for the binomial distribution. In this example, the probability of 4 or less successes is calculated for $binom(0.2)$

pbinom(4, size=12, prob=0.2)

## [1] 0.9274445

Poisson distribution density. $p = e^{-\lambda}\frac{\lambda^{k}}{k!}$

lambda = 1.07
k = 0
dpois(k, lambda)

## [1] 0.3430085

#Log likelihood
dpois(k, lambda, log=TRUE)

## [1] -1.07

#CDF
k = 4
ppois(k, lambda)

## [1] 0.9951501

In multiple hypothesis testing, one can adjust p-values by calculating corresponding q-values to control the false discovery rate. This approach is less conservative than the Bonferroni correction, yielding less false negatives. There are two ways of calculating q-values in R–using qvalue() function from the qvalue library or by using p.adjust(). These two different methods yield slightly different results, but seek to accomplish the same goal of controlling for the false discovery rate in multiple hypothesis testing. Below is test code calculating the false positives and false negatives from a simulation where the ground truth is known, using Bonferroni correction and the two methods for calculating q-values.

#Initializing parameters. There are 24 samples and 8793 genes. Of those genes, 500 of those are set to be different in the treatment versus control by adding 2 to their values. This Monte Carlo simulation is run 1000 times. 
library(qvalue)
library(genefilter)
n = 24
m = 8793
delta = 2
positives = 500
B = 100
set.seed(1)

#Of the 24 samples, 12 are controls and 12 are treatments. 
results = replicate(B, {
  g <- factor(rep(c(0, 1), each=(n/2)))
  mat = matrix(rnorm(n*m), m, n) #Generate n*m numbers from a   standard normal distribution
  mat[1:positives, 1:(n/2)] = mat[1:positives, 1:(n/2)] + delta
  pvals = rowttests(mat, g)$p.val
  qvals1 = p.adjust(pvals, method = "fdr")
  fd1 = sum(qvals1[-(1:positives)] <= 0.05)
  fdr1 = fd1/(m-positives)
  fn1 = sum(qvals1[1:positives] > 0.05)
  fnr1 = fn1/(positives)
  qvals2 = qvalue(pvals)$qvalue
  fd2 = sum(qvals2[-(1:positives)] <= 0.05)
  fdr2 = fd2/(m-positives)
  fn2 = sum(qvals2[1:positives] > 0.05)
  fnr2 = fn2/(positives)
  fd3 = sum(pvals[-(1:positives)] <= 0.05/m)
  fdr3 = fd1/(m-positives)
  fn3 = sum(pvals[1:positives] > 0.05/m)
  fnr3 = fn3/(positives)
  c(fdr1, fnr1, fdr2, fnr2, fdr3, fnr3)})

False Positive Rate (q-value method 1):

## [1] 0.00268178

False Negative Rate (q-value method 1):

## [1] 0.08292

False Discovery Rate (q-value method 2):

## [1] 0.002920535

False Negative Rate (q-value method 2):

## [1] 0.07854

False Discovery Rate (Bonferroni Correction):

## [1] 0.00268178

False Negative Rate (Bonferroni Correction):

## [1] 0.7649

R

Calvin Chi

2017-01-04