Two ways of assigning value to each element in a vector, one is faster, one is slower.
set.seed(1)
letters = c("a", "b", "c", "d", "e")
values = sample(letters, 1e4, replace = TRUE)
vector = character(0) #Create empty character vector
system.time(for(i in seq(length(values))) vector[i] = values[i])
## user system elapsed
## 0.365 0.011 0.382
vector = character(length(values))
system.time(for(i in seq(length(values))) vector[i] = values[i])
## user system elapsed
## 0.01 0.00 0.01
Suppose we have a function accepts a vector of elements and a number, and outputs a matrix of powers of each element in that vector up to the number given as input:
powers1 = function(x, dg) {
pw = matrix(x, nrow = length(x))
prod = x
for (i in 2:dg) {
prod = prod * x
pw = cbind(pw, prod)
}
return(pw)
}
powers1(c(2, 3, 4), 3)
## prod prod
## [1,] 2 4 8
## [2,] 3 9 27
## [3,] 4 16 64
One can get a report of approximately how much time each function call is taking by using the Rprof() function. To use the Rprof() properly, follow 4 steps:
x = runif(1000000)
Rprof()
invisible(powers1(x, 8))
Rprof(NULL)
summaryRprof()
## $by.self
## self.time self.pct total.time total.pct
## "cbind" 1.38 85.19 1.38 85.19
## "*" 0.12 7.41 0.12 7.41
## "matrix" 0.12 7.41 0.12 7.41
##
## $by.total
## total.time total.pct self.time self.pct
## "<Anonymous>" 1.62 100.00 0.00 0.00
## "block_exec" 1.62 100.00 0.00 0.00
## "call_block" 1.62 100.00 0.00 0.00
## "eval" 1.62 100.00 0.00 0.00
## "evaluate_call" 1.62 100.00 0.00 0.00
## "handle" 1.62 100.00 0.00 0.00
## "in_dir" 1.62 100.00 0.00 0.00
## "powers1" 1.62 100.00 0.00 0.00
## "process_file" 1.62 100.00 0.00 0.00
## "process_group.block" 1.62 100.00 0.00 0.00
## "process_group" 1.62 100.00 0.00 0.00
## "withCallingHandlers" 1.62 100.00 0.00 0.00
## "withVisible" 1.62 100.00 0.00 0.00
## "cbind" 1.38 85.19 1.38 85.19
## "*" 0.12 7.41 0.12 7.41
## "matrix" 0.12 7.41 0.12 7.41
##
## $sample.interval
## [1] 0.02
##
## $sampling.time
## [1] 1.62
To calculate number of cores available on local computer.
library(parallel)
detectCores()
## [1] 4
The foreach packaage The times() function in foreach allows one to repeat a function n number of times. foreach allows one to repeatedly execute a function while systematically varying the input parameters.
library(foreach)
max.eig <- function(N, sigma) {
d <- matrix(rnorm(N**2, sd = sigma), nrow = N)
E <- eigen(d)$values
abs(E)[[1]]
}
times(10) %do% max.eig(5, 1)
## [1] 2.789116 3.108739 2.447060 1.682317 2.036381 2.923629 3.358250
## [8] 2.055538 2.515417 2.665701
foreach(n = 1:5) %do% max.eig(n, 1)
## [[1]]
## [1] 0.02652123
##
## [[2]]
## [1] 1.090449
##
## [[3]]
## [1] 2.09963
##
## [[4]]
## [1] 2.019413
##
## [[5]]
## [1] 2.384514
In foreach, one can specificy in what data structure the results are returned using the .combine parameter.
# To return results in a vector
foreach(n = 1:5, .combine = c) %do% max.eig(n, 1)
## [1] 0.6900221 1.0689973 2.1916111 1.0521352 1.8602131
# Returns results in a nested list
foreach(n = 1:3) %:% foreach(m = 1:3) %do% max.eig(n, m)
## [[1]]
## [[1]][[1]]
## [1] 0.7578154
##
## [[1]][[2]]
## [1] 2.640217
##
## [[1]][[3]]
## [1] 0.1195664
##
##
## [[2]]
## [[2]][[1]]
## [1] 0.7509613
##
## [[2]][[2]]
## [1] 2.986557
##
## [[2]][[3]]
## [1] 1.432597
##
##
## [[3]]
## [[3]][[1]]
## [1] 0.8747027
##
## [[3]][[2]]
## [1] 2.80636
##
## [[3]][[3]]
## [1] 4.95146
# Returns results in matrix form using rbind
foreach(n = 1:3, .combine = rbind) %:% foreach(m = 1:3) %do% max.eig(n, m)
## [,1] [,2] [,3]
## result.1 0.7922618 0.4699619 2.249439
## result.2 0.9079341 4.509158 3.774817
## result.3 2.100027 3.238192 4.400374
# Returns results in matrix form using cbind
foreach(n = 1:3, .combine = cbind) %:% foreach(m = 1:3) %do% max.eig(n, m)
## result.1 result.2 result.3
## [1,] 0.7872542 1.568529 1.788463
## [2,] 3.208785 3.378223 1.366411
## [3,] 1.230553 2.087211 7.420172
# Using foreach() to loop over multiple variables simultaneously
foreach(n = 1:3, m = 1:3) %do% max.eig(n, m)
## [[1]]
## [1] 0.2278165
##
## [[2]]
## [1] 2.835797
##
## [[3]]
## [1] 5.93052
Adding a filter in foreach statement.
library(numbers)
foreach(n = 1:50, .combine = c) %:% when (isPrime(n)) %do% n
## [1] 2 3 5 7 11 13 17 19 23 29 31 37 41 43 47
To execute foreach() statements in parallel, use %dopar% instead of %do%, and register cluster using doParallel package.
library(doParallel)
## Loading required package: iterators
no_cores = detectCores() - 1
cl = makeCluster(no_cores)
registerDoParallel(cl)
times(10) %dopar% max.eig(10, 1)
## [1] 3.800989 3.475508 3.155365 3.454822 3.241104 3.170539 2.927744
## [8] 3.115359 3.006906 3.334678
stopCluster(cl)
Setting .multicombine = TRUE can prevent nested lists.
cl = makeCluster(no_cores)
registerDoParallel(cl)
base = 2
foreach(exponent = 2:4, .combine = list) %dopar% base^exponent
## [[1]]
## [[1]][[1]]
## [1] 4
##
## [[1]][[2]]
## [1] 8
##
##
## [[2]]
## [1] 16
foreach(exponent = 2:4, .combine = list, .multicombine = TRUE) %dopar% base^exponent
## [[1]]
## [1] 4
##
## [[2]]
## [1] 8
##
## [[3]]
## [1] 16
Variables within the same local environment are available in foreach package, but variables from parent environment are not available. To make a variable in the parent environment available, set the .export parameter.
base = 2
cl = makeCluster(no_cores)
registerDoParallel(cl)
base = 4
test = function(exponent) {
foreach(exponent = 2:4, .combine = c, .export = "base") %dopar% base^exponent
}
test()
## [1] 16 64 256
stopCluster(cl)
Sorting numbers in a vector by increasing/decreasing value.
x = c(4, 2, 1, 0, 3, 2, 1)
sort(x)
## [1] 0 1 1 2 2 3 4
sort(x, decreasing=TRUE)
## [1] 4 3 2 2 1 1 0
order() returns the indices of a sorted numeric vector, either increasing or decreasing.
x = c(4, 2, 1, 0, 3, 2, 1)
sort(x)
## [1] 0 1 1 2 2 3 4
order(x)
## [1] 4 3 7 2 6 5 1
sort(x, decreasing = TRUE)
## [1] 4 3 2 2 1 1 0
order(x, decreasing = TRUE)
## [1] 1 5 2 6 3 7 4
gregexpr() operates the same as regexpr() but finds all instances of pattern.
gregexpr("iss", "Mississippi")
## [[1]]
## [1] 2 5
## attr(,"match.length")
## [1] 3 3
## attr(,"useBytes")
## [1] TRUE
regexpr() finds the character position of the first instance of pattern within text.
regexpr("uat", "Equator")
## [1] 3
## attr(,"match.length")
## [1] 3
## attr(,"useBytes")
## [1] TRUE
strsplit() splits a string x into an R list of substrings based on what to split on.
strsplit("6-16-2011", split="-")
## [[1]]
## [1] "6" "16" "2011"
substr() returns the substring of a string defined by the start and stop positions of that substring.
substring("Equator", 3, 5)
## [1] "uat"
sprintf()
i = 8
s = sprintf("the square of %d is %d", i, i^2)
s
## [1] "the square of 8 is 64"
Counts number of characters in a string.
nchar("South Pole")
## [1] 10
The preferable for loop through a sequence of numbers of a certain length.
x = NULL
#Preferable for loop
for(i in seq(x)){
print(i)
}
#Less preferable for loop
for(i in 1:length(x)){
print(i)
}
## [1] 1
## [1] 0
While loops, break, and repeat.
#Method 1
i = 1
while (i <= 10){
i = i + 4
}
#Method 2
i = 1
while(TRUE){
i = i + 4
if(i > 10){
break
}
}
#Method 3
i = 1
repeat{
i = i + 4
if(i > 10) break
}
Looping over functions.
g1 = function(x) return(sin(x))
g2 = function(x) return(sqrt(x^2+1))
g3 = function(x) return(x)
plot(0, 0)
for(f in c(g1, g2, g3)) plot(f, 0, 5, add=TRUE, xlim = c(0, 5), ylim = c(-1, 5), asp=1) #plots function over domain from 0 to 5. add=TRUE means functions are plotted in the same plot
Nested if…else statements
x = -5
if(x > 0){
print("Positive number")
}else if(x < 0){
print("Negative number")
}else if(x == 0){
print("Zero")
}
Compact if else statements
x = 3
y = if(x == 2) x else x+1
y
## [1] 4
Next statements: in this example only sum up even numbers in a sequene of numbers from 1 to 10.
n = 1
for(i in seq(1, 10)){
if(i %% 2 != 0) {next}
else{
n = n + i
print(i)
}
}
## [1] 2
## [1] 4
## [1] 6
## [1] 8
## [1] 10
n
## [1] 31
Two ways to read in .csv files. First way:
data = read.table("<pathToFile>", sep = ",", header = TRUE)
Default sep for read.table() is tab delimited. Second way:
data = read.csv("<pathToFile>")
Default sep for read.csv() is the comma “,”.
Other useful parameters for the read functions + quote: indicate any quoted values. + na.strings: set the character that represents a missing value. + nrows: how many rows to read. + skip: number of lines to skip before starting to read.
Use fread() to read in large datasets faster:
library(data.table)
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:reshape2':
##
## dcast, melt
## The following objects are masked from 'package:dplyr':
##
## between, last
myData = fread("R_data/file3.txt")
Reading xlsx files.
library(xlsx)
#Read file cameras.xlsx and extract rows 1-4 and columns 2-3.
data = read.xlsx(file="R_data/random.xlsx", sheetIndex = 1, colIndex=2:4, rowIndex=2:4)
Writing to a file.
kids = c("Jack", "Jill")
ages = c(12, 10)
d = data.frame(kids, ages, stringsAsFactors = FALSE)
d
## kids ages
## 1 Jack 12
## 2 Jill 10
#With column/row names included
write.table(d, "R_data/kds1.txt")
#With column/row names excluded
write.table(d, "R_data/kds2.txt", row.names=FALSE, col.names=FALSE)
#Writing and appending to a file
cat("abc\n", file="R_data/u.txt")
cat("de\n", file="R_data/u.txt", append=TRUE)
#concatenating multiple items to a file
cat(file="R_data/v.txt", 1, 2, "xyz\n")
#Writing data frame to csv file.
write.csv(d, file = "file.csv", row.names=FALSE)
The file name v.txt will consist of a single line.
1 2 xyz
The writeLines() function can write each element in the vector as a line once a connection is established.
c = file("R_data/www.txt", "w")
writeLines(c("abc", "de", "f"), c)
close(c)
And the file www.txt will contain the following content:
abc
de
f
You can also establish a connection first before concatenating to a file for convenience’s sake.
con = file("analysisReport.txt", "w")
data = c(1, 2, 3)
results = c("the", "random")
conclusion = "conclusion"
cat(data, file=con)
cat(results, file=con)
cat(conclusion, file=con)
close(con)
Connection functions such as file() and url() can read in a file. Suppose we have a file name z6.txt with the following contents:
John 25
Mary 28
Jim 19
We can establish a connection via the file() function. The function readLines() has the parameter n, which indicates how many lines to read. The “r” parameter of file() indicates reading from file.
c = file("R_data/z6.txt", "r")
readLines(c, n=1)
## [1] "John 25"
readLines(c, n=1)
## [1] "Mary 28"
readLines(c, n=1)
## [1] "Jim 19"
readLines(c, n=1)
## character(0)
close(c)
#"Rewind" to beginning of the file
c = file("R_data/z6.txt", "r")
readLines(c, n=2)
## [1] "John 25" "Mary 28"
seek(con = c, where = 0) #parameter where=0 sets the position of the file pointer zero characters from the start of the file.
## [1] 16
readLines(c, n=1)
## [1] "John 25"
close(c) #closes connection
readLines() allows one to read a file where each line is treated as a string. Suppose we have a file z5.txt with the following content:
John 25
Mary 28
Jim 19
z1 = readLines("R_data/z5.txt")
z1
## [1] "John 25" "" "Mary 28" " " "Jim 19"
Suppose we have a file x.txt with the following content:
1 0 1
1 1 1
1 1 0
1 1 0
0 0 1
We can read it into a matrix the following way:
x = matrix(scan("R_data/x.txt", quiet=TRUE), nrow=5, byrow=TRUE)
read.csv() is more suited for reading comma delimited files.
f = read.csv(file="simple.csv", header = =TRUE, sep=",")
If you already know the modes of each column of your csv file, then you can indicate what mode they are with the colClasses parameter.
pollution = read.csv("R_data/avgpm25.csv", colClasses = c("numeric", "character", "factor", "numeric", "numeric"))
dim(pollution)
Any read function will have the parameters:
Reading a data frame or matrix from a file. Assume we have a file named z.txt with the following content:
name age
John 25
Mary 28
Jim 19
read.table() reads a file into a dataframe.
z = read.table("R_data/z.txt", header=TRUE)
z
## name age
## 1 John 25
## 2 Mary 28
## 3 Jim 19
To only read in the last two lines of z.txt:
z = read.table("R_data/z.txt", header=TRUE, skip = 1, nrows = 2)
z
## John X25
## 1 Mary 28
## 2 Jim 19
read.table() can also read gzipped files directly. ******
Using the readline() function to read in a single line from the keyboard.
> w = readline()
abc de f
> w
[1] "abc de f"
#Can also be used with an optional prompt
> inits = readline("type your initials: ")
type your initials: CC
> inits
[1] "CC"
Using the scan() funtion. Suppose we have four documents z1.txt, z2.txt, z3.txt, and z4.txt. The contents of each document is as follows: z1.txt
123
4 5
6
z2.txt
123
4.2 5
6
z3.txt
abc de f g
z4.txt
abc 123 6 y
Let’s see what we can do with these files using the scan() function. Scans items as double mode by default, by can change what mode to scan to by setting the “what” parameter. scan() also assumes that items of the vector are separated by whitespace, but that could be changed by setting the “sep” parameter.
scan("R_data/z1.txt")
## [1] 123 4 5 6
scan("R_data/z2.txt")
## [1] 123.0 4.2 5.0 6.0
scan("R_data/z3.txt", what="")
## [1] "abc" "de" "f" "g"
scan("R_data/z4.txt", what="")
## [1] "abc" "123" "6" "y"
x2 = scan("R_data/z3.txt", what="", sep="\n")
x2
## [1] "abc" "de f" "g"
x2 = scan("R_data/z3.txt", what="", sep="\n", quiet=TRUE)
x2
## [1] "abc" "de f" "g"
The scan() function can be used to read numbers from the keyboard. Returns a numeric vector containing typed numbers.
v = scan("")
1: 12 5 13
4: 3 4 5
7: 8
8:
Read 7 items
> v
[1] 12 5 13 3 4 5 8
For loops can loop over file names. scan() reads the file. Assume “file1.txt” contains numbers 1 to 6 and “file2.txt” contains numbers 5, 12, and 13.
for (fn in c("R_data/file1.txt", "R_data/file2.txt")) print(scan(fn))
## [1] 1 2 3 4 5 6
## [1] 5 12 13
Specifying default parameters when writing functions.
test = function(x, y=2) {return(x + y)}
test(3)
## [1] 5
test(5, 5)
## [1] 10
Converting elements in a matrix into their proportions
tab = matrix(c(180, 40, 20, 10), nrow = 2)
proportion = prop.table(tab)
proportion
## [,1] [,2]
## [1,] 0.72 0.08
## [2,] 0.16 0.04
Tables can be used to tabulate counts for a combination of categories. table() accepts a vector, matrix, factor, or dataframe, and returns a contingency table.
#Motvation
u = c(22, 8, 33, 6, 8, 29, -2)
fl = list(c(5, 12, 13, 12, 13, 5, 13), c("a", "bc", "a", "a", "bc", "a", "a"))
tapply(u, fl, length)
## a bc
## 5 2 NA
## 12 1 1
## 13 2 1
#Counts cannot be NA, table() solves this problem.
table(fl)
## fl.2
## fl.1 a bc
## 5 2 0
## 12 1 1
## 13 2 1
#Another example1
vote.for.X = c("Yes", "Yes", "No", "Not Sure", "No")
Voted.for.X.Last.Time = c("Yes", "No", "No", "Yes", "No")
d = data.frame(vote.for.X, Voted.for.X.Last.Time)
cttab = table(d)
cttab
## Voted.for.X.Last.Time
## vote.for.X No Yes
## No 2 0
## Not Sure 0 1
## Yes 1 1
#Another example2
gender = c("M", "M", "F", "M", "F", "F")
race = c("W", "W", "A", "O", "B", "B")
pol = c("L", "L", "C", "L", "L", "C")
d = data.frame(gender, race, pol)
vt = table(d)
vt
## , , pol = C
##
## race
## gender A B O W
## F 1 1 0 0
## M 0 0 0 0
##
## , , pol = L
##
## race
## gender A B O W
## F 0 1 0 0
## M 0 0 1 2
#Another example3
data = factor(c(rep("east", 450), rep("west", 150)))
table(data)
## data
## east west
## 450 150
#Table operations. Table is its own class but in practice can be treated like a matrix.
class(cttab)
## [1] "table"
cttab[1, 1]
## [1] 2
cttab/5
## Voted.for.X.Last.Time
## vote.for.X No Yes
## No 0.4 0.0
## Not Sure 0.0 0.2
## Yes 0.2 0.2
#Finding marginal values of Vote.for.X variable.
apply(cttab, 1, sum)
## No Not Sure Yes
## 2 1 2
#addmargins() to find marginal totals of a table.
addmargins(cttab)
## Voted.for.X.Last.Time
## vote.for.X No Yes Sum
## No 2 0 2
## Not Sure 0 1 1
## Yes 1 1 2
## Sum 3 2 5
#Get names of dimensions and their levels. Columns are dimensions, rows are levels.
dimnames(cttab)
## $vote.for.X
## [1] "No" "Not Sure" "Yes"
##
## $Voted.for.X.Last.Time
## [1] "No" "Yes"
Given a \(X\) matrix, plotting the variables for each sample
x = 1:10
A = c(15, 36, 54, 60, 68, 71, 73, 75, 78, 78)
B = c(20, 49, 58, 69, 75, 80, 83, 86, 88, 89)
C = c(24, 58, 68, 75, 83, 90, 93, 93, 95, 96)
Performance = data.frame(A,B,C)
matplot(x,Performance, type="o", pch=c(1,2,3), col=c("red","green","blue"))
In R base plotting, deciding how densely the tick marks of the axes should be plotted.
data = rnorm(100)
hist(data, breaks = 10, axes = FALSE)
# Adds the y-axis
axis(2)
# Adds the x-axis
axis(1, at = seq(-3, 3, 0.5), labels = seq(-3, 3, 0.5))
segments() draws line segments between pairs of points.
set.seed(1)
x = rnorm(20, mean=5, sd=1)
y = rnorm(20, mean=2, sd=1)
plot(rep(0, 20), x, xlim=c(-0.5, 1.5))
points(rep(1, 20), y)
segments(rep(0, 20), x, rep(1, 20), y)
Creating 3D scatter plots using the scatterplot3d package. Let us work with a simulated gene expression data set with 3 genes. There are five clusters in this data set and we wish to color code points based on what cluster they are from.
require(scatterplot3d)
set.seed(1)
n = c(100, 25, 50, 50, 75)
growth_m = c(0, 2, -2, 0, 4)
growth_sd = c(0.5, 1, 1, 0.5, 0.5)
immune_m = c(0, -2, -3, 4, 0)
immune_sd = c(0.5, 1, 0.5, 0.5, 1)
structural_m = c(3, 0, 2, -1, -2)
structural_sd = c(1, 1, 1, 0.7, 1)
data = data.frame(n=n, growth_m=growth_m, growth_sd, immune_m, immune_sd, structural_m, structural_sd)
get_expression = function(data){
n = data[1]
growth_m = data[2]
growth_sd = data[3]
immune_m = data[4]
immune_sd = data[5]
structural_m = data[6]
structural_sd = data[7]
growth = rnorm(n, growth_m, growth_sd)
immune = rnorm(n, immune_m, immune_sd)
structural = rnorm(n, structural_m, structural_sd)
expression = data.frame(growth=growth, immune=immune, structural=structural)
return(expression)
}
expressions = apply(data, 1, get_expression)
expressions = do.call(rbind, expressions)
expressions$cluster = c(rep("1", 100), rep("2", 25), rep("3", 50), rep("4", 50), rep("5", 75))
expressions$pcolor[expressions$cluster=="1"] = "red"
expressions$pcolor[expressions$cluster=="2"] = "blue"
expressions$pcolor[expressions$cluster=="3"] = "darkgreen"
expressions$pcolor[expressions$cluster=="4"] = "black"
expressions$pcolor[expressions$cluster=="5"] = "darkorange"
with(expressions, {plot = scatterplot3d(growth, immune, structural, color = pcolor, main="Scatterplot of Cell Gene Expression")})
legend("topleft", c("Cluster 1", "Cluster 2", "Cluster 3", "Cluster 4", "Cluster 5"), fill=c("red", "blue", "darkgreen", "black", "darkorange"))
ggplot2 system: In this example plots displ on the x-axis and hwy on the y-axis.
library(ggplot2)
head(mpg)
## Source: local data frame [6 x 11]
##
## manufacturer model displ year cyl trans drv cty hwy fl
## (chr) (chr) (dbl) (int) (int) (chr) (chr) (int) (int) (chr)
## 1 audi a4 1.8 1999 4 auto(l5) f 18 29 p
## 2 audi a4 1.8 1999 4 manual(m5) f 21 29 p
## 3 audi a4 2.0 2008 4 manual(m6) f 20 31 p
## 4 audi a4 2.0 2008 4 auto(av) f 21 30 p
## 5 audi a4 2.8 1999 6 auto(l5) f 16 26 p
## 6 audi a4 2.8 1999 6 manual(m5) f 18 26 p
## Variables not shown: class (chr)
#Regular plot, with x=displ and y=hwy
qplot(displ, hwy, data=mpg)
#Plot with labels
qplot(displ, hwy, data=mpg, color=drv)
#Plotting colored by group with fitted line. Method is set as linear model.
qplot(displ, hwy, data=mpg, geom=c("point", "smooth"), color=drv)
#geom argument accepts a character vector
#1. point: add points to plot.
#2. smooth: add fitted line with 95% confience interval.
qplot(displ, hwy, data=mpg, geom=c("point", "smooth"))
#If only one data column is specified, then histogram will be plotted. fill=drv color codes different groups
qplot(hwy, data=mpg, fill=drv)
#Plotting a histogram with customized binwidth
qplot(price, data=diamonds, binwidth=18497/30)
#Plotting histogram as a density function
qplot(price, data=diamonds, geom="density")
#Plotting scatterplot with points belonging to different groups having different symbols/colors
qplot(carat, price, data=diamonds, shape=cut, color=cut)
#facets parameter plots separate groups in different plots. Placing "drv" to the right of "~" indicates how many columns instead of rows
qplot(displ, hwy, data=mpg, facets=.~drv)
#Placing "drv" to the left of "~." indicates how many rows instead of columns. binwidth=2 indicates that each bin in the histogram has a width of 2.
qplot(hwy, data=mpg, facets=drv~., binwidth=2)
#Smooth and color historgrams. geom parameter "density" draws out the smooth histogram and color=drv assigns different color to different group
qplot(hwy, data=mpg, geom="density", color=drv)
#Making boxplots
qplot(drv, hwy, data=mpg, geom="boxplot")
#Making more detailed boxplots
qplot(drv, hwy, data=mpg, geom="boxplot", color=manufacturer)
Using the ggplot2 function
library(ggplot2)
#Store data in object g first without plotting
g = ggplot(mpg, aes(displ, hwy))
#Add layers to the plot. In this case adding points, linear regression, and subplots in grid.
g + geom_point() + geom_smooth(method="lm") + facet_grid(.~drv)
#Adding layers with modified parameters
g + geom_point(color="pink", size=4, alpha=1/2)
g + geom_point(size=4, alpha=1/2, aes(color=drv))
g + geom_point(aes(color = drv)) + labs(title="Swirl Rules!") + labs(x="Displacement", y="Hwy Mileage")
#For the arguments in geom_smooth(), size specifies size of dashes, linetype=3 specifies dashed instead of continuous line, method="lm" indicates linear regression, and se=FALSE turns off gray shadows indicating standard errors.
g + geom_point(aes(color = drv), size=2, alpha=1/2) + geom_smooth(size=4, linetype=3, method="lm", se=FALSE)
#Change the background theme and font with "base_family" parameter.
g + geom_point(aes(color=drv)) + theme_bw(base_family="Times")
#Changing the coordinate axes.
g + geom_line() + coord_cartesian(ylim=c(-3, 3))
#Example Illustration. margins=TRUE tells the plot to display marginal totals over each row and column. Since there are 3 factors in drv and 4 factors in cyl, with margins=FALSE will generate a 3x4 scatterplot.
g + geom_point() + facet_grid(drv~cyl, margins=TRUE) + geom_smooth(method="lm", size=2, se=FALSE, color="black")
#Plotting boxplots.
ggplot(diamonds, aes(carat, price)) + geom_boxplot() + facet_grid(.~cut)
lattice system: xyplot(y ~ x | f*g, data)
y ~ x: y plotted against xf*g: f and g are optional conditioning variables. The * means the intersection of two variablesy and x are looked up.library(datasets)
library(lattice)
xyplot(Ozone ~ Wind, data = airquality, col="red", pch=8, main="Big Apple Data")
In this example, layout specifies dimensions of grid in the form of c(columns, rows) upon which multiple plots will be plotted. Alternatively, one can assign the output of xyplot() to a variable, then print it later to get the plot to appear.
library(datasets)
library(lattice)
airquality = transform(airquality, Month = factor(Month))
p = xyplot(Ozone ~ Wind | Month, data = airquality, layout = c(5, 1))
#lattice functions return objects of class trellis
print(p)
#To view a particular setting of the p object
p[["x.limits"]]
## [1] 0.37 22.03
Another example with more optional conditioning.
library(ggplot2)
library(lattice)
str(diamonds)
table(diamonds$color, diamonds$cut)
xyplot(price~carat | color*cut, data = diamonds, strip=FALSE, pch=20, xlab="Carat", ylab="Price", main="Diamonds are Sparkly!")
#strip=FALSE indicates that the labels for each box is not shown to avoid clutter
Controlling color and opacity of the points and its importance.
#Arguments of rgb() in order: red, green, blue, alpha
x = rnorm(1000)
y = rnorm(1000)
plot(x, y, pch=19, col=rgb(0, 0.5, 0.5, 0.3))
Lattice panel functon
#Generate data
set.seed(10)
x = rnorm(100)
f = rep(0:1, each=50)
y = x + f - f*x + rnorm(100, sd=0.5)
f = factor(f, labels=c("Group 1", "Group 2"))
#Lattice panel function example 1
xyplot(y ~ x | f, panel=function(x, y, ...){
panel.xyplot(x, y, ...)
panel.abline(h=median(y), lty=2)
}, layout=c(2, 1))
#Lattice panel function example 2
xyplot(y ~ x | f, panel=function(x, y, ...){
panel.xyplot(x, y, ...)
panel.lmline(x, y, col = 2) #Overlay simple linear regression line. col parameter determines color of line.
})
Save plots already plotted to file. In this example, the device is specified as png, and the height and width of the png file are specified.
library(datasets)
x = rnorm(100, 0, 3)
hist(x)
dev.copy(png, file="plot.png", height=480, width=480)
dev.off()
Writing plots to pdf.
pdf(file="myplot.pdf")
x = rnorm(100, 0, 3)
hist(x)
dev.off()
Multiple scatter plots: adding general title to multiple scatter plots via mtext. The oma parameter in par() represents outer margin lengths from bottom, left, top, right respectively.
library(datasets)
par(mfrow=c(1, 3), mar=c(4, 4, 2, 1), oma=c(0, 0, 2, 0))
with(airquality, {
plot(Wind, Ozone, main="Ozone and Wind")
plot(Temp, Ozone, main="Ozone and Temperature")
plot(Temp, Ozone, main="Ozone and Temperature")
mtext("Ozone and Weather in New York City", outer = TRUE)
})
Usage of legend() function to add legend. type parameter in plot() indicates what type of plot to plot(“p”=points, “l”=lines, “n”=no plotting). lm() draws the linear regression line to the graph. In this example, linear regression is performed on the airquality data with Wind on the x-axis and Ozone on the y-axis.
library(datasets)
with(airquality, plot(Wind, Ozone, main="Ozone and Wind in New York City", type="n"))
with(subset(airquality, Month == 5), points(Wind, Ozone, col="blue"))
with(subset(airquality, Month != 5), points(Wind, Ozone, col="red"))
legend("topright", pch=1, col=c("blue", "red"), legend=c("May", "Other Months"))
model = lm(Ozone ~ Wind, airquality)
model
##
## Call:
## lm(formula = Ozone ~ Wind, data = airquality)
##
## Coefficients:
## (Intercept) Wind
## 96.873 -5.551
abline(model, lwd=2)
Annotating a graph.
library(datasets)
with(airquality, plot(Temp, Wind))
lines(airquality$Temp, airquality$wind)
text(x=75, y=20, labels="Hello World!")
title("Test Graph")
mtext("Random text", side=4)
Alternatively, rather than specify the coordinates you want your text to be placed, you mouse-click the position you want your text to be placed.
library(datasets)
with(airquality, plot(Temp, Wind))
lines(airquality$Temp, airquality$wind)
text(locator(1), labels="Hello World!")
title("Test Graph")
mtext("Random text", side=4)
Exploratory Data Analyis in 1 dimension. The rug() function plots the points under the histogram.
#summary() gives you a six number summary of a vector of values.
data = rnorm(100, 10, 3)
summary(data)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.690 8.407 10.210 10.090 12.210 17.290
#Boxplot
boxplot(data, col="blue")
#histogram
par(mfcol = c(1, 2))
hist(data, col="green", main = "Histogram default breaks")
rug(data)
hist(data, col="green", main = "Histogram with 50 breaks", breaks = 50)
rug(data)
par(mfcol = c(1, 1))
#Barplot summarizes categorical variables
data2 = factor(c(rep("east", 450), rep("west", 150)))
barplot(table(data2), col="wheat")
Exploratory data analysis in 2 dimensions
pollution = read.csv("data_sets/avgpm25.csv", colClasses = c("numeric", "character", "factor", "numeric", "numeric"))
head(pollution)
## pm25 fips region longitude latitude
## 1 9.771185 01003 east -87.74826 30.59278
## 2 9.993817 01027 east -85.84286 33.26581
## 3 10.688618 01033 east -87.72596 34.73148
## 4 11.337424 01049 east -85.79892 34.45913
## 5 12.119764 01055 east -86.03212 34.01860
## 6 10.827805 01069 east -85.35039 31.18973
#Multiple boxplots. In this example. We are making a box plot of the pm25 data from the pollution dataframe, divided by region.
boxplot(pm25 ~ region, data = pollution, col="red")
#Multiple histograms
par(mfrow = c(2, 1))
hist(subset(pollution, region=="east")$pm25, col="green")
hist(subset(pollution, region=="west")$pm25, col="green")
par(mfrow=c(1, 1))
#Scatterplot with multiple dimensions.
#col.axis determines color of axis number labels. col.lab determines xlab and ylab colors.
with(pollution, plot(latitude, pm25, col=region, col.axis="blue", col.lab="red"))
abline(h=12)
Making multiple boxplots from a list input. Each boxplot will represent data from an item in a list.
x = list(a=rnorm(20, 0, 1), b=rnorm(20, 0, 2), c=rnorm(20, 0, 3))
boxplot(x)
Create multiple grids on which plots will be generated with par() function. Any charts generated after these commands will go on the grid. The parameter mfrow indicates that plots will be plotted by row and mfcol indicates that plots will be plotted by column. Both mfrow and mfcol accepts a two element vector, with the first number indicating number of rows and second number indicating number of columns. The mar parameter accepts a 4 element vector indicating the margins for the bottom, left, top, right respectively.
x = seq(10)
par(mfrow=c(2, 2), mar=c(4, 4, 4, 4))
plot(x, sin(x))
plot(x, cos(x))
plot(x, x^2)
plot(x, tan(x))
#Resets par() so that one plot is shown at a time
par(mfcol=c(1, 1))
The las parameter in par() can be used to orient the axis value labels.
The bg parameter in par() can be used to set the background color
library(datasets)
hist(airquality$Ozone)
par(las=1, bg="wheat")
hist(airquality$Ozone)
Adding a line via abline().
x = rnorm(100, 5, 2)
hist(x)
#Line starts at y=5 and has a slope of 5
abline(5, 5, col="green")
#Vertical line with a line width of 4, position at mean of x
abline(v = mean(x), col="red", lwd=4)
#Horizontal line with position at 15
abline(h = 15)
Making a MA plot
#Assume e is our log2 transformed expression matrix. Where rows are genes and columsn are samples
e <- log2(exprs(mas133))
plot((e[,1]+e[,2])/2,e[,2]-e[,1],cex=0.5)
Making a quantile plot
#Assume "e" is our expression matrix with genes as rows and samples as columns.
#Finding the quantiles for each sample given the probabilties.
qs = apply(e, 2, quantile, prob=c(0.05, 0.25, 0.5, 0.75, 0.95))
#dim(qs) will indicate number of quantiles by number of samples
#matplot() function plots columns of a matrix against rows. Since we want the quantiles in our quantile plot to be on the y-axis, we must transpose qs.
qs = t(qs)
#In matplot, type indicates type of plot (in this case line plot), and it is not clear what the lty parameter is.
matplot(qs, type="l", lty=1, main="Quantile Plot", xlab="Samples", tlab="Quantiles")
Making a volcano plot with an example dataset
res = read.table("results.txt", header=TRUE)
head(res)
class(res)
#with() function makes it more convenient by allowing user to directly reference column name in plot() function
with(res, plot(log2FoldChange, -log10(pvalue), pch=20, main="Volcano plot", xlim=c(-2.5,2.5), asp=1))
#pch is a parameter that determines shape of point
#Let us color the points according to the pvalue and log2fold change characteristics. (1) Red if points have an adjusted pvalue < 0.05 (2) orange is points have log2 fold change > abs(1), and (3) green if both conditons 1 and 2 are met.
with(subset(res, padj <0.05 ), points(log2FoldChange, -log10(pvalue), pch=20, col="red"))
with(subset(res, abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), pch=20, col="orange"))
with(subset(res, padj<.05 & abs(log2FoldChange)>1), points(log2FoldChange, -log10(pvalue), pch=20, col="green"))
Basic plotting
plot(x, y, main = title, cex=1)
#cex = controls size of points plotted. 1 means original size, 2 means double the size, and 0.5 means half the size.
Drawing polygons in plots
plot(1, 1, asp = 1) #asp = 1 makes ratio of y-axis and x-axis the same
polygon(c(0.8, 1.2, 1.2, 0.8), c(0.8, 0.8, 1.2, 1.2), density=0, border="red")
#polygon(x_coordinates_of_vertices, y_coordinates_of_vertices, color, density, border)
dplyr is a package that makes it simpler and easier to work with dataframes. It also uses efficient data storage backends. A more detailed tutorial of dplyr can be found here.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
year = c(seq(2012, 2016), 2017)
month = c(1, 5, 6, 2, 9, 1)
dep_time = runif(6, 500, 510)
arr_time = runif(6, 800, 810)
d = data.frame(year=year, month=month, dep_time=dep_time, arr_time=arr_time)
d
## year month dep_time arr_time
## 1 2012 1 504.3841 808.7472
## 2 2013 5 509.6009 804.9235
## 3 2014 6 503.0224 802.2187
## 4 2015 2 504.6572 800.6766
## 5 2016 9 504.2597 807.5462
## 6 2017 1 508.6572 809.9654
#Subsetting based on conditions
filter(d, dep_time > 505, arr_time < 805)
## year month dep_time arr_time
## 1 2013 5 509.6009 804.9235
#slice() selects rows of dataframe
slice(d, 2:4)
## year month dep_time arr_time
## 1 2013 5 509.6009 804.9235
## 2 2014 6 503.0224 802.2187
## 3 2015 2 504.6572 800.6766
#Arrange() reorders dataframe by a column. Additional columns are included to break ties.
arrange(d, month, year)
## year month dep_time arr_time
## 1 2012 1 504.3841 808.7472
## 2 2017 1 508.6572 809.9654
## 3 2015 2 504.6572 800.6766
## 4 2013 5 509.6009 804.9235
## 5 2014 6 503.0224 802.2187
## 6 2016 9 504.2597 807.5462
arrange(d, desc(month)) #Arrange by month in descending order
## year month dep_time arr_time
## 1 2016 9 504.2597 807.5462
## 2 2014 6 503.0224 802.2187
## 3 2013 5 509.6009 804.9235
## 4 2015 2 504.6572 800.6766
## 5 2012 1 504.3841 808.7472
## 6 2017 1 508.6572 809.9654
#Select allows you to select columns in dataframe.
select(d, dep_time, arr_time)
## dep_time arr_time
## 1 504.3841 808.7472
## 2 509.6009 804.9235
## 3 503.0224 802.2187
## 4 504.6572 800.6766
## 5 504.2597 807.5462
## 6 508.6572 809.9654
select(d, month:dep_time)
## month dep_time
## 1 1 504.3841
## 2 5 509.6009
## 3 6 503.0224
## 4 2 504.6572
## 5 9 504.2597
## 6 1 508.6572
select(d, starts_with("sl"))
## data frame with 0 columns and 6 rows
#distinct() used with select() only returns unique values
distinct(select(d, month))
## month
## 1 1
## 2 5
## 3 6
## 4 2
## 5 9
#mutate() allows you to add columns to dataframe.
mutate(d, time=arr_time-dep_time, time_unit = time/60)
## year month dep_time arr_time time time_unit
## 1 2012 1 504.3841 808.7472 304.3631 5.072718
## 2 2013 5 509.6009 804.9235 295.3226 4.922043
## 3 2014 6 503.0224 802.2187 299.1963 4.986604
## 4 2015 2 504.6572 800.6766 296.0194 4.933657
## 5 2016 9 504.2597 807.5462 303.2865 5.054775
## 6 2017 1 508.6572 809.9654 301.3082 5.021804
#transmuate is like mutate() except you only end up with the newly created columns
transmute(d, time=arr_time-dep_time, time_unit = time/60)
## time time_unit
## 1 304.3631 5.072718
## 2 295.3226 4.922043
## 3 299.1963 4.986604
## 4 296.0194 4.933657
## 5 303.2865 5.054775
## 6 301.3082 5.021804
#Using the pipe function to chain dplyr functions together.
d %>% select(year, month) %>% head
## year month
## 1 2012 1
## 2 2013 5
## 3 2014 6
## 4 2015 2
## 5 2016 9
## 6 2017 1
d %>% mutate(time=arr_time-dep_time, time_unit = time/60) %>% filter(time >= 301) %>% head
## year month dep_time arr_time time time_unit
## 1 2012 1 504.3841 808.7472 304.3631 5.072718
## 2 2016 9 504.2597 807.5462 303.2865 5.054775
## 3 2017 1 508.6572 809.9654 301.3082 5.021804
Extracting subdata frames and applying functions to dataframes
library(plyr)
## -------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## -------------------------------------------------------------------------
##
## Attaching package: 'plyr'
## The following objects are masked from 'package:dplyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
exams = as.data.frame(replicate(3, rnorm(5, 4, 1)))
exams = rename(exams, c("V1" = "Exam1", "V2" = "Exam2", "V3" = "Exam3"))
exams
## Exam1 Exam2 Exam3
## 1 3.522454 4.888425 6.235394
## 2 5.747901 2.804460 4.347655
## 3 5.616139 5.423692 6.003491
## 4 3.322527 3.799218 2.792580
## 5 2.937886 5.328165 3.880153
#Extracting
exams[2:5, 2]
## [1] 2.804460 5.423692 3.799218 5.328165
exams[2:5, 2, drop=FALSE]
## Exam2
## 2 2.804460
## 3 5.423692
## 4 3.799218
## 5 5.328165
exams[exams$Exam1 >= 4.00, ]
## Exam1 Exam2 Exam3
## 2 5.747901 2.804460 4.347655
## 3 5.616139 5.423692 6.003491
subset(exams, Exam1 >= 4.00)
## Exam1 Exam2 Exam3
## 2 5.747901 2.804460 4.347655
## 3 5.616139 5.423692 6.003491
#Applying apply()
apply(exams, 1, max)
## [1] 6.235394 5.747901 6.003491 3.799218 5.328165
Creating, accessing, and renaming data frames
kids = c("jack", "jill")
ages = c(12, 10)
d = data.frame(kids, ages, stringsAsFactors = FALSE) #Each column is a vector rather than a factor
d
## kids ages
## 1 jack 12
## 2 jill 10
#Various accessing methods
d[[1]]
## [1] "jack" "jill"
d$kids
## [1] "jack" "jill"
d[,1]
## [1] "jack" "jill"
#Structure of dataframe d
str(d)
## 'data.frame': 2 obs. of 2 variables:
## $ kids: chr "jack" "jill"
## $ ages: num 12 10
#New dataframe
d <- data.frame(alpha=1:3, beta=4:6, gamma=7:9)
names(d)
## [1] "alpha" "beta" "gamma"
#Rename method 1
rename(d, c("beta"="two", "gamma"="three"))
## alpha two three
## 1 1 4 7
## 2 2 5 8
## 3 3 6 9
#Rename method 2
names(d)[names(d)=="beta"] <- "two"
names(d)[3] = "three"
Initialize a list of specified item names each with empty entries
names = c("a", "b", "c")
mylist = sapply(names, function(x) NULL)
mylist
## $a
## NULL
##
## $b
## NULL
##
## $c
## NULL
Recursive lists and how to flatten recursive lists.
b = list(u = 5, v = 12)
c = list(w = 13)
a = list(b, c)
a
## [[1]]
## [[1]]$u
## [1] 5
##
## [[1]]$v
## [1] 12
##
##
## [[2]]
## [[2]]$w
## [1] 13
length(a)
## [1] 2
c(list(u=5, v=12, c=list(d=5, e=9)))
## $u
## [1] 5
##
## $v
## [1] 12
##
## $c
## $c$d
## [1] 5
##
## $c$e
## [1] 9
#flatten option in c()
c(list(u=5, v=12, c=list(d=5, e=9)), recursive = T)
## u v c.d c.e
## 5 12 5 9
Using lapply() and sapply() functions to lists
#lapply returns list
lapply(list(1:3, 25:29), median)
## [[1]]
## [1] 2
##
## [[2]]
## [1] 27
#sapply returns vector
sapply(list(1:3, 25:29), median)
## [1] 2 27
Lists are considered lower precedence to vectors
c(5, 2, list(a=1, b=4))
## [[1]]
## [1] 5
##
## [[2]]
## [1] 2
##
## $a
## [1] 1
##
## $b
## [1] 4
Converting lists back to vectors and removing names
j = list(name="joe", salary=5500, union=T)
#get names
names(j)
## [1] "name" "salary" "union"
class(names(j))
## [1] "character"
#List to vector
ulj = unlist(j)
ulj
## name salary union
## "joe" "5500" "TRUE"
class(ulj)
## [1] "character"
#Removing names method 1
ulj1 = unname(ulj)
ulj1
## [1] "joe" "5500" "TRUE"
#Removing names method 2
names(ulj) = NULL
ulj
## [1] "joe" "5500" "TRUE"
Adding and deleting to lists. Getting size of list.
j = list(name="joe", salary=5500, union=T)
#Add method 1:
j$activity = "sailing"
j
## $name
## [1] "joe"
##
## $salary
## [1] 5500
##
## $union
## [1] TRUE
##
## $activity
## [1] "sailing"
#Add method 2:
j[4:5] = c(1, 2)
j
## $name
## [1] "joe"
##
## $salary
## [1] 5500
##
## $union
## [1] TRUE
##
## $activity
## [1] 1
##
## [[5]]
## [1] 2
#Delete an item:
j[[5]] = NULL
j$activity = NULL
j
## $name
## [1] "joe"
##
## $salary
## [1] 5500
##
## $union
## [1] TRUE
#Getting number of components
length(j)
## [1] 3
Creating lists
j = list(name="joe", salary=5500, union=T)
j
## $name
## [1] "joe"
##
## $salary
## [1] 5500
##
## $union
## [1] TRUE
jalt = list("joe", 5500, T)
jalt
## [[1]]
## [1] "joe"
##
## [[2]]
## [1] 5500
##
## [[3]]
## [1] TRUE
j$salary
## [1] 5500
j[["salary"]]
## [1] 5500
j[2]
## $salary
## [1] 5500
j[[2]]
## [1] 5500
z = vector(mode="list")
z[["abc"]] = 3
z
## $abc
## [1] 3
Arrays to store higher dimensional data
firsttest = matrix(c(46, 21, 50, 30, 25, 50), nrow=3)
secondtest = matrix(c(46, 41, 50, 43, 35, 50), nrow=3)
tests = array(data=c(firsttest, secondtest), dim = c(3, 2, 2))
tests
## , , 1
##
## [,1] [,2]
## [1,] 46 30
## [2,] 21 25
## [3,] 50 50
##
## , , 2
##
## [,1] [,2]
## [1,] 46 43
## [2,] 41 35
## [3,] 50 50
tests[,,1]
## [,1] [,2]
## [1,] 46 30
## [2,] 21 25
## [3,] 50 50
An array with dimensions \(c(x_1, x_2, x_3)\) is composed of \(x_3\) numbers of \(x_1 x x_2\) matrices.
b = array(1:12, c(2, 3, 2))
# Get the first matrix
b[,,1]
## [,1] [,2] [,3]
## [1,] 1 3 5
## [2,] 2 4 6
# Get the first row of every matrix
b[1,,]
## [,1] [,2]
## [1,] 1 7
## [2,] 3 9
## [3,] 5 11
# Get the second column of every matrix
b[,2,]
## [,1] [,2]
## [1,] 3 9
## [2,] 4 10
# Reassign dimensions
dim(b) = c(6, 2)
b
## [,1] [,2]
## [1,] 1 7
## [2,] 2 8
## [3,] 3 9
## [4,] 4 10
## [5,] 5 11
## [6,] 6 12
Basic linear algebra operations on matrices.
y = matrix(1:4, byrow = T, nrow=2)
y%*%y #Matrix - matrix multiplication
## [,1] [,2]
## [1,] 7 10
## [2,] 15 22
3*y #Matrix - scalar multiplication
## [,1] [,2]
## [1,] 3 6
## [2,] 9 12
y + y #matrix - matrix addition
## [,1] [,2]
## [1,] 2 4
## [2,] 6 8
Find the inverse of a square matrix.
A = matrix(c(1, 2, 3, 4), nrow = 2)
Ain = solve(A)
A%*%Ain
## [,1] [,2]
## [1,] 1 0
## [2,] 0 1
Combine a list of data frames/matrices into a single matrix
x = list(a=matrix(seq(6), nrow = 3), b=matrix(seq(4), nrow = 2))
do.call(rbind, x)
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
## [4,] 1 3
## [5,] 2 4
In-place tranform a column in a dataframe to another class using tranform(). First argument is a dataframe, and any arguments afterwards allows you to redefine the class of the column.
library(datasets)
head(airquality)
## Ozone Solar.R Wind Temp Month Day
## 1 41 190 7.4 67 5 1
## 2 36 118 8.0 72 5 2
## 3 12 149 12.6 74 5 3
## 4 18 313 11.5 62 5 4
## 5 NA NA 14.3 56 5 5
## 6 28 NA 14.9 66 5 6
airquality = transform(airquality, Month=factor(Month))
Given that a dataframe has factor columns containing double datatypes, transform columns into vectors.
d = data.frame(one = factor(c(0.45, 0.545)), two = factor(c("high", "low")))
d
## one two
## 1 0.45 high
## 2 0.545 low
class(d$one)
## [1] "factor"
d1 = transform(d, one = as.vector(one))
d1
## one two
## 1 0.45 high
## 2 0.545 low
class(d1$one)
## [1] "character"
Filtering with the subset() function to remove NAs. subset() has inputs data, condition, and select. Select chooses which columns to select from filtered data frame.
x <- c(6, 1:3, NA, 12)
x[x > 5]
## [1] 6 NA 12
subset(x, x > 5)
## [1] 6 12
exams = replicate(3, rnorm(5, 4, 1))
exams
## [,1] [,2] [,3]
## [1,] 5.230604 5.036754 5.232597
## [2,] 3.555671 4.249525 4.501794
## [3,] 4.066296 4.194820 3.314486
## [4,] 2.171491 4.064353 2.759771
## [5,] 4.342042 3.408740 3.560954
subset(exams, exams[,1] > 4 | exams[,2] > 4 | exams[,3] > 4, select=c(2, 3))
## [,1] [,2]
## [1,] 5.036754 5.232597
## [2,] 4.249525 4.501794
## [3,] 4.194820 3.314486
## [4,] 4.064353 2.759771
## [5,] 3.408740 3.560954
Using lapply and sapply to dataframes and converting lists to dataframes.
d = data.frame(kids = c("Jack", "Jill"), ages = c(12, 10))
d
## kids ages
## 1 Jack 12
## 2 Jill 10
dl = lapply(d, sort)
dl
## $kids
## [1] Jack Jill
## Levels: Jack Jill
##
## $ages
## [1] 10 12
as.data.frame(dl)
## kids ages
## 1 Jack 10
## 2 Jill 12
Alternatively, to directly receive datafrmae as output rather than list.
d[] = lapply(d, sort)
d
## kids ages
## 1 Jack 10
## 2 Jill 12
Merging data frames using merge(). The function merge() will automatically merge similar columns and will only keep rows that exist in both dataframes. If columns for each dataframe contains the same information but different names, user can indicate that.
d1 = data.frame(kids=c("Jack", "Jill", "Jillian", "John"), states=c("CA", "MA", "MA", "HI"))
d2 = data.frame(ages=c(10, 7, 12), kids=c("Jill", "Lillian", "Jack"))
d1
## kids states
## 1 Jack CA
## 2 Jill MA
## 3 Jillian MA
## 4 John HI
d2
## ages kids
## 1 10 Jill
## 2 7 Lillian
## 3 12 Jack
d = merge(d1, d2)
d
## kids states ages
## 1 Jack CA 12
## 2 Jill MA 10
d3 = data.frame(ages = c(12, 10, 7), pals = c("Jack", "Jill", "Lillian"))
merge(d1, d3, by.x="kids", by.y="pals")
## kids states ages
## 1 Jack CA 12
## 2 Jill MA 10
#Becareful of duplicate entries in a column for one dataframe when using merge()
d2a = rbind(d2, list(15, "Jill"))
merge(d1, d2a)
## kids states ages
## 1 Jack CA 12
## 2 Jill MA 10
## 3 Jill MA 15
Adding rows and columns to dataframes
kids = c("Jack", "Jill")
ages = c(12, 10)
d = data.frame(kids, ages, stringsAsFactors = F)
#Add row method 1:
rbind(d, list("Laura", 19))
## kids ages
## 1 Jack 12
## 2 Jill 10
## 3 Laura 19
#Add row method 2:
rbind(d, c("Laura", 19))
## kids ages
## 1 Jack 12
## 2 Jill 10
## 3 Laura 19
#Add column method 1:
cbind(d, happy = c(T, F))
## kids ages happy
## 1 Jack 12 TRUE
## 2 Jill 10 FALSE
#Add column method 2:
d$happy = T
d
## kids ages happy
## 1 Jack 12 TRUE
## 2 Jill 10 TRUE
d$sad = c(F, F)
d
## kids ages happy sad
## 1 Jack 12 TRUE FALSE
## 2 Jill 10 TRUE FALSE
Completing removing any rows with NAs in a dataframe
kids = c("Jack", NA, "Jillian", "John")
states = c("CA", "MA", "MA", NA)
d = data.frame(kids, states, stringsAsFactors = F)
d
## kids states
## 1 Jack CA
## 2 <NA> MA
## 3 Jillian MA
## 4 John <NA>
complete.cases(d)
## [1] TRUE FALSE TRUE FALSE
d2 = d[complete.cases(d), ]
d2
## kids states
## 1 Jack CA
## 3 Jillian MA
One way to convert a vector to a matrix.
x = 1:16
x
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16
dim(x) = c(4, 4)
x
## [,1] [,2] [,3] [,4]
## [1,] 1 5 9 13
## [2,] 2 6 10 14
## [3,] 3 7 11 15
## [4,] 4 8 12 16
x = 1:5
v = as.matrix(x)
v
## [,1]
## [1,] 1
## [2,] 2
## [3,] 3
## [4,] 4
## [5,] 5
Ways to convert from one object type to the other.
x = 1:3
mode(x) = "double"
typeof(x)
## [1] "double"
mode(x) = "character"
x
## [1] "1" "2" "3"
#Convert to string
number = 5
as.character(number)
## [1] "5"
#The strtoi() function converts from string to integer
x = c("1", "2", "3", "4")
x
## [1] "1" "2" "3" "4"
strtoi(x)
## [1] 1 2 3 4
sapply(x, f) applies function f() to every element in x and returns a vector/matrix as a result.
a = function(z) return(c(z, z^2))
a(1:4)
## [1] 1 2 3 4 1 4 9 16
result = sapply(1:4, a)
result
## [,1] [,2] [,3] [,4]
## [1,] 1 2 3 4
## [2,] 1 4 9 16
class(result)
## [1] "matrix"
Usage of apply(m, dimcode, f, fargs)
If the function returns k components, then the result of apply() will have k rows.
z = matrix(seq(1, 6), nrow = 3)
z
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
apply(z, 2, mean)
## [1] 2 5
f = function(x) x/c(2, 8)
y = apply(z, 1, f)
y
## [,1] [,2] [,3]
## [1,] 0.5 1.000 1.50
## [2,] 0.5 0.625 0.75
t(y)
## [,1] [,2]
## [1,] 0.5 0.500
## [2,] 1.0 0.625
## [3,] 1.5 0.750
Usage of apply(m, dimcode, f, fargs) #2!
f = function(IW, d){
maj = sum(IW[1:d]) / d
return(if(maj > 0.5) 1 else 0)
}
x = matrix(c(1, 0, 1, 1, 0, 1, 1, 1, 1, 0), nrow=2, ncol=5, byrow = T)
x
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 0 1 1 0
## [2,] 1 1 1 1 0
apply(x, 1, f, 3)
## [1] 1 1
apply(x, 1, f, 2)
## [1] 0 1
regexpr(<pattern>, <x>) finds index of the first occurence of <pattern> in <x>:
regexpr("a", c("cat", "dag", "wallet", "average", "status"))
## [1] 2 2 2 1 3
## attr(,"match.length")
## [1] 1 1 1 1 1
## attr(,"useBytes")
## [1] TRUE
gregexpr(<pattern>, <x>) finds index of every occurence of <pattern> in <x>
# Finds index of every period
gregexpr("\\.", c("1244.34234.234234", "342.24323.12414"))
## [[1]]
## [1] 5 11
## attr(,"match.length")
## [1] 1 1
## attr(,"useBytes")
## [1] TRUE
##
## [[2]]
## [1] 4 10
## attr(,"match.length")
## [1] 1 1
## attr(,"useBytes")
## [1] TRUE
subset() can be used to filter out NAs
x = c(6, 1:3, NA, 12)
x[x > 5]
## [1] 6 NA 12
subset(x, x > 5)
## [1] 6 12
match() finds the index of the first match in the second vector of each element in the first vector.
v1 = c("Cow", "Owl", "Greater")
v2 = c("Cheetah", "Owl", "Mountain", "Greater")
match(v1, v2)
## [1] NA 2 4
grep() searches for a specified substring pattern in a vector x of strings. It returns the indices where the pattern occurs. Setting value = TRUE in grep() allows one to return the whole line containing the pattern.
grep("Pole", c("Equation", "North Pole", "South Pole"))
## [1] 2 3
grep("pole", c("Equator", "North Pole", "South Pole"))
## integer(0)
grepl() searches for a specified substring pattern in a vector x of strings. It returns TRUE where pattern occurs and FALSE where it does not.
grepl("Pole", c("Equation", "North Pole", "South Pole"))
## [1] FALSE TRUE TRUE
Application: using grep(), readLines(), and read.table() to only read a file that contains certain dates in its first column. In grep syntax, “^” means the line has to start with the pattern it is searching for. [1, 2] means that either 1 or 2 can be in that position in the pattern. value = TRUE, means that instead of returning indices, the whole line containing the pattern is returned.
f = file("data_sets/household_power_consumption.txt")
data = read.table(text=grep("^[1, 2]/2/2007", readLines(f), value = TRUE),
sep = ";", na.strings="?", header=TRUE, col.names = c("Date", "Time",
"Global_active_power", "Global_reactive_power", "Voltage",
"Global_intensity", "Sub_metering_1", "Sub_metering_2", "Sub_metering_3"))
hist(data$Global_active_power, breaks=12, xlab="Global Active Power (kilowatts)",
ylab="Frequency", main="Global Active Power", col="red")
any() reports whether any arguments in vector is true.
x = 1:10
any(x > 8)
## [1] TRUE
any(x > 88)
## [1] FALSE
which() returns the indices for which a condition is fulfilled in a vector.
z = c(5, 2, -3, 8)
which(z*z > 8)
## [1] 1 3 4
Determines index of maximum/minimum value in vector using which()
x = c(1, 4, 5, -5, 99, 3)
which.max(x)
## [1] 5
which.min(x)
## [1] 4
Testing vector inequality using all().
x = 1:3
y = c(1, 3, 4)
x == y
## [1] TRUE FALSE FALSE
all(x == y)
## [1] FALSE
Identical() tests for both value and type equality.
x = 1:3
y = c(1, 2, 3)
identical(x, y)
## [1] FALSE
typeof(x)
## [1] "integer"
typeof(y)
## [1] "double"
Matrices are vectors with special class attributes. By default, elements are stored in column-major order. One can specify during matrix creation to have elements stored in row-major order. When doing matrix addition or subtraction, the number of elements must be equal to each other for an element-wise operation.
x = matrix(seq(1, 4, 1), nrow=2)
x
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
y = x + 10:13
y
## [,1] [,2]
## [1,] 11 15
## [2,] 13 17
x = matrix(seq(1, 4, 1), nrow=2, byrow = T)
x
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
Using col() and row()
z = matrix(3:8, nrow = 3)
row(z)
## [,1] [,2]
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
col(z)
## [,1] [,2]
## [1,] 1 2
## [2,] 1 2
## [3,] 1 2
Replacing a row with a certain value
y = matrix(1:6, nrow = 3)
y[2, ] = 0
Remvoing rows and columns from matrices
y = matrix(1:6, nrow = 3)
y[-2, ]
## [,1] [,2]
## [1,] 1 4
## [2,] 3 6
y[, -2]
## [1] 1 2 3
Assignment of submatrices to matrices
x = matrix(nrow = 3, ncol = 3)
y = matrix(c(4, 5, 2, 3), nrow = 2)
x[2:3, 2:3] = y
x
## [,1] [,2] [,3]
## [1,] NA NA NA
## [2,] NA 4 2
## [3,] NA 5 3
Initializing a matrix, then assigning value to each element
y = matrix(nrow=2, ncol=2)
y
## [,1] [,2]
## [1,] NA NA
## [2,] NA NA
y[1, 1] = 1
y[2, 1] = 2
y
## [,1] [,2]
## [1,] 1 NA
## [2,] 2 NA
#Initializing a matrix of zeros
y = matrix(0, nrow=2, ncol=2)
y
## [,1] [,2]
## [1,] 0 0
## [2,] 0 0
Matrices: col() and row() functions.
z = matrix(3:8, nrow=3)
row(z)
## [,1] [,2]
## [1,] 1 1
## [2,] 2 2
## [3,] 3 3
col(z)
## [,1] [,2]
## [1,] 1 2
## [2,] 1 2
## [3,] 1 2
More complicated matrix subsetting.
m = matrix(seq(1, 6), nrow=3)
m
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
m[m[,1] > 1 & m[, 2] > 5, ]
## [1] 3 6
which(m > 2)
## [1] 3 4 5 6
which(m > 2, arr.ind = T)
## row col
## [1,] 3 1
## [2,] 1 2
## [3,] 2 2
## [4,] 3 2
Matrix element assignment and row/column removal.
x = matrix(nrow=3, ncol=3)
y = matrix(c(4, 5, 2, 3), nrow=2)
y
## [,1] [,2]
## [1,] 4 2
## [2,] 5 3
x[2:3, 2:3] = y
x
## [,1] [,2] [,3]
## [1,] NA NA NA
## [2,] NA 4 2
## [3,] NA 5 3
y[-2, ]
## [1] 4 2
Finding the column-wise/row-wise means using colMeans(x) or rowMeans(x)
x = matrix(c(1, 0, 1, 1, 0, 1, 1, 1, 1, 0), nrow=2, ncol=5, byrow = T)
x
## [,1] [,2] [,3] [,4] [,5]
## [1,] 1 0 1 1 0
## [2,] 1 1 1 1 0
colMeans(x)
## [1] 1.0 0.5 1.0 1.0 0.0
rowMeans(x)
## [1] 0.6 0.8
Retrieving dimensions of vectors/matrix distinction.
z = matrix(1:8, nrow=4)
z
## [,1] [,2]
## [1,] 1 5
## [2,] 2 6
## [3,] 3 7
## [4,] 4 8
str(z)
## int [1:4, 1:2] 1 2 3 4 5 6 7 8
length(z)
## [1] 8
attributes(z)
## $dim
## [1] 4 2
dim(z)
## [1] 4 2
nrow(z)
## [1] 4
ncol(z)
## [1] 2
Appending columns or rows to matrix using cbind() or rbind().
one = c(1, 1, 1, 1)
z = matrix(seq(1, 8), nrow=4)
z = cbind(one, z)
z
## one
## [1,] 1 1 5
## [2,] 1 2 6
## [3,] 1 3 7
## [4,] 1 4 8
z = cbind(2, z)
z
## one
## [1,] 2 1 1 5
## [2,] 2 1 2 6
## [3,] 2 1 3 7
## [4,] 2 1 4 8
z = rbind(6, z)
z
## one
## [1,] 6 6 6 6
## [2,] 2 1 1 5
## [3,] 2 1 2 6
## [4,] 2 1 3 7
## [5,] 2 1 4 8
cbind()
## NULL
How to avoid dimensionality reduction when extracting a row from a matrix.
z = matrix(1:8, nrow=4)
r = z[2, ]
r
## [1] 2 6
attributes(r) #r is a vector
## NULL
r = z[2,, drop=FALSE]
r
## [,1] [,2]
## [1,] 2 6
attributes(r) #r is now a matrix
## $dim
## [1] 1 2
Assigning colnames and rownames to matrices
z = matrix(1:4, nrow=2)
colnames(z) = c("a", "b")
rownames(z) = c("c", "d")
z
## a b
## c 1 3
## d 2 4
Using tapply() and split() with factors. The inputs for tapply() are tapply(vector, factor(s), function). split() performs the same task as tapply() except for applying a function. The inputs for split() are split(vector, factor(s)). split() returns a list of the splits.
ages = c(25, 26, 55, 37, 21, 42)
#Affils must have the same length as ages
affils = c("R", "D", "D", "R", "U", "D")
#affils is automatically converted to and treated as a factor by tapply()
tapply(ages, affils, mean)
## D R U
## 41 31 21
#tapply() when there are more than 1 factor
d = data.frame(gender=c("M", "M", "F", "M", "F", "F"), age=c(47, 59, 21, 32, 33, 24), income=c(55000, 88000, 32450, 76500, 123000, 45650))
d
## gender age income
## 1 M 47 55000
## 2 M 59 88000
## 3 F 21 32450
## 4 M 32 76500
## 5 F 33 123000
## 6 F 24 45650
d$over25 = ifelse(d$age>25, 1, 0)
d
## gender age income over25
## 1 M 47 55000 1
## 2 M 59 88000 1
## 3 F 21 32450 0
## 4 M 32 76500 1
## 5 F 33 123000 1
## 6 F 24 45650 0
tapply(d$income, list(d$gender, d$over25), mean)
## 0 1
## F 39050 123000.00
## M NA 73166.67
#split()
split(d$income, list(d$gender, d$over25))
## $F.0
## [1] 32450 45650
##
## $M.0
## numeric(0)
##
## $F.1
## [1] 123000
##
## $M.1
## [1] 55000 88000 76500
#Another application of split(): finding indices
g = c("M", "F", "F", "I", "M", "M", "F")
split(1:7, g)
## $F
## [1] 2 3 7
##
## $I
## [1] 4
##
## $M
## [1] 1 5 6
Generating factor levels. gl(number of levels, number of replications, optional character label vector).
gl(2, 3, labels=c("Male", "Female"))
## [1] Male Male Male Female Female Female
## Levels: Male Female
Creating a factor with repeated elements
g = factor(rep(c(0, 1), each=12))
Creating, examining, and manipulating factors
x = c(5, 12, 13, 12)
xf = factor(x)
xf
## [1] 5 12 13 12
## Levels: 5 12 13
str(xf) #str() displays internal structure of R object
## Factor w/ 3 levels "5","12","13": 1 2 3 2
unclass(xf)
## [1] 1 2 3 2
## attr(,"levels")
## [1] "5" "12" "13"
length(xf)
## [1] 4
#Adding new value entry to factor. Cannot add new value that is not a level.
xff = factor(x, levels=c(5, 12, 13, 88))
xff
## [1] 5 12 13 12
## Levels: 5 12 13 88
xff[2] = 88
xff
## [1] 5 88 13 12
## Levels: 5 12 13 88
#Transforming a vector to a factor
x = c("Jan", "Feb", "Jan", "April")
as.factor(x)
## [1] Jan Feb Jan April
## Levels: April Feb Jan
Filtering on matrices
x = matrix(c(1, 2, 3, 2, 3, 4), nrow = 3)
x[x[, 2] >= 3, ]
## [,1] [,2]
## [1,] 2 3
## [2,] 3 4
z = c(5, 12, 13)
x[z %% 2 == 1, ]
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
Filtering on vectors
z = c(5, 2, -3, 8)
z*z > 8
## [1] TRUE FALSE TRUE TRUE
Transforming a vector of TRUE and FALSE to 1 and 0.
x = c(TRUE, TRUE, FALSE, TRUE, TRUE, FALSE, FALSE, FALSE, TRUE)
x
## [1] TRUE TRUE FALSE TRUE TRUE FALSE FALSE FALSE TRUE
as.numeric(x)
## [1] 1 1 0 1 1 0 0 0 1
Remove all the NA from a vector
x = c(1, 2, 3, NA, 3, 2, NA)
is.na(x)
## [1] FALSE FALSE FALSE TRUE FALSE FALSE TRUE
x[!is.na(x)]
## [1] 1 2 3 3 2
Flattening effect of c().
c(5, 2, c(1.5, 6))
## [1] 5.0 2.0 1.5 6.0
Vector element can be assigned names and used as reference to corresponding element.
x = c(1, 2, 3)
names(x)
## NULL
names(x) = c("a", "b", "c")
x
## a b c
## 1 2 3
x["b"]
## b
## 2
names(x) = NULL
x
## [1] 1 2 3
Vectors names can alternatively be assigned this way:
x = c()
x["a"] = 1
x["b"] = 2
x
## a b
## 1 2
All elements in a vector must be of the same mode, or variable type. If elements of different modes are created during vector creation, then R will automatically convert elements to same mode.
x = c(2, "j")
typeof(x)
## [1] "character"
Vector initialization of specificed length.
x = vector(length=4)
x
## [1] FALSE FALSE FALSE FALSE
Given a vector, create a new vector with repeated elements.
x = c(4, 2, 17, 5)
y = x[c(1, 1, 3)]
y
## [1] 4 4 17
seq() can be used to (1) generate numeric vector of evenly-spaced numbers from minimum to maximum or (2) genereate a sequence of numbers from 1 to length of input vector.
qs = seq(0, 20, 2)
qs
## [1] 0 2 4 6 8 10 12 14 16 18 20
x = c(5, 12, 13, 14)
qs = seq(x)
qs
## [1] 1 2 3 4
x = NULL
qs = seq(x)
#Empty integer vector
qs
## integer(0)
#Creating a sequence of values of specificed length
lambdas = seq(0, 10, len=10)
lambdas
## [1] 0.000000 1.111111 2.222222 3.333333 4.444444 5.555556 6.666667
## [8] 7.777778 8.888889 10.000000
Removing elements from vector using negative indexing
x = c(5, 12, 13, 3, 6, 0, 1, 15, 16, 8, 88)
x[-1]
## [1] 12 13 3 6 0 1 15 16 8 88
x[-length(x)]
## [1] 5 12 13 3 6 0 1 15 16 8
Given a vector, generate indices
Given a function with multiple inputs and a list of values for each type of input, outer() feeds every unique combination of input values into a function and outputs a matrix.
x = 1:3
y = 1:3
f1 = function(x, y) return(x*y)
outer(x, y, f1)
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 2 4 6
## [3,] 3 6 9
To generate all unique combinations of elements in two vectors.
expand.grid(seq(3), seq(3))
## Var1 Var2
## 1 1 1
## 2 2 1
## 3 3 1
## 4 1 2
## 5 2 2
## 6 3 2
## 7 1 3
## 8 2 3
## 9 3 3
Reorganizing dataframe feature variables: sometimes tasks such as boxplotting requires reformatting an existing dataframe. In a \(m\) by \(n\) data matrix, if you want to combine features \(i\) and \(j\) into one column and create another column indicating whether the values are of feature \(i\) or feature \(j\), you can use melt() in reshape2 package. Let us work through an example on the diamonds dataset from ggplot2.
library(reshape2)
library(ggplot2)
head(diamonds)
## Source: local data frame [6 x 10]
##
## carat cut color clarity depth table price x y z
## (dbl) (fctr) (fctr) (fctr) (dbl) (dbl) (int) (dbl) (dbl) (dbl)
## 1 0.23 Ideal E SI2 61.5 55 326 3.95 3.98 2.43
## 2 0.21 Premium E SI1 59.8 61 326 3.89 3.84 2.31
## 3 0.23 Good E VS1 56.9 65 327 4.05 4.07 2.31
## 4 0.29 Premium I VS2 62.4 58 334 4.20 4.23 2.63
## 5 0.31 Good J SI2 63.3 58 335 4.34 4.35 2.75
## 6 0.24 Very Good J VVS2 62.8 57 336 3.94 3.96 2.48
Say we are interested in combining columns x, y, and z into one column while creating another column to indicate whether the value in the combined column comes from x, y, or z, we would:
dat2 = melt(diamonds, id.vars = c("carat", "cut", "color", "clarity", "depth", "table", "price"), measure.vars = c("x", "y", "z"))
head(dat2)
## carat cut color clarity depth table price variable value
## 1 0.23 Ideal E SI2 61.5 55 326 x 3.95
## 2 0.21 Premium E SI1 59.8 61 326 x 3.89
## 3 0.23 Good E VS1 56.9 65 327 x 4.05
## 4 0.29 Premium I VS2 62.4 58 334 x 4.20
## 5 0.31 Good J SI2 63.3 58 335 x 4.34
## 6 0.24 Very Good J VVS2 62.8 57 336 x 3.94
The combined column is named value and the column designating x, y, or z assignments is named variable. id.vars parameter specifies columns that remains unchanged while measure.vars parameter specifies names of columns to combine.
To suppress warnings of a function
suppressWarnings(<function>)
Converting a string to a variable name
assign("x", 5)
invisible() can be used to suppress output
invisible(max(2, 4))
pmax() or pmin() allows one to calculate maxima or minima of all input values in parallel.
set.seed(1)
n = 10
mat = matrix(rnorm(2*n), ncol = 2)
head(mat)
## [,1] [,2]
## [1,] -0.6264538 1.51178117
## [2,] 0.1836433 0.38984324
## [3,] -0.8356286 -0.62124058
## [4,] 1.5952808 -2.21469989
## [5,] 0.3295078 1.12493092
## [6,] -0.8204684 -0.04493361
maxs = pmax(mat[,1], mat[, 2])
maxs
## [1] 1.51178117 0.38984324 -0.62124058 1.59528080 1.12493092
## [6] -0.04493361 0.48742905 0.94383621 0.82122120 0.59390132
The locator() function will return the x and y positions of the position you click your mouse on in the plot window. To get the x and y positions of two separate mouse-clicks on the graph:
locator(2)
To download a file from the internet to a destination folder.
fileUrl = "<url>"
download.file(fileUrl, destfile = "<path_destination_file>", method = "curl")
To create a vector containing a specificed number of evenly-spaced points between 0 and 1 using ppoints().
#Create a vector of 10 points evenly spaced between 0 and 1.
ppoints(10)
## [1] 0.06097561 0.15853659 0.25609756 0.35365854 0.45121951 0.54878049
## [7] 0.64634146 0.74390244 0.84146341 0.93902439
To get the arguments for a function, use args(). To get the help files for a function, one way is to use help().
args(qnorm)
## function (p, mean = 0, sd = 1, lower.tail = TRUE, log.p = FALSE)
## NULL
help(qnorm)
expand.grid() creates a dataframe of all combinations of parameters fed to it:
ms=2^c(1:3)
ks=seq(1,9,2)
params = expand.grid(k=ks,m=ms)
params
## k m
## 1 1 2
## 2 3 2
## 3 5 2
## 4 7 2
## 5 9 2
## 6 1 4
## 7 3 4
## 8 5 4
## 9 7 4
## 10 9 4
## 11 1 8
## 12 3 8
## 13 5 8
## 14 7 8
## 15 9 8
replicate(n, expr) is used for repeated evaluation of an expression.
group = replicate(3, rnorm(5, 5, 1))
group
## [,1] [,2] [,3]
## [1,] 5.918977 4.943871 6.358680
## [2,] 5.782136 4.844204 4.897212
## [3,] 5.074565 3.529248 5.387672
## [4,] 3.010648 4.521850 4.946195
## [5,] 5.619826 5.417942 3.622940
gsub() performs replacements for all patterns in a character vector.
x = c("Bad stuff happens", "Bad food", "Bad air")
gsub("Bad", "Good", x)
## [1] "Good stuff happens" "Good food" "Good air"
Integration.
integrand = function(x){x^2}
area = integrate(f = integrand, lower = -1, upper = 1)
area
## 0.6666667 with absolute error < 7.4e-15
Tests to see if all elements in one data structure is equal to all elements in the other.
x = runif(10, 0, 10)
y = x
all.equal(y, x)
## [1] TRUE
If \(x\) is a matrix and \(y\) is a vector, then sweep(x, 1, y, FUN="*") applies the function * to x[i, ] and element y[i], and in this case x[i, ]*y[i]. If the second argument in sweep() is 2 instead of 1, then sweep() applies this column-wise.
set.seed(2)
data = 1:10
x = matrix(sample(data, 9, replace = TRUE), byrow = TRUE, nrow = 3)
x
## [,1] [,2] [,3]
## [1,] 2 8 6
## [2,] 2 10 10
## [3,] 2 9 5
y = sample(c(-1, 1), nrow(x), replace=TRUE)
y
## [1] 1 1 -1
z = sweep(x, 1, y, FUN="*")
z
## [,1] [,2] [,3]
## [1,] 2 8 6
## [2,] 2 10 10
## [3,] -2 -9 -5
Set operations
x = c(1, 2, 4, 5, 6, 7, 10)
y = c(5, 3, 2, 6, 24, 34)
union(x, y)
## [1] 1 2 4 5 6 7 10 3 24 34
intersect(x, y)
## [1] 2 5 6
#Subtract y from x
setdiff(x, y)
## [1] 1 4 7 10
#Subtract x from y
setdiff(y, x)
## [1] 3 24 34
setequal(x, y)
## [1] FALSE
Estimate of the memory used to store R object.
object.size(x)
Returns a vector, data frame, or array but with duplicate elements/rows removed.
x = c(rep(1, 3), rep(2, 5))
x
## [1] 1 1 1 2 2 2 2 2
unique(x)
## [1] 1 2
Find the factorial of a number.
factorial(3)
## [1] 6
crossprod() is a function that returns the inner-crossproduct of a vector with itself.
n = 1:10
x = sample(n, 25, replace=TRUE)
y = sample(n, 25, replace = TRUE)
crossprod(y-x)
## [,1]
## [1,] 440
Check the version of a R package.
packageVersion("ggplot2")
## [1] '2.1.0'
Importing dates from character format using as.Date(). Returns a Date object and accepts a character vector.
dates <- c("05/27/84", "07/07/05")
#Format refers to format of input date format.
betterDates <- as.Date(dates, format = "%m/%d/%y")
betterDates
## [1] "1984-05-27" "2005-07-07"
months = format(betterDates, "%m")
months
## [1] "05" "07"
class(betterDates)
## [1] "Date"
Finding the rank of each value in a vector:
x = c(1, 4, 3, 6, 10)
rank(x)
## [1] 1 3 2 4 5
range() function returns a vector containing the minimum and maximum value of a given vector or vectors.
x = rpois(50, 3)
range(x, na.rm=TRUE)
## [1] 0 7
Save dataframe to a file. Assume we have a data frame called expressions.
save(expressions, file="expressions.Rda")
load("expressions.Rda")
Viewing the current setting of a function.
#Examples
par("bg")
## [1] "white"
par("mar")
## [1] 5.1 4.1 4.1 2.1
par("mfrow")
## [1] 1 1
View list of parameters for a function. Example is par().
parameters = par()
class(parameters)
## [1] "list"
length(parameters)
## [1] 72
names(parameters)
## [1] "xlog" "ylog" "adj" "ann" "ask"
## [6] "bg" "bty" "cex" "cex.axis" "cex.lab"
## [11] "cex.main" "cex.sub" "cin" "col" "col.axis"
## [16] "col.lab" "col.main" "col.sub" "cra" "crt"
## [21] "csi" "cxy" "din" "err" "family"
## [26] "fg" "fig" "fin" "font" "font.axis"
## [31] "font.lab" "font.main" "font.sub" "lab" "las"
## [36] "lend" "lheight" "ljoin" "lmitre" "lty"
## [41] "lwd" "mai" "mar" "mex" "mfcol"
## [46] "mfg" "mfrow" "mgp" "mkh" "new"
## [51] "oma" "omd" "omi" "page" "pch"
## [56] "pin" "plt" "ps" "pty" "smo"
## [61] "srt" "tck" "tcl" "usr" "xaxp"
## [66] "xaxs" "xaxt" "xpd" "yaxp" "yaxs"
## [71] "yaxt" "ylbias"
List files in current directory.
dir()
## [1] "analysisReport.txt" "Bash.ipynb"
## [3] "C" "data_sets"
## [5] "file.csv" "Git_Tutorial.ipynb"
## [7] "Git.txt" "HTML:CSS"
## [9] "Java_Programs" "Java.docx"
## [11] "Java.ipynb" "Java.txt"
## [13] "Latex" "latex-guide.pdf"
## [15] "Latex.txt" "note.txt"
## [17] "python" "Python.ipynb"
## [19] "Python.txt" "R_cache"
## [21] "R_data" "R_figure"
## [23] "R_package.txt" "R-plotting.html"
## [25] "R.html" "R.R"
## [27] "R.Rmd" "R.txt"
## [29] "results.txt" "rmarkdown-cheatsheet.pdf"
## [31] "Rprof.out" "rsconnect"
## [33] "SQL.ipynb" "Unix.txt"
Two ways of timing a program - one with proc.time(), one with system.time().
g = rnorm(1e5)
h = rep(NA, 1e5)
#Stop watch style
start = proc.time()
for(i in 1:1e5){
h[i] = g[i] + 1
}
end = proc.time()
end - start
## user system elapsed
## 0.290 0.005 0.302
#System
system.time(for(i in 1:1e5){h[i]=g[i]+1})
## user system elapsed
## 0.128 0.004 0.133
Install package from github
library(devtools)
install_github('epurdom/clusterCells')
Install a bioconductor package
library(BiocInstaller)
biocLite(<package>)
Get help files of a package.
help(cluster01)
Check if a package is installed. Returns boolean value.
require("devtools")
Check current version of R
R.Version()
## $platform
## [1] "x86_64-apple-darwin13.4.0"
##
## $arch
## [1] "x86_64"
##
## $os
## [1] "darwin13.4.0"
##
## $system
## [1] "x86_64, darwin13.4.0"
##
## $status
## [1] ""
##
## $major
## [1] "3"
##
## $minor
## [1] "2.4"
##
## $year
## [1] "2016"
##
## $month
## [1] "03"
##
## $day
## [1] "10"
##
## $`svn rev`
## [1] "70301"
##
## $language
## [1] "R"
##
## $version.string
## [1] "R version 3.2.4 (2016-03-10)"
##
## $nickname
## [1] "Very Secure Dishes"
Installing and loading R packages.
install.packages('dplyr')
library('dplyr')
Get current date in a more detailed format.
date()
## [1] "Wed Jan 4 19:32:12 2017"
Get current date in yyyy-mm-dd format.
Sys.Date()
## [1] "2017-01-04"
print() and cat() functions. Both functions are useful when one needs to print to the screen within the body of a function. The output of print() is numbered whereas the output of cat() is not.
print("abc")
## [1] "abc"
cat("abc\n") #\n is necessary to end on the next line. Default separater is a space.
## abc
cat("abc", "de\n")
## abc de
#Set separater between items.
cat("abc", "de\n", sep="")
## abcde
cat("abc", "de\n", sep="\n")
## abc
## de
x = c(5, 12, 13, 8, 88)
cat(x, sep=c(".", ".", ".", "\n", "\n"))
## 5.12.13.8
## 88
A closure consists of a function that sets up a local variable and then creates another function that accesses that variable.<<- assigns value to the local variable.
counter = function(){
ctr = 0
f = function(){
ctr <<- ctr + 1
cat("this count currently has value", ctr, "\n") #cat() concentates variables of different modes together, with a space automatically included in between.
}
}
c1 = counter()
c2 = counter()
c1()
## this count currently has value 1
c1()
## this count currently has value 2
c2()
## this count currently has value 1
c2()
## this count currently has value 2
Writing upstairs. Normally, write access to variables at higher levels via the standard assignment operator is not possible. In order to write to variables at higher levels, one can use the “<<-” operator or the assign() function.
two = function(u){
u <<- 2*u
z = 2*z
}
x = 1
z = 3
two(x)
x
## [1] 1
z
## [1] 3
u
## [1] 2
In this example, because x was copied to the formal argument u, x did not change. z did not change because z is a global variable and code within the function two does not have write access to the variable z. u did not exist prior to calling two(), but it was created as a top-level variable by superassignment.
The superassignment operator assigns to the first variable it is called to assign to as it searches up the levels.
f = function(){
inc = function(){x <<- x + 1}
x = 3
inc()
return(x)
}
f()
## [1] 4
x
## [1] 1
Alternatively you could also perform superassignment using the assign() function.
x = 1
two = function(u){
assign("u", 2*u, pos=.GlobalEnv) #pos=.GlobalEnv makes u a global variable
}
two(x)
u
## [1] 2
w = 12
f = function(y){
d = 8
h = function(){
return(d*(w+y))
}
return(h())
}
Using ls() and ls.str() to display top-level environment
ls()
#More detailed
ls.str()
Using the formals() and body() functions to break down a function. formals() indicates the function argument where body() indicates the function expression.
g = function(x){
return(x + 1)
}
formals(g)
## $x
body(g)
## {
## return(x + 1)
## }
Functions can return functions.
g = function(){
t = function(x) return(x^2)
return(t)
}
g()
## function(x) return(x^2)
## <environment: 0x7fbd42f830d8>
Return statements in a R function can be written in two ways. return() will be slower than simply expressing the result. For clarity it is customary to include return() statements in the middle of code block and omit return at the very end of the code block. return() statements can return any R object, even functions!
return(x)
#Or
x
The get() function takes as an argument a character string representing the name of some object and returns the object of that name.
u = matrix(seq(1, 6), nrow=3)
u
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
h = get("u")
h
## [,1] [,2]
## [1,] 1 4
## [2,] 2 5
## [3,] 3 6
class(h)
## [1] "matrix"
cut() function can be used to generate factors by determining which bin each element in x falls into. The form of cut() is cut(data_vector, binmarks, labels=F).User defined bins are inclusive of the last number but exclusive of the first number.c
z = c(1, 1, 1, 2, 3, 4, 5, 6)
binmarks = seq(0.0, 6, 1)
c1 = cut(z, binmarks, labels=F)
table(c1)
## c1
## 1 2 3 4 5 6
## 3 1 1 1 1 1
List the files in a directory with list.files(), which returns a character vector.
list.files("./data")
Check if a directory exists, if it doesn’t, then create that directory
if (!file.exists("data")){
dir.create("data")
}
Get and set current working directory.
getwd()
setwd("../")
#On Mac...
setwd("/Users/Documents")
#On
setwd("C:\\Users\\Calvin\\Downloads")
Rounding numbers
round(123.456, digits=2) #or simply round(123.456, 2)
## [1] 123.46
round(123.456, -2)
## [1] 100
#signif() rounds to the number of significant digits provided
signif(-123.456, 4)
## [1] -123.5
Paste0(x) is a shortcut for paste(x, sep=“”)
paste("a", "b")
## [1] "a b"
paste0("a", "b")
## [1] "ab"
Remove all environment variables
rm(list=ls())
x = c(0, 3, 2)
rm(x)
Download file from the internet
#tmpfile is the destination file
download.file("http://seanlahman.com/files/database/lahman-csv_2014-02-14.zip", "tmpfile.zip")
Paste function
paste(c("Hello", "World!"))
## [1] "Hello" "World!"
Paste and concatenate result into single string.
paste(c(1, 2, 3, 4))
## [1] "1" "2" "3" "4"
paste(c(1, 2, 3, 4), collapse = "")
## [1] "1234"
Using the break key word to break out of a loop. In this example, break is used to find the index of the first 1 in a vector
x = c(4, 5, 6, 1, 9)
first1 = function(x){
for (i in 1:length(x)){
if (x[i] == 1) break
}
return(i)
}
first1(x)
## [1] 4
R recycling. Applying an operation to two vectors requires them to be the same length. If they are not, R repeats the shorter one until it is as long as the longer vector.
x = seq(1, 5, 1)
x - 1
## [1] 0 1 2 3 4
y = matrix(seq(1, 6), nrow = 3)
y + c(1, 2)
## [,1] [,2]
## [1,] 2 6
## [2,] 4 6
## [3,] 4 8
Operator precedence
i = 2
1:i-1
## [1] 0 1
1:(i-1)
## [1] 1
Rounding numbers in R.
x = c(1.2, 3.9, 0.4)
z = round(x)
z
## [1] 1 4 0
Using stop() in R as a check. In this example, we are stopping if the input c in function f is not a scalar.
x = c(1:3)
f = function(x, c){
if(length(c) != 1) stop("vector c not allowed")
else{
return((x+c)^2)
}
}
f(x, 2)
## [1] 9 16 25
NA means that the value exists but is not known. When operating on vectors that have NAs in them, it is possible to tell the operator to ignore the NAs.
x = c(88, NA, 12, 168)
mean(x)
## [1] NA
mean(x, na.rm=TRUE)
## [1] 89.33333
mode() vs class() vs typeof(). mode() and typeof() are similar and returns how an object is stored in memory, while class() returns an object’s abstract data type. A more detailed discussion of this can be found here.
x = matrix(1:16, nrow=4)
mode(x)
## [1] "numeric"
typeof(x)
## [1] "integer"
class(x)
## [1] "matrix"
NULL means that the value does not exist. NULL is useful in the application of building vectors.
z = NULL
for (i in 1:10) if (i %% 2 == 0) z = c(z, i) # %% is modulo operator
z
## [1] 2 4 6 8 10
u = NULL
length(u)
## [1] 0
v = NA
length(v)
## [1] 1
ifelse() allows one to write a condensed version of if else statements. ifelse() takes in 3 inputs - condition, value returned if condition is fulfilled, and value returned otherwise. ifelse() statements can be nested.
x = 1:10
y = ifelse(x %% 2 == 0, 5, 12) #if x%% 2 == 0, then 5, else 12
y
## [1] 12 5 12 5 12 5 12 5 12 5
g <- c("M", "F", "F", "I", "M", "M", "F")
h = ifelse(g=="M", 1, ifelse(g=="F", 2, 3))
h
## [1] 1 2 2 3 1 1 2
diff() function by default calculates the difference between each element and the element before it in position.
u = c(1, 6, 7, 2, 3, 5)
diff(u)
## [1] 5 1 -5 1 2
sign() converts numbers in its argument vector to 1, 0, or -1, depending on whether they are positive, zero, or negative.
x = c(5, 1, -5, 1, 2)
sign(x)
## [1] 1 1 -1 1 1
Classical multidimensional scaling of a matrix. Requires as minimum input a dissimilarity matrix. By default returns a reduced feature vector (features = 2) for each sample.
library(tissuesGeneExpression)
data(tissuesGeneExpression)
d = dist(t(e))
mds = cmdscale(d)
plot(mds[,1], mds[,2])
Performing PCA. Usually want to perform PCA after calling scale() on data matrix. The principle components of scaled data are the columns of V in the svd of the same scaled data.
mat = scale(matrix(c(1, 2, 2, 5, 3, 7), nrow=2))
prcomp(mat)
## Standard deviations:
## [1] 1.732051 0.000000
##
## Rotation:
## PC1 PC2
## [1,] 0.5773503 -0.5773503
## [2,] 0.5773503 0.7886751
## [3,] 0.5773503 -0.2113249
Say we have a \(n x m\) data matrix X, calling scale() on it will subtract the mean from each column and divide each column by its standard deviation.
mat = matrix(c(1, 2, 2, 5, 3, 7), nrow=2)
scale(mat)
Performing singular value decomposition by calling the svd() function, which returns a list containing U, S, and V (no transpose). S is a vector. To convert S to a diagonal matrix, use diag() function. The right singular vectors represent the features.
mat = matrix(c(1, 2, 2, 5, 3, 7), nrow=2)
mat
## [,1] [,2] [,3]
## [1,] 1 2 3
## [2,] 2 5 7
result = svd(mat)
class(result)
## [1] "list"
result
## $d
## [1] 9.5899624 0.1806108
##
## $u
## [,1] [,2]
## [1,] -0.3897782 -0.9209087
## [2,] -0.9209087 0.3897782
##
## $v
## [,1] [,2]
## [1,] -0.2327012 -0.7826345
## [2,] -0.5614308 0.5928424
## [3,] -0.7941320 -0.1897921
#To diagonalize eigenvalues
diag(result$d)
## [,1] [,2]
## [1,] 9.589962 0.0000000
## [2,] 0.000000 0.1806108
To perform hierarchical clustering:
dist() function.hclust() function.Code demonstration. Note that hierarchical clustering is not deterministic.
#Code for generating clusters.
set.seed(1234)
x = rnorm(12, mean=rep(1:3, each=4), sd=0.2)
y = rnorm(12, mean=rep(c(1, 2, 1), each=4), sd=0.2)
plot(x, y, col="blue", pch=19, cex=2)
text(x+0.05, y+0.05, labels=as.character(1:12))
#To perform hierarchical clustering. Default uses complete linkage.
dataFrame = data.frame(x=x, y=y)
distxy = dist(dataFrame)
hClustering = hclust(distxy)
plot(hClustering)
#To determine the number of clusters/cluster assignments when tree is cut at a certain height specified by parameter h in
cut = cutree(hClustering, h=1)
cut
## [1] 1 1 1 1 2 2 2 2 3 3 3 3
#To plot dendrogram without labels
plot(as.dendrogram(hClustering))
#To perform hierarchical clustering using average linkage.
hClustering = hclust(distxy, method = "average")
plot(hClustering)
#To perform k-means clustering. The kmeans() function returns an object that contains many useful results.
kmeansObj = kmeans(dataFrame, centers=3)
#Items in the kmeans object.
names(kmeansObj)
## [1] "cluster" "centers" "totss" "withinss"
## [5] "tot.withinss" "betweenss" "size" "iter"
## [9] "ifault"
#Cluster assignments
kmeansObj$cluster
## [1] 3 3 3 3 1 1 1 1 2 2 2 2
#Number of iterations gone through
kmeansObj$iter
## [1] 2
#Centroids
kmeansObj$centers
## x y
## 1 1.9906904 2.0078229
## 2 2.8534966 0.9831222
## 3 0.8904553 1.0068707
#cluster centroids in KmeansObj$centers
par(mar=rep(0.2, 4))
plot(x, y, col=kmeansObj$cluster, pch=19, cex=2)
points(kmeansObj$centers, col=1:3, pch = 3, cex=3, lwd=3)
Chi-square test of independence with \(\alpha = 0.05\).
library(MASS) #loads survey dataset
tbl = table(survey$Smoke, survey$Exer)
tbl
##
## Freq None Some
## Heavy 7 1 3
## Never 87 18 84
## Occas 12 3 4
## Regul 9 1 7
chisq.test(tbl)
## Warning in chisq.test(tbl): Chi-squared approximation may be incorrect
##
## Pearson's Chi-squared test
##
## data: tbl
## X-squared = 5.4885, df = 6, p-value = 0.4828
Since the p-value \(\geq\) 0.05, do not reject null hypothesis that smoking habit is independent of exercise level of students.
Using the quantile() function to produce sample quantiles corresponding to given probabilities.
set.seed(1)
x = rpois(100, 4)
quantile(x, probs = c(0.25, 0.5, 0.75))
## 25% 50% 75%
## 3 4 5
PAM clustering via pam() function, which accepts as input a distance matrix. In this example, result is an object and its components can be accessed via result$.
require(cluster)
group1 = replicate(3, rnorm(5, 5, 1))
group2 = replicate(3, rnorm(5, 2, 1))
data = rbind(t(group1), t(group2))
result = pam(data, 2)
result
## Medoids:
## ID
## [1,] 2 6.980400 4.632779 3.955865 5.569720 4.864945
## [2,] 6 2.291446 1.556708 2.001105 2.074341 1.410479
## Clustering vector:
## [1] 1 1 1 2 2 2
## Objective function:
## build swap
## 1.654503 1.654503
##
## Available components:
## [1] "medoids" "id.med" "clustering" "objective" "isolation"
## [6] "clusinfo" "silinfo" "diss" "call" "data"
Generate a random permutation of a given vector
x = 1:10
sample(x)
## [1] 3 9 4 6 7 10 1 2 5 8
Uniformly sample with/without replacement from a vector of elements.
set.seed(1)
x = rnorm(10, 4, 1)
x
## [1] 3.373546 4.183643 3.164371 5.595281 4.329508 3.179532 4.487429
## [8] 4.738325 4.575781 3.694612
sample(x, 5)
## [1] 3.694612 4.183643 3.179532 3.373546 4.575781
sample(x, 5, replace = TRUE)
## [1] 5.595281 3.373546 5.595281 4.575781 5.595281
Given a sample x feature matrix, the function dist() will compute the distance (default = Euclidean) matrix for the samples. A distance matrix can have dimensions sample x sample and lists the distance between each sample. One can use the function image() to visualize the distance matrix.
#prepare mxn matrix
group1 = replicate(3, rnorm(5, 5, 1))
group2 = replicate(3, rnorm(5, 2, 1))
data = rbind(t(group1), t(group2))
dim(data)
## [1] 6 5
#Most concise
dist(data)
## 1 2 3 4 5
## 2 3.197198
## 3 2.751214 2.557937
## 4 8.027959 7.414114 6.490518
## 5 7.362879 7.363116 5.776047 3.056570
## 6 7.729928 7.575169 6.377825 1.812434 1.748685
#To see how many distances are calculated
length(dist(data))
## [1] 15
#With diagonal entries
dist(data, diag=T)
## 1 2 3 4 5 6
## 1 0.000000
## 2 3.197198 0.000000
## 3 2.751214 2.557937 0.000000
## 4 8.027959 7.414114 6.490518 0.000000
## 5 7.362879 7.363116 5.776047 3.056570 0.000000
## 6 7.729928 7.575169 6.377825 1.812434 1.748685 0.000000
#Complete
dist(data, diag=T, upper = T)
## 1 2 3 4 5 6
## 1 0.000000 3.197198 2.751214 8.027959 7.362879 7.729928
## 2 3.197198 0.000000 2.557937 7.414114 7.363116 7.575169
## 3 2.751214 2.557937 0.000000 6.490518 5.776047 6.377825
## 4 8.027959 7.414114 6.490518 0.000000 3.056570 1.812434
## 5 7.362879 7.363116 5.776047 3.056570 0.000000 1.748685
## 6 7.729928 7.575169 6.377825 1.812434 1.748685 0.000000
#Visualize distance matrix
image(as.matrix(dist(data)))
Here are the key functions that a heatmap performs:
heatmap().m x n matrix for heatmap to work.Input: m x n matrix with or without colnames/rownames
#prepare mxn matrix
group1 = replicate(3, rnorm(5, 5, 1))
group2 = replicate(3, rnorm(5, 2, 1))
data = rbind(t(group1), t(group2))
data
## [,1] [,2] [,3] [,4] [,5]
## [1,] 4.292505 5.3645820 5.768533 4.887654 5.8811077
## [2,] 5.398106 4.3879736 5.341120 3.870637 6.4330237
## [3,] 6.980400 4.6327785 3.955865 5.569720 4.8649454
## [4,] 4.401618 1.9607600 2.689739 2.028002 1.2567268
## [5,] 2.188792 0.1950414 3.465555 2.153253 4.1726117
## [6,] 2.475510 1.2900536 2.610726 1.065902 0.7463666
#matrix labels
sampleNames = vector(length=6)
for(i in seq(6)){
sampleNames[i] = paste("Sample", i)
}
rownames(data) = sampleNames
data = t(data) #Want feature by sample because then samples will be clustered.
#Plot heatmap
heatmap(data)
#Plot heatmap with color key and greater flexibility than R's built-in heatmap(). In heatmap.2()'s heatmaps, colors are based on absolute value, so it does not matter whether samples are by row or by column. scale="none" means no centering of values by row or by column.
require(gplots)
heatmap.2(data, col=redgreen(75), scale="none", density.info="none", trace="none")
Compute all permutations of a subset of a set with repeats.Returns a matrix with each row representing a permutation. In this example, 3 is the size of set of elements to permutate, 2 is the size of subset, and x is the vector containing elements.
library(gtools)
x = c("A", "G", "C")
permutations(3, 2, x, repeats.allowed = TRUE)
## [,1] [,2]
## [1,] "A" "A"
## [2,] "A" "C"
## [3,] "A" "G"
## [4,] "C" "A"
## [5,] "C" "C"
## [6,] "C" "G"
## [7,] "G" "A"
## [8,] "G" "C"
## [9,] "G" "G"
Calculating correlation
x = seq(1, 10)
y = seq(1, 10) + runif(10, -0.5, 0.5)
cor(x, y)
## [1] 0.99897
m = matrix(c(x, y), nrow = 10, ncol = 2)
m
## [,1] [,2]
## [1,] 1 1.114645
## [2,] 2 2.057160
## [3,] 3 2.828777
## [4,] 4 3.953131
## [5,] 5 5.000441
## [6,] 6 5.680866
## [7,] 7 7.029631
## [8,] 8 7.575276
## [9,] 9 8.777756
## [10,] 10 9.712700
#If cor() is called on a matrix, then it will return a correlation matrix
cor(m)
## [,1] [,2]
## [1,] 1.00000 0.99897
## [2,] 0.99897 1.00000
Find parameters of F-distribution that fit the data using fitFDist(), which will return a list of the scale factor (shifts plot horizontally) and the df2 factor (controls shape). Suppose X is the data we are trying to fit, and we known that the data has a degree of freedom of 14.
library(limma)
fitFDist(x, 14)
Computing variance of values in a vector.
x = c(1, 2, 1.4, 0.5)
var(x)
## [1] 0.4025
How to determine if the data fits any known distribution using Q-Q plots. In this example, we are going to see if data sampled from a poisson distribution indeed fits the poisson distribution.
x = rpois(100, 4) #Generates samples drawn from poisson distribution with lambda = 4.
ps = seq(0, 1, len=100) #Generates CDF probabilities to feed to qpois()
qpoisq = qpois(ps, 4) #Generates values of Poisson random variable that correspond to cumulative probability
qqplot(qpoisq, x)
How to determine if the data fits a normal distribution using Q-Q plots.
x = rnorm(100, 5, 2) #Generates 100 values drawn from normal distribution with mean=5 and standard deviation=2
qqnorm(x) #Creates Normal Q-Q Plot
qqline(x) #Fits a line to the Normal Q-Q plot
Application: Generating a covariance matrix given known variance of 1 and user provided covariance.
makecov = function(rho, n){
m = matrix(nrow=n, ncol=n)
m = ifelse(col(m)==row(m), 1, rho)
return(m)
}
cov = makecov(0.2, 3)
cov
## [,1] [,2] [,3]
## [1,] 1.0 0.2 0.2
## [2,] 0.2 1.0 0.2
## [3,] 0.2 0.2 1.0
Binomial distribution density. \(p = \binom{N}{x}p^{x}(1-p)^{N-x}\).
N = 10
p = 0.5
x = 1
dbinom(x, N, p)
## [1] 0.009765625
Cumulative probability density for the binomial distribution. In this example, the probability of 4 or less successes is calculated for \(binom(0.2)\)
pbinom(4, size=12, prob=0.2)
## [1] 0.9274445
Poisson distribution density. \(p = e^{-\lambda}\frac{\lambda^{k}}{k!}\)
lambda = 1.07
k = 0
dpois(k, lambda)
## [1] 0.3430085
#Log likelihood
dpois(k, lambda, log=TRUE)
## [1] -1.07
#CDF
k = 4
ppois(k, lambda)
## [1] 0.9951501
In multiple hypothesis testing, one can adjust p-values by calculating corresponding q-values to control the false discovery rate. This approach is less conservative than the Bonferroni correction, yielding less false negatives. There are two ways of calculating q-values in R–using qvalue() function from the qvalue library or by using p.adjust(). These two different methods yield slightly different results, but seek to accomplish the same goal of controlling for the false discovery rate in multiple hypothesis testing. Below is test code calculating the false positives and false negatives from a simulation where the ground truth is known, using Bonferroni correction and the two methods for calculating q-values.
#Initializing parameters. There are 24 samples and 8793 genes. Of those genes, 500 of those are set to be different in the treatment versus control by adding 2 to their values. This Monte Carlo simulation is run 1000 times.
library(qvalue)
library(genefilter)
n = 24
m = 8793
delta = 2
positives = 500
B = 100
set.seed(1)
#Of the 24 samples, 12 are controls and 12 are treatments.
results = replicate(B, {
g <- factor(rep(c(0, 1), each=(n/2)))
mat = matrix(rnorm(n*m), m, n) #Generate n*m numbers from a standard normal distribution
mat[1:positives, 1:(n/2)] = mat[1:positives, 1:(n/2)] + delta
pvals = rowttests(mat, g)$p.val
qvals1 = p.adjust(pvals, method = "fdr")
fd1 = sum(qvals1[-(1:positives)] <= 0.05)
fdr1 = fd1/(m-positives)
fn1 = sum(qvals1[1:positives] > 0.05)
fnr1 = fn1/(positives)
qvals2 = qvalue(pvals)$qvalue
fd2 = sum(qvals2[-(1:positives)] <= 0.05)
fdr2 = fd2/(m-positives)
fn2 = sum(qvals2[1:positives] > 0.05)
fnr2 = fn2/(positives)
fd3 = sum(pvals[-(1:positives)] <= 0.05/m)
fdr3 = fd1/(m-positives)
fn3 = sum(pvals[1:positives] > 0.05/m)
fnr3 = fn3/(positives)
c(fdr1, fnr1, fdr2, fnr2, fdr3, fnr3)})
False Positive Rate (q-value method 1):
## [1] 0.00268178
False Negative Rate (q-value method 1):
## [1] 0.08292
False Discovery Rate (q-value method 2):
## [1] 0.002920535
False Negative Rate (q-value method 2):
## [1] 0.07854
False Discovery Rate (Bonferroni Correction):
## [1] 0.00268178
False Negative Rate (Bonferroni Correction):
## [1] 0.7649