Alencar Xavier (alenxav@gmail.com)
July 21, 2017

INTRODUCTION TO R

A brief introduction to R for plant breeding and quantitative genetics students

AGRY611, Spring 2017

AGRY620, Fall 2017

Why to use R?

1. Functional

2. Fast

3. Free

Outline

R & IDE
Environments
Load data & packages
Object oriented structure
Basic commands
Operators, sets and subset
Names and characters
Plots

Basic statistics
Function and loops
ply's
Regular expressions
Probability
File manipulation
Other languages
Suggested packages

Installing R and IDE

R software

R project (https://www.r-project.org/)
Microsoft R Open (https://mran.microsoft.com/)

IDE (Integrated Development Environment)

R Studio (https://www.rstudio.com/)
R Eclipse (http://www.eclipse.org/)

Load and save data

read – read.csv – read.table – readClipboard
write – write.csv
view – ?
load – save – save.image
setwd – getwd
str – head – table – summary

data( tpod, package = "NAM" );
ls()

[1] "chr" "fam" "gen" "y"

summary( y ); getwd()

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
0.00000 0.00000 0.03606 0.16010 0.32040 0.70910

[1] "C:/Users/u787935/Google Drive/DOW080816/Lecture"

save( gen, file = "genotypes.RData" ); Gen = load( "genotypes.RData" )

Install and load packages

There are R packages for all sorts of things

Repositories

CRAN – or MRAN
bioconductor
GitHub

Commands

install.packages
library – require

# install.packages( "NAM" )
require( NAM )

# source("https://bioconductor.org/biocLite.R")
# biocLite("impute")
require( impute )

base::abbreviate( "Soybean Cyst Nematode" )

Soybean Cyst Nematode 
               "SyCN"

Data structure

NUMERIC

matrix, vector, scalar, array

Non-numeric

character
list – data.frame
logical – NA – NULL

my_stuff = list( phenotype = y, genotype = gen );
class( my_stuff )

[1] "list"

class(gen);

[1] "matrix"

is.numeric( gen )

[1] TRUE

ls( my_stuff )

[1] "genotype"  "phenotype"

class( "Hello!" )

[1] "character"

class( my_stuff$phenotype )

[1] "numeric"

typeof( my_stuff$phenotype )

[1] "double"

is.list( data.frame(y) )

[1] TRUE

class( factor(fam) )

[1] "factor"

Basic commands

rm – c( ) – matrix – array – data.frame – length – dim – seq – ncol – nrow – sort – order – mean – sd – var – median – sum – sqrt – exp – dist – max –min – range – round – head – tail – : + - * / ^ $

pheno_stat = c( mean(y), median(y), sd(y), var(y) )  
pheno_stat

[1] 0.16011322 0.03606238 0.19346780 0.03742979

a = c(2, 0.4, pi, -5, Inf, NA)
b <- 5
sort( a + b )

[1] 0.000000 5.400000 7.000000 8.141593      Inf

rm( a, b ); length( pheno_stat )

[1] 4

dim( gen )

[1] 196 376

y[1:5]; head(y);

[1] 0.1391304 0.0000000 0.3723404 0.0000000 0.1680672

[1] 0.1391304 0.0000000 0.3723404 0.0000000 0.1680672 0.1297710

round( tail(y), digits = 2)

[1] 0.00 0.02 0.01 0.00 0.21 0.00

gen[ seq(from=1,to=5,by=2), 1:2 ]

     Gm01_3321482 Gm01_39533385
[1,]            2             0
[2,]            2             0
[3,]            1             0

Logical operators

is.na – any – all
== ! > < | &
ifelse

5 > 4

[1] TRUE

T + TRUE + 1 + FALSE + F

[1] 3

TRUE & 1>0 & TRUE

[1] TRUE

TRUE | FALSE

[1] TRUE

sum( y == 0 )

[1] 53

c( any( 0.8 < y ), all( y >= 0 ) )

[1] FALSE  TRUE

"R" %in% c( "I", "LIKE", "R", 22 )

[1] TRUE

is.numeric( c("I","LIKE","R",22) )

[1] FALSE

ifelse( (TRUE == 'TRUE') & (1 == T) , yes = 'NICE', no = 'BOO')

[1] "NICE"

Set & Subset

Set

unique – union – intersect – setdiff – setequal – is.element

a = 1:5; b = 4:7
union( a, b );

[1] 1 2 3 4 5 6 7

intersect( a, b )

[1] 4 5

setdiff( a, b ); setdiff( b, a )

[1] 1 2 3

[1] 6 7

unique( c(1,2,2,3,3,3) )

[1] 1 2 3

Subset

which – subsets – which.max – which.min

# logicals
y1 = subset( y , fam==1 )
y2 = y[ which( y > 0 ) ]
# how many of each?
length( y1 ); length( y2 )

[1] 76

[1] 143

# mean of new pop
MEANS = c( population_A = mean( y1 ), population_B = mean( y2))
print(MEANS)

population_A population_B 
   0.1295266    0.2194559

which(is.na(c(1,2,NA,"ninja",6)))

[1] 3

Names and characters

colnames – rownames – paste – cat – print

a = c('Not bad', "keep going")
cat(a); print(a)

Not bad keep going

[1] "Not bad"    "keep going"

paste(a,1:2)

[1] "Not bad 1"    "keep going 2"

paste(a,collapse = ', ')

[1] "Not bad, keep going"

head(colnames(gen))

[1] "Gm01_3321482"  "Gm01_39533385" "Gm01_42085990" "Gm01_44260009"
[5] "Gm01_44893034" "Gm01_45171495"

Plots 1

plot – par – boxplot – hist – curve – image – pie – barplot – boxplot – abline – lines – density – heat

par(mfrow = c(1,4),cex=1.5)
hist(y[y>0.05], main = 'A nice histogram',col=3)
plot( y, y^2 ,  main = "That is a scatter plot",col=1+(y>.4),pch=20)
curve(dnorm(x),-3,3,main='Curve a Gaussian',lwd=3); abline(v=1,lty=2,lwd=2,col=4)
plot(density(y[y>0.05]), "A density plot",type='h',col=rainbow(150)); lines(c(0.5,0.5,0.8,0.8),c(0,1,1,0),col=1,lwd=4);

plot of chunk unnamed-chunk-12

Plots 2

# Four plots together
par(mfrow = c(1,4),cex=1.5)
image(t(gen[1:50,1:30]),main='Haplotype Heatmap',xaxt='n',yaxt='n',ylab='ID',xlab = 'Genome')
pie(c(0.4,0.6),c('Var(G)','Var(E)'),main='Pie chart',col=c(3,5)); text(0,0,"h2 = 0.4",cex=2,col=2)
barplot(table(fam),main='Bar plot',ylab="n# of individuals",xlab="Family")
boxplot(y[y>0.05]~factor(fam[y>0.05]),col=c(7,6),main="Box plot",ylab='Phenotype',xlab="Family")

plot of chunk unnamed-chunk-13

Plots 3

# Four plots together
par(mfrow = c(1,4),cex=1.1)
image(tcrossprod(gen[1:30,]-1),main='G matrix Heatmap',col=heat.colors(20),xaxt='n',yaxt='n')
plot(colMeans(gen[,1:50])/2,xlab='SNP',ylab='pi',col=4,main='Allele frq.',type='h')
plot(hclust(dist(gen[seq(1,200,20),])),col=2,lwd=3,xaxt='n',yaxt='n',xlab='')
plot(ecdf(y),main="empirical CDF"); abline(v=0.5,lwd=4,col=rgb(.5,.75,1))

plot of chunk unnamed-chunk-14

Statistics 1 - Normality and t-test

lm – glm – anova – aov – princomp – prcomp – t.test – shapiro.test – TukeyHSD – cor – cov

shapiro.test(y) # Shapiro-Wilk test of normality


    Shapiro-Wilk normality test

data:  y
W = 0.79956, p-value = 3.931e-15

t.test(y~fam) # t-test


    Welch Two Sample t-test

data:  y by fam
t = -1.8709, df = 185.58, p-value = 0.06294
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 -0.10263941  0.00272309
sample estimates:
mean in group 1 mean in group 2 
      0.1295266       0.1794847

Statistics 2 - Fit linear model

fit = lm(y~factor(fam)+gen[,1]+gen[,361]) # fit linear model
fit


Call:
lm(formula = y ~ factor(fam) + gen[, 1] + gen[, 361])

Coefficients:
 (Intercept)  factor(fam)2      gen[, 1]    gen[, 361]  
     0.05307       0.03937       0.01284       0.07823

anova(fit)

Analysis of Variance Table

Response: y
             Df Sum Sq Mean Sq F value   Pr(>F)    
factor(fam)   1 0.1161 0.11613  3.6958  0.05603 .  
gen[, 1]      1 0.0484 0.04838  1.5397  0.21618    
gen[, 361]    1 1.1012 1.10117 35.0440 1.46e-08 ***
Residuals   192 6.0331 0.03142                     
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Statistics 3 - ANOVA diagnostics

par(mfrow = c(1,4),cex=1.1)
plot(fit)

plot of chunk unnamed-chunk-17

# LRT - compare previous model with a new one
anova(fit,glm(y~gen[,361]))$`Pr(>F)`[2]

[1] 0.2277624

Statistics 4 - Correlation

# Load another dataset
data(met,package='NAM')
COR = cor(Obs[,6:8])
COR

          YLD       DTM       ACC
YLD 1.0000000 0.4446657 0.5091640
DTM 0.4446657 1.0000000 0.4237344
ACC 0.5091640 0.4237344 1.0000000

cov(Obs[,6:8])

          YLD      DTM       ACC
YLD 213.32002 39.34281 113.59453
DTM  39.34281 36.69711  39.20971
ACC 113.59453 39.20971 233.32855

image(COR,col = topo.colors(500))

plot of chunk unnamed-chunk-19

Statistics 5 - Principal components

PCA = princomp(Obs[,6:8],subset = Obs$Block==1,cor = T)
par(mfrow = c(1,2),cex=1.1)
plot(PCA)
biplot(PCA)

plot of chunk unnamed-chunk-20

eigen(COR)$values

[1] 1.9193634 0.5913541 0.4892825

Functions & Loops

Functions

function

Normalization = function( x ){
  x_new = ( x-mean(x) ) / sd( x )
  return(x_new)
}
head(y) # pre-normalization

[1] 0.1391304 0.0000000 0.3723404 0.0000000 0.1680672 0.1297710

Y = Normalization(y)
head(Y) # pos-normalization

[1] -0.10845620 -0.82759622  1.09696400 -0.82759622  0.04111283 -0.15683347

mean(Y); sd(Y)

[1] 4.112666e-17

[1] 1

Loops

for
while

# For
x = rep(0,2)
for(i in 1:2){
  x[i] = mean(y[fam==i])
  }
x

[1] 0.1295266 0.1794847

# While
while( i >= 0 ){ print(i); i = i-1 }

[1] 2
[1] 1
[1] 0

ply's

apply – tapply – lapply – mapply – colMeans – rowMeans

# apply - run a function by a dimension
gen2 = apply(X = gen, MARGIN = 2, FUN = function(x) x-mean(x) )
gen2[1:2,1:4]

     Gm01_3321482 Gm01_39533385 Gm01_42085990 Gm01_44260009
[1,]    0.9489796    -1.0408163     1.0408163     1.0204082
[2,]   -1.0510204     0.9591837    -0.9591837    -0.9795918

# tapply - run a function by class
tapply(X = Obs$YLD, INDEX = Obs$Year, FUN = mean)

    2013     2014     2015 
74.76657 55.72440 53.14087

# colMeans / rowMeans
head( colMeans(Obs[,6:8]) )

      YLD       DTM       ACC 
 63.65921 126.79314  56.27061

Regular expressions & related functions

strsplit – substr – grep – gsub – regexpr
names – dimnames – colnames – rownames – get

# obtain names from columns (aka SNPs)
snp = colnames(Gen);
head(snp,3)

[1] "Gm01_3321482" "Gm01_4755976" "Gm01_8340015"

 # name of initial SNPs from Chr5
grep("Gm05",snp,value = T)[ 1:5 ]

[1] "Gm05_389226" "Gm05_419473" "Gm05_545441" "Gm05_605301" "Gm05_620109"

# change names Gm to Chr
gsub("Gm","Chr",snp)[ 1:5 ]

[1] "Chr01_3321482"  "Chr01_4755976"  "Chr01_8340015"  "Chr01_9714061" 
[5] "Chr01_10510009"

# count SNPs/chr (first 12 chr)
table(substr(snp,3,4))[1:12]


 01  02  03  04  05  06  07  08  09  10  11  12 
190 228 188 167 205 233 236 274 161 250 223 171

# Get an object by the name
class(get('y'))

[1] "numeric"

Probability and sampling

r d p ?distributions
beta – norm – gamma – chisq – t

rnorm(n = 5, mean = 50, sd = 10)

[1] 43.37740 53.28237 36.16922 48.09699 52.56085

dnorm(x = 40, mean = 50, sd = 10)

[1] 0.02419707

pnorm(q = 40, mean = 50, sd = 10)

[1] 0.1586553

sample(x = y,size = 5,replace = FALSE)

[1] 0.47368421 0.00000000 0.18064516 0.05128205 0.06293706

hist(x = rnorm(n = 3000,mean = 50,sd = 10),breaks = 50)

plot of chunk unnamed-chunk-27

File manipulation & other languages

dir – file.remove – file.rename – file.copy – dir.create – system

setwd("C:/Users/u787935/Desktop/")
dir.create("A_NEW_FOLDER")
dir()[1:3]

[1] "~$UBLE EXPONENTIAL MODEL.docx" "A_NEW_FOLDER"                 
[3] "Alencar.lnk"

file.remove("A_NEW_FOLDER")

[1] FALSE

You can run other software from R: system('airemlf90.exe',input='par')
Other languages (java, F90, C, C++, Python, pearl, Scala): .Call, cppFunction, inline

What else is out there?

Some useful packages

Mixed model – lme4 – nlme – rrBLUP – EMMREML – MCMCglmm – pedigremm
Hyperdimensional models – BGLR – bWGR – glmnet – VIGoR – ranger – kernlab
Association analysis – GenABEL – rrBLUP – NAM – GAPIT
Parallel computing – snow – snowfall – foreach – Rmpi – doMC – gputools
Big data – Matrix – bigmemrory – ff – bigpca – Matrix – amap – h5
Fast computation – RcppEigen – RcppArmadillo
Other languages – Rcpp – rJava – rPython – rscala – Rcpp
Data management – ggplot2 – reshape2 – dplyr – data.table
R outside the box – markdown – knitr – roxygen2 – shiny – RODBC

Some remarks

Google is the programmer's best friend
If you want to really LEARN, reinvent the wheel
Programming skills come with practice
Comment your code in a way easy to understand