Introduction to R
Alencar Xavier (alenxav@gmail.com)
July 21, 2017
A brief introduction to R for plant breeding and quantitative genetics students
AGRY611, Spring 2017
AGRY620, Fall 2017
R project (https://www.r-project.org/)
Microsoft R Open (https://mran.microsoft.com/)
R Studio (https://www.rstudio.com/)
R Eclipse (http://www.eclipse.org/)
data( tpod, package = "NAM" );
ls()
[1] "chr" "fam" "gen" "y"
summary( y ); getwd()
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00000 0.00000 0.03606 0.16010 0.32040 0.70910
[1] "C:/Users/u787935/Google Drive/DOW080816/Lecture"
save( gen, file = "genotypes.RData" ); Gen = load( "genotypes.RData" )
# install.packages( "NAM" )
require( NAM )
# source("https://bioconductor.org/biocLite.R")
# biocLite("impute")
require( impute )
base::abbreviate( "Soybean Cyst Nematode" )
Soybean Cyst Nematode
"SyCN"
my_stuff = list( phenotype = y, genotype = gen );
class( my_stuff )
[1] "list"
class(gen);
[1] "matrix"
is.numeric( gen )
[1] TRUE
ls( my_stuff )
[1] "genotype" "phenotype"
class( "Hello!" )
[1] "character"
class( my_stuff$phenotype )
[1] "numeric"
typeof( my_stuff$phenotype )
[1] "double"
is.list( data.frame(y) )
[1] TRUE
class( factor(fam) )
[1] "factor"
pheno_stat = c( mean(y), median(y), sd(y), var(y) )
pheno_stat
[1] 0.16011322 0.03606238 0.19346780 0.03742979
a = c(2, 0.4, pi, -5, Inf, NA)
b <- 5
sort( a + b )
[1] 0.000000 5.400000 7.000000 8.141593 Inf
rm( a, b ); length( pheno_stat )
[1] 4
dim( gen )
[1] 196 376
y[1:5]; head(y);
[1] 0.1391304 0.0000000 0.3723404 0.0000000 0.1680672
[1] 0.1391304 0.0000000 0.3723404 0.0000000 0.1680672 0.1297710
round( tail(y), digits = 2)
[1] 0.00 0.02 0.01 0.00 0.21 0.00
gen[ seq(from=1,to=5,by=2), 1:2 ]
Gm01_3321482 Gm01_39533385
[1,] 2 0
[2,] 2 0
[3,] 1 0
5 > 4
[1] TRUE
T + TRUE + 1 + FALSE + F
[1] 3
TRUE & 1>0 & TRUE
[1] TRUE
TRUE | FALSE
[1] TRUE
sum( y == 0 )
[1] 53
c( any( 0.8 < y ), all( y >= 0 ) )
[1] FALSE TRUE
"R" %in% c( "I", "LIKE", "R", 22 )
[1] TRUE
is.numeric( c("I","LIKE","R",22) )
[1] FALSE
ifelse( (TRUE == 'TRUE') & (1 == T) , yes = 'NICE', no = 'BOO')
[1] "NICE"
a = 1:5; b = 4:7
union( a, b );
[1] 1 2 3 4 5 6 7
intersect( a, b )
[1] 4 5
setdiff( a, b ); setdiff( b, a )
[1] 1 2 3
[1] 6 7
unique( c(1,2,2,3,3,3) )
[1] 1 2 3
# logicals
y1 = subset( y , fam==1 )
y2 = y[ which( y > 0 ) ]
# how many of each?
length( y1 ); length( y2 )
[1] 76
[1] 143
# mean of new pop
MEANS = c( population_A = mean( y1 ), population_B = mean( y2))
print(MEANS)
population_A population_B
0.1295266 0.2194559
which(is.na(c(1,2,NA,"ninja",6)))
[1] 3
a = c('Not bad', "keep going")
cat(a); print(a)
Not bad keep going
[1] "Not bad" "keep going"
paste(a,1:2)
[1] "Not bad 1" "keep going 2"
paste(a,collapse = ', ')
[1] "Not bad, keep going"
head(colnames(gen))
[1] "Gm01_3321482" "Gm01_39533385" "Gm01_42085990" "Gm01_44260009"
[5] "Gm01_44893034" "Gm01_45171495"
par(mfrow = c(1,4),cex=1.5)
hist(y[y>0.05], main = 'A nice histogram',col=3)
plot( y, y^2 , main = "That is a scatter plot",col=1+(y>.4),pch=20)
curve(dnorm(x),-3,3,main='Curve a Gaussian',lwd=3); abline(v=1,lty=2,lwd=2,col=4)
plot(density(y[y>0.05]), "A density plot",type='h',col=rainbow(150)); lines(c(0.5,0.5,0.8,0.8),c(0,1,1,0),col=1,lwd=4);
# Four plots together
par(mfrow = c(1,4),cex=1.5)
image(t(gen[1:50,1:30]),main='Haplotype Heatmap',xaxt='n',yaxt='n',ylab='ID',xlab = 'Genome')
pie(c(0.4,0.6),c('Var(G)','Var(E)'),main='Pie chart',col=c(3,5)); text(0,0,"h2 = 0.4",cex=2,col=2)
barplot(table(fam),main='Bar plot',ylab="n# of individuals",xlab="Family")
boxplot(y[y>0.05]~factor(fam[y>0.05]),col=c(7,6),main="Box plot",ylab='Phenotype',xlab="Family")
# Four plots together
par(mfrow = c(1,4),cex=1.1)
image(tcrossprod(gen[1:30,]-1),main='G matrix Heatmap',col=heat.colors(20),xaxt='n',yaxt='n')
plot(colMeans(gen[,1:50])/2,xlab='SNP',ylab='pi',col=4,main='Allele frq.',type='h')
plot(hclust(dist(gen[seq(1,200,20),])),col=2,lwd=3,xaxt='n',yaxt='n',xlab='')
plot(ecdf(y),main="empirical CDF"); abline(v=0.5,lwd=4,col=rgb(.5,.75,1))
shapiro.test(y) # Shapiro-Wilk test of normality
Shapiro-Wilk normality test
data: y
W = 0.79956, p-value = 3.931e-15
t.test(y~fam) # t-test
Welch Two Sample t-test
data: y by fam
t = -1.8709, df = 185.58, p-value = 0.06294
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-0.10263941 0.00272309
sample estimates:
mean in group 1 mean in group 2
0.1295266 0.1794847
fit = lm(y~factor(fam)+gen[,1]+gen[,361]) # fit linear model
fit
Call:
lm(formula = y ~ factor(fam) + gen[, 1] + gen[, 361])
Coefficients:
(Intercept) factor(fam)2 gen[, 1] gen[, 361]
0.05307 0.03937 0.01284 0.07823
anova(fit)
Analysis of Variance Table
Response: y
Df Sum Sq Mean Sq F value Pr(>F)
factor(fam) 1 0.1161 0.11613 3.6958 0.05603 .
gen[, 1] 1 0.0484 0.04838 1.5397 0.21618
gen[, 361] 1 1.1012 1.10117 35.0440 1.46e-08 ***
Residuals 192 6.0331 0.03142
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow = c(1,4),cex=1.1)
plot(fit)
# LRT - compare previous model with a new one
anova(fit,glm(y~gen[,361]))$`Pr(>F)`[2]
[1] 0.2277624
# Load another dataset
data(met,package='NAM')
COR = cor(Obs[,6:8])
COR
YLD DTM ACC
YLD 1.0000000 0.4446657 0.5091640
DTM 0.4446657 1.0000000 0.4237344
ACC 0.5091640 0.4237344 1.0000000
cov(Obs[,6:8])
YLD DTM ACC
YLD 213.32002 39.34281 113.59453
DTM 39.34281 36.69711 39.20971
ACC 113.59453 39.20971 233.32855
image(COR,col = topo.colors(500))
PCA = princomp(Obs[,6:8],subset = Obs$Block==1,cor = T)
par(mfrow = c(1,2),cex=1.1)
plot(PCA)
biplot(PCA)
eigen(COR)$values
[1] 1.9193634 0.5913541 0.4892825
Normalization = function( x ){
x_new = ( x-mean(x) ) / sd( x )
return(x_new)
}
head(y) # pre-normalization
[1] 0.1391304 0.0000000 0.3723404 0.0000000 0.1680672 0.1297710
Y = Normalization(y)
head(Y) # pos-normalization
[1] -0.10845620 -0.82759622 1.09696400 -0.82759622 0.04111283 -0.15683347
mean(Y); sd(Y)
[1] 4.112666e-17
[1] 1
# For
x = rep(0,2)
for(i in 1:2){
x[i] = mean(y[fam==i])
}
x
[1] 0.1295266 0.1794847
# While
while( i >= 0 ){ print(i); i = i-1 }
[1] 2
[1] 1
[1] 0
# apply - run a function by a dimension
gen2 = apply(X = gen, MARGIN = 2, FUN = function(x) x-mean(x) )
gen2[1:2,1:4]
Gm01_3321482 Gm01_39533385 Gm01_42085990 Gm01_44260009
[1,] 0.9489796 -1.0408163 1.0408163 1.0204082
[2,] -1.0510204 0.9591837 -0.9591837 -0.9795918
# tapply - run a function by class
tapply(X = Obs$YLD, INDEX = Obs$Year, FUN = mean)
2013 2014 2015
74.76657 55.72440 53.14087
# colMeans / rowMeans
head( colMeans(Obs[,6:8]) )
YLD DTM ACC
63.65921 126.79314 56.27061
# obtain names from columns (aka SNPs)
snp = colnames(Gen);
head(snp,3)
[1] "Gm01_3321482" "Gm01_4755976" "Gm01_8340015"
# name of initial SNPs from Chr5
grep("Gm05",snp,value = T)[ 1:5 ]
[1] "Gm05_389226" "Gm05_419473" "Gm05_545441" "Gm05_605301" "Gm05_620109"
# change names Gm to Chr
gsub("Gm","Chr",snp)[ 1:5 ]
[1] "Chr01_3321482" "Chr01_4755976" "Chr01_8340015" "Chr01_9714061"
[5] "Chr01_10510009"
# count SNPs/chr (first 12 chr)
table(substr(snp,3,4))[1:12]
01 02 03 04 05 06 07 08 09 10 11 12
190 228 188 167 205 233 236 274 161 250 223 171
# Get an object by the name
class(get('y'))
[1] "numeric"
rnorm(n = 5, mean = 50, sd = 10)
[1] 43.37740 53.28237 36.16922 48.09699 52.56085
dnorm(x = 40, mean = 50, sd = 10)
[1] 0.02419707
pnorm(q = 40, mean = 50, sd = 10)
[1] 0.1586553
sample(x = y,size = 5,replace = FALSE)
[1] 0.47368421 0.00000000 0.18064516 0.05128205 0.06293706
hist(x = rnorm(n = 3000,mean = 50,sd = 10),breaks = 50)
setwd("C:/Users/u787935/Desktop/")
dir.create("A_NEW_FOLDER")
dir()[1:3]
[1] "~$UBLE EXPONENTIAL MODEL.docx" "A_NEW_FOLDER"
[3] "Alencar.lnk"
file.remove("A_NEW_FOLDER")
[1] FALSE
You can run other software from R: system('airemlf90.exe',input='par')
Other languages (java, F90, C, C++, Python, pearl, Scala): .Call, cppFunction, inline