Set up files

#setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2015")    # Set current working directory
setwd("C:/Users/Juan Steibel/SkyDrive/Documents/job/ans824/2015")    # Set current working directory
rm(list=ls())       # Remove all objects from work environment

library(synbreed)

## Warning: package 'synbreed' was built under R version 3.0.3

## Loading required package: doBy

## Warning: package 'doBy' was built under R version 3.0.3

## Loading required package: survival
## Loading required package: splines
## Loading required package: BLR
## Loading required package: SuppDists
## Loading required package: regress

## Warning: package 'regress' was built under R version 3.0.3

## Loading required package: abind

library(synbreedData)
library(genetics)

## Warning: package 'genetics' was built under R version 3.0.3

## Loading required package: combinat
## 
## Attaching package: 'combinat'
## 
## The following object is masked from 'package:utils':
## 
##     combn
## 
## Loading required package: gdata

## Warning: package 'gdata' was built under R version 3.0.3

## gdata: Unable to locate valid perl interpreter
## gdata: 
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata: 
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
## 
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
## 
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
## 
## Attaching package: 'gdata'
## 
## The following object is masked from 'package:stats':
## 
##     nobs
## 
## The following object is masked from 'package:utils':
## 
##     object.size
## 
## Loading required package: gtools

## Warning: package 'gtools' was built under R version 3.0.3

## Loading required package: MASS
## Loading required package: mvtnorm

## Warning: package 'mvtnorm' was built under R version 3.0.3

## 
## 
## NOTE: THIS PACKAGE IS NOW OBSOLETE.
## 
## 
## 
##   The R-Genetics project has developed an set of enhanced genetics
## 
##   packages to replace 'genetics'. Please visit the project homepage
## 
##   at http://rgenetics.org for informtion.
## 
## 
## 
## 
## Attaching package: 'genetics'
## 
## The following objects are masked from 'package:base':
## 
##     %in%, as.factor, order

library(regress)

the dataset

let’s load cattle dataset and obtain a few descriptive statistics

data(cattle)
head(cattle$pedigree)

##        ID Par1 Par2 gener
## 1 ID10001    0    0     0
## 2 ID10002    0    0     0
## 3 ID10003    0    0     0
## 4 ID10004    0    0     0
## 5 ID10005    0    0     0
## 6 ID10006    0    0     0

dim(cattle$pedigree)

## [1] 1929    4

head(cattle$covar)

##        id phenotyped genotyped
## 1 ID10001      FALSE     FALSE
## 2 ID10002      FALSE     FALSE
## 3 ID10003      FALSE     FALSE
## 4 ID10004      FALSE     FALSE
## 5 ID10005      FALSE     FALSE
## 6 ID10006      FALSE     FALSE

sum(cattle$covar$phenotyped)

## [1] 500

sum(cattle$covar$genotyped)

## [1] 500

summary(cattle$pheno[,,1])

##    Phenotype1          Phenotype2     
##  Min.   :-50.07000   Min.   :-424.43  
##  1st Qu.:-10.25750   1st Qu.: -94.40  
##  Median :  0.13500   Median :  -0.84  
##  Mean   : -0.00104   Mean   :   0.05  
##  3rd Qu.: 10.35250   3rd Qu.:  96.99  
##  Max.   : 49.41000   Max.   : 422.63

estimate kinship

Let’s compute the numerator relationship matrix\ This matrix contains: a) in the diagonal 1+Expected inbreeding coeffients and b) in the off-diagonals, the expected proportion of alleles shared identical by descent
The expected inbreeding and ibd sharing is computed conditional on pedigree information
Note Many other packages can be used to compute this matrix

Amat<-kin(cattle,ret="add")
dim(Amat)

## [1] 1929 1929

summary(diag(Amat-1)) #let's look at the distribution of inbreeding

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## 0.0000000 0.0000000 0.0000000 0.0001944 0.0000000 0.1250000

summary(Amat) #a more general descriptive analysis

##  dimension                     1929 x 1929 
##  rank                          1929 
##  range of off-diagonal values  0 -- 0.625 
##  mean off-diagonal values      0.002926408 
##  range of diagonal values      1 -- 1.125 
##  mean diagonal values          1.000194 
##  number of unique values       25

save(Amat,file="A_cattle.Rdata") #save to avoid costly re-computation
head(rownames(Amat)) #names of animals are kept in rows and columns, very useful for matching

## [1] "ID10001" "ID10002" "ID10003" "ID10004" "ID10005" "ID10006"

Extract the subset of A that corresponds to phenotyped animals and re-order it don’t assume the order matches!!! check it!

indx<-match(rownames(cattle$pheno[,,1]),rownames(Amat))
phenotypes<-cattle$pheno[,,1]
A_sub<-Amat[indx,indx]
sum(rownames(A_sub)!=rownames(phenotypes))

## [1] 0

Fit animal model

We can now fit an animal model
In this case the only fixed effect is the population mean, because data are pre-corrected

lmm<-regress(phenotypes[,1]~1,~A_sub)
summary(lmm)

## Likelihood kernel: K = (Intercept)
## 
## Maximized log likelihood with kernel K is  -1613.348 
## 
## Linear Coefficients:
##              Estimate Std. Error
##  (Intercept)    0.526      1.009
## 
## Variance Coefficients:
##              Estimate Std. Error
##        A_sub   99.565     40.589
##        In     142.788     37.391

names(lmm)

##  [1] "trace"              "llik"               "cycle"             
##  [4] "rdf"                "beta"               "beta.cov"          
##  [7] "beta.se"            "sigma"              "sigma.cov"         
## [10] "W"                  "Q"                  "fitted"            
## [13] "predicted"          "predictedVariance"  "predictedVariance2"
## [16] "pos"                "Vnames"             "formula"           
## [19] "Vformula"           "Kcolnames"          "model"             
## [22] "Z"                  "X"                  "Sigma"

lmm$sigma[1]/(sum(lmm$sigma)) #estimate heritability

##     A_sub 
## 0.4108264

Predict breeding values (EBV)

To select the best bulls we could estimate breeding values
And then sort them

EBV<-BLUP(lmm)
head(sort(EBV$Mean,decreasing=T),10)

## A_sub.ID11821 A_sub.ID11451 A_sub.ID11738 A_sub.ID11677 A_sub.ID11530 
##      21.12254      16.54415      15.69283      14.72569      14.20865 
## A_sub.ID11496 A_sub.ID11563 A_sub.ID11645 A_sub.ID11814 A_sub.ID11620 
##      13.60020      13.35008      13.34789      13.25690      12.97709

But notice this prediction only involves the animals with records. It is more interesting to include all pedigreed animals. To get an EBV (estimated breeding value) for everyone in the pedigree!
It can be done with regress, but it is a bit more involved. A much better package to deal with that is pedigreemm
notice: there is a class conflict between pedigreemm and synbreeed, no big deal if you are aware of it
Due to time constraints, we will skip this task, but if you are interested, come see me and we could implement it togetehr (outside class hours)

###Breeding values of individuals without records An alternative solution is to derive the other animals’ EBV using multivariate normal distribution theory and to condition the expected breeding value of animals without records on EBV of animals with records.
\[ \begin{bmatrix} EBV_{no.record}\\ EBV_{record} \end{bmatrix} \sim N \left ( \begin{bmatrix} 0\\ 0 \end{bmatrix} , \sigma _{a}^{2} \begin{bmatrix} A_{no.record,no.record} & A_{no.record,record} \\ A_{record,no.record} & A_{record,record} \end{bmatrix} \right ) \]

Consequently:

\[ E(EBV_{no.record} | \widehat{EBV}_{record}) = A_{no.record,record} A_{record,record}^{-1} \widehat{EBV}_{record} \]

EBV_norec<- Amat[-indx,indx]%*%solve(A_sub)%*%EBV$Mean
head(sort(EBV_norec[,1],decreasing=T),10)

##   ID10764   ID10880   ID10897   ID11399   ID10906   ID11416   ID11299 
## 13.208223 11.240781 10.560951  9.678954  9.299042  8.885645  8.841818 
##   ID10770   ID10772   ID11110 
##  8.754462  8.749780  8.381084

We now could bind together both predictions and get an overall ranking.

EBV_all<-c(EBV$Mean,EBV_norec[,1])
head(sort(EBV_all,decreasing=T),10)

## A_sub.ID11821 A_sub.ID11451 A_sub.ID11738 A_sub.ID11677 A_sub.ID11530 
##      21.12254      16.54415      15.69283      14.72569      14.20865 
## A_sub.ID11496 A_sub.ID11563 A_sub.ID11645 A_sub.ID11814       ID10764 
##      13.60020      13.35008      13.34789      13.25690      13.20822

Re-name EBV to match animal ID

Notice a problem with the labeling… it would be easy to solve using string functions
I will show this because it is a good way of practicing writing functions and using sapply
we know:

nms<-names(EBV$Mean)
head(nms) #we need to extract second word after '.'

## [1] "A_sub.ID11430" "A_sub.ID11431" "A_sub.ID11432" "A_sub.ID11433"
## [5] "A_sub.ID11434" "A_sub.ID11435"

strsplit(nms[1],"\\.")

## [[1]]
## [1] "A_sub"   "ID11430"

strsplit(nms[1],"\\.")[[1]][2] #this is it!

## [1] "ID11430"

new_name<-sapply(nms,function(x) strsplit(nms[1],"\\.")[[1]][2])
head(new_name)

## A_sub.ID11430 A_sub.ID11431 A_sub.ID11432 A_sub.ID11433 A_sub.ID11434 
##     "ID11430"     "ID11430"     "ID11430"     "ID11430"     "ID11430" 
## A_sub.ID11435 
##     "ID11430"

names(EBV$Mean)<-new_name #let's rename it
EBV_all<-c(EBV$Mean,EBV_norec[,1])
head(sort(EBV_all,decreasing=T),10) #what we wanted!!!