Set up files

setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2015")    # Set current working directory
#setwd("C:/Users/Juan Steibel/SkyDrive/Documents/job/ans824/2015")    # Set current working directory
rm(list=ls())        # Remove all objects from work environment

library(synbreed)

## Loading required package: doBy
## Loading required package: survival
## Loading required package: splines
## Loading required package: BLR
## Loading required package: SuppDists
## Loading required package: regress
## Loading required package: abind

library(synbreedData)
library(regress)

the dataset

let’s load mice dataset and impute

data(mice)
is.heter<-function(x){
  substr(x,1,1)!=substr(x,3,3)
}
miceCimp<-codeGeno(mice, label.heter = is.heter,impute=T,impute.type="random",
                   maf=0.05,nmiss=0.01,verbose=T)

##    step 1  : 400 marker(s) removed with > 1 % missing values 
##    step 2  : Recoding alleles 
##    step 4  : 2148 marker(s) removed with maf < 0.05 
##    step 7  : Imputing of missing values 
##    step 7d : Random imputing of missing values 
##    step 8  : Recode alleles due to imputation 
##    step 9  : 0 marker(s) removed with maf < 0.05 
##    step 10 : No duplicated markers removed 
##    End     : 9997 marker(s) remain after the check
## 
##      Summary of imputation 
##     total number of missing values                : 20141 
##     number of random imputations                  : 20141

summary(miceCimp)

## object of class 'gpData' 
## covar 
##   No. of individuals 2527 
##           phenotyped 2527 
##            genotyped 1940 
## pheno 
##   No. of traits:        2 
## 
##      weight      growth.slope     
##  Min.   :11.9   Min.   :-0.08889  
##  1st Qu.:17.8   1st Qu.: 0.04556  
##  Median :19.9   Median : 0.08024  
##  Mean   :20.3   Mean   : 0.08659  
##  3rd Qu.:22.6   3rd Qu.: 0.12569  
##  Max.   :30.2   Max.   : 0.26408  
##  NA's   :16     NA's   :53        
## 
## geno 
##   No. of markers 9997 
##   genotypes 1 0 2 
##   frequencies 0.367 0.532 0.101 
##   NA's 0.000 %
## map 
##   No. of mapped markers  9997 
##   No. of chromosomes     20 
## 
##   markers per chromosome 
##  
##   1  10  11  12  13  14  15  16  17  18  19   2   3   4   5   6   7   8 
## 875 332 646 490 410 435 427 437 375 346 248 801 757 716 553 628 509 479 
##   9   X 
## 527   6 
## 
## pedigree 
## NULL

genomic relationship

Let’s compute genomic relationship matrix, and check dimension of phenotypes and genotypes

Gmat<-kin(miceCimp,ret="realized")
plot(Gmat)

summary(Gmat)

##  dimension                     1940 x 1940 
##  rank                          1939 
##  range of off-diagonal values  -0.2796866 -- 1.219104 
##  mean off-diagonal values      -0.0005252561 
##  range of diagonal values      0.812 -- 1.325 
##  mean diagonal values          1.018472 
##  number of unique values       1873911

dim(Gmat)

## [1] 1940 1940

dim(miceCimp$pheno)

## [1] 2527    2    1

phenotypes<-miceCimp$pheno[,1,1]
summary(phenotypes)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    11.9    17.8    19.9    20.3    22.6    30.2      16

length(phenotypes)

## [1] 2527

phenotypes<-phenotypes[!is.na(phenotypes)]
sum(rownames(Gmat)%in%names(phenotypes))

## [1] 1928

prepare and match all model components

Use positional indexes

idg<-rownames(Gmat)%in%names(phenotypes)
Gpheno<-Gmat[idg,idg]
dim(Gpheno)

## [1] 1928 1928

idp<-match(rownames(Gpheno),names(phenotypes))
phenoG<-phenotypes[idp]
sum(names(phenoG)!=rownames(Gpheno))

## [1] 0

idcv<-match(rownames(Gpheno),miceCimp$covar$id)
cvg<-miceCimp$covar[idcv,]
sum(cvg$id!=rownames(Gpheno))

## [1] 0

prepare and match all model components

Use positional indexes

idg<-rownames(Gmat)%in%names(phenotypes)
Gpheno<-Gmat[idg,idg]
dim(Gpheno)

## [1] 1928 1928

idp<-match(rownames(Gpheno),names(phenotypes))
phenoG<-phenotypes[idp]
sum(names(phenoG)!=rownames(Gpheno))

## [1] 0

idcv<-match(rownames(Gpheno),miceCimp$covar$id)
cvg<-miceCimp$covar[idcv,]
sum(cvg$id!=rownames(Gpheno))

## [1] 0

Model fitting

lmmg<-regress(phenoG~as.factor(cvg$CageDensity)+cvg$sex,~Gpheno)
lmmg

## Likelihood kernel: K = (Intercept)+as.factor(cvg$CageDensity)3+as.factor(cvg$CageDensity)4+as.factor(cvg$CageDensity)5+as.factor(cvg$CageDensity)6+as.factor(cvg$CageDensity)7+cvg$sexM
## 
## Maximized log likelihood with kernel K is  -2559.913 
## 
## Linear Coefficients:
##                              Estimate Std. Error
##  (Intercept)                   19.326      0.492
##  as.factor(cvg$CageDensity)3   -0.958      0.525
##  as.factor(cvg$CageDensity)4   -0.946      0.497
##  as.factor(cvg$CageDensity)5   -1.193      0.501
##  as.factor(cvg$CageDensity)6   -1.389      0.500
##  as.factor(cvg$CageDensity)7   -2.353      0.584
##  cvg$sexM                       4.208      0.103
## 
## Variance Coefficients:
##                              Estimate Std. Error
##                       Gpheno    3.182      0.351
##                       In        3.900      0.158

lmmg$sigma[1]/(sum(lmmg$sigma)) #estimate heritability

##    Gpheno 
## 0.4493161

EBVg<-BLUP(lmmg)

Evaluate prediction accuracy

Compare GEBV to phenotypes and to corrected phenotypes

plot(EBVg$Mean,phenoG)
abline(0,1)

cor(phenoG,EBVg$Mean)

## [1] 0.5988551

corrected<-regress(phenoG~as.factor(cvg$CageDensity)+cvg$sex)
Ycor<-phenoG-corrected$predicted
plot(EBVg$Mean~Ycor)
abline(0,1)

cor(Ycor,EBVg$Mean)

##           [,1]
## [1,] 0.7454928

Prediction accuracy with new phenotypes

Let’s set aside some observations to prediction and refit model.

#random vector of missing values
Nobs<-length(phenoG)
Nmis<-round(length(phenoG)*0.1) #10% missing
tomissing<-sample(1:Nobs,Nmis)

phenoGna<-phenoG[-tomissing]
lmmgna<-regress(phenoGna~as.factor(cvg$CageDensity[-tomissing])+cvg$sex[-tomissing],
                        ~Gpheno[-tomissing,-tomissing])
lmmgna

## Likelihood kernel: K = (Intercept)+as.factor(cvg$CageDensity[-tomissing])3+as.factor(cvg$CageDensity[-tomissing])4+as.factor(cvg$CageDensity[-tomissing])5+as.factor(cvg$CageDensity[-tomissing])6+as.factor(cvg$CageDensity[-tomissing])7+cvg$sex[-tomissing]M
## 
## Maximized log likelihood with kernel K is  -2323.978 
## 
## Linear Coefficients:
##                                          Estimate Std. Error
##  (Intercept)                               19.256      0.509
##  as.factor(cvg$CageDensity[-tomissing])3   -0.876      0.544
##  as.factor(cvg$CageDensity[-tomissing])4   -0.840      0.514
##  as.factor(cvg$CageDensity[-tomissing])5   -1.165      0.519
##  as.factor(cvg$CageDensity[-tomissing])6   -1.278      0.519
##  as.factor(cvg$CageDensity[-tomissing])7   -2.237      0.604
##  cvg$sex[-tomissing]M                       4.164      0.109
## 
## Variance Coefficients:
##                                          Estimate Std. Error
##           Gpheno[-tomissing, -tomissing]    3.223      0.371
##           In                                3.959      0.171

lmmgna$sigma[1]/(sum(lmmgna$sigma)) #estimate heritability

## Gpheno[-tomissing, -tomissing] 
##                      0.4487875

EBVgna<-BLUP(lmmgna)
plot(EBVgna$Mean,phenoGna)
abline(0,1)

cor(phenoGna,EBVgna$Mean)

## [1] 0.6108896

correctedna<-regress(phenoGna~as.factor(cvg$CageDensity[-tomissing])+cvg$sex[-tomissing])
Ycorna<-phenoGna-correctedna$predicted
plot(EBVg$Mean~Ycor)
abline(0,1)

cor(Ycorna,EBVgna$Mean)

##           [,1]
## [1,] 0.7493676

EBV_norec<- Gpheno[tomissing,-tomissing]%*%solve(Gpheno[-tomissing,-tomissing])%*%EBVgna$Mean
head(sort(EBV_norec[,1],decreasing=T),10)

## A063867785 A084126609 A064096308 A084289027 A067106356 A053073028 
##   3.547340   3.411871   2.425728   2.329499   2.317289   2.241341 
## A063528111 A064094852 A084286885 A067043333 
##   2.211431   2.066039   2.046795   1.912738

plot(EBV_norec,phenoG[tomissing])

cor(EBV_norec,phenoG[tomissing])

##           [,1]
## [1,] 0.3761723

plot(EBV_norec,Ycor[tomissing])
abline(0,1)

cor(EBV_norec,Ycor[tomissing])

##           [,1]
## [1,] 0.5311374

Fitting an genomic animal model

Set up files

the dataset

genomic relationship

prepare and match all model components

prepare and match all model components

Model fitting

Evaluate prediction accuracy

Prediction accuracy with new phenotypes