Set up files

setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2015")    # Set current working directory
#setwd("C:/Users/Juan Steibel/SkyDrive/Documents/job/ans824/2015")    # Set current working directory
rm(list=ls())        # Remove all objects from work environment

library(synbreed)
## Loading required package: doBy
## Loading required package: survival
## Loading required package: splines
## Loading required package: BLR
## Loading required package: SuppDists
## Loading required package: regress
## Loading required package: abind
library(synbreedData)
library(regress)

the dataset

let’s load mice dataset and impute

data(mice)
is.heter<-function(x){
  substr(x,1,1)!=substr(x,3,3)
}
miceCimp<-codeGeno(mice, label.heter = is.heter,impute=T,impute.type="random",
                   maf=0.05,nmiss=0.01,verbose=T)
##    step 1  : 400 marker(s) removed with > 1 % missing values 
##    step 2  : Recoding alleles 
##    step 4  : 2148 marker(s) removed with maf < 0.05 
##    step 7  : Imputing of missing values 
##    step 7d : Random imputing of missing values 
##    step 8  : Recode alleles due to imputation 
##    step 9  : 0 marker(s) removed with maf < 0.05 
##    step 10 : No duplicated markers removed 
##    End     : 9997 marker(s) remain after the check
## 
##      Summary of imputation 
##     total number of missing values                : 20141 
##     number of random imputations                  : 20141
summary(miceCimp)
## object of class 'gpData' 
## covar 
##   No. of individuals 2527 
##           phenotyped 2527 
##            genotyped 1940 
## pheno 
##   No. of traits:        2 
## 
##      weight      growth.slope     
##  Min.   :11.9   Min.   :-0.08889  
##  1st Qu.:17.8   1st Qu.: 0.04556  
##  Median :19.9   Median : 0.08024  
##  Mean   :20.3   Mean   : 0.08659  
##  3rd Qu.:22.6   3rd Qu.: 0.12569  
##  Max.   :30.2   Max.   : 0.26408  
##  NA's   :16     NA's   :53        
## 
## geno 
##   No. of markers 9997 
##   genotypes 1 0 2 
##   frequencies 0.367 0.532 0.101 
##   NA's 0.000 %
## map 
##   No. of mapped markers  9997 
##   No. of chromosomes     20 
## 
##   markers per chromosome 
##  
##   1  10  11  12  13  14  15  16  17  18  19   2   3   4   5   6   7   8 
## 875 332 646 490 410 435 427 437 375 346 248 801 757 716 553 628 509 479 
##   9   X 
## 527   6 
## 
## pedigree 
## NULL

genomic relationship

Let’s compute genomic relationship matrix, and check dimension of phenotypes and genotypes

Gmat<-kin(miceCimp,ret="realized")
plot(Gmat)

summary(Gmat)
##  dimension                     1940 x 1940 
##  rank                          1939 
##  range of off-diagonal values  -0.2796866 -- 1.219104 
##  mean off-diagonal values      -0.0005252561 
##  range of diagonal values      0.812 -- 1.325 
##  mean diagonal values          1.018472 
##  number of unique values       1873911
dim(Gmat)
## [1] 1940 1940
dim(miceCimp$pheno)
## [1] 2527    2    1
phenotypes<-miceCimp$pheno[,1,1]
summary(phenotypes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    11.9    17.8    19.9    20.3    22.6    30.2      16
length(phenotypes)
## [1] 2527
phenotypes<-phenotypes[!is.na(phenotypes)]
sum(rownames(Gmat)%in%names(phenotypes))
## [1] 1928

prepare and match all model components

Use positional indexes

idg<-rownames(Gmat)%in%names(phenotypes)
Gpheno<-Gmat[idg,idg]
dim(Gpheno)
## [1] 1928 1928
idp<-match(rownames(Gpheno),names(phenotypes))
phenoG<-phenotypes[idp]
sum(names(phenoG)!=rownames(Gpheno))
## [1] 0
idcv<-match(rownames(Gpheno),miceCimp$covar$id)
cvg<-miceCimp$covar[idcv,]
sum(cvg$id!=rownames(Gpheno))
## [1] 0

prepare and match all model components

Use positional indexes

idg<-rownames(Gmat)%in%names(phenotypes)
Gpheno<-Gmat[idg,idg]
dim(Gpheno)
## [1] 1928 1928
idp<-match(rownames(Gpheno),names(phenotypes))
phenoG<-phenotypes[idp]
sum(names(phenoG)!=rownames(Gpheno))
## [1] 0
idcv<-match(rownames(Gpheno),miceCimp$covar$id)
cvg<-miceCimp$covar[idcv,]
sum(cvg$id!=rownames(Gpheno))
## [1] 0

Model fitting

lmmg<-regress(phenoG~as.factor(cvg$CageDensity)+cvg$sex,~Gpheno)
lmmg
## Likelihood kernel: K = (Intercept)+as.factor(cvg$CageDensity)3+as.factor(cvg$CageDensity)4+as.factor(cvg$CageDensity)5+as.factor(cvg$CageDensity)6+as.factor(cvg$CageDensity)7+cvg$sexM
## 
## Maximized log likelihood with kernel K is  -2559.913 
## 
## Linear Coefficients:
##                              Estimate Std. Error
##  (Intercept)                   19.326      0.492
##  as.factor(cvg$CageDensity)3   -0.958      0.525
##  as.factor(cvg$CageDensity)4   -0.946      0.497
##  as.factor(cvg$CageDensity)5   -1.193      0.501
##  as.factor(cvg$CageDensity)6   -1.389      0.500
##  as.factor(cvg$CageDensity)7   -2.353      0.584
##  cvg$sexM                       4.208      0.103
## 
## Variance Coefficients:
##                              Estimate Std. Error
##                       Gpheno    3.182      0.351
##                       In        3.900      0.158
lmmg$sigma[1]/(sum(lmmg$sigma)) #estimate heritability
##    Gpheno 
## 0.4493161
EBVg<-BLUP(lmmg)

Evaluate prediction accuracy

Compare GEBV to phenotypes and to corrected phenotypes

plot(EBVg$Mean,phenoG)
abline(0,1)

cor(phenoG,EBVg$Mean)
## [1] 0.5988551
corrected<-regress(phenoG~as.factor(cvg$CageDensity)+cvg$sex)
Ycor<-phenoG-corrected$predicted
plot(EBVg$Mean~Ycor)
abline(0,1)

cor(Ycor,EBVg$Mean)
##           [,1]
## [1,] 0.7454928

Prediction accuracy with new phenotypes

Let’s set aside some observations to prediction and refit model.

#random vector of missing values
Nobs<-length(phenoG)
Nmis<-round(length(phenoG)*0.1) #10% missing
tomissing<-sample(1:Nobs,Nmis)

phenoGna<-phenoG[-tomissing]
lmmgna<-regress(phenoGna~as.factor(cvg$CageDensity[-tomissing])+cvg$sex[-tomissing],
                        ~Gpheno[-tomissing,-tomissing])
lmmgna
## Likelihood kernel: K = (Intercept)+as.factor(cvg$CageDensity[-tomissing])3+as.factor(cvg$CageDensity[-tomissing])4+as.factor(cvg$CageDensity[-tomissing])5+as.factor(cvg$CageDensity[-tomissing])6+as.factor(cvg$CageDensity[-tomissing])7+cvg$sex[-tomissing]M
## 
## Maximized log likelihood with kernel K is  -2323.978 
## 
## Linear Coefficients:
##                                          Estimate Std. Error
##  (Intercept)                               19.256      0.509
##  as.factor(cvg$CageDensity[-tomissing])3   -0.876      0.544
##  as.factor(cvg$CageDensity[-tomissing])4   -0.840      0.514
##  as.factor(cvg$CageDensity[-tomissing])5   -1.165      0.519
##  as.factor(cvg$CageDensity[-tomissing])6   -1.278      0.519
##  as.factor(cvg$CageDensity[-tomissing])7   -2.237      0.604
##  cvg$sex[-tomissing]M                       4.164      0.109
## 
## Variance Coefficients:
##                                          Estimate Std. Error
##           Gpheno[-tomissing, -tomissing]    3.223      0.371
##           In                                3.959      0.171
lmmgna$sigma[1]/(sum(lmmgna$sigma)) #estimate heritability
## Gpheno[-tomissing, -tomissing] 
##                      0.4487875
EBVgna<-BLUP(lmmgna)
plot(EBVgna$Mean,phenoGna)
abline(0,1)

cor(phenoGna,EBVgna$Mean)
## [1] 0.6108896
correctedna<-regress(phenoGna~as.factor(cvg$CageDensity[-tomissing])+cvg$sex[-tomissing])
Ycorna<-phenoGna-correctedna$predicted
plot(EBVg$Mean~Ycor)
abline(0,1)

cor(Ycorna,EBVgna$Mean)
##           [,1]
## [1,] 0.7493676
EBV_norec<- Gpheno[tomissing,-tomissing]%*%solve(Gpheno[-tomissing,-tomissing])%*%EBVgna$Mean
head(sort(EBV_norec[,1],decreasing=T),10)
## A063867785 A084126609 A064096308 A084289027 A067106356 A053073028 
##   3.547340   3.411871   2.425728   2.329499   2.317289   2.241341 
## A063528111 A064094852 A084286885 A067043333 
##   2.211431   2.066039   2.046795   1.912738
plot(EBV_norec,phenoG[tomissing])

cor(EBV_norec,phenoG[tomissing])
##           [,1]
## [1,] 0.3761723
plot(EBV_norec,Ycor[tomissing])
abline(0,1)

cor(EBV_norec,Ycor[tomissing])
##           [,1]
## [1,] 0.5311374