Set up files

setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2017")    # Set current working directory
rm(list=ls())        # Remove all objects from work environment

library(synbreed)
library(synbreedData)
library(regress)
library(gwaR)

the dataset

let’s load mice dataset and impute

data(mice)
is.heter<-function(x){
  substr(x,1,1)!=substr(x,3,3)
}
miceCimp<-codeGeno(mice, label.heter = is.heter,impute=T,impute.type="random",
                   maf=0.05,nmiss=0.01,verbose=T)

##    step 1  : 400 marker(s) removed with > 1 % missing values 
##    step 2  : Recoding alleles 
##    step 4  : 2148 marker(s) removed with maf < 0.05 
##    step 7  : Imputing of missing values 
##    step 7d : Random imputing of missing values 
##    step 8  : Recode alleles due to imputation 
##    step 9  : 0 marker(s) removed with maf < 0.05 
##    step 10 : No duplicated markers removed 
##    End     : 9997 marker(s) remain after the check
## 
##      Summary of imputation 
##     total number of missing values                : 20141 
##     number of random imputations                  : 20141

summary(miceCimp)

## object of class 'gpData' 
## covar 
##   No. of individuals 2527 
##           phenotyped 2527 
##            genotyped 1940 
## pheno 
##   No. of traits:        2 
## 
##      weight      growth.slope     
##  Min.   :11.9   Min.   :-0.08889  
##  1st Qu.:17.8   1st Qu.: 0.04556  
##  Median :19.9   Median : 0.08024  
##  Mean   :20.3   Mean   : 0.08659  
##  3rd Qu.:22.6   3rd Qu.: 0.12569  
##  Max.   :30.2   Max.   : 0.26408  
##  NA's   :16     NA's   :53        
## 
## geno 
##   No. of markers 9997 
##   genotypes 1 0 2 
##   frequencies 0.367 0.532 0.101 
##   NA's 0.000 %
## map 
##   No. of mapped markers  9997 
##   No. of chromosomes     20 
## 
##   markers per chromosome 
##  
##   1  10  11  12  13  14  15  16  17  18  19   2   3   4   5   6   7   8 
## 875 332 646 490 410 435 427 437 375 346 248 801 757 716 553 628 509 479 
##   9   X 
## 527   6 
## 
## pedigree 
## NULL

genomic relationship

Let’s compute genomic relationship matrix, and check dimension of phenotypes and genotypes

Gmat<-kin(miceCimp,ret="realized")
plot(Gmat)

summary(Gmat)

##  dimension                     1940 x 1940 
##  rank                          1939 
##  range of off-diagonal values  -0.2797885 -- 1.218987 
##  mean off-diagonal values      -0.0005252645 
##  range of diagonal values      0.8121 -- 1.325 
##  mean diagonal values          1.018488 
##  number of unique values       1871722

dim(Gmat)

## [1] 1940 1940

Model fitting

miceCimp$covar$CageDensity<-as.factor(miceCimp$covar$CageDensity)
miceCimp$covar$birthYear<-as.factor(miceCimp$covar$birthYear)
miceCimp$covar$birthMonth<-as.factor(miceCimp$covar$birthMonth)

gb<-gblup(rsp="weight",data=miceCimp,design=c(y~CageDensity+sex+birthYear),G=Gmat)
print(gb)

## gblup analysis of trait: weight 
## 
## fixed effects equation:
## y ~ CageDensity + sex + birthYear
## 
## random effects equation:
## ~G + In
## 
## log-likelihood: -2496.608 converged in: 7 iterations 
## 
## estimated fixed effects:
##                 Estimate  StdError    test.st      p.value
## (Intercept)   18.6358719 0.5693513 32.7317636 0.000000e+00
## CageDensity3  -1.0177320 0.5142108 -1.9792114 4.779221e-02
## CageDensity4  -1.1366604 0.4874889 -2.3316641 1.971837e-02
## CageDensity5  -1.3513646 0.4914227 -2.7499028 5.961295e-03
## CageDensity6  -1.5536356 0.4906843 -3.1662632 1.544110e-03
## CageDensity7  -2.2883748 0.5719152 -4.0012487 6.300910e-05
## sexM           4.2143385 0.1006135 41.8864280 0.000000e+00
## birthYear2003  0.3318858 0.3335067  0.9951399 3.196682e-01
## birthYear2004  1.9788235 0.3575527  5.5343551 3.123755e-08
## 
## estimated variance components:
##    Estimate  StdError  prop.var
## G  2.123722 0.2697555 0.3509632
## In 3.927402 0.1536502 0.6490368

varcomp(gb)

##    Estimate  StdError  prop.var         se
## G  2.123722 0.2697555 0.3509632 0.03440950
## In 3.927402 0.1536502 0.6490368 0.03756528

EBVg<-summary(gb)$uhat

Evaluate prediction accuracy

Compare GEBV to phenotypes and to corrected phenotypes

plot(EBVg,gb$model$y)
abline(0,1)

cor(gb$model$y,EBVg)

##          [,1]
## [1,] 0.554764

Ycor<-gb$ehat #this is the phenotype minus fixed effects
plot(EBVg~Ycor)
abline(0,1)

cor(Ycor,EBVg)

##           [,1]
## [1,] 0.6867677

Prediction accuracy with new phenotypes

Let’s set aside some observations to prediction and refit model.

#random vector of missing values
Nobs<-nrow(miceCimp$pheno)
Nmis<-round(Nobs*0.1) #10% missing
tomissing<-sample(rownames(miceCimp$pheno),Nmis)
delG<-!(rownames(Gmat)%in%tomissing)
Gmiss<-Gmat[delG,delG]

gbmiss<-gblup(rsp="weight",data=miceCimp,design=c(y~CageDensity+sex+birthYear),G=Gmiss)

print(gbmiss)

## gblup analysis of trait: weight 
## 
## fixed effects equation:
## y ~ CageDensity + sex + birthYear
## 
## random effects equation:
## ~G + In
## 
## log-likelihood: -2245.489 converged in: 7 iterations 
## 
## estimated fixed effects:
##                 Estimate  StdError    test.st      p.value
## (Intercept)   18.8168901 0.6577242 28.6090878 0.000000e+00
## CageDensity3  -1.1199982 0.5850741 -1.9142842 5.558385e-02
## CageDensity4  -1.1850424 0.5586844 -2.1211304 3.391083e-02
## CageDensity5  -1.4599564 0.5617177 -2.5990929 9.347048e-03
## CageDensity6  -1.6767818 0.5629528 -2.9785476 2.896181e-03
## CageDensity7  -2.3589715 0.6339190 -3.7212509 1.982384e-04
## sexM           4.2336245 0.1050824 40.2886268 0.000000e+00
## birthYear2003  0.2094526 0.3526029  0.5940183 5.524999e-01
## birthYear2004  1.9067436 0.3768627  5.0595181 4.203174e-07
## 
## estimated variance components:
##    Estimate  StdError  prop.var
## G  2.070753 0.2754597 0.3489157
## In 3.864071 0.1609608 0.6510843

varcomp(gbmiss)

##    Estimate  StdError  prop.var         se
## G  2.070753 0.2754597 0.3489157 0.03621222
## In 3.864071 0.1609608 0.6510843 0.03973085

In-class activity

write a function to predict breeding values of animals with missing data, but related to animals with records
Use the function and predict breeding value for animals who were assumed missing
compare (plot and correlation coefficient) predicted breeding value using this function versus +phenotype +predicted breeding value with complete data +corrected phenotypes obtained with complete data