Set up files

#setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2015")    # Set current working directory
setwd("C:/Users/Juan Steibel/SkyDrive/Documents/job/ans824/2015")    # Set current working directory
rm(list=ls())        # Remove all objects from work environment

library(synbreed)
## Warning: package 'synbreed' was built under R version 3.0.3
## Loading required package: doBy
## Warning: package 'doBy' was built under R version 3.0.3
## Loading required package: survival
## Loading required package: splines
## Loading required package: BLR
## Loading required package: SuppDists
## Loading required package: regress
## Warning: package 'regress' was built under R version 3.0.3
## Loading required package: abind
library(synbreedData)
library(regress)

the dataset

let’s load cattle mice dataset obtain a few descriptive statistics

data(mice)
is.heter<-function(x){
  substr(x,1,1)!=substr(x,3,3)
}
miceCimp<-codeGeno(mice, label.heter = is.heter,impute=T,impute.type="random",
                   maf=0.05,nmiss=0.01,verbose=T)
##    step 1  : 400 marker(s) removed with > 1 % missing values 
##    step 2  : Recoding alleles 
##    step 4  : 2148 marker(s) removed with maf < 0.05 
##    step 7  : Imputing of missing values 
##    step 7d : Random imputing of missing values 
##    step 8  : No recoding of alleles necessary after imputation 
##    step 9  : 0 marker(s) removed with maf < 0.05 
##    step 10 : No duplicated markers removed 
##    End     : 9997 marker(s) remain after the check
## 
##      Summary of imputation 
##     total number of missing values                : 20141 
##     number of random imputations                  : 20141
summary(miceCimp)
## object of class 'gpData' 
## covar 
##   No. of individuals 2527 
##           phenotyped 2527 
##            genotyped 1940 
## pheno 
##   No. of traits:        2 
## 
##      weight      growth.slope     
##  Min.   :11.9   Min.   :-0.08889  
##  1st Qu.:17.8   1st Qu.: 0.04556  
##  Median :19.9   Median : 0.08024  
##  Mean   :20.3   Mean   : 0.08659  
##  3rd Qu.:22.6   3rd Qu.: 0.12569  
##  Max.   :30.2   Max.   : 0.26408  
##  NA's   :16     NA's   :53        
## 
## geno 
##   No. of markers 9997 
##   genotypes 1 0 2 
##   frequencies 0.367 0.532 0.101 
##   NA's 0.000 %
## map 
##   No. of mapped markers  9997 
##   No. of chromosomes     20 
## 
##   markers per chromosome 
##  
##   1  10  11  12  13  14  15  16  17  18  19   2   3   4   5   6   7   8 
## 875 332 646 490 410 435 427 437 375 346 248 801 757 716 553 628 509 479 
##   9   X 
## 527   6 
## 
## pedigree 
## NULL
Gmat<-kin(miceCimp,ret="realized")
plot(Gmat)

summary(Gmat)
##  dimension                     1940 x 1940 
##  rank                          1939 
##  range of off-diagonal values  -0.2796827 -- 1.218864 
##  mean off-diagonal values      -0.0005252555 
##  range of diagonal values      0.812 -- 1.325 
##  mean diagonal values          1.01847 
##  number of unique values       1874008
dim(Gmat)
## [1] 1940 1940
dim(miceCimp$pheno)
## [1] 2527    2    1
phenotypes<-miceCimp$pheno[,1,1]
summary(phenotypes)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    11.9    17.8    19.9    20.3    22.6    30.2      16
length(phenotypes)
## [1] 2527
phenotypes<-phenotypes[!is.na(phenotypes)]
sum(rownames(Gmat)%in%names(phenotypes))
## [1] 1928
idg<-rownames(Gmat)%in%names(phenotypes)
Gpheno<-Gmat[idg,idg]
dim(Gpheno)
## [1] 1928 1928
idp<-match(rownames(Gpheno),names(phenotypes))
phenoG<-phenotypes[idp]
sum(names(phenoG)!=rownames(Gpheno))
## [1] 0
idcv<-match(rownames(Gpheno),miceCimp$covar$id)
cvg<-miceCimp$covar[idcv,]
sum(cvg$id!=rownames(Gpheno))
## [1] 0

Now let’s fit the model

lmmg<-regress(phenoG~as.factor(cvg$CageDensity)+cvg$sex,~Gpheno)
lmmg
## Likelihood kernel: K = (Intercept)+as.factor(cvg$CageDensity)3+as.factor(cvg$CageDensity)4+as.factor(cvg$CageDensity)5+as.factor(cvg$CageDensity)6+as.factor(cvg$CageDensity)7+cvg$sexM
## 
## Maximized log likelihood with kernel K is  -2559.798 
## 
## Linear Coefficients:
##                              Estimate Std. Error
##  (Intercept)                   19.324      0.492
##  as.factor(cvg$CageDensity)3   -0.956      0.525
##  as.factor(cvg$CageDensity)4   -0.945      0.497
##  as.factor(cvg$CageDensity)5   -1.191      0.501
##  as.factor(cvg$CageDensity)6   -1.384      0.500
##  as.factor(cvg$CageDensity)7   -2.354      0.584
##  cvg$sexM                       4.208      0.103
## 
## Variance Coefficients:
##                              Estimate Std. Error
##                       Gpheno    3.182      0.351
##                       In        3.900      0.158
names(lmmg)
##  [1] "trace"              "llik"               "cycle"             
##  [4] "rdf"                "beta"               "beta.cov"          
##  [7] "beta.se"            "sigma"              "sigma.cov"         
## [10] "W"                  "Q"                  "fitted"            
## [13] "predicted"          "predictedVariance"  "predictedVariance2"
## [16] "pos"                "Vnames"             "formula"           
## [19] "Vformula"           "Kcolnames"          "model"             
## [22] "Z"                  "X"                  "Sigma"
lmmg$sigma[1]/(sum(lmmg$sigma)) #estimate heritability
##    Gpheno 
## 0.4492891
EBVg<-BLUP(lmmg)
plot(EBVg$Mean,phenoG)
abline(0,1)

cor(phenoG,EBVg$Mean)
## [1] 0.5989608
corrected<-regress(phenoG~as.factor(cvg$CageDensity)+cvg$sex)
Ycor<-phenoG-corrected$predicted
plot(EBVg$Mean~Ycor)
abline(0,1)

cor(Ycor,EBVg$Mean)
##           [,1]
## [1,] 0.7455167

Predictive ability under (cross) validation

#random vector of missing values
Nobs<-length(phenoG)
Nmis<-round(length(phenoG)*0.1) #10% missing
tomissing<-sample(1:Nobs,Nmis)

phenoGna<-phenoG[-tomissing]
lmmgna<-regress(phenoGna~as.factor(cvg$CageDensity[-tomissing])+cvg$sex[-tomissing],
                        ~Gpheno[-tomissing,-tomissing])
lmmgna
## Likelihood kernel: K = (Intercept)+as.factor(cvg$CageDensity[-tomissing])3+as.factor(cvg$CageDensity[-tomissing])4+as.factor(cvg$CageDensity[-tomissing])5+as.factor(cvg$CageDensity[-tomissing])6+as.factor(cvg$CageDensity[-tomissing])7+cvg$sex[-tomissing]M
## 
## Maximized log likelihood with kernel K is  -2327.643 
## 
## Linear Coefficients:
##                                          Estimate Std. Error
##  (Intercept)                               19.312      0.539
##  as.factor(cvg$CageDensity[-tomissing])3   -0.923      0.573
##  as.factor(cvg$CageDensity[-tomissing])4   -0.954      0.543
##  as.factor(cvg$CageDensity[-tomissing])5   -1.201      0.549
##  as.factor(cvg$CageDensity[-tomissing])6   -1.368      0.547
##  as.factor(cvg$CageDensity[-tomissing])7   -2.459      0.633
##  cvg$sex[-tomissing]M                       4.242      0.110
## 
## Variance Coefficients:
##                                          Estimate Std. Error
##           Gpheno[-tomissing, -tomissing]    3.249      0.373
##           In                                3.972      0.172
lmmgna$sigma[1]/(sum(lmmgna$sigma)) #estimate heritability
## Gpheno[-tomissing, -tomissing] 
##                      0.4498726
EBVgna<-BLUP(lmmgna)
plot(EBVgna$Mean,phenoGna)
abline(0,1)

cor(phenoGna,EBVgna$Mean)
## [1] 0.6006532
correctedna<-regress(phenoGna~as.factor(cvg$CageDensity[-tomissing])+cvg$sex[-tomissing])
Ycorna<-phenoGna-correctedna$predicted
plot(EBVg$Mean~Ycor)
abline(0,1)

cor(Ycorna,EBVgna$Mean)
##           [,1]
## [1,] 0.7526971
EBV_norec<- Gpheno[tomissing,-tomissing]%*%solve(Gpheno[-tomissing,-tomissing])%*%EBVgna$Mean
head(sort(EBV_norec[,1],decreasing=T),10)
## A063867785 A067119610 A063571604 A084281887 A084286365 A084279851 
##   3.420143   3.091214   2.958639   2.807412   2.782590   2.712283 
## A067026328 A063544096 A084286265 A084285591 
##   2.541050   2.475320   2.320364   2.251947
plot(EBV_norec,phenoG[tomissing])
cor(EBV_norec,phenoG[tomissing])
##           [,1]
## [1,] 0.4831398
abline(0,1)

plot(EBV_norec,Ycor[tomissing])
abline(0,1)

cor(EBV_norec,Ycor[tomissing])
##           [,1]
## [1,] 0.5167684