#setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2015") # Set current working directory
setwd("C:/Users/Juan Steibel/SkyDrive/Documents/job/ans824/2015") # Set current working directory
rm(list=ls()) # Remove all objects from work environment
library(synbreed)
## Warning: package 'synbreed' was built under R version 3.0.3
## Loading required package: doBy
## Warning: package 'doBy' was built under R version 3.0.3
## Loading required package: survival
## Loading required package: splines
## Loading required package: BLR
## Loading required package: SuppDists
## Loading required package: regress
## Warning: package 'regress' was built under R version 3.0.3
## Loading required package: abind
library(synbreedData)
library(regress)
let’s load cattle mice dataset obtain a few descriptive statistics
data(mice)
is.heter<-function(x){
substr(x,1,1)!=substr(x,3,3)
}
miceCimp<-codeGeno(mice, label.heter = is.heter,impute=T,impute.type="random",
maf=0.05,nmiss=0.01,verbose=T)
## step 1 : 400 marker(s) removed with > 1 % missing values
## step 2 : Recoding alleles
## step 4 : 2148 marker(s) removed with maf < 0.05
## step 7 : Imputing of missing values
## step 7d : Random imputing of missing values
## step 8 : No recoding of alleles necessary after imputation
## step 9 : 0 marker(s) removed with maf < 0.05
## step 10 : No duplicated markers removed
## End : 9997 marker(s) remain after the check
##
## Summary of imputation
## total number of missing values : 20141
## number of random imputations : 20141
summary(miceCimp)
## object of class 'gpData'
## covar
## No. of individuals 2527
## phenotyped 2527
## genotyped 1940
## pheno
## No. of traits: 2
##
## weight growth.slope
## Min. :11.9 Min. :-0.08889
## 1st Qu.:17.8 1st Qu.: 0.04556
## Median :19.9 Median : 0.08024
## Mean :20.3 Mean : 0.08659
## 3rd Qu.:22.6 3rd Qu.: 0.12569
## Max. :30.2 Max. : 0.26408
## NA's :16 NA's :53
##
## geno
## No. of markers 9997
## genotypes 1 0 2
## frequencies 0.367 0.532 0.101
## NA's 0.000 %
## map
## No. of mapped markers 9997
## No. of chromosomes 20
##
## markers per chromosome
##
## 1 10 11 12 13 14 15 16 17 18 19 2 3 4 5 6 7 8
## 875 332 646 490 410 435 427 437 375 346 248 801 757 716 553 628 509 479
## 9 X
## 527 6
##
## pedigree
## NULL
Gmat<-kin(miceCimp,ret="realized")
plot(Gmat)
summary(Gmat)
## dimension 1940 x 1940
## rank 1939
## range of off-diagonal values -0.2796827 -- 1.218864
## mean off-diagonal values -0.0005252555
## range of diagonal values 0.812 -- 1.325
## mean diagonal values 1.01847
## number of unique values 1874008
dim(Gmat)
## [1] 1940 1940
dim(miceCimp$pheno)
## [1] 2527 2 1
phenotypes<-miceCimp$pheno[,1,1]
summary(phenotypes)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 11.9 17.8 19.9 20.3 22.6 30.2 16
length(phenotypes)
## [1] 2527
phenotypes<-phenotypes[!is.na(phenotypes)]
sum(rownames(Gmat)%in%names(phenotypes))
## [1] 1928
idg<-rownames(Gmat)%in%names(phenotypes)
Gpheno<-Gmat[idg,idg]
dim(Gpheno)
## [1] 1928 1928
idp<-match(rownames(Gpheno),names(phenotypes))
phenoG<-phenotypes[idp]
sum(names(phenoG)!=rownames(Gpheno))
## [1] 0
idcv<-match(rownames(Gpheno),miceCimp$covar$id)
cvg<-miceCimp$covar[idcv,]
sum(cvg$id!=rownames(Gpheno))
## [1] 0
Now let’s fit the model
lmmg<-regress(phenoG~as.factor(cvg$CageDensity)+cvg$sex,~Gpheno)
lmmg
## Likelihood kernel: K = (Intercept)+as.factor(cvg$CageDensity)3+as.factor(cvg$CageDensity)4+as.factor(cvg$CageDensity)5+as.factor(cvg$CageDensity)6+as.factor(cvg$CageDensity)7+cvg$sexM
##
## Maximized log likelihood with kernel K is -2559.798
##
## Linear Coefficients:
## Estimate Std. Error
## (Intercept) 19.324 0.492
## as.factor(cvg$CageDensity)3 -0.956 0.525
## as.factor(cvg$CageDensity)4 -0.945 0.497
## as.factor(cvg$CageDensity)5 -1.191 0.501
## as.factor(cvg$CageDensity)6 -1.384 0.500
## as.factor(cvg$CageDensity)7 -2.354 0.584
## cvg$sexM 4.208 0.103
##
## Variance Coefficients:
## Estimate Std. Error
## Gpheno 3.182 0.351
## In 3.900 0.158
names(lmmg)
## [1] "trace" "llik" "cycle"
## [4] "rdf" "beta" "beta.cov"
## [7] "beta.se" "sigma" "sigma.cov"
## [10] "W" "Q" "fitted"
## [13] "predicted" "predictedVariance" "predictedVariance2"
## [16] "pos" "Vnames" "formula"
## [19] "Vformula" "Kcolnames" "model"
## [22] "Z" "X" "Sigma"
lmmg$sigma[1]/(sum(lmmg$sigma)) #estimate heritability
## Gpheno
## 0.4492891
EBVg<-BLUP(lmmg)
plot(EBVg$Mean,phenoG)
abline(0,1)
cor(phenoG,EBVg$Mean)
## [1] 0.5989608
corrected<-regress(phenoG~as.factor(cvg$CageDensity)+cvg$sex)
Ycor<-phenoG-corrected$predicted
plot(EBVg$Mean~Ycor)
abline(0,1)
cor(Ycor,EBVg$Mean)
## [,1]
## [1,] 0.7455167
Predictive ability under (cross) validation
#random vector of missing values
Nobs<-length(phenoG)
Nmis<-round(length(phenoG)*0.1) #10% missing
tomissing<-sample(1:Nobs,Nmis)
phenoGna<-phenoG[-tomissing]
lmmgna<-regress(phenoGna~as.factor(cvg$CageDensity[-tomissing])+cvg$sex[-tomissing],
~Gpheno[-tomissing,-tomissing])
lmmgna
## Likelihood kernel: K = (Intercept)+as.factor(cvg$CageDensity[-tomissing])3+as.factor(cvg$CageDensity[-tomissing])4+as.factor(cvg$CageDensity[-tomissing])5+as.factor(cvg$CageDensity[-tomissing])6+as.factor(cvg$CageDensity[-tomissing])7+cvg$sex[-tomissing]M
##
## Maximized log likelihood with kernel K is -2327.643
##
## Linear Coefficients:
## Estimate Std. Error
## (Intercept) 19.312 0.539
## as.factor(cvg$CageDensity[-tomissing])3 -0.923 0.573
## as.factor(cvg$CageDensity[-tomissing])4 -0.954 0.543
## as.factor(cvg$CageDensity[-tomissing])5 -1.201 0.549
## as.factor(cvg$CageDensity[-tomissing])6 -1.368 0.547
## as.factor(cvg$CageDensity[-tomissing])7 -2.459 0.633
## cvg$sex[-tomissing]M 4.242 0.110
##
## Variance Coefficients:
## Estimate Std. Error
## Gpheno[-tomissing, -tomissing] 3.249 0.373
## In 3.972 0.172
lmmgna$sigma[1]/(sum(lmmgna$sigma)) #estimate heritability
## Gpheno[-tomissing, -tomissing]
## 0.4498726
EBVgna<-BLUP(lmmgna)
plot(EBVgna$Mean,phenoGna)
abline(0,1)
cor(phenoGna,EBVgna$Mean)
## [1] 0.6006532
correctedna<-regress(phenoGna~as.factor(cvg$CageDensity[-tomissing])+cvg$sex[-tomissing])
Ycorna<-phenoGna-correctedna$predicted
plot(EBVg$Mean~Ycor)
abline(0,1)
cor(Ycorna,EBVgna$Mean)
## [,1]
## [1,] 0.7526971
EBV_norec<- Gpheno[tomissing,-tomissing]%*%solve(Gpheno[-tomissing,-tomissing])%*%EBVgna$Mean
head(sort(EBV_norec[,1],decreasing=T),10)
## A063867785 A067119610 A063571604 A084281887 A084286365 A084279851
## 3.420143 3.091214 2.958639 2.807412 2.782590 2.712283
## A067026328 A063544096 A084286265 A084285591
## 2.541050 2.475320 2.320364 2.251947
plot(EBV_norec,phenoG[tomissing])
cor(EBV_norec,phenoG[tomissing])
## [,1]
## [1,] 0.4831398
abline(0,1)
plot(EBV_norec,Ycor[tomissing])
abline(0,1)
cor(EBV_norec,Ycor[tomissing])
## [,1]
## [1,] 0.5167684