setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2017") # Set current working directory
rm(list=ls()) # Remove all objects from work environment
library(synbreed)
library(synbreedData)
library(genetics)
## Loading required package: combinat
##
## Attaching package: 'combinat'
## The following object is masked from 'package:utils':
##
## combn
## Loading required package: gdata
## gdata: Unable to locate valid perl interpreter
## gdata:
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata:
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
##
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
##
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
##
## Attaching package: 'gdata'
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
## Loading required package: gtools
## Loading required package: MASS
## Loading required package: mvtnorm
##
## NOTE: THIS PACKAGE IS NOW OBSOLETE.
##
## The R-Genetics project has developed an set of enhanced genetics
## packages to replace 'genetics'. Please visit the project homepage
## at http://rgenetics.org for informtion.
##
##
## Attaching package: 'genetics'
## The following objects are masked from 'package:base':
##
## %in%, as.factor, order
There are three datasets in gpData format
data(cattle)
summary(cattle)
## object of class 'gpData'
## covar
## No. of individuals 1929
## phenotyped 500
## genotyped 500
## pheno
## No. of traits: 2
##
## Phenotype1 Phenotype2
## Min. :-50.07000 Min. :-424.43
## 1st Qu.:-10.25750 1st Qu.: -94.40
## Median : 0.13500 Median : -0.84
## Mean : -0.00104 Mean : 0.05
## 3rd Qu.: 10.35250 3rd Qu.: 96.99
## Max. : 49.41000 Max. : 422.63
##
## geno
## No. of markers 7250
## genotypes AA AB BB
## frequencies 0.2889283 0.3456006 0.3627126
## NA's 0.276 %
## map
## No. of mapped markers 7250
## No. of chromosomes 29
##
## markers per chromosome
##
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18
## 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250
## 19 20 21 22 23 24 25 26 27 28 29
## 250 250 250 250 250 250 250 250 250 250 250
##
## pedigree
## Number of
## individuals 1929
## Par 1 376
## Par 2 1053
## generations 6
data(mice)
summary(mice)
## object of class 'gpData'
## covar
## No. of individuals 2527
## phenotyped 2527
## genotyped 1940
## pheno
## No. of traits: 2
##
## weight growth.slope
## Min. :11.9 Min. :-0.08889
## 1st Qu.:17.8 1st Qu.: 0.04556
## Median :19.9 Median : 0.08024
## Mean :20.3 Mean : 0.08659
## 3rd Qu.:22.6 3rd Qu.: 0.12569
## Max. :30.2 Max. : 0.26408
## NA's :16 NA's :53
##
## geno
## No. of markers 12545
## genotypes A/G G/G A/A C/C C/A A/T T/T G/C G/A C/G A/C T/A
## frequencies 0.15 0.277 0.311 0.081 0.015 0.016 0.026 0.004 0.063 0.012 0.036 0.006
## NA's 0.444 %
## map
## No. of mapped markers 12545
## No. of chromosomes 20
##
## markers per chromosome
##
## 1 10 11 12 13 14 15 16 17 18 19 2 3 4 5
## 1044 481 706 550 573 590 527 497 535 456 302 948 857 778 770
## 6 7 8 9 X
## 709 658 615 630 319
##
## pedigree
## NULL
data(maize)
summary(maize)
## object of class 'gpData'
## covar
## No. of individuals 1610
## phenotyped 1250
## genotyped 1250
## pheno
## No. of traits: 1
##
## IDTrait
## Min. :120.7
## 1st Qu.:142.8
## Median :148.9
## Mean :148.9
## 3rd Qu.:154.9
## Max. :181.8
##
## geno
## No. of markers 1117
## genotypes 0 1
## frequencies 0.339995 0.660005
## NA's 0.000 %
## map
## No. of mapped markers 1117
## No. of chromosomes 10
##
## markers per chromosome
##
## 1 2 3 4 5 6 7 8 9 10
## 76 96 99 122 85 106 154 130 121 128
##
## pedigree
## Number of
## individuals 1610
## Par 1 219
## Par 2 221
## generations 15
The package has specific plotting functions
plotGenMap(mice,dense=TRUE,nmarker=FALSE,ylab="pos [cM]")
plotGenMap(maize)
#how to create your own gpData object? We will not do this in this class, but it is really easy if you have a suitable dataset. See the synbreed vignette (in D2L website)
We need to recode genotypes to counts of a reference allele.
That task may be very easy when no heterozygotes are recorded
maizeC<-codeGeno(maize) #it is easy because genotypes are only AA or BB
summary(maizeC)
## object of class 'gpData'
## covar
## No. of individuals 1610
## phenotyped 1250
## genotyped 1250
## pheno
## No. of traits: 1
##
## IDTrait
## Min. :120.7
## 1st Qu.:142.8
## Median :148.9
## Mean :148.9
## 3rd Qu.:154.9
## Max. :181.8
##
## geno
## No. of markers 1117
## genotypes 0 2
## frequencies 0.7442206 0.2557794
## NA's 0.000 %
## map
## No. of mapped markers 1117
## No. of chromosomes 10
##
## markers per chromosome
##
## 1 2 3 4 5 6 7 8 9 10
## 76 96 99 122 85 106 154 130 121 128
##
## pedigree
## Number of
## individuals 1610
## Par 1 219
## Par 2 221
## generations 15
For the mice dataset, we need a function to distinguish heterozygotes
is.heter<-function(x){
substr(x,1,1)!=substr(x,3,3)
}
is.heter("A A")
## [1] FALSE
is.heter("A/A")
## [1] FALSE
is.heter("B B")
## [1] FALSE
is.heter("A B")
## [1] TRUE
miceC <- codeGeno(mice, label.heter = is.heter)
summary(miceC)
## object of class 'gpData'
## covar
## No. of individuals 2527
## phenotyped 2527
## genotyped 1940
## pheno
## No. of traits: 2
##
## weight growth.slope
## Min. :11.9 Min. :-0.08889
## 1st Qu.:17.8 1st Qu.: 0.04556
## Median :19.9 Median : 0.08024
## Mean :20.3 Mean : 0.08659
## 3rd Qu.:22.6 3rd Qu.: 0.12569
## Max. :30.2 Max. : 0.26408
## NA's :16 NA's :53
##
## geno
## No. of markers 12545
## genotypes 1 0 2
## frequencies 0.302 0.61 0.084
## NA's 0.444 %
## map
## No. of mapped markers 12545
## No. of chromosomes 20
##
## markers per chromosome
##
## 1 10 11 12 13 14 15 16 17 18 19 2 3 4 5
## 1044 481 706 550 573 590 527 497 535 456 302 948 857 778 770
## 6 7 8 9 X
## 709 658 615 630 319
##
## pedigree
## NULL
Let’s check if this worked correctly. First a few manual checks.
table(mice$geno[,1],miceC$geno[,1])
##
## 0 1 2
## A/A 0 0 358
## A/G 0 1003 0
## G/G 577 0 0
table(mice$geno[,17],miceC$geno[,17])
##
## 0 1
## G/A 0 40
## G/G 1896 0
table(mice$geno[,230],miceC$geno[,230])
##
## 0 1 2
## A/A 1309 0 0
## A/C 0 568 0
## C/C 0 0 63
compute variability measure of dosage within each genotype (it should be zero)
(by(miceC$geno[,230],mice$geno[,230],sd))
## mice$geno[, 230]: A/A
## [1] 0
## --------------------------------------------------------
## mice$geno[, 230]: A/C
## [1] 0
## --------------------------------------------------------
## mice$geno[, 230]: C/C
## [1] 0
notice: “dosage” should be first vector.
now, we can sum over genotypes
sum(by(miceC$geno[,230],mice$geno[,230],sd))
## [1] 0
now, let’s build a general function that given two matrices (v1,v2) and a column index (nc) will produce a 0 if there is agreement
agree_gen<-function(nc=1,v1,v2){
rs<-sum(by(v1[,nc],v2[,nc],sd,na.rm=T))
return(rs)
}
let’ test it:
agree_gen(1,miceC$geno,mice$geno)
## [1] 0
and we can use sapply
chk<-sapply(1:ncol(mice$geno),agree_gen,v1=miceC$geno,mice$geno)
summary(chk)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.0000 0.5065 1.4600 1.2240 1.9560 2.5430 89
imputation WARNING: this way of imputing genotypes is not optimal, because it will just assign random genotypes to missing values!!
We use it here to “fill in genotypes”
This is also a good opportunity to delete SNP that are close to fixation using maf=
And to delete SNP with too many missing values using nmiss=
Please refer to the help of codeGeno for details and more options
miceCimp<-codeGeno(mice, label.heter = is.heter,impute=T,impute.type="random",
maf=0.05,nmiss=0.01,verbose=T)
## step 1 : 400 marker(s) removed with > 1 % missing values
## step 2 : Recoding alleles
## step 4 : 2148 marker(s) removed with maf < 0.05
## step 7 : Imputing of missing values
## step 7d : Random imputing of missing values
## step 8 : Recode alleles due to imputation
## step 9 : 0 marker(s) removed with maf < 0.05
## step 10 : No duplicated markers removed
## End : 9997 marker(s) remain after the check
##
## Summary of imputation
## total number of missing values : 20141
## number of random imputations : 20141
summary(miceCimp)
## object of class 'gpData'
## covar
## No. of individuals 2527
## phenotyped 2527
## genotyped 1940
## pheno
## No. of traits: 2
##
## weight growth.slope
## Min. :11.9 Min. :-0.08889
## 1st Qu.:17.8 1st Qu.: 0.04556
## Median :19.9 Median : 0.08024
## Mean :20.3 Mean : 0.08659
## 3rd Qu.:22.6 3rd Qu.: 0.12569
## Max. :30.2 Max. : 0.26408
## NA's :16 NA's :53
##
## geno
## No. of markers 9997
## genotypes 1 0 2
## frequencies 0.367 0.532 0.101
## NA's 0.000 %
## map
## No. of mapped markers 9997
## No. of chromosomes 20
##
## markers per chromosome
##
## 1 10 11 12 13 14 15 16 17 18 19 2 3 4 5 6 7 8
## 875 332 646 490 410 435 427 437 375 346 248 801 757 716 553 628 509 479
## 9 X
## 527 6
##
## pedigree
## NULL
save(miceCimp,file="mice_clean.Rdata")
warning FOR all IMPUTATIONS: RECODING OF ALLELES IS performed!!!
Do you notice something else that is odd here?
(miceCimp$pheno[1:10,,1])
## weight growth.slope
## A048005080 20.3 0.139375629
## A048005112 16.1 -0.006243233
## A048006063 26.7 0.110238908
## A048006555 19.5 0.195829042
## A048007096 22.2 0.139949477
## A048010273 17.3 0.110281690
## A048010371 18.1 0.133378840
## A048011040 NA 0.102957907
## A048011287 25.6 0.068639480
## A048011554 18.6 0.058543689
qqnorm(miceCimp$pheno[,1,1])
qqnorm(miceCimp$pheno[,2,1])
dim(miceCimp$pheno)
## [1] 2527 2 1
dim(miceCimp$geno)
## [1] 1940 9997
head(miceC$covar)
## id phenotyped genotyped sex birthMonth birthYear CoatColour
## 1 A048005080 TRUE TRUE F 3 2003 black
## 2 A048005112 TRUE FALSE F 3 2003 chocolate
## 3 A048006063 TRUE TRUE M 2 2003 dark.brown
## 4 A048006555 TRUE TRUE M 3 2003 silverado
## 5 A048007096 TRUE TRUE M 1 2003 chocolate
## 6 A048010273 TRUE TRUE F 12 2002 chocolate
## CageDensity Litter
## 1 5 2
## 2 3 4
## 3 5 4
## 4 4 1
## 5 4 4
## 6 6 1
sum(miceC$covar$genotyped)
## [1] 1940
First let’s explore possible environmental effects
sum(rownames(miceCimp$pheno)!=(miceCimp$covar$id))
## [1] 0
boxplot(miceCimp$pheno[,1,1]~miceCimp$covar$CageDensity)
boxplot(miceCimp$pheno[,1,1]~miceCimp$covar$birthMonth)
boxplot(miceCimp$pheno[,1,1]~miceCimp$covar$birthYear)
boxplot(miceCimp$pheno[,1,1]~miceCimp$covar$CoatColour)
boxplot(miceCimp$pheno[,1,1]~miceCimp$covar$sex)
We could also fit linear models (ANOVA) to see which of these factors are worth controling.
Let’s fit this model: \[ y_{ijk}=cage.density_{i}+sex_{j}+birth.year_{k}+e_{ijk} \\ e_{ijk}\sim N(0,\sigma^{2} ) \]
anova(lm(miceCimp$pheno[,1,1]~miceCimp$covar$sex+
as.factor(miceCimp$covar$CageDensity)+
as.factor(miceCimp$covar$birthYear)))
## Analysis of Variance Table
##
## Response: miceCimp$pheno[, 1, 1]
## Df Sum Sq Mean Sq F value
## miceCimp$covar$sex 1 11483.8 11483.8 1991.2456
## as.factor(miceCimp$covar$CageDensity) 5 250.1 50.0 8.6727
## as.factor(miceCimp$covar$birthYear) 2 2446.1 1223.1 212.0735
## Residuals 2423 13973.8 5.8
## Pr(>F)
## miceCimp$covar$sex < 2.2e-16 ***
## as.factor(miceCimp$covar$CageDensity) 3.684e-08 ***
## as.factor(miceCimp$covar$birthYear) < 2.2e-16 ***
## Residuals
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We will keep these fixed effects in our subsequent analyses of this dataset.