Set up files

setwd("C:/Users/steibelj/OneDrive/Documents/job/ans824/2017")    # Set current working directory
rm(list=ls())           # Remove all objects from work environment

library(synbreed)
library(synbreedData)
library(genetics)

## Loading required package: combinat

## 
## Attaching package: 'combinat'

## The following object is masked from 'package:utils':
## 
##     combn

## Loading required package: gdata

## gdata: Unable to locate valid perl interpreter
## gdata: 
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata: 
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)

## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.

##

## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.

##

## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.

## 
## Attaching package: 'gdata'

## The following object is masked from 'package:stats':
## 
##     nobs

## The following object is masked from 'package:utils':
## 
##     object.size

## The following object is masked from 'package:base':
## 
##     startsWith

## Loading required package: gtools

## Loading required package: MASS

## Loading required package: mvtnorm

##

## NOTE: THIS PACKAGE IS NOW OBSOLETE.

##

##   The R-Genetics project has developed an set of enhanced genetics

##   packages to replace 'genetics'. Please visit the project homepage

##   at http://rgenetics.org for informtion.

##

## 
## Attaching package: 'genetics'

## The following objects are masked from 'package:base':
## 
##     %in%, as.factor, order

the datasets

There are three datasets in gpData format

data(cattle)
summary(cattle)

## object of class 'gpData' 
## covar 
##   No. of individuals 1929 
##           phenotyped 500 
##            genotyped 500 
## pheno 
##   No. of traits:        2 
## 
##    Phenotype1          Phenotype2     
##  Min.   :-50.07000   Min.   :-424.43  
##  1st Qu.:-10.25750   1st Qu.: -94.40  
##  Median :  0.13500   Median :  -0.84  
##  Mean   : -0.00104   Mean   :   0.05  
##  3rd Qu.: 10.35250   3rd Qu.:  96.99  
##  Max.   : 49.41000   Max.   : 422.63  
## 
## geno 
##   No. of markers 7250 
##   genotypes AA AB BB 
##   frequencies 0.2889283 0.3456006 0.3627126 
##   NA's 0.276 %
## map 
##   No. of mapped markers  7250 
##   No. of chromosomes     29 
## 
##   markers per chromosome 
##  
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18 
## 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 250 
##  19  20  21  22  23  24  25  26  27  28  29 
## 250 250 250 250 250 250 250 250 250 250 250 
## 
## pedigree 
## Number of 
##   individuals  1929 
##   Par 1        376 
##   Par 2        1053 
##   generations  6

data(mice)
summary(mice)

## object of class 'gpData' 
## covar 
##   No. of individuals 2527 
##           phenotyped 2527 
##            genotyped 1940 
## pheno 
##   No. of traits:        2 
## 
##      weight      growth.slope     
##  Min.   :11.9   Min.   :-0.08889  
##  1st Qu.:17.8   1st Qu.: 0.04556  
##  Median :19.9   Median : 0.08024  
##  Mean   :20.3   Mean   : 0.08659  
##  3rd Qu.:22.6   3rd Qu.: 0.12569  
##  Max.   :30.2   Max.   : 0.26408  
##  NA's   :16     NA's   :53        
## 
## geno 
##   No. of markers 12545 
##   genotypes A/G G/G A/A C/C C/A A/T T/T G/C G/A C/G A/C T/A 
##   frequencies 0.15 0.277 0.311 0.081 0.015 0.016 0.026 0.004 0.063 0.012 0.036 0.006 
##   NA's 0.444 %
## map 
##   No. of mapped markers  12545 
##   No. of chromosomes     20 
## 
##   markers per chromosome 
##  
##    1   10   11   12   13   14   15   16   17   18   19    2    3    4    5 
## 1044  481  706  550  573  590  527  497  535  456  302  948  857  778  770 
##    6    7    8    9    X 
##  709  658  615  630  319 
## 
## pedigree 
## NULL

data(maize)
summary(maize)

## object of class 'gpData' 
## covar 
##   No. of individuals 1610 
##           phenotyped 1250 
##            genotyped 1250 
## pheno 
##   No. of traits:        1 
## 
##     IDTrait     
##  Min.   :120.7  
##  1st Qu.:142.8  
##  Median :148.9  
##  Mean   :148.9  
##  3rd Qu.:154.9  
##  Max.   :181.8  
## 
## geno 
##   No. of markers 1117 
##   genotypes 0 1 
##   frequencies 0.339995 0.660005 
##   NA's 0.000 %
## map 
##   No. of mapped markers  1117 
##   No. of chromosomes     10 
## 
##   markers per chromosome 
##  
##   1   2   3   4   5   6   7   8   9  10 
##  76  96  99 122  85 106 154 130 121 128 
## 
## pedigree 
## Number of 
##   individuals  1610 
##   Par 1        219 
##   Par 2        221 
##   generations  15

The package has specific plotting functions

plotGenMap(mice,dense=TRUE,nmarker=FALSE,ylab="pos [cM]")

plotGenMap(maize)

#how to create your own gpData object? We will not do this in this class, but it is really easy if you have a suitable dataset. See the synbreed vignette (in D2L website)

recoding genotype

We need to recode genotypes to counts of a reference allele.

That task may be very easy when no heterozygotes are recorded

maizeC<-codeGeno(maize) #it is easy because genotypes are only AA or BB
summary(maizeC)

## object of class 'gpData' 
## covar 
##   No. of individuals 1610 
##           phenotyped 1250 
##            genotyped 1250 
## pheno 
##   No. of traits:        1 
## 
##     IDTrait     
##  Min.   :120.7  
##  1st Qu.:142.8  
##  Median :148.9  
##  Mean   :148.9  
##  3rd Qu.:154.9  
##  Max.   :181.8  
## 
## geno 
##   No. of markers 1117 
##   genotypes 0 2 
##   frequencies 0.7442206 0.2557794 
##   NA's 0.000 %
## map 
##   No. of mapped markers  1117 
##   No. of chromosomes     10 
## 
##   markers per chromosome 
##  
##   1   2   3   4   5   6   7   8   9  10 
##  76  96  99 122  85 106 154 130 121 128 
## 
## pedigree 
## Number of 
##   individuals  1610 
##   Par 1        219 
##   Par 2        221 
##   generations  15

For the mice dataset, we need a function to distinguish heterozygotes

is.heter<-function(x){
  substr(x,1,1)!=substr(x,3,3)
}
is.heter("A A")

## [1] FALSE

is.heter("A/A")

## [1] FALSE

is.heter("B B")

## [1] FALSE

is.heter("A B")

## [1] TRUE

miceC <- codeGeno(mice, label.heter = is.heter)
summary(miceC)

## object of class 'gpData' 
## covar 
##   No. of individuals 2527 
##           phenotyped 2527 
##            genotyped 1940 
## pheno 
##   No. of traits:        2 
## 
##      weight      growth.slope     
##  Min.   :11.9   Min.   :-0.08889  
##  1st Qu.:17.8   1st Qu.: 0.04556  
##  Median :19.9   Median : 0.08024  
##  Mean   :20.3   Mean   : 0.08659  
##  3rd Qu.:22.6   3rd Qu.: 0.12569  
##  Max.   :30.2   Max.   : 0.26408  
##  NA's   :16     NA's   :53        
## 
## geno 
##   No. of markers 12545 
##   genotypes 1 0 2 
##   frequencies 0.302 0.61 0.084 
##   NA's 0.444 %
## map 
##   No. of mapped markers  12545 
##   No. of chromosomes     20 
## 
##   markers per chromosome 
##  
##    1   10   11   12   13   14   15   16   17   18   19    2    3    4    5 
## 1044  481  706  550  573  590  527  497  535  456  302  948  857  778  770 
##    6    7    8    9    X 
##  709  658  615  630  319 
## 
## pedigree 
## NULL

checking recoding

Let’s check if this worked correctly. First a few manual checks.

table(mice$geno[,1],miceC$geno[,1])

##      
##          0    1    2
##   A/A    0    0  358
##   A/G    0 1003    0
##   G/G  577    0    0

table(mice$geno[,17],miceC$geno[,17])

##      
##          0    1
##   G/A    0   40
##   G/G 1896    0

table(mice$geno[,230],miceC$geno[,230])

##      
##          0    1    2
##   A/A 1309    0    0
##   A/C    0  568    0
##   C/C    0    0   63

compute variability measure of dosage within each genotype (it should be zero)

(by(miceC$geno[,230],mice$geno[,230],sd))

## mice$geno[, 230]: A/A
## [1] 0
## -------------------------------------------------------- 
## mice$geno[, 230]: A/C
## [1] 0
## -------------------------------------------------------- 
## mice$geno[, 230]: C/C
## [1] 0

notice: “dosage” should be first vector.

now, we can sum over genotypes

sum(by(miceC$geno[,230],mice$geno[,230],sd))

## [1] 0

now, let’s build a general function that given two matrices (v1,v2) and a column index (nc) will produce a 0 if there is agreement

agree_gen<-function(nc=1,v1,v2){
  rs<-sum(by(v1[,nc],v2[,nc],sd,na.rm=T))
  return(rs)
}

let’ test it:

agree_gen(1,miceC$geno,mice$geno)

## [1] 0

and we can use sapply

chk<-sapply(1:ncol(mice$geno),agree_gen,v1=miceC$geno,mice$geno)
summary(chk)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  0.0000  0.5065  1.4600  1.2240  1.9560  2.5430      89

Imputation of allelic dosages

imputation WARNING: this way of imputing genotypes is not optimal, because it will just assign random genotypes to missing values!!

We use it here to “fill in genotypes”
This is also a good opportunity to delete SNP that are close to fixation using maf=
And to delete SNP with too many missing values using nmiss=
Please refer to the help of codeGeno for details and more options

miceCimp<-codeGeno(mice, label.heter = is.heter,impute=T,impute.type="random",
                   maf=0.05,nmiss=0.01,verbose=T)

##    step 1  : 400 marker(s) removed with > 1 % missing values 
##    step 2  : Recoding alleles 
##    step 4  : 2148 marker(s) removed with maf < 0.05 
##    step 7  : Imputing of missing values 
##    step 7d : Random imputing of missing values 
##    step 8  : Recode alleles due to imputation 
##    step 9  : 0 marker(s) removed with maf < 0.05 
##    step 10 : No duplicated markers removed 
##    End     : 9997 marker(s) remain after the check
## 
##      Summary of imputation 
##     total number of missing values                : 20141 
##     number of random imputations                  : 20141

summary(miceCimp)

## object of class 'gpData' 
## covar 
##   No. of individuals 2527 
##           phenotyped 2527 
##            genotyped 1940 
## pheno 
##   No. of traits:        2 
## 
##      weight      growth.slope     
##  Min.   :11.9   Min.   :-0.08889  
##  1st Qu.:17.8   1st Qu.: 0.04556  
##  Median :19.9   Median : 0.08024  
##  Mean   :20.3   Mean   : 0.08659  
##  3rd Qu.:22.6   3rd Qu.: 0.12569  
##  Max.   :30.2   Max.   : 0.26408  
##  NA's   :16     NA's   :53        
## 
## geno 
##   No. of markers 9997 
##   genotypes 1 0 2 
##   frequencies 0.367 0.532 0.101 
##   NA's 0.000 %
## map 
##   No. of mapped markers  9997 
##   No. of chromosomes     20 
## 
##   markers per chromosome 
##  
##   1  10  11  12  13  14  15  16  17  18  19   2   3   4   5   6   7   8 
## 875 332 646 490 410 435 427 437 375 346 248 801 757 716 553 628 509 479 
##   9   X 
## 527   6 
## 
## pedigree 
## NULL

save(miceCimp,file="mice_clean.Rdata")

warning FOR all IMPUTATIONS: RECODING OF ALLELES IS performed!!!

Do you notice something else that is odd here?

(miceCimp$pheno[1:10,,1])

##            weight growth.slope
## A048005080   20.3  0.139375629
## A048005112   16.1 -0.006243233
## A048006063   26.7  0.110238908
## A048006555   19.5  0.195829042
## A048007096   22.2  0.139949477
## A048010273   17.3  0.110281690
## A048010371   18.1  0.133378840
## A048011040     NA  0.102957907
## A048011287   25.6  0.068639480
## A048011554   18.6  0.058543689

qqnorm(miceCimp$pheno[,1,1])

qqnorm(miceCimp$pheno[,2,1])

dim(miceCimp$pheno)

## [1] 2527    2    1

dim(miceCimp$geno)

## [1] 1940 9997

head(miceC$covar)

##           id phenotyped genotyped sex birthMonth birthYear CoatColour
## 1 A048005080       TRUE      TRUE   F          3      2003      black
## 2 A048005112       TRUE     FALSE   F          3      2003  chocolate
## 3 A048006063       TRUE      TRUE   M          2      2003 dark.brown
## 4 A048006555       TRUE      TRUE   M          3      2003  silverado
## 5 A048007096       TRUE      TRUE   M          1      2003  chocolate
## 6 A048010273       TRUE      TRUE   F         12      2002  chocolate
##   CageDensity Litter
## 1           5      2
## 2           3      4
## 3           5      4
## 4           4      1
## 5           4      4
## 6           6      1

sum(miceC$covar$genotyped)

## [1] 1940

First let’s explore possible environmental effects

sum(rownames(miceCimp$pheno)!=(miceCimp$covar$id))

## [1] 0

boxplot(miceCimp$pheno[,1,1]~miceCimp$covar$CageDensity)

boxplot(miceCimp$pheno[,1,1]~miceCimp$covar$birthMonth)

boxplot(miceCimp$pheno[,1,1]~miceCimp$covar$birthYear)

boxplot(miceCimp$pheno[,1,1]~miceCimp$covar$CoatColour)

boxplot(miceCimp$pheno[,1,1]~miceCimp$covar$sex)

We could also fit linear models (ANOVA) to see which of these factors are worth controling.

Let’s fit this model: \[ y_{ijk}=cage.density_{i}+sex_{j}+birth.year_{k}+e_{ijk} \\ e_{ijk}\sim N(0,\sigma^{2} ) \]

anova(lm(miceCimp$pheno[,1,1]~miceCimp$covar$sex+
        as.factor(miceCimp$covar$CageDensity)+
        as.factor(miceCimp$covar$birthYear)))

## Analysis of Variance Table
## 
## Response: miceCimp$pheno[, 1, 1]
##                                         Df  Sum Sq Mean Sq   F value
## miceCimp$covar$sex                       1 11483.8 11483.8 1991.2456
## as.factor(miceCimp$covar$CageDensity)    5   250.1    50.0    8.6727
## as.factor(miceCimp$covar$birthYear)      2  2446.1  1223.1  212.0735
## Residuals                             2423 13973.8     5.8          
##                                          Pr(>F)    
## miceCimp$covar$sex                    < 2.2e-16 ***
## as.factor(miceCimp$covar$CageDensity) 3.684e-08 ***
## as.factor(miceCimp$covar$birthYear)   < 2.2e-16 ***
## Residuals                                          
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We will keep these fixed effects in our subsequent analyses of this dataset.

Processing genotype and phenotype files

Set up files

the datasets

recoding genotype

checking recoding

Imputation of allelic dosages