Example to Estimate Genetic Relatedness

Directory

setwd("~/LabWork/Patuxent/BWTE_2/BWTE2/PhyloExample")

Load Libraries

suppressMessages(library(rentrez))
suppressMessages(library(ape))
suppressMessages(library(ips))
suppressMessages(library(phangorn))
suppressMessages(library(geiger))
suppressMessages(library(dendextend))
suppressMessages(library(ggplot2))
suppressMessages(library(ggtree))
suppressMessages(library(phylobase))
suppressMessages(library(insect))
suppressMessages(library(stringr))
suppressMessages(library(dplyr))

Load Alignment

Previously aligned using MUSCLE. Includes Multiple species surveillance data, for 135 influenza positive H5 isolates from the IRD database.

HA_segs = read.dna("Strain_Alignment_AIH5.fasta", format = "fasta")

#Remove Duplicate Names
duplicates = duplicated.DNAbin(HA_segs, point = TRUE)
attr(duplicates, "pointers")
##   [1]   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
##  [18]  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  32
##  [35]  34  35  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50
##  [52]  51  52  53  54  55  56  57  58  59  60  61  62  63  64  65  66  67
##  [69]  68  69  70  66  71  72  73  74  75  76  77  78  79  80  81  82  83
##  [86]  84  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100
## [103] 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
## [120] 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132  48 133
## [137] 134 135
HA_segs = subset.DNAbin(HA_segs, subset = !duplicates)


write.nexus.data(HA_segs , file = "HA_032619.nex", #Save copy
                 format = "dna",
                 datablock = TRUE, 
                 interleaved = FALSE, 
                 gap = "-", 
                 missing = "N")

Get Strain Name

Cleaning-up names for consistency.

Strains = unlist(attr(HA_segs, "dimnames")[1])

Strains.tab = as.data.frame(cbind(Strains, Strains))
names(Strains.tab) = c("Strain","SampID")

Strains.tab$Strain =  as.character(Strains.tab$Strain) 
Strains.tab$Strain = gsub("_","",as.character(Strains.tab$Strain))
Strains.tab$Strain = gsub("-","",as.character(Strains.tab$Strain))
Strains.tab$Strain = str_replace_all(Strains.tab$Strain, fixed(" "), "")
Strains.tab$SampID = NA

Read in Surveillance Data

Also from the IRD database, but this file Includes Long and Lat information.

Surveil.df = read.csv("./Surveillance_032619.csv", stringsAsFactors=FALSE)



Surveil.df$SampID = paste("Samp_", 1:dim(Surveil.df)[1], sep="")
Surveil.df$Strain.Name2 = str_replace(Surveil.df$Strain.Name, " \\(.*\\)", "")

Surveil.df$Strain.Name2 = gsub("\\s*\\([^\\)]+\\)","",as.character(Surveil.df$Strain.Name))
Surveil.df$Strain.Name2 = gsub("_","",as.character(Surveil.df$Strain.Name2))
Surveil.df$Strain.Name2 = gsub("-","",as.character(Surveil.df$Strain.Name2))
Surveil.df$Strain.Name2 = str_replace_all(Surveil.df$Strain.Name2, fixed(" "), "")

dim(Surveil.df)
## [1] 137  18
head(Surveil.df)
##   Collector.Institution Collection.Date Country State...Province
## 1 Ohio State University      12/08/2011     USA      Mississippi
## 2 Ohio State University      08/19/2011     USA             Ohio
## 3 Ohio State University      10/16/2011     USA        Wisconsin
## 4 Ohio State University      01/16/2012     USA      Mississippi
## 5 Ohio State University      01/17/2012     USA      Mississippi
## 6 Ohio State University      10/03/2010     USA        Wisconsin
##         Host.Species  Host.Common.Name Subtype Flu.Test.Status Type
## 1      Anas strepera           Gadwall    H5N3        Positive    A
## 2 Anas platyrhynchos           Mallard    H5N2        Positive    A
## 3  Anas carolinensis Green-Winged Teal    H5N2        Positive    A
## 4  Anas carolinensis Green-Winged Teal    H5N3        Positive    A
## 5  Anas carolinensis Green-Winged Teal    H5N3        Positive    A
## 6  Anas carolinensis Green-Winged Teal    H5N2        Positive    A
##                                                   Strain.Name Latitude
## 1                   A/gadwall/Mississippi/11OS5840/2011(H5N3)  32.8305
## 2                          A/mallard/Ohio/11OS2229/2011(H5N2)  41.4345
## 3  A/American green-winged teal/Wisconsin/11OS3432/2011(H5N2)  43.5497
## 4 A/American green-winged teal/Mississippi/12OS191/2012(H5N3)  32.5486
## 5 A/American green-winged teal/Mississippi/12OS311/2012(H5N3)  32.8305
## 6  A/American green-winged teal/Wisconsin/10OS2955/2010(H5N2)  43.5497
##   Longitude Sample.Identifier Sample.Accession          Age  Health SampID
## 1  -90.9812          11OS5840 IRD_UMN000093739   Hatch Year Healthy Samp_1
## 2  -82.9622          11OS2229 IRD_UMN000091639   Hatch Year Healthy Samp_2
## 3  -88.6559          11OS3432 IRD_UMN000092298   Hatch Year Healthy Samp_3
## 4  -90.8707           12OS191 IRD_UMN000094423   Hatch Year Healthy Samp_4
## 5  -90.9812           12OS311 IRD_UMN000094543 Undetermined Healthy Samp_5
## 6  -88.6559          10OS2955 IRD_UMN000055620   Hatch Year Healthy Samp_6
##                                         Strain.Name2
## 1                A/gadwall/Mississippi/11OS5840/2011
## 2                       A/mallard/Ohio/11OS2229/2011
## 3  A/Americangreenwingedteal/Wisconsin/11OS3432/2011
## 4 A/Americangreenwingedteal/Mississippi/12OS191/2012
## 5 A/Americangreenwingedteal/Mississippi/12OS311/2012
## 6  A/Americangreenwingedteal/Wisconsin/10OS2955/2010

Unique ID

Creating unique IDs

Strains.tab$SampID = with(Surveil.df, 
                              SampID[match(
                                Strains.tab$Strain,
                                           Strain.Name2)])   

Strains.tab$dup = duplicated(Strains.tab$SampID)

Strains.tab$SampID = ifelse(Strains.tab$dup == TRUE, paste(Strains.tab$SampID, "B", sep=""), Strains.tab$SampID)

ReName

One sample (Missing) in tree in alignment data, but, not Long Lat file.

Strains.tab$SampID[is.na(Strains.tab$SampID)] = "Missing"

attr(HA_segs, "dimnames")[[1]] = Strains.tab$SampID

View Alignment

image(HA_segs, show.labels = TRUE, cex = 0.5)

Evaluate Substitution Models

HA_segs_phy = as.phyDat(HA_segs)

mt = modelTest(HA_segs_phy)
## negative edges length changed to 0!
## [1] "JC+I"
## [1] "JC+G"
## [1] "JC+G+I"
## [1] "F81+I"
## [1] "F81+G"
## [1] "F81+G+I"
## [1] "K80+I"
## [1] "K80+G"
## [1] "K80+G+I"
## [1] "HKY+I"
## [1] "HKY+G"
## [1] "HKY+G+I"
## [1] "SYM+I"
## [1] "SYM+G"
## [1] "SYM+G+I"
## [1] "GTR+I"
## [1] "GTR+G"
## [1] "GTR+G+I"

Check Results by BIC

arrange(mt, BIC) #Model summary , sorted by BIC
##      Model  df     logLik      AIC          AICw     AICc         AICcw
## 1    GTR+G 276  -9369.476 19290.95  6.357194e-02 19393.57  9.250886e-02
## 2  GTR+G+I 277  -9365.786 19285.57  9.364281e-01 19389.00  9.074911e-01
## 3    GTR+I 276  -9390.282 19332.56  5.851571e-11 19435.18  8.515112e-11
## 4    HKY+G 272  -9407.421 19358.84  1.150502e-16 19458.25  8.352712e-16
## 5  HKY+G+I 273  -9403.735 19353.47  1.689523e-15 19453.67  8.230423e-15
## 6    HKY+I 272  -9427.901 19399.80  1.467820e-25 19499.21  1.065646e-24
## 7  SYM+G+I 274  -9442.133 19432.27  1.309495e-32 19533.27  4.272332e-32
## 8    SYM+G 273  -9446.911 19439.82  2.995885e-34 19540.03  1.459429e-33
## 9    SYM+I 273  -9472.232 19490.46  3.017251e-45 19590.67  1.469838e-44
## 10 K80+G+I 270  -9486.852 19513.70  2.712670e-50 19611.52  4.349757e-49
## 11   K80+G 269  -9491.327 19520.65  8.396304e-52 19617.69  1.995285e-50
## 12   K80+I 269  -9514.454 19566.91  7.587387e-62 19663.94  1.803055e-60
## 13     GTR 275  -9641.368 19832.74 1.433161e-119 19934.55 3.125672e-119
## 14     HKY 271  -9682.221 19906.44 1.417145e-135 20005.05 1.530467e-134
## 15     SYM 272  -9738.833 20021.67 1.351380e-160 20121.07 9.811097e-160
## 16     K80 268  -9778.020 20092.04 7.063887e-176 20188.29 2.483140e-174
## 17 F81+G+I 272  -9965.285 20474.57 6.079399e-259 20573.98 4.413681e-258
## 18   F81+G 271  -9969.250 20480.50 3.135234e-260 20579.11 3.385944e-259
## 19   F81+I 271  -9989.691 20521.38 4.156759e-269 20619.99 4.489154e-268
## 20  JC+G+I 269 -10055.065 20648.13 1.245973e-296 20745.16 2.960911e-295
## 21    JC+G 268 -10059.348 20654.70 4.677189e-298 20750.95 1.644154e-296
## 22    JC+I 268 -10081.611 20699.22 1.002725e-307 20795.47 3.524839e-306
## 23     F81 270 -10242.822 21025.64  0.000000e+00 21123.47  0.000000e+00
## 24      JC 267 -10340.048 21214.10  0.000000e+00 21309.57  0.000000e+00
##         BIC
## 1  20802.61
## 2  20802.71
## 3  20844.23
## 4  20848.60
## 5  20848.70
## 6  20889.56
## 7  20932.98
## 8  20935.05
## 9  20985.70
## 10 20992.50
## 11 20993.98
## 12 21040.23
## 13 21338.92
## 14 21390.72
## 15 21511.42
## 16 21559.89
## 17 21964.32
## 18 21964.78
## 19 22005.66
## 20 22121.45
## 21 22122.54
## 22 22167.07
## 23 22504.45
## 24 22676.47

Select Best Model

env = attr(mt, "env")
ls(envir=env)
##  [1] "data"         "F81"          "F81+G"        "F81+G+I"     
##  [5] "F81+I"        "GTR"          "GTR+G"        "GTR+G+I"     
##  [9] "GTR+I"        "HKY"          "HKY+G"        "HKY+G+I"     
## [13] "HKY+I"        "JC"           "JC+G"         "JC+G+I"      
## [17] "JC+I"         "K80"          "K80+G"        "K80+G+I"     
## [21] "K80+I"        "SYM"          "SYM+G"        "SYM+G+I"     
## [25] "SYM+I"        "tree_F81"     "tree_F81+G"   "tree_F81+G+I"
## [29] "tree_F81+I"   "tree_GTR"     "tree_GTR+G"   "tree_GTR+G+I"
## [33] "tree_GTR+I"   "tree_HKY"     "tree_HKY+G"   "tree_HKY+G+I"
## [37] "tree_HKY+I"   "tree_JC"      "tree_JC+G"    "tree_JC+G+I" 
## [41] "tree_JC+I"    "tree_K80"     "tree_K80+G"   "tree_K80+G+I"
## [45] "tree_K80+I"   "tree_SYM"     "tree_SYM+G"   "tree_SYM+G+I"
## [49] "tree_SYM+I"
(fit = eval(get("GTR+G", env), env)) #select model, save as "fit"
## 
##  loglikelihood: -9369.476 
## 
## unconstrained loglikelihood: -8245.543 
## Discrete gamma model
## Number of rate categories: 4 
## Shape parameter: 0.3507208 
## 
## Rate matrix:
##          a          c          g        t
## a 0.000000 1.30753461 6.64974164 0.537365
## c 1.307535 0.00000000 0.07174399 8.479255
## g 6.649742 0.07174399 0.00000000 1.000000
## t 0.537365 8.47925498 1.00000000 0.000000
## 
## Base frequencies:  
## 0.3587083 0.1773652 0.2182906 0.2456359

Optimize

#Optimise model parameters without fitting edges
fit1 = optim.pml(fit, 
                  optNni = FALSE, optBf = TRUE, 
                  optQ = TRUE, optInv = TRUE, 
                  optGamma = TRUE, optEdge = FALSE, 
                  optRate = TRUE, 
                  control = pml.control(epsilon = 1e-08, 
                                        maxit = 10, trace = 0))

#Fix substitution model and fit tree
fit2 = optim.pml(fit1, 
                 optNni = TRUE, optBf = FALSE,
                 optQ = FALSE, optInv = FALSE, 
                 optGamma = FALSE, optEdge = TRUE,
                 control = pml.control(epsilon = 1e-08, 
                                       maxit = 10, trace = 0))

#Fine tune
fit3 = optim.pml(fit2, 
                 optNni = TRUE, optBf = TRUE,
                 optQ = TRUE, optInv = TRUE, 
                 optGamma = TRUE, optEdge = TRUE, 
                 optRate = FALSE,
                 control = pml.control(epsilon = 1e-08, 
                                       maxit = 10, trace = 0))


fit4 = fit3$tree


#fit4$tip.label
fit4 = root(fit4, 47, resolve.root = TRUE) # reroots the tree

Plot Tree

p1 = ggtree(fit4,  
            ladderize=TRUE) +
            geom_tiplab(cex = 1.75) +
            geom_treescale(x=0.3, y=0.5, width=0.005, 
                           offset=-1, fontsize = 3, color = "red") +
            ggtitle("03-26-19 AI H5 ")

p1 

Variance-CoVariance Matrix

Creating a variance covariance matrix from tree under an assumption of Brownian motion.

VCV.mat = vcv.phylo(fit4, model = "Brownian", corr = FALSE)

dim(VCV.mat)
## [1] 135 135

Model

States = map("state", 
            fill = TRUE,
            plot = FALSE)

IDs = sapply(strsplit(States$names, ":"),
             function(x) x[1])

LL84 = "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"

States = map2SpatialPolygons(States, IDs = IDs,
                              proj4string = CRS(LL84))

#Add a dataframe  
pid = sapply(slot(States, "polygons"), 
             function(x) slot(x, "ID"))

p.df = data.frame(ID=1:length(States), 
                  row.names = pid)

States = SpatialPolygonsDataFrame(States, p.df)

US = gUnaryUnion(gBuffer(States, width=0))
## Warning in gBuffer(States, width = 0): Spatial object is not projected;
## GEOS expects planar coordinates
Ras = raster(res = 0.05, ext = extent(US),
             crs = proj4string(US))

Domain.r = rasterize(US, Ras, 
                     field = 0, 
                     background = NA)

#Point grid version
Grd.pnt = rasterToPoints(Domain.r, spatial = TRUE)

Grd.pnt@data = Grd.pnt@data %>%
  mutate(Long = Grd.pnt@coords[,1],
         Lat = Grd.pnt@coords[,2]) 

Sample Points

Surveil.df2 = Surveil.df %>%
              mutate(Long = as.numeric(Longitude),
                     Lat = as.numeric(Latitude),
                     Spp = Host.Common.Name,
                     Date = as.POSIXct(Collection.Date, 
                                        tz = "GMT", 
                                        format = "%m/%d/%Y"),
                     Year = year(Date),
                     Month = month(Date)) %>%
              filter(is.na(Long) == FALSE,
                     is.na(Lat) == FALSE)
## Warning: NAs introduced by coercion

## Warning: NAs introduced by coercion
Surveil.pnt = SpatialPointsDataFrame(cbind(Surveil.df2$Long, Surveil.df2$Lat), Surveil.df2)
proj4string(Surveil.pnt) = proj4string(US)

Outside of U.S.

Surveil.pnt$Domain = over(Surveil.pnt, US)

Surveil.pnt = subset(Surveil.pnt, Domain == TRUE)

Quick Plot

Isolate sample locations.

plot(States)
plot(Surveil.pnt, add=T, cex=1, pch=19, col="red")

Domain Tesselation

NA.bf = gBuffer(US, 
                width = 1, 
                byid = FALSE) 
## Warning in gBuffer(US, width = 1, byid = FALSE): Spatial object is not
## projected; GEOS expects planar coordinates
NA.bnds = inla.sp2segment(NA.bf)

MaxEdge = 0.5

mesh0 = inla.mesh.2d(boundary = NA.bnds,
                     loc = Surveil.pnt,
                     cutoff = 1, 
                     max.edge = MaxEdge,
                     min.angle = 21) 

MeshLocs = cbind(mesh0$loc[,1], mesh0$loc[,2]) 

xyz = as.data.frame(                
          inla.mesh.map(MeshLocs, 
                        projection = "longlat", 
                        inverse = TRUE))

true.radius.of.earth = 6371 #km
radius.of.earth = 1

mesh.dom = inla.mesh.create(loc = xyz,
                     cutoff = 35/true.radius.of.earth, 
                     refine=list(max.edge = 1000/true.radius.of.earth, 
                                 min.angle = 26))

plot(mesh.dom, rgl = TRUE, main = " ")

Set Integer Matrix Index

Changing names to an integer value for ease of cross-referencing in model.

CNames = colnames(VCV.mat)

colnames(VCV.mat) = 1:135
rownames(VCV.mat) = 1:135

VCV.mat2 = as.matrix(VCV.mat)

LuTable = as.data.frame(cbind(1:135, CNames))
LuTable$CNames = as.character(LuTable$CNames)

Surveil.pnt$SampID2 = with(LuTable,
                         V1[match(Surveil.pnt$SampID,
                                              CNames)])

Surveil.pnt$SampID2 = as.numeric(as.character(Surveil.pnt$SampID2))

Set up Model Spatial Index

locs = cbind(Surveil.pnt@coords[,1], Surveil.pnt@coords[,2]) #point locations

locs = inla.mesh.map(locs, 
                     projection = "longlat", 
                     inverse = TRUE)

A.det = inla.spde.make.A(mesh.dom, #the mesh
                         alpha = 2, #default setting
                         loc=locs) #our locations

spde = inla.spde2.pcmatern(mesh.dom, 
                           prior.range = c(1, 0.9), 
                           prior.sigma = c(1, 0.9))


field.det = inla.spde.make.index("field.det", spde$n.spde) #index 

Run Model in Loop

Estimating genetic realtedness iteratively between all pairs. Note that formula (Frm.M1) below includes space, species, year, month, and site location. Only 8-pairwise combinations are modeled here as an example.

DT.df = Surveil.pnt@data

DT.df %>%
  group_by(Month) %>%
  summarise(Count = length(Month))

DT.df %>%
  group_by(Year) %>%
  summarise(Count = length(Year))


DT.df$Site = 1:dim(DT.df)[1]

DT.lst = list(c(field.det,                            #Spatial index
                list(intercept1 = 1)),                #Intercept
                list(Lat = DT.df[,"Lat"],
                     Long = DT.df[,"Long"],
                     SampID = DT.df[,"SampID2"],
                     Subtype = DT.df[,"Subtype"],
                     Spp = DT.df[,"Spp"],
                     Month = DT.df[,"Month"],
                     Year = DT.df[,"Year"],
                     Site = DT.df[,"Site"]))   


for(i in 1:8){
  
Target.vect = as.data.frame(cbind(1:135, VCV.mat2[1:135,i])) 


DT.df$Target = with(Target.vect,
                    V2[match(Surveil.pnt$SampID2,
                                            V1)])  


DT.df$Target[DT.df$Target == 1] = NA #Remove focal location (self-relation)

DT.df$Target[is.na(DT.df$Target)] = mean(DT.df$Target, na.rm=T)


surv.stk = inla.stack(data = list(Y = DT.df$Target), 
                           A = list(A.det, 1),        #Projection matrix
                     effects = DT.lst,                #Spatial index and covariates
                         tag = "det.0")   


Frm.M1 = Y ~ -1 + intercept1 + 
                   f(field.det, model=spde) + #Spatial
                   f(Spp, model = "iid") + #Species
                   f(Year, model = "iid") + #Year
                   f(Month, model = "iid") + #Month
                   f(Site, model = "iid")  #Site

Model.M1 = inla(Frm.M1, 
               data = inla.stack.data(surv.stk), 
               family = "gaussian", 
               verbose = TRUE,
               control.fixed = list(prec = 0.001, prec.intercept = 0.0001), 
               control.predictor = list(
                                      A = inla.stack.A(surv.stk), 
                                      compute = TRUE, 
                                      link = 1), 
               control.results = list(return.marginals.random = TRUE,
                                      return.marginals.predictor = TRUE),
               control.compute=list(dic = TRUE, cpo = TRUE, waic = TRUE)) 


#Map model output to a raster object
Pred.Grid = Grd.pnt
pLocs = cbind(Pred.Grid$Long, Pred.Grid$Lat)

pLocs = inla.mesh.map(pLocs, 
                      projection = "longlat", 
                      inverse = TRUE)

Ap = inla.spde.make.A(mesh.dom, loc=pLocs)


Pred.Grid$Intercept =  Model.M1$summary.fitted.values[1,1]
Pred.Grid$M1 = drop(Ap %*% Model.M1$summary.random$field.det$mean) 




Pred.Grid@data = Pred.Grid@data %>%
            mutate(M2 = M1 + Intercept)


M1.r = rasterize(Pred.Grid, 
                 Domain.r, "M2", 
                 background = NA)

        if(i == 1){GenDist.stk = M1.r}
          else{GenDist.stk = stack(GenDist.stk, M1.r)}

}


Sum_gDist = mean(GenDist.stk) #Find average of all rasters

Scale = function(x){(x-min(x, na.rm=T))/(max(x, na.rm=T)-min(x, na.rm=T))}
values(Sum_gDist) = Scale(values(Sum_gDist)) #Scale for ease of interpretation

Individual Plots

Pairwise combinations can be plotted, like these:

plot(GenDist.stk[[1]])

plot(GenDist.stk[[2]])

plot(GenDist.stk[[4]])

plot(GenDist.stk[[8]])

Mean Distance

Values range from 0 (not realted) to 1 (highly related). Points indicate locations for 120 isolate samples.

Contours = rasterToContour(Sum_gDist, maxpixels=500000, levels = seq(0,1,0.10))

rng = seq(0, 1, 0.001)

mCols = c("#A50026", "#D73027", "#F46D43", "#FDAE61", 
          "#E0F3F8", "#ABD9E9", "#74ADD1", "#4575B4","#313695") 


cr = colorRampPalette(c(rev(mCols)),  
                        bias=1,space = "rgb")

M2.p = levelplot(Sum_gDist, 
               margin = FALSE,
               xlab = " ", ylab = NULL, 
               col.regions = cr, at = rng,
               colorkey = list(labels=list(cex=1.5),
                               space = "bottom"),
                par.strip.text = list(fontface='bold', cex=1),
                par.settings = list(axis.line = list(col = "black"),
                                   strip.background = list(col = 'transparent'), 
                                   strip.border = list(col = 'transparent')),
                                   scales = list(cex = 1.5)) + 
                latticeExtra::layer(sp.polygons(Contours, col = "black", lwd = 1)) +
                latticeExtra::layer(sp.polygons(Surveil.pnt, col = "black", pch = 19))
M2.p