setwd("~/LabWork/Patuxent/BWTE_2/BWTE2/PhyloExample")
suppressMessages(library(rentrez))
suppressMessages(library(ape))
suppressMessages(library(ips))
suppressMessages(library(phangorn))
suppressMessages(library(geiger))
suppressMessages(library(dendextend))
suppressMessages(library(ggplot2))
suppressMessages(library(ggtree))
suppressMessages(library(phylobase))
suppressMessages(library(insect))
suppressMessages(library(stringr))
suppressMessages(library(dplyr))
Previously aligned using MUSCLE. Includes Multiple species surveillance data, for 135 influenza positive H5 isolates from the IRD database.
HA_segs = read.dna("Strain_Alignment_AIH5.fasta", format = "fasta")
#Remove Duplicate Names
duplicates = duplicated.DNAbin(HA_segs, point = TRUE)
attr(duplicates, "pointers")
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17
## [18] 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 32
## [35] 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50
## [52] 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67
## [69] 68 69 70 66 71 72 73 74 75 76 77 78 79 80 81 82 83
## [86] 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100
## [103] 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117
## [120] 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 48 133
## [137] 134 135
HA_segs = subset.DNAbin(HA_segs, subset = !duplicates)
write.nexus.data(HA_segs , file = "HA_032619.nex", #Save copy
format = "dna",
datablock = TRUE,
interleaved = FALSE,
gap = "-",
missing = "N")
Cleaning-up names for consistency.
Strains = unlist(attr(HA_segs, "dimnames")[1])
Strains.tab = as.data.frame(cbind(Strains, Strains))
names(Strains.tab) = c("Strain","SampID")
Strains.tab$Strain = as.character(Strains.tab$Strain)
Strains.tab$Strain = gsub("_","",as.character(Strains.tab$Strain))
Strains.tab$Strain = gsub("-","",as.character(Strains.tab$Strain))
Strains.tab$Strain = str_replace_all(Strains.tab$Strain, fixed(" "), "")
Strains.tab$SampID = NA
Also from the IRD database, but this file Includes Long and Lat information.
Surveil.df = read.csv("./Surveillance_032619.csv", stringsAsFactors=FALSE)
Surveil.df$SampID = paste("Samp_", 1:dim(Surveil.df)[1], sep="")
Surveil.df$Strain.Name2 = str_replace(Surveil.df$Strain.Name, " \\(.*\\)", "")
Surveil.df$Strain.Name2 = gsub("\\s*\\([^\\)]+\\)","",as.character(Surveil.df$Strain.Name))
Surveil.df$Strain.Name2 = gsub("_","",as.character(Surveil.df$Strain.Name2))
Surveil.df$Strain.Name2 = gsub("-","",as.character(Surveil.df$Strain.Name2))
Surveil.df$Strain.Name2 = str_replace_all(Surveil.df$Strain.Name2, fixed(" "), "")
dim(Surveil.df)
## [1] 137 18
head(Surveil.df)
## Collector.Institution Collection.Date Country State...Province
## 1 Ohio State University 12/08/2011 USA Mississippi
## 2 Ohio State University 08/19/2011 USA Ohio
## 3 Ohio State University 10/16/2011 USA Wisconsin
## 4 Ohio State University 01/16/2012 USA Mississippi
## 5 Ohio State University 01/17/2012 USA Mississippi
## 6 Ohio State University 10/03/2010 USA Wisconsin
## Host.Species Host.Common.Name Subtype Flu.Test.Status Type
## 1 Anas strepera Gadwall H5N3 Positive A
## 2 Anas platyrhynchos Mallard H5N2 Positive A
## 3 Anas carolinensis Green-Winged Teal H5N2 Positive A
## 4 Anas carolinensis Green-Winged Teal H5N3 Positive A
## 5 Anas carolinensis Green-Winged Teal H5N3 Positive A
## 6 Anas carolinensis Green-Winged Teal H5N2 Positive A
## Strain.Name Latitude
## 1 A/gadwall/Mississippi/11OS5840/2011(H5N3) 32.8305
## 2 A/mallard/Ohio/11OS2229/2011(H5N2) 41.4345
## 3 A/American green-winged teal/Wisconsin/11OS3432/2011(H5N2) 43.5497
## 4 A/American green-winged teal/Mississippi/12OS191/2012(H5N3) 32.5486
## 5 A/American green-winged teal/Mississippi/12OS311/2012(H5N3) 32.8305
## 6 A/American green-winged teal/Wisconsin/10OS2955/2010(H5N2) 43.5497
## Longitude Sample.Identifier Sample.Accession Age Health SampID
## 1 -90.9812 11OS5840 IRD_UMN000093739 Hatch Year Healthy Samp_1
## 2 -82.9622 11OS2229 IRD_UMN000091639 Hatch Year Healthy Samp_2
## 3 -88.6559 11OS3432 IRD_UMN000092298 Hatch Year Healthy Samp_3
## 4 -90.8707 12OS191 IRD_UMN000094423 Hatch Year Healthy Samp_4
## 5 -90.9812 12OS311 IRD_UMN000094543 Undetermined Healthy Samp_5
## 6 -88.6559 10OS2955 IRD_UMN000055620 Hatch Year Healthy Samp_6
## Strain.Name2
## 1 A/gadwall/Mississippi/11OS5840/2011
## 2 A/mallard/Ohio/11OS2229/2011
## 3 A/Americangreenwingedteal/Wisconsin/11OS3432/2011
## 4 A/Americangreenwingedteal/Mississippi/12OS191/2012
## 5 A/Americangreenwingedteal/Mississippi/12OS311/2012
## 6 A/Americangreenwingedteal/Wisconsin/10OS2955/2010
Creating unique IDs
Strains.tab$SampID = with(Surveil.df,
SampID[match(
Strains.tab$Strain,
Strain.Name2)])
Strains.tab$dup = duplicated(Strains.tab$SampID)
Strains.tab$SampID = ifelse(Strains.tab$dup == TRUE, paste(Strains.tab$SampID, "B", sep=""), Strains.tab$SampID)
One sample (Missing) in tree in alignment data, but, not Long Lat file.
Strains.tab$SampID[is.na(Strains.tab$SampID)] = "Missing"
attr(HA_segs, "dimnames")[[1]] = Strains.tab$SampID
image(HA_segs, show.labels = TRUE, cex = 0.5)
HA_segs_phy = as.phyDat(HA_segs)
mt = modelTest(HA_segs_phy)
## negative edges length changed to 0!
## [1] "JC+I"
## [1] "JC+G"
## [1] "JC+G+I"
## [1] "F81+I"
## [1] "F81+G"
## [1] "F81+G+I"
## [1] "K80+I"
## [1] "K80+G"
## [1] "K80+G+I"
## [1] "HKY+I"
## [1] "HKY+G"
## [1] "HKY+G+I"
## [1] "SYM+I"
## [1] "SYM+G"
## [1] "SYM+G+I"
## [1] "GTR+I"
## [1] "GTR+G"
## [1] "GTR+G+I"
arrange(mt, BIC) #Model summary , sorted by BIC
## Model df logLik AIC AICw AICc AICcw
## 1 GTR+G 276 -9369.476 19290.95 6.357194e-02 19393.57 9.250886e-02
## 2 GTR+G+I 277 -9365.786 19285.57 9.364281e-01 19389.00 9.074911e-01
## 3 GTR+I 276 -9390.282 19332.56 5.851571e-11 19435.18 8.515112e-11
## 4 HKY+G 272 -9407.421 19358.84 1.150502e-16 19458.25 8.352712e-16
## 5 HKY+G+I 273 -9403.735 19353.47 1.689523e-15 19453.67 8.230423e-15
## 6 HKY+I 272 -9427.901 19399.80 1.467820e-25 19499.21 1.065646e-24
## 7 SYM+G+I 274 -9442.133 19432.27 1.309495e-32 19533.27 4.272332e-32
## 8 SYM+G 273 -9446.911 19439.82 2.995885e-34 19540.03 1.459429e-33
## 9 SYM+I 273 -9472.232 19490.46 3.017251e-45 19590.67 1.469838e-44
## 10 K80+G+I 270 -9486.852 19513.70 2.712670e-50 19611.52 4.349757e-49
## 11 K80+G 269 -9491.327 19520.65 8.396304e-52 19617.69 1.995285e-50
## 12 K80+I 269 -9514.454 19566.91 7.587387e-62 19663.94 1.803055e-60
## 13 GTR 275 -9641.368 19832.74 1.433161e-119 19934.55 3.125672e-119
## 14 HKY 271 -9682.221 19906.44 1.417145e-135 20005.05 1.530467e-134
## 15 SYM 272 -9738.833 20021.67 1.351380e-160 20121.07 9.811097e-160
## 16 K80 268 -9778.020 20092.04 7.063887e-176 20188.29 2.483140e-174
## 17 F81+G+I 272 -9965.285 20474.57 6.079399e-259 20573.98 4.413681e-258
## 18 F81+G 271 -9969.250 20480.50 3.135234e-260 20579.11 3.385944e-259
## 19 F81+I 271 -9989.691 20521.38 4.156759e-269 20619.99 4.489154e-268
## 20 JC+G+I 269 -10055.065 20648.13 1.245973e-296 20745.16 2.960911e-295
## 21 JC+G 268 -10059.348 20654.70 4.677189e-298 20750.95 1.644154e-296
## 22 JC+I 268 -10081.611 20699.22 1.002725e-307 20795.47 3.524839e-306
## 23 F81 270 -10242.822 21025.64 0.000000e+00 21123.47 0.000000e+00
## 24 JC 267 -10340.048 21214.10 0.000000e+00 21309.57 0.000000e+00
## BIC
## 1 20802.61
## 2 20802.71
## 3 20844.23
## 4 20848.60
## 5 20848.70
## 6 20889.56
## 7 20932.98
## 8 20935.05
## 9 20985.70
## 10 20992.50
## 11 20993.98
## 12 21040.23
## 13 21338.92
## 14 21390.72
## 15 21511.42
## 16 21559.89
## 17 21964.32
## 18 21964.78
## 19 22005.66
## 20 22121.45
## 21 22122.54
## 22 22167.07
## 23 22504.45
## 24 22676.47
env = attr(mt, "env")
ls(envir=env)
## [1] "data" "F81" "F81+G" "F81+G+I"
## [5] "F81+I" "GTR" "GTR+G" "GTR+G+I"
## [9] "GTR+I" "HKY" "HKY+G" "HKY+G+I"
## [13] "HKY+I" "JC" "JC+G" "JC+G+I"
## [17] "JC+I" "K80" "K80+G" "K80+G+I"
## [21] "K80+I" "SYM" "SYM+G" "SYM+G+I"
## [25] "SYM+I" "tree_F81" "tree_F81+G" "tree_F81+G+I"
## [29] "tree_F81+I" "tree_GTR" "tree_GTR+G" "tree_GTR+G+I"
## [33] "tree_GTR+I" "tree_HKY" "tree_HKY+G" "tree_HKY+G+I"
## [37] "tree_HKY+I" "tree_JC" "tree_JC+G" "tree_JC+G+I"
## [41] "tree_JC+I" "tree_K80" "tree_K80+G" "tree_K80+G+I"
## [45] "tree_K80+I" "tree_SYM" "tree_SYM+G" "tree_SYM+G+I"
## [49] "tree_SYM+I"
(fit = eval(get("GTR+G", env), env)) #select model, save as "fit"
##
## loglikelihood: -9369.476
##
## unconstrained loglikelihood: -8245.543
## Discrete gamma model
## Number of rate categories: 4
## Shape parameter: 0.3507208
##
## Rate matrix:
## a c g t
## a 0.000000 1.30753461 6.64974164 0.537365
## c 1.307535 0.00000000 0.07174399 8.479255
## g 6.649742 0.07174399 0.00000000 1.000000
## t 0.537365 8.47925498 1.00000000 0.000000
##
## Base frequencies:
## 0.3587083 0.1773652 0.2182906 0.2456359
#Optimise model parameters without fitting edges
fit1 = optim.pml(fit,
optNni = FALSE, optBf = TRUE,
optQ = TRUE, optInv = TRUE,
optGamma = TRUE, optEdge = FALSE,
optRate = TRUE,
control = pml.control(epsilon = 1e-08,
maxit = 10, trace = 0))
#Fix substitution model and fit tree
fit2 = optim.pml(fit1,
optNni = TRUE, optBf = FALSE,
optQ = FALSE, optInv = FALSE,
optGamma = FALSE, optEdge = TRUE,
control = pml.control(epsilon = 1e-08,
maxit = 10, trace = 0))
#Fine tune
fit3 = optim.pml(fit2,
optNni = TRUE, optBf = TRUE,
optQ = TRUE, optInv = TRUE,
optGamma = TRUE, optEdge = TRUE,
optRate = FALSE,
control = pml.control(epsilon = 1e-08,
maxit = 10, trace = 0))
fit4 = fit3$tree
#fit4$tip.label
fit4 = root(fit4, 47, resolve.root = TRUE) # reroots the tree
p1 = ggtree(fit4,
ladderize=TRUE) +
geom_tiplab(cex = 1.75) +
geom_treescale(x=0.3, y=0.5, width=0.005,
offset=-1, fontsize = 3, color = "red") +
ggtitle("03-26-19 AI H5 ")
p1
Creating a variance covariance matrix from tree under an assumption of Brownian motion.
VCV.mat = vcv.phylo(fit4, model = "Brownian", corr = FALSE)
dim(VCV.mat)
## [1] 135 135
States = map("state",
fill = TRUE,
plot = FALSE)
IDs = sapply(strsplit(States$names, ":"),
function(x) x[1])
LL84 = "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
States = map2SpatialPolygons(States, IDs = IDs,
proj4string = CRS(LL84))
#Add a dataframe
pid = sapply(slot(States, "polygons"),
function(x) slot(x, "ID"))
p.df = data.frame(ID=1:length(States),
row.names = pid)
States = SpatialPolygonsDataFrame(States, p.df)
US = gUnaryUnion(gBuffer(States, width=0))
## Warning in gBuffer(States, width = 0): Spatial object is not projected;
## GEOS expects planar coordinates
Ras = raster(res = 0.05, ext = extent(US),
crs = proj4string(US))
Domain.r = rasterize(US, Ras,
field = 0,
background = NA)
#Point grid version
Grd.pnt = rasterToPoints(Domain.r, spatial = TRUE)
Grd.pnt@data = Grd.pnt@data %>%
mutate(Long = Grd.pnt@coords[,1],
Lat = Grd.pnt@coords[,2])
Surveil.df2 = Surveil.df %>%
mutate(Long = as.numeric(Longitude),
Lat = as.numeric(Latitude),
Spp = Host.Common.Name,
Date = as.POSIXct(Collection.Date,
tz = "GMT",
format = "%m/%d/%Y"),
Year = year(Date),
Month = month(Date)) %>%
filter(is.na(Long) == FALSE,
is.na(Lat) == FALSE)
## Warning: NAs introduced by coercion
## Warning: NAs introduced by coercion
Surveil.pnt = SpatialPointsDataFrame(cbind(Surveil.df2$Long, Surveil.df2$Lat), Surveil.df2)
proj4string(Surveil.pnt) = proj4string(US)
Surveil.pnt$Domain = over(Surveil.pnt, US)
Surveil.pnt = subset(Surveil.pnt, Domain == TRUE)
Isolate sample locations.
plot(States)
plot(Surveil.pnt, add=T, cex=1, pch=19, col="red")
NA.bf = gBuffer(US,
width = 1,
byid = FALSE)
## Warning in gBuffer(US, width = 1, byid = FALSE): Spatial object is not
## projected; GEOS expects planar coordinates
NA.bnds = inla.sp2segment(NA.bf)
MaxEdge = 0.5
mesh0 = inla.mesh.2d(boundary = NA.bnds,
loc = Surveil.pnt,
cutoff = 1,
max.edge = MaxEdge,
min.angle = 21)
MeshLocs = cbind(mesh0$loc[,1], mesh0$loc[,2])
xyz = as.data.frame(
inla.mesh.map(MeshLocs,
projection = "longlat",
inverse = TRUE))
true.radius.of.earth = 6371 #km
radius.of.earth = 1
mesh.dom = inla.mesh.create(loc = xyz,
cutoff = 35/true.radius.of.earth,
refine=list(max.edge = 1000/true.radius.of.earth,
min.angle = 26))
plot(mesh.dom, rgl = TRUE, main = " ")
Changing names to an integer value for ease of cross-referencing in model.
CNames = colnames(VCV.mat)
colnames(VCV.mat) = 1:135
rownames(VCV.mat) = 1:135
VCV.mat2 = as.matrix(VCV.mat)
LuTable = as.data.frame(cbind(1:135, CNames))
LuTable$CNames = as.character(LuTable$CNames)
Surveil.pnt$SampID2 = with(LuTable,
V1[match(Surveil.pnt$SampID,
CNames)])
Surveil.pnt$SampID2 = as.numeric(as.character(Surveil.pnt$SampID2))
locs = cbind(Surveil.pnt@coords[,1], Surveil.pnt@coords[,2]) #point locations
locs = inla.mesh.map(locs,
projection = "longlat",
inverse = TRUE)
A.det = inla.spde.make.A(mesh.dom, #the mesh
alpha = 2, #default setting
loc=locs) #our locations
spde = inla.spde2.pcmatern(mesh.dom,
prior.range = c(1, 0.9),
prior.sigma = c(1, 0.9))
field.det = inla.spde.make.index("field.det", spde$n.spde) #index
Estimating genetic realtedness iteratively between all pairs. Note that formula (Frm.M1) below includes space, species, year, month, and site location. Only 8-pairwise combinations are modeled here as an example.
DT.df = Surveil.pnt@data
DT.df %>%
group_by(Month) %>%
summarise(Count = length(Month))
DT.df %>%
group_by(Year) %>%
summarise(Count = length(Year))
DT.df$Site = 1:dim(DT.df)[1]
DT.lst = list(c(field.det, #Spatial index
list(intercept1 = 1)), #Intercept
list(Lat = DT.df[,"Lat"],
Long = DT.df[,"Long"],
SampID = DT.df[,"SampID2"],
Subtype = DT.df[,"Subtype"],
Spp = DT.df[,"Spp"],
Month = DT.df[,"Month"],
Year = DT.df[,"Year"],
Site = DT.df[,"Site"]))
for(i in 1:8){
Target.vect = as.data.frame(cbind(1:135, VCV.mat2[1:135,i]))
DT.df$Target = with(Target.vect,
V2[match(Surveil.pnt$SampID2,
V1)])
DT.df$Target[DT.df$Target == 1] = NA #Remove focal location (self-relation)
DT.df$Target[is.na(DT.df$Target)] = mean(DT.df$Target, na.rm=T)
surv.stk = inla.stack(data = list(Y = DT.df$Target),
A = list(A.det, 1), #Projection matrix
effects = DT.lst, #Spatial index and covariates
tag = "det.0")
Frm.M1 = Y ~ -1 + intercept1 +
f(field.det, model=spde) + #Spatial
f(Spp, model = "iid") + #Species
f(Year, model = "iid") + #Year
f(Month, model = "iid") + #Month
f(Site, model = "iid") #Site
Model.M1 = inla(Frm.M1,
data = inla.stack.data(surv.stk),
family = "gaussian",
verbose = TRUE,
control.fixed = list(prec = 0.001, prec.intercept = 0.0001),
control.predictor = list(
A = inla.stack.A(surv.stk),
compute = TRUE,
link = 1),
control.results = list(return.marginals.random = TRUE,
return.marginals.predictor = TRUE),
control.compute=list(dic = TRUE, cpo = TRUE, waic = TRUE))
#Map model output to a raster object
Pred.Grid = Grd.pnt
pLocs = cbind(Pred.Grid$Long, Pred.Grid$Lat)
pLocs = inla.mesh.map(pLocs,
projection = "longlat",
inverse = TRUE)
Ap = inla.spde.make.A(mesh.dom, loc=pLocs)
Pred.Grid$Intercept = Model.M1$summary.fitted.values[1,1]
Pred.Grid$M1 = drop(Ap %*% Model.M1$summary.random$field.det$mean)
Pred.Grid@data = Pred.Grid@data %>%
mutate(M2 = M1 + Intercept)
M1.r = rasterize(Pred.Grid,
Domain.r, "M2",
background = NA)
if(i == 1){GenDist.stk = M1.r}
else{GenDist.stk = stack(GenDist.stk, M1.r)}
}
Sum_gDist = mean(GenDist.stk) #Find average of all rasters
Scale = function(x){(x-min(x, na.rm=T))/(max(x, na.rm=T)-min(x, na.rm=T))}
values(Sum_gDist) = Scale(values(Sum_gDist)) #Scale for ease of interpretation
Pairwise combinations can be plotted, like these:
plot(GenDist.stk[[1]])
plot(GenDist.stk[[2]])
plot(GenDist.stk[[4]])
plot(GenDist.stk[[8]])
Values range from 0 (not realted) to 1 (highly related). Points indicate locations for 120 isolate samples.
Contours = rasterToContour(Sum_gDist, maxpixels=500000, levels = seq(0,1,0.10))
rng = seq(0, 1, 0.001)
mCols = c("#A50026", "#D73027", "#F46D43", "#FDAE61",
"#E0F3F8", "#ABD9E9", "#74ADD1", "#4575B4","#313695")
cr = colorRampPalette(c(rev(mCols)),
bias=1,space = "rgb")
M2.p = levelplot(Sum_gDist,
margin = FALSE,
xlab = " ", ylab = NULL,
col.regions = cr, at = rng,
colorkey = list(labels=list(cex=1.5),
space = "bottom"),
par.strip.text = list(fontface='bold', cex=1),
par.settings = list(axis.line = list(col = "black"),
strip.background = list(col = 'transparent'),
strip.border = list(col = 'transparent')),
scales = list(cex = 1.5)) +
latticeExtra::layer(sp.polygons(Contours, col = "black", lwd = 1)) +
latticeExtra::layer(sp.polygons(Surveil.pnt, col = "black", pch = 19))
M2.p