Full documentation for: An alternative size variable for allometric investigations in subadults
Overview
This research compendium provides code and steps needed to replicate the results of the publication:
Chu, E.Y., Stull, K.E., & Sylvester, A.D. (2022). An alternative size variable for allometric investigations in subadults. In review.
To run the code for this paper locally, please ensure that R, and optionally RStudio, are installed on the local system.
Additionally, the following packages are used and should be installed using install.packages("package_name"):
How to cite
Please cite this compendium as:
Chu, E.Y., Stull, K.E., & Sylvester, A.D. (2022). Compendium of R code and data for An alternative size variable for allometric investigations in subadults. Accessed Current Date.
The Data
All data are from the Subadult Virtual Anthropology Database. Specifically, this project uses portions of data from the United States and South Africa, which are also directly provided in the data folder.
Organization
This document is divided into three sections:
1. Data Manipulation
2. Data Analysis
3. Data Visualization
Setup
After the repository is on your local system, the following code is to setup your local R/RStudio enviornment:
rm(list=ls()) # clear global environment
if (!(grepl("subadult_sv_2022",getwd()))) {
stop("Please set your working directory to 'subadult_sv_2022'.")
}
## Load required libraries
library(MVN)
library(caret)
library(lmodel2)
library(tidyverse)
library(gridExtra)
## Load personal ggplot2 theme
source("elaine_theme.R")Data Manipulation
1. Import data
- Located in data folder
- Stored as
.rdsfiles
us <- readRDS("data/us.rds") # United States
za <- readRDS("data/za.rds") # South Africa (Zuid-Afrika)2. Calculate the geometric mean of long bone lengths
- Geometric mean == “GM”
- Long bone == “lb”
# U.S.
GM_len_us <- rep(NA, nrow(us)) # initialize empty vector for all us individuals
for(i in 1:nrow(us)){
# extract lb lengths
vars <- c(us[[i,"FDL"]],us[[i,"TDL"]],us[[i,"HDL"]],us[[i,"RDL"]])
# calculate GM using function from EnvStats
GM_len_us[i] <- geoMean(vars)
}
# Z.A.
GM_len_za <- rep(NA, nrow(za)) # initialize empty vector for all za individuals
for(i in 1:nrow(za)){
# extract lb lengths
vars <- c(za[[i,"FDL"]],za[[i,"TDL"]],za[[i,"HDL"]],za[[i,"RDL"]])
# calculate GM using function from EnvStats
GM_len_za[i] <- geoMean(vars)
}3. Separate demographic and metric data
# U.S.
demo_us <- us[1:4] # demographic columns
metric_us <- cbind(us[5:ncol(us)],GM_len) # metric columns, including GM_len
# Z.A.
demo_za <- za[1:4] # demographic columns
metric_za <- cbind(za[5:ncol(za)],GM_len) # metric columns, including GM_len4. Combine demographic and logged metric data into a new data frame
- Save as
*_log.rdsin data folder
# Combine columns, log metric data
us_log <- cbind(demo_us,log(metric_us))
za_log <- cbind(demo_za,log(metric_za)) # combine columns, log metric data
# Save data
saveRDS(us_log,"data/us_log.rds")
saveRDS(za_log,"data/za_log.rds")5. Set up for k-fold cross-validation with 10 folds
- Set the seed for reproducibility
- Save as
k_folds_list.rds
set.seed(2021) # set seed for reproducibility
# k-Fold model construction using reduced (standard) major axis
k_folds <- createFolds(us_log$medrec, k=10) # generate 10 folds
saveRDS(k_folds, "results/k_folds_list.rds") # saveData Analysis
1. Test for multivariate normality
Mardia Test and Q-Q Plot:
- Retain Q-Q Plot for Figure 2
mvn(us[5:ncol(us)],mvnTest="mardia",covariance=TRUE,
multivariatePlot="qq",showOutliers=TRUE)Henze-Zinkler Test:
mvn(us[5:ncol(us)],mvnTest="hz",covariance=TRUE,showOutliers=TRUE)2. Kendall’s Rank Correlation Coefficient
- Retain correlation values with stature as Table 5
- Save all correlations as
variable_correlations.csv
corr <- cor(us[5:ncol(us)], method="kendall")
corr[,1] # correlation values with stature - Table 5
write.csv(corr,"results/variable_correlations.csv",row.names=FALSE) # save3. Reduced Major-Axis (RMA) Regression
- Bivariate regression with size variable ~ stature
- Storing each slope and intercept in the list
params
- Save as
rma_params_k_fold.rds
for(c in 6:ncol(us_log)) {
size_var <- colnames(us_log)[c]
params[[size_var]] <- list()
slope <- c()
int <- c()
for(kk in 1:length(k_folds)) {
row_idx <- k_folds[[kk]]
model <- suppressMessages(lmodel2(us_log[-row_idx,c] ~
us_log[-row_idx,"stature"]))
slope <- c(slope, model$regression.results$Slope[3])
int <- c(int, model$regression.results$Intercept[3])
}
params[[size_var]]$slope <- slope
params[[size_var]]$int <- int
}
saveRDS(params, "results/rma_params_k_fold.rds") # save parameter list4. Compile the contents of params into a data frame
- Each row is the potential size variable, with mean slope and intercept as the two columns
- Add a new row with distance of mean slope from 1 for each variable
- Retain as Table 5
- Save file as
rma_coefficients.csv
# Initialize data frame with column `size_var` as each variable in `params`
rma_results <- data.frame(size_var=names(params))
for(v in 1:nrow(rma_results)) {
rma_results$slope[v] <- mean(params[[v]]$slope) # mean slope
rma_results$intercept[v] <- mean(params[[v]]$int) # mean intercept
}
length_idx <- which(rma_results$size_var %in%
c("FDL","TDL","HDL","RDL")) # identify lb lengths
rma_results <- rma_results[-length_idx,] # remove lb lengths (n/a as size var)
rma_results$diff <- abs(1-rma_results$slope) # calculate slope distance from 1
rownames(rma_results) <- NULL # reset row numbers
write.csv(rma_results,"results/rma_coefficients.csv",row.names=FALSE) # saveIn this step, RMSB, FMSB, and GM_msb are identified as having the closest slopes to 1 and are therefore retained for the rest of the analyses.
5. Calculate allometry coefficients for each long bone length (U.S. data)
- Stature, RMSB, FMSB, GM_msb, GM_len
## Allometric Coefficients - FDL
fdl_stature <- lmodel2(us_log[["FDL"]]~
us_log[["stature"]])$regression.results[3,]
fdl_rmsb <- lmodel2(us_log[["FDL"]]~
us_log[["RMSB"]])$regression.results[3,]
fdl_fmsb <- lmodel2(us_log[["FDL"]]~
us_log[["FMSB"]])$regression.results[3,]
fdl_gm_msb <- lmodel2(us_log[["FDL"]]~
us_log[["GM_msb"]])$regression.results[3,]
fdl_gm_len <- lmodel2(us_log[["FDL"]]~
us_log[["GM_len"]])$regression.results[3,]
## Allometric Coefficients - TDL
tdl_stature <- lmodel2(us_log[["TDL"]]~
us_log[["stature"]])$regression.results[3,]
tdl_rmsb <- lmodel2(us_log[["TDL"]]~
us_log[["RMSB"]])$regression.results[3,]
tdl_fmsb <- lmodel2(us_log[["TDL"]]~
us_log[["FMSB"]])$regression.results[3,]
tdl_gm_msb <- lmodel2(us_log[["TDL"]]~
us_log[["GM_msb"]])$regression.results[3,]
tdl_gm_len <- lmodel2(us_log[["TDL"]]~
us_log[["GM_len"]])$regression.results[3,]
## Allometric Coefficients - HDL
hdl_stature <- lmodel2(us_log[["HDL"]]~
us_log[["stature"]])$regression.results[3,]
hdl_rmsb <- lmodel2(us_log[["HDL"]]~
us_log[["RMSB"]])$regression.results[3,]
hdl_fmsb <- lmodel2(us_log[["HDL"]]~
us_log[["FMSB"]])$regression.results[3,]
hdl_gm_msb <- lmodel2(us_log[["HDL"]]~
us_log[["GM_msb"]])$regression.results[3,]
hdl_gm_len <- lmodel2(us_log[["HDL"]]~
us_log[["GM_len"]])$regression.results[3,]
## Allometric Coefficients - RDL
rdl_stature <- lmodel2(us_log[["RDL"]]~
us_log[["stature"]])$regression.results[3,]
rdl_rmsb <- lmodel2(us_log[["RDL"]]~
us_log[["RMSB"]])$regression.results[3,]
rdl_fmsb <- lmodel2(us_log[["RDL"]]~
us_log[["FMSB"]])$regression.results[3,]
rdl_gm_msb <- lmodel2(us_log[["RDL"]]~
us_log[["GM_msb"]])$regression.results[3,]
rdl_gm_len <- lmodel2(us_log[["RDL"]]~
us_log[["GM_len"]])$regression.results[3,]6. Compile all allometry coefficients into a single data frame (U.S. data)
- Save as
us_allometry_coefs.csv
long_bone <- c(rep("Humerus",5),rep("Radius",5),
rep("Femur",5),rep("Tibia",5))
method <- rep(c("Stature","RMSB","FMSB","GM_Midshaft","GM_Length"),4)
slope <- c(hdl_stature$Slope,hdl_rmsb$Slope,hdl_fmsb$Slope,
hdl_gm_msb$Slope,hdl_gm_len$Slope,
rdl_stature$Slope,rdl_rmsb$Slope,rdl_fmsb$Slope,
rdl_gm_msb$Slope,rdl_gm_len$Slope,
fdl_stature$Slope,fdl_rmsb$Slope,fdl_fmsb$Slope,
fdl_gm_msb$Slope,fdl_gm_len$Slope,
tdl_stature$Slope,tdl_rmsb$Slope,tdl_fmsb$Slope,
tdl_gm_msb$Slope,tdl_gm_len$Slope)
compDF <- data.frame(long_bone, method, slope) # combine into compDf
compDF$slope <- as.numeric(as.character(compDF$slope)) # make numeric
compDF$long_bone <- factor(compDF$long_bone,
levels=c("Humerus","Radius","Femur","Tibia"))
compDF$method <- factor(compDF$method,
levels=c("Stature","RMSB","FMSB",
"GM_Midshaft","GM_Length"))
write.csv(compDF,"results/us_allometry_coefs.csv",row.names=FALSE) # save7. Calculate allometry coefficients for each long bone length (Z.A. data)
- Stature, RMSB, FMSB, GM_msb, GM_len
## Allometric Coefficients - FDL
fdl_stature <- lmodel2(za_log[["FDL"]]~
za_log[["stature"]])$regression.results[3,]
fdl_rmsb <- lmodel2(za_log[["FDL"]]~
za_log[["RMSB"]])$regression.results[3,]
fdl_fmsb <- lmodel2(za_log[["FDL"]]~
za_log[["FMSB"]])$regression.results[3,]
fdl_gm_msb <- lmodel2(za_log[["FDL"]]~
za_log[["GM_msb"]])$regression.results[3,]
fdl_gm_len <- lmodel2(za_log[["FDL"]]~
za_log[["GM_len"]])$regression.results[3,]
## Allometric Coefficients - TDL
tdl_stature <- lmodel2(za_log[["TDL"]]~
za_log[["stature"]])$regression.results[3,]
tdl_rmsb <- lmodel2(za_log[["TDL"]]~
za_log[["RMSB"]])$regression.results[3,]
tdl_fmsb <- lmodel2(za_log[["TDL"]]~
za_log[["FMSB"]])$regression.results[3,]
tdl_gm_msb <- lmodel2(za_log[["TDL"]]~
za_log[["GM_msb"]])$regression.results[3,]
tdl_gm_len <- lmodel2(za_log[["TDL"]]~
za_log[["GM_len"]])$regression.results[3,]
## Allometric Coefficients - HDL
hdl_stature <- lmodel2(za_log[["HDL"]]~
za_log[["stature"]])$regression.results[3,]
hdl_rmsb <- lmodel2(za_log[["HDL"]]~
za_log[["RMSB"]])$regression.results[3,]
hdl_fmsb <- lmodel2(za_log[["HDL"]]~
za_log[["FMSB"]])$regression.results[3,]
hdl_gm_msb <- lmodel2(za_log[["HDL"]]~
za_log[["GM_msb"]])$regression.results[3,]
hdl_gm_len <- lmodel2(za_log[["HDL"]]~
za_log[["GM_len"]])$regression.results[3,]
## Allometric Coefficients - RDL
rdl_stature <- lmodel2(za_log[["RDL"]]~
za_log[["stature"]])$regression.results[3,]
rdl_rmsb <- lmodel2(za_log[["RDL"]]~
za_log[["RMSB"]])$regression.results[3,]
rdl_fmsb <- lmodel2(za_log[["RDL"]]~
za_log[["FMSB"]])$regression.results[3,]
rdl_gm_msb <- lmodel2(za_log[["RDL"]]~
za_log[["GM_msb"]])$regression.results[3,]
rdl_gm_len <- lmodel2(za_log[["RDL"]]~
za_log[["GM_len"]])$regression.results[3,]8. Compile all allometry coefficients into a single data frame (Z.A. data)
- Save as
za_allometry_coefs.csv
long_bone <- c(rep("Humerus",5),rep("Radius",5),rep("Femur",5),rep("Tibia",5))
method <- rep(c("Stature","RMSB","FMSB","GM_Midshaft","GM_Length"),4)
slope <- c(hdl_stature$Slope,hdl_rmsb$Slope,hdl_fmsb$Slope,
hdl_gm_msb$Slope,hdl_gm_len$Slope,
rdl_stature$Slope,rdl_rmsb$Slope,rdl_fmsb$Slope,
rdl_gm_msb$Slope,rdl_gm_len$Slope,
fdl_stature$Slope,fdl_rmsb$Slope,fdl_fmsb$Slope,
fdl_gm_msb$Slope,fdl_gm_len$Slope,
tdl_stature$Slope,tdl_rmsb$Slope,tdl_fmsb$Slope,
tdl_gm_msb$Slope,tdl_gm_len$Slope)
compDF <- data.frame(long_bone, method, slope)
compDF$slope <- as.numeric(as.character(compDF$slope))
compDF$long_bone <- factor(compDF$long_bone,
levels=c("Humerus","Radius","Femur","Tibia"))
compDF$method <- factor(compDF$method,
levels=c("Stature","RMSB","FMSB",
"GM_Midshaft","GM_Length"))
write.csv(compDF,"results/za_allometry_coefs.csv",row.names=FALSE) # saveData Visualization
Figure 1
Caption: Sample age distributions by country.
# Add location column and subset for medrec, agey, and location
us_sub <- us %>% mutate(location="U.S.") %>% select(medrec,agey,location)
za_sub <- za %>%mutate(location="Z.A.") %>% select(medrec,agey,location)
# Combine into full data frame
full_data <- rbind(us_sub, za_sub)
# Plot sample distribution as bar chart
ggplot(full_data, aes(x=as.integer(agey))) +
geom_bar(fill="grey75") +
elaine_theme + labs(x="Age (years)", y="Count") +
scale_x_continuous(breaks=0:12) +
geom_text(stat="count", aes(label=..count..), color="black") +
facet_grid(rows=vars(location), scale="free_y")Figure 2
Caption: Chi-square Q-Q plot to visualize multivariate normality. The solid black line represents multivariate normality, and the deviation of the pattern of filled circles indicates non-normality of the U.S. data.
Figure generated when testing for multivariate normality
Figure 3
Caption: Inter-long bone allometric relationships. The solid red line across slope=1.0 represents isometry. Solid line = stature, dashed line = RMSB, dotted line = FMSB, dot-dash line = GM midshaft breadth, two-dash line = GM diaphyseal length.
## Import allometry coefficient data
us_coef <- read.csv("results/us_allometry_coefs.csv")
za_coef <- read.csv("results/za_allometry_coefs.csv")
# Assign long bones as factors
us_coef$long_bone <- factor(us_coef$long_bone,
levels=c("Humerus","Radius","Femur","Tibia"))
za_coef$long_bone <- factor(za_coef$long_bone,
levels=c("Humerus","Radius","Femur","Tibia"))
# Plot U.S. data as point and line graph
us_plot <- ggplot(us_coef, aes(x=long_bone, y=slope, group=method)) +
geom_hline(yintercept=1, linetype="solid", size=0.5, col="red",alpha=0.5) +
geom_point(pch=1,size=3) + geom_line(aes(linetype=method), size=1) +
scale_y_continuous(limits=c(0.8,1.6),breaks=seq(0.8,1.6,0.2)) +
elaine_theme + scale_linetype_manual(values=c("solid","dashed","dotted",
"dotdash","twodash")) +
labs(x="Long Bone", y="Allometry Coefficient", title="a) U.S. Sample") +
theme(legend.position="none", plot.title=element_text(hjust=0))
# Plot Z.A. data as point and line graph
za_plot <- ggplot(za_coef, aes(x=long_bone, y=slope, group=method)) +
geom_hline(yintercept=1, linetype="solid", size=0.5, col="red",alpha=0.5) +
geom_point(pch=1,size=3) + geom_line(aes(linetype=method), size=1) +
scale_y_continuous(limits=c(0.8,1.6),breaks=seq(0.8,1.6,0.2)) +
elaine_theme + scale_linetype_manual(values=c("solid","dashed","dotted",
"dotdash","twodash")) +
labs(x="Long Bone", y="Allometry Coefficient", title="b) Z.A. Sample") +
theme(legend.position="none", plot.title=element_text(hjust=0))
# Arrange Plots Side-by-Side
grid.arrange(us_plot, za_plot, ncol=2)Figure 4
Caption: Bivariate relationship between stature (x) and alternative size variables (y) in log-normal space. The solid green line represents isometry. Note the magnitude and directionality of the black, non-solid line crossing the isometry line.
# Import RMA coefficient results with slope and intercept
rma_results <- read.csv("results/rma_coefficients.csv")
# Import U.S. Log data
us_log <- readRDS("data/us_log.rds")
# Individual plot for RMSB v. Stature
rmsb_idx <- which(rma_results$size_var=="RMSB") # index for RMSB
c_rmsb <- ggplot(us_log, aes(x=stature,y=RMSB)) +
geom_point(pch=1,col="grey") +
geom_abline(slope=1,
intercept=mean(us_log$RMSB)-mean(us_log$stature),
size=1.15,col="#009E73") +
geom_abline(slope=rma_results$slope[rmsb_idx],
intercept=rma_results$intercept[rmsb_idx],
size=1.15,linetype="dashed") +
labs(x=expression(paste(italic(ln),"(Stature)")),
y=expression(paste(italic(ln),"(RMSB)"))) + elaine_theme
# Individual plot for FMSB v. Stature
fmsb_idx <- which(rma_results$size_var=="FMSB") # index for FMSB
c_fmsb <- ggplot(us_log, aes(x=stature,y=FMSB)) +
geom_point(pch=1,col="grey") +
geom_abline(slope=1,
intercept=mean(us_log$FMSB)-mean(us_log$stature),
size=1.15,col="#009E73") +
geom_abline(slope=rma_results$slope[fmsb_idx],
intercept=rma_results$intercept[fmsb_idx],
size=1.15,linetype="dotted") +
labs(x=expression(paste(italic(ln),"(Stature)")),
y=expression(paste(italic(ln),"(FMSB)"))) + elaine_theme
# Individual plot for GM_msb v. Stature
gm_msb_idx <- which(rma_results$size_var=="GM_msb") # index for GM_msb
c_gm_msb <- ggplot(us_log, aes(x=stature,y=GM_msb)) +
geom_point(pch=1,col="grey") +
geom_abline(slope=1,
intercept=mean(us_log$GM_msb)-mean(us_log$stature),
size=1.15,col="#009E73") +
geom_abline(slope=rma_results$slope[gm_msb_idx],
intercept=rma_results$intercept[gm_msb_idx],
size=1.15,linetype="dotdash") +
labs(x=expression(paste(italic(ln),"(Stature)")),
y=expression(paste(italic(ln),"(GM_Midshaft)"))) + elaine_theme
# Individual plot for GM_len v. Stature
gm_len_idx <- which(rma_results$size_var=="GM_len") # index for GM_len
c_gm_len <- ggplot(us_log, aes(x=stature,y=GM_len)) +
geom_point(pch=1,col="grey") +
geom_abline(slope=1,
intercept=mean(us_log$GM_len)-mean(us_log$stature),
size=1.15,col="#009E73") +
geom_abline(slope=rma_results$slope[gm_len_idx],
intercept=rma_results$intercept[gm_len_idx],
size=1.15,linetype="twodash") +
labs(x=expression(paste(italic(ln),"(Stature)")),
y=expression(paste(italic(ln),"(GM_Length)"))) + elaine_theme
# Arrange plots as 2x2
grid.arrange(c_rmsb,c_fmsb,c_gm_msb,c_gm_len,nrow=2,
layout_matrix=rbind(c(1,1,2,2),
c(3,3,4,4)))Figure 5
Caption: Allometry coefficient relationships between long bone diaphyseal lengths from averaged coefficients from Auerbach & Sylvester (2011) on the left and the current study on the right. The solid red line across slope=1.0 represents isometry. Solid line = stature, dotted line = GM diaphyseal length.
# Import U.S. and Auerbach & Sylvester coefficient data
us_coef <- read.csv("results/us_allometry_coefs.csv")
as_coef <- read.csv("data/Auerbach&Sylvester_2011.csv")
# Add study column
us_coef$study <- "Current Study"
# Combine data
comp_coef <- rbind(us_coef[which(us_coef$method %in%
c("Stature","GM_Length")),], as_coef)
# Define long bone as factor
comp_coef$long_bone <- factor(comp_coef$long_bone,
levels=c("Humerus","Radius","Femur","Tibia"))
## Plot
ggplot(comp_coef, aes(x=long_bone, y=slope, group=method)) +
geom_hline(yintercept=1, linetype="solid", size=0.5, col="red", alpha=0.5) +
geom_line(aes(linetype=method), size=1) + geom_point(pch=1, size=3) +
scale_y_continuous(breaks=seq(0.8,1.6,by=0.2)) +
scale_linetype_manual(values=c("dotted","solid")) +
labs(x="Long Bone", y="Allometry Coefficient") +
facet_grid(cols=vars(study)) + elaine_theme + theme(legend.position="none")