This note is to study the ‘plotRCS’ package, developed by Dr. Rongrui Huo at Guangxi Medical University Cancer Hospital in Nanning, China. All functions were originally written by Dr. Huo, with minor modifications made by me to suit my specific needs.
rcsplot_check <- function(data,
outcome = NULL,
time = NULL,
exposure = NULL,
covariates = NULL,
positive = NULL,
group = NULL,
knots = c(0.05, 0.35, 0.65, 0.95),
knots.line = FALSE,
ref.value = "k1",
ref.line = TRUE,
conf.int = TRUE,
conf.level = 0.95,
conf.type = c("shape", "line"),
pvalue = TRUE,
pvalue.digits = 3,
pvalue.position = c(0.02, 0.98),
pvalue.label.overall = "P for overall",
pvalue.label.nonlinear = "P for nonlinear",
fontsize = 12,
fontfamily = "serif",
linesize = 0.25,
linecolor = "#0072B5FF",
alpha = 0.1,
xbreaks = NULL,
ybreaks = NULL,
xlab = "",
ylab = "",
explain = TRUE,
...) {
# Select variables and check
outcome <- select_variable(data, outcome)
exposure <- select_variable(data, exposure)
if(!is.null(time)){
time <- select_variable(data, time)
}
if(is.null(group)){
group <- select_variable(data, group)
}
if(!is.null(covariates)){
covariates <- select_variable(data, covariates)
covariates <- setdiff(covariates, outcome)
covariates <- setdiff(covariates, exposure)
covariates <- setdiff(covariates, time)
}
if(!is.numeric(data[[exposure]])){
stop("The exposure variable must be numeric.", call. = FALSE)
}
if(length(unique(data[[outcome]])) == 2L){
# Set positive event
if(is.null(positive)){
if(is.numeric(data[[outcome]])){
positive <- max(data[[outcome]])
}else if(is.factor(data[[outcome]])){
positive <- levels(data[[outcome]])[2]
}else{
stop("You need to specify the positive event of outcome.", call. = FALSE)
}
}
data[[outcome]] <- ifelse(data[[outcome]] == positive, 1, 0)
}else{
if(!is.numeric(data[[outcome]])){
stop("The outcome variable must be numeric or the level must be 2.", call. = FALSE)
}
}
# Set data to environments
pos <- 1
envir = as.environment(pos)
assign("ddist_", rms::datadist(data), envir = envir)
options(datadist = "ddist_")
# Set reference
assign("m_", stats::quantile(data[[exposure]], knots), envir = envir)
if(is.character(ref.value)){
if(ref.value == "min"){
ref.value1 <- min(data[[exposure]], na.rm = TRUE)
}else if(ref.value == "median"){
ref.value1 <- stats::median(data[[exposure]], na.rm = TRUE)
}else if(ref.value == "mean"){
ref.value1 <- mean(data[[exposure]], na.rm = TRUE)
}else{
if(regex_detect(ref.value, pattern = "^k\\d+", ignore.case = TRUE)){
ref.value1 <- regex_extract(ref.value, pattern = "\\d+")
ref.value1 <- as.numeric(ref.value1)
ref.value1 <- m_[ref.value1]
}else{
stop("Reference knot must start with k.", call. = FALSE)
}
}
}else{
ref.value1 <- ref.value
}
eval(parse(text = "ddist_$limits['Adjust to', exposure] <<- ref.value1"))
# Fit models
if(length(unique(data[[outcome]])) == 2L){
if(is.null(time)){
formula <- sprintf("%s ~ rms::rcs(%s, m_)", outcome, exposure)
if(length(covariates) != 0){
formula <- paste(formula, paste(covariates, collapse = " + "), sep = " + ")
}
formula <- stats::as.formula(formula)
model <- rms::lrm(formula = formula, data = data)
}else{
formula <- sprintf("survival::Surv(%s, %s) ~ rms::rcs(%s, m_)", time, outcome, exposure)
if(length(covariates) != 0){
formula <- paste(formula, paste(covariates, collapse = " + "), sep = " + ")
}
formula <- stats::as.formula(formula)
model <- rms::cph(formula = formula, data = data)
}
}else{
formula <- sprintf("%s ~ rms::rcs(%s, m_)", outcome, exposure)
if(length(covariates) != 0){
formula <- paste(formula, paste(covariates, collapse = " + "), sep = " + ")
}
formula <- stats::as.formula(formula)
model <- rms::ols(formula = formula, data = data)
}
# Check conf.level.
if(conf.level < 0 | conf.level > 1){
stop("The conf.level must be strictly between 0 and 1.", call. = FALSE)
}
# Get plotdata from models
if(length(unique(data[[outcome]])) == 2L){
if(is.null(group)){
eval.text <- sprintf("rms::Predict(model, %s, conf.int = %f, ref.zero = TRUE, fun = exp)",
exposure,
conf.level)
}else{
eval.text <- sprintf("rms::Predict(model, %s, %s, conf.int = %f, ref.zero = TRUE, fun = exp)",
exposure,
group,
conf.level)
}
}else{
eval.text <- sprintf("rms::Predict(model, %s, conf.int = %f, ref.zero = TRUE)",
exposure,
conf.level)
}
plotdata <- eval(parse(text = eval.text))
plotdata <- as.data.frame(plotdata)
# Delete data from environments
if(exists("ddist_")){
rm("ddist_", inherits = TRUE, envir = envir)
}
if(exists("m_")){
rm("m_", inherits = TRUE, envir = envir)
}
# Set breaks for x-axis or y-axis
if(is.null(xbreaks)){
xbreaks <- pretty(plotdata[[exposure]])
}
if(is.null(ybreaks)){
if(length(unique(data[[outcome]])) == 2L){
if(conf.int){
ybreaks <- pretty(c(0, plotdata$upper))
}else{
ybreaks <- pretty(c(0, plotdata$yhat))
}
}else{
if(conf.int){
ybreaks <- pretty(c(plotdata$lower, plotdata$upper))
}else{
ybreaks <- pretty(plotdata$yhat)
}
}
}
# Set labels for x-axis or y-axis
if(xlab == ""){
xlab <- attr(data[[exposure]], "label")
if (is.null(xlab)) {
xlab <- exposure
}
}
if(ylab == ""){
if(length(unique(data[[outcome]])) == 2L){
if (is.null(time)) {
ylab <- "Odds ratio"
if(conf.int){
ylab <- sprintf("%s (%s%% CI)", ylab, as.character(conf.level * 100))
}
} else{
ylab <- "Hazard ratio"
if(conf.int){
ylab <- sprintf("%s (%s%% CI)", ylab, as.character(conf.level * 100))
}
}
}else{
ylab <- "\u3b2"
if(conf.int){
ylab <- sprintf("%s (%s%% CI)", ylab, as.character(conf.level * 100))
}
}
}
conf.type <- match.arg(conf.type)
# Plot by ggplot2
if (is.null(group)) {
plot <- ggplot2::ggplot(plotdata) +
ggplot2::geom_line(ggplot2::aes_string(x = exposure, y = "yhat"),
color = linecolor,
linewidth = linesize)
if(conf.int){
if(conf.type == "shape"){
plot <- plot +
ggplot2::geom_ribbon(ggplot2::aes_string(exposure, ymin = "lower", ymax = "upper"),
alpha = alpha,
fill = linecolor)
}else{
plot <- plot +
ggplot2::geom_line(ggplot2::aes_string(x = exposure, y = "lower"),
color = linecolor,
linewidth = linesize,
linetype = 2) +
ggplot2::geom_line(ggplot2::aes_string(x = exposure, y = "upper"),
color = linecolor,
linewidth = linesize,
linetype = 2)
}
}
} else{
plot <- ggplot2::ggplot(plotdata) +
ggplot2::geom_line(ggplot2::aes_string(x = exposure, y = "yhat", color = group),
linewidth = linesize)
if(conf.int){
if(conf.type == "shape"){
plot <- plot +
ggplot2::geom_ribbon(ggplot2::aes_string(exposure, ymin = "lower", ymax = "upper", fill = group),
alpha = alpha)
}else{
plot <- plot +
ggplot2::geom_line(ggplot2::aes_string(x = exposure, y = "lower", color = group),
linewidth = linesize,
linetype = 2) +
ggplot2::geom_line(ggplot2::aes_string(x = exposure, y = "upper", color = group),
linewidth = linesize,
linetype = 2)
}
}
}
if(ref.line){
if(length(unique(data[[outcome]])) == 2L){
plot <- plot +
ggplot2::geom_hline(yintercept = 1, linetype = 2, linewidth = linesize)
}else{
plot <- plot +
ggplot2::geom_hline(yintercept = 0, linetype = 2, linewidth = linesize)
}
}
plot <- plot +
gg_theme_sci(font.size = fontsize, font.family = fontfamily) +
ggplot2::xlab(xlab) +
ggplot2::ylab(ylab) +
ggplot2::coord_cartesian(expand = FALSE, clip = "off") +
ggplot2::scale_x_continuous(breaks = xbreaks, limits = c(min(xbreaks), max(xbreaks))) +
ggplot2::scale_y_continuous(breaks = ybreaks, limits = c(min(ybreaks), max(ybreaks)))
if(knots.line){
plot <- plot +
ggplot2::geom_vline(xintercept = stats::quantile(data[[exposure]], knots),
linetype = 3,
linewidth = linesize)
}
# Show P value
if (pvalue) {
pdata <- stats::anova(model)
pdata <- as.data.frame(pdata)
if(length(unique(data[[outcome]])) == 2L){
p.value <- pdata[2, 3]
p.overall <- pdata[1, 3]
}else{
p.value <- pdata[2, 5]
p.overall <- pdata[1, 5]
}
p.value <- format_pvalue(p.value, digits = pvalue.digits)
if (!regex_detect(p.value, "<", fixed = TRUE)) {
p.value <- paste0(pvalue.label.nonlinear, " = ", p.value)
} else{
p.value <- regex_replace(p.value, "<", replacement = "", fixed = TRUE)
p.value <- paste0(pvalue.label.nonlinear, " < ", p.value)
}
p.overall <- format_pvalue(p.overall, digits = pvalue.digits)
if (!regex_detect(p.overall, "<", fixed = TRUE)) {
p.overall <- paste0(pvalue.label.overall, " = ", p.overall)
} else{
p.overall <-
regex_replace(p.overall, "<", replacement = "", fixed = TRUE)
p.overall <-
paste0(pvalue.label.overall, " < ", p.overall)
}
p.string <- paste(p.overall, p.value, sep = "\n")
# Check p value position.
if(length(pvalue.position) != 2L){
stop("The pvalue.position must of length 2.", call. = FALSE)
}
if(pvalue.position[1] < 0 | pvalue.position[1] > 1){
stop("The first element of pvalue.position must be strictly between 0 and 1.", call. = FALSE)
}
if(pvalue.position[2] < 0 | pvalue.position[2] > 1){
stop("The Second element of pvalue.position must be strictly between 0 and 1.", call. = FALSE)
}
px <- min(xbreaks) + (max(xbreaks) - min(xbreaks)) * pvalue.position[1]
py <- min(ybreaks) + (max(ybreaks) - min(ybreaks)) * pvalue.position[2]
plot <- plot + draw_label(p.string,
size = fontsize,
fontfamily = fontfamily,
x = px,
y = py,
hjust = 0,
vjust = 1)
}
# Explain the figures, title and note.
if(explain){
title <- sprintf("Figure: Association Between %s and %s Using a Restricted Cubic Spline Regression Model.",
exposure,
outcome)
if(length(unique(data[[outcome]])) == 2L){
abbr <- ifelse(is.null(time), "ORs", "HRs")
}else{
abbr <- "\u3b2"
}
note <- sprintf("Graphs show %s for %s according to %s",
abbr,
outcome,
exposure)
if(is.null(covariates)){
note <- paste0(note, ".")
}else{
note <- sprintf("%s adjusted for %s.", note, paste(covariates, collapse = ", "))
}
if(length(unique(data[[outcome]])) == 2L){
if(is.null(time)){
note <- paste(note, "Data were fitted by a logistic regression model,", sep = " ")
}else{
note <- paste(note, "Data were fitted by a restricted cubic spline Cox proportional hazards regression model,", sep = " ")
}
}else{
note <- paste(note, "Data were fitted by a linear regression model,", sep = " ")
}
if(is.character(ref.value)){
if(ref.value == "min"){
reference <- "the minimum"
}else if(ref.value == "median"){
reference <- "the median"
}else if(ref.value == "mean"){
reference <- "mean"
}else{
reference <- regex_extract(ref.value, pattern = "\\d+")
reference <- paste("the", paste0(knots * 100, "th")[as.numeric(reference)], "percentile", sep = " ")
}
}else{
reference <- ref.value
}
tmp <- sprintf("and the model was conducted with %d knots at the %s percentiles of %s (reference is %s).",
length(knots),
paste(paste0(knots * 100, "th"), collapse = ", "),
exposure,
reference)
note <- paste(note, tmp, sep = " ")
# The range of TSH was restricted to 0.34 to 7.5 mIU/L because predictions
# greater than 7.5 mIU/L (95th percentile) are based on too few data points
# note <- paste(note, sprintf("The %s ranges from %.1f to %.1f.",
# exposure,
# min(data[[exposure]], na.rm = TRUE),
# max(data[[exposure]], na.rm = TRUE)), sep = " ")
note <- paste(note, sprintf("Solid lines indicate %s, and %s indicate %s%% CIs.",
abbr,
ifelse(conf.type == "shape", "shadow shape", "dashed lines"),
as.character(conf.level * 100)), sep = " ")
if(length(unique(data[[outcome]])) == 2L){
note <- paste(note,
ifelse(is.null(time),
"OR, odds ratio; CI, confidence interval.",
"HR, hazard ratio; CI, confidence interval." ),
sep = " ")
}else{
note <- paste(note, "CI, confidence interval.", sep = " ")
}
attr(plot, "title") <- title
attr(plot, "note") <- note
}
attr(plot, "explain") <- explain
class(plot) <- c("rcsplot", class(plot))
plot
return(list('plot'=plot,'plotdata'=plotdata))
}
Utility function
select_variable <- function(data, ..., type = c("name", "data", "index")){
type <- match.arg(type)
if(length(c(...)) == 0L){
return(NULL)
}
index <- .col_index(data, ...)
if(length(index) == 0L){
return(NULL)
}
switch(type,
data = data[index],
name = {
varname <- names(data)[index]
names(varname) <- varname
varname
},
index = index)
}
.col_index <- function(data, ...){
varnames <- list(...)
res <- lapply(varnames, function(x){
if(is.numeric(x)){
if(max(x) > ncol(data) | min(x) <= 0){
stop("Out of range for column index.", call. = FALSE)
}
x
}else{
sapply(x, function(i){
if(regex_detect(i, pattern = ":", fixed = TRUE)){
st <- regex_split(i, pattern = ":", fixed = TRUE)[[1]]
check_name(data, st[1])
check_name(data, st[2])
start <- which(names(data) == st[1])
end <- which(names(data) == st[2])
start:end
}else{
check_name(data, i)
which(names(data) == i)
}
})
}
})
res <- unique(unlist(res))
names(res) <- names(data)[res]
res
}
check_name <- function(data, varnames){
tmp <- varnames %in% names(data)
if(!all(tmp)){
tmpname <- varnames[!tmp]
tmpname <- paste(tmpname, collapse = ", ")
message <- sprintf("%s are (is) not included in the data frame.", tmpname)
stop(message, call. = FALSE)
}
}
regex_replace <- function(string,
pattern,
replacement,
ignore.case = FALSE,
perl = FALSE,
fixed = FALSE,
useBytes = FALSE){
sub(pattern = pattern,
replacement = replacement,
x = string,
ignore.case = ignore.case,
perl = perl,
fixed = fixed,
useBytes = useBytes)
}
regex_split <- function(string,
pattern,
perl = FALSE,
fixed = FALSE,
useBytes = FALSE){
strsplit(
string,
pattern,
perl = perl,
fixed = fixed,
useBytes = useBytes)
}
regex_detect <- function(string,
pattern,
ignore.case = FALSE,
perl = FALSE,
fixed = FALSE,
useBytes = FALSE){
grepl(
pattern,
string,
ignore.case = ignore.case,
perl = perl,
fixed = fixed,
useBytes = useBytes)
}
regex_extract <- function(string,
pattern,
ignore.case = FALSE,
perl = FALSE,
fixed = FALSE,
useBytes = FALSE){
regmatches(string,
regexpr(pattern,
string,
ignore.case = ignore.case,
perl = perl,
fixed = fixed,
useBytes = useBytes))
}
format_pvalue <- function(x, digits) {
fmt <- paste0("%.", digits, "f")
pVec <- sapply(x, function(i){
if(is.na(i)){
NA
}else{
ifelse(i == -1, "NA", sprintf(fmt = fmt, i))
}
})
smallPString <- paste0("<0.", paste0(rep("0", digits - 1), collapse = ""), "1")
posAllZeros <- grepl("^0\\.0*$", pVec)
pVec[posAllZeros] <- smallPString
return(pVec)
}
draw_label <- function (label,
x = 0.5,
y = 0.5,
hjust = 0,
vjust = 1,
fontfamily = "serif",
fontface = "plain",
color = "black",
size = 12,
angle = 0,
lineheight = 0.9,
alpha = 1,
colour) {
if (!missing(colour)) {
color <- colour
}
text_par <- grid::gpar(col = color,
fontsize = size,
fontfamily = fontfamily,
fontface = fontface,
lineheight = lineheight,
alpha = alpha)
text.grob <- grid::textGrob(label,
x = grid::unit(0.5, "npc"),
y = grid::unit(0.5, "npc"),
hjust = hjust,
vjust = vjust,
rot = angle,
gp = text_par)
ggplot2::annotation_custom(text.grob,
xmin = x,
xmax = x,
ymin = y,
ymax = y)
}
gg_theme_sci <- function(font.size = 12,
font.family = "serif",
axis.line.size = 0.25,
axis.ticks.length = 0.12,
legend.key.size = 1.0,
face.bold = FALSE,
panel.grid.major = FALSE,
panel.grid.minor = FALSE,
panel.border = FALSE,
panel.spacing = 0.6,
strip.background = "gray90",
aspect.ratio = NULL,
plot.margin = c(0.4, 0.6, 0.4, 0.4),
...) {
face <- ifelse(face.bold, "bold", "plain")
if(panel.grid.major){
pg.major = ggplot2::element_line(color = "gray90", size = axis.line.size)
}else{
pg.major = ggplot2::element_blank()
}
if(panel.grid.minor){
pg.minor = ggplot2::element_line(color = "gray90", size = axis.line.size, linetype = "dashed")
}else{
pg.minor = ggplot2::element_blank()
}
if(panel.border){
pborder = ggplot2::element_rect(color = "black", size = axis.line.size)
}else{
pborder = ggplot2::element_rect(color = "NA")
}
ggplot2::theme_bw(
base_size = font.size,
base_family = font.family,
base_line_size = axis.line.size,
base_rect_size = axis.line.size) +
ggplot2::theme(
panel.background = ggplot2::element_rect(fill = NA),
panel.grid = ggplot2::element_blank(),
panel.border = pborder,
panel.grid.major = pg.major,
panel.grid.minor = pg.minor,
panel.spacing = ggplot2::unit(panel.spacing, "cm"),
strip.background = ggplot2::element_rect(fill = strip.background, size = axis.line.size),
axis.line = ggplot2::element_line(size = axis.line.size, color = "black",lineend = "square"),
axis.ticks.length = ggplot2::unit(axis.ticks.length, "cm"),
axis.ticks = ggplot2::element_line(color = "black", size = axis.line.size),
axis.text = ggplot2::element_text(color = "black", size = font.size),
axis.title = ggplot2::element_text(color = "black", size = font.size, face = face),
legend.background = ggplot2::element_rect(fill = "NA"),
legend.text = ggplot2::element_text(color = "black", size = font.size),
legend.title = ggplot2::element_text(face = face),
legend.key.size = ggplot2::unit(legend.key.size, "lines"),
plot.title = ggplot2::element_text(size = font.size + 2, face = face),
plot.title.position = "plot",
plot.margin = ggplot2::unit(plot.margin, "cm"), # top, right, bottom, left
strip.text = ggplot2::element_text(color = "black", size = font.size, face = face),
aspect.ratio = aspect.ratio,
complete = FALSE,
...
)
}
library(plotRCS) # we only need cancer data from this library
p_check<-rcsplot_check(data = cancer,
outcome = "status",
time = "time",
exposure = "age",
covariates = c("sex", "race", "size", "metastasis"))
p_check$plot
##
## Figure: Association Between age and status Using a Restricted Cubic Spline Regression Model.
## Graphs show HRs for status according to age adjusted for sex, race, size, metastasis. Data were fitted by a restricted cubic spline Cox proportional hazards regression model, and the model was conducted with 4 knots at the 5th, 35th, 65th, 95th percentiles of age (reference is the 5th percentile). Solid lines indicate HRs, and shadow shape indicate 95% CIs. HR, hazard ratio; CI, confidence interval.
plotdata<-p_check$plotdata
median(cancer$size)
## [1] 45
Another example using simulated survival data.
require(survival)
library(Hmisc)
n <- 1000
set.seed(731)
age <- 50 + 12*rnorm(n)
sd(age)
## [1] 12.31327
label(age) <- "Age"
sex <- factor(sample(c('Male','Female'), n,
rep=TRUE, prob=c(.6, .4)))
table(sex)
## sex
## Female Male
## 380 620
blood.pressure <- rnorm(n, 120, 15)
cholesterol <- rnorm(n, 200, 25)
cens <- 15*runif(n)
h <- .02*exp(.04*(age-50)+.8*(sex=='Female'))
t <- -log(runif(n))/h
label(t) <- 'Follow-up Time'
e <- ifelse(t<=cens,1,0)
t <- pmin(t, cens)
units(t) <- "Year"
my_plotdata<-data.frame(age,sex, blood.pressure,cholesterol,e,t)
class(my_plotdata)
## [1] "data.frame"
p_check2<-rcsplot_check(data = my_plotdata,
outcome = "e",
time = "t",
exposure = "age",
covariates = c("sex", "blood.pressure", "cholesterol"))
p_check2$plot
##
## Figure: Association Between age and e Using a Restricted Cubic Spline Regression Model.
## Graphs show HRs for e according to age adjusted for sex, blood.pressure, cholesterol. Data were fitted by a restricted cubic spline Cox proportional hazards regression model, and the model was conducted with 4 knots at the 5th, 35th, 65th, 95th percentiles of age (reference is the 5th percentile). Solid lines indicate HRs, and shadow shape indicate 95% CIs. HR, hazard ratio; CI, confidence interval.
plotdata2<-p_check2$plotdata
median(plotdata2$blood.pressure)
## [1] 119.3027
median(plotdata2$cholesterol)
## [1] 199.609