Code
##############################################################
# easytable-1 function #
##############################################################
#Hit ctrl+A, select all the text here and run this entire script (easytable1.R)
#The first time you run this, it might take a while to install a
#the expss and flextable packagage. Please be patient! It gets much
#quicker on subsequent runs.
#Read through the instructions here for details on using easytable1().
#This script also generates a sample dataset called mydat, and demonstrates,
#with examples, how to format the input dataset.
#Once the entire script is run successfully, open up your script
#and use the function to get a formatted table1 !
#############################
# About easytable1()
#############################
#easytable1 function: this is a function I wrote to build descriptive statistics that
#are usually seen in the table1s that my colleagues use at BIDMC, Boston MA.
#we usually use this type of table for clinical trials where the subjects are randomized
#between two groups, a treatment and a control.
#This is a personal project that I embarked on to make my day to day routine work easier.
#This isn't an official BIDMC endorsed projec.
# This function relies on the expss package by Gregory Demin a
# https://cran.r-project.org/web/packages/expss/expss.pdf
# to format the input labels, and flextable package by David Gohel
# https://davidgohel.github.io/flextable/articles/overview.html) to format the output.
# This function would not have been possible without their wonderful code.
# Mac users may have trouble with installing the flextable package.
# please refer to this post for a fix.
# https://stackoverflow.com/questions/48199737/cant-install-flextable
# Sorry for the inconvenience.
#The general structure of easytable1() function is as follows:
#easytable1(numerics = c("insertnumericvarnamehere", "insertnumericvarnamehere", ...),
# logicals = c("insertlogicalvarnamehere", "insertlogicalvarnamehere", ...),
# factors = c("insertfactorvarnamehere", "insertfactorvarnamehere", ...), # Atleast one of numerics,logicals,or factors is mandatory.
# by = "inserttreatgroupvarnamehere", # A by group is mandatory
# data = dataframename, # This is your input dataframe. Enter the name of the dataframe without quotes. This is mandatory.
# title = "inserttabletitlehere") # Insert table title here. This is optional.
#The output of the easytable1() function is:
# A mean (SD) if the numeric variable is normally distributed, or a median and IQR if the numeric variable is not normally distributed.
# A count and % for levels of factor variables, and for binary (logical variables)
# These statistics are stratified by treatment groups.
# #Missing values are excluded for calculations of variable level counts, percentages, means, SDs, medians and IQRs.
#Load your sample dataset (mydat)
#---------------------------------------------------------------
#Create dataset
id <- seq(from = 1,
to = 50,
by = 1 )
#set seed
set.seed(125)
#create treatment group codes.
treatgrp <- sample(x = c(0, 1), #0 represents placebo, 1 represents treated
size = 50,
replace = TRUE,
prob = c(0.5, 0.5))
#continuous variable:normally distributed
hemoglobin <- rnorm(n = 50,
mean = 11.0,
sd = 1.5)
shapiro.test(hemoglobin)
##
## Shapiro-Wilk normality test
##
## data: hemoglobin
## W = 0.97231, p-value = 0.2868
#continous variable:skew distribution
weights <- sort(rnorm(n = 50,
mean = 172.3,
sd = 29))
skewedweights <- sample(weights,
size = 50,
replace = TRUE,
prob = c(50:31, 30:16, 1:15))
#hist(skewedweights)
shapiro.test(skewedweights)
##
## Shapiro-Wilk normality test
##
## data: skewedweights
## W = 0.92818, p-value = 0.004719
#Create binary (dichotomous) variable
received_blood <- sample(x = c(0, 1),
size = 50,
replace = TRUE,
prob = c(0.5, 0.5))
#Another binary variable
insurance_status <- sample(x = c(0, 1),
size = 50,
replace = TRUE,
prob = c(0.7, 0.3))
#Another binary variable
survived <- sample(x = c(0, 1),
size = 50,
replace = TRUE,
prob = c(0.5, 0.5))
#create factor variables - race and hospital disposition
racevec <- c("White", "African-American", "Asian", "Alaska Native/American Indian")
race <- sample( x = racevec,
size = 50,
replace = TRUE,
prob = c(0.7, 0.2, 0.6, 0.4))
hospvec <- c("Home", "Home with home care", "Nursing facility", "Rehab", "Retained")
hospdisp <- sample( x = hospvec,
size = 50,
replace = TRUE,
prob = c(rep(0.20, 5)))
#bind all variables together in a dataset
mydat <- data.frame(ID = id,
treat = treatgrp,
hb = hemoglobin,
weight = skewedweights,
blood_rcv = received_blood,
insured = insurance_status,
race = race,
disp = hospdisp)
#make sure mydat is cleaned and coded appropriately.
str(mydat)
## 'data.frame': 50 obs. of 8 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ treat : num 0 1 1 1 0 0 0 1 0 0 ...
## $ hb : num 11.54 8.24 11.84 12.34 11.59 ...
## $ weight : num 163 156 166 182 120 ...
## $ blood_rcv: num 0 0 1 0 0 0 1 0 0 1 ...
## $ insured : num 0 0 1 1 0 1 0 0 0 0 ...
## $ race : Factor w/ 4 levels "African-American",..: 4 1 3 4 4 4 2 2 4 1 ...
## $ disp : Factor w/ 5 levels "Home","Home with home care",..: 2 2 1 2 5 4 3 2 4 5 ...
mydat$ID <- as.integer(mydat$ID)
#-----------------------------------------------------------------
#View the structure of the sample dataset mydat here
str(mydat)
## 'data.frame': 50 obs. of 8 variables:
## $ ID : int 1 2 3 4 5 6 7 8 9 10 ...
## $ treat : num 0 1 1 1 0 0 0 1 0 0 ...
## $ hb : num 11.54 8.24 11.84 12.34 11.59 ...
## $ weight : num 163 156 166 182 120 ...
## $ blood_rcv: num 0 0 1 0 0 0 1 0 0 1 ...
## $ insured : num 0 0 1 1 0 1 0 0 0 0 ...
## $ race : Factor w/ 4 levels "African-American",..: 4 1 3 4 4 4 2 2 4 1 ...
## $ disp : Factor w/ 5 levels "Home","Home with home care",..: 2 2 1 2 5 4 3 2 4 5 ...
#----------------------------------------------------------------------------------------------------------------
# How to use the function #
#----------------------------------------------------------------------------------------------------------------
#For this function to work smoothly, mydat's input should:
#-----------> have variables supplied as components of one of the three input arguments. For example
# if you wish to have stats for the binary variable, 'insured', you should ensure it has values 1 for
# presence of insurance and 0 for absence of insurance, and then supply it to function as:
# easytable1(logicals = "insured", by = "treat", data = mydat).
# Any variable that is coded 1 for the presence of a characteristic and 0 for the absence can be fed into
# logicals = argument, regardless of whether it is actually numeric or integer.
# If you wanted to add stats for factor variable 'race', you should ensure that its levels are labelled,
# and then that the variable itself is labelled (see assigning variable labels), and then feed it to the
# function along with 'insured' as:
# easytable1( factors = "race", logicals = "insured", by = "treat", data = mydat)
# if you wanted to add stats for numeric variable weight, you don't have to do any labelling or value coding,
# just supply it directly as :
# easytable1(numerics = "weight", logicals = "insured", factors = "race", by = "treat", data = mydat)
# Numeric variables that take on integer values (like IQ score) can also be supplied as part of the numerics argument.
#----------> should always have input dataframe and a by variable provided. Notice in the previous examples,
# we provided a by = "treat" and data = mydat regardless of what the input was.
# By variable should only have values 1 and 0 (1 for intervention, 0 for placebo). This function
# does not support presence of more groups.
# ---------> as the last step, have variable labels for input variables applied. For example, if
# disp and race are your input variables, ensure that they have appropriate variable labels applied,
# something like "Hospital Disposition", and "Racial Group". It is important to remember that
# the individual levels of disp and race be labelled before the disp and race themselves are labelled.
# This is demonstrated below.
#-----------------------------------------------------------------------------------------------#
# Assigning variable lables to input variables (Assign just before using easytable1() function)
#-----------------------------------------------------------------------------------------------#
# Demonstration of labelling: order of labelling
# ---------------------------------------------------
# Say your input variables from mydat are disp (factor), race (factor), and hb (numeric)
# Before setting variable labels in your dataset, label the individual labels of the factor variables.
# mydat$disp <- factor(mydat$disp, levels = c("hme", "hhc", "nrsing", "rehb", retain"),
# labels = c("Home", "Home with home care", "Nursing facility", "Rehab", "Retained")
# mydat$race <- factor(mydat$race, levels = c("afam", "an/ai", "asi", white"), labels = c("African-American", "Alaska Native/American Indian",
# "Asian", "White"))
# Now, label the factor variables (and other input variables, if present) themselves. You use the apply_labels() function from
# the expss package. This script will automatically apply the
# mydat <- apply_labels(mydat,
# race = "Race",
# disp = "Disposition",
# hb = "Hemoglobin (g/dL)")
# Notice, you do not have to label variables that you aren't inputting into the easytable1() function.
# For example, I did not label "insured" because I do not to see it in my output table.
#Now, you are ready to use the easytable1() function.
# easytable1(numerics = "hb",
# factors = c("race", "disp"),
# by = "treat",
# data = mydat,
# title = "Table1: Only 3 variables")
#Loading expss package and assigning variable labels to sample dataset (sample dataset factors already have labels applied to levels)
#-------------------------------------------------------------------------------------------------------------------------------------
#library needed
labellibrary <- "expss"
for (l in labellibrary) {
if(!require(l, character.only = TRUE))
install.packages(l, dependencies = TRUE)
library(l,character.only = TRUE)
}
## Loading required package: expss
##
## Use 'expss_output_rnotebook()' to display tables inside R Notebooks.
## To return to the console output, use 'expss_output_default()'.
#this installs and calls expss for user who do not have it already
#You need to use the apply_labels function from this package to have all your input variables
#labelled.
#for example: if mydat is your input dataset
#and you want a table that calculates statistics for variables hb, weight, blood_rcv, insured, race and disp
#all 6 of these these variables MUST be labelled. Other variables that are not being fed into
#easytable1() do not need to be labelled.
#label your variables as demonstrated below:
mydat <- apply_labels(mydat,
hb = "Hemoglobin (g/dL)",
weight = "Weight (lbs)",
blood_rcv = "n (%) received blood",
insured = "n (%) with insurance",
race = "Race",
disp = "Disposition")
# Notice, we did not have to label variables that we were not
# inputting into the function. For example, ID was not being fed into
# the easytable1 function, and so it did not need to be labelled.
#-------------------------------------------------------------------------------------------------------------------
#function begins
easytable1 <- function(numerics, logicals, factors, data, by, title){ #input dataset should be a labelled dataset
#----------------------------------------------------------------------------------------
#blanket warning for missing inputs
if ((missing(factors) & missing(numerics) & missing(logicals))|missing(data)|missing(by)) {
stop("Please provide the name of atleast one input column, a 'by' column, and a dataset")
}
#install required packages
requiredPackages <- c("flextable", "magrittr")
for (p in requiredPackages) {
if(!require(p, character.only = TRUE))
install.packages(p, dependencies = TRUE)
library(p,character.only = TRUE)
}
#-----------------------------------------------------------------------------------------
#make 2 subsets of data using the 'by' input
x <- data[data[[by]] == "1", ] #by represents a column name with quotes. Provided in input.
y <- data[data[[by]] == "0", ]
#save lengths to use later in table labels
num_int <- nrow(x)
num_con <- nrow(y)
#-----------------------------------------------------------------------------------------
#empty dataframe
emptydat <- data.frame( Type = character(),
var = character(),
metric = character())
#-------------------------------------------------------------------------------------
#1st loop: For numerics from x
#------------------------------------------------------------------------------------
if (!(missing(numerics))) { #open if clause - allows numerics to be missing in input
for (i in numerics){ # open numerics loop
#extract the p value
shaptest <- shapiro.test(x[,i])
pvalue <- shaptest[[2]]
#if data is normal
if (pvalue > 0.05) {
mean <- round(mean(x[,i], na.rm = TRUE), 1) #NAs omitted from calculations, in both mean and SD
sd <- round(sd(x[,i], na.rm = TRUE), 1)
meanchar <- formatC(mean, format = "f", digits = 1) #Convert data to character for formatted display
sdchar <- formatC(sd, format = "f", digits = 1)
combine <- paste0(meanchar, " ", "(", sdchar, ")")
#create loop dataframe for current iteration
loopdata <- data.frame(Type = "Numeric", #This shows up in row label (Flextable)
var = var_lab(x[,i]), #This is display label for the variable.
metric = combine)
#add this iteration's data to empty dataset
emptydat <- rbind(emptydat, loopdata)
} #if ends: for normal data
#if data is not normal
else {
#calculate median and IQR
median <- median(x[, i], na.rm = TRUE)#NAs omitted.
quart1 <- round(quantile(x[, i] , 0.25), 1)
quart3 <- round(quantile(x[, i] , 0.75), 1)
#change them to characters, and combine to display with formatting.
medchar <- formatC(median, format = "f", digits = 1)
quart1char <- formatC(quart1, format = "f", digits = 1)
quart3char <- formatC(quart3, format = "f", digits = 1)
combine <- paste0(medchar, " ", "(", quart1char, ", ", quart3char, ")")
#Create the loop dataframe
loopdata <- data.frame(Type = "Numeric", #This shows up in row label (Flextable)
var = var_lab(x[,i]), #This is display label for the variable.
metric = combine)
#add this iteration's data to empty dataset
emptydat <- rbind(emptydat, loopdata)
} #else ends: for non-normal data
} #close numerics loop
} #close if clause to allow missing numerics in input
#-------------------------------------------------------------------------------------
# 2nd loop: For logicals from x
#------------------------------------------------------------------------------------
if (!(missing(logicals))) { #open if clause - allows logicals to be missing in input
for (i in logicals) { #open loop for logicals
count <- sum(x[, i]) #get count for 1s in logical column
table <- table(x[, i], useNA = "no") #prepare a table. Using a sum on this table gets you counts of this logical vector with NAs excluded.
denom <- sum(table) #get the denominator
pct <- round((count*100/denom),digits = 1) #get pctages (rounded to 1 decimal point)
countchar <- formatC(count) #convert the counts and pcts to character, and combine them with appropriate formatting.
pctchar <- formatC(pct, format = "f", digits = 1)
combine <- paste0(countchar, " ", "(", pctchar, " ", "%", ")")
#Create the loop dataframe
loopdata <- data.frame(Type = "Logical", #This shows up in row label (Flextable)
var = var_lab(x[,i]), #This is display label for the variable.
metric = combine)
#Bind this iteration of logical data to the skeleton data. Note: It should already have the values for the numeric variables.
emptydat <- rbind(emptydat, loopdata)
} #close logicals loop
} #close if clause to allow missing logicals in input
#-------------------------------------------------------------------------------------
# 3rd loop: For factor from x
#------------------------------------------------------------------------------------
if (!(missing(factors))) { #open if clause - allows factors to be missing in input
for (i in factors) {
table_fac <- table(x[ , i], useNA = "no") #create a table of factor variable currently in loop. Make sure missing values are not counted in denominator.
nn <- names(table_fac) #Store the names of the levels of factor in nn. They should have already been labelled.
list_fac <- lapply(table_fac, function(r) list(FREQ = r,
PCTnoNA = 100*r/sum(table_fac)))
# r just accesses frequencies from table
# look closely at the PCTnoNA denominator
# it is the sum(table_fac)
# This is the sum of the counts of the factors
# listed out in the table. Since we excluded
# missing values from the table using useNA = "no"
# the denominator is calculated excluding NAs.
unlist_fac <- unlist(list_fac) #unlist to break up into components that can be arranged in a matrix
fac_matrix <- t(matrix(unlist_fac, nrow = 2, byrow = FALSE)) #create the matrix
rownames(fac_matrix) <- nn #apply names of factors (from table) to matrix
fac_df <- data.frame(fac_matrix)
#Manipulate dataframe to hold counts and percentages. Look into making code cleaner.
fac_df$freqchar <- formatC(fac_df$X1)
fac_df$pctchar <- formatC(fac_df$X2, format = "f", digits = 1)
fac_df$combine <- paste0(fac_df$freqchar, " ", "(", fac_df$pctchar, " ", "%", ")")
#keep only column which is called combine.
fac_df <- data.frame(fac_df[, which(names(fac_df) == "combine")])
#This subsetting changes column name, unfortunately. So rename it.
names(fac_df) <- "metric"
fac_df$var <- nn # Factor labels applied here
fac_df$Type <- paste0("Categorical -", var_lab(data[,i])) # Variable Label applied here. It is combined with a string that calls it Categorical.
#reorder #This is done to ensure that columns are in the same order as empty dataset. Fcilitates rbind.
order <- c("Type", "var", "metric")
fac_df <- fac_df[, order]
#add to empty dataset
emptydat <- rbind(emptydat, fac_df)
} #close factor loop
} #close if clause - allows factors to be missing in input
#Save intervention data:
intervention <- emptydat
#-----------------------------------------------------------------------------------------
#******************************************************************************************
#Group 2: Working with the control group (y)
#******************************************************************************************
#Repeat all the above steps, but make sure to change:
# x to y (calculate stats only on the subset of data that has treatment code = 0)
# Do not drop Type and var. We will drop it at the end. Idea is to avoid changing code.
# Drop Type and var. These already exist in intervention, do not have to repeated here, will be the same for the two groups.
#No NEED TO CHANGE:
# emptydat_X to emptydat_Y.
# metric to metric0 , to reflect that these are being calculated for treatment code = 0
# Add suffix _y to all objects within function to distinguish these objects between 2 groups.
#BECAUSE RESULTS FROM FIRST DATASET ARE ALREADY SAVED IN INTERVENTION
#empty dataframe
emptydat <- data.frame( Type = character(),
var = character(),
metric = character())
#-------------------------------------------------------------------------------------
#1st loop: For numerics from y
#------------------------------------------------------------------------------------
if (!(missing(numerics))) { #open if clause - allows numerics to be missing in input
for (i in numerics){ # open numerics loop
#extract the p value
shaptest <- shapiro.test(y[,i])
pvalue <- shaptest[[2]]
#if data is normal
if (pvalue > 0.05) {
mean <- round(mean(y[,i], na.rm = TRUE), 1) #NAs omitted from calculations, in both mean and SD
sd <- round(sd(y[,i], na.rm = TRUE), 1)
meanchar <- formatC(mean, format = "f", digits = 1) #Convert data to character for formatted display
sdchar <- formatC(sd, format = "f", digits = 1)
combine <- paste0(meanchar, " ", "(", sdchar, ")")
#create loop dataframe for current iteration
loopdata <- data.frame(Type = "Numeric", #This shows up in row label (Flextable)
var = var_lab(y[,i]), #This is display label for the variable.
metric = combine)
#add this iteration's data to empty dataset
emptydat <- rbind(emptydat, loopdata)
} #if ends: for normal data
#if data is not normal
else {
#calculate median and IQR
median <- median(y[, i], na.rm = TRUE)#NAs omitted.
quart1 <- round(quantile(data[, i] , 0.25), 1)
quart3 <- round(quantile(data[, i] , 0.75), 1)
#change them to characters, and combine to display with formatting.
medchar <- formatC(median, format = "f", digits = 1)
quart1char <- formatC(quart1, format = "f", digits = 1)
quart3char <- formatC(quart3, format = "f", digits = 1)
combine <- paste0(medchar, " ", "(", quart1char, ", ", quart3char, ")")
#Create the loop dataframe
loopdata <- data.frame(Type = "Numeric", #This shows up in row label (Flextable)
var = var_lab(y[,i]), #This is display label for the variable.
metric = combine)
#add this iteration's data to empty dataset
emptydat <- rbind(emptydat, loopdata)
} #else ends: for non-normal data
} #close numerics loop
} #close if clause to allow missing numerics in input
#-------------------------------------------------------------------------------------
# 2nd loop: For logicals
#------------------------------------------------------------------------------------
if (!(missing(logicals))) { #open if clause - allows logicals to be missing in input
for (i in logicals) { #open loop for logicals
count <- sum(y[, i]) #get count for 1s in logical column
table <- table(y[, i], useNA = "no") #prepare a table. Using a sum on this table gets you counts of this logical vector with NAs excluded.
denom <- sum(table) #get the denominator
pct <- round((count*100/denom),digits = 1) #get pctages (rounded to 1 decimal point)
countchar <- formatC(count) #convert the counts and pcts to character, and combine them with appropriate formatting.
pctchar <- formatC(pct, format = "f", digits = 1)
combine <- paste0(countchar, " ", "(", pctchar, " ", "%", ")")
#Create the loop dataframe
loopdata <- data.frame(Type = "Logical", #This shows up in row label (Flextable)
var = var_lab(y[,i]), #This is display label for the variable.
metric = combine)
#Bind this iteration of logical data to the skeleton data. Note: It should already have the values for the numeric variables.
emptydat <- rbind(emptydat, loopdata)
} #close logicals loop
} #close if clause to allow missing logicals in input
#-------------------------------------------------------------------------------------
# 3rd loop: For factor
#------------------------------------------------------------------------------------
if (!(missing(factors))) { #open if clause - allows factors to be missing in input
for (i in factors) {
table_fac <- table(y[ , i], useNA = "no") #create a table of factor variable currently in loop. Make sure missing values are not counted in denominator.
nn <- names(table_fac) #Store the names of the levels of factor in nn. They should have already been labelled.
list_fac <- lapply(table_fac, function(r) list(FREQ = r,
PCTnoNA = 100*r/sum(table_fac)))
# r just accesses frequencies from table
# look closely at the PCTnoNA denominator
# it is the sum(table_fac)
# This is the sum of the counts of the factors
# listed out in the table. Since we excluded
# missing values from the table using useNA = "no"
# the denominator is calculated excluding NAs.
unlist_fac <- unlist(list_fac) #unlist to break up into components that can be arranged in a matrix
fac_matrix <- t(matrix(unlist_fac, nrow = 2, byrow = FALSE)) #create the matrix
rownames(fac_matrix) <- nn #apply names of factors (from table) to matrix
fac_df <- data.frame(fac_matrix)
#Manipulate dataframe to hold counts and percentages. Look into making code cleaner.
fac_df$freqchar <- formatC(fac_df$X1)
fac_df$pctchar <- formatC(fac_df$X2, format = "f", digits = 1)
fac_df$combine <- paste0(fac_df$freqchar, " ", "(", fac_df$pctchar, " ", "%", ")")
#keep only column which is called combine.
fac_df <- data.frame(fac_df[, which(names(fac_df) == "combine")])
#This subsetting changes column name, unfortunately. So rename it.
names(fac_df) <- "metric"
fac_df$var <- nn # Factor labels applied here
fac_df$Type <- paste0("Categorical -", var_lab(y[,i])) # Variable Label applied here. It is combined with a string that calls it Categorical.
#reorder #This is done to ensure that columns are in the same order as empty dataset. Fcilitates rbind.
order <- c("Type", "var", "metric")
fac_df <- fac_df[, order]
#add to empty dataset
emptydat <- rbind(emptydat, fac_df)
} #close factor loop
} #close if clause - allows factors to be missing in input
#Save control data:
control <- emptydat
#drop Type and var from control (These contain exactly same info as 1st 2 columns in intervention)
control <- control[, -c(1,2)]
#-----------------------------------------------------------------------------------------
#Bind both datasets:
out <- cbind(intervention, control)
names(out)[2] <- "Characteristic"
names(out)[3] <- paste0("Intervention\nN = ", nrow(x))
names(out)[4] <- paste0("Placebo\nN = ", nrow(y))
#--------------------------------------------------------------------------------------------
#return(out) #If you uncomment this, all subsequent lines dont run in function, and the output of function is a dataset
#Create grouped dataset
mygroupeddata <- as_grouped_data( x= out,
groups = c("Type"))
#return(mygroupeddata) #If you uncomment this, all subsequent lines dont run in function, and the output of function is a grouped dataset
#Create flextable
myflextable <- as_flextable( mygroupeddata ) %>%
bold(j = 1, i = ~ !is.na(Type), bold = TRUE, part = "body" ) %>%
bold(part = "header", bold = TRUE ) %>% autofit()
if (!(missing(title))) {
myflextable <- set_caption(myflextable, title) %>% autofit()
}
#end function: this is the only output
return(myflextable)
} #function ends
#----------------------------------------------------------------------------------------------
#Testing function on sample dataset.
#out <- easytable1(numerics = c("hb", "weight"),
# logicals = c("insured", "blood_rcv"),
# factors = c("race"),
# data = mydat,
# by = "treat")
#out