# Data management
library(tidyverse) # data management & other
library(tidyr) # organize tabular data
library(janitor) # data cleaning like clean_names
library(data.table) # rbindlist function "makes one data.table from a list of many"
# Data summary
library(psych) # numeric data summary
library(crosstable) # cross-tabulation
library(summarytools) # data summary tools
library(epitools) # epidemiology tools
library(ggsci) # themes for plots
# Data analysis
library(Hmisc) # many functions useful for analysis, graphics, computing sample size, simulation, importing and annotating, imputation
library(stats) # statistical tests
library(pROC) # ROC analysis
# Power
library(pwr) # Power calculations
library(WebPower) # basic and advanced statistical power analysis
library(rpact) # adaptive trial design
# Data sets
library(medicaldata)
Data Summary in R
Run Date: 2024-10-09
Performing data summary involves using various methods to explore and understand data sets. In Base R, we could use functions like summary(), str(), and head() to provide quick insights into the data and key statistics. We could use other packages like psych and crosstable to create nicer summary tables. Additionally, packages like dplyr and tidyverse enable the analyst to clean and manage data sets. Visualization is a crucial part of summarizing and understanding the data. The ggplot2 function allow us to create different types of figures. Together, these techniques (tables, figures, listings …etc) form a comprehensive approach to the data analysis.
1 Library
Load some useful libraries for the data management and data analysis.
Use results=‘hide’ in the chunk options to hide the results and message=FALSE to suppress messages like this … {r lib, results=‘hide’, message=FALSE}
2 Summary
To summarize numeric data, we could use the describeBy function of the psych package. To create a summary table by group, we could use the crosstable function of the crosstable package
# let's check what data are available to us
data()
# load the Indomethacin RCT data
data(indo_rct)
colnames(indo_rct)
[1] "id" "site" "age" "risk" "gender"
[6] "outcome" "sod" "pep" "recpanc" "psphinc"
[11] "precut" "difcan" "pneudil" "amp" "paninj"
[16] "acinar" "brush" "asa81" "asa325" "asa"
[21] "prophystent" "therastent" "pdstent" "sodsom" "bsphinc"
[26] "bstent" "chole" "pbmal" "train" "status"
[31] "type" "rx" "bleed"
# let's summarize age and risk by gender
describeBy( age + risk ~ gender , data = indo_rct)
Descriptive statistics by group
gender: 1_female
vars n mean sd median trimmed mad min max range skew kurtosis
age 1 476 45.08 13.01 45.0 44.72 14.08 19 90.0 71.0 0.23 -0.43
risk 2 476 2.44 0.87 2.5 2.41 0.74 1 5.5 4.5 0.38 -0.05
se
age 0.60
risk 0.04
------------------------------------------------------------
gender: 2_male
vars n mean sd median trimmed mad min max range skew kurtosis
age 1 126 45.99 14.37 46 45.87 17.79 19 76.0 57.0 0.08 -0.84
risk 2 126 2.14 0.89 2 2.06 0.74 1 4.5 3.5 0.64 0.00
se
age 1.28
risk 0.08
# we could also create a data frame for the results
= describeBy( age + risk ~ gender , data = indo_rct,mat = T) res
# Now let's create a summary table by treatment group
crosstable( data = indo_rct ,
cols = c(age,risk,gender,site,sod,pep,bleed,outcome),
by = rx,
total = "both",
percent_pattern = "{n}/{n_col} ({p_col})" ,
percent_digits = 1,
num_digits = 1,
showNA = "always",
unique_numeric = 5
)
# A tibble: 34 × 7
.id label variable `0_placebo` `1_indomethacin` `NA` Total
<chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 age age Min / Max 19.0 / 90.0 19.0 / 80.0 no NA 19.0 / …
2 age age Med [IQR] 46.0 [36.0;55.0] 44.0 [33.0;54.0] no NA 45.0 [3…
3 age age Mean (std) 46.0 (13.1) 44.5 (13.5) no NA 45.3 (1…
4 age age N (NA) 307 (0) 295 (0) no NA 602 (0)
5 risk risk score Min / Max 1.0 / 4.5 1.0 / 5.5 no NA 1.0 / 5…
6 risk risk score Med [IQR] 2.5 [1.5;3.0] 2.5 [2.0;3.0] no NA 2.5 [1.…
7 risk risk score Mean (std) 2.3 (0.9) 2.4 (0.9) no NA 2.4 (0.…
8 risk risk score N (NA) 307 (0) 295 (0) no NA 602 (0)
9 gender gender 1_female 247/307 (80.5%) 229/295 (77.6%) 0 476 (79…
10 gender gender 2_male 60/307 (19.5%) 66/295 (22.4%) 0 126 (20…
# ℹ 24 more rows
3 Plots
This is an example for a bar chart summarizing toxicity proportions (four types) by BMI (three levels)
# BMI plot data: wt is the BMI category, name is toxicity, p is the proportion
= data.frame(
wt.data wt = c(rep("Below 18",4), rep("18-29",4) , rep("30 or More",4) ) ,
name = rep( c("adrenal","diab", "hyper","hypo" ),3 ) ,
p = c(0.05,0.07,0.12,0.32,0,0.01,0.09,0.25,0.03,0.07,0.11,0.35))
head( wt.data)
wt name p
1 Below 18 adrenal 0.05
2 Below 18 diab 0.07
3 Below 18 hyper 0.12
4 Below 18 hypo 0.32
5 18-29 adrenal 0.00
6 18-29 diab 0.01
# Prepare the data for ggplot2
= wt.data %>%
bmiplotdata
# create tox.order, a column for ordering toxicity categories in the plot
mutate (tox.order = case_when(
=="hypo"~1 ,
name =="hyper"~2 ,
name =="adrenal"~3 ,
name =="diab"~4 )) %>%
name
mutate (name = case_when(
=="hypo"~"Hypotension" ,
name =="hyper"~"Hypertention" ,
name =="adrenal"~"Adrenal" ,
name =="diab"~"Diabetes" )) %>%
name
mutate( percent_true = round(p*100,1) ,
label.col = paste0(round(percent_true,0),"%"),
percent = percent_true+0.1 , # for aesthetics, if you have 0% it will show a thin line
wt.ord = case_when(wt=="30 or More"~3,wt=="Below 18"~1,TRUE ~2) # a column for ordering BMI categories in the plot
)
bmiplotdata
wt name p tox.order percent_true label.col percent wt.ord
1 Below 18 Adrenal 0.05 3 5 5% 5.1 1
2 Below 18 Diabetes 0.07 4 7 7% 7.1 1
3 Below 18 Hypertention 0.12 2 12 12% 12.1 1
4 Below 18 Hypotension 0.32 1 32 32% 32.1 1
5 18-29 Adrenal 0.00 3 0 0% 0.1 2
6 18-29 Diabetes 0.01 4 1 1% 1.1 2
7 18-29 Hypertention 0.09 2 9 9% 9.1 2
8 18-29 Hypotension 0.25 1 25 25% 25.1 2
9 30 or More Adrenal 0.03 3 3 3% 3.1 3
10 30 or More Diabetes 0.07 4 7 7% 7.1 3
11 30 or More Hypertention 0.11 2 11 11% 11.1 3
12 30 or More Hypotension 0.35 1 35 35% 35.1 3
# Create the bar plot using the ggplot function
= ggplot( bmiplotdata,
wtplot aes(x = reorder(name,tox.order) ,
y = percent ,
fill = reorder(wt,wt.ord))) +
# geom_bar: create a bar chart
geom_bar(stat = "identity" ,
position = "dodge" ,
width = 0.75) +
# geom_text
geom_text(aes(
label = label.col),
vjust = -0.5,
position = position_dodge(.75),
color="black",
size=3,
fontface = "plain",
family = "Fira Sans") +
# scale_y_continuous
scale_y_continuous(breaks = seq(0, 40, by = 5)) +
ylab ("Percentage of patients")+
xlab(element_blank())+
# theme
theme(
axis.title.x = element_text(size = 12,color = "black", face = "bold" ,vjust=1) ,
axis.title.y = element_text(size = 12,color = "black", face = "bold",vjust = 3) ,
axis.text.x = element_text(size = 11,color = "black",face = "bold") ,
axis.text.y = element_text(size = 11,color = "black") ,
legend.text = element_text(size = 10,color = "black",face = "bold") ,
legend.title = element_text(size = 10,color = "black",face = "bold") ,
legend.position = "bottom" )+
scale_fill_jama(name="BMI")
wtplot
Here, I create two time-series plots for male and female populations. Lines represent mortality rates (per 100,000) attributed to two risk factors. The time frame is 2016 to 2020.
= data.frame(
ts.plot.data sex_name = c( rep("Male",2), rep("Female",3) ),
risk = c(rep("High BMI",2), rep("Hyperglycemia",3)),
year = c(seq(2016,2020)) ) %>% # create rows for all possible combination
complete(sex_name, risk , year) %>%
arrange(sex_name, risk , year)
# let's create some values (rates)
$val = c(10,11,11.5,11.3,12,
ts.plot.data12,12.5,13,13,13.5,
15,16,17.5,18,19,
13,14,13.5,14,15)
=
ts
ggplot( ts.plot.data , aes(x = year , y = val, group = risk , fill=risk )) +
geom_line(aes(color = risk),linewidth=1.3)+
facet_wrap(~ sex_name , ncol=2)+
scale_y_continuous(name="Rate",breaks=seq(0,20,5),limits = c(0,20) )+
scale_x_continuous(name="Year",breaks=seq(2016,2020,1) )+
guides(color = guide_legend(title = "Risk"))+
theme(legend.position = "bottom" ,
legend.justification = "center",
legend.text = element_text(size = 11,color = "black",face = "bold") ,
axis.text = element_text(size = 11,color = "black",face = "bold") ,
panel.border = element_rect(colour = "black",fill = NA) ,
legend.title = element_text(size = 11,color = "black",face = "bold"),
strip.text = element_text(size = 11,color = "black",face = "bold"),
panel.spacing.x = unit(1, "lines"))+
scale_color_jama()
ts