Data Summary in R

Run Date: 2024-10-09

Performing data summary involves using various methods to explore and understand data sets. In Base R, we could use functions like summary(), str(), and head() to provide quick insights into the data and key statistics. We could use other packages like psych and crosstable to create nicer summary tables. Additionally, packages like dplyr and tidyverse enable the analyst to clean and manage data sets. Visualization is a crucial part of summarizing and understanding the data. The ggplot2 function allow us to create different types of figures. Together, these techniques (tables, figures, listings …etc) form a comprehensive approach to the data analysis.

1 Library

Load some useful libraries for the data management and data analysis.
Use results=‘hide’ in the chunk options to hide the results and message=FALSE to suppress messages like this … {r lib, results=‘hide’, message=FALSE}

# Data management 
library(tidyverse) # data management & other 
library(tidyr) # organize tabular data 
library(janitor)  # data cleaning like clean_names 
library(data.table) # rbindlist function "makes one data.table from a list of many"


# Data summary 
library(psych) # numeric data summary 
library(crosstable) # cross-tabulation 
library(summarytools)  # data summary tools 
library(epitools)  # epidemiology tools 
library(ggsci) # themes for plots 


# Data analysis 
library(Hmisc)  #  many functions useful for  analysis, graphics, computing sample size, simulation, importing and annotating, imputation 
library(stats) # statistical tests 
library(pROC) # ROC analysis 

# Power 
library(pwr) # Power calculations  
library(WebPower) # basic and advanced statistical power analysis 
library(rpact) # adaptive trial design 


# Data sets 
library(medicaldata)

2 Summary

To summarize numeric data, we could use the describeBy function of the psych package. To create a summary table by group, we could use the crosstable function of the crosstable package

# let's check what data are available to us 
data()

# load the Indomethacin RCT data
data(indo_rct)
colnames(indo_rct)

 [1] "id"          "site"        "age"         "risk"        "gender"     
 [6] "outcome"     "sod"         "pep"         "recpanc"     "psphinc"    
[11] "precut"      "difcan"      "pneudil"     "amp"         "paninj"     
[16] "acinar"      "brush"       "asa81"       "asa325"      "asa"        
[21] "prophystent" "therastent"  "pdstent"     "sodsom"      "bsphinc"    
[26] "bstent"      "chole"       "pbmal"       "train"       "status"     
[31] "type"        "rx"          "bleed"

# let's summarize age and risk by gender 
describeBy( age + risk ~  gender , data = indo_rct)


 Descriptive statistics by group 
gender: 1_female
     vars   n  mean    sd median trimmed   mad min  max range skew kurtosis
age     1 476 45.08 13.01   45.0   44.72 14.08  19 90.0  71.0 0.23    -0.43
risk    2 476  2.44  0.87    2.5    2.41  0.74   1  5.5   4.5 0.38    -0.05
       se
age  0.60
risk 0.04
------------------------------------------------------------ 
gender: 2_male
     vars   n  mean    sd median trimmed   mad min  max range skew kurtosis
age     1 126 45.99 14.37     46   45.87 17.79  19 76.0  57.0 0.08    -0.84
risk    2 126  2.14  0.89      2    2.06  0.74   1  4.5   3.5 0.64     0.00
       se
age  1.28
risk 0.08

# we could also create a data frame for the results 
res = describeBy( age + risk ~  gender , data = indo_rct,mat = T)

# Now let's create a summary table by treatment group
crosstable( data =  indo_rct , 
            cols = c(age,risk,gender,site,sod,pep,bleed,outcome),
            by = rx,
            total = "both",
            percent_pattern = "{n}/{n_col} ({p_col})" ,
            percent_digits = 1,
            num_digits = 1,
            showNA = "always",
            unique_numeric = 5
)

# A tibble: 34 × 7
   .id    label      variable   `0_placebo`      `1_indomethacin` `NA`  Total   
   <chr>  <chr>      <chr>      <chr>            <chr>            <chr> <chr>   
 1 age    age        Min / Max  19.0 / 90.0      19.0 / 80.0      no NA 19.0 / …
 2 age    age        Med [IQR]  46.0 [36.0;55.0] 44.0 [33.0;54.0] no NA 45.0 [3…
 3 age    age        Mean (std) 46.0 (13.1)      44.5 (13.5)      no NA 45.3 (1…
 4 age    age        N (NA)     307 (0)          295 (0)          no NA 602 (0) 
 5 risk   risk score Min / Max  1.0 / 4.5        1.0 / 5.5        no NA 1.0 / 5…
 6 risk   risk score Med [IQR]  2.5 [1.5;3.0]    2.5 [2.0;3.0]    no NA 2.5 [1.…
 7 risk   risk score Mean (std) 2.3 (0.9)        2.4 (0.9)        no NA 2.4 (0.…
 8 risk   risk score N (NA)     307 (0)          295 (0)          no NA 602 (0) 
 9 gender gender     1_female   247/307 (80.5%)  229/295 (77.6%)  0     476 (79…
10 gender gender     2_male     60/307 (19.5%)   66/295 (22.4%)   0     126 (20…
# ℹ 24 more rows

3 Plots

This is an example for a bar chart summarizing toxicity proportions (four types) by BMI (three levels)

# BMI plot data: wt is the BMI category, name is toxicity, p is the proportion 

wt.data = data.frame(  
  wt = c(rep("Below 18",4), rep("18-29",4) , rep("30 or More",4)  ) , 
  name = rep( c("adrenal","diab", "hyper","hypo" ),3 ) , 
  p =  c(0.05,0.07,0.12,0.32,0,0.01,0.09,0.25,0.03,0.07,0.11,0.35)) 


head( wt.data)

        wt    name    p
1 Below 18 adrenal 0.05
2 Below 18    diab 0.07
3 Below 18   hyper 0.12
4 Below 18    hypo 0.32
5    18-29 adrenal 0.00
6    18-29    diab 0.01

# Prepare the data for ggplot2 
bmiplotdata = wt.data %>% 
  
  # create tox.order, a column for ordering toxicity categories in the plot
  mutate (tox.order = case_when( 
    name =="hypo"~1 ,
    name =="hyper"~2 , 
    name =="adrenal"~3 , 
    name =="diab"~4 )) %>%
  
  mutate (name = case_when( 
    name =="hypo"~"Hypotension" ,
    name =="hyper"~"Hypertention" , 
    name =="adrenal"~"Adrenal" , 
    name =="diab"~"Diabetes" ))  %>% 
  
  
  mutate( percent_true = round(p*100,1) , 
          label.col = paste0(round(percent_true,0),"%"),
          percent = percent_true+0.1 , # for aesthetics, if you have 0% it will show a thin line 
          wt.ord = case_when(wt=="30 or More"~3,wt=="Below 18"~1,TRUE ~2) # a column for ordering BMI categories in the plot 
          ) 

bmiplotdata

           wt         name    p tox.order percent_true label.col percent wt.ord
1    Below 18      Adrenal 0.05         3            5        5%     5.1      1
2    Below 18     Diabetes 0.07         4            7        7%     7.1      1
3    Below 18 Hypertention 0.12         2           12       12%    12.1      1
4    Below 18  Hypotension 0.32         1           32       32%    32.1      1
5       18-29      Adrenal 0.00         3            0        0%     0.1      2
6       18-29     Diabetes 0.01         4            1        1%     1.1      2
7       18-29 Hypertention 0.09         2            9        9%     9.1      2
8       18-29  Hypotension 0.25         1           25       25%    25.1      2
9  30 or More      Adrenal 0.03         3            3        3%     3.1      3
10 30 or More     Diabetes 0.07         4            7        7%     7.1      3
11 30 or More Hypertention 0.11         2           11       11%    11.1      3
12 30 or More  Hypotension 0.35         1           35       35%    35.1      3

# Create the bar plot using the ggplot function 
wtplot = ggplot(  bmiplotdata, 
                  aes(x = reorder(name,tox.order)  , 
                      y = percent  , 
                      fill = reorder(wt,wt.ord))) + 
  
  # geom_bar: create a bar chart 
  geom_bar(stat = "identity" , 
           position = "dodge" ,
           width = 0.75) +
  
  
  # geom_text 
  geom_text(aes(
    label = label.col), 
    vjust = -0.5,
    position = position_dodge(.75),
    color="black",
    size=3,
    fontface = "plain", 
    family = "Fira Sans") +
  
  
  # scale_y_continuous 
  scale_y_continuous(breaks = seq(0, 40, by = 5)) +
  ylab ("Percentage of patients")+
  xlab(element_blank())+ 
  
  
  # theme 
  theme(
    axis.title.x = element_text(size = 12,color = "black",  face = "bold" ,vjust=1)  ,
    axis.title.y = element_text(size = 12,color = "black",  face = "bold",vjust = 3) ,
    
    axis.text.x = element_text(size = 11,color = "black",face = "bold") ,
    axis.text.y = element_text(size = 11,color = "black") ,
    
    legend.text = element_text(size = 10,color = "black",face = "bold") ,
    legend.title = element_text(size = 10,color = "black",face = "bold")    ,
    legend.position = "bottom"   )+

    
    scale_fill_jama(name="BMI") 


wtplot

Here, I create two time-series plots for male and female populations. Lines represent mortality rates (per 100,000) attributed to two risk factors. The time frame is 2016 to 2020.

ts.plot.data = data.frame(
  sex_name = c( rep("Male",2), rep("Female",3) ), 
   risk = c(rep("High BMI",2), rep("Hyperglycemia",3)),
  year = c(seq(2016,2020)) ) %>% # create rows for all possible combination 
  complete(sex_name, risk , year)  %>% 
  arrange(sex_name, risk , year)  

# let's create some values (rates) 
ts.plot.data$val = c(10,11,11.5,11.3,12,
                     12,12.5,13,13,13.5,
                     15,16,17.5,18,19,
                     13,14,13.5,14,15)

  
ts =   
  
  ggplot( ts.plot.data , aes(x = year , y = val, group = risk , fill=risk )) +
  geom_line(aes(color = risk),linewidth=1.3)+
  facet_wrap(~ sex_name , ncol=2)+
  
 
  scale_y_continuous(name="Rate",breaks=seq(0,20,5),limits = c(0,20)  )+
  scale_x_continuous(name="Year",breaks=seq(2016,2020,1)  )+
  
  guides(color = guide_legend(title = "Risk"))+
 
  
  theme(legend.position = "bottom" ,
        legend.justification = "center",
        legend.text = element_text(size = 11,color = "black",face = "bold") ,
        axis.text  = element_text(size = 11,color = "black",face = "bold") ,
        panel.border = element_rect(colour = "black",fill = NA) ,
        legend.title = element_text(size = 11,color = "black",face = "bold"),
        strip.text = element_text(size = 11,color = "black",face = "bold"),
        panel.spacing.x = unit(1, "lines"))+
  
  scale_color_jama()


ts