Module 12 Advanced Summary and Data Visualization

required library

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(readr)
library(readxl)
library(haven)

Data

Data and data shummary

data(mtcars)
head(mtcars)

##                    mpg cyl disp  hp drat    wt  qsec vs am gear carb
## Mazda RX4         21.0   6  160 110 3.90 2.620 16.46  0  1    4    4
## Mazda RX4 Wag     21.0   6  160 110 3.90 2.875 17.02  0  1    4    4
## Datsun 710        22.8   4  108  93 3.85 2.320 18.61  1  1    4    1
## Hornet 4 Drive    21.4   6  258 110 3.08 3.215 19.44  1  0    3    1
## Hornet Sportabout 18.7   8  360 175 3.15 3.440 17.02  0  0    3    2
## Valiant           18.1   6  225 105 2.76 3.460 20.22  1  0    3    1

Basic Summary Stats

summary(mtcars)

##       mpg             cyl             disp             hp       
##  Min.   :10.40   Min.   :4.000   Min.   : 71.1   Min.   : 52.0  
##  1st Qu.:15.43   1st Qu.:4.000   1st Qu.:120.8   1st Qu.: 96.5  
##  Median :19.20   Median :6.000   Median :196.3   Median :123.0  
##  Mean   :20.09   Mean   :6.188   Mean   :230.7   Mean   :146.7  
##  3rd Qu.:22.80   3rd Qu.:8.000   3rd Qu.:326.0   3rd Qu.:180.0  
##  Max.   :33.90   Max.   :8.000   Max.   :472.0   Max.   :335.0  
##       drat             wt             qsec             vs        
##  Min.   :2.760   Min.   :1.513   Min.   :14.50   Min.   :0.0000  
##  1st Qu.:3.080   1st Qu.:2.581   1st Qu.:16.89   1st Qu.:0.0000  
##  Median :3.695   Median :3.325   Median :17.71   Median :0.0000  
##  Mean   :3.597   Mean   :3.217   Mean   :17.85   Mean   :0.4375  
##  3rd Qu.:3.920   3rd Qu.:3.610   3rd Qu.:18.90   3rd Qu.:1.0000  
##  Max.   :4.930   Max.   :5.424   Max.   :22.90   Max.   :1.0000  
##        am              gear            carb      
##  Min.   :0.0000   Min.   :3.000   Min.   :1.000  
##  1st Qu.:0.0000   1st Qu.:3.000   1st Qu.:2.000  
##  Median :0.0000   Median :4.000   Median :2.000  
##  Mean   :0.4062   Mean   :3.688   Mean   :2.812  
##  3rd Qu.:1.0000   3rd Qu.:4.000   3rd Qu.:4.000  
##  Max.   :1.0000   Max.   :5.000   Max.   :8.000

Skimr for more details of data summary

# install.packages("skimr")
library(skimr)

## Warning: package 'skimr' was built under R version 4.4.3

skim(mtcars)

Data summary
Name	mtcars
Number of rows	32
Number of columns	11
_______________________
Column type frequency:
numeric	11
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
mpg	1	20.09	6.03	10.40	15.43	19.20	22.80	33.90	▃▇▅▁▂
cyl	1	6.19	1.79	4.00	4.00	6.00	8.00	8.00	▆▁▃▁▇
disp	1	230.72	123.94	71.10	120.83	196.30	326.00	472.00	▇▃▃▃▂
hp	1	146.69	68.56	52.00	96.50	123.00	180.00	335.00	▇▇▆▃▁
drat	1	3.60	0.53	2.76	3.08	3.70	3.92	4.93	▇▃▇▅▁
wt	1	3.22	0.98	1.51	2.58	3.33	3.61	5.42	▃▃▇▁▂
qsec	1	17.85	1.79	14.50	16.89	17.71	18.90	22.90	▃▇▇▂▁
vs	1	0.44	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▆
am	1	0.41	0.50	0.00	0.00	0.00	1.00	1.00	▇▁▁▁▆
gear	1	3.69	0.74	3.00	3.00	4.00	4.00	5.00	▇▁▆▁▂
carb	1	2.81	1.62	1.00	2.00	2.00	4.00	8.00	▇▂▅▁▁

Group-wise Summaries: dplyr

library(dplyr)

mtcars %>%
  group_by(cyl) %>%
  summarise(across(mpg:hp, list(mean = mean, sd = sd)))

## # A tibble: 3 × 7
##     cyl mpg_mean mpg_sd disp_mean disp_sd hp_mean hp_sd
##   <dbl>    <dbl>  <dbl>     <dbl>   <dbl>   <dbl> <dbl>
## 1     4     26.7   4.51      105.    26.9    82.6  20.9
## 2     6     19.7   1.45      183.    41.6   122.   24.3
## 3     8     15.1   2.56      353.    67.8   209.   51.0

Group-wise Summaries: dplyr

library(dplyr)

mtcars %>%
  group_by(cyl, carb) %>%
  summarise(across(mpg:hp, list(mean = mean, sd = sd)))

## `summarise()` has grouped output by 'cyl'. You can override using the `.groups`
## argument.

## # A tibble: 9 × 8
## # Groups:   cyl [3]
##     cyl  carb mpg_mean mpg_sd disp_mean disp_sd hp_mean hp_sd
##   <dbl> <dbl>    <dbl>  <dbl>     <dbl>   <dbl>   <dbl> <dbl>
## 1     4     1     27.6   5.55      91.4   21.4     77.4 16.1 
## 2     4     2     25.9   3.81     117.    27.1     87   24.9 
## 3     6     1     19.8   2.33     242.    23.3    108.   3.54
## 4     6     4     19.8   1.55     164.     4.39   116.   7.51
## 5     6     6     19.7  NA        145     NA      175   NA   
## 6     8     2     17.2   2.09     346.    43.4    162.  14.4 
## 7     8     3     16.3   1.05     276.     0      180    0   
## 8     8     4     13.2   2.28     406.    57.8    234   21.7 
## 9     8     8     15    NA        301     NA      335   NA

Basic Visualization: Base R

hist(mtcars$mpg, main="MPG Distribution")

Using ggplot2: Intro

library(ggplot2)

ggplot(mtcars, aes(x=mpg)) +
  geom_histogram(bins=10, fill="skyblue", color="black") + theme_dark()

Boxplots for Group Comparisons

ggplot(mtcars, aes(x=factor(cyl), y=mpg)) +
  geom_boxplot(fill="steelblue") +
  labs(x="Cylinders", y="Miles Per Gallon") + theme_bw()

Scatter Plot with Trend Line

ggplot(mtcars, aes(x=wt, y=mpg)) +
  geom_point() +
  geom_smooth(method="lm") + theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

Density Plots

ggplot(mtcars, aes(x=mpg, fill=factor(cyl))) +
  geom_density(alpha=0.5) + theme_classic()

Bar plot

ggplot(mtcars, aes(x=factor(gear))) +
  geom_bar(fill="darkblue") + theme_light()

Violin Plots

ggplot(mtcars, aes(x=factor(cyl), y=mpg)) +
  geom_violin(fill="lightgreen") + theme_classic()

Faceting: Multi-panel Plots

ggplot(mtcars, aes(x=mpg)) +
  geom_histogram(fill="blue") +
  facet_wrap(~cyl) + theme_classic()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Themes and Customizations

ggplot(mtcars, aes(x=wt, y=mpg)) +
  geom_point() +
  theme_minimal()

Interactive Plots: plotly

library(plotly)

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

p <- ggplot(mtcars, aes(x=wt, y=mpg, color = factor(cyl))) + geom_point() + theme_classic()
ggplotly(p)

Linear Model Forest Plot

# Linear Regression Example
data(mtcars)
model_lm <- lm(mpg ~ wt + hp + cyl, data = mtcars)

# Extract coefficients and confidence intervals
coef_df <- as.data.frame(summary(model_lm)$coefficients)
coef_df$term <- rownames(coef_df)
confint_df <- confint(model_lm)
coef_df$conf.low <- confint_df[,1]
coef_df$conf.high <- confint_df[,2]

# Forest Plot with ggplot2
library(ggplot2)
ggplot(coef_df[-1,], aes(x=term, y=Estimate)) +  # Remove intercept
  geom_point() +
  geom_errorbar(aes(ymin=conf.low, ymax=conf.high), width=0.2) +
  coord_flip() +
  labs(title="Linear Model - Forest Plot", y="Coefficient Estimate", x="Variable") +
  theme_minimal()

Logistic Regression Forest Plot

# Logistic Regression Example using mtcars
mtcars$am <- factor(mtcars$am)
model_logit <- glm(am ~ wt + hp + cyl, data = mtcars, family = "binomial")

# Tidy output using broom
library(broom)
logit_df <- tidy(model_logit, conf.int = TRUE, exponentiate = TRUE)  # ORs

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

# Forest Plot
ggplot(logit_df[-1,], aes(x=term, y=estimate)) +  # Exclude intercept
  geom_point() +
  geom_errorbar(aes(ymin=conf.low, ymax=conf.high), width=0.2) +
  coord_flip() +
  labs(title="Logistic Regression - Odds Ratios", y="Odds Ratio", x="Variable") +
  theme_minimal()

Cox Proportional Hazards Model Forest Plot

# Survival Analysis Example using survival package
library(survival)
library(survminer)

## Loading required package: ggpubr

## 
## Attaching package: 'survminer'

## The following object is masked from 'package:survival':
## 
##     myeloma

# Simulate survival data
set.seed(123)
n <- 100
surv_data <- data.frame(
  time = rexp(n, 0.1),
  status = sample(0:1, n, replace = TRUE),
  age = rnorm(n, 50, 10),
  sex = factor(sample(c("Male", "Female"), n, replace = TRUE))
)

cox_model <- coxph(Surv(time, status) ~ age + sex, data = surv_data)

library(forestmodel)

## Warning: package 'forestmodel' was built under R version 4.4.3

forest_model(cox_model)

The easiest way is with the usmap package to plot US Map and your results

Basic Maps

library(usmap)

## Warning: package 'usmap' was built under R version 4.4.3

plot_usmap()

Color by Population in 2022

plot_usmap(data = statepop, values = "pop_2022") + theme(legend.position = "right")

MS map by county

plot_usmap(include = "MS",
           regions = "county")

Deep SOuth

plot_usmap(include = c("MS", "AL", "GA", "LA", "SC", "FL"),
           regions = "county")

Other plot with differnt GIS

# Output is ggplot object so it can be extended # with any number of ggplot layers
library(ggplot2) 
plot_usmap(include = c("CA", "NV", "ID", "OR", "WA")) + labs(title = "Western States")

# Color maps with data 
plot_usmap(data = statepop, values = "pop_2022")

# Include labels on map (e.g. state abbreviations) 
plot_usmap(data = statepop, values = "pop_2022", labels = TRUE)

# Choose color for labels 
plot_usmap(data = statepop, values = "pop_2022", labels = TRUE, label_color = "white")

Some random sample USA Map

library(usmap)
library(ggplot2)

# Sample data: Population by state
data <- data.frame(
  state = state.name,
  prevalence = sample(10:100, 10)  # Random values for demonstration
)

# Create the map
plot_usmap(data = data, values = "prevalence", color = "white") +
  scale_fill_continuous(
    low = "lightblue", high = "darkblue", name = "prevalence", label = scales::comma
  ) +
  labs(title = "Sample Data by State", subtitle = "Random values across USA") +
  theme(legend.position = "right")

Module 12 Advanced Summary and Data Visualization

Md Roungu Ahmmad PhD, Assistant Professor, Department of Public Health-Epidemiology and Biostatistics, School of Health Professions, University of Southern Mississippi, Hattiesburg, MS, USA 39401

April 17, 2025

required library

Data

Data and data shummary

Basic Summary Stats

Skimr for more details of data summary

Group-wise Summaries: dplyr

Group-wise Summaries: dplyr

Basic Visualization: Base R

Using ggplot2: Intro

Boxplots for Group Comparisons

Scatter Plot with Trend Line

Density Plots

Bar plot

Violin Plots

Faceting: Multi-panel Plots

Themes and Customizations

Interactive Plots: plotly

Linear Model Forest Plot

Logistic Regression Forest Plot

Cox Proportional Hazards Model Forest Plot

The easiest way is with the usmap package to plot US Map and your results

Basic Maps

Color by Population in 2022

MS map by county

Deep SOuth

Other plot with differnt GIS

Some random sample USA Map