Colorful Correlation Plot

Overview of data visualization

This code will help provide a colorful correlation plot displaying the Pearson correlation coefficient, a 95% confidence interval (CI) around the correlation coefficient, the p-value, and a indicator of effect size magnitude following Cohen’s (1988) suggested small, medium, and large effect size thresholds.

Data

For illustration purposes, a data frame with five continuous variables is created with each variable ranging from 1 to 100.

# Set seed for reproducibility
set.seed(117)

# Create a data frame of five continuous variables
dat.r <- data.frame(
  Var1 = runif(100, min = 1, max = 100),
  Var2 = runif(100, min = 1, max = 100),
  Var3 = runif(100, min = 1, max = 100),
  Var4 = runif(100, min = 1, max = 100),
  Var5 = runif(100, min = 1, max = 100)
)

# Display the generated data frame
head(dat.r)

##        Var1      Var2     Var3      Var4      Var5
## 1 72.063691 98.463387 28.18272 81.365885 86.878495
## 2  5.492145 73.791096 72.39332 32.448097  3.623536
## 3 53.995668 37.554953 46.93986 55.552445 69.746189
## 4 47.039836 93.398847 68.89414 40.309063 52.585050
## 5 75.814536  4.376957  6.19421 50.713018 68.477886
## 6 14.308855 84.997090 79.40497  8.629912 31.651792

Organize data order

Next, organize variables so that they are plotted in a desired position within the correlation plot.

dat.r <- dat.r %>% dplyr::select(Var1, Var2, Var3, Var4, Var5) 

# Position of the variable in the ordering will determine where plotted. The variable you wish to be in the top corner should be placed second in the ordering.
dat.r <- dat.r %>% dplyr::select(Var5, Var1, Var2, Var3, Var4)

Optional: Rename variables for proper plotting labels

In some cases, your variable names might not be what you would like displayed either due to shorthand notation or excessive name length. The code below will provide a way to change variable names as you would like to see displayed in the final plot.

# Rename variables for plotting. The first quoted name is the original name and the second is the desired variable name.
dat.r <- dat.r %>% dplyr::rename_with(~ gsub("Var1", "Cognitive Reflection Test\n(CRT)", .x)) %>% 
  dplyr::rename_with(~ gsub("Var2", "Beck Depression Inventory\n(BDI)", .x)) %>% 
  dplyr::rename_with(~ gsub("Var3", "Extraversion\n(BFI)", .x)) %>% 
  dplyr::rename_with(~ gsub("Var4", "Positive Affect\n(PANAS)", .x)) %>%
  dplyr::rename_with(~ gsub("Var5", "Psychological Safety", .x))

Obtain sample size range for all comparisons

# Obtain an estimate of the sample size across all comparisons to be used later. 
approx_sample <- sum(apply(dat.r, 1, function(row) any(!is.na(row))))

Ensure all variables are numeric

The code below will ensure that all variables in the data frame are numeric so that the correlations can be calculated.

# Ensure all variables are numeric
dat.r <- dat.r %>% dplyr::mutate(across(everything(), as.numeric))

Correlation function

The code provides a function to calculate the Pearson’s correlation and associated statistics (correlation coefficient, p-value, lower and upper 95% CI, sample size n) for all comparisons within the data frame.

# Correlation Calculation Function (accounts for NAs)
corr_ci_na.fun <- function(x, y, alpha = 0.05) {
  # Compute correlation using pairwise complete observations
  corr <- cor(x, y, use = "pairwise.complete.obs")
  
  # Calculate the sample size for pairwise complete observations
  n <- sum(!is.na(x) & !is.na(y))
  
  # Fisher's Z transformation
  z <- 0.5 * log((1 + corr) / (1 - corr))
  
  # Standard error of Z
  se <- 1 / sqrt(n - 3)
  
  # Z critical value for the given alpha
  z_crit <- qnorm(1 - alpha / 2)
  
  # Confidence interval for Z
  z_ci_lower <- z - z_crit * se
  z_ci_upper <- z + z_crit * se
  
  # Transform Z confidence interval back to correlation scale
  ci_lower <- (exp(2 * z_ci_lower) - 1) / (exp(2 * z_ci_lower) + 1)
  ci_upper <- (exp(2 * z_ci_upper) - 1) / (exp(2 * z_ci_upper) + 1)
  
  # Compute t-statistic for correlation
  t_stat <- corr * sqrt((n - 2) / (1 - corr^2))
  
  # Compute p-value from t-statistic
  p_value <- 2 * pt(-abs(t_stat), df = n - 2)
  
  # Return correlation, p-value, and confidence interval
  return(c(corr, p_value, ci_lower, ci_upper, n))
}

Loop Across all pairs of variables and store results in a new data frame

The code below shows how a loop is used to calculate all the correlations between variables and stores the results in a new data frame called “cor.results”.

# Create a list to store the results
cor.results <- list()

# Loop through all pairs of columns in the dataset
for (i in 1:(ncol(dat.r) - 1)) {
  for (j in (i + 1):ncol(dat.r)) {
    result <- corr_ci_na.fun(dat.r[[i]], dat.r[[j]])
    cor.results <- rbind(cor.results, c(colnames(dat.r)[i], colnames(dat.r)[j], result))
  }
}

# Convert the results to a data frame for easier viewing
cor.results <- as.data.frame(cor.results)
colnames(cor.results) <- c("X1", "X2", "R", "P", "CI.Lower", "CI.Upper", "N")


df.cor <- as.data.frame(cor.results, stringsAsFactors = FALSE)
#colnames(df.cor) <- c("X1", "X2", "R", "P", "CI.Lower", "CI.Upper")
df.cor <- df.cor %>% dplyr::mutate(across(R:`CI.Upper`, as.numeric))
df.cor

##                                  X1                               X2
## 1              Psychological Safety Cognitive Reflection Test\n(CRT)
## 2              Psychological Safety Beck Depression Inventory\n(BDI)
## 3              Psychological Safety              Extraversion\n(BFI)
## 4              Psychological Safety         Positive Affect\n(PANAS)
## 5  Cognitive Reflection Test\n(CRT) Beck Depression Inventory\n(BDI)
## 6  Cognitive Reflection Test\n(CRT)              Extraversion\n(BFI)
## 7  Cognitive Reflection Test\n(CRT)         Positive Affect\n(PANAS)
## 8  Beck Depression Inventory\n(BDI)              Extraversion\n(BFI)
## 9  Beck Depression Inventory\n(BDI)         Positive Affect\n(PANAS)
## 10              Extraversion\n(BFI)         Positive Affect\n(PANAS)
##               R          P    CI.Lower     CI.Upper   N
## 1   0.055209675 0.58536024 -0.14275652 0.2489283681 100
## 2   0.001785332 0.98593492 -0.19470106 0.1981339697 100
## 3  -0.195698289 0.05102253 -0.37760188 0.0007486043 100
## 4   0.033129072 0.74350497 -0.16435855 0.2280631484 100
## 5   0.078493459 0.43759268 -0.11977124 0.2707374711 100
## 6   0.040472791 0.68930135 -0.15719496 0.2350225761 100
## 7  -0.027548167 0.78556834 -0.22276093 0.1697886706 100
## 8  -0.055656146 0.58232576 -0.24934842 0.1423177721 100
## 9   0.105037053 0.29831899 -0.09330608 0.2953615139 100
## 10 -0.174168020 0.08308588 -0.35832783 0.0230382296 100

Pre-plot formating and results categorization

The code below formats the data by rounding decimal points, labeling missing data as “NA”, and categorizes the magnitude of the correlation coefficients into trivial, small, medium, and large effect sizes.

# using p-value (italic)
# Format r value
df.cor$label.r = sprintf("%.2f", round(df.cor$R, 2))
df.cor$label.r <- gsub("-", "\u2212", as.character(df.cor$label.r))
df.cor <- df.cor %>% dplyr::mutate(label.r = dplyr::if_else(is.na(label.r), "NA", as.character(label.r)))
df.cor$label.r[df.cor$label.r == "NaN"] <- "NA"
# Format p-value
df.cor <- df.cor %>% dplyr::mutate(label.p = 
                                     dplyr::case_when(
                                       df.cor$P < .001 ~ paste0(" < .001"),
                                       df.cor$P >= .001 ~ paste0(" = ", sprintf("%.3f", round(df.cor$P, 3))),
                                       TRUE ~ NA_character_))
italic_p.fun <- function(text) {bquote(italic(p) ~ .(text))}
df.cor <- df.cor %>% dplyr::mutate(label.p = lapply(label.p, italic_p.fun))
df.cor$label.p[df.cor$label.p == "italic(p) ~ NA"] <- "NA"
# Format 95% CI
df.cor$label.lowerci = sprintf("%.2f", round(df.cor$CI.Lower, 2))
df.cor$label.lowerci <- gsub("-", "\u2212", as.character(df.cor$label.lowerci))
df.cor <- df.cor %>% dplyr::mutate(label.lowerci = dplyr::if_else(is.na(label.lowerci), "NA", as.character(label.lowerci)))
df.cor$label.lowerci[df.cor$label.lowerci == "NaN"] <- "NA"

df.cor$label.upperci = sprintf("%.2f", round(df.cor$CI.Upper, 2))
df.cor$label.upperci <- gsub("-", "\u2212", as.character(df.cor$label.upperci))
df.cor <- df.cor %>% dplyr::mutate(label.upperci = dplyr::if_else(is.na(label.upperci), "NA", as.character(label.upperci)))
df.cor$label.upperci[df.cor$label.upperci == "NaN"] <- "NA"
# Combine CIs
df.cor <- df.cor %>% dplyr:: mutate(label.ci = paste0("[", df.cor$label.lowerci, ", ", df.cor$label.upperci, "]"))
# P-Value Asterisk
df.cor <- df.cor %>% dplyr::mutate(asterisk.p = dplyr::case_when(
  df.cor$P > .05 ~ paste0(" "),
  df.cor$P > .01 ~ paste0("\n*"),
  df.cor$P > .001 ~ paste0("\n**"),
  !is.na(df.cor$P) ~ paste0("\n***"),
  TRUE ~ NA_character_))
# Effect Size Asterisk
df.cor <- df.cor %>% dplyr::mutate(asterisk.es = dplyr::case_when(
  abs(df.cor$R) < .10 ~ paste0(" "),
  abs(df.cor$R) < .30 ~ paste0("\n*"),
  abs(df.cor$R) < .50 ~ paste0("\n**"),
  !is.na(df.cor$R) ~ paste0("\n***"),
  TRUE ~ NA_character_))

# order factors
df.cor$X1 = as.character(df.cor$X1)
df.cor$X2 = as.character(df.cor$X2)
cor <- Hmisc::rcorr(dat.r %>% as.matrix())
nm = rownames(cor$r)
df.cor$X1 = factor(df.cor$X1, nm)
df.cor$X2 = factor(df.cor$X2, rev(nm))
df.cor

##                                  X1                               X2
## 1              Psychological Safety Cognitive Reflection Test\n(CRT)
## 2              Psychological Safety Beck Depression Inventory\n(BDI)
## 3              Psychological Safety              Extraversion\n(BFI)
## 4              Psychological Safety         Positive Affect\n(PANAS)
## 5  Cognitive Reflection Test\n(CRT) Beck Depression Inventory\n(BDI)
## 6  Cognitive Reflection Test\n(CRT)              Extraversion\n(BFI)
## 7  Cognitive Reflection Test\n(CRT)         Positive Affect\n(PANAS)
## 8  Beck Depression Inventory\n(BDI)              Extraversion\n(BFI)
## 9  Beck Depression Inventory\n(BDI)         Positive Affect\n(PANAS)
## 10              Extraversion\n(BFI)         Positive Affect\n(PANAS)
##               R          P    CI.Lower     CI.Upper   N label.r
## 1   0.055209675 0.58536024 -0.14275652 0.2489283681 100    0.06
## 2   0.001785332 0.98593492 -0.19470106 0.1981339697 100    0.00
## 3  -0.195698289 0.05102253 -0.37760188 0.0007486043 100   −0.20
## 4   0.033129072 0.74350497 -0.16435855 0.2280631484 100    0.03
## 5   0.078493459 0.43759268 -0.11977124 0.2707374711 100    0.08
## 6   0.040472791 0.68930135 -0.15719496 0.2350225761 100    0.04
## 7  -0.027548167 0.78556834 -0.22276093 0.1697886706 100   −0.03
## 8  -0.055656146 0.58232576 -0.24934842 0.1423177721 100   −0.06
## 9   0.105037053 0.29831899 -0.09330608 0.2953615139 100    0.11
## 10 -0.174168020 0.08308588 -0.35832783 0.0230382296 100   −0.17
##                   label.p label.lowerci label.upperci      label.ci asterisk.p
## 1  italic(p) ~ " = 0.585"         −0.14          0.25 [−0.14, 0.25]           
## 2  italic(p) ~ " = 0.986"         −0.19          0.20 [−0.19, 0.20]           
## 3  italic(p) ~ " = 0.051"         −0.38          0.00 [−0.38, 0.00]           
## 4  italic(p) ~ " = 0.744"         −0.16          0.23 [−0.16, 0.23]           
## 5  italic(p) ~ " = 0.438"         −0.12          0.27 [−0.12, 0.27]           
## 6  italic(p) ~ " = 0.689"         −0.16          0.24 [−0.16, 0.24]           
## 7  italic(p) ~ " = 0.786"         −0.22          0.17 [−0.22, 0.17]           
## 8  italic(p) ~ " = 0.582"         −0.25          0.14 [−0.25, 0.14]           
## 9  italic(p) ~ " = 0.298"         −0.09          0.30 [−0.09, 0.30]           
## 10 italic(p) ~ " = 0.083"         −0.36          0.02 [−0.36, 0.02]           
##    asterisk.es
## 1             
## 2             
## 3          \n*
## 4             
## 5             
## 6             
## 7             
## 8             
## 9          \n*
## 10         \n*

Correlation plot using ggplot2

The code below provides the correlation matrix plot with correlation coefficients, 95% confidence intervals, p-values, and a indicator of effect size magnitude. Darker blue boxes indicate stronger positive correlations while darker red boxes represent stronger negative correlations.

# Plot
corplot1 <- ggplot(df.cor, aes(X1, X2, fill = R)) +
  geom_tile(color = "white") +
  scale_fill_gradient2(low = '#D55E00', mid = '#FFFFFF', high = '#0072B2', midpoint = 0, aesthetics = 'fill', guide = 'colourbar', name = expression('Pearson Correlation ('~italic('r')~')'), limits = c(-1, 1), 
                       breaks = c(-1, -.8, -.6, -.4, -.2, 0, .2, .4, .6, .8, 1)) +
  geom_text(aes(label = label.r), fontface = 'bold', color = 'Black', size = 5, parse = FALSE, vjust = -0.75) +
  geom_text(aes(label = label.p), fontface = 'bold', color = 'Black', size = 4, parse = TRUE, vjust = 3) +
  geom_text(aes(label = label.ci), fontface = 'bold', color = 'Black', size = 4, parse = FALSE, vjust = 1) +
  geom_text(aes(label = asterisk.es), fontface = 'bold', color = 'Black', size = 6, parse = FALSE, vjust = -.50) +
  annotate("text", x = 3.0, y = 3, label = "Cohen (1988)\n* = Small Effect Size (±0.1)\n** = Medium Effect Size (±0.3)\n*** = Large Effect Size (±0.5)", color = "#000000", size = 5.5, hjust = 0) +
  labs(title = "Colorful Correlation Plot", subtitle = bquote(italic(N) ~ "≈" ~ .(approx_sample) ~ "total observations")) +
  theme_classic() +
  coord_equal(xlim = c(1, 4.1), clip = "off") +
  theme(axis.title.x = element_blank()) +
  theme(axis.title.y = element_blank()) +
  theme(axis.text.x = element_text(face = "bold", angle = 50, vjust = 1, hjust = 1, size = 14)) +
  theme(axis.text.y = element_text(face = "bold", angle = 0, vjust = 0.5, hjust = 1, size = 14)) +
  theme(legend.position = "bottom", legend.title = element_text(size = 14), legend.text = element_text(size = 14)) +
  theme(plot.title = element_text(hjust = 0.5, color = "black", size = 22, face = "bold")) +
  theme(plot.subtitle = element_text(hjust = 0.5, color = "black", size = 16, face = "italic")) +
  guides(fill = guide_colorbar(label.position = 'bottom', title.position = 'top', title.hjust = .5, barwidth = 30))
corplot1