ggpubr is a package that offers a sophisticated interface to produce nice histograms and violin plots for numeric variables and easily ads significance values to the plots. oetteR::f_plot_hist takes a data_ls list and plots a histogram or a bar plot of frequencies of all variables but the histograms are lacking some of ggpubr features, which are not so easily replicated. Here we want to try whether we can modify oetteR::fplot_hist to use most of ggpubr features

suppressPackageStartupMessages( require(oetteR) )
suppressPackageStartupMessages( require(ggpubr) )
suppressPackageStartupMessages( require(tidyverse) )

Data

data_ls = ISLR::Auto %>%
  f_clean_data()

## [1] "Number of excluded observations: 0"

Density Histograms

ggdensity( data = data_ls$data
           , x = 'displacement'
           , y = '..count..'
           , color = 'cylinders'
           , fill = 'cylinders'
           , palette = f_plot_col_vector74()
           , rug = T
           , add = 'mean'
           )

f_plot_hist( data_ls
             , variable = 'displacement'
             , group = 'cylinders'
             , y_axis = 'count'
             , graph_type = 'line'
             , rug = T )

Bar Histograms

gghistogram( data = data_ls$data
           , x = 'displacement'
           , y = '..count..'
           , color = 'cylinders'
           , fill = 'cylinders'
           , palette = f_plot_col_vector74()
           , rug = T
           , add = 'mean'
           )

## Warning: Using `bins = 30` by default. Pick better value with the argument
## `bins`.

f_plot_hist( data_ls
             , variable = 'displacement'
             , group = 'cylinders'
             , y_axis = 'count'
             , graph_type = 'bar'
             , rug = T )

Violin Plots

v1 = ggviolin( data = data_ls$data
           , x = 'cylinders'
           , y = 'displacement'
           #, color = 'cylinders'
           , fill = 'cylinders'
           , palette = f_plot_col_vector74()
           , rug = T
           , add = 'boxplot'
           , add.params = list( fill = 'white')
           )

v1

v2 = f_plot_hist( data_ls
             , variable = 'displacement'
             , group = 'cylinders'
             , y_axis = 'count'
             , graph_type = 'violin'
             , rug = T )

v2

Add stats

all combinations

lvl = levels(data_ls$data$cylinders) 

compare = expand.grid( a = lvl
                       , b = lvl
                       , stringsAsFactors = F) %>%
  filter( a != b) %>%
  mutate( comb = map2( a, b, function(x,y) sort( c(x,y) ) )
          , comb = map( comb, paste0, collapse = ',' )
          ) %>%
  unnest( comb ) %>%
  group_by( comb ) %>%
  summarise() %>%
  mutate( comb = stringr::str_split(comb, ',') ) %>%
  .$comb

v1_1 = v1 + 
  stat_compare_means(comparisons = compare, label = "p.signif") 

v1_1

ggplot( data_ls$data, aes( cylinders
                           , displacement
                           , fill = cylinders) ) +
  geom_violin() + 
  stat_compare_means(comparisons = compare, label = "p.signif")

p = f_plot_pretty_points( df = data_ls$data
                          , col_y = 'displacement'
                          , col_x = 'weight'
                          , col_facet = 'origin') +
  geom_smooth() + 
  ggpubr::stat_cor()

## [1] "Number of excluded observations: 0"

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Hide NS values

Make random grouping

data_ls$data$rnd = as.factor( sample( LETTERS[1:5], nrow(data_ls$data), replace = T ) )

filter all pairs that have a t_test p_val over 0.05

f_compare_means = function(comb, data, col_var, col_group){
  
  sym_group = as.name(col_group)
  
  lvl1 = stringr::str_split(comb, ',')[[1]][1]
  lvl2 = stringr::str_split(comb, ',')[[1]][2]
  
  x = data %>%
    filter( rlang::UQ(sym_group) == lvl1 ) %>%
    .[[col_var]]
  
  y = data %>%
    filter( rlang::UQ(sym_group) == lvl2 ) %>%
    .[[col_var]]
  
  t_test = t.test(x,y)
  
  return( t_test$p.value )
}

lvl = levels(data_ls$data$rnd) 

compare_filt = expand.grid( a = lvl
                       , b = lvl
                       , stringsAsFactors = F) %>%
  filter( a != b) %>%
  mutate( comb = map2( a, b, function(x,y) sort( c(x,y) ) )
          , comb = map( comb, paste0, collapse = ',' )
          ) %>%
  unnest( comb ) %>%
  group_by( comb ) %>%
  summarise() %>%
  mutate( p_val  = map_dbl( comb, f_compare_means
                            , data = data_ls$data
                            , col_var = 'displacement'
                            , col_group = 'rnd' )
          , comb = stringr::str_split(comb, ',') ) %>%
  filter( p_val <= 0.05 ) %>%
  .$comb

ggviolin( data = data_ls$data
           , x = 'rnd'
           , y = 'displacement'
           , color = 'rnd'
           , fill = 'rnd'
           , palette = f_plot_col_vector74()
           , rug = T
           , add = 'boxplot'
           , add.params = list( fill = 'white')
           ) + 
  stat_compare_means(comparisons = compare, label = "p.signif")

## Warning: Computation failed in `stat_signif()`:
## missing value where TRUE/FALSE needed

ggviolin( data = data_ls$data
           , x = 'rnd'
           , y = 'displacement'
           , color = 'rnd'
           , fill = 'rnd'
           , palette = f_plot_col_vector74()
           , rug = T
           , add = 'boxplot'
           , add.params = list( fill = 'white')
           ) + 
  stat_compare_means(comparisons = compare_filt, label = "p.signif")

plotly compatibility

v1_1 %>%
  plotly::ggplotly()

## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomSignif() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues

v1 %>%
  plotly::ggplotly()

v2 %>%
  plotly::ggplotly()

## Warning in geom2trace.default(dots[[1L]][[1L]], dots[[2L]][[1L]], dots[[3L]][[1L]]): geom_GeomSignif() has yet to be implemented in plotly.
##   If you'd like to see this geom implemented,
##   Please open an issue with your example code at
##   https://github.com/ropensci/plotly/issues

ggdensity( data = data_ls$data
           , x = 'displacement'
           , y = '..count..'
           , color = 'cylinders'
           , fill = 'cylinders'
           , palette = f_plot_col_vector74()
           , rug = T
           , add = 'mean'
           ) %>%
  plotly::ggplotly()

p %>%
  plotly::ggplotly()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

Summary

ggpubr is a great to add p values to any ggplot object and the histograms do look better than the ones from my default function. Unfortunately the p values dont show in plotly. ggpubr also does not plot frequency distributions of factor variables and does not offer a stat_ function to add a chi square p value.

GGpubr

OEB

January 6, 2018