suppressPackageStartupMessages( require(oetteR) )
suppressPackageStartupMessages( require(tidyverse) )

1 Introduction

Classically regression models are judged by their performance based on a calculated performance metric like rtSME and usually we will pick the least complex model with the best performance. However the best performance is often given by complex nonlinear models that are difficult to interpret in terms of what aspects of the data they are capturing, which makes it more difficult to trust them. Here certain visualisations can help us to judge the model and build up more confidence.

2 Visualising the model in the data space

In linear regression we can rely on a simple easily interpretable function to judge the impact of one predictor variable on the response variable. We can set up a empirical method to come up with a similar visualisation. For most of the statistical models we have methods to determine variable importance. So we generally have an idea which variables to investigate. The trick is to generate artifical data where we change the values of only one predictor while keeping all other predictors constant at their median.

2.1 Train nonlinear regression models

We are using the pipelearner package. We want the models to be trained on the full data set. Unfortunately we can only do this in pipelearner if we set the test section to a really low value.


data = ISLR::Auto %>%
  mutate(idx = row_number() )

data_ls = f_clean_data( data, id_cols = 'idx' )
#> [1] "Number of excluded observations: 0"
form = displacement~horsepower+cylinders+weight+acceleration+mpg+year
variable_color_code = f_plot_color_code_variables(data_ls)
limit            = 10

pl = pipelearner::pipelearner( data_ls$data ) %>%
  pipelearner::learn_models( rpart::rpart, form ) %>%
  pipelearner::learn_models( randomForest::randomForest, form ) %>%
  pipelearner::learn_models( e1071::svm, form ) %>%
  pipelearner::learn_cvpairs( pipelearner::crossv_mc, test = 0.000001, n = 1 ) %>%
  pipelearner::learn() 
 
 
 
 

2.2 Calculate Predictor importance


pl = pl %>%
  mutate( imp = map2(fit, train, f_model_importance) )

pl$imp  
#> [[1]]
#> # A tibble: 6 x 3
#>      row_names     value  rank
#>          <chr>     <dbl> <dbl>
#> 1    cylinders 3862523.0     1
#> 2   horsepower 3288862.6     2
#> 3       weight 3048387.5     3
#> 4          mpg 2665815.9     4
#> 5 acceleration 1534727.5     5
#> 6         year  232089.4     6
#> 
#> [[2]]
#> # A tibble: 6 x 3
#>      row_names      value  rank
#>          <chr>      <dbl> <dbl>
#> 1    cylinders 1432297.95     1
#> 2       weight 1142004.57     2
#> 3   horsepower  849124.27     3
#> 4          mpg  549087.85     4
#> 5 acceleration  229828.10     5
#> 6         year   33041.16     6
#> 
#> [[3]]
#> # A tibble: 6 x 3
#>      row_names      value  rank
#>          <chr>      <dbl> <dbl>
#> 1       weight 0.38800322     1
#> 2    cylinders 0.27951744     2
#> 3   horsepower 0.17575962     3
#> 4 acceleration 0.06555102     4
#> 5          mpg 0.05631134     5
#> 6         year 0.03485736     6

2.3 Plot Variable dependencies

pl = pl %>%
  mutate( plot = pmap( list( m = fit, ranked_variables = imp, title = model, data = train)
                        , .f = f_model_plot_variable_dependency_regression
                        , formula = form
                        , data_ls = data_ls
                        , variable_color_code = variable_color_code
                       , limit = limit )
  )

pl$plot %>%
  walk( print )

These plots are already quite informative. However we know the models are capaple of capturing nonlinear relationships meaning that the changes in response variable based on changes in one predictor can be quite different depending on the other predictors. For example the change in displacement in response to acceleration could be different even opposite in small cars compared to large cars

2.4 Binning the data

Here we could start to split the data into groups and repeat the above exercise for each group. We can either split the data on some notion we already have or simply split on the most important variable.

pl = pl %>%
  mutate( range_var = map_chr(imp, function(x) head(x,1)$row_names )
          , grid = pmap( list( m = fit
                            , title = model
                            , variables = imp
                            , range_variable = range_var
                     )
                     , f_model_plot_var_dep_over_spec_var_range
                     , formula = form
                     , data_ls = data_ls
                     , data = data_ls$data
                     , variable_color_code = variable_color_code
                     , log_y = F
                     , limit = 12
                     )
  )

pl$grid %>%
  walk( gridExtra::grid.arrange )

3 Visualising the data in the model space

We can also put the dataset into the modelling space and trace observations that are particularly bad good or differently predicted by a group of models.

3.1 Add Predictions and Residuals


pl_pred = pl %>%
  f_predict_pl_regression( cols_id = 'idx', data_test = 'train') %>%
  unnest(preds) %>%
  mutate( title = model ) %>%
  group_by( title ) %>%
  arrange( idx ) %>%
  ungroup()



f_plot_hist( 'resid', f_clean_data_no_changes(pl_pred) )

f_plot_hist( 'resid', f_clean_data_no_changes(pl_pred), group = 'title', graph_type = 'violin' )

3.2 Plot alluvial plot

The range of all residuals is binned into 5 equidistant groups. LL Low-Low, ML - Medium, M Medium, MH - Medium-High, HH High-High. And the predictions of an individual observation are tracked between models and grouped into flows.


p = f_plot_alluvial_1v1( data = pl_pred
                     , col_x = 'title'
                     , col_y = 'resid'
                     , col_id = 'idx'
                     , fill_by = 'all_flows'
                     , order_levels_x = c('randomForest', 'svm') )
#> [1] "Number of flows: 41"
#> [1] "Original Dataframe reduced to 10.5 %"
#> [1] "Maximum weight of a singfle flow 14.1 %"

p

We can see that the residuals of single observations fall mostly into the same category for each model. Transisition from a High (MH, HH) to a Low range (LM,LL) or the other way around are very rare if wee look at the to better performing models randomFores and svm. We can thus conclude that the two models catch up on similar traits of the data. We also find a consistency in the observations that are poorly predicted. In a next step we will investigate what sets them apart from the other observations that are predicted better.

3.3 Tag observations that dont predict well

f_tag = function(x,y,z){
  
  if( x == 'M' | y == 'M' | z == 'M'){
    return( 'Med' )
  } else if( stringr::str_detect(x,'L')
             & stringr::str_detect(y,'L')
             & stringr::str_detect(z,'L')){
    return( 'Low' )
  }else if( stringr::str_detect(x,'H')
             & stringr::str_detect(y,'H')
             & stringr::str_detect(z,'H')){
    return( 'High' )
  } else { 'Cross' }
}

df = p$data_key %>%
  mutate_if( is.factor, as.character ) %>%  ## factors will be passed as integers py pmap
  mutate( tag = pmap_chr( list( x = randomForest, y = rpart, z = svm), f_tag )
          , tag = as.factor(tag) 
          , idx = as.integer(idx) ) %>%
  select( idx, tag )

pl_tag = pl_pred %>%
  left_join( df )

p = f_plot_alluvial_1v1( data = pl_tag
                     , col_x = 'title'
                     , col_y = 'resid'
                     , col_fill = 'tag'
                     , fill_right = F
                     , col_id = 'idx'
                     , order_levels_x = c('randomForest', 'svm')
                     , order_levels_fill = c('Low', 'Cross', 'Med', 'High')
                     )
#> [1] "Number of flows: 41"
#> [1] "Original Dataframe reduced to 10.5 %"
#> [1] "Maximum weight of a singfle flow 14.1 %"

p

3.4 Follow the tags in the dataset

variables are ordered by importance from left to right


order_variables = pl %>%
  unnest(imp) %>%
  group_by( row_names ) %>%
  summarise( rank_sum = sum(rank) ) %>%
  arrange( rank_sum ) %>%
  .$row_names

# will exclude the one observation which we put in the test set
data_ls_new = data %>%
  left_join(df) %>%
  select( tag, displacement, order_variables ) %>%
  f_clean_data()
#> [1] "Number of excluded observations: 1"

f_plot_alluvial(data_ls_new$data, order_levels = c('Low', 'Cross', 'Med', 'High') ) +
    theme( axis.text.x = element_text(angle = 90))
#> [1] "Number of flows: 270"
#> [1] "Original Dataframe reduced to 69.1 %"
#> [1] "Maximum weight of a singfle flow 1.8 %"

3.5 Perform Group Analysis of differences





taglist = f_stat_group_ana(data_ls_new
                          , 'tag'
                          , alluvial = F
                          , tabplot = F 
                          , static_plots = F
                          , return_taglist = T )

taglist

Differences Between " tag " Groups

Number and percentage of observations per group




Table: All features

grouped by: tag




Links to static plots

Dynamic Plots

P < 0.05 , % difference > 3 , sorted by % difference





Plot1: cylinders ***

Counts

Probabilities

* P:0.05, ** P:0,005, *** P:0.001




Plot2: displacement ***

* P:0.05, ** P:0,005, *** P:0.001




Plot3: mpg ***

* P:0.05, ** P:0,005, *** P:0.001




Plot4: horsepower ***

* P:0.05, ** P:0,005, *** P:0.001




Plot5: weight ***

* P:0.05, ** P:0,005, *** P:0.001




Plot6: acceleration *

* P:0.05, ** P:0,005, *** P:0.001




Plot7: year ***

* P:0.05, ** P:0,005, *** P:0.001





Summary Tables




Table: Means and Medians of numerical features

grouped by: tag

Table: Counts and percentages of categorical features

grouped by: tag