open libraries

library(tidymodels)
library(kknn)
library(tidyverse)
library(themis) # for addressing class imbalance
library(dlookr)
library(janitor)
library(lubridate)
library(flextable)
library(magrittr)
library(viridis)
library(patchwork)

1 Introduction

This worksheet provides an analysis of sales data hosted at https://eforexcel.com/wp/downloads-18-sample-csv-files-data-sets-for-testing-sales/.

The purpose of this work is to build a classification model that can be used to predict a target variable from the dataset(s).

While designed to encourage analytical reasoning, the results of this exercise are trivial. This owes to the fact that the datasets, in their entirety, consist of all or near-zero variables.

As a result, I’ve taken this opportunity to encode three classification models (logistic regression, random forests, and K-Nearest Neighbors) using the tidy-models package and 5-fold cross-validation to assist model comparison.

Results of each model were equivalent to random guessing.

# load sales datasets - 10,000 and 100,000 obs datasets

s.sales<-read_csv('https://raw.githubusercontent.com/sconnin/Model_Compare/main/10000%20Sales%20Records.csv')

# read larger sales dataset

l.sales<-read_csv('https://raw.githubusercontent.com/sconnin/Model_Compare/main/100000%20Sales%20Records.csv')

2 Data Preprocessing and Exploration

# create working dataframe for both datasets

small.sales<-s.sales
large.sales<-l.sales

# convert character cols to factor

small.sales[,1:4]<-map(small.sales[,1:4], factor)
large.sales[,1:4]<-map(large.sales[,1:4], factor)

# clean data 

clean<-function(df){
    
    df%>%
    clean_names%>% # initial clean of col names
    remove_empty(c("rows", "cols"))%>%  # remove any empty rows and cols
    distinct()    # remove duplicate rows
}

small.sales<-clean(small.sales)
large.sales<-clean(large.sales)

# check for NA -- none found

#map(small.sales, ~sum(is.na(.)))  # tag to avoid printing
#map(large.sales, ~sum(is.na(.)))

# convert order and shipping date char col to mdy

small.sales$order_date<-mdy(small.sales$order_date)
large.sales$order_date<-mdy(large.sales$order_date)

small.sales$ship_date<-mdy(small.sales$ship_date)
large.sales$ship_date<-mdy(large.sales$ship_date)

# relocate cols and calc order turnaround time - bin this data

turn<-function(df){
  df%>%
    relocate(c(order_id, country, item_type, sales_channel, order_date, ship_date, order_priority, units_sold, unit_price, unit_cost, total_revenue, total_cost, total_profit, country), .before = region)%>%
  mutate(turnaround_time = ship_date-order_date, .after=ship_date)%>%
  mutate(turnaround_time=str_replace(turnaround_time, ' days', ''))%>%
  mutate(turnaround_time = as.integer(turnaround_time))%>%
  mutate(turnaround_time = cut(turnaround_time, breaks = c(0,10,20,30,40,50), include.lowest=TRUE))%>%
  mutate(turnaround_time = factor(turnaround_time))%>%
  arrange(desc(order_date))
  

}

small.sales<-turn(small.sales)
large.sales<-turn(large.sales)

Evaluate univariate statistics for numerical and factor variables. All downstream analyses will be restricted to the larger dataset.

# summarize numeric variables

large.sales%>%
    diagnose_numeric()%>%
    dplyr::select(variables, min, mean, median, max, zero, minus, outlier)%>%
    flextable()%>%
    set_caption("Summary Statistics for Large Dataset")
#summarize factor variables

large.sales%>%
    diagnose_category()%>%
    flextable()%>%
    set_caption("Summary Statistics for Categorical Variables: large dataset")

Pairwise comparisons for numeric variables

cor.tabl<-function(df, titles){
    df%>%
    filter_if(is.numeric, all_vars((.) != 0))%>%
    correlate()%>%
    filter(coef_corr > .6 | coef_corr < -.6)%>% # set thresholds to limit output 
    arrange(desc(coef_corr))%>%
    flextable()%>%
    set_caption(titles)
}


cor.tabl(large.sales, "Pairwise Correlation: Numerical Covariates: Large Dataset")
cor.tabl(large.sales, "Pairwise Correlation: Numerical Covariates: Large Dataset")

Four of the six numerical predictors are zero variance features. These include: units sold, total revenue, total count and total profit. These covariates should be removed from the analysis.

Both of the remaining two predictors (unit price and unit cost) are near zero variables. However these variables are highly collinear (R = .98) and we can drop either one from our analysis.

# Subset numerical variables

num_box<-select_if(large.sales, is.numeric)%>%
  select(!order_id)

# create working df for boxplots

order<-large.sales[8]

num_box<-cbind(order, num_box)

# Plot using boxplots

response = names(num_box)[1] #target_flag (needs to be fct for these plots)
response = purrr::set_names(response)


explain <- names(num_box)[2:7] #explanatory variables
explain = purrr::set_names(explain)

box_fun = function(x) {
    ggplot(num_box, aes_string(x = x, y = 'order_priority')) +
    geom_boxplot(aes(fill = order_priority, alpha = 0.4), outlier.color =
    'red', show.legend = FALSE)+
    scale_fill_viridis(discrete = TRUE, option = "E")+
    coord_flip()+
    labs(title = 'Distribution of Continuous Variables Relative to Order Priority: Large Sales')+
    theme_classic()
}

box_plots<-map(explain, ~box_fun(.x)) #creates a list of plots using purrr

plot(reduce(box_plots,`+`)) # layout with patchwork

Evaluate categorical variables

Finding: all of the categorical predictors are zero variance. They should be removed from our analysis.

# Subset categorical variables into a df of factor vars

mosaic<-large.sales%>% #subset target_flag as factor
    mutate(order_priority = as_factor(order_priority))%>%
    dplyr::select(where(is.factor))%>%
  relocate(order_priority, .before=country)

target <- all_of(names(mosaic)[1]) # set name for target_flag
predictors <- all_of(names(mosaic)[3:6])

#generate mosaic plots with purrr and dlookr

(plots<-predictors%>%map(function(predictors) mosaic%>%target_by(target)%>%relate(predictors)%>%plot()))
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

Review of both numerical and categorical variables indicates that only one covariate (unit_price or unit_cost) can be used to model the response variable, order_priority. As a result, the classification problem is reduced to the question of whether one of these covariates has sufficient variance to discriminate class levels for the response, order_priority.

This is obviously a trivial exercise. However, a logical approach is to convert order_priority to a binary form (H or Other) and attempt to discriminate these classes using a single covariate. I will use unit_price for this purpose.

Aggregating levels within order_priority produces imbalanced class frequencies - 25% H vs. 75% Other.

# remove zero variance predictors

sales<-large.sales%>%
  select(order_priority, unit_price)%>%
  mutate(order_priority = ifelse(order_priority == 'H', 'H', 'Other'))

# assess class distributions

table(sales$order_priority)%>%
  as.data.frame()%>%
  flextable%>%
  set_caption('Class Distribution for Sales: Large Dataset')

3 Model Development

The first step in our modeling process is to create training and test sets. Note that the modeling process that follows draws on the Tidymodels package, which is designed as a replacement for Caret.

set.seed(5345459)

# create random data split

data_split <- initial_split(sales, prop = .75, strata = order_priority)

# create train and test sets from split

train_data <- training(data_split)
test_data <- testing(data_split)

With training and test sets in hand, we build a modeling recipe and finalize our data preparation process. I have included steps that are not relevant to the current dataset but which illustrate the options available for the recipe function.

# pre-model processing using recipes

attr_recipe<-
  recipe(order_priority ~., data=sales)%>%
  step_normalize(all_numeric())%>% # center and scale numerical vars%>%
  #step_dummy(all_nominal, -attrition)%>% #convert factor cols to numeric
  #step_zv(all_numeric())%>% # remove numeric vars that have zero variance (single unique value)
  #step_corr(all_predictors(), threshold = 0.7, method = 'spearman')%>% # remove predictors that have large correlations with other covariates
  step_smote(order_priority) # rebalance response using smote method (themis package)
  
  
# review type, roles, source
  
summary(attr_recipe)

In order to compare classification models, I will include five-fold cross-validation in our modeling workflow.

set.seed(11)

cv_folds<-
  vfold_cv(train_data, 
           v=5,
           strata= order_priority)

For the purpose of this exercise, I will build build classifiers based on the following methods: logistic regression, random forests, and KNN.

# Random Forest

library(ranger)

rf <- rand_forest(trees = 500)%>% # given the trivial nature of the data, 500 is reasonable
  set_engine('ranger', importance='impurity')%>% # impurity will provide variable importance scores
  set_mode('classification')

# K-nearest neighbors, a k=5 is our default

knn<- 
  nearest_neighbor(neighbors = 4)%>% # can tune this part
  set_engine('kknn')%>%
  set_mode('classification')

# Logistic Regression

logit<-
  logistic_reg()%>%
  set_engine(engine = 'glm')%>%
  set_mode('classification')

The next step is to create initiate a workflow that will combine recipe and models. And we can set up our metrics for model evaluation.

# initiate a workflow

sales.wf<- 
  workflow()%>%
  add_recipe(attr_recipe)

# select metrics for model evaluation

sales.metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

Now we combine workflow, model, metrics, and cross-validation

# train random forest model


rf.train <- 
  sales.wf %>%
  add_model(rf) %>%
  fit_resamples(
    resamples = cv_folds,
    metrics = sales.metrics,
    control = control_resamples(save_pred = TRUE)
  )


# train knn model

knn.train <- 
  sales.wf %>%
  add_model(knn) %>%
  fit_resamples(
    resamples = cv_folds,
    metrics = sales.metrics,
    control = control_resamples(save_pred = TRUE)
  )


# train logistic model

logit.train <- 
  sales.wf %>%
  add_model(logit) %>%
  fit_resamples(
    resamples = cv_folds,
    metrics = sales.metrics,
    control = control_resamples(save_pred = TRUE)
  )

4 Model Evaluation

With model training completed, we can evaluate the results. First we evaluate our Random Forest model.

From the training metrics and class predictions. It’s clear that the random model is unable to discriminate classes of order_priority. This isn’t surprising given the near zero variance of our predictor. Our ROC curve confirms this assessment and indicates that the results are equivalent to random guessing.

# training metrics for random forest

collect_metrics(rf.train)%>%
  flextable()%>%
  set_caption('Random Forest Training Metrics')
# RF - check class predictions

rf.train %>%
  conf_mat_resampled()%>%
  flextable()%>%
  set_caption('Random Forest Class Predictions')
# RF ROC curve

rf.train %>%
  collect_predictions() %>%
  group_by(id) %>%
  roc_curve(order_priority, .pred_H) %>% # see  rf.train%>%collect_predictions()
  ggplot(aes(1 - specificity, sensitivity, color = id)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
  labs(title = 'Random Forest ROC')+
  coord_equal()

We now evaluate our nearest neighbors model.

The KNN model was not able to discriminate classes. With a sensitivity of 1.0, our model predicted all true positives.And no true negatives.

# training metrics for KNN

collect_metrics(knn.train)%>%
  flextable()%>%
  set_caption('KNN Training Metrics')
# knn - check class predictions

knn.train %>%
  conf_mat_resampled()%>%
  flextable()%>%
  set_caption('KNN Predictions')
# RF ROC curve

knn.train %>%
  collect_predictions() %>%
  group_by(id) %>%
  roc_curve(order_priority, .pred_H) %>% # see  rf.train%>%collect_predictions()
  ggplot(aes(1 - specificity, sensitivity, color = id)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
  labs(title = 'KNN ROC')+
  coord_equal()

Now we evaluate training results for the logistic regression model.

Similar to the previous models, the logistic model performed no better than random guessing. Comparing to the random forest model, the prediction accuracy for the logistic model was lower but did exhibit higher sensitivity.

# training metrics for random forest

collect_metrics(logit.train)%>%
  flextable()%>%
  set_caption('Logistic Regression Training Metrics')
# RF - check class predictions

logit.train %>%
  conf_mat_resampled()%>%
  flextable()%>%
  set_caption('Logistic Regression Class Predictions')
# Logistic Regression ROC curve

logit.train %>%
  collect_predictions() %>%
  group_by(id) %>%
  roc_curve(order_priority, .pred_H) %>% # see  rf.train%>%collect_predictions()
  ggplot(aes(1 - specificity, sensitivity, color = id)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
  labs(title = 'Logistic Regression ROC')+
  coord_equal()

Applying any of our models to the test dataset is trivial exercise since none of them effectively discriminate our target class. However, I will apply the random forest model for the purpose of demonstration.

# model the test set

sales.final<- sales.wf %>%
  add_model(rf) %>%
  last_fit(data_split)

# collect test metrics

collect_metrics(sales.final)%>%
  flextable()%>%
  set_caption('Random Forest Test Metrics')
# collect predictions and create confusion matrix

collect_predictions(sales.final) %>%
  conf_mat(order_priority, .pred_class)
##           Truth
## Prediction    H Other
##      H     3074  9484
##      Other 3163  9280

As expected, our random forest model is unable to discriminate classes in our response variable, order_priority.