open libraries
library(tidymodels)
library(kknn)
library(tidyverse)
library(themis) # for addressing class imbalance
library(dlookr)
library(janitor)
library(lubridate)
library(flextable)
library(magrittr)
library(viridis)
library(patchwork)
This worksheet provides an analysis of sales data hosted at https://eforexcel.com/wp/downloads-18-sample-csv-files-data-sets-for-testing-sales/.
The purpose of this work is to build a classification model that can be used to predict a target variable from the dataset(s).
While designed to encourage analytical reasoning, the results of this exercise are trivial. This owes to the fact that the datasets, in their entirety, consist of all or near-zero variables.
As a result, I’ve taken this opportunity to encode three classification models (logistic regression, random forests, and K-Nearest Neighbors) using the tidy-models package and 5-fold cross-validation to assist model comparison.
Results of each model were equivalent to random guessing.
# load sales datasets - 10,000 and 100,000 obs datasets
<-read_csv('https://raw.githubusercontent.com/sconnin/Model_Compare/main/10000%20Sales%20Records.csv')
s.sales
# read larger sales dataset
<-read_csv('https://raw.githubusercontent.com/sconnin/Model_Compare/main/100000%20Sales%20Records.csv') l.sales
# create working dataframe for both datasets
<-s.sales
small.sales<-l.sales
large.sales
# convert character cols to factor
1:4]<-map(small.sales[,1:4], factor)
small.sales[,1:4]<-map(large.sales[,1:4], factor)
large.sales[,
# clean data
<-function(df){
clean
%>%
df%>% # initial clean of col names
clean_namesremove_empty(c("rows", "cols"))%>% # remove any empty rows and cols
distinct() # remove duplicate rows
}
<-clean(small.sales)
small.sales<-clean(large.sales)
large.sales
# check for NA -- none found
#map(small.sales, ~sum(is.na(.))) # tag to avoid printing
#map(large.sales, ~sum(is.na(.)))
# convert order and shipping date char col to mdy
$order_date<-mdy(small.sales$order_date)
small.sales$order_date<-mdy(large.sales$order_date)
large.sales
$ship_date<-mdy(small.sales$ship_date)
small.sales$ship_date<-mdy(large.sales$ship_date)
large.sales
# relocate cols and calc order turnaround time - bin this data
<-function(df){
turn%>%
dfrelocate(c(order_id, country, item_type, sales_channel, order_date, ship_date, order_priority, units_sold, unit_price, unit_cost, total_revenue, total_cost, total_profit, country), .before = region)%>%
mutate(turnaround_time = ship_date-order_date, .after=ship_date)%>%
mutate(turnaround_time=str_replace(turnaround_time, ' days', ''))%>%
mutate(turnaround_time = as.integer(turnaround_time))%>%
mutate(turnaround_time = cut(turnaround_time, breaks = c(0,10,20,30,40,50), include.lowest=TRUE))%>%
mutate(turnaround_time = factor(turnaround_time))%>%
arrange(desc(order_date))
}
<-turn(small.sales)
small.sales<-turn(large.sales) large.sales
Evaluate univariate statistics for numerical and factor variables. All downstream analyses will be restricted to the larger dataset.
# summarize numeric variables
%>%
large.salesdiagnose_numeric()%>%
::select(variables, min, mean, median, max, zero, minus, outlier)%>%
dplyrflextable()%>%
set_caption("Summary Statistics for Large Dataset")
variables | min | mean | median | max | zero | minus | outlier |
order_id | 100,008,904.00 | 550,395,554.1761 | 547,718,512.50 | 999,996,459.00 | 0 | 0 | 0 |
units_sold | 1.00 | 5,001.4462 | 5,007.00 | 10,000.00 | 0 | 0 | 0 |
unit_price | 9.33 | 266.7040 | 205.70 | 668.27 | 0 | 0 | 0 |
unit_cost | 6.92 | 188.0197 | 117.11 | 524.96 | 0 | 0 | 0 |
total_revenue | 18.66 | 1,336,066.7307 | 789,891.57 | 6,682,700.00 | 0 | 0 | 6,597 |
total_cost | 13.84 | 941,975.4934 | 467,937.41 | 5,249,075.04 | 0 | 0 | 9,678 |
total_profit | 4.82 | 394,091.2373 | 283,657.46 | 1,738,700.00 | 0 | 0 | 4,125 |
#summarize factor variables
%>%
large.salesdiagnose_category()%>%
flextable()%>%
set_caption("Summary Statistics for Categorical Variables: large dataset")
variables | levels | N | freq | ratio | rank |
country | Sudan | 100,000 | 623 | 0.623 | 1 |
country | New Zealand | 100,000 | 593 | 0.593 | 2 |
country | Vatican City | 100,000 | 590 | 0.590 | 3 |
country | Malta | 100,000 | 589 | 0.589 | 4 |
country | Mozambique | 100,000 | 589 | 0.589 | 4 |
country | Cambodia | 100,000 | 584 | 0.584 | 6 |
country | Tunisia | 100,000 | 584 | 0.584 | 6 |
country | Panama | 100,000 | 578 | 0.578 | 8 |
country | Rwanda | 100,000 | 576 | 0.576 | 9 |
country | Cote d'Ivoire | 100,000 | 575 | 0.575 | 10 |
item_type | Office Supplies | 100,000 | 8,426 | 8.426 | 1 |
item_type | Cereal | 100,000 | 8,421 | 8.421 | 2 |
item_type | Baby Food | 100,000 | 8,407 | 8.407 | 3 |
item_type | Cosmetics | 100,000 | 8,370 | 8.370 | 4 |
item_type | Personal Care | 100,000 | 8,364 | 8.364 | 5 |
item_type | Meat | 100,000 | 8,320 | 8.320 | 6 |
item_type | Snacks | 100,000 | 8,308 | 8.308 | 7 |
item_type | Clothes | 100,000 | 8,304 | 8.304 | 8 |
item_type | Vegetables | 100,000 | 8,282 | 8.282 | 9 |
item_type | Household | 100,000 | 8,278 | 8.278 | 10 |
sales_channel | Online | 100,000 | 50,054 | 50.054 | 1 |
sales_channel | Offline | 100,000 | 49,946 | 49.946 | 2 |
order_date | 2010-11-27 | 100,000 | 57 | 0.057 | 1 |
order_date | 2010-10-03 | 100,000 | 56 | 0.056 | 2 |
order_date | 2011-03-23 | 100,000 | 56 | 0.056 | 2 |
order_date | 2017-05-22 | 100,000 | 56 | 0.056 | 2 |
order_date | 2014-02-19 | 100,000 | 55 | 0.055 | 5 |
order_date | 2015-09-20 | 100,000 | 55 | 0.055 | 5 |
order_date | 2016-10-22 | 100,000 | 55 | 0.055 | 5 |
order_date | 2016-12-06 | 100,000 | 55 | 0.055 | 5 |
order_date | 2013-07-15 | 100,000 | 54 | 0.054 | 9 |
order_date | 2010-09-02 | 100,000 | 53 | 0.053 | 10 |
ship_date | 2015-10-04 | 100,000 | 61 | 0.061 | 1 |
ship_date | 2013-08-04 | 100,000 | 60 | 0.060 | 2 |
ship_date | 2010-10-26 | 100,000 | 59 | 0.059 | 3 |
ship_date | 2014-11-06 | 100,000 | 57 | 0.057 | 4 |
ship_date | 2011-12-20 | 100,000 | 56 | 0.056 | 5 |
ship_date | 2011-04-29 | 100,000 | 55 | 0.055 | 6 |
ship_date | 2011-05-09 | 100,000 | 55 | 0.055 | 6 |
ship_date | 2013-12-13 | 100,000 | 55 | 0.055 | 6 |
ship_date | 2013-04-04 | 100,000 | 54 | 0.054 | 9 |
ship_date | 2015-06-02 | 100,000 | 54 | 0.054 | 9 |
turnaround_time | [0,10] | 100,000 | 21,566 | 21.566 | 1 |
turnaround_time | (40,50] | 100,000 | 19,866 | 19.866 | 2 |
turnaround_time | (10,20] | 100,000 | 19,676 | 19.676 | 3 |
turnaround_time | (30,40] | 100,000 | 19,447 | 19.447 | 4 |
turnaround_time | (20,30] | 100,000 | 19,445 | 19.445 | 5 |
order_priority | M | 100,000 | 25,088 | 25.088 | 1 |
order_priority | L | 100,000 | 25,016 | 25.016 | 2 |
order_priority | C | 100,000 | 24,951 | 24.951 | 3 |
order_priority | H | 100,000 | 24,945 | 24.945 | 4 |
region | Sub-Saharan Africa | 100,000 | 26,019 | 26.019 | 1 |
region | Europe | 100,000 | 25,877 | 25.877 | 2 |
region | Asia | 100,000 | 14,547 | 14.547 | 3 |
region | Middle East and North Africa | 100,000 | 12,580 | 12.580 | 4 |
region | Central America and the Caribbean | 100,000 | 10,731 | 10.731 | 5 |
region | Australia and Oceania | 100,000 | 8,113 | 8.113 | 6 |
region | North America | 100,000 | 2,133 | 2.133 | 7 |
Pairwise comparisons for numeric variables
<-function(df, titles){
cor.tabl%>%
dffilter_if(is.numeric, all_vars((.) != 0))%>%
correlate()%>%
filter(coef_corr > .6 | coef_corr < -.6)%>% # set thresholds to limit output
arrange(desc(coef_corr))%>%
flextable()%>%
set_caption(titles)
}
cor.tabl(large.sales, "Pairwise Correlation: Numerical Covariates: Large Dataset")
var1 | var2 | coef_corr |
total_cost | total_revenue | 0.9876907 |
total_revenue | total_cost | 0.9876907 |
unit_cost | unit_price | 0.9860299 |
unit_price | unit_cost | 0.9860299 |
total_profit | total_revenue | 0.8801859 |
total_revenue | total_profit | 0.8801859 |
total_profit | total_cost | 0.7951102 |
total_cost | total_profit | 0.7951102 |
total_cost | unit_cost | 0.7652115 |
unit_cost | total_cost | 0.7652115 |
total_cost | unit_price | 0.7544118 |
unit_price | total_cost | 0.7544118 |
total_revenue | unit_price | 0.7392584 |
unit_price | total_revenue | 0.7392584 |
total_revenue | unit_cost | 0.7290550 |
unit_cost | total_revenue | 0.7290550 |
total_profit | units_sold | 0.6016244 |
units_sold | total_profit | 0.6016244 |
cor.tabl(large.sales, "Pairwise Correlation: Numerical Covariates: Large Dataset")
var1 | var2 | coef_corr |
total_cost | total_revenue | 0.9876907 |
total_revenue | total_cost | 0.9876907 |
unit_cost | unit_price | 0.9860299 |
unit_price | unit_cost | 0.9860299 |
total_profit | total_revenue | 0.8801859 |
total_revenue | total_profit | 0.8801859 |
total_profit | total_cost | 0.7951102 |
total_cost | total_profit | 0.7951102 |
total_cost | unit_cost | 0.7652115 |
unit_cost | total_cost | 0.7652115 |
total_cost | unit_price | 0.7544118 |
unit_price | total_cost | 0.7544118 |
total_revenue | unit_price | 0.7392584 |
unit_price | total_revenue | 0.7392584 |
total_revenue | unit_cost | 0.7290550 |
unit_cost | total_revenue | 0.7290550 |
total_profit | units_sold | 0.6016244 |
units_sold | total_profit | 0.6016244 |
Four of the six numerical predictors are zero variance features. These include: units sold, total revenue, total count and total profit. These covariates should be removed from the analysis.
Both of the remaining two predictors (unit price and unit cost) are near zero variables. However these variables are highly collinear (R = .98) and we can drop either one from our analysis.
# Subset numerical variables
<-select_if(large.sales, is.numeric)%>%
num_boxselect(!order_id)
# create working df for boxplots
<-large.sales[8]
order
<-cbind(order, num_box)
num_box
# Plot using boxplots
= names(num_box)[1] #target_flag (needs to be fct for these plots)
response = purrr::set_names(response)
response
<- names(num_box)[2:7] #explanatory variables
explain = purrr::set_names(explain)
explain
= function(x) {
box_fun ggplot(num_box, aes_string(x = x, y = 'order_priority')) +
geom_boxplot(aes(fill = order_priority, alpha = 0.4), outlier.color =
'red', show.legend = FALSE)+
scale_fill_viridis(discrete = TRUE, option = "E")+
coord_flip()+
labs(title = 'Distribution of Continuous Variables Relative to Order Priority: Large Sales')+
theme_classic()
}
<-map(explain, ~box_fun(.x)) #creates a list of plots using purrr
box_plots
plot(reduce(box_plots,`+`)) # layout with patchwork
Evaluate categorical variables
Finding: all of the categorical predictors are zero variance. They should be removed from our analysis.
# Subset categorical variables into a df of factor vars
<-large.sales%>% #subset target_flag as factor
mosaicmutate(order_priority = as_factor(order_priority))%>%
::select(where(is.factor))%>%
dplyrrelocate(order_priority, .before=country)
<- all_of(names(mosaic)[1]) # set name for target_flag
target <- all_of(names(mosaic)[3:6])
predictors
#generate mosaic plots with purrr and dlookr
<-predictors%>%map(function(predictors) mosaic%>%target_by(target)%>%relate(predictors)%>%plot())) (plots
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
Review of both numerical and categorical variables indicates that only one covariate (unit_price or unit_cost) can be used to model the response variable, order_priority. As a result, the classification problem is reduced to the question of whether one of these covariates has sufficient variance to discriminate class levels for the response, order_priority.
This is obviously a trivial exercise. However, a logical approach is to convert order_priority to a binary form (H or Other) and attempt to discriminate these classes using a single covariate. I will use unit_price for this purpose.
Aggregating levels within order_priority produces imbalanced class frequencies - 25% H vs. 75% Other.
# remove zero variance predictors
<-large.sales%>%
salesselect(order_priority, unit_price)%>%
mutate(order_priority = ifelse(order_priority == 'H', 'H', 'Other'))
# assess class distributions
table(sales$order_priority)%>%
as.data.frame()%>%
%>%
flextableset_caption('Class Distribution for Sales: Large Dataset')
Var1 | Freq |
H | 24,945 |
Other | 75,055 |
The first step in our modeling process is to create training and test sets. Note that the modeling process that follows draws on the Tidymodels package, which is designed as a replacement for Caret.
set.seed(5345459)
# create random data split
<- initial_split(sales, prop = .75, strata = order_priority)
data_split
# create train and test sets from split
<- training(data_split)
train_data <- testing(data_split) test_data
With training and test sets in hand, we build a modeling recipe and finalize our data preparation process. I have included steps that are not relevant to the current dataset but which illustrate the options available for the recipe function.
# pre-model processing using recipes
<-
attr_reciperecipe(order_priority ~., data=sales)%>%
step_normalize(all_numeric())%>% # center and scale numerical vars%>%
#step_dummy(all_nominal, -attrition)%>% #convert factor cols to numeric
#step_zv(all_numeric())%>% # remove numeric vars that have zero variance (single unique value)
#step_corr(all_predictors(), threshold = 0.7, method = 'spearman')%>% # remove predictors that have large correlations with other covariates
step_smote(order_priority) # rebalance response using smote method (themis package)
# review type, roles, source
summary(attr_recipe)
In order to compare classification models, I will include five-fold cross-validation in our modeling workflow.
set.seed(11)
<-
cv_foldsvfold_cv(train_data,
v=5,
strata= order_priority)
For the purpose of this exercise, I will build build classifiers based on the following methods: logistic regression, random forests, and KNN.
# Random Forest
library(ranger)
<- rand_forest(trees = 500)%>% # given the trivial nature of the data, 500 is reasonable
rf set_engine('ranger', importance='impurity')%>% # impurity will provide variable importance scores
set_mode('classification')
# K-nearest neighbors, a k=5 is our default
<-
knnnearest_neighbor(neighbors = 4)%>% # can tune this part
set_engine('kknn')%>%
set_mode('classification')
# Logistic Regression
<-
logitlogistic_reg()%>%
set_engine(engine = 'glm')%>%
set_mode('classification')
The next step is to create initiate a workflow that will combine recipe and models. And we can set up our metrics for model evaluation.
# initiate a workflow
<-
sales.wfworkflow()%>%
add_recipe(attr_recipe)
# select metrics for model evaluation
<- metric_set(roc_auc, accuracy, sensitivity, specificity) sales.metrics
Now we combine workflow, model, metrics, and cross-validation
# train random forest model
<-
rf.train %>%
sales.wf add_model(rf) %>%
fit_resamples(
resamples = cv_folds,
metrics = sales.metrics,
control = control_resamples(save_pred = TRUE)
)
# train knn model
<-
knn.train %>%
sales.wf add_model(knn) %>%
fit_resamples(
resamples = cv_folds,
metrics = sales.metrics,
control = control_resamples(save_pred = TRUE)
)
# train logistic model
<-
logit.train %>%
sales.wf add_model(logit) %>%
fit_resamples(
resamples = cv_folds,
metrics = sales.metrics,
control = control_resamples(save_pred = TRUE)
)
With model training completed, we can evaluate the results. First we evaluate our Random Forest model.
From the training metrics and class predictions. It’s clear that the random model is unable to discriminate classes of order_priority. This isn’t surprising given the near zero variance of our predictor. Our ROC curve confirms this assessment and indicates that the results are equivalent to random guessing.
# training metrics for random forest
collect_metrics(rf.train)%>%
flextable()%>%
set_caption('Random Forest Training Metrics')
.metric | .estimator | mean | n | std_err | .config |
accuracy | binary | 0.4700996 | 5 | 0.015816637 | Preprocessor1_Model1 |
roc_auc | binary | 0.5031038 | 5 | 0.001138368 | Preprocessor1_Model1 |
sensitivity | binary | 0.5714643 | 5 | 0.030810977 | Preprocessor1_Model1 |
specificity | binary | 0.4364097 | 5 | 0.031280306 | Preprocessor1_Model1 |
# RF - check class predictions
%>%
rf.train conf_mat_resampled()%>%
flextable()%>%
set_caption('Random Forest Class Predictions')
Prediction | Truth | Freq |
H | H | 2,138.2 |
H | Other | 6,345.0 |
Other | H | 1,603.4 |
Other | Other | 4,913.2 |
# RF ROC curve
%>%
rf.train collect_predictions() %>%
group_by(id) %>%
roc_curve(order_priority, .pred_H) %>% # see rf.train%>%collect_predictions()
ggplot(aes(1 - specificity, sensitivity, color = id)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
labs(title = 'Random Forest ROC')+
coord_equal()
We now evaluate our nearest neighbors model.
The KNN model was not able to discriminate classes. With a sensitivity of 1.0, our model predicted all true positives.And no true negatives.
# training metrics for KNN
collect_metrics(knn.train)%>%
flextable()%>%
set_caption('KNN Training Metrics')
.metric | .estimator | mean | n | std_err | .config |
accuracy | binary | 0.2494433 | 5 | 0.00001131414 | Preprocessor1_Model1 |
roc_auc | binary | 0.5000000 | 5 | 0.00000000000 | Preprocessor1_Model1 |
sensitivity | binary | 1.0000000 | 5 | 0.00000000000 | Preprocessor1_Model1 |
specificity | binary | 0.0000000 | 5 | 0.00000000000 | Preprocessor1_Model1 |
# knn - check class predictions
%>%
knn.train conf_mat_resampled()%>%
flextable()%>%
set_caption('KNN Predictions')
Prediction | Truth | Freq |
H | H | 3,741.6 |
H | Other | 11,258.2 |
Other | H | 0.0 |
Other | Other | 0.0 |
# RF ROC curve
%>%
knn.train collect_predictions() %>%
group_by(id) %>%
roc_curve(order_priority, .pred_H) %>% # see rf.train%>%collect_predictions()
ggplot(aes(1 - specificity, sensitivity, color = id)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
labs(title = 'KNN ROC')+
coord_equal()
Now we evaluate training results for the logistic regression model.
Similar to the previous models, the logistic model performed no better than random guessing. Comparing to the random forest model, the prediction accuracy for the logistic model was lower but did exhibit higher sensitivity.
# training metrics for random forest
collect_metrics(logit.train)%>%
flextable()%>%
set_caption('Logistic Regression Training Metrics')
.metric | .estimator | mean | n | std_err | .config |
accuracy | binary | 0.4201790 | 5 | 0.001051367 | Preprocessor1_Model1 |
roc_auc | binary | 0.5045179 | 5 | 0.001001783 | Preprocessor1_Model1 |
sensitivity | binary | 0.6731345 | 5 | 0.001001477 | Preprocessor1_Model1 |
specificity | binary | 0.3361107 | 5 | 0.001509571 | Preprocessor1_Model1 |
# RF - check class predictions
%>%
logit.train conf_mat_resampled()%>%
flextable()%>%
set_caption('Logistic Regression Class Predictions')
Prediction | Truth | Freq |
H | H | 2,518.6 |
H | Other | 7,474.2 |
Other | H | 1,223.0 |
Other | Other | 3,784.0 |
# Logistic Regression ROC curve
%>%
logit.train collect_predictions() %>%
group_by(id) %>%
roc_curve(order_priority, .pred_H) %>% # see rf.train%>%collect_predictions()
ggplot(aes(1 - specificity, sensitivity, color = id)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
labs(title = 'Logistic Regression ROC')+
coord_equal()
Applying any of our models to the test dataset is trivial exercise since none of them effectively discriminate our target class. However, I will apply the random forest model for the purpose of demonstration.
# model the test set
<- sales.wf %>%
sales.finaladd_model(rf) %>%
last_fit(data_split)
# collect test metrics
collect_metrics(sales.final)%>%
flextable()%>%
set_caption('Random Forest Test Metrics')
.metric | .estimator | .estimate | .config |
accuracy | binary | 0.4941402 | Preprocessor1_Model1 |
roc_auc | binary | 0.4924767 | Preprocessor1_Model1 |
# collect predictions and create confusion matrix
collect_predictions(sales.final) %>%
conf_mat(order_priority, .pred_class)
## Truth
## Prediction H Other
## H 3074 9484
## Other 3163 9280
As expected, our random forest model is unable to discriminate classes in our response variable, order_priority.