open libraries
library(tidymodels)
library(kknn)
library(tidyverse)
library(themis) # for addressing class imbalance
library(dlookr)
library(janitor)
library(lubridate)
library(flextable)
library(magrittr)
library(viridis)
library(patchwork)This worksheet provides an analysis of sales data hosted at https://eforexcel.com/wp/downloads-18-sample-csv-files-data-sets-for-testing-sales/.
The purpose of this work is to build a classification model that can be used to predict a target variable from the dataset(s).
While designed to encourage analytical reasoning, the results of this exercise are trivial. This owes to the fact that the datasets, in their entirety, consist of all or near-zero variables.
As a result, I’ve taken this opportunity to encode three classification models (logistic regression, random forests, and K-Nearest Neighbors) using the tidy-models package and 5-fold cross-validation to assist model comparison.
Results of each model were equivalent to random guessing.
# load sales datasets - 10,000 and 100,000 obs datasets
s.sales<-read_csv('https://raw.githubusercontent.com/sconnin/Model_Compare/main/10000%20Sales%20Records.csv')
# read larger sales dataset
l.sales<-read_csv('https://raw.githubusercontent.com/sconnin/Model_Compare/main/100000%20Sales%20Records.csv')# create working dataframe for both datasets
small.sales<-s.sales
large.sales<-l.sales
# convert character cols to factor
small.sales[,1:4]<-map(small.sales[,1:4], factor)
large.sales[,1:4]<-map(large.sales[,1:4], factor)
# clean data
clean<-function(df){
df%>%
clean_names%>% # initial clean of col names
remove_empty(c("rows", "cols"))%>% # remove any empty rows and cols
distinct() # remove duplicate rows
}
small.sales<-clean(small.sales)
large.sales<-clean(large.sales)
# check for NA -- none found
#map(small.sales, ~sum(is.na(.))) # tag to avoid printing
#map(large.sales, ~sum(is.na(.)))
# convert order and shipping date char col to mdy
small.sales$order_date<-mdy(small.sales$order_date)
large.sales$order_date<-mdy(large.sales$order_date)
small.sales$ship_date<-mdy(small.sales$ship_date)
large.sales$ship_date<-mdy(large.sales$ship_date)
# relocate cols and calc order turnaround time - bin this data
turn<-function(df){
df%>%
relocate(c(order_id, country, item_type, sales_channel, order_date, ship_date, order_priority, units_sold, unit_price, unit_cost, total_revenue, total_cost, total_profit, country), .before = region)%>%
mutate(turnaround_time = ship_date-order_date, .after=ship_date)%>%
mutate(turnaround_time=str_replace(turnaround_time, ' days', ''))%>%
mutate(turnaround_time = as.integer(turnaround_time))%>%
mutate(turnaround_time = cut(turnaround_time, breaks = c(0,10,20,30,40,50), include.lowest=TRUE))%>%
mutate(turnaround_time = factor(turnaround_time))%>%
arrange(desc(order_date))
}
small.sales<-turn(small.sales)
large.sales<-turn(large.sales)Evaluate univariate statistics for numerical and factor variables. All downstream analyses will be restricted to the larger dataset.
# summarize numeric variables
large.sales%>%
diagnose_numeric()%>%
dplyr::select(variables, min, mean, median, max, zero, minus, outlier)%>%
flextable()%>%
set_caption("Summary Statistics for Large Dataset")variables | min | mean | median | max | zero | minus | outlier |
order_id | 100,008,904.00 | 550,395,554.1761 | 547,718,512.50 | 999,996,459.00 | 0 | 0 | 0 |
units_sold | 1.00 | 5,001.4462 | 5,007.00 | 10,000.00 | 0 | 0 | 0 |
unit_price | 9.33 | 266.7040 | 205.70 | 668.27 | 0 | 0 | 0 |
unit_cost | 6.92 | 188.0197 | 117.11 | 524.96 | 0 | 0 | 0 |
total_revenue | 18.66 | 1,336,066.7307 | 789,891.57 | 6,682,700.00 | 0 | 0 | 6,597 |
total_cost | 13.84 | 941,975.4934 | 467,937.41 | 5,249,075.04 | 0 | 0 | 9,678 |
total_profit | 4.82 | 394,091.2373 | 283,657.46 | 1,738,700.00 | 0 | 0 | 4,125 |
#summarize factor variables
large.sales%>%
diagnose_category()%>%
flextable()%>%
set_caption("Summary Statistics for Categorical Variables: large dataset")variables | levels | N | freq | ratio | rank |
country | Sudan | 100,000 | 623 | 0.623 | 1 |
country | New Zealand | 100,000 | 593 | 0.593 | 2 |
country | Vatican City | 100,000 | 590 | 0.590 | 3 |
country | Malta | 100,000 | 589 | 0.589 | 4 |
country | Mozambique | 100,000 | 589 | 0.589 | 4 |
country | Cambodia | 100,000 | 584 | 0.584 | 6 |
country | Tunisia | 100,000 | 584 | 0.584 | 6 |
country | Panama | 100,000 | 578 | 0.578 | 8 |
country | Rwanda | 100,000 | 576 | 0.576 | 9 |
country | Cote d'Ivoire | 100,000 | 575 | 0.575 | 10 |
item_type | Office Supplies | 100,000 | 8,426 | 8.426 | 1 |
item_type | Cereal | 100,000 | 8,421 | 8.421 | 2 |
item_type | Baby Food | 100,000 | 8,407 | 8.407 | 3 |
item_type | Cosmetics | 100,000 | 8,370 | 8.370 | 4 |
item_type | Personal Care | 100,000 | 8,364 | 8.364 | 5 |
item_type | Meat | 100,000 | 8,320 | 8.320 | 6 |
item_type | Snacks | 100,000 | 8,308 | 8.308 | 7 |
item_type | Clothes | 100,000 | 8,304 | 8.304 | 8 |
item_type | Vegetables | 100,000 | 8,282 | 8.282 | 9 |
item_type | Household | 100,000 | 8,278 | 8.278 | 10 |
sales_channel | Online | 100,000 | 50,054 | 50.054 | 1 |
sales_channel | Offline | 100,000 | 49,946 | 49.946 | 2 |
order_date | 2010-11-27 | 100,000 | 57 | 0.057 | 1 |
order_date | 2010-10-03 | 100,000 | 56 | 0.056 | 2 |
order_date | 2011-03-23 | 100,000 | 56 | 0.056 | 2 |
order_date | 2017-05-22 | 100,000 | 56 | 0.056 | 2 |
order_date | 2014-02-19 | 100,000 | 55 | 0.055 | 5 |
order_date | 2015-09-20 | 100,000 | 55 | 0.055 | 5 |
order_date | 2016-10-22 | 100,000 | 55 | 0.055 | 5 |
order_date | 2016-12-06 | 100,000 | 55 | 0.055 | 5 |
order_date | 2013-07-15 | 100,000 | 54 | 0.054 | 9 |
order_date | 2010-09-02 | 100,000 | 53 | 0.053 | 10 |
ship_date | 2015-10-04 | 100,000 | 61 | 0.061 | 1 |
ship_date | 2013-08-04 | 100,000 | 60 | 0.060 | 2 |
ship_date | 2010-10-26 | 100,000 | 59 | 0.059 | 3 |
ship_date | 2014-11-06 | 100,000 | 57 | 0.057 | 4 |
ship_date | 2011-12-20 | 100,000 | 56 | 0.056 | 5 |
ship_date | 2011-04-29 | 100,000 | 55 | 0.055 | 6 |
ship_date | 2011-05-09 | 100,000 | 55 | 0.055 | 6 |
ship_date | 2013-12-13 | 100,000 | 55 | 0.055 | 6 |
ship_date | 2013-04-04 | 100,000 | 54 | 0.054 | 9 |
ship_date | 2015-06-02 | 100,000 | 54 | 0.054 | 9 |
turnaround_time | [0,10] | 100,000 | 21,566 | 21.566 | 1 |
turnaround_time | (40,50] | 100,000 | 19,866 | 19.866 | 2 |
turnaround_time | (10,20] | 100,000 | 19,676 | 19.676 | 3 |
turnaround_time | (30,40] | 100,000 | 19,447 | 19.447 | 4 |
turnaround_time | (20,30] | 100,000 | 19,445 | 19.445 | 5 |
order_priority | M | 100,000 | 25,088 | 25.088 | 1 |
order_priority | L | 100,000 | 25,016 | 25.016 | 2 |
order_priority | C | 100,000 | 24,951 | 24.951 | 3 |
order_priority | H | 100,000 | 24,945 | 24.945 | 4 |
region | Sub-Saharan Africa | 100,000 | 26,019 | 26.019 | 1 |
region | Europe | 100,000 | 25,877 | 25.877 | 2 |
region | Asia | 100,000 | 14,547 | 14.547 | 3 |
region | Middle East and North Africa | 100,000 | 12,580 | 12.580 | 4 |
region | Central America and the Caribbean | 100,000 | 10,731 | 10.731 | 5 |
region | Australia and Oceania | 100,000 | 8,113 | 8.113 | 6 |
region | North America | 100,000 | 2,133 | 2.133 | 7 |
Pairwise comparisons for numeric variables
cor.tabl<-function(df, titles){
df%>%
filter_if(is.numeric, all_vars((.) != 0))%>%
correlate()%>%
filter(coef_corr > .6 | coef_corr < -.6)%>% # set thresholds to limit output
arrange(desc(coef_corr))%>%
flextable()%>%
set_caption(titles)
}
cor.tabl(large.sales, "Pairwise Correlation: Numerical Covariates: Large Dataset")var1 | var2 | coef_corr |
total_cost | total_revenue | 0.9876907 |
total_revenue | total_cost | 0.9876907 |
unit_cost | unit_price | 0.9860299 |
unit_price | unit_cost | 0.9860299 |
total_profit | total_revenue | 0.8801859 |
total_revenue | total_profit | 0.8801859 |
total_profit | total_cost | 0.7951102 |
total_cost | total_profit | 0.7951102 |
total_cost | unit_cost | 0.7652115 |
unit_cost | total_cost | 0.7652115 |
total_cost | unit_price | 0.7544118 |
unit_price | total_cost | 0.7544118 |
total_revenue | unit_price | 0.7392584 |
unit_price | total_revenue | 0.7392584 |
total_revenue | unit_cost | 0.7290550 |
unit_cost | total_revenue | 0.7290550 |
total_profit | units_sold | 0.6016244 |
units_sold | total_profit | 0.6016244 |
cor.tabl(large.sales, "Pairwise Correlation: Numerical Covariates: Large Dataset")var1 | var2 | coef_corr |
total_cost | total_revenue | 0.9876907 |
total_revenue | total_cost | 0.9876907 |
unit_cost | unit_price | 0.9860299 |
unit_price | unit_cost | 0.9860299 |
total_profit | total_revenue | 0.8801859 |
total_revenue | total_profit | 0.8801859 |
total_profit | total_cost | 0.7951102 |
total_cost | total_profit | 0.7951102 |
total_cost | unit_cost | 0.7652115 |
unit_cost | total_cost | 0.7652115 |
total_cost | unit_price | 0.7544118 |
unit_price | total_cost | 0.7544118 |
total_revenue | unit_price | 0.7392584 |
unit_price | total_revenue | 0.7392584 |
total_revenue | unit_cost | 0.7290550 |
unit_cost | total_revenue | 0.7290550 |
total_profit | units_sold | 0.6016244 |
units_sold | total_profit | 0.6016244 |
Four of the six numerical predictors are zero variance features. These include: units sold, total revenue, total count and total profit. These covariates should be removed from the analysis.
Both of the remaining two predictors (unit price and unit cost) are near zero variables. However these variables are highly collinear (R = .98) and we can drop either one from our analysis.
# Subset numerical variables
num_box<-select_if(large.sales, is.numeric)%>%
select(!order_id)
# create working df for boxplots
order<-large.sales[8]
num_box<-cbind(order, num_box)
# Plot using boxplots
response = names(num_box)[1] #target_flag (needs to be fct for these plots)
response = purrr::set_names(response)
explain <- names(num_box)[2:7] #explanatory variables
explain = purrr::set_names(explain)
box_fun = function(x) {
ggplot(num_box, aes_string(x = x, y = 'order_priority')) +
geom_boxplot(aes(fill = order_priority, alpha = 0.4), outlier.color =
'red', show.legend = FALSE)+
scale_fill_viridis(discrete = TRUE, option = "E")+
coord_flip()+
labs(title = 'Distribution of Continuous Variables Relative to Order Priority: Large Sales')+
theme_classic()
}
box_plots<-map(explain, ~box_fun(.x)) #creates a list of plots using purrr
plot(reduce(box_plots,`+`)) # layout with patchworkEvaluate categorical variables
Finding: all of the categorical predictors are zero variance. They should be removed from our analysis.
# Subset categorical variables into a df of factor vars
mosaic<-large.sales%>% #subset target_flag as factor
mutate(order_priority = as_factor(order_priority))%>%
dplyr::select(where(is.factor))%>%
relocate(order_priority, .before=country)
target <- all_of(names(mosaic)[1]) # set name for target_flag
predictors <- all_of(names(mosaic)[3:6])
#generate mosaic plots with purrr and dlookr
(plots<-predictors%>%map(function(predictors) mosaic%>%target_by(target)%>%relate(predictors)%>%plot()))## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
Review of both numerical and categorical variables indicates that only one covariate (unit_price or unit_cost) can be used to model the response variable, order_priority. As a result, the classification problem is reduced to the question of whether one of these covariates has sufficient variance to discriminate class levels for the response, order_priority.
This is obviously a trivial exercise. However, a logical approach is to convert order_priority to a binary form (H or Other) and attempt to discriminate these classes using a single covariate. I will use unit_price for this purpose.
Aggregating levels within order_priority produces imbalanced class frequencies - 25% H vs. 75% Other.
# remove zero variance predictors
sales<-large.sales%>%
select(order_priority, unit_price)%>%
mutate(order_priority = ifelse(order_priority == 'H', 'H', 'Other'))
# assess class distributions
table(sales$order_priority)%>%
as.data.frame()%>%
flextable%>%
set_caption('Class Distribution for Sales: Large Dataset')Var1 | Freq |
H | 24,945 |
Other | 75,055 |
The first step in our modeling process is to create training and test sets. Note that the modeling process that follows draws on the Tidymodels package, which is designed as a replacement for Caret.
set.seed(5345459)
# create random data split
data_split <- initial_split(sales, prop = .75, strata = order_priority)
# create train and test sets from split
train_data <- training(data_split)
test_data <- testing(data_split)With training and test sets in hand, we build a modeling recipe and finalize our data preparation process. I have included steps that are not relevant to the current dataset but which illustrate the options available for the recipe function.
# pre-model processing using recipes
attr_recipe<-
recipe(order_priority ~., data=sales)%>%
step_normalize(all_numeric())%>% # center and scale numerical vars%>%
#step_dummy(all_nominal, -attrition)%>% #convert factor cols to numeric
#step_zv(all_numeric())%>% # remove numeric vars that have zero variance (single unique value)
#step_corr(all_predictors(), threshold = 0.7, method = 'spearman')%>% # remove predictors that have large correlations with other covariates
step_smote(order_priority) # rebalance response using smote method (themis package)
# review type, roles, source
summary(attr_recipe)In order to compare classification models, I will include five-fold cross-validation in our modeling workflow.
set.seed(11)
cv_folds<-
vfold_cv(train_data,
v=5,
strata= order_priority)For the purpose of this exercise, I will build build classifiers based on the following methods: logistic regression, random forests, and KNN.
# Random Forest
library(ranger)
rf <- rand_forest(trees = 500)%>% # given the trivial nature of the data, 500 is reasonable
set_engine('ranger', importance='impurity')%>% # impurity will provide variable importance scores
set_mode('classification')
# K-nearest neighbors, a k=5 is our default
knn<-
nearest_neighbor(neighbors = 4)%>% # can tune this part
set_engine('kknn')%>%
set_mode('classification')
# Logistic Regression
logit<-
logistic_reg()%>%
set_engine(engine = 'glm')%>%
set_mode('classification')The next step is to create initiate a workflow that will combine recipe and models. And we can set up our metrics for model evaluation.
# initiate a workflow
sales.wf<-
workflow()%>%
add_recipe(attr_recipe)
# select metrics for model evaluation
sales.metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)Now we combine workflow, model, metrics, and cross-validation
# train random forest model
rf.train <-
sales.wf %>%
add_model(rf) %>%
fit_resamples(
resamples = cv_folds,
metrics = sales.metrics,
control = control_resamples(save_pred = TRUE)
)
# train knn model
knn.train <-
sales.wf %>%
add_model(knn) %>%
fit_resamples(
resamples = cv_folds,
metrics = sales.metrics,
control = control_resamples(save_pred = TRUE)
)
# train logistic model
logit.train <-
sales.wf %>%
add_model(logit) %>%
fit_resamples(
resamples = cv_folds,
metrics = sales.metrics,
control = control_resamples(save_pred = TRUE)
)With model training completed, we can evaluate the results. First we evaluate our Random Forest model.
From the training metrics and class predictions. It’s clear that the random model is unable to discriminate classes of order_priority. This isn’t surprising given the near zero variance of our predictor. Our ROC curve confirms this assessment and indicates that the results are equivalent to random guessing.
# training metrics for random forest
collect_metrics(rf.train)%>%
flextable()%>%
set_caption('Random Forest Training Metrics').metric | .estimator | mean | n | std_err | .config |
accuracy | binary | 0.4700996 | 5 | 0.015816637 | Preprocessor1_Model1 |
roc_auc | binary | 0.5031038 | 5 | 0.001138368 | Preprocessor1_Model1 |
sensitivity | binary | 0.5714643 | 5 | 0.030810977 | Preprocessor1_Model1 |
specificity | binary | 0.4364097 | 5 | 0.031280306 | Preprocessor1_Model1 |
# RF - check class predictions
rf.train %>%
conf_mat_resampled()%>%
flextable()%>%
set_caption('Random Forest Class Predictions')Prediction | Truth | Freq |
H | H | 2,138.2 |
H | Other | 6,345.0 |
Other | H | 1,603.4 |
Other | Other | 4,913.2 |
# RF ROC curve
rf.train %>%
collect_predictions() %>%
group_by(id) %>%
roc_curve(order_priority, .pred_H) %>% # see rf.train%>%collect_predictions()
ggplot(aes(1 - specificity, sensitivity, color = id)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
labs(title = 'Random Forest ROC')+
coord_equal()We now evaluate our nearest neighbors model.
The KNN model was not able to discriminate classes. With a sensitivity of 1.0, our model predicted all true positives.And no true negatives.
# training metrics for KNN
collect_metrics(knn.train)%>%
flextable()%>%
set_caption('KNN Training Metrics').metric | .estimator | mean | n | std_err | .config |
accuracy | binary | 0.2494433 | 5 | 0.00001131414 | Preprocessor1_Model1 |
roc_auc | binary | 0.5000000 | 5 | 0.00000000000 | Preprocessor1_Model1 |
sensitivity | binary | 1.0000000 | 5 | 0.00000000000 | Preprocessor1_Model1 |
specificity | binary | 0.0000000 | 5 | 0.00000000000 | Preprocessor1_Model1 |
# knn - check class predictions
knn.train %>%
conf_mat_resampled()%>%
flextable()%>%
set_caption('KNN Predictions')Prediction | Truth | Freq |
H | H | 3,741.6 |
H | Other | 11,258.2 |
Other | H | 0.0 |
Other | Other | 0.0 |
# RF ROC curve
knn.train %>%
collect_predictions() %>%
group_by(id) %>%
roc_curve(order_priority, .pred_H) %>% # see rf.train%>%collect_predictions()
ggplot(aes(1 - specificity, sensitivity, color = id)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
labs(title = 'KNN ROC')+
coord_equal()Now we evaluate training results for the logistic regression model.
Similar to the previous models, the logistic model performed no better than random guessing. Comparing to the random forest model, the prediction accuracy for the logistic model was lower but did exhibit higher sensitivity.
# training metrics for random forest
collect_metrics(logit.train)%>%
flextable()%>%
set_caption('Logistic Regression Training Metrics').metric | .estimator | mean | n | std_err | .config |
accuracy | binary | 0.4201790 | 5 | 0.001051367 | Preprocessor1_Model1 |
roc_auc | binary | 0.5045179 | 5 | 0.001001783 | Preprocessor1_Model1 |
sensitivity | binary | 0.6731345 | 5 | 0.001001477 | Preprocessor1_Model1 |
specificity | binary | 0.3361107 | 5 | 0.001509571 | Preprocessor1_Model1 |
# RF - check class predictions
logit.train %>%
conf_mat_resampled()%>%
flextable()%>%
set_caption('Logistic Regression Class Predictions')Prediction | Truth | Freq |
H | H | 2,518.6 |
H | Other | 7,474.2 |
Other | H | 1,223.0 |
Other | Other | 3,784.0 |
# Logistic Regression ROC curve
logit.train %>%
collect_predictions() %>%
group_by(id) %>%
roc_curve(order_priority, .pred_H) %>% # see rf.train%>%collect_predictions()
ggplot(aes(1 - specificity, sensitivity, color = id)) +
geom_abline(lty = 2, color = "gray80", size = 1.5) +
geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
labs(title = 'Logistic Regression ROC')+
coord_equal()Applying any of our models to the test dataset is trivial exercise since none of them effectively discriminate our target class. However, I will apply the random forest model for the purpose of demonstration.
# model the test set
sales.final<- sales.wf %>%
add_model(rf) %>%
last_fit(data_split)
# collect test metrics
collect_metrics(sales.final)%>%
flextable()%>%
set_caption('Random Forest Test Metrics').metric | .estimator | .estimate | .config |
accuracy | binary | 0.4941402 | Preprocessor1_Model1 |
roc_auc | binary | 0.4924767 | Preprocessor1_Model1 |
# collect predictions and create confusion matrix
collect_predictions(sales.final) %>%
conf_mat(order_priority, .pred_class)## Truth
## Prediction H Other
## H 3074 9484
## Other 3163 9280
As expected, our random forest model is unable to discriminate classes in our response variable, order_priority.