open libraries

library(tidymodels)
library(kknn)
library(tidyverse)
library(themis) # for addressing class imbalance
library(dlookr)
library(janitor)
library(lubridate)
library(flextable)
library(magrittr)
library(viridis)
library(patchwork)

1 Introduction

This worksheet provides an analysis of sales data hosted at https://eforexcel.com/wp/downloads-18-sample-csv-files-data-sets-for-testing-sales/.

The purpose of this work is to build a classification model that can be used to predict a target variable from the dataset(s).

While designed to encourage analytical reasoning, the results of this exercise are trivial. This owes to the fact that the datasets, in their entirety, consist of all or near-zero variables.

As a result, I’ve taken this opportunity to encode three classification models (logistic regression, random forests, and K-Nearest Neighbors) using the tidy-models package and 5-fold cross-validation to assist model comparison.

Results of each model were equivalent to random guessing.

# load sales datasets - 10,000 and 100,000 obs datasets

s.sales<-read_csv('https://raw.githubusercontent.com/sconnin/Model_Compare/main/10000%20Sales%20Records.csv')

# read larger sales dataset

l.sales<-read_csv('https://raw.githubusercontent.com/sconnin/Model_Compare/main/100000%20Sales%20Records.csv')

2 Data Preprocessing and Exploration

# create working dataframe for both datasets

small.sales<-s.sales
large.sales<-l.sales

# convert character cols to factor

small.sales[,1:4]<-map(small.sales[,1:4], factor)
large.sales[,1:4]<-map(large.sales[,1:4], factor)

# clean data 

clean<-function(df){
    
    df%>%
    clean_names%>% # initial clean of col names
    remove_empty(c("rows", "cols"))%>%  # remove any empty rows and cols
    distinct()    # remove duplicate rows
}

small.sales<-clean(small.sales)
large.sales<-clean(large.sales)

# check for NA -- none found

#map(small.sales, ~sum(is.na(.)))  # tag to avoid printing
#map(large.sales, ~sum(is.na(.)))

# convert order and shipping date char col to mdy

small.sales$order_date<-mdy(small.sales$order_date)
large.sales$order_date<-mdy(large.sales$order_date)

small.sales$ship_date<-mdy(small.sales$ship_date)
large.sales$ship_date<-mdy(large.sales$ship_date)

# relocate cols and calc order turnaround time - bin this data

turn<-function(df){
  df%>%
    relocate(c(order_id, country, item_type, sales_channel, order_date, ship_date, order_priority, units_sold, unit_price, unit_cost, total_revenue, total_cost, total_profit, country), .before = region)%>%
  mutate(turnaround_time = ship_date-order_date, .after=ship_date)%>%
  mutate(turnaround_time=str_replace(turnaround_time, ' days', ''))%>%
  mutate(turnaround_time = as.integer(turnaround_time))%>%
  mutate(turnaround_time = cut(turnaround_time, breaks = c(0,10,20,30,40,50), include.lowest=TRUE))%>%
  mutate(turnaround_time = factor(turnaround_time))%>%
  arrange(desc(order_date))
  

}

small.sales<-turn(small.sales)
large.sales<-turn(large.sales)

Evaluate univariate statistics for numerical and factor variables. All downstream analyses will be restricted to the larger dataset.

# summarize numeric variables

large.sales%>%
    diagnose_numeric()%>%
    dplyr::select(variables, min, mean, median, max, zero, minus, outlier)%>%
    flextable()%>%
    set_caption("Summary Statistics for Large Dataset")

Summary Statistics for Large Dataset
variables	min	mean	median	max	outlier
order_id	100,008,904.00	550,395,554.1761	547,718,512.50	999,996,459.00	0
units_sold	1.00	5,001.4462	5,007.00	10,000.00	0
unit_price	9.33	266.7040	205.70	668.27	0
unit_cost	6.92	188.0197	117.11	524.96	0
total_revenue	18.66	1,336,066.7307	789,891.57	6,682,700.00	6,597
total_cost	13.84	941,975.4934	467,937.41	5,249,075.04	9,678
total_profit	4.82	394,091.2373	283,657.46	1,738,700.00	4,125

#summarize factor variables

large.sales%>%
    diagnose_category()%>%
    flextable()%>%
    set_caption("Summary Statistics for Categorical Variables: large dataset")

Summary Statistics for Categorical Variables: large dataset
variables	levels	N	freq	ratio	rank
country	Sudan	100,000	623	0.623	1
country	New Zealand	100,000	593	0.593	2
country	Vatican City	100,000	590	0.590	3
country	Malta	100,000	589	0.589	4
country	Mozambique	100,000	589	0.589	4
country	Cambodia	100,000	584	0.584	6
country	Tunisia	100,000	584	0.584	6
country	Panama	100,000	578	0.578	8
country	Rwanda	100,000	576	0.576	9
country	Cote d'Ivoire	100,000	575	0.575	10
item_type	Office Supplies	100,000	8,426	8.426	1
item_type	Cereal	100,000	8,421	8.421	2
item_type	Baby Food	100,000	8,407	8.407	3
item_type	Cosmetics	100,000	8,370	8.370	4
item_type	Personal Care	100,000	8,364	8.364	5
item_type	Meat	100,000	8,320	8.320	6
item_type	Snacks	100,000	8,308	8.308	7
item_type	Clothes	100,000	8,304	8.304	8
item_type	Vegetables	100,000	8,282	8.282	9
item_type	Household	100,000	8,278	8.278	10
sales_channel	Online	100,000	50,054	50.054	1
sales_channel	Offline	100,000	49,946	49.946	2
order_date	2010-11-27	100,000	57	0.057	1
order_date	2010-10-03	100,000	56	0.056	2
order_date	2011-03-23	100,000	56	0.056	2
order_date	2017-05-22	100,000	56	0.056	2
order_date	2014-02-19	100,000	55	0.055	5
order_date	2015-09-20	100,000	55	0.055	5
order_date	2016-10-22	100,000	55	0.055	5
order_date	2016-12-06	100,000	55	0.055	5
order_date	2013-07-15	100,000	54	0.054	9
order_date	2010-09-02	100,000	53	0.053	10
ship_date	2015-10-04	100,000	61	0.061	1
ship_date	2013-08-04	100,000	60	0.060	2
ship_date	2010-10-26	100,000	59	0.059	3
ship_date	2014-11-06	100,000	57	0.057	4
ship_date	2011-12-20	100,000	56	0.056	5
ship_date	2011-04-29	100,000	55	0.055	6
ship_date	2011-05-09	100,000	55	0.055	6
ship_date	2013-12-13	100,000	55	0.055	6
ship_date	2013-04-04	100,000	54	0.054	9
ship_date	2015-06-02	100,000	54	0.054	9
turnaround_time	[0,10]	100,000	21,566	21.566	1
turnaround_time	(40,50]	100,000	19,866	19.866	2
turnaround_time	(10,20]	100,000	19,676	19.676	3
turnaround_time	(30,40]	100,000	19,447	19.447	4
turnaround_time	(20,30]	100,000	19,445	19.445	5
order_priority	M	100,000	25,088	25.088	1
order_priority	L	100,000	25,016	25.016	2
order_priority	C	100,000	24,951	24.951	3
order_priority	H	100,000	24,945	24.945	4
region	Sub-Saharan Africa	100,000	26,019	26.019	1
region	Europe	100,000	25,877	25.877	2
region	Asia	100,000	14,547	14.547	3
region	Middle East and North Africa	100,000	12,580	12.580	4
region	Central America and the Caribbean	100,000	10,731	10.731	5
region	Australia and Oceania	100,000	8,113	8.113	6
region	North America	100,000	2,133	2.133	7

Pairwise comparisons for numeric variables

cor.tabl<-function(df, titles){
    df%>%
    filter_if(is.numeric, all_vars((.) != 0))%>%
    correlate()%>%
    filter(coef_corr > .6 | coef_corr < -.6)%>% # set thresholds to limit output 
    arrange(desc(coef_corr))%>%
    flextable()%>%
    set_caption(titles)
}


cor.tabl(large.sales, "Pairwise Correlation: Numerical Covariates: Large Dataset")

Pairwise Correlation: Numerical Covariates: Large Dataset
var1	var2	coef_corr
total_cost	total_revenue	0.9876907
total_revenue	total_cost	0.9876907
unit_cost	unit_price	0.9860299
unit_price	unit_cost	0.9860299
total_profit	total_revenue	0.8801859
total_revenue	total_profit	0.8801859
total_profit	total_cost	0.7951102
total_cost	total_profit	0.7951102
total_cost	unit_cost	0.7652115
unit_cost	total_cost	0.7652115
total_cost	unit_price	0.7544118
unit_price	total_cost	0.7544118
total_revenue	unit_price	0.7392584
unit_price	total_revenue	0.7392584
total_revenue	unit_cost	0.7290550
unit_cost	total_revenue	0.7290550
total_profit	units_sold	0.6016244
units_sold	total_profit	0.6016244

cor.tabl(large.sales, "Pairwise Correlation: Numerical Covariates: Large Dataset")

Pairwise Correlation: Numerical Covariates: Large Dataset
var1	var2	coef_corr
total_cost	total_revenue	0.9876907
total_revenue	total_cost	0.9876907
unit_cost	unit_price	0.9860299
unit_price	unit_cost	0.9860299
total_profit	total_revenue	0.8801859
total_revenue	total_profit	0.8801859
total_profit	total_cost	0.7951102
total_cost	total_profit	0.7951102
total_cost	unit_cost	0.7652115
unit_cost	total_cost	0.7652115
total_cost	unit_price	0.7544118
unit_price	total_cost	0.7544118
total_revenue	unit_price	0.7392584
unit_price	total_revenue	0.7392584
total_revenue	unit_cost	0.7290550
unit_cost	total_revenue	0.7290550
total_profit	units_sold	0.6016244
units_sold	total_profit	0.6016244

Four of the six numerical predictors are zero variance features. These include: units sold, total revenue, total count and total profit. These covariates should be removed from the analysis.

Both of the remaining two predictors (unit price and unit cost) are near zero variables. However these variables are highly collinear (R = .98) and we can drop either one from our analysis.

# Subset numerical variables

num_box<-select_if(large.sales, is.numeric)%>%
  select(!order_id)

# create working df for boxplots

order<-large.sales[8]

num_box<-cbind(order, num_box)

# Plot using boxplots

response = names(num_box)[1] #target_flag (needs to be fct for these plots)
response = purrr::set_names(response)


explain <- names(num_box)[2:7] #explanatory variables
explain = purrr::set_names(explain)

box_fun = function(x) {
    ggplot(num_box, aes_string(x = x, y = 'order_priority')) +
    geom_boxplot(aes(fill = order_priority, alpha = 0.4), outlier.color =
    'red', show.legend = FALSE)+
    scale_fill_viridis(discrete = TRUE, option = "E")+
    coord_flip()+
    labs(title = 'Distribution of Continuous Variables Relative to Order Priority: Large Sales')+
    theme_classic()
}

box_plots<-map(explain, ~box_fun(.x)) #creates a list of plots using purrr

plot(reduce(box_plots,`+`)) # layout with patchwork

Evaluate categorical variables

Finding: all of the categorical predictors are zero variance. They should be removed from our analysis.

# Subset categorical variables into a df of factor vars

mosaic<-large.sales%>% #subset target_flag as factor
    mutate(order_priority = as_factor(order_priority))%>%
    dplyr::select(where(is.factor))%>%
  relocate(order_priority, .before=country)

target <- all_of(names(mosaic)[1]) # set name for target_flag
predictors <- all_of(names(mosaic)[3:6])

#generate mosaic plots with purrr and dlookr

(plots<-predictors%>%map(function(predictors) mosaic%>%target_by(target)%>%relate(predictors)%>%plot()))

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

Review of both numerical and categorical variables indicates that only one covariate (unit_price or unit_cost) can be used to model the response variable, order_priority. As a result, the classification problem is reduced to the question of whether one of these covariates has sufficient variance to discriminate class levels for the response, order_priority.

This is obviously a trivial exercise. However, a logical approach is to convert order_priority to a binary form (H or Other) and attempt to discriminate these classes using a single covariate. I will use unit_price for this purpose.

Aggregating levels within order_priority produces imbalanced class frequencies - 25% H vs. 75% Other.

# remove zero variance predictors

sales<-large.sales%>%
  select(order_priority, unit_price)%>%
  mutate(order_priority = ifelse(order_priority == 'H', 'H', 'Other'))

# assess class distributions

table(sales$order_priority)%>%
  as.data.frame()%>%
  flextable%>%
  set_caption('Class Distribution for Sales: Large Dataset')

Class Distribution for Sales: Large Dataset
Var1	Freq
H	24,945
Other	75,055

3 Model Development

The first step in our modeling process is to create training and test sets. Note that the modeling process that follows draws on the Tidymodels package, which is designed as a replacement for Caret.

set.seed(5345459)

# create random data split

data_split <- initial_split(sales, prop = .75, strata = order_priority)

# create train and test sets from split

train_data <- training(data_split)
test_data <- testing(data_split)

With training and test sets in hand, we build a modeling recipe and finalize our data preparation process. I have included steps that are not relevant to the current dataset but which illustrate the options available for the recipe function.

# pre-model processing using recipes

attr_recipe<-
  recipe(order_priority ~., data=sales)%>%
  step_normalize(all_numeric())%>% # center and scale numerical vars%>%
  #step_dummy(all_nominal, -attrition)%>% #convert factor cols to numeric
  #step_zv(all_numeric())%>% # remove numeric vars that have zero variance (single unique value)
  #step_corr(all_predictors(), threshold = 0.7, method = 'spearman')%>% # remove predictors that have large correlations with other covariates
  step_smote(order_priority) # rebalance response using smote method (themis package)
  
  
# review type, roles, source
  
summary(attr_recipe)

In order to compare classification models, I will include five-fold cross-validation in our modeling workflow.

set.seed(11)

cv_folds<-
  vfold_cv(train_data, 
           v=5,
           strata= order_priority)

For the purpose of this exercise, I will build build classifiers based on the following methods: logistic regression, random forests, and KNN.

# Random Forest

library(ranger)

rf <- rand_forest(trees = 500)%>% # given the trivial nature of the data, 500 is reasonable
  set_engine('ranger', importance='impurity')%>% # impurity will provide variable importance scores
  set_mode('classification')

# K-nearest neighbors, a k=5 is our default

knn<- 
  nearest_neighbor(neighbors = 4)%>% # can tune this part
  set_engine('kknn')%>%
  set_mode('classification')

# Logistic Regression

logit<-
  logistic_reg()%>%
  set_engine(engine = 'glm')%>%
  set_mode('classification')

The next step is to create initiate a workflow that will combine recipe and models. And we can set up our metrics for model evaluation.

# initiate a workflow

sales.wf<- 
  workflow()%>%
  add_recipe(attr_recipe)

# select metrics for model evaluation

sales.metrics <- metric_set(roc_auc, accuracy, sensitivity, specificity)

Now we combine workflow, model, metrics, and cross-validation

# train random forest model


rf.train <- 
  sales.wf %>%
  add_model(rf) %>%
  fit_resamples(
    resamples = cv_folds,
    metrics = sales.metrics,
    control = control_resamples(save_pred = TRUE)
  )


# train knn model

knn.train <- 
  sales.wf %>%
  add_model(knn) %>%
  fit_resamples(
    resamples = cv_folds,
    metrics = sales.metrics,
    control = control_resamples(save_pred = TRUE)
  )


# train logistic model

logit.train <- 
  sales.wf %>%
  add_model(logit) %>%
  fit_resamples(
    resamples = cv_folds,
    metrics = sales.metrics,
    control = control_resamples(save_pred = TRUE)
  )

4 Model Evaluation

With model training completed, we can evaluate the results. First we evaluate our Random Forest model.

From the training metrics and class predictions. It’s clear that the random model is unable to discriminate classes of order_priority. This isn’t surprising given the near zero variance of our predictor. Our ROC curve confirms this assessment and indicates that the results are equivalent to random guessing.

# training metrics for random forest

collect_metrics(rf.train)%>%
  flextable()%>%
  set_caption('Random Forest Training Metrics')

Random Forest Training Metrics
.metric	.estimator	mean	n	std_err	.config
accuracy	binary	0.4700996	5	0.015816637	Preprocessor1_Model1
roc_auc	binary	0.5031038	5	0.001138368	Preprocessor1_Model1
sensitivity	binary	0.5714643	5	0.030810977	Preprocessor1_Model1
specificity	binary	0.4364097	5	0.031280306	Preprocessor1_Model1

# RF - check class predictions

rf.train %>%
  conf_mat_resampled()%>%
  flextable()%>%
  set_caption('Random Forest Class Predictions')

Random Forest Class Predictions
Prediction	Truth	Freq
H	H	2,138.2
H	Other	6,345.0
Other	H	1,603.4
Other	Other	4,913.2

# RF ROC curve

rf.train %>%
  collect_predictions() %>%
  group_by(id) %>%
  roc_curve(order_priority, .pred_H) %>% # see  rf.train%>%collect_predictions()
  ggplot(aes(1 - specificity, sensitivity, color = id)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
  labs(title = 'Random Forest ROC')+
  coord_equal()

We now evaluate our nearest neighbors model.

The KNN model was not able to discriminate classes. With a sensitivity of 1.0, our model predicted all true positives.And no true negatives.

# training metrics for KNN

collect_metrics(knn.train)%>%
  flextable()%>%
  set_caption('KNN Training Metrics')

KNN Training Metrics
.metric	.estimator	mean	n	std_err	.config
accuracy	binary	0.2494433	5	0.00001131414	Preprocessor1_Model1
roc_auc	binary	0.5000000	5	0.00000000000	Preprocessor1_Model1
sensitivity	binary	1.0000000	5	0.00000000000	Preprocessor1_Model1
specificity	binary	0.0000000	5	0.00000000000	Preprocessor1_Model1

# knn - check class predictions

knn.train %>%
  conf_mat_resampled()%>%
  flextable()%>%
  set_caption('KNN Predictions')

KNN Predictions
Prediction	Truth	Freq
H	H	3,741.6
H	Other	11,258.2
Other	H	0.0
Other	Other	0.0

# RF ROC curve

knn.train %>%
  collect_predictions() %>%
  group_by(id) %>%
  roc_curve(order_priority, .pred_H) %>% # see  rf.train%>%collect_predictions()
  ggplot(aes(1 - specificity, sensitivity, color = id)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
  labs(title = 'KNN ROC')+
  coord_equal()

Now we evaluate training results for the logistic regression model.

Similar to the previous models, the logistic model performed no better than random guessing. Comparing to the random forest model, the prediction accuracy for the logistic model was lower but did exhibit higher sensitivity.

# training metrics for random forest

collect_metrics(logit.train)%>%
  flextable()%>%
  set_caption('Logistic Regression Training Metrics')

Logistic Regression Training Metrics
.metric	.estimator	mean	n	std_err	.config
accuracy	binary	0.4201790	5	0.001051367	Preprocessor1_Model1
roc_auc	binary	0.5045179	5	0.001001783	Preprocessor1_Model1
sensitivity	binary	0.6731345	5	0.001001477	Preprocessor1_Model1
specificity	binary	0.3361107	5	0.001509571	Preprocessor1_Model1

# RF - check class predictions

logit.train %>%
  conf_mat_resampled()%>%
  flextable()%>%
  set_caption('Logistic Regression Class Predictions')

Logistic Regression Class Predictions
Prediction	Truth	Freq
H	H	2,518.6
H	Other	7,474.2
Other	H	1,223.0
Other	Other	3,784.0

# Logistic Regression ROC curve

logit.train %>%
  collect_predictions() %>%
  group_by(id) %>%
  roc_curve(order_priority, .pred_H) %>% # see  rf.train%>%collect_predictions()
  ggplot(aes(1 - specificity, sensitivity, color = id)) +
  geom_abline(lty = 2, color = "gray80", size = 1.5) +
  geom_path(show.legend = FALSE, alpha = 0.6, size = 1.2) +
  labs(title = 'Logistic Regression ROC')+
  coord_equal()

Applying any of our models to the test dataset is trivial exercise since none of them effectively discriminate our target class. However, I will apply the random forest model for the purpose of demonstration.

# model the test set

sales.final<- sales.wf %>%
  add_model(rf) %>%
  last_fit(data_split)

# collect test metrics

collect_metrics(sales.final)%>%
  flextable()%>%
  set_caption('Random Forest Test Metrics')

Random Forest Test Metrics
.metric	.estimator	.estimate	.config
accuracy	binary	0.4941402	Preprocessor1_Model1
roc_auc	binary	0.4924767	Preprocessor1_Model1

# collect predictions and create confusion matrix

collect_predictions(sales.final) %>%
  conf_mat(order_priority, .pred_class)

##           Truth
## Prediction    H Other
##      H     3074  9484
##      Other 3163  9280

As expected, our random forest model is unable to discriminate classes in our response variable, order_priority.

hw622_hw1

Sean Connin

1 Introduction

2 Data Preprocessing and Exploration

3 Model Development

4 Model Evaluation