library(tidyverse)
library(ggpubr)
library(caret)
library(class)
library(lubridate)
library(MASS)
library(BBmisc)
library(rpart)
library(fastDummies)
library(rpart.plot)

Data Exploration

Read in Data

I elected to use the 100 record and 50,000 record datasets from https://excelbianalytics.com/wp/downloads-18-sample-csv-files-data-sets-for-testing-sales/.

# read in data
data.small <- read.csv("https://raw.githubusercontent.com/SaneSky109/DATA622/main/HW1/Data/100%20Sales%20Records.csv")

data.large <- read.csv("https://raw.githubusercontent.com/SaneSky109/DATA622/main/HW1/Data/50000%20Sales%20Records.csv")

Explore Data Structure

At a quick glance, the small dataset contains 100 records and 14 variables and the large dataset contains 50,000 records and the same 14 variables. The datasets also contain a variety of data types for variables (categorical and numeric, and time). The data types for some variables need to be adjusted before continuing any further analysis.

glimpse(data.small)

## Rows: 100
## Columns: 14
## $ Region         <chr> "Australia and Oceania", "Central America and the Carib~
## $ Country        <chr> "Tuvalu", "Grenada", "Russia", "Sao Tome and Principe",~
## $ Item.Type      <chr> "Baby Food", "Cereal", "Office Supplies", "Fruits", "Of~
## $ Sales.Channel  <chr> "Offline", "Online", "Offline", "Online", "Offline", "O~
## $ Order.Priority <chr> "H", "C", "L", "C", "L", "C", "M", "H", "M", "H", "H", ~
## $ Order.Date     <chr> "5/28/2010", "8/22/2012", "5/2/2014", "6/20/2014", "2/1~
## $ Order.ID       <int> 669165933, 963881480, 341417157, 514321792, 115456712, ~
## $ Ship.Date      <chr> "6/27/2010", "9/15/2012", "5/8/2014", "7/5/2014", "2/6/~
## $ Units.Sold     <int> 9925, 2804, 1779, 8102, 5062, 2974, 4187, 8082, 6070, 6~
## $ Unit.Price     <dbl> 255.28, 205.70, 651.21, 9.33, 651.21, 255.28, 668.27, 1~
## $ Unit.Cost      <dbl> 159.42, 117.11, 524.96, 6.92, 524.96, 159.42, 502.54, 9~
## $ Total.Revenue  <dbl> 2533654.00, 576782.80, 1158502.59, 75591.66, 3296425.02~
## $ Total.Cost     <dbl> 1582243.50, 328376.44, 933903.84, 56065.84, 2657347.52,~
## $ Total.Profit   <dbl> 951410.50, 248406.36, 224598.75, 19525.82, 639077.50, 2~

glimpse(data.large)

## Rows: 50,000
## Columns: 14
## $ Region         <chr> "Sub-Saharan Africa", "Europe", "Europe", "Europe", "Eu~
## $ Country        <chr> "Namibia", "Iceland", "Russia", "Moldova ", "Malta", "I~
## $ Item.Type      <chr> "Household", "Baby Food", "Meat", "Meat", "Cereal", "Me~
## $ Sales.Channel  <chr> "Offline", "Online", "Online", "Online", "Online", "Onl~
## $ Order.Priority <chr> "M", "H", "L", "L", "M", "H", "M", "L", "M", "C", "M", ~
## $ Order.Date     <chr> "8/31/2015", "11/20/2010", "6/22/2017", "2/28/2012", "8~
## $ Order.ID       <int> 897751939, 599480426, 538911855, 459845054, 626391351, ~
## $ Ship.Date      <chr> "10/12/2015", "1/9/2011", "6/25/2017", "3/20/2012", "9/~
## $ Units.Sold     <int> 3604, 8435, 4848, 7225, 1975, 2542, 4398, 49, 4031, 791~
## $ Unit.Price     <dbl> 668.27, 255.28, 421.89, 421.89, 205.70, 421.89, 668.27,~
## $ Unit.Cost      <dbl> 502.54, 159.42, 364.69, 364.69, 117.11, 364.69, 502.54,~
## $ Total.Revenue  <dbl> 2408445.08, 2153286.80, 2045322.72, 3048155.25, 406257.~
## $ Total.Cost     <dbl> 1811154.16, 1344707.70, 1768017.12, 2634885.25, 231292.~
## $ Total.Profit   <dbl> 597290.92, 808579.10, 277305.60, 413270.00, 174965.25, ~

Adjusting the Variable Data Types and Removing Irrelevant Variables

The categorical variables were stored as character data types. These variables need to be changed to factor variables before proceeding with the analysis. The numeric variables were also adjusted to a uniform type of numeric, instead of being int and double data types. The variable, Order.ID was removed as the variable does not aid in the analysis.

data.small$Region <- as.factor(data.small$Region)
data.small$Country <- as.factor(data.small$Country)
data.small$Item.Type <- as.factor(data.small$Item.Type)
data.small$Sales.Channel <- as.factor(data.small$Sales.Channel)
data.small$Order.Priority <- as.factor(data.small$Order.Priority)
data.small$Order.Date <- as.Date(data.small$Order.Date, "%m/%d/%Y")
data.small$Order.ID <- as.character(data.small$Order.ID)
data.small$Ship.Date <- as.Date(data.small$Ship.Date, "%m/%d/%Y")
data.small$Units.Sold <- as.numeric(data.small$Units.Sold)
data.small$Unit.Price <- as.numeric(data.small$Unit.Price)
data.small$Unit.Cost <- as.numeric(data.small$Unit.Cost)
data.small$Total.Revenue <- as.numeric(data.small$Total.Revenue)
data.small$Total.Cost <- as.numeric(data.small$Total.Cost)
data.small$Total.Profit <- as.numeric(data.small$Total.Profit)

data.large$Region <- as.factor(data.large$Region)
data.large$Country <- as.factor(data.large$Country)
data.large$Item.Type <- as.factor(data.large$Item.Type)
data.large$Sales.Channel <- as.factor(data.large$Sales.Channel)
data.large$Order.Priority <- as.factor(data.large$Order.Priority)
data.large$Order.Date <- as.Date(data.large$Order.Date, "%m/%d/%Y")
data.large$Order.ID <- as.character(data.large$Order.ID)
data.large$Ship.Date <- as.Date(data.large$Ship.Date, "%m/%d/%Y")
data.large$Units.Sold <- as.numeric(data.large$Units.Sold)
data.large$Unit.Price <- as.numeric(data.large$Unit.Price)
data.large$Unit.Cost <- as.numeric(data.large$Unit.Cost)
data.large$Total.Revenue <- as.numeric(data.large$Total.Revenue)
data.large$Total.Cost <- as.numeric(data.large$Total.Cost)
data.large$Total.Profit <- as.numeric(data.large$Total.Profit)

data.small <- data.small[,-7]
data.large <- data.large[,-7]

Adding Variables Derived from Data

I elected to create three new variables for this analysis: year, month, and Order.Size. The year and month variables were derived from the Order.Date variable and could provide insight to whether there are set times where the Sales are more prevalent. The variable, Order.Size, bins the variable, Units.Sold into 5 separate bins. The Order.Size variable will be the target variable in my machine learning classification analysis in the later sections of this analysis. Analyzing Order.Size can aid Sales management by predicting the general order size a business should undertake given location, item being sold, price, cost, etc.

data.small <-  data.small %>%
  mutate(year = as.factor(year(Order.Date))) %>% 
  mutate(month = as.factor(month(Order.Date))) %>%
  mutate(Order.Size = cut(Units.Sold, breaks = c(0, 1999, 3999, 5999, 7999, 10000)))

data.large <-  data.large %>%
  mutate(year = as.factor(year(Order.Date))) %>% 
  mutate(month = as.factor(month(Order.Date))) %>%
  mutate(Order.Size = cut(Units.Sold, breaks = c(0, 1999, 3999, 5999, 7999, 10000)))

levels(data.small$Order.Size) <- c("0-1999", "2000-3999", "4000-5999", "6000-7999","8000+")

levels(data.large$Order.Size) <- c("0-1999", "2000-3999", "4000-5999", "6000-7999","8000+")

data.small$Order.Size <- factor(data.small$Order.Size, ordered = T)
data.large$Order.Size <- factor(data.large$Order.Size, ordered = T)

Summary Table of Data

Small Dataset

Some notable descriptive statistics:

There are many Countries (76) and Item Types (12) present in the small dataset of only 100 rows. This could create problems for machine learning models if not all factor levels are represented in the training of the model.
There is a large disparity of unit / total price, cost, revenue, and profit across the data

summary(data.small)

##                                Region                    Country  
##  Asia                             :11   The Gambia           : 4  
##  Australia and Oceania            :11   Australia            : 3  
##  Central America and the Caribbean: 7   Djibouti             : 3  
##  Europe                           :22   Mexico               : 3  
##  Middle East and North Africa     :10   Sao Tome and Principe: 3  
##  North America                    : 3   Sierra Leone         : 3  
##  Sub-Saharan Africa               :36   (Other)              :81  
##            Item.Type  Sales.Channel Order.Priority   Order.Date        
##  Clothes        :13   Offline:50    C:22           Min.   :2010-02-02  
##  Cosmetics      :13   Online :50    H:30           1st Qu.:2012-02-14  
##  Office Supplies:12                 L:27           Median :2013-07-12  
##  Fruits         :10                 M:21           Mean   :2013-09-16  
##  Personal Care  :10                                3rd Qu.:2015-04-07  
##  Household      : 9                                Max.   :2017-05-22  
##  (Other)        :33                                                    
##    Ship.Date            Units.Sold     Unit.Price       Unit.Cost     
##  Min.   :2010-02-25   Min.   : 124   Min.   :  9.33   Min.   :  6.92  
##  1st Qu.:2012-02-24   1st Qu.:2836   1st Qu.: 81.73   1st Qu.: 35.84  
##  Median :2013-08-11   Median :5382   Median :179.88   Median :107.28  
##  Mean   :2013-10-09   Mean   :5129   Mean   :276.76   Mean   :191.05  
##  3rd Qu.:2015-04-28   3rd Qu.:7369   3rd Qu.:437.20   3rd Qu.:263.33  
##  Max.   :2017-06-17   Max.   :9925   Max.   :668.27   Max.   :524.96  
##                                                                       
##  Total.Revenue       Total.Cost       Total.Profit          year   
##  Min.   :   4870   Min.   :   3612   Min.   :   1258   2012   :22  
##  1st Qu.: 268721   1st Qu.: 168868   1st Qu.: 121444   2014   :15  
##  Median : 752314   Median : 363566   Median : 290768   2011   :12  
##  Mean   :1373488   Mean   : 931806   Mean   : 441682   2013   :12  
##  3rd Qu.:2212045   3rd Qu.:1613870   3rd Qu.: 635829   2015   :11  
##  Max.   :5997055   Max.   :4509794   Max.   :1719922   2010   :10  
##                                                        (Other):18  
##      month        Order.Size
##  2      :13   0-1999   :17  
##  7      :12   2000-3999:18  
##  5      :11   4000-5999:25  
##  10     :11   6000-7999:20  
##  6      :10   8000+    :20  
##  4      : 9                 
##  (Other):34

paste0("Number of Unique Countries: ", length(unique(data.small$Country)))

## [1] "Number of Unique Countries: 76"

paste0("Number of Unique Item.Types: ", length(unique(data.small$Item.Type)))

## [1] "Number of Unique Item.Types: 12"

Large Dataset

Notable summary statistics are:

Similar to the small dataset, the large dataset contains a large number of Countries (185). There is a large number of rows (50,000) making this dataset more resilient to a large number of dimensions.
The number of Item Types is the same as the small dataset
The cost and price of products are vastly different from one another

summary(data.large)

##                                Region     
##  Asia                             : 7348  
##  Australia and Oceania            : 4017  
##  Central America and the Caribbean: 5451  
##  Europe                           :12841  
##  Middle East and North Africa     : 6128  
##  North America                    : 1099  
##  Sub-Saharan Africa               :13116  
##                              Country              Item.Type     Sales.Channel  
##  Trinidad and Tobago             :  321   Fruits       : 4221   Offline:24966  
##  Guinea                          :  318   Meat         : 4221   Online :25034  
##  Cape Verde                      :  315   Cosmetics    : 4193                  
##  Maldives                        :  311   Vegetables   : 4191                  
##  Finland                         :  310   Personal Care: 4186                  
##  Democratic Republic of the Congo:  308   Beverages    : 4173                  
##  (Other)                         :48117   (Other)      :24815                  
##  Order.Priority   Order.Date           Ship.Date            Units.Sold   
##  C:12446        Min.   :2010-01-01   Min.   :2010-01-02   Min.   :    1  
##  H:12471        1st Qu.:2011-11-15   1st Qu.:2011-12-11   1st Qu.: 2498  
##  L:12588        Median :2013-10-09   Median :2013-11-02   Median : 5018  
##  M:12495        Mean   :2013-10-11   Mean   :2013-11-05   Mean   : 5000  
##                 3rd Qu.:2015-09-04   3rd Qu.:2015-09-30   3rd Qu.: 7493  
##                 Max.   :2017-07-28   Max.   :2017-09-16   Max.   :10000  
##                                                                          
##    Unit.Price       Unit.Cost      Total.Revenue       Total.Cost     
##  Min.   :  9.33   Min.   :  6.92   Min.   :     28   Min.   :     21  
##  1st Qu.: 81.73   1st Qu.: 35.84   1st Qu.: 276487   1st Qu.: 160637  
##  Median :154.06   Median : 97.44   Median : 781325   Median : 467104  
##  Mean   :265.65   Mean   :187.32   Mean   :1323716   Mean   : 933157  
##  3rd Qu.:421.89   3rd Qu.:263.33   3rd Qu.:1808642   3rd Qu.:1190390  
##  Max.   :668.27   Max.   :524.96   Max.   :6682032   Max.   :5249075  
##                                                                       
##   Total.Profit            year           month           Order.Size   
##  Min.   :      7.2   2011   : 6757   5      : 4571   0-1999   :10065  
##  1st Qu.:  94150.9   2012   : 6634   1      : 4408   2000-3999: 9762  
##  Median : 279536.4   2014   : 6596   3      : 4354   4000-5999:10040  
##  Mean   : 390558.7   2010   : 6594   7      : 4341   6000-7999:10234  
##  3rd Qu.: 564286.7   2015   : 6570   6      : 4311   8000+    : 9899  
##  Max.   :1738178.4   2016   : 6551   4      : 4277                    
##                      (Other):10298   (Other):23738

paste0("Number of Unique Countries: ", length(unique(data.large$Country)))

## [1] "Number of Unique Countries: 185"

paste0("Number of Unique Item.Types: ", length(unique(data.large$Item.Type)))

## [1] "Number of Unique Item.Types: 12"

Visualizations

The visualizations below explore the data distributions of many of the variables in the dataset and compare the small dataset to the large dataset.

The numeric data distributions for all variables in the small dataset appear to mainly follow a right skewed distribution apart from Unit.Price and Unit.Cost which appears to be bimodal.
The numeric data distributions for the large dataset are:
- Unit.Price and Unit.Cost - Multimodal
- Total.Revenue, Total.Cost, and Total.Profit - Right Skewed
The categorical data distributions are much more balanced for the large dataset compared to the small dataset. For example, both Country and Item.Type are sparsely populated for the small dataset with only 2 to 5 records populating them in some instances where the large dataset contains a more balanced class distribution of having 4000 records for each category for Item.Type.

plot2 <- ggplot() +
  geom_density(aes(Unit.Price, fill = "Small"), alpha = .2, data = data.small) +
  geom_density(aes(Unit.Price, fill = "Large"), alpha = .2, data = data.large) +
  scale_fill_manual(name = "Dataset", values = c(Small = "blue", Large = "orange")) +
  ggtitle("Density Plot: Unit.Price")


plot3 <- ggplot() +
  geom_density(aes(Unit.Cost, fill = "Small"), alpha = .2, data = data.small) +
  geom_density(aes(Unit.Cost, fill = "Large"), alpha = .2, data = data.large) +
  scale_fill_manual(name = "Dataset", values = c(Small = "blue", Large = "orange")) +
  ggtitle("Density Plot: Unit.Cost")

plot4 <- ggplot() +
  geom_density(aes(Total.Revenue, fill = "Small"), alpha = .2, data = data.small) +
  geom_density(aes(Total.Revenue, fill = "Large"), alpha = .2, data = data.large) +
  scale_fill_manual(name = "Dataset", values = c(Small = "blue", Large = "orange")) +
  ggtitle("Density Plot: Total.Revenue")

plot5 <- ggplot() +
  geom_density(aes(Total.Cost, fill = "Small"), alpha = .2, data = data.small) +
  geom_density(aes(Total.Cost, fill = "Large"), alpha = .2, data = data.large) +
  scale_fill_manual(name = "Dataset", values = c(Small = "blue", Large = "orange")) +
  ggtitle("Density Plot: Total.Cost")

plot6 <- ggplot() +
  geom_density(aes(Total.Profit, fill = "Small"), alpha = .2, data = data.small) +
  geom_density(aes(Total.Profit, fill = "Large"), alpha = .2, data = data.large) +
  scale_fill_manual(name = "Dataset", values = c(Small = "blue", Large = "orange")) +
  ggtitle("Density Plot: Total.Profit")


ggarrange(plot2,
          plot3, plot4,
          plot5, plot6,
          ncol = 2, nrow = 3)

p1 <- ggplot(data.small, aes(x = Region)) +
  geom_bar() +
  coord_flip()

p2 <- ggplot(data.large, aes(x = Region)) +
  geom_bar() +
  coord_flip()


p3 <- ggplot(data.small, aes(x = Country)) +
  geom_bar() +
  coord_flip()

p4 <- ggplot(data.large, aes(x = Country)) +
  geom_bar() +
  coord_flip()


p5 <- ggplot(data.small, aes(x = Item.Type)) +
  geom_bar() +
  coord_flip()

p6 <- ggplot(data.large, aes(x = Item.Type)) +
  geom_bar() +
  coord_flip()

ggarrange(p1, p2,
          p3, p4,
          p5, p6,
          ncol = 2, nrow = 3)

p7 <- ggplot(data.small, aes(x = Order.Priority)) +
  geom_bar() +
  coord_flip()

p8 <- ggplot(data.large, aes(x = Order.Priority)) +
  geom_bar() +
  coord_flip()


p9 <- ggplot(data.small, aes(x = Sales.Channel)) +
  geom_bar() +
  coord_flip()

p10 <- ggplot(data.large, aes(x = Sales.Channel)) +
  geom_bar() +
  coord_flip()


p11 <- ggplot(data.small, aes(x = Order.Size)) +
  geom_bar() +
  coord_flip()

p12 <- ggplot(data.large, aes(x = Order.Size)) +
  geom_bar() +
  coord_flip()

ggarrange(p7, p8,
          p9, p10,
          p11, p12,
          ncol = 2, nrow = 3)

Data Analysis

The main goal I want to achieve through machine learning is if I can predict the Order.Size given a number of inputs. The Order.Size is an ordered categorical variable, meaning that a classification algorithm will be needed to carry out the prediction. The two algorithms that will be used are Ordinal Logistic Regression and K Nearest Neighbor Classifier.

Small Dataset

Ordinal Logistic Regression

Partition Data

The data must be divided into a training and testing set to properly assess the model performance. I elected to use an 80% train set and a 20% test set.

set.seed(123)

smp_size <- floor(0.8 * nrow(data.small))

train_ind <- sample(seq_len(nrow(data.small)), size = smp_size)

train.small <- data.small[train_ind, ]
test.small <- data.small[-train_ind, ]

Create Ordinal Logistic Regression Model

An ordinal logistic regression can be trained using the polyr() function in the MASS library. The model being used is \(Order.Size = Unit.Cost + Region + Sales.Channel + year + month\).

m1= polr(Order.Size ~ Unit.Cost + Region + Sales.Channel + year + month, data = train.small, Hess = TRUE)

summary(m1)

## Call:
## polr(formula = Order.Size ~ Unit.Cost + Region + Sales.Channel + 
##     year + month, data = train.small, Hess = TRUE)
## 
## Coefficients:
##                                             Value Std. Error  t value
## Unit.Cost                               -0.001939   0.001255 -1.54504
## RegionAustralia and Oceania              0.076412   1.114411  0.06857
## RegionCentral America and the Caribbean  0.758842   1.349892  0.56215
## RegionEurope                            -0.818208   0.899989 -0.90913
## RegionMiddle East and North Africa      -0.355552   0.972363 -0.36566
## RegionNorth America                     -0.531397   1.179354 -0.45058
## RegionSub-Saharan Africa                -0.352368   0.729990 -0.48270
## Sales.ChannelOnline                     -0.475615   0.523238 -0.90898
## year2011                                -2.180317   1.143210 -1.90719
## year2012                                -1.015061   0.970144 -1.04630
## year2013                                -1.077440   0.947148 -1.13756
## year2014                                -0.460335   0.981443 -0.46904
## year2015                                -2.141481   1.017703 -2.10423
## year2016                                -1.541606   1.011621 -1.52390
## year2017                                -1.546337   1.338827 -1.15499
## month2                                  -1.754835   1.216819 -1.44215
## month3                                  -2.547239   1.337307 -1.90475
## month4                                  -1.669375   1.191154 -1.40148
## month5                                  -1.880260   1.201987 -1.56429
## month6                                  -2.569503   1.391772 -1.84621
## month7                                  -0.845618   1.251734 -0.67556
## month8                                  -3.870313   1.752344 -2.20865
## month9                                  -0.915637   1.595083 -0.57404
## month10                                 -1.905161   1.302916 -1.46223
## month11                                 -0.717994   1.161975 -0.61791
## month12                                 -3.428982   1.555205 -2.20484
## 
## Intercepts:
##                     Value   Std. Error t value
## 0-1999|2000-3999    -5.9417  1.5444    -3.8473
## 2000-3999|4000-5999 -4.7665  1.4961    -3.1860
## 4000-5999|6000-7999 -3.4304  1.4416    -2.3795
## 6000-7999|8000+     -2.1951  1.4097    -1.5571
## 
## Residual Deviance: 232.5209 
## AIC: 292.5209

Assess Model Performance

The model achieved an overall accuracy of 25%.

test.predictions <- predict(m1, newdata = test.small)


cm <- table(actual = test.small$Order.Size, predicted =  test.predictions)

cm1 <- confusionMatrix(cm)

cm1

## Confusion Matrix and Statistics
## 
##            predicted
## actual      0-1999 2000-3999 4000-5999 6000-7999 8000+
##   0-1999         1         0         3         1     0
##   2000-3999      0         0         3         0     1
##   4000-5999      0         0         2         0     3
##   6000-7999      0         0         1         1     1
##   8000+          1         0         1         0     1
## 
## Overall Statistics
##                                          
##                Accuracy : 0.25           
##                  95% CI : (0.0866, 0.491)
##     No Information Rate : 0.5            
##     P-Value [Acc > NIR] : 0.9941         
##                                          
##                   Kappa : 0.0506         
##                                          
##  Mcnemar's Test P-Value : NA             
## 
## Statistics by Class:
## 
##                      Class: 0-1999 Class: 2000-3999 Class: 4000-5999
## Sensitivity                 0.5000               NA           0.2000
## Specificity                 0.7778              0.8           0.7000
## Pos Pred Value              0.2000               NA           0.4000
## Neg Pred Value              0.9333               NA           0.4667
## Prevalence                  0.1000              0.0           0.5000
## Detection Rate              0.0500              0.0           0.1000
## Detection Prevalence        0.2500              0.2           0.2500
## Balanced Accuracy           0.6389               NA           0.4500
##                      Class: 6000-7999 Class: 8000+
## Sensitivity                    0.5000       0.1667
## Specificity                    0.8889       0.8571
## Pos Pred Value                 0.3333       0.3333
## Neg Pred Value                 0.9412       0.7059
## Prevalence                     0.1000       0.3000
## Detection Rate                 0.0500       0.0500
## Detection Prevalence           0.1500       0.1500
## Balanced Accuracy              0.6944       0.5119

KNN

Prepare Data for KNN

KNN uses distance as a means to determining the closest neighbor, therefore all numeric data should be normalized and all categorical predictors be created into dummy variables. I also removed variables that could be derived from other variables like total cost, total profit, and total revenue.

data.knn <- data.small[-c(2,6:8, 11:13)]
data.knn <- normalize(data.knn)

data.knn$Order.Size <- as.numeric(as.integer(factor(data.knn$Order.Size)))

data.knn <- dummy_cols(data.knn, select_columns = c('Region', 'Item.Type', 'Sales.Channel', 'Order.Priority', 'year', 'month'),
           remove_selected_columns = TRUE)

Partition Data

set.seed(123)

smp_size <- floor(0.8 * nrow(data.knn))

train_ind <- sample(seq_len(nrow(data.knn)), prob = data.knn$Freq, size = smp_size)

train.small <- data.knn[train_ind, ]
test.small <- data.knn[-train_ind, ]

train.labels <- train.small$Order.Size
test.labels <- test.small$Order.Size

train.small <- train.small[c(-6)]
test.small <- test.small[c(-6)]

Create KNN Classifier

I elected to use the square root of the sample size as the value for k.

m2 <- knn(train.small, test.small, cl = train.labels, k = sqrt(nrow(train.small)))

Assess Model Performance

The KNN Classifier had an overall accuracy of 60%.

cm <- table(actual = test.labels, predicted =  m2)

cm2 <- confusionMatrix(cm)

cm2

## Confusion Matrix and Statistics
## 
##       predicted
## actual 1 2 3 4 5
##      1 3 2 0 0 0
##      2 1 2 1 0 0
##      3 0 1 3 1 0
##      4 0 0 0 3 0
##      5 0 0 0 2 1
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6             
##                  95% CI : (0.3605, 0.8088)
##     No Information Rate : 0.3             
##     P-Value [Acc > NIR] : 0.005138        
##                                           
##                   Kappa : 0.4984          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.7500   0.4000   0.7500   0.5000   1.0000
## Specificity            0.8750   0.8667   0.8750   1.0000   0.8947
## Pos Pred Value         0.6000   0.5000   0.6000   1.0000   0.3333
## Neg Pred Value         0.9333   0.8125   0.9333   0.8235   1.0000
## Prevalence             0.2000   0.2500   0.2000   0.3000   0.0500
## Detection Rate         0.1500   0.1000   0.1500   0.1500   0.0500
## Detection Prevalence   0.2500   0.2000   0.2500   0.1500   0.1500
## Balanced Accuracy      0.8125   0.6333   0.8125   0.7500   0.9474

Large Dataset

Ordinal Logistic Regression

Partition Data

set.seed(123)

smp_size <- floor(0.8 * nrow(data.large))

train_ind <- sample(seq_len(nrow(data.large)), prob = data.knn$Freq, size = smp_size)

train.large <- data.large[train_ind, ]
test.large <- data.large[-train_ind, ]

Create Ordinal Logistic Regression Model

The variables within this model are the same as the variables in the small dataset ordinal logistic regression model.

m3= polr(Order.Size ~ Unit.Cost + Region + Sales.Channel + year + month, data = train.large, Hess = TRUE)
summary(m3)

## Call:
## polr(formula = Order.Size ~ Unit.Cost + Region + Sales.Channel + 
##     year + month, data = train.large, Hess = TRUE)
## 
## Coefficients:
##                                              Value Std. Error  t value
## Unit.Cost                               -7.586e-05  5.066e-05 -1.49752
## RegionAustralia and Oceania             -9.210e-04  3.878e-02 -0.02375
## RegionCentral America and the Caribbean -1.465e-02  3.531e-02 -0.41479
## RegionEurope                             1.279e-02  2.900e-02  0.44112
## RegionMiddle East and North Africa       8.945e-03  3.429e-02  0.26091
## RegionNorth America                      8.409e-02  6.326e-02  1.32926
## RegionSub-Saharan Africa                 2.664e-02  2.890e-02  0.92190
## Sales.ChannelOnline                      7.917e-03  1.768e-02  0.44768
## year2011                                -2.890e-02  3.417e-02 -0.84558
## year2012                                -2.771e-02  3.450e-02 -0.80327
## year2013                                -9.721e-03  3.446e-02 -0.28213
## year2014                                 1.868e-02  3.431e-02  0.54452
## year2015                                -3.973e-02  3.453e-02 -1.15048
## year2016                                 3.315e-02  3.443e-02  0.96271
## year2017                                -3.767e-02  4.114e-02 -0.91554
## month2                                  -3.994e-02  4.284e-02 -0.93218
## month3                                  -7.140e-02  4.231e-02 -1.68745
## month4                                  -5.043e-02  4.257e-02 -1.18477
## month5                                   2.652e-02  4.186e-02  0.63336
## month6                                   3.510e-03  4.260e-02  0.08240
## month7                                  -1.919e-02  4.221e-02 -0.45453
## month8                                   2.384e-02  4.350e-02  0.54809
## month9                                   5.620e-03  4.399e-02  0.12776
## month10                                 -5.826e-02  4.363e-02 -1.33515
## month11                                 -6.425e-02  4.373e-02 -1.46948
## month12                                  3.254e-02  4.350e-02  0.74811
## 
## Intercepts:
##                     Value    Std. Error t value 
## 0-1999|2000-3999     -1.4018   0.0461   -30.3912
## 2000-3999|4000-5999  -0.4469   0.0456    -9.8043
## 4000-5999|6000-7999   0.3682   0.0456     8.0781
## 6000-7999|8000+       1.3678   0.0461    29.6504
## 
## Residual Deviance: 128714.62 
## AIC: 128774.62

Assess Model Performance

The overall accuracy is 20.23%

test.predictions <- predict(m3, newdata = test.large)


cm <- table(actual = test.large$Order.Size, predicted =  test.predictions)

cm3 <- confusionMatrix(cm)

cm3

## Confusion Matrix and Statistics
## 
##            predicted
## actual      0-1999 2000-3999 4000-5999 6000-7999 8000+
##   0-1999       807         0         0       935   249
##   2000-3999    873         0         0       901   206
##   4000-5999    846         0         0       926   233
##   6000-7999    893         0         0       998   188
##   8000+        821         0         0       906   218
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2023          
##                  95% CI : (0.1945, 0.2103)
##     No Information Rate : 0.4666          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : -5e-04          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0-1999 Class: 2000-3999 Class: 4000-5999
## Sensitivity                 0.1903               NA               NA
## Specificity                 0.7944            0.802           0.7995
## Pos Pred Value              0.4053               NA               NA
## Neg Pred Value              0.5714               NA               NA
## Prevalence                  0.4240            0.000           0.0000
## Detection Rate              0.0807            0.000           0.0000
## Detection Prevalence        0.1991            0.198           0.2005
## Balanced Accuracy           0.4924               NA               NA
##                      Class: 6000-7999 Class: 8000+
## Sensitivity                    0.2139       0.1993
## Specificity                    0.7973       0.8061
## Pos Pred Value                 0.4800       0.1121
## Neg Pred Value                 0.5369       0.8912
## Prevalence                     0.4666       0.1094
## Detection Rate                 0.0998       0.0218
## Detection Prevalence           0.2079       0.1945
## Balanced Accuracy              0.5056       0.5027

KNN

Prepare Data for KNN

data.knn <- data.large[-c(2,6:8, 11:13)]
data.knn <- normalize(data.knn)

data.knn$Order.Size <- as.numeric(as.integer(factor(data.knn$Order.Size)))

data.knn <- dummy_cols(data.knn, select_columns = c('Region', 'Item.Type', 'Sales.Channel', 'Order.Priority', 'year', 'month'),
           remove_selected_columns = TRUE)

Partition Data

set.seed(123)

smp_size <- floor(0.8 * nrow(data.knn))

train_ind <- sample(seq_len(nrow(data.knn)), prob = data.knn$Freq, size = smp_size)

train.large <- data.knn[train_ind, ]
test.large <- data.knn[-train_ind, ]

train.labels <- train.large$Order.Size
test.labels <- test.large$Order.Size

train.large <- train.large[c(-6)]
test.large <- test.large[c(-6)]

Create KNN Model

m4 <- knn(train.large, test.large, cl = train.labels, k = sqrt(nrow(train.large)))

Assess Model Performance

The overall accuracy is 99.1%.

cm <- table(actual = test.labels, predicted =  m4)

cm4 <- confusionMatrix(cm)

cm4

## Confusion Matrix and Statistics
## 
##       predicted
## actual    1    2    3    4    5
##      1 1990    1    0    0    0
##      2   26 1945    9    0    0
##      3    0    5 1993    7    0
##      4    0    0   16 2043   20
##      5    0    0    0    6 1939
## 
## Overall Statistics
##                                           
##                Accuracy : 0.991           
##                  95% CI : (0.9889, 0.9928)
##     No Information Rate : 0.2056          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.9887          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            0.9871   0.9969   0.9876   0.9937   0.9898
## Specificity            0.9999   0.9957   0.9985   0.9955   0.9993
## Pos Pred Value         0.9995   0.9823   0.9940   0.9827   0.9969
## Neg Pred Value         0.9968   0.9993   0.9969   0.9984   0.9975
## Prevalence             0.2016   0.1951   0.2018   0.2056   0.1959
## Detection Rate         0.1990   0.1945   0.1993   0.2043   0.1939
## Detection Prevalence   0.1991   0.1980   0.2005   0.2079   0.1945
## Balanced Accuracy      0.9935   0.9963   0.9931   0.9946   0.9945

Comparison Between Model Performance

The results echo the effect data size has on parametric and non-parametric algorithms. Parametric models are more constrained and do not need a large amount of data to be trained, though are usually outperformed by their non-parametric algorithms due to the inherent restrictiveness. Nonparametric data, on the other hand, is not restricted and can be very effective if there is an adequate amount of data to suffice the algorithm. KNN outperformed the Ordinal Logistic Regression models in both datasets. There is a clear improvement in the KNN algorithm when more data is present. This effect is likely due to the curse of dimensionality.

accuracy <- c(cm1$overall[1], cm2$overall[1], cm3$overall[1], cm4$overall[1])
model.type <- c("Ordinal Logistic Regression", "KNN", "Ordinal Logistic Regression", "KNN")
dataset <- c("Small", "Small", "Large", "Large")
results <- data.frame(dataset,
           model.type,
           accuracy)

kableExtra::kable(results)

dataset	model.type	accuracy
Small	Ordinal Logistic Regression	0.2500
Small	KNN	0.6000
Large	Ordinal Logistic Regression	0.2023
Large	KNN	0.9910

DATA622: Homework 1

Eric Lehmphul

10/9/2022

Data Exploration

Read in Data

Explore Data Structure

Adjusting the Variable Data Types and Removing Irrelevant Variables

Adding Variables Derived from Data

Summary Table of Data

Small Dataset

Large Dataset

Visualizations

Data Analysis

Small Dataset

Ordinal Logistic Regression

Partition Data

Create Ordinal Logistic Regression Model

Assess Model Performance

KNN

Prepare Data for KNN

Partition Data

Create KNN Classifier

Assess Model Performance

Large Dataset

Ordinal Logistic Regression

Partition Data

Create Ordinal Logistic Regression Model

Assess Model Performance

KNN

Prepare Data for KNN

Partition Data

Create KNN Model

Assess Model Performance

Comparison Between Model Performance