DATA622 Homework1

Libraries

library(rcompanion)
library(datasets)
library(caTools)
library(dplyr)
library(party)
library(rpart.plot)
library(caret)

Import Data

df100 <- read.csv("https://raw.githubusercontent.com/ltcancel/DATA622/main/Homework1/1000%20Sales%20Records.csv")
df1000 <- read.csv("https://raw.githubusercontent.com/ltcancel/DATA622/main/Homework1/100000%20Sales%20Records.csv")
head(df100)

##                         Region Country  Item.Type Sales.Channel Order.Priority
## 1 Middle East and North Africa   Libya  Cosmetics       Offline              M
## 2                North America  Canada Vegetables        Online              M
## 3 Middle East and North Africa   Libya  Baby Food       Offline              C
## 4                         Asia   Japan     Cereal       Offline              C
## 5           Sub-Saharan Africa    Chad     Fruits       Offline              H
## 6                       Europe Armenia     Cereal        Online              H
##   Order.Date  Order.ID  Ship.Date Units.Sold Unit.Price Unit.Cost Total.Revenue
## 1 10/18/2014 686800706 10/31/2014       8446     437.20    263.33    3692591.20
## 2  11/7/2011 185941302  12/8/2011       3018     154.06     90.93     464953.08
## 3 10/31/2016 246222341  12/9/2016       1517     255.28    159.42     387259.76
## 4  4/10/2010 161442649  5/12/2010       3322     205.70    117.11     683335.40
## 5  8/16/2011 645713555  8/31/2011       9845       9.33      6.92      91853.85
## 6 11/24/2014 683458888 12/28/2014       9528     205.70    117.11    1959909.60
##   Total.Cost Total.Profit
## 1  2224085.2   1468506.02
## 2   274426.7    190526.34
## 3   241840.1    145419.62
## 4   389039.4    294295.98
## 5    68127.4     23726.45
## 6  1115824.1    844085.52

head(df1000)

##                              Region               Country     Item.Type
## 1      Middle East and North Africa            Azerbaijan        Snacks
## 2 Central America and the Caribbean                Panama     Cosmetics
## 3                Sub-Saharan Africa Sao Tome and Principe        Fruits
## 4                Sub-Saharan Africa Sao Tome and Principe Personal Care
## 5 Central America and the Caribbean                Belize     Household
## 6                            Europe               Denmark       Clothes
##   Sales.Channel Order.Priority Order.Date  Order.ID  Ship.Date Units.Sold
## 1        Online              C  10/8/2014 535113847 10/23/2014        934
## 2       Offline              L  2/22/2015 874708545  2/27/2015       4551
## 3       Offline              M  12/9/2015 854349935  1/18/2016       9986
## 4        Online              M  9/17/2014 892836844 10/12/2014       9118
## 5       Offline              H   2/4/2010 129280602   3/5/2010       5858
## 6        Online              C  2/20/2013 473105037  2/28/2013       1149
##   Unit.Price Unit.Cost Total.Revenue Total.Cost Total.Profit
## 1     152.58     97.44     142509.72   91008.96     51500.76
## 2     437.20    263.33    1989697.20 1198414.83    791282.37
## 3       9.33      6.92      93169.38   69103.12     24066.26
## 4      81.73     56.67     745214.14  516717.06    228497.08
## 5     668.27    502.54    3914725.66 2943879.32    970846.34
## 6     109.28     35.84     125562.72   41180.16     84382.56

Data Structure

The first dataframe contains 1000 observations and 14 variables. 7 variables are characters, 2 are integers, and the remaining are numbers.

str(df100)

## 'data.frame':    1000 obs. of  14 variables:
##  $ Region        : chr  "Middle East and North Africa" "North America" "Middle East and North Africa" "Asia" ...
##  $ Country       : chr  "Libya" "Canada" "Libya" "Japan" ...
##  $ Item.Type     : chr  "Cosmetics" "Vegetables" "Baby Food" "Cereal" ...
##  $ Sales.Channel : chr  "Offline" "Online" "Offline" "Offline" ...
##  $ Order.Priority: chr  "M" "M" "C" "C" ...
##  $ Order.Date    : chr  "10/18/2014" "11/7/2011" "10/31/2016" "4/10/2010" ...
##  $ Order.ID      : int  686800706 185941302 246222341 161442649 645713555 683458888 679414975 208630645 266467225 118598544 ...
##  $ Ship.Date     : chr  "10/31/2014" "12/8/2011" "12/9/2016" "5/12/2010" ...
##  $ Units.Sold    : int  8446 3018 1517 3322 9845 9528 2844 7299 2428 4800 ...
##  $ Unit.Price    : num  437.2 154.06 255.28 205.7 9.33 ...
##  $ Unit.Cost     : num  263.33 90.93 159.42 117.11 6.92 ...
##  $ Total.Revenue : num  3692591 464953 387260 683335 91854 ...
##  $ Total.Cost    : num  2224085 274427 241840 389039 68127 ...
##  $ Total.Profit  : num  1468506 190526 145420 294296 23726 ...

The second dataframe contains 100,000 observations and 14 variables. The variable datatypes are exactly the same as the first dataframe.

str(df1000)

## 'data.frame':    100000 obs. of  14 variables:
##  $ Region        : chr  "Middle East and North Africa" "Central America and the Caribbean" "Sub-Saharan Africa" "Sub-Saharan Africa" ...
##  $ Country       : chr  "Azerbaijan" "Panama" "Sao Tome and Principe" "Sao Tome and Principe" ...
##  $ Item.Type     : chr  "Snacks" "Cosmetics" "Fruits" "Personal Care" ...
##  $ Sales.Channel : chr  "Online" "Offline" "Offline" "Online" ...
##  $ Order.Priority: chr  "C" "L" "M" "M" ...
##  $ Order.Date    : chr  "10/8/2014" "2/22/2015" "12/9/2015" "9/17/2014" ...
##  $ Order.ID      : int  535113847 874708545 854349935 892836844 129280602 473105037 754046475 772153747 847788178 471623599 ...
##  $ Ship.Date     : chr  "10/23/2014" "2/27/2015" "1/18/2016" "10/12/2014" ...
##  $ Units.Sold    : int  934 4551 9986 9118 5858 1149 7964 6307 8217 2758 ...
##  $ Unit.Price    : num  152.58 437.2 9.33 81.73 668.27 ...
##  $ Unit.Cost     : num  97.44 263.33 6.92 56.67 502.54 ...
##  $ Total.Revenue : num  142510 1989697 93169 745214 3914726 ...
##  $ Total.Cost    : num  91009 1198415 69103 516717 2943879 ...
##  $ Total.Profit  : num  51501 791282 24066 228497 970846 ...

Data Comparison

Both dataframes look nearly identical, other than the obvious difference in the number of observations. If we look a little closer to the summary of each data set we can find some more differences. The minimum values for the Total.Revenue variable differs. The minimum for the first dataframe is 2043 and the minimum for the second dataframe is 19. Total.Cost and Total.Profit also show a significant difference with a minimum value of 1417 and 532.6 for respectively for the first dataframe and 14 and 4.8 for the second dataframe. Neither dataframe has NA values. The minimum and maximum values do not look off. They all look like valid values based on the variable names.

From the summaries below we can see that both datasets seem relatively simple with mostly categorical data so I will use Decision Tree and SVM models.

#check for NA values
summary(df100)

##     Region            Country           Item.Type         Sales.Channel     
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Order.Priority      Order.Date           Order.ID          Ship.Date        
##  Length:1000        Length:1000        Min.   :102928006   Length:1000       
##  Class :character   Class :character   1st Qu.:328074026   Class :character  
##  Mode  :character   Mode  :character   Median :556609714   Mode  :character  
##                                        Mean   :549681325                     
##                                        3rd Qu.:769694483                     
##                                        Max.   :995529830                     
##    Units.Sold     Unit.Price       Unit.Cost      Total.Revenue    
##  Min.   :  13   Min.   :  9.33   Min.   :  6.92   Min.   :   2043  
##  1st Qu.:2420   1st Qu.: 81.73   1st Qu.: 56.67   1st Qu.: 281192  
##  Median :5184   Median :154.06   Median : 97.44   Median : 754939  
##  Mean   :5054   Mean   :262.11   Mean   :184.97   Mean   :1327322  
##  3rd Qu.:7537   3rd Qu.:421.89   3rd Qu.:263.33   3rd Qu.:1733503  
##  Max.   :9998   Max.   :668.27   Max.   :524.96   Max.   :6617210  
##    Total.Cost       Total.Profit      
##  Min.   :   1417   Min.   :    532.6  
##  1st Qu.: 164932   1st Qu.:  98376.1  
##  Median : 464726   Median : 277226.0  
##  Mean   : 936119   Mean   : 391202.6  
##  3rd Qu.:1141750   3rd Qu.: 548456.8  
##  Max.   :5204978   Max.   :1726181.4

summary(df1000)

##     Region            Country           Item.Type         Sales.Channel     
##  Length:100000      Length:100000      Length:100000      Length:100000     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Order.Priority      Order.Date           Order.ID          Ship.Date        
##  Length:100000      Length:100000      Min.   :100008904   Length:100000     
##  Class :character   Class :character   1st Qu.:326046383   Class :character  
##  Mode  :character   Mode  :character   Median :547718512   Mode  :character  
##                                        Mean   :550395554                     
##                                        3rd Qu.:775078534                     
##                                        Max.   :999996459                     
##    Units.Sold      Unit.Price       Unit.Cost      Total.Revenue    
##  Min.   :    1   Min.   :  9.33   Min.   :  6.92   Min.   :     19  
##  1st Qu.: 2505   1st Qu.:109.28   1st Qu.: 56.67   1st Qu.: 279753  
##  Median : 5007   Median :205.70   Median :117.11   Median : 789892  
##  Mean   : 5001   Mean   :266.70   Mean   :188.02   Mean   :1336067  
##  3rd Qu.: 7495   3rd Qu.:437.20   3rd Qu.:364.69   3rd Qu.:1836490  
##  Max.   :10000   Max.   :668.27   Max.   :524.96   Max.   :6682700  
##    Total.Cost       Total.Profit      
##  Min.   :     14   Min.   :      4.8  
##  1st Qu.: 162928   1st Qu.:  95900.0  
##  Median : 467937   Median : 283657.5  
##  Mean   : 941975   Mean   : 394091.2  
##  3rd Qu.:1209475   3rd Qu.: 568384.1  
##  Max.   :5249075   Max.   :1738700.0

Decision Tree

The modeling for this data is very experimental because it is not clear what the business problem or question is. What are we attempting to solve for? Since we have two clear distinct categorical variables, Sales Channel and Order Priority, we can use one to be our predictor variable.

We will use a Decision Tree as our modeling technique for both datasets.

df100$Sales.Channel <- as.factor(df100$Sales.Channel)
df100$Order.Priority <- as.factor(df100$Order.Priority)
df1000$Sales.Channel <- as.factor(df1000$Sales.Channel)
df1000$Order.Priority <- as.factor(df1000$Order.Priority)
str(df1000)

## 'data.frame':    100000 obs. of  14 variables:
##  $ Region        : chr  "Middle East and North Africa" "Central America and the Caribbean" "Sub-Saharan Africa" "Sub-Saharan Africa" ...
##  $ Country       : chr  "Azerbaijan" "Panama" "Sao Tome and Principe" "Sao Tome and Principe" ...
##  $ Item.Type     : chr  "Snacks" "Cosmetics" "Fruits" "Personal Care" ...
##  $ Sales.Channel : Factor w/ 2 levels "Offline","Online": 2 1 1 2 1 2 1 2 2 2 ...
##  $ Order.Priority: Factor w/ 4 levels "C","H","L","M": 1 3 4 4 2 1 4 1 2 2 ...
##  $ Order.Date    : chr  "10/8/2014" "2/22/2015" "12/9/2015" "9/17/2014" ...
##  $ Order.ID      : int  535113847 874708545 854349935 892836844 129280602 473105037 754046475 772153747 847788178 471623599 ...
##  $ Ship.Date     : chr  "10/23/2014" "2/27/2015" "1/18/2016" "10/12/2014" ...
##  $ Units.Sold    : int  934 4551 9986 9118 5858 1149 7964 6307 8217 2758 ...
##  $ Unit.Price    : num  152.58 437.2 9.33 81.73 668.27 ...
##  $ Unit.Cost     : num  97.44 263.33 6.92 56.67 502.54 ...
##  $ Total.Revenue : num  142510 1989697 93169 745214 3914726 ...
##  $ Total.Cost    : num  91009 1198415 69103 516717 2943879 ...
##  $ Total.Profit  : num  51501 791282 24066 228497 970846 ...

#sample set
df100_data <- df100 %>%
  select(Order.Priority, Units.Sold, Unit.Price, Unit.Cost, Total.Revenue, Total.Cost, Total.Profit)

set.seed(1234)

#dt <- sort(sample(nrow(df100_data),nrow(df100_data)*.7))
#train_data <- df100_data[df,]

sample_data <- sample.split(df100_data, SplitRatio = 0.80)
train_data <- subset(df100_data, sample_data == TRUE)
test_data <- subset(df100_data, sample_data == FALSE)
head(train_data)

##   Order.Priority Units.Sold Unit.Price Unit.Cost Total.Revenue Total.Cost
## 1              M       8446     437.20    263.33     3692591.2  2224085.2
## 2              M       3018     154.06     90.93      464953.1   274426.7
## 3              C       1517     255.28    159.42      387259.8   241840.1
## 4              C       3322     205.70    117.11      683335.4   389039.4
## 7              H       2844     205.70    117.11      585010.8   333060.8
## 8              M       7299     109.28     35.84      797634.7   261596.2
##   Total.Profit
## 1    1468506.0
## 2     190526.3
## 3     145419.6
## 4     294296.0
## 7     251950.0
## 8     536038.6

Build tree for first set

rtree <- rpart(Order.Priority ~ ., data = train_data, method="class", control = rpart.control(minsplit = 20, minbucket = 7, maxdepth = 10, usesurrogate = 2, xval = 10))
rtree

## n= 714 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 714 521 L (0.26610644 0.22689076 0.27030812 0.23669468)  
##    2) Units.Sold< 6835 468 327 C (0.30128205 0.23504274 0.22863248 0.23504274)  
##      4) Units.Sold>=6233.5 46  25 C (0.45652174 0.19565217 0.26086957 0.08695652) *
##      5) Units.Sold< 6233.5 422 302 C (0.28436019 0.23933649 0.22511848 0.25118483)  
##       10) Total.Cost< 1481696 377 275 C (0.27055703 0.25994695 0.22811671 0.24137931)  
##         20) Total.Profit< 217526 250 169 C (0.32400000 0.24400000 0.19200000 0.24000000) *
##         21) Total.Profit>=217526 127  89 L (0.16535433 0.29133858 0.29921260 0.24409449)  
##           42) Total.Revenue>=1549050 16   8 H (0.31250000 0.50000000 0.12500000 0.06250000) *
##           43) Total.Revenue< 1549050 111  75 L (0.14414414 0.26126126 0.32432432 0.27027027)  
##             86) Total.Revenue< 951215.4 66  44 H (0.13636364 0.33333333 0.24242424 0.28787879) *
##             87) Total.Revenue>=951215.4 45  25 L (0.15555556 0.15555556 0.44444444 0.24444444) *
##       11) Total.Cost>=1481696 45  27 C (0.40000000 0.06666667 0.20000000 0.33333333) *
##    3) Units.Sold>=6835 246 160 L (0.19918699 0.21138211 0.34959350 0.23983740) *

Plot the tree

rpart.plot(rtree)

Prediction of first dataset

pred <- predict(rtree, test_data, type = "class")
pred_table <- table(test_data$Order.Priority, pred)
pred_table

##    pred
##      C  H  L  M
##   C 42 11 19  0
##   H 28  9 29  0
##   L 34 17 24  0
##   M 30 11 32  0

Confusion Matrix to test for accuracy

confusionMatrix(test_data$Order.Priority, pred)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  C  H  L  M
##          C 42 11 19  0
##          H 28  9 29  0
##          L 34 17 24  0
##          M 30 11 32  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2622          
##                  95% CI : (0.2122, 0.3173)
##     No Information Rate : 0.4685          
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0.0136          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
## 
## Statistics by Class:
## 
##                      Class: C Class: H Class: L Class: M
## Sensitivity            0.3134  0.18750  0.23077       NA
## Specificity            0.8026  0.76050  0.71978   0.7448
## Pos Pred Value         0.5833  0.13636  0.32000       NA
## Neg Pred Value         0.5701  0.82273  0.62085       NA
## Prevalence             0.4685  0.16783  0.36364   0.0000
## Detection Rate         0.1469  0.03147  0.08392   0.0000
## Detection Prevalence   0.2517  0.23077  0.26224   0.2552
## Balanced Accuracy      0.5580  0.47400  0.47527       NA

We will build the same decision tree with the second dataset

#sample set
df1000_data <- df1000 %>%
  select(Order.Priority, Units.Sold, Unit.Price, Unit.Cost, Total.Revenue, Total.Cost, Total.Profit)

set.seed(1234)


sample_data2 <- sample.split(df1000_data, SplitRatio = 0.80)
train_data2 <- subset(df1000_data, sample_data2 == TRUE)
test_data2 <- subset(df1000_data, sample_data2 == FALSE)
head(train_data2)

##   Order.Priority Units.Sold Unit.Price Unit.Cost Total.Revenue Total.Cost
## 1              C        934     152.58     97.44     142509.72   91008.96
## 2              L       4551     437.20    263.33    1989697.20 1198414.83
## 3              M       9986       9.33      6.92      93169.38   69103.12
## 4              M       9118      81.73     56.67     745214.14  516717.06
## 7              M       7964     437.20    263.33    3481860.80 2097160.12
## 8              C       6307       9.33      6.92      58844.31   43644.44
##   Total.Profit
## 1     51500.76
## 2    791282.37
## 3     24066.26
## 4    228497.08
## 7   1384700.68
## 8     15199.87

Build tree for second set

rtree2 <- rpart(Order.Priority ~ ., data = train_data2, method="class", control = rpart.control(minsplit = 4, minbucket = round(5/3), maxdepth = 3))
rtree2

## n= 71429 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 71429 53521 M (0.2506685 0.2483445 0.2502765 0.2507105) *

Plot the tree for second set

#rpart.plot(rtree2)