Pre-work

Visit the following website and explore the range of sizes of this dataset (from 100 to 5 million records): https://excelbianalytics.com/wp/downloads-18-sample-csv-files-data-sets-for-testing-sales/ Select 2 files to download Based on your computer’s capabilities (memory, CPU), select 2 files you can handle (recommended one small, one large) Download the files Review the structure and content of the tables, and think about the data sets (structure, size, dependencies, labels, etc) Consider the similarities and differences in the two data sets you have downloaded Think about how to analyze and predict an outcome based on the datasets available Based on the data you have, think which two machine learning algorithms presented so far could be used to analyze the data Deliverable

Data Selection

As part of this exercise , I will be analyzing the sales records for 1000 Sales and 100,000 sales records data.

Load library

library(stats)
library(corrplot)
library(dplyr)
library(tidyverse)
library(tidymodels)
library(caret)
library(rpart.plot)

Load the data

# Load the 1000 sales file (small dataset)
smallSalesData <- read_csv('1000_Sales_Records.csv.gz')

# load the 100000 sales (large dataset)
bigSalesData <- read_csv('100000_Sales_Records.csv.gz')

Exploratory Data analysis

The following is a glimpse and summary of the data. There are no missing values. To get a quick view of our data, we use the glimpse() command to show us our variable names, data types, and some sample data.

# Snippet of the data
head(smallSalesData)
## # A tibble: 6 x 14
##   Region       Country `Item Type` `Sales Channel` `Order Priority` `Order Date`
##   <chr>        <chr>   <chr>       <chr>           <chr>            <chr>       
## 1 Middle East~ Libya   Cosmetics   Offline         M                10/18/2014  
## 2 North Ameri~ Canada  Vegetables  Online          M                11/7/2011   
## 3 Middle East~ Libya   Baby Food   Offline         C                10/31/2016  
## 4 Asia         Japan   Cereal      Offline         C                4/10/2010   
## 5 Sub-Saharan~ Chad    Fruits      Offline         H                8/16/2011   
## 6 Europe       Armenia Cereal      Online          H                11/24/2014  
## # ... with 8 more variables: `Order ID` <dbl>, `Ship Date` <chr>,
## #   `Units Sold` <dbl>, `Unit Price` <dbl>, `Unit Cost` <dbl>,
## #   `Total Revenue` <dbl>, `Total Cost` <dbl>, `Total Profit` <dbl>
# glimpse and summary of the data
glimpse(smallSalesData)
## Rows: 1,000
## Columns: 14
## $ Region           <chr> "Middle East and North Africa", "North America", "Mid~
## $ Country          <chr> "Libya", "Canada", "Libya", "Japan", "Chad", "Armenia~
## $ `Item Type`      <chr> "Cosmetics", "Vegetables", "Baby Food", "Cereal", "Fr~
## $ `Sales Channel`  <chr> "Offline", "Online", "Offline", "Offline", "Offline",~
## $ `Order Priority` <chr> "M", "M", "C", "C", "H", "H", "H", "M", "H", "H", "M"~
## $ `Order Date`     <chr> "10/18/2014", "11/7/2011", "10/31/2016", "4/10/2010",~
## $ `Order ID`       <dbl> 686800706, 185941302, 246222341, 161442649, 645713555~
## $ `Ship Date`      <chr> "10/31/2014", "12/8/2011", "12/9/2016", "5/12/2010", ~
## $ `Units Sold`     <dbl> 8446, 3018, 1517, 3322, 9845, 9528, 2844, 7299, 2428,~
## $ `Unit Price`     <dbl> 437.20, 154.06, 255.28, 205.70, 9.33, 205.70, 205.70,~
## $ `Unit Cost`      <dbl> 263.33, 90.93, 159.42, 117.11, 6.92, 117.11, 117.11, ~
## $ `Total Revenue`  <dbl> 3692591.20, 464953.08, 387259.76, 683335.40, 91853.85~
## $ `Total Cost`     <dbl> 2224085.18, 274426.74, 241840.14, 389039.42, 68127.40~
## $ `Total Profit`   <dbl> 1468506.02, 190526.34, 145419.62, 294295.98, 23726.45~
summary(smallSalesData)
##     Region            Country           Item Type         Sales Channel     
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Order Priority      Order Date           Order ID          Ship Date        
##  Length:1000        Length:1000        Min.   :102928006   Length:1000       
##  Class :character   Class :character   1st Qu.:328074026   Class :character  
##  Mode  :character   Mode  :character   Median :556609714   Mode  :character  
##                                        Mean   :549681325                     
##                                        3rd Qu.:769694483                     
##                                        Max.   :995529830                     
##    Units Sold     Unit Price       Unit Cost      Total Revenue    
##  Min.   :  13   Min.   :  9.33   Min.   :  6.92   Min.   :   2043  
##  1st Qu.:2420   1st Qu.: 81.73   1st Qu.: 56.67   1st Qu.: 281192  
##  Median :5184   Median :154.06   Median : 97.44   Median : 754939  
##  Mean   :5054   Mean   :262.11   Mean   :184.97   Mean   :1327322  
##  3rd Qu.:7537   3rd Qu.:421.89   3rd Qu.:263.33   3rd Qu.:1733503  
##  Max.   :9998   Max.   :668.27   Max.   :524.96   Max.   :6617210  
##    Total Cost       Total Profit      
##  Min.   :   1417   Min.   :    532.6  
##  1st Qu.: 164932   1st Qu.:  98376.1  
##  Median : 464726   Median : 277226.0  
##  Mean   : 936119   Mean   : 391202.6  
##  3rd Qu.:1141750   3rd Qu.: 548456.8  
##  Max.   :5204978   Max.   :1726181.4
#summary of the bigdata
glimpse(bigSalesData)
## Rows: 100,000
## Columns: 14
## $ Region           <chr> "Middle East and North Africa", "Central America and ~
## $ Country          <chr> "Azerbaijan", "Panama", "Sao Tome and Principe", "Sao~
## $ `Item Type`      <chr> "Snacks", "Cosmetics", "Fruits", "Personal Care", "Ho~
## $ `Sales Channel`  <chr> "Online", "Offline", "Offline", "Online", "Offline", ~
## $ `Order Priority` <chr> "C", "L", "M", "M", "H", "C", "M", "C", "H", "H", "C"~
## $ `Order Date`     <chr> "10/8/2014", "2/22/2015", "12/9/2015", "9/17/2014", "~
## $ `Order ID`       <dbl> 535113847, 874708545, 854349935, 892836844, 129280602~
## $ `Ship Date`      <chr> "10/23/2014", "2/27/2015", "1/18/2016", "10/12/2014",~
## $ `Units Sold`     <dbl> 934, 4551, 9986, 9118, 5858, 1149, 7964, 6307, 8217, ~
## $ `Unit Price`     <dbl> 152.58, 437.20, 9.33, 81.73, 668.27, 109.28, 437.20, ~
## $ `Unit Cost`      <dbl> 97.44, 263.33, 6.92, 56.67, 502.54, 35.84, 263.33, 6.~
## $ `Total Revenue`  <dbl> 142509.72, 1989697.20, 93169.38, 745214.14, 3914725.6~
## $ `Total Cost`     <dbl> 91008.96, 1198414.83, 69103.12, 516717.06, 2943879.32~
## $ `Total Profit`   <dbl> 51500.76, 791282.37, 24066.26, 228497.08, 970846.34, ~
summary(bigSalesData)
##     Region            Country           Item Type         Sales Channel     
##  Length:100000      Length:100000      Length:100000      Length:100000     
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Order Priority      Order Date           Order ID          Ship Date        
##  Length:100000      Length:100000      Min.   :100008904   Length:100000     
##  Class :character   Class :character   1st Qu.:326046383   Class :character  
##  Mode  :character   Mode  :character   Median :547718512   Mode  :character  
##                                        Mean   :550395554                     
##                                        3rd Qu.:775078534                     
##                                        Max.   :999996459                     
##    Units Sold      Unit Price       Unit Cost      Total Revenue    
##  Min.   :    1   Min.   :  9.33   Min.   :  6.92   Min.   :     19  
##  1st Qu.: 2505   1st Qu.:109.28   1st Qu.: 56.67   1st Qu.: 279753  
##  Median : 5007   Median :205.70   Median :117.11   Median : 789892  
##  Mean   : 5001   Mean   :266.70   Mean   :188.02   Mean   :1336067  
##  3rd Qu.: 7495   3rd Qu.:437.20   3rd Qu.:364.69   3rd Qu.:1836490  
##  Max.   :10000   Max.   :668.27   Max.   :524.96   Max.   :6682700  
##    Total Cost       Total Profit      
##  Min.   :     14   Min.   :      4.8  
##  1st Qu.: 162928   1st Qu.:  95900.0  
##  Median : 467937   Median : 283657.5  
##  Mean   : 941975   Mean   : 394091.2  
##  3rd Qu.:1209475   3rd Qu.: 568384.1  
##  Max.   :5249075   Max.   :1738700.0

The data set contains the following 14 features

Region, Country, Item Type, Sales Channel, Order Priority, Order Date, Order ID, Ship Date, Units Sold, Unit Price, Unit Cost, Total Revenue, Total Cost, Total Profit.

The dataset contains the sales data of different items across different regions and different country.

Dealing with Multicollinearity

smallSalesData %>% keep(is.numeric) %>%  cor() %>%  corrplot()

From our analysis it is clear that the variables Units Sold, Unit Price, Unit Cost directly influences the Total Revenue, Total Cost,Total Profit Since we see there is collinearity exist between different predictor variables ,we could ignore Units Sold, Unit Price, Unit Cost and use Total Profit.

Data clean up

Since there is no missing values in the data , it doesn’t require any clean up. Since there are related dependent variables , we could remove those predictors from our dataset to make the dataset simple.

smallSalesData <- smallSalesData %>% 
  select(Region,Country ,'Item Type','Country','Sales Channel','Order Priority','Order ID','Total Profit')

bigSalesData <- bigSalesData %>% 
  select(Region,Country ,'Item Type','Country','Sales Channel','Order Priority','Order ID','Total Profit')


# Convert date columns to date datatypes

smallSalesData[['Order ID']] <- toString(smallSalesData[['Order ID']])
smallSalesData[['Sales Channel']] <- as.factor(smallSalesData[['Sales Channel']])


bigSalesData[['Order ID']] <- toString(bigSalesData[['Order ID']])
bigSalesData[['Sales Channel']] <- as.factor(bigSalesData[['Sales Channel']])
plot(smallSalesData$`Sales Channel`)

Machine Learning Algorithms

Looking at the data , I feel it is better to build a machine learning algorithm for the sales channel based on the Item type and country and region and Total Profit. So this will help anyone who already running a business or planning to start a new business in these regions.

Since this is a classification problem to predict the sales channel, I decided to use Decision tree algorithm.

Decision Tree

For this assignment, I decided to attempt a classification problem utilizing decision tree.The only two possibilities are Offline and Online.

Decision trees use a tree-like structure to represent the relationship between predictors and potential outcomes. A decision tree begins with a single partition known as the root node, which is then followed by progressively smaller partitions as the tree splits and grows. At each point where the tree splits, a decision is made in terms of how to further partition the data based on the values of a particular predictor.

The end or terminal nodes of the tree are known as the leaf nodes. These nodes represent the predicted outcome based on the set of decisions made from the root node, through the decision nodes to the leaf node.

The plan is to build a training and testing set using the data. Build a model using the training data and evaluate its performance using the testing data. The following code sets the seed and partitions the data into training and testing sets. I will do this on both the small and large data sets.

Splitting the Data

We split our dataset by partitioning 80 percent of the original data as training data and the remaining 20 percent as test data.

# Splitting the data 80/20
set.seed(1234)

training.samples <- smallSalesData$`Sales Channel` %>% 
  createDataPartition(p = 0.8, list=FALSE)

train.data <- smallSalesData[training.samples,]
test.data <- smallSalesData[-training.samples,]

round(prop.table(table(select(train.data, `Sales Channel`))),2)
## 
## Offline  Online 
##    0.52    0.48
round(prop.table(table(select(test.data, `Sales Channel`))),2)
## 
## Offline  Online 
##    0.52    0.48

Buidling the Decision Tree Classification Model

small_tree_model <-
 rpart(
 `Sales Channel` ~ `Region` +  `Item Type` + `Order Priority` + `Total Profit`,
 method = "class",
 data = train.data)
small_tree_model
## n= 800 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 800 384 Offline (0.5200000 0.4800000)  
##    2) Item Type=Cereal,Clothes,Cosmetics,Fruits,Meat,Office Supplies,Snacks 441 178 Offline (0.5963719 0.4036281)  
##      4) Total Profit>=13887.8 403 155 Offline (0.6153846 0.3846154)  
##        8) Region=Asia,Australia and Oceania,Central America and the Caribbean,Sub-Saharan Africa 239  81 Offline (0.6610879 0.3389121) *
##        9) Region=Europe,Middle East and North Africa,North America 164  74 Offline (0.5487805 0.4512195)  
##         18) Item Type=Cereal,Clothes,Fruits,Snacks 94  33 Offline (0.6489362 0.3510638) *
##         19) Item Type=Cosmetics,Meat,Office Supplies 70  29 Online (0.4142857 0.5857143) *
##      5) Total Profit< 13887.8 38  15 Online (0.3947368 0.6052632) *
##    3) Item Type=Baby Food,Beverages,Household,Personal Care,Vegetables 359 153 Online (0.4261838 0.5738162)  
##      6) Region=Australia and Oceania,Central America and the Caribbean,Europe,North America 170  83 Online (0.4882353 0.5117647)  
##       12) Order Priority=C,H,L 129  59 Offline (0.5426357 0.4573643)  
##         24) Item Type=Baby Food,Beverages,Household,Personal Care 101  40 Offline (0.6039604 0.3960396) *
##         25) Item Type=Vegetables 28   9 Online (0.3214286 0.6785714) *
##       13) Order Priority=M 41  13 Online (0.3170732 0.6829268) *
##      7) Region=Asia,Middle East and North Africa,Sub-Saharan Africa 189  70 Online (0.3703704 0.6296296) *

Performance of the Model

The performance of the data can be visualized using the confusion metrics and by measuring the accuracy of the results

Confusion Metrics

 rpart.plot(small_tree_model)

permits_pred <- predict(small_tree_model, test.data, type = "class")
permits_pred_table <- table(test.data$`Sales Channel`, permits_pred)

#Confusion metrics
permits_pred_table
##          permits_pred
##           Offline Online
##   Offline      56     48
##   Online       47     49
sum(diag(permits_pred_table)) / nrow(test.data)
## [1] 0.525

The decision tree model for small dataset has an accuracy of 52.5%

Apply the Decision tree model for big data

#spilit the data

trainingBigsamples <- bigSalesData$`Sales Channel` %>% 
  createDataPartition(p = 0.8, list=FALSE)

trainBigdata <- bigSalesData[trainingBigsamples,]
testBigdata <- bigSalesData[-trainingBigsamples,]

round(prop.table(table(select(trainBigdata, `Sales Channel`))),2)
## 
## Offline  Online 
##     0.5     0.5
round(prop.table(table(select(testBigdata, `Sales Channel`))),2)
## 
## Offline  Online 
##     0.5     0.5
big_tree_model <-
 rpart(
 `Sales Channel` ~ `Region` + `Country`+ `Item Type` + `Order Priority` + `Total Profit`,
 method = "class",
 data = trainBigdata)

big_sale_pred <- predict(big_tree_model, testBigdata, type = "class")
big_sales_pred_table <- table(testBigdata$`Sales Channel`, big_sale_pred)

#Confusion metrics
big_sales_pred_table
##          big_sale_pred
##           Offline Online
##   Offline    4170   5819
##   Online     4161   5849
sum(diag(big_sales_pred_table)) / nrow(testBigdata)
## [1] 0.500975

The decision tree model for small dataset has an accuracy of 50.1%

Logistic Regression

The Logistic Regression is a regression model in which the response variable (dependent variable) has binary value. It actually measures the probability of a binary response as the value of response variable based on the mathematical equation relating it with the predictor variables.

Model creation

smallLRModel <- glm(`Sales Channel` ~ `Region` + `Country`+ `Item Type` + `Order Priority` + `Total Profit`, data =trainBigdata, family = 'binomial')
summary(smallLRModel)
## 
## Call:
## glm(formula = `Sales Channel` ~ Region + Country + `Item Type` + 
##     `Order Priority` + `Total Profit`, family = "binomial", data = trainBigdata)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -1.305  -1.179   1.062   1.171   1.318  
## 
## Coefficients: (6 not defined because of singularities)
##                                           Estimate Std. Error z value Pr(>|z|)
## (Intercept)                             -6.493e-02  1.018e-01  -0.638  0.52341
## RegionAustralia and Oceania              6.728e-02  1.377e-01   0.489  0.62517
## RegionCentral America and the Caribbean -2.889e-02  1.384e-01  -0.209  0.83470
## RegionEurope                            -9.405e-02  1.339e-01  -0.702  0.48260
## RegionMiddle East and North Africa       8.965e-02  1.364e-01   0.657  0.51103
## RegionNorth America                     -5.388e-02  1.399e-01  -0.385  0.70007
## RegionSub-Saharan Africa                 2.697e-01  1.386e-01   1.945  0.05176
## CountryAlbania                           1.564e-01  1.342e-01   1.166  0.24367
## CountryAlgeria                          -2.538e-01  1.377e-01  -1.843  0.06538
## CountryAndorra                           2.230e-01  1.344e-01   1.659  0.09702
## CountryAngola                           -3.155e-01  1.417e-01  -2.227  0.02595
## CountryAntigua and Barbuda              -3.028e-02  1.392e-01  -0.218  0.82781
## CountryArmenia                           1.647e-01  1.340e-01   1.229  0.21900
## CountryAustralia                         2.102e-02  1.367e-01   0.154  0.87781
## CountryAustria                           1.224e-01  1.350e-01   0.907  0.36455
## CountryAzerbaijan                       -3.035e-04  1.350e-01  -0.002  0.99821
## CountryBahrain                          -8.872e-02  1.351e-01  -0.657  0.51145
## CountryBangladesh                        1.802e-01  1.364e-01   1.321  0.18638
## CountryBarbados                          1.688e-01  1.407e-01   1.200  0.23033
## CountryBelarus                           1.829e-01  1.319e-01   1.386  0.16570
## CountryBelgium                           4.042e-01  1.326e-01   3.049  0.00229
## CountryBelize                            1.244e-01  1.388e-01   0.897  0.36997
## CountryBenin                            -1.983e-01  1.376e-01  -1.441  0.14950
## CountryBhutan                            2.628e-02  1.364e-01   0.193  0.84724
## CountryBosnia and Herzegovina            1.884e-01  1.323e-01   1.424  0.15452
## CountryBotswana                         -1.226e-01  1.360e-01  -0.901  0.36736
## CountryBrunei                            8.413e-02  1.359e-01   0.619  0.53583
## CountryBulgaria                          1.424e-01  1.302e-01   1.093  0.27436
## CountryBurkina Faso                     -2.886e-01  1.382e-01  -2.089  0.03669
## CountryBurundi                          -1.701e-01  1.379e-01  -1.233  0.21745
## CountryCambodia                          1.126e-01  1.344e-01   0.838  0.40208
## CountryCameroon                         -2.720e-01  1.382e-01  -1.968  0.04908
## CountryCanada                            9.713e-02  1.396e-01   0.696  0.48663
## CountryCape Verde                       -1.325e-01  1.365e-01  -0.971  0.33166
## CountryCentral African Republic         -1.169e-01  1.375e-01  -0.850  0.39514
## CountryChad                             -4.288e-01  1.392e-01  -3.080  0.00207
## CountryChina                             4.779e-02  1.363e-01   0.351  0.72587
## CountryComoros                          -6.194e-02  1.392e-01  -0.445  0.65641
## CountryCosta Rica                       -2.088e-02  1.362e-01  -0.153  0.87820
## CountryCote d'Ivoire                    -2.364e-01  1.365e-01  -1.731  0.08337
## CountryCroatia                          -1.376e-02  1.328e-01  -0.104  0.91751
## CountryCuba                              1.156e-01  1.369e-01   0.845  0.39822
## CountryCyprus                            1.923e-01  1.329e-01   1.447  0.14793
## CountryCzech Republic                    9.706e-02  1.319e-01   0.736  0.46172
## CountryDemocratic Republic of the Congo -3.687e-01  1.402e-01  -2.630  0.00853
## CountryDenmark                           7.893e-02  1.336e-01   0.591  0.55453
## CountryDjibouti                         -2.889e-01  1.373e-01  -2.104  0.03538
## CountryDominica                         -3.843e-02  1.374e-01  -0.280  0.77967
## CountryDominican Republic               -8.651e-02  1.393e-01  -0.621  0.53469
## CountryEast Timor                        2.250e-01  1.370e-01   1.642  0.10058
## CountryEgypt                            -4.598e-03  1.359e-01  -0.034  0.97301
## CountryEl Salvador                       3.921e-02  1.370e-01   0.286  0.77477
## CountryEquatorial Guinea                -2.400e-01  1.370e-01  -1.752  0.07985
## CountryEritrea                          -2.425e-01  1.374e-01  -1.766  0.07742
## CountryEstonia                           2.418e-01  1.324e-01   1.826  0.06782
## CountryEthiopia                         -1.678e-01  1.363e-01  -1.231  0.21831
## CountryFederated States of Micronesia   -1.032e-02  1.347e-01  -0.077  0.93891
## CountryFiji                              1.793e-02  1.377e-01   0.130  0.89643
## CountryFinland                           2.966e-01  1.332e-01   2.226  0.02598
## CountryFrance                            5.359e-02  1.329e-01   0.403  0.68686
## CountryGabon                            -2.330e-01  1.386e-01  -1.681  0.09276
## CountryGeorgia                           2.780e-01  1.361e-01   2.043  0.04110
## CountryGermany                           7.904e-02  1.365e-01   0.579  0.56250
## CountryGhana                            -1.516e-02  1.413e-01  -0.107  0.91459
## CountryGreece                           -1.329e-01  1.343e-01  -0.990  0.32226
## CountryGreenland                         2.866e-01  1.384e-01   2.072  0.03831
## CountryGrenada                           9.841e-03  1.368e-01   0.072  0.94266
## CountryGuatemala                        -2.766e-02  1.370e-01  -0.202  0.84001
## CountryGuinea                           -4.030e-01  1.379e-01  -2.923  0.00347
## CountryGuinea-Bissau                    -1.981e-01  1.374e-01  -1.441  0.14945
## CountryHaiti                             2.636e-01  1.360e-01   1.939  0.05251
## CountryHonduras                          1.541e-01  1.390e-01   1.109  0.26747
## CountryHungary                           2.873e-01  1.311e-01   2.193  0.02834
## CountryIceland                           2.447e-01  1.353e-01   1.808  0.07055
## CountryIndia                             1.842e-01  1.366e-01   1.348  0.17755
## CountryIndonesia                        -6.279e-02  1.355e-01  -0.463  0.64302
## CountryIran                              7.695e-02  1.375e-01   0.560  0.57575
## CountryIraq                             -2.276e-02  1.343e-01  -0.169  0.86544
## CountryIreland                          -2.812e-02  1.320e-01  -0.213  0.83127
## CountryIsrael                           -4.445e-02  1.328e-01  -0.335  0.73790
## CountryItaly                             2.499e-01  1.339e-01   1.866  0.06203
## CountryJamaica                          -9.266e-02  1.391e-01  -0.666  0.50531
## CountryJapan                             2.673e-01  1.354e-01   1.974  0.04836
## CountryJordan                           -5.940e-03  1.353e-01  -0.044  0.96497
## CountryKazakhstan                        1.453e-01  1.369e-01   1.061  0.28856
## CountryKenya                            -2.452e-01  1.385e-01  -1.770  0.07673
## CountryKiribati                         -4.551e-02  1.401e-01  -0.325  0.74524
## CountryKosovo                            2.664e-01  1.315e-01   2.026  0.04278
## CountryKuwait                            4.369e-02  1.363e-01   0.321  0.74850
## CountryKyrgyzstan                        1.418e-01  1.396e-01   1.016  0.30954
## CountryLaos                              1.299e-01  1.364e-01   0.953  0.34074
## CountryLatvia                            1.608e-01  1.311e-01   1.227  0.21990
## CountryLebanon                          -1.273e-01  1.349e-01  -0.944  0.34533
## CountryLesotho                          -1.936e-01  1.394e-01  -1.388  0.16503
## CountryLiberia                          -2.205e-01  1.366e-01  -1.614  0.10648
## CountryLibya                             3.560e-02  1.352e-01   0.263  0.79235
## CountryLiechtenstein                     4.125e-02  1.345e-01   0.307  0.75898
## CountryLithuania                         2.546e-01  1.316e-01   1.935  0.05296
## CountryLuxembourg                        1.235e-01  1.334e-01   0.925  0.35483
## CountryMacedonia                         2.085e-01  1.312e-01   1.589  0.11203
## CountryMadagascar                       -1.702e-01  1.389e-01  -1.225  0.22050
## CountryMalawi                           -3.251e-01  1.381e-01  -2.355  0.01852
## CountryMalaysia                         -1.376e-03  1.396e-01  -0.010  0.99214
## CountryMaldives                         -7.130e-02  1.379e-01  -0.517  0.60521
## CountryMali                             -2.556e-01  1.377e-01  -1.856  0.06349
## CountryMalta                             3.089e-03  1.301e-01   0.024  0.98106
## CountryMarshall Islands                 -1.648e-01  1.366e-01  -1.206  0.22763
## CountryMauritania                       -1.417e-01  1.379e-01  -1.027  0.30436
## CountryMauritius                        -6.839e-02  1.368e-01  -0.500  0.61704
## CountryMexico                           -5.416e-02  1.409e-01  -0.384  0.70073
## CountryMoldova                           1.612e-01  1.350e-01   1.194  0.23239
## CountryMonaco                            6.471e-02  1.324e-01   0.489  0.62514
## CountryMongolia                         -8.362e-02  1.374e-01  -0.609  0.54280
## CountryMontenegro                        1.192e-01  1.326e-01   0.899  0.36849
## CountryMorocco                          -8.789e-02  1.351e-01  -0.650  0.51539
## CountryMozambique                       -3.324e-01  1.346e-01  -2.470  0.01351
## CountryMyanmar                           1.199e-01  1.362e-01   0.880  0.37869
## CountryNamibia                          -2.253e-01  1.355e-01  -1.662  0.09649
## CountryNauru                             1.052e-01  1.380e-01   0.762  0.44596
## CountryNepal                             1.482e-01  1.374e-01   1.079  0.28075
## CountryNetherlands                       2.359e-01  1.340e-01   1.761  0.07829
## CountryNew Zealand                      -6.424e-02  1.336e-01  -0.481  0.63050
## CountryNicaragua                        -3.552e-02  1.396e-01  -0.254  0.79921
## CountryNiger                            -1.621e-01  1.376e-01  -1.178  0.23872
## CountryNigeria                          -1.864e-01  1.415e-01  -1.317  0.18782
## CountryNorth Korea                       1.788e-01  1.377e-01   1.298  0.19423
## CountryNorway                            2.029e-01  1.329e-01   1.527  0.12676
## CountryOman                             -8.773e-02  1.358e-01  -0.646  0.51811
## CountryPakistan                         -6.478e-02  1.349e-01  -0.480  0.63104
## CountryPalau                            -6.457e-02  1.390e-01  -0.465  0.64227
## CountryPanama                           -1.329e-01  1.359e-01  -0.977  0.32834
## CountryPapua New Guinea                  8.648e-02  1.371e-01   0.631  0.52808
## CountryPhilippines                       1.215e-01  1.394e-01   0.872  0.38324
## CountryPoland                            1.119e-02  1.349e-01   0.083  0.93386
## CountryPortugal                          2.726e-01  1.312e-01   2.077  0.03782
## CountryQatar                            -3.264e-02  1.351e-01  -0.242  0.80906
## CountryRepublic of the Congo            -3.384e-01  1.392e-01  -2.430  0.01508
## CountryRomania                           2.005e-01  1.317e-01   1.523  0.12780
## CountryRussia                            1.895e-01  1.346e-01   1.407  0.15932
## CountryRwanda                           -1.994e-01  1.358e-01  -1.468  0.14209
## CountrySaint Kitts and Nevis             1.674e-01  1.382e-01   1.211  0.22598
## CountrySaint Lucia                      -1.006e-01  1.409e-01  -0.714  0.47526
## CountrySaint Vincent and the Grenadines  1.444e-01  1.364e-01   1.059  0.28982
## CountrySamoa                             1.857e-01  1.360e-01   1.365  0.17220
## CountrySan Marino                        5.395e-02  1.331e-01   0.405  0.68522
## CountrySao Tome and Principe            -2.779e-01  1.354e-01  -2.052  0.04015
## CountrySaudi Arabia                      4.784e-02  1.360e-01   0.352  0.72507
## CountrySenegal                          -1.872e-01  1.377e-01  -1.359  0.17408
## CountrySerbia                            1.512e-01  1.336e-01   1.132  0.25765
## CountrySeychelles                       -1.862e-01  1.360e-01  -1.369  0.17110
## CountrySierra Leone                     -3.512e-01  1.378e-01  -2.548  0.01083
## CountrySingapore                        -1.011e-02  1.358e-01  -0.074  0.94067
## CountrySlovakia                          2.312e-01  1.321e-01   1.751  0.08000
## CountrySlovenia                          1.903e-01  1.352e-01   1.408  0.15918
## CountrySolomon Islands                  -1.603e-01  1.398e-01  -1.147  0.25139
## CountrySomalia                           1.613e-02  1.358e-01   0.119  0.90540
## CountrySouth Africa                     -1.135e-01  1.367e-01  -0.830  0.40627
## CountrySouth Korea                       8.127e-02  1.339e-01   0.607  0.54393
## CountrySouth Sudan                      -3.535e-02  1.363e-01  -0.259  0.79533
## CountrySpain                             2.716e-01  1.332e-01   2.038  0.04153
## CountrySri Lanka                         2.679e-02  1.389e-01   0.193  0.84705
## CountrySudan                            -1.625e-01  1.323e-01  -1.228  0.21932
## CountrySwaziland                        -1.276e-01  1.382e-01  -0.923  0.35591
## CountrySweden                            1.876e-01  1.336e-01   1.404  0.16035
## CountrySwitzerland                       1.749e-02  1.354e-01   0.129  0.89723
## CountrySyria                            -1.248e-01  1.340e-01  -0.931  0.35179
## CountryTaiwan                           -3.752e-03  1.373e-01  -0.027  0.97821
## CountryTajikistan                        6.164e-02  1.365e-01   0.451  0.65167
## CountryTanzania                         -3.200e-01  1.392e-01  -2.299  0.02152
## CountryThailand                          5.806e-02  1.377e-01   0.422  0.67335
## CountryThe Bahamas                       7.275e-02  1.387e-01   0.525  0.59984
## CountryThe Gambia                       -1.883e-01  1.380e-01  -1.364  0.17261
## CountryTogo                             -3.236e-01  1.395e-01  -2.321  0.02031
## CountryTonga                             1.646e-02  1.381e-01   0.119  0.90509
## CountryTrinidad and Tobago                      NA         NA      NA       NA
## CountryTunisia                          -4.468e-02  1.327e-01  -0.337  0.73632
## CountryTurkey                           -5.231e-02  1.359e-01  -0.385  0.70031
## CountryTurkmenistan                     -1.093e-01  1.372e-01  -0.797  0.42543
## CountryTuvalu                           -7.778e-02  1.384e-01  -0.562  0.57412
## CountryUganda                           -1.893e-01  1.374e-01  -1.377  0.16838
## CountryUkraine                           6.311e-02  1.345e-01   0.469  0.63890
## CountryUnited Arab Emirates              6.882e-02  1.339e-01   0.514  0.60730
## CountryUnited Kingdom                    9.758e-02  1.347e-01   0.724  0.46885
## CountryUnited States of America                 NA         NA      NA       NA
## CountryUzbekistan                        1.253e-01  1.386e-01   0.904  0.36610
## CountryVanuatu                                  NA         NA      NA       NA
## CountryVatican City                             NA         NA      NA       NA
## CountryVietnam                                  NA         NA      NA       NA
## CountryYemen                             1.899e-02  1.329e-01   0.143  0.88643
## CountryZambia                           -2.172e-01  1.373e-01  -1.582  0.11363
## CountryZimbabwe                                 NA         NA      NA       NA
## `Item Type`Beverages                     1.161e-02  3.622e-02   0.321  0.74851
## `Item Type`Cereal                        3.655e-02  3.455e-02   1.058  0.29016
## `Item Type`Clothes                       1.394e-04  3.469e-02   0.004  0.99679
## `Item Type`Cosmetics                     3.652e-02  3.609e-02   1.012  0.31163
## `Item Type`Fruits                        2.181e-02  3.678e-02   0.593  0.55323
## `Item Type`Household                     3.391e-02  3.582e-02   0.947  0.34382
## `Item Type`Meat                         -2.290e-02  3.503e-02  -0.654  0.51319
## `Item Type`Office Supplies               2.297e-02  3.482e-02   0.660  0.50940
## `Item Type`Personal Care                 3.959e-02  3.568e-02   1.110  0.26719
## `Item Type`Snacks                       -5.558e-03  3.498e-02  -0.159  0.87373
## `Item Type`Vegetables                    2.878e-02  3.494e-02   0.824  0.41004
## `Order Priority`H                       -6.158e-03  2.008e-02  -0.307  0.75914
## `Order Priority`L                        1.155e-02  2.006e-02   0.576  0.56490
## `Order Priority`M                        2.662e-04  2.007e-02   0.013  0.98942
## `Total Profit`                          -1.534e-08  2.596e-08  -0.591  0.55465
##                                           
## (Intercept)                               
## RegionAustralia and Oceania               
## RegionCentral America and the Caribbean   
## RegionEurope                              
## RegionMiddle East and North Africa        
## RegionNorth America                       
## RegionSub-Saharan Africa                . 
## CountryAlbania                            
## CountryAlgeria                          . 
## CountryAndorra                          . 
## CountryAngola                           * 
## CountryAntigua and Barbuda                
## CountryArmenia                            
## CountryAustralia                          
## CountryAustria                            
## CountryAzerbaijan                         
## CountryBahrain                            
## CountryBangladesh                         
## CountryBarbados                           
## CountryBelarus                            
## CountryBelgium                          **
## CountryBelize                             
## CountryBenin                              
## CountryBhutan                             
## CountryBosnia and Herzegovina             
## CountryBotswana                           
## CountryBrunei                             
## CountryBulgaria                           
## CountryBurkina Faso                     * 
## CountryBurundi                            
## CountryCambodia                           
## CountryCameroon                         * 
## CountryCanada                             
## CountryCape Verde                         
## CountryCentral African Republic           
## CountryChad                             **
## CountryChina                              
## CountryComoros                            
## CountryCosta Rica                         
## CountryCote d'Ivoire                    . 
## CountryCroatia                            
## CountryCuba                               
## CountryCyprus                             
## CountryCzech Republic                     
## CountryDemocratic Republic of the Congo **
## CountryDenmark                            
## CountryDjibouti                         * 
## CountryDominica                           
## CountryDominican Republic                 
## CountryEast Timor                         
## CountryEgypt                              
## CountryEl Salvador                        
## CountryEquatorial Guinea                . 
## CountryEritrea                          . 
## CountryEstonia                          . 
## CountryEthiopia                           
## CountryFederated States of Micronesia     
## CountryFiji                               
## CountryFinland                          * 
## CountryFrance                             
## CountryGabon                            . 
## CountryGeorgia                          * 
## CountryGermany                            
## CountryGhana                              
## CountryGreece                             
## CountryGreenland                        * 
## CountryGrenada                            
## CountryGuatemala                          
## CountryGuinea                           **
## CountryGuinea-Bissau                      
## CountryHaiti                            . 
## CountryHonduras                           
## CountryHungary                          * 
## CountryIceland                          . 
## CountryIndia                              
## CountryIndonesia                          
## CountryIran                               
## CountryIraq                               
## CountryIreland                            
## CountryIsrael                             
## CountryItaly                            . 
## CountryJamaica                            
## CountryJapan                            * 
## CountryJordan                             
## CountryKazakhstan                         
## CountryKenya                            . 
## CountryKiribati                           
## CountryKosovo                           * 
## CountryKuwait                             
## CountryKyrgyzstan                         
## CountryLaos                               
## CountryLatvia                             
## CountryLebanon                            
## CountryLesotho                            
## CountryLiberia                            
## CountryLibya                              
## CountryLiechtenstein                      
## CountryLithuania                        . 
## CountryLuxembourg                         
## CountryMacedonia                          
## CountryMadagascar                         
## CountryMalawi                           * 
## CountryMalaysia                           
## CountryMaldives                           
## CountryMali                             . 
## CountryMalta                              
## CountryMarshall Islands                   
## CountryMauritania                         
## CountryMauritius                          
## CountryMexico                             
## CountryMoldova                            
## CountryMonaco                             
## CountryMongolia                           
## CountryMontenegro                         
## CountryMorocco                            
## CountryMozambique                       * 
## CountryMyanmar                            
## CountryNamibia                          . 
## CountryNauru                              
## CountryNepal                              
## CountryNetherlands                      . 
## CountryNew Zealand                        
## CountryNicaragua                          
## CountryNiger                              
## CountryNigeria                            
## CountryNorth Korea                        
## CountryNorway                             
## CountryOman                               
## CountryPakistan                           
## CountryPalau                              
## CountryPanama                             
## CountryPapua New Guinea                   
## CountryPhilippines                        
## CountryPoland                             
## CountryPortugal                         * 
## CountryQatar                              
## CountryRepublic of the Congo            * 
## CountryRomania                            
## CountryRussia                             
## CountryRwanda                             
## CountrySaint Kitts and Nevis              
## CountrySaint Lucia                        
## CountrySaint Vincent and the Grenadines   
## CountrySamoa                              
## CountrySan Marino                         
## CountrySao Tome and Principe            * 
## CountrySaudi Arabia                       
## CountrySenegal                            
## CountrySerbia                             
## CountrySeychelles                         
## CountrySierra Leone                     * 
## CountrySingapore                          
## CountrySlovakia                         . 
## CountrySlovenia                           
## CountrySolomon Islands                    
## CountrySomalia                            
## CountrySouth Africa                       
## CountrySouth Korea                        
## CountrySouth Sudan                        
## CountrySpain                            * 
## CountrySri Lanka                          
## CountrySudan                              
## CountrySwaziland                          
## CountrySweden                             
## CountrySwitzerland                        
## CountrySyria                              
## CountryTaiwan                             
## CountryTajikistan                         
## CountryTanzania                         * 
## CountryThailand                           
## CountryThe Bahamas                        
## CountryThe Gambia                         
## CountryTogo                             * 
## CountryTonga                              
## CountryTrinidad and Tobago                
## CountryTunisia                            
## CountryTurkey                             
## CountryTurkmenistan                       
## CountryTuvalu                             
## CountryUganda                             
## CountryUkraine                            
## CountryUnited Arab Emirates               
## CountryUnited Kingdom                     
## CountryUnited States of America           
## CountryUzbekistan                         
## CountryVanuatu                            
## CountryVatican City                       
## CountryVietnam                            
## CountryYemen                              
## CountryZambia                             
## CountryZimbabwe                           
## `Item Type`Beverages                      
## `Item Type`Cereal                         
## `Item Type`Clothes                        
## `Item Type`Cosmetics                      
## `Item Type`Fruits                         
## `Item Type`Household                      
## `Item Type`Meat                           
## `Item Type`Office Supplies                
## `Item Type`Personal Care                  
## `Item Type`Snacks                         
## `Item Type`Vegetables                     
## `Order Priority`H                         
## `Order Priority`L                         
## `Order Priority`M                         
## `Total Profit`                            
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 110905  on 80000  degrees of freedom
## Residual deviance: 110698  on 79801  degrees of freedom
## AIC: 111098
## 
## Number of Fisher Scoring iterations: 3
predictions1 <- predict(smallLRModel, trainBigdata, type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
head(predictions1)
##         1         2         3         4         5         6 
## 0.5045163 0.4524548 0.4871299 0.4907894 0.4797097 0.4984959
#confusion metrix
pred1 <- ifelse(predictions1>0.5, 1, 0)
confMetrixLR <- table(Predicted = pred1, Actual = trainBigdata$`Sales Channel`)
confMetrixLR
##          Actual
## Predicted Offline Online
##         0   19303  17666
##         1   20654  22378
#Accuracy
(confMetrixLR[[1,1]] +confMetrixLR[[2,2]])/sum(confMetrixLR)
## [1] 0.521006

Let’s apply the prediction to test data now

p2 <- predict(smallLRModel, testBigdata, type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
pred2 <- ifelse(p2>0.5, 1, 0)
confMetrixLR2 <- table(Predicted = pred2, Actual = testBigdata$`Sales Channel`)
confMetrixLR2
##          Actual
## Predicted Offline Online
##         0    4654   4626
##         1    5335   5384
#Accuracy
(confMetrixLR2[[1,1]] +confMetrixLR2[[2,2]])/sum(confMetrixLR2)
## [1] 0.5019251

We could see that the logistic regression provides an accuracy of 50.19%

Conclusion

I have applied Decision Tree and Logistic Regression model on the same set of data and tried to predict the sales Channel.After executing different models on different size of data , I didn’t observer any improvement in the efficacy of the model. Probably the data we have many not have enough information to predict the Sales Channel.

We may probably be able to predict other information like Total profits based on some other predictor variable.