Visit the following website and explore the range of sizes of this dataset (from 100 to 5 million records): https://excelbianalytics.com/wp/downloads-18-sample-csv-files-data-sets-for-testing-sales/ Select 2 files to download Based on your computer’s capabilities (memory, CPU), select 2 files you can handle (recommended one small, one large) Download the files Review the structure and content of the tables, and think about the data sets (structure, size, dependencies, labels, etc) Consider the similarities and differences in the two data sets you have downloaded Think about how to analyze and predict an outcome based on the datasets available Based on the data you have, think which two machine learning algorithms presented so far could be used to analyze the data Deliverable
As part of this exercise , I will be analyzing the sales records for 1000 Sales and 100,000 sales records data.
library(stats)
library(corrplot)
library(dplyr)
library(tidyverse)
library(tidymodels)
library(caret)
library(rpart.plot)
# Load the 1000 sales file (small dataset)
smallSalesData <- read_csv('1000_Sales_Records.csv.gz')
# load the 100000 sales (large dataset)
bigSalesData <- read_csv('100000_Sales_Records.csv.gz')
The following is a glimpse and summary of the data. There are no missing values. To get a quick view of our data, we use the glimpse() command to show us our variable names, data types, and some sample data.
# Snippet of the data
head(smallSalesData)
## # A tibble: 6 x 14
## Region Country `Item Type` `Sales Channel` `Order Priority` `Order Date`
## <chr> <chr> <chr> <chr> <chr> <chr>
## 1 Middle East~ Libya Cosmetics Offline M 10/18/2014
## 2 North Ameri~ Canada Vegetables Online M 11/7/2011
## 3 Middle East~ Libya Baby Food Offline C 10/31/2016
## 4 Asia Japan Cereal Offline C 4/10/2010
## 5 Sub-Saharan~ Chad Fruits Offline H 8/16/2011
## 6 Europe Armenia Cereal Online H 11/24/2014
## # ... with 8 more variables: `Order ID` <dbl>, `Ship Date` <chr>,
## # `Units Sold` <dbl>, `Unit Price` <dbl>, `Unit Cost` <dbl>,
## # `Total Revenue` <dbl>, `Total Cost` <dbl>, `Total Profit` <dbl>
# glimpse and summary of the data
glimpse(smallSalesData)
## Rows: 1,000
## Columns: 14
## $ Region <chr> "Middle East and North Africa", "North America", "Mid~
## $ Country <chr> "Libya", "Canada", "Libya", "Japan", "Chad", "Armenia~
## $ `Item Type` <chr> "Cosmetics", "Vegetables", "Baby Food", "Cereal", "Fr~
## $ `Sales Channel` <chr> "Offline", "Online", "Offline", "Offline", "Offline",~
## $ `Order Priority` <chr> "M", "M", "C", "C", "H", "H", "H", "M", "H", "H", "M"~
## $ `Order Date` <chr> "10/18/2014", "11/7/2011", "10/31/2016", "4/10/2010",~
## $ `Order ID` <dbl> 686800706, 185941302, 246222341, 161442649, 645713555~
## $ `Ship Date` <chr> "10/31/2014", "12/8/2011", "12/9/2016", "5/12/2010", ~
## $ `Units Sold` <dbl> 8446, 3018, 1517, 3322, 9845, 9528, 2844, 7299, 2428,~
## $ `Unit Price` <dbl> 437.20, 154.06, 255.28, 205.70, 9.33, 205.70, 205.70,~
## $ `Unit Cost` <dbl> 263.33, 90.93, 159.42, 117.11, 6.92, 117.11, 117.11, ~
## $ `Total Revenue` <dbl> 3692591.20, 464953.08, 387259.76, 683335.40, 91853.85~
## $ `Total Cost` <dbl> 2224085.18, 274426.74, 241840.14, 389039.42, 68127.40~
## $ `Total Profit` <dbl> 1468506.02, 190526.34, 145419.62, 294295.98, 23726.45~
summary(smallSalesData)
## Region Country Item Type Sales Channel
## Length:1000 Length:1000 Length:1000 Length:1000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Order Priority Order Date Order ID Ship Date
## Length:1000 Length:1000 Min. :102928006 Length:1000
## Class :character Class :character 1st Qu.:328074026 Class :character
## Mode :character Mode :character Median :556609714 Mode :character
## Mean :549681325
## 3rd Qu.:769694483
## Max. :995529830
## Units Sold Unit Price Unit Cost Total Revenue
## Min. : 13 Min. : 9.33 Min. : 6.92 Min. : 2043
## 1st Qu.:2420 1st Qu.: 81.73 1st Qu.: 56.67 1st Qu.: 281192
## Median :5184 Median :154.06 Median : 97.44 Median : 754939
## Mean :5054 Mean :262.11 Mean :184.97 Mean :1327322
## 3rd Qu.:7537 3rd Qu.:421.89 3rd Qu.:263.33 3rd Qu.:1733503
## Max. :9998 Max. :668.27 Max. :524.96 Max. :6617210
## Total Cost Total Profit
## Min. : 1417 Min. : 532.6
## 1st Qu.: 164932 1st Qu.: 98376.1
## Median : 464726 Median : 277226.0
## Mean : 936119 Mean : 391202.6
## 3rd Qu.:1141750 3rd Qu.: 548456.8
## Max. :5204978 Max. :1726181.4
#summary of the bigdata
glimpse(bigSalesData)
## Rows: 100,000
## Columns: 14
## $ Region <chr> "Middle East and North Africa", "Central America and ~
## $ Country <chr> "Azerbaijan", "Panama", "Sao Tome and Principe", "Sao~
## $ `Item Type` <chr> "Snacks", "Cosmetics", "Fruits", "Personal Care", "Ho~
## $ `Sales Channel` <chr> "Online", "Offline", "Offline", "Online", "Offline", ~
## $ `Order Priority` <chr> "C", "L", "M", "M", "H", "C", "M", "C", "H", "H", "C"~
## $ `Order Date` <chr> "10/8/2014", "2/22/2015", "12/9/2015", "9/17/2014", "~
## $ `Order ID` <dbl> 535113847, 874708545, 854349935, 892836844, 129280602~
## $ `Ship Date` <chr> "10/23/2014", "2/27/2015", "1/18/2016", "10/12/2014",~
## $ `Units Sold` <dbl> 934, 4551, 9986, 9118, 5858, 1149, 7964, 6307, 8217, ~
## $ `Unit Price` <dbl> 152.58, 437.20, 9.33, 81.73, 668.27, 109.28, 437.20, ~
## $ `Unit Cost` <dbl> 97.44, 263.33, 6.92, 56.67, 502.54, 35.84, 263.33, 6.~
## $ `Total Revenue` <dbl> 142509.72, 1989697.20, 93169.38, 745214.14, 3914725.6~
## $ `Total Cost` <dbl> 91008.96, 1198414.83, 69103.12, 516717.06, 2943879.32~
## $ `Total Profit` <dbl> 51500.76, 791282.37, 24066.26, 228497.08, 970846.34, ~
summary(bigSalesData)
## Region Country Item Type Sales Channel
## Length:100000 Length:100000 Length:100000 Length:100000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Order Priority Order Date Order ID Ship Date
## Length:100000 Length:100000 Min. :100008904 Length:100000
## Class :character Class :character 1st Qu.:326046383 Class :character
## Mode :character Mode :character Median :547718512 Mode :character
## Mean :550395554
## 3rd Qu.:775078534
## Max. :999996459
## Units Sold Unit Price Unit Cost Total Revenue
## Min. : 1 Min. : 9.33 Min. : 6.92 Min. : 19
## 1st Qu.: 2505 1st Qu.:109.28 1st Qu.: 56.67 1st Qu.: 279753
## Median : 5007 Median :205.70 Median :117.11 Median : 789892
## Mean : 5001 Mean :266.70 Mean :188.02 Mean :1336067
## 3rd Qu.: 7495 3rd Qu.:437.20 3rd Qu.:364.69 3rd Qu.:1836490
## Max. :10000 Max. :668.27 Max. :524.96 Max. :6682700
## Total Cost Total Profit
## Min. : 14 Min. : 4.8
## 1st Qu.: 162928 1st Qu.: 95900.0
## Median : 467937 Median : 283657.5
## Mean : 941975 Mean : 394091.2
## 3rd Qu.:1209475 3rd Qu.: 568384.1
## Max. :5249075 Max. :1738700.0
The data set contains the following 14 features
Region, Country, Item Type, Sales Channel, Order Priority, Order Date, Order ID, Ship Date, Units Sold, Unit Price, Unit Cost, Total Revenue, Total Cost, Total Profit.
The dataset contains the sales data of different items across different regions and different country.
smallSalesData %>% keep(is.numeric) %>% cor() %>% corrplot()
From our analysis it is clear that the variables Units Sold, Unit Price, Unit Cost directly influences the Total Revenue, Total Cost,Total Profit Since we see there is collinearity exist between different predictor variables ,we could ignore Units Sold, Unit Price, Unit Cost and use Total Profit.
Since there is no missing values in the data , it doesn’t require any clean up. Since there are related dependent variables , we could remove those predictors from our dataset to make the dataset simple.
smallSalesData <- smallSalesData %>%
select(Region,Country ,'Item Type','Country','Sales Channel','Order Priority','Order ID','Total Profit')
bigSalesData <- bigSalesData %>%
select(Region,Country ,'Item Type','Country','Sales Channel','Order Priority','Order ID','Total Profit')
# Convert date columns to date datatypes
smallSalesData[['Order ID']] <- toString(smallSalesData[['Order ID']])
smallSalesData[['Sales Channel']] <- as.factor(smallSalesData[['Sales Channel']])
bigSalesData[['Order ID']] <- toString(bigSalesData[['Order ID']])
bigSalesData[['Sales Channel']] <- as.factor(bigSalesData[['Sales Channel']])
plot(smallSalesData$`Sales Channel`)
Looking at the data , I feel it is better to build a machine learning algorithm for the sales channel based on the Item type and country and region and Total Profit. So this will help anyone who already running a business or planning to start a new business in these regions.
Since this is a classification problem to predict the sales channel, I decided to use Decision tree algorithm.
For this assignment, I decided to attempt a classification problem utilizing decision tree.The only two possibilities are Offline and Online.
Decision trees use a tree-like structure to represent the relationship between predictors and potential outcomes. A decision tree begins with a single partition known as the root node, which is then followed by progressively smaller partitions as the tree splits and grows. At each point where the tree splits, a decision is made in terms of how to further partition the data based on the values of a particular predictor.
The end or terminal nodes of the tree are known as the leaf nodes. These nodes represent the predicted outcome based on the set of decisions made from the root node, through the decision nodes to the leaf node.
The plan is to build a training and testing set using the data. Build a model using the training data and evaluate its performance using the testing data. The following code sets the seed and partitions the data into training and testing sets. I will do this on both the small and large data sets.
We split our dataset by partitioning 80 percent of the original data as training data and the remaining 20 percent as test data.
# Splitting the data 80/20
set.seed(1234)
training.samples <- smallSalesData$`Sales Channel` %>%
createDataPartition(p = 0.8, list=FALSE)
train.data <- smallSalesData[training.samples,]
test.data <- smallSalesData[-training.samples,]
round(prop.table(table(select(train.data, `Sales Channel`))),2)
##
## Offline Online
## 0.52 0.48
round(prop.table(table(select(test.data, `Sales Channel`))),2)
##
## Offline Online
## 0.52 0.48
small_tree_model <-
rpart(
`Sales Channel` ~ `Region` + `Item Type` + `Order Priority` + `Total Profit`,
method = "class",
data = train.data)
small_tree_model
## n= 800
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 800 384 Offline (0.5200000 0.4800000)
## 2) Item Type=Cereal,Clothes,Cosmetics,Fruits,Meat,Office Supplies,Snacks 441 178 Offline (0.5963719 0.4036281)
## 4) Total Profit>=13887.8 403 155 Offline (0.6153846 0.3846154)
## 8) Region=Asia,Australia and Oceania,Central America and the Caribbean,Sub-Saharan Africa 239 81 Offline (0.6610879 0.3389121) *
## 9) Region=Europe,Middle East and North Africa,North America 164 74 Offline (0.5487805 0.4512195)
## 18) Item Type=Cereal,Clothes,Fruits,Snacks 94 33 Offline (0.6489362 0.3510638) *
## 19) Item Type=Cosmetics,Meat,Office Supplies 70 29 Online (0.4142857 0.5857143) *
## 5) Total Profit< 13887.8 38 15 Online (0.3947368 0.6052632) *
## 3) Item Type=Baby Food,Beverages,Household,Personal Care,Vegetables 359 153 Online (0.4261838 0.5738162)
## 6) Region=Australia and Oceania,Central America and the Caribbean,Europe,North America 170 83 Online (0.4882353 0.5117647)
## 12) Order Priority=C,H,L 129 59 Offline (0.5426357 0.4573643)
## 24) Item Type=Baby Food,Beverages,Household,Personal Care 101 40 Offline (0.6039604 0.3960396) *
## 25) Item Type=Vegetables 28 9 Online (0.3214286 0.6785714) *
## 13) Order Priority=M 41 13 Online (0.3170732 0.6829268) *
## 7) Region=Asia,Middle East and North Africa,Sub-Saharan Africa 189 70 Online (0.3703704 0.6296296) *
The performance of the data can be visualized using the confusion metrics and by measuring the accuracy of the results
Confusion Metrics
rpart.plot(small_tree_model)
permits_pred <- predict(small_tree_model, test.data, type = "class")
permits_pred_table <- table(test.data$`Sales Channel`, permits_pred)
#Confusion metrics
permits_pred_table
## permits_pred
## Offline Online
## Offline 56 48
## Online 47 49
sum(diag(permits_pred_table)) / nrow(test.data)
## [1] 0.525
The decision tree model for small dataset has an accuracy of 52.5%
Apply the Decision tree model for big data
#spilit the data
trainingBigsamples <- bigSalesData$`Sales Channel` %>%
createDataPartition(p = 0.8, list=FALSE)
trainBigdata <- bigSalesData[trainingBigsamples,]
testBigdata <- bigSalesData[-trainingBigsamples,]
round(prop.table(table(select(trainBigdata, `Sales Channel`))),2)
##
## Offline Online
## 0.5 0.5
round(prop.table(table(select(testBigdata, `Sales Channel`))),2)
##
## Offline Online
## 0.5 0.5
big_tree_model <-
rpart(
`Sales Channel` ~ `Region` + `Country`+ `Item Type` + `Order Priority` + `Total Profit`,
method = "class",
data = trainBigdata)
big_sale_pred <- predict(big_tree_model, testBigdata, type = "class")
big_sales_pred_table <- table(testBigdata$`Sales Channel`, big_sale_pred)
#Confusion metrics
big_sales_pred_table
## big_sale_pred
## Offline Online
## Offline 4170 5819
## Online 4161 5849
sum(diag(big_sales_pred_table)) / nrow(testBigdata)
## [1] 0.500975
The decision tree model for small dataset has an accuracy of 50.1%
The Logistic Regression is a regression model in which the response variable (dependent variable) has binary value. It actually measures the probability of a binary response as the value of response variable based on the mathematical equation relating it with the predictor variables.
Model creation
smallLRModel <- glm(`Sales Channel` ~ `Region` + `Country`+ `Item Type` + `Order Priority` + `Total Profit`, data =trainBigdata, family = 'binomial')
summary(smallLRModel)
##
## Call:
## glm(formula = `Sales Channel` ~ Region + Country + `Item Type` +
## `Order Priority` + `Total Profit`, family = "binomial", data = trainBigdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.305 -1.179 1.062 1.171 1.318
##
## Coefficients: (6 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.493e-02 1.018e-01 -0.638 0.52341
## RegionAustralia and Oceania 6.728e-02 1.377e-01 0.489 0.62517
## RegionCentral America and the Caribbean -2.889e-02 1.384e-01 -0.209 0.83470
## RegionEurope -9.405e-02 1.339e-01 -0.702 0.48260
## RegionMiddle East and North Africa 8.965e-02 1.364e-01 0.657 0.51103
## RegionNorth America -5.388e-02 1.399e-01 -0.385 0.70007
## RegionSub-Saharan Africa 2.697e-01 1.386e-01 1.945 0.05176
## CountryAlbania 1.564e-01 1.342e-01 1.166 0.24367
## CountryAlgeria -2.538e-01 1.377e-01 -1.843 0.06538
## CountryAndorra 2.230e-01 1.344e-01 1.659 0.09702
## CountryAngola -3.155e-01 1.417e-01 -2.227 0.02595
## CountryAntigua and Barbuda -3.028e-02 1.392e-01 -0.218 0.82781
## CountryArmenia 1.647e-01 1.340e-01 1.229 0.21900
## CountryAustralia 2.102e-02 1.367e-01 0.154 0.87781
## CountryAustria 1.224e-01 1.350e-01 0.907 0.36455
## CountryAzerbaijan -3.035e-04 1.350e-01 -0.002 0.99821
## CountryBahrain -8.872e-02 1.351e-01 -0.657 0.51145
## CountryBangladesh 1.802e-01 1.364e-01 1.321 0.18638
## CountryBarbados 1.688e-01 1.407e-01 1.200 0.23033
## CountryBelarus 1.829e-01 1.319e-01 1.386 0.16570
## CountryBelgium 4.042e-01 1.326e-01 3.049 0.00229
## CountryBelize 1.244e-01 1.388e-01 0.897 0.36997
## CountryBenin -1.983e-01 1.376e-01 -1.441 0.14950
## CountryBhutan 2.628e-02 1.364e-01 0.193 0.84724
## CountryBosnia and Herzegovina 1.884e-01 1.323e-01 1.424 0.15452
## CountryBotswana -1.226e-01 1.360e-01 -0.901 0.36736
## CountryBrunei 8.413e-02 1.359e-01 0.619 0.53583
## CountryBulgaria 1.424e-01 1.302e-01 1.093 0.27436
## CountryBurkina Faso -2.886e-01 1.382e-01 -2.089 0.03669
## CountryBurundi -1.701e-01 1.379e-01 -1.233 0.21745
## CountryCambodia 1.126e-01 1.344e-01 0.838 0.40208
## CountryCameroon -2.720e-01 1.382e-01 -1.968 0.04908
## CountryCanada 9.713e-02 1.396e-01 0.696 0.48663
## CountryCape Verde -1.325e-01 1.365e-01 -0.971 0.33166
## CountryCentral African Republic -1.169e-01 1.375e-01 -0.850 0.39514
## CountryChad -4.288e-01 1.392e-01 -3.080 0.00207
## CountryChina 4.779e-02 1.363e-01 0.351 0.72587
## CountryComoros -6.194e-02 1.392e-01 -0.445 0.65641
## CountryCosta Rica -2.088e-02 1.362e-01 -0.153 0.87820
## CountryCote d'Ivoire -2.364e-01 1.365e-01 -1.731 0.08337
## CountryCroatia -1.376e-02 1.328e-01 -0.104 0.91751
## CountryCuba 1.156e-01 1.369e-01 0.845 0.39822
## CountryCyprus 1.923e-01 1.329e-01 1.447 0.14793
## CountryCzech Republic 9.706e-02 1.319e-01 0.736 0.46172
## CountryDemocratic Republic of the Congo -3.687e-01 1.402e-01 -2.630 0.00853
## CountryDenmark 7.893e-02 1.336e-01 0.591 0.55453
## CountryDjibouti -2.889e-01 1.373e-01 -2.104 0.03538
## CountryDominica -3.843e-02 1.374e-01 -0.280 0.77967
## CountryDominican Republic -8.651e-02 1.393e-01 -0.621 0.53469
## CountryEast Timor 2.250e-01 1.370e-01 1.642 0.10058
## CountryEgypt -4.598e-03 1.359e-01 -0.034 0.97301
## CountryEl Salvador 3.921e-02 1.370e-01 0.286 0.77477
## CountryEquatorial Guinea -2.400e-01 1.370e-01 -1.752 0.07985
## CountryEritrea -2.425e-01 1.374e-01 -1.766 0.07742
## CountryEstonia 2.418e-01 1.324e-01 1.826 0.06782
## CountryEthiopia -1.678e-01 1.363e-01 -1.231 0.21831
## CountryFederated States of Micronesia -1.032e-02 1.347e-01 -0.077 0.93891
## CountryFiji 1.793e-02 1.377e-01 0.130 0.89643
## CountryFinland 2.966e-01 1.332e-01 2.226 0.02598
## CountryFrance 5.359e-02 1.329e-01 0.403 0.68686
## CountryGabon -2.330e-01 1.386e-01 -1.681 0.09276
## CountryGeorgia 2.780e-01 1.361e-01 2.043 0.04110
## CountryGermany 7.904e-02 1.365e-01 0.579 0.56250
## CountryGhana -1.516e-02 1.413e-01 -0.107 0.91459
## CountryGreece -1.329e-01 1.343e-01 -0.990 0.32226
## CountryGreenland 2.866e-01 1.384e-01 2.072 0.03831
## CountryGrenada 9.841e-03 1.368e-01 0.072 0.94266
## CountryGuatemala -2.766e-02 1.370e-01 -0.202 0.84001
## CountryGuinea -4.030e-01 1.379e-01 -2.923 0.00347
## CountryGuinea-Bissau -1.981e-01 1.374e-01 -1.441 0.14945
## CountryHaiti 2.636e-01 1.360e-01 1.939 0.05251
## CountryHonduras 1.541e-01 1.390e-01 1.109 0.26747
## CountryHungary 2.873e-01 1.311e-01 2.193 0.02834
## CountryIceland 2.447e-01 1.353e-01 1.808 0.07055
## CountryIndia 1.842e-01 1.366e-01 1.348 0.17755
## CountryIndonesia -6.279e-02 1.355e-01 -0.463 0.64302
## CountryIran 7.695e-02 1.375e-01 0.560 0.57575
## CountryIraq -2.276e-02 1.343e-01 -0.169 0.86544
## CountryIreland -2.812e-02 1.320e-01 -0.213 0.83127
## CountryIsrael -4.445e-02 1.328e-01 -0.335 0.73790
## CountryItaly 2.499e-01 1.339e-01 1.866 0.06203
## CountryJamaica -9.266e-02 1.391e-01 -0.666 0.50531
## CountryJapan 2.673e-01 1.354e-01 1.974 0.04836
## CountryJordan -5.940e-03 1.353e-01 -0.044 0.96497
## CountryKazakhstan 1.453e-01 1.369e-01 1.061 0.28856
## CountryKenya -2.452e-01 1.385e-01 -1.770 0.07673
## CountryKiribati -4.551e-02 1.401e-01 -0.325 0.74524
## CountryKosovo 2.664e-01 1.315e-01 2.026 0.04278
## CountryKuwait 4.369e-02 1.363e-01 0.321 0.74850
## CountryKyrgyzstan 1.418e-01 1.396e-01 1.016 0.30954
## CountryLaos 1.299e-01 1.364e-01 0.953 0.34074
## CountryLatvia 1.608e-01 1.311e-01 1.227 0.21990
## CountryLebanon -1.273e-01 1.349e-01 -0.944 0.34533
## CountryLesotho -1.936e-01 1.394e-01 -1.388 0.16503
## CountryLiberia -2.205e-01 1.366e-01 -1.614 0.10648
## CountryLibya 3.560e-02 1.352e-01 0.263 0.79235
## CountryLiechtenstein 4.125e-02 1.345e-01 0.307 0.75898
## CountryLithuania 2.546e-01 1.316e-01 1.935 0.05296
## CountryLuxembourg 1.235e-01 1.334e-01 0.925 0.35483
## CountryMacedonia 2.085e-01 1.312e-01 1.589 0.11203
## CountryMadagascar -1.702e-01 1.389e-01 -1.225 0.22050
## CountryMalawi -3.251e-01 1.381e-01 -2.355 0.01852
## CountryMalaysia -1.376e-03 1.396e-01 -0.010 0.99214
## CountryMaldives -7.130e-02 1.379e-01 -0.517 0.60521
## CountryMali -2.556e-01 1.377e-01 -1.856 0.06349
## CountryMalta 3.089e-03 1.301e-01 0.024 0.98106
## CountryMarshall Islands -1.648e-01 1.366e-01 -1.206 0.22763
## CountryMauritania -1.417e-01 1.379e-01 -1.027 0.30436
## CountryMauritius -6.839e-02 1.368e-01 -0.500 0.61704
## CountryMexico -5.416e-02 1.409e-01 -0.384 0.70073
## CountryMoldova 1.612e-01 1.350e-01 1.194 0.23239
## CountryMonaco 6.471e-02 1.324e-01 0.489 0.62514
## CountryMongolia -8.362e-02 1.374e-01 -0.609 0.54280
## CountryMontenegro 1.192e-01 1.326e-01 0.899 0.36849
## CountryMorocco -8.789e-02 1.351e-01 -0.650 0.51539
## CountryMozambique -3.324e-01 1.346e-01 -2.470 0.01351
## CountryMyanmar 1.199e-01 1.362e-01 0.880 0.37869
## CountryNamibia -2.253e-01 1.355e-01 -1.662 0.09649
## CountryNauru 1.052e-01 1.380e-01 0.762 0.44596
## CountryNepal 1.482e-01 1.374e-01 1.079 0.28075
## CountryNetherlands 2.359e-01 1.340e-01 1.761 0.07829
## CountryNew Zealand -6.424e-02 1.336e-01 -0.481 0.63050
## CountryNicaragua -3.552e-02 1.396e-01 -0.254 0.79921
## CountryNiger -1.621e-01 1.376e-01 -1.178 0.23872
## CountryNigeria -1.864e-01 1.415e-01 -1.317 0.18782
## CountryNorth Korea 1.788e-01 1.377e-01 1.298 0.19423
## CountryNorway 2.029e-01 1.329e-01 1.527 0.12676
## CountryOman -8.773e-02 1.358e-01 -0.646 0.51811
## CountryPakistan -6.478e-02 1.349e-01 -0.480 0.63104
## CountryPalau -6.457e-02 1.390e-01 -0.465 0.64227
## CountryPanama -1.329e-01 1.359e-01 -0.977 0.32834
## CountryPapua New Guinea 8.648e-02 1.371e-01 0.631 0.52808
## CountryPhilippines 1.215e-01 1.394e-01 0.872 0.38324
## CountryPoland 1.119e-02 1.349e-01 0.083 0.93386
## CountryPortugal 2.726e-01 1.312e-01 2.077 0.03782
## CountryQatar -3.264e-02 1.351e-01 -0.242 0.80906
## CountryRepublic of the Congo -3.384e-01 1.392e-01 -2.430 0.01508
## CountryRomania 2.005e-01 1.317e-01 1.523 0.12780
## CountryRussia 1.895e-01 1.346e-01 1.407 0.15932
## CountryRwanda -1.994e-01 1.358e-01 -1.468 0.14209
## CountrySaint Kitts and Nevis 1.674e-01 1.382e-01 1.211 0.22598
## CountrySaint Lucia -1.006e-01 1.409e-01 -0.714 0.47526
## CountrySaint Vincent and the Grenadines 1.444e-01 1.364e-01 1.059 0.28982
## CountrySamoa 1.857e-01 1.360e-01 1.365 0.17220
## CountrySan Marino 5.395e-02 1.331e-01 0.405 0.68522
## CountrySao Tome and Principe -2.779e-01 1.354e-01 -2.052 0.04015
## CountrySaudi Arabia 4.784e-02 1.360e-01 0.352 0.72507
## CountrySenegal -1.872e-01 1.377e-01 -1.359 0.17408
## CountrySerbia 1.512e-01 1.336e-01 1.132 0.25765
## CountrySeychelles -1.862e-01 1.360e-01 -1.369 0.17110
## CountrySierra Leone -3.512e-01 1.378e-01 -2.548 0.01083
## CountrySingapore -1.011e-02 1.358e-01 -0.074 0.94067
## CountrySlovakia 2.312e-01 1.321e-01 1.751 0.08000
## CountrySlovenia 1.903e-01 1.352e-01 1.408 0.15918
## CountrySolomon Islands -1.603e-01 1.398e-01 -1.147 0.25139
## CountrySomalia 1.613e-02 1.358e-01 0.119 0.90540
## CountrySouth Africa -1.135e-01 1.367e-01 -0.830 0.40627
## CountrySouth Korea 8.127e-02 1.339e-01 0.607 0.54393
## CountrySouth Sudan -3.535e-02 1.363e-01 -0.259 0.79533
## CountrySpain 2.716e-01 1.332e-01 2.038 0.04153
## CountrySri Lanka 2.679e-02 1.389e-01 0.193 0.84705
## CountrySudan -1.625e-01 1.323e-01 -1.228 0.21932
## CountrySwaziland -1.276e-01 1.382e-01 -0.923 0.35591
## CountrySweden 1.876e-01 1.336e-01 1.404 0.16035
## CountrySwitzerland 1.749e-02 1.354e-01 0.129 0.89723
## CountrySyria -1.248e-01 1.340e-01 -0.931 0.35179
## CountryTaiwan -3.752e-03 1.373e-01 -0.027 0.97821
## CountryTajikistan 6.164e-02 1.365e-01 0.451 0.65167
## CountryTanzania -3.200e-01 1.392e-01 -2.299 0.02152
## CountryThailand 5.806e-02 1.377e-01 0.422 0.67335
## CountryThe Bahamas 7.275e-02 1.387e-01 0.525 0.59984
## CountryThe Gambia -1.883e-01 1.380e-01 -1.364 0.17261
## CountryTogo -3.236e-01 1.395e-01 -2.321 0.02031
## CountryTonga 1.646e-02 1.381e-01 0.119 0.90509
## CountryTrinidad and Tobago NA NA NA NA
## CountryTunisia -4.468e-02 1.327e-01 -0.337 0.73632
## CountryTurkey -5.231e-02 1.359e-01 -0.385 0.70031
## CountryTurkmenistan -1.093e-01 1.372e-01 -0.797 0.42543
## CountryTuvalu -7.778e-02 1.384e-01 -0.562 0.57412
## CountryUganda -1.893e-01 1.374e-01 -1.377 0.16838
## CountryUkraine 6.311e-02 1.345e-01 0.469 0.63890
## CountryUnited Arab Emirates 6.882e-02 1.339e-01 0.514 0.60730
## CountryUnited Kingdom 9.758e-02 1.347e-01 0.724 0.46885
## CountryUnited States of America NA NA NA NA
## CountryUzbekistan 1.253e-01 1.386e-01 0.904 0.36610
## CountryVanuatu NA NA NA NA
## CountryVatican City NA NA NA NA
## CountryVietnam NA NA NA NA
## CountryYemen 1.899e-02 1.329e-01 0.143 0.88643
## CountryZambia -2.172e-01 1.373e-01 -1.582 0.11363
## CountryZimbabwe NA NA NA NA
## `Item Type`Beverages 1.161e-02 3.622e-02 0.321 0.74851
## `Item Type`Cereal 3.655e-02 3.455e-02 1.058 0.29016
## `Item Type`Clothes 1.394e-04 3.469e-02 0.004 0.99679
## `Item Type`Cosmetics 3.652e-02 3.609e-02 1.012 0.31163
## `Item Type`Fruits 2.181e-02 3.678e-02 0.593 0.55323
## `Item Type`Household 3.391e-02 3.582e-02 0.947 0.34382
## `Item Type`Meat -2.290e-02 3.503e-02 -0.654 0.51319
## `Item Type`Office Supplies 2.297e-02 3.482e-02 0.660 0.50940
## `Item Type`Personal Care 3.959e-02 3.568e-02 1.110 0.26719
## `Item Type`Snacks -5.558e-03 3.498e-02 -0.159 0.87373
## `Item Type`Vegetables 2.878e-02 3.494e-02 0.824 0.41004
## `Order Priority`H -6.158e-03 2.008e-02 -0.307 0.75914
## `Order Priority`L 1.155e-02 2.006e-02 0.576 0.56490
## `Order Priority`M 2.662e-04 2.007e-02 0.013 0.98942
## `Total Profit` -1.534e-08 2.596e-08 -0.591 0.55465
##
## (Intercept)
## RegionAustralia and Oceania
## RegionCentral America and the Caribbean
## RegionEurope
## RegionMiddle East and North Africa
## RegionNorth America
## RegionSub-Saharan Africa .
## CountryAlbania
## CountryAlgeria .
## CountryAndorra .
## CountryAngola *
## CountryAntigua and Barbuda
## CountryArmenia
## CountryAustralia
## CountryAustria
## CountryAzerbaijan
## CountryBahrain
## CountryBangladesh
## CountryBarbados
## CountryBelarus
## CountryBelgium **
## CountryBelize
## CountryBenin
## CountryBhutan
## CountryBosnia and Herzegovina
## CountryBotswana
## CountryBrunei
## CountryBulgaria
## CountryBurkina Faso *
## CountryBurundi
## CountryCambodia
## CountryCameroon *
## CountryCanada
## CountryCape Verde
## CountryCentral African Republic
## CountryChad **
## CountryChina
## CountryComoros
## CountryCosta Rica
## CountryCote d'Ivoire .
## CountryCroatia
## CountryCuba
## CountryCyprus
## CountryCzech Republic
## CountryDemocratic Republic of the Congo **
## CountryDenmark
## CountryDjibouti *
## CountryDominica
## CountryDominican Republic
## CountryEast Timor
## CountryEgypt
## CountryEl Salvador
## CountryEquatorial Guinea .
## CountryEritrea .
## CountryEstonia .
## CountryEthiopia
## CountryFederated States of Micronesia
## CountryFiji
## CountryFinland *
## CountryFrance
## CountryGabon .
## CountryGeorgia *
## CountryGermany
## CountryGhana
## CountryGreece
## CountryGreenland *
## CountryGrenada
## CountryGuatemala
## CountryGuinea **
## CountryGuinea-Bissau
## CountryHaiti .
## CountryHonduras
## CountryHungary *
## CountryIceland .
## CountryIndia
## CountryIndonesia
## CountryIran
## CountryIraq
## CountryIreland
## CountryIsrael
## CountryItaly .
## CountryJamaica
## CountryJapan *
## CountryJordan
## CountryKazakhstan
## CountryKenya .
## CountryKiribati
## CountryKosovo *
## CountryKuwait
## CountryKyrgyzstan
## CountryLaos
## CountryLatvia
## CountryLebanon
## CountryLesotho
## CountryLiberia
## CountryLibya
## CountryLiechtenstein
## CountryLithuania .
## CountryLuxembourg
## CountryMacedonia
## CountryMadagascar
## CountryMalawi *
## CountryMalaysia
## CountryMaldives
## CountryMali .
## CountryMalta
## CountryMarshall Islands
## CountryMauritania
## CountryMauritius
## CountryMexico
## CountryMoldova
## CountryMonaco
## CountryMongolia
## CountryMontenegro
## CountryMorocco
## CountryMozambique *
## CountryMyanmar
## CountryNamibia .
## CountryNauru
## CountryNepal
## CountryNetherlands .
## CountryNew Zealand
## CountryNicaragua
## CountryNiger
## CountryNigeria
## CountryNorth Korea
## CountryNorway
## CountryOman
## CountryPakistan
## CountryPalau
## CountryPanama
## CountryPapua New Guinea
## CountryPhilippines
## CountryPoland
## CountryPortugal *
## CountryQatar
## CountryRepublic of the Congo *
## CountryRomania
## CountryRussia
## CountryRwanda
## CountrySaint Kitts and Nevis
## CountrySaint Lucia
## CountrySaint Vincent and the Grenadines
## CountrySamoa
## CountrySan Marino
## CountrySao Tome and Principe *
## CountrySaudi Arabia
## CountrySenegal
## CountrySerbia
## CountrySeychelles
## CountrySierra Leone *
## CountrySingapore
## CountrySlovakia .
## CountrySlovenia
## CountrySolomon Islands
## CountrySomalia
## CountrySouth Africa
## CountrySouth Korea
## CountrySouth Sudan
## CountrySpain *
## CountrySri Lanka
## CountrySudan
## CountrySwaziland
## CountrySweden
## CountrySwitzerland
## CountrySyria
## CountryTaiwan
## CountryTajikistan
## CountryTanzania *
## CountryThailand
## CountryThe Bahamas
## CountryThe Gambia
## CountryTogo *
## CountryTonga
## CountryTrinidad and Tobago
## CountryTunisia
## CountryTurkey
## CountryTurkmenistan
## CountryTuvalu
## CountryUganda
## CountryUkraine
## CountryUnited Arab Emirates
## CountryUnited Kingdom
## CountryUnited States of America
## CountryUzbekistan
## CountryVanuatu
## CountryVatican City
## CountryVietnam
## CountryYemen
## CountryZambia
## CountryZimbabwe
## `Item Type`Beverages
## `Item Type`Cereal
## `Item Type`Clothes
## `Item Type`Cosmetics
## `Item Type`Fruits
## `Item Type`Household
## `Item Type`Meat
## `Item Type`Office Supplies
## `Item Type`Personal Care
## `Item Type`Snacks
## `Item Type`Vegetables
## `Order Priority`H
## `Order Priority`L
## `Order Priority`M
## `Total Profit`
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 110905 on 80000 degrees of freedom
## Residual deviance: 110698 on 79801 degrees of freedom
## AIC: 111098
##
## Number of Fisher Scoring iterations: 3
predictions1 <- predict(smallLRModel, trainBigdata, type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
head(predictions1)
## 1 2 3 4 5 6
## 0.5045163 0.4524548 0.4871299 0.4907894 0.4797097 0.4984959
#confusion metrix
pred1 <- ifelse(predictions1>0.5, 1, 0)
confMetrixLR <- table(Predicted = pred1, Actual = trainBigdata$`Sales Channel`)
confMetrixLR
## Actual
## Predicted Offline Online
## 0 19303 17666
## 1 20654 22378
#Accuracy
(confMetrixLR[[1,1]] +confMetrixLR[[2,2]])/sum(confMetrixLR)
## [1] 0.521006
Let’s apply the prediction to test data now
p2 <- predict(smallLRModel, testBigdata, type = 'response')
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from a rank-deficient fit may be misleading
pred2 <- ifelse(p2>0.5, 1, 0)
confMetrixLR2 <- table(Predicted = pred2, Actual = testBigdata$`Sales Channel`)
confMetrixLR2
## Actual
## Predicted Offline Online
## 0 4654 4626
## 1 5335 5384
#Accuracy
(confMetrixLR2[[1,1]] +confMetrixLR2[[2,2]])/sum(confMetrixLR2)
## [1] 0.5019251
We could see that the logistic regression provides an accuracy of 50.19%
I have applied Decision Tree and Logistic Regression model on the same set of data and tried to predict the sales Channel.After executing different models on different size of data , I didn’t observer any improvement in the efficacy of the model. Probably the data we have many not have enough information to predict the Sales Channel.
We may probably be able to predict other information like Total profits based on some other predictor variable.