library(rcompanion)
library(datasets)
library(caTools)
library(dplyr)
library(party)
library(rpart.plot)
library(caret)
df100 <- read.csv("https://raw.githubusercontent.com/ltcancel/DATA622/main/Homework1/1000%20Sales%20Records.csv")
df1000 <- read.csv("https://raw.githubusercontent.com/ltcancel/DATA622/main/Homework1/100000%20Sales%20Records.csv")
head(df100)
## Region Country Item.Type Sales.Channel Order.Priority
## 1 Middle East and North Africa Libya Cosmetics Offline M
## 2 North America Canada Vegetables Online M
## 3 Middle East and North Africa Libya Baby Food Offline C
## 4 Asia Japan Cereal Offline C
## 5 Sub-Saharan Africa Chad Fruits Offline H
## 6 Europe Armenia Cereal Online H
## Order.Date Order.ID Ship.Date Units.Sold Unit.Price Unit.Cost Total.Revenue
## 1 10/18/2014 686800706 10/31/2014 8446 437.20 263.33 3692591.20
## 2 11/7/2011 185941302 12/8/2011 3018 154.06 90.93 464953.08
## 3 10/31/2016 246222341 12/9/2016 1517 255.28 159.42 387259.76
## 4 4/10/2010 161442649 5/12/2010 3322 205.70 117.11 683335.40
## 5 8/16/2011 645713555 8/31/2011 9845 9.33 6.92 91853.85
## 6 11/24/2014 683458888 12/28/2014 9528 205.70 117.11 1959909.60
## Total.Cost Total.Profit
## 1 2224085.2 1468506.02
## 2 274426.7 190526.34
## 3 241840.1 145419.62
## 4 389039.4 294295.98
## 5 68127.4 23726.45
## 6 1115824.1 844085.52
head(df1000)
## Region Country Item.Type
## 1 Middle East and North Africa Azerbaijan Snacks
## 2 Central America and the Caribbean Panama Cosmetics
## 3 Sub-Saharan Africa Sao Tome and Principe Fruits
## 4 Sub-Saharan Africa Sao Tome and Principe Personal Care
## 5 Central America and the Caribbean Belize Household
## 6 Europe Denmark Clothes
## Sales.Channel Order.Priority Order.Date Order.ID Ship.Date Units.Sold
## 1 Online C 10/8/2014 535113847 10/23/2014 934
## 2 Offline L 2/22/2015 874708545 2/27/2015 4551
## 3 Offline M 12/9/2015 854349935 1/18/2016 9986
## 4 Online M 9/17/2014 892836844 10/12/2014 9118
## 5 Offline H 2/4/2010 129280602 3/5/2010 5858
## 6 Online C 2/20/2013 473105037 2/28/2013 1149
## Unit.Price Unit.Cost Total.Revenue Total.Cost Total.Profit
## 1 152.58 97.44 142509.72 91008.96 51500.76
## 2 437.20 263.33 1989697.20 1198414.83 791282.37
## 3 9.33 6.92 93169.38 69103.12 24066.26
## 4 81.73 56.67 745214.14 516717.06 228497.08
## 5 668.27 502.54 3914725.66 2943879.32 970846.34
## 6 109.28 35.84 125562.72 41180.16 84382.56
The first dataframe contains 1000 observations and 14 variables. 7 variables are characters, 2 are integers, and the remaining are numbers.
str(df100)
## 'data.frame': 1000 obs. of 14 variables:
## $ Region : chr "Middle East and North Africa" "North America" "Middle East and North Africa" "Asia" ...
## $ Country : chr "Libya" "Canada" "Libya" "Japan" ...
## $ Item.Type : chr "Cosmetics" "Vegetables" "Baby Food" "Cereal" ...
## $ Sales.Channel : chr "Offline" "Online" "Offline" "Offline" ...
## $ Order.Priority: chr "M" "M" "C" "C" ...
## $ Order.Date : chr "10/18/2014" "11/7/2011" "10/31/2016" "4/10/2010" ...
## $ Order.ID : int 686800706 185941302 246222341 161442649 645713555 683458888 679414975 208630645 266467225 118598544 ...
## $ Ship.Date : chr "10/31/2014" "12/8/2011" "12/9/2016" "5/12/2010" ...
## $ Units.Sold : int 8446 3018 1517 3322 9845 9528 2844 7299 2428 4800 ...
## $ Unit.Price : num 437.2 154.06 255.28 205.7 9.33 ...
## $ Unit.Cost : num 263.33 90.93 159.42 117.11 6.92 ...
## $ Total.Revenue : num 3692591 464953 387260 683335 91854 ...
## $ Total.Cost : num 2224085 274427 241840 389039 68127 ...
## $ Total.Profit : num 1468506 190526 145420 294296 23726 ...
The second dataframe contains 100,000 observations and 14 variables. The variable datatypes are exactly the same as the first dataframe.
str(df1000)
## 'data.frame': 100000 obs. of 14 variables:
## $ Region : chr "Middle East and North Africa" "Central America and the Caribbean" "Sub-Saharan Africa" "Sub-Saharan Africa" ...
## $ Country : chr "Azerbaijan" "Panama" "Sao Tome and Principe" "Sao Tome and Principe" ...
## $ Item.Type : chr "Snacks" "Cosmetics" "Fruits" "Personal Care" ...
## $ Sales.Channel : chr "Online" "Offline" "Offline" "Online" ...
## $ Order.Priority: chr "C" "L" "M" "M" ...
## $ Order.Date : chr "10/8/2014" "2/22/2015" "12/9/2015" "9/17/2014" ...
## $ Order.ID : int 535113847 874708545 854349935 892836844 129280602 473105037 754046475 772153747 847788178 471623599 ...
## $ Ship.Date : chr "10/23/2014" "2/27/2015" "1/18/2016" "10/12/2014" ...
## $ Units.Sold : int 934 4551 9986 9118 5858 1149 7964 6307 8217 2758 ...
## $ Unit.Price : num 152.58 437.2 9.33 81.73 668.27 ...
## $ Unit.Cost : num 97.44 263.33 6.92 56.67 502.54 ...
## $ Total.Revenue : num 142510 1989697 93169 745214 3914726 ...
## $ Total.Cost : num 91009 1198415 69103 516717 2943879 ...
## $ Total.Profit : num 51501 791282 24066 228497 970846 ...
Both dataframes look nearly identical, other than the obvious difference in the number of observations. If we look a little closer to the summary of each data set we can find some more differences. The minimum values for the Total.Revenue variable differs. The minimum for the first dataframe is 2043 and the minimum for the second dataframe is 19. Total.Cost and Total.Profit also show a significant difference with a minimum value of 1417 and 532.6 for respectively for the first dataframe and 14 and 4.8 for the second dataframe. Neither dataframe has NA values. The minimum and maximum values do not look off. They all look like valid values based on the variable names.
From the summaries below we can see that both datasets seem relatively simple with mostly categorical data so I will use Decision Tree and SVM models.
#check for NA values
summary(df100)
## Region Country Item.Type Sales.Channel
## Length:1000 Length:1000 Length:1000 Length:1000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Order.Priority Order.Date Order.ID Ship.Date
## Length:1000 Length:1000 Min. :102928006 Length:1000
## Class :character Class :character 1st Qu.:328074026 Class :character
## Mode :character Mode :character Median :556609714 Mode :character
## Mean :549681325
## 3rd Qu.:769694483
## Max. :995529830
## Units.Sold Unit.Price Unit.Cost Total.Revenue
## Min. : 13 Min. : 9.33 Min. : 6.92 Min. : 2043
## 1st Qu.:2420 1st Qu.: 81.73 1st Qu.: 56.67 1st Qu.: 281192
## Median :5184 Median :154.06 Median : 97.44 Median : 754939
## Mean :5054 Mean :262.11 Mean :184.97 Mean :1327322
## 3rd Qu.:7537 3rd Qu.:421.89 3rd Qu.:263.33 3rd Qu.:1733503
## Max. :9998 Max. :668.27 Max. :524.96 Max. :6617210
## Total.Cost Total.Profit
## Min. : 1417 Min. : 532.6
## 1st Qu.: 164932 1st Qu.: 98376.1
## Median : 464726 Median : 277226.0
## Mean : 936119 Mean : 391202.6
## 3rd Qu.:1141750 3rd Qu.: 548456.8
## Max. :5204978 Max. :1726181.4
summary(df1000)
## Region Country Item.Type Sales.Channel
## Length:100000 Length:100000 Length:100000 Length:100000
## Class :character Class :character Class :character Class :character
## Mode :character Mode :character Mode :character Mode :character
##
##
##
## Order.Priority Order.Date Order.ID Ship.Date
## Length:100000 Length:100000 Min. :100008904 Length:100000
## Class :character Class :character 1st Qu.:326046383 Class :character
## Mode :character Mode :character Median :547718512 Mode :character
## Mean :550395554
## 3rd Qu.:775078534
## Max. :999996459
## Units.Sold Unit.Price Unit.Cost Total.Revenue
## Min. : 1 Min. : 9.33 Min. : 6.92 Min. : 19
## 1st Qu.: 2505 1st Qu.:109.28 1st Qu.: 56.67 1st Qu.: 279753
## Median : 5007 Median :205.70 Median :117.11 Median : 789892
## Mean : 5001 Mean :266.70 Mean :188.02 Mean :1336067
## 3rd Qu.: 7495 3rd Qu.:437.20 3rd Qu.:364.69 3rd Qu.:1836490
## Max. :10000 Max. :668.27 Max. :524.96 Max. :6682700
## Total.Cost Total.Profit
## Min. : 14 Min. : 4.8
## 1st Qu.: 162928 1st Qu.: 95900.0
## Median : 467937 Median : 283657.5
## Mean : 941975 Mean : 394091.2
## 3rd Qu.:1209475 3rd Qu.: 568384.1
## Max. :5249075 Max. :1738700.0
The modeling for this data is very experimental because it is not clear what the business problem or question is. What are we attempting to solve for? Since we have two clear distinct categorical variables, Sales Channel and Order Priority, we can use one to be our predictor variable.
We will use a Decision Tree as our modeling technique for both datasets.
df100$Sales.Channel <- as.factor(df100$Sales.Channel)
df100$Order.Priority <- as.factor(df100$Order.Priority)
df1000$Sales.Channel <- as.factor(df1000$Sales.Channel)
df1000$Order.Priority <- as.factor(df1000$Order.Priority)
str(df1000)
## 'data.frame': 100000 obs. of 14 variables:
## $ Region : chr "Middle East and North Africa" "Central America and the Caribbean" "Sub-Saharan Africa" "Sub-Saharan Africa" ...
## $ Country : chr "Azerbaijan" "Panama" "Sao Tome and Principe" "Sao Tome and Principe" ...
## $ Item.Type : chr "Snacks" "Cosmetics" "Fruits" "Personal Care" ...
## $ Sales.Channel : Factor w/ 2 levels "Offline","Online": 2 1 1 2 1 2 1 2 2 2 ...
## $ Order.Priority: Factor w/ 4 levels "C","H","L","M": 1 3 4 4 2 1 4 1 2 2 ...
## $ Order.Date : chr "10/8/2014" "2/22/2015" "12/9/2015" "9/17/2014" ...
## $ Order.ID : int 535113847 874708545 854349935 892836844 129280602 473105037 754046475 772153747 847788178 471623599 ...
## $ Ship.Date : chr "10/23/2014" "2/27/2015" "1/18/2016" "10/12/2014" ...
## $ Units.Sold : int 934 4551 9986 9118 5858 1149 7964 6307 8217 2758 ...
## $ Unit.Price : num 152.58 437.2 9.33 81.73 668.27 ...
## $ Unit.Cost : num 97.44 263.33 6.92 56.67 502.54 ...
## $ Total.Revenue : num 142510 1989697 93169 745214 3914726 ...
## $ Total.Cost : num 91009 1198415 69103 516717 2943879 ...
## $ Total.Profit : num 51501 791282 24066 228497 970846 ...
#sample set
df100_data <- df100 %>%
select(Order.Priority, Units.Sold, Unit.Price, Unit.Cost, Total.Revenue, Total.Cost, Total.Profit)
set.seed(1234)
#dt <- sort(sample(nrow(df100_data),nrow(df100_data)*.7))
#train_data <- df100_data[df,]
sample_data <- sample.split(df100_data, SplitRatio = 0.80)
train_data <- subset(df100_data, sample_data == TRUE)
test_data <- subset(df100_data, sample_data == FALSE)
head(train_data)
## Order.Priority Units.Sold Unit.Price Unit.Cost Total.Revenue Total.Cost
## 1 M 8446 437.20 263.33 3692591.2 2224085.2
## 2 M 3018 154.06 90.93 464953.1 274426.7
## 3 C 1517 255.28 159.42 387259.8 241840.1
## 4 C 3322 205.70 117.11 683335.4 389039.4
## 7 H 2844 205.70 117.11 585010.8 333060.8
## 8 M 7299 109.28 35.84 797634.7 261596.2
## Total.Profit
## 1 1468506.0
## 2 190526.3
## 3 145419.6
## 4 294296.0
## 7 251950.0
## 8 536038.6
Build tree for first set
rtree <- rpart(Order.Priority ~ ., data = train_data, method="class", control = rpart.control(minsplit = 20, minbucket = 7, maxdepth = 10, usesurrogate = 2, xval = 10))
rtree
## n= 714
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 714 521 L (0.26610644 0.22689076 0.27030812 0.23669468)
## 2) Units.Sold< 6835 468 327 C (0.30128205 0.23504274 0.22863248 0.23504274)
## 4) Units.Sold>=6233.5 46 25 C (0.45652174 0.19565217 0.26086957 0.08695652) *
## 5) Units.Sold< 6233.5 422 302 C (0.28436019 0.23933649 0.22511848 0.25118483)
## 10) Total.Cost< 1481696 377 275 C (0.27055703 0.25994695 0.22811671 0.24137931)
## 20) Total.Profit< 217526 250 169 C (0.32400000 0.24400000 0.19200000 0.24000000) *
## 21) Total.Profit>=217526 127 89 L (0.16535433 0.29133858 0.29921260 0.24409449)
## 42) Total.Revenue>=1549050 16 8 H (0.31250000 0.50000000 0.12500000 0.06250000) *
## 43) Total.Revenue< 1549050 111 75 L (0.14414414 0.26126126 0.32432432 0.27027027)
## 86) Total.Revenue< 951215.4 66 44 H (0.13636364 0.33333333 0.24242424 0.28787879) *
## 87) Total.Revenue>=951215.4 45 25 L (0.15555556 0.15555556 0.44444444 0.24444444) *
## 11) Total.Cost>=1481696 45 27 C (0.40000000 0.06666667 0.20000000 0.33333333) *
## 3) Units.Sold>=6835 246 160 L (0.19918699 0.21138211 0.34959350 0.23983740) *
Plot the tree
rpart.plot(rtree)
Prediction of first dataset
pred <- predict(rtree, test_data, type = "class")
pred_table <- table(test_data$Order.Priority, pred)
pred_table
## pred
## C H L M
## C 42 11 19 0
## H 28 9 29 0
## L 34 17 24 0
## M 30 11 32 0
Confusion Matrix to test for accuracy
confusionMatrix(test_data$Order.Priority, pred)
## Confusion Matrix and Statistics
##
## Reference
## Prediction C H L M
## C 42 11 19 0
## H 28 9 29 0
## L 34 17 24 0
## M 30 11 32 0
##
## Overall Statistics
##
## Accuracy : 0.2622
## 95% CI : (0.2122, 0.3173)
## No Information Rate : 0.4685
## P-Value [Acc > NIR] : 1
##
## Kappa : 0.0136
##
## Mcnemar's Test P-Value : <2e-16
##
## Statistics by Class:
##
## Class: C Class: H Class: L Class: M
## Sensitivity 0.3134 0.18750 0.23077 NA
## Specificity 0.8026 0.76050 0.71978 0.7448
## Pos Pred Value 0.5833 0.13636 0.32000 NA
## Neg Pred Value 0.5701 0.82273 0.62085 NA
## Prevalence 0.4685 0.16783 0.36364 0.0000
## Detection Rate 0.1469 0.03147 0.08392 0.0000
## Detection Prevalence 0.2517 0.23077 0.26224 0.2552
## Balanced Accuracy 0.5580 0.47400 0.47527 NA
We will build the same decision tree with the second dataset
#sample set
df1000_data <- df1000 %>%
select(Order.Priority, Units.Sold, Unit.Price, Unit.Cost, Total.Revenue, Total.Cost, Total.Profit)
set.seed(1234)
sample_data2 <- sample.split(df1000_data, SplitRatio = 0.80)
train_data2 <- subset(df1000_data, sample_data2 == TRUE)
test_data2 <- subset(df1000_data, sample_data2 == FALSE)
head(train_data2)
## Order.Priority Units.Sold Unit.Price Unit.Cost Total.Revenue Total.Cost
## 1 C 934 152.58 97.44 142509.72 91008.96
## 2 L 4551 437.20 263.33 1989697.20 1198414.83
## 3 M 9986 9.33 6.92 93169.38 69103.12
## 4 M 9118 81.73 56.67 745214.14 516717.06
## 7 M 7964 437.20 263.33 3481860.80 2097160.12
## 8 C 6307 9.33 6.92 58844.31 43644.44
## Total.Profit
## 1 51500.76
## 2 791282.37
## 3 24066.26
## 4 228497.08
## 7 1384700.68
## 8 15199.87
Build tree for second set
rtree2 <- rpart(Order.Priority ~ ., data = train_data2, method="class", control = rpart.control(minsplit = 4, minbucket = round(5/3), maxdepth = 3))
rtree2
## n= 71429
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 71429 53521 M (0.2506685 0.2483445 0.2502765 0.2507105) *
Plot the tree for second set
#rpart.plot(rtree2)