Data 622 Homework 1

Data Exploration

First we use the glimpse function to get a better understanding of our data. First Sales record file is (1000, 14) and the second is (1000000, 14). Both data sets have the same number of columns and the column names/data types are the same.

df1 <- read.csv("1000 Sales Records.csv")
df2 <- read.csv("50000 Sales Records.csv")

glimpse(df1)

## Rows: 1,000
## Columns: 14
## $ Region         <chr> "Middle East and North Africa", "North America", "Middl…
## $ Country        <chr> "Libya", "Canada", "Libya", "Japan", "Chad", "Armenia",…
## $ Item.Type      <chr> "Cosmetics", "Vegetables", "Baby Food", "Cereal", "Frui…
## $ Sales.Channel  <chr> "Offline", "Online", "Offline", "Offline", "Offline", "…
## $ Order.Priority <chr> "M", "M", "C", "C", "H", "H", "H", "M", "H", "H", "M", …
## $ Order.Date     <chr> "10/18/2014", "11/7/2011", "10/31/2016", "4/10/2010", "…
## $ Order.ID       <int> 686800706, 185941302, 246222341, 161442649, 645713555, …
## $ Ship.Date      <chr> "10/31/2014", "12/8/2011", "12/9/2016", "5/12/2010", "8…
## $ Units.Sold     <int> 8446, 3018, 1517, 3322, 9845, 9528, 2844, 7299, 2428, 4…
## $ Unit.Price     <dbl> 437.20, 154.06, 255.28, 205.70, 9.33, 205.70, 205.70, 1…
## $ Unit.Cost      <dbl> 263.33, 90.93, 159.42, 117.11, 6.92, 117.11, 117.11, 35…
## $ Total.Revenue  <dbl> 3692591.20, 464953.08, 387259.76, 683335.40, 91853.85, …
## $ Total.Cost     <dbl> 2224085.18, 274426.74, 241840.14, 389039.42, 68127.40, …
## $ Total.Profit   <dbl> 1468506.02, 190526.34, 145419.62, 294295.98, 23726.45, …

glimpse(df2)

## Rows: 50,000
## Columns: 14
## $ Region         <chr> "Sub-Saharan Africa", "Europe", "Europe", "Europe", "Eu…
## $ Country        <chr> "Namibia", "Iceland", "Russia", "Moldova ", "Malta", "I…
## $ Item.Type      <chr> "Household", "Baby Food", "Meat", "Meat", "Cereal", "Me…
## $ Sales.Channel  <chr> "Offline", "Online", "Online", "Online", "Online", "Onl…
## $ Order.Priority <chr> "M", "H", "L", "L", "M", "H", "M", "L", "M", "C", "M", …
## $ Order.Date     <chr> "8/31/2015", "11/20/2010", "6/22/2017", "2/28/2012", "8…
## $ Order.ID       <int> 897751939, 599480426, 538911855, 459845054, 626391351, …
## $ Ship.Date      <chr> "10/12/2015", "1/9/2011", "6/25/2017", "3/20/2012", "9/…
## $ Units.Sold     <int> 3604, 8435, 4848, 7225, 1975, 2542, 4398, 49, 4031, 791…
## $ Unit.Price     <dbl> 668.27, 255.28, 421.89, 421.89, 205.70, 421.89, 668.27,…
## $ Unit.Cost      <dbl> 502.54, 159.42, 364.69, 364.69, 117.11, 364.69, 502.54,…
## $ Total.Revenue  <dbl> 2408445.08, 2153286.80, 2045322.72, 3048155.25, 406257.…
## $ Total.Cost     <dbl> 1811154.16, 1344707.70, 1768017.12, 2634885.25, 231292.…
## $ Total.Profit   <dbl> 597290.92, 808579.10, 277305.60, 413270.00, 174965.25, …

Looking at a summary of both data sets shows that even though df2 has 1,000 orders of magnitude more than df1 the means and medians of the columns are similar. This is useful to know - because it means they likely have similar distributions.

summary(df1)

##     Region            Country           Item.Type         Sales.Channel     
##  Length:1000        Length:1000        Length:1000        Length:1000       
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Order.Priority      Order.Date           Order.ID          Ship.Date        
##  Length:1000        Length:1000        Min.   :102928006   Length:1000       
##  Class :character   Class :character   1st Qu.:328074026   Class :character  
##  Mode  :character   Mode  :character   Median :556609714   Mode  :character  
##                                        Mean   :549681325                     
##                                        3rd Qu.:769694483                     
##                                        Max.   :995529830                     
##    Units.Sold     Unit.Price       Unit.Cost      Total.Revenue    
##  Min.   :  13   Min.   :  9.33   Min.   :  6.92   Min.   :   2043  
##  1st Qu.:2420   1st Qu.: 81.73   1st Qu.: 56.67   1st Qu.: 281192  
##  Median :5184   Median :154.06   Median : 97.44   Median : 754939  
##  Mean   :5054   Mean   :262.11   Mean   :184.97   Mean   :1327322  
##  3rd Qu.:7537   3rd Qu.:421.89   3rd Qu.:263.33   3rd Qu.:1733503  
##  Max.   :9998   Max.   :668.27   Max.   :524.96   Max.   :6617210  
##    Total.Cost       Total.Profit      
##  Min.   :   1417   Min.   :    532.6  
##  1st Qu.: 164932   1st Qu.:  98376.1  
##  Median : 464726   Median : 277226.0  
##  Mean   : 936119   Mean   : 391202.6  
##  3rd Qu.:1141750   3rd Qu.: 548456.8  
##  Max.   :5204978   Max.   :1726181.4

summary(df2)

##     Region            Country           Item.Type         Sales.Channel     
##  Length:50000       Length:50000       Length:50000       Length:50000      
##  Class :character   Class :character   Class :character   Class :character  
##  Mode  :character   Mode  :character   Mode  :character   Mode  :character  
##                                                                             
##                                                                             
##                                                                             
##  Order.Priority      Order.Date           Order.ID          Ship.Date        
##  Length:50000       Length:50000       Min.   :100013196   Length:50000      
##  Class :character   Class :character   1st Qu.:324007046   Class :character  
##  Mode  :character   Mode  :character   Median :550422394   Mode  :character  
##                                        Mean   :549733027                     
##                                        3rd Qu.:776782381                     
##                                        Max.   :999999463                     
##    Units.Sold      Unit.Price       Unit.Cost      Total.Revenue    
##  Min.   :    1   Min.   :  9.33   Min.   :  6.92   Min.   :     28  
##  1st Qu.: 2498   1st Qu.: 81.73   1st Qu.: 35.84   1st Qu.: 276487  
##  Median : 5018   Median :154.06   Median : 97.44   Median : 781325  
##  Mean   : 5000   Mean   :265.65   Mean   :187.32   Mean   :1323716  
##  3rd Qu.: 7493   3rd Qu.:421.89   3rd Qu.:263.33   3rd Qu.:1808642  
##  Max.   :10000   Max.   :668.27   Max.   :524.96   Max.   :6682032  
##    Total.Cost       Total.Profit      
##  Min.   :     21   Min.   :      7.2  
##  1st Qu.: 160637   1st Qu.:  94150.9  
##  Median : 467104   Median : 279536.4  
##  Mean   : 933157   Mean   : 390558.7  
##  3rd Qu.:1190390   3rd Qu.: 564286.7  
##  Max.   :5249075   Max.   :1738178.4

Every columns in both data sets has no missing values, which means we do not have to interpolate any values.

vis_miss(df1)

Histograms of each predictor shows all do not have a normal distribution. This will rule out using LDA or QDA for our model.

plot_histogram(df1)

plot_histogram(df2)

Here we remove Order Id and the dates from the data frames. Also, because there are too many countries for our model to handle as a categorical variable we label encode the column.

df1 <- df1[, !names(df1) %in% c("Order.ID", "Order.Date", "Ship.Date")]
df2 <- df2[, !names(df2) %in% c("Order.ID", "Order.Date", "Ship.Date")] 

df1 <- mutate(df1, across(where(is.character), as.factor))
df2 <- mutate(df2, across(where(is.character), as.factor))

enc <- LabelEncoder.fit(df1$Country)
df1$Country <- transform(enc, df1$Country)

enc <- LabelEncoder.fit(df2$Country)
df2$Country <- transform(enc, df2$Country)

The box plots show quite a few outliers and the scale between Cost, Price, and Sold is magnitudes smaller than the other features. If we decide to use KNN we will need to scale our variables. We won’t if we use Random Forrest.

ggplot(data = reshape2::melt(df1) , aes(x=variable, y=value)) + 
  geom_boxplot(outlier.colour="red", outlier.shape=3, outlier.size=5,aes(fill=variable)) +
  coord_flip() + theme(legend.position = "none")

## Using Region, Item.Type, Sales.Channel, Order.Priority as id variables

ggplot(data = reshape2::melt(df2) , aes(x=variable, y=value)) + 
  geom_boxplot(outlier.colour="red", outlier.shape=3, outlier.size=5,aes(fill=variable)) +
  coord_flip() + theme(legend.position = "none")

## Using Region, Item.Type, Sales.Channel, Order.Priority as id variables

Checking the correlation between the variables shows that most variables are highly correlated. For example Total Cost and Total Revenue are 0.99 correlated. Which makes sense.

dev.off()

## null device 
##           1

df1_corr <- unlist(lapply(df1, is.numeric), use.names = FALSE)  
df1_corr <- df1[, df1_corr]
corPlot(df1_corr)

dev.off()

## null device 
##           1

df2_corr <- unlist(lapply(df2, is.numeric), use.names = FALSE)  
df2_corr <- df2[, df2_corr]
corPlot(df2_corr)

Data Split

smp_size1 <- floor(0.75 * nrow(df1))

## set the seed to make your partition reproducible
set.seed(123)
train_ind1 <- sample(seq_len(nrow(df1)), size = smp_size1)

df1_train <- df1[train_ind1, ]
df1_test <- df1[-train_ind1, ]

smp_size2 <- floor(0.75 * nrow(df2))

## set the seed to make your partition reproducible
set.seed(123)
train_ind2 <- sample(seq_len(nrow(df2)), size = smp_size2)

df2_train <- df2[train_ind2, ]
df2_test <- df2[-train_ind2, ]

Model Selection

For our model we will be predicting the Item Type. Since we are trying to classify I will select a model accordingly. LDA and QDA will not work because the features/predictors do not meet the necessary condition of being of a normal distribution.

rfm = randomForest(Item.Type~., data = df1_train)

pred <- predict(rfm, df1_test)

df1_test$Item.Type.Pred <- pred

table(df1_test$Item.Type, df1_test$Item.Type.Pred)

##                  
##                   Baby Food Beverages Cereal Clothes Cosmetics Fruits Household
##   Baby Food              25         0      0       0         0      0         0
##   Beverages               0        28      0       0         0      0         0
##   Cereal                  0         0     19       0         0      0         0
##   Clothes                 0         0      0      21         0      0         0
##   Cosmetics               0         0      0       0        15      0         0
##   Fruits                  0         0      0       0         0     16         0
##   Household               0         0      0       0         0      0        14
##   Meat                    0         0      0       0         0      0         0
##   Office Supplies         0         0      0       0         0      0         0
##   Personal Care           0         0      0       0         0      0         0
##   Snacks                  0         0      0       0         0      0         0
##   Vegetables              0         0      0       0         0      0         0
##                  
##                   Meat Office Supplies Personal Care Snacks Vegetables
##   Baby Food          0               0             0      0          0
##   Beverages          0               0             0      0          0
##   Cereal             0               0             0      0          0
##   Clothes            0               0             0      0          0
##   Cosmetics          0               0             0      0          0
##   Fruits             0               0             0      0          0
##   Household          0               0             0      0          0
##   Meat              22               0             0      0          0
##   Office Supplies    0              16             0      0          0
##   Personal Care      0               0            25      0          0
##   Snacks             0               0             0     25          0
##   Vegetables         0               0             0      0         24

Accuracy(df1_test$Item.Type, df1_test$Item.Type.Pred)

## [1] 1

rfm = randomForest(Item.Type~., data = df2_train)

pred <- predict(rfm, df2_test)

df2_test$Item.Type.Pred <- pred

table(df2_test$Item.Type, df2_test$Item.Type.Pred)

##                  
##                   Baby Food Beverages Cereal Clothes Cosmetics Fruits Household
##   Baby Food            1020         0      0       0         0      0         0
##   Beverages               0      1051      0       0         0      0         0
##   Cereal                  0         0   1064       0         0      0         0
##   Clothes                 0         0      0    1044         0      0         0
##   Cosmetics               0         0      0       0      1036      0         0
##   Fruits                  0         0      0       0         0   1080         0
##   Household               0         0      0       0         0      0      1080
##   Meat                    0         0      0       0         0      0         0
##   Office Supplies         0         0      0       0         0      0         0
##   Personal Care           0         0      0       0         0      0         0
##   Snacks                  0         0      0       0         0      0         0
##   Vegetables              0         0      0       0         0      0         0
##                  
##                   Meat Office Supplies Personal Care Snacks Vegetables
##   Baby Food          0               0             0      0          0
##   Beverages          0               0             0      0          0
##   Cereal             0               0             0      0          0
##   Clothes            0               0             0      0          0
##   Cosmetics          0               0             0      0          0
##   Fruits             0               0             0      0          0
##   Household          0               0             0      0          0
##   Meat            1019               0             0      0          0
##   Office Supplies    0             991             0      0          0
##   Personal Care      0               0          1039      0          0
##   Snacks             0               0             0   1030          0
##   Vegetables         0               0             0      0       1046

Accuracy(df2_test$Item.Type, df2_test$Item.Type.Pred)

## [1] 1

Data 622 Homework 1

Jordan Glendrange

Data Exploration

Data Split

Model Selection