1 Synopsis

This report analyzes data from a retail chain called Big Mart. It explores which locations produce the most sales. It also explores characteristics of these locations such as Outlet Type and Outlet Size. In the end, a prediction model will be built in order to predict the sales of each item at each outlet.

2 Exploratory Analysis

2.1 Loading Packages and Data

library(dplyr)
library(ggplot2)
library(caret)
library(caretEnsemble)
library(VIM)
library(gridExtra)

#Loading data
big_mart <- read.csv("train-file.csv")

2.2 Previewing Data

2.2.1 Structure of Data

glimpse(big_mart)
## Observations: 8,523
## Variables: 12
## $ Item_Identifier           <fctr> FDA15, DRC01, FDN15, FDX07, NCD19, ...
## $ Item_Weight               <dbl> 9.300, 5.920, 17.500, 19.200, 8.930,...
## $ Item_Fat_Content          <fctr> Low Fat, Regular, Low Fat, Regular,...
## $ Item_Visibility           <dbl> 0.016047301, 0.019278216, 0.01676007...
## $ Item_Type                 <fctr> Dairy, Soft Drinks, Meat, Fruits an...
## $ Item_MRP                  <dbl> 249.8092, 48.2692, 141.6180, 182.095...
## $ Outlet_Identifier         <fctr> OUT049, OUT018, OUT049, OUT010, OUT...
## $ Outlet_Establishment_Year <int> 1999, 2009, 1999, 1998, 1987, 2009, ...
## $ Outlet_Size               <fctr> Medium, Medium, Medium, , High, Med...
## $ Outlet_Location_Type      <fctr> Tier 1, Tier 3, Tier 1, Tier 3, Tie...
## $ Outlet_Type               <fctr> Supermarket Type1, Supermarket Type...
## $ Item_Outlet_Sales         <dbl> 3735.1380, 443.4228, 2097.2700, 732....

2.2.2 Head of Data

head(big_mart)
##   Item_Identifier Item_Weight Item_Fat_Content Item_Visibility
## 1           FDA15       9.300          Low Fat      0.01604730
## 2           DRC01       5.920          Regular      0.01927822
## 3           FDN15      17.500          Low Fat      0.01676007
## 4           FDX07      19.200          Regular      0.00000000
## 5           NCD19       8.930          Low Fat      0.00000000
## 6           FDP36      10.395          Regular      0.00000000
##               Item_Type Item_MRP Outlet_Identifier
## 1                 Dairy 249.8092            OUT049
## 2           Soft Drinks  48.2692            OUT018
## 3                  Meat 141.6180            OUT049
## 4 Fruits and Vegetables 182.0950            OUT010
## 5             Household  53.8614            OUT013
## 6          Baking Goods  51.4008            OUT018
##   Outlet_Establishment_Year Outlet_Size Outlet_Location_Type
## 1                      1999      Medium               Tier 1
## 2                      2009      Medium               Tier 3
## 3                      1999      Medium               Tier 1
## 4                      1998                           Tier 3
## 5                      1987        High               Tier 3
## 6                      2009      Medium               Tier 3
##         Outlet_Type Item_Outlet_Sales
## 1 Supermarket Type1         3735.1380
## 2 Supermarket Type2          443.4228
## 3 Supermarket Type1         2097.2700
## 4     Grocery Store          732.3800
## 5 Supermarket Type1          994.7052
## 6 Supermarket Type2          556.6088

2.2.3 Summary of Data

summary(big_mart)
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  FDG33  :  10    Min.   : 4.555   LF     : 316     Min.   :0.00000  
##  FDW13  :  10    1st Qu.: 8.774   low fat: 112     1st Qu.:0.02699  
##  DRE49  :   9    Median :12.600   Low Fat:5089     Median :0.05393  
##  DRN47  :   9    Mean   :12.858   reg    : 117     Mean   :0.06613  
##  FDD38  :   9    3rd Qu.:16.850   Regular:2889     3rd Qu.:0.09459  
##  FDF52  :   9    Max.   :21.350                    Max.   :0.32839  
##  (Other):8467    NA's   :1463                                       
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Fruits and Vegetables:1232   Min.   : 31.29   OUT027 : 935     
##  Snack Foods          :1200   1st Qu.: 93.83   OUT013 : 932     
##  Household            : 910   Median :143.01   OUT035 : 930     
##  Frozen Foods         : 856   Mean   :140.99   OUT046 : 930     
##  Dairy                : 682   3rd Qu.:185.64   OUT049 : 930     
##  Canned               : 649   Max.   :266.89   OUT045 : 929     
##  (Other)              :2994                    (Other):2937     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985                    :2410   Tier 1:2388         
##  1st Qu.:1987              High  : 932   Tier 2:2785         
##  Median :1999              Medium:2793   Tier 3:3350         
##  Mean   :1998              Small :2388                       
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##                                                              
##             Outlet_Type   Item_Outlet_Sales 
##  Grocery Store    :1083   Min.   :   33.29  
##  Supermarket Type1:5577   1st Qu.:  834.25  
##  Supermarket Type2: 928   Median : 1794.33  
##  Supermarket Type3: 935   Mean   : 2181.29  
##                           3rd Qu.: 3101.30  
##                           Max.   :13086.97  
## 

2.3 Manipulating Dataset

2.3.1 Cleaning Item_Fat_Content Variable

#Transforming "low fat" and "LF" to "Low Fat"
index <- which(big_mart$Item_Fat_Content == "LF" | 
                 big_mart$Item_Fat_Content == "low fat")

big_mart[index, "Item_Fat_Content"] <- "Low Fat"


#Transforming "reg" to "Regular
index2 <- which(big_mart$Item_Fat_Content == "reg")

big_mart[index2, "Item_Fat_Content"] <- "Regular"

#Dropping Unused Levels
big_mart$Item_Fat_Content <- factor(big_mart$Item_Fat_Content)

2.3.2 Imputing Missing Values

#Using kNN imputation for missing values
big_mart_imputed <- kNN(big_mart)
big_mart_imputed <- big_mart_imputed %>% 
    select(Item_Identifier:Item_Outlet_Sales)

summary(big_mart_imputed)
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  FDG33  :  10    Min.   : 4.555   Low Fat:5517     Min.   :0.00000  
##  FDW13  :  10    1st Qu.: 8.880   Regular:3006     1st Qu.:0.02699  
##  DRE49  :   9    Median :12.650                    Median :0.05393  
##  DRN47  :   9    Mean   :12.886                    Mean   :0.06613  
##  FDD38  :   9    3rd Qu.:16.850                    3rd Qu.:0.09459  
##  FDF52  :   9    Max.   :21.350                    Max.   :0.32839  
##  (Other):8467                                                       
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Fruits and Vegetables:1232   Min.   : 31.29   OUT027 : 935     
##  Snack Foods          :1200   1st Qu.: 93.83   OUT013 : 932     
##  Household            : 910   Median :143.01   OUT035 : 930     
##  Frozen Foods         : 856   Mean   :140.99   OUT046 : 930     
##  Dairy                : 682   3rd Qu.:185.64   OUT049 : 930     
##  Canned               : 649   Max.   :266.89   OUT045 : 929     
##  (Other)              :2994                    (Other):2937     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985                    :2410   Tier 1:2388         
##  1st Qu.:1987              High  : 932   Tier 2:2785         
##  Median :1999              Medium:2793   Tier 3:3350         
##  Mean   :1998              Small :2388                       
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##                                                              
##             Outlet_Type   Item_Outlet_Sales 
##  Grocery Store    :1083   Min.   :   33.29  
##  Supermarket Type1:5577   1st Qu.:  834.25  
##  Supermarket Type2: 928   Median : 1794.33  
##  Supermarket Type3: 935   Mean   : 2181.29  
##                           3rd Qu.: 3101.30  
##                           Max.   :13086.97  
## 

In the Item_Fat_Content column there were several observations that needed cleaning. All of the content in this column was either Low Fat or Regular. However, some of the observations were stored as LF, low fat or reg. The cleaning made sure all observations were entered as Low Fat or Regular.

There were also 1463 missing values for the Item_Weight column. These missing values will present problems when trying to create a Machine Learning Model. In this report, kNN imputation was used to impute values for the missing observations. This method imputes a value based on other observations with similar values for the other variables in the dataset.

2.3.3 Discovering Way to Impute Values for Outlet_Size

2.3.3.1 Outlet Identifier by Outlet Size Table

table(big_mart_imputed$Outlet_Identifier, big_mart_imputed$Outlet_Size)
##         
##              High Medium Small
##   OUT010 555    0      0     0
##   OUT013   0  932      0     0
##   OUT017 926    0      0     0
##   OUT018   0    0    928     0
##   OUT019   0    0      0   528
##   OUT027   0    0    935     0
##   OUT035   0    0      0   930
##   OUT045 929    0      0     0
##   OUT046   0    0      0   930
##   OUT049   0    0    930     0

2.3.3.2 Outlet Identifier by Outlet_Type Table

table(big_mart_imputed$Outlet_Identifier, big_mart_imputed$Outlet_Type)
##         
##          Grocery Store Supermarket Type1 Supermarket Type2
##   OUT010           555                 0                 0
##   OUT013             0               932                 0
##   OUT017             0               926                 0
##   OUT018             0                 0               928
##   OUT019           528                 0                 0
##   OUT027             0                 0                 0
##   OUT035             0               930                 0
##   OUT045             0               929                 0
##   OUT046             0               930                 0
##   OUT049             0               930                 0
##         
##          Supermarket Type3
##   OUT010                 0
##   OUT013                 0
##   OUT017                 0
##   OUT018                 0
##   OUT019                 0
##   OUT027               935
##   OUT035                 0
##   OUT045                 0
##   OUT046                 0
##   OUT049                 0

2.3.3.3 Outlet Type by Outlet Size Table

table(big_mart$Outlet_Type, big_mart_imputed$Outlet_Size)
##                    
##                          High Medium Small
##   Grocery Store      555    0      0   528
##   Supermarket Type1 1855  932    930  1860
##   Supermarket Type2    0    0    928     0
##   Supermarket Type3    0    0    935     0

2.3.3.4 Imputing Small for OUT010 Location

index3 <- which(big_mart_imputed$Outlet_Identifier == "OUT010")
big_mart_imputed[index3, "Outlet_Size"] <- "Small"

2.3.3.5 Imputing Small for OUT017 Location

index4 <- which(big_mart_imputed$Outlet_Identifier == "OUT017")
big_mart_imputed[index4, "Outlet_Size"] <- "Small"

2.3.3.6 Imputing Medium for OUT045 Location

index5 <- which(big_mart_imputed$Outlet_Identifier == "OUT045")
big_mart_imputed[index5, "Outlet_Size"] <- "Medium"

2.3.3.7 Dropping Unused Levels for Outlet Size Variable

big_mart_imputed$Outlet_Size <- factor(big_mart_imputed$Outlet_Size)

2.3.3.8 Summary Cleaned Dataset

summary(big_mart_imputed)
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  FDG33  :  10    Min.   : 4.555   Low Fat:5517     Min.   :0.00000  
##  FDW13  :  10    1st Qu.: 8.880   Regular:3006     1st Qu.:0.02699  
##  DRE49  :   9    Median :12.650                    Median :0.05393  
##  DRN47  :   9    Mean   :12.886                    Mean   :0.06613  
##  FDD38  :   9    3rd Qu.:16.850                    3rd Qu.:0.09459  
##  FDF52  :   9    Max.   :21.350                    Max.   :0.32839  
##  (Other):8467                                                       
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Fruits and Vegetables:1232   Min.   : 31.29   OUT027 : 935     
##  Snack Foods          :1200   1st Qu.: 93.83   OUT013 : 932     
##  Household            : 910   Median :143.01   OUT035 : 930     
##  Frozen Foods         : 856   Mean   :140.99   OUT046 : 930     
##  Dairy                : 682   3rd Qu.:185.64   OUT049 : 930     
##  Canned               : 649   Max.   :266.89   OUT045 : 929     
##  (Other)              :2994                    (Other):2937     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985              High  : 932   Tier 1:2388         
##  1st Qu.:1987              Medium:3722   Tier 2:2785         
##  Median :1999              Small :3869   Tier 3:3350         
##  Mean   :1998                                                
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##                                                              
##             Outlet_Type   Item_Outlet_Sales 
##  Grocery Store    :1083   Min.   :   33.29  
##  Supermarket Type1:5577   1st Qu.:  834.25  
##  Supermarket Type2: 928   Median : 1794.33  
##  Supermarket Type3: 935   Mean   : 2181.29  
##                           3rd Qu.: 3101.30  
##                           Max.   :13086.97  
## 

These tables show that there are 10 different Big Mart outlets that are being used in the dataset. Each outlet size is either small, medium or high. Also, each outlet type is either Grocery Store, Supermarket Type1, Supermarket Type2 or Supermarket Type3. The Outlet Type by Outlet Size Table shows that all Grocery Store locations are small. Since the OUT010 location is a Grocery Store, all observations that are for this location will have the Outlet_Size variable imputed as Small. Unfortunately, the Outlet Type for both the OUT017 and OUT045 locations are Supermarket Type1. The Outlet Size for Supermarket Type1 locations are either small, medium or high. Since the Outlet Size is only high for one location, in this report, the Outlet Size variable will be set to Small for the OUT017 location and the Outlet Size variable will be set to Medium for the OUT045 location. All the changes can be seen when comparing the summary of the cleaned dataset with the summary of the original dataset.

2.4 Visualizing Data

2.4.1 Item Outlet Sales Histogram

 ggplot(big_mart_imputed, aes(x=Item_Outlet_Sales)) +
  geom_histogram(binwidth = 200) +
  labs(title = "Item Outlet Sales Histogram", 
       x = "Item Outlet Sales")

2.4.2 Item Outlet Sales Histogram by Outlet Identifier

 ggplot(big_mart_imputed, aes(x=Item_Outlet_Sales, 
                             fill = Outlet_Identifier)) +
  geom_histogram(binwidth = 200) +
  facet_wrap(~ Outlet_Identifier) +
  labs(title = "Item Outlet Sales Histogram", 
       x = "Item Outlet Sales")

2.4.3 Sales by Outlet Identifier

ggplot(big_mart_imputed, aes(x = Outlet_Identifier,
                             y = Item_Outlet_Sales)) +
  geom_boxplot() +
  labs(title = "Sales by Outlet Identifier",
       x = "Outlet Identifier",
       y = "Item Outlet Sales") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1))

2.4.4 Item Outlet Sales by Item MRP and Outlet Identifier

ggplot(big_mart_imputed, aes(x = Item_MRP,
                             y = Item_Outlet_Sales)) +
  geom_bin2d() +
  facet_wrap(~ Outlet_Identifier) +
  labs(title = "Item Outlet Sales by Item MRP and Outlet Identifier",
       x = "Item MRP",
       y = "Item Outlet Sales")

2.4.5 Further Investigation

2.4.5.1 Median Sales by Location

big_mart_imputed %>%
  group_by(Outlet_Identifier) %>%
  summarize(median_sales = median(Item_Outlet_Sales)) %>%
  arrange(desc(median_sales))
## # A tibble: 10 x 2
##    Outlet_Identifier median_sales
##               <fctr>        <dbl>
##  1            OUT027    3364.9532
##  2            OUT035    2109.2544
##  3            OUT013    2050.6640
##  4            OUT017    2005.0567
##  5            OUT049    1966.1074
##  6            OUT046    1945.8005
##  7            OUT045    1834.9448
##  8            OUT018    1655.1788
##  9            OUT019     265.3213
## 10            OUT010     250.3408

2.4.5.2 Correlation of Item Outlet Sales and Item MRP

cor(big_mart_imputed$Item_MRP, big_mart_imputed$Item_Outlet_Sales)
## [1] 0.5675744

These charts show that most Item Outlet Sales occur within the range of 0 to 5000. The histogram of item outlet sales broken down by Outlet Identifier shows that most of the low item outlet sales were in the OUT010 and OUT019 locations.Further examination shows that these two locations were the only two locations that were Grocery Stores. Therefore, there should be no surprise that they would have the lowest sales. The boxplot shows that these two locations had the lowest sales all around. The Outlet that produced the highest sales was the OUT027 location.

Although a person might assume that this outlet was the biggest, its size was only medium. However, it was the only outlet that had a Outlet Type of Supermarket Type3. Another item worth noting is that the biggest location was ranked third when looking at median sales by location.

Finally, when looking at the final graph, there appears to be a moderate positive correlation between Item Outlet Sales and Item MRP. This assumption is corroborated when running a test for the correlation between these two variables. The correlation coefficient of 0.5675744 shows this relationship. Now it is time to build the Machine Learning Model.

3 Machine Learning Models

3.1 Removing Near Zero Variance Variables

#Preparing Data For Machine Learning
big_mart_sub <- big_mart_imputed %>%
  select(-Item_Identifier, -Outlet_Identifier)

3.2 Partitioning The Data

set.seed(366284)
inTrain <- createDataPartition(y = big_mart_sub$Item_Outlet_Sales, 
                               p = 0.7, list=FALSE)
train <- big_mart_sub[inTrain, ]
test <- big_mart_sub[-inTrain, ]

3.3 Caret List

3.3.1 Building List

control <- trainControl(method = "repeatedcv", number = 10, repeats = 3, savePredictions = TRUE, classProbs = TRUE)

algorithmList <- c('glm', 'glmnet', 'lm', 'ranger', 'treebag', 'gbm', 'bagEarth')

models <- caretList(Item_Outlet_Sales ~ ., train, trControl = control, methodList = algorithmList)

3.3.2 Model Performance

results <- resamples(models)
summary(results)
## 
## Call:
## summary.resamples(object = results)
## 
## Models: glm, glmnet, lm, ranger, treebag, gbm, bagEarth 
## Number of resamples: 30 
## 
## MAE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## glm      775.7543 817.1962 838.8328 839.5734 854.0810 933.1969    0
## glmnet   772.9690 812.9446 839.1800 837.0200 850.0924 933.4574    0
## lm       775.7543 817.1962 838.8328 839.5734 854.0810 933.1969    0
## ranger   707.0019 768.8223 778.7816 779.6686 789.4152 856.1491    0
## treebag  741.2490 782.0189 792.2324 796.6315 813.3684 877.1345    0
## gbm      709.7937 755.3484 768.8886 770.1752 783.7864 852.0749    0
## bagEarth 777.8254 815.9635 841.5288 838.1882 852.6282 934.3312    0
## 
## RMSE 
##              Min.  1st Qu.   Median     Mean  3rd Qu.     Max. NA's
## glm      1028.452 1097.926 1119.933 1131.884 1162.996 1260.873    0
## glmnet   1027.890 1099.636 1118.539 1131.576 1159.846 1262.671    0
## lm       1028.452 1097.926 1119.933 1131.884 1162.996 1260.873    0
## ranger   1011.118 1083.284 1112.091 1112.333 1132.856 1226.679    0
## treebag  1011.821 1075.789 1097.563 1106.159 1128.304 1205.002    0
## gbm      1000.838 1058.929 1076.016 1085.227 1110.155 1196.828    0
## bagEarth 1031.267 1098.347 1116.633 1130.683 1160.184 1258.743    0
## 
## Rsquared 
##               Min.   1st Qu.    Median      Mean   3rd Qu.      Max. NA's
## glm      0.5295138 0.5456362 0.5567858 0.5602667 0.5650994 0.6157971    0
## glmnet   0.5307685 0.5483504 0.5567145 0.5609628 0.5671603 0.6169494    0
## lm       0.5295138 0.5456362 0.5567858 0.5602667 0.5650994 0.6157971    0
## ranger   0.5384171 0.5635603 0.5743481 0.5771507 0.5869593 0.6324654    0
## treebag  0.5392784 0.5648766 0.5730870 0.5802179 0.5978366 0.6292508    0
## gbm      0.5646607 0.5812455 0.5919053 0.5960426 0.6084908 0.6500075    0
## bagEarth 0.5312773 0.5511528 0.5575486 0.5612730 0.5668677 0.6175535    0

3.3.3 Seeing Models

models
## $glm
## Generalized Linear Model 
## 
## 5967 samples
##    9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 5370, 5370, 5370, 5371, 5370, 5371, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1131.884  0.5602667  839.5734
## 
## 
## $glmnet
## glmnet 
## 
## 5967 samples
##    9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 5370, 5370, 5370, 5371, 5370, 5371, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda      RMSE      Rsquared   MAE     
##   0.10     1.935134  1131.910  0.5602403  839.1737
##   0.10    19.351336  1132.510  0.5599704  838.5245
##   0.10   193.513361  1164.512  0.5470946  859.3739
##   0.55     1.935134  1131.996  0.5601687  839.0071
##   0.55    19.351336  1131.576  0.5609628  837.0200
##   0.55   193.513361  1214.361  0.5170014  899.9145
##   1.00     1.935134  1131.741  0.5603706  838.7042
##   1.00    19.351336  1132.109  0.5608816  836.9101
##   1.00   193.513361  1265.689  0.4855106  942.3760
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were alpha = 0.55 and lambda
##  = 19.35134.
## 
## $lm
## Linear Regression 
## 
## 5967 samples
##    9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 5370, 5370, 5370, 5371, 5370, 5371, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1131.884  0.5602667  839.5734
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
## 
## $ranger
## Random Forest 
## 
## 5967 samples
##    9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 5370, 5370, 5370, 5371, 5370, 5371, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE      Rsquared   MAE      
##    2    variance    1275.068  0.5258551   966.8857
##    2    extratrees  1340.958  0.4750376  1026.2169
##   14    variance    1112.333  0.5771507   779.6686
##   14    extratrees  1116.907  0.5731479   780.9075
##   27    variance    1132.820  0.5636043   794.3238
##   27    extratrees  1128.780  0.5661096   788.6844
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were mtry = 14 and splitrule
##  = variance.
## 
## $treebag
## Bagged CART 
## 
## 5967 samples
##    9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 5370, 5370, 5370, 5371, 5370, 5371, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   1106.159  0.5802179  796.6315
## 
## 
## $gbm
## Stochastic Gradient Boosting 
## 
## 5967 samples
##    9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 5370, 5370, 5370, 5371, 5370, 5371, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  RMSE      Rsquared   MAE     
##   1                   50      1258.920  0.4913048  944.3025
##   1                  100      1178.567  0.5353710  872.8744
##   1                  150      1154.202  0.5462613  856.2617
##   2                   50      1127.343  0.5741295  829.6320
##   2                  100      1100.549  0.5853602  802.5415
##   2                  150      1097.342  0.5872170  796.9014
##   3                   50      1089.362  0.5951151  780.0931
##   3                  100      1085.227  0.5960426  770.1752
##   3                  150      1087.609  0.5941960  770.5545
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were n.trees = 100,
##  interaction.depth = 3, shrinkage = 0.1 and n.minobsinnode = 10.
## 
## $bagEarth
## Bagged MARS 
## 
## 5967 samples
##    9 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 5370, 5370, 5370, 5371, 5370, 5371, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE      
##   2       1404.038  0.3229708  1028.0708
##   4       1197.874  0.5072018   902.9631
##   7       1130.683  0.5612730   838.1882
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were nprune = 7 and degree = 1.
## 
## attr(,"class")
## [1] "caretList"

3.3.4 GLMNET Ensemble

stack_glmnet <- caretStack(models, method = "glmnet", trControl = trainControl(method = "repeatedcv", number = 10, repeats = 3, savePredictions = TRUE))
stack_glmnet
## A glmnet ensemble of 2 base models: glm, glmnet, lm, ranger, treebag, gbm, bagEarth
## 
## Ensemble results:
## glmnet 
## 
## 17901 samples
##     7 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 16111, 16110, 16112, 16110, 16112, 16111, ... 
## Resampling results across tuning parameters:
## 
##   alpha  lambda     RMSE      Rsquared   MAE     
##   0.10     2.63106  1078.625  0.6002746  761.3230
##   0.10    26.31060  1078.616  0.6002820  761.3661
##   0.10   263.10601  1082.263  0.5986693  770.6628
##   0.55     2.63106  1078.519  0.6003606  760.4564
##   0.55    26.31060  1078.629  0.6003519  760.8731
##   0.55   263.10601  1091.404  0.5998297  780.4231
##   1.00     2.63106  1078.513  0.6003692  760.3125
##   1.00    26.31060  1078.808  0.6003647  760.9179
##   1.00   263.10601  1111.595  0.5996916  812.4758
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were alpha = 1 and lambda = 2.63106.

3.3.4.1 Testing Model

3.3.4.1.1 Getting Predictions
predictions_glmnet <- predict(stack_glmnet, test)
error <- predictions_glmnet - test$Item_Outlet_Sales
3.3.4.1.2 Calculating RMSE
sqrt(mean(error^2))
## [1] 1083.242

3.3.5 Random Forest Ensemble

stack_rf <- caretStack(models, method = "ranger", trControl = trainControl(method = "repeatedcv", number = 10, repeats = 3, savePredictions = TRUE))
## Growing trees.. Progress: 97%. Estimated remaining time: 1 seconds.
## Growing trees.. Progress: 96%. Estimated remaining time: 1 seconds.
## Growing trees.. Progress: 89%. Estimated remaining time: 3 seconds.
## Growing trees.. Progress: 96%. Estimated remaining time: 1 seconds.
## Growing trees.. Progress: 92%. Estimated remaining time: 2 seconds.
## Growing trees.. Progress: 94%. Estimated remaining time: 1 seconds.
## Growing trees.. Progress: 98%. Estimated remaining time: 0 seconds.
## Growing trees.. Progress: 81%. Estimated remaining time: 7 seconds.
## Growing trees.. Progress: 91%. Estimated remaining time: 3 seconds.
## Growing trees.. Progress: 96%. Estimated remaining time: 1 seconds.
## Growing trees.. Progress: 93%. Estimated remaining time: 2 seconds.
## Growing trees.. Progress: 97%. Estimated remaining time: 1 seconds.
## Growing trees.. Progress: 90%. Estimated remaining time: 3 seconds.
## Growing trees.. Progress: 97%. Estimated remaining time: 1 seconds.
## Growing trees.. Progress: 98%. Estimated remaining time: 0 seconds.
## Growing trees.. Progress: 92%. Estimated remaining time: 2 seconds.
## Growing trees.. Progress: 92%. Estimated remaining time: 2 seconds.
## Growing trees.. Progress: 96%. Estimated remaining time: 1 seconds.
## Growing trees.. Progress: 97%. Estimated remaining time: 0 seconds.
stack_rf
## A ranger ensemble of 2 base models: glm, glmnet, lm, ranger, treebag, gbm, bagEarth
## 
## Ensemble results:
## Random Forest 
## 
## 17901 samples
##     7 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 16111, 16112, 16111, 16110, 16109, 16111, ... 
## Resampling results across tuning parameters:
## 
##   mtry  splitrule   RMSE      Rsquared   MAE     
##   2     variance    1019.680  0.6428196  711.9778
##   2     extratrees  1013.522  0.6472031  710.1280
##   4     variance    1021.081  0.6418997  711.6762
##   4     extratrees  1010.648  0.6491610  707.1745
##   7     variance    1025.489  0.6388922  714.4244
##   7     extratrees  1010.546  0.6492227  706.0768
## 
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were mtry = 7 and splitrule
##  = extratrees.

3.3.5.1 Testing Model

3.3.5.1.1 Getting Predictions
predictions_rf <- predict(stack_rf, test)
error <- predictions_rf - test$Item_Outlet_Sales
3.3.5.1.2 Calculating RMSE
sqrt(mean(error^2))
## [1] 1113.06

3.3.6 Bagging Ensemble

stack_bag <- caretStack(models, method = "bagEarth", trControl = trainControl(method = "repeatedcv", number = 10, repeats = 3, savePredictions = TRUE))
stack_bag
## A bagEarth ensemble of 2 base models: glm, glmnet, lm, ranger, treebag, gbm, bagEarth
## 
## Ensemble results:
## Bagged MARS 
## 
## 17901 samples
##     7 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 16111, 16111, 16111, 16110, 16111, 16110, ... 
## Resampling results across tuning parameters:
## 
##   nprune  RMSE      Rsquared   MAE     
##   2       1098.918  0.5889377  798.7656
##   5       1073.015  0.6046466  758.2355
##   9       1068.837  0.6077034  754.5428
## 
## Tuning parameter 'degree' was held constant at a value of 1
## RMSE was used to select the optimal model using  the smallest value.
## The final values used for the model were nprune = 9 and degree = 1.

3.3.6.1 Testing Model

3.3.6.1.1 Getting Predictions
predictions_bag <- predict(stack_bag, test)
error <- predictions_bag - test$Item_Outlet_Sales
3.3.6.1.2 Calculating RMSE
sqrt(mean(error^2))
## [1] 1083.072

Before the model can be built, the columns Item_Identifier and Outlet_Identifier were removed. These columns had zero variance because they are particular to each item and each outlet. Next the data was split into a train set and a test set. The train set contains 70% of the data selected randomly. The rest of the data is in the test set. The test set is used to test the accuracy of the model.

The next step to build the machine learning model to predict future Item_Outlet_Sales was to compare a list of machine learning models. The algorithms in this list included lm, glm, glmnet, treebag, bagEarth, random forest aka ranger and gbm. All of these model types are suitable for regression analysis. When comparing the RMSE or out of sample error, the best performing model was gbm model. This model had an out of sample error of 1085.227.

Although the gbm model could be used for predictions. Combining these models should produce better results. Hopefully, an ensemble model of these models in the list will use the best parts of each model.

The three different types of ensemble for this report were a glmnet ensemble, a random forest ensemble and a bagEarth ensemble. After these ensembles were created, they were each tested to see which produced the best RMSE. The glmnet model produced an RMSE of 1083.242. The random forest ensemble produced an RMSE of 1105.72. Finally the bagEarth model produced an RMSE of 1083.213.

4 Conclusion

In the end, the bagEarth ensemble was used to make the final predictions for Item Outlet Sales at Big Mart. This model produced the lowest RMSE. Therefore, it should be the model that will make the best predictions.

There were other conclusions that can be made from this report’s analysis. First there is a moderate correlation between an Item’s MRP at a Big Mart location and that item’s sales at that location. Also the smallest locations produced the lowest sales. However, the largest location did not produce the highest sales. The location that produced the highest sales was the OUT027 location. This location was Supermarket Type3 and its size was medium. This outlet performed much better than any other location. Its median Item_Outlet_Sales were 3364.95. The location that was second was the OUT035 location, which had a median Item_Outlet_Sales of 2109.25.

If Big Mart were to try to increase sales at all locations, it may consider switching more locations to Supermarket Type3. Other things Big Mart could do to increase sales is to see which Items had the highest sales. They may also consider how product visibility affected outlet sales. However, the model built in this report should be good for helping Big Mart predict future sales at its locations.

5 Testing Model

5.1 Loading Test Set

testing <- read.csv("test-file.csv")

5.2 Manipulating Dataset

#Transforming "low fat" and "LF" to "Low Fat"
index <- which(testing$Item_Fat_Content == "LF" | 
                 testing$Item_Fat_Content == "low fat")

testing[index, "Item_Fat_Content"] <- "Low Fat"

#Transforming "reg" to "Regular
index2 <- which(testing$Item_Fat_Content == "reg")

testing[index2, "Item_Fat_Content"] <- "Regular"

#Dropping Unused Levels
testing$Item_Fat_Content <- factor(testing$Item_Fat_Content)

#Using kNN imputation for missing values
testing_imputed <- kNN(testing)
testing_imputed <- testing_imputed %>% 
  select(Item_Identifier:Outlet_Type)

summary(testing_imputed)
##  Item_Identifier  Item_Weight     Item_Fat_Content Item_Visibility  
##  DRF48  :   8    Min.   : 4.555   Low Fat:3668     Min.   :0.00000  
##  FDK57  :   8    1st Qu.: 8.710   Regular:2013     1st Qu.:0.02705  
##  FDN52  :   8    Median :12.350                    Median :0.05415  
##  FDP15  :   8    Mean   :12.650                    Mean   :0.06568  
##  FDQ60  :   8    3rd Qu.:16.500                    3rd Qu.:0.09346  
##  FDW10  :   8    Max.   :21.350                    Max.   :0.32364  
##  (Other):5633                                                       
##                  Item_Type       Item_MRP      Outlet_Identifier
##  Snack Foods          : 789   Min.   : 31.99   OUT027 : 624     
##  Fruits and Vegetables: 781   1st Qu.: 94.41   OUT013 : 621     
##  Household            : 638   Median :141.42   OUT035 : 620     
##  Frozen Foods         : 570   Mean   :141.02   OUT046 : 620     
##  Dairy                : 454   3rd Qu.:186.03   OUT049 : 620     
##  Baking Goods         : 438   Max.   :266.59   OUT045 : 619     
##  (Other)              :2011                    (Other):1957     
##  Outlet_Establishment_Year Outlet_Size   Outlet_Location_Type
##  Min.   :1985                    :1606   Tier 1:1592         
##  1st Qu.:1987              High  : 621   Tier 2:1856         
##  Median :1999              Medium:1862   Tier 3:2233         
##  Mean   :1998              Small :1592                       
##  3rd Qu.:2004                                                
##  Max.   :2009                                                
##                                                              
##             Outlet_Type  
##  Grocery Store    : 722  
##  Supermarket Type1:3717  
##  Supermarket Type2: 618  
##  Supermarket Type3: 624  
##                          
##                          
## 
#Changing Outlet_Size for OUT010 Location
index3 <- which(testing_imputed$Outlet_Identifier == "OUT010")
testing_imputed[index3, "Outlet_Size"] <- "Small"

#Changing Outlet_Size for OUT017 Location
index4 <- which(testing_imputed$Outlet_Identifier == "OUT017")
testing_imputed[index4, "Outlet_Size"] <- "Small"

#Changing Outlet_Size for OUT045 Location
index5 <- which(testing_imputed$Outlet_Identifier == "OUT045")
testing_imputed[index5, "Outlet_Size"] <- "Medium"

#Dropping Unused Levels from Outlet_Identifier Column
testing_imputed$Outlet_Size <- factor(testing_imputed$Outlet_Size)

5.3 Testing Predictions

testing_predictions_bag <- predict(stack_glmnet, testing_imputed)

testing_imputed$Item_Outlet_Sales <- testing_predictions_bag

submission_bag <- testing_imputed[, c("Item_Identifier",
                                      "Outlet_Identifier",
                                    "Item_Outlet_Sales")]

dim(submission_bag)
## [1] 5681    3
head(submission_bag)
##   Item_Identifier Outlet_Identifier Item_Outlet_Sales
## 1           FDW58            OUT049         1653.5209
## 2           FDW14            OUT017         1479.9194
## 3           NCN55            OUT010          901.2456
## 4           FDQ58            OUT017         2625.4842
## 5           FDY38            OUT027         5985.9292
## 6           FDH56            OUT046         1781.3209
write.csv(submission_bag, "big_mart_predictions.csv", 
          row.names = FALSE)