Vans Project

library(tidyverse)
library(gtools)
library(zoo)
library(e1071)
library(regclass)
library(randomForest)
library(rpart)
library(caTools)
library(fitdistrplus)
library(caret)
con <- read_csv("Desktop/Nov 2020 Data Scientist_Homework Data/Consumers.csv")
con <- as.data.frame(con)
trans <- read.csv("Desktop/Nov 2020 Data Scientist_Homework Data/Transaction Line Items.csv")

Removing Transactions With Returns

returns <- subset(trans, trans$sale_or_return_indicator == "R")

trans = trans[!(trans$transaction_id %in% returns$transaction_id),]
unique(trans$sale_or_return_indicator)

## [1] "S"

Add Category Count Columns

trans$Footwear = ifelse(trans$Category == "Footwear", 1,0)
trans$Apparel <- ifelse(trans$Category == "Apparel", 1,0)
trans$Accessories <- ifelse(trans$Category == "Accessories", 1,0)
head(trans)

##   transaction_date transaction_id customer_id sale_or_return_indicator
## 1       2019-01-01     1557447392          NA                        S
## 2       2019-01-01     1557447593  1371878520                        S
## 3       2019-01-01     1557447846  1378604859                        S
## 4       2019-01-01     1557447846  1378604859                        S
## 5       2019-01-01     1557447846  1378604859                        S
## 6       2019-01-01     1557447920  1375140094                        S
##   tender_type net_retail quantity product_code    Category Footwear Apparel
## 1        GFTC      36.08        1 448512831124     Apparel        0       1
## 2        VISA      49.92        1 449556730252    Footwear        1       0
## 3        PPAL       9.85        1 447979167652 Accessories        0       0
## 4        PPAL      16.15        2 447979167296 Accessories        0       0
## 5        PPAL      21.20        2 448511665642 Accessories        0       0
## 6        PPAL      31.24        1 895832901841    Footwear        1       0
##   Accessories
## 1           0
## 2           0
## 3           1
## 4           1
## 5           1
## 6           0

Sum Up Transactions To Join On Consumer Data

x <- trans %>% group_by(customer_id) %>% 
  summarise(sum_retail = sum(net_retail), 
            sum_quantity = sum(quantity),
            sum_trans = n_distinct(transaction_id),
            sum_footwear = sum(Footwear),
            sum_apparel = sum(Apparel),
            sum_accessories = sum(Accessories))
x <- as.data.frame(x)
head(x)

##   customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 1  1312310343       8.79            1         1            0           0
## 2  1312311029      64.63            2         1            1           0
## 3  1312311101      41.22            1         1            1           0
## 4  1312311169      49.28            2         1            0           2
## 5  1312312035      41.68            1         1            1           0
## 6  1312312250     131.10            6         2            3           2
##   sum_accessories
## 1               1
## 2               1
## 3               0
## 4               0
## 5               0
## 6               0

Join on Consumer Data

dat <- left_join(x,con, by = "customer_id")

dat <- dat[order(-dat$sum_retail),]

head(dat)

##        customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 250662          NA 8682173.39       232782    138023       128737       44154
## 21428   1331753343  132337.76         4612       381         2464        1194
## 84750   1374526609   27465.38          462       101          404          26
## 12075   1323568849   20250.58          706        91          346         158
## 52247   1358781919   18793.94          876        20          592         198
## 85065   1374673179   14158.08          516        66          334         142
##        sum_accessories Gender State Has App Age MARITAL STATUS
## 250662           38739   <NA>  <NA>      NA  NA           <NA>
## 21428              440      M    CA       0  31           <NA>
## 84750                0      M    AL       1  24           <NA>
## 12075               38      M    CA       0  35           <NA>
## 52247               80      M    TX       1  NA           <NA>
## 85065               36      M    CA       1  33           <NA>
##        EST HOUSEHOLD INCOME V5 NUMBER OF PERSONS IN LIVING UNIT
## 250662                    <NA>                               NA
## 21428                        E                                1
## 84750                     <NA>                               NA
## 12075                        U                                0
## 52247                     <NA>                               NA
## 85065                     <NA>                               NA
##        NUMBER OF ADULTS IN LIVING UNIT CHILDREN: AGE 0-18 VERSION 3
## 250662                              NA                         <NA>
## 21428                                1                           5U
## 84750                               NA                         <NA>
## 12075                                0                           00
## 52247                               NA                         <NA>
## 85065                               NA                         <NA>
##        Nearest Outlet DistanceMiles Nearest Retail DistanceMiles
## 250662                           NA                           NA
## 21428                           9.3                          3.7
## 84750                            NA                           NA
## 12075                          10.4                          1.6
## 52247                            NA                           NA
## 85065                            NA                           NA

## get rid of NA Customer ID's
dat <- subset(dat, dat$customer_id != "NA")

options(scipen=10000)
ggplot(dat, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
              ylab("Sum of Retail")+
              theme(plot.title = element_text(hjust = 0.5))

## get rid of very extreme outlier
dat2 <- dat[-1,]

summary(dat2$sum_retail)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00    47.65    65.57    87.85   102.41 27465.38

Look For Outliers

## 2.5 sd above the mean
mean(dat2$sum_retail) + 2.5*sd(dat2$sum_retail)

## [1] 407.8555

dat2 <- dat2[dat2$sum_retail < 408,]

summary(dat2$sum_retail)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   47.48   65.22   82.22  100.72  407.93

options(scipen=10000)
ggplot(dat2, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
              ylab("Sum of Retail")+
              theme(plot.title = element_text(hjust = 0.5))

We Still Have Some Outliers, but the Boxplot Looks Better Now and We Have Maintained 99% of the Data

length(dat2$customer_id)/
length(dat$customer_id)

## [1] 0.9911514

Retail Plot

ggplot(dat2, aes(x = sum_retail)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Retail Distribution") +
  ylab("Density") + xlab("Retail")+
              theme(plot.title = element_text(hjust = 0.5))

High Value Customers

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   47.48   65.22   82.22  100.72  407.93

Lets Look At the Top 25%

dat2 = dat2[dat2$sum_retail > 100,]
summary(dat2$sum_retail)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   100.0   117.1   140.8   161.6   185.8   407.9

ggplot(dat2, aes(x = sum_retail)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Top 25th Percent Retail Distribution") +
  ylab("Density") + xlab("Retail")+
              theme(plot.title = element_text(hjust = 0.5))

Gender

gender <- matrix(c(length(u$customer_id)/length(dat2$customer_id),
                   length(f$customer_id)/length(dat2$customer_id),
                   length(m$customer_id)/length(dat2$customer_id)),
                 ncol = 1)
gender <- round(gender,2)

colnames(gender) <- c("Gender")
rownames(gender) <- c("Unknown", "Female", "Male")
gender %>% kbl(caption = "<center><strong>Gender Breakdown<strong><center>") %>%
  kable_styling(bootstrap_options = c("striped", "border"))

**Gender Breakdown**
	Gender
Unknown	0.19
Female	0.53
Male	0.27

Has App

library(kableExtra)
dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
no_app <- subset(dat2, dat2$`Has App` == 0)
has_app <- subset(dat2, dat2$`Has App` == 1)



app <- matrix(c(length(has_app$customer_id)/length(dat2$customer_id),
                length(no_app$customer_id)/length(dat2$customer_id)))
app <- round(app,2)
colnames(app) <- "Has App"
rownames(app) <- c("Has App", "No App")

app %>% kbl(caption = "<center><strong>App Breakdown<strong><center>") %>%
  kable_styling(bootstrap_options = c("striped", "border"))

**App Breakdown**
	Has App
Has App	0.12
No App	0.88

Age

age_noNA <- subset(dat2, dat2$Age != "NA")
summary(age_noNA$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   23.00   33.00   33.89   43.00   83.00

ggplot(age_noNA, aes(x = Age)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Top 25th Percent Age Distribution") +
  ylab("Density") + xlab("Age")+
              theme(plot.title = element_text(hjust = 0.5))

Marital Status

No Sign of Marital Status Being Significant

unique(dat2$`MARITAL STATUS`)

## [1] NA   "1M" "5M" "0U" "5S" "5U"

marital_status <- subset(dat2, dat2$`MARITAL STATUS` != "<NA>")
marital_status$`MARITAL STATUS` <- as.factor(marital_status$`MARITAL STATUS`)
plot(marital_status$`MARITAL STATUS`, marital_status$sum_retail)

Household Income

No Sign of Income Being Significant

unique(dat2$`EST HOUSEHOLD INCOME V5`)

##  [1] "U" "J" "F" NA  "E" "K" "A" "G" "H" "B" "L" "D" "C" "I"

income <- subset(dat2, dat2$`EST HOUSEHOLD INCOME V5` != "NA")
income$`EST HOUSEHOLD INCOME V5` <- as.factor(income$`EST HOUSEHOLD INCOME V5`)
plot(income$`EST HOUSEHOLD INCOME V5`, income$sum_retail)

Distance to Outlet or Retail

There Does Seem to Be a Grouping When Closer to Retail or Outlet

## Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
plot(nearest$`Nearest Outlet DistanceMiles`, nearest$sum_retail,
     xlab = "Nearest Outlet Distance in Miles",
     ylab = "Retail")

## Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
plot(retail$`Nearest Retail DistanceMiles`, retail$sum_retail, 
     xlab = "Nearest Retail Distance in Miles",
     ylab = "Retail")

Replacing NA’s

# Age
## get rid of very extreme outlier
dat2 <- dat[-1,]


age <- subset(dat2, dat2$Age != "NA")

summary(age$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   22.00   31.00   32.77   42.00   90.00

## replacing NA with median or mean won't make much difference
dat2$Age <- round(na.aggregate(dat2$Age),0) 


dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
# Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
summary(nearest$`Nearest Outlet DistanceMiles`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.10   10.50   19.60   41.62   41.40 1809.80

##plot(density(nearest$`Nearest Outlet DistanceMiles`))

## replace with the median not the mean since it is skewed

dat2$`Nearest Outlet DistanceMiles` <- na.aggregate(dat2$`Nearest Outlet DistanceMiles`, FUN = median)

# Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
summary(retail$`Nearest Retail DistanceMiles`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.30    6.30   14.28   13.10  647.20

##plot(density(retail$`Nearest Retail DistanceMiles`))

## will replace with median also as the data is skewed
dat2$`Nearest Retail DistanceMiles` <- na.aggregate(dat2$`Nearest Retail DistanceMiles`, FUN = median)

Model

library(e1071)
library(regclass)
library(randomForest)
library(rpart)
dat2 <- dat2[dat2$Gender != "U",]
data_final <- dat2[,c(2:8,10:11,17,18)]
data_final <- data_final[data_final$sum_retail > 0 ,]
data_final$Gender <- as.factor(data_final$Gender)
data_final$`Has App` <- as.factor(data_final$`Has App`)

Linear Model

## lm
datout <- lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear + 
    sum_apparel + sum_accessories + Gender + `Has App` + Age + 
    `Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`, 
    data = data_final)

summary(datout)

## 
## Call:
## lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear + 
##     sum_apparel + sum_accessories + Gender + `Has App` + Age + 
##     `Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`, 
##     data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4369.1   -19.8    -4.5    14.8 11178.7 
## 
## Coefficients:
##                                  Estimate Std. Error  t value
## (Intercept)                     10.731919   0.644607   16.649
## sum_quantity                    33.721403   0.150554  223.983
## sum_trans                        8.723931   0.132603   65.790
## sum_footwear                     1.039640   0.167544    6.205
## sum_apparel                    -23.424797   0.183919 -127.365
## sum_accessories                -31.826785   0.186683 -170.486
## GenderM                          2.259359   0.266508    8.478
## `Has App`1                      -3.530016   0.404177   -8.734
## Age                              0.238447   0.017943   13.289
## `Nearest Outlet DistanceMiles`   0.015577   0.002997    5.198
## `Nearest Retail DistanceMiles`  -0.032689   0.008210   -3.981
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## sum_quantity                   < 0.0000000000000002 ***
## sum_trans                      < 0.0000000000000002 ***
## sum_footwear                         0.000000000547 ***
## sum_apparel                    < 0.0000000000000002 ***
## sum_accessories                < 0.0000000000000002 ***
## GenderM                        < 0.0000000000000002 ***
## `Has App`1                     < 0.0000000000000002 ***
## Age                            < 0.0000000000000002 ***
## `Nearest Outlet DistanceMiles`       0.000000201469 ***
## `Nearest Retail DistanceMiles`       0.000068539930 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54.51 on 185164 degrees of freedom
## Multiple R-squared:  0.8576, Adjusted R-squared:  0.8576 
## F-statistic: 1.116e+05 on 10 and 185164 DF,  p-value: < 0.00000000000000022

plot(datout$fitted.values, datout$residuals) ## Not linear

The Data is Not Normal so I Will Try a LogNormal and a BoxCox Transformation

The BoxCox Transformation looks the best but still not great with an R^2 of .56

library(caret)
boxcox(sum_retail~.,data=data_final)

datout <- lm(log(sum_retail)~.,data = data_final) ## Log Model
datout2 <- lm((sum_retail)^0.6~.,data=data_final) ## 

summary(datout)

## 
## Call:
## lm(formula = log(sum_retail) ~ ., data = data_final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29.3397  -0.2812   0.0244   0.4125   2.5424 
## 
## Coefficients:
##                                   Estimate  Std. Error t value
## (Intercept)                     3.82765800  0.00789983 484.524
## sum_quantity                    0.02094955  0.00184508  11.354
## sum_trans                       0.13508326  0.00162509  83.123
## sum_footwear                    0.02424722  0.00205330  11.809
## sum_apparel                     0.00158193  0.00225398   0.702
## sum_accessories                -0.01198392  0.00228785  -5.238
## GenderM                        -0.00596852  0.00326613  -1.827
## `Has App`1                      0.00622478  0.00495330   1.257
## Age                             0.00476511  0.00021990  21.669
## `Nearest Outlet DistanceMiles`  0.00007065  0.00003672   1.924
## `Nearest Retail DistanceMiles` -0.00038147  0.00010062  -3.791
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## sum_quantity                   < 0.0000000000000002 ***
## sum_trans                      < 0.0000000000000002 ***
## sum_footwear                   < 0.0000000000000002 ***
## sum_apparel                                 0.48278    
## sum_accessories                         0.000000162 ***
## GenderM                                     0.06764 .  
## `Has App`1                                  0.20887    
## Age                            < 0.0000000000000002 ***
## `Nearest Outlet DistanceMiles`              0.05437 .  
## `Nearest Retail DistanceMiles`              0.00015 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.668 on 185164 degrees of freedom
## Multiple R-squared:  0.2033, Adjusted R-squared:  0.2033 
## F-statistic:  4725 on 10 and 185164 DF,  p-value: < 0.00000000000000022

summary(datout2)

## 
## Call:
## lm(formula = (sum_retail)^0.6 ~ ., data = data_final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -296.420   -2.579   -0.612    2.206   81.288 
## 
## Coefficients:
##                                  Estimate Std. Error t value
## (Intercept)                     8.8291599  0.0559730 157.740
## sum_quantity                    0.8279460  0.0130730  63.333
## sum_trans                       1.6078287  0.0115143 139.637
## sum_footwear                    0.0350833  0.0145483   2.412
## sum_apparel                    -0.3951664  0.0159702 -24.744
## sum_accessories                -0.5896443  0.0162102 -36.375
## GenderM                         0.0320045  0.0231417   1.383
## `Has App`1                      0.2001191  0.0350958   5.702
## Age                             0.0411432  0.0015581  26.406
## `Nearest Outlet DistanceMiles`  0.0006042  0.0002602   2.322
## `Nearest Retail DistanceMiles` -0.0029088  0.0007129  -4.080
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## sum_quantity                   < 0.0000000000000002 ***
## sum_trans                      < 0.0000000000000002 ***
## sum_footwear                                 0.0159 *  
## sum_apparel                    < 0.0000000000000002 ***
## sum_accessories                < 0.0000000000000002 ***
## GenderM                                      0.1667    
## `Has App`1                             0.0000000119 ***
## Age                            < 0.0000000000000002 ***
## `Nearest Outlet DistanceMiles`               0.0202 *  
## `Nearest Retail DistanceMiles`         0.0000450564 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.733 on 185164 degrees of freedom
## Multiple R-squared:  0.5619, Adjusted R-squared:  0.5619 
## F-statistic: 2.375e+04 on 10 and 185164 DF,  p-value: < 0.00000000000000022

Random Forest

## train and testing
# sample = sample.split(data_final$sum_retail, SplitRatio = .75)
# train.data = subset(data_final, sample == TRUE)
# test.data  = subset(data_final, sample == FALSE)
# datout4 <- randomForest(data_final$sum_retail~data_final$sum_quantity +
#                           data_final$sum_trans + data_final$sum_footwear +
#                           data_final$sum_apparel + data_final$sum_accessories +
#                           data_final$Gender + data_final$`Has App` +
#                           data_final$Age +
#                           data_final$`Nearest Outlet DistanceMiles` +
#                           data_final$`Nearest Retail DistanceMiles`,
#                         ntree = 30, mtry = 5, max.depth = 5, min.node.size = 5)
# datout4

datoutFinal <- readRDS("./vansFinalModel.rds")
datoutFinal

## 
## Call:
##  randomForest(formula = data_final$sum_retail ~ data_final$sum_quantity +      data_final$sum_trans + data_final$sum_footwear + data_final$sum_apparel +      data_final$sum_accessories + data_final$Gender + data_final$`Has App` +      data_final$Age + data_final$`Nearest Outlet DistanceMiles` +      data_final$`Nearest Retail DistanceMiles`, ntree = 30, mtry = 5,      max.depth = 5, min.node.size = 5) 
##                Type of random forest: regression
##                      Number of trees: 30
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 4037.324
##                     % Var explained: 80.65

Vans Project

Justin Cowden

December 2, 2020

Removing Transactions With Returns

Add Category Count Columns

Sum Up Transactions To Join On Consumer Data

Join on Consumer Data

Look For Outliers

We Still Have Some Outliers, but the Boxplot Looks Better Now and We Have Maintained 99% of the Data

Retail Plot

High Value Customers

Lets Look At the Top 25%

Gender

Has App

Age

Marital Status

No Sign of Marital Status Being Significant

Household Income

No Sign of Income Being Significant

Distance to Outlet or Retail

There Does Seem to Be a Grouping When Closer to Retail or Outlet

Replacing NA’s

Model

Linear Model

The Data is Not Normal so I Will Try a LogNormal and a BoxCox Transformation

The BoxCox Transformation looks the best but still not great with an R^2 of .56

Random Forest