library(tidyverse)
library(gtools)
library(zoo)
library(e1071)
library(regclass)
library(randomForest)
library(rpart)
library(caTools)
library(fitdistrplus)
library(caret)
con <- read_csv("Desktop/Nov 2020 Data Scientist_Homework Data/Consumers.csv")
con <- as.data.frame(con)
trans <- read.csv("Desktop/Nov 2020 Data Scientist_Homework Data/Transaction Line Items.csv")

Removing Transactions With Returns

returns <- subset(trans, trans$sale_or_return_indicator == "R")

trans = trans[!(trans$transaction_id %in% returns$transaction_id),]
unique(trans$sale_or_return_indicator)

## [1] "S"

Add Category Count Columns

trans$Footwear = ifelse(trans$Category == "Footwear", 1,0)
trans$Apparel <- ifelse(trans$Category == "Apparel", 1,0)
trans$Accessories <- ifelse(trans$Category == "Accessories", 1,0)
head(trans)

##   transaction_date transaction_id customer_id sale_or_return_indicator
## 1       2019-01-01     1557447392          NA                        S
## 2       2019-01-01     1557447593  1371878520                        S
## 3       2019-01-01     1557447846  1378604859                        S
## 4       2019-01-01     1557447846  1378604859                        S
## 5       2019-01-01     1557447846  1378604859                        S
## 6       2019-01-01     1557447920  1375140094                        S
##   tender_type net_retail quantity product_code    Category Footwear Apparel
## 1        GFTC      36.08        1 448512831124     Apparel        0       1
## 2        VISA      49.92        1 449556730252    Footwear        1       0
## 3        PPAL       9.85        1 447979167652 Accessories        0       0
## 4        PPAL      16.15        2 447979167296 Accessories        0       0
## 5        PPAL      21.20        2 448511665642 Accessories        0       0
## 6        PPAL      31.24        1 895832901841    Footwear        1       0
##   Accessories
## 1           0
## 2           0
## 3           1
## 4           1
## 5           1
## 6           0

Sum Up Transactions To Join On Consumer Data

x <- trans %>% group_by(customer_id) %>% 
  summarise(sum_retail = sum(net_retail), 
            sum_quantity = sum(quantity),
            sum_trans = n_distinct(transaction_id),
            sum_footwear = sum(Footwear),
            sum_apparel = sum(Apparel),
            sum_accessories = sum(Accessories))
x <- as.data.frame(x)
head(x)

##   customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 1  1312310343       8.79            1         1            0           0
## 2  1312311029      64.63            2         1            1           0
## 3  1312311101      41.22            1         1            1           0
## 4  1312311169      49.28            2         1            0           2
## 5  1312312035      41.68            1         1            1           0
## 6  1312312250     131.10            6         2            3           2
##   sum_accessories
## 1               1
## 2               1
## 3               0
## 4               0
## 5               0
## 6               0

Join on Consumer Data

dat <- left_join(x,con, by = "customer_id")

dat <- dat[order(-dat$sum_retail),]

head(dat)

##        customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 250662          NA 8682173.39       232782    138023       128737       44154
## 21428   1331753343  132337.76         4612       381         2464        1194
## 84750   1374526609   27465.38          462       101          404          26
## 12075   1323568849   20250.58          706        91          346         158
## 52247   1358781919   18793.94          876        20          592         198
## 85065   1374673179   14158.08          516        66          334         142
##        sum_accessories Gender State Has App Age MARITAL STATUS
## 250662           38739   <NA>  <NA>      NA  NA           <NA>
## 21428              440      M    CA       0  31           <NA>
## 84750                0      M    AL       1  24           <NA>
## 12075               38      M    CA       0  35           <NA>
## 52247               80      M    TX       1  NA           <NA>
## 85065               36      M    CA       1  33           <NA>
##        EST HOUSEHOLD INCOME V5 NUMBER OF PERSONS IN LIVING UNIT
## 250662                    <NA>                               NA
## 21428                        E                                1
## 84750                     <NA>                               NA
## 12075                        U                                0
## 52247                     <NA>                               NA
## 85065                     <NA>                               NA
##        NUMBER OF ADULTS IN LIVING UNIT CHILDREN: AGE 0-18 VERSION 3
## 250662                              NA                         <NA>
## 21428                                1                           5U
## 84750                               NA                         <NA>
## 12075                                0                           00
## 52247                               NA                         <NA>
## 85065                               NA                         <NA>
##        Nearest Outlet DistanceMiles Nearest Retail DistanceMiles
## 250662                           NA                           NA
## 21428                           9.3                          3.7
## 84750                            NA                           NA
## 12075                          10.4                          1.6
## 52247                            NA                           NA
## 85065                            NA                           NA

## get rid of NA Customer ID's
dat <- subset(dat, dat$customer_id != "NA")

options(scipen=10000)
ggplot(dat, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
              ylab("Sum of Retail")+
              theme(plot.title = element_text(hjust = 0.5))

## get rid of very extreme outlier
dat2 <- dat[-1,]

summary(dat2$sum_retail)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00    47.65    65.57    87.85   102.41 27465.38

Look For Outliers

## 2.5 sd above the mean
mean(dat2$sum_retail) + 2.5*sd(dat2$sum_retail)

## [1] 407.8555

dat2 <- dat2[dat2$sum_retail < 408,]

summary(dat2$sum_retail)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   47.48   65.22   82.22  100.72  407.93

options(scipen=10000)
ggplot(dat2, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
              ylab("Sum of Retail")+
              theme(plot.title = element_text(hjust = 0.5))

We Still Have Some Outliers, but the Boxplot Looks Better Now and We Have Maintained 99% of the Data

length(dat2$customer_id)/
length(dat$customer_id)

## [1] 0.9911514

Density Plot

ggplot(dat2, aes(x = sum_retail)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Retail Distribution") +
  ylab("Density") + xlab("Retail")+
              theme(plot.title = element_text(hjust = 0.5))

## Warning: Ignoring unknown parameters: outlier.colour, outlier.shape,
## outlier.size, notch

High Value Customers

summary(dat2$sum_retail)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   47.48   65.22   82.22  100.72  407.93

Lets Look At the Top 25%

dat2 = dat2[dat2$sum_retail > 100,]
summary(dat2$sum_retail)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   100.0   117.1   140.8   161.6   185.8   407.9

ggplot(dat2, aes(x = sum_retail)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Top 25th Percent Retail Distribution") +
  ylab("Density") + xlab("Retail")+
              theme(plot.title = element_text(hjust = 0.5))

## Warning: Ignoring unknown parameters: outlier.colour, outlier.shape,
## outlier.size, notch

Gender

# library(kableExtra)
# u <- subset(dat2, dat2$Gender == "U")
# f <- subset(dat2, dat2$Gender == "F")
# m <- subset(dat2, dat2$Gender == "M")
# 
# length(u$customer_id)/length(dat2$customer_id)
# length(f$customer_id)/length(dat2$customer_id)
# length(m$customer_id)/length(dat2$customer_id)
# 
# gender <- matrix(c(length(u$customer_id)/length(dat2$customer_id),
#                    length(f$customer_id)/length(dat2$customer_id),
#                    length(m$customer_id)/length(dat2$customer_id)),
#                  ncol = 1)
# gender <- round(gender,2)
# 
# colnames(gender) <- c("Gender")
# rownames(gender) <- c("Unknown", "Female", "Male")
# gender %>% kbl(caption = "<center><strong>Gender Breakdown<strong><center") %>% 
#   kable_styling(bootstrap_options = c("striped", "border"))
#

Has App

# library(kableExtra)
# dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
# no_app <- subset(dat2, dat2$`Has App` == 0)
# has_app <- subset(dat2, dat2$`Has App` == 1)
# 
# length(has_app$customer_id)/length(dat2$customer_id) ## 13.5 of high customers have app
# 
# app <- matrix(c(length(has_app$customer_id)/length(dat2$customer_id),
#                 length(no_app$customer_id)/length(dat2$customer_id)))
# app <- round(app,2)
# colnames(app) <- "Has App"
# rownames(app) <- c("Has App", "No App")
# 
# app %>% kbl(caption = "<center><strong>App Breakdown<strong><center") %>% 
#   kable_styling(bootstrap_options = c("striped", "border"))
# 
# 
# ggplot(dat2, aes(x="", y=dat2$`Has App`, fill=dat2$`Has App`)) +
#   geom_bar(stat="identity", width=1) +
#   coord_polar("y", start=0)

Age

age_noNA <- subset(dat2, dat2$Age != "NA")
summary(age_noNA$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   23.00   33.00   33.89   43.00   83.00

plot(density(age_noNA$Age))

ggplot(age_noNA, aes(x = Age)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Top 25th Percent Age Distribution") +
  ylab("Density") + xlab("Age")+
              theme(plot.title = element_text(hjust = 0.5))

## Warning: Ignoring unknown parameters: outlier.colour, outlier.shape,
## outlier.size, notch

Marital Status

unique(dat2$`MARITAL STATUS`)

## [1] NA   "1M" "5M" "0U" "5S" "5U"

marital_status <- subset(dat2, dat2$`MARITAL STATUS` != "<NA>")
marital_status$`MARITAL STATUS` <- as.factor(marital_status$`MARITAL STATUS`)
plot(marital_status$`MARITAL STATUS`, marital_status$sum_retail)

## No Sign of Marital Status Being Significant

Household Income

unique(dat2$`EST HOUSEHOLD INCOME V5`)

##  [1] "U" "J" "F" NA  "E" "K" "A" "G" "H" "B" "L" "D" "C" "I"

income <- subset(dat2, dat2$`EST HOUSEHOLD INCOME V5` != "NA")
income$`EST HOUSEHOLD INCOME V5` <- as.factor(income$`EST HOUSEHOLD INCOME V5`)
plot(income$`EST HOUSEHOLD INCOME V5`, income$sum_retail)

## No Sign of Income Being Significant

Distance to Outlet or Retail

## Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
plot(nearest$`Nearest Outlet DistanceMiles`, nearest$sum_retail)

## Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
plot(retail$`Nearest Retail DistanceMiles`, retail$sum_retail)

## There Does Seem to Be a Grouping When Closer to Retail or Outlet

Replacing NA’s

# Age
## get rid of very extreme outlier
dat2 <- dat[-1,]
dat2 <- dat2[dat2$sum_retail < 408,]

age <- subset(dat2, dat2$Age != "NA")
plot(density(age$Age))

summary(age$Age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   22.00   31.00   32.75   42.00   90.00

## replacing NA with median or mean won't make much difference
dat2$Age <- round(na.aggregate(dat2$Age),0) 


dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
# Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
summary(nearest$`Nearest Outlet DistanceMiles`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.10   10.60   19.70   41.73   41.60 1809.80

plot(density(nearest$`Nearest Outlet DistanceMiles`))

## replace with the median not the mean since it is skewed

dat2$`Nearest Outlet DistanceMiles` <- na.aggregate(dat2$`Nearest Outlet DistanceMiles`, FUN = median)

# Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
summary(retail$`Nearest Retail DistanceMiles`)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.30    6.30   14.34   13.10  647.20

plot(density(retail$`Nearest Retail DistanceMiles`))

## will replace with median also as the data is skewed
dat2$`Nearest Retail DistanceMiles` <- na.aggregate(dat2$`Nearest Retail DistanceMiles`, FUN = median)

Model

library(e1071)
library(regclass)
library(randomForest)
library(rpart)
dat2 <- dat2[dat2$Gender != "U",]
data_final <- dat2[,c(2:8,10:11,17,18)]
data_final <- data_final[data_final$sum_retail > 0 ,]
data_final$Gender <- as.factor(data_final$Gender)
data_final$`Has App` <- as.factor(data_final$`Has App`)

LM

## lm
datout <- lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear + 
    sum_apparel + sum_accessories + Gender + `Has App` + Age + 
    `Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`, 
    data = data_final)

summary(datout)

## 
## Call:
## lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear + 
##     sum_apparel + sum_accessories + Gender + `Has App` + Age + 
##     `Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`, 
##     data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -706.84  -17.03   -2.91   13.77  302.76 
## 
## Coefficients:
##                                 Estimate Std. Error t value
## (Intercept)                     8.289412   0.388054  21.362
## sum_quantity                   11.889159   0.130557  91.065
## sum_trans                      11.595716   0.142089  81.609
## sum_footwear                   24.877895   0.153452 162.121
## sum_apparel                     6.937089   0.154520  44.894
## sum_accessories                -4.499599   0.164970 -27.275
## GenderM                         2.443701   0.154533  15.813
## `Has App`1                     -5.385020   0.236712 -22.749
## Age                             0.060565   0.010461   5.790
## `Nearest Outlet DistanceMiles`  0.016461   0.001737   9.479
## `Nearest Retail DistanceMiles` -0.045285   0.004750  -9.534
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## sum_quantity                   < 0.0000000000000002 ***
## sum_trans                      < 0.0000000000000002 ***
## sum_footwear                   < 0.0000000000000002 ***
## sum_apparel                    < 0.0000000000000002 ***
## sum_accessories                < 0.0000000000000002 ***
## GenderM                        < 0.0000000000000002 ***
## `Has App`1                     < 0.0000000000000002 ***
## Age                                   0.00000000706 ***
## `Nearest Outlet DistanceMiles` < 0.0000000000000002 ***
## `Nearest Retail DistanceMiles` < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.4 on 183108 degrees of freedom
## Multiple R-squared:  0.7372, Adjusted R-squared:  0.7372 
## F-statistic: 5.136e+04 on 10 and 183108 DF,  p-value: < 0.00000000000000022

plot(datout$fitted.values, datout$residuals) ## Not linear

Data is not normal so I will try a LogNormal and a Gamma Model after checking the Cullen Frey Graph

library(caret)
boxcox(sum_retail~.,data=data_final)

datout <- lm(log(sum_retail)~.,data = data_final) ## Log Model
datout2 <- lm((sum_retail)^0.6~.,data=data_final) ##

Random Forest

## train and testing
# sample = sample.split(data_final$sum_retail, SplitRatio = .75)
# train.data = subset(data_final, sample == TRUE)
# test.data  = subset(data_final, sample == FALSE)
# datout4 <- randomForest(data_final$sum_retail~data_final$sum_quantity +
#                           data_final$sum_trans + data_final$sum_footwear +
#                           data_final$sum_apparel + data_final$sum_accessories +
#                           data_final$Gender + data_final$`Has App` +
#                           data_final$Age +
#                           data_final$`Nearest Outlet DistanceMiles` +
#                           data_final$`Nearest Retail DistanceMiles`,
#                         ntree = 30, mtry = 5, max.depth = 5, min.node.size = 5)
# datout4

datoutFinal <- readRDS("./vansFinalModel.rds")
datoutFinal

## 
## Call:
##  randomForest(formula = data_final$sum_retail ~ data_final$sum_quantity +      data_final$sum_trans + data_final$sum_footwear + data_final$sum_apparel +      data_final$sum_accessories + data_final$Gender + data_final$`Has App` +      data_final$Age + data_final$`Nearest Outlet DistanceMiles` +      data_final$`Nearest Retail DistanceMiles`, ntree = 30, mtry = 5,      max.depth = 5, min.node.size = 5) 
##                Type of random forest: regression
##                      Number of trees: 30
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 930.4626
##                     % Var explained: 75.2

Vans Project

Removing Transactions With Returns

Add Category Count Columns

Sum Up Transactions To Join On Consumer Data

Join on Consumer Data

Look For Outliers

We Still Have Some Outliers, but the Boxplot Looks Better Now and We Have Maintained 99% of the Data

Density Plot

High Value Customers

Lets Look At the Top 25%

Gender

Has App

Age

Marital Status

Household Income

Distance to Outlet or Retail

Replacing NA’s

Model

LM

Data is not normal so I will try a LogNormal and a Gamma Model after checking the Cullen Frey Graph

Random Forest