library(tidyverse)
library(gtools)
library(zoo)
library(e1071)
library(regclass)
library(randomForest)
library(rpart)
library(caTools)
library(fitdistrplus)
library(caret)
con <- read_csv("Desktop/Nov 2020 Data Scientist_Homework Data/Consumers.csv")
con <- as.data.frame(con)
trans <- read.csv("Desktop/Nov 2020 Data Scientist_Homework Data/Transaction Line Items.csv")

Removing Transactions With Returns

returns <- subset(trans, trans$sale_or_return_indicator == "R")

trans = trans[!(trans$transaction_id %in% returns$transaction_id),]
unique(trans$sale_or_return_indicator)
## [1] "S"

Add Category Count Columns

trans$Footwear = ifelse(trans$Category == "Footwear", 1,0)
trans$Apparel <- ifelse(trans$Category == "Apparel", 1,0)
trans$Accessories <- ifelse(trans$Category == "Accessories", 1,0)
head(trans)
##   transaction_date transaction_id customer_id sale_or_return_indicator
## 1       2019-01-01     1557447392          NA                        S
## 2       2019-01-01     1557447593  1371878520                        S
## 3       2019-01-01     1557447846  1378604859                        S
## 4       2019-01-01     1557447846  1378604859                        S
## 5       2019-01-01     1557447846  1378604859                        S
## 6       2019-01-01     1557447920  1375140094                        S
##   tender_type net_retail quantity product_code    Category Footwear Apparel
## 1        GFTC      36.08        1 448512831124     Apparel        0       1
## 2        VISA      49.92        1 449556730252    Footwear        1       0
## 3        PPAL       9.85        1 447979167652 Accessories        0       0
## 4        PPAL      16.15        2 447979167296 Accessories        0       0
## 5        PPAL      21.20        2 448511665642 Accessories        0       0
## 6        PPAL      31.24        1 895832901841    Footwear        1       0
##   Accessories
## 1           0
## 2           0
## 3           1
## 4           1
## 5           1
## 6           0

Sum Up Transactions To Join On Consumer Data

x <- trans %>% group_by(customer_id) %>% 
  summarise(sum_retail = sum(net_retail), 
            sum_quantity = sum(quantity),
            sum_trans = n_distinct(transaction_id),
            sum_footwear = sum(Footwear),
            sum_apparel = sum(Apparel),
            sum_accessories = sum(Accessories))
x <- as.data.frame(x)
head(x)
##   customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 1  1312310343       8.79            1         1            0           0
## 2  1312311029      64.63            2         1            1           0
## 3  1312311101      41.22            1         1            1           0
## 4  1312311169      49.28            2         1            0           2
## 5  1312312035      41.68            1         1            1           0
## 6  1312312250     131.10            6         2            3           2
##   sum_accessories
## 1               1
## 2               1
## 3               0
## 4               0
## 5               0
## 6               0

Join on Consumer Data

dat <- left_join(x,con, by = "customer_id")

dat <- dat[order(-dat$sum_retail),]

head(dat)
##        customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 250662          NA 8682173.39       232782    138023       128737       44154
## 21428   1331753343  132337.76         4612       381         2464        1194
## 84750   1374526609   27465.38          462       101          404          26
## 12075   1323568849   20250.58          706        91          346         158
## 52247   1358781919   18793.94          876        20          592         198
## 85065   1374673179   14158.08          516        66          334         142
##        sum_accessories Gender State Has App Age MARITAL STATUS
## 250662           38739   <NA>  <NA>      NA  NA           <NA>
## 21428              440      M    CA       0  31           <NA>
## 84750                0      M    AL       1  24           <NA>
## 12075               38      M    CA       0  35           <NA>
## 52247               80      M    TX       1  NA           <NA>
## 85065               36      M    CA       1  33           <NA>
##        EST HOUSEHOLD INCOME V5 NUMBER OF PERSONS IN LIVING UNIT
## 250662                    <NA>                               NA
## 21428                        E                                1
## 84750                     <NA>                               NA
## 12075                        U                                0
## 52247                     <NA>                               NA
## 85065                     <NA>                               NA
##        NUMBER OF ADULTS IN LIVING UNIT CHILDREN: AGE 0-18 VERSION 3
## 250662                              NA                         <NA>
## 21428                                1                           5U
## 84750                               NA                         <NA>
## 12075                                0                           00
## 52247                               NA                         <NA>
## 85065                               NA                         <NA>
##        Nearest Outlet DistanceMiles Nearest Retail DistanceMiles
## 250662                           NA                           NA
## 21428                           9.3                          3.7
## 84750                            NA                           NA
## 12075                          10.4                          1.6
## 52247                            NA                           NA
## 85065                            NA                           NA
## get rid of NA Customer ID's
dat <- subset(dat, dat$customer_id != "NA")
options(scipen=10000)
ggplot(dat, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
              ylab("Sum of Retail")+
              theme(plot.title = element_text(hjust = 0.5))

## get rid of very extreme outlier
dat2 <- dat[-1,]

summary(dat2$sum_retail)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00    47.65    65.57    87.85   102.41 27465.38

Look For Outliers

## 2.5 sd above the mean
mean(dat2$sum_retail) + 2.5*sd(dat2$sum_retail)
## [1] 407.8555
dat2 <- dat2[dat2$sum_retail < 408,]
summary(dat2$sum_retail)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   47.48   65.22   82.22  100.72  407.93
options(scipen=10000)
ggplot(dat2, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
              ylab("Sum of Retail")+
              theme(plot.title = element_text(hjust = 0.5))

We Still Have Some Outliers, but the Boxplot Looks Better Now and We Have Maintained 99% of the Data

length(dat2$customer_id)/
length(dat$customer_id)
## [1] 0.9911514

Density Plot

ggplot(dat2, aes(x = sum_retail)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Retail Distribution") +
  ylab("Density") + xlab("Retail")+
              theme(plot.title = element_text(hjust = 0.5))
## Warning: Ignoring unknown parameters: outlier.colour, outlier.shape,
## outlier.size, notch

High Value Customers

summary(dat2$sum_retail)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   47.48   65.22   82.22  100.72  407.93

Lets Look At the Top 25%

dat2 = dat2[dat2$sum_retail > 100,]
summary(dat2$sum_retail)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   100.0   117.1   140.8   161.6   185.8   407.9
ggplot(dat2, aes(x = sum_retail)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Top 25th Percent Retail Distribution") +
  ylab("Density") + xlab("Retail")+
              theme(plot.title = element_text(hjust = 0.5))
## Warning: Ignoring unknown parameters: outlier.colour, outlier.shape,
## outlier.size, notch

Gender

# library(kableExtra)
# u <- subset(dat2, dat2$Gender == "U")
# f <- subset(dat2, dat2$Gender == "F")
# m <- subset(dat2, dat2$Gender == "M")
# 
# length(u$customer_id)/length(dat2$customer_id)
# length(f$customer_id)/length(dat2$customer_id)
# length(m$customer_id)/length(dat2$customer_id)
# 
# gender <- matrix(c(length(u$customer_id)/length(dat2$customer_id),
#                    length(f$customer_id)/length(dat2$customer_id),
#                    length(m$customer_id)/length(dat2$customer_id)),
#                  ncol = 1)
# gender <- round(gender,2)
# 
# colnames(gender) <- c("Gender")
# rownames(gender) <- c("Unknown", "Female", "Male")
# gender %>% kbl(caption = "<center><strong>Gender Breakdown<strong><center") %>% 
#   kable_styling(bootstrap_options = c("striped", "border"))
# 

Has App

# library(kableExtra)
# dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
# no_app <- subset(dat2, dat2$`Has App` == 0)
# has_app <- subset(dat2, dat2$`Has App` == 1)
# 
# length(has_app$customer_id)/length(dat2$customer_id) ## 13.5 of high customers have app
# 
# app <- matrix(c(length(has_app$customer_id)/length(dat2$customer_id),
#                 length(no_app$customer_id)/length(dat2$customer_id)))
# app <- round(app,2)
# colnames(app) <- "Has App"
# rownames(app) <- c("Has App", "No App")
# 
# app %>% kbl(caption = "<center><strong>App Breakdown<strong><center") %>% 
#   kable_styling(bootstrap_options = c("striped", "border"))
# 
# 
# ggplot(dat2, aes(x="", y=dat2$`Has App`, fill=dat2$`Has App`)) +
#   geom_bar(stat="identity", width=1) +
#   coord_polar("y", start=0)

Age

age_noNA <- subset(dat2, dat2$Age != "NA")
summary(age_noNA$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   23.00   33.00   33.89   43.00   83.00
plot(density(age_noNA$Age))

ggplot(age_noNA, aes(x = Age)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Top 25th Percent Age Distribution") +
  ylab("Density") + xlab("Age")+
              theme(plot.title = element_text(hjust = 0.5))
## Warning: Ignoring unknown parameters: outlier.colour, outlier.shape,
## outlier.size, notch

Marital Status

unique(dat2$`MARITAL STATUS`)
## [1] NA   "1M" "5M" "0U" "5S" "5U"
marital_status <- subset(dat2, dat2$`MARITAL STATUS` != "<NA>")
marital_status$`MARITAL STATUS` <- as.factor(marital_status$`MARITAL STATUS`)
plot(marital_status$`MARITAL STATUS`, marital_status$sum_retail)

## No Sign of Marital Status Being Significant

Household Income

unique(dat2$`EST HOUSEHOLD INCOME V5`)
##  [1] "U" "J" "F" NA  "E" "K" "A" "G" "H" "B" "L" "D" "C" "I"
income <- subset(dat2, dat2$`EST HOUSEHOLD INCOME V5` != "NA")
income$`EST HOUSEHOLD INCOME V5` <- as.factor(income$`EST HOUSEHOLD INCOME V5`)
plot(income$`EST HOUSEHOLD INCOME V5`, income$sum_retail)

## No Sign of Income Being Significant

Distance to Outlet or Retail

## Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
plot(nearest$`Nearest Outlet DistanceMiles`, nearest$sum_retail)

## Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
plot(retail$`Nearest Retail DistanceMiles`, retail$sum_retail)

## There Does Seem to Be a Grouping When Closer to Retail or Outlet

Replacing NA’s

# Age
## get rid of very extreme outlier
dat2 <- dat[-1,]
dat2 <- dat2[dat2$sum_retail < 408,]

age <- subset(dat2, dat2$Age != "NA")
plot(density(age$Age))

summary(age$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   22.00   31.00   32.75   42.00   90.00
## replacing NA with median or mean won't make much difference
dat2$Age <- round(na.aggregate(dat2$Age),0) 


dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
# Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
summary(nearest$`Nearest Outlet DistanceMiles`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.10   10.60   19.70   41.73   41.60 1809.80
plot(density(nearest$`Nearest Outlet DistanceMiles`))

## replace with the median not the mean since it is skewed

dat2$`Nearest Outlet DistanceMiles` <- na.aggregate(dat2$`Nearest Outlet DistanceMiles`, FUN = median)

# Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
summary(retail$`Nearest Retail DistanceMiles`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.30    6.30   14.34   13.10  647.20
plot(density(retail$`Nearest Retail DistanceMiles`))

## will replace with median also as the data is skewed
dat2$`Nearest Retail DistanceMiles` <- na.aggregate(dat2$`Nearest Retail DistanceMiles`, FUN = median)

Model

library(e1071)
library(regclass)
library(randomForest)
library(rpart)
dat2 <- dat2[dat2$Gender != "U",]
data_final <- dat2[,c(2:8,10:11,17,18)]
data_final <- data_final[data_final$sum_retail > 0 ,]
data_final$Gender <- as.factor(data_final$Gender)
data_final$`Has App` <- as.factor(data_final$`Has App`)

LM

## lm
datout <- lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear + 
    sum_apparel + sum_accessories + Gender + `Has App` + Age + 
    `Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`, 
    data = data_final)

summary(datout)
## 
## Call:
## lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear + 
##     sum_apparel + sum_accessories + Gender + `Has App` + Age + 
##     `Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`, 
##     data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -706.84  -17.03   -2.91   13.77  302.76 
## 
## Coefficients:
##                                 Estimate Std. Error t value
## (Intercept)                     8.289412   0.388054  21.362
## sum_quantity                   11.889159   0.130557  91.065
## sum_trans                      11.595716   0.142089  81.609
## sum_footwear                   24.877895   0.153452 162.121
## sum_apparel                     6.937089   0.154520  44.894
## sum_accessories                -4.499599   0.164970 -27.275
## GenderM                         2.443701   0.154533  15.813
## `Has App`1                     -5.385020   0.236712 -22.749
## Age                             0.060565   0.010461   5.790
## `Nearest Outlet DistanceMiles`  0.016461   0.001737   9.479
## `Nearest Retail DistanceMiles` -0.045285   0.004750  -9.534
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## sum_quantity                   < 0.0000000000000002 ***
## sum_trans                      < 0.0000000000000002 ***
## sum_footwear                   < 0.0000000000000002 ***
## sum_apparel                    < 0.0000000000000002 ***
## sum_accessories                < 0.0000000000000002 ***
## GenderM                        < 0.0000000000000002 ***
## `Has App`1                     < 0.0000000000000002 ***
## Age                                   0.00000000706 ***
## `Nearest Outlet DistanceMiles` < 0.0000000000000002 ***
## `Nearest Retail DistanceMiles` < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 31.4 on 183108 degrees of freedom
## Multiple R-squared:  0.7372, Adjusted R-squared:  0.7372 
## F-statistic: 5.136e+04 on 10 and 183108 DF,  p-value: < 0.00000000000000022
plot(datout$fitted.values, datout$residuals) ## Not linear

Data is not normal so I will try a LogNormal and a Gamma Model after checking the Cullen Frey Graph

library(caret)
boxcox(sum_retail~.,data=data_final)

datout <- lm(log(sum_retail)~.,data = data_final) ## Log Model
datout2 <- lm((sum_retail)^0.6~.,data=data_final) ## 

Random Forest

## train and testing
# sample = sample.split(data_final$sum_retail, SplitRatio = .75)
# train.data = subset(data_final, sample == TRUE)
# test.data  = subset(data_final, sample == FALSE)
# datout4 <- randomForest(data_final$sum_retail~data_final$sum_quantity +
#                           data_final$sum_trans + data_final$sum_footwear +
#                           data_final$sum_apparel + data_final$sum_accessories +
#                           data_final$Gender + data_final$`Has App` +
#                           data_final$Age +
#                           data_final$`Nearest Outlet DistanceMiles` +
#                           data_final$`Nearest Retail DistanceMiles`,
#                         ntree = 30, mtry = 5, max.depth = 5, min.node.size = 5)
# datout4
datoutFinal <- readRDS("./vansFinalModel.rds")
datoutFinal
## 
## Call:
##  randomForest(formula = data_final$sum_retail ~ data_final$sum_quantity +      data_final$sum_trans + data_final$sum_footwear + data_final$sum_apparel +      data_final$sum_accessories + data_final$Gender + data_final$`Has App` +      data_final$Age + data_final$`Nearest Outlet DistanceMiles` +      data_final$`Nearest Retail DistanceMiles`, ntree = 30, mtry = 5,      max.depth = 5, min.node.size = 5) 
##                Type of random forest: regression
##                      Number of trees: 30
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 930.4626
##                     % Var explained: 75.2