library(tidyverse)
library(gtools)
library(zoo)
library(e1071)
library(regclass)
library(randomForest)
library(rpart)
library(caTools)
library(fitdistrplus)
library(caret)
con <- read_csv("Desktop/Nov 2020 Data Scientist_Homework Data/Consumers.csv")
con <- as.data.frame(con)
trans <- read.csv("Desktop/Nov 2020 Data Scientist_Homework Data/Transaction Line Items.csv")

Removing Transactions With Returns

returns <- subset(trans, trans$sale_or_return_indicator == "R")

trans = trans[!(trans$transaction_id %in% returns$transaction_id),]
unique(trans$sale_or_return_indicator)
## [1] "S"

Add Category Count Columns

trans$Footwear = ifelse(trans$Category == "Footwear", 1,0)
trans$Apparel <- ifelse(trans$Category == "Apparel", 1,0)
trans$Accessories <- ifelse(trans$Category == "Accessories", 1,0)
head(trans)
##   transaction_date transaction_id customer_id sale_or_return_indicator
## 1       2019-01-01     1557447392          NA                        S
## 2       2019-01-01     1557447593  1371878520                        S
## 3       2019-01-01     1557447846  1378604859                        S
## 4       2019-01-01     1557447846  1378604859                        S
## 5       2019-01-01     1557447846  1378604859                        S
## 6       2019-01-01     1557447920  1375140094                        S
##   tender_type net_retail quantity product_code    Category Footwear Apparel
## 1        GFTC      36.08        1 448512831124     Apparel        0       1
## 2        VISA      49.92        1 449556730252    Footwear        1       0
## 3        PPAL       9.85        1 447979167652 Accessories        0       0
## 4        PPAL      16.15        2 447979167296 Accessories        0       0
## 5        PPAL      21.20        2 448511665642 Accessories        0       0
## 6        PPAL      31.24        1 895832901841    Footwear        1       0
##   Accessories
## 1           0
## 2           0
## 3           1
## 4           1
## 5           1
## 6           0

Sum Up Transactions To Join On Consumer Data

x <- trans %>% group_by(customer_id) %>% 
  summarise(sum_retail = sum(net_retail), 
            sum_quantity = sum(quantity),
            sum_trans = n_distinct(transaction_id),
            sum_footwear = sum(Footwear),
            sum_apparel = sum(Apparel),
            sum_accessories = sum(Accessories))
x <- as.data.frame(x)
head(x)
##   customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 1  1312310343       8.79            1         1            0           0
## 2  1312311029      64.63            2         1            1           0
## 3  1312311101      41.22            1         1            1           0
## 4  1312311169      49.28            2         1            0           2
## 5  1312312035      41.68            1         1            1           0
## 6  1312312250     131.10            6         2            3           2
##   sum_accessories
## 1               1
## 2               1
## 3               0
## 4               0
## 5               0
## 6               0

Join on Consumer Data

dat <- left_join(x,con, by = "customer_id")

dat <- dat[order(-dat$sum_retail),]

head(dat)
##        customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 250662          NA 8682173.39       232782    138023       128737       44154
## 21428   1331753343  132337.76         4612       381         2464        1194
## 84750   1374526609   27465.38          462       101          404          26
## 12075   1323568849   20250.58          706        91          346         158
## 52247   1358781919   18793.94          876        20          592         198
## 85065   1374673179   14158.08          516        66          334         142
##        sum_accessories Gender State Has App Age MARITAL STATUS
## 250662           38739   <NA>  <NA>      NA  NA           <NA>
## 21428              440      M    CA       0  31           <NA>
## 84750                0      M    AL       1  24           <NA>
## 12075               38      M    CA       0  35           <NA>
## 52247               80      M    TX       1  NA           <NA>
## 85065               36      M    CA       1  33           <NA>
##        EST HOUSEHOLD INCOME V5 NUMBER OF PERSONS IN LIVING UNIT
## 250662                    <NA>                               NA
## 21428                        E                                1
## 84750                     <NA>                               NA
## 12075                        U                                0
## 52247                     <NA>                               NA
## 85065                     <NA>                               NA
##        NUMBER OF ADULTS IN LIVING UNIT CHILDREN: AGE 0-18 VERSION 3
## 250662                              NA                         <NA>
## 21428                                1                           5U
## 84750                               NA                         <NA>
## 12075                                0                           00
## 52247                               NA                         <NA>
## 85065                               NA                         <NA>
##        Nearest Outlet DistanceMiles Nearest Retail DistanceMiles
## 250662                           NA                           NA
## 21428                           9.3                          3.7
## 84750                            NA                           NA
## 12075                          10.4                          1.6
## 52247                            NA                           NA
## 85065                            NA                           NA
## get rid of NA Customer ID's
dat <- subset(dat, dat$customer_id != "NA")
options(scipen=10000)
ggplot(dat, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
              ylab("Sum of Retail")+
              theme(plot.title = element_text(hjust = 0.5))

## get rid of very extreme outlier
dat2 <- dat[-1,]

summary(dat2$sum_retail)
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##     0.00    47.65    65.57    87.85   102.41 27465.38

Look For Outliers

## 2.5 sd above the mean
mean(dat2$sum_retail) + 2.5*sd(dat2$sum_retail)
## [1] 407.8555
dat2 <- dat2[dat2$sum_retail < 408,]
summary(dat2$sum_retail)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   47.48   65.22   82.22  100.72  407.93
options(scipen=10000)
ggplot(dat2, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
             outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
              ylab("Sum of Retail")+
              theme(plot.title = element_text(hjust = 0.5))

We Still Have Some Outliers, but the Boxplot Looks Better Now and We Have Maintained 99% of the Data

length(dat2$customer_id)/
length(dat$customer_id)
## [1] 0.9911514

Retail Plot

ggplot(dat2, aes(x = sum_retail)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Retail Distribution") +
  ylab("Density") + xlab("Retail")+
              theme(plot.title = element_text(hjust = 0.5))

High Value Customers

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   47.48   65.22   82.22  100.72  407.93

Lets Look At the Top 25%

dat2 = dat2[dat2$sum_retail > 100,]
summary(dat2$sum_retail)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   100.0   117.1   140.8   161.6   185.8   407.9
ggplot(dat2, aes(x = sum_retail)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Top 25th Percent Retail Distribution") +
  ylab("Density") + xlab("Retail")+
              theme(plot.title = element_text(hjust = 0.5))

Gender

gender <- matrix(c(length(u$customer_id)/length(dat2$customer_id),
                   length(f$customer_id)/length(dat2$customer_id),
                   length(m$customer_id)/length(dat2$customer_id)),
                 ncol = 1)
gender <- round(gender,2)

colnames(gender) <- c("Gender")
rownames(gender) <- c("Unknown", "Female", "Male")
gender %>% kbl(caption = "<center><strong>Gender Breakdown<strong><center>") %>%
  kable_styling(bootstrap_options = c("striped", "border"))
Gender Breakdown
Gender
Unknown 0.19
Female 0.53
Male 0.27

Has App

library(kableExtra)
dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
no_app <- subset(dat2, dat2$`Has App` == 0)
has_app <- subset(dat2, dat2$`Has App` == 1)



app <- matrix(c(length(has_app$customer_id)/length(dat2$customer_id),
                length(no_app$customer_id)/length(dat2$customer_id)))
app <- round(app,2)
colnames(app) <- "Has App"
rownames(app) <- c("Has App", "No App")

app %>% kbl(caption = "<center><strong>App Breakdown<strong><center>") %>%
  kable_styling(bootstrap_options = c("striped", "border"))
App Breakdown
Has App
Has App 0.12
No App 0.88

Age

age_noNA <- subset(dat2, dat2$Age != "NA")
summary(age_noNA$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   23.00   33.00   33.89   43.00   83.00
ggplot(age_noNA, aes(x = Age)) + 
  geom_density(outlier.colour="black",
               outlier.shape=16,
               outlier.size=2, notch=FALSE) +
  ggtitle("Top 25th Percent Age Distribution") +
  ylab("Density") + xlab("Age")+
              theme(plot.title = element_text(hjust = 0.5))

Marital Status

No Sign of Marital Status Being Significant

unique(dat2$`MARITAL STATUS`)
## [1] NA   "1M" "5M" "0U" "5S" "5U"
marital_status <- subset(dat2, dat2$`MARITAL STATUS` != "<NA>")
marital_status$`MARITAL STATUS` <- as.factor(marital_status$`MARITAL STATUS`)
plot(marital_status$`MARITAL STATUS`, marital_status$sum_retail)

Household Income

No Sign of Income Being Significant

unique(dat2$`EST HOUSEHOLD INCOME V5`)
##  [1] "U" "J" "F" NA  "E" "K" "A" "G" "H" "B" "L" "D" "C" "I"
income <- subset(dat2, dat2$`EST HOUSEHOLD INCOME V5` != "NA")
income$`EST HOUSEHOLD INCOME V5` <- as.factor(income$`EST HOUSEHOLD INCOME V5`)
plot(income$`EST HOUSEHOLD INCOME V5`, income$sum_retail)

Distance to Outlet or Retail

There Does Seem to Be a Grouping When Closer to Retail or Outlet

## Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
plot(nearest$`Nearest Outlet DistanceMiles`, nearest$sum_retail,
     xlab = "Nearest Outlet Distance in Miles",
     ylab = "Retail")

## Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
plot(retail$`Nearest Retail DistanceMiles`, retail$sum_retail, 
     xlab = "Nearest Retail Distance in Miles",
     ylab = "Retail")

Replacing NA’s

# Age
## get rid of very extreme outlier
dat2 <- dat[-1,]


age <- subset(dat2, dat2$Age != "NA")

summary(age$Age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   16.00   22.00   31.00   32.77   42.00   90.00
## replacing NA with median or mean won't make much difference
dat2$Age <- round(na.aggregate(dat2$Age),0) 


dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
# Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
summary(nearest$`Nearest Outlet DistanceMiles`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.10   10.50   19.60   41.62   41.40 1809.80
##plot(density(nearest$`Nearest Outlet DistanceMiles`))

## replace with the median not the mean since it is skewed

dat2$`Nearest Outlet DistanceMiles` <- na.aggregate(dat2$`Nearest Outlet DistanceMiles`, FUN = median)

# Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
summary(retail$`Nearest Retail DistanceMiles`)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    3.30    6.30   14.28   13.10  647.20
##plot(density(retail$`Nearest Retail DistanceMiles`))

## will replace with median also as the data is skewed
dat2$`Nearest Retail DistanceMiles` <- na.aggregate(dat2$`Nearest Retail DistanceMiles`, FUN = median)

Model

library(e1071)
library(regclass)
library(randomForest)
library(rpart)
dat2 <- dat2[dat2$Gender != "U",]
data_final <- dat2[,c(2:8,10:11,17,18)]
data_final <- data_final[data_final$sum_retail > 0 ,]
data_final$Gender <- as.factor(data_final$Gender)
data_final$`Has App` <- as.factor(data_final$`Has App`)

Linear Model

## lm
datout <- lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear + 
    sum_apparel + sum_accessories + Gender + `Has App` + Age + 
    `Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`, 
    data = data_final)

summary(datout)
## 
## Call:
## lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear + 
##     sum_apparel + sum_accessories + Gender + `Has App` + Age + 
##     `Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`, 
##     data = data_final)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4369.1   -19.8    -4.5    14.8 11178.7 
## 
## Coefficients:
##                                  Estimate Std. Error  t value
## (Intercept)                     10.731919   0.644607   16.649
## sum_quantity                    33.721403   0.150554  223.983
## sum_trans                        8.723931   0.132603   65.790
## sum_footwear                     1.039640   0.167544    6.205
## sum_apparel                    -23.424797   0.183919 -127.365
## sum_accessories                -31.826785   0.186683 -170.486
## GenderM                          2.259359   0.266508    8.478
## `Has App`1                      -3.530016   0.404177   -8.734
## Age                              0.238447   0.017943   13.289
## `Nearest Outlet DistanceMiles`   0.015577   0.002997    5.198
## `Nearest Retail DistanceMiles`  -0.032689   0.008210   -3.981
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## sum_quantity                   < 0.0000000000000002 ***
## sum_trans                      < 0.0000000000000002 ***
## sum_footwear                         0.000000000547 ***
## sum_apparel                    < 0.0000000000000002 ***
## sum_accessories                < 0.0000000000000002 ***
## GenderM                        < 0.0000000000000002 ***
## `Has App`1                     < 0.0000000000000002 ***
## Age                            < 0.0000000000000002 ***
## `Nearest Outlet DistanceMiles`       0.000000201469 ***
## `Nearest Retail DistanceMiles`       0.000068539930 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 54.51 on 185164 degrees of freedom
## Multiple R-squared:  0.8576, Adjusted R-squared:  0.8576 
## F-statistic: 1.116e+05 on 10 and 185164 DF,  p-value: < 0.00000000000000022
plot(datout$fitted.values, datout$residuals) ## Not linear

The Data is Not Normal so I Will Try a LogNormal and a BoxCox Transformation

The BoxCox Transformation looks the best but still not great with an R^2 of .56

library(caret)
boxcox(sum_retail~.,data=data_final)

datout <- lm(log(sum_retail)~.,data = data_final) ## Log Model
datout2 <- lm((sum_retail)^0.6~.,data=data_final) ## 

summary(datout)
## 
## Call:
## lm(formula = log(sum_retail) ~ ., data = data_final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -29.3397  -0.2812   0.0244   0.4125   2.5424 
## 
## Coefficients:
##                                   Estimate  Std. Error t value
## (Intercept)                     3.82765800  0.00789983 484.524
## sum_quantity                    0.02094955  0.00184508  11.354
## sum_trans                       0.13508326  0.00162509  83.123
## sum_footwear                    0.02424722  0.00205330  11.809
## sum_apparel                     0.00158193  0.00225398   0.702
## sum_accessories                -0.01198392  0.00228785  -5.238
## GenderM                        -0.00596852  0.00326613  -1.827
## `Has App`1                      0.00622478  0.00495330   1.257
## Age                             0.00476511  0.00021990  21.669
## `Nearest Outlet DistanceMiles`  0.00007065  0.00003672   1.924
## `Nearest Retail DistanceMiles` -0.00038147  0.00010062  -3.791
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## sum_quantity                   < 0.0000000000000002 ***
## sum_trans                      < 0.0000000000000002 ***
## sum_footwear                   < 0.0000000000000002 ***
## sum_apparel                                 0.48278    
## sum_accessories                         0.000000162 ***
## GenderM                                     0.06764 .  
## `Has App`1                                  0.20887    
## Age                            < 0.0000000000000002 ***
## `Nearest Outlet DistanceMiles`              0.05437 .  
## `Nearest Retail DistanceMiles`              0.00015 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.668 on 185164 degrees of freedom
## Multiple R-squared:  0.2033, Adjusted R-squared:  0.2033 
## F-statistic:  4725 on 10 and 185164 DF,  p-value: < 0.00000000000000022
summary(datout2)
## 
## Call:
## lm(formula = (sum_retail)^0.6 ~ ., data = data_final)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -296.420   -2.579   -0.612    2.206   81.288 
## 
## Coefficients:
##                                  Estimate Std. Error t value
## (Intercept)                     8.8291599  0.0559730 157.740
## sum_quantity                    0.8279460  0.0130730  63.333
## sum_trans                       1.6078287  0.0115143 139.637
## sum_footwear                    0.0350833  0.0145483   2.412
## sum_apparel                    -0.3951664  0.0159702 -24.744
## sum_accessories                -0.5896443  0.0162102 -36.375
## GenderM                         0.0320045  0.0231417   1.383
## `Has App`1                      0.2001191  0.0350958   5.702
## Age                             0.0411432  0.0015581  26.406
## `Nearest Outlet DistanceMiles`  0.0006042  0.0002602   2.322
## `Nearest Retail DistanceMiles` -0.0029088  0.0007129  -4.080
##                                            Pr(>|t|)    
## (Intercept)                    < 0.0000000000000002 ***
## sum_quantity                   < 0.0000000000000002 ***
## sum_trans                      < 0.0000000000000002 ***
## sum_footwear                                 0.0159 *  
## sum_apparel                    < 0.0000000000000002 ***
## sum_accessories                < 0.0000000000000002 ***
## GenderM                                      0.1667    
## `Has App`1                             0.0000000119 ***
## Age                            < 0.0000000000000002 ***
## `Nearest Outlet DistanceMiles`               0.0202 *  
## `Nearest Retail DistanceMiles`         0.0000450564 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.733 on 185164 degrees of freedom
## Multiple R-squared:  0.5619, Adjusted R-squared:  0.5619 
## F-statistic: 2.375e+04 on 10 and 185164 DF,  p-value: < 0.00000000000000022

Random Forest

## train and testing
# sample = sample.split(data_final$sum_retail, SplitRatio = .75)
# train.data = subset(data_final, sample == TRUE)
# test.data  = subset(data_final, sample == FALSE)
# datout4 <- randomForest(data_final$sum_retail~data_final$sum_quantity +
#                           data_final$sum_trans + data_final$sum_footwear +
#                           data_final$sum_apparel + data_final$sum_accessories +
#                           data_final$Gender + data_final$`Has App` +
#                           data_final$Age +
#                           data_final$`Nearest Outlet DistanceMiles` +
#                           data_final$`Nearest Retail DistanceMiles`,
#                         ntree = 30, mtry = 5, max.depth = 5, min.node.size = 5)
# datout4
datoutFinal <- readRDS("./vansFinalModel.rds")
datoutFinal
## 
## Call:
##  randomForest(formula = data_final$sum_retail ~ data_final$sum_quantity +      data_final$sum_trans + data_final$sum_footwear + data_final$sum_apparel +      data_final$sum_accessories + data_final$Gender + data_final$`Has App` +      data_final$Age + data_final$`Nearest Outlet DistanceMiles` +      data_final$`Nearest Retail DistanceMiles`, ntree = 30, mtry = 5,      max.depth = 5, min.node.size = 5) 
##                Type of random forest: regression
##                      Number of trees: 30
## No. of variables tried at each split: 5
## 
##           Mean of squared residuals: 4037.324
##                     % Var explained: 80.65