library(tidyverse)
library(gtools)
library(zoo)
library(e1071)
library(regclass)
library(randomForest)
library(rpart)
library(caTools)
library(fitdistrplus)
library(caret)
con <- read_csv("Desktop/Nov 2020 Data Scientist_Homework Data/Consumers.csv")
con <- as.data.frame(con)
trans <- read.csv("Desktop/Nov 2020 Data Scientist_Homework Data/Transaction Line Items.csv")
Removing Transactions With Returns
returns <- subset(trans, trans$sale_or_return_indicator == "R")
trans = trans[!(trans$transaction_id %in% returns$transaction_id),]
unique(trans$sale_or_return_indicator)
## [1] "S"
Add Category Count Columns
trans$Footwear = ifelse(trans$Category == "Footwear", 1,0)
trans$Apparel <- ifelse(trans$Category == "Apparel", 1,0)
trans$Accessories <- ifelse(trans$Category == "Accessories", 1,0)
head(trans)
## transaction_date transaction_id customer_id sale_or_return_indicator
## 1 2019-01-01 1557447392 NA S
## 2 2019-01-01 1557447593 1371878520 S
## 3 2019-01-01 1557447846 1378604859 S
## 4 2019-01-01 1557447846 1378604859 S
## 5 2019-01-01 1557447846 1378604859 S
## 6 2019-01-01 1557447920 1375140094 S
## tender_type net_retail quantity product_code Category Footwear Apparel
## 1 GFTC 36.08 1 448512831124 Apparel 0 1
## 2 VISA 49.92 1 449556730252 Footwear 1 0
## 3 PPAL 9.85 1 447979167652 Accessories 0 0
## 4 PPAL 16.15 2 447979167296 Accessories 0 0
## 5 PPAL 21.20 2 448511665642 Accessories 0 0
## 6 PPAL 31.24 1 895832901841 Footwear 1 0
## Accessories
## 1 0
## 2 0
## 3 1
## 4 1
## 5 1
## 6 0
Sum Up Transactions To Join On Consumer Data
x <- trans %>% group_by(customer_id) %>%
summarise(sum_retail = sum(net_retail),
sum_quantity = sum(quantity),
sum_trans = n_distinct(transaction_id),
sum_footwear = sum(Footwear),
sum_apparel = sum(Apparel),
sum_accessories = sum(Accessories))
x <- as.data.frame(x)
head(x)
## customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 1 1312310343 8.79 1 1 0 0
## 2 1312311029 64.63 2 1 1 0
## 3 1312311101 41.22 1 1 1 0
## 4 1312311169 49.28 2 1 0 2
## 5 1312312035 41.68 1 1 1 0
## 6 1312312250 131.10 6 2 3 2
## sum_accessories
## 1 1
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
Join on Consumer Data
dat <- left_join(x,con, by = "customer_id")
dat <- dat[order(-dat$sum_retail),]
head(dat)
## customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 250662 NA 8682173.39 232782 138023 128737 44154
## 21428 1331753343 132337.76 4612 381 2464 1194
## 84750 1374526609 27465.38 462 101 404 26
## 12075 1323568849 20250.58 706 91 346 158
## 52247 1358781919 18793.94 876 20 592 198
## 85065 1374673179 14158.08 516 66 334 142
## sum_accessories Gender State Has App Age MARITAL STATUS
## 250662 38739 <NA> <NA> NA NA <NA>
## 21428 440 M CA 0 31 <NA>
## 84750 0 M AL 1 24 <NA>
## 12075 38 M CA 0 35 <NA>
## 52247 80 M TX 1 NA <NA>
## 85065 36 M CA 1 33 <NA>
## EST HOUSEHOLD INCOME V5 NUMBER OF PERSONS IN LIVING UNIT
## 250662 <NA> NA
## 21428 E 1
## 84750 <NA> NA
## 12075 U 0
## 52247 <NA> NA
## 85065 <NA> NA
## NUMBER OF ADULTS IN LIVING UNIT CHILDREN: AGE 0-18 VERSION 3
## 250662 NA <NA>
## 21428 1 5U
## 84750 NA <NA>
## 12075 0 00
## 52247 NA <NA>
## 85065 NA <NA>
## Nearest Outlet DistanceMiles Nearest Retail DistanceMiles
## 250662 NA NA
## 21428 9.3 3.7
## 84750 NA NA
## 12075 10.4 1.6
## 52247 NA NA
## 85065 NA NA
## get rid of NA Customer ID's
dat <- subset(dat, dat$customer_id != "NA")
options(scipen=10000)
ggplot(dat, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
ylab("Sum of Retail")+
theme(plot.title = element_text(hjust = 0.5))

## get rid of very extreme outlier
dat2 <- dat[-1,]
summary(dat2$sum_retail)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 47.65 65.57 87.85 102.41 27465.38
Look For Outliers
## 2.5 sd above the mean
mean(dat2$sum_retail) + 2.5*sd(dat2$sum_retail)
## [1] 407.8555
dat2 <- dat2[dat2$sum_retail < 408,]
summary(dat2$sum_retail)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 47.48 65.22 82.22 100.72 407.93
options(scipen=10000)
ggplot(dat2, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
ylab("Sum of Retail")+
theme(plot.title = element_text(hjust = 0.5))

We Still Have Some Outliers, but the Boxplot Looks Better Now and We Have Maintained 99% of the Data
length(dat2$customer_id)/
length(dat$customer_id)
## [1] 0.9911514
Retail Plot
ggplot(dat2, aes(x = sum_retail)) +
geom_density(outlier.colour="black",
outlier.shape=16,
outlier.size=2, notch=FALSE) +
ggtitle("Retail Distribution") +
ylab("Density") + xlab("Retail")+
theme(plot.title = element_text(hjust = 0.5))

High Value Customers
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 47.48 65.22 82.22 100.72 407.93
Lets Look At the Top 25%
dat2 = dat2[dat2$sum_retail > 100,]
summary(dat2$sum_retail)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100.0 117.1 140.8 161.6 185.8 407.9
ggplot(dat2, aes(x = sum_retail)) +
geom_density(outlier.colour="black",
outlier.shape=16,
outlier.size=2, notch=FALSE) +
ggtitle("Top 25th Percent Retail Distribution") +
ylab("Density") + xlab("Retail")+
theme(plot.title = element_text(hjust = 0.5))

Gender
gender <- matrix(c(length(u$customer_id)/length(dat2$customer_id),
length(f$customer_id)/length(dat2$customer_id),
length(m$customer_id)/length(dat2$customer_id)),
ncol = 1)
gender <- round(gender,2)
colnames(gender) <- c("Gender")
rownames(gender) <- c("Unknown", "Female", "Male")
gender %>% kbl(caption = "<center><strong>Gender Breakdown<strong><center>") %>%
kable_styling(bootstrap_options = c("striped", "border"))
Gender Breakdown
|
|
Gender
|
|
Unknown
|
0.19
|
|
Female
|
0.53
|
|
Male
|
0.27
|
Has App
library(kableExtra)
dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
no_app <- subset(dat2, dat2$`Has App` == 0)
has_app <- subset(dat2, dat2$`Has App` == 1)
app <- matrix(c(length(has_app$customer_id)/length(dat2$customer_id),
length(no_app$customer_id)/length(dat2$customer_id)))
app <- round(app,2)
colnames(app) <- "Has App"
rownames(app) <- c("Has App", "No App")
app %>% kbl(caption = "<center><strong>App Breakdown<strong><center>") %>%
kable_styling(bootstrap_options = c("striped", "border"))
App Breakdown
|
|
Has App
|
|
Has App
|
0.12
|
|
No App
|
0.88
|
Age
age_noNA <- subset(dat2, dat2$Age != "NA")
summary(age_noNA$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 23.00 33.00 33.89 43.00 83.00
ggplot(age_noNA, aes(x = Age)) +
geom_density(outlier.colour="black",
outlier.shape=16,
outlier.size=2, notch=FALSE) +
ggtitle("Top 25th Percent Age Distribution") +
ylab("Density") + xlab("Age")+
theme(plot.title = element_text(hjust = 0.5))

Marital Status
No Sign of Marital Status Being Significant
unique(dat2$`MARITAL STATUS`)
## [1] NA "1M" "5M" "0U" "5S" "5U"
marital_status <- subset(dat2, dat2$`MARITAL STATUS` != "<NA>")
marital_status$`MARITAL STATUS` <- as.factor(marital_status$`MARITAL STATUS`)
plot(marital_status$`MARITAL STATUS`, marital_status$sum_retail)

Household Income
No Sign of Income Being Significant
unique(dat2$`EST HOUSEHOLD INCOME V5`)
## [1] "U" "J" "F" NA "E" "K" "A" "G" "H" "B" "L" "D" "C" "I"
income <- subset(dat2, dat2$`EST HOUSEHOLD INCOME V5` != "NA")
income$`EST HOUSEHOLD INCOME V5` <- as.factor(income$`EST HOUSEHOLD INCOME V5`)
plot(income$`EST HOUSEHOLD INCOME V5`, income$sum_retail)

Distance to Outlet or Retail
There Does Seem to Be a Grouping When Closer to Retail or Outlet
## Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
plot(nearest$`Nearest Outlet DistanceMiles`, nearest$sum_retail,
xlab = "Nearest Outlet Distance in Miles",
ylab = "Retail")

## Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
plot(retail$`Nearest Retail DistanceMiles`, retail$sum_retail,
xlab = "Nearest Retail Distance in Miles",
ylab = "Retail")

Replacing NA’s
# Age
## get rid of very extreme outlier
dat2 <- dat[-1,]
age <- subset(dat2, dat2$Age != "NA")
summary(age$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 22.00 31.00 32.77 42.00 90.00
## replacing NA with median or mean won't make much difference
dat2$Age <- round(na.aggregate(dat2$Age),0)
dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
# Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
summary(nearest$`Nearest Outlet DistanceMiles`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.10 10.50 19.60 41.62 41.40 1809.80
##plot(density(nearest$`Nearest Outlet DistanceMiles`))
## replace with the median not the mean since it is skewed
dat2$`Nearest Outlet DistanceMiles` <- na.aggregate(dat2$`Nearest Outlet DistanceMiles`, FUN = median)
# Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
summary(retail$`Nearest Retail DistanceMiles`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 3.30 6.30 14.28 13.10 647.20
##plot(density(retail$`Nearest Retail DistanceMiles`))
## will replace with median also as the data is skewed
dat2$`Nearest Retail DistanceMiles` <- na.aggregate(dat2$`Nearest Retail DistanceMiles`, FUN = median)
Model
library(e1071)
library(regclass)
library(randomForest)
library(rpart)
dat2 <- dat2[dat2$Gender != "U",]
data_final <- dat2[,c(2:8,10:11,17,18)]
data_final <- data_final[data_final$sum_retail > 0 ,]
data_final$Gender <- as.factor(data_final$Gender)
data_final$`Has App` <- as.factor(data_final$`Has App`)
Linear Model
## lm
datout <- lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear +
sum_apparel + sum_accessories + Gender + `Has App` + Age +
`Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`,
data = data_final)
summary(datout)
##
## Call:
## lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear +
## sum_apparel + sum_accessories + Gender + `Has App` + Age +
## `Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`,
## data = data_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4369.1 -19.8 -4.5 14.8 11178.7
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 10.731919 0.644607 16.649
## sum_quantity 33.721403 0.150554 223.983
## sum_trans 8.723931 0.132603 65.790
## sum_footwear 1.039640 0.167544 6.205
## sum_apparel -23.424797 0.183919 -127.365
## sum_accessories -31.826785 0.186683 -170.486
## GenderM 2.259359 0.266508 8.478
## `Has App`1 -3.530016 0.404177 -8.734
## Age 0.238447 0.017943 13.289
## `Nearest Outlet DistanceMiles` 0.015577 0.002997 5.198
## `Nearest Retail DistanceMiles` -0.032689 0.008210 -3.981
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## sum_quantity < 0.0000000000000002 ***
## sum_trans < 0.0000000000000002 ***
## sum_footwear 0.000000000547 ***
## sum_apparel < 0.0000000000000002 ***
## sum_accessories < 0.0000000000000002 ***
## GenderM < 0.0000000000000002 ***
## `Has App`1 < 0.0000000000000002 ***
## Age < 0.0000000000000002 ***
## `Nearest Outlet DistanceMiles` 0.000000201469 ***
## `Nearest Retail DistanceMiles` 0.000068539930 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 54.51 on 185164 degrees of freedom
## Multiple R-squared: 0.8576, Adjusted R-squared: 0.8576
## F-statistic: 1.116e+05 on 10 and 185164 DF, p-value: < 0.00000000000000022
plot(datout$fitted.values, datout$residuals) ## Not linear

Random Forest
## train and testing
# sample = sample.split(data_final$sum_retail, SplitRatio = .75)
# train.data = subset(data_final, sample == TRUE)
# test.data = subset(data_final, sample == FALSE)
# datout4 <- randomForest(data_final$sum_retail~data_final$sum_quantity +
# data_final$sum_trans + data_final$sum_footwear +
# data_final$sum_apparel + data_final$sum_accessories +
# data_final$Gender + data_final$`Has App` +
# data_final$Age +
# data_final$`Nearest Outlet DistanceMiles` +
# data_final$`Nearest Retail DistanceMiles`,
# ntree = 30, mtry = 5, max.depth = 5, min.node.size = 5)
# datout4
datoutFinal <- readRDS("./vansFinalModel.rds")
datoutFinal
##
## Call:
## randomForest(formula = data_final$sum_retail ~ data_final$sum_quantity + data_final$sum_trans + data_final$sum_footwear + data_final$sum_apparel + data_final$sum_accessories + data_final$Gender + data_final$`Has App` + data_final$Age + data_final$`Nearest Outlet DistanceMiles` + data_final$`Nearest Retail DistanceMiles`, ntree = 30, mtry = 5, max.depth = 5, min.node.size = 5)
## Type of random forest: regression
## Number of trees: 30
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 4037.324
## % Var explained: 80.65