library(tidyverse)
library(gtools)
library(zoo)
library(e1071)
library(regclass)
library(randomForest)
library(rpart)
library(caTools)
library(fitdistrplus)
library(caret)
con <- read_csv("Desktop/Nov 2020 Data Scientist_Homework Data/Consumers.csv")
con <- as.data.frame(con)
trans <- read.csv("Desktop/Nov 2020 Data Scientist_Homework Data/Transaction Line Items.csv")
returns <- subset(trans, trans$sale_or_return_indicator == "R")
trans = trans[!(trans$transaction_id %in% returns$transaction_id),]
unique(trans$sale_or_return_indicator)
## [1] "S"
trans$Footwear = ifelse(trans$Category == "Footwear", 1,0)
trans$Apparel <- ifelse(trans$Category == "Apparel", 1,0)
trans$Accessories <- ifelse(trans$Category == "Accessories", 1,0)
head(trans)
## transaction_date transaction_id customer_id sale_or_return_indicator
## 1 2019-01-01 1557447392 NA S
## 2 2019-01-01 1557447593 1371878520 S
## 3 2019-01-01 1557447846 1378604859 S
## 4 2019-01-01 1557447846 1378604859 S
## 5 2019-01-01 1557447846 1378604859 S
## 6 2019-01-01 1557447920 1375140094 S
## tender_type net_retail quantity product_code Category Footwear Apparel
## 1 GFTC 36.08 1 448512831124 Apparel 0 1
## 2 VISA 49.92 1 449556730252 Footwear 1 0
## 3 PPAL 9.85 1 447979167652 Accessories 0 0
## 4 PPAL 16.15 2 447979167296 Accessories 0 0
## 5 PPAL 21.20 2 448511665642 Accessories 0 0
## 6 PPAL 31.24 1 895832901841 Footwear 1 0
## Accessories
## 1 0
## 2 0
## 3 1
## 4 1
## 5 1
## 6 0
x <- trans %>% group_by(customer_id) %>%
summarise(sum_retail = sum(net_retail),
sum_quantity = sum(quantity),
sum_trans = n_distinct(transaction_id),
sum_footwear = sum(Footwear),
sum_apparel = sum(Apparel),
sum_accessories = sum(Accessories))
x <- as.data.frame(x)
head(x)
## customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 1 1312310343 8.79 1 1 0 0
## 2 1312311029 64.63 2 1 1 0
## 3 1312311101 41.22 1 1 1 0
## 4 1312311169 49.28 2 1 0 2
## 5 1312312035 41.68 1 1 1 0
## 6 1312312250 131.10 6 2 3 2
## sum_accessories
## 1 1
## 2 1
## 3 0
## 4 0
## 5 0
## 6 0
dat <- left_join(x,con, by = "customer_id")
dat <- dat[order(-dat$sum_retail),]
head(dat)
## customer_id sum_retail sum_quantity sum_trans sum_footwear sum_apparel
## 250662 NA 8682173.39 232782 138023 128737 44154
## 21428 1331753343 132337.76 4612 381 2464 1194
## 84750 1374526609 27465.38 462 101 404 26
## 12075 1323568849 20250.58 706 91 346 158
## 52247 1358781919 18793.94 876 20 592 198
## 85065 1374673179 14158.08 516 66 334 142
## sum_accessories Gender State Has App Age MARITAL STATUS
## 250662 38739 <NA> <NA> NA NA <NA>
## 21428 440 M CA 0 31 <NA>
## 84750 0 M AL 1 24 <NA>
## 12075 38 M CA 0 35 <NA>
## 52247 80 M TX 1 NA <NA>
## 85065 36 M CA 1 33 <NA>
## EST HOUSEHOLD INCOME V5 NUMBER OF PERSONS IN LIVING UNIT
## 250662 <NA> NA
## 21428 E 1
## 84750 <NA> NA
## 12075 U 0
## 52247 <NA> NA
## 85065 <NA> NA
## NUMBER OF ADULTS IN LIVING UNIT CHILDREN: AGE 0-18 VERSION 3
## 250662 NA <NA>
## 21428 1 5U
## 84750 NA <NA>
## 12075 0 00
## 52247 NA <NA>
## 85065 NA <NA>
## Nearest Outlet DistanceMiles Nearest Retail DistanceMiles
## 250662 NA NA
## 21428 9.3 3.7
## 84750 NA NA
## 12075 10.4 1.6
## 52247 NA NA
## 85065 NA NA
## get rid of NA Customer ID's
dat <- subset(dat, dat$customer_id != "NA")
options(scipen=10000)
ggplot(dat, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
ylab("Sum of Retail")+
theme(plot.title = element_text(hjust = 0.5))
## get rid of very extreme outlier
dat2 <- dat[-1,]
summary(dat2$sum_retail)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 47.65 65.57 87.85 102.41 27465.38
## 2.5 sd above the mean
mean(dat2$sum_retail) + 2.5*sd(dat2$sum_retail)
## [1] 407.8555
dat2 <- dat2[dat2$sum_retail < 408,]
summary(dat2$sum_retail)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 47.48 65.22 82.22 100.72 407.93
options(scipen=10000)
ggplot(dat2, aes(y = sum_retail)) + geom_boxplot(outlier.colour="black", outlier.shape=16,
outlier.size=2, notch=FALSE) + ggtitle("Retail Boxplot") +
ylab("Sum of Retail")+
theme(plot.title = element_text(hjust = 0.5))
length(dat2$customer_id)/
length(dat$customer_id)
## [1] 0.9911514
ggplot(dat2, aes(x = sum_retail)) +
geom_density(outlier.colour="black",
outlier.shape=16,
outlier.size=2, notch=FALSE) +
ggtitle("Retail Distribution") +
ylab("Density") + xlab("Retail")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: Ignoring unknown parameters: outlier.colour, outlier.shape,
## outlier.size, notch
summary(dat2$sum_retail)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 47.48 65.22 82.22 100.72 407.93
dat2 = dat2[dat2$sum_retail > 100,]
summary(dat2$sum_retail)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 100.0 117.1 140.8 161.6 185.8 407.9
ggplot(dat2, aes(x = sum_retail)) +
geom_density(outlier.colour="black",
outlier.shape=16,
outlier.size=2, notch=FALSE) +
ggtitle("Top 25th Percent Retail Distribution") +
ylab("Density") + xlab("Retail")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: Ignoring unknown parameters: outlier.colour, outlier.shape,
## outlier.size, notch
# library(kableExtra)
# u <- subset(dat2, dat2$Gender == "U")
# f <- subset(dat2, dat2$Gender == "F")
# m <- subset(dat2, dat2$Gender == "M")
#
# length(u$customer_id)/length(dat2$customer_id)
# length(f$customer_id)/length(dat2$customer_id)
# length(m$customer_id)/length(dat2$customer_id)
#
# gender <- matrix(c(length(u$customer_id)/length(dat2$customer_id),
# length(f$customer_id)/length(dat2$customer_id),
# length(m$customer_id)/length(dat2$customer_id)),
# ncol = 1)
# gender <- round(gender,2)
#
# colnames(gender) <- c("Gender")
# rownames(gender) <- c("Unknown", "Female", "Male")
# gender %>% kbl(caption = "<center><strong>Gender Breakdown<strong><center") %>%
# kable_styling(bootstrap_options = c("striped", "border"))
#
# library(kableExtra)
# dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
# no_app <- subset(dat2, dat2$`Has App` == 0)
# has_app <- subset(dat2, dat2$`Has App` == 1)
#
# length(has_app$customer_id)/length(dat2$customer_id) ## 13.5 of high customers have app
#
# app <- matrix(c(length(has_app$customer_id)/length(dat2$customer_id),
# length(no_app$customer_id)/length(dat2$customer_id)))
# app <- round(app,2)
# colnames(app) <- "Has App"
# rownames(app) <- c("Has App", "No App")
#
# app %>% kbl(caption = "<center><strong>App Breakdown<strong><center") %>%
# kable_styling(bootstrap_options = c("striped", "border"))
#
#
# ggplot(dat2, aes(x="", y=dat2$`Has App`, fill=dat2$`Has App`)) +
# geom_bar(stat="identity", width=1) +
# coord_polar("y", start=0)
age_noNA <- subset(dat2, dat2$Age != "NA")
summary(age_noNA$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 23.00 33.00 33.89 43.00 83.00
plot(density(age_noNA$Age))
ggplot(age_noNA, aes(x = Age)) +
geom_density(outlier.colour="black",
outlier.shape=16,
outlier.size=2, notch=FALSE) +
ggtitle("Top 25th Percent Age Distribution") +
ylab("Density") + xlab("Age")+
theme(plot.title = element_text(hjust = 0.5))
## Warning: Ignoring unknown parameters: outlier.colour, outlier.shape,
## outlier.size, notch
unique(dat2$`MARITAL STATUS`)
## [1] NA "1M" "5M" "0U" "5S" "5U"
marital_status <- subset(dat2, dat2$`MARITAL STATUS` != "<NA>")
marital_status$`MARITAL STATUS` <- as.factor(marital_status$`MARITAL STATUS`)
plot(marital_status$`MARITAL STATUS`, marital_status$sum_retail)
## No Sign of Marital Status Being Significant
unique(dat2$`EST HOUSEHOLD INCOME V5`)
## [1] "U" "J" "F" NA "E" "K" "A" "G" "H" "B" "L" "D" "C" "I"
income <- subset(dat2, dat2$`EST HOUSEHOLD INCOME V5` != "NA")
income$`EST HOUSEHOLD INCOME V5` <- as.factor(income$`EST HOUSEHOLD INCOME V5`)
plot(income$`EST HOUSEHOLD INCOME V5`, income$sum_retail)
## No Sign of Income Being Significant
## Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
plot(nearest$`Nearest Outlet DistanceMiles`, nearest$sum_retail)
## Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
plot(retail$`Nearest Retail DistanceMiles`, retail$sum_retail)
## There Does Seem to Be a Grouping When Closer to Retail or Outlet
# Age
## get rid of very extreme outlier
dat2 <- dat[-1,]
dat2 <- dat2[dat2$sum_retail < 408,]
age <- subset(dat2, dat2$Age != "NA")
plot(density(age$Age))
summary(age$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 16.00 22.00 31.00 32.75 42.00 90.00
## replacing NA with median or mean won't make much difference
dat2$Age <- round(na.aggregate(dat2$Age),0)
dat2$`Has App` <- round(na.aggregate(dat2$`Has App`),0)
# Nearest Outlet
nearest <- subset(dat2, dat2$`Nearest Outlet DistanceMiles` != "NA")
summary(nearest$`Nearest Outlet DistanceMiles`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.10 10.60 19.70 41.73 41.60 1809.80
plot(density(nearest$`Nearest Outlet DistanceMiles`))
## replace with the median not the mean since it is skewed
dat2$`Nearest Outlet DistanceMiles` <- na.aggregate(dat2$`Nearest Outlet DistanceMiles`, FUN = median)
# Nearest Retail
retail <- subset(dat2, dat2$`Nearest Retail DistanceMiles` != "NA")
summary(retail$`Nearest Retail DistanceMiles`)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 3.30 6.30 14.34 13.10 647.20
plot(density(retail$`Nearest Retail DistanceMiles`))
## will replace with median also as the data is skewed
dat2$`Nearest Retail DistanceMiles` <- na.aggregate(dat2$`Nearest Retail DistanceMiles`, FUN = median)
library(e1071)
library(regclass)
library(randomForest)
library(rpart)
dat2 <- dat2[dat2$Gender != "U",]
data_final <- dat2[,c(2:8,10:11,17,18)]
data_final <- data_final[data_final$sum_retail > 0 ,]
data_final$Gender <- as.factor(data_final$Gender)
data_final$`Has App` <- as.factor(data_final$`Has App`)
## lm
datout <- lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear +
sum_apparel + sum_accessories + Gender + `Has App` + Age +
`Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`,
data = data_final)
summary(datout)
##
## Call:
## lm(formula = sum_retail ~ sum_quantity + sum_trans + sum_footwear +
## sum_apparel + sum_accessories + Gender + `Has App` + Age +
## `Nearest Outlet DistanceMiles` + `Nearest Retail DistanceMiles`,
## data = data_final)
##
## Residuals:
## Min 1Q Median 3Q Max
## -706.84 -17.03 -2.91 13.77 302.76
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 8.289412 0.388054 21.362
## sum_quantity 11.889159 0.130557 91.065
## sum_trans 11.595716 0.142089 81.609
## sum_footwear 24.877895 0.153452 162.121
## sum_apparel 6.937089 0.154520 44.894
## sum_accessories -4.499599 0.164970 -27.275
## GenderM 2.443701 0.154533 15.813
## `Has App`1 -5.385020 0.236712 -22.749
## Age 0.060565 0.010461 5.790
## `Nearest Outlet DistanceMiles` 0.016461 0.001737 9.479
## `Nearest Retail DistanceMiles` -0.045285 0.004750 -9.534
## Pr(>|t|)
## (Intercept) < 0.0000000000000002 ***
## sum_quantity < 0.0000000000000002 ***
## sum_trans < 0.0000000000000002 ***
## sum_footwear < 0.0000000000000002 ***
## sum_apparel < 0.0000000000000002 ***
## sum_accessories < 0.0000000000000002 ***
## GenderM < 0.0000000000000002 ***
## `Has App`1 < 0.0000000000000002 ***
## Age 0.00000000706 ***
## `Nearest Outlet DistanceMiles` < 0.0000000000000002 ***
## `Nearest Retail DistanceMiles` < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 31.4 on 183108 degrees of freedom
## Multiple R-squared: 0.7372, Adjusted R-squared: 0.7372
## F-statistic: 5.136e+04 on 10 and 183108 DF, p-value: < 0.00000000000000022
plot(datout$fitted.values, datout$residuals) ## Not linear
library(caret)
boxcox(sum_retail~.,data=data_final)
datout <- lm(log(sum_retail)~.,data = data_final) ## Log Model
datout2 <- lm((sum_retail)^0.6~.,data=data_final) ##
## train and testing
# sample = sample.split(data_final$sum_retail, SplitRatio = .75)
# train.data = subset(data_final, sample == TRUE)
# test.data = subset(data_final, sample == FALSE)
# datout4 <- randomForest(data_final$sum_retail~data_final$sum_quantity +
# data_final$sum_trans + data_final$sum_footwear +
# data_final$sum_apparel + data_final$sum_accessories +
# data_final$Gender + data_final$`Has App` +
# data_final$Age +
# data_final$`Nearest Outlet DistanceMiles` +
# data_final$`Nearest Retail DistanceMiles`,
# ntree = 30, mtry = 5, max.depth = 5, min.node.size = 5)
# datout4
datoutFinal <- readRDS("./vansFinalModel.rds")
datoutFinal
##
## Call:
## randomForest(formula = data_final$sum_retail ~ data_final$sum_quantity + data_final$sum_trans + data_final$sum_footwear + data_final$sum_apparel + data_final$sum_accessories + data_final$Gender + data_final$`Has App` + data_final$Age + data_final$`Nearest Outlet DistanceMiles` + data_final$`Nearest Retail DistanceMiles`, ntree = 30, mtry = 5, max.depth = 5, min.node.size = 5)
## Type of random forest: regression
## Number of trees: 30
## No. of variables tried at each split: 5
##
## Mean of squared residuals: 930.4626
## % Var explained: 75.2