df <- read_csv("../Datasets/Blackwell_Demographic_Data.csv")
## Parsed with column specification:
## cols(
## `in-store` = col_integer(),
## age = col_integer(),
## items = col_integer(),
## amount = col_double(),
## region = col_integer()
## )
This dataset is a data frame of 80000 observations and 5 variables. The variables are in-store, age, items, amount, region:
In-store: The “in-store” column indicates where each transaction was made online (0), or in-store (1).
Age: The “age” column indicates the age of the customer who made the transaction.
Items: The “items” column tracks the number of items the customer purchased.
Amount: The “amount” column records the amount of money spent on the transaction.
Region: The “region” column indicates in which of the four regions the purchase was made (1 = East, 2 = West, 3 = South, 4 = Central).
Some information about the company: Blackwell has been a successful electronics retailer for over 40 years, with over 30 stores in the Southeast. A little over a year ago we launched our eCommerce website.
Total amount: 6.687357610^{7}$.
Do customers in different regions spend more per transaction? Which regions spend the most/least?
Are there differences in the age of customers between regions? If so, can we predict the age of a customer in a region based on other demographic data?
We need to investigate Martin’s hypothesis: Is there any correlation between age of a customer and if the transaction was made online or in the store? Do any other factors predict if a customer will buy online or in our stores?
Finally, is there a relationship between number of items purchased and amount spent?
There is 0 missing values.
Summary of the dataset:
knitr::kable (summary(df))
|
|
|
|
| |
|---|---|---|---|---|---|
| Min. :0.0000 | Min. :18.00 | Min. :1.000 | Min. : 5.0 | Min. :1.000 | |
| 1st Qu.:0.0000 | 1st Qu.:33.00 | 1st Qu.:3.000 | 1st Qu.: 285.1 | 1st Qu.:2.000 | |
| Median :0.0000 | Median :45.00 | Median :4.000 | Median : 582.3 | Median :3.000 | |
| Mean :0.4561 | Mean :45.76 | Mean :4.505 | Mean : 835.9 | Mean :2.675 | |
| 3rd Qu.:1.0000 | 3rd Qu.:56.00 | 3rd Qu.:6.000 | 3rd Qu.:1233.7 | 3rd Qu.:4.000 | |
| Max. :1.0000 | Max. :85.00 | Max. :8.000 | Max. :3000.0 | Max. :4.000 |
Notes:
In store, items and region are a factors with different levels –> change that.
Which is the distribution of age for all the dataset? And for each region?
Distribution of amount by region and for all the dataset.
Factorizing variables:
df$`in-store` <- factor(df$`in-store`, levels = c(0,1), labels = c("Online","In-store"))
df$items <- factor(df$items)
df$region <- factor(df$region, levels = c(1,2,3,4), labels = c("East","West","South","Central"))
Now the information is easier to understand:
kable (summary(df))
|
|
|
|
| |
|---|---|---|---|---|---|
| Online :43516 | Min. :18.00 | 4 :11596 | Min. : 5.0 | East :16000 | |
| In-store:36484 | 1st Qu.:33.00 | 6 :11522 | 1st Qu.: 285.1 | West :20000 | |
| NA | Median :45.00 | 3 :11487 | Median : 582.3 | South :18000 | |
| NA | Mean :45.76 | 7 :11378 | Mean : 835.9 | Central:26000 | |
| NA | 3rd Qu.:56.00 | 2 :11290 | 3rd Qu.:1233.7 | NA | |
| NA | Max. :85.00 | 5 :11238 | Max. :3000.0 | NA | |
| NA | NA | (Other):11489 | NA | NA |
df %>%
group_by(region) %>%
summarise(TotalAmount = sum(amount)) %>%
arrange(desc(TotalAmount)) %>%
print() %>%
ggplot(aes(region,TotalAmount, fill = region))+
geom_bar(stat = "identity")
## # A tibble: 4 x 2
## region TotalAmount
## <fct> <dbl>
## 1 Central 33385352.
## 2 South 16523454.
## 3 East 11922584.
## 4 West 5042186.
df %>%
ggplot(aes(amount, color = region))+
geom_freqpoly(binwidth = 50)
kable(df %>%
filter(region == "West") %>%
filter(amount == max(amount) | amount == min(amount)) %>%
select(amount) %>%
arrange(desc(amount)))
| amount |
|---|
| 499.94 |
| 5.00 |
| amount |
|---|
| 1999.80 |
| 50.05 |
| amount |
|---|
| 3000.00 |
| 50.13 |
| amount |
|---|
| 3000.0 |
| 50.6 |
df %>%
group_by(`in-store`) %>%
summarise(TotalAmount = sum(amount)) %>%
arrange(desc(TotalAmount)) %>%
print() %>%
ggplot(aes(`in-store`,TotalAmount, fill = `in-store`))+
geom_bar(stat = "identity")
## # A tibble: 2 x 2
## `in-store` TotalAmount
## <fct> <dbl>
## 1 Online 38531657.
## 2 In-store 28341919.
df %>%
ggplot(aes(age)) +
geom_histogram(bins = 30, color = "blue", fill = "white") +
facet_wrap(~region, ncol = 2)
kable(df %>%
filter(region == "Central") %>%
filter(age == max(age) | age == min(age)) %>%
select(age) %>%
unique() %>%
arrange(desc(age)))
| age |
|---|
| 63 |
| 18 |
| age |
|---|
| 74 |
| 19 |
| age |
|---|
| 74 |
| 18 |
| age |
|---|
| 85 |
| 28 |
To make data more understand, we will discretize the age by different groups:
df$age <- cut(df$age, breaks = 4)
summary(df$age)
## (17.9,34.8] (34.8,51.5] (51.5,68.2] (68.2,85.1]
## 22454 29723 19848 7975
df %>%
group_by(region, age) %>%
summarise(TotalAmount = sum(amount)) %>%
arrange(desc(TotalAmount)) %>%
print() %>%
ggplot(aes(age, TotalAmount)) +
geom_bar(stat = "identity", aes(fill = age), show.legend = TRUE, width = .5) +
facet_wrap(~region, ncol = 2)
## # A tibble: 15 x 3
## # Groups: region [4]
## region age TotalAmount
## <fct> <fct> <dbl>
## 1 Central (34.8,51.5] 13904361.
## 2 Central (17.9,34.8] 13081734.
## 3 Central (51.5,68.2] 6399258.
## 4 South (34.8,51.5] 6047671.
## 5 South (17.9,34.8] 5161244.
## 6 East (34.8,51.5] 4990490.
## 7 South (51.5,68.2] 4697042.
## 8 East (17.9,34.8] 4128797.
## 9 East (51.5,68.2] 2298870.
## 10 West (51.5,68.2] 1529149.
## 11 West (68.2,85.1] 1471742.
## 12 West (34.8,51.5] 1461438.
## 13 South (68.2,85.1] 617497.
## 14 West (17.9,34.8] 579857.
## 15 East (68.2,85.1] 504427.
library(rpart)
library(rpart.plot)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
training.ids <- createDataPartition(df$amount,
p = 0.7,
list = F)
mod <- rpart(amount ~.,
data = df[training.ids,],
method = "anova",
control = rpart.control(minsplit = 20, cp = 0.01))
prp(mod, type = 2, extra = 100, nn = TRUE,
fallen.leaves = TRUE, faclen = 4, varlen = 8,
shadow.col = "gray", roundint=FALSE)