setwd("C:\\Users\\hi\\Documents")
dataset <- read.csv("BlackFriday.csv")
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 3.5.2
## -- Attaching packages ---------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 3.1.1 v purrr 0.3.2
## v tibble 2.1.3 v dplyr 0.8.1
## v tidyr 0.8.3 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.4.0
## Warning: package 'ggplot2' was built under R version 3.5.3
## Warning: package 'tidyr' was built under R version 3.5.3
## Warning: package 'readr' was built under R version 3.5.2
## Warning: package 'purrr' was built under R version 3.5.3
## Warning: package 'dplyr' was built under R version 3.5.3
## Warning: package 'stringr' was built under R version 3.5.2
## Warning: package 'forcats' was built under R version 3.5.3
## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(scales)
## Warning: package 'scales' was built under R version 3.5.2
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(gridExtra)
## Warning: package 'gridExtra' was built under R version 3.5.2
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
#The tidyverse package is what we will use for visualizing and exploring our dataset.
#It is knows for easy-to-read syntax and massive amounts of useful functions.
#The scales package will be used mainly to customize plot axis
summary(dataset)
## User_ID Product_ID Gender Age
## Min. :1000001 P00265242: 1858 F:132197 0-17 : 14707
## 1st Qu.:1001495 P00110742: 1591 M:405380 18-25: 97634
## Median :1003031 P00025442: 1586 26-35:214690
## Mean :1002992 P00112142: 1539 36-45:107499
## 3rd Qu.:1004417 P00057642: 1430 46-50: 44526
## Max. :1006040 P00184942: 1424 51-55: 37618
## (Other) :528149 55+ : 20903
## Occupation City_Category Stay_In_Current_City_Years
## Min. : 0.000 A:144638 0 : 72725
## 1st Qu.: 2.000 B:226493 1 :189192
## Median : 7.000 C:166446 2 : 99459
## Mean : 8.083 3 : 93312
## 3rd Qu.:14.000 4+: 82889
## Max. :20.000
##
## Marital_Status Product_Category_1 Product_Category_2 Product_Category_3
## Min. :0.0000 Min. : 1.000 Min. : 2.00 Min. : 3.0
## 1st Qu.:0.0000 1st Qu.: 1.000 1st Qu.: 5.00 1st Qu.: 9.0
## Median :0.0000 Median : 5.000 Median : 9.00 Median :14.0
## Mean :0.4088 Mean : 5.296 Mean : 9.84 Mean :12.7
## 3rd Qu.:1.0000 3rd Qu.: 8.000 3rd Qu.:15.00 3rd Qu.:16.0
## Max. :1.0000 Max. :18.000 Max. :18.00 Max. :18.0
## NA's :166986 NA's :373299
## Purchase
## Min. : 185
## 1st Qu.: 5866
## Median : 8062
## Mean : 9334
## 3rd Qu.:12073
## Max. :23961
##
head(dataset, 5)
## User_ID Product_ID Gender Age Occupation City_Category
## 1 1000001 P00069042 F 0-17 10 A
## 2 1000001 P00248942 F 0-17 10 A
## 3 1000001 P00087842 F 0-17 10 A
## 4 1000001 P00085442 F 0-17 10 A
## 5 1000002 P00285442 M 55+ 16 C
## Stay_In_Current_City_Years Marital_Status Product_Category_1
## 1 2 0 3
## 2 2 0 1
## 3 2 0 12
## 4 2 0 12
## 5 4+ 0 8
## Product_Category_2 Product_Category_3 Purchase
## 1 NA NA 8370
## 2 6 14 15200
## 3 NA NA 1422
## 4 14 NA 1057
## 5 NA NA 7969
#It looks like we have 12 different columns, each representing a corresponding variable below.
#User_ID: Unique identifier of shopper.
#Product_ID: Unique identifier of product. (No key given)
#Gender: Sex of shopper.
#Age: Age of shopper split into bins.
#Occupation: Occupation of shopper. (No key given)
#City_Category: Residence location of shopper. (No key given)
#Stay_In_Current_City_Years: Number of years stay in current city.
#Marital_Status: Marital status of shopper.
#Product_Category_1: Product category of purchase.
#Product_Category_2: Product may belong to other category.
#Product_Category_3: Product may belong to other category.
#Purchase: Purchase amount in dollars.
########################TO BEGAIN OUR EXPLORATION ##############################################################################3
######## GENDER ##########################
dataset_gender =dataset %>% select(User_ID,Gender) %>% group_by(User_ID) %>% distinct()
head(dataset_gender )
## # A tibble: 6 x 2
## # Groups: User_ID [6]
## User_ID Gender
## <int> <fct>
## 1 1000001 F
## 2 1000002 M
## 3 1000003 M
## 4 1000004 M
## 5 1000005 M
## 6 1000006 F
View(dataset_gender )
summary(dataset_gender$Gender)
## F M
## 1666 4225
plot(dataset_gender$Gender)

genderDist = ggplot(data = dataset_gender) +
geom_bar(mapping = aes(x = Gender, fill = Gender)) +
labs(title = 'Gender of Customers') +
scale_fill_brewer(palette = 'PuBuGn')
print(genderDist)

#As we can see, there are quite a few more males than females shopping at our store on Black Friday
#Average purchase by gender
dataset_sum = dataset %>% select(User_ID,Gender,Purchase) %>% group_by(Gender) %>% summarise(total_sum=as.numeric(sum(Purchase))) %>% arrange()
dataset_avg_M = 3853044357/4225
dataset_avg_F =1164624021/1666
Value <- c (dataset_avg_M,dataset_avg_F)
gender <- c ('M','F')
average_spending_gender <- data.frame(Value,gender)
View(average_spending_gender)
str(average_spending_gender)
## 'data.frame': 2 obs. of 2 variables:
## $ Value : num 911963 699054
## $ gender: Factor w/ 2 levels "F","M": 2 1
#We can see that that the average transaction for Females was 699054.00 and the average transaction for Males was 911963.20.
#Let visualize our results
class(average_spending_gender)
## [1] "data.frame"
genderAverage = ggplot(data = average_spending_gender) +
geom_bar(mapping = aes(x = gender, y = Value, fill = gender), stat = 'identity') +
labs(title = 'Average Spending by Gender') +
scale_fill_brewer(palette = 'PuBuGn')
print(genderAverage)

################## TOP SELLERS #########################################
TOP_seller <- dataset %>% select(Product_ID,Purchase) %>% group_by(Product_ID) %>% summarise(sumproduct = sum(Purchase)) %>% arrange()
View(head(TOP_seller, 10))
#Now that we have Identified our top 5 best selling products, lets examine the best selling product, P00265242.
best_seller = dataset[dataset$Product_ID == 'P00265242', ]
head(best_seller)
## User_ID Product_ID Gender Age Occupation City_Category
## 400 1000066 P00265242 M 26-35 18 C
## 1192 1000196 P00265242 F 36-45 9 C
## 1373 1000222 P00265242 M 26-35 1 A
## 1846 1000301 P00265242 M 18-25 4 B
## 2210 1000345 P00265242 M 26-35 12 A
## 2405 1000383 P00265242 F 26-35 7 A
## Stay_In_Current_City_Years Marital_Status Product_Category_1
## 400 2 0 5
## 1192 4+ 0 5
## 1373 1 0 5
## 1846 4+ 0 5
## 2210 2 1 5
## 2405 4+ 1 5
## Product_Category_2 Product_Category_3 Purchase
## 400 8 NA 8652
## 1192 8 NA 8767
## 1373 8 NA 6944
## 1846 8 NA 8628
## 2210 8 NA 8593
## 2405 8 NA 6998
#We can see that this product fits into Product_Category_1 = 5 and Product_Category_2 = 8. As mentioned in the introduction,
#it would be useful to have a key to reference the item name in order to determine what it is.
#Another interesting finding is that even though people are purchasing the same product,
#they are paying different prices. This could be due to various Black Friday promotions,
#discounts, or coupon codes. Otherwise, investigation would need to be done regarding the reason for different purchase prices of the same product between customers.
###Lets continue to analyze our best seller to see if any relationship to Gender exits.
genderDist_bs = ggplot(data = best_seller) +
geom_bar(mapping = aes(x = Gender, y = ..count.., fill = Gender)) +
labs(title = 'Gender of Customers (Best Seller)') +
scale_fill_brewer(palette = 'PuBuGn')
print(genderDist_bs)

################# AGE ###################################
#Lets begin examining Age by creating a table of each individual age group and their respective counts.
customers_age = dataset %>% select(Age, User_ID) %>% group_by(Age) %>% distinct(User_ID) %>% summarise(n())
names(customers_age) <- c("age","num")
customers_age_vis = ggplot(data = customers_age) +
geom_bar(color = 'black', stat = 'identity', mapping = aes(x = age, y = num, fill = age)) +
labs(title = 'Age of Customers') +
theme(axis.text.x = element_text(size = 10)) +
scale_fill_brewer(palette = 'Blues') +
theme(legend.position="none")
print(customers_age_vis)

#We can also plot a similar chart depicting the distribution of age within our "best seller" category.
#This will show us if there is a specific age category that purchased the best selling product more than other shoppers.
###########################################################################################################################
##################################### city #################################################
##################################### #################################################
customers_location = dataset %>%
select(User_ID, City_Category) %>%
distinct()
head(customers_location)
## User_ID City_Category
## 1 1000001 A
## 2 1000002 C
## 3 1000003 A
## 4 1000004 B
## 5 1000005 A
## 6 1000006 A
customers_location_vis = ggplot(data = customers_location) +
geom_bar(color = 'white', mapping = aes(x = City_Category, y = ..count.., fill = City_Category)) +
labs(title = 'Location of Customers') +
scale_fill_brewer(palette = "Dark2") +
theme(legend.position="none")
print(customers_location_vis)

#We can see that most of our customers live in City C.
#Now, we can compute the total purchase amount by City to see the which city's customers spent the most at our store.
purchases_city = dataset %>%
group_by(City_Category) %>%
summarise(Purchases = sum(Purchase))
purchases_city_1000s = purchases_city %>%
mutate(purchasesThousands = purchases_city$Purchases / 1000)
purchases_city_1000s
## # A tibble: 3 x 3
## City_Category Purchases purchasesThousands
## <fct> <int> <dbl>
## 1 A 1295668797 1295669.
## 2 B 2083431612 2083432.
## 3 C 1638567969 1638568.
#In order to work with larger numbers, we divided the Purchases column/1000.
#This is a common practice within the business and acounting world, and it makes large numbers easier to read and chart.
purchaseCity_vis = ggplot(data = purchases_city_1000s, aes(x = City_Category, y = purchasesThousands, fill = City_Category)) +
geom_bar(color = 'white', stat = 'identity') +
labs(title = 'Total Customer Purchase Amount (by City)', y = '($000s)', x = 'City Category') +
scale_fill_brewer(palette = "Dark2") +
theme(legend.position="none", plot.title = element_text(size = 9))
print(purchaseCity_vis)

grid.arrange(customers_location_vis, purchaseCity_vis, ncol=2)

#Here we can see that customers from City C were the most frequent shoppers at our store on Black Friday
#but Customers from City B had the highest amount of total purchases.
#Let's continue to investigate and try to determine the reason for this observation.
#Lets find how many purchases were made by customers from each city.
#First, we will get the total number of purchases for each corresponding User_ID.
customers = dataset %>%
group_by(User_ID) %>%
count(User_ID)
head(customers)
## # A tibble: 6 x 2
## # Groups: User_ID [6]
## User_ID n
## <int> <int>
## 1 1000001 34
## 2 1000002 76
## 3 1000003 29
## 4 1000004 13
## 5 1000005 106
## 6 1000006 46
#This tells us how many times a certain user made a purchase.
#dive deeper lets compute the total purchase amount for each user, then join it with the other table
customers_City = dataset %>%
select(User_ID, City_Category) %>%
group_by(User_ID) %>%
distinct() %>%
ungroup() %>%
left_join(customers, customers_City, by = 'User_ID')
head(customers_City)
## # A tibble: 6 x 3
## User_ID City_Category n
## <int> <fct> <int>
## 1 1000001 A 34
## 2 1000002 C 76
## 3 1000003 A 29
## 4 1000004 B 13
## 5 1000005 A 106
## 6 1000006 A 46
city_purchases_count = customers_City %>%
select(City_Category, n) %>%
group_by(City_Category) %>%
summarise(CountOfPurchases = sum(n))
city_purchases_count
## # A tibble: 3 x 2
## City_Category CountOfPurchases
## <fct> <int>
## 1 A 144638
## 2 B 226493
## 3 C 166446
city_count_purchases_vis = ggplot(data = city_purchases_count, aes(x = City_Category, y = CountOfPurchases, fill = City_Category)) +
geom_bar(color = 'white', stat = 'identity') +
labs(title = 'Total Purchase Count (by City)', y = 'Count', x = 'City Category') +
scale_fill_brewer(palette = "Dark2") +
theme(legend.position="none", plot.title = element_text(size = 9))
print(city_count_purchases_vis)

grid.arrange(purchaseCity_vis, city_count_purchases_vis, ncol = 2)

#One inference we can make from these charts is that customers from City B are simply making more purchases than residence of City A + City C,
#and not necessarily buying more expensive products.
#We can make this assumption due to the fact that the "Total Count of Purchases" chart has a very similar appearance to the "Total Customer Purchase Amount" chart.
#If it were the other case, then customers from City B would most likely have a lower count of total purchases corresponding to a higher total purchase amount.
head(best_seller)
## User_ID Product_ID Gender Age Occupation City_Category
## 400 1000066 P00265242 M 26-35 18 C
## 1192 1000196 P00265242 F 36-45 9 C
## 1373 1000222 P00265242 M 26-35 1 A
## 1846 1000301 P00265242 M 18-25 4 B
## 2210 1000345 P00265242 M 26-35 12 A
## 2405 1000383 P00265242 F 26-35 7 A
## Stay_In_Current_City_Years Marital_Status Product_Category_1
## 400 2 0 5
## 1192 4+ 0 5
## 1373 1 0 5
## 1846 4+ 0 5
## 2210 2 1 5
## 2405 4+ 1 5
## Product_Category_2 Product_Category_3 Purchase
## 400 8 NA 8652
## 1192 8 NA 8767
## 1373 8 NA 6944
## 1846 8 NA 8628
## 2210 8 NA 8593
## 2405 8 NA 6998
best_seller_city = best_seller %>%
select(User_ID, City_Category) %>%
distinct() %>%
count(City_Category)
best_seller_city
## # A tibble: 3 x 2
## City_Category n
## <fct> <int>
## 1 A 396
## 2 B 626
## 3 C 836
best_seller_city_vis = ggplot(data = best_seller_city, aes(x = City_Category, y = n, fill = City_Category)) +
geom_bar(color = 'white', stat = 'identity') +
labs(title = 'Best Seller Purchase Count (by City)', y = 'Count', x = 'City Category') +
scale_fill_brewer(palette = "Blues") +
theme(legend.position="none", plot.title = element_text(size = 9))
grid.arrange(city_count_purchases_vis,best_seller_city_vis, ncol = 2)

#An interesting revelation has been made! Although customers residing in City C purchase more of our "best seller" than City A + B,
#residents of City C fall behind City B in overall number of purchases.
################################################################################################################################
#######################3 Stay in Current City ###########################
######################## ############################
customers_stay = dataset %>%
select(User_ID, City_Category, Stay_In_Current_City_Years) %>%
group_by(User_ID) %>%
distinct()
head(customers_stay)
## # A tibble: 6 x 3
## # Groups: User_ID [6]
## User_ID City_Category Stay_In_Current_City_Years
## <int> <fct> <fct>
## 1 1000001 A 2
## 2 1000002 C 4+
## 3 1000003 A 3
## 4 1000004 B 2
## 5 1000005 A 1
## 6 1000006 A 1
#Now that we have our dataset in order, we can plot and explore.
residence = customers_stay %>%
group_by(City_Category) %>%
tally()
head(residence)
## # A tibble: 3 x 2
## City_Category n
## <fct> <int>
## 1 A 1045
## 2 B 1707
## 3 C 3139
#Looks like most of our customers are living in City C. Now, lets investigate further
customers_stay_vis = ggplot(data = customers_stay, aes(x = Stay_In_Current_City_Years, y = ..count.., fill = Stay_In_Current_City_Years)) +
geom_bar(stat = 'count') +
scale_fill_brewer(palette = 15) +
labs(title = 'Customers Stay in Current City', y = 'Count', x = 'Stay in Current City', fill = 'Number of Years in Current City')
print(customers_stay_vis)

#It looks like most of our customers have only been living in their respective cities for 1 year.
#In order to see a better distribution, lets make a stacked bar chart according to each City_Category.
stay_cities = customers_stay %>%
group_by(City_Category, Stay_In_Current_City_Years) %>%
tally() %>%
mutate(Percentage = (n/sum(n))*100)
head(stay_cities)
## # A tibble: 6 x 4
## # Groups: City_Category [2]
## City_Category Stay_In_Current_City_Years n Percentage
## <fct> <fct> <int> <dbl>
## 1 A 0 147 14.1
## 2 A 1 370 35.4
## 3 A 2 183 17.5
## 4 A 3 180 17.2
## 5 A 4+ 165 15.8
## 6 B 0 211 12.4
ggplot(data = stay_cities, aes(x = City_Category, y = n, fill = Stay_In_Current_City_Years)) +
geom_bar(stat = "identity", color = 'white') +
scale_fill_brewer(palette = 2) +
labs(title = "City Category + Stay in Current City",
y = "Total Count (Years)",
x = "City",
fill = "Stay Years")

#Looking at this chart we can see the distribution of the total customer base and their respective city residences,
#split by the amount of time they have lived there.
#Here, we can notice that in every City_Category, the most common stay length seems to be 1 year.
###################################################################################################################
################################ Marital Status ####################
dataset_maritalStatus = dataset %>%
select(User_ID, Marital_Status) %>%
group_by(User_ID) %>%
distinct()
head(dataset_maritalStatus)
## # A tibble: 6 x 2
## # Groups: User_ID [6]
## User_ID Marital_Status
## <int> <int>
## 1 1000001 0
## 2 1000002 0
## 3 1000003 0
## 4 1000004 1
## 5 1000005 1
## 6 1000006 0
#Note, we need to quickly change Marital_Status from a numeric variable to a categorical type
dataset_maritalStatus$Marital_Status = as.character(dataset_maritalStatus$Marital_Status)
typeof(dataset_maritalStatus$Marital_Status)
## [1] "character"
#If we look back at the variable descriptions of the dataset,
#we don't have a clear guide for marital status.
#In other cases,
#it would be best to reach out to the provider of the data to be completely sure
#of what the values in a column represent but in this case, we will assume that 1 = married and 0 = single.
marital_vis = ggplot(data = dataset_maritalStatus) +
geom_bar(mapping = aes(x = Marital_Status, y = ..count.., fill = Marital_Status)) +
labs(title = 'Marital Status') +
scale_fill_brewer(palette = 'Pastel2')
print(marital_vis)

# looks like most of our shoppers happen to be single or unmarried.
#Similar to our investigation of age groups, we can look at the makeup of Marital_Status in each City_Category.
dataset_maritalStatus = dataset_maritalStatus %>%
full_join(customers_stay, by = 'User_ID')
head(dataset_maritalStatus)
## # A tibble: 6 x 4
## # Groups: User_ID [6]
## User_ID Marital_Status City_Category Stay_In_Current_City_Years
## <int> <chr> <fct> <fct>
## 1 1000001 0 A 2
## 2 1000002 0 C 4+
## 3 1000003 0 A 3
## 4 1000004 1 B 2
## 5 1000005 1 A 1
## 6 1000006 0 A 1
maritalStatus_cities = dataset_maritalStatus %>%
group_by(City_Category, Marital_Status) %>%
tally()
head(maritalStatus_cities)
## # A tibble: 6 x 3
## # Groups: City_Category [3]
## City_Category Marital_Status n
## <fct> <chr> <int>
## 1 A 0 652
## 2 A 1 393
## 3 B 0 1004
## 4 B 1 703
## 5 C 0 1761
## 6 C 1 1378
top_sellers = dataset %>%
count(Product_ID, sort = TRUE)
top_5 = head(top_sellers, 5)
best_seller = dataset[dataset$Product_ID == 'P00265242', ]
head(best_seller)
## User_ID Product_ID Gender Age Occupation City_Category
## 400 1000066 P00265242 M 26-35 18 C
## 1192 1000196 P00265242 F 36-45 9 C
## 1373 1000222 P00265242 M 26-35 1 A
## 1846 1000301 P00265242 M 18-25 4 B
## 2210 1000345 P00265242 M 26-35 12 A
## 2405 1000383 P00265242 F 26-35 7 A
## Stay_In_Current_City_Years Marital_Status Product_Category_1
## 400 2 0 5
## 1192 4+ 0 5
## 1373 1 0 5
## 1846 4+ 0 5
## 2210 2 1 5
## 2405 4+ 1 5
## Product_Category_2 Product_Category_3 Purchase
## 400 8 NA 8652
## 1192 8 NA 8767
## 1373 8 NA 6944
## 1846 8 NA 8628
## 2210 8 NA 8593
## 2405 8 NA 6998
genderDist_bs = ggplot(data = best_seller) +
geom_bar(mapping = aes(x = Gender, y = ..count.., fill = Gender)) +
labs(title = 'Gender of Customers (Best Seller)') +
scale_fill_brewer(palette = 'PuBuGn')
print(genderDist_bs)

genderDist_bs_prop = ggplot(data = best_seller) +
geom_bar(fill = 'lightblue', mapping = aes(x = Gender, y = ..prop.., group = 1, fill = Gender)) +
labs(title = 'Gender of Customers (Best Seller - Proportion)') +
theme(plot.title = element_text(size=9.5))
genderDist_prop = ggplot(data = dataset_gender) +
geom_bar(fill = "lightblue4", mapping = aes(x = Gender, y = ..prop.., group = 1)) +
labs(title = 'Gender of Customers (Total Proportion)') +
theme(plot.title = element_text(size=9.5))
grid.arrange(genderDist_prop, genderDist_bs_prop, ncol=2)

customers_age = dataset %>%
select(User_ID, Age) %>%
distinct() %>%
count(Age)
customers_age
## # A tibble: 7 x 2
## Age n
## <fct> <int>
## 1 0-17 218
## 2 18-25 1069
## 3 26-35 2053
## 4 36-45 1167
## 5 46-50 531
## 6 51-55 481
## 7 55+ 372
customers_age_vis = ggplot(data = customers_age) +
geom_bar(color = 'black', stat = 'identity', mapping = aes(x = Age, y = n, fill = Age)) +
labs(title = 'Age of Customers') +
theme(axis.text.x = element_text(size = 10)) +
scale_fill_brewer(palette = 'Blues') +
theme(legend.position="none")
print(customers_age_vis)

ageDist_bs = ggplot(data = best_seller) +
geom_bar(color = 'black', mapping = aes(x = Age, y = ..count.., fill = Age)) +
labs(title = 'Age of Customers (Best Seller)') +
theme(axis.text.x = element_text(size = 10)) +
scale_fill_brewer(palette = 'GnBu') +
theme(legend.position="none")
print(ageDist_bs)

grid.arrange(customers_age_vis, ageDist_bs, ncol=2)

customers_location = dataset %>%
select(User_ID, City_Category) %>%
distinct()
head(customers_location)
## User_ID City_Category
## 1 1000001 A
## 2 1000002 C
## 3 1000003 A
## 4 1000004 B
## 5 1000005 A
## 6 1000006 A
customers_location_vis = ggplot(data = customers_location) +
geom_bar(color = 'white', mapping = aes(x = City_Category, y = ..count.., fill = City_Category)) +
labs(title = 'Location of Customers') +
scale_fill_brewer(palette = "Dark2") +
theme(legend.position="none")
print(customers_location_vis)

purchases_city = dataset %>%
group_by(City_Category) %>%
summarise(Purchases = sum(Purchase))
purchases_city_1000s = purchases_city %>%
mutate(purchasesThousands = purchases_city$Purchases / 1000)
purchases_city_1000s
## # A tibble: 3 x 3
## City_Category Purchases purchasesThousands
## <fct> <int> <dbl>
## 1 A 1295668797 1295669.
## 2 B 2083431612 2083432.
## 3 C 1638567969 1638568.
purchaseCity_vis = ggplot(data = purchases_city_1000s, aes(x = City_Category, y = purchasesThousands, fill = City_Category)) +
geom_bar(color = 'white', stat = 'identity') +
labs(title = 'Total Customer Purchase Amount (by City)', y = '($000s)', x = 'City Category') +
scale_fill_brewer(palette = "Dark2") +
theme(legend.position="none", plot.title = element_text(size = 9))
print(purchaseCity_vis)

grid.arrange(customers_location_vis, purchaseCity_vis, ncol=2)

customers = dataset %>%
group_by(User_ID) %>%
count(User_ID)
head(customers)
## # A tibble: 6 x 2
## # Groups: User_ID [6]
## User_ID n
## <int> <int>
## 1 1000001 34
## 2 1000002 76
## 3 1000003 29
## 4 1000004 13
## 5 1000005 106
## 6 1000006 46
customers_City = dataset %>%
select(User_ID, City_Category) %>%
group_by(User_ID) %>%
distinct() %>%
ungroup() %>%
left_join(customers, customers_City, by = 'User_ID')
head(customers_City)
## # A tibble: 6 x 3
## User_ID City_Category n
## <int> <fct> <int>
## 1 1000001 A 34
## 2 1000002 C 76
## 3 1000003 A 29
## 4 1000004 B 13
## 5 1000005 A 106
## 6 1000006 A 46
city_purchases_count = customers_City %>%
select(City_Category, n) %>%
group_by(City_Category) %>%
summarise(CountOfPurchases = sum(n))
city_purchases_count
## # A tibble: 3 x 2
## City_Category CountOfPurchases
## <fct> <int>
## 1 A 144638
## 2 B 226493
## 3 C 166446
city_count_purchases_vis = ggplot(data = city_purchases_count, aes(x = City_Category, y = CountOfPurchases, fill = City_Category)) +
geom_bar(color = 'white', stat = 'identity') +
labs(title = 'Total Purchase Count (by City)', y = 'Count', x = 'City Category') +
scale_fill_brewer(palette = "Dark2") +
theme(legend.position="none", plot.title = element_text(size = 9))
print(city_count_purchases_vis)

grid.arrange(purchaseCity_vis, city_count_purchases_vis, ncol = 2)

head(best_seller)
## User_ID Product_ID Gender Age Occupation City_Category
## 400 1000066 P00265242 M 26-35 18 C
## 1192 1000196 P00265242 F 36-45 9 C
## 1373 1000222 P00265242 M 26-35 1 A
## 1846 1000301 P00265242 M 18-25 4 B
## 2210 1000345 P00265242 M 26-35 12 A
## 2405 1000383 P00265242 F 26-35 7 A
## Stay_In_Current_City_Years Marital_Status Product_Category_1
## 400 2 0 5
## 1192 4+ 0 5
## 1373 1 0 5
## 1846 4+ 0 5
## 2210 2 1 5
## 2405 4+ 1 5
## Product_Category_2 Product_Category_3 Purchase
## 400 8 NA 8652
## 1192 8 NA 8767
## 1373 8 NA 6944
## 1846 8 NA 8628
## 2210 8 NA 8593
## 2405 8 NA 6998
best_seller_city = best_seller %>%
select(User_ID, City_Category) %>%
distinct() %>%
count(City_Category)
best_seller_city
## # A tibble: 3 x 2
## City_Category n
## <fct> <int>
## 1 A 396
## 2 B 626
## 3 C 836
best_seller_city_vis = ggplot(data = best_seller_city, aes(x = City_Category, y = n, fill = City_Category)) +
geom_bar(color = 'white', stat = 'identity') +
labs(title = 'Best Seller Purchase Count (by City)', y = 'Count', x = 'City Category') +
scale_fill_brewer(palette = "Blues") +
theme(legend.position="none", plot.title = element_text(size = 9))
grid.arrange(city_count_purchases_vis,best_seller_city_vis, ncol = 2)

customers_stay = dataset %>%
select(User_ID, City_Category, Stay_In_Current_City_Years) %>%
group_by(User_ID) %>%
distinct()
head(customers_stay)
## # A tibble: 6 x 3
## # Groups: User_ID [6]
## User_ID City_Category Stay_In_Current_City_Years
## <int> <fct> <fct>
## 1 1000001 A 2
## 2 1000002 C 4+
## 3 1000003 A 3
## 4 1000004 B 2
## 5 1000005 A 1
## 6 1000006 A 1
residence = customers_stay %>%
group_by(City_Category) %>%
tally()
head(residence)
## # A tibble: 3 x 2
## City_Category n
## <fct> <int>
## 1 A 1045
## 2 B 1707
## 3 C 3139
customers_stay_vis = ggplot(data = customers_stay, aes(x = Stay_In_Current_City_Years, y = ..count.., fill = Stay_In_Current_City_Years)) +
geom_bar(stat = 'count') +
scale_fill_brewer(palette = 15) +
labs(title = 'Customers Stay in Current City', y = 'Count', x = 'Stay in Current City', fill = 'Number of Years in Current City')
print(customers_stay_vis)

stay_cities = customers_stay %>%
group_by(City_Category, Stay_In_Current_City_Years) %>%
tally() %>%
mutate(Percentage = (n/sum(n))*100)
head(stay_cities)
## # A tibble: 6 x 4
## # Groups: City_Category [2]
## City_Category Stay_In_Current_City_Years n Percentage
## <fct> <fct> <int> <dbl>
## 1 A 0 147 14.1
## 2 A 1 370 35.4
## 3 A 2 183 17.5
## 4 A 3 180 17.2
## 5 A 4+ 165 15.8
## 6 B 0 211 12.4
ggplot(data = stay_cities, aes(x = City_Category, y = n, fill = Stay_In_Current_City_Years)) +
geom_bar(stat = "identity", color = 'white') +
scale_fill_brewer(palette = 2) +
labs(title = "City Category + Stay in Current City",
y = "Total Count (Years)",
x = "City",
fill = "Stay Years")

customers_total_purchase_amount = dataset %>%
group_by(User_ID) %>%
summarise(Purchase_Amount = sum(Purchase))
head(customers_total_purchase_amount)
## # A tibble: 6 x 2
## User_ID Purchase_Amount
## <int> <int>
## 1 1000001 333481
## 2 1000002 810353
## 3 1000003 341635
## 4 1000004 205987
## 5 1000005 821001
## 6 1000006 379450
customers_total_purchase_amount = arrange(customers_total_purchase_amount, desc((Purchase_Amount)))
head(customers_total_purchase_amount)
## # A tibble: 6 x 2
## User_ID Purchase_Amount
## <int> <int>
## 1 1004277 10536783
## 2 1001680 8699232
## 3 1002909 7577505
## 4 1001941 6817493
## 5 1000424 6573609
## 6 1004448 6565878
summary(customers_total_purchase_amount)
## User_ID Purchase_Amount
## Min. :1000001 Min. : 44108
## 1st Qu.:1001518 1st Qu.: 234914
## Median :1003026 Median : 512612
## Mean :1003025 Mean : 851752
## 3rd Qu.:1004532 3rd Qu.: 1099005
## Max. :1006040 Max. :10536783
ggplot(customers_total_purchase_amount, aes(Purchase_Amount)) +
geom_density(adjust = 1) +
geom_vline(aes(xintercept=median(Purchase_Amount)),
color="blue", linetype="dashed", size=1) +
geom_vline(aes(xintercept=mean(Purchase_Amount)),
color="red", linetype="dashed", size=1) +
geom_text(aes(x=mean(Purchase_Amount), label=round(mean(Purchase_Amount)), y=1.2e-06), color = 'red', angle=360,
size=4, vjust=3, hjust=-.1) +
geom_text(aes(x=median(Purchase_Amount), label=round(median(Purchase_Amount)), y=1.2e-06), color = 'blue', angle=360,
size=4, vjust=0, hjust=-.1) +
scale_x_continuous(name="Purchase Amount", limits=c(0, 7500000), breaks = seq(0,7500000, by = 1000000), expand = c(0,0)) +
scale_y_continuous(name="Density", limits=c(0, .00000125), labels = scientific, expand = c(0,0))
## Warning: Removed 3 rows containing non-finite values (stat_density).

################################################## Marital Status ######################
#Lets now examine the marital status of store customers.
dataset_maritalStatus = dataset %>%
select(User_ID, Marital_Status) %>%
group_by(User_ID) %>%
distinct()
head(dataset_maritalStatus)
## # A tibble: 6 x 2
## # Groups: User_ID [6]
## User_ID Marital_Status
## <int> <int>
## 1 1000001 0
## 2 1000002 0
## 3 1000003 0
## 4 1000004 1
## 5 1000005 1
## 6 1000006 0
dataset_maritalStatus$Marital_Status = as.character(dataset_maritalStatus$Marital_Status)
typeof(dataset_maritalStatus$Marital_Status)
## [1] "character"
marital_vis = ggplot(data = dataset_maritalStatus) +
geom_bar(mapping = aes(x = Marital_Status, y = ..count.., fill = Marital_Status)) +
labs(title = 'Marital Status') +
scale_fill_brewer(palette = 'Pastel2')
print(marital_vis)

#It looks like most of our shoppers happen to be single or unmarried. Similar to our investigation of age groups, we can look at the makeup of Marital_Status in each City_Category.
dataset_maritalStatus = dataset_maritalStatus %>%
full_join(customers_stay, by = 'User_ID')
head(dataset_maritalStatus)
## # A tibble: 6 x 4
## # Groups: User_ID [6]
## User_ID Marital_Status City_Category Stay_In_Current_City_Years
## <int> <chr> <fct> <fct>
## 1 1000001 0 A 2
## 2 1000002 0 C 4+
## 3 1000003 0 A 3
## 4 1000004 1 B 2
## 5 1000005 1 A 1
## 6 1000006 0 A 1
#User_ID Marital_Status City_Category Stay_In_Current_City_Years
maritalStatus_cities = dataset_maritalStatus %>%
group_by(City_Category, Marital_Status) %>%
tally()
head(maritalStatus_cities)
## # A tibble: 6 x 3
## # Groups: City_Category [3]
## City_Category Marital_Status n
## <fct> <chr> <int>
## 1 A 0 652
## 2 A 1 393
## 3 B 0 1004
## 4 B 1 703
## 5 C 0 1761
## 6 C 1 1378
ggplot(data = maritalStatus_cities, aes(x = City_Category, y = n, fill = Marital_Status)) +
geom_bar(stat = "identity", color = 'black') +
scale_fill_brewer(palette = 2) +
labs(title = "City + Marital Status",
y = "Total Count (Shoppers)",
x = "City",
fill = "Marital Status")

#Here, we can see that out off all Cities, the highest proportion of single shoppers seems to be in City A.
#Now, lets investigate the Stay_in_Current_City distribution within each City_Category
Users_Age = dataset %>%
select(User_ID, Age) %>%
distinct()
head(Users_Age)
## User_ID Age
## 1 1000001 0-17
## 2 1000002 55+
## 3 1000003 26-35
## 4 1000004 46-50
## 5 1000005 26-35
## 6 1000006 51-55
dataset_maritalStatus = dataset_maritalStatus %>%
full_join(Users_Age, by = 'User_ID')
head(dataset_maritalStatus)
## # A tibble: 6 x 5
## # Groups: User_ID [6]
## User_ID Marital_Status City_Category Stay_In_Current_City_Years Age
## <int> <chr> <fct> <fct> <fct>
## 1 1000001 0 A 2 0-17
## 2 1000002 0 C 4+ 55+
## 3 1000003 0 A 3 26-35
## 4 1000004 1 B 2 46-50
## 5 1000005 1 A 1 26-35
## 6 1000006 0 A 1 51-55
City_A = dataset_maritalStatus %>%
filter(City_Category == 'A')
City_B = dataset_maritalStatus %>%
filter(City_Category == 'B')
City_C = dataset_maritalStatus %>%
filter(City_Category == 'C')
head(City_A)
## # A tibble: 6 x 5
## # Groups: User_ID [6]
## User_ID Marital_Status City_Category Stay_In_Current_City_Years Age
## <int> <chr> <fct> <fct> <fct>
## 1 1000001 0 A 2 0-17
## 2 1000003 0 A 3 26-35
## 3 1000005 1 A 1 26-35
## 4 1000006 0 A 1 51-55
## 5 1000015 0 A 1 26-35
## 6 1000019 0 A 3 0-17
head(City_B)
## # A tibble: 6 x 5
## # Groups: User_ID [6]
## User_ID Marital_Status City_Category Stay_In_Current_City_Years Age
## <int> <chr> <fct> <fct> <fct>
## 1 1000004 1 B 2 46-50
## 2 1000007 1 B 1 36-45
## 3 1000010 1 B 4+ 36-45
## 4 1000018 0 B 3 18-25
## 5 1000021 0 B 0 18-25
## 6 1000023 1 B 3 36-45
head(City_C)
## # A tibble: 6 x 5
## # Groups: User_ID [6]
## User_ID Marital_Status City_Category Stay_In_Current_City_Years Age
## <int> <chr> <fct> <fct> <fct>
## 1 1000002 0 C 4+ 55+
## 2 1000008 1 C 4+ 26-35
## 3 1000009 0 C 0 26-35
## 4 1000011 0 C 1 26-35
## 5 1000012 0 C 2 26-35
## 6 1000013 1 C 3 46-50
City_A_stay_vis = ggplot(data = City_A, aes(x = Age, y = ..count.., fill = Age)) +
geom_bar(stat = 'count') +
scale_fill_brewer(palette = 8) +
theme(legend.position="none", axis.text = element_text(size = 6)) +
labs(title = 'City A', y = 'Count', x = 'Age', fill = 'Age')
City_B_stay_vis = ggplot(data = City_B, aes(x = Age, y = ..count.., fill = Age)) +
geom_bar(stat = 'count') +
scale_fill_brewer(palette = 9) +
theme(legend.position="none", axis.text = element_text(size = 6)) +
labs(title = 'City B', y = 'Count', x = 'Age', fill = 'Age')
City_C_stay_vis = ggplot(data = City_C, aes(x = Age, y = ..count.., fill = Age)) +
geom_bar(stat = 'count') +
scale_fill_brewer(palette = 11) +
theme(legend.position="none", axis.text = element_text(size = 6)) +
labs(title = 'City C', y = 'Count', x = 'Age', fill = 'Age')
grid.arrange(City_A_stay_vis, City_B_stay_vis, City_C_stay_vis, ncol = 3)

#It looks as though City A has less shoppers living there over the age of 45 compared to the other cities.
#This could be a factor in the resulting levels of Marital_Status within each individual city.
######################################################################################################################
######################### Top Shoppers #########################
top_shoppers = dataset %>%
count(User_ID, sort = TRUE)
head(top_shoppers)
## # A tibble: 6 x 2
## User_ID n
## <int> <int>
## 1 1001680 1025
## 2 1004277 978
## 3 1001941 898
## 4 1001181 861
## 5 1000889 822
## 6 1003618 766
#Looks like User_ID 1001680 shows up the most on our master ledger of shopper data.
#Since each individual row represents a different transaction/product, it looks like this user made over 1000 total transactions!
#We can join together this top shoppers dataset with our total customer purchases dataset to see them combined.
top_shoppers = top_shoppers %>%
select(User_ID, n) %>%
left_join(customers_total_purchase_amount, Purchase_Amount, by = 'User_ID')
head(top_shoppers)
## # A tibble: 6 x 3
## User_ID n Purchase_Amount
## <int> <int> <int>
## 1 1001680 1025 8699232
## 2 1004277 978 10536783
## 3 1001941 898 6817493
## 4 1001181 861 6387899
## 5 1000889 822 5499812
## 6 1003618 766 5961987
#Now that we have joined the two tables together,
#we can see that although User_ID 1001680 has the highest number of total purchases,
#User_ID 1004277 has the highest Purchase_Amount as identified in our earlier charts as well.
#compute the average Purchase_Amount for each user.
top_shoppers = mutate(top_shoppers,
Average_Purchase_Amount = Purchase_Amount/n)
head(top_shoppers)
## # A tibble: 6 x 4
## User_ID n Purchase_Amount Average_Purchase_Amount
## <int> <int> <int> <dbl>
## 1 1001680 1025 8699232 8487.
## 2 1004277 978 10536783 10774.
## 3 1001941 898 6817493 7592.
## 4 1001181 861 6387899 7419.
## 5 1000889 822 5499812 6691.
## 6 1003618 766 5961987 7783.
#From here, we can also compute the average Purchase_Amount for each user
#we can sort according to Average_Purchase_Amount to see which customers, on average, are spending the most.
top_shoppers_averagePurchase = top_shoppers %>%
arrange(desc(Average_Purchase_Amount))
head(top_shoppers_averagePurchase)
## # A tibble: 6 x 4
## User_ID n Purchase_Amount Average_Purchase_Amount
## <int> <int> <int> <dbl>
## 1 1005069 16 308454 19278.
## 2 1003902 93 1746284 18777.
## 3 1005999 18 330227 18346.
## 4 1001349 23 417743 18163.
## 5 1000101 65 1138239 17511.
## 6 1003461 20 350174 17509.
#Looks like User_ID 1005069 has the highest Average_Purchase_Amount
#and a total Purchase_Amount of 308454. User_ID 1003902 is right behind User_ID 1005069
#in Average_Purchase_Amount, but has a much higher total Purchase_Amount of 1746284.
#################################################################################################
######################## Occupation ##########################
customers_Occupation = dataset %>%
select(User_ID, Occupation) %>%
group_by(User_ID) %>%
distinct() %>%
left_join(customers_total_purchase_amount, Occupation, by = 'User_ID')
head(customers_Occupation)
## # A tibble: 6 x 3
## # Groups: User_ID [6]
## User_ID Occupation Purchase_Amount
## <int> <int> <int>
## 1 1000001 10 333481
## 2 1000002 16 810353
## 3 1000003 15 341635
## 4 1000004 7 205987
## 5 1000005 20 821001
## 6 1000006 9 379450
#Now that we have our dataset necessary, we can group together the total Purchase_Amount for
#each Occupation identifier. We will then convert Occupation to a charater data type
totalPurchases_Occupation = customers_Occupation %>%
group_by(Occupation) %>%
summarise(Purchase_Amount = sum(Purchase_Amount)) %>%
arrange(desc(Purchase_Amount))
totalPurchases_Occupation$Occupation = as.character(totalPurchases_Occupation$Occupation)
typeof(totalPurchases_Occupation$Occupation)
## [1] "character"
head(totalPurchases_Occupation)
## # A tibble: 6 x 2
## Occupation Purchase_Amount
## <chr> <int>
## 1 4 657530393
## 2 0 625814811
## 3 7 549282744
## 4 1 414552829
## 5 17 387240355
## 6 12 300672105
occupation = ggplot(data = totalPurchases_Occupation) +
geom_bar(mapping = aes(x = reorder(Occupation, -Purchase_Amount), y = Purchase_Amount, fill = Occupation), stat = 'identity') +
scale_x_discrete(name="Occupation", breaks = seq(0,20, by = 1), expand = c(0,0)) +
scale_y_continuous(name="Purchase Amount ($)", expand = c(0,0), limits = c(0, 750000000)) +
labs(title = 'Total Purchase Amount by Occupation') +
theme(legend.position="none")
print(occupation)

#Looks like customers labeled as Occupation 4 spent the most at our store on Black Friday,
#with customers of Occupation 0 + 7 closely behind. Here,
#if a key was given, we could use that information to classify our shoppers accordingly.