kaggle_project_blackFriday.R

setwd("C:\\Users\\hi\\Documents")
dataset <- read.csv("BlackFriday.csv")
library(tidyverse)

## Warning: package 'tidyverse' was built under R version 3.5.2

## -- Attaching packages ---------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 3.1.1     v purrr   0.3.2
## v tibble  2.1.3     v dplyr   0.8.1
## v tidyr   0.8.3     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.4.0

## Warning: package 'ggplot2' was built under R version 3.5.3

## Warning: package 'tidyr' was built under R version 3.5.3

## Warning: package 'readr' was built under R version 3.5.2

## Warning: package 'purrr' was built under R version 3.5.3

## Warning: package 'dplyr' was built under R version 3.5.3

## Warning: package 'stringr' was built under R version 3.5.2

## Warning: package 'forcats' was built under R version 3.5.3

## -- Conflicts ------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(scales)

## Warning: package 'scales' was built under R version 3.5.2

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

library(gridExtra)

## Warning: package 'gridExtra' was built under R version 3.5.2

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

#The tidyverse package is what we will use for visualizing and exploring our dataset.
#It is knows for easy-to-read syntax and massive amounts of useful functions. 
#The scales package will be used mainly to customize plot axis
summary(dataset)

##     User_ID            Product_ID     Gender        Age        
##  Min.   :1000001   P00265242:  1858   F:132197   0-17 : 14707  
##  1st Qu.:1001495   P00110742:  1591   M:405380   18-25: 97634  
##  Median :1003031   P00025442:  1586              26-35:214690  
##  Mean   :1002992   P00112142:  1539              36-45:107499  
##  3rd Qu.:1004417   P00057642:  1430              46-50: 44526  
##  Max.   :1006040   P00184942:  1424              51-55: 37618  
##                    (Other)  :528149              55+  : 20903  
##    Occupation     City_Category Stay_In_Current_City_Years
##  Min.   : 0.000   A:144638      0 : 72725                 
##  1st Qu.: 2.000   B:226493      1 :189192                 
##  Median : 7.000   C:166446      2 : 99459                 
##  Mean   : 8.083                 3 : 93312                 
##  3rd Qu.:14.000                 4+: 82889                 
##  Max.   :20.000                                           
##                                                           
##  Marital_Status   Product_Category_1 Product_Category_2 Product_Category_3
##  Min.   :0.0000   Min.   : 1.000     Min.   : 2.00      Min.   : 3.0      
##  1st Qu.:0.0000   1st Qu.: 1.000     1st Qu.: 5.00      1st Qu.: 9.0      
##  Median :0.0000   Median : 5.000     Median : 9.00      Median :14.0      
##  Mean   :0.4088   Mean   : 5.296     Mean   : 9.84      Mean   :12.7      
##  3rd Qu.:1.0000   3rd Qu.: 8.000     3rd Qu.:15.00      3rd Qu.:16.0      
##  Max.   :1.0000   Max.   :18.000     Max.   :18.00      Max.   :18.0      
##                                      NA's   :166986     NA's   :373299    
##     Purchase    
##  Min.   :  185  
##  1st Qu.: 5866  
##  Median : 8062  
##  Mean   : 9334  
##  3rd Qu.:12073  
##  Max.   :23961  
##

head(dataset, 5)

##   User_ID Product_ID Gender  Age Occupation City_Category
## 1 1000001  P00069042      F 0-17         10             A
## 2 1000001  P00248942      F 0-17         10             A
## 3 1000001  P00087842      F 0-17         10             A
## 4 1000001  P00085442      F 0-17         10             A
## 5 1000002  P00285442      M  55+         16             C
##   Stay_In_Current_City_Years Marital_Status Product_Category_1
## 1                          2              0                  3
## 2                          2              0                  1
## 3                          2              0                 12
## 4                          2              0                 12
## 5                         4+              0                  8
##   Product_Category_2 Product_Category_3 Purchase
## 1                 NA                 NA     8370
## 2                  6                 14    15200
## 3                 NA                 NA     1422
## 4                 14                 NA     1057
## 5                 NA                 NA     7969

#It looks like we have 12 different columns, each representing a corresponding variable below.
#User_ID: Unique identifier of shopper.
#Product_ID: Unique identifier of product. (No key given)
#Gender: Sex of shopper.
#Age: Age of shopper split into bins.
#Occupation: Occupation of shopper. (No key given)
#City_Category: Residence location of shopper. (No key given)
#Stay_In_Current_City_Years: Number of years stay in current city.
#Marital_Status: Marital status of shopper.
#Product_Category_1: Product category of purchase.
#Product_Category_2: Product may belong to other category.
#Product_Category_3: Product may belong to other category.
#Purchase: Purchase amount in dollars.

########################TO BEGAIN OUR EXPLORATION ##############################################################################3
 ########                 GENDER                     ##########################
dataset_gender =dataset %>% select(User_ID,Gender) %>% group_by(User_ID) %>% distinct() 
head(dataset_gender )

## # A tibble: 6 x 2
## # Groups:   User_ID [6]
##   User_ID Gender
##     <int> <fct> 
## 1 1000001 F     
## 2 1000002 M     
## 3 1000003 M     
## 4 1000004 M     
## 5 1000005 M     
## 6 1000006 F

View(dataset_gender )
summary(dataset_gender$Gender)

##    F    M 
## 1666 4225

plot(dataset_gender$Gender)

genderDist  = ggplot(data = dataset_gender) +
  geom_bar(mapping = aes(x = Gender, fill = Gender)) +
  labs(title = 'Gender of Customers') + 
  scale_fill_brewer(palette = 'PuBuGn')

print(genderDist)

#As we can see, there are quite a few more males than females shopping at our store on Black Friday
#Average purchase by gender
dataset_sum = dataset %>% select(User_ID,Gender,Purchase) %>% group_by(Gender) %>% summarise(total_sum=as.numeric(sum(Purchase))) %>% arrange()
dataset_avg_M = 3853044357/4225
dataset_avg_F =1164624021/1666
Value <- c (dataset_avg_M,dataset_avg_F)
gender <- c ('M','F')

average_spending_gender <- data.frame(Value,gender)
View(average_spending_gender)
str(average_spending_gender)

## 'data.frame':    2 obs. of  2 variables:
##  $ Value : num  911963 699054
##  $ gender: Factor w/ 2 levels "F","M": 2 1

#We can see that that the average transaction for Females was 699054.00 and the average transaction for Males was 911963.20. 
#Let visualize our results

class(average_spending_gender)

## [1] "data.frame"

genderAverage  = ggplot(data = average_spending_gender) +
  geom_bar(mapping = aes(x = gender, y = Value, fill = gender), stat = 'identity') +
  labs(title = 'Average Spending by Gender') +
  scale_fill_brewer(palette = 'PuBuGn')
print(genderAverage)

##################                        TOP SELLERS                        #########################################
TOP_seller <- dataset %>% select(Product_ID,Purchase) %>% group_by(Product_ID) %>% summarise(sumproduct = sum(Purchase)) %>% arrange()
View(head(TOP_seller, 10)) 
#Now that we have Identified our top 5 best selling products, lets examine the best selling product, P00265242.
best_seller = dataset[dataset$Product_ID == 'P00265242', ]
head(best_seller)

##      User_ID Product_ID Gender   Age Occupation City_Category
## 400  1000066  P00265242      M 26-35         18             C
## 1192 1000196  P00265242      F 36-45          9             C
## 1373 1000222  P00265242      M 26-35          1             A
## 1846 1000301  P00265242      M 18-25          4             B
## 2210 1000345  P00265242      M 26-35         12             A
## 2405 1000383  P00265242      F 26-35          7             A
##      Stay_In_Current_City_Years Marital_Status Product_Category_1
## 400                           2              0                  5
## 1192                         4+              0                  5
## 1373                          1              0                  5
## 1846                         4+              0                  5
## 2210                          2              1                  5
## 2405                         4+              1                  5
##      Product_Category_2 Product_Category_3 Purchase
## 400                   8                 NA     8652
## 1192                  8                 NA     8767
## 1373                  8                 NA     6944
## 1846                  8                 NA     8628
## 2210                  8                 NA     8593
## 2405                  8                 NA     6998

#We can see that this product fits into Product_Category_1 = 5 and Product_Category_2 = 8. As mentioned in the introduction, 
#it would be useful to have a key to reference the item name in order to determine what it is.
#Another interesting finding is that even though people are purchasing the same product,
#they are paying different prices. This could be due to various Black Friday promotions, 
#discounts, or coupon codes. Otherwise, investigation would need to be done regarding the reason for different purchase prices of the same product between customers.

###Lets continue to analyze our best seller to see if any relationship to Gender exits.
genderDist_bs  = ggplot(data = best_seller) +
  geom_bar(mapping = aes(x = Gender, y = ..count.., fill = Gender)) +
  labs(title = 'Gender of Customers (Best Seller)') +
  scale_fill_brewer(palette = 'PuBuGn')
print(genderDist_bs)

#################   AGE                              ###################################
#Lets begin examining Age by creating a table of each individual age group and their respective counts.
customers_age = dataset %>% select(Age, User_ID) %>% group_by(Age) %>% distinct(User_ID) %>% summarise(n())
names(customers_age) <- c("age","num")
customers_age_vis = ggplot(data = customers_age) + 
  geom_bar(color = 'black', stat = 'identity', mapping = aes(x = age, y = num, fill = age)) +
  labs(title = 'Age of Customers') +
  theme(axis.text.x = element_text(size = 10)) +
  scale_fill_brewer(palette = 'Blues') +
  theme(legend.position="none")
print(customers_age_vis)

#We can also plot a similar chart depicting the distribution of age within our "best seller" category.
#This will show us if there is a specific age category that purchased the best selling product more than other shoppers.

###########################################################################################################################
#####################################    city        #################################################
#####################################            #################################################

customers_location =  dataset %>%
  select(User_ID, City_Category) %>%
  distinct()
head(customers_location)

##   User_ID City_Category
## 1 1000001             A
## 2 1000002             C
## 3 1000003             A
## 4 1000004             B
## 5 1000005             A
## 6 1000006             A

customers_location_vis = ggplot(data = customers_location) +
  geom_bar(color = 'white', mapping = aes(x = City_Category, y = ..count.., fill = City_Category)) +
  labs(title = 'Location of Customers') + 
  scale_fill_brewer(palette = "Dark2") + 
  theme(legend.position="none")
print(customers_location_vis)

#We can see that most of our customers live in City C. 
#Now, we can compute the total purchase amount by City to see the which city's customers spent the most at our store.
purchases_city = dataset %>%
  group_by(City_Category) %>%
  summarise(Purchases = sum(Purchase))

purchases_city_1000s = purchases_city %>%
  mutate(purchasesThousands = purchases_city$Purchases / 1000)

purchases_city_1000s

## # A tibble: 3 x 3
##   City_Category  Purchases purchasesThousands
##   <fct>              <int>              <dbl>
## 1 A             1295668797           1295669.
## 2 B             2083431612           2083432.
## 3 C             1638567969           1638568.

#In order to work with larger numbers, we divided the Purchases column/1000.
#This is a common practice within the business and acounting world, and it makes large numbers easier to read and chart.
purchaseCity_vis = ggplot(data = purchases_city_1000s, aes(x = City_Category, y = purchasesThousands, fill = City_Category)) +
  geom_bar(color = 'white', stat = 'identity') +
  labs(title = 'Total Customer Purchase Amount (by City)', y = '($000s)', x = 'City Category') +
  scale_fill_brewer(palette = "Dark2") + 
  theme(legend.position="none", plot.title = element_text(size = 9))
print(purchaseCity_vis)

grid.arrange(customers_location_vis, purchaseCity_vis, ncol=2)

#Here we can see that customers from City C were the most frequent shoppers at our store on Black Friday 
#but Customers from City B had the highest amount of total purchases.
#Let's continue to investigate and try to determine the reason for this observation.

#Lets find how many purchases were made by customers from each city.
#First, we will get the total number of purchases for each corresponding User_ID.
customers = dataset %>%
  group_by(User_ID) %>%
  count(User_ID)
head(customers)

## # A tibble: 6 x 2
## # Groups:   User_ID [6]
##   User_ID     n
##     <int> <int>
## 1 1000001    34
## 2 1000002    76
## 3 1000003    29
## 4 1000004    13
## 5 1000005   106
## 6 1000006    46

#This tells us how many times a certain user made a purchase. 
#dive deeper lets compute the total purchase amount for each user, then join it with the other table
customers_City =  dataset %>%
  select(User_ID, City_Category) %>%
  group_by(User_ID) %>%
  distinct() %>%
  ungroup() %>%
  left_join(customers, customers_City, by = 'User_ID') 
head(customers_City)

## # A tibble: 6 x 3
##   User_ID City_Category     n
##     <int> <fct>         <int>
## 1 1000001 A                34
## 2 1000002 C                76
## 3 1000003 A                29
## 4 1000004 B                13
## 5 1000005 A               106
## 6 1000006 A                46

city_purchases_count = customers_City %>%
  select(City_Category, n) %>%
  group_by(City_Category) %>%
  summarise(CountOfPurchases = sum(n))
city_purchases_count

## # A tibble: 3 x 2
##   City_Category CountOfPurchases
##   <fct>                    <int>
## 1 A                       144638
## 2 B                       226493
## 3 C                       166446

city_count_purchases_vis = ggplot(data = city_purchases_count, aes(x = City_Category, y = CountOfPurchases, fill = City_Category)) +
  geom_bar(color = 'white', stat = 'identity') +
  labs(title = 'Total Purchase Count (by City)', y = 'Count', x = 'City Category') +
  scale_fill_brewer(palette = "Dark2") +
  theme(legend.position="none", plot.title = element_text(size = 9))
print(city_count_purchases_vis)

grid.arrange(purchaseCity_vis, city_count_purchases_vis, ncol = 2)

#One inference we can make from these charts is that customers from City B are simply making more purchases than residence of City A + City C,
#and not necessarily buying more expensive products. 
#We can make this assumption due to the fact that the "Total Count of Purchases" chart has a very similar appearance to the "Total Customer Purchase Amount" chart. 
#If it were the other case, then customers from City B would most likely have a lower count of total purchases corresponding to a higher total purchase amount.
head(best_seller)

##      User_ID Product_ID Gender   Age Occupation City_Category
## 400  1000066  P00265242      M 26-35         18             C
## 1192 1000196  P00265242      F 36-45          9             C
## 1373 1000222  P00265242      M 26-35          1             A
## 1846 1000301  P00265242      M 18-25          4             B
## 2210 1000345  P00265242      M 26-35         12             A
## 2405 1000383  P00265242      F 26-35          7             A
##      Stay_In_Current_City_Years Marital_Status Product_Category_1
## 400                           2              0                  5
## 1192                         4+              0                  5
## 1373                          1              0                  5
## 1846                         4+              0                  5
## 2210                          2              1                  5
## 2405                         4+              1                  5
##      Product_Category_2 Product_Category_3 Purchase
## 400                   8                 NA     8652
## 1192                  8                 NA     8767
## 1373                  8                 NA     6944
## 1846                  8                 NA     8628
## 2210                  8                 NA     8593
## 2405                  8                 NA     6998

best_seller_city = best_seller %>%
  select(User_ID, City_Category) %>%
  distinct() %>%
  count(City_Category)
best_seller_city

## # A tibble: 3 x 2
##   City_Category     n
##   <fct>         <int>
## 1 A               396
## 2 B               626
## 3 C               836

best_seller_city_vis = ggplot(data = best_seller_city, aes(x = City_Category, y = n, fill = City_Category)) +
  geom_bar(color = 'white', stat = 'identity') +
  labs(title = 'Best Seller Purchase Count (by City)', y = 'Count', x = 'City Category') +
  scale_fill_brewer(palette = "Blues") +
  theme(legend.position="none", plot.title = element_text(size = 9))
grid.arrange(city_count_purchases_vis,best_seller_city_vis, ncol = 2)

#An interesting revelation has been made! Although customers residing in City C purchase more of our "best seller" than City A + B, 
#residents of City C fall behind City B in overall number of purchases.

################################################################################################################################
#######################3     Stay in Current City   ###########################
########################                           ############################

customers_stay = dataset %>%
  select(User_ID, City_Category, Stay_In_Current_City_Years) %>%
  group_by(User_ID) %>%
  distinct()
head(customers_stay)

## # A tibble: 6 x 3
## # Groups:   User_ID [6]
##   User_ID City_Category Stay_In_Current_City_Years
##     <int> <fct>         <fct>                     
## 1 1000001 A             2                         
## 2 1000002 C             4+                        
## 3 1000003 A             3                         
## 4 1000004 B             2                         
## 5 1000005 A             1                         
## 6 1000006 A             1

#Now that we have our dataset in order, we can plot and explore.
residence = customers_stay %>%
  group_by(City_Category) %>%
  tally()
head(residence)

## # A tibble: 3 x 2
##   City_Category     n
##   <fct>         <int>
## 1 A              1045
## 2 B              1707
## 3 C              3139

#Looks like most of our customers are living in City C. Now, lets investigate further
customers_stay_vis = ggplot(data = customers_stay, aes(x = Stay_In_Current_City_Years, y = ..count.., fill = Stay_In_Current_City_Years)) +
  geom_bar(stat = 'count') +
  scale_fill_brewer(palette = 15) +
  labs(title = 'Customers Stay in Current City', y = 'Count', x = 'Stay in Current City', fill = 'Number of Years in Current City')
print(customers_stay_vis)

#It looks like most of our customers have only been living in their respective cities for 1 year. 
#In order to see a better distribution, lets make a stacked bar chart according to each City_Category.
stay_cities = customers_stay %>%
  group_by(City_Category, Stay_In_Current_City_Years) %>%
  tally() %>%
  mutate(Percentage = (n/sum(n))*100)
head(stay_cities)

## # A tibble: 6 x 4
## # Groups:   City_Category [2]
##   City_Category Stay_In_Current_City_Years     n Percentage
##   <fct>         <fct>                      <int>      <dbl>
## 1 A             0                            147       14.1
## 2 A             1                            370       35.4
## 3 A             2                            183       17.5
## 4 A             3                            180       17.2
## 5 A             4+                           165       15.8
## 6 B             0                            211       12.4

ggplot(data = stay_cities, aes(x = City_Category, y = n, fill = Stay_In_Current_City_Years)) + 
  geom_bar(stat = "identity", color = 'white') + 
  scale_fill_brewer(palette = 2) + 
  labs(title = "City Category + Stay in Current City", 
       y = "Total Count (Years)", 
       x = "City", 
       fill = "Stay Years")

#Looking at this chart we can see the distribution of the total customer base and their respective city residences, 
#split by the amount of time they have lived there. 
#Here, we can notice that in every City_Category, the most common stay length seems to be 1 year.

###################################################################################################################
################################   Marital Status   ####################
dataset_maritalStatus = dataset %>%
  select(User_ID, Marital_Status) %>%
  group_by(User_ID) %>%
  distinct()

head(dataset_maritalStatus)

## # A tibble: 6 x 2
## # Groups:   User_ID [6]
##   User_ID Marital_Status
##     <int>          <int>
## 1 1000001              0
## 2 1000002              0
## 3 1000003              0
## 4 1000004              1
## 5 1000005              1
## 6 1000006              0

#Note, we need to quickly change Marital_Status from a numeric variable to a categorical type
dataset_maritalStatus$Marital_Status = as.character(dataset_maritalStatus$Marital_Status)
typeof(dataset_maritalStatus$Marital_Status)

## [1] "character"

#If we look back at the variable descriptions of the dataset,
#we don't have a clear guide for marital status. 
#In other cases, 
#it would be best to reach out to the provider of the data to be completely sure
#of what the values in a column represent but in this case, we will assume that 1 = married and 0 = single.
marital_vis = ggplot(data = dataset_maritalStatus) +
  geom_bar(mapping = aes(x = Marital_Status, y = ..count.., fill = Marital_Status)) +
  labs(title = 'Marital Status') +
  scale_fill_brewer(palette = 'Pastel2')
print(marital_vis)

# looks like most of our shoppers happen to be single or unmarried.
#Similar to our investigation of age groups, we can look at the makeup of Marital_Status in each City_Category.
dataset_maritalStatus = dataset_maritalStatus %>%
  full_join(customers_stay, by = 'User_ID') 
head(dataset_maritalStatus)

## # A tibble: 6 x 4
## # Groups:   User_ID [6]
##   User_ID Marital_Status City_Category Stay_In_Current_City_Years
##     <int> <chr>          <fct>         <fct>                     
## 1 1000001 0              A             2                         
## 2 1000002 0              C             4+                        
## 3 1000003 0              A             3                         
## 4 1000004 1              B             2                         
## 5 1000005 1              A             1                         
## 6 1000006 0              A             1

maritalStatus_cities = dataset_maritalStatus %>%
  group_by(City_Category, Marital_Status) %>%
  tally()
head(maritalStatus_cities)

## # A tibble: 6 x 3
## # Groups:   City_Category [3]
##   City_Category Marital_Status     n
##   <fct>         <chr>          <int>
## 1 A             0                652
## 2 A             1                393
## 3 B             0               1004
## 4 B             1                703
## 5 C             0               1761
## 6 C             1               1378

top_sellers = dataset %>%
  count(Product_ID, sort = TRUE)

top_5 = head(top_sellers, 5)
best_seller = dataset[dataset$Product_ID == 'P00265242', ]
head(best_seller)

##      User_ID Product_ID Gender   Age Occupation City_Category
## 400  1000066  P00265242      M 26-35         18             C
## 1192 1000196  P00265242      F 36-45          9             C
## 1373 1000222  P00265242      M 26-35          1             A
## 1846 1000301  P00265242      M 18-25          4             B
## 2210 1000345  P00265242      M 26-35         12             A
## 2405 1000383  P00265242      F 26-35          7             A
##      Stay_In_Current_City_Years Marital_Status Product_Category_1
## 400                           2              0                  5
## 1192                         4+              0                  5
## 1373                          1              0                  5
## 1846                         4+              0                  5
## 2210                          2              1                  5
## 2405                         4+              1                  5
##      Product_Category_2 Product_Category_3 Purchase
## 400                   8                 NA     8652
## 1192                  8                 NA     8767
## 1373                  8                 NA     6944
## 1846                  8                 NA     8628
## 2210                  8                 NA     8593
## 2405                  8                 NA     6998

genderDist_bs  = ggplot(data = best_seller) +
  geom_bar(mapping = aes(x = Gender, y = ..count.., fill = Gender)) +
  labs(title = 'Gender of Customers (Best Seller)') +
  scale_fill_brewer(palette = 'PuBuGn')
print(genderDist_bs)

genderDist_bs_prop = ggplot(data = best_seller) + 
  geom_bar(fill = 'lightblue', mapping = aes(x = Gender, y = ..prop.., group = 1, fill = Gender)) +
  labs(title = 'Gender of Customers (Best Seller - Proportion)') +
  theme(plot.title = element_text(size=9.5))

genderDist_prop = ggplot(data = dataset_gender) + 
  geom_bar(fill = "lightblue4", mapping = aes(x = Gender, y = ..prop.., group = 1)) +
  labs(title = 'Gender of Customers (Total Proportion)') +
  theme(plot.title = element_text(size=9.5)) 

grid.arrange(genderDist_prop, genderDist_bs_prop, ncol=2)

customers_age = dataset %>%
select(User_ID, Age) %>%
distinct() %>%
count(Age)
customers_age

## # A tibble: 7 x 2
##   Age       n
##   <fct> <int>
## 1 0-17    218
## 2 18-25  1069
## 3 26-35  2053
## 4 36-45  1167
## 5 46-50   531
## 6 51-55   481
## 7 55+     372

customers_age_vis = ggplot(data = customers_age) + 
geom_bar(color = 'black', stat = 'identity', mapping = aes(x = Age, y = n, fill = Age)) +
labs(title = 'Age of Customers') +
theme(axis.text.x = element_text(size = 10)) +
scale_fill_brewer(palette = 'Blues') +
theme(legend.position="none")
print(customers_age_vis)

ageDist_bs  = ggplot(data = best_seller) +
geom_bar(color = 'black', mapping = aes(x = Age, y = ..count.., fill = Age)) +
labs(title = 'Age of Customers (Best Seller)') +
theme(axis.text.x = element_text(size = 10)) +
scale_fill_brewer(palette = 'GnBu') + 
theme(legend.position="none")
print(ageDist_bs)

grid.arrange(customers_age_vis, ageDist_bs, ncol=2)

customers_location =  dataset %>%
  select(User_ID, City_Category) %>%
  distinct()
head(customers_location)

##   User_ID City_Category
## 1 1000001             A
## 2 1000002             C
## 3 1000003             A
## 4 1000004             B
## 5 1000005             A
## 6 1000006             A

customers_location_vis = ggplot(data = customers_location) +
  geom_bar(color = 'white', mapping = aes(x = City_Category, y = ..count.., fill = City_Category)) +
  labs(title = 'Location of Customers') + 
  scale_fill_brewer(palette = "Dark2") + 
  theme(legend.position="none")
print(customers_location_vis)

purchases_city = dataset %>%
group_by(City_Category) %>%
summarise(Purchases = sum(Purchase))
purchases_city_1000s = purchases_city %>%
mutate(purchasesThousands = purchases_city$Purchases / 1000)
purchases_city_1000s

## # A tibble: 3 x 3
##   City_Category  Purchases purchasesThousands
##   <fct>              <int>              <dbl>
## 1 A             1295668797           1295669.
## 2 B             2083431612           2083432.
## 3 C             1638567969           1638568.

purchaseCity_vis = ggplot(data = purchases_city_1000s, aes(x = City_Category, y = purchasesThousands, fill = City_Category)) +
geom_bar(color = 'white', stat = 'identity') +
labs(title = 'Total Customer Purchase Amount (by City)', y = '($000s)', x = 'City Category') +
scale_fill_brewer(palette = "Dark2") + 
theme(legend.position="none", plot.title = element_text(size = 9))
print(purchaseCity_vis)

grid.arrange(customers_location_vis, purchaseCity_vis, ncol=2)

customers = dataset %>%
  group_by(User_ID) %>%
  count(User_ID)
head(customers)

## # A tibble: 6 x 2
## # Groups:   User_ID [6]
##   User_ID     n
##     <int> <int>
## 1 1000001    34
## 2 1000002    76
## 3 1000003    29
## 4 1000004    13
## 5 1000005   106
## 6 1000006    46

customers_City =  dataset %>%
  select(User_ID, City_Category) %>%
  group_by(User_ID) %>%
  distinct() %>%
  ungroup() %>%
  left_join(customers, customers_City, by = 'User_ID') 
head(customers_City)

## # A tibble: 6 x 3
##   User_ID City_Category     n
##     <int> <fct>         <int>
## 1 1000001 A                34
## 2 1000002 C                76
## 3 1000003 A                29
## 4 1000004 B                13
## 5 1000005 A               106
## 6 1000006 A                46

city_purchases_count = customers_City %>%
  select(City_Category, n) %>%
  group_by(City_Category) %>%
  summarise(CountOfPurchases = sum(n))
city_purchases_count

## # A tibble: 3 x 2
##   City_Category CountOfPurchases
##   <fct>                    <int>
## 1 A                       144638
## 2 B                       226493
## 3 C                       166446

city_count_purchases_vis = ggplot(data = city_purchases_count, aes(x = City_Category, y = CountOfPurchases, fill = City_Category)) +
  geom_bar(color = 'white', stat = 'identity') +
  labs(title = 'Total Purchase Count (by City)', y = 'Count', x = 'City Category') +
  scale_fill_brewer(palette = "Dark2") +
  theme(legend.position="none", plot.title = element_text(size = 9))
print(city_count_purchases_vis)

grid.arrange(purchaseCity_vis, city_count_purchases_vis, ncol = 2)

head(best_seller)

##      User_ID Product_ID Gender   Age Occupation City_Category
## 400  1000066  P00265242      M 26-35         18             C
## 1192 1000196  P00265242      F 36-45          9             C
## 1373 1000222  P00265242      M 26-35          1             A
## 1846 1000301  P00265242      M 18-25          4             B
## 2210 1000345  P00265242      M 26-35         12             A
## 2405 1000383  P00265242      F 26-35          7             A
##      Stay_In_Current_City_Years Marital_Status Product_Category_1
## 400                           2              0                  5
## 1192                         4+              0                  5
## 1373                          1              0                  5
## 1846                         4+              0                  5
## 2210                          2              1                  5
## 2405                         4+              1                  5
##      Product_Category_2 Product_Category_3 Purchase
## 400                   8                 NA     8652
## 1192                  8                 NA     8767
## 1373                  8                 NA     6944
## 1846                  8                 NA     8628
## 2210                  8                 NA     8593
## 2405                  8                 NA     6998

best_seller_city = best_seller %>%
  select(User_ID, City_Category) %>%
  distinct() %>%
  count(City_Category)
best_seller_city

## # A tibble: 3 x 2
##   City_Category     n
##   <fct>         <int>
## 1 A               396
## 2 B               626
## 3 C               836

best_seller_city_vis = ggplot(data = best_seller_city, aes(x = City_Category, y = n, fill = City_Category)) +
  geom_bar(color = 'white', stat = 'identity') +
  labs(title = 'Best Seller Purchase Count (by City)', y = 'Count', x = 'City Category') +
  scale_fill_brewer(palette = "Blues") +
  theme(legend.position="none", plot.title = element_text(size = 9))
grid.arrange(city_count_purchases_vis,best_seller_city_vis, ncol = 2)

customers_stay = dataset %>%
  select(User_ID, City_Category, Stay_In_Current_City_Years) %>%
  group_by(User_ID) %>%
  distinct()
head(customers_stay)

## # A tibble: 6 x 3
## # Groups:   User_ID [6]
##   User_ID City_Category Stay_In_Current_City_Years
##     <int> <fct>         <fct>                     
## 1 1000001 A             2                         
## 2 1000002 C             4+                        
## 3 1000003 A             3                         
## 4 1000004 B             2                         
## 5 1000005 A             1                         
## 6 1000006 A             1

residence = customers_stay %>%
  group_by(City_Category) %>%
  tally()
head(residence)

## # A tibble: 3 x 2
##   City_Category     n
##   <fct>         <int>
## 1 A              1045
## 2 B              1707
## 3 C              3139

customers_stay_vis = ggplot(data = customers_stay, aes(x = Stay_In_Current_City_Years, y = ..count.., fill = Stay_In_Current_City_Years)) +
  geom_bar(stat = 'count') +
  scale_fill_brewer(palette = 15) +
  labs(title = 'Customers Stay in Current City', y = 'Count', x = 'Stay in Current City', fill = 'Number of Years in Current City')
print(customers_stay_vis)

stay_cities = customers_stay %>%
  group_by(City_Category, Stay_In_Current_City_Years) %>%
  tally() %>%
  mutate(Percentage = (n/sum(n))*100)
head(stay_cities)

## # A tibble: 6 x 4
## # Groups:   City_Category [2]
##   City_Category Stay_In_Current_City_Years     n Percentage
##   <fct>         <fct>                      <int>      <dbl>
## 1 A             0                            147       14.1
## 2 A             1                            370       35.4
## 3 A             2                            183       17.5
## 4 A             3                            180       17.2
## 5 A             4+                           165       15.8
## 6 B             0                            211       12.4

ggplot(data = stay_cities, aes(x = City_Category, y = n, fill = Stay_In_Current_City_Years)) + 
  geom_bar(stat = "identity", color = 'white') + 
  scale_fill_brewer(palette = 2) + 
  labs(title = "City Category + Stay in Current City", 
       y = "Total Count (Years)", 
       x = "City", 
       fill = "Stay Years")

customers_total_purchase_amount = dataset %>%
  group_by(User_ID) %>%
  summarise(Purchase_Amount = sum(Purchase))

head(customers_total_purchase_amount)

## # A tibble: 6 x 2
##   User_ID Purchase_Amount
##     <int>           <int>
## 1 1000001          333481
## 2 1000002          810353
## 3 1000003          341635
## 4 1000004          205987
## 5 1000005          821001
## 6 1000006          379450

customers_total_purchase_amount = arrange(customers_total_purchase_amount, desc((Purchase_Amount)))

head(customers_total_purchase_amount)

## # A tibble: 6 x 2
##   User_ID Purchase_Amount
##     <int>           <int>
## 1 1004277        10536783
## 2 1001680         8699232
## 3 1002909         7577505
## 4 1001941         6817493
## 5 1000424         6573609
## 6 1004448         6565878

summary(customers_total_purchase_amount)

##     User_ID        Purchase_Amount   
##  Min.   :1000001   Min.   :   44108  
##  1st Qu.:1001518   1st Qu.:  234914  
##  Median :1003026   Median :  512612  
##  Mean   :1003025   Mean   :  851752  
##  3rd Qu.:1004532   3rd Qu.: 1099005  
##  Max.   :1006040   Max.   :10536783

ggplot(customers_total_purchase_amount, aes(Purchase_Amount)) +
  geom_density(adjust = 1) +
  geom_vline(aes(xintercept=median(Purchase_Amount)),
             color="blue", linetype="dashed", size=1) +
  geom_vline(aes(xintercept=mean(Purchase_Amount)),
             color="red", linetype="dashed", size=1) +
  geom_text(aes(x=mean(Purchase_Amount), label=round(mean(Purchase_Amount)), y=1.2e-06), color = 'red', angle=360,
            size=4, vjust=3, hjust=-.1) +
  geom_text(aes(x=median(Purchase_Amount), label=round(median(Purchase_Amount)), y=1.2e-06), color = 'blue', angle=360,
            size=4, vjust=0, hjust=-.1) +
  scale_x_continuous(name="Purchase Amount", limits=c(0, 7500000), breaks = seq(0,7500000, by = 1000000), expand = c(0,0)) +
  scale_y_continuous(name="Density", limits=c(0, .00000125), labels = scientific, expand = c(0,0))

## Warning: Removed 3 rows containing non-finite values (stat_density).

##################################################      Marital Status    ######################
#Lets now examine the marital status of store customers.

dataset_maritalStatus = dataset %>%
select(User_ID, Marital_Status) %>%
group_by(User_ID) %>%
distinct()

head(dataset_maritalStatus)

## # A tibble: 6 x 2
## # Groups:   User_ID [6]
##   User_ID Marital_Status
##     <int>          <int>
## 1 1000001              0
## 2 1000002              0
## 3 1000003              0
## 4 1000004              1
## 5 1000005              1
## 6 1000006              0

dataset_maritalStatus$Marital_Status = as.character(dataset_maritalStatus$Marital_Status)
typeof(dataset_maritalStatus$Marital_Status)

## [1] "character"

marital_vis = ggplot(data = dataset_maritalStatus) +
  geom_bar(mapping = aes(x = Marital_Status, y = ..count.., fill = Marital_Status)) +
  labs(title = 'Marital Status') +
  scale_fill_brewer(palette = 'Pastel2')
print(marital_vis)

#It looks like most of our shoppers happen to be single or unmarried. Similar to our investigation of age groups, we can look at the makeup of Marital_Status in each City_Category.

dataset_maritalStatus = dataset_maritalStatus %>%
  full_join(customers_stay, by = 'User_ID') 
head(dataset_maritalStatus)

## # A tibble: 6 x 4
## # Groups:   User_ID [6]
##   User_ID Marital_Status City_Category Stay_In_Current_City_Years
##     <int> <chr>          <fct>         <fct>                     
## 1 1000001 0              A             2                         
## 2 1000002 0              C             4+                        
## 3 1000003 0              A             3                         
## 4 1000004 1              B             2                         
## 5 1000005 1              A             1                         
## 6 1000006 0              A             1

#User_ID    Marital_Status  City_Category   Stay_In_Current_City_Years

maritalStatus_cities = dataset_maritalStatus %>%
  group_by(City_Category, Marital_Status) %>%
  tally()
head(maritalStatus_cities)

## # A tibble: 6 x 3
## # Groups:   City_Category [3]
##   City_Category Marital_Status     n
##   <fct>         <chr>          <int>
## 1 A             0                652
## 2 A             1                393
## 3 B             0               1004
## 4 B             1                703
## 5 C             0               1761
## 6 C             1               1378

ggplot(data = maritalStatus_cities, aes(x = City_Category, y = n, fill = Marital_Status)) + 
  geom_bar(stat = "identity", color = 'black') + 
  scale_fill_brewer(palette = 2) + 
  labs(title = "City + Marital Status", 
       y = "Total Count (Shoppers)", 
       x = "City", 
       fill = "Marital Status")

#Here, we can see that out off all Cities, the highest proportion of single shoppers seems to be in City A.
#Now, lets investigate the Stay_in_Current_City distribution within each City_Category
Users_Age = dataset %>%
  select(User_ID, Age) %>%
  distinct()
head(Users_Age)

##   User_ID   Age
## 1 1000001  0-17
## 2 1000002   55+
## 3 1000003 26-35
## 4 1000004 46-50
## 5 1000005 26-35
## 6 1000006 51-55

dataset_maritalStatus = dataset_maritalStatus %>%
  full_join(Users_Age, by = 'User_ID')
head(dataset_maritalStatus)

## # A tibble: 6 x 5
## # Groups:   User_ID [6]
##   User_ID Marital_Status City_Category Stay_In_Current_City_Years Age  
##     <int> <chr>          <fct>         <fct>                      <fct>
## 1 1000001 0              A             2                          0-17 
## 2 1000002 0              C             4+                         55+  
## 3 1000003 0              A             3                          26-35
## 4 1000004 1              B             2                          46-50
## 5 1000005 1              A             1                          26-35
## 6 1000006 0              A             1                          51-55

City_A = dataset_maritalStatus %>%
  filter(City_Category == 'A')
City_B = dataset_maritalStatus %>%
  filter(City_Category == 'B')
City_C = dataset_maritalStatus %>%
  filter(City_Category == 'C')
head(City_A)

## # A tibble: 6 x 5
## # Groups:   User_ID [6]
##   User_ID Marital_Status City_Category Stay_In_Current_City_Years Age  
##     <int> <chr>          <fct>         <fct>                      <fct>
## 1 1000001 0              A             2                          0-17 
## 2 1000003 0              A             3                          26-35
## 3 1000005 1              A             1                          26-35
## 4 1000006 0              A             1                          51-55
## 5 1000015 0              A             1                          26-35
## 6 1000019 0              A             3                          0-17

head(City_B)

## # A tibble: 6 x 5
## # Groups:   User_ID [6]
##   User_ID Marital_Status City_Category Stay_In_Current_City_Years Age  
##     <int> <chr>          <fct>         <fct>                      <fct>
## 1 1000004 1              B             2                          46-50
## 2 1000007 1              B             1                          36-45
## 3 1000010 1              B             4+                         36-45
## 4 1000018 0              B             3                          18-25
## 5 1000021 0              B             0                          18-25
## 6 1000023 1              B             3                          36-45

head(City_C)

## # A tibble: 6 x 5
## # Groups:   User_ID [6]
##   User_ID Marital_Status City_Category Stay_In_Current_City_Years Age  
##     <int> <chr>          <fct>         <fct>                      <fct>
## 1 1000002 0              C             4+                         55+  
## 2 1000008 1              C             4+                         26-35
## 3 1000009 0              C             0                          26-35
## 4 1000011 0              C             1                          26-35
## 5 1000012 0              C             2                          26-35
## 6 1000013 1              C             3                          46-50

City_A_stay_vis = ggplot(data = City_A, aes(x = Age, y = ..count.., fill = Age)) + 
  geom_bar(stat = 'count') +
  scale_fill_brewer(palette = 8) +
  theme(legend.position="none", axis.text = element_text(size = 6)) +
  labs(title = 'City A', y = 'Count', x = 'Age', fill = 'Age')
City_B_stay_vis = ggplot(data = City_B, aes(x = Age, y = ..count.., fill = Age)) +
  geom_bar(stat = 'count') +
  scale_fill_brewer(palette = 9) +
  theme(legend.position="none", axis.text = element_text(size = 6)) +
  labs(title = 'City B', y = 'Count', x = 'Age', fill = 'Age')
City_C_stay_vis = ggplot(data = City_C, aes(x = Age, y = ..count.., fill = Age)) +
  geom_bar(stat = 'count') +
  scale_fill_brewer(palette = 11) +
  theme(legend.position="none", axis.text = element_text(size = 6)) +
  labs(title = 'City C', y = 'Count', x = 'Age', fill = 'Age')
grid.arrange(City_A_stay_vis, City_B_stay_vis, City_C_stay_vis, ncol = 3)

#It looks as though City A has less shoppers living there over the age of 45 compared to the other cities. 
#This could be a factor in the resulting levels of Marital_Status within each individual city.
######################################################################################################################
#########################      Top Shoppers        #########################

top_shoppers = dataset %>%
  count(User_ID, sort = TRUE)

head(top_shoppers)

## # A tibble: 6 x 2
##   User_ID     n
##     <int> <int>
## 1 1001680  1025
## 2 1004277   978
## 3 1001941   898
## 4 1001181   861
## 5 1000889   822
## 6 1003618   766

#Looks like User_ID 1001680 shows up the most on our master ledger of shopper data.
#Since each individual row represents a different transaction/product, it looks like this user made over 1000 total transactions! 
#We can join together this top shoppers dataset with our total customer purchases dataset to see them combined.

top_shoppers =  top_shoppers %>%
  select(User_ID, n) %>%
  left_join(customers_total_purchase_amount, Purchase_Amount, by = 'User_ID')

head(top_shoppers)

## # A tibble: 6 x 3
##   User_ID     n Purchase_Amount
##     <int> <int>           <int>
## 1 1001680  1025         8699232
## 2 1004277   978        10536783
## 3 1001941   898         6817493
## 4 1001181   861         6387899
## 5 1000889   822         5499812
## 6 1003618   766         5961987

#Now that we have joined the two tables together, 
#we can see that although User_ID 1001680 has the highest number of total purchases, 
#User_ID 1004277 has the highest Purchase_Amount as identified in our earlier charts as well.
 #compute the average Purchase_Amount for each user.

top_shoppers = mutate(top_shoppers,
                      Average_Purchase_Amount = Purchase_Amount/n)

head(top_shoppers)

## # A tibble: 6 x 4
##   User_ID     n Purchase_Amount Average_Purchase_Amount
##     <int> <int>           <int>                   <dbl>
## 1 1001680  1025         8699232                   8487.
## 2 1004277   978        10536783                  10774.
## 3 1001941   898         6817493                   7592.
## 4 1001181   861         6387899                   7419.
## 5 1000889   822         5499812                   6691.
## 6 1003618   766         5961987                   7783.

#From here, we can also compute the average Purchase_Amount for each user
#we can sort according to Average_Purchase_Amount to see which customers, on average, are spending the most.
top_shoppers_averagePurchase = top_shoppers %>%
  arrange(desc(Average_Purchase_Amount))

head(top_shoppers_averagePurchase)

## # A tibble: 6 x 4
##   User_ID     n Purchase_Amount Average_Purchase_Amount
##     <int> <int>           <int>                   <dbl>
## 1 1005069    16          308454                  19278.
## 2 1003902    93         1746284                  18777.
## 3 1005999    18          330227                  18346.
## 4 1001349    23          417743                  18163.
## 5 1000101    65         1138239                  17511.
## 6 1003461    20          350174                  17509.

#Looks like User_ID 1005069 has the highest Average_Purchase_Amount 
#and a total Purchase_Amount of 308454. User_ID 1003902 is right behind User_ID 1005069 
#in Average_Purchase_Amount, but has a much higher total Purchase_Amount of 1746284.

#################################################################################################
########################             Occupation       ##########################
customers_Occupation =  dataset %>%
  select(User_ID, Occupation) %>%
  group_by(User_ID) %>%
  distinct() %>%
  left_join(customers_total_purchase_amount, Occupation, by = 'User_ID')
head(customers_Occupation)

## # A tibble: 6 x 3
## # Groups:   User_ID [6]
##   User_ID Occupation Purchase_Amount
##     <int>      <int>           <int>
## 1 1000001         10          333481
## 2 1000002         16          810353
## 3 1000003         15          341635
## 4 1000004          7          205987
## 5 1000005         20          821001
## 6 1000006          9          379450

#Now that we have our dataset necessary, we can group together the total Purchase_Amount for
#each Occupation identifier. We will then convert Occupation to a charater data type

totalPurchases_Occupation = customers_Occupation %>%
  group_by(Occupation) %>%
  summarise(Purchase_Amount = sum(Purchase_Amount)) %>%
  arrange(desc(Purchase_Amount))

totalPurchases_Occupation$Occupation = as.character(totalPurchases_Occupation$Occupation)
typeof(totalPurchases_Occupation$Occupation)

## [1] "character"

head(totalPurchases_Occupation)

## # A tibble: 6 x 2
##   Occupation Purchase_Amount
##   <chr>                <int>
## 1 4                657530393
## 2 0                625814811
## 3 7                549282744
## 4 1                414552829
## 5 17               387240355
## 6 12               300672105

occupation = ggplot(data = totalPurchases_Occupation) +
  geom_bar(mapping = aes(x = reorder(Occupation, -Purchase_Amount), y = Purchase_Amount, fill = Occupation), stat = 'identity') +
  scale_x_discrete(name="Occupation", breaks = seq(0,20, by = 1), expand = c(0,0)) +
  scale_y_continuous(name="Purchase Amount ($)", expand = c(0,0), limits = c(0, 750000000)) +
  labs(title = 'Total Purchase Amount by Occupation') + 
  theme(legend.position="none")
print(occupation)

#Looks like customers labeled as Occupation 4 spent the most at our store on Black Friday,
#with customers of Occupation 0 + 7 closely behind. Here, 
#if a key was given, we could use that information to classify our shoppers accordingly.

kaggle_project_blackFriday.R

hi

2019-06-24