Segmentation

#Libraries needed
#install.packages("ggthemes")
library(sqldf)

## Loading required package: gsubfn

## Loading required package: proto

## Loading required package: RSQLite

library(tidyverse)

## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──

## ✓ ggplot2 3.3.2     ✓ purrr   0.3.4
## ✓ tibble  3.0.3     ✓ dplyr   1.0.2
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.5.0

## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(dplyr)
library(ggthemes)
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library(hrbrthemes)

## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.

##       Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and

##       if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow

library(ggplot2)
library(scales)

## 
## Attaching package: 'scales'

## The following object is masked from 'package:purrr':
## 
##     discard

## The following object is masked from 'package:readr':
## 
##     col_factor

library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(corrplot)

## corrplot 0.84 loaded

events<- read.csv("segmentation_event_data.csv", header = T)
head(events)

events <-data.frame(events)
summary(events)

##     month            event_type            url             total_events   
##  Length:1048575     Length:1048575     Length:1048575     Min.   :     1  
##  Class :character   Class :character   Class :character   1st Qu.:     1  
##  Mode  :character   Mode  :character   Mode  :character   Median :     3  
##                                                           Mean   :    74  
##                                                           3rd Qu.:    12  
##                                                           Max.   :817705  
##      users         
##  Min.   :     0.0  
##  1st Qu.:     1.0  
##  Median :     2.0  
##  Mean   :    59.4  
##  3rd Qu.:    11.0  
##  Max.   :581382.0

str(events)

## 'data.frame':    1048575 obs. of  5 variables:
##  $ month       : chr  "4/1/19" "6/1/19" "1/1/19" "2/1/19" ...
##  $ event_type  : chr  "page_view" "page_view" "page_view" "page_view" ...
##  $ url         : chr  "/foodrecipe/12345-the-chocolate-cake" "/foodrecipe/65428-grilled-octopus" "/foodrecipe/95043-avocado-toast" "/foodrecipe/12356-shallots-thyme" ...
##  $ total_events: int  10957 806 1150 375 93 144 271 2995 160 1846 ...
##  $ users       : int  9577 696 982 282 77 131 227 2391 131 1344 ...

products <- read.csv("segmentation_product_data.csv", header = T)
head(products)

summary(products)

##    product_id   product_name        brand_name         total_orders    
##  Min.   :  39   Length:3033        Length:3033        Min.   :   1.00  
##  1st Qu.:2544   Class :character   Class :character   1st Qu.:   3.00  
##  Median :3850   Mode  :character   Mode  :character   Median :   8.00  
##  Mean   :3586                                         Mean   :  27.85  
##  3rd Qu.:4837                                         3rd Qu.:  22.00  
##  Max.   :5772                                         Max.   :2531.00  
##   total_units       total_buyers      total_sales    
##  Min.   :   1.00   Min.   :   1.00   Min.   :    10  
##  1st Qu.:   3.00   1st Qu.:   3.00   1st Qu.:   220  
##  Median :   9.00   Median :   8.00   Median :   600  
##  Mean   :  32.71   Mean   :  27.31   Mean   :  2167  
##  3rd Qu.:  25.00   3rd Qu.:  21.00   3rd Qu.:  1562  
##  Max.   :2910.00   Max.   :2489.00   Max.   :272770

str(products)

## 'data.frame':    3033 obs. of  7 variables:
##  $ product_id  : int  39 49 53 54 56 58 59 61 63 67 ...
##  $ product_name: chr  "PieBox" "Jacobsen Salt Co. Slide Tin (4 pack)" "N. 19 - Salvador (Pimenton, Saffron, Seafood Essence)" "N. 25 - Escabeche (Lemon, Saffron, Coriander, Fennel)" ...
##  $ brand_name  : chr  "PieBox" "Jacobsen Salt Co." "La Boîte" "La Boîte" ...
##  $ total_orders: int  43 9 1 17 18 3 6 21 26 1 ...
##  $ total_units : int  45 10 1 17 18 3 6 21 33 1 ...
##  $ total_buyers: int  42 9 1 17 17 3 5 20 25 1 ...
##  $ total_sales : int  1921 239 27 395 440 135 150 505 1056 75 ...

library(sqldf)
sql<- sqldf('select brand_name,
            count (distinct product_name) No_of_Products,
            sum(total_orders) Total_orders,
            sum(total_units) Units,
            sum(total_sales) Total_sales,
            dense_rank() over (order by sum(total_sales) desc) Rank,
            case when dense_rank() over (order by sum(total_sales) desc) between 1 and 100 then "Group1"
            when dense_rank() over (order by sum(total_sales) desc) between 101 and 200 then "Group2"
            when dense_rank() over (order by sum(total_sales) desc) between 201 and 300 then "Group3"
            when dense_rank() over (order by sum(total_sales) desc) between 301 and 400 then "Group4"
            else "Group5" end as Segments,
            sum(total_buyers) Total_buyers,
            sum(total_sales)/count(distinct product_name) Average_product_Sale,
            sum(total_Sales)/sum(total_orders) AOV
            from products p
            group by brand_name
            order by sum(total_sales) desc')
head(sql, 10)

head(sql)

#AOV by Groups
Group1<- filter(sql, Segments=="Group1")
Group2<- filter(sql, Segments=="Group2")
Group3<- filter(sql, Segments=="Group3")
sum(Group1$Total_sales)/sum(Group1$Total_orders)

## [1] 80.21903

sum(Group2$Total_sales)/sum(Group2$Total_orders)

## [1] 70.30956

sum(Group3$Total_sales)/sum(Group3$Total_orders)

## [1] 52.71673

sql1.1 <- sql %>%
  group_by(Segments)%>%
  summarise(total_orders = sum(Total_orders),
            total_sales = sum(Total_sales))%>%
  mutate(AOV = total_sales / total_orders)

## `summarise()` ungrouping output (override with `.groups` argument)

ggplot(sql1.1, aes(x=Segments,y=AOV, fill = Segments ))+
  geom_bar(stat = "identity", width = 0.3)+
  geom_text(label = paste("$",round(sql1.1$AOV,2)),vjust =-0.2, color = "black")+ theme_economist()

f1<- filter(sql, Segments== "Group1") %>%
  top_n (n=20, Total_sales) %>%
  ggplot(aes(reorder(brand_name,-Total_sales), Total_sales))+
  geom_bar(stat = "identity")+
  scale_x_discrete(labels=abbreviate)+
  geom_text(aes(label=Total_sales), position=position_dodge(width=0.9), vjust=-0.25)+
  theme_economist()+
  scale_fill_viridis_c()+
  labs(title = "Top Brands by Total Sales Group1",
       caption = "source: Data 2019",
       x = "Brand Name",
       y="Total Sales"
  )
f1

## Warning in f(...): abbreviate used with non-ASCII chars

f2<- filter(sql, Segments== "Group2") %>%
  top_n (n=20, Total_sales) %>%
  ggplot(aes(reorder(brand_name,-Total_sales), Total_sales))+
  geom_bar(stat = "identity")+
  scale_x_discrete(labels=abbreviate)+
  geom_text(aes(label=Total_sales), position=position_dodge(width=0.9), vjust=-0.25)+
  theme_economist()+
  scale_fill_viridis_c()+
  labs(title = "Top Brands by Total Sales Group2",
       caption = "source: Data 2019",
       x = "Brand Name",
       y="Total Sales"
  )
f2

## Warning in f(...): abbreviate used with non-ASCII chars

f3<- filter(sql, Segments== "Group3") %>%
  top_n (n=20, Total_sales) %>%
  ggplot(aes(reorder(brand_name,-Total_sales), Total_sales))+
  geom_bar(stat = "identity")+
  scale_x_discrete(labels=abbreviate)+
  geom_text(aes(label=Total_sales), position=position_dodge(width=0.9), vjust=-0.25)+
  theme_economist()+
  scale_fill_viridis_c()+
  labs(title = "Top Brands by Total Sales Group3",
       caption = "source: Food52 2019",
       x = "Brand Name",
       y="Total Sales"
  )
f3

f4<- filter(sql, Segments== "Group4") %>%
  top_n (n=20, Total_sales) %>%
  ggplot(aes(reorder(brand_name,-Total_sales), Total_sales))+
  geom_bar(stat = "identity")+
  scale_x_discrete(labels=abbreviate)+
  geom_text(aes(label=Total_sales), position=position_dodge(width=0.9), vjust=-0.25)+
  theme_economist()+
  scale_fill_viridis_c()+
  labs(title = "Top Brands by Total Sales Group4",
       caption = "source: Food52 2019",
       x = "Brand Name",
       y="Total Sales"
  )
f4

## Warning in f(...): abbreviate used with non-ASCII chars

g1<-grid.arrange(f1,f2, ncol=1)

## Warning in f(...): abbreviate used with non-ASCII chars

## Warning in f(...): abbreviate used with non-ASCII chars

g2<-grid.arrange(f3,f4, ncol=1)

## Warning in f(...): abbreviate used with non-ASCII chars

Segmentation

Urooj

4/1/2020