#Libraries needed
#install.packages("ggthemes")
library(sqldf)
## Loading required package: gsubfn
## Loading required package: proto
## Loading required package: RSQLite
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.2 ✓ purrr 0.3.4
## ✓ tibble 3.0.3 ✓ dplyr 1.0.2
## ✓ tidyr 1.1.1 ✓ stringr 1.4.0
## ✓ readr 1.3.1 ✓ forcats 0.5.0
## ── Conflicts ─────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(dplyr)
library(ggthemes)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(ggplot2)
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(corrplot)
## corrplot 0.84 loaded
events<- read.csv("segmentation_event_data.csv", header = T)
head(events)
events <-data.frame(events)
summary(events)
## month event_type url total_events
## Length:1048575 Length:1048575 Length:1048575 Min. : 1
## Class :character Class :character Class :character 1st Qu.: 1
## Mode :character Mode :character Mode :character Median : 3
## Mean : 74
## 3rd Qu.: 12
## Max. :817705
## users
## Min. : 0.0
## 1st Qu.: 1.0
## Median : 2.0
## Mean : 59.4
## 3rd Qu.: 11.0
## Max. :581382.0
str(events)
## 'data.frame': 1048575 obs. of 5 variables:
## $ month : chr "4/1/19" "6/1/19" "1/1/19" "2/1/19" ...
## $ event_type : chr "page_view" "page_view" "page_view" "page_view" ...
## $ url : chr "/foodrecipe/12345-the-chocolate-cake" "/foodrecipe/65428-grilled-octopus" "/foodrecipe/95043-avocado-toast" "/foodrecipe/12356-shallots-thyme" ...
## $ total_events: int 10957 806 1150 375 93 144 271 2995 160 1846 ...
## $ users : int 9577 696 982 282 77 131 227 2391 131 1344 ...
products <- read.csv("segmentation_product_data.csv", header = T)
head(products)
summary(products)
## product_id product_name brand_name total_orders
## Min. : 39 Length:3033 Length:3033 Min. : 1.00
## 1st Qu.:2544 Class :character Class :character 1st Qu.: 3.00
## Median :3850 Mode :character Mode :character Median : 8.00
## Mean :3586 Mean : 27.85
## 3rd Qu.:4837 3rd Qu.: 22.00
## Max. :5772 Max. :2531.00
## total_units total_buyers total_sales
## Min. : 1.00 Min. : 1.00 Min. : 10
## 1st Qu.: 3.00 1st Qu.: 3.00 1st Qu.: 220
## Median : 9.00 Median : 8.00 Median : 600
## Mean : 32.71 Mean : 27.31 Mean : 2167
## 3rd Qu.: 25.00 3rd Qu.: 21.00 3rd Qu.: 1562
## Max. :2910.00 Max. :2489.00 Max. :272770
str(products)
## 'data.frame': 3033 obs. of 7 variables:
## $ product_id : int 39 49 53 54 56 58 59 61 63 67 ...
## $ product_name: chr "PieBox" "Jacobsen Salt Co. Slide Tin (4 pack)" "N. 19 - Salvador (Pimenton, Saffron, Seafood Essence)" "N. 25 - Escabeche (Lemon, Saffron, Coriander, Fennel)" ...
## $ brand_name : chr "PieBox" "Jacobsen Salt Co." "La Boîte" "La Boîte" ...
## $ total_orders: int 43 9 1 17 18 3 6 21 26 1 ...
## $ total_units : int 45 10 1 17 18 3 6 21 33 1 ...
## $ total_buyers: int 42 9 1 17 17 3 5 20 25 1 ...
## $ total_sales : int 1921 239 27 395 440 135 150 505 1056 75 ...
library(sqldf)
sql<- sqldf('select brand_name,
count (distinct product_name) No_of_Products,
sum(total_orders) Total_orders,
sum(total_units) Units,
sum(total_sales) Total_sales,
dense_rank() over (order by sum(total_sales) desc) Rank,
case when dense_rank() over (order by sum(total_sales) desc) between 1 and 100 then "Group1"
when dense_rank() over (order by sum(total_sales) desc) between 101 and 200 then "Group2"
when dense_rank() over (order by sum(total_sales) desc) between 201 and 300 then "Group3"
when dense_rank() over (order by sum(total_sales) desc) between 301 and 400 then "Group4"
else "Group5" end as Segments,
sum(total_buyers) Total_buyers,
sum(total_sales)/count(distinct product_name) Average_product_Sale,
sum(total_Sales)/sum(total_orders) AOV
from products p
group by brand_name
order by sum(total_sales) desc')
head(sql, 10)
head(sql)
#AOV by Groups
Group1<- filter(sql, Segments=="Group1")
Group2<- filter(sql, Segments=="Group2")
Group3<- filter(sql, Segments=="Group3")
sum(Group1$Total_sales)/sum(Group1$Total_orders)
## [1] 80.21903
sum(Group2$Total_sales)/sum(Group2$Total_orders)
## [1] 70.30956
sum(Group3$Total_sales)/sum(Group3$Total_orders)
## [1] 52.71673
sql1.1 <- sql %>%
group_by(Segments)%>%
summarise(total_orders = sum(Total_orders),
total_sales = sum(Total_sales))%>%
mutate(AOV = total_sales / total_orders)
## `summarise()` ungrouping output (override with `.groups` argument)
ggplot(sql1.1, aes(x=Segments,y=AOV, fill = Segments ))+
geom_bar(stat = "identity", width = 0.3)+
geom_text(label = paste("$",round(sql1.1$AOV,2)),vjust =-0.2, color = "black")+ theme_economist()

f1<- filter(sql, Segments== "Group1") %>%
top_n (n=20, Total_sales) %>%
ggplot(aes(reorder(brand_name,-Total_sales), Total_sales))+
geom_bar(stat = "identity")+
scale_x_discrete(labels=abbreviate)+
geom_text(aes(label=Total_sales), position=position_dodge(width=0.9), vjust=-0.25)+
theme_economist()+
scale_fill_viridis_c()+
labs(title = "Top Brands by Total Sales Group1",
caption = "source: Data 2019",
x = "Brand Name",
y="Total Sales"
)
f1
## Warning in f(...): abbreviate used with non-ASCII chars

f2<- filter(sql, Segments== "Group2") %>%
top_n (n=20, Total_sales) %>%
ggplot(aes(reorder(brand_name,-Total_sales), Total_sales))+
geom_bar(stat = "identity")+
scale_x_discrete(labels=abbreviate)+
geom_text(aes(label=Total_sales), position=position_dodge(width=0.9), vjust=-0.25)+
theme_economist()+
scale_fill_viridis_c()+
labs(title = "Top Brands by Total Sales Group2",
caption = "source: Data 2019",
x = "Brand Name",
y="Total Sales"
)
f2
## Warning in f(...): abbreviate used with non-ASCII chars

f3<- filter(sql, Segments== "Group3") %>%
top_n (n=20, Total_sales) %>%
ggplot(aes(reorder(brand_name,-Total_sales), Total_sales))+
geom_bar(stat = "identity")+
scale_x_discrete(labels=abbreviate)+
geom_text(aes(label=Total_sales), position=position_dodge(width=0.9), vjust=-0.25)+
theme_economist()+
scale_fill_viridis_c()+
labs(title = "Top Brands by Total Sales Group3",
caption = "source: Food52 2019",
x = "Brand Name",
y="Total Sales"
)
f3

f4<- filter(sql, Segments== "Group4") %>%
top_n (n=20, Total_sales) %>%
ggplot(aes(reorder(brand_name,-Total_sales), Total_sales))+
geom_bar(stat = "identity")+
scale_x_discrete(labels=abbreviate)+
geom_text(aes(label=Total_sales), position=position_dodge(width=0.9), vjust=-0.25)+
theme_economist()+
scale_fill_viridis_c()+
labs(title = "Top Brands by Total Sales Group4",
caption = "source: Food52 2019",
x = "Brand Name",
y="Total Sales"
)
f4
## Warning in f(...): abbreviate used with non-ASCII chars

g1<-grid.arrange(f1,f2, ncol=1)
## Warning in f(...): abbreviate used with non-ASCII chars
## Warning in f(...): abbreviate used with non-ASCII chars

g2<-grid.arrange(f3,f4, ncol=1)
## Warning in f(...): abbreviate used with non-ASCII chars
