Marketing Data Analysis: Data Summarizing, Wrangling, and Visualization (pt.1)

Introduction

This simple report* is part one of my data analysis with R showcase. The goal of this showcase is to help others with R coding, practice marketing analytics, as well as showcasing my data crunching skills to potential employers. The complexity of the data analysis will coincide with the sequence of the posts.

The sales data being analyzed for this post is of two fictional hardware stores; “Uptown” and “Downtown” from the book ‘Marketing Analytics, Data-Driven Techniques with Microsoft Excel’ by Wayne L. Winston. Each location sells 10 types of tapes, 10 types of adhesives, and 10 types of safety equipment.

Note: To view the code that produced the output, look for the button labeled “Code” to the top-right of the output.

This report was produced via R and RMarkdown. RMarkdown is a product of the RStudio team. To learn more, visit the following link http://rmarkdown.rstudio.com, or read ‘bookdown: Authoring Books and Technical Documents with R Markdown’ by Yihui Xie https://www.crcpress.com/bookdown-Authoring-Books-and-Technical-Documents-with-R-Markdown/Xie/p/book/9781138700109.

The packages utilized for this report are readxl, knitr, ggplot2, formattable, DT, and directlabels.

hw.store <- as.data.frame(hw.store)
hw.store$Store <- gsub(pattern = "uptown","Uptown_Store",hw.store$Store,ignore.case = FALSE)
hw.store$Store <- gsub(pattern = "downtown","Downtown_Store",hw.store$Store, ignore.case = FALSE)
hw.store$Month <- factor(hw.store$Month, ordered = T, levels = c("January",
                                                                         "February",
                                                                         "March",
                                                                         "April",
                                                                         "May",
                                                                         "June",
                                                                         "July",
                                                                         "August",
                                                                         "September",
                                                                         "October",
                                                                         "November",
                                                                         "December"))

DT::datatable(hw.store,
              filter = 'top',
              rownames = FALSE,
              caption = htmltools::tags$caption(
                style = 'caption-side:bottom; text-align: right;',
                'Source:', htmltools::em('Marketing Analytics by Wayne L. Winston.')))

% of Sales Per Store

The table below shows the ratio of sales between both locations. As one can see, there is only a 2.56% difference between both locations.

location <- unique(hw.store$Store)
proportion <- sprintf("%.2f%%", 100*round(prop.table(table(hw.store$Store)),4))
proportion_unformatted <- as.numeric(round(prop.table(table(hw.store$Store)),4))*100

df1 <- data.frame(location,proportion)

colnames(df1) <- c("Store","Proportion")

as.datatable(rownames = FALSE,
             caption = htmltools::tags$caption(
                style = 'caption-center:top; text-align: center;',
                'Overall Sales Proportion by Location'),
             options = list(
  pageLength = 12,
  dom = 't'),
  formattable(df1))

Monthly Revenue Table

The table below shows the accumulated revenue per month over the span of a year. March, August, October, and April performed below the monthly average of $799.33, with March being the lowest performer ($631.00) and November performing the best ($873.50).

The monthly revenue column is colored based on the average monthly revenue:

Red indicates below the average
Green indicates equal to or greater than the average

m.revenue <- aggregate(Price~Month,hw.store,sum,na.rm=T)
colnames(m.revenue)[2] <- "Revenue"
m.revenue2 <- m.revenue

Revenue_unformatted <- m.revenue2$Revenue
Ratio_of_Sales_unformatted <- prop.table(table(hw.store$Month))


m.revenue2$Revenue <- sprintf("$%.2f",Revenue_unformatted)

m.revenue2$Ratio_of_Sales <- sprintf("%.2f%%", 100*round(prop.table(table(hw.store$Month)),4))


colnames(m.revenue2)[3] <- "Revenue_Ratio"

revenue.total <- sum(Revenue_unformatted)
ratio.total <- sum(Ratio_of_Sales_unformatted)

revenue.total <- sprintf("$%.2f",revenue.total)
ratio.total <- sprintf("%.2f%%",100*ratio.total)
m.revenue2$Month <- as.character(m.revenue2$Month)

gt <- c(ratio.total,revenue.total)
m.revenue.total <- rbind(m.revenue2,gt)

m.revenue.total[[13,1]] <- "Grand_Total"

as.datatable(rownames = FALSE,
             caption = htmltools::tags$caption(
                style = 'caption-center:top; text-align: center;',
                'Monthly Revenue'),
             options = list(
               ordering=F,
               pageLength = 13,
               dom = 't'),
             formattable(m.revenue.total,list(
               Revenue = formatter("span",
                                 style = ~ style(color = ifelse(Revenue_unformatted >=mean(Revenue_unformatted), "green", "red"))))))

## Warning in .mapply(function(...) {: longer argument not a multiple of
## length of shorter

Monthly Revenue Visualized

This graph is a visual representation of the above revenue table.

The green bins repesent the Months that generated revenue above or equal to the monthly average
The red bins repesent the Months that generated revenue below the monthly average

ggplot(NULL, aes(m.revenue$Month,Revenue_unformatted))+
  geom_bar(stat = 'identity', fill =ifelse(Revenue_unformatted >= mean(Revenue_unformatted), "lightgreen", "red"))+
  geom_text(aes(label = sprintf("$%.2f",Revenue_unformatted), hjust = 1)) +
  labs(x=NULL,y=NULL,
       title="Monthly Revenue ",
       subtitle=paste("Average Monthly Revenue:",sprintf("$%.2f",mean(Revenue_unformatted))))+
  coord_flip()+
  theme_light()

Product Line Performance

This section breaks down the units sold per product, product revenue, and respective revenue percentile.

product.perf <- aggregate(Price~Product,hw.store,sum,na.rm=TRUE)
product.perf$Price_unformatted <- product.perf$Price

percent.rank.udf <- function(vector) {
  round((rank(vector)/length(vector))*100, digits = 2)
}

product.perf$Product_Rank_Unformatted <- percent.rank.udf(product.perf$Price_unformatted)

product.perf$Product_Rank <- sprintf("%.2f%%", product.perf$Product_Rank_Unformatted)

product.perf$Price <- sprintf("$%.2f",product.perf$Price)

Products <- names(table(hw.store$Product))
Units_Sold <- as.vector(table(hw.store$Product))

x <- unique(hw.store[,c("Product","Price")])
Price <- x[order(x$Product),"Price"]
Price <- sprintf("$%.2f",Price)

prod.table <- data.frame(Product=product.perf$Product,
                         Price,
                         Units_Sold,
                         Product_Revenue=product.perf$Price,
                         Revenue_Percentile=product.perf$Product_Rank)
                         


as.datatable(rownames = FALSE,
             filter = 'top',
             caption = htmltools::tags$caption(
                style = 'caption-center:top; text-align: center;',
                'Product Line Performance'),
             options = list(
  pageLength = 10,
  lengthMenu = c(10, 20, 30)),
  formattable(prod.table))

Pareto Table

Here we see a list of the products that generated 80% of the revenue. “Safety 8” outperformed all other products by at least $4,300.00, in turn making up 52.64% of the revenue.

pareto.table <- product.perf[product.perf$Product_Rank_Unformatted >=80.00,c(1,2,5)]

colnames(pareto.table)[2] <- "Product_Revenue"

pareto.table$Product_Revenue <- gsub(pattern = "\\$","",pareto.table$Product_Revenue)
pareto.table$Product_Rank <- gsub(pattern = "%","",pareto.table$Product_Rank)

pareto.table$Product_Revenue <- as.numeric(pareto.table$Product_Revenue)
pareto.table$Product_Rank <- as.numeric(pareto.table$Product_Rank)

pareto.table <- pareto.table[order(pareto.table$Product_Revenue, decreasing = TRUE),]

totals <- apply(pareto.table[,-1],2,sum)
totals[1] <- sprintf("$%.2f",totals[1])
totals[2] <- sprintf("%.2f%%",as.numeric(totals[2]))
totals <- c("Grand_Total",totals)

pareto.table$Product_Revenue <- sprintf("$%.2f",pareto.table$Product_Revenue)
pareto.table$Product_Rank <- sprintf("%.2f%%", pareto.table$Product_Rank)

pareto.table <- rbind(pareto.table,totals)

as.datatable(rownames = FALSE,
             caption = htmltools::tags$caption(
                style = 'caption-center:top; text-align: center;',
                'Top 80 Percentile by Revenue'),
             options = list(
               ordering=F,
               pageLength = 8,
               dom = 't'),
             formattable(pareto.table))

Product Ranking

The products that generated 80% of the overall revenue brought in $8,010.00, while the remaining 20% generated $1,582.00, which makes a difference of $6,428.00. If we exclude ‘Safety 8’, the 80% generated $2,960.00, making the difference between 80% of the revenue (excluding ‘Safety 8’) and the remaining 20% $1,378.00.

The blue bins represent the products that generated 80% of the overall revenue
The grey bins represent the products that generated the remaining revenue

product.df <- aggregate(Price~Product,hw.store,sum,na.rm=TRUE)
colnames(product.df)[2] <- "Revenue"

percent.rank.udf <- function(vector) {
  round((rank(vector)/length(vector))*100, digits = 2)
}

product.df$Rank <- percent.rank.udf(product.df$Revenue)

product.df <- product.df[order(product.df$Rank, decreasing = TRUE),]

below80 <- product.df[product.df$Rank < 80.00,]
above80 <- product.df[product.df$Rank >= 80.00,]

caption1 <- paste("< 80% Ranking Grand Total:",sprintf("$%.2f",sum(below80$Revenue)))
caption2 <- paste(">= 80% Ranking Grand Total:",sprintf("$%.2f",sum(above80$Revenue)))

ggplot(product.df, aes(x= reorder(Product, -Rank),y= Rank))+
  geom_bar(stat = 'identity',show.legend = FALSE, fill= ifelse(product.df$Rank >= 80.00,"lightblue","lightgrey"))+
    labs(x=NULL,y="Product Ranking by Revenue")+
    geom_text(aes(angle = 90,label = sprintf("$%.2f",Revenue), hjust = 1))+
  annotate(geom = "text", x = 22, y = 80,  label = caption1)+
  annotate(geom = "text", x = 22, y = 95,  label = caption2)+
  theme_bw()+
  theme(axis.text.x = element_text(angle = 90, hjust = 1))+
  scale_y_continuous(breaks = seq(0, 100, 20),labels = c("0%","20%","40%","60%","80%","100%"))

Store Performance Table

In terms of both revenue and sales, Uptown and Downtown stores performed similarly to one another, with Downtown outperforming Uptown by $379.00. We can see that the Downtown store only performs above its own average in January, July, and September. Seeing as how it outperforms the Uptown store, this is likely due to the Downtown store having a higher variance with its sales when compared to the Uptown store.

In terms of both stores’ monthly revenue, the months of March, April, August and October were below the monthly average, while the remaining 8 months performed above the monthly average.

Each store, and the monthly revenue is colored based on its own average monthly revenue:

Red indicates below the average
Green indicates equal to or greater than the average

store.perf <- aggregate(Price~Store+Month,hw.store,sum,na.rm=TRUE)
uptown.store <- store.perf[store.perf$Store=="Uptown_Store",]
downtown.store <- store.perf[store.perf$Store=="Downtown_Store",]

colnames(uptown.store)[3] <- "Uptown_Store"
colnames(downtown.store)[3] <- "Downtown_Store"

store.revenue <- merge(uptown.store,downtown.store,by.x = "Month",by.y = "Month", all.x = TRUE)

store.revenue <- store.revenue[,c(1,3,5)]

store.revenue$Monthly_Total <- store.revenue$Uptown_Store+store.revenue$Downtown_Store

store.revenue <- store.revenue[order(store.revenue$Month),]

Uptown_Store_unformatted <- store.revenue$Uptown_Store
Downtown_Store_unformatted <- store.revenue$Downtown_Store
Month_Total_unformatted <- store.revenue$Monthly_Total

store.revenue$Uptown_Store <- sprintf("$%.2f",store.revenue$Uptown_Store)
store.revenue$Downtown_Store <- sprintf("$%.2f",store.revenue$Downtown_Store)
store.revenue$Monthly_Total <- sprintf("$%.2f",store.revenue$Monthly_Total)

rownames(store.revenue) <- NULL
store.revenue$Index <- 1:nrow(store.revenue)
store.revenue <- store.revenue[,c(5,1,2,3,4)]


as.datatable(rownames = FALSE,
             caption = htmltools::tags$caption(
                style = 'caption-center:top; text-align: center;',
                'Monthly Performance Per Store'),
             options = list(
               pageLength = 12,
               dom = 't'),
             formattable(store.revenue,
                         list(
                           Uptown_Store = formatter("span",
                                              style = ~ style(color = ifelse(Uptown_Store_unformatted >= mean(Uptown_Store_unformatted), "green","red"))),
  Downtown_Store = formatter("span", 
    style = ~ style(color = ifelse(Downtown_Store_unformatted >= mean(Downtown_Store_unformatted), "green","red"))),
  Monthly_Total = formatter("span", 
    style = ~ style(color = ifelse(Month_Total_unformatted >= mean(Month_Total_unformatted), "green","red")))
  )))

Store Performance Visualized

Below, we get a more intuitive visual representation of how both locations perform on a month to month basis. We can see that both March and August require some form of promotional attention. I hypothesize that a new marketing campaign targeting customers with a low retention rate during the first week of March and the first week of August will increase the average monthly revenue by at least 20%.

df.agg <- aggregate(Price~Store+Month, hw.store, sum, na.rm=TRUE)
df.agg$Store <- gsub(pattern = "_Store","",df.agg$Store) #Removing to clean up the plot

caption3 <- paste(strwrap("March is our weakest month, revenue wise.",25),collapse = "\n")

caption4 <- paste(strwrap("The month of August is our second weakest month.",25),collapse = "\n")


ggplot(df.agg,aes(Month,Price, color = Store,  group = Store))+
  directlabels::geom_dl(aes(label = Store),method = list("last.qp", cex = 0.85,hjust = .8))+
  geom_line(show.legend = FALSE)+
  geom_point(show.legend = FALSE,alpha=0.5)+
  labs(x=NULL,y=NULL)+
  annotate(geom = "text", x = 3.8, y = 460,  label = caption3)+
  #annotate(geom = "segment", x = 3.5, xend = 3, y = 445, yend = 345, arrow=arrow())+
  annotate(geom = "text", x = 8, y = 480,  label = caption4)+
  #annotate(geom = "segment", x = 8, xend = 8, y = 460, yend = 395, arrow=arrow())+
  scale_x_discrete(labels = c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct","Nov","Dec"))+
  scale_y_continuous(breaks = seq(300, 500, 50),labels = c("$300","$350","$400","$450","$500"))+
  scale_y_continuous(sec.axis = dup_axis())+
  scale_color_manual(values=c("#CC6666", "#9999CC"))+
  theme_light()

Store Summary

# Totals
uptown.total <- sum(uptown.store$Uptown)
downtown.total <- sum(downtown.store$Downtown)
store.gt <- sum(uptown.store$Uptown+downtown.store$Downtown)


uptown.total <- sprintf("$%.2f",uptown.total)
downtown.total <- sprintf("$%.2f",downtown.total)
store.gt <- sprintf("$%.2f",store.gt)
store.totals <- c(uptown.total,downtown.total,store.gt)


# Averages
uptown.avg <- mean(uptown.store$Uptown)
downtown.avg <- mean(downtown.store$Downtown)
gt.store.avg <- uptown.avg+downtown.avg

uptown.average <- sprintf("$%.2f",mean(uptown.avg))
downtown.average <- sprintf("$%.2f",mean(downtown.avg))
gt.store.avg <- sprintf("$%.2f",mean(gt.store.avg))

store.averages <- c(uptown.average,downtown.average,gt.store.avg)

store.summary <- data.frame(Totals=store.totals,Averages=store.averages)
rownames(store.summary) <- c("Uptown_Store","Downtown_Store","Grand Total")


as.datatable(
  caption = htmltools::tags$caption(
                style = 'caption-center:top; text-align: center;',
                'Store Summary'),
  options = list(
    ordering=F,
    dom = 't'),
  formattable(store.summary))