Background Context:

Business Objective:

Importing Libraries & Dataset

library(readr)
library(dplyr)
library(lubridate)
library(tidyr)
library(ggplot2)
library(tibble)
library(corrplot)
library(gt)
library(broom)

Data Overview

head(df)
## # A tibble: 6 × 17
##   `Client ID` `Client Type`   `Number of Customers` `Montly Target` `Zip Code`
##   <chr>       <chr>                           <dbl>           <dbl>      <dbl>
## 1 ID-987275   Medium Facility                  2800             125       1003
## 2 ID-987275   Medium Facility                  2800             125       1003
## 3 ID-987275   Medium Facility                  2800             125       1003
## 4 ID-987275   Medium Facility                  2800             125       1003
## 5 ID-987275   Medium Facility                  2800             125       1003
## 6 ID-987275   Medium Facility                  2800             125       1003
## # ℹ 12 more variables: Calendardate <chr>, `Amount Collected` <dbl>,
## #   `Unit Sold` <dbl>, `Campaign (Email)` <dbl>, `Campaign (Flyer)` <dbl>,
## #   `Campaign (Phone)` <dbl>, `Sales Contact 1` <dbl>, `Sales Contact 2` <dbl>,
## #   `Sales Contact 3` <dbl>, `Sales Contact 4` <dbl>, `Sales Contact 5` <dbl>,
## #   `Number of Competition` <chr>
str(df)
## spc_tbl_ [2,976 × 17] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
##  $ Client ID            : chr [1:2976] "ID-987275" "ID-987275" "ID-987275" "ID-987275" ...
##  $ Client Type          : chr [1:2976] "Medium Facility" "Medium Facility" "Medium Facility" "Medium Facility" ...
##  $ Number of Customers  : num [1:2976] 2800 2800 2800 2800 2800 2800 2800 2800 2800 2800 ...
##  $ Montly Target        : num [1:2976] 125 125 125 125 125 125 125 125 125 125 ...
##  $ Zip Code             : num [1:2976] 1003 1003 1003 1003 1003 ...
##  $ Calendardate         : chr [1:2976] "16-01-2014" "16-02-2014" "18-03-2014" "18-04-2014" ...
##  $ Amount Collected     : num [1:2976] 0 3409460 10228384 17047304 23866224 ...
##  $ Unit Sold            : num [1:2976] 0 24 75 123 171 198 123 123 123 198 ...
##  $ Campaign (Email)     : num [1:2976] 0 0 0 0 0 ...
##  $ Campaign (Flyer)     : num [1:2976] 0 0 0 0 0 ...
##  $ Campaign (Phone)     : num [1:2976] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Sales Contact 1      : num [1:2976] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Sales Contact 2      : num [1:2976] 0 0 0 3547500 0 ...
##  $ Sales Contact 3      : num [1:2976] 0 0 0 1290000 0 ...
##  $ Sales Contact 4      : num [1:2976] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Sales Contact 5      : num [1:2976] 0 322500 0 0 0 ...
##  $ Number of Competition: chr [1:2976] "Low" "Low" "Low" "Low" ...
##  - attr(*, "spec")=
##   .. cols(
##   ..   `Client ID` = col_character(),
##   ..   `Client Type` = col_character(),
##   ..   `Number of Customers` = col_double(),
##   ..   `Montly Target` = col_double(),
##   ..   `Zip Code` = col_double(),
##   ..   Calendardate = col_character(),
##   ..   `Amount Collected` = col_double(),
##   ..   `Unit Sold` = col_double(),
##   ..   `Campaign (Email)` = col_double(),
##   ..   `Campaign (Flyer)` = col_double(),
##   ..   `Campaign (Phone)` = col_double(),
##   ..   `Sales Contact 1` = col_double(),
##   ..   `Sales Contact 2` = col_double(),
##   ..   `Sales Contact 3` = col_double(),
##   ..   `Sales Contact 4` = col_double(),
##   ..   `Sales Contact 5` = col_double(),
##   ..   `Number of Competition` = col_character()
##   .. )
##  - attr(*, "problems")=<externalptr>

Data Cleaning

## Cleaning the CalendarDate Column from (Chr) to Date.
df <- df %>%
  mutate(Calendardate = as.Date(Calendardate, format = "%d-%m-%Y"),
         Calendar_Month = month(Calendardate),
         Calendar_Year = year(Calendardate))
# Replacing spaces with underscores in column names
colnames(df) <- gsub(" ", "_", colnames(df))
colnames(df) <- gsub("\\(|\\)", "", colnames(df))

# Check the updated column names
colnames(df)
##  [1] "Client_ID"             "Client_Type"           "Number_of_Customers"  
##  [4] "Montly_Target"         "Zip_Code"              "Calendardate"         
##  [7] "Amount_Collected"      "Unit_Sold"             "Campaign_Email"       
## [10] "Campaign_Flyer"        "Campaign_Phone"        "Sales_Contact_1"      
## [13] "Sales_Contact_2"       "Sales_Contact_3"       "Sales_Contact_4"      
## [16] "Sales_Contact_5"       "Number_of_Competition" "Calendar_Month"       
## [19] "Calendar_Year"

Simple EDA

# Distribution of Client Types
df %>% count(Client_Type) %>% mutate(Percentage = n / sum(n) * 100)
## # A tibble: 4 × 3
##   Client_Type          n Percentage
##   <chr>            <int>      <dbl>
## 1 Large Facility    1368      46.0 
## 2 Medium Facility    504      16.9 
## 3 Private Facility   264       8.87
## 4 Small Facility     840      28.2
# Mean Amount Collected by Client Type
df %>% group_by(Client_Type) %>% summarise(Avg_Amount_Collected = mean(Amount_Collected, na.rm = TRUE))
## # A tibble: 4 × 2
##   Client_Type      Avg_Amount_Collected
##   <chr>                           <dbl>
## 1 Large Facility              19998805.
## 2 Medium Facility             40759968.
## 3 Private Facility             5030246.
## 4 Small Facility               1637759.

Statistical Analysis:

Impact of (Marketing Strategies & Sales Contacts) on (Sales):

# Correlation between marketing strategies and Amount Collected
cor_matrix <- cor(df %>% select(Amount_Collected, Campaign_Email, Campaign_Flyer, Campaign_Phone, Sales_Contact_1, Sales_Contact_2, Sales_Contact_3, Sales_Contact_4, Sales_Contact_5), use = "complete.obs")
cor_matrix['Amount_Collected',]
## Amount_Collected   Campaign_Email   Campaign_Flyer   Campaign_Phone 
##       1.00000000       0.24823455       0.44433668       0.03485845 
##  Sales_Contact_1  Sales_Contact_2  Sales_Contact_3  Sales_Contact_4 
##       0.27747840       0.55211154       0.35788696       0.23616507 
##  Sales_Contact_5 
##       0.09579479

The correlation between which (marketing campagin strategies) & (amount_collected) from most to least successful:

  • Flyers
  • Email
  • Phone

The correlation between which (Sales_Contact) & (Amount_Collected) from most to least successful:

  • Contact 2
  • Contact 3
  • Contact 1
  • Contact 4
  • Contact 5

Visualization: “Organizational correlation of (Marketing Strategies & Sales Contacts) with (Sales)

# Removing total 'Amount_collected', which would be a correlation of itself
cor_data <- cor_matrix['Amount_Collected', -1]

# Transform into a data frame for plotting
cor_df <- data.frame(
  Strategy = names(cor_data),
  Correlation = cor_data
)

# Create the bar plot
ggplot(cor_df, aes(x = reorder(Strategy, Correlation), y = Correlation, fill = Correlation)) +
  geom_bar(stat = "identity") +
  coord_flip() +  # Flip coordinates to make it horizontal
  scale_fill_gradient(low = "red", high = "green") +  # Use red to green gradient
  labs(title = "Correlation of Marketing Strategies with Amount Collected",
       x = "Marketing Strategy",
       y = "Degree of Correlation") +
  theme_minimal() +
  theme(legend.position = "none")  # Remove the legend for the fill

Visualization: “Organizational Facilities’ correlation of (Marketing Strategies & Sales Contacts) with (Sales)”

# Define a simple function to create a correlation plot for a given subset of data
create_correlation_plot <- function(data_subset, title_suffix) {
  cor_matrix <- cor(data_subset %>% 
                    select(Amount_Collected, Campaign_Email, Campaign_Flyer, Campaign_Phone, 
                           Sales_Contact_1, Sales_Contact_2, Sales_Contact_3, Sales_Contact_4, Sales_Contact_5),
                    use = "complete.obs")
  cor_data <- cor_matrix['Amount_Collected', -1]  # Exclude self-correlation

  cor_df <- data.frame(
    Strategy = names(cor_data),
    Correlation = cor_data
  )

  ggplot(cor_df, aes(x = reorder(Strategy, Correlation), y = Correlation, fill = Correlation)) +
    geom_bar(stat = "identity") +
    coord_flip() +  # Flip coordinates to make it horizontal
    scale_fill_gradient(low = "red", high = "green") +
    labs(title = paste("Correlation of Marketing Sales & Strategies -", title_suffix),
         x = "Marketing Strategy", y = "Degree of Correlation") +
    theme_minimal() +
    theme(legend.position = "none")
}

# Create a list of facility types you want to plot
facility_types <- c("Small Facility", "Medium Facility", "Large Facility", "Private Facility")

# Loop through the facility types and create a plot for each
plots <- list()
for (type in facility_types) {
  data_subset <- df %>% filter(Client_Type == type)
  plots[[type]] <- create_correlation_plot(data_subset, type)
}

# Display the plots
plots$`Small Facility`

plots$`Medium Facility`

plots$`Large Facility`

plots$`Private Facility`

  • Key-Takeaways:
    • Sales_Contact_1: Overall has a negative impact in various facilities Sectors
    • Sales_Contact_5: Under performing within the Smaller & Larger Facilities Sector
    • Campaign_Email: Under performing within the Private Facilities Sector

Linear Regression Analysis (Market Sales and Strategies)

# Modeling a Linear Regression Analysis on Sales on Marketing Strategies & Sales Contacts
model <- lm(Amount_Collected ~ Campaign_Email + Campaign_Flyer + Campaign_Phone +
            Sales_Contact_1 + Sales_Contact_2 + Sales_Contact_3 + 
            Sales_Contact_4 + Sales_Contact_5, data = df)

summary(model)
## 
## Call:
## lm(formula = Amount_Collected ~ Campaign_Email + Campaign_Flyer + 
##     Campaign_Phone + Sales_Contact_1 + Sales_Contact_2 + Sales_Contact_3 + 
##     Sales_Contact_4 + Sales_Contact_5, data = df)
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -111513076   -8521151   -1481206    2067059  147916803 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)     1.481e+06  5.123e+05   2.891  0.00387 ** 
## Campaign_Email  7.932e-01  5.968e-01   1.329  0.18388    
## Campaign_Flyer  3.338e+00  2.601e-01  12.831  < 2e-16 ***
## Campaign_Phone  7.336e-02  1.053e+00   0.070  0.94444    
## Sales_Contact_1 4.237e+00  4.151e-01  10.207  < 2e-16 ***
## Sales_Contact_2 3.638e+00  1.292e-01  28.155  < 2e-16 ***
## Sales_Contact_3 2.343e+00  1.307e-01  17.925  < 2e-16 ***
## Sales_Contact_4 1.095e+01  1.060e+00  10.331  < 2e-16 ***
## Sales_Contact_5 3.508e+00  4.549e+00   0.771  0.44072    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 21850000 on 2967 degrees of freedom
## Multiple R-squared:  0.4799, Adjusted R-squared:  0.4785 
## F-statistic: 342.1 on 8 and 2967 DF,  p-value: < 2.2e-16
# Extracting significant coefficients where p-value is less than 0.05
coef_df <- broom::tidy(model) %>% 
  filter(p.value < 0.05) %>% 
  select(term, estimate)

# View the filtered coefficients dataframe
print(coef_df)
## # A tibble: 6 × 2
##   term              estimate
##   <chr>                <dbl>
## 1 (Intercept)     1481206.  
## 2 Campaign_Flyer        3.34
## 3 Sales_Contact_1       4.24
## 4 Sales_Contact_2       3.64
## 5 Sales_Contact_3       2.34
## 6 Sales_Contact_4      10.9

Linear Regression by Client_Types (Small, Medium, Large, Private)

# Modeling a Linear regression analysis By each Client Type (Small, Medium, Large, Private)
consolidated_summary <- df %>%
  group_by(Client_Type) %>%
  do({
    model <- lm(Amount_Collected ~ Campaign_Email + Campaign_Flyer + Campaign_Phone +
                Sales_Contact_1 + Sales_Contact_2 + Sales_Contact_3 +
                Sales_Contact_4 + Sales_Contact_5, data = .)
    tidy(model) %>%
      filter(p.value < 0.05) %>%
      select(term, estimate) %>%
      arrange(desc(estimate))
  }) %>%
  ungroup() %>%
  rename(Variable = term, `Coefficent (Impact)` = estimate)

print(consolidated_summary)
## # A tibble: 15 × 3
##    Client_Type      Variable        `Coefficent (Impact)`
##    <chr>            <chr>                           <dbl>
##  1 Large Facility   (Intercept)               2811701.   
##  2 Large Facility   Sales_Contact_1                11.7  
##  3 Large Facility   Sales_Contact_4                10.6  
##  4 Large Facility   Sales_Contact_2                 4.00 
##  5 Large Facility   Campaign_Flyer                  2.72 
##  6 Large Facility   Sales_Contact_3                 2.03 
##  7 Large Facility   Campaign_Phone                 -3.54 
##  8 Medium Facility  (Intercept)               5681904.   
##  9 Medium Facility  Campaign_Flyer                  4.11 
## 10 Medium Facility  Sales_Contact_2                 3.58 
## 11 Medium Facility  Sales_Contact_1                 3.14 
## 12 Medium Facility  Sales_Contact_3                 2.12 
## 13 Private Facility Sales_Contact_2                 6.62 
## 14 Small Facility   (Intercept)                878923.   
## 15 Small Facility   Sales_Contact_2                 0.810
# This will provide a more granular insight into each Marketing Strategy & Sales Contact from each facility.
# Editing Summarized Table for Business
consolidated_summary_modified <- consolidated_summary %>%
  rename(Account_Type = Client_Type, `Return on Investment` = `Coefficent (Impact)`) %>% # Correcting the column names
  mutate(`Return on Investment` = round(`Return on Investment`, 2)) %>% # Rounding the 'Return on Investment' to 2 decimal places
  filter(Variable != "(Intercept)") # Removing unnecessary grand total rows with the variable "(Intercept)"

print(consolidated_summary_modified)
## # A tibble: 12 × 3
##    Account_Type     Variable        `Return on Investment`
##    <chr>            <chr>                            <dbl>
##  1 Large Facility   Sales_Contact_1                  11.7 
##  2 Large Facility   Sales_Contact_4                  10.6 
##  3 Large Facility   Sales_Contact_2                   4   
##  4 Large Facility   Campaign_Flyer                    2.72
##  5 Large Facility   Sales_Contact_3                   2.03
##  6 Large Facility   Campaign_Phone                   -3.54
##  7 Medium Facility  Campaign_Flyer                    4.11
##  8 Medium Facility  Sales_Contact_2                   3.58
##  9 Medium Facility  Sales_Contact_1                   3.14
## 10 Medium Facility  Sales_Contact_3                   2.12
## 11 Private Facility Sales_Contact_2                   6.62
## 12 Small Facility   Sales_Contact_2                   0.81

Conclusion:

  • From the table above, we can visualize how much ROI we can derive for each dollar spent within each avenue of marketing.
  • We can clearly see that for each different marketing strategy, sales contact, and even facility requires a tailored adjustment for improvements.
  • For Example Medium Facility shows decent results with Flyer Campaigns and each dollar spend return 4 dollars on average. Sales Contact 2 is highly effective followed by Sales Contact 1 and Sales Contact 3. Else all other strategy shows no impact can be dropped to save cost.