Working Directory

Loading neccessary libraries

library(readxl)       # For reading Excel files
library(skimr)        # For quick and concise data summaries
library(dplyr)        # For data manipulation
library(tidyr)        # For reshaping data
library(ggplot2)      # For visualization
library(cluster)      # For clustering analysis
library(fastDummies)  # For dummy variable creation
library(writexl)      # Save dataframe as excel
library(pROC)         # For ROC curve and AUC in classification
library(psych)        # Summarize the dataset "data"
library(ggrepel)      # Position label to avoid overlap

Read dataset

tmp <- read_excel("Business_Universe_Data_Assessment_2024.xlsx")

Inspecting dataset

skim(tmp)
Data summary
Name tmp
Number of rows 1788
Number of columns 11
_______________________
Column type frequency:
character 3
numeric 8
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
company name 0 1 3 73 0 1788 0
Country 0 1 14 14 0 1 0
Sub sector 2 1 7 375 0 822 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
CompanyURN 0 1.00 101434.18 117950.64 41.00 19092.75 59731.00 139070.75 475789.00 ▇▃▁▁▁
Turnover (£) 0 1.00 1763.33 8488.24 0.30 160.10 331.44 817.12 180074.00 ▇▁▁▁▁
No of staff 0 1.00 9693.96 99387.33 1000.00 1554.50 2592.00 5245.00 4052806.00 ▇▁▁▁▁
Profit/Loss as % of Turnover 0 1.00 -0.27 10.23 -296.23 0.00 0.02 0.08 105.04 ▁▁▁▇▁
Year founded 0 1.00 1976.09 37.27 1770.00 1966.00 1988.00 2000.00 2017.00 ▁▁▁▂▇
No of Sites 0 1.00 68.34 241.71 1.00 3.00 9.00 34.00 5253.00 ▇▁▁▁▁
Customer? 1299 0.27 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁
Displaying 1st party intent last 3 months? 1123 0.37 1.00 0.00 1.00 1.00 1.00 1.00 1.00 ▁▁▇▁▁
str(tmp)
## tibble [1,788 × 11] (S3: tbl_df/tbl/data.frame)
##  $ CompanyURN                                : num [1:1788] 24138 25240 149774 26890 9759 ...
##  $ company name                              : chr [1:1788] "BBA Aviation plc" "GKN Aerospace Services Limited" "Leonardo MW Ltd" "Senior plc" ...
##  $ Country                                   : chr [1:1788] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
##  $ Turnover (£)                              : num [1:1788] 2149 787 990 1023 14955 ...
##  $ No of staff                               : num [1:1788] 13002 4002 4256 7658 49900 ...
##  $ Sub sector                                : chr [1:1788] "Manufacture-Aerospace," "Manufacture-Aerospace,Manufacture-Defence" "Manufacture-Aerospace,Manufacture-Defence" "Manufacture-Aerospace,Manufacture-Defence,Manufacture-Electronics/Electrical,Manufacture-Engineering" ...
##  $ Profit/Loss as % of Turnover              : num [1:1788] 0.0598 0.1524 0.0669 0.0589 0.258 ...
##  $ Year founded                              : num [1:1788] 1897 1939 1950 1933 1884 ...
##  $ No of Sites                               : num [1:1788] 6 6 9 3 23 8 25 826 6 79 ...
##  $ Customer?                                 : num [1:1788] NA NA NA NA 1 NA 1 NA NA 1 ...
##  $ Displaying 1st party intent last 3 months?: num [1:1788] 1 NA NA NA NA NA NA NA 1 1 ...

Checking for missing values

colSums(is.na(tmp))
##                                 CompanyURN 
##                                          0 
##                               company name 
##                                          0 
##                                    Country 
##                                          0 
##                               Turnover (£) 
##                                          0 
##                                No of staff 
##                                          0 
##                                 Sub sector 
##                                          2 
##               Profit/Loss as % of Turnover 
##                                          0 
##                               Year founded 
##                                          0 
##                                No of Sites 
##                                          0 
##                                  Customer? 
##                                       1299 
## Displaying 1st party intent last 3 months? 
##                                       1123
sum(is.na(tmp))
## [1] 2424

DEMOGRAPHY

summary_stats <- tmp %>%
  summarise(
    Min_Turnover = min(`Turnover (£)`, na.rm = TRUE),
    Max_Turnover = max(`Turnover (£)`, na.rm = TRUE),
    Avg_Turnover = mean(`Turnover (£)`, na.rm = TRUE),
    Min_Staff = min(`No of staff`, na.rm = TRUE),
    Max_Staff = max(`No of staff`, na.rm = TRUE),
    Avg_Staff = mean(`No of staff`, na.rm = TRUE),
    Min_Year = min(`Year founded`, na.rm = TRUE),
    Max_Year = max(`Year founded`, na.rm = TRUE),
    Avg_Year = mean(`Year founded`, na.rm = TRUE)
  )

print(summary_stats)
## # A tibble: 1 × 9
##   Min_Turnover Max_Turnover Avg_Turnover Min_Staff Max_Staff Avg_Staff Min_Year
##          <dbl>        <dbl>        <dbl>     <dbl>     <dbl>     <dbl>    <dbl>
## 1          0.3       180074        1763.      1000   4052806     9694.     1770
## # ℹ 2 more variables: Max_Year <dbl>, Avg_Year <dbl>

Country and Customer

country_plot <- ggplot(tmp, aes(x = Country)) +
  geom_bar(fill = "#F64668", alpha = 0.8) +
  labs(title = "Country Distribution", x = "Country", y = "Count") +
  theme_minimal()

customer_plot <- ggplot(tmp, aes(x = factor(`Customer?`))) +
  geom_bar(fill = "#F64668", alpha = 0.7) +
  labs(title = "Customer Conversion Status", x = "Conversion Status", y = "Count") +
  theme_minimal()

print(country_plot)

print(customer_plot)

AUTOMATED SELECTION

Data Quality Validation:

-Data Quality Check Summary -Lead Scoring Model -Lead Tier Assignment -Lead Summary by Tier -Lead Tier Visualization

auto_data_qual <- tmp %>%
  mutate(
    Turnover_Valid = ifelse(`Turnover (£)` > 0, TRUE, FALSE),
    Staff_Valid = ifelse(`No of staff` > 0, TRUE, FALSE),
    Profit_Loss_Valid = ifelse(`Profit/Loss as % of Turnover` >= -100 & `Profit/Loss as % of Turnover` <= 100, TRUE, FALSE),
    Intent_Valid = !is.na(`Displaying 1st party intent last 3 months?`)
  ) %>%
  filter(Turnover_Valid & Staff_Valid & Profit_Loss_Valid & Intent_Valid)
cat("Data Quality Check Summary:\n")
## Data Quality Check Summary:
print(summary(auto_data_qual))
##    CompanyURN     company name         Country           Turnover (£)     
##  Min.   :    42   Length:664         Length:664         Min.   :     0.3  
##  1st Qu.: 19312   Class :character   Class :character   1st Qu.:   152.9  
##  Median : 70123   Mode  :character   Mode  :character   Median :   319.1  
##  Mean   :108344                                         Mean   :  1665.5  
##  3rd Qu.:146731                                         3rd Qu.:   781.2  
##  Max.   :475753                                         Max.   :141450.0  
##                                                                           
##   No of staff      Sub sector        Profit/Loss as % of Turnover
##  Min.   :  1000   Length:664         Min.   :-66.12010           
##  1st Qu.:  1526   Class :character   1st Qu.:  0.00000           
##  Median :  2348   Mode  :character   Median :  0.03013           
##  Mean   :  7580                      Mean   :  0.21179           
##  3rd Qu.:  5022                      3rd Qu.:  0.08936           
##  Max.   :588112                      Max.   : 76.52657           
##                                                                  
##   Year founded   No of Sites        Customer?  
##  Min.   :1770   Min.   :   1.00   Min.   :1    
##  1st Qu.:1968   1st Qu.:   3.00   1st Qu.:1    
##  Median :1988   Median :   8.50   Median :1    
##  Mean   :1977   Mean   :  51.73   Mean   :1    
##  3rd Qu.:2000   3rd Qu.:  28.00   3rd Qu.:1    
##  Max.   :2017   Max.   :1577.00   Max.   :1    
##                                   NA's   :427  
##  Displaying 1st party intent last 3 months? Turnover_Valid Staff_Valid   
##  Min.   :1                                  Mode:logical   Mode:logical  
##  1st Qu.:1                                  TRUE:664       TRUE:664      
##  Median :1                                                               
##  Mean   :1                                                               
##  3rd Qu.:1                                                               
##  Max.   :1                                                               
##                                                                          
##  Profit_Loss_Valid Intent_Valid  
##  Mode:logical      Mode:logical  
##  TRUE:664          TRUE:664      
##                                  
##                                  
##                                  
##                                  
## 

Lead Scoring Model:

auto_data_qual <- auto_data_qual %>%
  mutate(
    Lead_Score = (0.4 * `Turnover (£)` / max(`Turnover (£)`, na.rm = TRUE)) +
                 (0.3 * `No of staff` / max(`No of staff`, na.rm = TRUE)) +
                 (0.2 * `Profit/Loss as % of Turnover` / max(`Profit/Loss as % of Turnover`, na.rm = TRUE)) +
                 (0.1 * as.numeric(`Displaying 1st party intent last 3 months?`))
  ) %>%
  arrange(desc(Lead_Score))

Automated Lead Tier Assignment – Quality-Checked Scoring by Lead Score Thresholds (Customer Status Agnostic)

auto_data_qual <- auto_data_qual %>%
  mutate(
    Lead_Tier = case_when(
      Lead_Score >= 0.7 ~ "High",
      Lead_Score >= 0.4 ~ "Medium",
      TRUE ~ "Low"
    )
  )

top_20 <- auto_data_qual %>%
  arrange(desc(Lead_Score)) %>%  
  slice_head(n = 20)             

print(top_20)
## # A tibble: 20 × 17
##    CompanyURN `company name`   Country `Turnover (£)` `No of staff` `Sub sector`
##         <dbl> <chr>            <chr>            <dbl>         <dbl> <chr>       
##  1      19196 BP p.l.c.        United…        141450.          3805 Oil & Gas-E…
##  2      95782 Compass Group P… United…         22568         588112 Business Se…
##  3       8956 HSBC Holdings p… United…         60408         246933 Finance-Ban…
##  4      99129 Avis Budget EME… United…          1329.          5163 Consumer Se…
##  5      11258 Aviva plc        United…         55292          29653 Insurance-G…
##  6      15774 Barclays Bank p… United…         29754         129400 Finance-Con…
##  7     343548 ASDA Group Limi… United…         21666.        161451 Insurance-M…
##  8      14254 ASDA Stores Lim… United…         21408.        157207 Retail-Opti…
##  9     133780 Associated Brit… United…         15357         132590 Manufacture…
## 10      19271 BP Internationa… United…         33418.          3805 Oil & Gas-/…
## 11      10062 Centrica plc     United…         27102          38278 Utilities-G…
## 12      20118 BHP Billiton plc United…         28829.         26146 Constructio…
## 13      19189 Anglo American … United…         16502.         80000 Constructio…
## 14     155889 Abacus Group Li… United…           287           1100 Wholesale-E…
## 15      16217 British America… United…         13140          87577 Manufacture…
## 16      96592 AstraZeneca plc  United…         18271          60100 Manufacture…
## 17      21504 Churchill Insur… United…          1207.          8500 Insurance-M…
## 18      20278 HSBC Bank plc    United…         18994          55346 Finance-Ban…
## 19      24853 ABI SAB Group H… United…         14949          70089 Manufacture…
## 20      13190 FirstGroup plc   United…          5653.        100891 Transport-P…
## # ℹ 11 more variables: `Profit/Loss as % of Turnover` <dbl>,
## #   `Year founded` <dbl>, `No of Sites` <dbl>, `Customer?` <dbl>,
## #   `Displaying 1st party intent last 3 months?` <dbl>, Turnover_Valid <lgl>,
## #   Staff_Valid <lgl>, Profit_Loss_Valid <lgl>, Intent_Valid <lgl>,
## #   Lead_Score <dbl>, Lead_Tier <chr>

Lead Summary by Tier:

new_lead_summary <- auto_data_qual %>%
  group_by(Lead_Tier) %>%
  summarise(
    Avg_Turnover = mean(`Turnover (£)`, na.rm = TRUE),
    Avg_Staff = mean(`No of staff`, na.rm = TRUE),
    Avg_Profit_Margin = mean(`Profit/Loss as % of Turnover`, na.rm = TRUE),
    Intent_Rate = mean(as.numeric(`Displaying 1st party intent last 3 months?`), na.rm = TRUE),
    Companies_in_Tier = paste(unique(`company name`), collapse = ", ")  
  )

#Lead Tier Visualization:

ggplot(auto_data_qual, aes(x = Lead_Tier, fill = Lead_Tier)) + 
  geom_bar() + 
  labs(title = "Lead Tiers by Count", x = "Lead Tier", y = "Count") +
  scale_fill_manual(values = c("High" = "green", "Medium" = "orange", "Low" = "red")) +
  geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.5)

MANUAL SELECTION

Converting Binary data to factors

tmp <- tmp %>%
  mutate(`Customer?` = factor(`Customer?`, levels = c(0, 1), labels = c("No", "Yes"), exclude = NULL),
         `Displaying 1st party intent last 3 months?` = factor(`Displaying 1st party intent last 3 months?`, 
                                                               levels = c(0, 1), labels = c("No", "Yes"), exclude = NULL))

Handling missing values

tmp$`Customer?`[is.na(tmp$`Customer?`)] <- "No"
tmp$`Displaying 1st party intent last 3 months?`[is.na(tmp$`Displaying 1st party intent last 3 months?`)] <- "No"

Conditioning sub sector

tmp <- tmp %>%
  mutate(`Sub sector` = ifelse(is.na(`Sub sector`) & grepl("Trust", `company name`), "Education-Nonprofit", `Sub sector`))

The P&L had scientific notation which made it unreadable hence….

tmp <- tmp %>%
  mutate(`Profit/Loss as % of Turnover` = round(`Profit/Loss as % of Turnover`, 1))

Checking missing values (just to be sure the data is ready)

colSums(is.na(tmp))
##                                 CompanyURN 
##                                          0 
##                               company name 
##                                          0 
##                                    Country 
##                                          0 
##                               Turnover (£) 
##                                          0 
##                                No of staff 
##                                          0 
##                                 Sub sector 
##                                          0 
##               Profit/Loss as % of Turnover 
##                                          0 
##                               Year founded 
##                                          0 
##                                No of Sites 
##                                          0 
##                                  Customer? 
##                                          0 
## Displaying 1st party intent last 3 months? 
##                                          0

View the dataset

summary(tmp)
##    CompanyURN     company name         Country           Turnover (£)     
##  Min.   :    41   Length:1788        Length:1788        Min.   :     0.3  
##  1st Qu.: 19093   Class :character   Class :character   1st Qu.:   160.1  
##  Median : 59731   Mode  :character   Mode  :character   Median :   331.4  
##  Mean   :101434                                         Mean   :  1763.3  
##  3rd Qu.:139071                                         3rd Qu.:   817.1  
##  Max.   :475789                                         Max.   :180074.0  
##   No of staff       Sub sector        Profit/Loss as % of Turnover
##  Min.   :   1000   Length:1788        Min.   :-296.2000           
##  1st Qu.:   1554   Class :character   1st Qu.:   0.0000           
##  Median :   2592   Mode  :character   Median :   0.0000           
##  Mean   :   9694                      Mean   :  -0.2751           
##  3rd Qu.:   5245                      3rd Qu.:   0.1000           
##  Max.   :4052806                      Max.   : 105.0000           
##   Year founded   No of Sites      Customer? 
##  Min.   :1770   Min.   :   1.00   No :1299  
##  1st Qu.:1966   1st Qu.:   3.00   Yes: 489  
##  Median :1988   Median :   9.00             
##  Mean   :1976   Mean   :  68.34             
##  3rd Qu.:2000   3rd Qu.:  34.00             
##  Max.   :2017   Max.   :5253.00             
##  Displaying 1st party intent last 3 months?
##  No :1123                                  
##  Yes: 665                                  
##                                            
##                                            
##                                            
## 

Examine - Current state of the business

ggplot(tmp, aes(x = factor(`Customer?`))) +
  geom_bar(fill = "#F64668", color = "black") +
  labs(
    title = "Existing Customers vs Potential Leads",
    x = "Customer Status",
    y = "Count"
  ) +
  scale_x_discrete(labels = c("No", "Yes")) +

  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, size = 5, color = "black") +
  
  annotate("text", x = 2.5, y = -max(table(tmp$`Customer?`)) * 0.1, 
           label = paste("Sample size:", nrow(tmp)), color = "black", size = 4, hjust = 1, vjust = 0) +

  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10)
  )

ggplot(tmp, aes(x = factor(`Displaying 1st party intent last 3 months?`))) +
  geom_bar(fill = "#F64668", color = "black") +
  labs(
    title = "Displaying 1st Party Intent last 3 months",
    x = "Displaying Intent",
    y = "Count"
  ) +
  scale_x_discrete(labels = c("No", "Yes")) +
 
  geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, size = 5, color = "black") +
 
  annotate("text", x = 2.5, y = -max(table(tmp$`Displaying 1st party intent last 3 months?`)) * 0.1,
           label = paste("Sample size:", nrow(tmp)), color = "black", size = 4, hjust = 1, vjust = 0) +

  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold"),
    axis.title = element_text(size = 12),
    axis.text = element_text(size = 10)
  )

Descriptive Analysis - Understanding New Leads

tmp_newlead <- tmp %>% filter(`Customer?` == "No")
describe(tmp_newlead)
##                                             vars    n      mean        sd
## CompanyURN                                     1 1299 118481.11 120996.51
## company name*                                  2 1299    650.00    375.13
## Country*                                       3 1299      1.00      0.00
## Turnover (£)                                   4 1299   1290.70   7144.27
## No of staff                                    5 1299   6759.58  26063.03
## Sub sector*                                    6 1299    312.26    169.46
## Profit/Loss as % of Turnover                   7 1299     -0.38     11.96
## Year founded                                   8 1299   1978.14     34.90
## No of Sites                                    9 1299     60.89    208.06
## Customer?*                                    10 1299      1.00      0.00
## Displaying 1st party intent last 3 months?*   11 1299      1.33      0.47
##                                             median  trimmed       mad    min
## CompanyURN                                   95094 93275.67 102678.95   41.0
## company name*                                  650   650.00    481.84    1.0
## Country*                                         1     1.00      0.00    1.0
## Turnover (£)                                   294   431.70    282.73    0.3
## No of staff                                   2508  3362.01   1820.63 1000.0
## Sub sector*                                    357   317.01    166.05    1.0
## Profit/Loss as % of Turnover                     0     0.03      0.00 -296.2
## Year founded                                  1989  1984.68     22.24 1777.0
## No of Sites                                      8    21.39     10.38    1.0
## Customer?*                                       1     1.00      0.00    1.0
## Displaying 1st party intent last 3 months?*      1     1.29      0.00    1.0
##                                                max    range   skew kurtosis
## CompanyURN                                  475789 475748.0   1.60     1.75
## company name*                                 1299   1298.0   0.00    -1.20
## Country*                                         1      0.0    NaN      NaN
## Turnover (£)                                180074 180073.7  17.34   367.33
## No of staff                                 592897 591897.0  18.02   387.76
## Sub sector*                                    605    604.0  -0.34    -1.17
## Profit/Loss as % of Turnover                   105    401.2 -16.09   370.60
## Year founded                                  2017    240.0  -1.96     4.30
## No of Sites                                   5253   5252.0  13.99   306.83
## Customer?*                                       1      0.0    NaN      NaN
## Displaying 1st party intent last 3 months?*      2      1.0   0.72    -1.48
##                                                  se
## CompanyURN                                  3357.13
## company name*                                 10.41
## Country*                                       0.00
## Turnover (£)                                 198.22
## No of staff                                  723.14
## Sub sector*                                    4.70
## Profit/Loss as % of Turnover                   0.33
## Year founded                                   0.97
## No of Sites                                    5.77
## Customer?*                                     0.00
## Displaying 1st party intent last 3 months?*    0.01

Define workforce size categories

tmp_newlead <- tmp_newlead %>%
  mutate(Staff_Size_Category = case_when(
    `No of staff` <= 2500 ~ "Small",
    `No of staff` <= 10000 ~ "Medium",
    TRUE ~ "Large"
  ))

Plot Turnover vs. Staff Size, segmented by Workforce Size

ggplot(tmp_newlead, aes(x = `Turnover (£)`, y = `No of staff`, color = Staff_Size_Category)) +
  geom_point(alpha = 0.5) + 
  labs(
    title = "Turnover vs. Staff Size by Workforce Category", 
    x = "Turnover (£)", 
    y = "Number of Staff", 
    color = "Workforce Size Category"
  ) +
  scale_y_continuous(labels = scales::comma) + 
  theme_minimal() +
  scale_color_manual(values = c("blue", "green", "red")) +
  theme(
    panel.grid.major = element_line(color = "gray", linewidth = 0.5), 
    panel.grid.minor = element_blank(),
    axis.text.x = element_text(angle = 45, hjust = 1)
  )

tmp_newlead_output <- tmp_newlead %>%
  select(`Turnover (£)`, `No of staff`, Staff_Size_Category)
print(tmp_newlead_output)
## # A tibble: 1,299 × 3
##    `Turnover (£)` `No of staff` Staff_Size_Category
##             <dbl>         <dbl> <chr>              
##  1          2149.         13002 Large              
##  2           787.          4002 Medium             
##  3           990.          4256 Medium             
##  4          1023.          7658 Medium             
##  5           518.          2474 Small              
##  6           937.         13501 Large              
##  7           838.          2881 Medium             
##  8          2916.          3901 Medium             
##  9           633.          9723 Medium             
## 10          1992.         11479 Large              
## # ℹ 1,289 more rows
head(tmp_newlead_output)
## # A tibble: 6 × 3
##   `Turnover (£)` `No of staff` Staff_Size_Category
##            <dbl>         <dbl> <chr>              
## 1          2149.         13002 Large              
## 2           787.          4002 Medium             
## 3           990.          4256 Medium             
## 4          1023.          7658 Medium             
## 5           518.          2474 Small              
## 6           937.         13501 Large
tmp_newlead_output %>%
  group_by(Staff_Size_Category) %>%
  summarise(
    Mean_Turnover = mean(`Turnover (£)`),
    Median_Turnover = median(`Turnover (£)`),
    Min_Turnover = min(`Turnover (£)`),
    Max_Turnover = max(`Turnover (£)`),
    Mean_Staff_Size = mean(`No of staff`),
    Median_Staff_Size = median(`No of staff`),
    Min_Staff_Size = min(`No of staff`),
    Max_Staff_Size = max(`No of staff`)
  )
## # A tibble: 3 × 9
##   Staff_Size_Category Mean_Turnover Median_Turnover Min_Turnover Max_Turnover
##   <chr>                       <dbl>           <dbl>        <dbl>        <dbl>
## 1 Large                       5949.           2010.        280.       180074 
## 2 Medium                       877.            425.         16.2       77969 
## 3 Small                        618.            174.          0.3      115312.
## # ℹ 4 more variables: Mean_Staff_Size <dbl>, Median_Staff_Size <dbl>,
## #   Min_Staff_Size <dbl>, Max_Staff_Size <dbl>

Summary

summary(tmp_newlead)
##    CompanyURN     company name         Country           Turnover (£)     
##  Min.   :    41   Length:1299        Length:1299        Min.   :     0.3  
##  1st Qu.: 24804   Class :character   Class :character   1st Qu.:   143.3  
##  Median : 95094   Mode  :character   Mode  :character   Median :   294.0  
##  Mean   :118481                                         Mean   :  1290.7  
##  3rd Qu.:155926                                         3rd Qu.:   711.9  
##  Max.   :475789                                         Max.   :180074.0  
##   No of staff      Sub sector        Profit/Loss as % of Turnover
##  Min.   :  1000   Length:1299        Min.   :-296.2000           
##  1st Qu.:  1520   Class :character   1st Qu.:   0.0000           
##  Median :  2508   Mode  :character   Median :   0.0000           
##  Mean   :  6760                      Mean   :  -0.3757           
##  3rd Qu.:  5086                      3rd Qu.:   0.1000           
##  Max.   :592897                      Max.   : 105.0000           
##   Year founded   No of Sites      Customer? 
##  Min.   :1777   Min.   :   1.00   No :1299  
##  1st Qu.:1969   1st Qu.:   3.00   Yes:   0  
##  Median :1989   Median :   8.00             
##  Mean   :1978   Mean   :  60.89             
##  3rd Qu.:2000   3rd Qu.:  33.50             
##  Max.   :2017   Max.   :5253.00             
##  Displaying 1st party intent last 3 months? Staff_Size_Category
##  No :871                                    Length:1299        
##  Yes:428                                    Class :character   
##                                             Mode  :character   
##                                                                
##                                                                
## 

Profitability Analysis - top 10 companies based on turnover and staff size

top_10_profitable_companies <- tmp_newlead %>%
  arrange(desc(`Turnover (£)`), desc(`No of staff`)) %>%
  head(10)
top_10_profitable_companies %>%
  select(CompanyURN, `company name`, `Turnover (£)`, `No of staff`, `Profit/Loss as % of Turnover`) %>%
  print()
## # A tibble: 10 × 5
##    CompanyURN `company name` `Turnover (£)` `No of staff` Profit/Loss as % of …¹
##         <dbl> <chr>                   <dbl>         <dbl>                  <dbl>
##  1      23524 Department fo…        180074          79224                    0  
##  2      12347 Department of…        115312.          1887                  -12.3
##  3     150720 Legal & Gener…         77969           8939                    0.3
##  4     389165 Education and…         60302           1919                    0  
##  5      17556 HM Revenue & …         56627.         59857                 -205. 
##  6     160582 Lloyds Bankin…         48313          80418                    0.1
##  7      24183 Rio Tinto plc          25126.         54938                    0.2
##  8      96091 J Sainsbury p…         23506         108300                    0  
##  9      95782 Compass Group…         22568         588112                    0.1
## 10     343548 ASDA Group Li…         21666.        161451                    0  
## # ℹ abbreviated name: ¹​`Profit/Loss as % of Turnover`
write_xlsx(top_10_profitable_companies, "top_10_profitable_companies.xlsx")
tmp <- tmp %>%
  mutate(`Profit/Loss as % of Turnover` = round(`Profit/Loss as % of Turnover` * 100, 2))
ggplot(top_10_profitable_companies, aes(x = reorder(`company name`, `Turnover (£)`), 
                                       y = `Turnover (£)`, fill = `Profit/Loss as % of Turnover`)) +
  geom_bar(stat = "identity", show.legend = TRUE, alpha = 0.8) +
  geom_text(aes(label = scales::comma(`Turnover (£)`), y = `Turnover (£)` + 5000), 
            size = 4, color = "black", fontface = "bold") +
  labs(
    title = "Top 10 Profitable Companies",
    subtitle = "Turnover and Profit Margin Comparison for Top 10 Companies\n(Original dataset: n = 1788)",
    x = "Company Name",
    y = "Turnover (£)",
    fill = "Profit Margin (%)"
) +
  scale_fill_gradient(low = "red", high = "green") +
  coord_flip() +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 18, face = "bold"),
    plot.subtitle = element_text(size = 14),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    panel.grid = element_blank(),
    legend.position = "top"
  )

Intent-Based Prioritization

top_10_intent_based <- tmp_newlead %>%
  filter(`Displaying 1st party intent last 3 months?` == "Yes") %>%
  arrange(desc(`Turnover (£)`), desc(`No of staff`)) %>%
  head(10)
top_10_intent_based %>%
  select(CompanyURN, `company name`, `Turnover (£)`, `No of staff`, `Profit/Loss as % of Turnover`) %>%
  print()
## # A tibble: 10 × 5
##    CompanyURN `company name` `Turnover (£)` `No of staff` Profit/Loss as % of …¹
##         <dbl> <chr>                   <dbl>         <dbl>                  <dbl>
##  1      95782 Compass Group…         22568         588112                    0.1
##  2     343548 ASDA Group Li…         21666.        161451                    0  
##  3      14254 ASDA Stores L…         21408.        157207                    0  
##  4      20278 HSBC Bank plc          18994          55346                    0.1
##  5      23594 Department fo…         18380          19227                    0  
##  6      19189 Anglo America…         16502.         80000                    0.2
##  7      98901 Bank of Scotl…         11091          35752                    0.2
##  8      99183 Dixons Carpho…         10585          45461                    0  
##  9     435309 Cisco Interna…          9170.          3303                    0  
## 10     409120 Aon plc                 9005.         69316                    0  
## # ℹ abbreviated name: ¹​`Profit/Loss as % of Turnover`
write_xlsx(top_10_intent_based, "top_10_intent_based.xlsx")
ggplot(top_10_intent_based, aes(x = reorder(`company name`, `Turnover (£)`), 
                                y = `Turnover (£)`, fill = `Profit/Loss as % of Turnover`)) +
  geom_bar(stat = "identity", show.legend = TRUE, alpha = 0.8) +
  geom_text(aes(label = scales::comma(`Turnover (£)`), y = `Turnover (£)` + 5000), 
            size = 4, color = "black", fontface = "bold") +
  labs(
    title = "Top 10 Intent-Based Companies",
    subtitle = "Prioritized by Turnover and Profit Margin\n(Original dataset: n = 1788)",
    x = "Company Name",
    y = "Turnover (£)",
    fill = "Profit Margin (%)"
) +
  scale_fill_gradient(low = "red", high = "green") +
  coord_flip() +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 18, face = "bold"),
    plot.subtitle = element_text(size = 14),
    axis.text = element_text(size = 10),
    axis.title = element_text(size = 12),
    panel.grid = element_blank(),
    legend.position = "top"
  )

Cluster Analysis

cluster_data <- tmp_newlead %>%
  select(`Turnover (£)`, `No of staff`, `Profit/Loss as % of Turnover`) %>%
  mutate(across(everything(), scale))  
set.seed(123)  
kmeans_result <- kmeans(cluster_data, centers = 4)
tmp_newlead$Cluster <- kmeans_result$cluster  

cluster_summary <- tmp_newlead %>%
  group_by(Cluster) %>%
  summarise(
    Avg_Turnover = mean(`Turnover (£)`, na.rm = TRUE),
    Avg_Staff_Size = mean(`No of staff`, na.rm = TRUE),
    Avg_Profit_Margin = mean(`Profit/Loss as % of Turnover`, na.rm = TRUE),
    Count = n()  
  )

print(cluster_summary)
## # A tibble: 4 × 5
##   Cluster Avg_Turnover Avg_Staff_Size Avg_Profit_Margin Count
##     <int>        <dbl>          <dbl>             <dbl> <int>
## 1       1       23462.         80106.            0.0556    18
## 2       2       15079         590504.            0.05       2
## 3       3       90142.         35613          -128.         4
## 4       4         677.          4718.            0.0188  1275
ggplot(tmp_newlead, aes(x = `Turnover (£)`, y = `No of staff`, color = as.factor(Cluster))) +
  geom_point(alpha = 0.6) +  
  labs(
    title = "Clustering of New Leads by Turnover and Staff Size",
    subtitle = "Clusters are based on company revenue, staff size, and profitability",
    x = "Turnover (£)",
    y = "Number of Staff",
    color = "Cluster"
  ) +
  scale_y_continuous(labels = scales::comma) +  
  theme_minimal()

clustered_companies <- tmp_newlead %>%
  select(`company name`, `Turnover (£)`, `No of staff`, `Profit/Loss as % of Turnover`, Cluster) %>%
  arrange(Cluster)  

print(clustered_companies)
## # A tibble: 1,299 × 5
##    `company name`    `Turnover (£)` `No of staff` Profit/Loss as % of …¹ Cluster
##    <chr>                      <dbl>         <dbl>                  <dbl>   <int>
##  1 J Sainsbury plc           23506         108300                    0         1
##  2 Wittington Inves…         15526         133913                    0.1       1
##  3 ASDA Stores Limi…         21408.        157207                    0         1
##  4 TUI Travel Limit…         14619          57389                   -0.1       1
##  5 Standard Charter…         15270.         84916                    0.1       1
##  6 HSBC Bank plc             18994          55346                    0.1       1
##  7 Capita plc                 4909.         74755                   -0.1       1
##  8 Marks & Spencer …         10622          84939                    0         1
##  9 Rio Tinto plc             25126.         54938                    0.2       1
## 10 Anglo American p…         16502.         80000                    0.2       1
## # ℹ 1,289 more rows
## # ℹ abbreviated name: ¹​`Profit/Loss as % of Turnover`
cluster_summary_companies <- tmp_newlead %>%
  group_by(Cluster) %>%
  summarise(
    Count = n(),
    Companies = paste(`company name`, collapse = ", ")  
  )

print(cluster_summary_companies)
## # A tibble: 4 × 3
##   Cluster Count Companies                                                       
##     <int> <int> <chr>                                                           
## 1       1    18 J Sainsbury plc, Wittington Investments Limited, ASDA Stores Li…
## 2       2     2 G4S plc, Compass Group PLC                                      
## 3       3     4 Department for Work and Pensions, HM Revenue & Customs, Departm…
## 4       4  1275 BBA Aviation plc, GKN Aerospace Services Limited, Leonardo MW L…

Creating an Explanatory Variable - Conversion Probability

if("converted" %in% names(tmp_newlead)) {
  model <- glm(converted ~ `Turnover (£)` + `No of staff` + `Profit/Loss as % of Turnover` + `Displaying 1st Party Intent Last 3 Months?`,
               data = tmp_newlead, family = binomial)
  
   tmp_newlead$conversion_prob <- predict(model, tmp_newlead, type = "response")
  
   ggplot(tmp_newlead, aes(x = conversion_prob)) +
    geom_histogram(fill = "#F64668", bins = 20, alpha = 0.7) +
    labs(title = "Lead Conversion Probability Distribution", x = "Conversion Probability", y = "Count") +
    theme_minimal()
} else {
  message("Conversion data is not available for probability modeling.")
}
## Conversion data is not available for probability modeling.

Lets Go to the initial Data for Logistic Regression Analysis

tmp$`Customer?` <- ifelse(tmp$`Customer?` == "Yes", 1, 0)
sum(is.na(tmp))
## [1] 0
tmp_lga <- tmp %>%
  mutate(`Displaying 1st party intent last 3 months?` = ifelse(`Displaying 1st party intent last 3 months?` == "Yes", 1, 0))

Convert Sub sector to dummy variables

tmp_lga <- tmp_lga %>%
  mutate(across(`Sub sector`, as.factor)) %>%
  fastDummies::dummy_cols(select_columns = "Sub sector", remove_first_dummy = TRUE)

Scale numeric variables

tmp_lga <- tmp_lga %>%
  mutate(across(c(`Turnover (£)`, `No of staff`, `Profit/Loss as % of Turnover`), scale))

Logistic regression model

model <- glm(`Customer?` ~ `Turnover (£)` + `No of staff` + `Profit/Loss as % of Turnover` + `Displaying 1st party intent last 3 months?`,
             data = tmp_lga, family = binomial)

summary(model)
## 
## Call:
## glm(formula = `Customer?` ~ `Turnover (£)` + `No of staff` + 
##     `Profit/Loss as % of Turnover` + `Displaying 1st party intent last 3 months?`, 
##     family = binomial, data = tmp_lga)
## 
## Coefficients:
##                                              Estimate Std. Error z value
## (Intercept)                                  -1.24811    0.07210 -17.311
## `Turnover (£)`                                0.19726    0.06370   3.097
## `No of staff`                                 0.12016    0.12026   0.999
## `Profit/Loss as % of Turnover`                0.06202    0.06861   0.904
## `Displaying 1st party intent last 3 months?`  0.65976    0.10865   6.072
##                                              Pr(>|z|)    
## (Intercept)                                   < 2e-16 ***
## `Turnover (£)`                                0.00196 ** 
## `No of staff`                                 0.31769    
## `Profit/Loss as % of Turnover`                0.36599    
## `Displaying 1st party intent last 3 months?` 1.26e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2098.0  on 1787  degrees of freedom
## Residual deviance: 2043.9  on 1783  degrees of freedom
## AIC: 2053.9
## 
## Number of Fisher Scoring iterations: 5
tmp_lga$conversion_prob <- predict(model, tmp_lga, type = "response")
ggplot(tmp_lga, aes(x = conversion_prob)) +
  geom_histogram(fill = "#F64668", bins = 20, alpha = 0.7) +
  labs(title = "Lead Conversion Probability Distribution", x = "Conversion Probability", y = "Count") +
  theme_minimal()

tmp_lga <- tmp_lga %>%
  arrange(desc(conversion_prob))  

top_companies <- tmp_lga %>% head(5)

ggplot(top_companies, aes(x = conversion_prob, y = reorder(`company name`, conversion_prob))) +
  geom_bar(stat = "identity", fill = "#F64668", alpha = 0.7) +
  labs(title = "Top 5 Companies by Lead Conversion Probability", 
       x = "Conversion Probability", 
       y = "Company Name") +
  theme_minimal() +
  theme(axis.text.y = element_text(size = 10)) 

Model Evaluation

tmp_lga$conversion_prob <- predict(model, tmp_lga, type = "response")

ggplot(tmp_lga, aes(x = conversion_prob)) +
  geom_histogram(fill = "#F64668", bins = 20, alpha = 0.7) +
  labs(title = "Lead Conversion Probability Distribution", x = "Conversion Probability", y = "Count") +
  theme_minimal()

roc_curve <- roc(tmp_lga$'Customer?', tmp_lga$conversion_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve)

auc(roc_curve)
## Area under the curve: 0.633
print(roc_curve)
## 
## Call:
## roc.default(response = tmp_lga$"Customer?", predictor = tmp_lga$conversion_prob)
## 
## Data: tmp_lga$conversion_prob in 1299 controls (tmp_lga$"Customer?" 0) < 489 cases (tmp_lga$"Customer?" 1).
## Area under the curve: 0.633
plot(roc_curve)

auc_value <- auc(roc_curve)
print(paste("AUC:", auc_value))
## [1] "AUC: 0.632958182399234"
print(roc_curve)
## 
## Call:
## roc.default(response = tmp_lga$"Customer?", predictor = tmp_lga$conversion_prob)
## 
## Data: tmp_lga$conversion_prob in 1299 controls (tmp_lga$"Customer?" 0) < 489 cases (tmp_lga$"Customer?" 1).
## Area under the curve: 0.633