library(readxl) # For reading Excel files
library(skimr) # For quick and concise data summaries
library(dplyr) # For data manipulation
library(tidyr) # For reshaping data
library(ggplot2) # For visualization
library(cluster) # For clustering analysis
library(fastDummies) # For dummy variable creation
library(writexl) # Save dataframe as excel
library(pROC) # For ROC curve and AUC in classification
library(psych) # Summarize the dataset "data"
library(ggrepel) # Position label to avoid overlap
tmp <- read_excel("Business_Universe_Data_Assessment_2024.xlsx")
skim(tmp)
| Name | tmp |
| Number of rows | 1788 |
| Number of columns | 11 |
| _______________________ | |
| Column type frequency: | |
| character | 3 |
| numeric | 8 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| company name | 0 | 1 | 3 | 73 | 0 | 1788 | 0 |
| Country | 0 | 1 | 14 | 14 | 0 | 1 | 0 |
| Sub sector | 2 | 1 | 7 | 375 | 0 | 822 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| CompanyURN | 0 | 1.00 | 101434.18 | 117950.64 | 41.00 | 19092.75 | 59731.00 | 139070.75 | 475789.00 | ▇▃▁▁▁ |
| Turnover (£) | 0 | 1.00 | 1763.33 | 8488.24 | 0.30 | 160.10 | 331.44 | 817.12 | 180074.00 | ▇▁▁▁▁ |
| No of staff | 0 | 1.00 | 9693.96 | 99387.33 | 1000.00 | 1554.50 | 2592.00 | 5245.00 | 4052806.00 | ▇▁▁▁▁ |
| Profit/Loss as % of Turnover | 0 | 1.00 | -0.27 | 10.23 | -296.23 | 0.00 | 0.02 | 0.08 | 105.04 | ▁▁▁▇▁ |
| Year founded | 0 | 1.00 | 1976.09 | 37.27 | 1770.00 | 1966.00 | 1988.00 | 2000.00 | 2017.00 | ▁▁▁▂▇ |
| No of Sites | 0 | 1.00 | 68.34 | 241.71 | 1.00 | 3.00 | 9.00 | 34.00 | 5253.00 | ▇▁▁▁▁ |
| Customer? | 1299 | 0.27 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▇▁▁ |
| Displaying 1st party intent last 3 months? | 1123 | 0.37 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▇▁▁ |
str(tmp)
## tibble [1,788 × 11] (S3: tbl_df/tbl/data.frame)
## $ CompanyURN : num [1:1788] 24138 25240 149774 26890 9759 ...
## $ company name : chr [1:1788] "BBA Aviation plc" "GKN Aerospace Services Limited" "Leonardo MW Ltd" "Senior plc" ...
## $ Country : chr [1:1788] "United Kingdom" "United Kingdom" "United Kingdom" "United Kingdom" ...
## $ Turnover (£) : num [1:1788] 2149 787 990 1023 14955 ...
## $ No of staff : num [1:1788] 13002 4002 4256 7658 49900 ...
## $ Sub sector : chr [1:1788] "Manufacture-Aerospace," "Manufacture-Aerospace,Manufacture-Defence" "Manufacture-Aerospace,Manufacture-Defence" "Manufacture-Aerospace,Manufacture-Defence,Manufacture-Electronics/Electrical,Manufacture-Engineering" ...
## $ Profit/Loss as % of Turnover : num [1:1788] 0.0598 0.1524 0.0669 0.0589 0.258 ...
## $ Year founded : num [1:1788] 1897 1939 1950 1933 1884 ...
## $ No of Sites : num [1:1788] 6 6 9 3 23 8 25 826 6 79 ...
## $ Customer? : num [1:1788] NA NA NA NA 1 NA 1 NA NA 1 ...
## $ Displaying 1st party intent last 3 months?: num [1:1788] 1 NA NA NA NA NA NA NA 1 1 ...
colSums(is.na(tmp))
## CompanyURN
## 0
## company name
## 0
## Country
## 0
## Turnover (£)
## 0
## No of staff
## 0
## Sub sector
## 2
## Profit/Loss as % of Turnover
## 0
## Year founded
## 0
## No of Sites
## 0
## Customer?
## 1299
## Displaying 1st party intent last 3 months?
## 1123
sum(is.na(tmp))
## [1] 2424
summary_stats <- tmp %>%
summarise(
Min_Turnover = min(`Turnover (£)`, na.rm = TRUE),
Max_Turnover = max(`Turnover (£)`, na.rm = TRUE),
Avg_Turnover = mean(`Turnover (£)`, na.rm = TRUE),
Min_Staff = min(`No of staff`, na.rm = TRUE),
Max_Staff = max(`No of staff`, na.rm = TRUE),
Avg_Staff = mean(`No of staff`, na.rm = TRUE),
Min_Year = min(`Year founded`, na.rm = TRUE),
Max_Year = max(`Year founded`, na.rm = TRUE),
Avg_Year = mean(`Year founded`, na.rm = TRUE)
)
print(summary_stats)
## # A tibble: 1 × 9
## Min_Turnover Max_Turnover Avg_Turnover Min_Staff Max_Staff Avg_Staff Min_Year
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.3 180074 1763. 1000 4052806 9694. 1770
## # ℹ 2 more variables: Max_Year <dbl>, Avg_Year <dbl>
country_plot <- ggplot(tmp, aes(x = Country)) +
geom_bar(fill = "#F64668", alpha = 0.8) +
labs(title = "Country Distribution", x = "Country", y = "Count") +
theme_minimal()
customer_plot <- ggplot(tmp, aes(x = factor(`Customer?`))) +
geom_bar(fill = "#F64668", alpha = 0.7) +
labs(title = "Customer Conversion Status", x = "Conversion Status", y = "Count") +
theme_minimal()
print(country_plot)
print(customer_plot)
-Data Quality Check Summary -Lead Scoring Model -Lead Tier Assignment -Lead Summary by Tier -Lead Tier Visualization
auto_data_qual <- tmp %>%
mutate(
Turnover_Valid = ifelse(`Turnover (£)` > 0, TRUE, FALSE),
Staff_Valid = ifelse(`No of staff` > 0, TRUE, FALSE),
Profit_Loss_Valid = ifelse(`Profit/Loss as % of Turnover` >= -100 & `Profit/Loss as % of Turnover` <= 100, TRUE, FALSE),
Intent_Valid = !is.na(`Displaying 1st party intent last 3 months?`)
) %>%
filter(Turnover_Valid & Staff_Valid & Profit_Loss_Valid & Intent_Valid)
cat("Data Quality Check Summary:\n")
## Data Quality Check Summary:
print(summary(auto_data_qual))
## CompanyURN company name Country Turnover (£)
## Min. : 42 Length:664 Length:664 Min. : 0.3
## 1st Qu.: 19312 Class :character Class :character 1st Qu.: 152.9
## Median : 70123 Mode :character Mode :character Median : 319.1
## Mean :108344 Mean : 1665.5
## 3rd Qu.:146731 3rd Qu.: 781.2
## Max. :475753 Max. :141450.0
##
## No of staff Sub sector Profit/Loss as % of Turnover
## Min. : 1000 Length:664 Min. :-66.12010
## 1st Qu.: 1526 Class :character 1st Qu.: 0.00000
## Median : 2348 Mode :character Median : 0.03013
## Mean : 7580 Mean : 0.21179
## 3rd Qu.: 5022 3rd Qu.: 0.08936
## Max. :588112 Max. : 76.52657
##
## Year founded No of Sites Customer?
## Min. :1770 Min. : 1.00 Min. :1
## 1st Qu.:1968 1st Qu.: 3.00 1st Qu.:1
## Median :1988 Median : 8.50 Median :1
## Mean :1977 Mean : 51.73 Mean :1
## 3rd Qu.:2000 3rd Qu.: 28.00 3rd Qu.:1
## Max. :2017 Max. :1577.00 Max. :1
## NA's :427
## Displaying 1st party intent last 3 months? Turnover_Valid Staff_Valid
## Min. :1 Mode:logical Mode:logical
## 1st Qu.:1 TRUE:664 TRUE:664
## Median :1
## Mean :1
## 3rd Qu.:1
## Max. :1
##
## Profit_Loss_Valid Intent_Valid
## Mode:logical Mode:logical
## TRUE:664 TRUE:664
##
##
##
##
##
auto_data_qual <- auto_data_qual %>%
mutate(
Lead_Score = (0.4 * `Turnover (£)` / max(`Turnover (£)`, na.rm = TRUE)) +
(0.3 * `No of staff` / max(`No of staff`, na.rm = TRUE)) +
(0.2 * `Profit/Loss as % of Turnover` / max(`Profit/Loss as % of Turnover`, na.rm = TRUE)) +
(0.1 * as.numeric(`Displaying 1st party intent last 3 months?`))
) %>%
arrange(desc(Lead_Score))
auto_data_qual <- auto_data_qual %>%
mutate(
Lead_Tier = case_when(
Lead_Score >= 0.7 ~ "High",
Lead_Score >= 0.4 ~ "Medium",
TRUE ~ "Low"
)
)
top_20 <- auto_data_qual %>%
arrange(desc(Lead_Score)) %>%
slice_head(n = 20)
print(top_20)
## # A tibble: 20 × 17
## CompanyURN `company name` Country `Turnover (£)` `No of staff` `Sub sector`
## <dbl> <chr> <chr> <dbl> <dbl> <chr>
## 1 19196 BP p.l.c. United… 141450. 3805 Oil & Gas-E…
## 2 95782 Compass Group P… United… 22568 588112 Business Se…
## 3 8956 HSBC Holdings p… United… 60408 246933 Finance-Ban…
## 4 99129 Avis Budget EME… United… 1329. 5163 Consumer Se…
## 5 11258 Aviva plc United… 55292 29653 Insurance-G…
## 6 15774 Barclays Bank p… United… 29754 129400 Finance-Con…
## 7 343548 ASDA Group Limi… United… 21666. 161451 Insurance-M…
## 8 14254 ASDA Stores Lim… United… 21408. 157207 Retail-Opti…
## 9 133780 Associated Brit… United… 15357 132590 Manufacture…
## 10 19271 BP Internationa… United… 33418. 3805 Oil & Gas-/…
## 11 10062 Centrica plc United… 27102 38278 Utilities-G…
## 12 20118 BHP Billiton plc United… 28829. 26146 Constructio…
## 13 19189 Anglo American … United… 16502. 80000 Constructio…
## 14 155889 Abacus Group Li… United… 287 1100 Wholesale-E…
## 15 16217 British America… United… 13140 87577 Manufacture…
## 16 96592 AstraZeneca plc United… 18271 60100 Manufacture…
## 17 21504 Churchill Insur… United… 1207. 8500 Insurance-M…
## 18 20278 HSBC Bank plc United… 18994 55346 Finance-Ban…
## 19 24853 ABI SAB Group H… United… 14949 70089 Manufacture…
## 20 13190 FirstGroup plc United… 5653. 100891 Transport-P…
## # ℹ 11 more variables: `Profit/Loss as % of Turnover` <dbl>,
## # `Year founded` <dbl>, `No of Sites` <dbl>, `Customer?` <dbl>,
## # `Displaying 1st party intent last 3 months?` <dbl>, Turnover_Valid <lgl>,
## # Staff_Valid <lgl>, Profit_Loss_Valid <lgl>, Intent_Valid <lgl>,
## # Lead_Score <dbl>, Lead_Tier <chr>
new_lead_summary <- auto_data_qual %>%
group_by(Lead_Tier) %>%
summarise(
Avg_Turnover = mean(`Turnover (£)`, na.rm = TRUE),
Avg_Staff = mean(`No of staff`, na.rm = TRUE),
Avg_Profit_Margin = mean(`Profit/Loss as % of Turnover`, na.rm = TRUE),
Intent_Rate = mean(as.numeric(`Displaying 1st party intent last 3 months?`), na.rm = TRUE),
Companies_in_Tier = paste(unique(`company name`), collapse = ", ")
)
#Lead Tier Visualization:
ggplot(auto_data_qual, aes(x = Lead_Tier, fill = Lead_Tier)) +
geom_bar() +
labs(title = "Lead Tiers by Count", x = "Lead Tier", y = "Count") +
scale_fill_manual(values = c("High" = "green", "Medium" = "orange", "Low" = "red")) +
geom_text(stat = 'count', aes(label = after_stat(count)), vjust = -0.5)
tmp <- tmp %>%
mutate(`Customer?` = factor(`Customer?`, levels = c(0, 1), labels = c("No", "Yes"), exclude = NULL),
`Displaying 1st party intent last 3 months?` = factor(`Displaying 1st party intent last 3 months?`,
levels = c(0, 1), labels = c("No", "Yes"), exclude = NULL))
tmp$`Customer?`[is.na(tmp$`Customer?`)] <- "No"
tmp$`Displaying 1st party intent last 3 months?`[is.na(tmp$`Displaying 1st party intent last 3 months?`)] <- "No"
tmp <- tmp %>%
mutate(`Sub sector` = ifelse(is.na(`Sub sector`) & grepl("Trust", `company name`), "Education-Nonprofit", `Sub sector`))
tmp <- tmp %>%
mutate(`Profit/Loss as % of Turnover` = round(`Profit/Loss as % of Turnover`, 1))
colSums(is.na(tmp))
## CompanyURN
## 0
## company name
## 0
## Country
## 0
## Turnover (£)
## 0
## No of staff
## 0
## Sub sector
## 0
## Profit/Loss as % of Turnover
## 0
## Year founded
## 0
## No of Sites
## 0
## Customer?
## 0
## Displaying 1st party intent last 3 months?
## 0
summary(tmp)
## CompanyURN company name Country Turnover (£)
## Min. : 41 Length:1788 Length:1788 Min. : 0.3
## 1st Qu.: 19093 Class :character Class :character 1st Qu.: 160.1
## Median : 59731 Mode :character Mode :character Median : 331.4
## Mean :101434 Mean : 1763.3
## 3rd Qu.:139071 3rd Qu.: 817.1
## Max. :475789 Max. :180074.0
## No of staff Sub sector Profit/Loss as % of Turnover
## Min. : 1000 Length:1788 Min. :-296.2000
## 1st Qu.: 1554 Class :character 1st Qu.: 0.0000
## Median : 2592 Mode :character Median : 0.0000
## Mean : 9694 Mean : -0.2751
## 3rd Qu.: 5245 3rd Qu.: 0.1000
## Max. :4052806 Max. : 105.0000
## Year founded No of Sites Customer?
## Min. :1770 Min. : 1.00 No :1299
## 1st Qu.:1966 1st Qu.: 3.00 Yes: 489
## Median :1988 Median : 9.00
## Mean :1976 Mean : 68.34
## 3rd Qu.:2000 3rd Qu.: 34.00
## Max. :2017 Max. :5253.00
## Displaying 1st party intent last 3 months?
## No :1123
## Yes: 665
##
##
##
##
ggplot(tmp, aes(x = factor(`Customer?`))) +
geom_bar(fill = "#F64668", color = "black") +
labs(
title = "Existing Customers vs Potential Leads",
x = "Customer Status",
y = "Count"
) +
scale_x_discrete(labels = c("No", "Yes")) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, size = 5, color = "black") +
annotate("text", x = 2.5, y = -max(table(tmp$`Customer?`)) * 0.1,
label = paste("Sample size:", nrow(tmp)), color = "black", size = 4, hjust = 1, vjust = 0) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)
)
ggplot(tmp, aes(x = factor(`Displaying 1st party intent last 3 months?`))) +
geom_bar(fill = "#F64668", color = "black") +
labs(
title = "Displaying 1st Party Intent last 3 months",
x = "Displaying Intent",
y = "Count"
) +
scale_x_discrete(labels = c("No", "Yes")) +
geom_text(stat = "count", aes(label = after_stat(count)), vjust = -0.5, size = 5, color = "black") +
annotate("text", x = 2.5, y = -max(table(tmp$`Displaying 1st party intent last 3 months?`)) * 0.1,
label = paste("Sample size:", nrow(tmp)), color = "black", size = 4, hjust = 1, vjust = 0) +
theme_minimal() +
theme(
plot.title = element_text(size = 16, face = "bold"),
axis.title = element_text(size = 12),
axis.text = element_text(size = 10)
)
tmp_newlead <- tmp %>% filter(`Customer?` == "No")
describe(tmp_newlead)
## vars n mean sd
## CompanyURN 1 1299 118481.11 120996.51
## company name* 2 1299 650.00 375.13
## Country* 3 1299 1.00 0.00
## Turnover (£) 4 1299 1290.70 7144.27
## No of staff 5 1299 6759.58 26063.03
## Sub sector* 6 1299 312.26 169.46
## Profit/Loss as % of Turnover 7 1299 -0.38 11.96
## Year founded 8 1299 1978.14 34.90
## No of Sites 9 1299 60.89 208.06
## Customer?* 10 1299 1.00 0.00
## Displaying 1st party intent last 3 months?* 11 1299 1.33 0.47
## median trimmed mad min
## CompanyURN 95094 93275.67 102678.95 41.0
## company name* 650 650.00 481.84 1.0
## Country* 1 1.00 0.00 1.0
## Turnover (£) 294 431.70 282.73 0.3
## No of staff 2508 3362.01 1820.63 1000.0
## Sub sector* 357 317.01 166.05 1.0
## Profit/Loss as % of Turnover 0 0.03 0.00 -296.2
## Year founded 1989 1984.68 22.24 1777.0
## No of Sites 8 21.39 10.38 1.0
## Customer?* 1 1.00 0.00 1.0
## Displaying 1st party intent last 3 months?* 1 1.29 0.00 1.0
## max range skew kurtosis
## CompanyURN 475789 475748.0 1.60 1.75
## company name* 1299 1298.0 0.00 -1.20
## Country* 1 0.0 NaN NaN
## Turnover (£) 180074 180073.7 17.34 367.33
## No of staff 592897 591897.0 18.02 387.76
## Sub sector* 605 604.0 -0.34 -1.17
## Profit/Loss as % of Turnover 105 401.2 -16.09 370.60
## Year founded 2017 240.0 -1.96 4.30
## No of Sites 5253 5252.0 13.99 306.83
## Customer?* 1 0.0 NaN NaN
## Displaying 1st party intent last 3 months?* 2 1.0 0.72 -1.48
## se
## CompanyURN 3357.13
## company name* 10.41
## Country* 0.00
## Turnover (£) 198.22
## No of staff 723.14
## Sub sector* 4.70
## Profit/Loss as % of Turnover 0.33
## Year founded 0.97
## No of Sites 5.77
## Customer?* 0.00
## Displaying 1st party intent last 3 months?* 0.01
tmp_newlead <- tmp_newlead %>%
mutate(Staff_Size_Category = case_when(
`No of staff` <= 2500 ~ "Small",
`No of staff` <= 10000 ~ "Medium",
TRUE ~ "Large"
))
ggplot(tmp_newlead, aes(x = `Turnover (£)`, y = `No of staff`, color = Staff_Size_Category)) +
geom_point(alpha = 0.5) +
labs(
title = "Turnover vs. Staff Size by Workforce Category",
x = "Turnover (£)",
y = "Number of Staff",
color = "Workforce Size Category"
) +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
scale_color_manual(values = c("blue", "green", "red")) +
theme(
panel.grid.major = element_line(color = "gray", linewidth = 0.5),
panel.grid.minor = element_blank(),
axis.text.x = element_text(angle = 45, hjust = 1)
)
tmp_newlead_output <- tmp_newlead %>%
select(`Turnover (£)`, `No of staff`, Staff_Size_Category)
print(tmp_newlead_output)
## # A tibble: 1,299 × 3
## `Turnover (£)` `No of staff` Staff_Size_Category
## <dbl> <dbl> <chr>
## 1 2149. 13002 Large
## 2 787. 4002 Medium
## 3 990. 4256 Medium
## 4 1023. 7658 Medium
## 5 518. 2474 Small
## 6 937. 13501 Large
## 7 838. 2881 Medium
## 8 2916. 3901 Medium
## 9 633. 9723 Medium
## 10 1992. 11479 Large
## # ℹ 1,289 more rows
head(tmp_newlead_output)
## # A tibble: 6 × 3
## `Turnover (£)` `No of staff` Staff_Size_Category
## <dbl> <dbl> <chr>
## 1 2149. 13002 Large
## 2 787. 4002 Medium
## 3 990. 4256 Medium
## 4 1023. 7658 Medium
## 5 518. 2474 Small
## 6 937. 13501 Large
tmp_newlead_output %>%
group_by(Staff_Size_Category) %>%
summarise(
Mean_Turnover = mean(`Turnover (£)`),
Median_Turnover = median(`Turnover (£)`),
Min_Turnover = min(`Turnover (£)`),
Max_Turnover = max(`Turnover (£)`),
Mean_Staff_Size = mean(`No of staff`),
Median_Staff_Size = median(`No of staff`),
Min_Staff_Size = min(`No of staff`),
Max_Staff_Size = max(`No of staff`)
)
## # A tibble: 3 × 9
## Staff_Size_Category Mean_Turnover Median_Turnover Min_Turnover Max_Turnover
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Large 5949. 2010. 280. 180074
## 2 Medium 877. 425. 16.2 77969
## 3 Small 618. 174. 0.3 115312.
## # ℹ 4 more variables: Mean_Staff_Size <dbl>, Median_Staff_Size <dbl>,
## # Min_Staff_Size <dbl>, Max_Staff_Size <dbl>
summary(tmp_newlead)
## CompanyURN company name Country Turnover (£)
## Min. : 41 Length:1299 Length:1299 Min. : 0.3
## 1st Qu.: 24804 Class :character Class :character 1st Qu.: 143.3
## Median : 95094 Mode :character Mode :character Median : 294.0
## Mean :118481 Mean : 1290.7
## 3rd Qu.:155926 3rd Qu.: 711.9
## Max. :475789 Max. :180074.0
## No of staff Sub sector Profit/Loss as % of Turnover
## Min. : 1000 Length:1299 Min. :-296.2000
## 1st Qu.: 1520 Class :character 1st Qu.: 0.0000
## Median : 2508 Mode :character Median : 0.0000
## Mean : 6760 Mean : -0.3757
## 3rd Qu.: 5086 3rd Qu.: 0.1000
## Max. :592897 Max. : 105.0000
## Year founded No of Sites Customer?
## Min. :1777 Min. : 1.00 No :1299
## 1st Qu.:1969 1st Qu.: 3.00 Yes: 0
## Median :1989 Median : 8.00
## Mean :1978 Mean : 60.89
## 3rd Qu.:2000 3rd Qu.: 33.50
## Max. :2017 Max. :5253.00
## Displaying 1st party intent last 3 months? Staff_Size_Category
## No :871 Length:1299
## Yes:428 Class :character
## Mode :character
##
##
##
top_10_profitable_companies <- tmp_newlead %>%
arrange(desc(`Turnover (£)`), desc(`No of staff`)) %>%
head(10)
top_10_profitable_companies %>%
select(CompanyURN, `company name`, `Turnover (£)`, `No of staff`, `Profit/Loss as % of Turnover`) %>%
print()
## # A tibble: 10 × 5
## CompanyURN `company name` `Turnover (£)` `No of staff` Profit/Loss as % of …¹
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 23524 Department fo… 180074 79224 0
## 2 12347 Department of… 115312. 1887 -12.3
## 3 150720 Legal & Gener… 77969 8939 0.3
## 4 389165 Education and… 60302 1919 0
## 5 17556 HM Revenue & … 56627. 59857 -205.
## 6 160582 Lloyds Bankin… 48313 80418 0.1
## 7 24183 Rio Tinto plc 25126. 54938 0.2
## 8 96091 J Sainsbury p… 23506 108300 0
## 9 95782 Compass Group… 22568 588112 0.1
## 10 343548 ASDA Group Li… 21666. 161451 0
## # ℹ abbreviated name: ¹`Profit/Loss as % of Turnover`
write_xlsx(top_10_profitable_companies, "top_10_profitable_companies.xlsx")
tmp <- tmp %>%
mutate(`Profit/Loss as % of Turnover` = round(`Profit/Loss as % of Turnover` * 100, 2))
ggplot(top_10_profitable_companies, aes(x = reorder(`company name`, `Turnover (£)`),
y = `Turnover (£)`, fill = `Profit/Loss as % of Turnover`)) +
geom_bar(stat = "identity", show.legend = TRUE, alpha = 0.8) +
geom_text(aes(label = scales::comma(`Turnover (£)`), y = `Turnover (£)` + 5000),
size = 4, color = "black", fontface = "bold") +
labs(
title = "Top 10 Profitable Companies",
subtitle = "Turnover and Profit Margin Comparison for Top 10 Companies\n(Original dataset: n = 1788)",
x = "Company Name",
y = "Turnover (£)",
fill = "Profit Margin (%)"
) +
scale_fill_gradient(low = "red", high = "green") +
coord_flip() +
theme_minimal() +
theme(
plot.title = element_text(size = 18, face = "bold"),
plot.subtitle = element_text(size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
panel.grid = element_blank(),
legend.position = "top"
)
top_10_intent_based <- tmp_newlead %>%
filter(`Displaying 1st party intent last 3 months?` == "Yes") %>%
arrange(desc(`Turnover (£)`), desc(`No of staff`)) %>%
head(10)
top_10_intent_based %>%
select(CompanyURN, `company name`, `Turnover (£)`, `No of staff`, `Profit/Loss as % of Turnover`) %>%
print()
## # A tibble: 10 × 5
## CompanyURN `company name` `Turnover (£)` `No of staff` Profit/Loss as % of …¹
## <dbl> <chr> <dbl> <dbl> <dbl>
## 1 95782 Compass Group… 22568 588112 0.1
## 2 343548 ASDA Group Li… 21666. 161451 0
## 3 14254 ASDA Stores L… 21408. 157207 0
## 4 20278 HSBC Bank plc 18994 55346 0.1
## 5 23594 Department fo… 18380 19227 0
## 6 19189 Anglo America… 16502. 80000 0.2
## 7 98901 Bank of Scotl… 11091 35752 0.2
## 8 99183 Dixons Carpho… 10585 45461 0
## 9 435309 Cisco Interna… 9170. 3303 0
## 10 409120 Aon plc 9005. 69316 0
## # ℹ abbreviated name: ¹`Profit/Loss as % of Turnover`
write_xlsx(top_10_intent_based, "top_10_intent_based.xlsx")
ggplot(top_10_intent_based, aes(x = reorder(`company name`, `Turnover (£)`),
y = `Turnover (£)`, fill = `Profit/Loss as % of Turnover`)) +
geom_bar(stat = "identity", show.legend = TRUE, alpha = 0.8) +
geom_text(aes(label = scales::comma(`Turnover (£)`), y = `Turnover (£)` + 5000),
size = 4, color = "black", fontface = "bold") +
labs(
title = "Top 10 Intent-Based Companies",
subtitle = "Prioritized by Turnover and Profit Margin\n(Original dataset: n = 1788)",
x = "Company Name",
y = "Turnover (£)",
fill = "Profit Margin (%)"
) +
scale_fill_gradient(low = "red", high = "green") +
coord_flip() +
theme_minimal() +
theme(
plot.title = element_text(size = 18, face = "bold"),
plot.subtitle = element_text(size = 14),
axis.text = element_text(size = 10),
axis.title = element_text(size = 12),
panel.grid = element_blank(),
legend.position = "top"
)
cluster_data <- tmp_newlead %>%
select(`Turnover (£)`, `No of staff`, `Profit/Loss as % of Turnover`) %>%
mutate(across(everything(), scale))
set.seed(123)
kmeans_result <- kmeans(cluster_data, centers = 4)
tmp_newlead$Cluster <- kmeans_result$cluster
cluster_summary <- tmp_newlead %>%
group_by(Cluster) %>%
summarise(
Avg_Turnover = mean(`Turnover (£)`, na.rm = TRUE),
Avg_Staff_Size = mean(`No of staff`, na.rm = TRUE),
Avg_Profit_Margin = mean(`Profit/Loss as % of Turnover`, na.rm = TRUE),
Count = n()
)
print(cluster_summary)
## # A tibble: 4 × 5
## Cluster Avg_Turnover Avg_Staff_Size Avg_Profit_Margin Count
## <int> <dbl> <dbl> <dbl> <int>
## 1 1 23462. 80106. 0.0556 18
## 2 2 15079 590504. 0.05 2
## 3 3 90142. 35613 -128. 4
## 4 4 677. 4718. 0.0188 1275
ggplot(tmp_newlead, aes(x = `Turnover (£)`, y = `No of staff`, color = as.factor(Cluster))) +
geom_point(alpha = 0.6) +
labs(
title = "Clustering of New Leads by Turnover and Staff Size",
subtitle = "Clusters are based on company revenue, staff size, and profitability",
x = "Turnover (£)",
y = "Number of Staff",
color = "Cluster"
) +
scale_y_continuous(labels = scales::comma) +
theme_minimal()
clustered_companies <- tmp_newlead %>%
select(`company name`, `Turnover (£)`, `No of staff`, `Profit/Loss as % of Turnover`, Cluster) %>%
arrange(Cluster)
print(clustered_companies)
## # A tibble: 1,299 × 5
## `company name` `Turnover (£)` `No of staff` Profit/Loss as % of …¹ Cluster
## <chr> <dbl> <dbl> <dbl> <int>
## 1 J Sainsbury plc 23506 108300 0 1
## 2 Wittington Inves… 15526 133913 0.1 1
## 3 ASDA Stores Limi… 21408. 157207 0 1
## 4 TUI Travel Limit… 14619 57389 -0.1 1
## 5 Standard Charter… 15270. 84916 0.1 1
## 6 HSBC Bank plc 18994 55346 0.1 1
## 7 Capita plc 4909. 74755 -0.1 1
## 8 Marks & Spencer … 10622 84939 0 1
## 9 Rio Tinto plc 25126. 54938 0.2 1
## 10 Anglo American p… 16502. 80000 0.2 1
## # ℹ 1,289 more rows
## # ℹ abbreviated name: ¹`Profit/Loss as % of Turnover`
cluster_summary_companies <- tmp_newlead %>%
group_by(Cluster) %>%
summarise(
Count = n(),
Companies = paste(`company name`, collapse = ", ")
)
print(cluster_summary_companies)
## # A tibble: 4 × 3
## Cluster Count Companies
## <int> <int> <chr>
## 1 1 18 J Sainsbury plc, Wittington Investments Limited, ASDA Stores Li…
## 2 2 2 G4S plc, Compass Group PLC
## 3 3 4 Department for Work and Pensions, HM Revenue & Customs, Departm…
## 4 4 1275 BBA Aviation plc, GKN Aerospace Services Limited, Leonardo MW L…
if("converted" %in% names(tmp_newlead)) {
model <- glm(converted ~ `Turnover (£)` + `No of staff` + `Profit/Loss as % of Turnover` + `Displaying 1st Party Intent Last 3 Months?`,
data = tmp_newlead, family = binomial)
tmp_newlead$conversion_prob <- predict(model, tmp_newlead, type = "response")
ggplot(tmp_newlead, aes(x = conversion_prob)) +
geom_histogram(fill = "#F64668", bins = 20, alpha = 0.7) +
labs(title = "Lead Conversion Probability Distribution", x = "Conversion Probability", y = "Count") +
theme_minimal()
} else {
message("Conversion data is not available for probability modeling.")
}
## Conversion data is not available for probability modeling.
tmp$`Customer?` <- ifelse(tmp$`Customer?` == "Yes", 1, 0)
sum(is.na(tmp))
## [1] 0
tmp_lga <- tmp %>%
mutate(`Displaying 1st party intent last 3 months?` = ifelse(`Displaying 1st party intent last 3 months?` == "Yes", 1, 0))
tmp_lga <- tmp_lga %>%
mutate(across(`Sub sector`, as.factor)) %>%
fastDummies::dummy_cols(select_columns = "Sub sector", remove_first_dummy = TRUE)
tmp_lga <- tmp_lga %>%
mutate(across(c(`Turnover (£)`, `No of staff`, `Profit/Loss as % of Turnover`), scale))
model <- glm(`Customer?` ~ `Turnover (£)` + `No of staff` + `Profit/Loss as % of Turnover` + `Displaying 1st party intent last 3 months?`,
data = tmp_lga, family = binomial)
summary(model)
##
## Call:
## glm(formula = `Customer?` ~ `Turnover (£)` + `No of staff` +
## `Profit/Loss as % of Turnover` + `Displaying 1st party intent last 3 months?`,
## family = binomial, data = tmp_lga)
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) -1.24811 0.07210 -17.311
## `Turnover (£)` 0.19726 0.06370 3.097
## `No of staff` 0.12016 0.12026 0.999
## `Profit/Loss as % of Turnover` 0.06202 0.06861 0.904
## `Displaying 1st party intent last 3 months?` 0.65976 0.10865 6.072
## Pr(>|z|)
## (Intercept) < 2e-16 ***
## `Turnover (£)` 0.00196 **
## `No of staff` 0.31769
## `Profit/Loss as % of Turnover` 0.36599
## `Displaying 1st party intent last 3 months?` 1.26e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2098.0 on 1787 degrees of freedom
## Residual deviance: 2043.9 on 1783 degrees of freedom
## AIC: 2053.9
##
## Number of Fisher Scoring iterations: 5
tmp_lga$conversion_prob <- predict(model, tmp_lga, type = "response")
ggplot(tmp_lga, aes(x = conversion_prob)) +
geom_histogram(fill = "#F64668", bins = 20, alpha = 0.7) +
labs(title = "Lead Conversion Probability Distribution", x = "Conversion Probability", y = "Count") +
theme_minimal()
tmp_lga <- tmp_lga %>%
arrange(desc(conversion_prob))
top_companies <- tmp_lga %>% head(5)
ggplot(top_companies, aes(x = conversion_prob, y = reorder(`company name`, conversion_prob))) +
geom_bar(stat = "identity", fill = "#F64668", alpha = 0.7) +
labs(title = "Top 5 Companies by Lead Conversion Probability",
x = "Conversion Probability",
y = "Company Name") +
theme_minimal() +
theme(axis.text.y = element_text(size = 10))
tmp_lga$conversion_prob <- predict(model, tmp_lga, type = "response")
ggplot(tmp_lga, aes(x = conversion_prob)) +
geom_histogram(fill = "#F64668", bins = 20, alpha = 0.7) +
labs(title = "Lead Conversion Probability Distribution", x = "Conversion Probability", y = "Count") +
theme_minimal()
roc_curve <- roc(tmp_lga$'Customer?', tmp_lga$conversion_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_curve)
auc(roc_curve)
## Area under the curve: 0.633
print(roc_curve)
##
## Call:
## roc.default(response = tmp_lga$"Customer?", predictor = tmp_lga$conversion_prob)
##
## Data: tmp_lga$conversion_prob in 1299 controls (tmp_lga$"Customer?" 0) < 489 cases (tmp_lga$"Customer?" 1).
## Area under the curve: 0.633
plot(roc_curve)
auc_value <- auc(roc_curve)
print(paste("AUC:", auc_value))
## [1] "AUC: 0.632958182399234"
print(roc_curve)
##
## Call:
## roc.default(response = tmp_lga$"Customer?", predictor = tmp_lga$conversion_prob)
##
## Data: tmp_lga$conversion_prob in 1299 controls (tmp_lga$"Customer?" 0) < 489 cases (tmp_lga$"Customer?" 1).
## Area under the curve: 0.633