# load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)
# load customer dataset
customers <- read.csv("/cloud/project/Employees/customer_engagement.csv")
colnames(customers)
## [1] "Employee.ID" "Month"
## [3] "Total.Customer.Calls" "Total.Customer.Meetings"
## [5] "Email.Outreach" "Follow.up.Engagements"
## [7] "Customer.Feedback.Score"
# load employee dataset
employees <- read.csv("/cloud/project/Employees/employee_details.csv")
colnames(employees)
## [1] "Employee.ID" "Employee.Name" "Gender" "Age"
## [5] "Region.Location" "Sales.Team" "Position.Role" "Salary...."
## [9] "Tenure..Years."
# load sales dataset
sales <- read.csv("/cloud/project/Employees/sales_performance.csv")
colnames(sales)
## [1] "Employee.ID" "Month"
## [3] "Total.Sales...." "Sales.Target...."
## [5] "X..of.Sales.Target.Achieved" "Total.Units.Sold"
## [7] "New.Customers.Acquired" "Average.Deal.Size...."
## [9] "Largest.Deal.Closed...."
# Do employees with longer tenures tend to have higher total sales or larger deals closed compared to newer employees?
# join the employees and sales tables
employees_sales_table <- left_join(employees,sales,by='Employee.ID')
# create tenure category
tenure_group <- cut(employees_sales_table$Tenure..Years.,
breaks=c(1,4,8,12,16,20),
labels=c("1-4","4.1-8","8.1-12","12.1-16","16.1-20"),
include.lowest='TRUE')
# create a tenure category column
employees_sales_table <- employees_sales_table %>%
mutate (Tenure_Category = tenure_group)
# Establish relationship
employee_tenure <- employees_sales_table %>%
group_by (Tenure_Category)%>%
summarise(total_sales=sum(Total.Sales....),
total_deals=sum(Largest.Deal.Closed....)
,.groups='drop')
# Print results
print(employee_tenure)
## # A tibble: 5 × 3
## Tenure_Category total_sales total_deals
## <fct> <int> <int>
## 1 1-4 5617910 1430003
## 2 4.1-8 5023130 1451952
## 3 8.1-12 7708116 1816503
## 4 12.1-16 5066087 1482638
## 5 16.1-20 6609907 1701676
# Visualize results
ggplot(data=employee_tenure,mapping=aes(x=Tenure_Category,y=total_sales,fill=total_deals))+
geom_col()

# Correlation between total number of customer calls and customer feedback scores?
cor_score <- cor(customers$Total.Customer.Calls,customers$Customer.Feedback.Score)
print(cor_score)
## [1] 0.002517845
# What is the frequency of salary distribution
ggplot(data=employees, mapping=aes(x=Salary....))+
geom_histogram(color='lightblue')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Print results
print(employees)
## Employee.ID Employee.Name Gender Age Region.Location Sales.Team
## 1 E001 Ava Anderson Female 31 Texas North
## 2 E002 William Hall Male 35 Texas West
## 3 E003 Oliver Brown Male 55 Texas North
## 4 E004 John Smith Female 23 Florida South
## 5 E005 Henry Martinez Female 40 California Central
## 6 E006 Benjamin Young Female 48 New York East
## 7 E007 Mia Jackson Male 56 Pennsylvania South
## 8 E008 Isabella Miller Female 30 Michigan West
## 9 E009 Evelyn King Female 52 Texas North
## 10 E010 Amelia Garcia Male 44 Georgia West
## 11 E011 Harper Lewis Male 48 Illinois North
## 12 E012 David Wright Male 50 Pennsylvania Central
## 13 E013 Harper Lewis Male 23 Texas East
## 14 E014 John Smith Female 40 Florida Central
## 15 E015 Amelia Garcia Male 34 North Carolina Central
## 16 E016 Amelia Garcia Male 26 New York East
## 17 E017 Sebastian Scott Female 23 Pennsylvania West
## 18 E018 Liam Wilson Male 36 Pennsylvania East
## 19 E019 Evelyn King Male 55 Texas North
## 20 E020 Charlotte Martin Female 43 Pennsylvania North
## 21 E021 Isabella Miller Female 48 Georgia Central
## 22 E022 Lucas Harris Female 59 North Carolina South
## 23 E023 William Hall Female 43 Texas North
## 24 E024 Amelia Garcia Female 42 North Carolina Central
## 25 E025 Evelyn King Male 53 Illinois South
## 26 E026 William Hall Male 59 California East
## 27 E027 John Smith Female 55 New York South
## 28 E028 William Hall Female 57 Ohio West
## 29 E029 Oliver Brown Female 50 Georgia Central
## 30 E030 Charlotte Martin Female 51 Texas Central
## 31 E031 Evelyn King Male 41 Georgia Central
## 32 E032 James White Female 58 Pennsylvania North
## 33 E033 Emily Robinson Female 30 Texas West
## 34 E034 Emma Johnson Female 41 New York South
## 35 E035 Mia Jackson Female 32 North Carolina East
## 36 E036 Elijah Thompson Male 38 North Carolina South
## 37 E037 Sebastian Scott Male 36 Pennsylvania West
## 38 E038 Ethan Thomas Female 52 Illinois South
## 39 E039 Evelyn King Male 45 Georgia West
## 40 E040 Michael Walker Male 40 Illinois North
## 41 E041 Alexander Clark Female 25 Michigan West
## 42 E042 Harper Lewis Male 31 Texas West
## 43 E043 Mason Taylor Female 44 Michigan West
## 44 E044 Benjamin Young Male 40 Georgia West
## 45 E045 Oliver Brown Male 51 Pennsylvania South
## 46 E046 Michael Walker Male 32 California Central
## 47 E047 James White Female 32 Georgia North
## 48 E048 David Wright Male 56 New York East
## 49 E049 Emma Johnson Male 40 Texas South
## 50 E050 Emily Robinson Female 33 North Carolina South
## Position.Role Salary.... Tenure..Years.
## 1 Team Leader 94677 17.6
## 2 Sales Executive 63510 16.7
## 3 Account Manager 104472 14.0
## 4 Team Leader 72079 6.7
## 5 Team Leader 43728 14.0
## 6 Account Manager 55644 14.1
## 7 Regional Manager 76335 2.8
## 8 Regional Manager 104625 12.0
## 9 Sales Representative 48745 16.6
## 10 Account Manager 118497 1.2
## 11 Account Manager 87416 10.8
## 12 Sales Executive 116161 8.1
## 13 Team Leader 102978 10.2
## 14 Regional Manager 114130 7.5
## 15 Sales Executive 97156 18.3
## 16 Sales Representative 73804 2.9
## 17 Regional Manager 102395 3.9
## 18 Team Leader 55674 5.2
## 19 Sales Representative 88916 4.6
## 20 Sales Representative 107177 18.0
## 21 Team Leader 115400 12.7
## 22 Sales Executive 42299 16.8
## 23 Team Leader 74018 11.0
## 24 Account Manager 61344 11.1
## 25 Regional Manager 116238 11.5
## 26 Team Leader 47669 15.4
## 27 Regional Manager 59154 3.2
## 28 Account Manager 81901 9.7
## 29 Account Manager 100429 2.5
## 30 Account Manager 57759 16.6
## 31 Team Leader 86843 18.5
## 32 Sales Executive 61405 10.1
## 33 Regional Manager 46932 6.9
## 34 Team Leader 75739 7.9
## 35 Sales Representative 87338 16.5
## 36 Sales Executive 94986 17.9
## 37 Regional Manager 86659 12.4
## 38 Sales Executive 46385 15.5
## 39 Team Leader 77992 7.5
## 40 Regional Manager 78228 14.5
## 41 Team Leader 109417 1.5
## 42 Account Manager 50069 16.5
## 43 Sales Representative 107353 5.2
## 44 Sales Representative 65403 17.3
## 45 Account Manager 64018 3.7
## 46 Sales Executive 52387 5.3
## 47 Sales Executive 118480 3.4
## 48 Team Leader 61906 3.5
## 49 Sales Representative 111265 2.5
## 50 Sales Executive 82284 14.1
# How employee age influences total sales generated & average deal size closed?
# Join the employees table and sales table
employee_sales <- left_join(employees, sales, by = 'Employee.ID')
# Create age categories
age_category <- cut(employee_sales$Age,
breaks=c(18,29,39,49,59),
labels=c("18-29","30-39","40-49","50-59"),
include.lowest='TRUE')
# Create new age categories column
employee_sales <- employee_sales %>%
mutate(Age_Category = age_category)
# Establish the relationship
employee.age.influence <- employee_sales %>%
group_by (Age_Category)%>%
summarise(total.sales = sum(Total.Sales....,na.rm=TRUE),
avg.deal.size = mean(Average.Deal.Size....,na.rm=TRUE)
,.groups = 'drop')
# Print results
print(employee.age.influence)
## # A tibble: 4 × 3
## Age_Category total.sales avg.deal.size
## <fct> <int> <dbl>
## 1 18-29 2235740 4455.
## 2 30-39 7336775 4756.
## 3 40-49 9217243 4981.
## 4 50-59 11235392 4777.
# Visualize results
ggplot(data=employee.age.influence, mapping=aes(x=total.sales, y=avg.deal.size,fill=Age_Category))+
geom_bar(stat='identity')

# Regions with highest average customer engagement in terms of email outreach and follow-up engagements
# Join the employees table and customers table
employee_customer <- right_join(customers,employees,by = 'Employee.ID')
# Establish customer engagement by region
top_regions <- employee_customer %>%
group_by (Region.Location)%>%
summarise(avg_email.outreach = mean(Email.Outreach,na.rm=TRUE),
avg_follow.up = mean(Follow.up.Engagements,na.rm=TRUE)
,.groups='drop')
#print results
print(top_regions)
## # A tibble: 10 × 3
## Region.Location avg_email.outreach avg_follow.up
## <chr> <dbl> <dbl>
## 1 California 50.7 18.9
## 2 Florida 31.3 25.4
## 3 Georgia 54.4 26.9
## 4 Illinois 61.2 32.4
## 5 Michigan 42.5 26.4
## 6 New York 46.1 21.2
## 7 North Carolina 49.0 25.7
## 8 Ohio 41.6 17.6
## 9 Pennsylvania 52.9 21.6
## 10 Texas 54.2 26.1
# Visualize output
ggplot(data=top_regions, mapping=aes(x=avg_email.outreach,y=avg_follow.up, color=Region.Location))+
geom_point(size=3) + theme_minimal() +
labs(title='Regions with highest Engagement',
x='Avg email outreach',
y='Avg follow up',)

# Relationship between employee salary & percentage of sales target?
# Join the employee and sales tables
employee_sales <- right_join(employees,sales,by='Employee.ID')
# Establish the correlation
cor_salary_sales.target <- cor(employee_sales$Salary....,employee_sales$X..of.Sales.Target.Achieved)
# Print results
print(cor_salary_sales.target)
## [1] -0.0728787
# Sales team with highest % of new customers in comparison to their total customer calls
# join the employees and customer table
employees_customers <- right_join(customers,employees,by='Employee.ID')
# join the employee_customer joined table to sales table
employees_customers_sales <- left_join(employees_customers,sales,by='Employee.ID')
## Warning in left_join(employees_customers, sales, by = "Employee.ID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 36 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Establish the relationship
top.sales.team <- employees_customers_sales%>%
group_by(Sales.Team)%>%
summarise(total.customers = sum(New.Customers.Acquired,na.rm=TRUE),
total.calls = sum(Total.Customer.Calls,na.rm = TRUE)
,.groups='drop') %>%
arrange(desc(total.customers))%>%
slice(1:10)
# Print the results
print(top.sales.team)
## # A tibble: 5 × 3
## Sales.Team total.customers total.calls
## <chr> <int> <int>
## 1 West 5325 36651
## 2 South 3685 31637
## 3 Central 3222 21974
## 4 North 3154 28192
## 5 East 2396 16359
# Visualize the results
ggplot(data=top.sales.team,mapping=aes(x=Sales.Team, y=total.customers, fill=total.calls))+
geom_col()

# Top 5 employees in terms of largest deals closed, and what are their average customer feedback scores?
# Join employee table and sales table
employees_sales <- left_join(employees,sales,by='Employee.ID')
# Join customer table to the employee sales table above
customers_employees_sales <- left_join(employees_sales,customers,by='Employee.ID')
## Warning in left_join(employees_sales, customers, by = "Employee.ID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 88 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
# Establish relationship
top_employees <- customers_employees_sales %>%
group_by(Employee.Name)%>%
summarise(total.closed.deal=sum(Largest.Deal.Closed....),
avg.feedback.score=mean(Customer.Feedback.Score)
,.group='drop')%>%
arrange(desc(total.closed.deal))%>%
slice(1:5)
# Print results
print(top_employees)
## # A tibble: 5 × 4
## Employee.Name total.closed.deal avg.feedback.score .group
## <chr> <int> <dbl> <chr>
## 1 Evelyn King 5277327 2.80 drop
## 2 William Hall 4457687 3.14 drop
## 3 Isabella Miller 4136465 3.32 drop
## 4 Oliver Brown 3095435 2.91 drop
## 5 Mason Taylor 3074440 3.05 drop
# Visualize results
ggplot(data=top_employees,mapping=aes(x=total.closed.deal,y=avg.feedback.score,color=Employee.Name))+
geom_point(size=3)
