# load packages
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
library(ggplot2)

# load customer dataset
customers <- read.csv("/cloud/project/Employees/customer_engagement.csv")
colnames(customers)
## [1] "Employee.ID"             "Month"                  
## [3] "Total.Customer.Calls"    "Total.Customer.Meetings"
## [5] "Email.Outreach"          "Follow.up.Engagements"  
## [7] "Customer.Feedback.Score"
# load employee dataset
employees <- read.csv("/cloud/project/Employees/employee_details.csv")
colnames(employees)
## [1] "Employee.ID"     "Employee.Name"   "Gender"          "Age"            
## [5] "Region.Location" "Sales.Team"      "Position.Role"   "Salary...."     
## [9] "Tenure..Years."
# load sales dataset
sales <- read.csv("/cloud/project/Employees/sales_performance.csv")
colnames(sales)
## [1] "Employee.ID"                 "Month"                      
## [3] "Total.Sales...."             "Sales.Target...."           
## [5] "X..of.Sales.Target.Achieved" "Total.Units.Sold"           
## [7] "New.Customers.Acquired"      "Average.Deal.Size...."      
## [9] "Largest.Deal.Closed...."
# Do employees with longer tenures tend to have higher total sales or larger deals closed compared to newer employees?

# join the employees and sales tables
employees_sales_table <- left_join(employees,sales,by='Employee.ID')

# create tenure category
tenure_group <- cut(employees_sales_table$Tenure..Years.,
                    breaks=c(1,4,8,12,16,20),
                    labels=c("1-4","4.1-8","8.1-12","12.1-16","16.1-20"),
                    include.lowest='TRUE')
                    
# create a tenure category column
employees_sales_table <- employees_sales_table %>%
  mutate (Tenure_Category = tenure_group)

# Establish relationship
employee_tenure <- employees_sales_table %>%
  group_by (Tenure_Category)%>% 
  summarise(total_sales=sum(Total.Sales....),
            total_deals=sum(Largest.Deal.Closed....)
            ,.groups='drop')

# Print results
print(employee_tenure)
## # A tibble: 5 × 3
##   Tenure_Category total_sales total_deals
##   <fct>                 <int>       <int>
## 1 1-4                 5617910     1430003
## 2 4.1-8               5023130     1451952
## 3 8.1-12              7708116     1816503
## 4 12.1-16             5066087     1482638
## 5 16.1-20             6609907     1701676
# Visualize results
ggplot(data=employee_tenure,mapping=aes(x=Tenure_Category,y=total_sales,fill=total_deals))+
  geom_col()

# Correlation between total number of customer calls and customer feedback scores?
cor_score <- cor(customers$Total.Customer.Calls,customers$Customer.Feedback.Score)
print(cor_score)
## [1] 0.002517845
# What is the frequency of salary distribution

ggplot(data=employees, mapping=aes(x=Salary....))+
  geom_histogram(color='lightblue')
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# Print results
print(employees)
##    Employee.ID    Employee.Name Gender Age Region.Location Sales.Team
## 1         E001     Ava Anderson Female  31           Texas      North
## 2         E002     William Hall   Male  35           Texas       West
## 3         E003     Oliver Brown   Male  55           Texas      North
## 4         E004       John Smith Female  23         Florida      South
## 5         E005   Henry Martinez Female  40      California    Central
## 6         E006   Benjamin Young Female  48        New York       East
## 7         E007      Mia Jackson   Male  56    Pennsylvania      South
## 8         E008  Isabella Miller Female  30        Michigan       West
## 9         E009      Evelyn King Female  52           Texas      North
## 10        E010    Amelia Garcia   Male  44         Georgia       West
## 11        E011     Harper Lewis   Male  48        Illinois      North
## 12        E012     David Wright   Male  50    Pennsylvania    Central
## 13        E013     Harper Lewis   Male  23           Texas       East
## 14        E014       John Smith Female  40         Florida    Central
## 15        E015    Amelia Garcia   Male  34  North Carolina    Central
## 16        E016    Amelia Garcia   Male  26        New York       East
## 17        E017  Sebastian Scott Female  23    Pennsylvania       West
## 18        E018      Liam Wilson   Male  36    Pennsylvania       East
## 19        E019      Evelyn King   Male  55           Texas      North
## 20        E020 Charlotte Martin Female  43    Pennsylvania      North
## 21        E021  Isabella Miller Female  48         Georgia    Central
## 22        E022     Lucas Harris Female  59  North Carolina      South
## 23        E023     William Hall Female  43           Texas      North
## 24        E024    Amelia Garcia Female  42  North Carolina    Central
## 25        E025      Evelyn King   Male  53        Illinois      South
## 26        E026     William Hall   Male  59      California       East
## 27        E027       John Smith Female  55        New York      South
## 28        E028     William Hall Female  57            Ohio       West
## 29        E029     Oliver Brown Female  50         Georgia    Central
## 30        E030 Charlotte Martin Female  51           Texas    Central
## 31        E031      Evelyn King   Male  41         Georgia    Central
## 32        E032      James White Female  58    Pennsylvania      North
## 33        E033   Emily Robinson Female  30           Texas       West
## 34        E034     Emma Johnson Female  41        New York      South
## 35        E035      Mia Jackson Female  32  North Carolina       East
## 36        E036  Elijah Thompson   Male  38  North Carolina      South
## 37        E037  Sebastian Scott   Male  36    Pennsylvania       West
## 38        E038     Ethan Thomas Female  52        Illinois      South
## 39        E039      Evelyn King   Male  45         Georgia       West
## 40        E040   Michael Walker   Male  40        Illinois      North
## 41        E041  Alexander Clark Female  25        Michigan       West
## 42        E042     Harper Lewis   Male  31           Texas       West
## 43        E043     Mason Taylor Female  44        Michigan       West
## 44        E044   Benjamin Young   Male  40         Georgia       West
## 45        E045     Oliver Brown   Male  51    Pennsylvania      South
## 46        E046   Michael Walker   Male  32      California    Central
## 47        E047      James White Female  32         Georgia      North
## 48        E048     David Wright   Male  56        New York       East
## 49        E049     Emma Johnson   Male  40           Texas      South
## 50        E050   Emily Robinson Female  33  North Carolina      South
##           Position.Role Salary.... Tenure..Years.
## 1           Team Leader      94677           17.6
## 2       Sales Executive      63510           16.7
## 3       Account Manager     104472           14.0
## 4           Team Leader      72079            6.7
## 5           Team Leader      43728           14.0
## 6       Account Manager      55644           14.1
## 7      Regional Manager      76335            2.8
## 8      Regional Manager     104625           12.0
## 9  Sales Representative      48745           16.6
## 10      Account Manager     118497            1.2
## 11      Account Manager      87416           10.8
## 12      Sales Executive     116161            8.1
## 13          Team Leader     102978           10.2
## 14     Regional Manager     114130            7.5
## 15      Sales Executive      97156           18.3
## 16 Sales Representative      73804            2.9
## 17     Regional Manager     102395            3.9
## 18          Team Leader      55674            5.2
## 19 Sales Representative      88916            4.6
## 20 Sales Representative     107177           18.0
## 21          Team Leader     115400           12.7
## 22      Sales Executive      42299           16.8
## 23          Team Leader      74018           11.0
## 24      Account Manager      61344           11.1
## 25     Regional Manager     116238           11.5
## 26          Team Leader      47669           15.4
## 27     Regional Manager      59154            3.2
## 28      Account Manager      81901            9.7
## 29      Account Manager     100429            2.5
## 30      Account Manager      57759           16.6
## 31          Team Leader      86843           18.5
## 32      Sales Executive      61405           10.1
## 33     Regional Manager      46932            6.9
## 34          Team Leader      75739            7.9
## 35 Sales Representative      87338           16.5
## 36      Sales Executive      94986           17.9
## 37     Regional Manager      86659           12.4
## 38      Sales Executive      46385           15.5
## 39          Team Leader      77992            7.5
## 40     Regional Manager      78228           14.5
## 41          Team Leader     109417            1.5
## 42      Account Manager      50069           16.5
## 43 Sales Representative     107353            5.2
## 44 Sales Representative      65403           17.3
## 45      Account Manager      64018            3.7
## 46      Sales Executive      52387            5.3
## 47      Sales Executive     118480            3.4
## 48          Team Leader      61906            3.5
## 49 Sales Representative     111265            2.5
## 50      Sales Executive      82284           14.1
# How employee age influences total sales generated & average deal size closed?

# Join the employees table and sales table
employee_sales <- left_join(employees, sales, by = 'Employee.ID')

# Create age categories
age_category <- cut(employee_sales$Age,
                    breaks=c(18,29,39,49,59),
                    labels=c("18-29","30-39","40-49","50-59"),
                    include.lowest='TRUE')

# Create new age categories column
employee_sales <- employee_sales %>%
  mutate(Age_Category = age_category)

# Establish the relationship
employee.age.influence <- employee_sales %>%
  group_by (Age_Category)%>%
  summarise(total.sales = sum(Total.Sales....,na.rm=TRUE),
            avg.deal.size = mean(Average.Deal.Size....,na.rm=TRUE)
            ,.groups = 'drop')

# Print results
print(employee.age.influence)
## # A tibble: 4 × 3
##   Age_Category total.sales avg.deal.size
##   <fct>              <int>         <dbl>
## 1 18-29            2235740         4455.
## 2 30-39            7336775         4756.
## 3 40-49            9217243         4981.
## 4 50-59           11235392         4777.
# Visualize results
ggplot(data=employee.age.influence, mapping=aes(x=total.sales, y=avg.deal.size,fill=Age_Category))+
  geom_bar(stat='identity')

# Regions with highest average customer engagement in terms of email outreach and follow-up engagements

# Join the employees table and customers table
employee_customer <- right_join(customers,employees,by = 'Employee.ID')

# Establish customer engagement by region
top_regions <- employee_customer %>%
  group_by (Region.Location)%>%
  summarise(avg_email.outreach = mean(Email.Outreach,na.rm=TRUE),
            avg_follow.up = mean(Follow.up.Engagements,na.rm=TRUE)
            ,.groups='drop')
#print results
print(top_regions)
## # A tibble: 10 × 3
##    Region.Location avg_email.outreach avg_follow.up
##    <chr>                        <dbl>         <dbl>
##  1 California                    50.7          18.9
##  2 Florida                       31.3          25.4
##  3 Georgia                       54.4          26.9
##  4 Illinois                      61.2          32.4
##  5 Michigan                      42.5          26.4
##  6 New York                      46.1          21.2
##  7 North Carolina                49.0          25.7
##  8 Ohio                          41.6          17.6
##  9 Pennsylvania                  52.9          21.6
## 10 Texas                         54.2          26.1
# Visualize output
ggplot(data=top_regions, mapping=aes(x=avg_email.outreach,y=avg_follow.up, color=Region.Location))+
geom_point(size=3) + theme_minimal() +
  labs(title='Regions with highest Engagement',
       x='Avg email outreach',
       y='Avg follow up',)

# Relationship between employee salary & percentage of sales target?

# Join the employee and sales tables
employee_sales <- right_join(employees,sales,by='Employee.ID')

# Establish the correlation
cor_salary_sales.target <- cor(employee_sales$Salary....,employee_sales$X..of.Sales.Target.Achieved)

# Print results
print(cor_salary_sales.target)
## [1] -0.0728787
# Sales team with highest % of new customers in comparison to their total customer calls

# join the employees and customer table
employees_customers <- right_join(customers,employees,by='Employee.ID')

# join the employee_customer joined table to sales table
employees_customers_sales <- left_join(employees_customers,sales,by='Employee.ID')
## Warning in left_join(employees_customers, sales, by = "Employee.ID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 36 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Establish the relationship
top.sales.team <- employees_customers_sales%>%
  group_by(Sales.Team)%>%
  summarise(total.customers = sum(New.Customers.Acquired,na.rm=TRUE),
           total.calls = sum(Total.Customer.Calls,na.rm = TRUE)
  ,.groups='drop') %>%
    arrange(desc(total.customers))%>%
    slice(1:10)
  
# Print the results
print(top.sales.team)
## # A tibble: 5 × 3
##   Sales.Team total.customers total.calls
##   <chr>                <int>       <int>
## 1 West                  5325       36651
## 2 South                 3685       31637
## 3 Central               3222       21974
## 4 North                 3154       28192
## 5 East                  2396       16359
# Visualize the results
ggplot(data=top.sales.team,mapping=aes(x=Sales.Team, y=total.customers, fill=total.calls))+
geom_col()

# Top 5 employees in terms of largest deals closed, and what are their average customer feedback scores?
  
# Join employee table and sales table
employees_sales <- left_join(employees,sales,by='Employee.ID')

# Join customer table to the employee sales table above
customers_employees_sales <- left_join(employees_sales,customers,by='Employee.ID')
## Warning in left_join(employees_sales, customers, by = "Employee.ID"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 1 of `x` matches multiple rows in `y`.
## ℹ Row 88 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Establish relationship
top_employees <- customers_employees_sales %>%
  group_by(Employee.Name)%>%
  summarise(total.closed.deal=sum(Largest.Deal.Closed....),
            avg.feedback.score=mean(Customer.Feedback.Score)
  ,.group='drop')%>%
arrange(desc(total.closed.deal))%>%
  slice(1:5)

# Print results
print(top_employees)
## # A tibble: 5 × 4
##   Employee.Name   total.closed.deal avg.feedback.score .group
##   <chr>                       <int>              <dbl> <chr> 
## 1 Evelyn King               5277327               2.80 drop  
## 2 William Hall              4457687               3.14 drop  
## 3 Isabella Miller           4136465               3.32 drop  
## 4 Oliver Brown              3095435               2.91 drop  
## 5 Mason Taylor              3074440               3.05 drop
# Visualize results
ggplot(data=top_employees,mapping=aes(x=total.closed.deal,y=avg.feedback.score,color=Employee.Name))+
  geom_point(size=3)