ChengTamrinMidterm

#Loading libraries
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

library(psych)

## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(viridis)

## Loading required package: viridisLite

library(scales)

## 
## Attaching package: 'scales'
## 
## The following object is masked from 'package:viridis':
## 
##     viridis_pal
## 
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
## 
## The following object is masked from 'package:purrr':
## 
##     discard
## 
## The following object is masked from 'package:readr':
## 
##     col_factor

library(purrr)
library(flextable)

## 
## Attaching package: 'flextable'
## 
## The following object is masked from 'package:purrr':
## 
##     compose

HR_Analytics <- read_csv("HR_Analytics.csv")

## Rows: 1470 Columns: 35
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (9): Attrition, BusinessTravel, Department, EducationField, Gender, Job...
## dbl (26): Age, DailyRate, DistanceFromHome, Education, EmployeeCount, Employ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

# Randomly selecting just 500 from the data set so everything doesn't take so long 
# We randomly pull numbers from 1 to length of data and use those to index which rows we want to pull 
set.seed(123)
rows.to.select <- sample(1:nrow(HR_Analytics), size = 500, replace = FALSE)
HR_Analytics <- HR_Analytics[rows.to.select, ]

#Get an overview of the division of attrition to no attrition 
summary(HR_Analytics)

##       Age         Attrition         BusinessTravel       DailyRate     
##  Min.   :18.00   Length:500         Length:500         Min.   : 102.0  
##  1st Qu.:31.00   Class :character   Class :character   1st Qu.: 439.8  
##  Median :36.00   Mode  :character   Mode  :character   Median : 787.5  
##  Mean   :37.44                                         Mean   : 795.5  
##  3rd Qu.:44.00                                         3rd Qu.:1167.5  
##  Max.   :60.00                                         Max.   :1498.0  
##   Department        DistanceFromHome   Education     EducationField    
##  Length:500         Min.   : 1.000   Min.   :1.000   Length:500        
##  Class :character   1st Qu.: 2.000   1st Qu.:2.000   Class :character  
##  Mode  :character   Median : 7.000   Median :3.000   Mode  :character  
##                     Mean   : 9.538   Mean   :2.996                     
##                     3rd Qu.:15.000   3rd Qu.:4.000                     
##                     Max.   :29.000   Max.   :5.000                     
##  EmployeeCount EmployeeNumber   EnvironmentSatisfaction    Gender         
##  Min.   :1     Min.   :   8.0   Min.   :1.00            Length:500        
##  1st Qu.:1     1st Qu.: 474.8   1st Qu.:2.00            Class :character  
##  Median :1     Median : 974.5   Median :3.00            Mode  :character  
##  Mean   :1     Mean   :1004.6   Mean   :2.76                              
##  3rd Qu.:1     3rd Qu.:1542.5   3rd Qu.:4.00                              
##  Max.   :1     Max.   :2068.0   Max.   :4.00                              
##    HourlyRate     JobInvolvement     JobLevel       JobRole         
##  Min.   : 30.00   Min.   :1.000   Min.   :1.000   Length:500        
##  1st Qu.: 49.00   1st Qu.:2.000   1st Qu.:1.000   Class :character  
##  Median : 65.00   Median :3.000   Median :2.000   Mode  :character  
##  Mean   : 65.71   Mean   :2.752   Mean   :2.158                     
##  3rd Qu.: 83.00   3rd Qu.:3.000   3rd Qu.:3.000                     
##  Max.   :100.00   Max.   :4.000   Max.   :5.000                     
##  JobSatisfaction MaritalStatus      MonthlyIncome    MonthlyRate   
##  Min.   :1.00    Length:500         Min.   : 1052   Min.   : 2097  
##  1st Qu.:2.00    Class :character   1st Qu.: 2898   1st Qu.: 8677  
##  Median :3.00    Mode  :character   Median : 5044   Median :14550  
##  Mean   :2.74                       Mean   : 6833   Mean   :14366  
##  3rd Qu.:4.00                       3rd Qu.: 9390   3rd Qu.:20329  
##  Max.   :4.00                       Max.   :19999   Max.   :26959  
##  NumCompaniesWorked    Over18            OverTime         PercentSalaryHike
##  Min.   :0.000      Length:500         Length:500         Min.   :11.00    
##  1st Qu.:1.000      Class :character   Class :character   1st Qu.:12.00    
##  Median :2.000      Mode  :character   Mode  :character   Median :14.00    
##  Mean   :2.862                                            Mean   :15.33    
##  3rd Qu.:4.000                                            3rd Qu.:18.00    
##  Max.   :9.000                                            Max.   :25.00    
##  PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
##  Min.   :3.000     Min.   :1.000            Min.   :80    Min.   :0.000   
##  1st Qu.:3.000     1st Qu.:2.000            1st Qu.:80    1st Qu.:0.000   
##  Median :3.000     Median :3.000            Median :80    Median :1.000   
##  Mean   :3.162     Mean   :2.754            Mean   :80    Mean   :0.782   
##  3rd Qu.:3.000     3rd Qu.:4.000            3rd Qu.:80    3rd Qu.:1.000   
##  Max.   :4.000     Max.   :4.000            Max.   :80    Max.   :3.000   
##  TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany  
##  Min.   : 0.00     Min.   :0.00          Min.   :1.000   Min.   : 0.000  
##  1st Qu.: 6.00     1st Qu.:2.00          1st Qu.:2.000   1st Qu.: 3.000  
##  Median :10.00     Median :3.00          Median :3.000   Median : 5.000  
##  Mean   :12.02     Mean   :2.87          Mean   :2.768   Mean   : 7.366  
##  3rd Qu.:17.00     3rd Qu.:3.00          3rd Qu.:3.000   3rd Qu.:10.000  
##  Max.   :40.00     Max.   :6.00          Max.   :4.000   Max.   :40.000  
##  YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
##  Min.   : 0.00      Min.   : 0.000          Min.   : 0.000      
##  1st Qu.: 2.00      1st Qu.: 0.000          1st Qu.: 2.000      
##  Median : 3.00      Median : 1.000          Median : 3.000      
##  Mean   : 4.42      Mean   : 2.476          Mean   : 4.302      
##  3rd Qu.: 7.00      3rd Qu.: 4.000          3rd Qu.: 7.000      
##  Max.   :18.00      Max.   :15.000          Max.   :17.000

count(HR_Analytics, Attrition)

## # A tibble: 2 × 2
##   Attrition     n
##   <chr>     <int>
## 1 No          415
## 2 Yes          85

count(HR_Analytics, Attrition, Gender)

## # A tibble: 4 × 3
##   Attrition Gender     n
##   <chr>     <chr>  <int>
## 1 No        Female   156
## 2 No        Male     259
## 3 Yes       Female    35
## 4 Yes       Male      50

# What variables do I have? 
all.var <- names(HR_Analytics)

# Want to follow up on these different ways to represent money - all highly correlated?
money.var <- c("DailyRate", "HourlyRate", "MonthlyIncome", "MonthlyRate")

#List of Categorical Variables
cat.var <- c("Attrition", "BusinessTravel", "Department", "EducationField", "Gender", "JobRole", "MaritalStatus", "Over18", "OverTime")

#List of Not Helpful Variables
not.helpful.var <- c("EmployeeCount", "EmployeeNumber", "PerformanceRating", "StandardHours")
num.var <- setdiff(all.var, cat.var) |>
  setdiff(not.helpful.var)

# Subset the data to include only the relevant variables
df_subset <- HR_Analytics[, c("DailyRate", "HourlyRate", "MonthlyIncome", "MonthlyRate")]

# Create a pair plot for the selected variables with theme_minimal()
ggpairs(df_subset,
        upper = list(continuous = wrap("cor", size = 5)),  # Correlation in upper
        lower = list(continuous = wrap("points", alpha = 0.5)),  # Scatterplots in lower
        diag = list(continuous = wrap("densityDiag")),  # Density plots on diagonal
        title = "Pair Plot for DailyRate, HourlyRate, MonthlyIncome, and MonthlyRate") +
  theme_minimal()  # Apply minimal theme

# Barely any correlation seen in pair plot, for ease of use and for practice purposes will use MonthlyIncome as sole indicator of income

#Creating a table of descriptives for all helpful variables

# Select only the relevant columns
all.included.var <- HR_Analytics %>%
  select(JobSatisfaction, EnvironmentSatisfaction, MonthlyIncome, 
         WorkLifeBalance, YearsAtCompany, YearsInCurrentRole, OverTime, Gender)

# Split into numeric and categorical columns
numeric_cols <- all.included.var %>%
  select(where(is.numeric))

categorical_cols <- all.included.var %>%
  select(OverTime, Gender)

# Calculate summary statistics (Min, Max, Mean, SD) for numeric variables
numeric_descriptives <- map_dfr(numeric_cols, ~ tibble(
  Variable = deparse(substitute(.)),
  Min = round(min(., na.rm = TRUE), 2),
  Max = round(max(., na.rm = TRUE), 2),
  Mean = round(mean(., na.rm = TRUE), 2),
  SD = round(sd(., na.rm = TRUE), 2)
), .id = "Variable")

# Create the flextable
descriptives_table <- flextable(numeric_descriptives)

# Display the table
descriptives_table

Variable	Min	Max	Mean	SD
JobSatisfaction	1	4	2.74	1.09
EnvironmentSatisfaction	1	4	2.76	1.08
MonthlyIncome	1,052	19,999	6,832.89	5,008.53
WorkLifeBalance	1	4	2.77	0.73
YearsAtCompany	0	40	7.37	6.52
YearsInCurrentRole	0	18	4.42	3.79

# Create bar plot showing attrition by gender

# Summarize the data to calculate proportions of attrition by gender
attrition_by_gender <- HR_Analytics %>%
  group_by(Gender, Attrition) %>%
  summarise(count = n()) %>%
  mutate(percentage = count / sum(count) * 100) %>%
  ungroup()

## `summarise()` has grouped output by 'Gender'. You can override using the
## `.groups` argument.

# Create a bar plot with percentages
ggplot(attrition_by_gender, aes(x = Gender, y = percentage, fill = Attrition)) +
  geom_bar(stat = "identity", position = "dodge") +
  geom_text(aes(label = sprintf("%.1f%%", percentage)), 
            position = position_dodge(0.9), vjust = -0.5) +
  labs(title = "Attrition by Gender (Percentage)", 
       x = "Gender", 
       y = "Percentage of Employees", 
       fill = "Attrition") +
  theme_minimal()

# Select relevant columns including Attrition for grouping
df_selected <- HR_Analytics %>%
  select(Attrition, JobSatisfaction, EnvironmentSatisfaction, 
         MonthlyIncome, WorkLifeBalance, YearsAtCompany, YearsInCurrentRole, Age)

# JobSatisfaction by Attrition
ggplot(df_selected, aes(x = Attrition, y = JobSatisfaction, fill = Attrition)) +
  geom_boxplot(outlier.shape = NA) +  # Hide default outliers for custom labeling
  geom_jitter(width = 0.2, alpha = 0.3, color = "gray40") +  # Add jitter for better visualization
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red", show.legend = FALSE) +  # Add mean point
  stat_summary(fun.data = function(y) data.frame(y=median(y), label = paste("Median:", round(median(y), 2))),
               geom = "text", vjust = -1.5, size = 4) +  # Label median
  stat_summary(fun.data = function(y) data.frame(y=mean(y), label = paste("Mean:", round(mean(y), 2))),
               geom = "text", vjust = 1.5, color = "red", size = 4) +  # Label mean
  stat_summary(fun.data = function(y) data.frame(y=min(y), label=paste("Min:", round(min(y), 2))),
               geom="text", vjust=1.5, hjust=-0.2, size=4, color="blue") +  # Label min (lower whisker)
  stat_summary(fun.data = function(y) data.frame(y=max(y), label=paste("Max:", round(max(y), 2))),
               geom="text", vjust=-1.5, hjust=1.2, size=4, color="blue") +  # Label max (upper whisker)
  scale_fill_viridis(discrete = TRUE, option = "D") +  # Use viridis palette
  labs(title = "Job Satisfaction by Attrition", x = "Attrition", y = "Job Satisfaction") +
  theme_minimal(base_size = 14) +  # Ensure text is legible
  theme(legend.position = "none",  # Remove legend as Attrition is already on the x-axis
        plot.title = element_text(size = 16, face = "bold"),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14))  # Adjust text sizes for readability

# EnvironmentSatisfaction by Attrition
ggplot(df_selected, aes(x = Attrition, y = EnvironmentSatisfaction, fill = Attrition)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, color = "gray40") +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red", show.legend = FALSE) +
  stat_summary(fun.data = function(y) data.frame(y=median(y), label = paste("Median:", round(median(y), 2))),
               geom = "text", vjust = -1.5, size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=mean(y), label = paste("Mean:", round(mean(y), 2))),
               geom = "text", vjust = 1.5, color = "red", size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=min(y), label=paste("Min:", round(min(y), 2))),
               geom="text", vjust=1.5, hjust=-0.2, size=4, color="blue") +  # Label min (lower whisker)
  stat_summary(fun.data = function(y) data.frame(y=max(y), label=paste("Max:", round(max(y), 2))),
               geom="text", vjust=-1.5, hjust=1.2, size=4, color="blue") +  # Label max (upper whisker)
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Environment Satisfaction by Attrition", x = "Attrition", y = "Environment Satisfaction") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(size = 16, face = "bold"),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14))

# MonthlyIncome by Attrition
ggplot(df_selected, aes(x = Attrition, y = MonthlyIncome, fill = Attrition)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, color = "gray40") +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red", show.legend = FALSE) +
  stat_summary(fun.data = function(y) data.frame(y=median(y), label = paste("Median:", round(median(y), 2))),
               geom = "text", vjust = -1.5, size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=mean(y), label = paste("Mean:", round(mean(y), 2))),
               geom = "text", vjust = 1.5, color = "red", size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=min(y), label=paste("Min:", round(min(y), 2))),
               geom="text", vjust=1.5, hjust=-0.2, size=4, color="blue") +  # Label min (lower whisker)
  stat_summary(fun.data = function(y) data.frame(y=max(y), label=paste("Max:", round(max(y), 2))),
               geom="text", vjust=-1.5, hjust=1.2, size=4, color="blue") +  # Label max (upper whisker)
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Monthly Income by Attrition", x = "Attrition", y = "Monthly Income") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(size = 16, face = "bold"),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14))

# WorkLifeBalance by Attrition
ggplot(df_selected, aes(x = Attrition, y = WorkLifeBalance, fill = Attrition)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, color = "gray40") +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red", show.legend = FALSE) +
  stat_summary(fun.data = function(y) data.frame(y=median(y), label = paste("Median:", round(median(y), 2))),
               geom = "text", vjust = -1.5, size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=mean(y), label = paste("Mean:", round(mean(y), 2))),
               geom = "text", vjust = 1.5, color = "red", size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=min(y), label=paste("Min:", round(min(y), 2))),
               geom="text", vjust=1.5, hjust=-0.2, size=4, color="blue") +  # Label min (lower whisker)
  stat_summary(fun.data = function(y) data.frame(y=max(y), label=paste("Max:", round(max(y), 2))),
               geom="text", vjust=-1.5, hjust=1.2, size=4, color="blue") +  # Label max (upper whisker)
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Work-Life Balance by Attrition", x = "Attrition", y = "Work-Life Balance") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(size = 16, face = "bold"),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14))

# YearsAtCompany by Attrition
ggplot(df_selected, aes(x = Attrition, y = YearsAtCompany, fill = Attrition)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, color = "gray40") +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red", show.legend = FALSE) +
  stat_summary(fun.data = function(y) data.frame(y=median(y), label = paste("Median:", round(median(y), 2))),
               geom = "text", vjust = -1.5, size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=mean(y), label = paste("Mean:", round(mean(y), 2))),
               geom = "text", vjust = 1.5, color = "red", size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=min(y), label=paste("Min:", round(min(y), 2))),
               geom="text", vjust=1.5, hjust=-0.2, size=4, color="blue") +  # Label min (lower whisker)
  stat_summary(fun.data = function(y) data.frame(y=max(y), label=paste("Max:", round(max(y), 2))),
               geom="text", vjust=-1.5, hjust=1.2, size=4, color="blue") +  # Label max (upper whisker)
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Years at Company by Attrition", x = "Attrition", y = "Years at Company") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(size = 16, face = "bold"),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14))

# YearsInCurrentRole by Attrition
ggplot(df_selected, aes(x = Attrition, y = YearsInCurrentRole, fill = Attrition)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, color = "gray40") +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red", show.legend = FALSE) +
  stat_summary(fun.data = function(y) data.frame(y=median(y), label = paste("Median:", round(median(y), 2))),
               geom = "text", vjust = -1.5, size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=mean(y), label = paste("Mean:", round(mean(y), 2))),
               geom = "text", vjust = 1.5, color = "red", size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=min(y), label=paste("Min:", round(min(y), 2))),
               geom="text", vjust=1.5, hjust=-0.2, size=4, color="blue") +  # Label min (lower whisker)
  stat_summary(fun.data = function(y) data.frame(y=max(y), label=paste("Max:", round(max(y), 2))),
               geom="text", vjust=-1.5, hjust=1.2, size=4, color="blue") +  # Label max (upper whisker)
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Years in Current Role by Attrition", x = "Attrition", y = "Years in Current Role") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(size = 16, face = "bold"),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14))

# Age by Attrition
ggplot(df_selected, aes(x = Attrition, y = Age, fill = Attrition)) +
  geom_boxplot(outlier.shape = NA) +
  geom_jitter(width = 0.2, alpha = 0.3, color = "gray40") +
  stat_summary(fun = mean, geom = "point", shape = 18, size = 3, color = "red", show.legend = FALSE) +
  stat_summary(fun.data = function(y) data.frame(y=median(y), label = paste("Median:", round(median(y), 2))),
               geom = "text", vjust = -1.5, size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=mean(y), label = paste("Mean:", round(mean(y), 2))),
               geom = "text", vjust = 1.5, color = "red", size = 4) +
  stat_summary(fun.data = function(y) data.frame(y=min(y), label=paste("Min:", round(min(y), 2))),
               geom="text", vjust=1.5, hjust=-0.2, size=4, color="blue") +  # Label min (lower whisker)
  stat_summary(fun.data = function(y) data.frame(y=max(y), label=paste("Max:", round(max(y), 2))),
               geom="text", vjust=-1.5, hjust=1.2, size=4, color="blue") +  # Label max (upper whisker)
  scale_fill_viridis(discrete = TRUE, option = "D") +
  labs(title = "Age by Attrition", x = "Attrition", y = "Age") +
  theme_minimal(base_size = 14) +
  theme(legend.position = "none",
        plot.title = element_text(size = 16, face = "bold"),
        axis.text = element_text(size = 12),
        axis.title = element_text(size = 14))

# Load necessary libraries
library(ComplexHeatmap)

## Loading required package: grid

## ========================================
## ComplexHeatmap version 2.21.1
## Bioconductor page: http://bioconductor.org/packages/ComplexHeatmap/
## Github page: https://github.com/jokergoo/ComplexHeatmap
## Documentation: http://jokergoo.github.io/ComplexHeatmap-reference
## 
## If you use it in published research, please cite either one:
## - Gu, Z. Complex Heatmap Visualization. iMeta 2022.
## - Gu, Z. Complex heatmaps reveal patterns and correlations in multidimensional 
##     genomic data. Bioinformatics 2016.
## 
## 
## The new InteractiveComplexHeatmap package can directly export static 
## complex heatmaps into an interactive Shiny app with zero effort. Have a try!
## 
## This message can be suppressed by:
##   suppressPackageStartupMessages(library(ComplexHeatmap))
## ========================================

library(dplyr)

# Define numeric columns
numeric_columns <- c("Age", "MonthlyIncome", "EnvironmentSatisfaction", "JobSatisfaction", 
                     "WorkLifeBalance", "YearsAtCompany", "YearsInCurrentRole")

# Subset the data to include only numeric columns and Attrition
hr_numeric_data <- HR_Analytics[, numeric_columns]
attrition_data <- HR_Analytics$Attrition  # Extract the Attrition column

# Scale the numeric data
hr_data_scaled <- scale(hr_numeric_data)

# Define a function for uncentered cosine distance
uncenter.dist <- function(m) {
    # Calculate the cosine similarity
    norm <- sqrt(rowSums(m^2))  # Calculate the norms
    similarity <- (m %*% t(m)) / (norm %*% t(norm))  # Cosine similarity matrix
    # Convert cosine similarity to distance
    as.dist(1 - similarity)  # Cosine distance
}

# Perform hierarchical clustering on rows and columns
row.clus <- hclust(uncenter.dist(hr_data_scaled), method = "average")
col.clus <- hclust(uncenter.dist(t(hr_data_scaled)), method = "average")

# Create a k-means clustering solution
set.seed(123)  # Setting seed for reproducibility
k <- 3  # Number of clusters for k-means
kmeans_result <- kmeans(hr_data_scaled, centers = k)

# Reorder rows based on k-means clustering
ordered_indices <- order(kmeans_result$cluster)
hr_data_scaled_ordered <- hr_data_scaled[ordered_indices, ]
attrition_data_ordered <- attrition_data[ordered_indices]  # Reorder attrition data

# Create a color palette for the attrition status
attrition_colors <- c("Yes" = "black", "No" = "white")

# Create a row annotation for the attrition column
attrition_annotation <- rowAnnotation(
  Attrition = attrition_data_ordered,
  col = list(Attrition = attrition_colors),
  annotation_legend_param = list(Attrition = list(title = "Attrition Status"))
)

# Create the heatmap with the attrition row annotation
heatmap_ordered <- Heatmap(hr_data_scaled_ordered, 
                           name = "HR Analytics Heatmap",
                           cluster_rows = row.clus,
                           cluster_columns = col.clus,
                           right_annotation = attrition_annotation)  # Apply attrition to rows

# Draw the heatmap with the attrition row annotation
draw(heatmap_ordered)

ChengTamrinMidterm

Tamrin Cheng

2024-11-04