#download csv file into a dataframe
hrdata <- read.csv("HR-Employee-Attrition.csv")
#It's nice to preview the data. Previewing the first 6 rows
head(hrdata)
#we want to filter out the data
hrdata[ , c("Age", "DailyRate", "DistanceFromHome", "Education", "HourlyRate", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked", "TotalWorkingYears", "TrainingTimesLastYear", "Gender")]
NA
#now to create a correlation table, let's pass the filtered data
cor( hrdata[ , c("Age", "DailyRate", "DistanceFromHome", "Education", "HourlyRate", "MonthlyIncome", "MonthlyRate", "NumCompaniesWorked", "TotalWorkingYears", "TrainingTimesLastYear")] )
Age DailyRate DistanceFromHome Education HourlyRate MonthlyIncome MonthlyRate
Age 1.00000000 0.010660943 -0.001686120 0.20803373 0.024286543 0.497854567 0.028051167
DailyRate 0.01066094 1.000000000 -0.004985337 -0.01680643 0.023381422 0.007707059 -0.032181602
DistanceFromHome -0.00168612 -0.004985337 1.000000000 0.02104183 0.031130586 -0.017014445 0.027472864
Education 0.20803373 -0.016806433 0.021041826 1.00000000 0.016774829 0.094960677 -0.026084197
HourlyRate 0.02428654 0.023381422 0.031130586 0.01677483 1.000000000 -0.015794304 -0.015296750
MonthlyIncome 0.49785457 0.007707059 -0.017014445 0.09496068 -0.015794304 1.000000000 0.034813626
MonthlyRate 0.02805117 -0.032181602 0.027472864 -0.02608420 -0.015296750 0.034813626 1.000000000
NumCompaniesWorked 0.29963476 0.038153434 -0.029250804 0.12631656 0.022156883 0.149515216 0.017521353
TotalWorkingYears 0.68038054 0.014514739 0.004628426 0.14827970 -0.002333682 0.772893246 0.026442471
TrainingTimesLastYear -0.01962082 0.002452543 -0.036942234 -0.02510024 -0.008547685 -0.021736277 0.001466881
NumCompaniesWorked TotalWorkingYears TrainingTimesLastYear
Age 0.29963476 0.680380536 -0.019620819
DailyRate 0.03815343 0.014514739 0.002452543
DistanceFromHome -0.02925080 0.004628426 -0.036942234
Education 0.12631656 0.148279697 -0.025100241
HourlyRate 0.02215688 -0.002333682 -0.008547685
MonthlyIncome 0.14951522 0.772893246 -0.021736277
MonthlyRate 0.01752135 0.026442471 0.001466881
NumCompaniesWorked 1.00000000 0.237638590 -0.066054072
TotalWorkingYears 0.23763859 1.000000000 -0.035661571
TrainingTimesLastYear -0.06605407 -0.035661571 1.000000000
#It would be helpful to create pair plots to visually see the relationships with the highest correlation value
pairs(~MonthlyIncome+Age+TotalWorkingYears+Education,data = hrdata, main = "Scatterplot Matrix")

yes_age <- hrdata[(hrdata$Attrition == "Yes"),'Age']
no_age <- hrdata[(hrdata$Attrition != "Yes"),'Age']
t.test(yes_age, no_age)
Welch Two Sample t-test
data: yes_age and no_age
t = -5.828, df = 316.93, p-value = 1.38e-08
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
-5.288346 -2.618930
sample estimates:
mean of x mean of y
33.60759 37.56123
#I decided to run hypothesis testing because I didn't want any misconception that the decision to lay employees off were for pragmatic reasons and not due to ageism. Therefore, because p is less than 0.05, there is a statistically significant difference between the two samples. Those who left were younger than those who stayed. We can see that in the mean comparison at the bottom. Note that x is the first array we passed in and y is the second array we passed in. That is also confirmed in the confidence interval, since both those numbers listed below are negative, we know that the first array is smaller than the second, with confidence.
#Creating a linear regression model to display the relationship between monthly income based upon the age
model1 <- lm(MonthlyIncome ~ Age, data = hrdata)
summary(model1)
Call:
lm(formula = MonthlyIncome ~ Age, data = hrdata)
Residuals:
Min 1Q Median 3Q Max
-9990.1 -2592.7 -677.9 1810.5 12540.8
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -2970.67 443.70 -6.695 3.06e-11 ***
Age 256.57 11.67 21.995 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 4084 on 1468 degrees of freedom
Multiple R-squared: 0.2479, Adjusted R-squared: 0.2473
F-statistic: 483.8 on 1 and 1468 DF, p-value: < 2.2e-16
# Create a scatter plot with the regression line
plot(MonthlyIncome ~ Age, data = hrdata,
main = "Monthly Income vs. Age",
xlab = "Age",
ylab = "Monthly Income")
# Add the regression line to the plot
abline(model1, col = "red")

# Install needed packages to create a violin plot, a more detailed distribution visualization
install.packages("ggplot2")
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.4/ggplot2_3.5.1.tgz'
Content type 'application/x-gzip' length 4974305 bytes (4.7 MB)
==================================================
downloaded 4.7 MB
The downloaded binary packages are in
/var/folders/x3/54ss49b16vj4lr0y1qs2rk8w0000gn/T//RtmpPh1IO9/downloaded_packages
install.packages("readr")
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.4/readr_2.1.5.tgz'
Content type 'application/x-gzip' length 1970418 bytes (1.9 MB)
==================================================
downloaded 1.9 MB
The downloaded binary packages are in
/var/folders/x3/54ss49b16vj4lr0y1qs2rk8w0000gn/T//RtmpPh1IO9/downloaded_packages
library(ggplot2)
library(readr)
# Create the violin plot
ggplot(hrdata, aes(x = Attrition, y = MonthlyIncome, fill = Attrition)) +
geom_violin() +
geom_boxplot(width = 0.1, fill = "white") +
labs(title = "Monthly Income vs. Attrition Rates",
x = "Attrition",
y = "Monthly Income")

#Shape of Distribution: For the 'No Attrition' plot, the violin plot for this group is slightly right-skewed, indicating that a majority of employees with no attrition have lower to moderate monthly incomes, with a smaller proportion having higher incomes.For the 'Yes Attrition' plot, the violin plot for this group appears more symmetrical, suggesting a more even distribution of monthly incomes among employees who have left the company.
#Central Tendency:The median monthly income (indicated by the white line within the box) appears to be higher for the "No Attrition" group compared to the "Yes Attrition" group. This suggests that employees with higher monthly incomes are less likely to leave the company.
#Spread:The interquartile range (IQR, represented by the box) is wider for the "No Attrition" group, indicating a greater variability in monthly incomes among employees who stay with the company. The "Yes Attrition" group shows a narrower IQR, suggesting a more concentrated range of monthly incomes among those who leave.
#Outliers:There appear to be a few potential outliers on the higher end of the income range for both groups, but they are more prominent in the "No Attrition" group.
#The violin plot suggests a potential relationship between monthly income and employee attrition. Employees with higher monthly incomes tend to stay with the company more often, while those with lower to moderate incomes are more likely to leave. The wider spread of incomes among those who stay might indicate that the company offers a wider range of compensation packages to retain its higher-earning employees.
#By looking at the box plot, we can see that the employees who left or was let go from IBM has an average income of about $4500 per month, while the employees who stayed have an average income of about $5000. Furthermore, the the employees who stayed with IBM have a 25% quartile value and a 75% quartile value that's higher than the 25% and 75% quartile of the employees who left IBM. And there happen to be some anomalies displayed in this graph, with some employees above the the 75% quartile range, having an income of $20000 a month that left the company. This analysis satisfy the assumption that I had that people with higher monthly income will stay. However, down the line, I want to see what the cause of these employees leaving or getting let go.
#Let's also see the relationship between the age and the attrition rate
# Create the violin plot
ggplot(hrdata, aes(x = Attrition, y = Age, fill = Attrition)) +
geom_violin() +
geom_boxplot(width = 0.1, fill = "white") +
labs(title = "Age vs. Attrition Rates",
x = "Attrition",
y = "Age")

#You can see that most employees were either let go or they left around the early thirties.
#we're going to do a hypothesis test with what's called a Welch Two Sample t-test
#After finding the relationship between the age and attrition rates, and monthly income and attrition rate.
#Creating a linear regression model to display multiple x variables at once to predict a y variable
model2 = lm(MonthlyIncome ~ Age + TotalWorkingYears, data=hrdata)
summary(model2)
Call:
lm(formula = MonthlyIncome ~ Age + TotalWorkingYears, data = hrdata)
Residuals:
Min 1Q Median 3Q Max
-11310.8 -1690.8 -91.4 1428.3 11461.5
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1978.08 352.36 5.614 2.36e-08 ***
Age -26.87 11.63 -2.311 0.021 *
TotalWorkingYears 489.13 13.65 35.824 < 2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 2984 on 1467 degrees of freedom
Multiple R-squared: 0.5988, Adjusted R-squared: 0.5983
F-statistic: 1095 on 2 and 1467 DF, p-value: < 2.2e-16
#The R squared value has significantly increased with multiple x variables, with a value of 0.5983. However, with a p-value of 2.2e-16, this values suggests that the relationship is not significant.
library(ggplot2)
# Create the horizontal bar plot using ggplot2
ggplot(hrdata, aes(x = JobRole, fill = factor(JobSatisfaction))) +
geom_bar(position = "dodge") +
coord_flip() + # This line flips the x and y axes to make it horizontal
labs(title = "Job Satisfaction Distribution by Job Role",
x = "Job Role",
y = "Satisfaction Count") +
scale_fill_discrete(name = "Job Satisfaction") + # Changed legend title
theme_bw()

library(ggplot2)
# Assuming your DataFrame is named 'hrdata'
# Create the horizontal violin plot using ggplot2
ggplot(hrdata, aes(x = JobRole, y = DistanceFromHome, fill = JobRole)) +
geom_violin() +
coord_flip() +
labs(title = "Distance From Home by Job Role",
x = "Job Role",
y = "Distance From Home") +
theme_bw() +
theme(legend.position = "none") # Remove the legend as JobRole is already on the y-axis

library(ggplot2)
library(RColorBrewer)
# Calculate average years since last promotion for each job role
hrdata_summary <- aggregate(YearsSinceLastPromotion ~ JobRole, data = hrdata, FUN = mean)
# Create the horizontal bar plot
ggplot(hrdata_summary, aes(x = YearsSinceLastPromotion, y = JobRole)) +
geom_col(fill = "salmon") +
labs(x = "Average Years Since Last Promotion", y = "Job Role",
title = "Years Since Last Promotion by Job Role") +
theme_bw()

library(ggplot2)
# Create the horizontal bar plot using ggplot2
ggplot(hrdata, aes(x = JobRole, fill = factor(Attrition))) +
geom_bar(position = "dodge") +
coord_flip() + # This line flips the x and y axes to make it horizontal
labs(title = "Attrition Count by Job Role",
x = "Job Role",
y = "Attrition Count") +
scale_fill_discrete(name = "Attrition") + # Changed legend title
theme_bw()

library(ggplot2)
# Create the horizontal bar plot using ggplot2
ggplot(hrdata, aes(x = JobRole, fill = factor(OverTime))) +
geom_bar(position = "dodge") +
coord_flip() + # This line flips the x and y axes to make it horizontal
labs(title = "Overtime Count by Job Role",
x = "Job Role",
y = "Overtime Count") +
scale_fill_discrete(name = "Overtime") + # Changed legend title
theme_bw()

# Converting to numeric dummy variables
hrdata$Attrition_values <- ifelse(hrdata$Attrition == "Yes", 1, 0)
hrdata$Overtime_values <- ifelse(hrdata$OverTime == "Yes", 1, 0)
hrdata$Gender_values<-ifelse(hrdata$Gender == "Female", 1, 0)
# Create the logistic regression model
model_logistic <- glm(Attrition_values ~ Gender_values + Overtime_values,
data = hrdata,
family = binomial)
# Print the summary
summary(model_logistic)
Call:
glm(formula = Attrition_values ~ Gender_values + Overtime_values,
family = binomial, data = hrdata)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) -2.0629 0.1135 -18.182 <2e-16 ***
Gender_values -0.2378 0.1517 -1.568 0.117
Overtime_values 1.3408 0.1471 9.117 <2e-16 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 1298.6 on 1469 degrees of freedom
Residual deviance: 1214.7 on 1467 degrees of freedom
AIC: 1220.7
Number of Fisher Scoring iterations: 4
# Extract the coefficient for Overtime_numeric
odds_ratio <- exp(coef(model_logistic)["Overtime_values"])
print(odds_ratio)
Overtime_values
3.821998
# Print the odds ratio
cat("Odds Ratio for Attrition rates based on Overtime responses:", odds_ratio, "\n")
Odds Ratio for Attrition rates based on Overtime responses: 3.821998
