The data set used in this project is the IBM HR Analytics Employee Attrition & Performance data set from Kaggle. It contains data on 1,470 employees with 35 variables related to demographics, job satisfaction, performance, and work conditions. The main objective is to analyze factors influencing employee attrition and identify patterns that contribute to turnover. This analysis includes data cleaning, visualization, and predictive modeling to help HR teams understand and reduce employee attrition.
# Load essential packages
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(readxl)
library(readr)
# load the data set
IBM_HR_Employee_Attrition <- read_csv("C:/Users/san82/OneDrive/Desktop/R-PROGRAMMING/IBM HR-Employee-Attrition.csv",show_col_types = FALSE)
# display first few rows
head(IBM_HR_Employee_Attrition)
## # A tibble: 6 × 35
## Age Attrition BusinessTravel DailyRate Department DistanceFromHome Education
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 41 Yes Travel_Rarely 1102 Sales 1 2
## 2 49 No Travel_Freque… 279 Research … 8 1
## 3 37 Yes Travel_Rarely 1373 Research … 2 2
## 4 33 No Travel_Freque… 1392 Research … 3 4
## 5 27 No Travel_Rarely 591 Research … 2 1
## 6 32 No Travel_Freque… 1005 Research … 2 2
## # ℹ 28 more variables: EducationField <chr>, EmployeeCount <dbl>,
## # EmployeeNumber <dbl>, EnvironmentSatisfaction <dbl>, Gender <chr>,
## # HourlyRate <dbl>, JobInvolvement <dbl>, JobLevel <dbl>, JobRole <chr>,
## # JobSatisfaction <dbl>, MaritalStatus <chr>, MonthlyIncome <dbl>,
## # MonthlyRate <dbl>, NumCompaniesWorked <dbl>, Over18 <chr>, OverTime <chr>,
## # PercentSalaryHike <dbl>, PerformanceRating <dbl>,
## # RelationshipSatisfaction <dbl>, StandardHours <dbl>, …
Interpretation: The IBM HR Employee Attrition dataset was successfully loaded, and the first few rows were displayed. This provides an initial understanding of the variables, data types, and general content of the dataset, which focuses on employee demographics, work experience, and attrition-related information.
str(IBM_HR_Employee_Attrition)
## spc_tbl_ [1,470 × 35] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ Age : num [1:1470] 41 49 37 33 27 32 59 30 38 36 ...
## $ Attrition : chr [1:1470] "Yes" "No" "Yes" "No" ...
## $ BusinessTravel : chr [1:1470] "Travel_Rarely" "Travel_Frequently" "Travel_Rarely" "Travel_Frequently" ...
## $ DailyRate : num [1:1470] 1102 279 1373 1392 591 ...
## $ Department : chr [1:1470] "Sales" "Research & Development" "Research & Development" "Research & Development" ...
## $ DistanceFromHome : num [1:1470] 1 8 2 3 2 2 3 24 23 27 ...
## $ Education : num [1:1470] 2 1 2 4 1 2 3 1 3 3 ...
## $ EducationField : chr [1:1470] "Life Sciences" "Life Sciences" "Other" "Life Sciences" ...
## $ EmployeeCount : num [1:1470] 1 1 1 1 1 1 1 1 1 1 ...
## $ EmployeeNumber : num [1:1470] 1 2 4 5 7 8 10 11 12 13 ...
## $ EnvironmentSatisfaction : num [1:1470] 2 3 4 4 1 4 3 4 4 3 ...
## $ Gender : chr [1:1470] "Female" "Male" "Male" "Female" ...
## $ HourlyRate : num [1:1470] 94 61 92 56 40 79 81 67 44 94 ...
## $ JobInvolvement : num [1:1470] 3 2 2 3 3 3 4 3 2 3 ...
## $ JobLevel : num [1:1470] 2 2 1 1 1 1 1 1 3 2 ...
## $ JobRole : chr [1:1470] "Sales Executive" "Research Scientist" "Laboratory Technician" "Research Scientist" ...
## $ JobSatisfaction : num [1:1470] 4 2 3 3 2 4 1 3 3 3 ...
## $ MaritalStatus : chr [1:1470] "Single" "Married" "Single" "Married" ...
## $ MonthlyIncome : num [1:1470] 5993 5130 2090 2909 3468 ...
## $ MonthlyRate : num [1:1470] 19479 24907 2396 23159 16632 ...
## $ NumCompaniesWorked : num [1:1470] 8 1 6 1 9 0 4 1 0 6 ...
## $ Over18 : chr [1:1470] "Y" "Y" "Y" "Y" ...
## $ OverTime : chr [1:1470] "Yes" "No" "Yes" "Yes" ...
## $ PercentSalaryHike : num [1:1470] 11 23 15 11 12 13 20 22 21 13 ...
## $ PerformanceRating : num [1:1470] 3 4 3 3 3 3 4 4 4 3 ...
## $ RelationshipSatisfaction: num [1:1470] 1 4 2 3 4 3 1 2 2 2 ...
## $ StandardHours : num [1:1470] 80 80 80 80 80 80 80 80 80 80 ...
## $ StockOptionLevel : num [1:1470] 0 1 0 0 1 0 3 1 0 2 ...
## $ TotalWorkingYears : num [1:1470] 8 10 7 8 6 8 12 1 10 17 ...
## $ TrainingTimesLastYear : num [1:1470] 0 3 3 3 3 2 3 2 2 3 ...
## $ WorkLifeBalance : num [1:1470] 1 3 3 3 3 2 2 3 3 2 ...
## $ YearsAtCompany : num [1:1470] 6 10 0 8 2 7 1 1 9 7 ...
## $ YearsInCurrentRole : num [1:1470] 4 7 0 7 2 7 0 0 7 7 ...
## $ YearsSinceLastPromotion : num [1:1470] 0 1 0 3 2 3 0 0 1 7 ...
## $ YearsWithCurrManager : num [1:1470] 5 7 0 0 2 6 0 0 8 7 ...
## - attr(*, "spec")=
## .. cols(
## .. Age = col_double(),
## .. Attrition = col_character(),
## .. BusinessTravel = col_character(),
## .. DailyRate = col_double(),
## .. Department = col_character(),
## .. DistanceFromHome = col_double(),
## .. Education = col_double(),
## .. EducationField = col_character(),
## .. EmployeeCount = col_double(),
## .. EmployeeNumber = col_double(),
## .. EnvironmentSatisfaction = col_double(),
## .. Gender = col_character(),
## .. HourlyRate = col_double(),
## .. JobInvolvement = col_double(),
## .. JobLevel = col_double(),
## .. JobRole = col_character(),
## .. JobSatisfaction = col_double(),
## .. MaritalStatus = col_character(),
## .. MonthlyIncome = col_double(),
## .. MonthlyRate = col_double(),
## .. NumCompaniesWorked = col_double(),
## .. Over18 = col_character(),
## .. OverTime = col_character(),
## .. PercentSalaryHike = col_double(),
## .. PerformanceRating = col_double(),
## .. RelationshipSatisfaction = col_double(),
## .. StandardHours = col_double(),
## .. StockOptionLevel = col_double(),
## .. TotalWorkingYears = col_double(),
## .. TrainingTimesLastYear = col_double(),
## .. WorkLifeBalance = col_double(),
## .. YearsAtCompany = col_double(),
## .. YearsInCurrentRole = col_double(),
## .. YearsSinceLastPromotion = col_double(),
## .. YearsWithCurrManager = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(IBM_HR_Employee_Attrition)
## Age Attrition BusinessTravel DailyRate
## Min. :18.00 Length:1470 Length:1470 Min. : 102.0
## 1st Qu.:30.00 Class :character Class :character 1st Qu.: 465.0
## Median :36.00 Mode :character Mode :character Median : 802.0
## Mean :36.92 Mean : 802.5
## 3rd Qu.:43.00 3rd Qu.:1157.0
## Max. :60.00 Max. :1499.0
## Department DistanceFromHome Education EducationField
## Length:1470 Min. : 1.000 Min. :1.000 Length:1470
## Class :character 1st Qu.: 2.000 1st Qu.:2.000 Class :character
## Mode :character Median : 7.000 Median :3.000 Mode :character
## Mean : 9.193 Mean :2.913
## 3rd Qu.:14.000 3rd Qu.:4.000
## Max. :29.000 Max. :5.000
## EmployeeCount EmployeeNumber EnvironmentSatisfaction Gender
## Min. :1 Min. : 1.0 Min. :1.000 Length:1470
## 1st Qu.:1 1st Qu.: 491.2 1st Qu.:2.000 Class :character
## Median :1 Median :1020.5 Median :3.000 Mode :character
## Mean :1 Mean :1024.9 Mean :2.722
## 3rd Qu.:1 3rd Qu.:1555.8 3rd Qu.:4.000
## Max. :1 Max. :2068.0 Max. :4.000
## HourlyRate JobInvolvement JobLevel JobRole
## Min. : 30.00 Min. :1.00 Min. :1.000 Length:1470
## 1st Qu.: 48.00 1st Qu.:2.00 1st Qu.:1.000 Class :character
## Median : 66.00 Median :3.00 Median :2.000 Mode :character
## Mean : 65.89 Mean :2.73 Mean :2.064
## 3rd Qu.: 83.75 3rd Qu.:3.00 3rd Qu.:3.000
## Max. :100.00 Max. :4.00 Max. :5.000
## JobSatisfaction MaritalStatus MonthlyIncome MonthlyRate
## Min. :1.000 Length:1470 Min. : 1009 Min. : 2094
## 1st Qu.:2.000 Class :character 1st Qu.: 2911 1st Qu.: 8047
## Median :3.000 Mode :character Median : 4919 Median :14236
## Mean :2.729 Mean : 6503 Mean :14313
## 3rd Qu.:4.000 3rd Qu.: 8379 3rd Qu.:20462
## Max. :4.000 Max. :19999 Max. :26999
## NumCompaniesWorked Over18 OverTime PercentSalaryHike
## Min. :0.000 Length:1470 Length:1470 Min. :11.00
## 1st Qu.:1.000 Class :character Class :character 1st Qu.:12.00
## Median :2.000 Mode :character Mode :character Median :14.00
## Mean :2.693 Mean :15.21
## 3rd Qu.:4.000 3rd Qu.:18.00
## Max. :9.000 Max. :25.00
## PerformanceRating RelationshipSatisfaction StandardHours StockOptionLevel
## Min. :3.000 Min. :1.000 Min. :80 Min. :0.0000
## 1st Qu.:3.000 1st Qu.:2.000 1st Qu.:80 1st Qu.:0.0000
## Median :3.000 Median :3.000 Median :80 Median :1.0000
## Mean :3.154 Mean :2.712 Mean :80 Mean :0.7939
## 3rd Qu.:3.000 3rd Qu.:4.000 3rd Qu.:80 3rd Qu.:1.0000
## Max. :4.000 Max. :4.000 Max. :80 Max. :3.0000
## TotalWorkingYears TrainingTimesLastYear WorkLifeBalance YearsAtCompany
## Min. : 0.00 Min. :0.000 Min. :1.000 Min. : 0.000
## 1st Qu.: 6.00 1st Qu.:2.000 1st Qu.:2.000 1st Qu.: 3.000
## Median :10.00 Median :3.000 Median :3.000 Median : 5.000
## Mean :11.28 Mean :2.799 Mean :2.761 Mean : 7.008
## 3rd Qu.:15.00 3rd Qu.:3.000 3rd Qu.:3.000 3rd Qu.: 9.000
## Max. :40.00 Max. :6.000 Max. :4.000 Max. :40.000
## YearsInCurrentRole YearsSinceLastPromotion YearsWithCurrManager
## Min. : 0.000 Min. : 0.000 Min. : 0.000
## 1st Qu.: 2.000 1st Qu.: 0.000 1st Qu.: 2.000
## Median : 3.000 Median : 1.000 Median : 3.000
## Mean : 4.229 Mean : 2.188 Mean : 4.123
## 3rd Qu.: 7.000 3rd Qu.: 3.000 3rd Qu.: 7.000
## Max. :18.000 Max. :15.000 Max. :17.000
Interpretation: The structure (str()) of the dataset confirms that it contains both numerical and categorical variables such as Age, JobRole, and MonthlyIncome. The summary (summary()) provides statistical details, including minimum, median, mean, and maximum values, helping to identify patterns like salary distribution or potential outliers. ———————————————–
cat("Number of Rows: ", nrow(IBM_HR_Employee_Attrition), "\n")
## Number of Rows: 1470
cat("Number of Columns: ", ncol(IBM_HR_Employee_Attrition))
## Number of Columns: 35
Interpretation: The dataset’s dimensions reveal the number of rows and columns, showing how many employees (observations) and variables (features) are included. This helps gauge the dataset’s size and suitability for statistical and predictive analysis.
IBM_HR_Employee_Attrition %>%
summarise(across(where(is.numeric),
list(
mean = ~mean(.x, na.rm = TRUE),
median = ~median(.x, na.rm = TRUE),
sd = ~sd(.x, na.rm = TRUE)
)))
## # A tibble: 1 × 78
## Age_mean Age_median Age_sd DailyRate_mean DailyRate_median DailyRate_sd
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 36.9 36 9.14 802. 802 404.
## # ℹ 72 more variables: DistanceFromHome_mean <dbl>,
## # DistanceFromHome_median <dbl>, DistanceFromHome_sd <dbl>,
## # Education_mean <dbl>, Education_median <dbl>, Education_sd <dbl>,
## # EmployeeCount_mean <dbl>, EmployeeCount_median <dbl>,
## # EmployeeCount_sd <dbl>, EmployeeNumber_mean <dbl>,
## # EmployeeNumber_median <dbl>, EmployeeNumber_sd <dbl>,
## # EnvironmentSatisfaction_mean <dbl>, EnvironmentSatisfaction_median <dbl>, …
Interpretation: The computed mean, median, and standard deviation for Age show the central tendency and variability among employees. A close mean and median indicate a symmetric distribution, while a large standard deviation suggests wider diversity in employee ages.
IBM_HR_Employee_Attrition$JobInvolvement_cat <- factor(IBM_HR_Employee_Attrition$JobInvolvement,
levels = c(1, 2, 3, 4),
labels = c("Low", "Medium", "High", "Very High"))
IBM_HR_Employee_Attrition$JobSatisfaction_cat <- factor(IBM_HR_Employee_Attrition$JobSatisfaction,
levels = c(1, 2, 3, 4),
labels = c("Low", "Medium", "High", "Very High"))
IBM_HR_Employee_Attrition$PerformanceRating_cat <- factor(IBM_HR_Employee_Attrition$PerformanceRating,
levels = c(1, 2, 3, 4),
labels = c("Low", "Good", "Excellent", "Outstanding"))
# View first few rows to confirm the new columns
head(IBM_HR_Employee_Attrition[c("JobInvolvement_cat",
"JobSatisfaction_cat",
"PerformanceRating_cat")])
## # A tibble: 6 × 3
## JobInvolvement_cat JobSatisfaction_cat PerformanceRating_cat
## <fct> <fct> <fct>
## 1 High Very High Excellent
## 2 Medium Medium Outstanding
## 3 Medium High Excellent
## 4 High High Excellent
## 5 High Medium Excellent
## 6 High Very High Excellent
Interpretation: Numeric rating columns such as JobInvolvement, JobSatisfaction, and PerformanceRating were converted into categorical factors (Low, Medium, High, Very High). This transformation enhances interpretability, making future visualizations and statistical analyses more meaningful by representing employee engagement and satisfaction levels in clearer, qualitative terms.
Question 1: Does employee income have any relationship with attrition? Are lower or higher-income employees more likely to leave the company?
ggplot(IBM_HR_Employee_Attrition, aes(x = MonthlyIncome, fill = Attrition)) +
geom_density(alpha = 0.5) +
labs(title = "Density Plot of Monthly Income by Attrition",
x = "Monthly Income", y = "Density") +
theme_minimal()
Interpretation:
Question 2: How are age, income, and experience-related variables correlated? Do employees who have been at the company longer or have higher income show less attrition?
hr_num <- IBM_HR_Employee_Attrition[, c("Age", "MonthlyIncome", "DistanceFromHome",
"TotalWorkingYears", "YearsAtCompany")]
pairs(hr_num,
main = "Pair Plot of IBM HR Employee Features",
pch = 21,
bg = c("red", "green3")[as.factor(IBM_HR_Employee_Attrition$Attrition)],
labels = c("Age", "MonthlyIncome", "DistanceFromHome", "TotalWorkingYears", "YearsAtCompany"))
Interpretation: The pair plot visualizes relationships among numeric variables (Age, MonthlyIncome, DistanceFromHome, TotalWorkingYears, YearsAtCompany).
Question 3:Does the time since last promotion influence attrition? How does pay rate interact with promotion frequency?
ggplot(IBM_HR_Employee_Attrition, aes(x = YearsSinceLastPromotion, y = MonthlyRate)) +
geom_point(aes(color = Department), alpha = 0.7, size = 2) +
facet_wrap(~Attrition) +
labs(title = "Years Since Last Promotion vs Monthly Rate by Attrition",
x = "Years Since Last Promotion",
y = "Monthly Rate") +
theme_minimal()
Interpretation:
The facet plot separates employees by attrition.
In the Attrition = Yes group, employees often have
longer years since last promotion with moderate
monthly rates, implying that delayed
promotions may increase attrition.
Employees with frequent promotions and better pay are
less likely to resign.
Question 4:Which job roles experience the most employee turnover?
ggplot(IBM_HR_Employee_Attrition, aes(x = JobRole, fill = Attrition)) +
geom_bar(position = "dodge") +
labs(title = "Attrition by Job Role",
x = "Job Role", y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Interpretation:
The bar chart compares attrition counts across job roles.
Sales Representatives and Laboratory Technicians may have higher
attrition, possibly due to job pressure or limited growth
opportunities.
These roles could be prioritized for retention programs..
Question 5:How does work-life balance affect attrition rates?
ggplot(IBM_HR_Employee_Attrition, aes(x = WorkLifeBalance, fill = Attrition)) +
geom_bar(position = "fill", color = "black") + # "fill" makes bars show proportions
labs(
title = "Work-Life Balance vs Attrition",
x = "Work-Life Balance (1 = Poor, 4 = Excellent)",
y = "Proportion of Employees"
) +
scale_fill_manual(values = c("orange", "turquoise")) + # simpler colors
theme_minimal()
Interpretation:
The stacked bar chart shows attrition proportions by WorkLifeBalance
ratings.
Employees with poor (1) or average (2) work-life balance show higher
attrition, while those rating **3 or 4 tend to stay longer.
Encouraging flexible schedules and reduced stress can improve
retention.
Question: Does staying longer in a company mean you stay longer in the same role?
model_slr <- lm(YearsInCurrentRole ~ YearsAtCompany, data = IBM_HR_Employee_Attrition)
summary(model_slr)
##
## Call:
## lm(formula = YearsInCurrentRole ~ YearsAtCompany, data = IBM_HR_Employee_Attrition)
##
## Residuals:
## Min 1Q Median 3Q Max
## -12.7512 -1.3282 -0.3282 1.4283 7.9411
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.08458 0.09357 11.59 <2e-16 ***
## YearsAtCompany 0.44872 0.01005 44.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.361 on 1468 degrees of freedom
## Multiple R-squared: 0.5757, Adjusted R-squared: 0.5754
## F-statistic: 1992 on 1 and 1468 DF, p-value: < 2.2e-16
ggplot(IBM_HR_Employee_Attrition, aes(x = YearsAtCompany, y = YearsInCurrentRole)) +
geom_point(color = "purple", alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE, color = "orange") +
labs(
title = "Simple Linear Regression: Years in Current Role vs Years at Company",
x = "Years at Company",
y = "Years in Current Role"
) +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
Interpretation: The simple linear regression analysis shows a strong positive relationship between the number of years an employee has spent at the company and the number of years they have been in their current role. The regression coefficient for YearsAtCompany is 0.448, meaning that for every additional year an employee stays in the company, they tend to spend about 0.45 more years in the same role. This relationship is statistically significant, as indicated by the very small p-value (< 2e-16). The model explains about 58% of the variation in YearsInCurrentRole (R² = 0.5757), which indicates a strong fit for HR-related data. Overall, the results suggest that employees who stay longer in the company generally remain longer in their current roles, reflecting career stability but possibly also slower role rotation or promotion rates.
Question: How does Monthly Income change with Job Level?
# Fit polynomial regression model (degree = 2)
model_poly_income <- lm(MonthlyIncome ~ poly(JobLevel, 2, raw = TRUE),
data = IBM_HR_Employee_Attrition)
# Model summary
summary(model_poly_income)
##
## Call:
## lm(formula = MonthlyIncome ~ poly(JobLevel, 2, raw = TRUE), data = IBM_HR_Employee_Attrition)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4494.1 -802.0 -146.7 717.0 4227.7
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 381.29 152.05 2.508 0.0123 *
## poly(JobLevel, 2, raw = TRUE)1 1868.33 133.72 13.971 <2e-16 ***
## poly(JobLevel, 2, raw = TRUE)2 413.09 24.69 16.729 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1344 on 1467 degrees of freedom
## Multiple R-squared: 0.9186, Adjusted R-squared: 0.9185
## F-statistic: 8277 on 2 and 1467 DF, p-value: < 2.2e-16
# Plot the relationship
plot(IBM_HR_Employee_Attrition$JobLevel, IBM_HR_Employee_Attrition$MonthlyIncome,
pch = 20, col = "skyblue",
main = "Polynomial Regression: Monthly Income vs Job Level",
xlab = "Job Level",
ylab = "Monthly Income")
# Add polynomial regression line
lines(sort(IBM_HR_Employee_Attrition$JobLevel),
predict(model_poly_income,
newdata = data.frame(JobLevel = sort(IBM_HR_Employee_Attrition$JobLevel))),
col = "red", lwd = 2)
# Prediction for new Job Levels
new_data <- data.frame(JobLevel = c(1, 3, 5))
predict_poly_income <- predict(model_poly_income, newdata = new_data)
predict_poly_income
## 1 2 3
## 2662.703 9704.056 20050.110
Interpretation: The scatter plot shows the distribution of Monthly Income across Job Levels._ The red curve represents a polynomial relationship (degree 2) capturing non-linear trends. Predictions example: MonthlyIncome at Job Levels 1, 3, and 5 are r predict_poly_income.
Question: Can we predict employee attrition based on Age, MonthlyIncome, DistanceFromHome, and YearsAtCompany?
# Select only relevant columns for ANOVA
hr_data <- data.frame(
JobRole = IBM_HR_Employee_Attrition$JobRole,
MonthlyIncome = IBM_HR_Employee_Attrition$MonthlyIncome
)
# Step 3: Group-wise summary (mean, sd, count)
group_summary <- hr_data %>%
group_by(JobRole) %>%
summarise(
mean = mean(MonthlyIncome),
sd = sd(MonthlyIncome),
count = n()
)
print("Group-wise summary (Monthly Income by Job Role):")
## [1] "Group-wise summary (Monthly Income by Job Role):"
print(group_summary)
## # A tibble: 9 × 4
## JobRole mean sd count
## <chr> <dbl> <dbl> <int>
## 1 Healthcare Representative 7529. 2543. 131
## 2 Human Resources 4236. 2439. 52
## 3 Laboratory Technician 3237. 1150. 259
## 4 Manager 17182. 2317. 102
## 5 Manufacturing Director 7295. 2677. 145
## 6 Research Director 16034. 2828. 80
## 7 Research Scientist 3240. 1198. 292
## 8 Sales Executive 6924. 2367. 326
## 9 Sales Representative 2626 855. 83
# Step 4: Perform One-Way ANOVA
anova_model <- aov(MonthlyIncome ~ JobRole, data = hr_data)
anova_res <- summary(anova_model)
print("ANOVA Table:")
## [1] "ANOVA Table:"
print(anova_res)
## Df Sum Sq Mean Sq F value Pr(>F)
## JobRole 8 2.657e+10 3.321e+09 810.2 <2e-16 ***
## Residuals 1461 5.989e+09 4.099e+06
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Step 5: Compare calculated F with critical F
f_calculated <- anova_res[[1]]$`F value`[1]
f_critical <- 2.10 # Approximate F-critical for df(8, 1460) at α = 0.05
cat("\nCalculated F =", f_calculated)
##
## Calculated F = 810.2141
cat("\nCritical F =", f_critical)
##
## Critical F = 2.1
if (f_calculated > f_critical) {
cat("\nDecision: Reject H0 → Significant difference in mean Monthly Income among Job Roles.\n")
} else {
cat("\nDecision: Fail to Reject H0 → No significant difference in mean Monthly Income.\n")
}
##
## Decision: Reject H0 → Significant difference in mean Monthly Income among Job Roles.
# Step 6: Box plot visualization
boxplot(MonthlyIncome ~ JobRole, data = hr_data,
main = "Monthly Income Comparison Across Job Roles",
ylab = "Monthly Income",
xlab = "Job Role",
col = c("skyblue", "lightgreen", "lightpink", "lightyellow", "orange", "lightgray", "lightblue", "plum", "tan"),
las = 1, # rotate x-axis labels
cex.axis = 0.8, # smaller tick labels
cex.lab = 1.1, # axis label size
cex.main = 1.2) #title size
Interpretation: The boxplot shows clear salary differences across job roles. Manager and Research Director roles have the highest monthly incomes with wide variation, whereas Healthcare Representatives, Sales Executives, and similar junior roles have much lower income ranges.
Each point represents a test employee with actual and predicted attrition. Color = predicted label, Shape = actual label. Confusion matrix shows correct and incorrect predictions. Model accuracy for k = 5: r round(accuracy * 100, 2)%.
Hypotheses
Null Hypothesis (H)₀: There is no significant difference in the average Monthly Income among the different Job Roles.
Alternative Hypothesis (H₁): There is a significant difference in the average Monthly Income for at least one Job Role.
F_calculated > F_critical
Reject the Null Hypothesis (H₀) Accept the Alternative Hypothesis (H₁)
Question: Can we predict employee attrition based on Age, MonthlyIncome, DistanceFromHome, and YearsAtCompany?
library(caret)
## Loading required package: lattice
library(class)
# Use only first 100 rows
IBM_HR_EMPLOYEE_ATTRITION <- IBM_HR_Employee_Attrition[1:100, ]
#3. Select numeric columns for normalization
hr_data <- IBM_HR_Employee_Attrition[, c("Age", "MonthlyIncome", "DistanceFromHome", "YearsAtCompany")]
hr_data$Attrition <- IBM_HR_Employee_Attrition$Attrition
#4. Normalize numeric values
normalize <- function(x){
return((x - min(x)) / (max(x) - min(x)))
}
hr_norm <- as.data.frame(lapply(hr_data[, 1:4], normalize))
hr_norm$Attrition <- hr_data$Attrition
#5. Split data (80% training and 20% testing)
set.seed(123)
train_index <- createDataPartition(hr_norm$Attrition, p = 0.8, list = FALSE)
train_data <- hr_norm[train_index, 1:4]
test_data <- hr_norm[-train_index, 1:4]
train_labels <- hr_norm[train_index, 5]
test_labels <- hr_norm[-train_index, 5]
#6. Build and run KNN model
k_value <- 5
knn_pred <- knn(train = train_data, test = test_data, cl = train_labels, k = k_value)
#7. Evaluate model performance
conf_mat <- table(predicted = knn_pred, actual = test_labels)
print(conf_mat)
## actual
## predicted No Yes
## No 236 44
## Yes 10 3
accuracy <- sum(diag(conf_mat)) / sum(conf_mat)
cat("\nModel Accuracy (k =", k_value, "):", round(accuracy * 100, 2), "%\n")
##
## Model Accuracy (k = 5 ): 81.57 %
#8. Visualization of predictions
results <- cbind(test_data, actual = test_labels, predicted = knn_pred)
ggplot(results, aes(x = MonthlyIncome, y = Age)) +
geom_point(aes(color = predicted, shape = actual), size = 4) +
labs(title = paste("KNN Classification Results (First 100 Rows, k =", k_value, ")"),
subtitle = "Color = Predicted Attrition | Shape = Actual Attrition",
x = "Monthly Income (Normalized)",
y = "Age (Normalized)") +
theme_minimal()
#9. Check accuracy for different k values
accuracy_results <- c()
for (k in 1:15) {
pred <- knn(train_data, test_data, train_labels, k = k)
acc <- sum(pred == test_labels) / length(test_labels)
accuracy_results <- c(accuracy_results, acc)
}
#10. Plot accuracy vs k values
plot(1:15, accuracy_results, type = "b", pch = 19,
xlab = "K Value", ylab = "Accuracy",
main = "KNN Accuracy for Different K Values (First 100 Rows)")
Interpretation: The scatter plot shows actual vs predicted attrition. Color indicates predicted labels; shape indicates actual labels. Accuracy for k = 5: r round(accuracy * 100, 2)%. High overlap indicates good model performance.