### Step 0: Why and what
### Step 1 and 2: Get and clean data
hr <- read.csv("hrdata.csv")
str(hr)
## 'data.frame': 14999 obs. of 11 variables:
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ sales : chr "sales" "sales" "sales" "sales" ...
## $ salary : chr "low" "medium" "medium" "low" ...
#Read Data
hr$X <- NULL
hr$dept <- hr$sales
hr$sales <- NULL
#Factorize columns
str(hr)
## 'data.frame': 14999 obs. of 10 variables:
## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ salary : chr "low" "medium" "medium" "low" ...
## $ dept : chr "sales" "sales" "sales" "sales" ...
hr$salary <- as.factor(hr$salary)
hr$dept <- as.factor(hr$dept)
str(hr)
## 'data.frame': 14999 obs. of 10 variables:
## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : int 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : int 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : int 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : int 0 0 0 0 0 0 0 0 0 0 ...
## $ left : int 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: int 0 0 0 0 0 0 0 0 0 0 ...
## $ salary : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...
## $ dept : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ...
summary(hr)
## satisfaction_level last_evaluation number_project average_montly_hours
## Min. :0.0900 Min. :0.3600 Min. :2.000 Min. : 96.0
## 1st Qu.:0.4400 1st Qu.:0.5600 1st Qu.:3.000 1st Qu.:156.0
## Median :0.6400 Median :0.7200 Median :4.000 Median :200.0
## Mean :0.6128 Mean :0.7161 Mean :3.803 Mean :201.1
## 3rd Qu.:0.8200 3rd Qu.:0.8700 3rd Qu.:5.000 3rd Qu.:245.0
## Max. :1.0000 Max. :1.0000 Max. :7.000 Max. :310.0
##
## time_spend_company Work_accident left promotion_last_5years
## Min. : 2.000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.: 3.000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median : 3.000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean : 3.498 Mean :0.1446 Mean :0.2381 Mean :0.02127
## 3rd Qu.: 4.000 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :10.000 Max. :1.0000 Max. :1.0000 Max. :1.00000
##
## salary dept
## high :1237 sales :4140
## low :7316 technical :2720
## medium:6446 support :2229
## IT :1227
## product_mng: 902
## marketing : 858
## (Other) :2923
#Converting Factors into Dummies
hr_dummies <- as.data.frame(model.matrix(~ . -1, data = hr))
str(hr_dummies)
## 'data.frame': 14999 obs. of 20 variables:
## $ satisfaction_level : num 0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
## $ last_evaluation : num 0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
## $ number_project : num 2 5 7 5 2 2 6 5 5 2 ...
## $ average_montly_hours : num 157 262 272 223 159 153 247 259 224 142 ...
## $ time_spend_company : num 3 6 4 5 3 3 4 5 5 3 ...
## $ Work_accident : num 0 0 0 0 0 0 0 0 0 0 ...
## $ left : num 1 1 1 1 1 1 1 1 1 1 ...
## $ promotion_last_5years: num 0 0 0 0 0 0 0 0 0 0 ...
## $ salaryhigh : num 0 0 0 0 0 0 0 0 0 0 ...
## $ salarylow : num 1 0 0 1 1 1 1 1 1 1 ...
## $ salarymedium : num 0 1 1 0 0 0 0 0 0 0 ...
## $ depthr : num 0 0 0 0 0 0 0 0 0 0 ...
## $ deptIT : num 0 0 0 0 0 0 0 0 0 0 ...
## $ deptmanagement : num 0 0 0 0 0 0 0 0 0 0 ...
## $ deptmarketing : num 0 0 0 0 0 0 0 0 0 0 ...
## $ deptproduct_mng : num 0 0 0 0 0 0 0 0 0 0 ...
## $ deptRandD : num 0 0 0 0 0 0 0 0 0 0 ...
## $ deptsales : num 1 1 1 1 1 1 1 1 1 1 ...
## $ deptsupport : num 0 0 0 0 0 0 0 0 0 0 ...
## $ depttechnical : num 0 0 0 0 0 0 0 0 0 0 ...
#Normalize data
# Min Max Normalization
minmax <- function(x){
(x-min(x))/ (max(x) - min(x))
}
hr_norm <- as.data.frame(lapply(hr_dummies, minmax))
summary(hr_norm)
## satisfaction_level last_evaluation number_project average_montly_hours
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3846 1st Qu.:0.3125 1st Qu.:0.2000 1st Qu.:0.2804
## Median :0.6044 Median :0.5625 Median :0.4000 Median :0.4860
## Mean :0.5745 Mean :0.5564 Mean :0.3606 Mean :0.4909
## 3rd Qu.:0.8022 3rd Qu.:0.7969 3rd Qu.:0.6000 3rd Qu.:0.6963
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## time_spend_company Work_accident left promotion_last_5years
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.1250 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.1250 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.1873 Mean :0.1446 Mean :0.2381 Mean :0.02127
## 3rd Qu.:0.2500 3rd Qu.:0.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## salaryhigh salarylow salarymedium depthr
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.08247 Mean :0.4878 Mean :0.4298 Mean :0.04927
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## deptIT deptmanagement deptmarketing deptproduct_mng
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.000 Median :0.0000 Median :0.00000
## Mean :0.08181 Mean :0.042 Mean :0.0572 Mean :0.06014
## 3rd Qu.:0.00000 3rd Qu.:0.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.000 Max. :1.0000 Max. :1.00000
## deptRandD deptsales deptsupport depttechnical
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.000 Median :0.0000 Median :0.0000
## Mean :0.05247 Mean :0.276 Mean :0.1486 Mean :0.1813
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.000 Max. :1.0000 Max. :1.0000
#Step 3: Split data
set.seed(12345)
test_rows <- sample(1:nrow(hr_norm), 0.3*nrow(hr_norm))
hr_test <- hr_norm[test_rows, ]
hr_train <- hr_norm[-test_rows, ]
hr_test_y <- hr_test$left
hr_train_y <- hr_train$left
hr_test$left <- NULL
hr_train$left <- NULL
#Step 4 and 5: Make Model, Make prediction
library(class)
## Warning: package 'class' was built under R version 4.4.3
knn_pred <- knn
knn_pred <- knn(train = hr_train, test = hr_test, cl = hr_train_y, k = 51)
#Step 6: Evaluate Model
library (caret)
## Loading required package: ggplot2
## Loading required package: lattice
confusionMatrix(as.factor(knn_pred), as.factor(hr_test_y), positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 3252 369
## 1 153 725
##
## Accuracy : 0.884
## 95% CI : (0.8743, 0.8932)
## No Information Rate : 0.7568
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6621
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.6627
## Specificity : 0.9551
## Pos Pred Value : 0.8257
## Neg Pred Value : 0.8981
## Prevalence : 0.2432
## Detection Rate : 0.1611
## Detection Prevalence : 0.1952
## Balanced Accuracy : 0.8089
##
## 'Positive' Class : 1
##