hrdata_Nam.R

### Step 0: Why and what
### Step 1 and 2: Get and clean data
hr <- read.csv("hrdata.csv")
str(hr)

## 'data.frame':    14999 obs. of  11 variables:
##  $ X                    : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ satisfaction_level   : num  0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
##  $ last_evaluation      : num  0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ sales                : chr  "sales" "sales" "sales" "sales" ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...

#Read Data 
hr$X <- NULL
hr$dept <- hr$sales
hr$sales <- NULL

#Factorize columns
str(hr)

## 'data.frame':    14999 obs. of  10 variables:
##  $ satisfaction_level   : num  0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
##  $ last_evaluation      : num  0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ salary               : chr  "low" "medium" "medium" "low" ...
##  $ dept                 : chr  "sales" "sales" "sales" "sales" ...

hr$salary <- as.factor(hr$salary)
hr$dept <- as.factor(hr$dept)

str(hr)

## 'data.frame':    14999 obs. of  10 variables:
##  $ satisfaction_level   : num  0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
##  $ last_evaluation      : num  0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
##  $ number_project       : int  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : int  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : int  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ salary               : Factor w/ 3 levels "high","low","medium": 2 3 3 2 2 2 2 2 2 2 ...
##  $ dept                 : Factor w/ 10 levels "accounting","hr",..: 8 8 8 8 8 8 8 8 8 8 ...

summary(hr)

##  satisfaction_level last_evaluation  number_project  average_montly_hours
##  Min.   :0.0900     Min.   :0.3600   Min.   :2.000   Min.   : 96.0       
##  1st Qu.:0.4400     1st Qu.:0.5600   1st Qu.:3.000   1st Qu.:156.0       
##  Median :0.6400     Median :0.7200   Median :4.000   Median :200.0       
##  Mean   :0.6128     Mean   :0.7161   Mean   :3.803   Mean   :201.1       
##  3rd Qu.:0.8200     3rd Qu.:0.8700   3rd Qu.:5.000   3rd Qu.:245.0       
##  Max.   :1.0000     Max.   :1.0000   Max.   :7.000   Max.   :310.0       
##                                                                          
##  time_spend_company Work_accident         left        promotion_last_5years
##  Min.   : 2.000     Min.   :0.0000   Min.   :0.0000   Min.   :0.00000      
##  1st Qu.: 3.000     1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000      
##  Median : 3.000     Median :0.0000   Median :0.0000   Median :0.00000      
##  Mean   : 3.498     Mean   :0.1446   Mean   :0.2381   Mean   :0.02127      
##  3rd Qu.: 4.000     3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000      
##  Max.   :10.000     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000      
##                                                                            
##     salary              dept     
##  high  :1237   sales      :4140  
##  low   :7316   technical  :2720  
##  medium:6446   support    :2229  
##                IT         :1227  
##                product_mng: 902  
##                marketing  : 858  
##                (Other)    :2923

#Converting Factors into Dummies 
hr_dummies <- as.data.frame(model.matrix(~ . -1, data = hr))
str(hr_dummies)

## 'data.frame':    14999 obs. of  20 variables:
##  $ satisfaction_level   : num  0.38 0.8 0.11 0.72 0.37 0.41 0.1 0.92 0.89 0.42 ...
##  $ last_evaluation      : num  0.53 0.86 0.88 0.87 0.52 0.5 0.77 0.85 1 0.53 ...
##  $ number_project       : num  2 5 7 5 2 2 6 5 5 2 ...
##  $ average_montly_hours : num  157 262 272 223 159 153 247 259 224 142 ...
##  $ time_spend_company   : num  3 6 4 5 3 3 4 5 5 3 ...
##  $ Work_accident        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ left                 : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ promotion_last_5years: num  0 0 0 0 0 0 0 0 0 0 ...
##  $ salaryhigh           : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ salarylow            : num  1 0 0 1 1 1 1 1 1 1 ...
##  $ salarymedium         : num  0 1 1 0 0 0 0 0 0 0 ...
##  $ depthr               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ deptIT               : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ deptmanagement       : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ deptmarketing        : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ deptproduct_mng      : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ deptRandD            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ deptsales            : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ deptsupport          : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ depttechnical        : num  0 0 0 0 0 0 0 0 0 0 ...

#Normalize data 
# Min Max Normalization
minmax <- function(x){ 
  (x-min(x))/ (max(x) - min(x))
  }

hr_norm <- as.data.frame(lapply(hr_dummies, minmax))
summary(hr_norm)

##  satisfaction_level last_evaluation  number_project   average_montly_hours
##  Min.   :0.0000     Min.   :0.0000   Min.   :0.0000   Min.   :0.0000      
##  1st Qu.:0.3846     1st Qu.:0.3125   1st Qu.:0.2000   1st Qu.:0.2804      
##  Median :0.6044     Median :0.5625   Median :0.4000   Median :0.4860      
##  Mean   :0.5745     Mean   :0.5564   Mean   :0.3606   Mean   :0.4909      
##  3rd Qu.:0.8022     3rd Qu.:0.7969   3rd Qu.:0.6000   3rd Qu.:0.6963      
##  Max.   :1.0000     Max.   :1.0000   Max.   :1.0000   Max.   :1.0000      
##  time_spend_company Work_accident         left        promotion_last_5years
##  Min.   :0.0000     Min.   :0.0000   Min.   :0.0000   Min.   :0.00000      
##  1st Qu.:0.1250     1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000      
##  Median :0.1250     Median :0.0000   Median :0.0000   Median :0.00000      
##  Mean   :0.1873     Mean   :0.1446   Mean   :0.2381   Mean   :0.02127      
##  3rd Qu.:0.2500     3rd Qu.:0.0000   3rd Qu.:0.0000   3rd Qu.:0.00000      
##  Max.   :1.0000     Max.   :1.0000   Max.   :1.0000   Max.   :1.00000      
##    salaryhigh        salarylow       salarymedium        depthr       
##  Min.   :0.00000   Min.   :0.0000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.0000   Median :0.0000   Median :0.00000  
##  Mean   :0.08247   Mean   :0.4878   Mean   :0.4298   Mean   :0.04927  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.0000   Max.   :1.0000   Max.   :1.00000  
##      deptIT        deptmanagement  deptmarketing    deptproduct_mng  
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.00000  
##  1st Qu.:0.00000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.00000  
##  Median :0.00000   Median :0.000   Median :0.0000   Median :0.00000  
##  Mean   :0.08181   Mean   :0.042   Mean   :0.0572   Mean   :0.06014  
##  3rd Qu.:0.00000   3rd Qu.:0.000   3rd Qu.:0.0000   3rd Qu.:0.00000  
##  Max.   :1.00000   Max.   :1.000   Max.   :1.0000   Max.   :1.00000  
##    deptRandD         deptsales      deptsupport     depttechnical   
##  Min.   :0.00000   Min.   :0.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000   1st Qu.:0.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000   Median :0.000   Median :0.0000   Median :0.0000  
##  Mean   :0.05247   Mean   :0.276   Mean   :0.1486   Mean   :0.1813  
##  3rd Qu.:0.00000   3rd Qu.:1.000   3rd Qu.:0.0000   3rd Qu.:0.0000  
##  Max.   :1.00000   Max.   :1.000   Max.   :1.0000   Max.   :1.0000

#Step 3: Split data

set.seed(12345)
test_rows <- sample(1:nrow(hr_norm), 0.3*nrow(hr_norm))

hr_test <- hr_norm[test_rows, ]
hr_train <- hr_norm[-test_rows, ]

hr_test_y <- hr_test$left
hr_train_y <- hr_train$left

hr_test$left <- NULL
hr_train$left <- NULL

#Step 4 and 5: Make Model, Make prediction

library(class)

## Warning: package 'class' was built under R version 4.4.3

knn_pred <- knn

knn_pred <- knn(train = hr_train, test = hr_test, cl = hr_train_y, k = 51)

#Step 6: Evaluate Model
library (caret)

## Loading required package: ggplot2

## Loading required package: lattice

confusionMatrix(as.factor(knn_pred), as.factor(hr_test_y), positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 3252  369
##          1  153  725
##                                           
##                Accuracy : 0.884           
##                  95% CI : (0.8743, 0.8932)
##     No Information Rate : 0.7568          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6621          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.6627          
##             Specificity : 0.9551          
##          Pos Pred Value : 0.8257          
##          Neg Pred Value : 0.8981          
##              Prevalence : 0.2432          
##          Detection Rate : 0.1611          
##    Detection Prevalence : 0.1952          
##       Balanced Accuracy : 0.8089          
##                                           
##        'Positive' Class : 1               
##

hrdata_Nam.R

namho

2025-03-26