IMPORTING DATA

Reading data into R

# reading data
HR.df <- read.csv(paste("CleanHRData.csv"))
# dimension of the dataset
dim(HR.df)
## [1] 8995   17

Structure of the dataframe

# structure of the dataframe
str(HR.df)
## 'data.frame':    8995 obs. of  17 variables:
##  $ CandidateRef            : int  2110407 2112635 2112838 2115021 2115125 2117167 2119124 2127572 2138169 2143362 ...
##  $ DOJExtended             : chr  "Yes" "No" "No" "No" ...
##  $ DurationToAcceptOffer   : int  14 18 3 26 1 17 37 16 1 6 ...
##  $ NoticePeriod            : int  30 30 45 30 120 30 30 0 30 30 ...
##  $ OfferedBand             : chr  "E2" "E2" "E2" "E2" ...
##  $ PercentHikeExpectedInCTC: num  -20.8 50 42.8 42.8 42.6 ...
##  $ PercentHikeOfferedInCTC : num  13.2 320 42.8 42.8 42.6 ...
##  $ PercentDifferenceCTC    : num  42.9 180 0 0 0 ...
##  $ JoiningBonus            : chr  "No" "No" "No" "No" ...
##  $ CandidateRelocateActual : chr  "No" "No" "No" "No" ...
##  $ Gender                  : chr  "Female" "Male" "Male" "Male" ...
##  $ CandidateSource         : chr  "Agency" "Employee Referral" "Agency" "Employee Referral" ...
##  $ RexInYrs                : int  7 8 4 4 6 2 7 8 3 3 ...
##  $ LOB                     : chr  "ERS" "INFRA" "INFRA" "INFRA" ...
##  $ Location                : chr  "Noida" "Chennai" "Noida" "Noida" ...
##  $ Age                     : int  34 34 27 34 34 34 32 34 26 34 ...
##  $ Status                  : chr  "Joined" "Joined" "Joined" "Joined" ...

Training (80%) and Testing (20%) Data

library(caret)
# data partition
set.seed(2341)
trainIndex <- createDataPartition(HR.df$Status, p = 0.80, list = FALSE)
# 80% training data
trainHR.df <- HR.df[trainIndex, ]
dim(trainHR.df)
## [1] 7197   17
# 20% testing data
testHR.df <- HR.df[-trainIndex, ]
dim(testHR.df)
## [1] 1798   17