Mudit Gupta
---
Consider a simple dataset of “Case Study: HR Data (Analysis and Visualization)”
# reading data
library(ggplot2)
setwd("~/Desktop/Dropbox/MLM/R_Workspace/Session 4")
hrdata <- read.csv('CleanHRData.csv')
attach(hrdata)
# printing Dimensions
dim(hrdata)
[1] 8995 17
# structure of the dataset
str(hrdata)
'data.frame': 8995 obs. of 17 variables:
$ CandidateRef : int 2110407 2112635 2112838 2115021 2115125 2117167 2119124 2127572 2138169 2143362 ...
$ DOJExtended : Factor w/ 2 levels "No","Yes": 2 1 1 1 2 2 2 2 1 1 ...
$ DurationToAcceptOffer : int 14 18 3 26 1 17 37 16 1 6 ...
$ NoticePeriod : int 30 30 45 30 120 30 30 0 30 30 ...
$ OfferedBand : Factor w/ 4 levels "E0","E1","E2",..: 3 3 3 3 3 2 3 2 2 2 ...
$ PercentHikeExpectedInCTC: num -20.8 50 42.8 42.8 42.6 ...
$ PercentHikeOfferedInCTC : num 13.2 320 42.8 42.8 42.6 ...
$ PercentDifferenceCTC : num 42.9 180 0 0 0 ...
$ JoiningBonus : Factor w/ 2 levels "No","Yes": 1 1 1 1 1 1 1 1 1 1 ...
$ CandidateRelocateActual : Factor w/ 2 levels "No","Yes": 1 1 1 1 2 1 1 1 1 1 ...
$ Gender : Factor w/ 2 levels "Female","Male": 1 2 2 2 2 2 2 1 1 2 ...
$ CandidateSource : Factor w/ 3 levels "Agency","Direct",..: 1 3 1 3 3 3 3 2 3 3 ...
$ RexInYrs : int 7 8 4 4 6 2 7 8 3 3 ...
$ LOB : Factor w/ 9 levels "AXON","BFSI",..: 5 8 8 8 8 8 8 7 2 3 ...
$ Location : Factor w/ 11 levels "Ahmedabad","Bangalore",..: 9 3 9 9 9 9 9 9 5 3 ...
$ Age : int 34 34 27 34 34 34 32 34 26 34 ...
$ Status : Factor w/ 2 levels "Joined","Not Joined": 1 1 1 1 1 1 1 1 1 1 ...
# structure of the dataset
psych::describe(hrdata)
vars n mean sd median trimmed
CandidateRef 1 8995 2843647.38 486344.77 2807482 2831309.11
DOJExtended* 2 8995 1.47 0.50 1 1.46
DurationToAcceptOffer 3 8995 21.43 25.81 10 16.64
NoticePeriod 4 8995 39.29 22.22 30 37.91
OfferedBand* 5 8995 2.39 0.63 2 2.32
PercentHikeExpectedInCTC 6 8995 43.86 29.79 40 40.92
PercentHikeOfferedInCTC 7 8995 40.66 36.06 36 36.70
PercentDifferenceCTC 8 8995 -1.57 19.61 0 -3.08
JoiningBonus* 9 8995 1.05 0.21 1 1.00
CandidateRelocateActual* 10 8995 1.14 0.35 1 1.05
Gender* 11 8995 1.83 0.38 2 1.91
CandidateSource* 12 8995 1.89 0.67 2 1.86
RexInYrs 13 8995 4.24 2.55 4 4.01
LOB* 14 8995 5.18 2.38 5 5.30
Location* 15 8995 4.94 3.00 3 4.78
Age 16 8995 29.91 4.10 29 29.86
Status* 17 8995 1.19 0.39 1 1.11
mad min max range skew
CandidateRef 668050.66 2109586.00 3836076.00 1726490.00 0.16
DOJExtended* 0.00 1.00 2.00 1.00 0.13
DurationToAcceptOffer 13.34 0.00 224.00 224.00 1.61
NoticePeriod 0.00 0.00 120.00 120.00 0.74
OfferedBand* 0.00 1.00 4.00 3.00 0.81
PercentHikeExpectedInCTC 19.53 -68.83 359.77 428.60 2.29
PercentHikeOfferedInCTC 20.76 -60.53 471.43 531.96 2.75
PercentDifferenceCTC 6.85 -67.27 300.00 367.27 4.80
JoiningBonus* 0.00 1.00 2.00 1.00 4.31
CandidateRelocateActual* 0.00 1.00 2.00 1.00 2.03
Gender* 0.00 1.00 2.00 1.00 -1.73
CandidateSource* 0.00 1.00 3.00 2.00 0.13
RexInYrs 1.48 0.00 24.00 24.00 1.29
LOB* 4.45 1.00 9.00 8.00 -0.22
Location* 1.48 1.00 11.00 10.00 0.50
Age 4.45 20.00 60.00 40.00 0.42
Status* 0.00 1.00 2.00 1.00 1.61
kurtosis se
CandidateRef -1.46 5127.95
DOJExtended* -1.98 0.01
DurationToAcceptOffer 2.65 0.27
NoticePeriod 0.79 0.23
OfferedBand* 0.30 0.01
PercentHikeExpectedInCTC 12.44 0.31
PercentHikeOfferedInCTC 15.92 0.38
PercentDifferenceCTC 45.55 0.21
JoiningBonus* 16.61 0.00
CandidateRelocateActual* 2.14 0.00
Gender* 1.01 0.00
CandidateSource* -0.81 0.01
RexInYrs 3.20 0.03
LOB* -1.23 0.03
Location* -1.56 0.03
Age 0.34 0.04
Status* 0.58 0.00
# Pie Chart from data frame with Appended Sample Sizes
mytable <- table(Status)
pie(mytable,
main="Pie Chart of Percentage of Candidates who Joined / Not Joined")
statusByGender <- table(Status,Gender)
statusByGender
Gender
Status Female Male
Joined 1278 6035
Not Joined 273 1409
# Bar Chart from data frame with Appended Sample Sizes
barplot(statusByGender,
legend=T,
beside=T,
main='Bar Chart of Percentage of the candiadtes (Joined / Not joined) the Company by Gender')