Data Exploration and Preprocessing
## Structure of the data
str(absentdata)
## 'data.frame': 740 obs. of 21 variables:
## $ ID : int 11 36 3 7 11 3 10 20 14 1 ...
## $ Reason.for.absence : int 26 0 23 7 23 23 22 23 19 22 ...
## $ Month.of.absence : int 7 7 7 7 7 7 7 7 7 7 ...
## $ Day.of.the.week : int 3 3 4 5 5 6 6 6 2 2 ...
## $ Seasons : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Transportation.expense : int 289 118 179 279 289 179 361 260 155 235 ...
## $ Distance.from.Residence.to.Work: int 36 13 51 5 36 51 52 50 12 11 ...
## $ Service.time : int 13 18 18 14 13 18 3 11 14 14 ...
## $ Age : int 33 50 38 39 33 38 28 36 34 37 ...
## $ Work.load.Average.day : num 240 240 240 240 240 ...
## $ Hit.target : int 97 97 97 97 97 97 97 97 97 97 ...
## $ Disciplinary.failure : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Education : int 1 1 1 1 1 1 1 1 1 3 ...
## $ Son : int 2 1 0 2 2 0 1 4 2 1 ...
## $ Social.drinker : int 1 1 1 1 1 1 1 1 1 0 ...
## $ Social.smoker : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Pet : int 1 0 0 0 1 0 4 0 0 1 ...
## $ Weight : int 90 98 89 68 90 89 80 65 95 88 ...
## $ Height : int 172 178 170 168 172 170 172 168 196 172 ...
## $ Body.mass.index : int 30 31 31 24 30 31 27 23 25 29 ...
## $ Absenteeism.time.in.hours : int 4 0 2 4 2 2 8 4 40 8 ...
## Summary statistics
summary(absentdata)
## ID Reason.for.absence Month.of.absence Day.of.the.week
## Min. : 1.00 Min. : 0.00 Min. : 0.000 Min. :2.000
## 1st Qu.: 9.00 1st Qu.:13.00 1st Qu.: 3.000 1st Qu.:3.000
## Median :18.00 Median :23.00 Median : 6.000 Median :4.000
## Mean :18.02 Mean :19.22 Mean : 6.324 Mean :3.915
## 3rd Qu.:28.00 3rd Qu.:26.00 3rd Qu.: 9.000 3rd Qu.:5.000
## Max. :36.00 Max. :28.00 Max. :12.000 Max. :6.000
## Seasons Transportation.expense Distance.from.Residence.to.Work
## Min. :1.000 Min. :118.0 Min. : 5.00
## 1st Qu.:2.000 1st Qu.:179.0 1st Qu.:16.00
## Median :3.000 Median :225.0 Median :26.00
## Mean :2.545 Mean :221.3 Mean :29.63
## 3rd Qu.:4.000 3rd Qu.:260.0 3rd Qu.:50.00
## Max. :4.000 Max. :388.0 Max. :52.00
## Service.time Age Work.load.Average.day Hit.target
## Min. : 1.00 Min. :27.00 Min. :205.9 Min. : 81.00
## 1st Qu.: 9.00 1st Qu.:31.00 1st Qu.:244.4 1st Qu.: 93.00
## Median :13.00 Median :37.00 Median :264.2 Median : 95.00
## Mean :12.55 Mean :36.45 Mean :271.5 Mean : 94.59
## 3rd Qu.:16.00 3rd Qu.:40.00 3rd Qu.:294.2 3rd Qu.: 97.00
## Max. :29.00 Max. :58.00 Max. :378.9 Max. :100.00
## Disciplinary.failure Education Son Social.drinker
## Min. :0.00000 Min. :1.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.0000
## Median :0.00000 Median :1.000 Median :1.000 Median :1.0000
## Mean :0.05405 Mean :1.292 Mean :1.019 Mean :0.5676
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.00000 Max. :4.000 Max. :4.000 Max. :1.0000
## Social.smoker Pet Weight Height
## Min. :0.00000 Min. :0.0000 Min. : 56.00 Min. :163.0
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: 69.00 1st Qu.:169.0
## Median :0.00000 Median :0.0000 Median : 83.00 Median :170.0
## Mean :0.07297 Mean :0.7459 Mean : 79.04 Mean :172.1
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.: 89.00 3rd Qu.:172.0
## Max. :1.00000 Max. :8.0000 Max. :108.00 Max. :196.0
## Body.mass.index Absenteeism.time.in.hours
## Min. :19.00 Min. : 0.000
## 1st Qu.:24.00 1st Qu.: 2.000
## Median :25.00 Median : 3.000
## Mean :26.68 Mean : 6.924
## 3rd Qu.:31.00 3rd Qu.: 8.000
## Max. :38.00 Max. :120.000
## Zero in Reason.for.absence for absence is not a valid reason code. ICD and non-ICD codes do not support it.
## Removed observations zero in Reason code.
range(absentdata$Reason.for.absence)
## [1] 0 28
absentdata <- absentdata[!(absentdata$Reason.for.absence == 0),]
## Observations in which reason code is greater than zero but absenteeism time in hours is > 0 +
## Observations in which reason code is zero and absenteism time is zero.
a <- subset(absentdata, Absenteeism.time.in.hours <= 0 & Reason.for.absence > 0, c(ID, Reason.for.absence, Absenteeism.time.in.hours))
b <- subset(absentdata, Absenteeism.time.in.hours <= 0 & Reason.for.absence == 0, c(ID, Reason.for.absence, Absenteeism.time.in.hours))
as.matrix(rbind(a,b))
## ID Reason.for.absence Absenteeism.time.in.hours
## 135 34 27 0
## REason code 27 is the only one wherer Absenteeism time is zero. Removed that observation.
absentdata = absentdata[!(absentdata$Absenteeism.time.in.hours==0 & absentdata$Reason.for.absence > 0) ,]
## At this point we have 696 observations and 21 attributes.
#dim(absentdata)
#summary(absentdata)
## Disciplinary failure is a noise. ONlu zeros in Disciplinary Failure.
range(absentdata$Disciplinary.failure)
## [1] 0 0
## Removied Disciplinary failure attribute.
absentdata <- absentdata[,-12]
#str(absentdata)
## Now we have 696 observations and 20 attributes.
## Missing value analysis
as.matrix(colSums(is.na(absentdata)))
## [,1]
## ID 0
## Reason.for.absence 0
## Month.of.absence 0
## Day.of.the.week 0
## Seasons 0
## Transportation.expense 0
## Distance.from.Residence.to.Work 0
## Service.time 0
## Age 0
## Work.load.Average.day 0
## Hit.target 0
## Education 0
## Son 0
## Social.drinker 0
## Social.smoker 0
## Pet 0
## Weight 0
## Height 0
## Body.mass.index 0
## Absenteeism.time.in.hours 0
# There is no missing value in any any attributes.
## Box plot of Absenteeism time in hours with Reason for absence. To verify the outliers in each reason codes.
ggplot(absentdata,
aes_string(y=absentdata$Absenteeism.time.in.hours,x=as.factor(absentdata$Reason.for.absence))) +
geom_boxplot() +
xlab('Reason.for.absence') +
ylab('Absenteeism.time.in.hours')

## Let's see the Distribution for continuous variables.
## Absenteeism time. Highly right skewed due to presence of outliers.
hist(absentdata$Absenteeism.time.in.hours, breaks = 40, #prob = TRUE,
xlab = 'Absenteeism time in hours', main = " Absenteeism time Distribution", col = "grey")

boxplot(absentdata$Absenteeism.time.in.hours, main = "Box plot of Absenteeism time in hours")

#Outlier Analysis
#boxplot for Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Hit.target
boxplot(absentdata[,c('Transportation.expense','Distance.from.Residence.to.Work', 'Service.time', 'Age','Hit.target')], varwidth = T,
col = "dark grey")

#boxplot for Weight,Height,Body.mass.index,Absenteeism.time.in.hours
boxplot(absentdata[,c('Weight', 'Height', 'Body.mass.index','Absenteeism.time.in.hours')], col = "grey", varwidth = T)

#boxplot for Work.load.Average.day
boxplot(absentdata[,c('Work.load.Average.day')], col = "grey")

## We do not delete the outliers instead we will cap the outliers with 25 and 75 percentiles.
## Capping outliers - replacing outliers with 25percentile and 75percentile values.
for (i in c('Transportation.expense','Service.time','Age','Work.load.Average.day','Hit.target','Height','Absenteeism.time.in.hours')){
q = quantile(absentdata[,i],c(0.25,0.75))
iqr1 = q[2]-q[1]
min1 = q[1]-1.5*iqr1
max1 = q[2]+1.5*iqr1
absentdata[,i][absentdata[,i]<min1] = min1
absentdata[,i][absentdata[,i]>max1] = max1
}
## Reason for absence vs absenteeism in time after outlier capping
ggplot(absentdata,
aes_string(y=absentdata$Absenteeism.time.in.hours,x=as.factor(absentdata$Reason.for.absence))) +
geom_boxplot() +
xlab('Reason.for.absence') +
ylab('Absenteeism.time.in.hours')

#boxplot for Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Hit.target
boxplot(absentdata[,c('Transportation.expense','Distance.from.Residence.to.Work', 'Service.time', 'Age','Hit.target')], varwidth = T,
col = "dark grey")

#boxplot for Weight,Height,Body.mass.index,Absenteeism.time.in.hours
boxplot(absentdata[,c('Weight', 'Height', 'Body.mass.index','Absenteeism.time.in.hours')], col = "grey", varwidth = T)

#boxplot for Work.load.Average.day
boxplot(absentdata[,c('Work.load.Average.day')], col = "grey")

## Data independence, Multicollinearity test.
## First categorical variables.
categorical_var = c("Reason.for.absence","Month.of.absence","Day.of.the.week",
"Seasons", "Education", "Social.drinker",
"Social.smoker", "Son", "Pet")
## Transform categorical variables into factors.
absentdata[,categorical_var ] <- lapply(absentdata[,categorical_var], factor)
#str(absentdata)
# Chi-square test for relationship between attributes.
pvalue = c()
#Calculating & storing p-values in vector pval from chisquare test
for(i in categorical_var){
for(j in categorical_var){
chi2 = chisq.test(absentdata[,i],absentdata[,j]) #, simulate.p.value = T)
pvalue = c(pvalue,chi2$p.value)
}
}
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
length(pvalue)
## [1] 81
m1 <- matrix(pvalue, ncol = 9)
df <- data.frame(m1)
row.names(df) <- categorical_var
colnames(df) <- categorical_var
print(df)
## Reason.for.absence Month.of.absence Day.of.the.week
## Reason.for.absence 0.000000e+00 2.191275e-14 5.170971e-02
## Month.of.absence 2.191275e-14 0.000000e+00 6.765307e-01
## Day.of.the.week 5.170971e-02 6.765307e-01 0.000000e+00
## Seasons 6.493726e-19 0.000000e+00 4.086874e-01
## Education 1.895058e-10 8.873931e-03 6.369421e-01
## Social.drinker 2.253390e-08 3.040382e-02 3.040911e-01
## Social.smoker 1.742354e-09 1.838631e-02 5.388357e-01
## Son 1.788850e-19 4.226723e-05 1.889736e-09
## Pet 1.386394e-18 6.376062e-05 4.012904e-01
## Seasons Education Social.drinker Social.smoker
## Reason.for.absence 6.493726e-19 1.895058e-10 2.253390e-08 1.742354e-09
## Month.of.absence 0.000000e+00 8.873931e-03 3.040382e-02 1.838631e-02
## Day.of.the.week 4.086874e-01 6.369421e-01 3.040911e-01 5.388357e-01
## Seasons 0.000000e+00 8.936680e-02 1.473154e-01 6.615252e-02
## Education 8.936680e-02 0.000000e+00 1.426601e-33 5.635243e-26
## Social.drinker 1.473154e-01 1.426601e-33 1.688763e-152 1.031400e-02
## Social.smoker 6.615252e-02 5.635243e-26 1.031400e-02 7.049811e-150
## Son 1.795308e-05 8.888863e-12 4.393599e-09 2.868110e-20
## Pet 1.090705e-04 3.323522e-29 1.303426e-26 6.587414e-14
## Son Pet
## Reason.for.absence 1.788850e-19 1.386394e-18
## Month.of.absence 4.226723e-05 6.376062e-05
## Day.of.the.week 1.889736e-09 4.012904e-01
## Seasons 1.795308e-05 1.090705e-04
## Education 8.888863e-12 3.323522e-29
## Social.drinker 4.393599e-09 1.303426e-26
## Social.smoker 2.868110e-20 6.587414e-14
## Son 0.000000e+00 8.122222e-90
## Pet 8.122222e-90 0.000000e+00
## As per the chisquare test, except Reason.for.absence and Day.of.the.week, all categorical variables are related to Reason.for.absence, as the p-values are less than 0.005. So, we removed all categorical variables correlated to Reason.for.absence but Day.of.the.week.
absentdata <- absentdata[, -c(3, 5, 12,13,14, 15, 16)]
## Correltaion matrix for continuous attribute
m <- cor(absentdata[,4:13])
corrplot(m, order = "hclust", tl.srt = 30, tl.col = "black", addrect = 3, method = "number" )

## Correlation between Absenteeism.time.in.hours and predictor are below 0.1. But high collinearity found between Weight and Body.mass.index. So, I removed Weight from the dataframe.
absentdata = absentdata[,-10]
## After data pre-processiin we are left with 696 observstions and 12 variables including target variable. .
## Test for linearity in the data
pairs(absentdata[, -c(1:3)])

## Data is not linear. So, linear models will not be a good choice for this data.
######################## End of Data Preprocessing ############################
# Aggregating Absenteeism.time.in.hours by Reason.for.absence
Reasons = aggregate(absentdata$Absenteeism.time.in.hours, by=list(Category=absentdata$Reason.for.absence), FUN=sum)
#print(as.data.frame(Reasons))
Reasons$Absence = (Reasons$x/sum(absentdata$Absenteeism.time.in.hours))*100
Reasons = Reasons[order(Reasons$Absence, decreasing = T),]
#print(Reasons)
barplot(Reasons$Absence, names.arg = Reasons$Category, xlab = "Reason for absence", ylab = "Absence", col = "dark grey",
main = "How much proportion each reason code plays in absenteeism")

## Taking backup of preprocessed data
#write.csv(modeldata, "modeldata.csv", row.names = F)
Model building using Machine Learning Algorithms.
### We will see how many grous are there in the data set by means of K-means clustering.
modeldata = absentdata[,-c(1,2,3)]
df = scale(modeldata)
## NbClust method
## wssplot function to give value of K based on elbow method using within cluster sum of squeares.
wssplot <- function(data, nc = 20, seed = 1234) {
wss <- (nrow(data) - 1) * sum(apply(data, 2, var))
for (i in 2 : nc) {
set.seed(seed)
wss[i] <- sum(kmeans(data, centers = i)$withins)}
plot(1:nc, wss, type = "b", xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
}
wssplot(df)

set.seed(1234)
nc <- NbClust(df, min.nc = 3, max.nc = 20, method = "kmeans" )

## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##

## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 7 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 4 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 2 proposed 8 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 2 proposed 14 as the best number of clusters
## * 2 proposed 17 as the best number of clusters
## * 1 proposed 19 as the best number of clusters
## * 2 proposed 20 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
barplot(table(nc$Best.nc[1,]))
## According to NbCluster method, 3 would be optimal value of K.

set.seed(1234)
### Elbow method - K = 3 optimal value
fviz_nbclust(df, kmeans, method = "wss")

## Average Silhoutte Method = k = 9, optimal value
set.seed(1234)
fviz_nbclust(df, kmeans, method = "silhouette")

## Comparison of k-values
set.seed(1234)
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k7 <- kmeans(df, centers = 7, nstart = 25)
k9 <- kmeans(df, centers = 9, nstart = 25)
# plots to compare
p1 = fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p2 = fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p3 = fviz_cluster(k7, geom = "point", data = df) + ggtitle("k = 7")
p4 = fviz_cluster(k9, geom = "point", data = df) + ggtitle("k = 9")
grid.arrange(p1, p2, p3, p4, nrow = 2)

# Compute k-means clustering with k = 3
set.seed(1234)
final <- kmeans(df, 3, nstart = 25)
final$center
## Transportation.expense Distance.from.Residence.to.Work Service.time
## 1 -0.3192529 0.9399145 1.0407530
## 2 0.5716311 0.1896419 -0.7329393
## 3 -0.6703766 -1.0206664 0.3776036
## Age Work.load.Average.day Hit.target Height Body.mass.index
## 1 0.6674743 -0.22119948 0.04639312 -0.72124669 1.28823361
## 2 -0.7305156 0.03503184 -0.06509454 -0.07121779 -0.58346249
## 3 0.6586461 0.11282116 0.06867719 0.66439646 -0.05031317
## Absenteeism.time.in.hours
## 1 -0.26202258
## 2 0.04704534
## 3 0.12477083
fviz_cluster(final, data = df)

#print(final)
## Build models supervesed learning way.
## Response variable is Absenteeism.time.in.hours. We will create 6 categoris and build models to predict the class.
modeldata <- absentdata
temp_table = table(as.factor(modeldata$Absenteeism.time.in.hours))
barplot(temp_table, xlab = "Absenteeism in Hours", ylab = "Frequency", main = "Absenteeism frequency")

## Most of the time it is 8 hours people go abesent from work. That means full day absenteeism is common trend.
## According to K-means cluster, this data comprises of 3 partition or 3 groups. So, creating three class lebels for absenteeism.in.time.hours, low - absenteeism hours is within 1 to 4 hours, moderate when 5 to 8 hours, high when greater than 8 hours.
absentgroup <- ifelse((modeldata$Absenteeism.time.in.hours >= 1 & modeldata$Absenteeism.time.in.hours <=4), "low", "high")
tempdata <- as.integer(as.character(modeldata$Absenteeism.time.in.hours))
for (i in 1:length(tempdata)) {
if(tempdata[i] >= 1 & tempdata[i] <=4){
modeldata$absentgroup[i] = "low"
} else if(tempdata[i] > 4 & tempdata[i] <= 8){
modeldata$absentgroup[i] = "moderate"
} else { modeldata$absentgroup[i] = "high"}
}
table(modeldata$absentgroup)
##
## high low moderate
## 63 417 216
modeldata$absentgroup = factor(modeldata$absentgroup)
## We are using validatioin set approach for resampling. Select 80% observation for training and 20% for testing.
## Removing ansenteeism.in.time.hours and ID attributes.
modeldata = modeldata[, -12]
modeldata = modeldata[,-1]
#smp_size <- floor(0.75 * nrow(modeldata))
## set the seed to make partition reproducible
set.seed(1234)
train_index = sample(1:nrow(modeldata), 0.8*nrow(modeldata))
train = modeldata[train_index,]
test = modeldata[-train_index,]
test.group <- test$absentgroup
## first model - Simple Classification Tree with "tree" function of "tree" package.
model_tree <- tree(absentgroup ~ . , data = train)
summary(model_tree)
##
## Classification tree:
## tree(formula = absentgroup ~ ., data = train)
## Variables actually used in tree construction:
## [1] "Reason.for.absence" "Body.mass.index"
## [3] "Transportation.expense" "Hit.target"
## [5] "Height" "Age"
## Number of terminal nodes: 13
## Residual mean deviance: 0.9118 = 495.1 / 543
## Misclassification error rate: 0.1727 = 96 / 556
plot(model_tree)
text(model_tree, pretty = 0, cex = 0.8)

model_tree_pred = predict(model_tree, test, type = "class")
#conf_matrix = table(model_tree_pred, test.group)
#model_tree_acu = sum(diag(conf_matrix))/sum(conf_matrix)
print(postResample(pred = model_tree_pred, obs = test.group))
## Accuracy Kappa
## 0.6571429 0.3761604
confusionMatrix(model_tree_pred, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 3 3 4
## low 3 64 14
## moderate 10 14 25
##
## Overall Statistics
##
## Accuracy : 0.6571
## 95% CI : (0.5723, 0.7352)
## No Information Rate : 0.5786
## P-Value [Acc > NIR] : 0.03515
##
## Kappa : 0.3762
## Mcnemar's Test P-Value : 0.46252
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.18750 0.7901 0.5814
## Specificity 0.94355 0.7119 0.7526
## Pos Pred Value 0.30000 0.7901 0.5102
## Neg Pred Value 0.90000 0.7119 0.8022
## Prevalence 0.11429 0.5786 0.3071
## Detection Rate 0.02143 0.4571 0.1786
## Detection Prevalence 0.07143 0.5786 0.3500
## Balanced Accuracy 0.56552 0.7510 0.6670
## Linear Discriminant Analysis
lda.fit = lda(absentgroup ~ ., data = train)
lda.fit
## Call:
## lda(absentgroup ~ ., data = train)
##
## Prior probabilities of groups:
## high low moderate
## 0.08453237 0.60431655 0.31115108
##
## Group means:
## Reason.for.absence2 Reason.for.absence3 Reason.for.absence4
## high 0.0212766 0.000000000 0.000000000
## low 0.0000000 0.000000000 0.002976190
## moderate 0.0000000 0.005780347 0.005780347
## Reason.for.absence5 Reason.for.absence6 Reason.for.absence7
## high 0.00000000 0.02127660 0.04255319
## low 0.00297619 0.00297619 0.01190476
## moderate 0.01156069 0.01734104 0.02890173
## Reason.for.absence8 Reason.for.absence9 Reason.for.absence10
## high 0.000000000 0.042553191 0.063829787
## low 0.008928571 0.000000000 0.008928571
## moderate 0.011560694 0.005780347 0.080924855
## Reason.for.absence11 Reason.for.absence12 Reason.for.absence13
## high 0.08510638 0.063829787 0.19148936
## low 0.02976190 0.005952381 0.03869048
## moderate 0.04624277 0.000000000 0.09248555
## Reason.for.absence14 Reason.for.absence15 Reason.for.absence16
## high 0.04255319 0.000000000 0.000000000
## low 0.02380952 0.000000000 0.008928571
## moderate 0.02890173 0.005780347 0.000000000
## Reason.for.absence17 Reason.for.absence18 Reason.for.absence19
## high 0.000000000 0.02127660 0.23404255
## low 0.000000000 0.00297619 0.01785714
## moderate 0.005780347 0.06936416 0.08670520
## Reason.for.absence21 Reason.for.absence22 Reason.for.absence23
## high 0.000000000 0.021276596 0.04255319
## low 0.005952381 0.008928571 0.31250000
## moderate 0.017341040 0.173410405 0.05202312
## Reason.for.absence24 Reason.for.absence25 Reason.for.absence26
## high 0.00000000 0.00000000 0.02127660
## low 0.00000000 0.06845238 0.02083333
## moderate 0.01734104 0.01734104 0.12716763
## Reason.for.absence27 Reason.for.absence28 Day.of.the.week3
## high 0.0000000 0.04255319 0.1914894
## low 0.1636905 0.25000000 0.2142857
## moderate 0.0000000 0.02312139 0.2080925
## Day.of.the.week4 Day.of.the.week5 Day.of.the.week6
## high 0.2553191 0.1702128 0.06382979
## low 0.1696429 0.1934524 0.23214286
## moderate 0.2196532 0.1502890 0.16763006
## Transportation.expense Distance.from.Residence.to.Work
## high 225.7447 26.93617
## low 203.9077 30.15476
## moderate 250.0694 29.93642
## Service.time Age Work.load.Average.day Hit.target Height
## high 12.68085 35.80851 286.3948 94.72340 171.6809
## low 12.90476 36.44494 266.8456 94.85417 170.9360
## moderate 11.74277 35.65607 274.2928 94.97110 171.0838
## Body.mass.index
## high 25.74468
## low 26.56548
## moderate 26.74566
##
## Coefficients of linear discriminants:
## LD1 LD2
## Reason.for.absence2 0.4825247666 -8.8433185127
## Reason.for.absence3 -0.0332466691 1.4400404680
## Reason.for.absence4 1.9721431796 0.5578624263
## Reason.for.absence5 0.6587052967 0.6138922986
## Reason.for.absence6 0.5934601018 -0.9185998238
## Reason.for.absence7 1.3615778466 -1.1293384262
## Reason.for.absence8 1.8079169709 0.3248714944
## Reason.for.absence9 -0.0837661785 -5.4164221088
## Reason.for.absence10 0.3669252281 -0.3372586633
## Reason.for.absence11 1.6590260064 -1.1788767276
## Reason.for.absence12 1.4968567203 -5.2848492973
## Reason.for.absence13 1.1833600886 -1.6410287542
## Reason.for.absence14 2.0115292277 -1.1175940256
## Reason.for.absence15 -0.4710651836 0.8465044508
## Reason.for.absence16 3.6913390966 -0.4305365211
## Reason.for.absence17 -0.3338521155 1.8149828314
## Reason.for.absence18 -0.1225305400 0.4216789577
## Reason.for.absence19 0.5794519426 -2.5508155148
## Reason.for.absence21 1.2317651664 0.7256574734
## Reason.for.absence22 0.2366217296 0.5913152240
## Reason.for.absence23 3.3836776656 -0.5781132636
## Reason.for.absence24 0.0006886533 1.2486003490
## Reason.for.absence25 3.2146943220 -0.2736031074
## Reason.for.absence26 0.7475143736 0.3218743778
## Reason.for.absence27 3.6095836480 -0.5492118374
## Reason.for.absence28 3.4855830349 -0.7054934841
## Day.of.the.week3 -0.0320237693 0.4249944232
## Day.of.the.week4 -0.0781944490 0.1230870433
## Day.of.the.week5 0.0953873608 0.3752321668
## Day.of.the.week6 0.1574424590 0.4888155395
## Transportation.expense -0.0043092780 0.0004016558
## Distance.from.Residence.to.Work 0.0041536634 0.0070208063
## Service.time -0.0356503696 -0.0813113277
## Age 0.0185138276 0.0238804089
## Work.load.Average.day -0.0012479059 -0.0027344620
## Hit.target 0.0016184504 0.0066126378
## Height 0.0139270637 -0.0330145373
## Body.mass.index -0.0086373798 0.0419995730
##
## Proportion of trace:
## LD1 LD2
## 0.8515 0.1485
#summary(lda.fit)
plot(lda.fit, col = as.integer(train$absentgroup))

plot(lda.fit, dimen = 1, type = 'b')

lda.test <- predict(lda.fit,test)
test$lda <- lda.test$class
table(test$lda,test$absentgroup)
##
## high low moderate
## high 7 4 3
## low 2 66 15
## moderate 7 11 25
print(postResample(pred = test$lda, obs = test.group))
## Accuracy Kappa
## 0.7000000 0.4557571
confusionMatrix(test$lda, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 7 4 3
## low 2 66 15
## moderate 7 11 25
##
## Overall Statistics
##
## Accuracy : 0.7
## 95% CI : (0.6168, 0.7745)
## No Information Rate : 0.5786
## P-Value [Acc > NIR] : 0.002055
##
## Kappa : 0.4558
## Mcnemar's Test P-Value : 0.410170
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.4375 0.8148 0.5814
## Specificity 0.9435 0.7119 0.8144
## Pos Pred Value 0.5000 0.7952 0.5814
## Neg Pred Value 0.9286 0.7368 0.8144
## Prevalence 0.1143 0.5786 0.3071
## Detection Rate 0.0500 0.4714 0.1786
## Detection Prevalence 0.1000 0.5929 0.3071
## Balanced Accuracy 0.6905 0.7633 0.6979
ldahist(data = lda.test$x[,1],g = test.group)

plot(lda.test$x[,1], lda.test$x[,2])
text(lda.test$x[,1], lda.test$x[,2], test$absentgroup, cex = 0.7, pos = 4, col = c("red","green","blue"))

test = test[,-12]
## Random forest
set.seed(1234)
#split 3, error rate 27.32%
rf.fit = randomForest(absentgroup~., data = train, importance = TRUE)
rf.fit
##
## Call:
## randomForest(formula = absentgroup ~ ., data = train, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 27.88%
## Confusion matrix:
## high low moderate class.error
## high 1 15 31 0.9787234
## low 4 290 42 0.1369048
## moderate 12 51 110 0.3641618
#rf.fit.pred <- predict(rf.fit, test, type = "class")
# Fine tuning parameters of Random Forest model, split 6. Error rate 26.62%
rf.fit1 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = 6, importance = TRUE)
rf.fit1
##
## Call:
## randomForest(formula = absentgroup ~ ., data = train, ntree = 500, mtry = 6, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 26.44%
## Confusion matrix:
## high low moderate class.error
## high 3 11 33 0.9361702
## low 6 286 44 0.1488095
## moderate 14 39 120 0.3063584
rf.fit1.pred <- predict(rf.fit1, test, type = "class")
# Checking classification accuracy
print(postResample(pred = rf.fit1.pred, obs = test.group))
## Accuracy Kappa
## 0.7000000 0.4339078
confusionMatrix(rf.fit1.pred, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 5 1 1
## low 2 69 18
## moderate 9 11 24
##
## Overall Statistics
##
## Accuracy : 0.7
## 95% CI : (0.6168, 0.7745)
## No Information Rate : 0.5786
## P-Value [Acc > NIR] : 0.002055
##
## Kappa : 0.4339
## Mcnemar's Test P-Value : 0.038033
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.31250 0.8519 0.5581
## Specificity 0.98387 0.6610 0.7938
## Pos Pred Value 0.71429 0.7753 0.5455
## Neg Pred Value 0.91729 0.7647 0.8021
## Prevalence 0.11429 0.5786 0.3071
## Detection Rate 0.03571 0.4929 0.1714
## Detection Prevalence 0.05000 0.6357 0.3143
## Balanced Accuracy 0.64819 0.7564 0.6760
importance(rf.fit1)
## high low moderate
## Reason.for.absence 16.4707898 91.958915 52.722060264
## Day.of.the.week -2.6444497 -2.101536 -6.290773639
## Transportation.expense 2.4960105 22.916939 10.265611666
## Distance.from.Residence.to.Work -1.2800162 3.458159 0.688356662
## Service.time -2.3836102 10.839848 1.648585822
## Age -2.4761537 10.701084 -0.175352608
## Work.load.Average.day -5.1675358 4.720620 -1.093288773
## Hit.target 2.4093100 -1.412450 -3.307227354
## Height 1.8859505 8.974606 -0.006048795
## Body.mass.index 0.3430315 8.863048 -0.624923531
## MeanDecreaseAccuracy MeanDecreaseGini
## Reason.for.absence 97.5505864 135.22912
## Day.of.the.week -6.6601649 23.70837
## Transportation.expense 24.0868828 24.48099
## Distance.from.Residence.to.Work 2.9637681 10.19540
## Service.time 10.8946338 11.44768
## Age 8.9082448 11.17669
## Work.load.Average.day 0.9409773 31.24681
## Hit.target -2.4529034 21.27021
## Height 8.0866204 10.23405
## Body.mass.index 8.2537315 10.04194
varImpPlot(rf.fit1)

# Using For loop to identify the right mtry for model
a=c()
#i=5
for (i in 1:8) {
rf.fit2 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = i, importance = TRUE)
rf.fit2.pred <- predict(rf.fit2, test, type = "class")
a[i-2] = mean(rf.fit2.pred == test.group)
}
a
## [1] 0.7071429 0.7214286 0.7071429 0.7071429 0.6928571 0.6857143
plot(3:8,a, type = "b")

rf.fit5 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = 5, importance = TRUE)
rf.fit5.pred <- predict(rf.fit5, test, type = "class")
print(postResample(pred = rf.fit5.pred, obs = test.group))
## Accuracy Kappa
## 0.7071429 0.4439062
confusionMatrix(rf.fit5.pred, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 5 0 1
## low 2 70 18
## moderate 9 11 24
##
## Overall Statistics
##
## Accuracy : 0.7071
## 95% CI : (0.6243, 0.7809)
## No Information Rate : 0.5786
## P-Value [Acc > NIR] : 0.001151
##
## Kappa : 0.4439
## Mcnemar's Test P-Value : 0.017819
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.31250 0.8642 0.5581
## Specificity 0.99194 0.6610 0.7938
## Pos Pred Value 0.83333 0.7778 0.5455
## Neg Pred Value 0.91791 0.7800 0.8021
## Prevalence 0.11429 0.5786 0.3071
## Detection Rate 0.03571 0.5000 0.1714
## Detection Prevalence 0.04286 0.6429 0.3143
## Balanced Accuracy 0.65222 0.7626 0.6760
## Building the Classification Tree Models using the Quinlan's C5.0 algorithm
c50.fit <- C5.0(train[-11], train$absentgroup, trials = 10)
summary(c50.fit)
##
## Call:
## C5.0.default(x = train[-11], y = train$absentgroup, trials = 10)
##
##
## C5.0 [Release 2.07 GPL Edition] Thu Feb 14 19:35:53 2019
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 556 cases (11 attributes) from undefined.data
##
## ----- Trial 0: -----
##
## Decision tree:
##
## Reason.for.absence in {1,3,5,6,10,15,17,18,21,22,24}: moderate (102/20)
## Reason.for.absence in {2,9}: high (4/1)
## Reason.for.absence in {4,16,23,25,27,28}: low (292/21)
## Reason.for.absence = 8:
## :...Hit.target <= 95: moderate (2)
## : Hit.target > 95: low (3)
## Reason.for.absence = 12:
## :...Hit.target <= 94: low (2)
## : Hit.target > 94: high (3)
## Reason.for.absence = 26:
## :...Age <= 43: moderate (28/6)
## : Age > 43: low (2)
## Reason.for.absence = 7:
## :...Service.time <= 9: moderate (7/2)
## : Service.time > 9:
## : :...Transportation.expense <= 260: low (2)
## : Transportation.expense > 260: high (2)
## Reason.for.absence = 14:
## :...Day.of.the.week = 2: high (2/1)
## : Day.of.the.week in {4,5}: low (5/1)
## : Day.of.the.week = 3:
## : :...Transportation.expense <= 260: low (2)
## : : Transportation.expense > 260: moderate (2)
## : Day.of.the.week = 6:
## : :...Transportation.expense <= 246: moderate (2)
## : Transportation.expense > 246: low (2)
## Reason.for.absence = 11:
## :...Transportation.expense <= 157: high (2)
## : Transportation.expense > 157:
## : :...Work.load.Average.day <= 284.853: low (10/2)
## : Work.load.Average.day > 284.853:
## : :...Work.load.Average.day <= 343.253: moderate (7/2)
## : Work.load.Average.day > 343.253: low (3/1)
## Reason.for.absence = 13:
## :...Body.mass.index <= 21: low (7/1)
## : Body.mass.index > 21:
## : :...Day.of.the.week in {3,6}: moderate (10/3)
## : Day.of.the.week = 2:
## : :...Service.time <= 13: high (2)
## : : Service.time > 13: low (2/1)
## : Day.of.the.week = 4:
## : :...Hit.target <= 95: moderate (6/2)
## : : Hit.target > 95: high (4/1)
## : Day.of.the.week = 5:
## : :...Distance.from.Residence.to.Work <= 27: low (4)
## : Distance.from.Residence.to.Work > 27: moderate (3/1)
## Reason.for.absence = 19:
## :...Height > 175: low (7/2)
## Height <= 175:
## :...Transportation.expense <= 118: high (3/1)
## Transportation.expense > 118:
## :...Hit.target <= 88: high (2)
## Hit.target > 88: moderate (20/5)
##
## ----- Trial 1: -----
##
## Decision tree:
##
## Reason.for.absence in {5,8,16,21,23,25,27,28}:
## :...Transportation.expense <= 291: low (260.4/41.2)
## : Transportation.expense > 291: moderate (15/5.5)
## Reason.for.absence in {1,2,3,4,6,7,9,10,11,12,13,14,15,17,18,19,22,24,26}:
## :...Transportation.expense <= 235:
## :...Day.of.the.week = 5: moderate (20.5/10.3)
## : Day.of.the.week = 6: low (18.2/7.1)
## : Day.of.the.week = 3:
## : :...Body.mass.index <= 28: high (22.1/10.3)
## : : Body.mass.index > 28: low (12.6/6.3)
## : Day.of.the.week = 4:
## : :...Hit.target <= 88: moderate (5.5/2.4)
## : : Hit.target > 88: low (29.3/11.1)
## : Day.of.the.week = 2:
## : :...Transportation.expense > 184: moderate (21.3/7.9)
## : Transportation.expense <= 184:
## : :...Hit.target <= 94: moderate (6.3/1.6)
## : Hit.target > 94: high (18.2/8.7)
## Transportation.expense > 235:
## :...Age > 41: low (9.5/3.9)
## Age <= 41:
## :...Age > 34: moderate (50.6/19)
## Age <= 34:
## :...Reason.for.absence in {1,3,4,6,9,10,11,12,14,15,17,18,22,24,
## : 26}: moderate (43.4/11.9)
## Reason.for.absence in {2,7,13,19}: high (22.9/8.7)
##
## ----- Trial 2: -----
##
## Decision tree:
##
## Reason.for.absence in {16,23,27,28}: low (213.9/46.4)
## Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,
## : 25,26}:
## :...Work.load.Average.day > 313.532:
## :...Body.mass.index <= 28:
## : :...Transportation.expense <= 279: high (35.6/15.2)
## : : Transportation.expense > 279: moderate (4)
## : Body.mass.index > 28:
## : :...Service.time <= 13: moderate (8.6/4)
## : Service.time > 13: low (21/11.7)
## Work.load.Average.day <= 313.532:
## :...Transportation.expense > 233:
## :...Reason.for.absence in {1,3,4,5,6,8,11,12,13,14,15,17,18,21,22,24,
## : : 25,26}: moderate (99.4/33.5)
## : Reason.for.absence in {2,7,9,10,19}:
## : :...Body.mass.index <= 24: high (11.9/2)
## : Body.mass.index > 24: moderate (20.4/7.4)
## Transportation.expense <= 233:
## :...Day.of.the.week = 3:
## :...Body.mass.index > 29: low (6/1.3)
## : Body.mass.index <= 29:
## : :...Service.time <= 11: moderate (12.9/2)
## : Service.time > 11: high (13.3/4.5)
## Day.of.the.week in {2,4,5,6}:
## :...Height <= 170:
## :...Transportation.expense > 189: low (20.1/3.3)
## : Transportation.expense <= 189:
## : :...Work.load.Average.day <= 222.196: low (4.2)
## : Work.load.Average.day > 222.196: moderate (24/8.7)
## Height > 170:
## :...Distance.from.Residence.to.Work <= 11: low (17.3/5.8)
## Distance.from.Residence.to.Work > 11:
## :...Hit.target <= 88: high (4.9/2)
## Hit.target > 88:
## :...Distance.from.Residence.to.Work <= 12: high (5.9/2)
## Distance.from.Residence.to.Work > 12: low (32.5/10.1)
##
## ----- Trial 3: -----
##
## Decision tree:
##
## Reason.for.absence in {16,23,25,27,28}:
## :...Transportation.expense > 291: moderate (17.1/7.8)
## : Transportation.expense <= 291:
## : :...Height <= 167: moderate (19.3/9.1)
## : Height > 167:
## : :...Work.load.Average.day > 268.519: low (48.5)
## : Work.load.Average.day <= 268.519:
## : :...Transportation.expense > 118: low (115.8/26.2)
## : Transportation.expense <= 118:
## : :...Age <= 48: moderate (20/7.4)
## : Age > 48: low (5.7)
## Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,
## : 26}:
## :...Work.load.Average.day > 284.853:
## :...Transportation.expense <= 248:
## : :...Distance.from.Residence.to.Work <= 11: low (8.7/4)
## : : Distance.from.Residence.to.Work > 11: moderate (62.4/28)
## : Transportation.expense > 248:
## : :...Transportation.expense > 300: moderate (5)
## : Transportation.expense <= 300:
## : :...Hit.target <= 94: high (7.8/1.1)
## : Hit.target > 94:
## : :...Day.of.the.week in {2,5,6}: high (15.2/5.3)
## : Day.of.the.week in {3,4}: moderate (7.6)
## Work.load.Average.day <= 284.853:
## :...Distance.from.Residence.to.Work <= 11:
## :...Hit.target <= 88: low (4.5)
## : Hit.target > 88: moderate (32.3/9.2)
## Distance.from.Residence.to.Work > 11:
## :...Distance.from.Residence.to.Work <= 12: high (8.6/1.8)
## Distance.from.Residence.to.Work > 12:
## :...Hit.target <= 88: moderate (10/3.6)
## Hit.target > 88:
## :...Height <= 168: moderate (28.9/10.8)
## Height > 168:
## :...Age > 40:
## :...Height <= 171: moderate (11.2/4.9)
## : Height > 171: low (20.1/8.7)
## Age <= 40:
## :...Day.of.the.week in {3,4}: moderate (38.6/19.1)
## Day.of.the.week in {5,6}: low (33.6/13.6)
## Day.of.the.week = 2:
## :...Height <= 171: low (22/5.2)
## Height > 171: high (13/6.8)
##
## ----- Trial 4: -----
##
## Decision tree:
##
## Reason.for.absence in {1,2,3,6,7,9,10,11,12,13,14,15,17,18,19,22,24}:
## :...Reason.for.absence in {1,3,6,7,10,15,17,18,22,24}: moderate (120.6/46.2)
## : Reason.for.absence in {2,9,12}: high (13.4/4)
## : Reason.for.absence = 14: low (17.5/9.1)
## : Reason.for.absence = 11:
## : :...Transportation.expense <= 157: high (2.7)
## : : Transportation.expense > 157:
## : : :...Work.load.Average.day <= 284.853: low (14.9/3)
## : : Work.load.Average.day > 284.853: moderate (14/6.9)
## : Reason.for.absence = 13:
## : :...Body.mass.index <= 21: low (8/2.2)
## : : Body.mass.index > 21:
## : : :...Age > 43: low (4.1/1.6)
## : : Age <= 43:
## : : :...Work.load.Average.day <= 313.532: moderate (19.1/7.6)
## : : Work.load.Average.day > 313.532: high (23.3/7.7)
## : Reason.for.absence = 19:
## : :...Height > 175: high (9.2/3.9)
## : Height <= 175:
## : :...Transportation.expense <= 118: high (4.6/2.1)
## : Transportation.expense > 118:
## : :...Age <= 30: moderate (5.6)
## : Age > 30: high (25.2/10.4)
## Reason.for.absence in {4,5,8,16,21,23,25,26,27,28}:
## :...Day.of.the.week in {3,4,6}:
## :...Reason.for.absence in {4,16,21,27}: low (24.3)
## : Reason.for.absence in {5,8,23,25,26,28}:
## : :...Body.mass.index <= 24: low (41.7/6.8)
## : Body.mass.index > 24:
## : :...Height > 175: low (18.4/2.7)
## : Height <= 175:
## : :...Day.of.the.week = 3: low (30.5/10.1)
## : Day.of.the.week = 4:
## : :...Service.time <= 12: low (3.9)
## : : Service.time > 12: moderate (21.6/6.8)
## : Day.of.the.week = 6:
## : :...Reason.for.absence in {5,25,26,28}: moderate (20.5/5)
## : Reason.for.absence in {8,23}: low (7.3)
## Day.of.the.week in {2,5}:
## :...Distance.from.Residence.to.Work <= 14: low (23/3.9)
## Distance.from.Residence.to.Work > 14:
## :...Height <= 167: low (13.7/4.4)
## Height > 167:
## :...Transportation.expense > 330: low (3/1.5)
## Transportation.expense <= 330:
## :...Reason.for.absence in {25,27}: low (15.9)
## Reason.for.absence in {4,5,8,16,21,26}: moderate (12.5/4.1)
## Reason.for.absence in {23,28}:
## :...Work.load.Average.day > 265.017: low (7.8)
## Work.load.Average.day <= 265.017:
## :...Work.load.Average.day <= 239.554: low (4.4)
## Work.load.Average.day > 239.554: high (25.3/10.2)
##
## ----- Trial 5: -----
##
## Decision tree:
##
## Reason.for.absence in {4,5,8,16,21,23,25,27,28}:
## :...Body.mass.index <= 19: low (14.1/3.2)
## : Body.mass.index > 19:
## : :...Reason.for.absence in {4,8,16,27,28}: low (94/21.4)
## : Reason.for.absence in {5,21}: moderate (8.7/3.1)
## : Reason.for.absence = 25:
## : :...Work.load.Average.day <= 241.476: moderate (13.1/5.2)
## : : Work.load.Average.day > 241.476: low (11.2)
## : Reason.for.absence = 23:
## : :...Service.time <= 4: low (9.7/5.9)
## : Service.time > 4:
## : :...Hit.target <= 98: low (60.6/13.6)
## : Hit.target > 98: moderate (17.1/4.6)
## Reason.for.absence in {1,2,3,6,7,9,10,11,12,13,14,15,17,18,19,22,24,26}:
## :...Reason.for.absence in {1,3,6,14,15,17,18,22,24}: moderate (95.1/42.3)
## Reason.for.absence in {2,9,12}: high (13.5/5.5)
## Reason.for.absence = 7: low (16.3/8.6)
## Reason.for.absence = 10:
## :...Body.mass.index <= 23: high (10.1/3.4)
## : Body.mass.index > 23: moderate (19/9.1)
## Reason.for.absence = 11:
## :...Hit.target <= 93: low (6.7/0.4)
## : Hit.target > 93:
## : :...Body.mass.index <= 22: high (5.8/1.3)
## : Body.mass.index > 22: moderate (18.8/9.2)
## Reason.for.absence = 19:
## :...Height > 175: low (10.8/4.5)
## : Height <= 175:
## : :...Transportation.expense <= 118: low (4.8/2.2)
## : Transportation.expense > 118: moderate (33.1/12.6)
## Reason.for.absence = 13:
## :...Body.mass.index <= 21: low (7.6/2.7)
## : Body.mass.index > 21:
## : :...Transportation.expense <= 118: low (4.8/2)
## : Transportation.expense > 118:
## : :...Age <= 36: moderate (17.5/8.1)
## : Age > 36:
## : :...Transportation.expense > 246: low (2.8)
## : Transportation.expense <= 246:
## : :...Work.load.Average.day <= 308.593: moderate (8.5/2.7)
## : Work.load.Average.day > 308.593: high (14.1/4.6)
## Reason.for.absence = 26:
## :...Work.load.Average.day > 330.061: high (2.6)
## Work.load.Average.day <= 330.061:
## :...Distance.from.Residence.to.Work > 42: moderate (2.8)
## Distance.from.Residence.to.Work <= 42:
## :...Age > 43: low (2.6)
## Age <= 43:
## :...Work.load.Average.day > 275.312: moderate (5)
## Work.load.Average.day <= 275.312:
## :...Work.load.Average.day <= 237.656: moderate (3.1)
## Work.load.Average.day > 237.656: low (22.2/8)
##
## ----- Trial 6: -----
##
## Decision tree:
##
## Reason.for.absence in {4,5,8,16,21,23,25,27,28}:
## :...Transportation.expense <= 291: low (199.2/55.5)
## : Transportation.expense > 291: moderate (18.4/8.1)
## Reason.for.absence in {1,2,3,6,7,9,10,11,12,13,14,15,17,18,19,22,24,26}:
## :...Reason.for.absence in {1,3,7,9,15,17,18,24}: moderate (60.3/28.9)
## Reason.for.absence in {2,6,19}: high (59.8/31.5)
## Reason.for.absence in {11,12}: low (39.6/21.3)
## Reason.for.absence = 14:
## :...Body.mass.index <= 25: moderate (14.1/8.1)
## : Body.mass.index > 25: low (9/0.9)
## Reason.for.absence = 26:
## :...Work.load.Average.day <= 330.061: moderate (36.6/14.7)
## : Work.load.Average.day > 330.061: high (2.3)
## Reason.for.absence = 10:
## :...Hit.target <= 91: low (6.9/1.5)
## : Hit.target > 91:
## : :...Work.load.Average.day <= 222.196: low (3.4/0.4)
## : Work.load.Average.day > 222.196: moderate (20.2/8.3)
## Reason.for.absence = 22:
## :...Height <= 167: high (3.8/0.7)
## : Height > 167:
## : :...Hit.target <= 95: moderate (6.6)
## : Hit.target > 95: low (19.3/8.5)
## Reason.for.absence = 13:
## :...Body.mass.index <= 21: low (7.4/3.1)
## Body.mass.index > 21:
## :...Hit.target > 99: low (2.4)
## Hit.target <= 99:
## :...Transportation.expense <= 118: moderate (4.9/2.4)
## Transportation.expense > 118: high (40.8/20.8)
##
## ----- Trial 7: -----
##
## Decision tree:
##
## Reason.for.absence in {16,23,25,27,28}:
## :...Distance.from.Residence.to.Work > 42: low (41.2/1.6)
## : Distance.from.Residence.to.Work <= 42:
## : :...Hit.target <= 97: low (78.8/13.2)
## : Hit.target > 97:
## : :...Height > 175: low (11.9/2.9)
## : Height <= 175:
## : :...Height > 172: moderate (4.7)
## : Height <= 172:
## : :...Distance.from.Residence.to.Work <= 27: low (17/3.4)
## : Distance.from.Residence.to.Work > 27: moderate (12.4/2.2)
## Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,
## : 26}:
## :...Work.load.Average.day > 284.853:
## :...Body.mass.index > 27:
## : :...Transportation.expense <= 268: low (35.6/17.4)
## : : Transportation.expense > 268: moderate (17.8/7.8)
## : Body.mass.index <= 27:
## : :...Day.of.the.week in {3,5,6}: moderate (33.7/11)
## : Day.of.the.week = 4: high (10.8/3.9)
## : Day.of.the.week = 2:
## : :...Transportation.expense <= 184: high (8.8/1)
## : Transportation.expense > 184: moderate (14.3/5)
## Work.load.Average.day <= 284.853:
## :...Service.time <= 10:
## :...Work.load.Average.day <= 230.29: moderate (9.3/4)
## : Work.load.Average.day > 230.29:
## : :...Reason.for.absence in {6,7,10,11,12,13,14}: low (46/14.2)
## : Reason.for.absence in {1,2,3,4,5,8,9,15,17,18,19,21,22,24,
## : 26}: moderate (45.2/18.2)
## Service.time > 10:
## :...Service.time <= 11: moderate (13.8/2.7)
## Service.time > 11:
## :...Distance.from.Residence.to.Work > 42: low (16/6.5)
## Distance.from.Residence.to.Work <= 42:
## :...Height > 172:
## :...Age <= 37: high (11.8/1.5)
## : Age > 37: low (15.8/4.9)
## Height <= 172:
## :...Reason.for.absence in {1,2,3,4,5,8,9,11,13,14,15,17,18,
## : 21,22,24,
## : 26}: moderate (69.1/33.8)
## Reason.for.absence in {6,7,10,12,19}:
## :...Work.load.Average.day <= 265.017: high (23.4/6.7)
## Work.load.Average.day > 265.017: moderate (8.7/2.1)
##
## ----- Trial 8: -----
##
## Decision tree:
##
## Reason.for.absence in {16,23,25,27,28}:
## :...Transportation.expense <= 291: low (124.8/11.9)
## : Transportation.expense > 291: moderate (13.5/5.3)
## Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,
## : 26}:
## :...Transportation.expense <= 248:
## :...Reason.for.absence in {1,2,3,4,5,6,10,15,17,18,21,22,
## : : 24}: moderate (90.8/40.6)
## : Reason.for.absence in {7,8,14}: low (31.4/11.3)
## : Reason.for.absence in {9,12}: high (14.6/5.6)
## : Reason.for.absence = 11:
## : :...Body.mass.index <= 27: high (17.1/6.8)
## : : Body.mass.index > 27: moderate (13.5/6.6)
## : Reason.for.absence = 19:
## : :...Height <= 171: moderate (14.2/2.3)
## : : Height > 171: low (19.2/7)
## : Reason.for.absence = 26:
## : :...Body.mass.index <= 33: low (16.6/5.6)
## : : Body.mass.index > 33: moderate (6.9)
## : Reason.for.absence = 13:
## : :...Service.time <= 11: low (12.5/3.2)
## : Service.time > 11:
## : :...Height <= 165: low (2.4)
## : Height > 165:
## : :...Work.load.Average.day <= 308.593: moderate (12.8/2.5)
## : Work.load.Average.day > 308.593: high (20.7/10.2)
## Transportation.expense > 248:
## :...Height > 172:
## :...Service.time <= 12: moderate (24.1/7.4)
## : Service.time > 12: low (3)
## Height <= 172:
## :...Day.of.the.week = 3: moderate (9.4)
## Day.of.the.week = 5: high (20.8/8.2)
## Day.of.the.week in {2,4,6}:
## :...Hit.target <= 93:
## :...Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
## : : 17,19,21}: high (20.2/2.5)
## : Reason.for.absence in {18,22,24,26}: moderate (7)
## Hit.target > 93:
## :...Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
## : 17,18,21,22,24}: moderate (22.3)
## Reason.for.absence in {19,26}: high (16.3/5.4)
##
## ----- Trial 9: -----
##
## Decision tree:
##
## Reason.for.absence in {1,3,5,6,8,9,10,15,17,18,22,24}: moderate (111.9/37.4)
## Reason.for.absence = 2: high (3.3)
## Reason.for.absence in {4,12,16,21,23,25,27,28}: low (137.2/14.1)
## Reason.for.absence = 7:
## :...Service.time <= 11: moderate (17.9/5.4)
## : Service.time > 11: high (6.1/0.9)
## Reason.for.absence = 14:
## :...Body.mass.index <= 25: high (16.8/9.5)
## : Body.mass.index > 25: low (9.1/0.9)
## Reason.for.absence = 11:
## :...Transportation.expense <= 157: high (3.5)
## : Transportation.expense > 157:
## : :...Work.load.Average.day <= 265.017: low (21/1.6)
## : Work.load.Average.day > 265.017: moderate (27.1/12.7)
## Reason.for.absence = 19:
## :...Height > 175: low (11.2/2.7)
## : Height <= 175:
## : :...Transportation.expense <= 118: high (9.3/2.1)
## : Transportation.expense > 118: moderate (43.7/16.7)
## Reason.for.absence = 26:
## :...Work.load.Average.day > 330.061: high (2.6)
## : Work.load.Average.day <= 330.061:
## : :...Age <= 43: moderate (35.1/6.3)
## : Age > 43: low (3.3)
## Reason.for.absence = 13:
## :...Body.mass.index <= 21: low (4.7)
## Body.mass.index > 21:
## :...Transportation.expense <= 118: low (3.8)
## Transportation.expense > 118:
## :...Hit.target > 99: low (3.3)
## Hit.target <= 99:
## :...Age > 41: low (4.2/0.8)
## Age <= 41:
## :...Work.load.Average.day <= 222.196: high (5.2)
## Work.load.Average.day > 222.196:
## :...Work.load.Average.day <= 313.532: moderate (15.4/1.8)
## Work.load.Average.day > 313.532: high (24.3/10)
##
##
## Evaluation on training data (556 cases):
##
## Trial Decision Tree
## ----- ----------------
## Size Errors
##
## 0 34 74(13.3%)
## 1 15 121(21.8%)
## 2 18 116(20.9%)
## 3 23 134(24.1%)
## 4 30 121(21.8%)
## 5 31 112(20.1%)
## 6 19 137(24.6%)
## 7 22 104(18.7%)
## 8 23 98(17.6%)
## 9 23 91(16.4%)
## boost 54( 9.7%) <<
##
##
## (a) (b) (c) <-classified as
## ---- ---- ----
## 26 5 16 (a): class high
## 2 322 12 (b): class low
## 2 17 154 (c): class moderate
##
##
## Attribute usage:
##
## 100.00% Reason.for.absence
## 100.00% Transportation.expense
## 98.74% Day.of.the.week
## 95.68% Work.load.Average.day
## 95.50% Distance.from.Residence.to.Work
## 95.14% Height
## 88.13% Body.mass.index
## 83.27% Hit.target
## 61.69% Service.time
## 42.99% Age
##
##
## Time: 0.0 secs
plot(c50.fit)

c50.fit.pred <- predict(c50.fit, test)
print(postResample(pred = c50.fit.pred, obs = test.group))
## Accuracy Kappa
## 0.7071429 0.4427184
confusionMatrix(c50.fit.pred, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 5 1 2
## low 5 70 17
## moderate 6 10 24
##
## Overall Statistics
##
## Accuracy : 0.7071
## 95% CI : (0.6243, 0.7809)
## No Information Rate : 0.5786
## P-Value [Acc > NIR] : 0.001151
##
## Kappa : 0.4427
## Mcnemar's Test P-Value : 0.090396
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.31250 0.8642 0.5581
## Specificity 0.97581 0.6271 0.8351
## Pos Pred Value 0.62500 0.7609 0.6000
## Neg Pred Value 0.91667 0.7708 0.8100
## Prevalence 0.11429 0.5786 0.3071
## Detection Rate 0.03571 0.5000 0.1714
## Detection Prevalence 0.05714 0.6571 0.2857
## Balanced Accuracy 0.64415 0.7457 0.6966
## Recursive PArtition Regression Tree - RPART
m2 = rpart(absentgroup ~ .,train, method = "class")
m2.pred = predict(m2, test, type = "class")
print(postResample(pred = m2.pred, obs = test.group))
## Accuracy Kappa
## 0.6785714 0.3765463
confusionMatrix(m2.pred, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 4 0 1
## low 6 69 20
## moderate 6 12 22
##
## Overall Statistics
##
## Accuracy : 0.6786
## 95% CI : (0.5945, 0.7549)
## No Information Rate : 0.5786
## P-Value [Acc > NIR] : 0.009715
##
## Kappa : 0.3765
## Mcnemar's Test P-Value : 0.009005
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.25000 0.8519 0.5116
## Specificity 0.99194 0.5593 0.8144
## Pos Pred Value 0.80000 0.7263 0.5500
## Neg Pred Value 0.91111 0.7333 0.7900
## Prevalence 0.11429 0.5786 0.3071
## Detection Rate 0.02857 0.4929 0.1571
## Detection Prevalence 0.03571 0.6786 0.2857
## Balanced Accuracy 0.62097 0.7056 0.6630
plot(m2)
text(m2, pretty = 0, cex = 0.8)

prp(m2, varlen = 4, extra = 2)

## Support Vector machine
absent_classifier <- ksvm(absentgroup ~ ., data = train, kernel = "vanilladot")
## Setting default kernel parameters
absent_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 307
##
## Objective Function Value : -77.7838 -88.6522 -159.4792
## Training error : 0.208633
#Evaluating the SVM Model Performance
absent_predictions <- predict(absent_classifier, test)
table(absent_predictions, test.group )
## test.group
## absent_predictions high low moderate
## high 2 1 0
## low 1 67 15
## moderate 13 13 28
#Confusion Matrix for SVM Model
agreement <- absent_predictions == test.group
table(agreement)
## agreement
## FALSE TRUE
## 43 97
print(postResample(pred = absent_predictions, obs = test.group))
## Accuracy Kappa
## 0.6928571 0.4270486
############## Random Forest is our Best PErformaer ##################
############### Final Prediction on entire data set ##################
finalData = rbind(train, test)
final_fit <- predict(rf.fit5, finalData, type = "class")
summary(final_fit)
## high low moderate
## 48 430 218
#table(final_fit, modeldata$absentgroup)
print(postResample(pred = final_fit, obs = finalData$absentgroup))
## Accuracy Kappa
## 0.9324713 0.8717146
confusionMatrix(final_fit, finalData$absentgroup)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 47 0 1
## low 5 406 19
## moderate 11 11 196
##
## Overall Statistics
##
## Accuracy : 0.9325
## 95% CI : (0.9112, 0.95)
## No Information Rate : 0.5991
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.8717
## Mcnemar's Test P-Value : 0.001458
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.74603 0.9736 0.9074
## Specificity 0.99842 0.9140 0.9542
## Pos Pred Value 0.97917 0.9442 0.8991
## Neg Pred Value 0.97531 0.9586 0.9582
## Prevalence 0.09052 0.5991 0.3103
## Detection Rate 0.06753 0.5833 0.2816
## Detection Prevalence 0.06897 0.6178 0.3132
## Balanced Accuracy 0.87223 0.9438 0.9308
### Best perfomer is Random Forest with 5 splits. ON entire data, random forest's prediction accuracy 93%. Kappa 87% tells the model is almost perfect to predict the absenteeism group.
################################## End of Projct Absenteeism at Wrok #####################################