## Libraries required for this project
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.3
library(corrplot)
## corrplot 0.84 loaded
library(factoextra)
## Warning: package 'factoextra' was built under R version 3.6.3
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(NbClust)
library(cluster)
library(purrr)
## Warning: package 'purrr' was built under R version 3.6.3
library(MASS)
## Warning: package 'MASS' was built under R version 3.6.3
library(gridExtra)
library(tree)
## Warning: package 'tree' was built under R version 3.6.3
library(Metrics)
## Warning: package 'Metrics' was built under R version 3.6.3
library(randomForest)
## Warning: package 'randomForest' was built under R version 3.6.3
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(C50)
## Warning: package 'C50' was built under R version 3.6.3
library(kernlab)
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:purrr':
##
## cross
## The following object is masked from 'package:ggplot2':
##
## alpha
library(caret)
## Warning: package 'caret' was built under R version 3.6.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 3.6.3
##
## Attaching package: 'caret'
## The following objects are masked from 'package:Metrics':
##
## precision, recall
## The following object is masked from 'package:purrr':
##
## lift
library(rpart)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 3.6.3
## Read input Absenteeism_at_work.csv from working directory.
absentdata <- read.csv('Absenteeism_at_work_train.csv')
summary(absentdata)
## ID Reason.for.absence Month.of.absence Day.of.the.week
## Min. : 1.00 Min. : 0.00 Min. : 1.000 Min. :2.000
## 1st Qu.: 7.00 1st Qu.:13.00 1st Qu.: 3.000 1st Qu.:3.000
## Median :18.00 Median :23.00 Median : 7.000 Median :4.000
## Mean :17.67 Mean :19.47 Mean : 6.441 Mean :3.893
## 3rd Qu.:28.00 3rd Qu.:26.00 3rd Qu.:10.000 3rd Qu.:5.000
## Max. :36.00 Max. :28.00 Max. :12.000 Max. :6.000
##
## Seasons Transportation.expense Distance.from.Residence.to.Work
## Min. :1.000 Min. : 0.0 Min. : 5.00
## 1st Qu.:2.000 1st Qu.:179.0 1st Qu.:17.00
## Median :2.000 Median :225.0 Median :26.00
## Mean :2.553 Mean :222.8 Mean :30.37
## 3rd Qu.:4.000 3rd Qu.:260.0 3rd Qu.:50.00
## Max. :4.000 Max. :388.0 Max. :52.00
##
## Service.time Age Work.load.Average.day Hit.target
## Min. : 1.0 38 :112 222,196: 35 Min. : 81.00
## 1st Qu.: 9.0 28 :109 264,249: 33 1st Qu.: 92.00
## Median :13.0 37 : 67 343,253: 29 Median : 95.00
## Mean :12.7 40 : 50 265,017: 28 Mean : 94.41
## 3rd Qu.:16.0 33 : 48 284,853: 25 3rd Qu.: 97.00
## Max. :29.0 36 : 47 308,593: 24 Max. :100.00
## (Other):233 (Other):492 NA's :1
## Disciplinary.failure Education Son Social.drinker
## Min. :0.00000 Min. :1.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.0000
## Median :0.00000 Median :1.000 Median :1.000 Median :1.0000
## Mean :0.05405 Mean :1.246 Mean :1.029 Mean :0.5841
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :1.00000 Max. :4.000 Max. :4.000 Max. :1.0000
##
## Social.smoker Pet Weight Height
## Min. :0.00000 Min. :0.0000 Min. : 56.00 Min. :163.0
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: 69.00 1st Qu.:169.0
## Median :0.00000 Median :0.0000 Median : 83.00 Median :170.0
## Mean :0.06907 Mean :0.6907 Mean : 79.21 Mean :171.9
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.: 89.00 3rd Qu.:172.0
## Max. :1.00000 Max. :8.0000 Max. :108.00 Max. :196.0
## NA's :2
## Body.mass.index Absenteeism.time.in.hours
## Min. :19.00 Min. : 0.000
## 1st Qu.:24.00 1st Qu.: 2.000
## Median :25.00 Median : 3.000
## Mean :26.82 Mean : 6.752
## 3rd Qu.:31.00 3rd Qu.: 8.000
## Max. :38.00 Max. :120.000
##
str(absentdata)
## 'data.frame': 666 obs. of 21 variables:
## $ ID : int 11 36 3 7 11 3 10 20 14 1 ...
## $ Reason.for.absence : int 26 0 23 7 23 23 22 23 19 22 ...
## $ Month.of.absence : int 7 7 7 7 7 7 7 7 7 7 ...
## $ Day.of.the.week : int 3 3 4 5 5 6 6 6 2 2 ...
## $ Seasons : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Transportation.expense : int 289 118 179 279 289 179 361 260 155 235 ...
## $ Distance.from.Residence.to.Work: int 36 13 51 5 36 51 52 50 12 11 ...
## $ Service.time : int 13 18 18 14 13 18 3 11 14 14 ...
## $ Age : Factor w/ 23 levels "0","27","28",..: 8 21 12 13 8 12 3 10 9 11 ...
## $ Work.load.Average.day : Factor w/ 36 levels "0","12","205,917",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ Hit.target : int 97 97 97 97 97 97 97 97 97 97 ...
## $ Disciplinary.failure : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Education : int 1 1 1 1 1 1 1 1 1 3 ...
## $ Son : int 2 1 0 2 2 0 1 4 2 1 ...
## $ Social.drinker : int 1 1 1 1 1 1 1 1 1 0 ...
## $ Social.smoker : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Pet : int 1 0 0 0 1 0 4 0 0 1 ...
## $ Weight : int 90 98 89 68 90 89 80 65 95 88 ...
## $ Height : int 172 178 170 168 172 170 172 168 196 172 ...
## $ Body.mass.index : int 30 31 31 24 30 31 27 23 25 29 ...
## $ Absenteeism.time.in.hours : int 4 0 2 4 2 2 8 4 40 8 ...
## Observations in which reason code is greater than zero but absenteeism time in hours is > 0 +
## Observations in which reason code is zero and absenteeism time is zero.
a <- subset(absentdata, Absenteeism.time.in.hours <= 0 & Reason.for.absence > 0, c(ID, Reason.for.absence, Absenteeism.time.in.hours))
b <- subset(absentdata, Absenteeism.time.in.hours <= 0 & Reason.for.absence == 0, c(ID, Reason.for.absence, Absenteeism.time.in.hours))
as.matrix(rbind(a,b))
## ID Reason.for.absence Absenteeism.time.in.hours
## 135 34 27 0
## 2 36 0 0
## 51 20 0 0
## 52 29 0 0
## 55 11 0 0
## 56 36 0 0
## 59 13 0 0
## 65 36 0 0
## 204 2 0 0
## 214 7 0 0
## 215 18 0 0
## 216 23 0 0
## 217 31 0 0
## 252 20 0 0
## 274 5 0 0
## 277 8 0 0
## 278 19 0 0
## 286 5 0 0
## 294 36 0 0
## 295 33 0 0
## 301 5 0 0
## 304 5 0 0
## 312 20 0 0
## 313 15 0 0
## 314 30 0 0
## 326 18 0 0
## 337 23 0 0
## 338 7 0 0
## 401 13 0 0
## 406 1 0 0
## 407 24 0 0
## 408 36 0 0
## 447 3 0 0
## 531 28 0 0
## 549 15 0 0
## 550 11 0 0
## 552 5 0 0
## REason code 27 is the only one wherer Absenteeism time is zero. Removed that observation.
absentdata = absentdata[!(absentdata$Absenteeism.time.in.hours==0 & absentdata$Reason.for.absence > 0) ,]
## At this point we have 665 observations and 21 attributes.
dim(absentdata)
## [1] 665 21
summary(absentdata)
## ID Reason.for.absence Month.of.absence Day.of.the.week
## Min. : 1.00 Min. : 0.00 Min. : 1.00 Min. :2.000
## 1st Qu.: 7.00 1st Qu.:13.00 1st Qu.: 3.00 1st Qu.:3.000
## Median :18.00 Median :23.00 Median : 7.00 Median :4.000
## Mean :17.65 Mean :19.46 Mean : 6.45 Mean :3.896
## 3rd Qu.:28.00 3rd Qu.:26.00 3rd Qu.:10.00 3rd Qu.:5.000
## Max. :36.00 Max. :28.00 Max. :12.00 Max. :6.000
##
## Seasons Transportation.expense Distance.from.Residence.to.Work
## Min. :1.000 Min. : 0 Min. : 5.0
## 1st Qu.:2.000 1st Qu.:179 1st Qu.:17.0
## Median :2.000 Median :225 Median :26.0
## Mean :2.553 Mean :223 Mean :30.4
## 3rd Qu.:4.000 3rd Qu.:260 3rd Qu.:50.0
## Max. :4.000 Max. :388 Max. :52.0
##
## Service.time Age Work.load.Average.day Hit.target
## Min. : 1.0 38 :112 222,196: 35 Min. : 81.00
## 1st Qu.: 9.0 28 :109 264,249: 33 1st Qu.: 92.00
## Median :13.0 37 : 66 343,253: 29 Median : 95.00
## Mean :12.7 40 : 50 265,017: 28 Mean : 94.41
## 3rd Qu.:16.0 33 : 48 284,853: 25 3rd Qu.: 97.00
## Max. :29.0 36 : 47 268,519: 23 Max. :100.00
## (Other):233 (Other):492 NA's :1
## Disciplinary.failure Education Son Social.drinker
## Min. :0.00000 Min. :1.000 Min. :0.00 Min. :0.000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.00 1st Qu.:0.000
## Median :0.00000 Median :1.000 Median :1.00 Median :1.000
## Mean :0.05414 Mean :1.247 Mean :1.03 Mean :0.585
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:2.00 3rd Qu.:1.000
## Max. :1.00000 Max. :4.000 Max. :4.00 Max. :1.000
##
## Social.smoker Pet Weight Height
## Min. :0.00000 Min. :0.0000 Min. : 56.00 Min. :163.0
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: 69.00 1st Qu.:169.0
## Median :0.00000 Median :0.0000 Median : 83.00 Median :170.0
## Mean :0.06917 Mean :0.6917 Mean : 79.21 Mean :171.9
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.: 89.00 3rd Qu.:172.0
## Max. :1.00000 Max. :8.0000 Max. :108.00 Max. :196.0
## NA's :2
## Body.mass.index Absenteeism.time.in.hours
## Min. :19.00 Min. : 0.000
## 1st Qu.:24.00 1st Qu.: 2.000
## Median :25.00 Median : 3.000
## Mean :26.81 Mean : 6.762
## 3rd Qu.:31.00 3rd Qu.: 8.000
## Max. :38.00 Max. :120.000
##
## Disciplinary failure is a noise. ONly zeros in Disciplinary Failure.
range(absentdata$Disciplinary.failure)
## [1] 0 1
## Removied Disciplinary failure attribute.
absentdata <- absentdata[,-12]
str(absentdata)
## 'data.frame': 665 obs. of 20 variables:
## $ ID : int 11 36 3 7 11 3 10 20 14 1 ...
## $ Reason.for.absence : int 26 0 23 7 23 23 22 23 19 22 ...
## $ Month.of.absence : int 7 7 7 7 7 7 7 7 7 7 ...
## $ Day.of.the.week : int 3 3 4 5 5 6 6 6 2 2 ...
## $ Seasons : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Transportation.expense : int 289 118 179 279 289 179 361 260 155 235 ...
## $ Distance.from.Residence.to.Work: int 36 13 51 5 36 51 52 50 12 11 ...
## $ Service.time : int 13 18 18 14 13 18 3 11 14 14 ...
## $ Age : Factor w/ 23 levels "0","27","28",..: 8 21 12 13 8 12 3 10 9 11 ...
## $ Work.load.Average.day : Factor w/ 36 levels "0","12","205,917",..: 8 8 8 8 8 8 8 8 8 8 ...
## $ Hit.target : int 97 97 97 97 97 97 97 97 97 97 ...
## $ Education : int 1 1 1 1 1 1 1 1 1 3 ...
## $ Son : int 2 1 0 2 2 0 1 4 2 1 ...
## $ Social.drinker : int 1 1 1 1 1 1 1 1 1 0 ...
## $ Social.smoker : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Pet : int 1 0 0 0 1 0 4 0 0 1 ...
## $ Weight : int 90 98 89 68 90 89 80 65 95 88 ...
## $ Height : int 172 178 170 168 172 170 172 168 196 172 ...
## $ Body.mass.index : int 30 31 31 24 30 31 27 23 25 29 ...
## $ Absenteeism.time.in.hours : int 4 0 2 4 2 2 8 4 40 8 ...
## Now we have 665 observations and 20 attributes.
## Missing value analysis
as.matrix(colSums(is.na(absentdata)))
## [,1]
## ID 0
## Reason.for.absence 0
## Month.of.absence 0
## Day.of.the.week 0
## Seasons 0
## Transportation.expense 0
## Distance.from.Residence.to.Work 0
## Service.time 0
## Age 0
## Work.load.Average.day 0
## Hit.target 1
## Education 0
## Son 0
## Social.drinker 0
## Social.smoker 0
## Pet 0
## Weight 2
## Height 0
## Body.mass.index 0
## Absenteeism.time.in.hours 0
# There are a couple of missing values.
library(dplyr)
## Warning: package 'dplyr' was built under R version 3.6.3
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
##
## combine
## The following object is masked from 'package:gridExtra':
##
## combine
## The following object is masked from 'package:MASS':
##
## select
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# Exclude the missing observations
absentdata <-absentdata %>%
na.omit()
dim(absentdata)
## [1] 662 20
## Box plot of Absenteeism time in hours with Reason for absence. To verify the outliers in each reason codes.
ggplot(absentdata,
aes_string(y=absentdata$Absenteeism.time.in.hours,x=as.factor(absentdata$Reason.for.absence))) +
geom_boxplot() +
xlab('Reason.for.absence') +
ylab('Absenteeism.time.in.hours')
## Let's see the Distribution for continuous variables.
## Absenteeism time. Highly right skewed due to presence of outliers.
hist(absentdata$Absenteeism.time.in.hours, breaks = 40, #prob = TRUE,
xlab = 'Absenteeism time in hours', main = " Absenteeism time Distribution", col = "grey")
boxplot(absentdata$Absenteeism.time.in.hours, main = "Box plot of Absenteeism time in hours")
#Outlier Analysis
#boxplot for Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Hit.target
boxplot(absentdata[,c('Transportation.expense','Distance.from.Residence.to.Work', 'Service.time', 'Age','Hit.target')], varwidth = T,
col = "dark grey")
#boxplot for Weight,Height,Body.mass.index,Absenteeism.time.in.hours
boxplot(absentdata[,c('Weight', 'Height', 'Body.mass.index','Absenteeism.time.in.hours')], col = "grey", varwidth = T)
#boxplot for Work.load.Average.day
absentdata$Work.load.Average.day <- as.numeric(absentdata$Work.load.Average.day)
boxplot(absentdata[,c('Work.load.Average.day')], col = "grey")
## We do not delete the outliers instead we will cap the outliers with 25 and 75 percentiles.
## Capping outliers - replacing outliers with 25percentile and 75percentile values.
absentdata$Transportation.expense<- as.numeric(absentdata$Transportation.expense)
absentdata$Service.time <- as.numeric(absentdata$Service.time)
absentdata$Age <- as.numeric(absentdata$Age)
absentdata$Hit.target <- as.numeric(absentdata$Hit.target)
absentdata$Height <- as.numeric(absentdata$Height)
absentdata$Absenteeism.time.in.hours <- as.numeric(absentdata$Absenteeism.time.in.hours)
for (i in c('Transportation.expense','Service.time','Age','Work.load.Average.day','Hit.target','Height','Absenteeism.time.in.hours')){
q = quantile(absentdata[,i],c(0.25,0.75))
iqr1 = q[2]-q[1]
min1 = q[1]-1.5*iqr1
max1 = q[2]+1.5*iqr1
absentdata[,i][absentdata[,i]<min1] = min1
absentdata[,i][absentdata[,i]>max1] = max1
}
## Reason for absence vs absenteeism in time after outlier capping
ggplot(absentdata,
aes_string(y=absentdata$Absenteeism.time.in.hours,x=as.factor(absentdata$Reason.for.absence))) +
geom_boxplot() +
xlab('Reason.for.absence') +
ylab('Absenteeism.time.in.hours')
#boxplot for Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Hit.target
boxplot(absentdata[,c('Transportation.expense','Distance.from.Residence.to.Work', 'Service.time', 'Age','Hit.target')], varwidth = T,
col = "dark grey")
#boxplot for Weight,Height,Body.mass.index,Absenteeism.time.in.hours
boxplot(absentdata[,c('Weight', 'Height', 'Body.mass.index','Absenteeism.time.in.hours')], col = "grey", varwidth = T)
#boxplot for Work.load.Average.day
boxplot(absentdata[,c('Work.load.Average.day')], col = "grey")
## Data independence, Multicollinearity test.
## First categorical variables.
categorical_var = c("Reason.for.absence","Month.of.absence","Day.of.the.week",
"Seasons", "Education", "Social.drinker",
"Social.smoker", "Son", "Pet")
## Transform categorical variables into factors.
absentdata[,categorical_var ] <- lapply(absentdata[,categorical_var], factor)
#str(absentdata)
# Chi-square test for relationship between attributes.
pvalue = c()
#Calculating & storing p-values in vector pval from chisquare test
for(i in categorical_var){
for(j in categorical_var){
chi2 = chisq.test(absentdata[,i],absentdata[,j]) #, simulate.p.value = T)
pvalue = c(pvalue,chi2$p.value)
}
}
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect
length(pvalue)
## [1] 81
m1 <- matrix(pvalue, ncol = 9)
df <- data.frame(m1)
row.names(df) <- categorical_var
colnames(df) <- categorical_var
print(df)
## Reason.for.absence Month.of.absence Day.of.the.week
## Reason.for.absence 0.000000e+00 5.030070e-16 6.576851e-02
## Month.of.absence 5.030070e-16 0.000000e+00 3.455722e-01
## Day.of.the.week 6.576851e-02 3.455722e-01 0.000000e+00
## Seasons 5.214522e-21 1.245723e-317 1.656180e-01
## Education 6.534526e-09 3.901758e-01 5.162065e-01
## Social.drinker 5.270888e-08 3.251838e-03 2.821174e-01
## Social.smoker 5.803942e-09 2.075713e-02 7.591291e-01
## Son 4.854183e-18 1.186792e-04 6.611599e-08
## Pet 5.427211e-16 5.725378e-04 4.217188e-01
## Seasons Education Social.drinker Social.smoker
## Reason.for.absence 5.214522e-21 6.534526e-09 5.270888e-08 5.803942e-09
## Month.of.absence 1.245723e-317 3.901758e-01 3.251838e-03 2.075713e-02
## Day.of.the.week 1.656180e-01 5.162065e-01 2.821174e-01 7.591291e-01
## Seasons 0.000000e+00 5.460428e-01 2.494963e-02 1.559893e-01
## Education 5.460428e-01 0.000000e+00 6.103574e-32 2.400489e-20
## Social.drinker 2.494963e-02 6.103574e-32 4.319588e-145 8.081862e-03
## Social.smoker 1.559893e-01 2.400489e-20 8.081862e-03 1.211298e-142
## Son 6.721641e-06 1.778511e-07 5.257573e-09 1.017876e-17
## Pet 4.104513e-04 3.628847e-14 7.886952e-29 5.794183e-17
## Son Pet
## Reason.for.absence 4.854183e-18 5.427211e-16
## Month.of.absence 1.186792e-04 5.725378e-04
## Day.of.the.week 6.611599e-08 4.217188e-01
## Seasons 6.721641e-06 4.104513e-04
## Education 1.778511e-07 3.628847e-14
## Social.drinker 5.257573e-09 7.886952e-29
## Social.smoker 1.017876e-17 5.794183e-17
## Son 0.000000e+00 5.764606e-85
## Pet 5.764606e-85 0.000000e+00
## As per the chisquare test, except Reason.for.absence and Day.of.the.week, all categorical variables are related to Reason.for.absence, as the p-values are less than 0.005. So, we removed all categorical variables correlated to Reason.for.absence but Day.of.the.week.
absentdata <- absentdata[, -c(3, 5, 12,13,14, 15, 16)]
## Correltaion matrix for continuous attribute
m <- cor(absentdata[,4:13])
corrplot(m, order = "hclust", tl.srt = 30, tl.col = "black", addrect = 3, method = "number" )
## Correlation between Absenteeism.time.in.hours and predictor are below 0.1. But high collinearity found between Weight and Body.mass.index. So, I removed Weight from the dataframe.
absentdata = absentdata[,-10]
## After data pre-processing we are left with 696 observstions and 12 variables including target variable.
## Test for linearity in the data
pairs(absentdata[, -c(1:3)])
## Data is not linear. So, linear models will not be a good choice for this data.
######################## End of Data Preprocessing #########################
# Aggregating Absenteeism.time.in.hours by Reason.for.absence
Reasons = aggregate(absentdata$Absenteeism.time.in.hours, by=list(Category=absentdata$Reason.for.absence), FUN=sum)
#print(as.data.frame(Reasons))
Reasons$Absence = (Reasons$x/sum(absentdata$Absenteeism.time.in.hours))*100
Reasons = Reasons[order(Reasons$Absence, decreasing = T),]
#print(Reasons)
barplot(Reasons$Absence, names.arg = Reasons$Category, xlab = "Reason for absence", ylab = "Absence", col = "dark grey",
main = "How much proportion each reason code plays in absenteeism")
## Taking backup of preprocessed data
#write.csv(modeldata, "modeldata.csv", row.names = F)
Model building using Machine Learning Algorithms.
### We will see how many grous are there in the data set by means of K-means clustering.
modeldata = absentdata[,-c(1,2,3)]
df = scale(modeldata)
## NbClust method
## wssplot function to give value of K based on elbow method using within cluster sum of squeares.
wssplot <- function(data, nc = 20, seed = 1234) {
wss <- (nrow(data) - 1) * sum(apply(data, 2, var))
for (i in 2 : nc) {
set.seed(seed)
wss[i] <- sum(kmeans(data, centers = i)$withins)}
plot(1:nc, wss, type = "b", xlab = "Number of Clusters",
ylab = "Within groups sum of squares")
}
wssplot(df)
set.seed(1234)
nc <- NbClust(df, min.nc = 3, max.nc = 20, method = "kmeans" )
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 7 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 3 proposed 5 as the best number of clusters
## * 5 proposed 7 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 1 proposed 17 as the best number of clusters
## * 1 proposed 18 as the best number of clusters
## * 1 proposed 19 as the best number of clusters
## * 3 proposed 20 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 7 proposed 3 as the best number of clusters
## * 1 proposed 4 as the best number of clusters
## * 4 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 2 proposed 8 as the best number of clusters
## * 1 proposed 13 as the best number of clusters
## * 2 proposed 14 as the best number of clusters
## * 2 proposed 17 as the best number of clusters
## * 1 proposed 19 as the best number of clusters
## * 2 proposed 20 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 3
##
##
## *******************************************************************
barplot(table(nc$Best.nc[1,]))
## According to NbCluster method, 3 would be optimal value of K.
set.seed(1234)
### Elbow method - K = 3 optimal value
fviz_nbclust(df, kmeans, method = "wss")
## Average Silhoutte Method = k = 9, optimal value
set.seed(1234)
fviz_nbclust(df, kmeans, method = "silhouette")
## Comparison of k-values
set.seed(1234)
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k7 <- kmeans(df, centers = 7, nstart = 25)
k9 <- kmeans(df, centers = 9, nstart = 25)
# plots to compare
p1 = fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p2 = fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p3 = fviz_cluster(k7, geom = "point", data = df) + ggtitle("k = 7")
p4 = fviz_cluster(k9, geom = "point", data = df) + ggtitle("k = 9")
grid.arrange(p1, p2, p3, p4, nrow = 2)
# Compute k-means clustering with k = 3
set.seed(1234)
final <- kmeans(df, 3, nstart = 25)
final$center
## Transportation.expense Distance.from.Residence.to.Work Service.time
## 1 0.5826762 0.1788144 -0.7447137
## 2 -0.3515032 0.8551632 0.9905166
## 3 -0.6607796 -1.0473691 0.3698462
## Age Work.load.Average.day Hit.target Height Body.mass.index
## 1 -0.7571524 0.07953133 -0.04578003 -0.05033513 -0.5905792
## 2 0.6596816 -0.25491756 0.01288722 -0.69154040 1.2463275
## 3 0.6806771 0.09129538 0.06483467 0.69019707 -0.1108426
## Absenteeism.time.in.hours
## 1 0.06533443
## 2 -0.24888579
## 3 0.10961637
fviz_cluster(final, data = df)
#print(final)
## Build models supervesed learning way.
## Response variable is Absenteeism.time.in.hours. We will create 6 categoris and build models to predict the class.
modeldata <- absentdata
temp_table = table(as.factor(modeldata$Absenteeism.time.in.hours))
barplot(temp_table, xlab = "Absenteeism in Hours", ylab = "Frequency", main = "Absenteeism frequency")
## Most of the time it is 8 hours people go abesent from work. That means full day absenteeism is common trend.
## According to K-means cluster, this data comprises of 3 partition or 3 groups. So, creating three class lebels for absenteeism.in.time.hours, low - absenteeism hours is within 1 to 4 hours, moderate when 5 to 8 hours, high when greater than 8 hours.
absentgroup <- ifelse((modeldata$Absenteeism.time.in.hours >= 1 & modeldata$Absenteeism.time.in.hours <=4), "low", "high")
tempdata <- as.integer(as.character(modeldata$Absenteeism.time.in.hours))
for (i in 1:length(tempdata)) {
if(tempdata[i] >= 1 & tempdata[i] <=4){
modeldata$absentgroup[i] = "low"
} else if(tempdata[i] > 4 & tempdata[i] <= 8){
modeldata$absentgroup[i] = "moderate"
} else { modeldata$absentgroup[i] = "high"}
}
table(modeldata$absentgroup)
##
## high low moderate
## 93 378 191
modeldata$absentgroup = factor(modeldata$absentgroup)
## We are using validation set approach for resampling. Select 80% observation for training and 20% for testing.
## Removing ansenteeism.in.time.hours and ID attributes.
modeldata = modeldata[, -12]
modeldata = modeldata[,-1]
#smp_size <- floor(0.75 * nrow(modeldata))
## set the seed to make partition reproducible
set.seed(1234)
train_index = sample(1:nrow(modeldata), 0.8*nrow(modeldata))
train = modeldata[train_index,]
test = modeldata[-train_index,]
test.group <- test$absentgroup
## first model - Simple Classification Tree with "tree" function of "tree" package.
model_tree <- tree(absentgroup ~ . , data = train)
summary(model_tree)
##
## Classification tree:
## tree(formula = absentgroup ~ ., data = train)
## Variables actually used in tree construction:
## [1] "Reason.for.absence" "Transportation.expense" "Height"
## [4] "Day.of.the.week"
## Number of terminal nodes: 8
## Residual mean deviance: 0.9707 = 505.7 / 521
## Misclassification error rate: 0.172 = 91 / 529
plot(model_tree)
text(model_tree, pretty = 0, cex = 0.8)
model_tree_pred = predict(model_tree, test, type = "class")
#conf_matrix = table(model_tree_pred, test.group)
#model_tree_acu = sum(diag(conf_matrix))/sum(conf_matrix)
print(postResample(pred = model_tree_pred, obs = test.group))
## Accuracy Kappa
## 0.7067669 0.4438130
confusionMatrix(model_tree_pred, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 4 0 0
## low 9 67 14
## moderate 7 9 23
##
## Overall Statistics
##
## Accuracy : 0.7068
## 95% CI : (0.6216, 0.7825)
## No Information Rate : 0.5714
## P-Value [Acc > NIR] : 0.0009020
##
## Kappa : 0.4438
##
## Mcnemar's Test P-Value : 0.0006782
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.20000 0.8816 0.6216
## Specificity 1.00000 0.5965 0.8333
## Pos Pred Value 1.00000 0.7444 0.5897
## Neg Pred Value 0.87597 0.7907 0.8511
## Prevalence 0.15038 0.5714 0.2782
## Detection Rate 0.03008 0.5038 0.1729
## Detection Prevalence 0.03008 0.6767 0.2932
## Balanced Accuracy 0.60000 0.7390 0.7275
## Random forest
set.seed(1234)
#split 3, error rate 27.32%
rf.fit = randomForest(absentgroup~., data = train, importance = TRUE)
rf.fit
##
## Call:
## randomForest(formula = absentgroup ~ ., data = train, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 22.5%
## Confusion matrix:
## high low moderate class.error
## high 41 13 19 0.4383562
## low 5 268 29 0.1125828
## moderate 9 44 101 0.3441558
#rf.fit.pred <- predict(rf.fit, test, type = "class")
# Fine tuning parameters of Random Forest model, split 6. Error rate 26.62%
rf.fit1 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = 6, importance = TRUE)
rf.fit1
##
## Call:
## randomForest(formula = absentgroup ~ ., data = train, ntree = 500, mtry = 6, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 6
##
## OOB estimate of error rate: 23.44%
## Confusion matrix:
## high low moderate class.error
## high 40 11 22 0.4520548
## low 10 257 35 0.1490066
## moderate 10 36 108 0.2987013
rf.fit1.pred <- predict(rf.fit1, test, type = "class")
# Checking classification accuracy
print(postResample(pred = rf.fit1.pred, obs = test.group))
## Accuracy Kappa
## 0.7218045 0.5052780
confusionMatrix(rf.fit1.pred, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 8 1 4
## low 5 64 9
## moderate 7 11 24
##
## Overall Statistics
##
## Accuracy : 0.7218
## 95% CI : (0.6375, 0.796)
## No Information Rate : 0.5714
## P-Value [Acc > NIR] : 0.0002423
##
## Kappa : 0.5053
##
## Mcnemar's Test P-Value : 0.2975673
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.40000 0.8421 0.6486
## Specificity 0.95575 0.7544 0.8125
## Pos Pred Value 0.61538 0.8205 0.5714
## Neg Pred Value 0.90000 0.7818 0.8571
## Prevalence 0.15038 0.5714 0.2782
## Detection Rate 0.06015 0.4812 0.1805
## Detection Prevalence 0.09774 0.5865 0.3158
## Balanced Accuracy 0.67788 0.7982 0.7306
importance(rf.fit1)
## high low moderate
## Reason.for.absence 70.5375810 101.313533 63.2393467
## Day.of.the.week 1.7246321 -4.088889 -3.5912094
## Transportation.expense 7.8281458 21.860152 13.1987024
## Distance.from.Residence.to.Work 2.8208051 8.521092 4.8244581
## Service.time 2.7315962 9.912955 3.2589291
## Age -0.1799845 10.398948 0.9287517
## Work.load.Average.day 1.1818111 4.621594 0.4533569
## Hit.target 2.9746632 3.774605 -0.0791509
## Height -0.3565491 17.905394 3.0534402
## Body.mass.index 0.5132413 9.346885 0.5666345
## MeanDecreaseAccuracy MeanDecreaseGini
## Reason.for.absence 118.840701 155.843193
## Day.of.the.week -4.321063 18.669516
## Transportation.expense 24.861364 21.794455
## Distance.from.Residence.to.Work 10.520234 11.278804
## Service.time 10.565270 9.813166
## Age 9.330569 9.052956
## Work.load.Average.day 3.940339 26.191008
## Hit.target 3.842166 20.944596
## Height 16.480953 11.680924
## Body.mass.index 8.671397 10.107328
varImpPlot(rf.fit1)
# Using For loop to identify the right mtry for model
a=c()
#i=5
for (i in 1:8) {
rf.fit2 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = i, importance = TRUE)
rf.fit2.pred <- predict(rf.fit2, test, type = "class")
a[i-2] = mean(rf.fit2.pred == test.group)
}
a
## [1] 0.7293233 0.7142857 0.7142857 0.7293233 0.7293233 0.7293233
plot(3:8,a, type = "b")
rf.fit5 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = 5, importance = TRUE)
rf.fit5.pred <- predict(rf.fit5, test, type = "class")
print(postResample(pred = rf.fit5.pred, obs = test.group))
## Accuracy Kappa
## 0.7218045 0.5052780
confusionMatrix(rf.fit5.pred, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 8 1 4
## low 5 64 9
## moderate 7 11 24
##
## Overall Statistics
##
## Accuracy : 0.7218
## 95% CI : (0.6375, 0.796)
## No Information Rate : 0.5714
## P-Value [Acc > NIR] : 0.0002423
##
## Kappa : 0.5053
##
## Mcnemar's Test P-Value : 0.2975673
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.40000 0.8421 0.6486
## Specificity 0.95575 0.7544 0.8125
## Pos Pred Value 0.61538 0.8205 0.5714
## Neg Pred Value 0.90000 0.7818 0.8571
## Prevalence 0.15038 0.5714 0.2782
## Detection Rate 0.06015 0.4812 0.1805
## Detection Prevalence 0.09774 0.5865 0.3158
## Balanced Accuracy 0.67788 0.7982 0.7306
## Building the Classification Tree Models using the Quinlan's C5.0 algorithm
c50.fit <- C5.0(train[-11], train$absentgroup, trials = 10)
summary(c50.fit)
##
## Call:
## C5.0.default(x = train[-11], y = train$absentgroup, trials = 10)
##
##
## C5.0 [Release 2.07 GPL Edition] Sat Oct 03 13:09:54 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 529 cases (11 attributes) from undefined.data
##
## ----- Trial 0: -----
##
## Decision tree:
##
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence = 0: high (32)
## Reason.for.absence in {16,23,25,27,28}:
## :...Transportation.expense <= 291: low (257/17)
## : Transportation.expense > 291:
## : :...Body.mass.index <= 25: moderate (8/3)
## : Body.mass.index > 25: low (5)
## Reason.for.absence in {1,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,26}:
## :...Reason.for.absence = 9: high (4/1)
## Reason.for.absence in {1,4,5,6,8,10,15,17,18,21,22,24,
## : 26}: moderate (115/22)
## Reason.for.absence = 12:
## :...Distance.from.Residence.to.Work <= 20: high (2)
## : Distance.from.Residence.to.Work > 20: low (4/1)
## Reason.for.absence = 7:
## :...Transportation.expense > 279: high (3)
## : Transportation.expense <= 279:
## : :...Work.load.Average.day <= 6: moderate (2)
## : Work.load.Average.day > 6: low (6/1)
## Reason.for.absence = 11:
## :...Height <= 169: moderate (5/1)
## : Height > 169:
## : :...Body.mass.index <= 23: high (3/1)
## : Body.mass.index > 23: low (9/2)
## Reason.for.absence = 14:
## :...Body.mass.index > 25: low (7)
## : Body.mass.index <= 25:
## : :...Day.of.the.week in {2,4}: high (2)
## : Day.of.the.week in {3,6}: moderate (2)
## : Day.of.the.week = 5: low (3/1)
## Reason.for.absence = 19:
## :...Height > 174: low (3)
## : Height <= 174:
## : :...Distance.from.Residence.to.Work <= 17: high (3/1)
## : Distance.from.Residence.to.Work > 17: moderate (15/4)
## Reason.for.absence = 13:
## :...Day.of.the.week = 3:
## :...Work.load.Average.day <= 21: moderate (4)
## : Work.load.Average.day > 21: high (3/1)
## Day.of.the.week = 4:
## :...Height <= 171: moderate (4/2)
## : Height > 171: high (6/2)
## Day.of.the.week = 5:
## :...Height <= 171: moderate (5/1)
## : Height > 171: low (4/1)
## Day.of.the.week = 6:
## :...Work.load.Average.day <= 29: moderate (3)
## : Work.load.Average.day > 29: high (2)
## Day.of.the.week = 2:
## :...Distance.from.Residence.to.Work > 27: high (2)
## Distance.from.Residence.to.Work <= 27:
## :...Transportation.expense <= 189: low (2)
## Transportation.expense > 189:
## :...Age <= 15: moderate (2)
## Age > 15: high (2/1)
##
## ----- Trial 1: -----
##
## Decision tree:
##
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence in {4,5,6,8,15,16,17,18,21,22,23,24,25,26,27,28}:
## :...Reason.for.absence in {4,8,16,21,23,27}: low (166.2/34.6)
## : Reason.for.absence in {5,6,15,17,18,24}: moderate (26.6/7.8)
## : Reason.for.absence = 22:
## : :...Body.mass.index <= 29: moderate (14.9)
## : : Body.mass.index > 29: low (7.5/2.4)
## : Reason.for.absence = 25:
## : :...Transportation.expense <= 189: moderate (12.5/4.7)
## : : Transportation.expense > 189: low (12.5/0.8)
## : Reason.for.absence = 28:
## : :...Height <= 165: moderate (6/0.8)
## : : Height > 165: low (67.4/8.6)
## : Reason.for.absence = 26:
## : :...Work.load.Average.day > 33: high (2.6)
## : Work.load.Average.day <= 33:
## : :...Age > 17: low (5.2)
## : Age <= 17:
## : :...Height <= 171: moderate (9.4)
## : Height > 171: low (13.3/5.5)
## Reason.for.absence in {0,1,7,9,10,11,12,13,14,19}:
## :...Reason.for.absence = 0: high (25.1)
## Reason.for.absence in {1,7,9,10,11,12,13,14,19}:
## :...Transportation.expense > 248:
## :...Height <= 172: high (40.9/18.3)
## : Height > 172: moderate (9.7/2.4)
## Transportation.expense <= 248:
## :...Body.mass.index > 25:
## :...Work.load.Average.day <= 10: low (10.7/1.6)
## : Work.load.Average.day > 10:
## : :...Work.load.Average.day <= 28: moderate (22.7/9.7)
## : Work.load.Average.day > 28: low (20.1/8.3)
## Body.mass.index <= 25:
## :...Distance.from.Residence.to.Work <= 15: high (7.5)
## Distance.from.Residence.to.Work > 15:
## :...Transportation.expense > 235: moderate (9.7/4.2)
## Transportation.expense <= 235:
## :...Height <= 168: moderate (2.4)
## Height > 168:
## :...Hit.target <= 96: high (26.3/16.2)
## Hit.target > 96: low (9.9/1.6)
##
## ----- Trial 2: -----
##
## Decision tree:
##
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence = 0: high (20.8)
## Reason.for.absence in {16,23,25,27,28}:
## :...Body.mass.index <= 19: low (13.1/3.2)
## : Body.mass.index > 19:
## : :...Transportation.expense > 189:
## : :...Hit.target > 99: moderate (3.2)
## : : Hit.target <= 99:
## : : :...Height <= 164.5: moderate (2.2)
## : : Height > 164.5: low (98.7/15.7)
## : Transportation.expense <= 189:
## : :...Transportation.expense > 184: moderate (6.5)
## : Transportation.expense <= 184:
## : :...Day.of.the.week in {5,6}: low (28.8)
## : Day.of.the.week in {2,3,4}:
## : :...Service.time > 18: moderate (3.2)
## : Service.time <= 18:
## : :...Work.load.Average.day > 29: low (10.5)
## : Work.load.Average.day <= 29:
## : :...Body.mass.index <= 29: moderate (31.8/12.5)
## : Body.mass.index > 29: low (30.6/6.5)
## Reason.for.absence in {1,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,26}:
## :...Transportation.expense > 330: moderate (23.7/2.7)
## Transportation.expense <= 330:
## :...Work.load.Average.day > 26:
## :...Day.of.the.week = 3: moderate (12.5/4.2)
## : Day.of.the.week = 6: low (10.6/4.8)
## : Day.of.the.week in {2,4,5}:
## : :...Hit.target <= 94:
## : :...Distance.from.Residence.to.Work > 42: moderate (4.2)
## : : Distance.from.Residence.to.Work <= 42:
## : : :...Hit.target <= 88: high (5.6)
## : : Hit.target > 88: moderate (18.2/6.9)
## : Hit.target > 94:
## : :...Transportation.expense <= 118: low (4.3)
## : Transportation.expense > 118:
## : :...Hit.target <= 97: high (30.1/12.7)
## : Hit.target > 97: moderate (4)
## Work.load.Average.day <= 26:
## :...Work.load.Average.day > 24: low (19.5/6.6)
## Work.load.Average.day <= 24:
## :...Work.load.Average.day > 22: moderate (12.4/5)
## Work.load.Average.day <= 22:
## :...Height > 172: low (15.8/4.7)
## Height <= 172:
## :...Distance.from.Residence.to.Work > 27: moderate (46.4/15.5)
## Distance.from.Residence.to.Work <= 27:
## :...Day.of.the.week in {3,6}: moderate (17.7/8)
## Day.of.the.week in {2,4,5}:
## :...Body.mass.index > 35: high (2.1/0.6)
## Body.mass.index <= 35:
## :...Hit.target <= 88: moderate (10.4/5.7)
## Hit.target > 88:
## :...Transportation.expense <= 189: low (15.2/0.6)
## Transportation.expense > 189:
## :...Height <= 169: low (19.4/4.1)
## Height > 169: moderate (7.5)
##
## ----- Trial 3: -----
##
## Decision tree:
##
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence in {0,1,7,9,10,11,12,13,14,19}:
## :...Reason.for.absence in {1,7}: moderate (30/13.5)
## : Reason.for.absence in {0,9,12}: high (29.4/4.9)
## : Reason.for.absence in {11,14}: low (38.3/19.4)
## : Reason.for.absence = 10:
## : :...Transportation.expense <= 235: low (13/4.8)
## : : Transportation.expense > 235: moderate (12.9/4)
## : Reason.for.absence = 13:
## : :...Hit.target <= 99: high (59.2/35.9)
## : : Hit.target > 99: low (2.9)
## : Reason.for.absence = 19:
## : :...Height > 174: low (3.8)
## : Height <= 174:
## : :...Age <= 5: moderate (3.5)
## : Age > 5: high (24.8/10.1)
## Reason.for.absence in {4,5,6,8,15,16,17,18,21,22,23,24,25,26,27,28}:
## :...Transportation.expense > 291:
## :...Day.of.the.week in {2,3,4,6}: moderate (29.4/7.5)
## : Day.of.the.week = 5: high (5/1.1)
## Transportation.expense <= 291:
## :...Body.mass.index <= 19: low (14.4/4.5)
## Body.mass.index > 19:
## :...Reason.for.absence in {4,16,18,23,25,27,28}:
## :...Height > 167: low (176.4/37)
## : Height <= 167:
## : :...Distance.from.Residence.to.Work <= 27: low (11.5/2.9)
## : Distance.from.Residence.to.Work > 27: moderate (12.2/1.9)
## Reason.for.absence in {5,6,8,15,17,21,22,24,26}:
## :...Service.time <= 12: moderate (10.9/1.8)
## Service.time > 12:
## :...Age <= 17: moderate (47.8/16.2)
## Age > 17: low (3.5)
##
## ----- Trial 4: -----
##
## Decision tree:
##
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence = 0: high (14.8)
## Reason.for.absence in {1,7,9,10,11,12,13,14,19}:
## :...Transportation.expense <= 235:
## : :...Day.of.the.week = 5: moderate (18.2/9)
## : : Day.of.the.week = 6: low (8.5/3.7)
## : : Day.of.the.week = 3:
## : : :...Work.load.Average.day <= 28: high (18.9/9.1)
## : : : Work.load.Average.day > 28: low (8.8/2.6)
## : : Day.of.the.week = 4:
## : : :...Distance.from.Residence.to.Work <= 11: low (3.8)
## : : : Distance.from.Residence.to.Work > 11: high (29.6/14.6)
## : : Day.of.the.week = 2:
## : : :...Height > 174: high (6/0.5)
## : : Height <= 174:
## : : :...Transportation.expense <= 189: low (18.7/5.1)
## : : Transportation.expense > 189: moderate (11.1/3.6)
## : Transportation.expense > 235:
## : :...Age > 15: low (14.7/6.9)
## : Age <= 15:
## : :...Day.of.the.week = 3: moderate (8.3/1)
## : Day.of.the.week = 5: high (12.4/6.1)
## : Day.of.the.week in {2,4,6}:
## : :...Service.time <= 11: moderate (17.1/2.4)
## : Service.time > 11:
## : :...Work.load.Average.day <= 30: high (30.9/8.4)
## : Work.load.Average.day > 30: moderate (8.8/1.2)
## Reason.for.absence in {4,5,6,8,15,16,17,18,21,22,23,24,25,26,27,28}:
## :...Reason.for.absence in {4,5,6,8,15,17,18,22,24,26}: moderate (86.3/31.6)
## Reason.for.absence in {16,21,25,27}: low (55.1/12.6)
## Reason.for.absence = 28:
## :...Body.mass.index <= 24: low (16.3)
## : Body.mass.index > 24:
## : :...Day.of.the.week in {2,5}: low (11.5)
## : Day.of.the.week in {3,4,6}:
## : :...Height <= 167: moderate (4.9)
## : Height > 167:
## : :...Work.load.Average.day <= 13: moderate (15.6/3)
## : Work.load.Average.day > 13: low (10.5)
## Reason.for.absence = 23:
## :...Body.mass.index <= 19: high (7.7/3.2)
## Body.mass.index > 19:
## :...Service.time <= 4: low (11.5/5.3)
## Service.time > 4:
## :...Hit.target > 98: moderate (14.1/3.2)
## Hit.target <= 98:
## :...Hit.target > 93: low (17.9)
## Hit.target <= 93:
## :...Body.mass.index <= 24: low (8.3)
## Body.mass.index > 24:
## :...Hit.target <= 92: low (22.8/6)
## Hit.target > 92: moderate (15.9/4)
##
## ----- Trial 5: -----
##
## Decision tree:
##
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence = 0: high (12.5)
## Reason.for.absence in {4,7,8,12,14,16,18,21,22,23,25,26,27,28}:
## :...Reason.for.absence in {4,7,12,14,16,27}: low (69/22.2)
## : Reason.for.absence in {8,18,21,22}: moderate (54.8/23.5)
## : Reason.for.absence = 25:
## : :...Distance.from.Residence.to.Work <= 22: moderate (17.7/6.8)
## : : Distance.from.Residence.to.Work > 22: low (4.8)
## : Reason.for.absence = 28:
## : :...Height <= 165: moderate (5/0.8)
## : : Height > 165: low (48/10.5)
## : Reason.for.absence = 23:
## : :...Day.of.the.week in {2,3,4,6}:
## : : :...Service.time <= 18: low (73.8/21.1)
## : : : Service.time > 18: moderate (2.6)
## : : Day.of.the.week = 5:
## : : :...Service.time <= 10: high (13.3/5.6)
## : : Service.time > 10: low (11.7/5.7)
## : Reason.for.absence = 26:
## : :...Work.load.Average.day > 33: high (2.8)
## : Work.load.Average.day <= 33:
## : :...Body.mass.index <= 35: low (24.2/9.3)
## : Body.mass.index > 35: moderate (3.2)
## Reason.for.absence in {1,5,6,9,10,11,13,15,17,19,24}:
## :...Transportation.expense > 179: moderate (108.3/39.1)
## Transportation.expense <= 179:
## :...Hit.target <= 84.5: moderate (4.4)
## Hit.target > 84.5:
## :...Day.of.the.week in {2,5,6}: low (32.1/14.3)
## Day.of.the.week = 3: moderate (15.8/7.8)
## Day.of.the.week = 4:
## :...Height <= 174: low (15.6/6.7)
## Height > 174: high (9.4/3.7)
##
## ----- Trial 6: -----
##
## Decision tree:
##
## Reason.for.absence in {2,3,4,8,16,18,21,22,23,25,26,27,28}: low (282.4/114.2)
## Reason.for.absence in {0,1,5,6,7,9,10,11,12,13,14,15,17,19,24}:
## :...Reason.for.absence in {0,9,12}: high (26.6/7.1)
## Reason.for.absence in {1,5,6,13,15,17,24}: moderate (97.8/48.7)
## Reason.for.absence in {7,14}: low (38.4/21.6)
## Reason.for.absence = 10:
## :...Height <= 174: moderate (21.7/10)
## : Height > 174: high (3.2)
## Reason.for.absence = 11:
## :...Body.mass.index <= 24: moderate (14.2/6.7)
## : Body.mass.index > 24: low (11/3.2)
## Reason.for.absence = 19:
## :...Height > 174: low (4.1)
## Height <= 174:
## :...Distance.from.Residence.to.Work <= 17: high (5.6/1.6)
## Distance.from.Residence.to.Work > 17: moderate (23.8/9.2)
##
## ----- Trial 7: -----
##
## Decision tree:
##
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence in {4,8,10,16,18,21,22,23,25,26,27,28}:
## :...Transportation.expense > 291: moderate (42.1/13.1)
## : Transportation.expense <= 291:
## : :...Reason.for.absence in {4,10,18,21}: moderate (52.2/25.2)
## : Reason.for.absence in {8,16,22,25,27,28}: low (102.6/29.1)
## : Reason.for.absence = 23:
## : :...Work.load.Average.day <= 21:
## : : :...Service.time <= 18: low (37.9/3.5)
## : : : Service.time > 18: moderate (2.5)
## : : Work.load.Average.day > 21:
## : : :...Service.time <= 13: moderate (26.1/10.9)
## : : Service.time > 13: low (4.6)
## : Reason.for.absence = 26:
## : :...Work.load.Average.day > 33: high (2.7)
## : Work.load.Average.day <= 33:
## : :...Age <= 17: moderate (24.6/6.5)
## : Age > 17: low (3.5)
## Reason.for.absence in {0,1,5,6,7,9,11,12,13,14,15,17,19,24}:
## :...Reason.for.absence in {0,9}: high (16.6/2.7)
## Reason.for.absence in {1,5,6,15,17,24}: moderate (22.7/6)
## Reason.for.absence in {7,12,14}: low (49.8/28.6)
## Reason.for.absence = 11:
## :...Work.load.Average.day <= 26: low (12/3.6)
## : Work.load.Average.day > 26: moderate (13.7/5)
## Reason.for.absence = 19:
## :...Height > 174: low (3.9)
## : Height <= 174:
## : :...Distance.from.Residence.to.Work <= 17: high (5.6/1.9)
## : Distance.from.Residence.to.Work > 17: moderate (23.9/10.1)
## Reason.for.absence = 13:
## :...Work.load.Average.day > 34: high (14.7/6)
## Work.load.Average.day <= 34:
## :...Transportation.expense <= 118: low (4.3)
## Transportation.expense > 118:
## :...Hit.target <= 93: moderate (10.8/2.4)
## Hit.target > 93:
## :...Body.mass.index > 35: high (2.2)
## Body.mass.index <= 35:
## :...Age > 15: low (4.5)
## Age <= 15:
## :...Hit.target > 99: low (2.7)
## Hit.target <= 99:
## :...Transportation.expense > 189: moderate (14.3/6)
## Transportation.expense <= 189:
## :...Hit.target <= 97: low (16.1/7.5)
## Hit.target > 97: high (8.2/2.8)
##
## ----- Trial 8: -----
##
## Decision tree:
##
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence in {0,1,5,6,7,9,11,12,13,14,15,17,19,24}:
## :...Reason.for.absence in {0,9,12,14}: high (44.2/17.4)
## : Reason.for.absence in {1,5,6,15,17,24}: moderate (17.7/3.1)
## : Reason.for.absence = 7:
## : :...Transportation.expense <= 279: moderate (15.6/6.5)
## : : Transportation.expense > 279: high (7.5)
## : Reason.for.absence = 11:
## : :...Body.mass.index <= 24: high (16.1/8.1)
## : : Body.mass.index > 24: low (10.9/3.9)
## : Reason.for.absence = 19:
## : :...Height <= 174: high (30.2/14.5)
## : : Height > 174: low (3.4)
## : Reason.for.absence = 13:
## : :...Day.of.the.week in {2,6}: high (25.6/12.8)
## : Day.of.the.week = 3: moderate (13.2/6.3)
## : Day.of.the.week = 5: low (17.2/9.2)
## : Day.of.the.week = 4:
## : :...Hit.target <= 95: moderate (18.1/8.2)
## : Hit.target > 95: high (4.1)
## Reason.for.absence in {4,8,10,16,18,21,22,23,25,26,27,28}:
## :...Height > 175: low (39/19.5)
## Height <= 175:
## :...Work.load.Average.day > 31: moderate (23.6/8.3)
## Work.load.Average.day <= 31:
## :...Service.time <= 9: low (57/7.4)
## Service.time > 9:
## :...Distance.from.Residence.to.Work > 50: low (29.4/4.1)
## Distance.from.Residence.to.Work <= 50:
## :...Work.load.Average.day > 29: low (11.3/0.8)
## Work.load.Average.day <= 29:
## :...Age <= 9: moderate (38.6/10.1)
## Age > 9:
## :...Height > 172: moderate (4.6)
## Height <= 172:
## :...Hit.target <= 88: low (6.2)
## Hit.target > 88:
## :...Transportation.expense > 268: low (6.8)
## Transportation.expense <= 268:
## :...Body.mass.index > 32: moderate (5.2)
## Body.mass.index <= 32:
## :...Height <= 165: low (9.1/1.3)
## Height > 165: [S1]
##
## SubTree [S1]
##
## Reason.for.absence in {4,21,22,26}: moderate (16.9)
## Reason.for.absence in {8,10,16,18,23,25,27,28}:
## :...Body.mass.index > 28: low (7)
## Body.mass.index <= 28:
## :...Day.of.the.week in {5,6}: low (5.2)
## Day.of.the.week in {2,3,4}:
## :...Work.load.Average.day > 18: moderate (7.7/1.4)
## Work.load.Average.day <= 18:
## :...Reason.for.absence in {8,16,23,27,28}: low (12.5)
## Reason.for.absence in {10,18,25}: moderate (15.9/5.4)
##
## ----- Trial 9: -----
##
## Decision tree:
##
## Reason.for.absence in {0,9}: high (10.9)
## Reason.for.absence in {4,5,6,8,15,17,18,21,22,24}: moderate (74.1/22.5)
## Reason.for.absence in {2,3,16,23,25,27,28}: low (147.2/23.3)
## Reason.for.absence in {1,7,10,11,12,13,14,19,26}:
## :...Transportation.expense <= 157:
## :...Height <= 174: low (17.2/6.1)
## : Height > 174:
## : :...Day.of.the.week in {2,4}: high (17.6/1.4)
## : Day.of.the.week in {3,5,6}: low (5.1)
## Transportation.expense > 157:
## :...Reason.for.absence in {1,10,11,19}: moderate (77.1/26.8)
## Reason.for.absence = 12: low (11/4.3)
## Reason.for.absence = 7:
## :...Transportation.expense <= 279: moderate (15.6/7.5)
## : Transportation.expense > 279: high (6.7)
## Reason.for.absence = 14:
## :...Body.mass.index <= 25: high (17.4/9.7)
## : Body.mass.index > 25: low (8.6)
## Reason.for.absence = 26:
## :...Work.load.Average.day <= 33: moderate (24.6/7.3)
## : Work.load.Average.day > 33: high (3.1)
## Reason.for.absence = 13:
## :...Hit.target <= 93: moderate (14.9/2.8)
## Hit.target > 93:
## :...Work.load.Average.day <= 7: low (7.2/2.1)
## Work.load.Average.day > 7:
## :...Hit.target > 99: low (3.1)
## Hit.target <= 99:
## :...Hit.target > 97: high (8.7/1.8)
## Hit.target <= 97:
## :...Hit.target > 96: moderate (11/2.1)
## Hit.target <= 96:
## :...Body.mass.index <= 21: moderate (6.3/2.6)
## Body.mass.index > 21: high (21/8)
##
##
## Evaluation on training data (529 cases):
##
## Trial Decision Tree
## ----- ----------------
## Size Errors
##
## 0 33 63(11.9%)
## 1 23 109(20.6%)
## 2 30 98(18.5%)
## 3 19 109(20.6%)
## 4 30 103(19.5%)
## 5 20 112(21.2%)
## 6 11 140(26.5%)
## 7 27 107(20.2%)
## 8 30 119(22.5%)
## 9 21 80(15.1%)
## boost 45( 8.5%) <<
##
##
## (a) (b) (c) <-classified as
## ---- ---- ----
## 62 3 8 (a): class high
## 1 291 10 (b): class low
## 2 21 131 (c): class moderate
##
##
## Attribute usage:
##
## 100.00% Reason.for.absence
## 93.95% Transportation.expense
## 90.74% Work.load.Average.day
## 90.36% Height
## 88.09% Body.mass.index
## 80.15% Day.of.the.week
## 69.94% Service.time
## 64.84% Distance.from.Residence.to.Work
## 62.19% Hit.target
## 44.23% Age
##
##
## Time: 0.0 secs
plot(c50.fit)
c50.fit.pred <- predict(c50.fit, test)
print(postResample(pred = c50.fit.pred, obs = test.group))
## Accuracy Kappa
## 0.6992481 0.4493894
confusionMatrix(c50.fit.pred, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 5 2 3
## low 8 65 11
## moderate 7 9 23
##
## Overall Statistics
##
## Accuracy : 0.6992
## 95% CI : (0.6137, 0.7757)
## No Information Rate : 0.5714
## P-Value [Acc > NIR] : 0.001653
##
## Kappa : 0.4494
##
## Mcnemar's Test P-Value : 0.144744
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.25000 0.8553 0.6216
## Specificity 0.95575 0.6667 0.8333
## Pos Pred Value 0.50000 0.7738 0.5897
## Neg Pred Value 0.87805 0.7755 0.8511
## Prevalence 0.15038 0.5714 0.2782
## Detection Rate 0.03759 0.4887 0.1729
## Detection Prevalence 0.07519 0.6316 0.2932
## Balanced Accuracy 0.60288 0.7610 0.7275
## Recursive PArtition Regression Tree - RPART
m2 = rpart(absentgroup ~ .,train, method = "class")
m2.pred = predict(m2, test, type = "class")
print(postResample(pred = m2.pred, obs = test.group))
## Accuracy Kappa
## 0.7142857 0.4774607
confusionMatrix(m2.pred, test.group)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 4 0 2
## low 8 65 9
## moderate 8 11 26
##
## Overall Statistics
##
## Accuracy : 0.7143
## 95% CI : (0.6295, 0.7892)
## No Information Rate : 0.5714
## P-Value [Acc > NIR] : 0.0004757
##
## Kappa : 0.4775
##
## Mcnemar's Test P-Value : 0.0081006
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.20000 0.8553 0.7027
## Specificity 0.98230 0.7018 0.8021
## Pos Pred Value 0.66667 0.7927 0.5778
## Neg Pred Value 0.87402 0.7843 0.8750
## Prevalence 0.15038 0.5714 0.2782
## Detection Rate 0.03008 0.4887 0.1955
## Detection Prevalence 0.04511 0.6165 0.3383
## Balanced Accuracy 0.59115 0.7785 0.7524
plot(m2)
text(m2, pretty = 0, cex = 0.8)
prp(m2, varlen = 4, extra = 2)
## Support Vector machine
absent_classifier <- ksvm(absentgroup ~ ., data = train, kernel = "vanilladot")
## Setting default kernel parameters
absent_classifier
## Support Vector Machine object of class "ksvm"
##
## SV type: C-svc (classification)
## parameter : cost C = 1
##
## Linear (vanilla) kernel function.
##
## Number of Support Vectors : 295
##
## Objective Function Value : -66.3712 -76.3851 -135.2174
## Training error : 0.190926
#Evaluating the SVM Model Performance
absent_predictions <- predict(absent_classifier, test)
table(absent_predictions, test.group )
## test.group
## absent_predictions high low moderate
## high 4 0 0
## low 4 67 11
## moderate 12 9 26
#Confusion Matrix for SVM Model
agreement <- absent_predictions == test.group
table(agreement)
## agreement
## FALSE TRUE
## 36 97
print(postResample(pred = absent_predictions, obs = test.group))
## Accuracy Kappa
## 0.7293233 0.5032164
############## Random Forest is our Best Performer ##################
############### Final Prediction on entire data set ##################
finalData = rbind(train, test)
final_fit <- predict(rf.fit5, finalData, type = "class")
summary(final_fit)
## high low moderate
## 89 382 191
#table(final_fit, modeldata$absentgroup)
print(postResample(pred = final_fit, obs = finalData$absentgroup))
## Accuracy Kappa
## 0.9335347 0.8830623
confusionMatrix(final_fit, finalData$absentgroup)
## Confusion Matrix and Statistics
##
## Reference
## Prediction high low moderate
## high 81 1 7
## low 5 365 12
## moderate 7 12 172
##
## Overall Statistics
##
## Accuracy : 0.9335
## 95% CI : (0.9118, 0.9513)
## No Information Rate : 0.571
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.8831
##
## Mcnemar's Test P-Value : 0.4459
##
## Statistics by Class:
##
## Class: high Class: low Class: moderate
## Sensitivity 0.8710 0.9656 0.9005
## Specificity 0.9859 0.9401 0.9597
## Pos Pred Value 0.9101 0.9555 0.9005
## Neg Pred Value 0.9791 0.9536 0.9597
## Prevalence 0.1405 0.5710 0.2885
## Detection Rate 0.1224 0.5514 0.2598
## Detection Prevalence 0.1344 0.5770 0.2885
## Balanced Accuracy 0.9285 0.9529 0.9301
### Best perfomer is Random Forest with 5 splits. ON entire data, random forest's prediction accuracy 93%. Kappa 88% tells the model is almost perfect to predict the absenteeism group.
################################## End of Project Absenteeism at Wrok #####################################