Prediction of Absenteeism at Workplace

## Clear the environment
rm(list = ls())

## Set your working directory path here
workingdirectory = "C:/Users/MANISHA/Desktop/ANLY530-ML1-Project/ANLY530-Final/Final_Dliverables"
setwd(workingdirectory)

## Libraries required for this project
library(ggplot2)

library(corrplot)

## corrplot 0.84 loaded

library(factoextra)

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

library(NbClust)

library(cluster)

library(purrr)

library(MASS)

library(gridExtra)

library(tree)

library(Metrics)

library(randomForest)

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(C50) 

library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:purrr':
## 
##     cross

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:Metrics':
## 
##     precision, recall

## The following object is masked from 'package:purrr':
## 
##     lift

library(rpart)

library(rpart.plot)

## Read input Absenteeism_at_work.csv from working directory.
absentdata = read.csv('Absenteeism_at_work.csv', header = T, sep = ";")

Data Exploration and Preprocessing

## Structure of the data
str(absentdata)

## 'data.frame':    740 obs. of  21 variables:
##  $ ID                             : int  11 36 3 7 11 3 10 20 14 1 ...
##  $ Reason.for.absence             : int  26 0 23 7 23 23 22 23 19 22 ...
##  $ Month.of.absence               : int  7 7 7 7 7 7 7 7 7 7 ...
##  $ Day.of.the.week                : int  3 3 4 5 5 6 6 6 2 2 ...
##  $ Seasons                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Transportation.expense         : int  289 118 179 279 289 179 361 260 155 235 ...
##  $ Distance.from.Residence.to.Work: int  36 13 51 5 36 51 52 50 12 11 ...
##  $ Service.time                   : int  13 18 18 14 13 18 3 11 14 14 ...
##  $ Age                            : int  33 50 38 39 33 38 28 36 34 37 ...
##  $ Work.load.Average.day          : num  240 240 240 240 240 ...
##  $ Hit.target                     : int  97 97 97 97 97 97 97 97 97 97 ...
##  $ Disciplinary.failure           : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Education                      : int  1 1 1 1 1 1 1 1 1 3 ...
##  $ Son                            : int  2 1 0 2 2 0 1 4 2 1 ...
##  $ Social.drinker                 : int  1 1 1 1 1 1 1 1 1 0 ...
##  $ Social.smoker                  : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ Pet                            : int  1 0 0 0 1 0 4 0 0 1 ...
##  $ Weight                         : int  90 98 89 68 90 89 80 65 95 88 ...
##  $ Height                         : int  172 178 170 168 172 170 172 168 196 172 ...
##  $ Body.mass.index                : int  30 31 31 24 30 31 27 23 25 29 ...
##  $ Absenteeism.time.in.hours      : int  4 0 2 4 2 2 8 4 40 8 ...

## Summary statistics
summary(absentdata)

##        ID        Reason.for.absence Month.of.absence Day.of.the.week
##  Min.   : 1.00   Min.   : 0.00      Min.   : 0.000   Min.   :2.000  
##  1st Qu.: 9.00   1st Qu.:13.00      1st Qu.: 3.000   1st Qu.:3.000  
##  Median :18.00   Median :23.00      Median : 6.000   Median :4.000  
##  Mean   :18.02   Mean   :19.22      Mean   : 6.324   Mean   :3.915  
##  3rd Qu.:28.00   3rd Qu.:26.00      3rd Qu.: 9.000   3rd Qu.:5.000  
##  Max.   :36.00   Max.   :28.00      Max.   :12.000   Max.   :6.000  
##     Seasons      Transportation.expense Distance.from.Residence.to.Work
##  Min.   :1.000   Min.   :118.0          Min.   : 5.00                  
##  1st Qu.:2.000   1st Qu.:179.0          1st Qu.:16.00                  
##  Median :3.000   Median :225.0          Median :26.00                  
##  Mean   :2.545   Mean   :221.3          Mean   :29.63                  
##  3rd Qu.:4.000   3rd Qu.:260.0          3rd Qu.:50.00                  
##  Max.   :4.000   Max.   :388.0          Max.   :52.00                  
##   Service.time        Age        Work.load.Average.day   Hit.target    
##  Min.   : 1.00   Min.   :27.00   Min.   :205.9         Min.   : 81.00  
##  1st Qu.: 9.00   1st Qu.:31.00   1st Qu.:244.4         1st Qu.: 93.00  
##  Median :13.00   Median :37.00   Median :264.2         Median : 95.00  
##  Mean   :12.55   Mean   :36.45   Mean   :271.5         Mean   : 94.59  
##  3rd Qu.:16.00   3rd Qu.:40.00   3rd Qu.:294.2         3rd Qu.: 97.00  
##  Max.   :29.00   Max.   :58.00   Max.   :378.9         Max.   :100.00  
##  Disciplinary.failure   Education          Son        Social.drinker  
##  Min.   :0.00000      Min.   :1.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.00000      1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.00000      Median :1.000   Median :1.000   Median :1.0000  
##  Mean   :0.05405      Mean   :1.292   Mean   :1.019   Mean   :0.5676  
##  3rd Qu.:0.00000      3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.00000      Max.   :4.000   Max.   :4.000   Max.   :1.0000  
##  Social.smoker          Pet             Weight           Height     
##  Min.   :0.00000   Min.   :0.0000   Min.   : 56.00   Min.   :163.0  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: 69.00   1st Qu.:169.0  
##  Median :0.00000   Median :0.0000   Median : 83.00   Median :170.0  
##  Mean   :0.07297   Mean   :0.7459   Mean   : 79.04   Mean   :172.1  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.: 89.00   3rd Qu.:172.0  
##  Max.   :1.00000   Max.   :8.0000   Max.   :108.00   Max.   :196.0  
##  Body.mass.index Absenteeism.time.in.hours
##  Min.   :19.00   Min.   :  0.000          
##  1st Qu.:24.00   1st Qu.:  2.000          
##  Median :25.00   Median :  3.000          
##  Mean   :26.68   Mean   :  6.924          
##  3rd Qu.:31.00   3rd Qu.:  8.000          
##  Max.   :38.00   Max.   :120.000

## Zero in Reason.for.absence for absence is not a valid reason code. ICD and non-ICD codes do not support it. 
## Removed observations zero in Reason code. 
range(absentdata$Reason.for.absence)

## [1]  0 28

absentdata <- absentdata[!(absentdata$Reason.for.absence == 0),]

## Observations in which reason code is greater than zero but absenteeism time in hours is > 0 + 
## Observations in which reason code is zero and absenteism time is zero.
a <- subset(absentdata, Absenteeism.time.in.hours <= 0 & Reason.for.absence > 0, c(ID, Reason.for.absence, Absenteeism.time.in.hours))
b <- subset(absentdata, Absenteeism.time.in.hours <= 0 & Reason.for.absence == 0, c(ID, Reason.for.absence, Absenteeism.time.in.hours))
as.matrix(rbind(a,b))

##     ID Reason.for.absence Absenteeism.time.in.hours
## 135 34                 27                         0

## REason code 27 is the only one wherer Absenteeism time is zero. Removed that observation. 
absentdata = absentdata[!(absentdata$Absenteeism.time.in.hours==0 & absentdata$Reason.for.absence > 0) ,]

## At this point we have 696 observations and 21 attributes. 
#dim(absentdata)
#summary(absentdata)

## Disciplinary failure is a noise. ONlu zeros in Disciplinary Failure. 
range(absentdata$Disciplinary.failure)

## [1] 0 0

## Removied Disciplinary failure attribute. 
absentdata <- absentdata[,-12]

#str(absentdata)
## Now we have 696 observations and 20 attributes.

## Missing value analysis
as.matrix(colSums(is.na(absentdata)))

##                                 [,1]
## ID                                 0
## Reason.for.absence                 0
## Month.of.absence                   0
## Day.of.the.week                    0
## Seasons                            0
## Transportation.expense             0
## Distance.from.Residence.to.Work    0
## Service.time                       0
## Age                                0
## Work.load.Average.day              0
## Hit.target                         0
## Education                          0
## Son                                0
## Social.drinker                     0
## Social.smoker                      0
## Pet                                0
## Weight                             0
## Height                             0
## Body.mass.index                    0
## Absenteeism.time.in.hours          0

# There is no missing value in any any attributes.

## Box plot of Absenteeism time in hours with Reason for absence. To verify the outliers in each reason codes. 
ggplot(absentdata,
    aes_string(y=absentdata$Absenteeism.time.in.hours,x=as.factor(absentdata$Reason.for.absence))) +
    geom_boxplot() + 
    xlab('Reason.for.absence') +
    ylab('Absenteeism.time.in.hours')

## Let's see the Distribution for continuous variables. 

## Absenteeism time. Highly right skewed due to presence of outliers. 
hist(absentdata$Absenteeism.time.in.hours, breaks = 40, #prob = TRUE,
     xlab = 'Absenteeism time in hours', main = " Absenteeism time Distribution", col = "grey")

boxplot(absentdata$Absenteeism.time.in.hours, main = "Box plot of Absenteeism time in hours")

#Outlier Analysis


#boxplot for Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Hit.target
boxplot(absentdata[,c('Transportation.expense','Distance.from.Residence.to.Work', 'Service.time', 'Age','Hit.target')], varwidth = T, 
        col = "dark grey")

#boxplot for Weight,Height,Body.mass.index,Absenteeism.time.in.hours
boxplot(absentdata[,c('Weight', 'Height', 'Body.mass.index','Absenteeism.time.in.hours')], col = "grey", varwidth = T)

#boxplot for Work.load.Average.day 
boxplot(absentdata[,c('Work.load.Average.day')], col = "grey")

## We do not delete the outliers instead we will cap the outliers with 25 and 75 percentiles. 
## Capping outliers - replacing outliers with 25percentile and 75percentile values. 

for (i in c('Transportation.expense','Service.time','Age','Work.load.Average.day','Hit.target','Height','Absenteeism.time.in.hours')){
  q = quantile(absentdata[,i],c(0.25,0.75))
  iqr1 = q[2]-q[1]
  min1 = q[1]-1.5*iqr1
  max1 = q[2]+1.5*iqr1
  absentdata[,i][absentdata[,i]<min1] = min1
  absentdata[,i][absentdata[,i]>max1] = max1
}

## Reason for absence vs absenteeism in time after outlier capping
ggplot(absentdata,
    aes_string(y=absentdata$Absenteeism.time.in.hours,x=as.factor(absentdata$Reason.for.absence))) +
    geom_boxplot() + 
    xlab('Reason.for.absence') +
    ylab('Absenteeism.time.in.hours')

#boxplot for Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Hit.target
boxplot(absentdata[,c('Transportation.expense','Distance.from.Residence.to.Work', 'Service.time', 'Age','Hit.target')], varwidth = T, 
        col = "dark grey")

#boxplot for Weight,Height,Body.mass.index,Absenteeism.time.in.hours
boxplot(absentdata[,c('Weight', 'Height', 'Body.mass.index','Absenteeism.time.in.hours')], col = "grey", varwidth = T)

#boxplot for Work.load.Average.day 
boxplot(absentdata[,c('Work.load.Average.day')], col = "grey")

## Data independence, Multicollinearity test. 
## First categorical variables. 
categorical_var = c("Reason.for.absence","Month.of.absence","Day.of.the.week",
                     "Seasons", "Education", "Social.drinker",
                     "Social.smoker", "Son", "Pet")

## Transform categorical variables into factors. 
absentdata[,categorical_var ] <- lapply(absentdata[,categorical_var], factor)
#str(absentdata)

# Chi-square test for relationship between attributes. 
pvalue = c()

#Calculating & storing p-values in vector pval from chisquare test
for(i in categorical_var){ 
  for(j in categorical_var){
    chi2 = chisq.test(absentdata[,i],absentdata[,j]) #, simulate.p.value = T)
    pvalue = c(pvalue,chi2$p.value)
  }
}

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

length(pvalue)

## [1] 81

m1 <- matrix(pvalue, ncol = 9)
df <- data.frame(m1)
row.names(df) <- categorical_var
colnames(df) <- categorical_var
print(df)

##                    Reason.for.absence Month.of.absence Day.of.the.week
## Reason.for.absence       0.000000e+00     2.191275e-14    5.170971e-02
## Month.of.absence         2.191275e-14     0.000000e+00    6.765307e-01
## Day.of.the.week          5.170971e-02     6.765307e-01    0.000000e+00
## Seasons                  6.493726e-19     0.000000e+00    4.086874e-01
## Education                1.895058e-10     8.873931e-03    6.369421e-01
## Social.drinker           2.253390e-08     3.040382e-02    3.040911e-01
## Social.smoker            1.742354e-09     1.838631e-02    5.388357e-01
## Son                      1.788850e-19     4.226723e-05    1.889736e-09
## Pet                      1.386394e-18     6.376062e-05    4.012904e-01
##                         Seasons    Education Social.drinker Social.smoker
## Reason.for.absence 6.493726e-19 1.895058e-10   2.253390e-08  1.742354e-09
## Month.of.absence   0.000000e+00 8.873931e-03   3.040382e-02  1.838631e-02
## Day.of.the.week    4.086874e-01 6.369421e-01   3.040911e-01  5.388357e-01
## Seasons            0.000000e+00 8.936680e-02   1.473154e-01  6.615252e-02
## Education          8.936680e-02 0.000000e+00   1.426601e-33  5.635243e-26
## Social.drinker     1.473154e-01 1.426601e-33  1.688763e-152  1.031400e-02
## Social.smoker      6.615252e-02 5.635243e-26   1.031400e-02 7.049811e-150
## Son                1.795308e-05 8.888863e-12   4.393599e-09  2.868110e-20
## Pet                1.090705e-04 3.323522e-29   1.303426e-26  6.587414e-14
##                             Son          Pet
## Reason.for.absence 1.788850e-19 1.386394e-18
## Month.of.absence   4.226723e-05 6.376062e-05
## Day.of.the.week    1.889736e-09 4.012904e-01
## Seasons            1.795308e-05 1.090705e-04
## Education          8.888863e-12 3.323522e-29
## Social.drinker     4.393599e-09 1.303426e-26
## Social.smoker      2.868110e-20 6.587414e-14
## Son                0.000000e+00 8.122222e-90
## Pet                8.122222e-90 0.000000e+00

## As per the chisquare test, except Reason.for.absence and Day.of.the.week, all categorical variables are related to Reason.for.absence, as the p-values are less than 0.005. So, we removed all categorical variables correlated to Reason.for.absence but Day.of.the.week.
absentdata <- absentdata[, -c(3, 5, 12,13,14, 15, 16)]


## Correltaion matrix for continuous attribute
m <- cor(absentdata[,4:13])
corrplot(m, order = "hclust", tl.srt = 30, tl.col = "black", addrect = 3, method = "number" )

## Correlation between Absenteeism.time.in.hours and predictor are below 0.1. But high collinearity found between Weight and Body.mass.index. So, I removed Weight from the dataframe. 
absentdata = absentdata[,-10]

## After data pre-processiin we are left with 696 observstions and 12 variables including target variable. .

## Test for linearity in the data
pairs(absentdata[, -c(1:3)])

## Data is not linear. So, linear models will not be a good choice for this data. 


######################## End of Data Preprocessing ############################

# Aggregating Absenteeism.time.in.hours by Reason.for.absence
Reasons = aggregate(absentdata$Absenteeism.time.in.hours, by=list(Category=absentdata$Reason.for.absence), FUN=sum)
#print(as.data.frame(Reasons))
Reasons$Absence = (Reasons$x/sum(absentdata$Absenteeism.time.in.hours))*100
Reasons = Reasons[order(Reasons$Absence, decreasing = T),]
#print(Reasons)
barplot(Reasons$Absence, names.arg = Reasons$Category, xlab = "Reason for absence", ylab = "Absence", col = "dark grey", 
        main = "How much proportion each reason code plays in absenteeism")

## Taking backup of preprocessed data

#write.csv(modeldata, "modeldata.csv", row.names = F)

Model building using Machine Learning Algorithms.

### We will see how many grous are there in the data set by means of K-means clustering.
modeldata = absentdata[,-c(1,2,3)]

df = scale(modeldata)
## NbClust method
## wssplot function to give value of K based on elbow method using within cluster sum of squeares.
wssplot <- function(data, nc = 20, seed = 1234) {
    wss <- (nrow(data) - 1) * sum(apply(data, 2, var))
    for (i in 2 : nc) {
        set.seed(seed)
        wss[i] <- sum(kmeans(data, centers = i)$withins)}
    plot(1:nc, wss, type = "b", xlab = "Number of Clusters",
         ylab = "Within groups sum of squares")
}

wssplot(df)

set.seed(1234)
nc <- NbClust(df, min.nc = 3, max.nc = 20, method = "kmeans" )

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 7 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 4 proposed 6 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 2 proposed 8 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 2 proposed 14 as the best number of clusters 
## * 2 proposed 17 as the best number of clusters 
## * 1 proposed 19 as the best number of clusters 
## * 2 proposed 20 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

barplot(table(nc$Best.nc[1,]))
## According to NbCluster method, 3 would be optimal value of K.

set.seed(1234)
### Elbow method - K = 3 optimal value
fviz_nbclust(df, kmeans, method = "wss")

## Average Silhoutte Method = k = 9, optimal value
set.seed(1234)
fviz_nbclust(df, kmeans, method = "silhouette")

## Comparison of k-values

set.seed(1234)
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k7 <- kmeans(df, centers = 7, nstart = 25)
k9 <- kmeans(df, centers = 9, nstart = 25)

# plots to compare
p1 = fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p2 = fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p3 = fviz_cluster(k7, geom = "point",  data = df) + ggtitle("k = 7")
p4 = fviz_cluster(k9, geom = "point",  data = df) + ggtitle("k = 9")

grid.arrange(p1, p2, p3, p4, nrow = 2)

# Compute k-means clustering with k = 3
set.seed(1234)
final <- kmeans(df, 3, nstart = 25)
final$center

##   Transportation.expense Distance.from.Residence.to.Work Service.time
## 1             -0.3192529                       0.9399145    1.0407530
## 2              0.5716311                       0.1896419   -0.7329393
## 3             -0.6703766                      -1.0206664    0.3776036
##          Age Work.load.Average.day  Hit.target      Height Body.mass.index
## 1  0.6674743           -0.22119948  0.04639312 -0.72124669      1.28823361
## 2 -0.7305156            0.03503184 -0.06509454 -0.07121779     -0.58346249
## 3  0.6586461            0.11282116  0.06867719  0.66439646     -0.05031317
##   Absenteeism.time.in.hours
## 1               -0.26202258
## 2                0.04704534
## 3                0.12477083

fviz_cluster(final, data = df)

#print(final)

## Build models supervesed learning way. 
## Response variable is Absenteeism.time.in.hours. We will create 6 categoris and build models to predict the class. 
modeldata <- absentdata


temp_table = table(as.factor(modeldata$Absenteeism.time.in.hours))
barplot(temp_table, xlab = "Absenteeism in Hours", ylab = "Frequency", main = "Absenteeism frequency")

## Most of the time it is 8 hours people go abesent from work. That means full day absenteeism is common trend. 

## According to K-means cluster, this data comprises of 3 partition or 3 groups. So, creating three class lebels for absenteeism.in.time.hours, low - absenteeism hours is within 1 to 4 hours, moderate when 5 to 8 hours, high when greater than 8 hours. 
absentgroup <- ifelse((modeldata$Absenteeism.time.in.hours >= 1 & modeldata$Absenteeism.time.in.hours <=4), "low", "high")              
tempdata <- as.integer(as.character(modeldata$Absenteeism.time.in.hours))
for (i in 1:length(tempdata)) {
    
    if(tempdata[i] >= 1 & tempdata[i] <=4){
        
        modeldata$absentgroup[i] = "low"
    
    } else if(tempdata[i] > 4 & tempdata[i] <= 8){
    
        modeldata$absentgroup[i] = "moderate"
    
    } else { modeldata$absentgroup[i] = "high"}
    
}

table(modeldata$absentgroup)

## 
##     high      low moderate 
##       63      417      216

modeldata$absentgroup = factor(modeldata$absentgroup)

## We are using validatioin set approach for resampling. Select 80% observation for training and 20% for testing.
## Removing ansenteeism.in.time.hours and ID attributes. 
modeldata = modeldata[, -12]
modeldata = modeldata[,-1]

#smp_size <- floor(0.75 * nrow(modeldata))

## set the seed to make partition reproducible
set.seed(1234)
train_index = sample(1:nrow(modeldata), 0.8*nrow(modeldata))        
train = modeldata[train_index,]
test = modeldata[-train_index,]
test.group <- test$absentgroup

## first model - Simple Classification Tree with "tree" function of "tree" package. 

model_tree <- tree(absentgroup ~ . , data = train)
summary(model_tree)

## 
## Classification tree:
## tree(formula = absentgroup ~ ., data = train)
## Variables actually used in tree construction:
## [1] "Reason.for.absence"     "Body.mass.index"       
## [3] "Transportation.expense" "Hit.target"            
## [5] "Height"                 "Age"                   
## Number of terminal nodes:  13 
## Residual mean deviance:  0.9118 = 495.1 / 543 
## Misclassification error rate: 0.1727 = 96 / 556

plot(model_tree)
text(model_tree, pretty = 0, cex = 0.8)

model_tree_pred = predict(model_tree, test, type = "class")

#conf_matrix = table(model_tree_pred, test.group)
#model_tree_acu = sum(diag(conf_matrix))/sum(conf_matrix)


print(postResample(pred = model_tree_pred, obs = test.group))

##  Accuracy     Kappa 
## 0.6571429 0.3761604

confusionMatrix(model_tree_pred, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        3   3        4
##   low         3  64       14
##   moderate   10  14       25
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6571          
##                  95% CI : (0.5723, 0.7352)
##     No Information Rate : 0.5786          
##     P-Value [Acc > NIR] : 0.03515         
##                                           
##                   Kappa : 0.3762          
##  Mcnemar's Test P-Value : 0.46252         
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.18750     0.7901          0.5814
## Specificity              0.94355     0.7119          0.7526
## Pos Pred Value           0.30000     0.7901          0.5102
## Neg Pred Value           0.90000     0.7119          0.8022
## Prevalence               0.11429     0.5786          0.3071
## Detection Rate           0.02143     0.4571          0.1786
## Detection Prevalence     0.07143     0.5786          0.3500
## Balanced Accuracy        0.56552     0.7510          0.6670

## Linear Discriminant Analysis

lda.fit = lda(absentgroup ~ ., data = train)
lda.fit

## Call:
## lda(absentgroup ~ ., data = train)
## 
## Prior probabilities of groups:
##       high        low   moderate 
## 0.08453237 0.60431655 0.31115108 
## 
## Group means:
##          Reason.for.absence2 Reason.for.absence3 Reason.for.absence4
## high               0.0212766         0.000000000         0.000000000
## low                0.0000000         0.000000000         0.002976190
## moderate           0.0000000         0.005780347         0.005780347
##          Reason.for.absence5 Reason.for.absence6 Reason.for.absence7
## high              0.00000000          0.02127660          0.04255319
## low               0.00297619          0.00297619          0.01190476
## moderate          0.01156069          0.01734104          0.02890173
##          Reason.for.absence8 Reason.for.absence9 Reason.for.absence10
## high             0.000000000         0.042553191          0.063829787
## low              0.008928571         0.000000000          0.008928571
## moderate         0.011560694         0.005780347          0.080924855
##          Reason.for.absence11 Reason.for.absence12 Reason.for.absence13
## high               0.08510638          0.063829787           0.19148936
## low                0.02976190          0.005952381           0.03869048
## moderate           0.04624277          0.000000000           0.09248555
##          Reason.for.absence14 Reason.for.absence15 Reason.for.absence16
## high               0.04255319          0.000000000          0.000000000
## low                0.02380952          0.000000000          0.008928571
## moderate           0.02890173          0.005780347          0.000000000
##          Reason.for.absence17 Reason.for.absence18 Reason.for.absence19
## high              0.000000000           0.02127660           0.23404255
## low               0.000000000           0.00297619           0.01785714
## moderate          0.005780347           0.06936416           0.08670520
##          Reason.for.absence21 Reason.for.absence22 Reason.for.absence23
## high              0.000000000          0.021276596           0.04255319
## low               0.005952381          0.008928571           0.31250000
## moderate          0.017341040          0.173410405           0.05202312
##          Reason.for.absence24 Reason.for.absence25 Reason.for.absence26
## high               0.00000000           0.00000000           0.02127660
## low                0.00000000           0.06845238           0.02083333
## moderate           0.01734104           0.01734104           0.12716763
##          Reason.for.absence27 Reason.for.absence28 Day.of.the.week3
## high                0.0000000           0.04255319        0.1914894
## low                 0.1636905           0.25000000        0.2142857
## moderate            0.0000000           0.02312139        0.2080925
##          Day.of.the.week4 Day.of.the.week5 Day.of.the.week6
## high            0.2553191        0.1702128       0.06382979
## low             0.1696429        0.1934524       0.23214286
## moderate        0.2196532        0.1502890       0.16763006
##          Transportation.expense Distance.from.Residence.to.Work
## high                   225.7447                        26.93617
## low                    203.9077                        30.15476
## moderate               250.0694                        29.93642
##          Service.time      Age Work.load.Average.day Hit.target   Height
## high         12.68085 35.80851              286.3948   94.72340 171.6809
## low          12.90476 36.44494              266.8456   94.85417 170.9360
## moderate     11.74277 35.65607              274.2928   94.97110 171.0838
##          Body.mass.index
## high            25.74468
## low             26.56548
## moderate        26.74566
## 
## Coefficients of linear discriminants:
##                                           LD1           LD2
## Reason.for.absence2              0.4825247666 -8.8433185127
## Reason.for.absence3             -0.0332466691  1.4400404680
## Reason.for.absence4              1.9721431796  0.5578624263
## Reason.for.absence5              0.6587052967  0.6138922986
## Reason.for.absence6              0.5934601018 -0.9185998238
## Reason.for.absence7              1.3615778466 -1.1293384262
## Reason.for.absence8              1.8079169709  0.3248714944
## Reason.for.absence9             -0.0837661785 -5.4164221088
## Reason.for.absence10             0.3669252281 -0.3372586633
## Reason.for.absence11             1.6590260064 -1.1788767276
## Reason.for.absence12             1.4968567203 -5.2848492973
## Reason.for.absence13             1.1833600886 -1.6410287542
## Reason.for.absence14             2.0115292277 -1.1175940256
## Reason.for.absence15            -0.4710651836  0.8465044508
## Reason.for.absence16             3.6913390966 -0.4305365211
## Reason.for.absence17            -0.3338521155  1.8149828314
## Reason.for.absence18            -0.1225305400  0.4216789577
## Reason.for.absence19             0.5794519426 -2.5508155148
## Reason.for.absence21             1.2317651664  0.7256574734
## Reason.for.absence22             0.2366217296  0.5913152240
## Reason.for.absence23             3.3836776656 -0.5781132636
## Reason.for.absence24             0.0006886533  1.2486003490
## Reason.for.absence25             3.2146943220 -0.2736031074
## Reason.for.absence26             0.7475143736  0.3218743778
## Reason.for.absence27             3.6095836480 -0.5492118374
## Reason.for.absence28             3.4855830349 -0.7054934841
## Day.of.the.week3                -0.0320237693  0.4249944232
## Day.of.the.week4                -0.0781944490  0.1230870433
## Day.of.the.week5                 0.0953873608  0.3752321668
## Day.of.the.week6                 0.1574424590  0.4888155395
## Transportation.expense          -0.0043092780  0.0004016558
## Distance.from.Residence.to.Work  0.0041536634  0.0070208063
## Service.time                    -0.0356503696 -0.0813113277
## Age                              0.0185138276  0.0238804089
## Work.load.Average.day           -0.0012479059 -0.0027344620
## Hit.target                       0.0016184504  0.0066126378
## Height                           0.0139270637 -0.0330145373
## Body.mass.index                 -0.0086373798  0.0419995730
## 
## Proportion of trace:
##    LD1    LD2 
## 0.8515 0.1485

#summary(lda.fit)
plot(lda.fit, col = as.integer(train$absentgroup))

plot(lda.fit, dimen = 1, type = 'b')

lda.test <- predict(lda.fit,test)
test$lda <- lda.test$class
table(test$lda,test$absentgroup)

##           
##            high low moderate
##   high        7   4        3
##   low         2  66       15
##   moderate    7  11       25

print(postResample(pred = test$lda, obs = test.group))

##  Accuracy     Kappa 
## 0.7000000 0.4557571

confusionMatrix(test$lda, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        7   4        3
##   low         2  66       15
##   moderate    7  11       25
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7             
##                  95% CI : (0.6168, 0.7745)
##     No Information Rate : 0.5786          
##     P-Value [Acc > NIR] : 0.002055        
##                                           
##                   Kappa : 0.4558          
##  Mcnemar's Test P-Value : 0.410170        
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity               0.4375     0.8148          0.5814
## Specificity               0.9435     0.7119          0.8144
## Pos Pred Value            0.5000     0.7952          0.5814
## Neg Pred Value            0.9286     0.7368          0.8144
## Prevalence                0.1143     0.5786          0.3071
## Detection Rate            0.0500     0.4714          0.1786
## Detection Prevalence      0.1000     0.5929          0.3071
## Balanced Accuracy         0.6905     0.7633          0.6979

ldahist(data = lda.test$x[,1],g = test.group)

plot(lda.test$x[,1], lda.test$x[,2])
text(lda.test$x[,1], lda.test$x[,2], test$absentgroup, cex = 0.7, pos = 4, col = c("red","green","blue"))

test = test[,-12]

## Random forest 
set.seed(1234)
#split 3, error rate 27.32%
rf.fit = randomForest(absentgroup~., data = train, importance = TRUE)
rf.fit

## 
## Call:
##  randomForest(formula = absentgroup ~ ., data = train, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 27.88%
## Confusion matrix:
##          high low moderate class.error
## high        1  15       31   0.9787234
## low         4 290       42   0.1369048
## moderate   12  51      110   0.3641618

#rf.fit.pred <- predict(rf.fit, test, type = "class")

# Fine tuning parameters of Random Forest model, split 6. Error rate 26.62%
rf.fit1 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = 6, importance = TRUE)
rf.fit1

## 
## Call:
##  randomForest(formula = absentgroup ~ ., data = train, ntree = 500,      mtry = 6, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 26.44%
## Confusion matrix:
##          high low moderate class.error
## high        3  11       33   0.9361702
## low         6 286       44   0.1488095
## moderate   14  39      120   0.3063584

rf.fit1.pred <- predict(rf.fit1, test, type = "class")

# Checking classification accuracy

print(postResample(pred = rf.fit1.pred, obs = test.group))

##  Accuracy     Kappa 
## 0.7000000 0.4339078

confusionMatrix(rf.fit1.pred, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        5   1        1
##   low         2  69       18
##   moderate    9  11       24
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7             
##                  95% CI : (0.6168, 0.7745)
##     No Information Rate : 0.5786          
##     P-Value [Acc > NIR] : 0.002055        
##                                           
##                   Kappa : 0.4339          
##  Mcnemar's Test P-Value : 0.038033        
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.31250     0.8519          0.5581
## Specificity              0.98387     0.6610          0.7938
## Pos Pred Value           0.71429     0.7753          0.5455
## Neg Pred Value           0.91729     0.7647          0.8021
## Prevalence               0.11429     0.5786          0.3071
## Detection Rate           0.03571     0.4929          0.1714
## Detection Prevalence     0.05000     0.6357          0.3143
## Balanced Accuracy        0.64819     0.7564          0.6760

importance(rf.fit1)

##                                       high       low     moderate
## Reason.for.absence              16.4707898 91.958915 52.722060264
## Day.of.the.week                 -2.6444497 -2.101536 -6.290773639
## Transportation.expense           2.4960105 22.916939 10.265611666
## Distance.from.Residence.to.Work -1.2800162  3.458159  0.688356662
## Service.time                    -2.3836102 10.839848  1.648585822
## Age                             -2.4761537 10.701084 -0.175352608
## Work.load.Average.day           -5.1675358  4.720620 -1.093288773
## Hit.target                       2.4093100 -1.412450 -3.307227354
## Height                           1.8859505  8.974606 -0.006048795
## Body.mass.index                  0.3430315  8.863048 -0.624923531
##                                 MeanDecreaseAccuracy MeanDecreaseGini
## Reason.for.absence                        97.5505864        135.22912
## Day.of.the.week                           -6.6601649         23.70837
## Transportation.expense                    24.0868828         24.48099
## Distance.from.Residence.to.Work            2.9637681         10.19540
## Service.time                              10.8946338         11.44768
## Age                                        8.9082448         11.17669
## Work.load.Average.day                      0.9409773         31.24681
## Hit.target                                -2.4529034         21.27021
## Height                                     8.0866204         10.23405
## Body.mass.index                            8.2537315         10.04194

varImpPlot(rf.fit1)

# Using For loop to identify the right mtry for model
a=c()
#i=5

for (i in 1:8) {
  rf.fit2 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = i, importance = TRUE)
  rf.fit2.pred <- predict(rf.fit2, test, type = "class")
  a[i-2] = mean(rf.fit2.pred == test.group)
}
a

## [1] 0.7071429 0.7214286 0.7071429 0.7071429 0.6928571 0.6857143

plot(3:8,a, type = "b")

rf.fit5 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = 5, importance = TRUE)
rf.fit5.pred <- predict(rf.fit5, test, type = "class")
print(postResample(pred = rf.fit5.pred, obs = test.group))

##  Accuracy     Kappa 
## 0.7071429 0.4439062

confusionMatrix(rf.fit5.pred, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        5   0        1
##   low         2  70       18
##   moderate    9  11       24
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7071          
##                  95% CI : (0.6243, 0.7809)
##     No Information Rate : 0.5786          
##     P-Value [Acc > NIR] : 0.001151        
##                                           
##                   Kappa : 0.4439          
##  Mcnemar's Test P-Value : 0.017819        
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.31250     0.8642          0.5581
## Specificity              0.99194     0.6610          0.7938
## Pos Pred Value           0.83333     0.7778          0.5455
## Neg Pred Value           0.91791     0.7800          0.8021
## Prevalence               0.11429     0.5786          0.3071
## Detection Rate           0.03571     0.5000          0.1714
## Detection Prevalence     0.04286     0.6429          0.3143
## Balanced Accuracy        0.65222     0.7626          0.6760

## Building the Classification Tree Models using the Quinlan's C5.0 algorithm
c50.fit  <- C5.0(train[-11], train$absentgroup, trials = 10)
summary(c50.fit)

## 
## Call:
## C5.0.default(x = train[-11], y = train$absentgroup, trials = 10)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Thu Feb 14 19:35:53 2019
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 556 cases (11 attributes) from undefined.data
## 
## -----  Trial 0:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {1,3,5,6,10,15,17,18,21,22,24}: moderate (102/20)
## Reason.for.absence in {2,9}: high (4/1)
## Reason.for.absence in {4,16,23,25,27,28}: low (292/21)
## Reason.for.absence = 8:
## :...Hit.target <= 95: moderate (2)
## :   Hit.target > 95: low (3)
## Reason.for.absence = 12:
## :...Hit.target <= 94: low (2)
## :   Hit.target > 94: high (3)
## Reason.for.absence = 26:
## :...Age <= 43: moderate (28/6)
## :   Age > 43: low (2)
## Reason.for.absence = 7:
## :...Service.time <= 9: moderate (7/2)
## :   Service.time > 9:
## :   :...Transportation.expense <= 260: low (2)
## :       Transportation.expense > 260: high (2)
## Reason.for.absence = 14:
## :...Day.of.the.week = 2: high (2/1)
## :   Day.of.the.week in {4,5}: low (5/1)
## :   Day.of.the.week = 3:
## :   :...Transportation.expense <= 260: low (2)
## :   :   Transportation.expense > 260: moderate (2)
## :   Day.of.the.week = 6:
## :   :...Transportation.expense <= 246: moderate (2)
## :       Transportation.expense > 246: low (2)
## Reason.for.absence = 11:
## :...Transportation.expense <= 157: high (2)
## :   Transportation.expense > 157:
## :   :...Work.load.Average.day <= 284.853: low (10/2)
## :       Work.load.Average.day > 284.853:
## :       :...Work.load.Average.day <= 343.253: moderate (7/2)
## :           Work.load.Average.day > 343.253: low (3/1)
## Reason.for.absence = 13:
## :...Body.mass.index <= 21: low (7/1)
## :   Body.mass.index > 21:
## :   :...Day.of.the.week in {3,6}: moderate (10/3)
## :       Day.of.the.week = 2:
## :       :...Service.time <= 13: high (2)
## :       :   Service.time > 13: low (2/1)
## :       Day.of.the.week = 4:
## :       :...Hit.target <= 95: moderate (6/2)
## :       :   Hit.target > 95: high (4/1)
## :       Day.of.the.week = 5:
## :       :...Distance.from.Residence.to.Work <= 27: low (4)
## :           Distance.from.Residence.to.Work > 27: moderate (3/1)
## Reason.for.absence = 19:
## :...Height > 175: low (7/2)
##     Height <= 175:
##     :...Transportation.expense <= 118: high (3/1)
##         Transportation.expense > 118:
##         :...Hit.target <= 88: high (2)
##             Hit.target > 88: moderate (20/5)
## 
## -----  Trial 1:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {5,8,16,21,23,25,27,28}:
## :...Transportation.expense <= 291: low (260.4/41.2)
## :   Transportation.expense > 291: moderate (15/5.5)
## Reason.for.absence in {1,2,3,4,6,7,9,10,11,12,13,14,15,17,18,19,22,24,26}:
## :...Transportation.expense <= 235:
##     :...Day.of.the.week = 5: moderate (20.5/10.3)
##     :   Day.of.the.week = 6: low (18.2/7.1)
##     :   Day.of.the.week = 3:
##     :   :...Body.mass.index <= 28: high (22.1/10.3)
##     :   :   Body.mass.index > 28: low (12.6/6.3)
##     :   Day.of.the.week = 4:
##     :   :...Hit.target <= 88: moderate (5.5/2.4)
##     :   :   Hit.target > 88: low (29.3/11.1)
##     :   Day.of.the.week = 2:
##     :   :...Transportation.expense > 184: moderate (21.3/7.9)
##     :       Transportation.expense <= 184:
##     :       :...Hit.target <= 94: moderate (6.3/1.6)
##     :           Hit.target > 94: high (18.2/8.7)
##     Transportation.expense > 235:
##     :...Age > 41: low (9.5/3.9)
##         Age <= 41:
##         :...Age > 34: moderate (50.6/19)
##             Age <= 34:
##             :...Reason.for.absence in {1,3,4,6,9,10,11,12,14,15,17,18,22,24,
##                 :                      26}: moderate (43.4/11.9)
##                 Reason.for.absence in {2,7,13,19}: high (22.9/8.7)
## 
## -----  Trial 2:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {16,23,27,28}: low (213.9/46.4)
## Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,
## :                      25,26}:
## :...Work.load.Average.day > 313.532:
##     :...Body.mass.index <= 28:
##     :   :...Transportation.expense <= 279: high (35.6/15.2)
##     :   :   Transportation.expense > 279: moderate (4)
##     :   Body.mass.index > 28:
##     :   :...Service.time <= 13: moderate (8.6/4)
##     :       Service.time > 13: low (21/11.7)
##     Work.load.Average.day <= 313.532:
##     :...Transportation.expense > 233:
##         :...Reason.for.absence in {1,3,4,5,6,8,11,12,13,14,15,17,18,21,22,24,
##         :   :                      25,26}: moderate (99.4/33.5)
##         :   Reason.for.absence in {2,7,9,10,19}:
##         :   :...Body.mass.index <= 24: high (11.9/2)
##         :       Body.mass.index > 24: moderate (20.4/7.4)
##         Transportation.expense <= 233:
##         :...Day.of.the.week = 3:
##             :...Body.mass.index > 29: low (6/1.3)
##             :   Body.mass.index <= 29:
##             :   :...Service.time <= 11: moderate (12.9/2)
##             :       Service.time > 11: high (13.3/4.5)
##             Day.of.the.week in {2,4,5,6}:
##             :...Height <= 170:
##                 :...Transportation.expense > 189: low (20.1/3.3)
##                 :   Transportation.expense <= 189:
##                 :   :...Work.load.Average.day <= 222.196: low (4.2)
##                 :       Work.load.Average.day > 222.196: moderate (24/8.7)
##                 Height > 170:
##                 :...Distance.from.Residence.to.Work <= 11: low (17.3/5.8)
##                     Distance.from.Residence.to.Work > 11:
##                     :...Hit.target <= 88: high (4.9/2)
##                         Hit.target > 88:
##                         :...Distance.from.Residence.to.Work <= 12: high (5.9/2)
##                             Distance.from.Residence.to.Work > 12: low (32.5/10.1)
## 
## -----  Trial 3:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {16,23,25,27,28}:
## :...Transportation.expense > 291: moderate (17.1/7.8)
## :   Transportation.expense <= 291:
## :   :...Height <= 167: moderate (19.3/9.1)
## :       Height > 167:
## :       :...Work.load.Average.day > 268.519: low (48.5)
## :           Work.load.Average.day <= 268.519:
## :           :...Transportation.expense > 118: low (115.8/26.2)
## :               Transportation.expense <= 118:
## :               :...Age <= 48: moderate (20/7.4)
## :                   Age > 48: low (5.7)
## Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,
## :                      26}:
## :...Work.load.Average.day > 284.853:
##     :...Transportation.expense <= 248:
##     :   :...Distance.from.Residence.to.Work <= 11: low (8.7/4)
##     :   :   Distance.from.Residence.to.Work > 11: moderate (62.4/28)
##     :   Transportation.expense > 248:
##     :   :...Transportation.expense > 300: moderate (5)
##     :       Transportation.expense <= 300:
##     :       :...Hit.target <= 94: high (7.8/1.1)
##     :           Hit.target > 94:
##     :           :...Day.of.the.week in {2,5,6}: high (15.2/5.3)
##     :               Day.of.the.week in {3,4}: moderate (7.6)
##     Work.load.Average.day <= 284.853:
##     :...Distance.from.Residence.to.Work <= 11:
##         :...Hit.target <= 88: low (4.5)
##         :   Hit.target > 88: moderate (32.3/9.2)
##         Distance.from.Residence.to.Work > 11:
##         :...Distance.from.Residence.to.Work <= 12: high (8.6/1.8)
##             Distance.from.Residence.to.Work > 12:
##             :...Hit.target <= 88: moderate (10/3.6)
##                 Hit.target > 88:
##                 :...Height <= 168: moderate (28.9/10.8)
##                     Height > 168:
##                     :...Age > 40:
##                         :...Height <= 171: moderate (11.2/4.9)
##                         :   Height > 171: low (20.1/8.7)
##                         Age <= 40:
##                         :...Day.of.the.week in {3,4}: moderate (38.6/19.1)
##                             Day.of.the.week in {5,6}: low (33.6/13.6)
##                             Day.of.the.week = 2:
##                             :...Height <= 171: low (22/5.2)
##                                 Height > 171: high (13/6.8)
## 
## -----  Trial 4:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {1,2,3,6,7,9,10,11,12,13,14,15,17,18,19,22,24}:
## :...Reason.for.absence in {1,3,6,7,10,15,17,18,22,24}: moderate (120.6/46.2)
## :   Reason.for.absence in {2,9,12}: high (13.4/4)
## :   Reason.for.absence = 14: low (17.5/9.1)
## :   Reason.for.absence = 11:
## :   :...Transportation.expense <= 157: high (2.7)
## :   :   Transportation.expense > 157:
## :   :   :...Work.load.Average.day <= 284.853: low (14.9/3)
## :   :       Work.load.Average.day > 284.853: moderate (14/6.9)
## :   Reason.for.absence = 13:
## :   :...Body.mass.index <= 21: low (8/2.2)
## :   :   Body.mass.index > 21:
## :   :   :...Age > 43: low (4.1/1.6)
## :   :       Age <= 43:
## :   :       :...Work.load.Average.day <= 313.532: moderate (19.1/7.6)
## :   :           Work.load.Average.day > 313.532: high (23.3/7.7)
## :   Reason.for.absence = 19:
## :   :...Height > 175: high (9.2/3.9)
## :       Height <= 175:
## :       :...Transportation.expense <= 118: high (4.6/2.1)
## :           Transportation.expense > 118:
## :           :...Age <= 30: moderate (5.6)
## :               Age > 30: high (25.2/10.4)
## Reason.for.absence in {4,5,8,16,21,23,25,26,27,28}:
## :...Day.of.the.week in {3,4,6}:
##     :...Reason.for.absence in {4,16,21,27}: low (24.3)
##     :   Reason.for.absence in {5,8,23,25,26,28}:
##     :   :...Body.mass.index <= 24: low (41.7/6.8)
##     :       Body.mass.index > 24:
##     :       :...Height > 175: low (18.4/2.7)
##     :           Height <= 175:
##     :           :...Day.of.the.week = 3: low (30.5/10.1)
##     :               Day.of.the.week = 4:
##     :               :...Service.time <= 12: low (3.9)
##     :               :   Service.time > 12: moderate (21.6/6.8)
##     :               Day.of.the.week = 6:
##     :               :...Reason.for.absence in {5,25,26,28}: moderate (20.5/5)
##     :                   Reason.for.absence in {8,23}: low (7.3)
##     Day.of.the.week in {2,5}:
##     :...Distance.from.Residence.to.Work <= 14: low (23/3.9)
##         Distance.from.Residence.to.Work > 14:
##         :...Height <= 167: low (13.7/4.4)
##             Height > 167:
##             :...Transportation.expense > 330: low (3/1.5)
##                 Transportation.expense <= 330:
##                 :...Reason.for.absence in {25,27}: low (15.9)
##                     Reason.for.absence in {4,5,8,16,21,26}: moderate (12.5/4.1)
##                     Reason.for.absence in {23,28}:
##                     :...Work.load.Average.day > 265.017: low (7.8)
##                         Work.load.Average.day <= 265.017:
##                         :...Work.load.Average.day <= 239.554: low (4.4)
##                             Work.load.Average.day > 239.554: high (25.3/10.2)
## 
## -----  Trial 5:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {4,5,8,16,21,23,25,27,28}:
## :...Body.mass.index <= 19: low (14.1/3.2)
## :   Body.mass.index > 19:
## :   :...Reason.for.absence in {4,8,16,27,28}: low (94/21.4)
## :       Reason.for.absence in {5,21}: moderate (8.7/3.1)
## :       Reason.for.absence = 25:
## :       :...Work.load.Average.day <= 241.476: moderate (13.1/5.2)
## :       :   Work.load.Average.day > 241.476: low (11.2)
## :       Reason.for.absence = 23:
## :       :...Service.time <= 4: low (9.7/5.9)
## :           Service.time > 4:
## :           :...Hit.target <= 98: low (60.6/13.6)
## :               Hit.target > 98: moderate (17.1/4.6)
## Reason.for.absence in {1,2,3,6,7,9,10,11,12,13,14,15,17,18,19,22,24,26}:
## :...Reason.for.absence in {1,3,6,14,15,17,18,22,24}: moderate (95.1/42.3)
##     Reason.for.absence in {2,9,12}: high (13.5/5.5)
##     Reason.for.absence = 7: low (16.3/8.6)
##     Reason.for.absence = 10:
##     :...Body.mass.index <= 23: high (10.1/3.4)
##     :   Body.mass.index > 23: moderate (19/9.1)
##     Reason.for.absence = 11:
##     :...Hit.target <= 93: low (6.7/0.4)
##     :   Hit.target > 93:
##     :   :...Body.mass.index <= 22: high (5.8/1.3)
##     :       Body.mass.index > 22: moderate (18.8/9.2)
##     Reason.for.absence = 19:
##     :...Height > 175: low (10.8/4.5)
##     :   Height <= 175:
##     :   :...Transportation.expense <= 118: low (4.8/2.2)
##     :       Transportation.expense > 118: moderate (33.1/12.6)
##     Reason.for.absence = 13:
##     :...Body.mass.index <= 21: low (7.6/2.7)
##     :   Body.mass.index > 21:
##     :   :...Transportation.expense <= 118: low (4.8/2)
##     :       Transportation.expense > 118:
##     :       :...Age <= 36: moderate (17.5/8.1)
##     :           Age > 36:
##     :           :...Transportation.expense > 246: low (2.8)
##     :               Transportation.expense <= 246:
##     :               :...Work.load.Average.day <= 308.593: moderate (8.5/2.7)
##     :                   Work.load.Average.day > 308.593: high (14.1/4.6)
##     Reason.for.absence = 26:
##     :...Work.load.Average.day > 330.061: high (2.6)
##         Work.load.Average.day <= 330.061:
##         :...Distance.from.Residence.to.Work > 42: moderate (2.8)
##             Distance.from.Residence.to.Work <= 42:
##             :...Age > 43: low (2.6)
##                 Age <= 43:
##                 :...Work.load.Average.day > 275.312: moderate (5)
##                     Work.load.Average.day <= 275.312:
##                     :...Work.load.Average.day <= 237.656: moderate (3.1)
##                         Work.load.Average.day > 237.656: low (22.2/8)
## 
## -----  Trial 6:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {4,5,8,16,21,23,25,27,28}:
## :...Transportation.expense <= 291: low (199.2/55.5)
## :   Transportation.expense > 291: moderate (18.4/8.1)
## Reason.for.absence in {1,2,3,6,7,9,10,11,12,13,14,15,17,18,19,22,24,26}:
## :...Reason.for.absence in {1,3,7,9,15,17,18,24}: moderate (60.3/28.9)
##     Reason.for.absence in {2,6,19}: high (59.8/31.5)
##     Reason.for.absence in {11,12}: low (39.6/21.3)
##     Reason.for.absence = 14:
##     :...Body.mass.index <= 25: moderate (14.1/8.1)
##     :   Body.mass.index > 25: low (9/0.9)
##     Reason.for.absence = 26:
##     :...Work.load.Average.day <= 330.061: moderate (36.6/14.7)
##     :   Work.load.Average.day > 330.061: high (2.3)
##     Reason.for.absence = 10:
##     :...Hit.target <= 91: low (6.9/1.5)
##     :   Hit.target > 91:
##     :   :...Work.load.Average.day <= 222.196: low (3.4/0.4)
##     :       Work.load.Average.day > 222.196: moderate (20.2/8.3)
##     Reason.for.absence = 22:
##     :...Height <= 167: high (3.8/0.7)
##     :   Height > 167:
##     :   :...Hit.target <= 95: moderate (6.6)
##     :       Hit.target > 95: low (19.3/8.5)
##     Reason.for.absence = 13:
##     :...Body.mass.index <= 21: low (7.4/3.1)
##         Body.mass.index > 21:
##         :...Hit.target > 99: low (2.4)
##             Hit.target <= 99:
##             :...Transportation.expense <= 118: moderate (4.9/2.4)
##                 Transportation.expense > 118: high (40.8/20.8)
## 
## -----  Trial 7:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {16,23,25,27,28}:
## :...Distance.from.Residence.to.Work > 42: low (41.2/1.6)
## :   Distance.from.Residence.to.Work <= 42:
## :   :...Hit.target <= 97: low (78.8/13.2)
## :       Hit.target > 97:
## :       :...Height > 175: low (11.9/2.9)
## :           Height <= 175:
## :           :...Height > 172: moderate (4.7)
## :               Height <= 172:
## :               :...Distance.from.Residence.to.Work <= 27: low (17/3.4)
## :                   Distance.from.Residence.to.Work > 27: moderate (12.4/2.2)
## Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,
## :                      26}:
## :...Work.load.Average.day > 284.853:
##     :...Body.mass.index > 27:
##     :   :...Transportation.expense <= 268: low (35.6/17.4)
##     :   :   Transportation.expense > 268: moderate (17.8/7.8)
##     :   Body.mass.index <= 27:
##     :   :...Day.of.the.week in {3,5,6}: moderate (33.7/11)
##     :       Day.of.the.week = 4: high (10.8/3.9)
##     :       Day.of.the.week = 2:
##     :       :...Transportation.expense <= 184: high (8.8/1)
##     :           Transportation.expense > 184: moderate (14.3/5)
##     Work.load.Average.day <= 284.853:
##     :...Service.time <= 10:
##         :...Work.load.Average.day <= 230.29: moderate (9.3/4)
##         :   Work.load.Average.day > 230.29:
##         :   :...Reason.for.absence in {6,7,10,11,12,13,14}: low (46/14.2)
##         :       Reason.for.absence in {1,2,3,4,5,8,9,15,17,18,19,21,22,24,
##         :                              26}: moderate (45.2/18.2)
##         Service.time > 10:
##         :...Service.time <= 11: moderate (13.8/2.7)
##             Service.time > 11:
##             :...Distance.from.Residence.to.Work > 42: low (16/6.5)
##                 Distance.from.Residence.to.Work <= 42:
##                 :...Height > 172:
##                     :...Age <= 37: high (11.8/1.5)
##                     :   Age > 37: low (15.8/4.9)
##                     Height <= 172:
##                     :...Reason.for.absence in {1,2,3,4,5,8,9,11,13,14,15,17,18,
##                         :                      21,22,24,
##                         :                      26}: moderate (69.1/33.8)
##                         Reason.for.absence in {6,7,10,12,19}:
##                         :...Work.load.Average.day <= 265.017: high (23.4/6.7)
##                             Work.load.Average.day > 265.017: moderate (8.7/2.1)
## 
## -----  Trial 8:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {16,23,25,27,28}:
## :...Transportation.expense <= 291: low (124.8/11.9)
## :   Transportation.expense > 291: moderate (13.5/5.3)
## Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,
## :                      26}:
## :...Transportation.expense <= 248:
##     :...Reason.for.absence in {1,2,3,4,5,6,10,15,17,18,21,22,
##     :   :                      24}: moderate (90.8/40.6)
##     :   Reason.for.absence in {7,8,14}: low (31.4/11.3)
##     :   Reason.for.absence in {9,12}: high (14.6/5.6)
##     :   Reason.for.absence = 11:
##     :   :...Body.mass.index <= 27: high (17.1/6.8)
##     :   :   Body.mass.index > 27: moderate (13.5/6.6)
##     :   Reason.for.absence = 19:
##     :   :...Height <= 171: moderate (14.2/2.3)
##     :   :   Height > 171: low (19.2/7)
##     :   Reason.for.absence = 26:
##     :   :...Body.mass.index <= 33: low (16.6/5.6)
##     :   :   Body.mass.index > 33: moderate (6.9)
##     :   Reason.for.absence = 13:
##     :   :...Service.time <= 11: low (12.5/3.2)
##     :       Service.time > 11:
##     :       :...Height <= 165: low (2.4)
##     :           Height > 165:
##     :           :...Work.load.Average.day <= 308.593: moderate (12.8/2.5)
##     :               Work.load.Average.day > 308.593: high (20.7/10.2)
##     Transportation.expense > 248:
##     :...Height > 172:
##         :...Service.time <= 12: moderate (24.1/7.4)
##         :   Service.time > 12: low (3)
##         Height <= 172:
##         :...Day.of.the.week = 3: moderate (9.4)
##             Day.of.the.week = 5: high (20.8/8.2)
##             Day.of.the.week in {2,4,6}:
##             :...Hit.target <= 93:
##                 :...Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
##                 :   :                      17,19,21}: high (20.2/2.5)
##                 :   Reason.for.absence in {18,22,24,26}: moderate (7)
##                 Hit.target > 93:
##                 :...Reason.for.absence in {1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
##                     :                      17,18,21,22,24}: moderate (22.3)
##                     Reason.for.absence in {19,26}: high (16.3/5.4)
## 
## -----  Trial 9:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {1,3,5,6,8,9,10,15,17,18,22,24}: moderate (111.9/37.4)
## Reason.for.absence = 2: high (3.3)
## Reason.for.absence in {4,12,16,21,23,25,27,28}: low (137.2/14.1)
## Reason.for.absence = 7:
## :...Service.time <= 11: moderate (17.9/5.4)
## :   Service.time > 11: high (6.1/0.9)
## Reason.for.absence = 14:
## :...Body.mass.index <= 25: high (16.8/9.5)
## :   Body.mass.index > 25: low (9.1/0.9)
## Reason.for.absence = 11:
## :...Transportation.expense <= 157: high (3.5)
## :   Transportation.expense > 157:
## :   :...Work.load.Average.day <= 265.017: low (21/1.6)
## :       Work.load.Average.day > 265.017: moderate (27.1/12.7)
## Reason.for.absence = 19:
## :...Height > 175: low (11.2/2.7)
## :   Height <= 175:
## :   :...Transportation.expense <= 118: high (9.3/2.1)
## :       Transportation.expense > 118: moderate (43.7/16.7)
## Reason.for.absence = 26:
## :...Work.load.Average.day > 330.061: high (2.6)
## :   Work.load.Average.day <= 330.061:
## :   :...Age <= 43: moderate (35.1/6.3)
## :       Age > 43: low (3.3)
## Reason.for.absence = 13:
## :...Body.mass.index <= 21: low (4.7)
##     Body.mass.index > 21:
##     :...Transportation.expense <= 118: low (3.8)
##         Transportation.expense > 118:
##         :...Hit.target > 99: low (3.3)
##             Hit.target <= 99:
##             :...Age > 41: low (4.2/0.8)
##                 Age <= 41:
##                 :...Work.load.Average.day <= 222.196: high (5.2)
##                     Work.load.Average.day > 222.196:
##                     :...Work.load.Average.day <= 313.532: moderate (15.4/1.8)
##                         Work.load.Average.day > 313.532: high (24.3/10)
## 
## 
## Evaluation on training data (556 cases):
## 
## Trial        Decision Tree   
## -----      ----------------  
##    Size      Errors  
## 
##    0     34   74(13.3%)
##    1     15  121(21.8%)
##    2     18  116(20.9%)
##    3     23  134(24.1%)
##    4     30  121(21.8%)
##    5     31  112(20.1%)
##    6     19  137(24.6%)
##    7     22  104(18.7%)
##    8     23   98(17.6%)
##    9     23   91(16.4%)
## boost             54( 9.7%)   <<
## 
## 
##     (a)   (b)   (c)    <-classified as
##    ----  ----  ----
##      26     5    16    (a): class high
##       2   322    12    (b): class low
##       2    17   154    (c): class moderate
## 
## 
##  Attribute usage:
## 
##  100.00% Reason.for.absence
##  100.00% Transportation.expense
##   98.74% Day.of.the.week
##   95.68% Work.load.Average.day
##   95.50% Distance.from.Residence.to.Work
##   95.14% Height
##   88.13% Body.mass.index
##   83.27% Hit.target
##   61.69% Service.time
##   42.99% Age
## 
## 
## Time: 0.0 secs

plot(c50.fit)

c50.fit.pred <- predict(c50.fit, test)

print(postResample(pred = c50.fit.pred, obs = test.group))

##  Accuracy     Kappa 
## 0.7071429 0.4427184

confusionMatrix(c50.fit.pred, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        5   1        2
##   low         5  70       17
##   moderate    6  10       24
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7071          
##                  95% CI : (0.6243, 0.7809)
##     No Information Rate : 0.5786          
##     P-Value [Acc > NIR] : 0.001151        
##                                           
##                   Kappa : 0.4427          
##  Mcnemar's Test P-Value : 0.090396        
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.31250     0.8642          0.5581
## Specificity              0.97581     0.6271          0.8351
## Pos Pred Value           0.62500     0.7609          0.6000
## Neg Pred Value           0.91667     0.7708          0.8100
## Prevalence               0.11429     0.5786          0.3071
## Detection Rate           0.03571     0.5000          0.1714
## Detection Prevalence     0.05714     0.6571          0.2857
## Balanced Accuracy        0.64415     0.7457          0.6966

## Recursive PArtition Regression Tree - RPART

m2 = rpart(absentgroup ~ .,train, method = "class")
m2.pred = predict(m2, test, type = "class")
print(postResample(pred = m2.pred, obs = test.group))

##  Accuracy     Kappa 
## 0.6785714 0.3765463

confusionMatrix(m2.pred, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        4   0        1
##   low         6  69       20
##   moderate    6  12       22
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6786          
##                  95% CI : (0.5945, 0.7549)
##     No Information Rate : 0.5786          
##     P-Value [Acc > NIR] : 0.009715        
##                                           
##                   Kappa : 0.3765          
##  Mcnemar's Test P-Value : 0.009005        
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.25000     0.8519          0.5116
## Specificity              0.99194     0.5593          0.8144
## Pos Pred Value           0.80000     0.7263          0.5500
## Neg Pred Value           0.91111     0.7333          0.7900
## Prevalence               0.11429     0.5786          0.3071
## Detection Rate           0.02857     0.4929          0.1571
## Detection Prevalence     0.03571     0.6786          0.2857
## Balanced Accuracy        0.62097     0.7056          0.6630

plot(m2)
text(m2, pretty = 0, cex = 0.8)

prp(m2, varlen = 4, extra = 2)

## Support Vector machine 

absent_classifier <- ksvm(absentgroup ~ ., data = train, kernel = "vanilladot")

##  Setting default kernel parameters

absent_classifier

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 307 
## 
## Objective Function Value : -77.7838 -88.6522 -159.4792 
## Training error : 0.208633

#Evaluating the SVM Model Performance

absent_predictions <- predict(absent_classifier, test) 
table(absent_predictions, test.group )

##                   test.group
## absent_predictions high low moderate
##           high        2   1        0
##           low         1  67       15
##           moderate   13  13       28

#Confusion Matrix for SVM Model

agreement <- absent_predictions == test.group 
table(agreement)

## agreement
## FALSE  TRUE 
##    43    97

print(postResample(pred = absent_predictions, obs = test.group))

##  Accuracy     Kappa 
## 0.6928571 0.4270486

############## Random Forest is our Best PErformaer ##################

############### Final Prediction on entire data set ##################

finalData = rbind(train, test)

final_fit <- predict(rf.fit5, finalData, type = "class")

summary(final_fit)

##     high      low moderate 
##       48      430      218

#table(final_fit, modeldata$absentgroup)

print(postResample(pred = final_fit, obs = finalData$absentgroup))

##  Accuracy     Kappa 
## 0.9324713 0.8717146

confusionMatrix(final_fit, finalData$absentgroup)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high       47   0        1
##   low         5 406       19
##   moderate   11  11      196
## 
## Overall Statistics
##                                         
##                Accuracy : 0.9325        
##                  95% CI : (0.9112, 0.95)
##     No Information Rate : 0.5991        
##     P-Value [Acc > NIR] : < 2.2e-16     
##                                         
##                   Kappa : 0.8717        
##  Mcnemar's Test P-Value : 0.001458      
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.74603     0.9736          0.9074
## Specificity              0.99842     0.9140          0.9542
## Pos Pred Value           0.97917     0.9442          0.8991
## Neg Pred Value           0.97531     0.9586          0.9582
## Prevalence               0.09052     0.5991          0.3103
## Detection Rate           0.06753     0.5833          0.2816
## Detection Prevalence     0.06897     0.6178          0.3132
## Balanced Accuracy        0.87223     0.9438          0.9308

### Best perfomer is Random Forest with 5 splits. ON entire data, random forest's prediction accuracy 93%. Kappa 87% tells the model is almost perfect to predict the absenteeism group. 


################################## End of Projct Absenteeism at Wrok #####################################

Prediction of Absenteeism at Workplace

Team7-Himangshu, Junaid, Sankshiptha

February 6, 2019

Data Exploration and Preprocessing

Model building using Machine Learning Algorithms.