ANLY530_Absenteeism

## Libraries required for this project
library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.6.3

library(corrplot)

## corrplot 0.84 loaded

library(factoextra)

## Warning: package 'factoextra' was built under R version 3.6.3

## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa

library(NbClust)

library(cluster)

library(purrr)

## Warning: package 'purrr' was built under R version 3.6.3

library(MASS)

## Warning: package 'MASS' was built under R version 3.6.3

library(gridExtra)

library(tree)

## Warning: package 'tree' was built under R version 3.6.3

library(Metrics)

## Warning: package 'Metrics' was built under R version 3.6.3

library(randomForest)

## Warning: package 'randomForest' was built under R version 3.6.3

## randomForest 4.6-14

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(C50)

## Warning: package 'C50' was built under R version 3.6.3

library(kernlab)

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:purrr':
## 
##     cross

## The following object is masked from 'package:ggplot2':
## 
##     alpha

library(caret)

## Warning: package 'caret' was built under R version 3.6.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 3.6.3

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:Metrics':
## 
##     precision, recall

## The following object is masked from 'package:purrr':
## 
##     lift

library(rpart)

library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 3.6.3

## Read input Absenteeism_at_work.csv from working directory.
absentdata <- read.csv('Absenteeism_at_work_train.csv')

summary(absentdata)

##        ID        Reason.for.absence Month.of.absence Day.of.the.week
##  Min.   : 1.00   Min.   : 0.00      Min.   : 1.000   Min.   :2.000  
##  1st Qu.: 7.00   1st Qu.:13.00      1st Qu.: 3.000   1st Qu.:3.000  
##  Median :18.00   Median :23.00      Median : 7.000   Median :4.000  
##  Mean   :17.67   Mean   :19.47      Mean   : 6.441   Mean   :3.893  
##  3rd Qu.:28.00   3rd Qu.:26.00      3rd Qu.:10.000   3rd Qu.:5.000  
##  Max.   :36.00   Max.   :28.00      Max.   :12.000   Max.   :6.000  
##                                                                     
##     Seasons      Transportation.expense Distance.from.Residence.to.Work
##  Min.   :1.000   Min.   :  0.0          Min.   : 5.00                  
##  1st Qu.:2.000   1st Qu.:179.0          1st Qu.:17.00                  
##  Median :2.000   Median :225.0          Median :26.00                  
##  Mean   :2.553   Mean   :222.8          Mean   :30.37                  
##  3rd Qu.:4.000   3rd Qu.:260.0          3rd Qu.:50.00                  
##  Max.   :4.000   Max.   :388.0          Max.   :52.00                  
##                                                                        
##   Service.time       Age      Work.load.Average.day   Hit.target    
##  Min.   : 1.0   38     :112   222,196: 35           Min.   : 81.00  
##  1st Qu.: 9.0   28     :109   264,249: 33           1st Qu.: 92.00  
##  Median :13.0   37     : 67   343,253: 29           Median : 95.00  
##  Mean   :12.7   40     : 50   265,017: 28           Mean   : 94.41  
##  3rd Qu.:16.0   33     : 48   284,853: 25           3rd Qu.: 97.00  
##  Max.   :29.0   36     : 47   308,593: 24           Max.   :100.00  
##                 (Other):233   (Other):492           NA's   :1       
##  Disciplinary.failure   Education          Son        Social.drinker  
##  Min.   :0.00000      Min.   :1.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0.00000      1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0.00000      Median :1.000   Median :1.000   Median :1.0000  
##  Mean   :0.05405      Mean   :1.246   Mean   :1.029   Mean   :0.5841  
##  3rd Qu.:0.00000      3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :1.00000      Max.   :4.000   Max.   :4.000   Max.   :1.0000  
##                                                                       
##  Social.smoker          Pet             Weight           Height     
##  Min.   :0.00000   Min.   :0.0000   Min.   : 56.00   Min.   :163.0  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: 69.00   1st Qu.:169.0  
##  Median :0.00000   Median :0.0000   Median : 83.00   Median :170.0  
##  Mean   :0.06907   Mean   :0.6907   Mean   : 79.21   Mean   :171.9  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.: 89.00   3rd Qu.:172.0  
##  Max.   :1.00000   Max.   :8.0000   Max.   :108.00   Max.   :196.0  
##                                     NA's   :2                       
##  Body.mass.index Absenteeism.time.in.hours
##  Min.   :19.00   Min.   :  0.000          
##  1st Qu.:24.00   1st Qu.:  2.000          
##  Median :25.00   Median :  3.000          
##  Mean   :26.82   Mean   :  6.752          
##  3rd Qu.:31.00   3rd Qu.:  8.000          
##  Max.   :38.00   Max.   :120.000          
##

str(absentdata)

## 'data.frame':    666 obs. of  21 variables:
##  $ ID                             : int  11 36 3 7 11 3 10 20 14 1 ...
##  $ Reason.for.absence             : int  26 0 23 7 23 23 22 23 19 22 ...
##  $ Month.of.absence               : int  7 7 7 7 7 7 7 7 7 7 ...
##  $ Day.of.the.week                : int  3 3 4 5 5 6 6 6 2 2 ...
##  $ Seasons                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Transportation.expense         : int  289 118 179 279 289 179 361 260 155 235 ...
##  $ Distance.from.Residence.to.Work: int  36 13 51 5 36 51 52 50 12 11 ...
##  $ Service.time                   : int  13 18 18 14 13 18 3 11 14 14 ...
##  $ Age                            : Factor w/ 23 levels "0","27","28",..: 8 21 12 13 8 12 3 10 9 11 ...
##  $ Work.load.Average.day          : Factor w/ 36 levels "0","12","205,917",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ Hit.target                     : int  97 97 97 97 97 97 97 97 97 97 ...
##  $ Disciplinary.failure           : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Education                      : int  1 1 1 1 1 1 1 1 1 3 ...
##  $ Son                            : int  2 1 0 2 2 0 1 4 2 1 ...
##  $ Social.drinker                 : int  1 1 1 1 1 1 1 1 1 0 ...
##  $ Social.smoker                  : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ Pet                            : int  1 0 0 0 1 0 4 0 0 1 ...
##  $ Weight                         : int  90 98 89 68 90 89 80 65 95 88 ...
##  $ Height                         : int  172 178 170 168 172 170 172 168 196 172 ...
##  $ Body.mass.index                : int  30 31 31 24 30 31 27 23 25 29 ...
##  $ Absenteeism.time.in.hours      : int  4 0 2 4 2 2 8 4 40 8 ...

## Observations in which reason code is greater than zero but absenteeism time in hours is > 0 + 
## Observations in which reason code is zero and absenteeism time is zero.

a <- subset(absentdata, Absenteeism.time.in.hours <= 0 & Reason.for.absence > 0, c(ID, Reason.for.absence, Absenteeism.time.in.hours))

b <- subset(absentdata, Absenteeism.time.in.hours <= 0 & Reason.for.absence == 0, c(ID, Reason.for.absence, Absenteeism.time.in.hours))

as.matrix(rbind(a,b))

##     ID Reason.for.absence Absenteeism.time.in.hours
## 135 34                 27                         0
## 2   36                  0                         0
## 51  20                  0                         0
## 52  29                  0                         0
## 55  11                  0                         0
## 56  36                  0                         0
## 59  13                  0                         0
## 65  36                  0                         0
## 204  2                  0                         0
## 214  7                  0                         0
## 215 18                  0                         0
## 216 23                  0                         0
## 217 31                  0                         0
## 252 20                  0                         0
## 274  5                  0                         0
## 277  8                  0                         0
## 278 19                  0                         0
## 286  5                  0                         0
## 294 36                  0                         0
## 295 33                  0                         0
## 301  5                  0                         0
## 304  5                  0                         0
## 312 20                  0                         0
## 313 15                  0                         0
## 314 30                  0                         0
## 326 18                  0                         0
## 337 23                  0                         0
## 338  7                  0                         0
## 401 13                  0                         0
## 406  1                  0                         0
## 407 24                  0                         0
## 408 36                  0                         0
## 447  3                  0                         0
## 531 28                  0                         0
## 549 15                  0                         0
## 550 11                  0                         0
## 552  5                  0                         0

## REason code 27 is the only one wherer Absenteeism time is zero. Removed that observation. 
absentdata = absentdata[!(absentdata$Absenteeism.time.in.hours==0 & absentdata$Reason.for.absence > 0) ,]


## At this point we have 665 observations and 21 attributes. 
dim(absentdata)

## [1] 665  21

summary(absentdata)

##        ID        Reason.for.absence Month.of.absence Day.of.the.week
##  Min.   : 1.00   Min.   : 0.00      Min.   : 1.00    Min.   :2.000  
##  1st Qu.: 7.00   1st Qu.:13.00      1st Qu.: 3.00    1st Qu.:3.000  
##  Median :18.00   Median :23.00      Median : 7.00    Median :4.000  
##  Mean   :17.65   Mean   :19.46      Mean   : 6.45    Mean   :3.896  
##  3rd Qu.:28.00   3rd Qu.:26.00      3rd Qu.:10.00    3rd Qu.:5.000  
##  Max.   :36.00   Max.   :28.00      Max.   :12.00    Max.   :6.000  
##                                                                     
##     Seasons      Transportation.expense Distance.from.Residence.to.Work
##  Min.   :1.000   Min.   :  0            Min.   : 5.0                   
##  1st Qu.:2.000   1st Qu.:179            1st Qu.:17.0                   
##  Median :2.000   Median :225            Median :26.0                   
##  Mean   :2.553   Mean   :223            Mean   :30.4                   
##  3rd Qu.:4.000   3rd Qu.:260            3rd Qu.:50.0                   
##  Max.   :4.000   Max.   :388            Max.   :52.0                   
##                                                                        
##   Service.time       Age      Work.load.Average.day   Hit.target    
##  Min.   : 1.0   38     :112   222,196: 35           Min.   : 81.00  
##  1st Qu.: 9.0   28     :109   264,249: 33           1st Qu.: 92.00  
##  Median :13.0   37     : 66   343,253: 29           Median : 95.00  
##  Mean   :12.7   40     : 50   265,017: 28           Mean   : 94.41  
##  3rd Qu.:16.0   33     : 48   284,853: 25           3rd Qu.: 97.00  
##  Max.   :29.0   36     : 47   268,519: 23           Max.   :100.00  
##                 (Other):233   (Other):492           NA's   :1       
##  Disciplinary.failure   Education          Son       Social.drinker 
##  Min.   :0.00000      Min.   :1.000   Min.   :0.00   Min.   :0.000  
##  1st Qu.:0.00000      1st Qu.:1.000   1st Qu.:0.00   1st Qu.:0.000  
##  Median :0.00000      Median :1.000   Median :1.00   Median :1.000  
##  Mean   :0.05414      Mean   :1.247   Mean   :1.03   Mean   :0.585  
##  3rd Qu.:0.00000      3rd Qu.:1.000   3rd Qu.:2.00   3rd Qu.:1.000  
##  Max.   :1.00000      Max.   :4.000   Max.   :4.00   Max.   :1.000  
##                                                                     
##  Social.smoker          Pet             Weight           Height     
##  Min.   :0.00000   Min.   :0.0000   Min.   : 56.00   Min.   :163.0  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: 69.00   1st Qu.:169.0  
##  Median :0.00000   Median :0.0000   Median : 83.00   Median :170.0  
##  Mean   :0.06917   Mean   :0.6917   Mean   : 79.21   Mean   :171.9  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.: 89.00   3rd Qu.:172.0  
##  Max.   :1.00000   Max.   :8.0000   Max.   :108.00   Max.   :196.0  
##                                     NA's   :2                       
##  Body.mass.index Absenteeism.time.in.hours
##  Min.   :19.00   Min.   :  0.000          
##  1st Qu.:24.00   1st Qu.:  2.000          
##  Median :25.00   Median :  3.000          
##  Mean   :26.81   Mean   :  6.762          
##  3rd Qu.:31.00   3rd Qu.:  8.000          
##  Max.   :38.00   Max.   :120.000          
##

## Disciplinary failure is a noise. ONly zeros in Disciplinary Failure. 
range(absentdata$Disciplinary.failure)

## [1] 0 1

## Removied Disciplinary failure attribute. 
absentdata <- absentdata[,-12]

str(absentdata)

## 'data.frame':    665 obs. of  20 variables:
##  $ ID                             : int  11 36 3 7 11 3 10 20 14 1 ...
##  $ Reason.for.absence             : int  26 0 23 7 23 23 22 23 19 22 ...
##  $ Month.of.absence               : int  7 7 7 7 7 7 7 7 7 7 ...
##  $ Day.of.the.week                : int  3 3 4 5 5 6 6 6 2 2 ...
##  $ Seasons                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Transportation.expense         : int  289 118 179 279 289 179 361 260 155 235 ...
##  $ Distance.from.Residence.to.Work: int  36 13 51 5 36 51 52 50 12 11 ...
##  $ Service.time                   : int  13 18 18 14 13 18 3 11 14 14 ...
##  $ Age                            : Factor w/ 23 levels "0","27","28",..: 8 21 12 13 8 12 3 10 9 11 ...
##  $ Work.load.Average.day          : Factor w/ 36 levels "0","12","205,917",..: 8 8 8 8 8 8 8 8 8 8 ...
##  $ Hit.target                     : int  97 97 97 97 97 97 97 97 97 97 ...
##  $ Education                      : int  1 1 1 1 1 1 1 1 1 3 ...
##  $ Son                            : int  2 1 0 2 2 0 1 4 2 1 ...
##  $ Social.drinker                 : int  1 1 1 1 1 1 1 1 1 0 ...
##  $ Social.smoker                  : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ Pet                            : int  1 0 0 0 1 0 4 0 0 1 ...
##  $ Weight                         : int  90 98 89 68 90 89 80 65 95 88 ...
##  $ Height                         : int  172 178 170 168 172 170 172 168 196 172 ...
##  $ Body.mass.index                : int  30 31 31 24 30 31 27 23 25 29 ...
##  $ Absenteeism.time.in.hours      : int  4 0 2 4 2 2 8 4 40 8 ...

## Now we have 665 observations and 20 attributes. 

## Missing value analysis
as.matrix(colSums(is.na(absentdata)))

##                                 [,1]
## ID                                 0
## Reason.for.absence                 0
## Month.of.absence                   0
## Day.of.the.week                    0
## Seasons                            0
## Transportation.expense             0
## Distance.from.Residence.to.Work    0
## Service.time                       0
## Age                                0
## Work.load.Average.day              0
## Hit.target                         1
## Education                          0
## Son                                0
## Social.drinker                     0
## Social.smoker                      0
## Pet                                0
## Weight                             2
## Height                             0
## Body.mass.index                    0
## Absenteeism.time.in.hours          0

# There are a couple of missing values.

library(dplyr)

## Warning: package 'dplyr' was built under R version 3.6.3

## 
## Attaching package: 'dplyr'

## The following object is masked from 'package:randomForest':
## 
##     combine

## The following object is masked from 'package:gridExtra':
## 
##     combine

## The following object is masked from 'package:MASS':
## 
##     select

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

# Exclude the missing observations
absentdata <-absentdata %>%
na.omit()       
dim(absentdata)

## [1] 662  20

## Box plot of Absenteeism time in hours with Reason for absence. To verify the outliers in each reason codes. 
ggplot(absentdata,
    aes_string(y=absentdata$Absenteeism.time.in.hours,x=as.factor(absentdata$Reason.for.absence))) +
    geom_boxplot() + 
    xlab('Reason.for.absence') +
    ylab('Absenteeism.time.in.hours')

## Let's see the Distribution for continuous variables. 

## Absenteeism time. Highly right skewed due to presence of outliers. 
hist(absentdata$Absenteeism.time.in.hours, breaks = 40, #prob = TRUE,
     xlab = 'Absenteeism time in hours', main = " Absenteeism time Distribution", col = "grey")

boxplot(absentdata$Absenteeism.time.in.hours, main = "Box plot of Absenteeism time in hours")

#Outlier Analysis


#boxplot for Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Hit.target
boxplot(absentdata[,c('Transportation.expense','Distance.from.Residence.to.Work', 'Service.time', 'Age','Hit.target')], varwidth = T, 
        col = "dark grey")

#boxplot for Weight,Height,Body.mass.index,Absenteeism.time.in.hours
boxplot(absentdata[,c('Weight', 'Height', 'Body.mass.index','Absenteeism.time.in.hours')], col = "grey", varwidth = T)

#boxplot for Work.load.Average.day
absentdata$Work.load.Average.day <- as.numeric(absentdata$Work.load.Average.day)

boxplot(absentdata[,c('Work.load.Average.day')], col = "grey")

## We do not delete the outliers instead we will cap the outliers with 25 and 75 percentiles. 
## Capping outliers - replacing outliers with 25percentile and 75percentile values. 

absentdata$Transportation.expense<- as.numeric(absentdata$Transportation.expense)

absentdata$Service.time <- as.numeric(absentdata$Service.time)

absentdata$Age <- as.numeric(absentdata$Age)

absentdata$Hit.target <- as.numeric(absentdata$Hit.target)

absentdata$Height <- as.numeric(absentdata$Height)

absentdata$Absenteeism.time.in.hours <- as.numeric(absentdata$Absenteeism.time.in.hours)

for (i in c('Transportation.expense','Service.time','Age','Work.load.Average.day','Hit.target','Height','Absenteeism.time.in.hours')){
  q = quantile(absentdata[,i],c(0.25,0.75))
  iqr1 = q[2]-q[1]
  min1 = q[1]-1.5*iqr1
  max1 = q[2]+1.5*iqr1
  absentdata[,i][absentdata[,i]<min1] = min1
  absentdata[,i][absentdata[,i]>max1] = max1
}

## Reason for absence vs absenteeism in time after outlier capping
ggplot(absentdata,
    aes_string(y=absentdata$Absenteeism.time.in.hours,x=as.factor(absentdata$Reason.for.absence))) +
    geom_boxplot() + 
    xlab('Reason.for.absence') +
    ylab('Absenteeism.time.in.hours')

#boxplot for Transportation.expense, Distance.from.Residence.to.Work, Service.time, Age, Hit.target
boxplot(absentdata[,c('Transportation.expense','Distance.from.Residence.to.Work', 'Service.time', 'Age','Hit.target')], varwidth = T, 
        col = "dark grey")

#boxplot for Weight,Height,Body.mass.index,Absenteeism.time.in.hours
boxplot(absentdata[,c('Weight', 'Height', 'Body.mass.index','Absenteeism.time.in.hours')], col = "grey", varwidth = T)

#boxplot for Work.load.Average.day 
boxplot(absentdata[,c('Work.load.Average.day')], col = "grey")

## Data independence, Multicollinearity test. 
## First categorical variables. 
categorical_var = c("Reason.for.absence","Month.of.absence","Day.of.the.week",
                     "Seasons", "Education", "Social.drinker",
                     "Social.smoker", "Son", "Pet")

## Transform categorical variables into factors. 
absentdata[,categorical_var ] <- lapply(absentdata[,categorical_var], factor)
#str(absentdata)

# Chi-square test for relationship between attributes. 
pvalue = c()

#Calculating & storing p-values in vector pval from chisquare test
for(i in categorical_var){ 
  for(j in categorical_var){
    chi2 = chisq.test(absentdata[,i],absentdata[,j]) #, simulate.p.value = T)
    pvalue = c(pvalue,chi2$p.value)
  }
}

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absentdata[, i], absentdata[, j]): Chi-squared
## approximation may be incorrect

length(pvalue)

## [1] 81

m1 <- matrix(pvalue, ncol = 9)
df <- data.frame(m1)
row.names(df) <- categorical_var
colnames(df) <- categorical_var
print(df)

##                    Reason.for.absence Month.of.absence Day.of.the.week
## Reason.for.absence       0.000000e+00     5.030070e-16    6.576851e-02
## Month.of.absence         5.030070e-16     0.000000e+00    3.455722e-01
## Day.of.the.week          6.576851e-02     3.455722e-01    0.000000e+00
## Seasons                  5.214522e-21    1.245723e-317    1.656180e-01
## Education                6.534526e-09     3.901758e-01    5.162065e-01
## Social.drinker           5.270888e-08     3.251838e-03    2.821174e-01
## Social.smoker            5.803942e-09     2.075713e-02    7.591291e-01
## Son                      4.854183e-18     1.186792e-04    6.611599e-08
## Pet                      5.427211e-16     5.725378e-04    4.217188e-01
##                          Seasons    Education Social.drinker Social.smoker
## Reason.for.absence  5.214522e-21 6.534526e-09   5.270888e-08  5.803942e-09
## Month.of.absence   1.245723e-317 3.901758e-01   3.251838e-03  2.075713e-02
## Day.of.the.week     1.656180e-01 5.162065e-01   2.821174e-01  7.591291e-01
## Seasons             0.000000e+00 5.460428e-01   2.494963e-02  1.559893e-01
## Education           5.460428e-01 0.000000e+00   6.103574e-32  2.400489e-20
## Social.drinker      2.494963e-02 6.103574e-32  4.319588e-145  8.081862e-03
## Social.smoker       1.559893e-01 2.400489e-20   8.081862e-03 1.211298e-142
## Son                 6.721641e-06 1.778511e-07   5.257573e-09  1.017876e-17
## Pet                 4.104513e-04 3.628847e-14   7.886952e-29  5.794183e-17
##                             Son          Pet
## Reason.for.absence 4.854183e-18 5.427211e-16
## Month.of.absence   1.186792e-04 5.725378e-04
## Day.of.the.week    6.611599e-08 4.217188e-01
## Seasons            6.721641e-06 4.104513e-04
## Education          1.778511e-07 3.628847e-14
## Social.drinker     5.257573e-09 7.886952e-29
## Social.smoker      1.017876e-17 5.794183e-17
## Son                0.000000e+00 5.764606e-85
## Pet                5.764606e-85 0.000000e+00

## As per the chisquare test, except Reason.for.absence and Day.of.the.week, all categorical variables are related to Reason.for.absence, as the p-values are less than 0.005. So, we removed all categorical variables correlated to Reason.for.absence but Day.of.the.week.
absentdata <- absentdata[, -c(3, 5, 12,13,14, 15, 16)]


## Correltaion matrix for continuous attribute
m <- cor(absentdata[,4:13])
corrplot(m, order = "hclust", tl.srt = 30, tl.col = "black", addrect = 3, method = "number" )

## Correlation between Absenteeism.time.in.hours and predictor are below 0.1. But high collinearity found between Weight and Body.mass.index. So, I removed Weight from the dataframe. 
absentdata = absentdata[,-10]

## After data pre-processing we are left with 696 observstions and 12 variables including target variable.

## Test for linearity in the data
pairs(absentdata[, -c(1:3)])

## Data is not linear. So, linear models will not be a good choice for this data. 


######################## End of Data Preprocessing #########################

# Aggregating Absenteeism.time.in.hours by Reason.for.absence
Reasons = aggregate(absentdata$Absenteeism.time.in.hours, by=list(Category=absentdata$Reason.for.absence), FUN=sum)
#print(as.data.frame(Reasons))
Reasons$Absence = (Reasons$x/sum(absentdata$Absenteeism.time.in.hours))*100
Reasons = Reasons[order(Reasons$Absence, decreasing = T),]
#print(Reasons)
barplot(Reasons$Absence, names.arg = Reasons$Category, xlab = "Reason for absence", ylab = "Absence", col = "dark grey", 
        main = "How much proportion each reason code plays in absenteeism")

## Taking backup of preprocessed data

#write.csv(modeldata, "modeldata.csv", row.names = F)

Model building using Machine Learning Algorithms.

### We will see how many grous are there in the data set by means of K-means clustering.
modeldata = absentdata[,-c(1,2,3)]

df = scale(modeldata)
## NbClust method
## wssplot function to give value of K based on elbow method using within cluster sum of squeares.
wssplot <- function(data, nc = 20, seed = 1234) {
    wss <- (nrow(data) - 1) * sum(apply(data, 2, var))
    for (i in 2 : nc) {
        set.seed(seed)
        wss[i] <- sum(kmeans(data, centers = i)$withins)}
    plot(1:nc, wss, type = "b", xlab = "Number of Clusters",
         ylab = "Within groups sum of squares")
}

wssplot(df)

set.seed(1234)
nc <- NbClust(df, min.nc = 3, max.nc = 20, method = "kmeans" )

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
##

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 7 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 3 proposed 5 as the best number of clusters 
## * 5 proposed 7 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 1 proposed 17 as the best number of clusters 
## * 1 proposed 18 as the best number of clusters 
## * 1 proposed 19 as the best number of clusters 
## * 3 proposed 20 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************

## *** : The Hubert index is a graphical method of determining the number of clusters.
##                 In the plot of Hubert index, we seek a significant knee that corresponds to a 
##                 significant increase of the value of the measure i.e the significant peak in Hubert
##                 index second differences plot. 
## 

## *** : The D index is a graphical method of determining the number of clusters. 
##                 In the plot of D index, we seek a significant knee (the significant peak in Dindex
##                 second differences plot) that corresponds to a significant increase of the value of
##                 the measure. 
##  
## ******************************************************************* 
## * Among all indices:                                                
## * 7 proposed 3 as the best number of clusters 
## * 1 proposed 4 as the best number of clusters 
## * 4 proposed 6 as the best number of clusters 
## * 1 proposed 7 as the best number of clusters 
## * 2 proposed 8 as the best number of clusters 
## * 1 proposed 13 as the best number of clusters 
## * 2 proposed 14 as the best number of clusters 
## * 2 proposed 17 as the best number of clusters 
## * 1 proposed 19 as the best number of clusters 
## * 2 proposed 20 as the best number of clusters 
## 
##                    ***** Conclusion *****                            
##  
## * According to the majority rule, the best number of clusters is  3 
##  
##  
## *******************************************************************


barplot(table(nc$Best.nc[1,]))
## According to NbCluster method, 3 would be optimal value of K. 

set.seed(1234)
### Elbow method - K = 3 optimal value
fviz_nbclust(df, kmeans, method = "wss")


## Average Silhoutte Method = k = 9, optimal value
set.seed(1234)
fviz_nbclust(df, kmeans, method = "silhouette")


## Comparison of k-values

set.seed(1234)
k3 <- kmeans(df, centers = 3, nstart = 25)
k4 <- kmeans(df, centers = 4, nstart = 25)
k7 <- kmeans(df, centers = 7, nstart = 25)
k9 <- kmeans(df, centers = 9, nstart = 25)

# plots to compare
p1 = fviz_cluster(k3, geom = "point", data = df) + ggtitle("k = 3")
p2 = fviz_cluster(k4, geom = "point", data = df) + ggtitle("k = 4")
p3 = fviz_cluster(k7, geom = "point",  data = df) + ggtitle("k = 7")
p4 = fviz_cluster(k9, geom = "point",  data = df) + ggtitle("k = 9")

grid.arrange(p1, p2, p3, p4, nrow = 2)

# Compute k-means clustering with k = 3
set.seed(1234)
final <- kmeans(df, 3, nstart = 25)
final$center

##   Transportation.expense Distance.from.Residence.to.Work Service.time
## 1              0.5826762                       0.1788144   -0.7447137
## 2             -0.3515032                       0.8551632    0.9905166
## 3             -0.6607796                      -1.0473691    0.3698462
##          Age Work.load.Average.day  Hit.target      Height Body.mass.index
## 1 -0.7571524            0.07953133 -0.04578003 -0.05033513      -0.5905792
## 2  0.6596816           -0.25491756  0.01288722 -0.69154040       1.2463275
## 3  0.6806771            0.09129538  0.06483467  0.69019707      -0.1108426
##   Absenteeism.time.in.hours
## 1                0.06533443
## 2               -0.24888579
## 3                0.10961637

fviz_cluster(final, data = df)

#print(final)

## Build models supervesed learning way. 
## Response variable is Absenteeism.time.in.hours. We will create 6 categoris and build models to predict the class. 
modeldata <- absentdata


temp_table = table(as.factor(modeldata$Absenteeism.time.in.hours))
barplot(temp_table, xlab = "Absenteeism in Hours", ylab = "Frequency", main = "Absenteeism frequency")

## Most of the time it is 8 hours people go abesent from work. That means full day absenteeism is common trend. 

## According to K-means cluster, this data comprises of 3 partition or 3 groups. So, creating three class lebels for absenteeism.in.time.hours, low - absenteeism hours is within 1 to 4 hours, moderate when 5 to 8 hours, high when greater than 8 hours. 
absentgroup <- ifelse((modeldata$Absenteeism.time.in.hours >= 1 & modeldata$Absenteeism.time.in.hours <=4), "low", "high")              
tempdata <- as.integer(as.character(modeldata$Absenteeism.time.in.hours))
for (i in 1:length(tempdata)) {
    
    if(tempdata[i] >= 1 & tempdata[i] <=4){
        
        modeldata$absentgroup[i] = "low"
    
    } else if(tempdata[i] > 4 & tempdata[i] <= 8){
    
        modeldata$absentgroup[i] = "moderate"
    
    } else { modeldata$absentgroup[i] = "high"}
    
}

table(modeldata$absentgroup)

## 
##     high      low moderate 
##       93      378      191

modeldata$absentgroup = factor(modeldata$absentgroup)


## We are using validation set approach for resampling. Select 80% observation for training and 20% for testing.
## Removing ansenteeism.in.time.hours and ID attributes. 
modeldata = modeldata[, -12]
modeldata = modeldata[,-1]

#smp_size <- floor(0.75 * nrow(modeldata))

## set the seed to make partition reproducible
set.seed(1234)
train_index = sample(1:nrow(modeldata), 0.8*nrow(modeldata))        
train = modeldata[train_index,]
test = modeldata[-train_index,]
test.group <- test$absentgroup

## first model - Simple Classification Tree with "tree" function of "tree" package. 

model_tree <- tree(absentgroup ~ . , data = train)
summary(model_tree)

## 
## Classification tree:
## tree(formula = absentgroup ~ ., data = train)
## Variables actually used in tree construction:
## [1] "Reason.for.absence"     "Transportation.expense" "Height"                
## [4] "Day.of.the.week"       
## Number of terminal nodes:  8 
## Residual mean deviance:  0.9707 = 505.7 / 521 
## Misclassification error rate: 0.172 = 91 / 529

plot(model_tree)
text(model_tree, pretty = 0, cex = 0.8)


model_tree_pred = predict(model_tree, test, type = "class")

#conf_matrix = table(model_tree_pred, test.group)
#model_tree_acu = sum(diag(conf_matrix))/sum(conf_matrix)


print(postResample(pred = model_tree_pred, obs = test.group))

##  Accuracy     Kappa 
## 0.7067669 0.4438130

confusionMatrix(model_tree_pred, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        4   0        0
##   low         9  67       14
##   moderate    7   9       23
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7068          
##                  95% CI : (0.6216, 0.7825)
##     No Information Rate : 0.5714          
##     P-Value [Acc > NIR] : 0.0009020       
##                                           
##                   Kappa : 0.4438          
##                                           
##  Mcnemar's Test P-Value : 0.0006782       
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.20000     0.8816          0.6216
## Specificity              1.00000     0.5965          0.8333
## Pos Pred Value           1.00000     0.7444          0.5897
## Neg Pred Value           0.87597     0.7907          0.8511
## Prevalence               0.15038     0.5714          0.2782
## Detection Rate           0.03008     0.5038          0.1729
## Detection Prevalence     0.03008     0.6767          0.2932
## Balanced Accuracy        0.60000     0.7390          0.7275

## Random forest 
set.seed(1234)
#split 3, error rate 27.32%
rf.fit = randomForest(absentgroup~., data = train, importance = TRUE)
rf.fit

## 
## Call:
##  randomForest(formula = absentgroup ~ ., data = train, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 3
## 
##         OOB estimate of  error rate: 22.5%
## Confusion matrix:
##          high low moderate class.error
## high       41  13       19   0.4383562
## low         5 268       29   0.1125828
## moderate    9  44      101   0.3441558

#rf.fit.pred <- predict(rf.fit, test, type = "class")

# Fine tuning parameters of Random Forest model, split 6. Error rate 26.62%
rf.fit1 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = 6, importance = TRUE)
rf.fit1

## 
## Call:
##  randomForest(formula = absentgroup ~ ., data = train, ntree = 500,      mtry = 6, importance = TRUE) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 6
## 
##         OOB estimate of  error rate: 23.44%
## Confusion matrix:
##          high low moderate class.error
## high       40  11       22   0.4520548
## low        10 257       35   0.1490066
## moderate   10  36      108   0.2987013

rf.fit1.pred <- predict(rf.fit1, test, type = "class")

# Checking classification accuracy

print(postResample(pred = rf.fit1.pred, obs = test.group))

##  Accuracy     Kappa 
## 0.7218045 0.5052780

confusionMatrix(rf.fit1.pred, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        8   1        4
##   low         5  64        9
##   moderate    7  11       24
## 
## Overall Statistics
##                                          
##                Accuracy : 0.7218         
##                  95% CI : (0.6375, 0.796)
##     No Information Rate : 0.5714         
##     P-Value [Acc > NIR] : 0.0002423      
##                                          
##                   Kappa : 0.5053         
##                                          
##  Mcnemar's Test P-Value : 0.2975673      
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.40000     0.8421          0.6486
## Specificity              0.95575     0.7544          0.8125
## Pos Pred Value           0.61538     0.8205          0.5714
## Neg Pred Value           0.90000     0.7818          0.8571
## Prevalence               0.15038     0.5714          0.2782
## Detection Rate           0.06015     0.4812          0.1805
## Detection Prevalence     0.09774     0.5865          0.3158
## Balanced Accuracy        0.67788     0.7982          0.7306

importance(rf.fit1)

##                                       high        low   moderate
## Reason.for.absence              70.5375810 101.313533 63.2393467
## Day.of.the.week                  1.7246321  -4.088889 -3.5912094
## Transportation.expense           7.8281458  21.860152 13.1987024
## Distance.from.Residence.to.Work  2.8208051   8.521092  4.8244581
## Service.time                     2.7315962   9.912955  3.2589291
## Age                             -0.1799845  10.398948  0.9287517
## Work.load.Average.day            1.1818111   4.621594  0.4533569
## Hit.target                       2.9746632   3.774605 -0.0791509
## Height                          -0.3565491  17.905394  3.0534402
## Body.mass.index                  0.5132413   9.346885  0.5666345
##                                 MeanDecreaseAccuracy MeanDecreaseGini
## Reason.for.absence                        118.840701       155.843193
## Day.of.the.week                            -4.321063        18.669516
## Transportation.expense                     24.861364        21.794455
## Distance.from.Residence.to.Work            10.520234        11.278804
## Service.time                               10.565270         9.813166
## Age                                         9.330569         9.052956
## Work.load.Average.day                       3.940339        26.191008
## Hit.target                                  3.842166        20.944596
## Height                                     16.480953        11.680924
## Body.mass.index                             8.671397        10.107328

varImpPlot(rf.fit1)

# Using For loop to identify the right mtry for model
a=c()
#i=5

for (i in 1:8) {
  rf.fit2 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = i, importance = TRUE)
  rf.fit2.pred <- predict(rf.fit2, test, type = "class")
  a[i-2] = mean(rf.fit2.pred == test.group)
}
a

## [1] 0.7293233 0.7142857 0.7142857 0.7293233 0.7293233 0.7293233

plot(3:8,a, type = "b")

rf.fit5 <- randomForest(absentgroup ~ ., data = train, ntree = 500, mtry = 5, importance = TRUE)
rf.fit5.pred <- predict(rf.fit5, test, type = "class")
print(postResample(pred = rf.fit5.pred, obs = test.group))

##  Accuracy     Kappa 
## 0.7218045 0.5052780

confusionMatrix(rf.fit5.pred, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        8   1        4
##   low         5  64        9
##   moderate    7  11       24
## 
## Overall Statistics
##                                          
##                Accuracy : 0.7218         
##                  95% CI : (0.6375, 0.796)
##     No Information Rate : 0.5714         
##     P-Value [Acc > NIR] : 0.0002423      
##                                          
##                   Kappa : 0.5053         
##                                          
##  Mcnemar's Test P-Value : 0.2975673      
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.40000     0.8421          0.6486
## Specificity              0.95575     0.7544          0.8125
## Pos Pred Value           0.61538     0.8205          0.5714
## Neg Pred Value           0.90000     0.7818          0.8571
## Prevalence               0.15038     0.5714          0.2782
## Detection Rate           0.06015     0.4812          0.1805
## Detection Prevalence     0.09774     0.5865          0.3158
## Balanced Accuracy        0.67788     0.7982          0.7306

## Building the Classification Tree Models using the Quinlan's C5.0 algorithm
c50.fit  <- C5.0(train[-11], train$absentgroup, trials = 10)
summary(c50.fit)

## 
## Call:
## C5.0.default(x = train[-11], y = train$absentgroup, trials = 10)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sat Oct 03 13:09:54 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 529 cases (11 attributes) from undefined.data
## 
## -----  Trial 0:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence = 0: high (32)
## Reason.for.absence in {16,23,25,27,28}:
## :...Transportation.expense <= 291: low (257/17)
## :   Transportation.expense > 291:
## :   :...Body.mass.index <= 25: moderate (8/3)
## :       Body.mass.index > 25: low (5)
## Reason.for.absence in {1,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,26}:
## :...Reason.for.absence = 9: high (4/1)
##     Reason.for.absence in {1,4,5,6,8,10,15,17,18,21,22,24,
##     :                      26}: moderate (115/22)
##     Reason.for.absence = 12:
##     :...Distance.from.Residence.to.Work <= 20: high (2)
##     :   Distance.from.Residence.to.Work > 20: low (4/1)
##     Reason.for.absence = 7:
##     :...Transportation.expense > 279: high (3)
##     :   Transportation.expense <= 279:
##     :   :...Work.load.Average.day <= 6: moderate (2)
##     :       Work.load.Average.day > 6: low (6/1)
##     Reason.for.absence = 11:
##     :...Height <= 169: moderate (5/1)
##     :   Height > 169:
##     :   :...Body.mass.index <= 23: high (3/1)
##     :       Body.mass.index > 23: low (9/2)
##     Reason.for.absence = 14:
##     :...Body.mass.index > 25: low (7)
##     :   Body.mass.index <= 25:
##     :   :...Day.of.the.week in {2,4}: high (2)
##     :       Day.of.the.week in {3,6}: moderate (2)
##     :       Day.of.the.week = 5: low (3/1)
##     Reason.for.absence = 19:
##     :...Height > 174: low (3)
##     :   Height <= 174:
##     :   :...Distance.from.Residence.to.Work <= 17: high (3/1)
##     :       Distance.from.Residence.to.Work > 17: moderate (15/4)
##     Reason.for.absence = 13:
##     :...Day.of.the.week = 3:
##         :...Work.load.Average.day <= 21: moderate (4)
##         :   Work.load.Average.day > 21: high (3/1)
##         Day.of.the.week = 4:
##         :...Height <= 171: moderate (4/2)
##         :   Height > 171: high (6/2)
##         Day.of.the.week = 5:
##         :...Height <= 171: moderate (5/1)
##         :   Height > 171: low (4/1)
##         Day.of.the.week = 6:
##         :...Work.load.Average.day <= 29: moderate (3)
##         :   Work.load.Average.day > 29: high (2)
##         Day.of.the.week = 2:
##         :...Distance.from.Residence.to.Work > 27: high (2)
##             Distance.from.Residence.to.Work <= 27:
##             :...Transportation.expense <= 189: low (2)
##                 Transportation.expense > 189:
##                 :...Age <= 15: moderate (2)
##                     Age > 15: high (2/1)
## 
## -----  Trial 1:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence in {4,5,6,8,15,16,17,18,21,22,23,24,25,26,27,28}:
## :...Reason.for.absence in {4,8,16,21,23,27}: low (166.2/34.6)
## :   Reason.for.absence in {5,6,15,17,18,24}: moderate (26.6/7.8)
## :   Reason.for.absence = 22:
## :   :...Body.mass.index <= 29: moderate (14.9)
## :   :   Body.mass.index > 29: low (7.5/2.4)
## :   Reason.for.absence = 25:
## :   :...Transportation.expense <= 189: moderate (12.5/4.7)
## :   :   Transportation.expense > 189: low (12.5/0.8)
## :   Reason.for.absence = 28:
## :   :...Height <= 165: moderate (6/0.8)
## :   :   Height > 165: low (67.4/8.6)
## :   Reason.for.absence = 26:
## :   :...Work.load.Average.day > 33: high (2.6)
## :       Work.load.Average.day <= 33:
## :       :...Age > 17: low (5.2)
## :           Age <= 17:
## :           :...Height <= 171: moderate (9.4)
## :               Height > 171: low (13.3/5.5)
## Reason.for.absence in {0,1,7,9,10,11,12,13,14,19}:
## :...Reason.for.absence = 0: high (25.1)
##     Reason.for.absence in {1,7,9,10,11,12,13,14,19}:
##     :...Transportation.expense > 248:
##         :...Height <= 172: high (40.9/18.3)
##         :   Height > 172: moderate (9.7/2.4)
##         Transportation.expense <= 248:
##         :...Body.mass.index > 25:
##             :...Work.load.Average.day <= 10: low (10.7/1.6)
##             :   Work.load.Average.day > 10:
##             :   :...Work.load.Average.day <= 28: moderate (22.7/9.7)
##             :       Work.load.Average.day > 28: low (20.1/8.3)
##             Body.mass.index <= 25:
##             :...Distance.from.Residence.to.Work <= 15: high (7.5)
##                 Distance.from.Residence.to.Work > 15:
##                 :...Transportation.expense > 235: moderate (9.7/4.2)
##                     Transportation.expense <= 235:
##                     :...Height <= 168: moderate (2.4)
##                         Height > 168:
##                         :...Hit.target <= 96: high (26.3/16.2)
##                             Hit.target > 96: low (9.9/1.6)
## 
## -----  Trial 2:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence = 0: high (20.8)
## Reason.for.absence in {16,23,25,27,28}:
## :...Body.mass.index <= 19: low (13.1/3.2)
## :   Body.mass.index > 19:
## :   :...Transportation.expense > 189:
## :       :...Hit.target > 99: moderate (3.2)
## :       :   Hit.target <= 99:
## :       :   :...Height <= 164.5: moderate (2.2)
## :       :       Height > 164.5: low (98.7/15.7)
## :       Transportation.expense <= 189:
## :       :...Transportation.expense > 184: moderate (6.5)
## :           Transportation.expense <= 184:
## :           :...Day.of.the.week in {5,6}: low (28.8)
## :               Day.of.the.week in {2,3,4}:
## :               :...Service.time > 18: moderate (3.2)
## :                   Service.time <= 18:
## :                   :...Work.load.Average.day > 29: low (10.5)
## :                       Work.load.Average.day <= 29:
## :                       :...Body.mass.index <= 29: moderate (31.8/12.5)
## :                           Body.mass.index > 29: low (30.6/6.5)
## Reason.for.absence in {1,4,5,6,7,8,9,10,11,12,13,14,15,17,18,19,21,22,24,26}:
## :...Transportation.expense > 330: moderate (23.7/2.7)
##     Transportation.expense <= 330:
##     :...Work.load.Average.day > 26:
##         :...Day.of.the.week = 3: moderate (12.5/4.2)
##         :   Day.of.the.week = 6: low (10.6/4.8)
##         :   Day.of.the.week in {2,4,5}:
##         :   :...Hit.target <= 94:
##         :       :...Distance.from.Residence.to.Work > 42: moderate (4.2)
##         :       :   Distance.from.Residence.to.Work <= 42:
##         :       :   :...Hit.target <= 88: high (5.6)
##         :       :       Hit.target > 88: moderate (18.2/6.9)
##         :       Hit.target > 94:
##         :       :...Transportation.expense <= 118: low (4.3)
##         :           Transportation.expense > 118:
##         :           :...Hit.target <= 97: high (30.1/12.7)
##         :               Hit.target > 97: moderate (4)
##         Work.load.Average.day <= 26:
##         :...Work.load.Average.day > 24: low (19.5/6.6)
##             Work.load.Average.day <= 24:
##             :...Work.load.Average.day > 22: moderate (12.4/5)
##                 Work.load.Average.day <= 22:
##                 :...Height > 172: low (15.8/4.7)
##                     Height <= 172:
##                     :...Distance.from.Residence.to.Work > 27: moderate (46.4/15.5)
##                         Distance.from.Residence.to.Work <= 27:
##                         :...Day.of.the.week in {3,6}: moderate (17.7/8)
##                             Day.of.the.week in {2,4,5}:
##                             :...Body.mass.index > 35: high (2.1/0.6)
##                                 Body.mass.index <= 35:
##                                 :...Hit.target <= 88: moderate (10.4/5.7)
##                                     Hit.target > 88:
##                                     :...Transportation.expense <= 189: low (15.2/0.6)
##                                         Transportation.expense > 189:
##                                         :...Height <= 169: low (19.4/4.1)
##                                             Height > 169: moderate (7.5)
## 
## -----  Trial 3:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence in {0,1,7,9,10,11,12,13,14,19}:
## :...Reason.for.absence in {1,7}: moderate (30/13.5)
## :   Reason.for.absence in {0,9,12}: high (29.4/4.9)
## :   Reason.for.absence in {11,14}: low (38.3/19.4)
## :   Reason.for.absence = 10:
## :   :...Transportation.expense <= 235: low (13/4.8)
## :   :   Transportation.expense > 235: moderate (12.9/4)
## :   Reason.for.absence = 13:
## :   :...Hit.target <= 99: high (59.2/35.9)
## :   :   Hit.target > 99: low (2.9)
## :   Reason.for.absence = 19:
## :   :...Height > 174: low (3.8)
## :       Height <= 174:
## :       :...Age <= 5: moderate (3.5)
## :           Age > 5: high (24.8/10.1)
## Reason.for.absence in {4,5,6,8,15,16,17,18,21,22,23,24,25,26,27,28}:
## :...Transportation.expense > 291:
##     :...Day.of.the.week in {2,3,4,6}: moderate (29.4/7.5)
##     :   Day.of.the.week = 5: high (5/1.1)
##     Transportation.expense <= 291:
##     :...Body.mass.index <= 19: low (14.4/4.5)
##         Body.mass.index > 19:
##         :...Reason.for.absence in {4,16,18,23,25,27,28}:
##             :...Height > 167: low (176.4/37)
##             :   Height <= 167:
##             :   :...Distance.from.Residence.to.Work <= 27: low (11.5/2.9)
##             :       Distance.from.Residence.to.Work > 27: moderate (12.2/1.9)
##             Reason.for.absence in {5,6,8,15,17,21,22,24,26}:
##             :...Service.time <= 12: moderate (10.9/1.8)
##                 Service.time > 12:
##                 :...Age <= 17: moderate (47.8/16.2)
##                     Age > 17: low (3.5)
## 
## -----  Trial 4:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence = 0: high (14.8)
## Reason.for.absence in {1,7,9,10,11,12,13,14,19}:
## :...Transportation.expense <= 235:
## :   :...Day.of.the.week = 5: moderate (18.2/9)
## :   :   Day.of.the.week = 6: low (8.5/3.7)
## :   :   Day.of.the.week = 3:
## :   :   :...Work.load.Average.day <= 28: high (18.9/9.1)
## :   :   :   Work.load.Average.day > 28: low (8.8/2.6)
## :   :   Day.of.the.week = 4:
## :   :   :...Distance.from.Residence.to.Work <= 11: low (3.8)
## :   :   :   Distance.from.Residence.to.Work > 11: high (29.6/14.6)
## :   :   Day.of.the.week = 2:
## :   :   :...Height > 174: high (6/0.5)
## :   :       Height <= 174:
## :   :       :...Transportation.expense <= 189: low (18.7/5.1)
## :   :           Transportation.expense > 189: moderate (11.1/3.6)
## :   Transportation.expense > 235:
## :   :...Age > 15: low (14.7/6.9)
## :       Age <= 15:
## :       :...Day.of.the.week = 3: moderate (8.3/1)
## :           Day.of.the.week = 5: high (12.4/6.1)
## :           Day.of.the.week in {2,4,6}:
## :           :...Service.time <= 11: moderate (17.1/2.4)
## :               Service.time > 11:
## :               :...Work.load.Average.day <= 30: high (30.9/8.4)
## :                   Work.load.Average.day > 30: moderate (8.8/1.2)
## Reason.for.absence in {4,5,6,8,15,16,17,18,21,22,23,24,25,26,27,28}:
## :...Reason.for.absence in {4,5,6,8,15,17,18,22,24,26}: moderate (86.3/31.6)
##     Reason.for.absence in {16,21,25,27}: low (55.1/12.6)
##     Reason.for.absence = 28:
##     :...Body.mass.index <= 24: low (16.3)
##     :   Body.mass.index > 24:
##     :   :...Day.of.the.week in {2,5}: low (11.5)
##     :       Day.of.the.week in {3,4,6}:
##     :       :...Height <= 167: moderate (4.9)
##     :           Height > 167:
##     :           :...Work.load.Average.day <= 13: moderate (15.6/3)
##     :               Work.load.Average.day > 13: low (10.5)
##     Reason.for.absence = 23:
##     :...Body.mass.index <= 19: high (7.7/3.2)
##         Body.mass.index > 19:
##         :...Service.time <= 4: low (11.5/5.3)
##             Service.time > 4:
##             :...Hit.target > 98: moderate (14.1/3.2)
##                 Hit.target <= 98:
##                 :...Hit.target > 93: low (17.9)
##                     Hit.target <= 93:
##                     :...Body.mass.index <= 24: low (8.3)
##                         Body.mass.index > 24:
##                         :...Hit.target <= 92: low (22.8/6)
##                             Hit.target > 92: moderate (15.9/4)
## 
## -----  Trial 5:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence = 0: high (12.5)
## Reason.for.absence in {4,7,8,12,14,16,18,21,22,23,25,26,27,28}:
## :...Reason.for.absence in {4,7,12,14,16,27}: low (69/22.2)
## :   Reason.for.absence in {8,18,21,22}: moderate (54.8/23.5)
## :   Reason.for.absence = 25:
## :   :...Distance.from.Residence.to.Work <= 22: moderate (17.7/6.8)
## :   :   Distance.from.Residence.to.Work > 22: low (4.8)
## :   Reason.for.absence = 28:
## :   :...Height <= 165: moderate (5/0.8)
## :   :   Height > 165: low (48/10.5)
## :   Reason.for.absence = 23:
## :   :...Day.of.the.week in {2,3,4,6}:
## :   :   :...Service.time <= 18: low (73.8/21.1)
## :   :   :   Service.time > 18: moderate (2.6)
## :   :   Day.of.the.week = 5:
## :   :   :...Service.time <= 10: high (13.3/5.6)
## :   :       Service.time > 10: low (11.7/5.7)
## :   Reason.for.absence = 26:
## :   :...Work.load.Average.day > 33: high (2.8)
## :       Work.load.Average.day <= 33:
## :       :...Body.mass.index <= 35: low (24.2/9.3)
## :           Body.mass.index > 35: moderate (3.2)
## Reason.for.absence in {1,5,6,9,10,11,13,15,17,19,24}:
## :...Transportation.expense > 179: moderate (108.3/39.1)
##     Transportation.expense <= 179:
##     :...Hit.target <= 84.5: moderate (4.4)
##         Hit.target > 84.5:
##         :...Day.of.the.week in {2,5,6}: low (32.1/14.3)
##             Day.of.the.week = 3: moderate (15.8/7.8)
##             Day.of.the.week = 4:
##             :...Height <= 174: low (15.6/6.7)
##                 Height > 174: high (9.4/3.7)
## 
## -----  Trial 6:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {2,3,4,8,16,18,21,22,23,25,26,27,28}: low (282.4/114.2)
## Reason.for.absence in {0,1,5,6,7,9,10,11,12,13,14,15,17,19,24}:
## :...Reason.for.absence in {0,9,12}: high (26.6/7.1)
##     Reason.for.absence in {1,5,6,13,15,17,24}: moderate (97.8/48.7)
##     Reason.for.absence in {7,14}: low (38.4/21.6)
##     Reason.for.absence = 10:
##     :...Height <= 174: moderate (21.7/10)
##     :   Height > 174: high (3.2)
##     Reason.for.absence = 11:
##     :...Body.mass.index <= 24: moderate (14.2/6.7)
##     :   Body.mass.index > 24: low (11/3.2)
##     Reason.for.absence = 19:
##     :...Height > 174: low (4.1)
##         Height <= 174:
##         :...Distance.from.Residence.to.Work <= 17: high (5.6/1.6)
##             Distance.from.Residence.to.Work > 17: moderate (23.8/9.2)
## 
## -----  Trial 7:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence in {4,8,10,16,18,21,22,23,25,26,27,28}:
## :...Transportation.expense > 291: moderate (42.1/13.1)
## :   Transportation.expense <= 291:
## :   :...Reason.for.absence in {4,10,18,21}: moderate (52.2/25.2)
## :       Reason.for.absence in {8,16,22,25,27,28}: low (102.6/29.1)
## :       Reason.for.absence = 23:
## :       :...Work.load.Average.day <= 21:
## :       :   :...Service.time <= 18: low (37.9/3.5)
## :       :   :   Service.time > 18: moderate (2.5)
## :       :   Work.load.Average.day > 21:
## :       :   :...Service.time <= 13: moderate (26.1/10.9)
## :       :       Service.time > 13: low (4.6)
## :       Reason.for.absence = 26:
## :       :...Work.load.Average.day > 33: high (2.7)
## :           Work.load.Average.day <= 33:
## :           :...Age <= 17: moderate (24.6/6.5)
## :               Age > 17: low (3.5)
## Reason.for.absence in {0,1,5,6,7,9,11,12,13,14,15,17,19,24}:
## :...Reason.for.absence in {0,9}: high (16.6/2.7)
##     Reason.for.absence in {1,5,6,15,17,24}: moderate (22.7/6)
##     Reason.for.absence in {7,12,14}: low (49.8/28.6)
##     Reason.for.absence = 11:
##     :...Work.load.Average.day <= 26: low (12/3.6)
##     :   Work.load.Average.day > 26: moderate (13.7/5)
##     Reason.for.absence = 19:
##     :...Height > 174: low (3.9)
##     :   Height <= 174:
##     :   :...Distance.from.Residence.to.Work <= 17: high (5.6/1.9)
##     :       Distance.from.Residence.to.Work > 17: moderate (23.9/10.1)
##     Reason.for.absence = 13:
##     :...Work.load.Average.day > 34: high (14.7/6)
##         Work.load.Average.day <= 34:
##         :...Transportation.expense <= 118: low (4.3)
##             Transportation.expense > 118:
##             :...Hit.target <= 93: moderate (10.8/2.4)
##                 Hit.target > 93:
##                 :...Body.mass.index > 35: high (2.2)
##                     Body.mass.index <= 35:
##                     :...Age > 15: low (4.5)
##                         Age <= 15:
##                         :...Hit.target > 99: low (2.7)
##                             Hit.target <= 99:
##                             :...Transportation.expense > 189: moderate (14.3/6)
##                                 Transportation.expense <= 189:
##                                 :...Hit.target <= 97: low (16.1/7.5)
##                                     Hit.target > 97: high (8.2/2.8)
## 
## -----  Trial 8:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {2,3}: low (0)
## Reason.for.absence in {0,1,5,6,7,9,11,12,13,14,15,17,19,24}:
## :...Reason.for.absence in {0,9,12,14}: high (44.2/17.4)
## :   Reason.for.absence in {1,5,6,15,17,24}: moderate (17.7/3.1)
## :   Reason.for.absence = 7:
## :   :...Transportation.expense <= 279: moderate (15.6/6.5)
## :   :   Transportation.expense > 279: high (7.5)
## :   Reason.for.absence = 11:
## :   :...Body.mass.index <= 24: high (16.1/8.1)
## :   :   Body.mass.index > 24: low (10.9/3.9)
## :   Reason.for.absence = 19:
## :   :...Height <= 174: high (30.2/14.5)
## :   :   Height > 174: low (3.4)
## :   Reason.for.absence = 13:
## :   :...Day.of.the.week in {2,6}: high (25.6/12.8)
## :       Day.of.the.week = 3: moderate (13.2/6.3)
## :       Day.of.the.week = 5: low (17.2/9.2)
## :       Day.of.the.week = 4:
## :       :...Hit.target <= 95: moderate (18.1/8.2)
## :           Hit.target > 95: high (4.1)
## Reason.for.absence in {4,8,10,16,18,21,22,23,25,26,27,28}:
## :...Height > 175: low (39/19.5)
##     Height <= 175:
##     :...Work.load.Average.day > 31: moderate (23.6/8.3)
##         Work.load.Average.day <= 31:
##         :...Service.time <= 9: low (57/7.4)
##             Service.time > 9:
##             :...Distance.from.Residence.to.Work > 50: low (29.4/4.1)
##                 Distance.from.Residence.to.Work <= 50:
##                 :...Work.load.Average.day > 29: low (11.3/0.8)
##                     Work.load.Average.day <= 29:
##                     :...Age <= 9: moderate (38.6/10.1)
##                         Age > 9:
##                         :...Height > 172: moderate (4.6)
##                             Height <= 172:
##                             :...Hit.target <= 88: low (6.2)
##                                 Hit.target > 88:
##                                 :...Transportation.expense > 268: low (6.8)
##                                     Transportation.expense <= 268:
##                                     :...Body.mass.index > 32: moderate (5.2)
##                                         Body.mass.index <= 32:
##                                         :...Height <= 165: low (9.1/1.3)
##                                             Height > 165: [S1]
## 
## SubTree [S1]
## 
## Reason.for.absence in {4,21,22,26}: moderate (16.9)
## Reason.for.absence in {8,10,16,18,23,25,27,28}:
## :...Body.mass.index > 28: low (7)
##     Body.mass.index <= 28:
##     :...Day.of.the.week in {5,6}: low (5.2)
##         Day.of.the.week in {2,3,4}:
##         :...Work.load.Average.day > 18: moderate (7.7/1.4)
##             Work.load.Average.day <= 18:
##             :...Reason.for.absence in {8,16,23,27,28}: low (12.5)
##                 Reason.for.absence in {10,18,25}: moderate (15.9/5.4)
## 
## -----  Trial 9:  -----
## 
## Decision tree:
## 
## Reason.for.absence in {0,9}: high (10.9)
## Reason.for.absence in {4,5,6,8,15,17,18,21,22,24}: moderate (74.1/22.5)
## Reason.for.absence in {2,3,16,23,25,27,28}: low (147.2/23.3)
## Reason.for.absence in {1,7,10,11,12,13,14,19,26}:
## :...Transportation.expense <= 157:
##     :...Height <= 174: low (17.2/6.1)
##     :   Height > 174:
##     :   :...Day.of.the.week in {2,4}: high (17.6/1.4)
##     :       Day.of.the.week in {3,5,6}: low (5.1)
##     Transportation.expense > 157:
##     :...Reason.for.absence in {1,10,11,19}: moderate (77.1/26.8)
##         Reason.for.absence = 12: low (11/4.3)
##         Reason.for.absence = 7:
##         :...Transportation.expense <= 279: moderate (15.6/7.5)
##         :   Transportation.expense > 279: high (6.7)
##         Reason.for.absence = 14:
##         :...Body.mass.index <= 25: high (17.4/9.7)
##         :   Body.mass.index > 25: low (8.6)
##         Reason.for.absence = 26:
##         :...Work.load.Average.day <= 33: moderate (24.6/7.3)
##         :   Work.load.Average.day > 33: high (3.1)
##         Reason.for.absence = 13:
##         :...Hit.target <= 93: moderate (14.9/2.8)
##             Hit.target > 93:
##             :...Work.load.Average.day <= 7: low (7.2/2.1)
##                 Work.load.Average.day > 7:
##                 :...Hit.target > 99: low (3.1)
##                     Hit.target <= 99:
##                     :...Hit.target > 97: high (8.7/1.8)
##                         Hit.target <= 97:
##                         :...Hit.target > 96: moderate (11/2.1)
##                             Hit.target <= 96:
##                             :...Body.mass.index <= 21: moderate (6.3/2.6)
##                                 Body.mass.index > 21: high (21/8)
## 
## 
## Evaluation on training data (529 cases):
## 
## Trial        Decision Tree   
## -----      ----------------  
##    Size      Errors  
## 
##    0     33   63(11.9%)
##    1     23  109(20.6%)
##    2     30   98(18.5%)
##    3     19  109(20.6%)
##    4     30  103(19.5%)
##    5     20  112(21.2%)
##    6     11  140(26.5%)
##    7     27  107(20.2%)
##    8     30  119(22.5%)
##    9     21   80(15.1%)
## boost             45( 8.5%)   <<
## 
## 
##     (a)   (b)   (c)    <-classified as
##    ----  ----  ----
##      62     3     8    (a): class high
##       1   291    10    (b): class low
##       2    21   131    (c): class moderate
## 
## 
##  Attribute usage:
## 
##  100.00% Reason.for.absence
##   93.95% Transportation.expense
##   90.74% Work.load.Average.day
##   90.36% Height
##   88.09% Body.mass.index
##   80.15% Day.of.the.week
##   69.94% Service.time
##   64.84% Distance.from.Residence.to.Work
##   62.19% Hit.target
##   44.23% Age
## 
## 
## Time: 0.0 secs

plot(c50.fit)


c50.fit.pred <- predict(c50.fit, test)

print(postResample(pred = c50.fit.pred, obs = test.group))

##  Accuracy     Kappa 
## 0.6992481 0.4493894

confusionMatrix(c50.fit.pred, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        5   2        3
##   low         8  65       11
##   moderate    7   9       23
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6992          
##                  95% CI : (0.6137, 0.7757)
##     No Information Rate : 0.5714          
##     P-Value [Acc > NIR] : 0.001653        
##                                           
##                   Kappa : 0.4494          
##                                           
##  Mcnemar's Test P-Value : 0.144744        
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.25000     0.8553          0.6216
## Specificity              0.95575     0.6667          0.8333
## Pos Pred Value           0.50000     0.7738          0.5897
## Neg Pred Value           0.87805     0.7755          0.8511
## Prevalence               0.15038     0.5714          0.2782
## Detection Rate           0.03759     0.4887          0.1729
## Detection Prevalence     0.07519     0.6316          0.2932
## Balanced Accuracy        0.60288     0.7610          0.7275

## Recursive PArtition Regression Tree - RPART

m2 = rpart(absentgroup ~ .,train, method = "class")
m2.pred = predict(m2, test, type = "class")
print(postResample(pred = m2.pred, obs = test.group))

##  Accuracy     Kappa 
## 0.7142857 0.4774607

confusionMatrix(m2.pred, test.group)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high        4   0        2
##   low         8  65        9
##   moderate    8  11       26
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7143          
##                  95% CI : (0.6295, 0.7892)
##     No Information Rate : 0.5714          
##     P-Value [Acc > NIR] : 0.0004757       
##                                           
##                   Kappa : 0.4775          
##                                           
##  Mcnemar's Test P-Value : 0.0081006       
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity              0.20000     0.8553          0.7027
## Specificity              0.98230     0.7018          0.8021
## Pos Pred Value           0.66667     0.7927          0.5778
## Neg Pred Value           0.87402     0.7843          0.8750
## Prevalence               0.15038     0.5714          0.2782
## Detection Rate           0.03008     0.4887          0.1955
## Detection Prevalence     0.04511     0.6165          0.3383
## Balanced Accuracy        0.59115     0.7785          0.7524

plot(m2)
text(m2, pretty = 0, cex = 0.8)

prp(m2, varlen = 4, extra = 2)


## Support Vector machine 

absent_classifier <- ksvm(absentgroup ~ ., data = train, kernel = "vanilladot")

##  Setting default kernel parameters

absent_classifier

## Support Vector Machine object of class "ksvm" 
## 
## SV type: C-svc  (classification) 
##  parameter : cost C = 1 
## 
## Linear (vanilla) kernel function. 
## 
## Number of Support Vectors : 295 
## 
## Objective Function Value : -66.3712 -76.3851 -135.2174 
## Training error : 0.190926

#Evaluating the SVM Model Performance

absent_predictions <- predict(absent_classifier, test) 
table(absent_predictions, test.group )

##                   test.group
## absent_predictions high low moderate
##           high        4   0        0
##           low         4  67       11
##           moderate   12   9       26

#Confusion Matrix for SVM Model

agreement <- absent_predictions == test.group 
table(agreement)

## agreement
## FALSE  TRUE 
##    36    97

print(postResample(pred = absent_predictions, obs = test.group))

##  Accuracy     Kappa 
## 0.7293233 0.5032164

############## Random Forest is our Best Performer ##################

############### Final Prediction on entire data set ##################

finalData = rbind(train, test)

final_fit <- predict(rf.fit5, finalData, type = "class")

summary(final_fit)

##     high      low moderate 
##       89      382      191

#table(final_fit, modeldata$absentgroup)

print(postResample(pred = final_fit, obs = finalData$absentgroup))

##  Accuracy     Kappa 
## 0.9335347 0.8830623

confusionMatrix(final_fit, finalData$absentgroup)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction high low moderate
##   high       81   1        7
##   low         5 365       12
##   moderate    7  12      172
## 
## Overall Statistics
##                                           
##                Accuracy : 0.9335          
##                  95% CI : (0.9118, 0.9513)
##     No Information Rate : 0.571           
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.8831          
##                                           
##  Mcnemar's Test P-Value : 0.4459          
## 
## Statistics by Class:
## 
##                      Class: high Class: low Class: moderate
## Sensitivity               0.8710     0.9656          0.9005
## Specificity               0.9859     0.9401          0.9597
## Pos Pred Value            0.9101     0.9555          0.9005
## Neg Pred Value            0.9791     0.9536          0.9597
## Prevalence                0.1405     0.5710          0.2885
## Detection Rate            0.1224     0.5514          0.2598
## Detection Prevalence      0.1344     0.5770          0.2885
## Balanced Accuracy         0.9285     0.9529          0.9301

### Best perfomer is Random Forest with 5 splits. ON entire data, random forest's prediction accuracy 93%. Kappa 88% tells the model is almost perfect to predict the absenteeism group. 


################################## End of Project Absenteeism at Wrok #####################################

ANLY530_Absenteeism_Project

Punit Thakur

10/3/2020