Data Manipulation

Setting Factors

Reason for absence, month, day of the week, season, disciplinary failure, education, social drinker, & social smoker should be set as factors, based on the documentation.

Son (children) and Pets are debatable since it isn’t possible to have .1 - .9 of a kid/pet, but they are continuous variables.

ATW_factors <- c(2:5, 12:13, 15:16)
ATW2 <- ATW
ATW2[ATW_factors] <- lapply(ATW2[ATW_factors], factor)

Factor Levels

There was a mild complication with ‘Reason for absence’ since 0s are present in the data, however the research article doesn’t list ‘0’ as a possible code. Also, ‘20’ doesn’t exist in the data, however it exists as a code. Based on other papers on this data, it seems that the codes for 1 through 20 should be applied to 0 through 19 (21 through 28 are fine).

‘dz’ is shorthand for ‘diseases’

# set factor levels & rename columns
# 0s are present in 'Reason for absence' while 20 is not. However, there is no level 0 in the documentation
ATW2 <- ATW2 %>%
  mutate(`Reason for absence` = fct_recode(`Reason for absence`,
                                           `Infectious & parasitic (dz)`="0", 
                                           `Neoplasms`="1",
                                           `blood (dz)`="2",
                                           `Endocrine, Nutritional, & metabolic diseases`="3",
                                           `Mental & behavioural disorders`="4",
                                           `nervous system (dz)`="5",
                                           `eye & adnexa (dz)`="6",
                                           `ear & mastoid process (dz)`="7",
                                           `circulatory system (dz)`="8",
                                           `respiratory system (dz)`="9",
                                           `digestive system (dz)`="10", 
                                           `integumentary (dz)`="11",
                                           `musculoskeletal system (dz)`="12", 
                                           `genitourinary system (dz)`="13",
                                           `Pregnancy, childbirth and & puerperium`="14",
                                           `perinatal period conditions`="15",  
                                           `congenital abnormalities`= "16",
                                           `abnormal clinical findings`="17", 
                                           `external injury`= "18",
                                           `morbidity and mortality`="19", 
                                           `health status factors`="21",
                                           `patient follow-up`="22",
                                           `medical consultation`="23",
                                           `blood donation`="24", 
                                           `laboratory examination`="25", 
                                           `unjustified absence`="26", 
                                           `physiotherapy`="27",
                                           `dental consultation`="28")) %>% 
   
  mutate(`Month of absence` = fct_recode(`Month of absence`,
                                          None = "0",
                                          January = "1",
                                          February = "2",
                                          March = "3",
                                          April = "4",
                                          May = "5",
                                          June = "6",
                                          July = "7",
                                          August = "8",
                                          September = "9",
                                          October = "10",
                                          November = "11",
                                          Decemeber = "12")) %>% 
  
   mutate(`Day of the week` = fct_recode(`Day of the week`,
                                         Monday = "2",
                                         Tuesday = "3",
                                         Wednesday = "4",
                                         Thursday = "5",
                                         Friday = "6")) %>% 
  
   mutate(Seasons = fct_recode(Seasons,
                              summer = "1",
                              fall = "2",
                              winter = "3",
                              spring = "4")) %>% 

   mutate(Education = fct_recode(Education,
                                 `high school` = "1",
                                 graduate = "2",
                                 `post-graduate` = "3",
                                 `master & doctorate` = "4")) %>% 
  
   mutate(`Disciplinary failure` = fct_recode(`Disciplinary failure`,
                                              no = "0",
                                              yes = "1")) %>% 

  mutate(`Social drinker` = fct_recode(`Social drinker`,
                                        no = "0",
                                        yes = "1")) %>% 
  
   mutate(`Social smoker` = fct_recode(`Social smoker`,
                                       no = "0",
                                       yes = "1"))

Rename Columns

# rename columns
ATW2 <- ATW2 %>% rename(
  Reason = `Reason for absence`,
  Month = `Month of absence`,
  Weekday = `Day of the week`,
  TransExpense = `Transportation expense`,
  Distance = `Distance from Residence to Work`,
  ServiceTime = `Service time`,
  Workload = `Work load Average/day`,
  Hit = `Hit target`,
  DiscpFail = `Disciplinary failure`,
  Children = `Son`,
  SocDrinker = `Social drinker`,
  SocSmoker = `Social smoker`,
  BMI = `Body mass index`,
  Absenteeism = `Absenteeism time in hours`
)

datatable(ATW2, rownames = F)

EDA, Visualization, & Exploration

Pie Charts

  • The top 4 reasons for absence are medical consultations, dental consultations, physiotherapy, and genitourinary system diseases

  • Genitourinary system diseases, issues relating to morbidity & death, medical consultations, dental consultations, and integumentary system diseases account for roughly 50% of absentee hours. The top 2 (genitourinary dz & morbidity/death related factors) make sense since both reasons require a lot of attention.

# Paired plots using 'add_pie'
Reason_donut_pair <- plot_ly()

Reason_donut_pair <- Reason_donut_pair %>%
  add_pie(data = count(ATW2, Reason), labels = ~Reason, values = ~n, hole = 0.6, textinfo = 'percent', textposition = 'inside',
          name = "Frequency",
          hoverinfo = 'text',
          text = ~paste('Frequency of', Reason, 'for absenteeism'),
          domain = list(x = c(0.51, 0.99))) %>% 
  add_pie(data = ATW2, labels = ~Reason, values = ~Absenteeism, hole = 0.6, textinfo = 'value', textposition = 'inside',
          hoverinfo = 'text',
          text = ~paste('Total hours absent because of', Reason),
          name = "Total Hours",
          domain = list(x = c(0.01, 0.46))) %>% 
  layout(title = "Reasons for Absenteeism by Total Hours Absent & Frequency", showlegend = T,
         xaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE),
         yaxis = list(showgrid = FALSE, zeroline = FALSE, showticklabels = FALSE))

Reason_donut_pair

Bar charts

# Reorder by percent descending

Scatterplots

Age & Absenteeism

Based on the concentration of points, it looks like the majority of employees are absent for 8 hours or less.

ATW2 %>% plot_ly(
  x = ~Age, y = ~Absenteeism,
  color = ~Age, size = ~Absenteeism
)

Distplot

Distribution of Age

Visualize the density & distribution of age using a histogram, rug plot, & normal curve (or KDE)

age_dist <- ggplot(ATW2, aes(x = Age, color = 'density')) +
  geom_histogram(aes(y = ..density..), bins = 8, fill = '#67B7D1', alpha = 0.5) +
  geom_density(color = '#67B7D1') +
  geom_rug(color = '#67B7D1') +
  ylab("")+
  xlab("") + theme(legend.title = element_blank()) +
  scale_color_manual(values = c('density' = '#67B7D1'))

ggplotly(age_dist) %>% 
  layout(plot_bgcolor = '#e5ecf6')

Modeling Pt 1

Feature Importance

Linear Regression (ATW Original)

Using varImp (caret) on a linear regression model, the most important variables by far are reasons for absence & disciplinary failure. This is followed by Children (‘Son’), Education, & Age

set.seed(100)
# 5-fold cross validation, 2 repeats
control <- trainControl(method = "repeatedcv", number = 5, repeats = 2)
train <- createDataPartition(y = ATW$`Absenteeism time in hours`, p = 0.7, list = F)
training <- ATW[train, ]
testing <- ATW[-train, ]
lm1 <- train(`Absenteeism time in hours` ~ . -ID, data = ATW, method = "lm", prob.model = T)
importance1 <- varImp(lm1, scale = F)
print(importance1)
## lm variable importance
## 
##                                         Overall
## `\\`Reason for absence\\``              7.29283
## `\\`Disciplinary failure\\``            7.28016
## `\\`Day of the week\\``                 2.80132
## Son                                     2.05225
## Education                               1.61704
## Age                                     1.58835
## `\\`Social drinker\\``                  1.18414
## `\\`Body mass index\\``                 0.94568
## `\\`Social smoker\\``                   0.94294
## `\\`Distance from Residence to Work\\`` 0.78791
## `\\`Hit target\\``                      0.76261
## `\\`Month of absence\\``                0.75446
## Weight                                  0.74169
## `\\`Transportation expense\\``          0.63088
## Pet                                     0.62458
## `\\`Work load Average/day\\``           0.42733
## Height                                  0.28440
## Seasons                                 0.08603
## `\\`Service time\\``                    0.02898
plot(importance1)

Linear Regression 2(ATW2)

Using the factored data (ATW2) shows that the most important variables are still reasons for absence, children, and the day of the week. More specifically, the most influential reasons seem to be: respiratory system diseases, factors relating to morbidity & mortality, genitourinary system diseases, and musculoskeletal system diseases.

set.seed(100)
# 10-fold cross validation, 3 repeats
control <- trainControl(method = "repeatedcv", number = 5, repeats = 2)

train <- createDataPartition(y = ATW2$Absenteeism, p = 0.7, list = F)
training <- ATW2[train, ]
testing <- ATW2[-train, ]

lm2 <- train(Absenteeism ~ . -ID, data = ATW2, method = "lm", prob.model = T)
importance2 <- varImp(lm2, scale = F)

print(importance2)
## lm variable importance
## 
##   only 20 most important variables shown (out of 62)
## 
##                                                Overall
## `Reasonrespiratory system (dz)`                  6.572
## `Reasonmorbidity and mortality`                  6.545
## `Reasongenitourinary system (dz)`                6.335
## `Reasonmusculoskeletal system (dz)`              5.107
## `Reasoneye & adnexa (dz)`                        4.215
## `Reasonintegumentary (dz)`                       4.043
## `Reasondigestive system (dz)`                    3.500
## `Reasonexternal injury`                          3.169
## `Reasonpatient follow-up`                        3.073
## `Reasonear & mastoid process (dz)`               2.909
## ReasonNeoplasms                                  2.681
## `ReasonPregnancy, childbirth and & puerperium`   2.509
## `Reasonunjustified absence`                      2.390
## WeekdayThursday                                  2.307
## Reasonphysiotherapy                              2.254
## `Reasonblood (dz)`                               2.046
## Children                                         1.893
## `Reasonmedical consultation`                     1.794
## WeekdayFriday                                    1.636
## `Reasoncirculatory system (dz)`                  1.576
plot(importance2)

Taking out ‘Reasons’

When reasons for absence are removed, the most important variables are discipline failure, day of the week, age, season, and children

set.seed(100)
# 10-fold cross validation, 3 repeats
control <- trainControl(method = "repeatedcv", number = 5, repeats = 2)

train <- createDataPartition(y = ATW2$Absenteeism, p = 0.7, list = F)
training <- ATW2[train, ]
testing <- ATW2[-train, ]

lm3 <- train(Absenteeism ~ . -ID -Reason, data = ATW2, method = "lm", prob.model = T)
importance3 <- varImp(lm3, scale = F)

print(importance3)
## lm variable importance
## 
##   only 20 most important variables shown (out of 36)
## 
##                  Overall
## DiscpFailyes      3.8736
## WeekdayFriday     2.9581
## WeekdayThursday   2.9414
## Age               1.9853
## Seasonswinter     1.7695
## MonthJuly         1.7653
## Children          1.7085
## Seasonsfall       1.3254
## TransExpense      1.3127
## MonthDecemeber    1.2852
## MonthNovember     1.2068
## MonthAugust       1.0988
## MonthSeptember    1.0899
## Distance          1.0737
## WeekdayWednesday  1.0408
## MonthMarch        0.9753
## MonthJune         0.9277
## WeekdayTuesday    0.8905
## BMI               0.8709
## MonthOctober      0.8654
plot(importance3)

Clustering

K-means

Elbow Method

After plotting the total w/in-clusters sum of squares, it looks like 2 through 4 clusters is optimal

set.seed(100)

k.max <- 15
ATW_wss <- sapply(1:k.max,
                   function (k) {kmeans(ATW, k, nstart = 50, iter.max = 15)$tot.withinss})

plot(1:k.max, ATW_wss,
     type = "b", pch = 19, frame = F,
     xlab = "Number of k Clusters",
     ylab = "Total within-clusters sum of squares")

2 clusters

ATW3 <- ATW %>% select(`Reason for absence`, `Disciplinary failure`, `Son`, `Education`, `Age`, `Month of absence`, `Seasons`, `Day of the week`)

set.seed(100)

km2 <- kmeans(ATW3, 2, nstart = 50)
ATW3$cluster <- km2$cluster

fviz_cluster(object = km2,
             data = ATW3,
             ellipse.type = "norm",
             geom = "point",
             palette = "jco",
             main = "",
             ggtheme = theme_minimal())

3 Clusters

set.seed(100)

km3 <- kmeans(ATW3, 3, nstart = 50)
ATW3$cluster <- km3$cluster

fviz_cluster(object = km3,
             data = ATW3,
             ellipse.type = "norm",
             geom = "point",
             palette = "jco",
             main = "",
             ggtheme = theme_minimal())

4 Clusters

# Copy of dataframe for cluster values

set.seed(100)

km4 <- kmeans(ATW3, 4, nstart = 50)

fviz_cluster(object = km4,
             data = ATW3,
             ellipse.type = "norm",
             geom = "point",
             palette = "jco",
             main = "",
             ggtheme = theme_minimal())

5 Clusters

set.seed(100)

km5 <- kmeans(ATW3, 5, nstart = 50)

fviz_cluster(object = km5,
             data = ATW3,
             ellipse.type = "norm",
             geom = "point",
             palette = "jco",
             main = "",
             ggtheme = theme_minimal())

Cluster assignments

Add cluster assignments to ATW2

# ATW2 <- ATW2 %>% select(- c(cluster))
# Add cluster assignment to ATW2
ATW2$k2Cluster <- km2$cluster
ATW2$k3Cluster <- km3$cluster
ATW2$k4Cluster <- km4$cluster
ATW2$k5Cluster <- km5$cluster
ATW2
## # A tibble: 740 x 25
##       ID Reason    Month Weekday Seasons TransExpense Distance ServiceTime   Age
##    <dbl> <fct>     <fct> <fct>   <fct>          <dbl>    <dbl>       <dbl> <dbl>
##  1    11 unjustif~ July  Tuesday summer           289       36          13    33
##  2    36 Infectio~ July  Tuesday summer           118       13          18    50
##  3     3 medical ~ July  Wednes~ summer           179       51          18    38
##  4     7 ear & ma~ July  Thursd~ summer           279        5          14    39
##  5    11 medical ~ July  Thursd~ summer           289       36          13    33
##  6     3 medical ~ July  Friday  summer           179       51          18    38
##  7    10 patient ~ July  Friday  summer           361       52           3    28
##  8    20 medical ~ July  Friday  summer           260       50          11    36
##  9    14 morbidit~ July  Monday  summer           155       12          14    34
## 10     1 patient ~ July  Monday  summer           235       11          14    37
## # ... with 730 more rows, and 16 more variables: Workload <dbl>, Hit <dbl>,
## #   DiscpFail <fct>, Education <fct>, Children <dbl>, SocDrinker <fct>,
## #   SocSmoker <fct>, Pet <dbl>, Weight <dbl>, Height <dbl>, BMI <dbl>,
## #   Absenteeism <dbl>, k2Cluster <int>, k3Cluster <int>, k4Cluster <int>,
## #   k5Cluster <int>

Hierarchical clustering

set.seed(100)

ATW_dissimilarity <- dist(ATW3, method = 'euclidean')
hc1 <- hclust(ATW_dissimilarity, method = 'average')

plot(hc1)

Feature Engineering (post-Clustering & Feature Importance)

Absent

There are 45 rows where people were not absent, but I don’t think that was a significant contributor to clustering (see “2 Clusters”). Based on earlier plots, it looks like most individuals take a half-day (3 to 4 hours) to day of absence (8 hours). After these points, ‘Absenteeism’ (lenght of absence in hours) increases in intervals of 8, suggesting that people take multi-day absences.

# There are 45 rows where people were not absent
# ATW2 %>% filter(Absenteeism < 1)

# Create 2 categoires for AbsentType: 1 day & Multi-day

ATW2 <- ATW2 %>% mutate(
  AbsentType = case_when(
    Absenteeism <= 8 ~ 'One day',
    Absenteeism >= 9 ~ 'Multi-day'
  )
)

Age Groups

ATW2 <- ATW2 %>% mutate(
  AgeGroup = case_when(
    Age >= 18 & Age <= 25 ~ 'Emerging Adult',
    Age >= 26 & Age <= 40 ~ 'Young Adult',
    Age >= 41 & Age <= 55 ~ 'Middle Adult',
    Age >= 56 & Age <= 58 ~ 'Young-Old Adult'
  )
)

datatable(ATW2, rownames = F)

Modeling Pt 2

Predicting Absenteeism

All of my linear model variations were pretty poor. I’ll need to try more combinations in the future, but this was the best one I could produce at the time:

  • \(RSME = 9.652\)

  • \(R^{2} = .499\)

  • \(MAE = 4.242\)

#ATW4 <- ATW2 %>% select(Reason, DiscpFail, Education, Children, Age, Seasons, Weekday, Month, Absenteeism, AbsentType, k2Cluster)

set.seed(100)
# 10-fold cross validation, 3 repeats
control <- trainControl(method = "repeatedcv", number = 5, repeats = 2)

train <- createDataPartition(y = ATW2$Absenteeism, p = 0.7, list = F)
training <- ATW2[train, ]
testing <- ATW2[-train, ]

lm4 <- train(Absenteeism ~ 
               Reason + DiscpFail + Children + Age + Seasons + Weekday + AbsentType + k2Cluster + k3Cluster,
             data = ATW2, 
             method = "lm", 
             prob.model = T)

lm4_pred <- predict(lm4, testing)

lm4
## Linear Regression 
## 
## 740 samples
##   9 predictor
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 740, 740, 740, 740, 740, 740, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   9.607287  0.4930916  4.269613
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE

Predicting Absent Type (single vs multiday absences)

In terms of predicting one day vs multi-day absences, the model was able to produce high accuracy with a pretty good ROC curve/AUC value.

  • including k2Cluster: Accuracy = .905, Specificity = .971, AUC = .808,

  • including k4Cluster: Accuracy = .919, Specificity = 1.000, AUC = .788

However, the precision, recall, & sensitivity scores were low, suggesting that this model is good at finding the positive (or predicted) class (Multi-day), but it is too cautious and under-predicting the negative class (One day absences)

set.seed(100)
# 10-fold cross validation, 3 repeats
control <- trainControl(method = "repeatedcv", number = 5, repeats = 2)

train <- createDataPartition(y = ATW2$AbsentType, p = 0.7, list = F)
training <- ATW2[train, ]
testing <- ATW2[-train, ]

LogitBoost1 <- train(AbsentType ~ 
               Reason + DiscpFail + Children + Age + Seasons + Weekday+ k4Cluster,
             data = ATW2, 
             method = "LogitBoost", 
             prob.model = T)

test_pred <- predict(LogitBoost1, newdata = testing)
confusionMatrix(test_pred, as.factor(testing$AbsentType), mode = 'everything')
## Confusion Matrix and Statistics
## 
##            Reference
## Prediction  Multi-day One day
##   Multi-day         0       0
##   One day          18     203
##                                          
##                Accuracy : 0.9186         
##                  95% CI : (0.8743, 0.951)
##     No Information Rate : 0.9186         
##     P-Value [Acc > NIR] : 0.5623         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : 6.151e-05      
##                                          
##             Sensitivity : 0.00000        
##             Specificity : 1.00000        
##          Pos Pred Value :     NaN        
##          Neg Pred Value : 0.91855        
##               Precision :      NA        
##                  Recall : 0.00000        
##                      F1 :      NA        
##              Prevalence : 0.08145        
##          Detection Rate : 0.00000        
##    Detection Prevalence : 0.00000        
##       Balanced Accuracy : 0.50000        
##                                          
##        'Positive' Class : Multi-day      
## 
test_pred_prob <- predict(LogitBoost1, newdata = testing, type = 'prob')
logit_boost_roc1 <- roc(testing$AbsentType, test_pred_prob$`Multi-day`)
logit_boost_roc1
## 
## Call:
## roc.default(response = testing$AbsentType, predictor = test_pred_prob$`Multi-day`)
## 
## Data: test_pred_prob$`Multi-day` in 18 controls (testing$AbsentType Multi-day) > 203 cases (testing$AbsentType One day).
## Area under the curve: 0.7875
plot(logit_boost_roc1)