Data processing

##read the train data
absent_train<-data.frame(read.csv("Absenteeism_at_work_train.csv"))
summary(absent_train)
##        ID       Reason.for.absence Month.of.absence Day.of.the.week
##  Min.   : 1.0   Min.   : 0.00      Min.   : 1.000   Min.   :2.000  
##  1st Qu.: 7.0   1st Qu.:13.00      1st Qu.: 3.000   1st Qu.:3.000  
##  Median :18.0   Median :23.00      Median : 7.000   Median :4.000  
##  Mean   :17.7   Mean   :19.46      Mean   : 6.448   Mean   :3.893  
##  3rd Qu.:28.0   3rd Qu.:26.00      3rd Qu.:10.000   3rd Qu.:5.000  
##  Max.   :36.0   Max.   :28.00      Max.   :12.000   Max.   :6.000  
##                                                                    
##     Seasons      Transportation.expense Distance.from.Residence.to.Work
##  Min.   :1.000   Min.   :  0.0          Min.   : 5.00                  
##  1st Qu.:2.000   1st Qu.:179.0          1st Qu.:17.00                  
##  Median :2.000   Median :225.0          Median :26.00                  
##  Mean   :2.553   Mean   :222.9          Mean   :30.34                  
##  3rd Qu.:4.000   3rd Qu.:260.0          3rd Qu.:50.00                  
##  Max.   :4.000   Max.   :388.0          Max.   :52.00                  
##                                                                        
##   Service.time        Age        Work.load.Average.day   Hit.target    
##  Min.   : 1.00   Min.   : 0.00   Min.   :     0        Min.   : 81.00  
##  1st Qu.: 9.00   1st Qu.:31.00   1st Qu.:244387        1st Qu.: 92.00  
##  Median :13.00   Median :37.00   Median :264249        Median : 95.00  
##  Mean   :12.69   Mean   :36.35   Mean   :272668        Mean   : 94.41  
##  3rd Qu.:16.00   3rd Qu.:40.00   3rd Qu.:294217        3rd Qu.: 97.00  
##  Max.   :29.00   Max.   :58.00   Max.   :378884        Max.   :100.00  
##                                                        NA's   :1       
##  Disciplinary.failure   Education          Son       Social.drinker  
##  Min.   :0.00000      Min.   :1.000   Min.   :0.00   Min.   :0.0000  
##  1st Qu.:0.00000      1st Qu.:1.000   1st Qu.:0.00   1st Qu.:0.0000  
##  Median :0.00000      Median :1.000   Median :1.00   Median :1.0000  
##  Mean   :0.05414      Mean   :1.247   Mean   :1.03   Mean   :0.5835  
##  3rd Qu.:0.00000      3rd Qu.:1.000   3rd Qu.:2.00   3rd Qu.:1.0000  
##  Max.   :1.00000      Max.   :4.000   Max.   :4.00   Max.   :1.0000  
##                                                                      
##  Social.smoker          Pet             Weight          Height     
##  Min.   :0.00000   Min.   :0.0000   Min.   : 56.0   Min.   :163.0  
##  1st Qu.:0.00000   1st Qu.:0.0000   1st Qu.: 69.0   1st Qu.:169.0  
##  Median :0.00000   Median :0.0000   Median : 83.0   Median :170.0  
##  Mean   :0.06917   Mean   :0.6917   Mean   : 79.2   Mean   :171.9  
##  3rd Qu.:0.00000   3rd Qu.:1.0000   3rd Qu.: 89.0   3rd Qu.:172.0  
##  Max.   :1.00000   Max.   :8.0000   Max.   :108.0   Max.   :196.0  
##                                     NA's   :2                      
##  Body.mass.index Absenteeism.time.in.hours
##  Min.   :19.00   Min.   :  0.000          
##  1st Qu.:24.00   1st Qu.:  2.000          
##  Median :25.00   Median :  3.000          
##  Mean   :26.81   Mean   :  6.758          
##  3rd Qu.:31.00   3rd Qu.:  8.000          
##  Max.   :38.00   Max.   :120.000          
## 
table(absent_train$Absenteeism.time.in.hours)
## 
##   0   1   2   3   4   5   7   8  16  24  32  40  56  64  80 104 112 120 
##  37  84 138 100  56   7   1 185  18  15   6   7   2   2   2   1   2   2
#simplify dv column name
colnames(absent_train)[21]<-"hours"

#remove unreasonable observations
#reason for absense should not be 0 since it's not a valid code
absent_train <- absent_train[!(absent_train$Reason.for.absence == 0) ,]
#age cannot be 0 since it make no sense
absent_train <- absent_train[!(absent_train$Age == 0) ,]
#work load per day cannot be 0 since it doesn't make sense to absent if you have no workload
absent_train <- absent_train[!(absent_train$Work.load.Average.day == 0) ,]
#remove NA value observations
absent_train <- absent_train[!(is.na(absent_train$Hit.target) == T) ,]
absent_train <- absent_train[!(is.na(absent_train$Weight) == T) ,]


#check the data again
summary(absent_train)
##        ID        Reason.for.absence Month.of.absence Day.of.the.week
##  Min.   : 1.00   Min.   : 1.00      Min.   : 1.000   Min.   :2.000  
##  1st Qu.: 7.00   1st Qu.:14.00      1st Qu.: 3.000   1st Qu.:3.000  
##  Median :18.00   Median :23.00      Median : 7.000   Median :4.000  
##  Mean   :17.65   Mean   :20.59      Mean   : 6.359   Mean   :3.886  
##  3rd Qu.:28.00   3rd Qu.:27.00      3rd Qu.:10.000   3rd Qu.:5.000  
##  Max.   :36.00   Max.   :28.00      Max.   :12.000   Max.   :6.000  
##     Seasons      Transportation.expense Distance.from.Residence.to.Work
##  Min.   :1.000   Min.   :  0            Min.   : 5.0                   
##  1st Qu.:2.000   1st Qu.:179            1st Qu.:17.0                   
##  Median :2.000   Median :225            Median :26.0                   
##  Mean   :2.513   Mean   :221            Mean   :30.5                   
##  3rd Qu.:4.000   3rd Qu.:260            3rd Qu.:50.0                   
##  Max.   :4.000   Max.   :388            Max.   :52.0                   
##   Service.time        Age        Work.load.Average.day   Hit.target    
##  Min.   : 1.00   Min.   :27.00   Min.   :    12        Min.   : 81.00  
##  1st Qu.: 9.00   1st Qu.:31.00   1st Qu.:244387        1st Qu.: 93.00  
##  Median :13.00   Median :37.00   Median :264249        Median : 95.00  
##  Mean   :12.68   Mean   :36.26   Mean   :272619        Mean   : 94.57  
##  3rd Qu.:16.00   3rd Qu.:40.00   3rd Qu.:302585        3rd Qu.: 97.00  
##  Max.   :29.00   Max.   :58.00   Max.   :378884        Max.   :100.00  
##  Disciplinary.failure   Education          Son        Social.drinker  
##  Min.   :0            Min.   :1.000   Min.   :0.000   Min.   :0.0000  
##  1st Qu.:0            1st Qu.:1.000   1st Qu.:0.000   1st Qu.:0.0000  
##  Median :0            Median :1.000   Median :1.000   Median :1.0000  
##  Mean   :0            Mean   :1.253   Mean   :1.013   Mean   :0.5785  
##  3rd Qu.:0            3rd Qu.:1.000   3rd Qu.:2.000   3rd Qu.:1.0000  
##  Max.   :0            Max.   :4.000   Max.   :4.000   Max.   :1.0000  
##  Social.smoker         Pet             Weight        Height     
##  Min.   :0.0000   Min.   :0.0000   Min.   : 56   Min.   :163.0  
##  1st Qu.:0.0000   1st Qu.:0.0000   1st Qu.: 69   1st Qu.:169.0  
##  Median :0.0000   Median :0.0000   Median : 83   Median :170.0  
##  Mean   :0.0625   Mean   :0.6843   Mean   : 79   Mean   :171.9  
##  3rd Qu.:0.0000   3rd Qu.:1.0000   3rd Qu.: 89   3rd Qu.:172.0  
##  Max.   :1.0000   Max.   :8.0000   Max.   :108   Max.   :196.0  
##  Body.mass.index     hours        
##  Min.   :19.00   Min.   :  0.000  
##  1st Qu.:24.00   1st Qu.:  2.000  
##  Median :25.00   Median :  3.000  
##  Mean   :26.74   Mean   :  7.155  
##  3rd Qu.:31.00   3rd Qu.:  8.000  
##  Max.   :38.00   Max.   :120.000
table(absent_train$hours)
## 
##   0   1   2   3   4   5   7   8  16  24  32  40  56  64  80 104 112 120 
##   1  84 137  99  56   7   1 182  18  15   6   7   2   2   2   1   2   2
#after checking the data, we found all values for column disciplinary isequal to 0, so we should remove this column as it cannot be a good classifier & also remove ID column
#and also we found only 1 observation with 0 hour of absence, so we decided to remove this observation since it's consider an outlier, which means there will be only 2 values for dependent variables
absent_train<-absent_train[,-c(1,12)]
absent_train <- absent_train[!(absent_train$hours == 0) ,]

#change dv to categoricl value
for(i in 1:nrow(absent_train)) {
     if(absent_train$hours[i] <= 6 & absent_train$hours[i] >0) {
         absent_train$hours[i] = "low"
    }
     else {absent_train$hours[i] = "high"}
}
table(absent_train$hours)
## 
## high  low 
##  187  436
#factorize a few columns
cols_fac <- c(1,2,3,4,11,13,14,19)
absent_train[,cols_fac] <- lapply(absent_train[,cols_fac], as.factor)
sapply(absent_train, class)
##              Reason.for.absence                Month.of.absence 
##                        "factor"                        "factor" 
##                 Day.of.the.week                         Seasons 
##                        "factor"                        "factor" 
##          Transportation.expense Distance.from.Residence.to.Work 
##                       "integer"                       "integer" 
##                    Service.time                             Age 
##                       "integer"                       "integer" 
##           Work.load.Average.day                      Hit.target 
##                       "integer"                       "integer" 
##                       Education                             Son 
##                        "factor"                       "integer" 
##                  Social.drinker                   Social.smoker 
##                        "factor"                        "factor" 
##                             Pet                          Weight 
##                       "integer"                       "integer" 
##                          Height                 Body.mass.index 
##                       "integer"                       "integer" 
##                           hours 
##                        "factor"
#normalize the data
normalize <- function(x) {
    return ((x - min(x)) / (max(x) - min(x)))
}
col_nor<-c(5,6,7,8,9,10,12,15,16,17,18)
absent_train[col_nor] <- as.data.frame(lapply(absent_train[col_nor], normalize))

#check for existence of highly correlated variables
#check correlation between numerical variables
m <- cor(absent_train[,col_nor])
corrplot(m, order = "hclust", tl.srt = 30, tl.col = "black", addrect = 3, method = "number" )

#BMI and weight are highly correlated, so we removed weight
#check correlation between categorical variables
absent_fac <- absent_train[,c(1,2,3,4,11,13,14)]
chisq_result <- matrix(0L, nrow = 7, ncol = 7)
row.names(chisq_result)<-colnames(absent_fac)
colnames(chisq_result)<-colnames(absent_fac)
for(i in 1:ncol(absent_fac)) {
    for(j in 1:ncol(absent_fac)){
        chisq_result[i,j] <- (chisq.test(absent_fac[,i],absent_fac[,j]))$p.value
    }
}
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect

## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
#we can conclude that all variables are highly correlated with reason.for.absence except day.of.the.week
#so we removed all categorical variables except reason for absense & day of the week
col_remove <- c(2,4,11,13,14,16)
absent_train<-absent_train[,-col_remove]






##same pattern for test set
absent_test<-data.frame(read.csv("Absenteeism_at_work_test.csv"))
summary(absent_test)
##        ID        Reason.for.absence Month.of.absence Day.of.the.week
##  Min.   : 1.00   Min.   : 0.00      Min.   :0.00     Min.   :2.000  
##  1st Qu.:14.00   1st Qu.:10.00      1st Qu.:5.00     1st Qu.:3.000  
##  Median :22.00   Median :19.00      Median :5.00     Median :4.000  
##  Mean   :21.11   Mean   :16.93      Mean   :5.27     Mean   :4.108  
##  3rd Qu.:28.75   3rd Qu.:25.00      3rd Qu.:6.00     3rd Qu.:5.000  
##  Max.   :36.00   Max.   :28.00      Max.   :7.00     Max.   :6.000  
##     Seasons      Transportation.expense Distance.from.Residence.to.Work
##  Min.   :1.000   Min.   :118.0          Min.   :10.00                  
##  1st Qu.:1.250   1st Qu.:155.0          1st Qu.:13.00                  
##  Median :3.000   Median :207.0          Median :21.00                  
##  Mean   :2.473   Mean   :205.8          Mean   :22.97                  
##  3rd Qu.:3.000   3rd Qu.:235.0          3rd Qu.:26.00                  
##  Max.   :3.000   Max.   :378.0          Max.   :52.00                  
##   Service.time        Age        Work.load.Average.day   Hit.target   
##  Min.   : 1.00   Min.   :28.00   Min.   :237656        Min.   :91.00  
##  1st Qu.: 9.00   1st Qu.:30.25   1st Qu.:237656        1st Qu.:93.00  
##  Median :10.50   Median :36.50   Median :246288        Median :96.00  
##  Mean   :11.24   Mean   :36.82   Mean   :253931        Mean   :96.23  
##  3rd Qu.:14.00   3rd Qu.:40.00   3rd Qu.:274122        3rd Qu.:99.00  
##  Max.   :24.00   Max.   :58.00   Max.   :275089        Max.   :99.00  
##  Disciplinary.failure   Education          Son         Social.drinker  
##  Min.   :0.00000      Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.00000      1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.00000      Median :1.000   Median :1.0000   Median :0.0000  
##  Mean   :0.05405      Mean   :1.703   Mean   :0.9324   Mean   :0.4189  
##  3rd Qu.:0.00000      3rd Qu.:3.000   3rd Qu.:2.0000   3rd Qu.:1.0000  
##  Max.   :1.00000      Max.   :4.000   Max.   :3.0000   Max.   :1.0000  
##  Social.smoker         Pet            Weight           Height     
##  Min.   :0.0000   Min.   :0.000   Min.   : 56.00   Min.   :163.0  
##  1st Qu.:0.0000   1st Qu.:0.000   1st Qu.: 68.00   1st Qu.:171.0  
##  Median :0.0000   Median :0.000   Median : 75.00   Median :172.0  
##  Mean   :0.1081   Mean   :1.243   Mean   : 77.38   Mean   :174.3  
##  3rd Qu.:0.0000   3rd Qu.:2.000   3rd Qu.: 88.00   3rd Qu.:178.0  
##  Max.   :1.0000   Max.   :8.000   Max.   :106.00   Max.   :196.0  
##  Body.mass.index Absenteeism.time.in.hours
##  Min.   :19.00   Min.   :  0.000          
##  1st Qu.:22.00   1st Qu.:  2.000          
##  Median :25.00   Median :  3.000          
##  Mean   :25.43   Mean   :  8.473          
##  3rd Qu.:28.00   3rd Qu.:  8.000          
##  Max.   :38.00   Max.   :120.000
colnames(absent_test)[21]<-"hours"

#remove unreasonable observations
#reason for absense should not be 0 since it's not a valid code
absent_test <- absent_test[!(absent_test$Reason.for.absence == 0) ,]
#month of the year cannot be 0
absent_test <- absent_test[!(absent_test$Month.of.absence == 0) ,]

#check the data again
summary(absent_test)
##        ID        Reason.for.absence Month.of.absence Day.of.the.week
##  Min.   : 1.00   Min.   : 5.0       Min.   :4.000    Min.   :2.000  
##  1st Qu.:14.00   1st Qu.:12.0       1st Qu.:5.000    1st Qu.:3.000  
##  Median :22.00   Median :19.0       Median :5.000    Median :4.000  
##  Mean   :21.39   Mean   :18.7       Mean   :5.493    Mean   :4.179  
##  3rd Qu.:28.50   3rd Qu.:26.0       3rd Qu.:6.000    3rd Qu.:5.000  
##  Max.   :36.00   Max.   :28.0       Max.   :7.000    Max.   :6.000  
##     Seasons      Transportation.expense Distance.from.Residence.to.Work
##  Min.   :1.000   Min.   :118.0          Min.   :10.00                  
##  1st Qu.:1.000   1st Qu.:155.0          1st Qu.:13.00                  
##  Median :3.000   Median :189.0          Median :20.00                  
##  Mean   :2.463   Mean   :204.6          Mean   :22.45                  
##  3rd Qu.:3.000   3rd Qu.:235.0          3rd Qu.:26.00                  
##  Max.   :3.000   Max.   :378.0          Max.   :52.00                  
##   Service.time        Age        Work.load.Average.day   Hit.target   
##  Min.   : 1.00   Min.   :28.00   Min.   :237656        Min.   :91.00  
##  1st Qu.: 9.00   1st Qu.:30.00   1st Qu.:237656        1st Qu.:93.00  
##  Median :10.00   Median :34.00   Median :246288        Median :96.00  
##  Mean   :11.07   Mean   :36.21   Mean   :253011        Mean   :96.21  
##  3rd Qu.:14.00   3rd Qu.:40.00   3rd Qu.:269847        3rd Qu.:99.00  
##  Max.   :24.00   Max.   :58.00   Max.   :275089        Max.   :99.00  
##  Disciplinary.failure   Education          Son         Social.drinker  
##  Min.   :0            Min.   :1.000   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0            1st Qu.:1.000   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0            Median :1.000   Median :1.0000   Median :0.0000  
##  Mean   :0            Mean   :1.761   Mean   :0.9104   Mean   :0.4179  
##  3rd Qu.:0            3rd Qu.:3.000   3rd Qu.:2.0000   3rd Qu.:1.0000  
##  Max.   :0            Max.   :4.000   Max.   :3.0000   Max.   :1.0000  
##  Social.smoker          Pet            Weight           Height     
##  Min.   :0.00000   Min.   :0.000   Min.   : 56.00   Min.   :167.0  
##  1st Qu.:0.00000   1st Qu.:0.000   1st Qu.: 66.50   1st Qu.:171.0  
##  Median :0.00000   Median :0.000   Median : 75.00   Median :172.0  
##  Mean   :0.08955   Mean   :1.075   Mean   : 76.43   Mean   :174.5  
##  3rd Qu.:0.00000   3rd Qu.:1.000   3rd Qu.: 86.00   3rd Qu.:178.0  
##  Max.   :1.00000   Max.   :8.000   Max.   :106.00   Max.   :196.0  
##  Body.mass.index     hours        
##  Min.   :19.00   Min.   :  1.000  
##  1st Qu.:22.00   1st Qu.:  2.000  
##  Median :25.00   Median :  3.000  
##  Mean   :25.04   Mean   :  9.358  
##  3rd Qu.:28.00   3rd Qu.:  8.000  
##  Max.   :38.00   Max.   :120.000
table(absent_test$hours)
## 
##   1   2   3   4   8  16  24  48  64  80 120 
##   4  19  11   4  23   1   1   1   1   1   1
#after checking the data, we found all values for column disciplinary isequal to 0, so we should remove this column as it cannot be a good classifier & also remove ID column
absent_test<-absent_test[,-c(1,12)]

#change dv to categoricl value
for(i in 1:nrow(absent_test)) {
     if(absent_test$hours[i] <= 6 & absent_test$hours[i] >0) {
         absent_test$hours[i] = "low"
    }
     else {absent_test$hours[i] = "high"}
}
table(absent_test$hours)
## 
## high  low 
##   25   42
#factorize a few columns
absent_test[,cols_fac] <- lapply(absent_test[,cols_fac], as.factor)
sapply(absent_test, class)
##              Reason.for.absence                Month.of.absence 
##                        "factor"                        "factor" 
##                 Day.of.the.week                         Seasons 
##                        "factor"                        "factor" 
##          Transportation.expense Distance.from.Residence.to.Work 
##                       "integer"                       "integer" 
##                    Service.time                             Age 
##                       "integer"                       "integer" 
##           Work.load.Average.day                      Hit.target 
##                       "integer"                       "integer" 
##                       Education                             Son 
##                        "factor"                       "integer" 
##                  Social.drinker                   Social.smoker 
##                        "factor"                        "factor" 
##                             Pet                          Weight 
##                       "integer"                       "integer" 
##                          Height                 Body.mass.index 
##                       "integer"                       "integer" 
##                           hours 
##                        "factor"
#normalize the data
absent_test[col_nor] <- as.data.frame(lapply(absent_test[col_nor], normalize))
#remove same columns
absent_test<-absent_test[,-col_remove]

Model type1: decision tree/random forest

#decision tree model
set.seed(20201205)
absent_c50 <- C5.0(x = absent_train[-13], y = absent_train$hours)
summary(absent_c50)
## 
## Call:
## C5.0.default(x = absent_train[-13], y = absent_train$hours)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sat Apr 03 15:53:34 2021
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 623 cases (13 attributes) from undefined.data
## 
## Decision tree:
## 
## Reason.for.absence in {1,3,4,5,6,8,10,11,13,15,17,18,19,21,22,24,
## :                      26}: high (244/85)
## Reason.for.absence in {2,7,9,12,14,16,23,25,27,28}: low (379/28)
## 
## 
## Evaluation on training data (623 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2  113(18.1%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     159    28    (a): class high
##      85   351    (b): class low
## 
## 
##  Attribute usage:
## 
##  100.00% Reason.for.absence
## 
## 
## Time: 0.0 secs
#prediction in test set
absent_pred <- predict(absent_c50, absent_test)
(p <- table(absent_pred, absent_test$hours))
##            
## absent_pred high low
##        high   20  19
##        low     5  23
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 64.1791
#We decided regression tree model does not fit this case, so we skipped it

#random forest model
set.seed(20201205)
absent_randomforest <- randomForest(hours ~ . , data = absent_train, ntree = 1200)
#belong 2 lines are written for a wired error, found wired solution on stack overlow :P
absent_test <- rbind(absent_train[1, ] , absent_test)
absent_test <- absent_test[-1,]
#prediction on test set
absent_pred <- predict(absent_randomforest, absent_test)
(p <- table(absent_pred, absent_test$hours))
##            
## absent_pred high low
##        high    9   5
##        low    16  37
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 68.65672
#check variables importance
varImpPlot(absent_randomforest)

#reason for absence is the most important varibles among all of them, so let's analysis which one is the most frequent reason
#we have to get the raw data again
absent_plot<-data.frame(read.csv("Absenteeism_at_work_train.csv"))
colnames(absent_plot)[21]<-"hours"
absent_plot <- absent_plot[!(absent_plot$Reason.for.absence == 0) ,]
absent_plot <- absent_plot[!(absent_plot$Age == 0) ,]
absent_plot <- absent_plot[!(absent_plot$Work.load.Average.day == 0) ,]
absent_plot <- absent_plot[!(is.na(absent_plot$Hit.target) == T) ,]
absent_plot <- absent_plot[!(is.na(absent_plot$Weight) == T) ,]
#aggregate hours for the same reason
absent_reason <- data.frame(aggregate(absent_plot$hours, by=list(Category = absent_plot$Reason.for.absence), FUN=sum))
absent_reason <- absent_reason[order(absent_reason$x, decreasing = T),]
#plot the reason vs hours
ggplot(data = absent_reason, aes(reorder(Category,x), x)) +
  geom_bar(stat="identity") +
  geom_col(aes(fill = x)) +
  scale_fill_gradient2(low = "white", 
                       high = "red3", 
                       midpoint = median(absent_reason$Category)) + 
  coord_flip() +
  labs(x = 'Hours aggregated',
         y = 'Reason for absence',
         title = 'Aggregated hours for each reason'
       )

We can conclude that accuracy of random forest (71.64%) is slightly higher than decision tree (64.18%). Among all the reasons, reason #13 (Diseases of the musculoskeletal system and connective tissue), #19 (Injury, poisoning and certain other consequences of external causes) and #23 (medical consultation) are the most important reasons for absence.

Model type2: svm

#give naive bayes a try
absent_nb <- naive_bayes(hours ~ ., data = absent_train)
## Warning: naive_bayes(): Feature Reason.for.absence - zero probabilities are
## present. Consider Laplace smoothing.
absent_nb
## 
## ================================== Naive Bayes ================================== 
##  
##  Call: 
## naive_bayes.formula(formula = hours ~ ., data = absent_train)
## 
## --------------------------------------------------------------------------------- 
##  
## Laplace smoothing: 0
## 
## --------------------------------------------------------------------------------- 
##  
##  A priori probabilities: 
## 
##      high       low 
## 0.3001605 0.6998395 
## 
## --------------------------------------------------------------------------------- 
##  
##  Tables: 
## 
## --------------------------------------------------------------------------------- 
##  ::: Reason.for.absence (Categorical) 
## --------------------------------------------------------------------------------- 
##                   
## Reason.for.absence        high         low
##                 1  0.069518717 0.006880734
##                 2  0.000000000 0.002293578
##                 3  0.005347594 0.000000000
##                 4  0.005347594 0.002293578
##                 5  0.010695187 0.000000000
##                 6  0.026737968 0.000000000
##                 7  0.016042781 0.022935780
##                 8  0.016042781 0.002293578
##                 9  0.005347594 0.006880734
##                 10 0.058823529 0.013761468
##                 11 0.053475936 0.032110092
##                 12 0.000000000 0.018348624
##                 13 0.117647059 0.064220183
##                 14 0.021390374 0.027522936
##                 15 0.010695187 0.000000000
##                 16 0.000000000 0.002293578
##                 17 0.005347594 0.000000000
##                 18 0.090909091 0.009174312
##                 19 0.085561497 0.034403670
##                 21 0.021390374 0.004587156
##                 22 0.139037433 0.006880734
##                 23 0.053475936 0.307339450
##                 24 0.016042781 0.000000000
##                 25 0.021390374 0.057339450
##                 26 0.117647059 0.018348624
##                 27 0.000000000 0.133027523
##                 28 0.032085561 0.227064220
## 
## --------------------------------------------------------------------------------- 
##  ::: Day.of.the.week (Categorical) 
## --------------------------------------------------------------------------------- 
##                
## Day.of.the.week      high       low
##               2 0.2620321 0.2133028
##               3 0.2513369 0.1949541
##               4 0.2032086 0.1972477
##               5 0.1176471 0.1857798
##               6 0.1657754 0.2087156
## 
## --------------------------------------------------------------------------------- 
##  ::: Transportation.expense (Gaussian) 
## --------------------------------------------------------------------------------- 
##                       
## Transportation.expense      high       low
##                   mean 0.6369976 0.5412076
##                   sd   0.1905748 0.1508206
## 
## --------------------------------------------------------------------------------- 
##  ::: Distance.from.Residence.to.Work (Gaussian) 
## --------------------------------------------------------------------------------- 
##                                
## Distance.from.Residence.to.Work      high       low
##                            mean 0.5324838 0.5479211
##                            sd   0.3161333 0.3178227
## 
## --------------------------------------------------------------------------------- 
##  ::: Service.time (Gaussian) 
## --------------------------------------------------------------------------------- 
##             
## Service.time      high       low
##         mean 0.3930481 0.4275885
##         sd   0.1570563 0.1539578
## 
## ---------------------------------------------------------------------------------
## 
## # ... and 7 more tables
## 
## ---------------------------------------------------------------------------------
(p_train<-table(predict(absent_nb, absent_train), absent_train$hours))
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
##       
##        high low
##   high  119  43
##   low    68 393
(accuracy_training <- sum(diag(p_train)/sum(p_train)*100))
## [1] 82.18299
(p_test <- table(predict(absent_nb, absent_test), absent_test$hours))
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
##       
##        high low
##   high   19  12
##   low     6  30
(accuracy <- sum(diag(p_test))/sum(p_test)*100)
## [1] 73.13433
#not much improvement, let's try ksvm

#svm with linear kernel
absent_svm <- ksvm(hours ~ ., data = absent_train, kernel = 'vanilladot')
##  Setting default kernel parameters
summary(absent_svm)
## Length  Class   Mode 
##      1   ksvm     S4
absent_svm_predictions <- predict(absent_svm, absent_test) 

(p <- table(absent_svm_predictions,absent_test$hours))
##                       
## absent_svm_predictions high low
##                   high   19  14
##                   low     6  28
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 70.14925
#svm with polynomial kernel
absent_svm_1 <- ksvm(hours ~ ., data = absent_train, kernel = 'polydot')
##  Setting default kernel parameters
summary(absent_svm_1)
## Length  Class   Mode 
##      1   ksvm     S4
absent_svm_predictions_1 <- predict(absent_svm_1, absent_test) 

(p <- table(absent_svm_predictions_1,absent_test$hours))
##                         
## absent_svm_predictions_1 high low
##                     high   19  14
##                     low     6  28
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 70.14925
#gaussian kernel
absent_svm_2 <- ksvm(hours ~ ., data = absent_train, kernel = 'rbfdot')
summary(absent_svm_2)
## Length  Class   Mode 
##      1   ksvm     S4
absent_svm_predictions_2 <- predict(absent_svm_2, absent_test) 

(p <- table(absent_svm_predictions_2,absent_test$hours))
##                         
## absent_svm_predictions_2 high low
##                     high    4   2
##                     low    21  40
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 65.67164

The best accuracy belongs to naive bayes method which is 73.13%.

Model type3: KNN

K-means method only applies to non-categorical variables, and the most important variable is the categorical variable reason for absence. So in this part, we’ll remove all categorical variables and take a look at how other numerical variables will affect the absence hours, which means determine which are important non-medical reason for absence.

#make a specific dataset again for numerical-only variables
absent_knn_train <- absent_train[,-c(1,2)]
absent_knn_test <- absent_test[,-c(1,2)]

#build the model
absent_knn_train_labels <- absent_knn_train[, 11]
absent_knn_test_labels <- absent_knn_test[, 11]
#set the k to sqrt(623) which is around 25
absent_knn_test_pred <- knn(train = absent_knn_train[,-11], test = absent_knn_test[-11], 
                            cl = absent_knn_train_labels, k = 25)
(abs_tbl <- table(absent_knn_test_pred, absent_knn_test_labels))
##                     absent_knn_test_labels
## absent_knn_test_pred high low
##                 high    8   2
##                 low    17  40
(Accuracy <- (abs_tbl[1]+abs_tbl[4])/sum(abs_tbl)*100)
## [1] 71.64179

The KNN method gave us accuracy of 71.64%.

Conslution

Based on all methods, the highest accuracy belongs to naive Bayes analysis, but we decided to choose the random forest method. First, the difference of accuracy is not significant, which doesn’t really have the “best” one. Second, the random forest result make more sense since the result is inconsistent with our common sense. If a person got a bone fracture or a knee injury, or got poisoned, he/she might just stay at home, even if they’re not on vacation, at least for the first few days. Other than that, if someone just feel uncomfortable, especially when symptoms last long, and he/she happened to squeeze into a time window for a doctor’s interview, he/she will probably go for consultation without notice to his/her company. The thing I really want to emphasize is that the US, as a developed nation, has a long way to go for improve social benefit, such as paid day-off, sick leave and unemployment insurance, etc. Compared to Canada, the resident of US has a higher cost of medical expense and are more fearful to lose their job due to absence, not even mention about North European countries. Poverty is not a crime or a consequence of laziness, irresponsible or intelligence problem. It’s a social welfare issue that may affect everyone. Thus, the government needs to focus more on how to improve worker’s benefit instead of just calling “law and order”.