##read the train data
absent_train<-data.frame(read.csv("Absenteeism_at_work_train.csv"))
summary(absent_train)
## ID Reason.for.absence Month.of.absence Day.of.the.week
## Min. : 1.0 Min. : 0.00 Min. : 1.000 Min. :2.000
## 1st Qu.: 7.0 1st Qu.:13.00 1st Qu.: 3.000 1st Qu.:3.000
## Median :18.0 Median :23.00 Median : 7.000 Median :4.000
## Mean :17.7 Mean :19.46 Mean : 6.448 Mean :3.893
## 3rd Qu.:28.0 3rd Qu.:26.00 3rd Qu.:10.000 3rd Qu.:5.000
## Max. :36.0 Max. :28.00 Max. :12.000 Max. :6.000
##
## Seasons Transportation.expense Distance.from.Residence.to.Work
## Min. :1.000 Min. : 0.0 Min. : 5.00
## 1st Qu.:2.000 1st Qu.:179.0 1st Qu.:17.00
## Median :2.000 Median :225.0 Median :26.00
## Mean :2.553 Mean :222.9 Mean :30.34
## 3rd Qu.:4.000 3rd Qu.:260.0 3rd Qu.:50.00
## Max. :4.000 Max. :388.0 Max. :52.00
##
## Service.time Age Work.load.Average.day Hit.target
## Min. : 1.00 Min. : 0.00 Min. : 0 Min. : 81.00
## 1st Qu.: 9.00 1st Qu.:31.00 1st Qu.:244387 1st Qu.: 92.00
## Median :13.00 Median :37.00 Median :264249 Median : 95.00
## Mean :12.69 Mean :36.35 Mean :272668 Mean : 94.41
## 3rd Qu.:16.00 3rd Qu.:40.00 3rd Qu.:294217 3rd Qu.: 97.00
## Max. :29.00 Max. :58.00 Max. :378884 Max. :100.00
## NA's :1
## Disciplinary.failure Education Son Social.drinker
## Min. :0.00000 Min. :1.000 Min. :0.00 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.00 1st Qu.:0.0000
## Median :0.00000 Median :1.000 Median :1.00 Median :1.0000
## Mean :0.05414 Mean :1.247 Mean :1.03 Mean :0.5835
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:2.00 3rd Qu.:1.0000
## Max. :1.00000 Max. :4.000 Max. :4.00 Max. :1.0000
##
## Social.smoker Pet Weight Height
## Min. :0.00000 Min. :0.0000 Min. : 56.0 Min. :163.0
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.: 69.0 1st Qu.:169.0
## Median :0.00000 Median :0.0000 Median : 83.0 Median :170.0
## Mean :0.06917 Mean :0.6917 Mean : 79.2 Mean :171.9
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.: 89.0 3rd Qu.:172.0
## Max. :1.00000 Max. :8.0000 Max. :108.0 Max. :196.0
## NA's :2
## Body.mass.index Absenteeism.time.in.hours
## Min. :19.00 Min. : 0.000
## 1st Qu.:24.00 1st Qu.: 2.000
## Median :25.00 Median : 3.000
## Mean :26.81 Mean : 6.758
## 3rd Qu.:31.00 3rd Qu.: 8.000
## Max. :38.00 Max. :120.000
##
table(absent_train$Absenteeism.time.in.hours)
##
## 0 1 2 3 4 5 7 8 16 24 32 40 56 64 80 104 112 120
## 37 84 138 100 56 7 1 185 18 15 6 7 2 2 2 1 2 2
#simplify dv column name
colnames(absent_train)[21]<-"hours"
#remove unreasonable observations
#reason for absense should not be 0 since it's not a valid code
absent_train <- absent_train[!(absent_train$Reason.for.absence == 0) ,]
#age cannot be 0 since it make no sense
absent_train <- absent_train[!(absent_train$Age == 0) ,]
#work load per day cannot be 0 since it doesn't make sense to absent if you have no workload
absent_train <- absent_train[!(absent_train$Work.load.Average.day == 0) ,]
#remove NA value observations
absent_train <- absent_train[!(is.na(absent_train$Hit.target) == T) ,]
absent_train <- absent_train[!(is.na(absent_train$Weight) == T) ,]
#check the data again
summary(absent_train)
## ID Reason.for.absence Month.of.absence Day.of.the.week
## Min. : 1.00 Min. : 1.00 Min. : 1.000 Min. :2.000
## 1st Qu.: 7.00 1st Qu.:14.00 1st Qu.: 3.000 1st Qu.:3.000
## Median :18.00 Median :23.00 Median : 7.000 Median :4.000
## Mean :17.65 Mean :20.59 Mean : 6.359 Mean :3.886
## 3rd Qu.:28.00 3rd Qu.:27.00 3rd Qu.:10.000 3rd Qu.:5.000
## Max. :36.00 Max. :28.00 Max. :12.000 Max. :6.000
## Seasons Transportation.expense Distance.from.Residence.to.Work
## Min. :1.000 Min. : 0 Min. : 5.0
## 1st Qu.:2.000 1st Qu.:179 1st Qu.:17.0
## Median :2.000 Median :225 Median :26.0
## Mean :2.513 Mean :221 Mean :30.5
## 3rd Qu.:4.000 3rd Qu.:260 3rd Qu.:50.0
## Max. :4.000 Max. :388 Max. :52.0
## Service.time Age Work.load.Average.day Hit.target
## Min. : 1.00 Min. :27.00 Min. : 12 Min. : 81.00
## 1st Qu.: 9.00 1st Qu.:31.00 1st Qu.:244387 1st Qu.: 93.00
## Median :13.00 Median :37.00 Median :264249 Median : 95.00
## Mean :12.68 Mean :36.26 Mean :272619 Mean : 94.57
## 3rd Qu.:16.00 3rd Qu.:40.00 3rd Qu.:302585 3rd Qu.: 97.00
## Max. :29.00 Max. :58.00 Max. :378884 Max. :100.00
## Disciplinary.failure Education Son Social.drinker
## Min. :0 Min. :1.000 Min. :0.000 Min. :0.0000
## 1st Qu.:0 1st Qu.:1.000 1st Qu.:0.000 1st Qu.:0.0000
## Median :0 Median :1.000 Median :1.000 Median :1.0000
## Mean :0 Mean :1.253 Mean :1.013 Mean :0.5785
## 3rd Qu.:0 3rd Qu.:1.000 3rd Qu.:2.000 3rd Qu.:1.0000
## Max. :0 Max. :4.000 Max. :4.000 Max. :1.0000
## Social.smoker Pet Weight Height
## Min. :0.0000 Min. :0.0000 Min. : 56 Min. :163.0
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.: 69 1st Qu.:169.0
## Median :0.0000 Median :0.0000 Median : 83 Median :170.0
## Mean :0.0625 Mean :0.6843 Mean : 79 Mean :171.9
## 3rd Qu.:0.0000 3rd Qu.:1.0000 3rd Qu.: 89 3rd Qu.:172.0
## Max. :1.0000 Max. :8.0000 Max. :108 Max. :196.0
## Body.mass.index hours
## Min. :19.00 Min. : 0.000
## 1st Qu.:24.00 1st Qu.: 2.000
## Median :25.00 Median : 3.000
## Mean :26.74 Mean : 7.155
## 3rd Qu.:31.00 3rd Qu.: 8.000
## Max. :38.00 Max. :120.000
table(absent_train$hours)
##
## 0 1 2 3 4 5 7 8 16 24 32 40 56 64 80 104 112 120
## 1 84 137 99 56 7 1 182 18 15 6 7 2 2 2 1 2 2
#after checking the data, we found all values for column disciplinary isequal to 0, so we should remove this column as it cannot be a good classifier & also remove ID column
#and also we found only 1 observation with 0 hour of absence, so we decided to remove this observation since it's consider an outlier, which means there will be only 2 values for dependent variables
absent_train<-absent_train[,-c(1,12)]
absent_train <- absent_train[!(absent_train$hours == 0) ,]
#change dv to categoricl value
for(i in 1:nrow(absent_train)) {
if(absent_train$hours[i] <= 6 & absent_train$hours[i] >0) {
absent_train$hours[i] = "low"
}
else {absent_train$hours[i] = "high"}
}
table(absent_train$hours)
##
## high low
## 187 436
#factorize a few columns
cols_fac <- c(1,2,3,4,11,13,14,19)
absent_train[,cols_fac] <- lapply(absent_train[,cols_fac], as.factor)
sapply(absent_train, class)
## Reason.for.absence Month.of.absence
## "factor" "factor"
## Day.of.the.week Seasons
## "factor" "factor"
## Transportation.expense Distance.from.Residence.to.Work
## "integer" "integer"
## Service.time Age
## "integer" "integer"
## Work.load.Average.day Hit.target
## "integer" "integer"
## Education Son
## "factor" "integer"
## Social.drinker Social.smoker
## "factor" "factor"
## Pet Weight
## "integer" "integer"
## Height Body.mass.index
## "integer" "integer"
## hours
## "factor"
#normalize the data
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
col_nor<-c(5,6,7,8,9,10,12,15,16,17,18)
absent_train[col_nor] <- as.data.frame(lapply(absent_train[col_nor], normalize))
#check for existence of highly correlated variables
#check correlation between numerical variables
m <- cor(absent_train[,col_nor])
corrplot(m, order = "hclust", tl.srt = 30, tl.col = "black", addrect = 3, method = "number" )
#BMI and weight are highly correlated, so we removed weight
#check correlation between categorical variables
absent_fac <- absent_train[,c(1,2,3,4,11,13,14)]
chisq_result <- matrix(0L, nrow = 7, ncol = 7)
row.names(chisq_result)<-colnames(absent_fac)
colnames(chisq_result)<-colnames(absent_fac)
for(i in 1:ncol(absent_fac)) {
for(j in 1:ncol(absent_fac)){
chisq_result[i,j] <- (chisq.test(absent_fac[,i],absent_fac[,j]))$p.value
}
}
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
## Warning in chisq.test(absent_fac[, i], absent_fac[, j]): Chi-squared
## approximation may be incorrect
#we can conclude that all variables are highly correlated with reason.for.absence except day.of.the.week
#so we removed all categorical variables except reason for absense & day of the week
col_remove <- c(2,4,11,13,14,16)
absent_train<-absent_train[,-col_remove]
##same pattern for test set
absent_test<-data.frame(read.csv("Absenteeism_at_work_test.csv"))
summary(absent_test)
## ID Reason.for.absence Month.of.absence Day.of.the.week
## Min. : 1.00 Min. : 0.00 Min. :0.00 Min. :2.000
## 1st Qu.:14.00 1st Qu.:10.00 1st Qu.:5.00 1st Qu.:3.000
## Median :22.00 Median :19.00 Median :5.00 Median :4.000
## Mean :21.11 Mean :16.93 Mean :5.27 Mean :4.108
## 3rd Qu.:28.75 3rd Qu.:25.00 3rd Qu.:6.00 3rd Qu.:5.000
## Max. :36.00 Max. :28.00 Max. :7.00 Max. :6.000
## Seasons Transportation.expense Distance.from.Residence.to.Work
## Min. :1.000 Min. :118.0 Min. :10.00
## 1st Qu.:1.250 1st Qu.:155.0 1st Qu.:13.00
## Median :3.000 Median :207.0 Median :21.00
## Mean :2.473 Mean :205.8 Mean :22.97
## 3rd Qu.:3.000 3rd Qu.:235.0 3rd Qu.:26.00
## Max. :3.000 Max. :378.0 Max. :52.00
## Service.time Age Work.load.Average.day Hit.target
## Min. : 1.00 Min. :28.00 Min. :237656 Min. :91.00
## 1st Qu.: 9.00 1st Qu.:30.25 1st Qu.:237656 1st Qu.:93.00
## Median :10.50 Median :36.50 Median :246288 Median :96.00
## Mean :11.24 Mean :36.82 Mean :253931 Mean :96.23
## 3rd Qu.:14.00 3rd Qu.:40.00 3rd Qu.:274122 3rd Qu.:99.00
## Max. :24.00 Max. :58.00 Max. :275089 Max. :99.00
## Disciplinary.failure Education Son Social.drinker
## Min. :0.00000 Min. :1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :1.000 Median :1.0000 Median :0.0000
## Mean :0.05405 Mean :1.703 Mean :0.9324 Mean :0.4189
## 3rd Qu.:0.00000 3rd Qu.:3.000 3rd Qu.:2.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :4.000 Max. :3.0000 Max. :1.0000
## Social.smoker Pet Weight Height
## Min. :0.0000 Min. :0.000 Min. : 56.00 Min. :163.0
## 1st Qu.:0.0000 1st Qu.:0.000 1st Qu.: 68.00 1st Qu.:171.0
## Median :0.0000 Median :0.000 Median : 75.00 Median :172.0
## Mean :0.1081 Mean :1.243 Mean : 77.38 Mean :174.3
## 3rd Qu.:0.0000 3rd Qu.:2.000 3rd Qu.: 88.00 3rd Qu.:178.0
## Max. :1.0000 Max. :8.000 Max. :106.00 Max. :196.0
## Body.mass.index Absenteeism.time.in.hours
## Min. :19.00 Min. : 0.000
## 1st Qu.:22.00 1st Qu.: 2.000
## Median :25.00 Median : 3.000
## Mean :25.43 Mean : 8.473
## 3rd Qu.:28.00 3rd Qu.: 8.000
## Max. :38.00 Max. :120.000
colnames(absent_test)[21]<-"hours"
#remove unreasonable observations
#reason for absense should not be 0 since it's not a valid code
absent_test <- absent_test[!(absent_test$Reason.for.absence == 0) ,]
#month of the year cannot be 0
absent_test <- absent_test[!(absent_test$Month.of.absence == 0) ,]
#check the data again
summary(absent_test)
## ID Reason.for.absence Month.of.absence Day.of.the.week
## Min. : 1.00 Min. : 5.0 Min. :4.000 Min. :2.000
## 1st Qu.:14.00 1st Qu.:12.0 1st Qu.:5.000 1st Qu.:3.000
## Median :22.00 Median :19.0 Median :5.000 Median :4.000
## Mean :21.39 Mean :18.7 Mean :5.493 Mean :4.179
## 3rd Qu.:28.50 3rd Qu.:26.0 3rd Qu.:6.000 3rd Qu.:5.000
## Max. :36.00 Max. :28.0 Max. :7.000 Max. :6.000
## Seasons Transportation.expense Distance.from.Residence.to.Work
## Min. :1.000 Min. :118.0 Min. :10.00
## 1st Qu.:1.000 1st Qu.:155.0 1st Qu.:13.00
## Median :3.000 Median :189.0 Median :20.00
## Mean :2.463 Mean :204.6 Mean :22.45
## 3rd Qu.:3.000 3rd Qu.:235.0 3rd Qu.:26.00
## Max. :3.000 Max. :378.0 Max. :52.00
## Service.time Age Work.load.Average.day Hit.target
## Min. : 1.00 Min. :28.00 Min. :237656 Min. :91.00
## 1st Qu.: 9.00 1st Qu.:30.00 1st Qu.:237656 1st Qu.:93.00
## Median :10.00 Median :34.00 Median :246288 Median :96.00
## Mean :11.07 Mean :36.21 Mean :253011 Mean :96.21
## 3rd Qu.:14.00 3rd Qu.:40.00 3rd Qu.:269847 3rd Qu.:99.00
## Max. :24.00 Max. :58.00 Max. :275089 Max. :99.00
## Disciplinary.failure Education Son Social.drinker
## Min. :0 Min. :1.000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0 Median :1.000 Median :1.0000 Median :0.0000
## Mean :0 Mean :1.761 Mean :0.9104 Mean :0.4179
## 3rd Qu.:0 3rd Qu.:3.000 3rd Qu.:2.0000 3rd Qu.:1.0000
## Max. :0 Max. :4.000 Max. :3.0000 Max. :1.0000
## Social.smoker Pet Weight Height
## Min. :0.00000 Min. :0.000 Min. : 56.00 Min. :167.0
## 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.: 66.50 1st Qu.:171.0
## Median :0.00000 Median :0.000 Median : 75.00 Median :172.0
## Mean :0.08955 Mean :1.075 Mean : 76.43 Mean :174.5
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.: 86.00 3rd Qu.:178.0
## Max. :1.00000 Max. :8.000 Max. :106.00 Max. :196.0
## Body.mass.index hours
## Min. :19.00 Min. : 1.000
## 1st Qu.:22.00 1st Qu.: 2.000
## Median :25.00 Median : 3.000
## Mean :25.04 Mean : 9.358
## 3rd Qu.:28.00 3rd Qu.: 8.000
## Max. :38.00 Max. :120.000
table(absent_test$hours)
##
## 1 2 3 4 8 16 24 48 64 80 120
## 4 19 11 4 23 1 1 1 1 1 1
#after checking the data, we found all values for column disciplinary isequal to 0, so we should remove this column as it cannot be a good classifier & also remove ID column
absent_test<-absent_test[,-c(1,12)]
#change dv to categoricl value
for(i in 1:nrow(absent_test)) {
if(absent_test$hours[i] <= 6 & absent_test$hours[i] >0) {
absent_test$hours[i] = "low"
}
else {absent_test$hours[i] = "high"}
}
table(absent_test$hours)
##
## high low
## 25 42
#factorize a few columns
absent_test[,cols_fac] <- lapply(absent_test[,cols_fac], as.factor)
sapply(absent_test, class)
## Reason.for.absence Month.of.absence
## "factor" "factor"
## Day.of.the.week Seasons
## "factor" "factor"
## Transportation.expense Distance.from.Residence.to.Work
## "integer" "integer"
## Service.time Age
## "integer" "integer"
## Work.load.Average.day Hit.target
## "integer" "integer"
## Education Son
## "factor" "integer"
## Social.drinker Social.smoker
## "factor" "factor"
## Pet Weight
## "integer" "integer"
## Height Body.mass.index
## "integer" "integer"
## hours
## "factor"
#normalize the data
absent_test[col_nor] <- as.data.frame(lapply(absent_test[col_nor], normalize))
#remove same columns
absent_test<-absent_test[,-col_remove]
#decision tree model
set.seed(20201205)
absent_c50 <- C5.0(x = absent_train[-13], y = absent_train$hours)
summary(absent_c50)
##
## Call:
## C5.0.default(x = absent_train[-13], y = absent_train$hours)
##
##
## C5.0 [Release 2.07 GPL Edition] Sat Apr 03 15:53:34 2021
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 623 cases (13 attributes) from undefined.data
##
## Decision tree:
##
## Reason.for.absence in {1,3,4,5,6,8,10,11,13,15,17,18,19,21,22,24,
## : 26}: high (244/85)
## Reason.for.absence in {2,7,9,12,14,16,23,25,27,28}: low (379/28)
##
##
## Evaluation on training data (623 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 113(18.1%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 159 28 (a): class high
## 85 351 (b): class low
##
##
## Attribute usage:
##
## 100.00% Reason.for.absence
##
##
## Time: 0.0 secs
#prediction in test set
absent_pred <- predict(absent_c50, absent_test)
(p <- table(absent_pred, absent_test$hours))
##
## absent_pred high low
## high 20 19
## low 5 23
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 64.1791
#We decided regression tree model does not fit this case, so we skipped it
#random forest model
set.seed(20201205)
absent_randomforest <- randomForest(hours ~ . , data = absent_train, ntree = 1200)
#belong 2 lines are written for a wired error, found wired solution on stack overlow :P
absent_test <- rbind(absent_train[1, ] , absent_test)
absent_test <- absent_test[-1,]
#prediction on test set
absent_pred <- predict(absent_randomforest, absent_test)
(p <- table(absent_pred, absent_test$hours))
##
## absent_pred high low
## high 9 5
## low 16 37
(accuracy <- sum(diag(p))/sum(p)*100)
## [1] 68.65672
#check variables importance
varImpPlot(absent_randomforest)
#reason for absence is the most important varibles among all of them, so let's analysis which one is the most frequent reason
#we have to get the raw data again
absent_plot<-data.frame(read.csv("Absenteeism_at_work_train.csv"))
colnames(absent_plot)[21]<-"hours"
absent_plot <- absent_plot[!(absent_plot$Reason.for.absence == 0) ,]
absent_plot <- absent_plot[!(absent_plot$Age == 0) ,]
absent_plot <- absent_plot[!(absent_plot$Work.load.Average.day == 0) ,]
absent_plot <- absent_plot[!(is.na(absent_plot$Hit.target) == T) ,]
absent_plot <- absent_plot[!(is.na(absent_plot$Weight) == T) ,]
#aggregate hours for the same reason
absent_reason <- data.frame(aggregate(absent_plot$hours, by=list(Category = absent_plot$Reason.for.absence), FUN=sum))
absent_reason <- absent_reason[order(absent_reason$x, decreasing = T),]
#plot the reason vs hours
ggplot(data = absent_reason, aes(reorder(Category,x), x)) +
geom_bar(stat="identity") +
geom_col(aes(fill = x)) +
scale_fill_gradient2(low = "white",
high = "red3",
midpoint = median(absent_reason$Category)) +
coord_flip() +
labs(x = 'Hours aggregated',
y = 'Reason for absence',
title = 'Aggregated hours for each reason'
)
We can conclude that accuracy of random forest (71.64%) is slightly higher than decision tree (64.18%). Among all the reasons, reason #13 (Diseases of the musculoskeletal system and connective tissue), #19 (Injury, poisoning and certain other consequences of external causes) and #23 (medical consultation) are the most important reasons for absence.
#give naive bayes a try
absent_nb <- naive_bayes(hours ~ ., data = absent_train)
## Warning: naive_bayes(): Feature Reason.for.absence - zero probabilities are
## present. Consider Laplace smoothing.
absent_nb
##
## ================================== Naive Bayes ==================================
##
## Call:
## naive_bayes.formula(formula = hours ~ ., data = absent_train)
##
## ---------------------------------------------------------------------------------
##
## Laplace smoothing: 0
##
## ---------------------------------------------------------------------------------
##
## A priori probabilities:
##
## high low
## 0.3001605 0.6998395
##
## ---------------------------------------------------------------------------------
##
## Tables:
##
## ---------------------------------------------------------------------------------
## ::: Reason.for.absence (Categorical)
## ---------------------------------------------------------------------------------
##
## Reason.for.absence high low
## 1 0.069518717 0.006880734
## 2 0.000000000 0.002293578
## 3 0.005347594 0.000000000
## 4 0.005347594 0.002293578
## 5 0.010695187 0.000000000
## 6 0.026737968 0.000000000
## 7 0.016042781 0.022935780
## 8 0.016042781 0.002293578
## 9 0.005347594 0.006880734
## 10 0.058823529 0.013761468
## 11 0.053475936 0.032110092
## 12 0.000000000 0.018348624
## 13 0.117647059 0.064220183
## 14 0.021390374 0.027522936
## 15 0.010695187 0.000000000
## 16 0.000000000 0.002293578
## 17 0.005347594 0.000000000
## 18 0.090909091 0.009174312
## 19 0.085561497 0.034403670
## 21 0.021390374 0.004587156
## 22 0.139037433 0.006880734
## 23 0.053475936 0.307339450
## 24 0.016042781 0.000000000
## 25 0.021390374 0.057339450
## 26 0.117647059 0.018348624
## 27 0.000000000 0.133027523
## 28 0.032085561 0.227064220
##
## ---------------------------------------------------------------------------------
## ::: Day.of.the.week (Categorical)
## ---------------------------------------------------------------------------------
##
## Day.of.the.week high low
## 2 0.2620321 0.2133028
## 3 0.2513369 0.1949541
## 4 0.2032086 0.1972477
## 5 0.1176471 0.1857798
## 6 0.1657754 0.2087156
##
## ---------------------------------------------------------------------------------
## ::: Transportation.expense (Gaussian)
## ---------------------------------------------------------------------------------
##
## Transportation.expense high low
## mean 0.6369976 0.5412076
## sd 0.1905748 0.1508206
##
## ---------------------------------------------------------------------------------
## ::: Distance.from.Residence.to.Work (Gaussian)
## ---------------------------------------------------------------------------------
##
## Distance.from.Residence.to.Work high low
## mean 0.5324838 0.5479211
## sd 0.3161333 0.3178227
##
## ---------------------------------------------------------------------------------
## ::: Service.time (Gaussian)
## ---------------------------------------------------------------------------------
##
## Service.time high low
## mean 0.3930481 0.4275885
## sd 0.1570563 0.1539578
##
## ---------------------------------------------------------------------------------
##
## # ... and 7 more tables
##
## ---------------------------------------------------------------------------------
(p_train<-table(predict(absent_nb, absent_train), absent_train$hours))
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
##
## high low
## high 119 43
## low 68 393
(accuracy_training <- sum(diag(p_train)/sum(p_train)*100))
## [1] 82.18299
(p_test <- table(predict(absent_nb, absent_test), absent_test$hours))
## Warning: predict.naive_bayes(): more features in the newdata are provided as
## there are probability tables in the object. Calculation is performed based on
## features to be found in the tables.
##
## high low
## high 19 12
## low 6 30
(accuracy <- sum(diag(p_test))/sum(p_test)*100)
## [1] 73.13433
#not much improvement, let's try ksvm
#svm with linear kernel
absent_svm <- ksvm(hours ~ ., data = absent_train, kernel = 'vanilladot')
## Setting default kernel parameters
summary(absent_svm)
## Length Class Mode
## 1 ksvm S4
absent_svm_predictions <- predict(absent_svm, absent_test)
(p <- table(absent_svm_predictions,absent_test$hours))
##
## absent_svm_predictions high low
## high 19 14
## low 6 28
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 70.14925
#svm with polynomial kernel
absent_svm_1 <- ksvm(hours ~ ., data = absent_train, kernel = 'polydot')
## Setting default kernel parameters
summary(absent_svm_1)
## Length Class Mode
## 1 ksvm S4
absent_svm_predictions_1 <- predict(absent_svm_1, absent_test)
(p <- table(absent_svm_predictions_1,absent_test$hours))
##
## absent_svm_predictions_1 high low
## high 19 14
## low 6 28
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 70.14925
#gaussian kernel
absent_svm_2 <- ksvm(hours ~ ., data = absent_train, kernel = 'rbfdot')
summary(absent_svm_2)
## Length Class Mode
## 1 ksvm S4
absent_svm_predictions_2 <- predict(absent_svm_2, absent_test)
(p <- table(absent_svm_predictions_2,absent_test$hours))
##
## absent_svm_predictions_2 high low
## high 4 2
## low 21 40
(Accuracy <- sum(diag(p))/sum(p)*100)
## [1] 65.67164
The best accuracy belongs to naive bayes method which is 73.13%.
K-means method only applies to non-categorical variables, and the most important variable is the categorical variable reason for absence. So in this part, we’ll remove all categorical variables and take a look at how other numerical variables will affect the absence hours, which means determine which are important non-medical reason for absence.
#make a specific dataset again for numerical-only variables
absent_knn_train <- absent_train[,-c(1,2)]
absent_knn_test <- absent_test[,-c(1,2)]
#build the model
absent_knn_train_labels <- absent_knn_train[, 11]
absent_knn_test_labels <- absent_knn_test[, 11]
#set the k to sqrt(623) which is around 25
absent_knn_test_pred <- knn(train = absent_knn_train[,-11], test = absent_knn_test[-11],
cl = absent_knn_train_labels, k = 25)
(abs_tbl <- table(absent_knn_test_pred, absent_knn_test_labels))
## absent_knn_test_labels
## absent_knn_test_pred high low
## high 8 2
## low 17 40
(Accuracy <- (abs_tbl[1]+abs_tbl[4])/sum(abs_tbl)*100)
## [1] 71.64179
The KNN method gave us accuracy of 71.64%.
Based on all methods, the highest accuracy belongs to naive Bayes analysis, but we decided to choose the random forest method. First, the difference of accuracy is not significant, which doesn’t really have the “best” one. Second, the random forest result make more sense since the result is inconsistent with our common sense. If a person got a bone fracture or a knee injury, or got poisoned, he/she might just stay at home, even if they’re not on vacation, at least for the first few days. Other than that, if someone just feel uncomfortable, especially when symptoms last long, and he/she happened to squeeze into a time window for a doctor’s interview, he/she will probably go for consultation without notice to his/her company. The thing I really want to emphasize is that the US, as a developed nation, has a long way to go for improve social benefit, such as paid day-off, sick leave and unemployment insurance, etc. Compared to Canada, the resident of US has a higher cost of medical expense and are more fearful to lose their job due to absence, not even mention about North European countries. Poverty is not a crime or a consequence of laziness, irresponsible or intelligence problem. It’s a social welfare issue that may affect everyone. Thus, the government needs to focus more on how to improve worker’s benefit instead of just calling “law and order”.