Type of linear model - should we use GLM?

cat('Range of values of deep sleep:' ,range(cleaned_sleep_df$Deep.Sleep.duration))
FALSE Range of values of deep sleep: 12 83
ggplot(cleaned_sleep_df, aes(x = Deep.Sleep.duration)) +
  geom_histogram(aes(y = after_stat(density)), binwidth = 0.5, fill = "blue", color = "black") +# Adjust binwidth as needed
  geom_density(alpha = 0.2, fill = "#FF6666") +
  ggtitle("Distribution of Deep Sleep Duration") +
  xlab("Deep Sleep Duration (mins)") +
  ylab("Density") +
  xlim(c(0, max(cleaned_sleep_df$Deep.Sleep.duration)+10))
Distribution of deep sleep duration

Distribution of deep sleep duration

The outcome distribution looks somewhat normally distributed.

colnames(cleaned_sleep_df)
FALSE  [1] "X"                        "Timestamp"               
FALSE  [3] "Sleep.quality"            "Deep.Sleep.duration"     
FALSE  [5] "Alcohol"                  "Alcohol.hours.before.bed"
FALSE  [7] "Full.before.bed."         "Music.while.sleeping."   
FALSE  [9] "Type.of.music"            "Caffiene"                
FALSE [11] "Exercise"                 "Calories.count"          
FALSE [13] "Game"                     "Date.of.the.night"       
FALSE [15] "chamomile"                "Coffee.Type"             
FALSE [17] "Other"                    "Amount.of.sleep"         
FALSE [19] "X.1"                      "blackout.eye.mask."      
FALSE [21] "Melatonin."               "REM.sleep"               
FALSE [23] "Other.1"                  "Date.of.night"           
FALSE [25] "day_of_week"              "Amount.of.sleep.hours"   
FALSE [27] "Sleep.quality.ordinal"    "Game.past.10pm"          
FALSE [29] "chamomile_after_10pm"     "chamomile_after_10pm_bin"
FALSE [31] "Calories.count.100"
df_model <- cleaned_sleep_df %>% select('Deep.Sleep.duration', 'Alcohol', 'Game.past.10pm', 'Exercise', 'Calories.count.100',
'chamomile_after_10pm', 'Amount.of.sleep.hours', 'blackout.eye.mask.', 'day_of_week')
kable(summary(df_model)) ## NA spotted in calories count 
Deep.Sleep.duration Alcohol Game.past.10pm Exercise Calories.count.100 chamomile_after_10pm Amount.of.sleep.hours blackout.eye.mask. day_of_week
Min. :12.00 Length:53 Length:53 Length:53 Min. : 3.000 Length:53 Min. :5.500 Length:53 Length:53
1st Qu.:40.00 Class :character Class :character Class :character 1st Qu.: 4.782 Class :character 1st Qu.:7.333 Class :character Class :character
Median :50.00 Mode :character Mode :character Mode :character Median : 5.700 Mode :character Median :7.750 Mode :character Mode :character
Mean :49.85 NA NA NA Mean : 7.060 NA Mean :7.779 NA NA
3rd Qu.:59.00 NA NA NA 3rd Qu.:10.000 NA 3rd Qu.:8.183 NA NA
Max. :83.00 NA NA NA Max. :15.000 NA Max. :9.733 NA NA
NA NA NA NA NA’s :1 NA NA NA NA
## Check for the missing row 
missing_index <- which(is.na(df_model['Calories.count.100']))
kable(df_model[missing_index,]) ## Missing row is for the first obs with exercise = gym
Deep.Sleep.duration Alcohol Game.past.10pm Exercise Calories.count.100 chamomile_after_10pm Amount.of.sleep.hours blackout.eye.mask. day_of_week
19 No False Gym NA True 7.283333 No Tue
## Impute using Exercise information
df_model$Calories.count.100 <- ave(df_model$Calories.count, df_model$Exercise, FUN=function(x) 
  ifelse(is.na(x), mean(x, na.rm=TRUE), x))

## Check result 
df_model[missing_index,] 
FALSE   Deep.Sleep.duration Alcohol Game.past.10pm Exercise Calories.count.100
FALSE 1                  19      No          False      Gym           5.611364
FALSE   chamomile_after_10pm Amount.of.sleep.hours blackout.eye.mask. day_of_week
FALSE 1                 True              7.283333                 No         Tue

Model 1: Full Standard Linear Model (normal errors)

model_1 <- df_model %>% lm(Deep.Sleep.duration ~ ., data = .)
# (xtable(model_1, type = 'pdf'))
(xtable(summary(model_1), type = 'html')) %>% kable(digits = 2)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 12.68 34.57 0.37 0.72
AlcoholYes -11.59 7.60 -1.52 0.14
Game.past.10pmTrue -12.92 7.48 -1.73 0.09
ExerciseFootball + Gym -6.76 11.76 -0.57 0.57
ExerciseGym -3.37 11.81 -0.28 0.78
ExerciseNone 1.77 12.82 0.14 0.89
ExerciseSwim 3.25 21.56 0.15 0.88
Calories.count.100 0.59 1.92 0.31 0.76
chamomile_after_10pmTrue -4.65 6.38 -0.73 0.47
Amount.of.sleep.hours 4.99 3.18 1.57 0.13
blackout.eye.mask.Yes 2.74 5.83 0.47 0.64
day_of_weekMon -2.11 10.10 -0.21 0.84
day_of_weekSat -1.75 11.61 -0.15 0.88
day_of_weekSun 6.94 10.78 0.64 0.52
day_of_weekThu 6.95 9.11 0.76 0.45
day_of_weekTue -3.26 9.19 -0.35 0.72
day_of_weekWed -1.27 8.64 -0.15 0.88
m1_sum <- summary(model_1)

## Model scores
m1_sum$r.squared
FALSE [1] 0.3801823
m1_sum$adj.r.squared
FALSE [1] 0.1047078
## Evaluate model 
library(car)
kable(vif(model_1), caption = 'VIF for model 1 predictors')
VIF for model 1 predictors
GVIF Df GVIF^(1/(2*Df))
Alcohol 1.496252 1 1.223214
Game.past.10pm 1.937812 1 1.392053
Exercise 18.918500 4 1.444145
Calories.count.100 6.764168 1 2.600801
chamomile_after_10pm 2.115609 1 1.454513
Amount.of.sleep.hours 1.907979 1 1.381296
blackout.eye.mask. 1.346811 1 1.160522
day_of_week 17.590731 6 1.269912

We can see that the VIF of calories is really high, suggesting that it can be linearly predicted by the rest of the variables. We can try a second model that excludes

Feature selection

reduced_df_model <-  df_model %>% select(-Exercise, -day_of_week)
model_2 <- lm(Deep.Sleep.duration ~ . , data = reduced_df_model)
# print(model_2)
(xtable(summary(model_2), type = 'html')) %>% kable(digits = 2)
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.30 18.32 -0.07 0.94
AlcoholYes -15.69 6.04 -2.60 0.01
Game.past.10pmTrue -12.51 5.20 -2.41 0.02
Calories.count.100 0.80 0.73 1.10 0.28
chamomile_after_10pmTrue -4.87 4.44 -1.10 0.28
Amount.of.sleep.hours 6.57 2.26 2.91 0.01
blackout.eye.mask.Yes 2.58 4.78 0.54 0.59
m2_sum <- summary(model_2)

## Model scores
m2_sum$r.squared

[1] 0.3113852

m2_sum$adj.r.squared

[1] 0.2215659

## Evaluate model 
kable(vif(model_2), caption = 'VIF for model 2 predictors')
VIF for model 2 predictors
x
Alcohol 1.086868
Game.past.10pm 1.075199
Calories.count.100 1.120191
chamomile_after_10pm 1.178987
Amount.of.sleep.hours 1.101311
blackout.eye.mask. 1.040981

The adjusted R^2 has significantly improved.

Random forests

library(randomForest)
library(varImp)
# df_model <- df_model %>%
#   mutate(across(where(is.character), as.factor))

## Split data into predictors and response
predictors_df <- df_model %>% select(-Deep.Sleep.duration)
response_df <- df_model$Deep.Sleep.duration

## Fit Random Forest model
rf_model <- randomForest(x = predictors_df, y = response_df, ntree = 500, mtry = floor(sqrt(ncol(predictors_df))), importance = TRUE)

## Show results
print(rf_model)
FALSE 
FALSE Call:
FALSE  randomForest(x = predictors_df, y = response_df, ntree = 500,      mtry = floor(sqrt(ncol(predictors_df))), importance = TRUE) 
FALSE                Type of random forest: regression
FALSE                      Number of trees: 500
FALSE No. of variables tried at each split: 2
FALSE 
FALSE           Mean of squared residuals: 230.9377
FALSE                     % Var explained: 10.09
## importance
kable(importance(rf_model), caption = 'Feature importance from RF')
Feature importance from RF
%IncMSE IncNodePurity
Alcohol 8.1936972 796.9705
Game.past.10pm 2.6103778 785.5702
Exercise 0.4363052 868.5519
Calories.count.100 -3.6083176 2009.5532
chamomile_after_10pm -0.2567942 588.4329
Amount.of.sleep.hours 11.7117086 3136.6855
blackout.eye.mask. -1.9961237 313.5721
day_of_week 3.1976341 1469.9132
varImpPlot(rf_model)

?importance

Compared to the second linear model, there is a drop in the percentage of variance explained.

On both accounts of importance (increase in error, and increase in node purity), amount of sleep edges out above the rest. It proves to be important to not skimp on sleep, i.e. it is hard to control what kind of sleep you get from the total duration. Best is just to sleep more. Alcohol is agreeably detrimental to sleep.

Additional questions: Are all calories equal? - Football (cardivascular) vs Gym (resistance)

cleaned_sleep_df$Exercise <- factor(cleaned_sleep_df$Exercise, levels = c("None", "Gym", "Football"))
cleaned_sleep_df$Exercise <- as.factor(cleaned_sleep_df$Exercise)
levels(cleaned_sleep_df$Exercise)
FALSE [1] "None"     "Gym"      "Football"
exercise_subset <- cleaned_sleep_df |>  filter(Exercise %in% c("None", "Football", "Gym"))

library(ggplot2)
ggplot(data= exercise_subset, aes(x=Calories.count
, y=Deep.Sleep.duration, color=Exercise)) +
  geom_point() +
  geom_smooth(method="lm") +
  # facet_wrap(~Exercise) +
  labs(title="Effect of Calories Burned on Deep Sleep by Exercise Type")

Visually, we can see that the per calorie effect on deep sleep is different for the different exercises, although they do not share the same domain (Football is much more taxing and has a much higher calorie count).

exercise_model <- lm(data = exercise_subset, formula = Deep.Sleep.duration~ Exercise*Calories.count.100)
(xtable(summary(exercise_model), type = 'html')) %>% kable(digits = 2)
Estimate Std. Error t value Pr(>|t|)
(Intercept) 82.38 29.62 2.78 0.01
ExerciseGym -40.21 32.86 -1.22 0.23
ExerciseFootball -58.59 40.92 -1.43 0.16
Calories.count.100 -6.83 6.53 -1.05 0.30
ExerciseGym:Calories.count.100 7.58 6.98 1.09 0.28
ExerciseFootball:Calories.count.100 9.95 7.04 1.41 0.17


We see here that: