Type of linear model - should we use GLM?

cat('Range of values of deep sleep:' ,range(cleaned_sleep_df$Deep.Sleep.duration))

FALSE Range of values of deep sleep: 12 83

ggplot(cleaned_sleep_df, aes(x = Deep.Sleep.duration)) +
  geom_histogram(aes(y = after_stat(density)), binwidth = 0.5, fill = "blue", color = "black") +# Adjust binwidth as needed
  geom_density(alpha = 0.2, fill = "#FF6666") +
  ggtitle("Distribution of Deep Sleep Duration") +
  xlab("Deep Sleep Duration (mins)") +
  ylab("Density") +
  xlim(c(0, max(cleaned_sleep_df$Deep.Sleep.duration)+10))

Distribution of deep sleep duration

The outcome distribution looks somewhat normally distributed.

colnames(cleaned_sleep_df)

FALSE  [1] "X"                        "Timestamp"               
FALSE  [3] "Sleep.quality"            "Deep.Sleep.duration"     
FALSE  [5] "Alcohol"                  "Alcohol.hours.before.bed"
FALSE  [7] "Full.before.bed."         "Music.while.sleeping."   
FALSE  [9] "Type.of.music"            "Caffiene"                
FALSE [11] "Exercise"                 "Calories.count"          
FALSE [13] "Game"                     "Date.of.the.night"       
FALSE [15] "chamomile"                "Coffee.Type"             
FALSE [17] "Other"                    "Amount.of.sleep"         
FALSE [19] "X.1"                      "blackout.eye.mask."      
FALSE [21] "Melatonin."               "REM.sleep"               
FALSE [23] "Other.1"                  "Date.of.night"           
FALSE [25] "day_of_week"              "Amount.of.sleep.hours"   
FALSE [27] "Sleep.quality.ordinal"    "Game.past.10pm"          
FALSE [29] "chamomile_after_10pm"     "chamomile_after_10pm_bin"
FALSE [31] "Calories.count.100"

df_model <- cleaned_sleep_df %>% select('Deep.Sleep.duration', 'Alcohol', 'Game.past.10pm', 'Exercise', 'Calories.count.100',
'chamomile_after_10pm', 'Amount.of.sleep.hours', 'blackout.eye.mask.', 'day_of_week')

kable(summary(df_model)) ## NA spotted in calories count

Deep.Sleep.duration	Alcohol	Game.past.10pm	Exercise	Calories.count.100	chamomile_after_10pm	Amount.of.sleep.hours	blackout.eye.mask.	day_of_week
Min. :12.00	Length:53	Length:53	Length:53	Min. : 3.000	Length:53	Min. :5.500	Length:53	Length:53
1st Qu.:40.00	Class :character	Class :character	Class :character	1st Qu.: 4.782	Class :character	1st Qu.:7.333	Class :character	Class :character
Median :50.00	Mode :character	Mode :character	Mode :character	Median : 5.700	Mode :character	Median :7.750	Mode :character	Mode :character
Mean :49.85	NA	NA	NA	Mean : 7.060	NA	Mean :7.779	NA	NA
3rd Qu.:59.00	NA	NA	NA	3rd Qu.:10.000	NA	3rd Qu.:8.183	NA	NA
Max. :83.00	NA	NA	NA	Max. :15.000	NA	Max. :9.733	NA	NA
NA	NA	NA	NA	NA’s :1	NA	NA	NA	NA

## Check for the missing row 
missing_index <- which(is.na(df_model['Calories.count.100']))
kable(df_model[missing_index,]) ## Missing row is for the first obs with exercise = gym

Deep.Sleep.duration	Alcohol	Game.past.10pm	Exercise	Calories.count.100	chamomile_after_10pm	Amount.of.sleep.hours	blackout.eye.mask.	day_of_week
19	No	False	Gym	NA	True	7.283333	No	Tue

## Impute using Exercise information
df_model$Calories.count.100 <- ave(df_model$Calories.count, df_model$Exercise, FUN=function(x) 
  ifelse(is.na(x), mean(x, na.rm=TRUE), x))

## Check result 
df_model[missing_index,]

FALSE   Deep.Sleep.duration Alcohol Game.past.10pm Exercise Calories.count.100
FALSE 1                  19      No          False      Gym           5.611364
FALSE   chamomile_after_10pm Amount.of.sleep.hours blackout.eye.mask. day_of_week
FALSE 1                 True              7.283333                 No         Tue

Model 1: Full Standard Linear Model (normal errors)

model_1 <- df_model %>% lm(Deep.Sleep.duration ~ ., data = .)
# (xtable(model_1, type = 'pdf'))
(xtable(summary(model_1), type = 'html')) %>% kable(digits = 2)

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	12.68	34.57	0.37	0.72
AlcoholYes	-11.59	7.60	-1.52	0.14
Game.past.10pmTrue	-12.92	7.48	-1.73	0.09
ExerciseFootball + Gym	-6.76	11.76	-0.57	0.57
ExerciseGym	-3.37	11.81	-0.28	0.78
ExerciseNone	1.77	12.82	0.14	0.89
ExerciseSwim	3.25	21.56	0.15	0.88
Calories.count.100	0.59	1.92	0.31	0.76
chamomile_after_10pmTrue	-4.65	6.38	-0.73	0.47
Amount.of.sleep.hours	4.99	3.18	1.57	0.13
blackout.eye.mask.Yes	2.74	5.83	0.47	0.64
day_of_weekMon	-2.11	10.10	-0.21	0.84
day_of_weekSat	-1.75	11.61	-0.15	0.88
day_of_weekSun	6.94	10.78	0.64	0.52
day_of_weekThu	6.95	9.11	0.76	0.45
day_of_weekTue	-3.26	9.19	-0.35	0.72
day_of_weekWed	-1.27	8.64	-0.15	0.88

m1_sum <- summary(model_1)

## Model scores
m1_sum$r.squared

FALSE [1] 0.3801823

m1_sum$adj.r.squared

FALSE [1] 0.1047078

## Evaluate model 
library(car)
kable(vif(model_1), caption = 'VIF for model 1 predictors')

VIF for model 1 predictors
	GVIF	Df	GVIF^(1/(2*Df))
Alcohol	1.496252	1	1.223214
Game.past.10pm	1.937812	1	1.392053
Exercise	18.918500	4	1.444145
Calories.count.100	6.764168	1	2.600801
chamomile_after_10pm	2.115609	1	1.454513
Amount.of.sleep.hours	1.907979	1	1.381296
blackout.eye.mask.	1.346811	1	1.160522
day_of_week	17.590731	6	1.269912

We can see that the VIF of calories is really high, suggesting that it can be linearly predicted by the rest of the variables. We can try a second model that excludes

Feature selection

reduced_df_model <-  df_model %>% select(-Exercise, -day_of_week)
model_2 <- lm(Deep.Sleep.duration ~ . , data = reduced_df_model)
# print(model_2)
(xtable(summary(model_2), type = 'html')) %>% kable(digits = 2)

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	-1.30	18.32	-0.07	0.94
AlcoholYes	-15.69	6.04	-2.60	0.01
Game.past.10pmTrue	-12.51	5.20	-2.41	0.02
Calories.count.100	0.80	0.73	1.10	0.28
chamomile_after_10pmTrue	-4.87	4.44	-1.10	0.28
Amount.of.sleep.hours	6.57	2.26	2.91	0.01
blackout.eye.mask.Yes	2.58	4.78	0.54	0.59

m2_sum <- summary(model_2)

## Model scores
m2_sum$r.squared

[1] 0.3113852

m2_sum$adj.r.squared

[1] 0.2215659

## Evaluate model 
kable(vif(model_2), caption = 'VIF for model 2 predictors')

VIF for model 2 predictors
	x
Alcohol	1.086868
Game.past.10pm	1.075199
Calories.count.100	1.120191
chamomile_after_10pm	1.178987
Amount.of.sleep.hours	1.101311
blackout.eye.mask.	1.040981

The adjusted R^2 has significantly improved.

Random forests

library(randomForest)
library(varImp)
# df_model <- df_model %>%
#   mutate(across(where(is.character), as.factor))

## Split data into predictors and response
predictors_df <- df_model %>% select(-Deep.Sleep.duration)
response_df <- df_model$Deep.Sleep.duration

## Fit Random Forest model
rf_model <- randomForest(x = predictors_df, y = response_df, ntree = 500, mtry = floor(sqrt(ncol(predictors_df))), importance = TRUE)

## Show results
print(rf_model)

FALSE 
FALSE Call:
FALSE  randomForest(x = predictors_df, y = response_df, ntree = 500,      mtry = floor(sqrt(ncol(predictors_df))), importance = TRUE) 
FALSE                Type of random forest: regression
FALSE                      Number of trees: 500
FALSE No. of variables tried at each split: 2
FALSE 
FALSE           Mean of squared residuals: 230.9377
FALSE                     % Var explained: 10.09

## importance
kable(importance(rf_model), caption = 'Feature importance from RF')

Feature importance from RF
	%IncMSE	IncNodePurity
Alcohol	8.1936972	796.9705
Game.past.10pm	2.6103778	785.5702
Exercise	0.4363052	868.5519
Calories.count.100	-3.6083176	2009.5532
chamomile_after_10pm	-0.2567942	588.4329
Amount.of.sleep.hours	11.7117086	3136.6855
blackout.eye.mask.	-1.9961237	313.5721
day_of_week	3.1976341	1469.9132

varImpPlot(rf_model)

?importance

Compared to the second linear model, there is a drop in the percentage of variance explained.

On both accounts of importance (increase in error, and increase in node purity), amount of sleep edges out above the rest. It proves to be important to not skimp on sleep, i.e. it is hard to control what kind of sleep you get from the total duration. Best is just to sleep more. Alcohol is agreeably detrimental to sleep.

Additional questions: Are all calories equal? - Football (cardivascular) vs Gym (resistance)

cleaned_sleep_df$Exercise <- factor(cleaned_sleep_df$Exercise, levels = c("None", "Gym", "Football"))
cleaned_sleep_df$Exercise <- as.factor(cleaned_sleep_df$Exercise)
levels(cleaned_sleep_df$Exercise)

FALSE [1] "None"     "Gym"      "Football"

exercise_subset <- cleaned_sleep_df |>  filter(Exercise %in% c("None", "Football", "Gym"))

library(ggplot2)
ggplot(data= exercise_subset, aes(x=Calories.count
, y=Deep.Sleep.duration, color=Exercise)) +
  geom_point() +
  geom_smooth(method="lm") +
  # facet_wrap(~Exercise) +
  labs(title="Effect of Calories Burned on Deep Sleep by Exercise Type")

Visually, we can see that the per calorie effect on deep sleep is different for the different exercises, although they do not share the same domain (Football is much more taxing and has a much higher calorie count).

exercise_model <- lm(data = exercise_subset, formula = Deep.Sleep.duration~ Exercise*Calories.count.100)
(xtable(summary(exercise_model), type = 'html')) %>% kable(digits = 2)

	Estimate	Std. Error	t value	Pr(>\|t\|)
(Intercept)	82.38	29.62	2.78	0.01
ExerciseGym	-40.21	32.86	-1.22	0.23
ExerciseFootball	-58.59	40.92	-1.43	0.16
Calories.count.100	-6.83	6.53	-1.05	0.30
ExerciseGym:Calories.count.100	7.58	6.98	1.09	0.28
ExerciseFootball:Calories.count.100	9.95	7.04	1.41	0.17

We see here that:

per 100 calories effect with no exercise is -6.83
per 100 calories effect with gym is 0.75
per 100 calories effect with no exercise is 3.12

Modelling Deep Sleep data

Ian Petrus Tan

2024-06-15

Type of linear model - should we use GLM?

Model 1: Full Standard Linear Model (normal errors)

Feature selection

Random forests

Additional questions: Are all calories equal? - Football (cardivascular) vs Gym (resistance)