cat('Range of values of deep sleep:' ,range(cleaned_sleep_df$Deep.Sleep.duration))
FALSE Range of values of deep sleep: 12 83
ggplot(cleaned_sleep_df, aes(x = Deep.Sleep.duration)) +
geom_histogram(aes(y = after_stat(density)), binwidth = 0.5, fill = "blue", color = "black") +# Adjust binwidth as needed
geom_density(alpha = 0.2, fill = "#FF6666") +
ggtitle("Distribution of Deep Sleep Duration") +
xlab("Deep Sleep Duration (mins)") +
ylab("Density") +
xlim(c(0, max(cleaned_sleep_df$Deep.Sleep.duration)+10))
Distribution of deep sleep duration
The outcome distribution looks somewhat normally distributed.
colnames(cleaned_sleep_df)
FALSE [1] "X" "Timestamp"
FALSE [3] "Sleep.quality" "Deep.Sleep.duration"
FALSE [5] "Alcohol" "Alcohol.hours.before.bed"
FALSE [7] "Full.before.bed." "Music.while.sleeping."
FALSE [9] "Type.of.music" "Caffiene"
FALSE [11] "Exercise" "Calories.count"
FALSE [13] "Game" "Date.of.the.night"
FALSE [15] "chamomile" "Coffee.Type"
FALSE [17] "Other" "Amount.of.sleep"
FALSE [19] "X.1" "blackout.eye.mask."
FALSE [21] "Melatonin." "REM.sleep"
FALSE [23] "Other.1" "Date.of.night"
FALSE [25] "day_of_week" "Amount.of.sleep.hours"
FALSE [27] "Sleep.quality.ordinal" "Game.past.10pm"
FALSE [29] "chamomile_after_10pm" "chamomile_after_10pm_bin"
FALSE [31] "Calories.count.100"
df_model <- cleaned_sleep_df %>% select('Deep.Sleep.duration', 'Alcohol', 'Game.past.10pm', 'Exercise', 'Calories.count.100',
'chamomile_after_10pm', 'Amount.of.sleep.hours', 'blackout.eye.mask.', 'day_of_week')
kable(summary(df_model)) ## NA spotted in calories count
Deep.Sleep.duration | Alcohol | Game.past.10pm | Exercise | Calories.count.100 | chamomile_after_10pm | Amount.of.sleep.hours | blackout.eye.mask. | day_of_week | |
---|---|---|---|---|---|---|---|---|---|
Min. :12.00 | Length:53 | Length:53 | Length:53 | Min. : 3.000 | Length:53 | Min. :5.500 | Length:53 | Length:53 | |
1st Qu.:40.00 | Class :character | Class :character | Class :character | 1st Qu.: 4.782 | Class :character | 1st Qu.:7.333 | Class :character | Class :character | |
Median :50.00 | Mode :character | Mode :character | Mode :character | Median : 5.700 | Mode :character | Median :7.750 | Mode :character | Mode :character | |
Mean :49.85 | NA | NA | NA | Mean : 7.060 | NA | Mean :7.779 | NA | NA | |
3rd Qu.:59.00 | NA | NA | NA | 3rd Qu.:10.000 | NA | 3rd Qu.:8.183 | NA | NA | |
Max. :83.00 | NA | NA | NA | Max. :15.000 | NA | Max. :9.733 | NA | NA | |
NA | NA | NA | NA | NA’s :1 | NA | NA | NA | NA |
## Check for the missing row
missing_index <- which(is.na(df_model['Calories.count.100']))
kable(df_model[missing_index,]) ## Missing row is for the first obs with exercise = gym
Deep.Sleep.duration | Alcohol | Game.past.10pm | Exercise | Calories.count.100 | chamomile_after_10pm | Amount.of.sleep.hours | blackout.eye.mask. | day_of_week |
---|---|---|---|---|---|---|---|---|
19 | No | False | Gym | NA | True | 7.283333 | No | Tue |
## Impute using Exercise information
df_model$Calories.count.100 <- ave(df_model$Calories.count, df_model$Exercise, FUN=function(x)
ifelse(is.na(x), mean(x, na.rm=TRUE), x))
## Check result
df_model[missing_index,]
FALSE Deep.Sleep.duration Alcohol Game.past.10pm Exercise Calories.count.100
FALSE 1 19 No False Gym 5.611364
FALSE chamomile_after_10pm Amount.of.sleep.hours blackout.eye.mask. day_of_week
FALSE 1 True 7.283333 No Tue
model_1 <- df_model %>% lm(Deep.Sleep.duration ~ ., data = .)
# (xtable(model_1, type = 'pdf'))
(xtable(summary(model_1), type = 'html')) %>% kable(digits = 2)
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 12.68 | 34.57 | 0.37 | 0.72 |
AlcoholYes | -11.59 | 7.60 | -1.52 | 0.14 |
Game.past.10pmTrue | -12.92 | 7.48 | -1.73 | 0.09 |
ExerciseFootball + Gym | -6.76 | 11.76 | -0.57 | 0.57 |
ExerciseGym | -3.37 | 11.81 | -0.28 | 0.78 |
ExerciseNone | 1.77 | 12.82 | 0.14 | 0.89 |
ExerciseSwim | 3.25 | 21.56 | 0.15 | 0.88 |
Calories.count.100 | 0.59 | 1.92 | 0.31 | 0.76 |
chamomile_after_10pmTrue | -4.65 | 6.38 | -0.73 | 0.47 |
Amount.of.sleep.hours | 4.99 | 3.18 | 1.57 | 0.13 |
blackout.eye.mask.Yes | 2.74 | 5.83 | 0.47 | 0.64 |
day_of_weekMon | -2.11 | 10.10 | -0.21 | 0.84 |
day_of_weekSat | -1.75 | 11.61 | -0.15 | 0.88 |
day_of_weekSun | 6.94 | 10.78 | 0.64 | 0.52 |
day_of_weekThu | 6.95 | 9.11 | 0.76 | 0.45 |
day_of_weekTue | -3.26 | 9.19 | -0.35 | 0.72 |
day_of_weekWed | -1.27 | 8.64 | -0.15 | 0.88 |
m1_sum <- summary(model_1)
## Model scores
m1_sum$r.squared
FALSE [1] 0.3801823
m1_sum$adj.r.squared
FALSE [1] 0.1047078
## Evaluate model
library(car)
kable(vif(model_1), caption = 'VIF for model 1 predictors')
GVIF | Df | GVIF^(1/(2*Df)) | |
---|---|---|---|
Alcohol | 1.496252 | 1 | 1.223214 |
Game.past.10pm | 1.937812 | 1 | 1.392053 |
Exercise | 18.918500 | 4 | 1.444145 |
Calories.count.100 | 6.764168 | 1 | 2.600801 |
chamomile_after_10pm | 2.115609 | 1 | 1.454513 |
Amount.of.sleep.hours | 1.907979 | 1 | 1.381296 |
blackout.eye.mask. | 1.346811 | 1 | 1.160522 |
day_of_week | 17.590731 | 6 | 1.269912 |
We can see that the VIF of calories is really high, suggesting that it can be linearly predicted by the rest of the variables. We can try a second model that excludes
reduced_df_model <- df_model %>% select(-Exercise, -day_of_week)
model_2 <- lm(Deep.Sleep.duration ~ . , data = reduced_df_model)
# print(model_2)
(xtable(summary(model_2), type = 'html')) %>% kable(digits = 2)
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | -1.30 | 18.32 | -0.07 | 0.94 |
AlcoholYes | -15.69 | 6.04 | -2.60 | 0.01 |
Game.past.10pmTrue | -12.51 | 5.20 | -2.41 | 0.02 |
Calories.count.100 | 0.80 | 0.73 | 1.10 | 0.28 |
chamomile_after_10pmTrue | -4.87 | 4.44 | -1.10 | 0.28 |
Amount.of.sleep.hours | 6.57 | 2.26 | 2.91 | 0.01 |
blackout.eye.mask.Yes | 2.58 | 4.78 | 0.54 | 0.59 |
m2_sum <- summary(model_2)
## Model scores
m2_sum$r.squared
[1] 0.3113852
m2_sum$adj.r.squared
[1] 0.2215659
## Evaluate model
kable(vif(model_2), caption = 'VIF for model 2 predictors')
x | |
---|---|
Alcohol | 1.086868 |
Game.past.10pm | 1.075199 |
Calories.count.100 | 1.120191 |
chamomile_after_10pm | 1.178987 |
Amount.of.sleep.hours | 1.101311 |
blackout.eye.mask. | 1.040981 |
The adjusted R^2 has significantly improved.
library(randomForest)
library(varImp)
# df_model <- df_model %>%
# mutate(across(where(is.character), as.factor))
## Split data into predictors and response
predictors_df <- df_model %>% select(-Deep.Sleep.duration)
response_df <- df_model$Deep.Sleep.duration
## Fit Random Forest model
rf_model <- randomForest(x = predictors_df, y = response_df, ntree = 500, mtry = floor(sqrt(ncol(predictors_df))), importance = TRUE)
## Show results
print(rf_model)
FALSE
FALSE Call:
FALSE randomForest(x = predictors_df, y = response_df, ntree = 500, mtry = floor(sqrt(ncol(predictors_df))), importance = TRUE)
FALSE Type of random forest: regression
FALSE Number of trees: 500
FALSE No. of variables tried at each split: 2
FALSE
FALSE Mean of squared residuals: 230.9377
FALSE % Var explained: 10.09
## importance
kable(importance(rf_model), caption = 'Feature importance from RF')
%IncMSE | IncNodePurity | |
---|---|---|
Alcohol | 8.1936972 | 796.9705 |
Game.past.10pm | 2.6103778 | 785.5702 |
Exercise | 0.4363052 | 868.5519 |
Calories.count.100 | -3.6083176 | 2009.5532 |
chamomile_after_10pm | -0.2567942 | 588.4329 |
Amount.of.sleep.hours | 11.7117086 | 3136.6855 |
blackout.eye.mask. | -1.9961237 | 313.5721 |
day_of_week | 3.1976341 | 1469.9132 |
varImpPlot(rf_model)
?importance
Compared to the second linear model, there is a drop in the percentage of variance explained.
On both accounts of importance (increase in error, and increase in node purity), amount of sleep edges out above the rest. It proves to be important to not skimp on sleep, i.e. it is hard to control what kind of sleep you get from the total duration. Best is just to sleep more. Alcohol is agreeably detrimental to sleep.
cleaned_sleep_df$Exercise <- factor(cleaned_sleep_df$Exercise, levels = c("None", "Gym", "Football"))
cleaned_sleep_df$Exercise <- as.factor(cleaned_sleep_df$Exercise)
levels(cleaned_sleep_df$Exercise)
FALSE [1] "None" "Gym" "Football"
exercise_subset <- cleaned_sleep_df |> filter(Exercise %in% c("None", "Football", "Gym"))
library(ggplot2)
ggplot(data= exercise_subset, aes(x=Calories.count
, y=Deep.Sleep.duration, color=Exercise)) +
geom_point() +
geom_smooth(method="lm") +
# facet_wrap(~Exercise) +
labs(title="Effect of Calories Burned on Deep Sleep by Exercise Type")
Visually, we can see that the per calorie effect on deep sleep is
different for the different exercises, although they do not share the
same domain (Football is much more taxing and has a much higher calorie
count).
exercise_model <- lm(data = exercise_subset, formula = Deep.Sleep.duration~ Exercise*Calories.count.100)
(xtable(summary(exercise_model), type = 'html')) %>% kable(digits = 2)
Estimate | Std. Error | t value | Pr(>|t|) | |
---|---|---|---|---|
(Intercept) | 82.38 | 29.62 | 2.78 | 0.01 |
ExerciseGym | -40.21 | 32.86 | -1.22 | 0.23 |
ExerciseFootball | -58.59 | 40.92 | -1.43 | 0.16 |
Calories.count.100 | -6.83 | 6.53 | -1.05 | 0.30 |
ExerciseGym:Calories.count.100 | 7.58 | 6.98 | 1.09 | 0.28 |
ExerciseFootball:Calories.count.100 | 9.95 | 7.04 | 1.41 | 0.17 |
We see here that: