library(tidyverse)
library(GGally)
set.seed(123)
data1 <- read_csv("/Users/mohamedhassan/Downloads/Sleep_Efficiency.csv")
summary(data1)
## ID Age Gender
## Min. : 1.0 Min. : 9.00 Length:452
## 1st Qu.:113.8 1st Qu.:29.00 Class :character
## Median :226.5 Median :40.00 Mode :character
## Mean :226.5 Mean :40.29
## 3rd Qu.:339.2 3rd Qu.:52.00
## Max. :452.0 Max. :69.00
##
## Bedtime Wakeup time
## Min. :2021-01-03 00:30:00.00 Min. :2021-01-03 08:30:00.00
## 1st Qu.:2021-04-14 01:07:30.00 1st Qu.:2021-04-14 07:52:30.00
## Median :2021-07-20 23:30:00.00 Median :2021-07-20 16:00:00.00
## Mean :2021-07-13 00:03:39.02 Mean :2021-07-12 20:19:22.82
## 3rd Qu.:2021-10-11 05:22:30.00 3rd Qu.:2021-10-11 05:52:30.00
## Max. :2021-12-31 21:00:00.00 Max. :2021-12-31 06:30:00.00
##
## Sleep duration Sleep efficiency REM sleep percentage Deep sleep percentage
## Min. : 5.000 Min. :0.5000 Min. :15.00 Min. :18.00
## 1st Qu.: 7.000 1st Qu.:0.6975 1st Qu.:20.00 1st Qu.:48.25
## Median : 7.500 Median :0.8200 Median :22.00 Median :58.00
## Mean : 7.466 Mean :0.7889 Mean :22.62 Mean :52.82
## 3rd Qu.: 8.000 3rd Qu.:0.9000 3rd Qu.:25.00 3rd Qu.:63.00
## Max. :10.000 Max. :0.9900 Max. :30.00 Max. :75.00
##
## Light sleep percentage Awakenings Caffeine consumption
## Min. : 7.00 Min. :0.000 Min. : 0.00
## 1st Qu.:15.00 1st Qu.:1.000 1st Qu.: 0.00
## Median :18.00 Median :1.000 Median : 25.00
## Mean :24.56 Mean :1.641 Mean : 23.65
## 3rd Qu.:32.50 3rd Qu.:3.000 3rd Qu.: 50.00
## Max. :63.00 Max. :4.000 Max. :200.00
## NA's :20 NA's :25
## Alcohol consumption Smoking status Exercise frequency
## Min. :0.000 Length:452 Min. :0.000
## 1st Qu.:0.000 Class :character 1st Qu.:0.000
## Median :0.000 Mode :character Median :2.000
## Mean :1.174 Mean :1.791
## 3rd Qu.:2.000 3rd Qu.:3.000
## Max. :5.000 Max. :5.000
## NA's :14 NA's :6
data2 <- data1 %>%
# quadratic term - squaring rem sleep
mutate(rem_sleep_squared = `REM sleep percentage`^2,
# converting smoking status to binary values (0 and 1)
smoke_dichotomous = recode(`Smoking status`, Yes = 1, No = 0),
# quadratic term(Sleep efficiency) * dichotomous term(smoke dichotomous)
sleep_eff_smoke_interaction = `Sleep efficiency` * smoke_dichotomous)
data2
## # A tibble: 452 × 18
## ID Age Gender Bedtime `Wakeup time` `Sleep duration`
## <dbl> <dbl> <chr> <dttm> <dttm> <dbl>
## 1 1 65 Female 2021-03-06 01:00:00 2021-03-06 07:00:00 6
## 2 2 69 Male 2021-12-05 02:00:00 2021-12-05 09:00:00 7
## 3 3 40 Female 2021-05-25 21:30:00 2021-05-25 05:30:00 8
## 4 4 40 Female 2021-11-03 02:30:00 2021-11-03 08:30:00 6
## 5 5 57 Male 2021-03-13 01:00:00 2021-03-13 09:00:00 8
## 6 6 36 Female 2021-07-01 21:00:00 2021-07-01 04:30:00 7.5
## 7 7 27 Female 2021-07-21 21:00:00 2021-07-21 03:00:00 6
## 8 8 53 Male 2021-08-16 00:30:00 2021-08-16 10:30:00 10
## 9 9 41 Female 2021-04-05 02:30:00 2021-04-05 08:30:00 6
## 10 10 11 Female 2021-09-16 01:00:00 2021-09-16 10:00:00 9
## # ℹ 442 more rows
## # ℹ 12 more variables: `Sleep efficiency` <dbl>, `REM sleep percentage` <dbl>,
## # `Deep sleep percentage` <dbl>, `Light sleep percentage` <dbl>,
## # Awakenings <dbl>, `Caffeine consumption` <dbl>,
## # `Alcohol consumption` <dbl>, `Smoking status` <chr>,
## # `Exercise frequency` <dbl>, rem_sleep_squared <dbl>,
## # smoke_dichotomous <dbl>, sleep_eff_smoke_interaction <dbl>
# Using Sleep duration as the dependent variable, and including Age, sleep efficiency and deep sleep % as independent variables
model1 <- lm(`Sleep duration` ~ rem_sleep_squared + smoke_dichotomous + sleep_eff_smoke_interaction +
`Sleep efficiency` + `Deep sleep percentage` + Age, data=data2)
summary(model1)
##
## Call:
## lm(formula = `Sleep duration` ~ rem_sleep_squared + smoke_dichotomous +
## sleep_eff_smoke_interaction + `Sleep efficiency` + `Deep sleep percentage` +
## Age, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.56320 -0.45588 0.00158 0.56214 2.65085
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.2806861 0.3995824 20.723 <2e-16 ***
## rem_sleep_squared -0.0001525 0.0002783 -0.548 0.5840
## smoke_dichotomous -1.0133272 0.5022720 -2.017 0.0442 *
## sleep_eff_smoke_interaction 1.3058531 0.6346333 2.058 0.0402 *
## `Sleep efficiency` -0.3858172 0.6212145 -0.621 0.5349
## `Deep sleep percentage` -0.0047025 0.0046649 -1.008 0.3140
## Age -0.0040664 0.0031302 -1.299 0.1946
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8657 on 445 degrees of freedom
## Multiple R-squared: 0.01552, Adjusted R-squared: 0.002247
## F-statistic: 1.169 on 6 and 445 DF, p-value: 0.3216
model2 <- update(model1, .~. -rem_sleep_squared,
data=data2)
summary(model2)
##
## Call:
## lm(formula = `Sleep duration` ~ smoke_dichotomous + sleep_eff_smoke_interaction +
## `Sleep efficiency` + `Deep sleep percentage` + Age, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.57663 -0.44891 0.00218 0.56329 2.59922
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.236321 0.390987 21.065 <2e-16 ***
## smoke_dichotomous -1.024727 0.501447 -2.044 0.0416 *
## sleep_eff_smoke_interaction 1.316814 0.633820 2.078 0.0383 *
## `Sleep efficiency` -0.500989 0.584122 -0.858 0.3915
## `Deep sleep percentage` -0.003663 0.004258 -0.860 0.3902
## Age -0.004028 0.003127 -1.288 0.1984
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.865 on 446 degrees of freedom
## Multiple R-squared: 0.01486, Adjusted R-squared: 0.003812
## F-statistic: 1.345 on 5 and 446 DF, p-value: 0.2441
model3 <- update(model2, .~. -`Deep sleep percentage`,
data=data2)
summary(model3)
##
## Call:
## lm(formula = `Sleep duration` ~ smoke_dichotomous + sleep_eff_smoke_interaction +
## `Sleep efficiency` + Age, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.57271 -0.46611 -0.01631 0.55245 2.61113
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.276003 0.388143 21.322 <2e-16 ***
## smoke_dichotomous -0.983804 0.499040 -1.971 0.0493 *
## sleep_eff_smoke_interaction 1.263409 0.630589 2.004 0.0457 *
## `Sleep efficiency` -0.809467 0.460941 -1.756 0.0798 .
## Age -0.003789 0.003114 -1.217 0.2243
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8647 on 447 degrees of freedom
## Multiple R-squared: 0.01322, Adjusted R-squared: 0.004392
## F-statistic: 1.497 on 4 and 447 DF, p-value: 0.2019
model4 <- update(model3, .~. -`Age`,
data=data2)
summary(model4)
##
## Call:
## lm(formula = `Sleep duration` ~ smoke_dichotomous + sleep_eff_smoke_interaction +
## `Sleep efficiency`, data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.54472 -0.46056 -0.02788 0.54160 2.62589
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.1695 0.3783 21.593 <2e-16 ***
## smoke_dichotomous -1.0080 0.4989 -2.020 0.0439 *
## sleep_eff_smoke_interaction 1.2856 0.6307 2.039 0.0421 *
## `Sleep efficiency` -0.8645 0.4590 -1.884 0.0603 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8652 on 448 degrees of freedom
## Multiple R-squared: 0.009953, Adjusted R-squared: 0.003324
## F-statistic: 1.501 on 3 and 448 DF, p-value: 0.2135
model5 <- update(model4, .~. -`Sleep efficiency`,
data=data2)
summary(model5)
##
## Call:
## lm(formula = `Sleep duration` ~ smoke_dichotomous + sleep_eff_smoke_interaction,
## data = data2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.54472 -0.46309 0.03691 0.53691 2.59004
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.46309 0.05026 148.487 <2e-16 ***
## smoke_dichotomous -0.30158 0.32999 -0.914 0.361
## sleep_eff_smoke_interaction 0.42111 0.43376 0.971 0.332
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8676 on 449 degrees of freedom
## Multiple R-squared: 0.002112, Adjusted R-squared: -0.002332
## F-statistic: 0.4752 on 2 and 449 DF, p-value: 0.622
Residual vs. Fitted Values Plot
ggplot(model4, aes(x = .fitted, y = .resid)) +
geom_point() +
geom_hline(yintercept = 0, linetype = "dashed") +
labs(title="Residual vs. Fitted Values Plot") +
xlab("Fitted values") +
ylab("Residuals")

plot(fitted(model4),resid(model4))

Plotting Original Data with Fitted Line
Histogram of Residual Values
ggplot(data = model4, aes(x = model4$residuals)) +
geom_histogram(bins = 10, fill = 'steelblue', color = 'black') +
labs(title = 'Histogram of Residuals', x = 'Residuals', y = 'Frequency')

hist(model4$residuals, breaks = 10)

Q-Q Plot of Residual Values
qqnorm(resid(model4))
qqline(resid(model4))

par(mfrow=c(2,2))
plot(model4)
