library(tidyverse)
library(GGally)
set.seed(123)
data1 <- read_csv("/Users/mohamedhassan/Downloads/Sleep_Efficiency.csv")
summary(data1)
##        ID             Age           Gender         
##  Min.   :  1.0   Min.   : 9.00   Length:452        
##  1st Qu.:113.8   1st Qu.:29.00   Class :character  
##  Median :226.5   Median :40.00   Mode  :character  
##  Mean   :226.5   Mean   :40.29                     
##  3rd Qu.:339.2   3rd Qu.:52.00                     
##  Max.   :452.0   Max.   :69.00                     
##                                                    
##     Bedtime                        Wakeup time                    
##  Min.   :2021-01-03 00:30:00.00   Min.   :2021-01-03 08:30:00.00  
##  1st Qu.:2021-04-14 01:07:30.00   1st Qu.:2021-04-14 07:52:30.00  
##  Median :2021-07-20 23:30:00.00   Median :2021-07-20 16:00:00.00  
##  Mean   :2021-07-13 00:03:39.02   Mean   :2021-07-12 20:19:22.82  
##  3rd Qu.:2021-10-11 05:22:30.00   3rd Qu.:2021-10-11 05:52:30.00  
##  Max.   :2021-12-31 21:00:00.00   Max.   :2021-12-31 06:30:00.00  
##                                                                   
##  Sleep duration   Sleep efficiency REM sleep percentage Deep sleep percentage
##  Min.   : 5.000   Min.   :0.5000   Min.   :15.00        Min.   :18.00        
##  1st Qu.: 7.000   1st Qu.:0.6975   1st Qu.:20.00        1st Qu.:48.25        
##  Median : 7.500   Median :0.8200   Median :22.00        Median :58.00        
##  Mean   : 7.466   Mean   :0.7889   Mean   :22.62        Mean   :52.82        
##  3rd Qu.: 8.000   3rd Qu.:0.9000   3rd Qu.:25.00        3rd Qu.:63.00        
##  Max.   :10.000   Max.   :0.9900   Max.   :30.00        Max.   :75.00        
##                                                                              
##  Light sleep percentage   Awakenings    Caffeine consumption
##  Min.   : 7.00          Min.   :0.000   Min.   :  0.00      
##  1st Qu.:15.00          1st Qu.:1.000   1st Qu.:  0.00      
##  Median :18.00          Median :1.000   Median : 25.00      
##  Mean   :24.56          Mean   :1.641   Mean   : 23.65      
##  3rd Qu.:32.50          3rd Qu.:3.000   3rd Qu.: 50.00      
##  Max.   :63.00          Max.   :4.000   Max.   :200.00      
##                         NA's   :20      NA's   :25          
##  Alcohol consumption Smoking status     Exercise frequency
##  Min.   :0.000       Length:452         Min.   :0.000     
##  1st Qu.:0.000       Class :character   1st Qu.:0.000     
##  Median :0.000       Mode  :character   Median :2.000     
##  Mean   :1.174                          Mean   :1.791     
##  3rd Qu.:2.000                          3rd Qu.:3.000     
##  Max.   :5.000                          Max.   :5.000     
##  NA's   :14                             NA's   :6
data2 <- data1 %>% 
  # quadratic term - squaring rem sleep
  mutate(rem_sleep_squared = `REM sleep percentage`^2,
         # converting smoking status to binary values (0 and 1)
         smoke_dichotomous = recode(`Smoking status`, Yes = 1, No = 0),
         # quadratic term(Sleep efficiency) * dichotomous term(smoke dichotomous)
         sleep_eff_smoke_interaction = `Sleep efficiency` * smoke_dichotomous)
data2
## # A tibble: 452 × 18
##       ID   Age Gender Bedtime             `Wakeup time`       `Sleep duration`
##    <dbl> <dbl> <chr>  <dttm>              <dttm>                         <dbl>
##  1     1    65 Female 2021-03-06 01:00:00 2021-03-06 07:00:00              6  
##  2     2    69 Male   2021-12-05 02:00:00 2021-12-05 09:00:00              7  
##  3     3    40 Female 2021-05-25 21:30:00 2021-05-25 05:30:00              8  
##  4     4    40 Female 2021-11-03 02:30:00 2021-11-03 08:30:00              6  
##  5     5    57 Male   2021-03-13 01:00:00 2021-03-13 09:00:00              8  
##  6     6    36 Female 2021-07-01 21:00:00 2021-07-01 04:30:00              7.5
##  7     7    27 Female 2021-07-21 21:00:00 2021-07-21 03:00:00              6  
##  8     8    53 Male   2021-08-16 00:30:00 2021-08-16 10:30:00             10  
##  9     9    41 Female 2021-04-05 02:30:00 2021-04-05 08:30:00              6  
## 10    10    11 Female 2021-09-16 01:00:00 2021-09-16 10:00:00              9  
## # ℹ 442 more rows
## # ℹ 12 more variables: `Sleep efficiency` <dbl>, `REM sleep percentage` <dbl>,
## #   `Deep sleep percentage` <dbl>, `Light sleep percentage` <dbl>,
## #   Awakenings <dbl>, `Caffeine consumption` <dbl>,
## #   `Alcohol consumption` <dbl>, `Smoking status` <chr>,
## #   `Exercise frequency` <dbl>, rem_sleep_squared <dbl>,
## #   smoke_dichotomous <dbl>, sleep_eff_smoke_interaction <dbl>
# Using Sleep duration as the dependent variable, and including Age, sleep efficiency and deep sleep % as independent variables
model1 <- lm(`Sleep duration` ~ rem_sleep_squared + smoke_dichotomous + sleep_eff_smoke_interaction + 
               `Sleep efficiency` + `Deep sleep percentage` + Age, data=data2)
summary(model1)
## 
## Call:
## lm(formula = `Sleep duration` ~ rem_sleep_squared + smoke_dichotomous + 
##     sleep_eff_smoke_interaction + `Sleep efficiency` + `Deep sleep percentage` + 
##     Age, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.56320 -0.45588  0.00158  0.56214  2.65085 
## 
## Coefficients:
##                               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  8.2806861  0.3995824  20.723   <2e-16 ***
## rem_sleep_squared           -0.0001525  0.0002783  -0.548   0.5840    
## smoke_dichotomous           -1.0133272  0.5022720  -2.017   0.0442 *  
## sleep_eff_smoke_interaction  1.3058531  0.6346333   2.058   0.0402 *  
## `Sleep efficiency`          -0.3858172  0.6212145  -0.621   0.5349    
## `Deep sleep percentage`     -0.0047025  0.0046649  -1.008   0.3140    
## Age                         -0.0040664  0.0031302  -1.299   0.1946    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8657 on 445 degrees of freedom
## Multiple R-squared:  0.01552,    Adjusted R-squared:  0.002247 
## F-statistic: 1.169 on 6 and 445 DF,  p-value: 0.3216
model2 <- update(model1, .~. -rem_sleep_squared,
data=data2)
summary(model2)
## 
## Call:
## lm(formula = `Sleep duration` ~ smoke_dichotomous + sleep_eff_smoke_interaction + 
##     `Sleep efficiency` + `Deep sleep percentage` + Age, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.57663 -0.44891  0.00218  0.56329  2.59922 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  8.236321   0.390987  21.065   <2e-16 ***
## smoke_dichotomous           -1.024727   0.501447  -2.044   0.0416 *  
## sleep_eff_smoke_interaction  1.316814   0.633820   2.078   0.0383 *  
## `Sleep efficiency`          -0.500989   0.584122  -0.858   0.3915    
## `Deep sleep percentage`     -0.003663   0.004258  -0.860   0.3902    
## Age                         -0.004028   0.003127  -1.288   0.1984    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.865 on 446 degrees of freedom
## Multiple R-squared:  0.01486,    Adjusted R-squared:  0.003812 
## F-statistic: 1.345 on 5 and 446 DF,  p-value: 0.2441
model3 <- update(model2, .~. -`Deep sleep percentage`,
data=data2)
summary(model3)
## 
## Call:
## lm(formula = `Sleep duration` ~ smoke_dichotomous + sleep_eff_smoke_interaction + 
##     `Sleep efficiency` + Age, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.57271 -0.46611 -0.01631  0.55245  2.61113 
## 
## Coefficients:
##                              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  8.276003   0.388143  21.322   <2e-16 ***
## smoke_dichotomous           -0.983804   0.499040  -1.971   0.0493 *  
## sleep_eff_smoke_interaction  1.263409   0.630589   2.004   0.0457 *  
## `Sleep efficiency`          -0.809467   0.460941  -1.756   0.0798 .  
## Age                         -0.003789   0.003114  -1.217   0.2243    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8647 on 447 degrees of freedom
## Multiple R-squared:  0.01322,    Adjusted R-squared:  0.004392 
## F-statistic: 1.497 on 4 and 447 DF,  p-value: 0.2019
model4 <- update(model3, .~. -`Age`,
data=data2)
summary(model4)
## 
## Call:
## lm(formula = `Sleep duration` ~ smoke_dichotomous + sleep_eff_smoke_interaction + 
##     `Sleep efficiency`, data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.54472 -0.46056 -0.02788  0.54160  2.62589 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                   8.1695     0.3783  21.593   <2e-16 ***
## smoke_dichotomous            -1.0080     0.4989  -2.020   0.0439 *  
## sleep_eff_smoke_interaction   1.2856     0.6307   2.039   0.0421 *  
## `Sleep efficiency`           -0.8645     0.4590  -1.884   0.0603 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8652 on 448 degrees of freedom
## Multiple R-squared:  0.009953,   Adjusted R-squared:  0.003324 
## F-statistic: 1.501 on 3 and 448 DF,  p-value: 0.2135
model5 <- update(model4, .~. -`Sleep efficiency`,
data=data2)
summary(model5)
## 
## Call:
## lm(formula = `Sleep duration` ~ smoke_dichotomous + sleep_eff_smoke_interaction, 
##     data = data2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.54472 -0.46309  0.03691  0.53691  2.59004 
## 
## Coefficients:
##                             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                  7.46309    0.05026 148.487   <2e-16 ***
## smoke_dichotomous           -0.30158    0.32999  -0.914    0.361    
## sleep_eff_smoke_interaction  0.42111    0.43376   0.971    0.332    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8676 on 449 degrees of freedom
## Multiple R-squared:  0.002112,   Adjusted R-squared:  -0.002332 
## F-statistic: 0.4752 on 2 and 449 DF,  p-value: 0.622

Residual vs. Fitted Values Plot

ggplot(model4, aes(x = .fitted, y = .resid)) +
  geom_point() +
  geom_hline(yintercept = 0, linetype = "dashed") +
  labs(title="Residual vs. Fitted Values Plot") +
  xlab("Fitted values") +
  ylab("Residuals")

plot(fitted(model4),resid(model4))

Plotting Original Data with Fitted Line

Histogram of Residual Values

ggplot(data = model4, aes(x = model4$residuals)) +
    geom_histogram(bins = 10, fill = 'steelblue', color = 'black') +
    labs(title = 'Histogram of Residuals', x = 'Residuals', y = 'Frequency')

hist(model4$residuals, breaks = 10)

Q-Q Plot of Residual Values

qqnorm(resid(model4))
qqline(resid(model4))

par(mfrow=c(2,2))
plot(model4)