#Importing the data
mydata <- read.table("Sleep_health_and_lifestyle_dataset.csv", header=TRUE, sep=",",check.names = FALSE)
head(mydata)
##   Person ID Gender Age           Occupation Sleep Duration Quality of Sleep
## 1         1   Male  27    Software Engineer            6.1                6
## 2         2   Male  28               Doctor            6.2                6
## 3         3   Male  28               Doctor            6.2                6
## 4         4   Male  28 Sales Representative            5.9                4
## 5         5   Male  28 Sales Representative            5.9                4
## 6         6   Male  28    Software Engineer            5.9                4
##   Physical Activity Level Stress Level BMI Category Blood Pressure Heart Rate
## 1                      42            6   Overweight         126/83         77
## 2                      60            8       Normal         125/80         75
## 3                      60            8       Normal         125/80         75
## 4                      30            8        Obese         140/90         85
## 5                      30            8        Obese         140/90         85
## 6                      30            8        Obese         140/90         85
##   Daily Steps Sleep Disorder
## 1        4200           None
## 2       10000           None
## 3       10000           None
## 4        3000    Sleep Apnea
## 5        3000    Sleep Apnea
## 6        3000       Insomnia

A data frame with 374 observations on the following 14 variables: Unit of observation: Person - Person ID: A unique number for each person.

Source: LAKSIKA THARMALINGAM (2023) Sleep Health and Lifestyle Dataset. Kaggle. https://www.kaggle.com/datasets/uom190346a/sleep-health-and-lifestyle-dataset

#Delete units to missing data
library(tidyr)
mydata <- mydata %>% drop_na()
library(psych)
describe(mydata[ , c(-1,-2,-4,-9,-10,-13)])
##                         vars   n    mean      sd median trimmed     mad    min
## Age                        1 374   42.18    8.67   43.0   41.84   10.38   27.0
## Sleep Duration             2 374    7.13    0.80    7.2    7.12    1.04    5.8
## Quality of Sleep           3 374    7.31    1.20    7.0    7.32    1.48    4.0
## Physical Activity Level    4 374   59.17   20.83   60.0   58.97   22.24   30.0
## Stress Level               5 374    5.39    1.77    5.0    5.36    2.97    3.0
## Heart Rate                 6 374   70.17    4.14   70.0   69.74    2.97   65.0
## Daily Steps                7 374 6816.84 1617.92 7000.0 6732.67 1482.60 3000.0
##                             max  range  skew kurtosis    se
## Age                        59.0   32.0  0.26    -0.92  0.45
## Sleep Duration              8.5    2.7  0.04    -1.29  0.04
## Quality of Sleep            9.0    5.0 -0.21    -0.77  0.06
## Physical Activity Level    90.0   60.0  0.07    -1.27  1.08
## Stress Level                8.0    5.0  0.15    -1.33  0.09
## Heart Rate                 86.0   21.0  1.22     2.21  0.21
## Daily Steps             10000.0 7000.0  0.18    -0.42 83.66

1. RQ1

library(psych)
describeBy(mydata$`Sleep Duration`, g = mydata$Gender)
## 
##  Descriptive statistics by group 
## group: Female
##    vars   n mean   sd median trimmed  mad min max range skew kurtosis   se
## X1    1 185 7.23 0.88    7.2    7.23 1.33 5.8 8.5   2.7 0.07    -1.48 0.06
## ------------------------------------------------------------ 
## group: Male
##    vars   n mean   sd median trimmed  mad min max range  skew kurtosis   se
## X1    1 189 7.04 0.69    7.2    7.06 0.89 5.9 8.1   2.2 -0.27    -1.53 0.05
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
ggplot(mydata, aes(x = `Sleep Duration`)) +
  geom_histogram(binwidth = 0.2, colour="gray") +
  facet_wrap(~Gender, ncol = 1) + 
  ylab("Frequency")

shapiro.test(mydata$`Sleep Duration`[mydata$Gender == "Female"])
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$`Sleep Duration`[mydata$Gender == "Female"]
## W = 0.89858, p-value = 6.361e-10
shapiro.test(mydata$`Sleep Duration`[mydata$Gender == "Male"])
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$`Sleep Duration`[mydata$Gender == "Male"]
## W = 0.87248, p-value = 1.513e-11
wilcox.test(mydata$`Sleep Duration` ~ mydata$Gender,
            correct = FALSE,
            exact = FALSE,
            alternative = "greater")
## 
##  Wilcoxon rank sum test
## 
## data:  mydata$`Sleep Duration` by mydata$Gender
## W = 20036, p-value = 0.007211
## alternative hypothesis: true location shift is greater than 0
t.test(mydata$`Sleep Duration` ~ mydata$Gender,
       alternative = "greater",
       var.equal = TRUE)
## 
##  Two Sample t-test
## 
## data:  mydata$`Sleep Duration` by mydata$Gender
## t = 2.3624, df = 372, p-value = 0.009334
## alternative hypothesis: true difference in means between group Female and group Male is greater than 0
## 95 percent confidence interval:
##  0.05835527        Inf
## sample estimates:
## mean in group Female   mean in group Male 
##             7.229730             7.036508
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
## 
##     phi
effectsize(wilcox.test(mydata$`Sleep Duration` ~ mydata$Gender,
                       correct = FALSE,
                       exact = FALSE,
                       alternative = "two.sided"))
## r (rank biserial) |       95% CI
## --------------------------------
## 0.15              | [0.03, 0.26]
interpret_rank_biserial(0.15)
## [1] "small"
## (Rules: funder2019)

Conclusion: Using the sample data, we find that there is a difference in the average sleep duration between females and males ( p = 0.007 for the Wilcoxon test). Sleep duration is higher for females. The effect size is small ( r = 0.15 ).The variable Sleep Duration is numeric, but the Shapiro-Wilk test showed that the data is not normally distributed ( p < 0.001 ). Because of this, a non-parametric test was more suitable. In conclusion, females sleep longer on average than males.

2. RQ2

Research question : There is a correlation between Age and Sleep Duration

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
ggpairs(mydata[ ,c(3,5)])

library(Hmisc)
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(as.matrix(mydata[ ,c(3,5)]), 
      type = "spearman")
##                 Age Sleep Duration
## Age            1.00           0.31
## Sleep Duration 0.31           1.00
## 
## n= 374 
## 
## 
## P
##                Age Sleep Duration
## Age                 0            
## Sleep Duration  0

Conclusion: We found out there is a correlation between Age and Sleep Duration. Using Spearman correlation, we found a semi strong positive correlation (ρ=0.31) between Age and Sleep Duration. The choice of Spearman was appropriate because the variables were not normally distributed. We would use Pearson if variables would be normally distributed.

3. RQ3

Research Question: Is there an association between Gender and the type of Sleep Disorder?

results <- chisq.test(mydata$Gender, mydata$`Sleep Disorder`, 
                      correct = TRUE)
results
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$Gender and mydata$`Sleep Disorder`
## X-squared = 54.306, df = 2, p-value = 1.613e-12
addmargins(results$observed)
##              mydata$`Sleep Disorder`
## mydata$Gender Insomnia None Sleep Apnea Sum
##        Female       36   82          67 185
##        Male         41  137          11 189
##        Sum          77  219          78 374

Explanation of Observed/Empirical Variables:

The observed value of 36 means that, in the sample, there are 36 Females with Insomnia. Similarly, the observed value of 11 means that there are 11 Males with Sleep Apnea. These values represent the actual counts recorded in the data.

round(results$expected, 2)
##              mydata$`Sleep Disorder`
## mydata$Gender Insomnia   None Sleep Apnea
##        Female    38.09 108.33       38.58
##        Male      38.91 110.67       39.42

Explanation of Expected/Theoretical Variables:

If there was no association between Gender and the type of Sleep Disorder, the expected number of Females with Insomnia would be 38.09, and the expected number of Males with Sleep Apnea would be 39.42.

round(results$res, 2)
##              mydata$`Sleep Disorder`
## mydata$Gender Insomnia  None Sleep Apnea
##        Female    -0.34 -2.53        4.57
##        Male       0.33  2.50       -4.53

Explanation of Standardized Residual:

There is less Males with Sleep Apnea than expected at a significance level of α = 0.1%

There is more Female with Sleep Apnea than expected at a significance level of α = 0.1%

library(effectsize)
effectsize::cramers_v(mydata$Gender, mydata$`Sleep Disorder`)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.37              | [0.28, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
interpret_cramers_v(0.37)
## [1] "large"
## (Rules: funder2019)

Based on the Chi-Square test, there is a statistically significant association between Gender and the type of Sleep Disorder (p<0.001) with a large effect size (V=0.37).