library(readr)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ purrr     1.0.2
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
sleep_health_and_lifestyle_dataset <- read_csv("C:/Users/acer/Desktop/Sleep_health_and_lifestyle_dataset.csv")
## Rows: 374 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Occupation, BMI Category, Blood Pressure, Sleep Disorder
## dbl (8): Person ID, Age, Sleep Duration, Quality of Sleep, Physical Activity...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(sleep_health_and_lifestyle_dataset)


no_of_rows<-nrow(sleep_health_and_lifestyle_dataset)
print(no_of_rows)
## [1] 374
no_of_columns<-ncol(sleep_health_and_lifestyle_dataset)
print(no_of_columns)
## [1] 13
head(sleep_health_and_lifestyle_dataset,10)
## # A tibble: 10 × 13
##    `Person ID` Gender   Age Occupation       `Sleep Duration` `Quality of Sleep`
##          <dbl> <chr>  <dbl> <chr>                       <dbl>              <dbl>
##  1           1 Male      27 Software Engine…              6.1                  6
##  2           2 Male      28 Doctor                        6.2                  6
##  3           3 Male      28 Doctor                        6.2                  6
##  4           4 Male      28 Sales Represent…              5.9                  4
##  5           5 Male      28 Sales Represent…              5.9                  4
##  6           6 Male      28 Software Engine…              5.9                  4
##  7           7 Male      29 Teacher                       6.3                  6
##  8           8 Male      29 Doctor                        7.8                  7
##  9           9 Male      29 Doctor                        7.8                  7
## 10          10 Male      29 Doctor                        7.8                  7
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## #   `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## #   `Daily Steps` <dbl>, `Sleep Disorder` <chr>
tail(sleep_health_and_lifestyle_dataset,15)
## # A tibble: 15 × 13
##    `Person ID` Gender   Age Occupation `Sleep Duration` `Quality of Sleep`
##          <dbl> <chr>  <dbl> <chr>                 <dbl>              <dbl>
##  1         360 Female    59 Nurse                   8.1                  9
##  2         361 Female    59 Nurse                   8.2                  9
##  3         362 Female    59 Nurse                   8.2                  9
##  4         363 Female    59 Nurse                   8.2                  9
##  5         364 Female    59 Nurse                   8.2                  9
##  6         365 Female    59 Nurse                   8                    9
##  7         366 Female    59 Nurse                   8                    9
##  8         367 Female    59 Nurse                   8.1                  9
##  9         368 Female    59 Nurse                   8                    9
## 10         369 Female    59 Nurse                   8.1                  9
## 11         370 Female    59 Nurse                   8.1                  9
## 12         371 Female    59 Nurse                   8                    9
## 13         372 Female    59 Nurse                   8.1                  9
## 14         373 Female    59 Nurse                   8.1                  9
## 15         374 Female    59 Nurse                   8.1                  9
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## #   `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## #   `Daily Steps` <dbl>, `Sleep Disorder` <chr>
#STATISTICAL TECHNIQUES

mean(sleep_health_and_lifestyle_dataset$Age)
## [1] 42.18449
median(sleep_health_and_lifestyle_dataset$Age)
## [1] 43
standard_deviation <- sd(sleep_health_and_lifestyle_dataset$Age)
print(standard_deviation)
## [1] 8.673133
variance <- var(sleep_health_and_lifestyle_dataset$Age)
print(variance)
## [1] 75.22324
Occupation_Daily_steps<- sleep_health_and_lifestyle_dataset%>% group_by (Occupation)%>%
  summarise(Total_daily_steps = sum(`Daily Steps`))%>%arrange(-Total_daily_steps)
View(Occupation_Daily_steps)


Gender_base<-sleep_health_and_lifestyle_dataset%>% group_by(Gender)%>%
  summarise(Average_sleep_duration = mean(`Sleep Duration`),
            Total_steps_taken=sum(`Daily Steps`),
            Average_quality_of_sleep= mean(`Quality of Sleep`),Average_Stress_level = mean(`Stress Level`))
View(Gender_base)


#FILTERING
high_stress_data <- sleep_health_and_lifestyle_dataset[sleep_health_and_lifestyle_dataset$'Stress Level' > 5, ]
print(high_stress_data)
## # A tibble: 166 × 13
##    `Person ID` Gender   Age Occupation       `Sleep Duration` `Quality of Sleep`
##          <dbl> <chr>  <dbl> <chr>                       <dbl>              <dbl>
##  1           1 Male      27 Software Engine…              6.1                  6
##  2           2 Male      28 Doctor                        6.2                  6
##  3           3 Male      28 Doctor                        6.2                  6
##  4           4 Male      28 Sales Represent…              5.9                  4
##  5           5 Male      28 Sales Represent…              5.9                  4
##  6           6 Male      28 Software Engine…              5.9                  4
##  7           7 Male      29 Teacher                       6.3                  6
##  8           8 Male      29 Doctor                        7.8                  7
##  9           9 Male      29 Doctor                        7.8                  7
## 10          10 Male      29 Doctor                        7.8                  7
## # ℹ 156 more rows
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## #   `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## #   `Daily Steps` <dbl>, `Sleep Disorder` <chr>
#DATA CLEANING

# REMOVING DUPLICATES
sleep_health_and_lifestyle_dataset<- unique(sleep_health_and_lifestyle_dataset)
print(sleep_health_and_lifestyle_dataset)
## # A tibble: 374 × 13
##    `Person ID` Gender   Age Occupation       `Sleep Duration` `Quality of Sleep`
##          <dbl> <chr>  <dbl> <chr>                       <dbl>              <dbl>
##  1           1 Male      27 Software Engine…              6.1                  6
##  2           2 Male      28 Doctor                        6.2                  6
##  3           3 Male      28 Doctor                        6.2                  6
##  4           4 Male      28 Sales Represent…              5.9                  4
##  5           5 Male      28 Sales Represent…              5.9                  4
##  6           6 Male      28 Software Engine…              5.9                  4
##  7           7 Male      29 Teacher                       6.3                  6
##  8           8 Male      29 Doctor                        7.8                  7
##  9           9 Male      29 Doctor                        7.8                  7
## 10          10 Male      29 Doctor                        7.8                  7
## # ℹ 364 more rows
## # ℹ 7 more variables: `Physical Activity Level` <dbl>, `Stress Level` <dbl>,
## #   `BMI Category` <chr>, `Blood Pressure` <chr>, `Heart Rate` <dbl>,
## #   `Daily Steps` <dbl>, `Sleep Disorder` <chr>
#DATA TRANSFORMING

# Scaling numerical variables
sleep_health_and_lifestyle_dataset$Age_scaled <- scale(sleep_health_and_lifestyle_dataset$Age)

# Encoding categorical variables
sleep_health_and_lifestyle_dataset$Gender <- as.factor(sleep_health_and_lifestyle_dataset$Gender)



#Data exploration and visualization

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggplot2)
library(tidyverse)
View(sleep_health_and_lifestyle_dataset)


summary(sleep_health_and_lifestyle_dataset)
##    Person ID         Gender         Age         Occupation       
##  Min.   :  1.00   Female:185   Min.   :27.00   Length:374        
##  1st Qu.: 94.25   Male  :189   1st Qu.:35.25   Class :character  
##  Median :187.50                Median :43.00   Mode  :character  
##  Mean   :187.50                Mean   :42.18                     
##  3rd Qu.:280.75                3rd Qu.:50.00                     
##  Max.   :374.00                Max.   :59.00                     
##  Sleep Duration  Quality of Sleep Physical Activity Level  Stress Level  
##  Min.   :5.800   Min.   :4.000    Min.   :30.00           Min.   :3.000  
##  1st Qu.:6.400   1st Qu.:6.000    1st Qu.:45.00           1st Qu.:4.000  
##  Median :7.200   Median :7.000    Median :60.00           Median :5.000  
##  Mean   :7.132   Mean   :7.313    Mean   :59.17           Mean   :5.385  
##  3rd Qu.:7.800   3rd Qu.:8.000    3rd Qu.:75.00           3rd Qu.:7.000  
##  Max.   :8.500   Max.   :9.000    Max.   :90.00           Max.   :8.000  
##  BMI Category       Blood Pressure       Heart Rate     Daily Steps   
##  Length:374         Length:374         Min.   :65.00   Min.   : 3000  
##  Class :character   Class :character   1st Qu.:68.00   1st Qu.: 5600  
##  Mode  :character   Mode  :character   Median :70.00   Median : 7000  
##                                        Mean   :70.17   Mean   : 6817  
##                                        3rd Qu.:72.00   3rd Qu.: 8000  
##                                        Max.   :86.00   Max.   :10000  
##  Sleep Disorder        Age_scaled.V1    
##  Length:374         Min.   :-1.7507504  
##  Class :character   1st Qu.:-0.7995371  
##  Mode  :character   Median : 0.0940269  
##                     Mean   : 0.0000000  
##                     3rd Qu.: 0.9011170  
##                     Max.   : 1.9388042
#CORRELATION Analysis
cor_matrix <- cor(sleep_health_and_lifestyle_dataset[, c("Sleep Duration", "Physical Activity Level", "Quality of Sleep")])
print(cor_matrix)
##                         Sleep Duration Physical Activity Level Quality of Sleep
## Sleep Duration               1.0000000               0.2123603        0.8832130
## Physical Activity Level      0.2123603               1.0000000        0.1928965
## Quality of Sleep             0.8832130               0.1928965        1.0000000
#Scatter plot
sleep_health_and_lifestyle_dataset%>%
  ggplot(aes(`Sleep Duration`,`Age`))+
  geom_point(col="purple",size=4)

#Histogram
hist(sleep_health_and_lifestyle_dataset$`Heart Rate`,col=c("blue","purple","orange"))

# Calculate the mean stress level for each combination of factors
mean_Stress_Level <-sleep_health_and_lifestyle_dataset %>%
  group_by(`Physical Activity Level`,`Quality of Sleep`,`Sleep Duration`) %>%
  summarise(Mean_Stress_Level = mean(`Stress Level`))
## `summarise()` has grouped output by 'Physical Activity Level', 'Quality of
## Sleep'. You can override using the `.groups` argument.
mean_Stress_Level
## # A tibble: 61 × 4
## # Groups:   Physical Activity Level, Quality of Sleep [32]
##    Physical Activity Lev…¹ `Quality of Sleep` `Sleep Duration` Mean_Stress_Level
##                      <dbl>              <dbl>            <dbl>             <dbl>
##  1                      30                  4              5.9                 8
##  2                      30                  6              6                   8
##  3                      30                  6              6.1                 8
##  4                      30                  6              6.2                 8
##  5                      30                  7              6.8                 6
##  6                      30                  9              8.3                 3
##  7                      30                  9              8.4                 3
##  8                      30                  9              8.5                 3
##  9                      32                  4              5.8                 8
## 10                      35                  5              6.4                 7
## # ℹ 51 more rows
## # ℹ abbreviated name: ¹​`Physical Activity Level`