library(readr)
library(estadistica)
library(ggplot2)

Sleep <- read_csv("Sleep_health_and_lifestyle_dataset.csv")
## Rows: 374 Columns: 13
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Occupation, BMI Category, Blood Pressure, Sleep Disorder
## dbl (8): Person ID, Age, Sleep Duration, Quality of Sleep, Physical Activity...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Sleep=as.data.frame(unclass(Sleep),
                    stringsAsFactors = TRUE)
str(Sleep)
## 'data.frame':    374 obs. of  13 variables:
##  $ Person.ID              : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ Gender                 : Factor w/ 2 levels "Female","Male": 2 2 2 2 2 2 2 2 2 2 ...
##  $ Age                    : num  27 28 28 28 28 28 29 29 29 29 ...
##  $ Occupation             : Factor w/ 11 levels "Accountant","Doctor",..: 10 2 2 7 7 10 11 2 2 2 ...
##  $ Sleep.Duration         : num  6.1 6.2 6.2 5.9 5.9 5.9 6.3 7.8 7.8 7.8 ...
##  $ Quality.of.Sleep       : num  6 6 6 4 4 4 6 7 7 7 ...
##  $ Physical.Activity.Level: num  42 60 60 30 30 30 40 75 75 75 ...
##  $ Stress.Level           : num  6 8 8 8 8 8 7 6 6 6 ...
##  $ BMI.Category           : Factor w/ 4 levels "Normal","Normal Weight",..: 4 1 1 3 3 3 3 1 1 1 ...
##  $ Blood.Pressure         : Factor w/ 25 levels "115/75","115/78",..: 12 10 10 23 23 23 23 7 7 7 ...
##  $ Heart.Rate             : num  77 75 75 85 85 85 82 70 70 70 ...
##  $ Daily.Steps            : num  4200 10000 10000 3000 3000 3000 3500 8000 8000 8000 ...
##  $ Sleep.Disorder         : Factor w/ 3 levels "Insomnia","None",..: 2 2 2 3 3 1 1 2 2 2 ...
Sleep$Quality.of.Sleep=as.factor(Sleep$Quality.of.Sleep)

summary(Sleep)
##    Person.ID         Gender         Age             Occupation Sleep.Duration 
##  Min.   :  1.00   Female:185   Min.   :27.00   Nurse     :73   Min.   :5.800  
##  1st Qu.: 94.25   Male  :189   1st Qu.:35.25   Doctor    :71   1st Qu.:6.400  
##  Median :187.50                Median :43.00   Engineer  :63   Median :7.200  
##  Mean   :187.50                Mean   :42.18   Lawyer    :47   Mean   :7.132  
##  3rd Qu.:280.75                3rd Qu.:50.00   Teacher   :40   3rd Qu.:7.800  
##  Max.   :374.00                Max.   :59.00   Accountant:37   Max.   :8.500  
##                                                (Other)   :43                  
##  Quality.of.Sleep Physical.Activity.Level  Stress.Level          BMI.Category
##  4:  5            Min.   :30.00           Min.   :3.000   Normal       :195  
##  5:  7            1st Qu.:45.00           1st Qu.:4.000   Normal Weight: 21  
##  6:105            Median :60.00           Median :5.000   Obese        : 10  
##  7: 77            Mean   :59.17           Mean   :5.385   Overweight   :148  
##  8:109            3rd Qu.:75.00           3rd Qu.:7.000                      
##  9: 71            Max.   :90.00           Max.   :8.000                      
##                                                                              
##  Blood.Pressure   Heart.Rate     Daily.Steps        Sleep.Disorder
##  130/85 :99     Min.   :65.00   Min.   : 3000   Insomnia   : 77   
##  125/80 :65     1st Qu.:68.00   1st Qu.: 5600   None       :219   
##  140/95 :65     Median :70.00   Median : 7000   Sleep Apnea: 78   
##  120/80 :45     Mean   :70.17   Mean   : 6817                     
##  115/75 :32     3rd Qu.:72.00   3rd Qu.: 8000                     
##  135/90 :27     Max.   :86.00   Max.   :10000                     
##  (Other):41
###
# La variable Sleep Disorder contiene valores faltantes. En esta base se asumirán
# como ausencia de trastorno del sueño.
Sleep$Sleep.Disorder[is.na(Sleep$Sleep.Disorder)] <- "No Disorder"
## Warning in `[<-.factor`(`*tmp*`, is.na(Sleep$Sleep.Disorder), value =
## structure(c(2L, : invalid factor level, NA generated
Sleep$Sleep.Disorder <- as.factor(Sleep$Sleep.Disorder)

# Separación de la presión arterial en sistólica y diastólica
bp_split <- strsplit(as.character(Sleep$Blood.Pressure), "/")
Sleep$Systolic <- as.numeric(sapply(bp_split, `[`, 1))
Sleep$Diastolic <- as.numeric(sapply(bp_split, `[`, 2))

# Variable binaria auxiliar para modelado logístico
# 0 = No Disorder, 1 = presenta algún trastorno
Sleep$Sleep.Disorder.Binary <- ifelse(Sleep$Sleep.Disorder == "None", 0, 1)
Sleep$Sleep.Disorder.Binary <- as.factor(Sleep$Sleep.Disorder.Binary)

#Resumen estadístico
n=nrow(Sleep)
max(Sleep$Age)
## [1] 59
min(Sleep$Age)
## [1] 27
round(mean(Sleep$Age),1)
## [1] 42.2
median(Sleep$Age)
## [1] 43
as.numeric(moda(Sleep$Age))
## [1] 43
diff(range(Sleep$Age))
## [1] 32
var(Sleep$Age)
## [1] 75.22324
sd(Sleep$Age)
## [1] 8.673133
coeficiente.variacion(Sleep$Age)*100
##   variable
## 1    20.53
quantile(Sleep$Age)
##    0%   25%   50%   75%  100% 
## 27.00 35.25 43.00 50.00 59.00
boxplot.stats(Sleep$Age)
## $stats
## [1] 27 35 43 50 59
## 
## $n
## [1] 374
## 
## $conf
## [1] 41.7745 44.2255
## 
## $out
## numeric(0)
ggplot(Sleep, aes(x=Age))+
  geom_boxplot()+
  labs(title = "Diagrama de boxplot de las edades",
       x="Edades de los pacientes")

ggplot(Sleep, aes(x=Age))+
  geom_boxplot(fill="red")+
  labs(title = "Diagrama de boxplot de las edades",
       x="Edades de los pacientes")

ggplot(Sleep, aes(x=Age,
                  fill=Gender))+
  geom_boxplot()+
  labs(title = "Diagrama de boxplot de las edades",
       x="Edades de los pacientes")

names(Sleep)
##  [1] "Person.ID"               "Gender"                 
##  [3] "Age"                     "Occupation"             
##  [5] "Sleep.Duration"          "Quality.of.Sleep"       
##  [7] "Physical.Activity.Level" "Stress.Level"           
##  [9] "BMI.Category"            "Blood.Pressure"         
## [11] "Heart.Rate"              "Daily.Steps"            
## [13] "Sleep.Disorder"          "Systolic"               
## [15] "Diastolic"               "Sleep.Disorder.Binary"
ggplot(Sleep, aes(x=Age,
                  fill=Sleep.Disorder.Binary))+
  geom_boxplot()+
  labs(title = "Diagrama de boxplot de las edades",
       x="Edades de los pacientes")

#Histograma
ggplot(Sleep, aes(x=Age))+
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

#Histograma
Clase=1+3.33*log10(n)
Clase
## [1] 9.567662
ggplot(Sleep, aes(x=Age))+
  geom_histogram(bins=10, 
                 color="white", 
                 fill="red")+
  labs(title="Histograma de edad de los pacientes",
       x="Edad",
       y="Frecuencia Absoluta")

ggplot(Sleep, aes(x=Age,
                  fill=Sleep.Disorder.Binary))+
  geom_histogram(bins=10, 
                 color="white")+
  labs(title="Histograma de edad de los pacientes",
       x="Edad",
       y="Frecuencia Absoluta")

ggplot(Sleep, aes(x=Age,
                  fill=Sleep.Disorder.Binary))+
  geom_histogram(bins=10, 
                 color="white")+
  labs(title="Histograma de edad de los pacientes",
       x="Edad",
       y="Frecuencia Absoluta")+
  geom_vline(aes(xintercept = 42.2), color="blue")+
  geom_vline(aes(xintercept = 43), color="yellow")+
  geom_vline(aes(xintercept = 43), color="yellow")

ggplot(Sleep, aes(x=Age,
                  fill=BMI.Category))+
  geom_density(alpha=0.3)+
  labs(title="Densidad de edad de los pacientes",
       x="Edad",
       y="Densidad")

ggplot(Sleep, aes(x=Age,
                  fill=BMI.Category))+
geom_density(alpha=0.2)+
  labs(title="Densidad de edad de los pacientes",
       x="Edad",
       y="Densidad")+
  scale_fill_manual(values = c("red", "blue", "yellow", "cyan"))