#https://www.kaggle.com/datasets/abdallaahmed77/healthcare-risk-factors-dataset
library(readr)
library(ggplot2)
Healthcare <- read_csv("dirty_v3_path.csv")
## Rows: 30000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): Gender, Medical Condition, random_notes
## dbl (17): Age, Glucose, Blood Pressure, BMI, Oxygen Saturation, LengthOfStay...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Healthcare)

Healthcare=as.data.frame(unclass(Healthcare),
                         stringsAsFactors = TRUE)
summary(Healthcare)
##       Age           Gender         Medical.Condition    Glucose      
##  Min.   :10.00   Female:12865   Hypertension:7120    Min.   : 20.32  
##  1st Qu.:45.00   Male  :12635   Diabetes    :6417    1st Qu.: 96.28  
##  Median :55.00   NA's  : 4500   Obesity     :3857    Median :110.50  
##  Mean   :54.62                  Healthy     :3039    Mean   :123.62  
##  3rd Qu.:66.00                  Asthma      :2037    3rd Qu.:136.61  
##  Max.   :89.00                  (Other)     :3030    Max.   :318.51  
##  NA's   :4500                   NA's        :4500    NA's   :4500    
##  Blood.Pressure        BMI        Oxygen.Saturation  LengthOfStay   
##  Min.   : 74.24   Min.   : 7.67   Min.   : 67.51    Min.   : 1.000  
##  1st Qu.:125.14   1st Qu.:24.59   1st Qu.: 93.00    1st Qu.: 3.000  
##  Median :138.32   Median :28.05   Median : 95.30    Median : 4.000  
##  Mean   :140.46   Mean   :28.48   Mean   : 94.95    Mean   : 4.414  
##  3rd Qu.:153.79   3rd Qu.:31.81   3rd Qu.: 97.38    3rd Qu.: 5.000  
##  Max.   :226.38   Max.   :56.85   Max.   :110.07    Max.   :19.000  
##  NA's   :4500                                                       
##   Cholesterol     Triglycerides        HbA1c           Smoking      
##  Min.   : 95.73   Min.   :-22.48   Min.   : 3.280   Min.   :0.0000  
##  1st Qu.:189.50   1st Qu.:141.28   1st Qu.: 5.330   1st Qu.:0.0000  
##  Median :211.84   Median :173.37   Median : 5.970   Median :0.0000  
##  Mean   :213.03   Mean   :176.84   Mean   : 6.294   Mean   :0.2798  
##  3rd Qu.:235.31   3rd Qu.:208.63   3rd Qu.: 6.920   3rd Qu.:1.0000  
##  Max.   :358.37   Max.   :421.51   Max.   :12.360   Max.   :1.0000  
##                                                                     
##     Alcohol       Physical.Activity   Diet.Score    Family.History  
##  Min.   :0.0000   Min.   :-3.680    Min.   :-1.75   Min.   :0.0000  
##  1st Qu.:0.0000   1st Qu.: 2.350    1st Qu.: 2.77   1st Qu.:0.0000  
##  Median :0.0000   Median : 3.590    Median : 3.79   Median :0.0000  
##  Mean   :0.2385   Mean   : 3.803    Mean   : 4.03   Mean   :0.4394  
##  3rd Qu.:0.0000   3rd Qu.: 5.060    3rd Qu.: 5.02   3rd Qu.:1.0000  
##  Max.   :1.0000   Max.   :12.410    Max.   :12.06   Max.   :1.0000  
##                                                                     
##   Stress.Level     Sleep.Hours    random_notes   noise_col        
##  Min.   :-2.440   Min.   : 1.59   ###  :7575   Min.   :-412.1696  
##  1st Qu.: 4.370   1st Qu.: 5.41   ??   :7547   1st Qu.: -68.2708  
##  Median : 5.900   Median : 6.23   ipsum:7476   Median :  -0.5107  
##  Mean   : 5.917   Mean   : 6.23   lorem:7402   Mean   :  -0.5170  
##  3rd Qu.: 7.440   3rd Qu.: 7.05                3rd Qu.:  66.8114  
##  Max.   :15.450   Max.   :10.35                Max.   : 467.8949  
## 
media_x=mean(Healthcare$Age, na.rm = TRUE)
media_x
## [1] 54.61678
Healthcare$Age[is.na(Healthcare$Age)]=media_x
# emilinar todo
Healthcare=na.omit(Healthcare)

Healthcare$Smoking=as.factor(Healthcare$Smoking)
Healthcare$Alcohol=as.factor(Healthcare$Alcohol)
Healthcare$Family.History=as.factor(Healthcare$Family.History)

head(Healthcare, 20)
##         Age Gender Medical.Condition Glucose Blood.Pressure   BMI
## 1  46.00000   Male          Diabetes  137.04         135.27 28.90
## 2  22.00000   Male           Healthy   71.58         113.27 26.29
## 5  66.00000 Female      Hypertension   95.15         178.17 31.12
## 6  50.00000   Male      Hypertension  107.00         171.80 25.33
## 10 13.00000 Female            Asthma   90.60         126.88 29.24
## 12 32.00000   Male           Healthy  114.41         122.99 24.92
## 14 60.00000   Male      Hypertension   92.30         128.30 27.33
## 15 74.00000   Male      Hypertension  104.78         125.27 29.19
## 17 45.00000   Male          Diabetes  204.82         135.38 29.67
## 18 64.00000 Female          Diabetes  278.41         151.13 33.49
## 19 54.61678 Female          Diabetes  219.73         157.39 29.21
## 22 78.00000   Male      Hypertension  118.70         118.00 32.42
## 23 46.00000   Male          Diabetes  147.24         135.78 34.65
## 24 63.00000   Male          Diabetes  198.52         142.32 33.84
## 25 58.00000   Male          Diabetes  184.13         143.06 33.54
## 26 82.00000 Female         Arthritis   80.65         124.50 42.44
## 27 74.00000 Female      Hypertension  114.31         154.22 29.55
## 29 33.00000   Male           Obesity  115.23         149.68 28.39
## 30 46.00000 Female      Hypertension  103.56         154.95 20.11
## 32 46.00000 Female      Hypertension  100.21         161.52 26.48
##    Oxygen.Saturation LengthOfStay Cholesterol Triglycerides HbA1c Smoking
## 1              96.04            6      231.88        210.56  7.61       0
## 2              97.54            2      165.57        129.41  4.91       0
## 5              94.90            4      259.53        115.85  5.98       0
## 6              95.21            5      233.10        203.67  5.48       1
## 10             97.96            6      193.44        163.65  6.18       1
## 12             95.71            1      153.71        135.57  4.77       1
## 14             96.30            4      198.27        164.57  4.30       1
## 15             99.43            6      184.59        195.14  4.34       1
## 17             88.88            5      194.29         94.98  8.70       1
## 18             95.79            6      222.42        148.66  7.68       0
## 19             95.93            5      213.50        219.72  7.06       0
## 22             98.07            4      235.06        275.50  4.86       1
## 23             88.59            6      222.02        195.05  6.80       0
## 24             90.77            3      177.28         64.13  7.92       0
## 25             88.40            4      223.54        141.01  7.37       1
## 26             92.68            2      210.98        189.89  5.53       0
## 27             93.09            4      206.79        220.85  5.05       0
## 29             95.34            3      217.10        168.43  5.29       0
## 30             94.51            6      235.27        139.26  6.09       0
## 32             94.94            2      251.62        189.48  6.91       0
##    Alcohol Physical.Activity Diet.Score Family.History Stress.Level Sleep.Hours
## 1        0             -0.20       3.54              0         5.07        6.05
## 2        0              8.12       5.90              0         5.87        7.72
## 5        1              3.56       3.40              0         6.38        6.64
## 6        0              5.01       4.65              0         7.25        6.35
## 10       0              5.27       5.63              0         7.09        5.41
## 12       0              4.92       6.55              1         3.40        7.99
## 14       0              6.78       2.95              1         6.60        7.24
## 15       1              4.97       4.17              0         7.73        6.47
## 17       1              3.98       2.58              1         5.91        6.03
## 18       1              1.10       3.40              0         7.52        5.94
## 19       0              4.38       2.21              1         7.92        6.06
## 22       0              2.09       5.44              1         8.24        5.37
## 23       0              1.60       2.20              0         6.03        5.10
## 24       0              2.25       2.56              0         4.89        6.17
## 25       0              3.69       2.73              1         8.02        7.11
## 26       1              3.10       4.78              0         6.30        6.87
## 27       0              2.95       5.50              1         8.32        6.43
## 29       1              1.75       3.07              0         4.67        6.23
## 30       0              5.95       1.68              1         8.62        4.98
## 32       0              4.91       3.37              1         6.50        6.88
##    random_notes   noise_col
## 1         lorem -137.057211
## 2         ipsum  -11.230610
## 5         lorem   44.831426
## 6         ipsum  108.411983
## 10        lorem  -45.916696
## 12           ??  -41.851833
## 14           ??  102.442797
## 15          ### -145.328903
## 17          ###  -32.371503
## 18           ??  130.220928
## 19        ipsum  105.291688
## 22        lorem -116.117153
## 23        ipsum   45.698900
## 24          ###  -32.924575
## 25        lorem  220.066222
## 26        lorem -164.764043
## 27        lorem   -9.454445
## 29           ??  -47.137590
## 30        lorem   46.769174
## 32          ###  -20.407802
Healthcare$random_notes[ trimws(Healthcare$random_notes) ==
                           "" | grepl("^[[:punct:][:space:]]+$",
                                      Healthcare$random_notes) ]=NA

Healthcare$random_notes <- as.character(Healthcare$random_notes)
Healthcare$random_notes[is.na(Healthcare$random_notes)] <- "no aplica!"


#Gráficos
names(Healthcare)
##  [1] "Age"               "Gender"            "Medical.Condition"
##  [4] "Glucose"           "Blood.Pressure"    "BMI"              
##  [7] "Oxygen.Saturation" "LengthOfStay"      "Cholesterol"      
## [10] "Triglycerides"     "HbA1c"             "Smoking"          
## [13] "Alcohol"           "Physical.Activity" "Diet.Score"       
## [16] "Family.History"    "Stress.Level"      "Sleep.Hours"      
## [19] "random_notes"      "noise_col"
#Hisotgrama
ggplot(Healthcare, aes(x=BMI, 
                       ))+
  geom_histogram(bins=10, 
                 col="white")+
  facet_wrap(.~Smoking)

#
ggplot(Healthcare, aes(x=BMI,
                       fill=Smoking))+
  geom_histogram(bins=10, 
                 col="white", 
                 position = "identity", 
                 alpha = 0.6)+
  geom_vline(aes(xintercept = mean(BMI, na.rm = TRUE)), 
             color = "red", linewidth = 1.2, 
             linetype = "dashed")

#Graficos de densidad

ggplot(Healthcare, aes(x=BMI,
                       fill=Smoking)) +
  geom_density(alpha = 0.5)

#
ggplot(Healthcare, aes(x=BMI)) +
  geom_density(alpha = 0.5) +
  facet_wrap(~Smoking) +
  labs(title = " ")

#puntos
ggplot(Healthcare, aes(x=BMI,
                       y=Blood.Pressure,
                       color=Smoking)) +
  geom_point()

#
ggplot(Healthcare, aes(x=BMI,
                       y=Blood.Pressure,
                       color=Smoking)) +
  geom_point()+
  facet_wrap(~Smoking) +
  labs(title = " ")

#Diagrama de boxplot
ggplot(Healthcare, aes(x=BMI,
                       #y=Blood.Pressure,
                       color=Smoking)) +
  geom_boxplot()

#

ggplot(Healthcare, aes(y=Blood.Pressure,
                       color=Smoking)) +
  geom_boxplot()+
  facet_wrap(~Smoking) +
  geom_hline(aes(yintercept = mean(Blood.Pressure, na.rm = TRUE)), 
             color = "red", linewidth = 1.2, 
             linetype = "dashed")

  labs(title = " ")
## $title
## [1] " "
## 
## attr(,"class")
## [1] "labels"
library(GGally)

ggpairs(Healthcare,
        columns=c(1,4,5,6, 12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(Healthcare,
        columns=c(1,4,5,6, 12),
mapping=aes(color=Smoking))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Graficos interactivos
library(plotly)
## 
## Adjuntando el paquete: 'plotly'
## 
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## 
## The following object is masked from 'package:stats':
## 
##     filter
## 
## The following object is masked from 'package:graphics':
## 
##     layout
p=ggplot(Healthcare, aes(x=BMI,
                        y=Blood.Pressure,
                      color=Smoking)) +
  geom_point()

ggplotly(p)
p1=ggplot(Healthcare, aes(x=BMI,
                         y=Blood.Pressure,
                         color=Smoking)) +
  geom_smooth()+
  facet_wrap(~Smoking) +
  labs(title = " ")

ggplotly(p1)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
p2=ggplot(Healthcare, aes(x=BMI,
                          y=Blood.Pressure,
                          color=Smoking)) +
  geom_smooth(method = "lm", se = FALSE)+
  facet_wrap(~Smoking) +
  labs(title = " ")

ggplotly(p2)
## `geom_smooth()` using formula = 'y ~ x'
######

ggplot(Healthcare, aes(x=BMI,
                      y=Medical.Condition, 
                      fill=Medical.Condition)) +
  geom_boxplot() +
  labs(x="BMI" ,
       y="Medical.Condition",
       fill="Medical.Condition") +  
  geom_point(stat= "summary",
             fun.y=mean, 
             shape=16, size=4, 
             color="red") +
  coord_flip()
## Warning in geom_point(stat = "summary", fun.y = mean, shape = 16, size = 4, :
## Ignoring unknown parameters: `fun.y`
## No summary function supplied, defaulting to `mean_se()`