#https://www.kaggle.com/datasets/abdallaahmed77/healthcare-risk-factors-dataset
library(readr)
library(ggplot2)
Healthcare <- read_csv("dirty_v3_path.csv")
## Rows: 30000 Columns: 20
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): Gender, Medical Condition, random_notes
## dbl (17): Age, Glucose, Blood Pressure, BMI, Oxygen Saturation, LengthOfStay...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
View(Healthcare)
Healthcare=as.data.frame(unclass(Healthcare),
stringsAsFactors = TRUE)
summary(Healthcare)
## Age Gender Medical.Condition Glucose
## Min. :10.00 Female:12865 Hypertension:7120 Min. : 20.32
## 1st Qu.:45.00 Male :12635 Diabetes :6417 1st Qu.: 96.28
## Median :55.00 NA's : 4500 Obesity :3857 Median :110.50
## Mean :54.62 Healthy :3039 Mean :123.62
## 3rd Qu.:66.00 Asthma :2037 3rd Qu.:136.61
## Max. :89.00 (Other) :3030 Max. :318.51
## NA's :4500 NA's :4500 NA's :4500
## Blood.Pressure BMI Oxygen.Saturation LengthOfStay
## Min. : 74.24 Min. : 7.67 Min. : 67.51 Min. : 1.000
## 1st Qu.:125.14 1st Qu.:24.59 1st Qu.: 93.00 1st Qu.: 3.000
## Median :138.32 Median :28.05 Median : 95.30 Median : 4.000
## Mean :140.46 Mean :28.48 Mean : 94.95 Mean : 4.414
## 3rd Qu.:153.79 3rd Qu.:31.81 3rd Qu.: 97.38 3rd Qu.: 5.000
## Max. :226.38 Max. :56.85 Max. :110.07 Max. :19.000
## NA's :4500
## Cholesterol Triglycerides HbA1c Smoking
## Min. : 95.73 Min. :-22.48 Min. : 3.280 Min. :0.0000
## 1st Qu.:189.50 1st Qu.:141.28 1st Qu.: 5.330 1st Qu.:0.0000
## Median :211.84 Median :173.37 Median : 5.970 Median :0.0000
## Mean :213.03 Mean :176.84 Mean : 6.294 Mean :0.2798
## 3rd Qu.:235.31 3rd Qu.:208.63 3rd Qu.: 6.920 3rd Qu.:1.0000
## Max. :358.37 Max. :421.51 Max. :12.360 Max. :1.0000
##
## Alcohol Physical.Activity Diet.Score Family.History
## Min. :0.0000 Min. :-3.680 Min. :-1.75 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.: 2.350 1st Qu.: 2.77 1st Qu.:0.0000
## Median :0.0000 Median : 3.590 Median : 3.79 Median :0.0000
## Mean :0.2385 Mean : 3.803 Mean : 4.03 Mean :0.4394
## 3rd Qu.:0.0000 3rd Qu.: 5.060 3rd Qu.: 5.02 3rd Qu.:1.0000
## Max. :1.0000 Max. :12.410 Max. :12.06 Max. :1.0000
##
## Stress.Level Sleep.Hours random_notes noise_col
## Min. :-2.440 Min. : 1.59 ### :7575 Min. :-412.1696
## 1st Qu.: 4.370 1st Qu.: 5.41 ?? :7547 1st Qu.: -68.2708
## Median : 5.900 Median : 6.23 ipsum:7476 Median : -0.5107
## Mean : 5.917 Mean : 6.23 lorem:7402 Mean : -0.5170
## 3rd Qu.: 7.440 3rd Qu.: 7.05 3rd Qu.: 66.8114
## Max. :15.450 Max. :10.35 Max. : 467.8949
##
media_x=mean(Healthcare$Age, na.rm = TRUE)
media_x
## [1] 54.61678
Healthcare$Age[is.na(Healthcare$Age)]=media_x
# emilinar todo
Healthcare=na.omit(Healthcare)
Healthcare$Smoking=as.factor(Healthcare$Smoking)
Healthcare$Alcohol=as.factor(Healthcare$Alcohol)
Healthcare$Family.History=as.factor(Healthcare$Family.History)
head(Healthcare, 20)
## Age Gender Medical.Condition Glucose Blood.Pressure BMI
## 1 46.00000 Male Diabetes 137.04 135.27 28.90
## 2 22.00000 Male Healthy 71.58 113.27 26.29
## 5 66.00000 Female Hypertension 95.15 178.17 31.12
## 6 50.00000 Male Hypertension 107.00 171.80 25.33
## 10 13.00000 Female Asthma 90.60 126.88 29.24
## 12 32.00000 Male Healthy 114.41 122.99 24.92
## 14 60.00000 Male Hypertension 92.30 128.30 27.33
## 15 74.00000 Male Hypertension 104.78 125.27 29.19
## 17 45.00000 Male Diabetes 204.82 135.38 29.67
## 18 64.00000 Female Diabetes 278.41 151.13 33.49
## 19 54.61678 Female Diabetes 219.73 157.39 29.21
## 22 78.00000 Male Hypertension 118.70 118.00 32.42
## 23 46.00000 Male Diabetes 147.24 135.78 34.65
## 24 63.00000 Male Diabetes 198.52 142.32 33.84
## 25 58.00000 Male Diabetes 184.13 143.06 33.54
## 26 82.00000 Female Arthritis 80.65 124.50 42.44
## 27 74.00000 Female Hypertension 114.31 154.22 29.55
## 29 33.00000 Male Obesity 115.23 149.68 28.39
## 30 46.00000 Female Hypertension 103.56 154.95 20.11
## 32 46.00000 Female Hypertension 100.21 161.52 26.48
## Oxygen.Saturation LengthOfStay Cholesterol Triglycerides HbA1c Smoking
## 1 96.04 6 231.88 210.56 7.61 0
## 2 97.54 2 165.57 129.41 4.91 0
## 5 94.90 4 259.53 115.85 5.98 0
## 6 95.21 5 233.10 203.67 5.48 1
## 10 97.96 6 193.44 163.65 6.18 1
## 12 95.71 1 153.71 135.57 4.77 1
## 14 96.30 4 198.27 164.57 4.30 1
## 15 99.43 6 184.59 195.14 4.34 1
## 17 88.88 5 194.29 94.98 8.70 1
## 18 95.79 6 222.42 148.66 7.68 0
## 19 95.93 5 213.50 219.72 7.06 0
## 22 98.07 4 235.06 275.50 4.86 1
## 23 88.59 6 222.02 195.05 6.80 0
## 24 90.77 3 177.28 64.13 7.92 0
## 25 88.40 4 223.54 141.01 7.37 1
## 26 92.68 2 210.98 189.89 5.53 0
## 27 93.09 4 206.79 220.85 5.05 0
## 29 95.34 3 217.10 168.43 5.29 0
## 30 94.51 6 235.27 139.26 6.09 0
## 32 94.94 2 251.62 189.48 6.91 0
## Alcohol Physical.Activity Diet.Score Family.History Stress.Level Sleep.Hours
## 1 0 -0.20 3.54 0 5.07 6.05
## 2 0 8.12 5.90 0 5.87 7.72
## 5 1 3.56 3.40 0 6.38 6.64
## 6 0 5.01 4.65 0 7.25 6.35
## 10 0 5.27 5.63 0 7.09 5.41
## 12 0 4.92 6.55 1 3.40 7.99
## 14 0 6.78 2.95 1 6.60 7.24
## 15 1 4.97 4.17 0 7.73 6.47
## 17 1 3.98 2.58 1 5.91 6.03
## 18 1 1.10 3.40 0 7.52 5.94
## 19 0 4.38 2.21 1 7.92 6.06
## 22 0 2.09 5.44 1 8.24 5.37
## 23 0 1.60 2.20 0 6.03 5.10
## 24 0 2.25 2.56 0 4.89 6.17
## 25 0 3.69 2.73 1 8.02 7.11
## 26 1 3.10 4.78 0 6.30 6.87
## 27 0 2.95 5.50 1 8.32 6.43
## 29 1 1.75 3.07 0 4.67 6.23
## 30 0 5.95 1.68 1 8.62 4.98
## 32 0 4.91 3.37 1 6.50 6.88
## random_notes noise_col
## 1 lorem -137.057211
## 2 ipsum -11.230610
## 5 lorem 44.831426
## 6 ipsum 108.411983
## 10 lorem -45.916696
## 12 ?? -41.851833
## 14 ?? 102.442797
## 15 ### -145.328903
## 17 ### -32.371503
## 18 ?? 130.220928
## 19 ipsum 105.291688
## 22 lorem -116.117153
## 23 ipsum 45.698900
## 24 ### -32.924575
## 25 lorem 220.066222
## 26 lorem -164.764043
## 27 lorem -9.454445
## 29 ?? -47.137590
## 30 lorem 46.769174
## 32 ### -20.407802
Healthcare$random_notes[ trimws(Healthcare$random_notes) ==
"" | grepl("^[[:punct:][:space:]]+$",
Healthcare$random_notes) ]=NA
Healthcare$random_notes <- as.character(Healthcare$random_notes)
Healthcare$random_notes[is.na(Healthcare$random_notes)] <- "no aplica!"
#Gráficos
names(Healthcare)
## [1] "Age" "Gender" "Medical.Condition"
## [4] "Glucose" "Blood.Pressure" "BMI"
## [7] "Oxygen.Saturation" "LengthOfStay" "Cholesterol"
## [10] "Triglycerides" "HbA1c" "Smoking"
## [13] "Alcohol" "Physical.Activity" "Diet.Score"
## [16] "Family.History" "Stress.Level" "Sleep.Hours"
## [19] "random_notes" "noise_col"
#Hisotgrama
ggplot(Healthcare, aes(x=BMI,
))+
geom_histogram(bins=10,
col="white")+
facet_wrap(.~Smoking)

#
ggplot(Healthcare, aes(x=BMI,
fill=Smoking))+
geom_histogram(bins=10,
col="white",
position = "identity",
alpha = 0.6)+
geom_vline(aes(xintercept = mean(BMI, na.rm = TRUE)),
color = "red", linewidth = 1.2,
linetype = "dashed")

#Graficos de densidad
ggplot(Healthcare, aes(x=BMI,
fill=Smoking)) +
geom_density(alpha = 0.5)

#
ggplot(Healthcare, aes(x=BMI)) +
geom_density(alpha = 0.5) +
facet_wrap(~Smoking) +
labs(title = " ")

#puntos
ggplot(Healthcare, aes(x=BMI,
y=Blood.Pressure,
color=Smoking)) +
geom_point()

#
ggplot(Healthcare, aes(x=BMI,
y=Blood.Pressure,
color=Smoking)) +
geom_point()+
facet_wrap(~Smoking) +
labs(title = " ")

#Diagrama de boxplot
ggplot(Healthcare, aes(x=BMI,
#y=Blood.Pressure,
color=Smoking)) +
geom_boxplot()

#
ggplot(Healthcare, aes(y=Blood.Pressure,
color=Smoking)) +
geom_boxplot()+
facet_wrap(~Smoking) +
geom_hline(aes(yintercept = mean(Blood.Pressure, na.rm = TRUE)),
color = "red", linewidth = 1.2,
linetype = "dashed")

labs(title = " ")
## $title
## [1] " "
##
## attr(,"class")
## [1] "labels"
library(GGally)
ggpairs(Healthcare,
columns=c(1,4,5,6, 12))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggpairs(Healthcare,
columns=c(1,4,5,6, 12),
mapping=aes(color=Smoking))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

#Graficos interactivos
library(plotly)
##
## Adjuntando el paquete: 'plotly'
##
## The following object is masked from 'package:ggplot2':
##
## last_plot
##
## The following object is masked from 'package:stats':
##
## filter
##
## The following object is masked from 'package:graphics':
##
## layout
p=ggplot(Healthcare, aes(x=BMI,
y=Blood.Pressure,
color=Smoking)) +
geom_point()
ggplotly(p)
p1=ggplot(Healthcare, aes(x=BMI,
y=Blood.Pressure,
color=Smoking)) +
geom_smooth()+
facet_wrap(~Smoking) +
labs(title = " ")
ggplotly(p1)
## `geom_smooth()` using method = 'gam' and formula = 'y ~ s(x, bs = "cs")'
p2=ggplot(Healthcare, aes(x=BMI,
y=Blood.Pressure,
color=Smoking)) +
geom_smooth(method = "lm", se = FALSE)+
facet_wrap(~Smoking) +
labs(title = " ")
ggplotly(p2)
## `geom_smooth()` using formula = 'y ~ x'
######
ggplot(Healthcare, aes(x=BMI,
y=Medical.Condition,
fill=Medical.Condition)) +
geom_boxplot() +
labs(x="BMI" ,
y="Medical.Condition",
fill="Medical.Condition") +
geom_point(stat= "summary",
fun.y=mean,
shape=16, size=4,
color="red") +
coord_flip()
## Warning in geom_point(stat = "summary", fun.y = mean, shape = 16, size = 4, :
## Ignoring unknown parameters: `fun.y`
## No summary function supplied, defaulting to `mean_se()`
