El presente informe desarrolla un análisis estadístico completo del conjunto de datos Smokers Health Data, con enfoque en:
library(moments)
library(BSDA)
library(readr)
library(dplyr)
library(tidyverse)
library(broom)
library(ggplot2)
library(tidyr)
library(ggpubr)
library(DescTools)
# Cargar dataset
Dta <- read_csv("smoking_health_data_final.csv")
# Exploración inicial
dim(Dta)
## [1] 3900 7
names(Dta)
## [1] "age" "sex" "current_smoker" "heart_rate"
## [5] "blood_pressure" "cigs_per_day" "chol"
head(Dta)
str(Dta)
## spc_tbl_ [3,900 × 7] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ age : num [1:3900] 54 45 58 42 42 57 43 42 37 49 ...
## $ sex : chr [1:3900] "male" "male" "male" "male" ...
## $ current_smoker: chr [1:3900] "yes" "yes" "yes" "yes" ...
## $ heart_rate : num [1:3900] 95 64 81 90 62 62 75 66 65 93 ...
## $ blood_pressure: chr [1:3900] "110/72" "121/72" "127.5/76" "122.5/80" ...
## $ cigs_per_day : num [1:3900] NA NA NA NA NA NA NA NA NA NA ...
## $ chol : num [1:3900] 219 248 235 225 226 223 222 196 188 256 ...
## - attr(*, "spec")=
## .. cols(
## .. age = col_double(),
## .. sex = col_character(),
## .. current_smoker = col_character(),
## .. heart_rate = col_double(),
## .. blood_pressure = col_character(),
## .. cigs_per_day = col_double(),
## .. chol = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
summary(Dta)
## age sex current_smoker heart_rate
## Min. :32.00 Length:3900 Length:3900 Min. : 44.00
## 1st Qu.:42.00 Class :character Class :character 1st Qu.: 68.00
## Median :49.00 Mode :character Mode :character Median : 75.00
## Mean :49.54 Mean : 75.69
## 3rd Qu.:56.00 3rd Qu.: 82.00
## Max. :70.00 Max. :143.00
##
## blood_pressure cigs_per_day chol
## Length:3900 Min. : 0.000 Min. :113.0
## Class :character 1st Qu.: 0.000 1st Qu.:206.0
## Mode :character Median : 0.000 Median :234.0
## Mean : 9.169 Mean :236.6
## 3rd Qu.:20.000 3rd Qu.:263.0
## Max. :70.000 Max. :696.0
## NA's :14 NA's :7
# Eliminar duplicados
Dta <- Dta %>% distinct()
colSums(is.na(Dta))
## age sex current_smoker heart_rate blood_pressure
## 0 0 0 0 0
## cigs_per_day chol
## 14 7
# Estandarizar variable de fumador
Dta <- Dta %>%
mutate(
current_smoker = case_when(
current_smoker %in% c("yes","Yes","YES","1",1,"si","Si","SI") ~ 1,
current_smoker %in% c("no","No","NO","0",0) ~ 0,
TRUE ~ NA_real_
),
current_smoker = factor(current_smoker),
heart_rate = as.numeric(heart_rate),
chol = as.numeric(chol),
cigs_per_day = as.numeric(cigs_per_day),
age = as.numeric(age),
systolic = as.numeric(sub("/.*", "", blood_pressure)),
diastolic = as.numeric(sub(".*/", "", blood_pressure))
)
# Imputación de NA numéricos
num_cols <- Dta %>% select(where(is.numeric)) %>% names()
for (col in num_cols) {
med <- median(Dta[[col]], na.rm = TRUE)
Dta[[col]][is.na(Dta[[col]])] <- med
}
# Nuevas variables
Dta <- Dta %>%
mutate(
chol_high = if_else(chol > 240, 1, 0),
tachy = if_else(heart_rate > 100, 1, 0)
)
glimpse(Dta)
## Rows: 3,900
## Columns: 11
## $ age <dbl> 54, 45, 58, 42, 42, 57, 43, 42, 37, 49, 55, 39, 53, 45,…
## $ sex <chr> "male", "male", "male", "male", "male", "male", "male",…
## $ current_smoker <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0…
## $ heart_rate <dbl> 95, 64, 81, 90, 62, 62, 75, 66, 65, 93, 70, 85, 58, 83,…
## $ blood_pressure <chr> "110/72", "121/72", "127.5/76", "122.5/80", "119/80", "…
## $ cigs_per_day <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ chol <dbl> 219, 248, 235, 225, 226, 223, 222, 196, 188, 256, 214, …
## $ systolic <dbl> 110.0, 121.0, 127.5, 122.5, 119.0, 107.5, 109.5, 123.0,…
## $ diastolic <dbl> 72.0, 72.0, 76.0, 80.0, 80.0, 72.5, 69.0, 73.0, 77.0, 8…
## $ chol_high <dbl> 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1…
## $ tachy <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
summary(select(Dta, age, heart_rate, chol, systolic, diastolic, cigs_per_day))
## age heart_rate chol systolic
## Min. :32.00 Min. : 44.00 Min. :113.0 Min. : 83.5
## 1st Qu.:42.00 1st Qu.: 68.00 1st Qu.:206.0 1st Qu.:117.0
## Median :49.00 Median : 75.00 Median :234.0 Median :128.0
## Mean :49.54 Mean : 75.69 Mean :236.6 Mean :132.4
## 3rd Qu.:56.00 3rd Qu.: 82.00 3rd Qu.:263.0 3rd Qu.:144.0
## Max. :70.00 Max. :143.00 Max. :696.0 Max. :295.0
## diastolic cigs_per_day
## Min. : 48.00 Min. : 0.000
## 1st Qu.: 75.00 1st Qu.: 0.000
## Median : 82.00 Median : 0.000
## Mean : 82.99 Mean : 9.136
## 3rd Qu.: 90.00 3rd Qu.:20.000
## Max. :142.50 Max. :70.000
Dta %>%
group_by(current_smoker) %>%
summarise(
n = n(),
mean_hr = mean(heart_rate),
sd_hr = sd(heart_rate),
mean_chol = mean(chol),
sd_chol = sd(chol)
)
ggplot(Dta, aes(x = heart_rate)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "skyblue", alpha = .7) +
geom_density(size = 1.2, color = "blue") +
geom_vline(xintercept = 75, color = "red", linetype = "dashed", size = 1.2) +
theme_minimal()
ggplot(Dta, aes(x = chol)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "orange", alpha = .7) +
geom_density(size = 1.2, color = "darkorange") +
geom_vline(xintercept = 200, color = "red", linetype = "dashed", size = 1.2) +
theme_minimal()
df_chol_high <- Dta %>%
count(chol_high) %>%
mutate(label = ifelse(chol_high==1,"Alto (>240)","Normal"))
ggplot(df_chol_high, aes(x = label, y = n, fill = label)) +
geom_col(alpha = .8) +
geom_text(aes(label=n), vjust=-0.5, size=6) +
theme_minimal() +
theme(legend.position="none")
df_tachy <- Dta %>%
count(tachy) %>%
mutate(label = ifelse(tachy==1,"Taquicardia","Normal"))
ggplot(df_tachy, aes(x = label, y = n, fill = label)) +
geom_col(alpha = .8) +
geom_text(aes(label=n), vjust=-0.5, size=6) +
theme_minimal() +
theme(legend.position="none")
ggplot(Dta, aes(x=current_smoker, y=chol, fill=current_smoker)) +
geom_boxplot(alpha=.8) +
scale_x_discrete(labels=c("No fumador","Fumador")) +
theme_minimal() +
theme(legend.position="none")
ggplot(Dta, aes(x=current_smoker, y=heart_rate, fill=current_smoker)) +
geom_violin(alpha=.7) +
geom_jitter(width=.1, alpha=.2) +
scale_x_discrete(labels=c("No fumador","Fumador")) +
theme_minimal() +
theme(legend.position="none")
t.test(Dta$heart_rate, mu=75)
##
## One Sample t-test
##
## data: Dta$heart_rate
## t = 3.5809, df = 3899, p-value = 0.0003465
## alternative hypothesis: true mean is not equal to 75
## 95 percent confidence interval:
## 75.31176 76.06619
## sample estimates:
## mean of x
## 75.68897
t.test(Dta$chol, mu=200, alternative="greater")
##
## One Sample t-test
##
## data: Dta$chol
## t = 51.541, df = 3899, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 200
## 95 percent confidence interval:
## 235.4233 Inf
## sample estimates:
## mean of x
## 236.5913
binom.test(sum(Dta$chol_high==1), nrow(Dta), p=0.20, alternative="greater")
##
## Exact binomial test
##
## data: sum(Dta$chol_high == 1) and nrow(Dta)
## number of successes = 1669, number of trials = 3900, p-value < 2.2e-16
## alternative hypothesis: true probability of success is greater than 0.2
## 95 percent confidence interval:
## 0.4148317 1.0000000
## sample estimates:
## probability of success
## 0.4279487
binom.test(sum(Dta$tachy==1), nrow(Dta), p=0.05)
##
## Exact binomial test
##
## data: sum(Dta$tachy == 1) and nrow(Dta)
## number of successes = 93, number of trials = 3900, p-value < 2.2e-16
## alternative hypothesis: true probability of success is not equal to 0.05
## 95 percent confidence interval:
## 0.01928904 0.02913444
## sample estimates:
## probability of success
## 0.02384615
t.test(
Dta$chol[Dta$current_smoker==1],
Dta$chol[Dta$current_smoker==0]
)
##
## Welch Two Sample t-test
##
## data: Dta$chol[Dta$current_smoker == 1] and Dta$chol[Dta$current_smoker == 0]
## t = -2.9129, df = 3892, p-value = 0.003601
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.914781 -1.351271
## sample estimates:
## mean of x mean of y
## 234.5057 238.6387
t.test(
Dta$heart_rate[Dta$current_smoker==1],
Dta$heart_rate[Dta$current_smoker==0],
alternative="greater"
)
##
## Welch Two Sample t-test
##
## data: Dta$heart_rate[Dta$current_smoker == 1] and Dta$heart_rate[Dta$current_smoker == 0]
## t = 3.5809, df = 3896.4, p-value = 0.0001733
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.7434658 Inf
## sample estimates:
## mean of x mean of y
## 76.38302 75.00762
tabla <- table(Dta$current_smoker, Dta$chol_high)
prop.test(
x = c(tabla[2,2], tabla[1,2]),
n = c(sum(tabla[2,]), sum(tabla[1,])),
correct = FALSE
)
##
## 2-sample test for equality of proportions without continuity correction
##
## data: c(tabla[2, 2], tabla[1, 2]) out of c(sum(tabla[2, ]), sum(tabla[1, ]))
## X-squared = 5.6732, df = 1, p-value = 0.01723
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## -0.068776154 -0.006711146
## sample estimates:
## prop 1 prop 2
## 0.4089027 0.4466463
La gráfica de frecuencia cardíaca muestra una distribución aproximadamente simétrica, concentrada entre los 70 y 80 latidos por minuto. La densidad es mayor en torno a los 75–77 lpm. La línea de referencia (75 lpm) queda por debajo del promedio estimado, lo cual concuerda con las pruebas de hipótesis que indican que la media real es mayor a 75.
Lo que significa que la población tiene frecuencias cardíacas mayormente normales, con una ligera tendencia hacia valores superiores a 75, sugiriendo una actividad cardíaca un poco elevada en promedio.
La distribución del colesterol presenta un desplazamiento claro hacia valores altos. El promedio alrededor de 236 mg/dL está muy por encima del valor umbral de 200 mg/dL. Además, la dispersión evidencia que una parte considerable de la población presenta niveles potencialmente peligrosos.
La muestra presenta un riesgo aumentado de enfermedades cardiovasculares, dado que la mayoría de los valores están por encima de niveles saludables.
El 42.8% de los individuos tiene niveles de colesterol alto. Este valor es más del doble del 20% esperado para poblaciones sin factores de riesgo importantes.
Puede existir una prevalencia elevada de hipercolesterolemia, lo que indica un problema significativo de salud pública en la muestra.
La proporción de taquicardia es baja (2.38%). El gráfico confirma la poca frecuencia del evento en esta población.
Los no fumadores presentan una media de colesterol ligeramente más alta (238.6 mg/dL) que los fumadores (234.5 mg/dL). Aunque la diferencia es pequeña, es estadísticamente significativa.
La diferencia puede deberse a factores externos como edad, dieta, medicación o sesgos en la muestra. No se concluye una relación causal entre fumar y menor colesterol.
Los fumadores presentan frecuencias cardíacas mayores (76.38 vs 75.00 lpm). La gráfica muestra mayor densidad de valores altos para ellos. El aumento de frecuencia cardíaca en fumadores es fisiológicamente coherente con los efectos estimulantes de la nicotina.