Este informe presenta el análisis de un conjunto de datos sobre salud de personas fumadoras y no fumadoras. Se aplican pruebas de hipótesis para evaluar diferencias en frecuencia cardíaca, colesterol y proporciones clínicas.
# Instalar y cargar paquetes necesarios
# install.packages("readr")
# install.packages("ggplot2")
library(readr)
library(ggplot2)
df <- read_csv("smoking_health_data_final.csv")
## Rows: 3900 Columns: 7
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): sex, current_smoker, blood_pressure
## dbl (4): age, heart_rate, cigs_per_day, chol
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df)
## # A tibble: 6 × 7
## age sex current_smoker heart_rate blood_pressure cigs_per_day chol
## <dbl> <chr> <chr> <dbl> <chr> <dbl> <dbl>
## 1 54 male yes 95 110/72 NA 219
## 2 45 male yes 64 121/72 NA 248
## 3 58 male yes 81 127.5/76 NA 235
## 4 42 male yes 90 122.5/80 NA 225
## 5 42 male yes 62 119/80 NA 226
## 6 57 male yes 62 107.5/72.5 NA 223
df$heart_rate <- as.numeric(gsub(",", ".", df$heart_rate))
df$chol <- as.numeric(gsub(",", ".", df$chol))
t.test(df$heart_rate, mu = 75)
##
## One Sample t-test
##
## data: df$heart_rate
## t = 3.5809, df = 3899, p-value = 0.0003465
## alternative hypothesis: true mean is not equal to 75
## 95 percent confidence interval:
## 75.31176 76.06619
## sample estimates:
## mean of x
## 75.68897
t.test(df$chol, mu = 200, alternative = "greater")
##
## One Sample t-test
##
## data: df$chol
## t = 51.456, df = 3892, p-value < 2.2e-16
## alternative hypothesis: true mean is greater than 200
## 95 percent confidence interval:
## 235.4258 Inf
## sample estimates:
## mean of x
## 236.5959
# Gráfica
hist(df$chol, breaks = 20, col = "skyblue", main = "Histograma de Colesterol", xlab = "Colesterol")
abline(v = 200, col = "red", lwd = 2)
df$colesterol_alto <- ifelse(!is.na(df$chol) & df$chol > 240, 1, 0)
# Eliminar NA si existen
df_clean <- df[!is.na(df$colesterol_alto), ]
# Recuento
x <- sum(df_clean$colesterol_alto)
n <- nrow(df_clean)
# Prueba de proporciones
if (!is.na(x) && !is.na(n) && n > 0 && x > 0) {
prop.test(x, n, p = 0.2, alternative = "greater")
} else {
cat("No hay suficientes datos válidos para realizar la prueba de proporción.\n")
}
##
## 1-sample proportions test with continuity correction
##
## data: x out of n, null probability 0.2
## X-squared = 1265.1, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is greater than 0.2
## 95 percent confidence interval:
## 0.4148435 1.0000000
## sample estimates:
## p
## 0.4279487
df$taquicardia <- ifelse(df$heart_rate > 100, 1, 0)
prop.test(sum(df$taquicardia), length(df$taquicardia), p = 0.05)
##
## 1-sample proportions test with continuity correction
##
## data: sum(df$taquicardia) out of length(df$taquicardia), null probability 0.05
## X-squared = 55.613, df = 1, p-value = 8.825e-14
## alternative hypothesis: true p is not equal to 0.05
## 95 percent confidence interval:
## 0.01939026 0.02926409
## sample estimates:
## p
## 0.02384615
# Gráfica
barplot(table(df$taquicardia), names.arg = c("Normal", "Taquicardia"), col = c("skyblue", "salmon"), main = "Proporciones de Taquicardia")
t.test(df$chol[df$current_smoker == "yes"], df$chol[df$current_smoker == "no"])
##
## Welch Two Sample t-test
##
## data: df$chol[df$current_smoker == "yes"] and df$chol[df$current_smoker == "no"]
## t = -2.9119, df = 3884.8, p-value = 0.003612
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.925837 -1.352281
## sample estimates:
## mean of x mean of y
## 234.5067 238.6458
# Gráfica
boxplot(chol ~ current_smoker, data = df, col = c("orange", "green"), main = "Colesterol por hábito de fumar")
t.test(df$heart_rate[df$current_smoker == "yes"],
df$heart_rate[df$current_smoker == "no"],
alternative = "greater")
##
## Welch Two Sample t-test
##
## data: df$heart_rate[df$current_smoker == "yes"] and df$heart_rate[df$current_smoker == "no"]
## t = 3.5809, df = 3896.4, p-value = 0.0001733
## alternative hypothesis: true difference in means is greater than 0
## 95 percent confidence interval:
## 0.7434658 Inf
## sample estimates:
## mean of x mean of y
## 76.38302 75.00762
# Gráfica
boxplot(heart_rate ~ current_smoker, data = df, col = c("lightblue", "pink"), main = "Frecuencia cardíaca por hábito de fumar")
Con base en los resultados estadísticos y el análisis visual, se establecen las siguientes conclusiones: