Prueba Hipótesis

La prueba hipótesis para la media muestral

Se va a trabajar la base titulada diabetes_risk_dataset.

#Paquetes
library(readr)
library(ggplot2)
library(BSDA)

Carga de base de datos

diabetes <- read_csv("diabetes_risk_dataset.csv")
diabetes=as.data.frame(unclass(diabetes),
                       stringsAsFactors = TRUE)

Tranformación de datos codificados a variables tipo factor

prueba hipótesis para edad

\[ \left\{\begin{matrix} Ho: \mu =\mu _{x}\\ Ha: \mu \neq \mu _{x} \end{matrix}\right. \]

z.test(x=diabetes$age,
       sigma.x =sd(diabetes$age),
        mu=50,
         alternative="two.side",
         conf.level=0.95)
## 
##  One-sample z-Test
## 
## data:  diabetes$age
## z = -8.3075, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 50
## 95 percent confidence interval:
##  49.43794 49.65252
## sample estimates:
## mean of x 
##  49.54523

Existe evidencia suficiente del 95%, para rechazar la prueba nula, es decir, que la edad de los pacientes no es igual a 50.

ggplot(diabetes, aes(x=age))+
  geom_histogram(bins=10, 
                 col="white")+
  labs(title = "Histograma de edad de pacientes",
       x="Clases de las edades",
       y="Frecuencia Absoluta")+
  geom_vline(aes(xintercept = mean(diabetes$age)),
             color="red",
             linetype="dashed")

Prueba de diferencia de medias

\[ \left\{\begin{matrix} Ho: \mu_{x1} =\mu _{x2}\\ Ha: \mu_{x1} \neq \mu _{x2} \end{matrix}\right. \] ### Diagnóstico gráficos

ggplot(diabetes, aes(x=bmi))+
  geom_histogram(bins=10, 
                 color="white")+
  facet_wrap(.~smoker)

ggplot(diabetes, aes(x=bmi, 
                     fill=smoker))+
  geom_histogram(bins=10, 
                 color="white",
                 alpha=0.5)+
  geom_vline(aes(xintercept = mean(diabetes$bmi)),
             color="red",
             linetype="dashed")

smokerIBM=split(diabetes, diabetes$smoker)
z.test(x=smokerIBM$`0`$bmi,
       y=smokerIBM$`1`$bmi,
       sigma.x =sd(smokerIBM$`0`$bmi),
       sigma.y=sd(smokerIBM$`1`$bmi),
       mu=,
         alternative="two.side",
         conf.level=0.95)
## 
##  Two-sample z-Test
## 
## data:  smokerIBM$`0`$bmi and smokerIBM$`1`$bmi
## z = -0.006229, p-value = 0.995
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.07664373  0.07615810
## sample estimates:
## mean of x mean of y 
##  27.03396  27.03420
ggplot(diabetes, aes(x=bmi, 
                     fill=smoker)) +
  geom_density(alpha = 0.5)+
  labs(title = "Diagrama de densidad de BMI de los pacientes",
       x="BMI",
       y="Densidad")

ggplot(diabetes, aes(x=bmi, 
                     fill=smoker)) +
  geom_density(alpha = 0.5)+
  labs(title = "Diagrama de densidad de BMI de los pacientes",
       x="BMI",
       y="Densidad")+
  facet_wrap(.~smoker)

ggplot(diabetes, aes(y=bmi, 
                     fill=smoker)) +
  geom_boxplot()+
  labs(title = "Diagrama de boxplot BMI de los pacientes",
       y="BMI")+
    geom_hline(aes(yintercept = mean(diabetes$bmi)), 
             color = "red", linewidth = 1.2, 
             linetype = "dashed")
## Warning: Use of `diabetes$bmi` is discouraged.
## ℹ Use `bmi` instead.

ggplot(diabetes, aes(y=bmi, 
                     fill=smoker)) +
  geom_boxplot()+
  labs(title = "Diagrama de boxplot BMI de los pacientes",
       y="BMI")+
    geom_hline(aes(yintercept = mean(diabetes$bmi)), 
             color = "red", linewidth = 1.2, 
             linetype = "dashed")+
  facet_wrap(.~smoker)
## Warning: Use of `diabetes$bmi` is discouraged.
## ℹ Use `bmi` instead.

# Prueba Hipotesis para proporciones

diferencia de proporciones

\[ \left\{\begin{matrix} Ho: \pi_{x1} =\pi _{x2}\\ Ha: \pi_{x1} \neq \pi _{x2} \end{matrix}\right. \]

tablasmoker=table(diabetes$smoker)
tablasmoker
## 
##     0     1 
## 79800 20200
prop.test(x = c(79800, 20200),
          n = c(100000, 100000),
          alternative = "two.sided",
          conf.level = 0.95,
          correct = TRUE)
## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(79800, 20200) out of c(1e+05, 1e+05)
## X-squared = 71041, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  0.5924708 0.5995292
## sample estimates:
## prop 1 prop 2 
##  0.798  0.202
ggplot(diabetes, aes(x=smoker))+
  geom_bar()

# Ho: p = 0.5  vs Ha: p ≠ 0.5
prop.test(x = 79800,
          n = 100000,
          p = 0.5,
          alternative = "two.sided",
          conf.level = 0.95,
          correct = TRUE)
## 
##  1-sample proportions test with continuity correction
## 
## data:  79800 out of 1e+05, null probability 0.5
## X-squared = 35520, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.7954951 0.8004819
## sample estimates:
##     p 
## 0.798