La prueba hipótesis es ua técnica estadística que permite validar la signifícancia de los diseños de experimento.
# subir una base de datos médico
diabetes <- read_csv("diabetes_risk_dataset.csv")
diabetes=as.data.frame(unclass(diabetes),
stringsAsFactors = TRUE)
diabetes$family_history=as.factor(diabetes$family_history)
diabetes$smoker=as.factor(diabetes$smoker)
diabetes$at_risk_diabetes=as.factor(diabetes$at_risk_diabetes)
A continuación, se realizará una prueba hipótesis para una muestra de la media:
\[ \left\{\begin{matrix} Ho: \mu =\mu _{x}\\ Ha: \mu \neq \mu _{x} \end{matrix}\right. \]
z.test(x= diabetes$bmi,
sigma.x=sd(diabetes$bmi),
mu=30,
alternative="two.side",
conf.level = 0.95)
##
## One-sample z-Test
##
## data: diabetes$bmi
## z = -189.18, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 30
## 95 percent confidence interval:
## 27.00328 27.06474
## sample estimates:
## mean of x
## 27.03401
Existe evidencia suficiente del 95% para rechazar Ho, es decir, que la media de la BMI no es igual a 30.
ggplot(diabetes, aes(x=bmi))+
geom_histogram(bins=10,
col="white")+
geom_vline(aes(xintercept =mean(diabetes$bmi)),
color="red",
linetype="dashed")+
geom_vline(aes(xintercept =30),
color="blue",
linetype="dashed")+
labs(title="Histograma de BMI de los pacientes",
x= "Clases de MBI",
y= "Frecuencia Absoluta")
boxplot.stats(diabetes$bmi)
## $stats
## [1] 15.00000 23.65709 27.00269 30.38185 40.46415
##
## $n
## [1] 100000
##
## $conf
## [1] 26.96909 27.03629
##
## $out
## [1] 41.80731 43.07589 40.61697 43.28871 40.58229 41.34413 40.56431 40.61561
## [9] 43.87327 42.09486 41.49260 44.37620 41.93864 40.49541 42.54965 40.59158
## [17] 40.83831 42.87602 41.20967 41.22930 42.75067 41.30092 42.14534 41.73040
## [25] 43.56828 41.22570 40.47542 40.60735 44.02603 42.38393 42.80473 42.00414
## [33] 41.65213 40.94578 42.67159 42.48751 41.82918 41.60928 43.97061 41.42286
## [41] 41.60750 41.79138 41.75191 43.11307 40.76528 40.74452 40.92929 41.91751
## [49] 41.47698 41.06362 42.46653 45.00000 42.01355 45.00000 42.94339 41.37895
## [57] 40.53330 42.97575 42.97608 40.87106 40.49964 42.39011 41.46570 41.19736
## [65] 40.87715 40.96856 42.42753 40.75440 41.41013 42.69973 42.90710 41.26074
## [73] 41.52821 42.01927 43.51768 41.90717 42.39776 42.40130 40.97398 41.06727
## [81] 41.24893 43.02476 42.81135 41.69732 43.90272 41.95614 42.00346 41.58712
## [89] 40.56515 42.94851 45.00000 40.88133 41.48848 40.79726 41.33535 43.04059
## [97] 44.99939 41.90229 41.95207 41.49458 40.60387 42.67736 41.05823 41.76999
## [105] 41.62840 40.90563 41.39812 41.84913 40.85448 42.25650 40.77332 43.30979
## [113] 43.51290 41.05501 45.00000 40.93366 40.74724 40.99690 45.00000 41.93016
## [121] 41.28702 40.74724 41.41221 40.88098 43.09700 42.94686 42.18680 40.53374
## [129] 40.91030 40.61525 41.23367 41.02181 42.35228 42.53081 44.30287 41.48568
## [137] 41.42824 42.55225 41.31226 40.71764 41.27276 40.49428 40.61265 41.03276
## [145] 42.55151 40.81737 44.49142 41.44629 41.37076 41.03194 42.77387 42.20048
## [153] 41.74788 41.15774 41.77526 42.22779 41.80682 42.26300 41.16168 42.88566
## [161] 41.30143 41.43337 40.50955 41.66166 41.92837 40.97119 42.14117 41.56911
## [169] 41.23049 40.68436 42.56821 41.61115 43.60098 42.75876 40.63862 40.60712
## [177] 40.96252 40.82450 40.92155 40.49013 41.57397 40.58847 44.75314 41.74531
## [185] 41.58159 40.98523 40.83006 41.70730 43.32711 40.59903 42.90144 41.17445
## [193] 41.07219 41.01215 41.85393 41.27929 41.42298 41.28693 40.56237 44.30638
## [201] 44.30683 41.02486 41.42569 42.49447 41.93508 42.78386 40.56124 41.40631
## [209] 41.88290 41.98697 41.25116 45.00000 41.19635 40.75621 44.92736 41.82001
## [217] 42.57334 40.69644 41.07915 41.70478 40.86009 41.30251 45.00000 41.20844
## [225] 45.00000 44.02829 43.45899 41.10081 41.05571 41.31457 45.00000 41.31244
## [233] 45.00000 42.27829 41.62035 42.50094 40.87540 44.65661 42.14843 45.00000
## [241] 40.99199 44.09775 42.38914 42.05548 43.08789 41.51423 41.25042 41.58192
## [249] 41.14282 41.16365 40.48203 40.51707 42.29717 42.57459 41.58005 42.95829
## [257] 41.33482 40.52631 45.00000 42.36815 41.14295 42.05732 42.07777 41.06823
## [265] 40.81157 42.03859 40.77750 41.93397 41.90005 42.98692 44.29767 40.72228
## [273] 41.01993 43.53056 44.52895 40.83605 40.74830 40.58078 42.04631 40.81755
## [281] 43.18399 42.03394 40.50202 41.72665 43.28322 42.46872 44.04133 42.62374
## [289] 41.36594 40.56883 41.86064 40.52633 41.55858 41.06132 41.17185 40.93733
## [297] 43.19990 40.61366 42.82675 42.68821 43.06159 41.83901 41.00460 44.40106
## [305] 43.16153 41.02473 40.69424 41.07784 41.80756 41.25283 40.70282 41.14681
## [313] 41.78803 40.49078 40.93820 41.09686 40.61625 41.32215 43.53522 44.54634
## [321] 42.50237 42.79511 41.79572 41.87012 41.90851 40.82233 41.03002 40.93098
## [329] 41.68710 41.55570 42.59338 40.80517 40.56201 42.01067 40.90585 41.01659
## [337] 40.61018 43.31600 41.58855 41.37773 41.24042 40.89653 42.24426
Interpretación
ggplot(diabetes, aes(x=bmi))+
geom_boxplot()+
labs(title="Boxplot del BMI de los pacientes",
x= "MBI")+
geom_vline(aes(xintercept =30),
color="blue",
linetype="dashed")
ggplot(diabetes, aes(x=bmi))+
geom_density(alpha=0.6)+
geom_vline(aes(xintercept =mean(diabetes$bmi)),
color="red",
linetype="dashed")+
geom_vline(aes(xintercept =30),
color="blue",
linetype="dashed")
interpretación
\[ \left\{\begin{matrix} Ho: \mu_{x1} =\mu _{x2}\\ Ha: \mu_{x1} \neq \mu _{x2} \end{matrix}\right. \] # dividir la base
datahistory=split(diabetes, diabetes$family_history)
z.test( x=datahistory$`0`$bmi,
y=datahistory$`1`$bmi,
sigma.x=sd(datahistory$`0`$bmi),
sigma.y=sd(datahistory$`1`$bmi),
mu=0,
alternative ="two.side",
conf.level = 0.95)
##
## Two-sample z-Test
##
## data: datahistory$`0`$bmi and datahistory$`1`$bmi
## z = -0.76561, p-value = 0.4439
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -0.09338595 0.04092189
## sample estimates:
## mean of x mean of y
## 27.02616 27.05239
Exite evidencia suficiente del 95%, para aceptar Ho, es decir, que la media de los que tienen historia de diabetis es igual a los que no tienes historia de diabetis, por consiguente no es significativo la diferencia de medias.
ggplot(diabetes, aes(x=bmi,
fill=family_history))+
geom_histogram(bins=10,
color="white")+
geom_vline(aes(xintercept =mean(diabetes$bmi)),
color="red",
linetype="dashed")
ggplot(diabetes, aes(x=bmi,
fill=family_history))+
geom_histogram(bins=10,
color="white")+
geom_vline(aes(xintercept =mean(diabetes$bmi)),
color="red",
linetype="dashed")+
facet_wrap(.~family_history)
ggplot(diabetes, aes(y=bmi,
fill=family_history))+
geom_boxplot()+
geom_hline(aes(yintercept =mean(diabetes$bmi)),
color="red",
linetype="dashed")
ggplot(diabetes, aes(y=bmi,
fill=family_history))+
geom_boxplot()+
geom_hline(aes(yintercept =mean(diabetes$bmi)),
color="red",
linetype="dashed")+
facet_wrap(.~family_history)
ggplot(diabetes, aes(x=bmi,
fill=family_history))+
geom_density(alpha=0.2)+
geom_vline(aes(xintercept =mean(diabetes$bmi)),
color="red",
linetype="dashed")
ggplot(diabetes, aes(x=bmi,
fill=family_history))+
geom_density(alpha=0.2)+
geom_vline(aes(xintercept =mean(diabetes$bmi)),
color="red",
linetype="dashed")+
facet_wrap(.~family_history)
\[ \left\{\begin{matrix} Ho: \pi =\pi _{x}\\ Ha: \pi \neq \pi _{x} \end{matrix}\right. \]
tablaHistoria=table(diabetes$family_history)
tablaHistoria
##
## 0 1
## 70068 29932
prop.test(x=29932, # los éxitos
n=100000, # total de evetos
p=0.2, # proporción de prueba
alternative = "two.side" ,
conf.level = 0.95)
##
## 1-sample proportions test with continuity correction
##
## data: 29932 out of 1e+05, null probability 0.2
## X-squared = 6164.7, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.2
## 95 percent confidence interval:
## 0.2964844 0.3021711
## sample estimates:
## p
## 0.29932
ggplot(diabetes, aes(x=family_history))+
geom_bar()
\[ \left\{\begin{matrix} Ho: \pi_{x1} =\pi _{x2}\\ Ha: \pi_{x1} \neq \pi _{x2} \end{matrix}\right. \]
tablaHistoria=table(diabetes$family_history)
tablaHistoria
##
## 0 1
## 70068 29932
prop.test(x=c(70068,29932), # los éxitos
n=c(100000,100000), # total de evetos
alternative = "two.side" ,
conf.level = 0.95)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(70068, 29932) out of c(1e+05, 1e+05)
## X-squared = 32216, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.3973359 0.4053841
## sample estimates:
## prop 1 prop 2
## 0.70068 0.29932