Estadística Descriptiva
library(readxl)
sifil=read_excel("C:\\Users\\Alumno\\Desktop\\baseTaller\\Sífilis.xlsx")
sifil=data.frame(sifil)
head(sifil)
## id Ficha Edad Sexo Raza Estado
## 1 1 1 De 20 a 24 años Hombre Negra Soltero
## 2 2 2 De 25 a 29 años Hombre Negra Soltero
## 3 3 3 De 30 a 34 años Mujer Blanca Soltero
## 4 4 4 De 20 a 24 años Mujer Mestiza Soltero
## 5 5 5 De 20 a 24 años Hombre Mestiza Soltero
## 6 6 6 De 20 a 24 años Hombre Mestiza Soltero
## Sífilis Curación Estrato DH
## 1 Sífilis Reciente Adquirida Sintomática 49 De 16 a 24 años 30
## 2 Sífilis Reciente Adquirida Sintomática 39 De 25 a 34 años 79
## 3 Sífilis Reciente Adquirida Latente 39 De 25 a 34 años 79
## 4 Sífilis Reciente Adquirida Sintomática 35 De 16 a 24 años 71
## 5 Sífilis Reciente Adquirida Sintomática 21 De 16 a 24 años 43
## 6 Sífilis Reciente Adquirida Latente 20 De 16 a 24 años 41
Creando Tablas Estadisticas
fi=table(sifil[,4])
pi=round(prop.table(fi)*100,2)
f=data.frame(fi)
p=data.frame(pi)
names(f)=c('Edad','Personas')
names(p)=c('Edad','Porc.')
tb=merge(f,p,'Edad')
tl=data.frame(Edad='Total',Personas=sum(fi),Porc.=sum(pi))
rbind(tb,tl)
## Edad Personas Porc.
## 1 Hombre 564 66.35
## 2 Mujer 286 33.65
## 3 Total 850 100.00
creando una función para elaborar tablas estadísticas
tabfrec=function(vec,x,y){
fi=table(vec)
pi=round(prop.table(fi)*100,2)
f=data.frame(fi)
p=data.frame(pi)
names(f)=c('x1','y1')
names(p)=c('x1','Porc.')
tb=merge(f,p,'x1')
tl=data.frame(x1='Total',y1=sum(fi),Porc.=sum(pi))
t2=rbind(tb,tl)
names(t2)=c(x,y,'Porc.')
return(t2)
}
tabfrec(sifil[,3],'Edad','Personas')
## Edad Personas Porc.
## 1 De 16 a 19 años 120 14.12
## 2 De 20 a 24 años 380 44.71
## 3 De 25 a 29 años 168 19.76
## 4 De 30 a 34 años 66 7.76
## 5 De 35 a 39 años 41 4.82
## 6 De 40 a 44 años 47 5.53
## 7 De 45 a 49 años 28 3.29
## 8 Total 850 99.99
tabfrec(sifil[,4],'Sexo','Personas')
## Sexo Personas Porc.
## 1 Hombre 564 66.35
## 2 Mujer 286 33.65
## 3 Total 850 100.00
tabfrec(sifil[,5],'Raza','Personas')
## Raza Personas Porc.
## 1 Blanca 309 36.35
## 2 Mestiza 212 24.94
## 3 Negra 329 38.71
## 4 Total 850 100.00
Gráficos Estadísticos.
library(ggplot2)
levels(sifil[,4])
## NULL
#Diagrama de Circular
ggplot(sifil, aes(x=1, y=sifil[,8], fill=sifil[,4]))+geom_bar(stat="identity")+coord_polar(theta='y')
#realizando con dos categorías
ggplot(sifil,aes(x=sifil[,4]))+geom_bar()
ggplot(sifil,aes(x=sifil[,4],fill=Raza))+geom_bar()
#realizado diagrama de barras de dos cualitativas
ggplot(sifil,aes(x=sifil[,4],fill=sifil[,5]))+geom_bar(position="dodge",color='blue')
#realizando una culitativa y cuantitativa
ggplot(sifil, aes(x=sifil[,3], y=sifil[,8])) + geom_bar(stat="identity")+theme(text = element_text(size=9))
#diagrama de cajas
ggplot(sifil,aes(x=sifil[,5]))+geom_boxplot(aes(y=sifil[,8]))
#diagrama de cajas con dos variables
ggplot(sifil,aes(x=sifil[,4],y=sifil[,8]))+geom_boxplot()
ggplot(sifil,aes(x=sifil[,5],y=sifil[,8],fill=sifil[,4]))+geom_boxplot(position="dodge")
#diagrama de correlación
ggplot(sifil,aes(x=sifil[,8],y=sifil[,10]))+geom_point()
#diagrama de correlación y sexo
ggplot(sifil,aes(x=sifil[,8],y=sifil[,10],color=sifil[,4]))+geom_point()
#diagrama de lineas y puntos
ggplot(sifil,aes(x=sifil[,8],y=sifil[,10]))+geom_line()+geom_point(size=3)
#
Estadisticos estadísticos descriptivos Explotación de base de datos
summary(sifil)
## id Ficha Edad Sexo
## Min. : 1.0 Min. : 1.0 Length:850 Length:850
## 1st Qu.:213.2 1st Qu.:213.2 Class :character Class :character
## Median :425.5 Median :425.5 Mode :character Mode :character
## Mean :425.5 Mean :425.5
## 3rd Qu.:637.8 3rd Qu.:637.8
## Max. :850.0 Max. :850.0
## Raza Estado Sífilis Curación
## Length:850 Length:850 Length:850 Min. :15.00
## Class :character Class :character Class :character 1st Qu.:24.00
## Mode :character Mode :character Mode :character Median :33.00
## Mean :32.93
## 3rd Qu.:42.00
## Max. :50.00
## Estrato DH
## Length:850 Min. : 21.00
## Class :character 1st Qu.: 49.00
## Mode :character Median : 67.00
## Mean : 66.17
## 3rd Qu.: 85.00
## Max. :101.00
min(sifil[,8])
## [1] 15
max(sifil[,8])
## [1] 50
range(sifil[,8])
## [1] 15 50
mean(sifil[,8])
## [1] 32.93059
median(sifil[,8])
## [1] 33
var(sifil[,8])
## [1] 109.1295
sd(sifil[,8])
## [1] 10.4465
cor(sifil[,8],sifil[,10])
## [1] 0.9488814
length(sifil[,10])
## [1] 850
Estadisticas por grupo
m=tapply(sifil[,8],sifil[,3],mean)
s=tapply(sifil[,8],sifil[,3],sd)
me=tapply(sifil[,8],sifil[,3],median)
n=tapply(sifil[,8],sifil[,3],length)
cbind(media=round(m,2),desv=round(s,2),mediana=round(me,2),muestra=round(n,2))
## media desv mediana muestra
## De 16 a 19 años 32.42 10.69 32.5 120
## De 20 a 24 años 33.13 10.44 34.0 380
## De 25 a 29 años 32.61 10.60 32.5 168
## De 30 a 34 años 32.55 10.47 31.0 66
## De 35 a 39 años 33.07 10.42 33.0 41
## De 40 a 44 años 33.04 10.92 33.0 47
## De 45 a 49 años 34.82 8.35 37.5 28
Creando funcion para estadística por grupo
estgru=function(vector,cate){
m=tapply(vector,cate,mean)
s=tapply(vector,cate,sd)
me=tapply(vector,cate,median)
n=tapply(vector,cate,length)
cv=s/m
t=cbind(media=round(m,2),desv=round(s,2),mediana=round(me,2),cv=round(cv,2),muestra=round(n,2))
return(t)
}
estgru(sifil[,8],sifil[,5])
## media desv mediana cv muestra
## Blanca 32.43 10.44 32.0 0.32 309
## Mestiza 33.95 10.46 35.5 0.31 212
## Negra 32.74 10.43 33.0 0.32 329
estgru(sifil[,8],sifil[,4])
## media desv mediana cv muestra
## Hombre 32.54 10.62 33 0.33 564
## Mujer 33.70 10.07 34 0.30 286
estgru(sifil[,8],sifil[,6])
## media desv mediana cv muestra
## casado 32.83 10.28 32 0.31 157
## Otro 33.12 11.78 34 0.36 17
## Soltero 32.95 10.47 33 0.32 676
Probabilidades lanzar una mondeda n veces
omega=c('c','s')
x=sample(omega,7,replace=TRUE)
t=table(x)
prop.table(t)
## x
## c s
## 0.2857143 0.7142857
10 maceteros con un determinado tipo de tierra 1, 15 con tipo de tierra 2, 25 con tipo de tierra 3
x=c(paste('A',1:10,sep=""),paste('B',1:15,sep=""),paste('C',1:25,sep=""))
x
## [1] "A1" "A2" "A3" "A4" "A5" "A6" "A7" "A8" "A9" "A10" "B1"
## [12] "B2" "B3" "B4" "B5" "B6" "B7" "B8" "B9" "B10" "B11" "B12"
## [23] "B13" "B14" "B15" "C1" "C2" "C3" "C4" "C5" "C6" "C7" "C8"
## [34] "C9" "C10" "C11" "C12" "C13" "C14" "C15" "C16" "C17" "C18" "C19"
## [45] "C20" "C21" "C22" "C23" "C24" "C25"
prop=c(rep(10/50,10),rep(15/50,15),rep(25/50,25))
sample(x,15,replace=TRUE,prop)
## [1] "C24" "B10" "C6" "B8" "C9" "C21" "A4" "C9" "C22" "C15" "C19"
## [12] "B13" "C19" "A5" "C25"
sample(x,25,replace=FALSE,prop)
## [1] "C5" "C11" "C3" "A5" "B13" "C6" "B8" "C24" "C1" "C16" "C7"
## [12] "B14" "C22" "B2" "C17" "C10" "C14" "B3" "C2" "C25" "B12" "C18"
## [23] "C19" "A10" "C15"
probabilidad binomial en una investigación en el nevado de Toluca en 1990 el 30% de pinos tiene presencia de hongos, en una muestra de 325, cual la probabilidad que 90 estén infectados.
pbinom(90,325,0.30)
## [1] 0.1990518
pbinom(10,325,0.30)
## [1] 3.224579e-36
Probabilidad normal
pnorm(13,mean=23,sd=8)
## [1] 0.1056498
pnorm(1.23)
## [1] 0.8906514
#inverse
qnorm(0.05)
## [1] -1.644854
qnorm(0.97)
## [1] 1.880794
qnorm(0.97,23,8)
## [1] 38.04635
probabilidad t student
pt(12,12,3)
## [1] 0.9999203
pt(1.2,12)
## [1] 0.8733526
#inversa
qt(0.40,12,3)
## [1] 2.771237
qt(0.05,12)
## [1] -1.782288
probabilidad chi-cuadrado
pchisq(4.2,12)
## [1] 0.02044908
qchisq(0.05,12)
## [1] 5.226029
probabilidad F
pf(4.4,12,13)
## [1] 0.9936507
qf(0.90,12,13)
## [1] 2.096588
Estimation de parametros
N=length(sifil[,1])
n=30
slc=sample(1:N,n,replace=FALSE)
msifil=sifil[sifil[,1]%in%slc,]
#Estimación de Parámetros para la proporciones
table(msifil[,4])
##
## Hombre Mujer
## 18 12
prop.test(20,n,conf.level=0.95)$conf.int
## [1] 0.4713741 0.8206242
## attr(,"conf.level")
## [1] 0.95
prop.table(table(sifil[,4]))
##
## Hombre Mujer
## 0.6635294 0.3364706
#otro
table(msifil[,5])
##
## Blanca Mestiza Negra
## 12 9 9
prop.test(10,n,conf.level=0.95)$conf.int
## [1] 0.1793758 0.5286259
## attr(,"conf.level")
## [1] 0.95
prop.table(table(sifil[,5]))
##
## Blanca Mestiza Negra
## 0.3635294 0.2494118 0.3870588
#Estimación de parámetros para el promedio
t.test(msifil[,8],conf.level=0.95)$conf.int
## [1] 30.33477 38.46523
## attr(,"conf.level")
## [1] 0.95
mean(sifil[,8])
## [1] 32.93059
#estimación de parámetros para la varianza
test.var=function(vector,alpha){
n=length(vector)
inf=((n-1)*var(vector))/qchisq(1-alpha/2,n-1)
sup=((n-1)*var(vector))/qchisq(alpha/2,n-1)
t=cbind(inf=round(inf,2),sup=round(sup,2),level.conf=1-alpha)
return(t)
}
test.var(msifil[,8],0.01)
## inf sup level.conf
## [1,] 65.68 261.96 0.99
var(sifil[,8])
## [1] 109.1295
test.var(msifil[,10],0.05)
## inf sup level.conf
## [1,] 300.7 856.78 0.95
var(sifil[,10])
## [1] 441.2104
#Prueba de hipótesis
#Para la proporción
table(msifil[,5])
##
## Blanca Mestiza Negra
## 12 9 9
prop.test(13,20,0.2,conf.level=0.95,alternative="two.side")
## Warning in prop.test(13, 20, 0.2, conf.level = 0.95, alternative =
## "two.side"): Chi-squared approximation may be incorrect
##
## 1-sample proportions test with continuity correction
##
## data: 13 out of 20, null probability 0.2
## X-squared = 22.578, df = 1, p-value = 2.018e-06
## alternative hypothesis: true p is not equal to 0.2
## 95 percent confidence interval:
## 0.4094896 0.8369133
## sample estimates:
## p
## 0.65
#menor
prop.test(13,20,0.2,conf.level=0.95,alternative="less")
## Warning in prop.test(13, 20, 0.2, conf.level = 0.95, alternative = "less"):
## Chi-squared approximation may be incorrect
##
## 1-sample proportions test with continuity correction
##
## data: 13 out of 20, null probability 0.2
## X-squared = 22.578, df = 1, p-value = 1
## alternative hypothesis: true p is less than 0.2
## 95 percent confidence interval:
## 0.000000 0.817166
## sample estimates:
## p
## 0.65
#mayor
prop.test(13,20,0.2,conf.level=0.95,alternative="greater")
## Warning in prop.test(13, 20, 0.2, conf.level = 0.95, alternative =
## "greater"): Chi-squared approximation may be incorrect
##
## 1-sample proportions test with continuity correction
##
## data: 13 out of 20, null probability 0.2
## X-squared = 22.578, df = 1, p-value = 1.009e-06
## alternative hypothesis: true p is greater than 0.2
## 95 percent confidence interval:
## 0.4423272 1.0000000
## sample estimates:
## p
## 0.65
#para la media
t.test(msifil[,8],mu=33,conf.level=0.98,alternative="less")
##
## One Sample t-test
##
## data: msifil[, 8]
## t = 0.70434, df = 29, p-value = 0.7566
## alternative hypothesis: true mean is less than 33
## 98 percent confidence interval:
## -Inf 38.67412
## sample estimates:
## mean of x
## 34.4
t.test(msifil[,10],mu=50,conf.level=0.95,alternative="two.side")
##
## One Sample t-test
##
## data: msifil[, 10]
## t = 4.9807, df = 29, p-value = 2.676e-05
## alternative hypothesis: true mean is not equal to 50
## 95 percent confidence interval:
## 61.66955 77.93045
## sample estimates:
## mean of x
## 69.8
#para la varianza
#install.packages("TeachingDemos")
library(TeachingDemos)
#En caso que:
#Desv. Estándar: sigma=k.
#Varianza: sigmasq=K
sigma.test(msifil[,8],sigma=10.4,conf.level=0,98,alternative="greater")
##
## One sample Chi-squared test for variance
##
## data: msifil[, 8]
## X-squared = 35.073, df = 29, p-value = 0.2021
## alternative hypothesis: true variance is greater than 98
## 0 percent confidence interval:
## Inf Inf
## sample estimates:
## var of msifil[, 8]
## 118.5241
sigma.test(msifil[,10],sigma=21.1,conf.level=0.95,alternative="less")
##
## One sample Chi-squared test for variance
##
## data: msifil[, 10]
## X-squared = 30.882, df = 29, p-value = 0.629
## alternative hypothesis: true variance is less than 445.21
## 95 percent confidence interval:
## 0.0000 776.4014
## sample estimates:
## var of msifil[, 10]
## 474.0966
#Para dos Muestras
table(msifil[,5])
##
## Blanca Mestiza Negra
## 12 9 9
prop.test(c(13,10),c(20,20),NULL,conf.level=0.90,alternative="two.side")
##
## 2-sample test for equality of proportions with continuity
## correction
##
## data: c(13, 10) out of c(20, 20)
## X-squared = 0.40921, df = 1, p-value = 0.5224
## alternative hypothesis: two.sided
## 90 percent confidence interval:
## -0.1541552 0.4541552
## sample estimates:
## prop 1 prop 2
## 0.65 0.50
#para dos varianzas
mN=msifil[msifil[,5]=="Negra" | msifil[,5]=="Mestiza",5]
mM=msifil[msifil[,5]=="Negra" | msifil[,5]=="Mestiza",8]
var.test(mM~mN,conf.level=0.95, alternative="greater")
##
## F test to compare two variances
##
## data: mM by mN
## F = 1.1132, num df = 8, denom df = 8, p-value = 0.4416
## alternative hypothesis: true ratio of variances is greater than 1
## 95 percent confidence interval:
## 0.323796 Inf
## sample estimates:
## ratio of variances
## 1.113243
#para dos muestras varianzas iguales
t.test(mM~mN, mu=0,conf.level=0.95, var.equal=TRUE, alternative="two.sided")
##
## Two Sample t-test
##
## data: mM by mN
## t = 0.46588, df = 16, p-value = 0.6476
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.678669 13.567558
## sample estimates:
## mean in group Mestiza mean in group Negra
## 37.44444 35.00000
#differences
t.test(mM~mN, mu=0,conf.level=0.95, var.equal=FALSE, alternative="two.sided")
##
## Welch Two Sample t-test
##
## data: mM by mN
## t = 0.46588, df = 15.954, p-value = 0.6476
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -8.681266 13.570155
## sample estimates:
## mean in group Mestiza mean in group Negra
## 37.44444 35.00000
#para muestras apareadas
t.test(msifil[,8],msifil[,10], paired=TRUE, conf.level=0.95,alternative="less")
##
## Paired t-test
##
## data: msifil[, 8] and msifil[, 10]
## t = -17.81, df = 29, p-value < 2.2e-16
## alternative hypothesis: true difference in means is less than 0
## 95 percent confidence interval:
## -Inf -32.02271
## sample estimates:
## mean of the differences
## -35.4
library(readxl)
conta=read_excel("C:\\Users\\Alumno\\Desktop\\baseTaller\\contamina.xlsx")
conta=data.frame(conta)
head(conta)
## CONTAMINACION
## 1 23
## 2 25
## 3 24
## 4 24
## 5 21
## 6 21
Prueba de Normalidad.
library(stats)
#Kolmogorov
ks.test(conta[,1], pnorm, mean(conta[,1]), sd(conta[,1]))
## Warning in ks.test(conta[, 1], pnorm, mean(conta[, 1]), sd(conta[, 1])):
## ties should not be present for the Kolmogorov-Smirnov test
##
## One-sample Kolmogorov-Smirnov test
##
## data: conta[, 1]
## D = 0.16742, p-value = 0.2652
## alternative hypothesis: two-sided
#shapiro
shapiro.test(conta[,1])
##
## Shapiro-Wilk normality test
##
## data: conta[, 1]
## W = 0.94046, p-value = 0.05249
diagrama de qqplot
scale(conta[,1])
## [,1]
## [1,] -0.7220002
## [2,] 0.2972942
## [3,] -0.2123530
## [4,] -0.2123530
## [5,] -1.7412946
## [6,] -1.7412946
## [7,] -0.2123530
## [8,] 1.8262358
## [9,] 0.2972942
## [10,] -0.2123530
## [11,] -0.2123530
## [12,] -0.7220002
## [13,] 1.8262358
## [14,] -0.7220002
## [15,] -0.2123530
## [16,] -1.7412946
## [17,] -1.2316474
## [18,] 1.3165886
## [19,] 0.8069414
## [20,] 0.8069414
## [21,] 0.2972942
## [22,] -0.7220002
## [23,] 1.8262358
## [24,] -0.2123530
## [25,] 0.2972942
## [26,] 1.8262358
## [27,] 0.2972942
## [28,] -0.2123530
## [29,] 0.8069414
## [30,] -0.7220002
## [31,] 0.2972942
## [32,] 1.3165886
## [33,] -1.2316474
## [34,] -0.2123530
## [35,] -0.7220002
## [36,] -0.2123530
## attr(,"scaled:center")
## [1] 24.41667
## attr(,"scaled:scale")
## [1] 1.962142
estgor=data.frame(scale(conta[,1]))
plot(conta[,1],estgor[,1],xlab="Valores Observados",ylab="Normalizados")
qqnorm(conta[,1])
qqline(conta[,1])
Estadísticas Descriptivas
summary(conta[,1])
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 23.00 24.00 24.42 25.25 28.00
prueba de hipótesis
t.test(conta[,1],mu=20,conf.level=0.95,alternative="two.side")
##
## One Sample t-test
##
## data: conta[, 1]
## t = 13.506, df = 35, p-value = 1.913e-15
## alternative hypothesis: true mean is not equal to 20
## 95 percent confidence interval:
## 23.75277 25.08056
## sample estimates:
## mean of x
## 24.41667
Prueba de Hipótesis para dos muestras
library(readxl)
tort=read_excel("C:\\Users\\Alumno\\Desktop\\baseTaller\\tortugas.xlsx")
tort=data.frame(tort)
head(tort)
## diametro tortuga
## 1 34.44891 chelomiamydas
## 2 38.81394 chelomiamydas
## 3 37.62226 chelomiamydas
## 4 37.19779 chelomiamydas
## 5 29.72025 chelomiamydas
## 6 30.16471 chelomiamydas
llamada a la función creada en las clases anteriores.
estgru(tort[,1],tort[,2])
## media desv mediana cv muestra
## chelomiamydas 38.06 4.82 37.62 0.13 31
## demochelys 41.81 4.94 42.27 0.12 33
summary(tort)
## diametro tortuga
## Min. :29.72 Length:64
## 1st Qu.:36.96 Class :character
## Median :39.03 Mode :character
## Mean :39.99
## 3rd Qu.:43.83
## Max. :52.56
library(stats)
#Kolmogorov
shapiro.test(tort[,1])
##
## Shapiro-Wilk normality test
##
## data: tort[, 1]
## W = 0.98117, p-value = 0.4353
levels(tort[,2])
## NULL
shapiro.test(tort[,1][tort[,2]=="chelomiamydas"])
##
## Shapiro-Wilk normality test
##
## data: tort[, 1][tort[, 2] == "chelomiamydas"]
## W = 0.9502, p-value = 0.1581
shapiro.test(tort[,1][tort[,2]=="demochelys" ])
##
## Shapiro-Wilk normality test
##
## data: tort[, 1][tort[, 2] == "demochelys"]
## W = 0.98814, p-value = 0.97
Homocedasticidad. Homogeneidad de varianzas
library(TeachingDemos)
var.test(tort[,1]~tort[,2],conf.level=0.95, alternative="two.side")
##
## F test to compare two variances
##
## data: tort[, 1] by tort[, 2]
## F = 0.95382, num df = 30, denom df = 32, p-value = 0.8993
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.4673696 1.9629641
## sample estimates:
## ratio of variances
## 0.9538249
install.packages(“car”)
library(car)
## Loading required package: carData
bartlett.test(tort[,1]~tort[,2],tort)
##
## Bartlett test of homogeneity of variances
##
## data: tort[, 1] by tort[, 2]
## Bartlett's K-squared = 0.017018, df = 1, p-value = 0.8962
prueba de hipótesis
t.test(tort[,1]~tort[,2], mu=0,conf.level=0.95, var.equal=TRUE, alternative="two.sided")
##
## Two Sample t-test
##
## data: tort[, 1] by tort[, 2]
## t = -3.0773, df = 62, p-value = 0.003108
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -6.196034 -1.316223
## sample estimates:
## mean in group chelomiamydas mean in group demochelys
## 38.05614 41.81227