Students <- read.csv("C:/Users/pedro/OneDrive/Escritorio/Especializacion Estadistica 2023/Metodos estadisticos/StudentsPerformance.csv")
names(Students) <- make.names(names(Students) ) # Eliminar caracteres epeciales
head(Students, n=5)

##   gender race.ethnicity parental.level.of.education        lunch
## 1 female        group B           bachelor's degree     standard
## 2 female        group C                some college     standard
## 3 female        group B             master's degree     standard
## 4   male        group A          associate's degree free/reduced
## 5   male        group C                some college     standard
##   test.preparation.course math.score reading.score writing.score
## 1                    none         72            72            74
## 2               completed         69            90            88
## 3                    none         90            95            93
## 4                    none         47            57            44
## 5                    none         76            78            75

# Extraemos una muestra de 100 datos para calcular un intervalo de confianza
set.seed(12345)
mdf <- sample(Students$math.score, 100)
mdf <- data.frame(mdf)
media <- mean(mdf$mdf)
desv <- sd(mdf$mdf)
head(mdf, n=5)

##   mdf
## 1  59
## 2  53
## 3  91
## 4  53
## 5  61

1. Test para una muestra

1.1. No se le esta dando un valor contra el cual comparar

library(lattice)
library(BSDA)
# Test para una muestra
test1 <- z.test(x=mdf$mdf, # Vector numerico (Obligatorio)
                sigma.x=sd(mdf$mdf), # Margen de error
                conf.level=0.95)
test1

## 
##  One-sample z-Test
## 
## data:  mdf$mdf
## z = 42.831, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
##  63.78139 69.89861
## sample estimates:
## mean of x 
##     66.84

# El siempre me da la prueba Ha: true mean is not equal to 0, por eso da un valor de tan alto (42.8)
# Lo anterior, es porque esta comparando la media muestral vs una muestra de cero. No tiene sentido
# Esto se realizo para obtener el intervalo de confianza para la media al 95%
# a media va a estar entre 63.78139 y 69.89861 (Este estos son los valores que me importan en este caso)

1.2. Dandole un valor puntual contra el cual comparar

test2 <- z.test(x=mdf$mdf, # Vector numerico (Obligatorio)
                sigma.x=sd(mdf$mdf), # Margen de error
                conf.level=0.95,
                mu=70, # El valor que quiero evaluar
                alternative="two.sided") # Prueba de dos colas porque es una evaluacion puntual, podria ser "greater" o "less" si quiero saber valores mayores o menores)
test2 # Z de -2.02 queda en la zona de rechazo (<+/- 1.96)

## 
##  One-sample z-Test
## 
## data:  mdf$mdf
## z = -2.0249, p-value = 0.04287
## alternative hypothesis: true mean is not equal to 70
## 95 percent confidence interval:
##  63.78139 69.89861
## sample estimates:
## mean of x 
##     66.84

# Existen evidencias suficientes para rechazar Ho
# 70 no esta contenido dentro del intervalo

1.3. Dandole un valor por encima del cual comparar

test3 <- z.test(x=mdf$mdf, # Vector numerico (Obligatorio)
                sigma.x=sd(mdf$mdf), # Margen de error
                conf.level=0.95,
                mu=70, # El valor sobre el que quiero evaluar
                alternative="greater") # Prueba de una cola, quiero evaluar contra valores mayores
test3 # Aceptamos la hipotesis nula (p-value = 0.9786)

## 
##  One-sample z-Test
## 
## data:  mdf$mdf
## z = -2.0249, p-value = 0.9786
## alternative hypothesis: true mean is greater than 70
## 95 percent confidence interval:
##  64.27313       NA
## sample estimates:
## mean of x 
##     66.84

1.4. Comparacion de una muestra usando prueba t ()

Se usa con menos de 30 datos, aunque luego de un numero de datos alto, z y t sirven

set.seed(1234)
dat.t=sample(1:nrow(Students), 25, replace = TRUE)
dat.t # Las filas seleccionadas

##  [1] 284 848 918 101 623 905 645 934 400 900  98 103 726 602 326  79 974 884 270
## [20] 382 184 574   4 900 661

dat.mt = Students[dat.t, ] # extraer los datos de las filas seleccionadas

t.test(x=dat.mt$math.score,
       sd=sd(dat.mt$math.score),
       alternative="two.sided",
       conf.level=0.95,
       mu=60)

## 
##  One Sample t-test
## 
## data:  dat.mt$math.score
## t = 1.6123, df = 24, p-value = 0.12
## alternative hypothesis: true mean is not equal to 60
## 95 percent confidence interval:
##  58.75637 70.12363
## sample estimates:
## mean of x 
##     64.44

1.5. Test para una proporcion

set.seed(12345)
pdfa <- sample(Students$race.ethnicity, 100)
pdfa <- data.frame(pdfa)
head(pdfa)

##      pdfa
## 1 group C
## 2 group E
## 3 group E
## 4 group C
## 5 group B
## 6 group C

t.race <- table(pdfa); t.race

## pdfa
## group A group B group C group D group E 
##      10      16      33      24      17

t.race[3] # Datos del Grupo C

## group C 
##      33

# extraer el intervalo
prop.test(x=t.race[3], # Valor grupo a evaluar
          n=nrow(pdfa), # tamano de la muestra
          conf.level = 0.95) # Confianza

## 
##  1-sample proportions test with continuity correction
## 
## data:  t.race[3] out of nrow(pdfa), null probability 0.5
## X-squared = 10.89, df = 1, p-value = 0.0009668
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.2411558 0.4320901
## sample estimates:
##    p 
## 0.33

# Intervalo entre 0.2411558 y 0.4320901

2. Tests para dos muestras

library(BSDA)
dat = split(Students, Students$gender)
dat.F <- dat$female
dat.M <- dat$male

boxplot(Students$math.score~Students$gender)
abline(v=mean(Students$math.score), col="red" )

2.1. Las medias en dos grupos son iguales (two.side)

ho: u1==u2 -> ho: u1-u2==0

ha: u1!=u2 -> ho: u1-u2!=0

2.1. Ejemplo: La media de los dos grupos es igual?

### ho: u1-u2==0
### ha: u1-u2!=0
z.test(x=dat.F$math.score, y=dat.M$math.score,
       sigma.x = sd(dat.F$math.score), sigma.y = sd(dat.M$math.score),
       mu=0,
       alternative = "two.sided",
       conf.level = 0.95)

## 
##  Two-sample z-Test
## 
## data:  dat.F$math.score and dat.M$math.score
## z = -5.398, p-value = 6.739e-08
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.944962 -3.245060
## sample estimates:
## mean of x mean of y 
##  63.63320  68.72822

# como -6.94 y -3.24 no cruzan el cero, se rechaza que la diferencia de medias es igual a cero
# Los dos grupos son estadisticamente diferentes

2.1. Ejemplo. Los dos grupos presentan una diferencia de 5 entre las notas de hombres y mujeres?

### ho: u1-u2==-5
### ha: u1-u2!=-5
z.test(x=dat.F$math.score, y=dat.M$math.score,
       sigma.x = sd(dat.F$math.score), sigma.y = sd(dat.M$math.score),
       mu=-5,
       alternative = "two.sided",
       conf.level = 0.95)

## 
##  Two-sample z-Test
## 
## data:  dat.F$math.score and dat.M$math.score
## z = -0.10066, p-value = 0.9198
## alternative hypothesis: true difference in means is not equal to -5
## 95 percent confidence interval:
##  -6.944962 -3.245060
## sample estimates:
## mean of x mean of y 
##  63.63320  68.72822

# como -6.94 y -3.24 si cruzan -5, se acepta que la diferencia de medias es -5
# Este es un caso particular, pues generalmente se establece un mu de 0, para verificar si son iguales o diferentes

2.2. Greater

ho: u1-u2>=0

ho: u1-u2<0

2.3. Less

ho: u1-u2<=0

ho: u1-u2>0

2.4. Comparamos dos proporciones

table(Students$gender)

## 
## female   male 
##    518    482

length(Students$gender)

## [1] 1000

prop.test(x=c(518, 482),
          n=c(1000, 1000),
alternative="two.sided",
conf.level=0.95 )

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  c(518, 482) out of c(1000, 1000)
## X-squared = 2.45, df = 1, p-value = 0.1175
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.008797718  0.080797718
## sample estimates:
## prop 1 prop 2 
##  0.518  0.482

# No existe diferencias entre las proporciones
# En el intervalo de confianza, esta contenido el cero

2.2. Comparacion de dos grupos con la prueba t

ho: var1/var2==1

Primero revisamos que cumplan el supuesto de igualdad de varianzas

# Esta prueba usa la distribucion F, que trabaja con los grados de libertad
# Los grados de libertad, describen la distribucion
# Definan la posibilidad de ser seleccionados
# Entre mas grados de libertad, tiende a acercarse mas a una normal
var.test(x=dat.mt$math.score, y=dat.mt$reading.score,
         alternative = "two.sided",
         ratio = 1,
         conf.level = 0.95)

## 
##  F test to compare two variances
## 
## data:  dat.mt$math.score and dat.mt$reading.score
## F = 0.81425, num df = 24, denom df = 24, p-value = 0.6187
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3588147 1.8477593
## sample estimates:
## ratio of variances 
##          0.8142501

# Existe evidencia que permite decir que las varianzas poblecionales son iguales
# El ratio de varianzas es igua a 1

Segundo. comparamos los grupos usando la prueba de t

t.test(x=dat.mt$math.score, y=dat.mt$reading.score,
       sigma.x=sd(dat.mt$math.score), sigma.y=sd(dat.mt$math.score),
       alternative="two.sided",
       mu=0,
       var.equal=TRUE,
       conf.level = 0.95) # de acuerdo a la prueba anterior

## 
##  Two Sample t-test
## 
## data:  dat.mt$math.score and dat.mt$reading.score
## t = -1.2164, df = 48, p-value = 0.2298
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -13.264956   3.264956
## sample estimates:
## mean of x mean of y 
##     64.44     69.44

# se acepta la hipotesis nula, el cero esta contenido entre el IC: -13.2 a 3.2
# ademas el pvalor es mayor a 0.05

2.2. Ejemplo. Comparacion de dos grupos con la prueba t

dat.t2 <- sample(1:nrow(Students), 50, replace = T)
dat.mt2 <- Students[dat.t2,]
dat.m <- split(dat.mt2, dat.mt2$gender)

## Prueba varianzas
var.test(x=dat.m$female$math.score, y=dat.m$male$math.score, 
         alternative = "two.sided",
         ratio = 1,
         conf.level = 0.95)

## 
##  F test to compare two variances
## 
## data:  dat.m$female$math.score and dat.m$male$math.score
## F = 1.2767, num df = 30, denom df = 18, p-value = 0.5951
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.5222808 2.8514115
## sample estimates:
## ratio of variances 
##           1.276718

# Las varianzas son iguales

## Prueba comparacion
t.test(x=dat.m$female$math.score, y=dat.m$male$math.score,
       sigma.x=dat.m$female$math.score, sigma.y=dat.m$male$math.score,
       alternative="two.sided",
       mu=0,
       var.equal=TRUE, # de acuerdo a la prueba de varianzas anterior
       conf.level = 0.95)

## 
##  Two Sample t-test
## 
## data:  dat.m$female$math.score and dat.m$male$math.score
## t = 0.090585, df = 48, p-value = 0.9282
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -9.140568 10.003047
## sample estimates:
## mean of x mean of y 
##  65.48387  65.05263

# Se acepta la hipotesis nula de que las medias son iguales (p valor, mayor a 0.05), el CI no contempla a cero
# Los hombres y mujeres tienen mismos puntajes en la prueba de matematicas

3. test para mas de dos muestras (PRELIMINAR)

ml = lm(Students$math.score~Students$race.ethnicity)
summary(ml)

## 
## Call:
## lm(formula = Students$math.score ~ Students$race.ethnicity)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -64.464  -9.453   0.536  10.179  38.371 
## 
## Coefficients:
##                                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                      61.629      1.565  39.373  < 2e-16 ***
## Students$race.ethnicitygroup B    1.823      1.897   0.961   0.3366    
## Students$race.ethnicitygroup C    2.835      1.770   1.601   0.1096    
## Students$race.ethnicitygroup D    5.733      1.812   3.165   0.0016 ** 
## Students$race.ethnicitygroup E   12.192      2.002   6.090 1.61e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 14.77 on 995 degrees of freedom
## Multiple R-squared:  0.05542,    Adjusted R-squared:  0.05162 
## F-statistic: 14.59 on 4 and 995 DF,  p-value: 1.373e-11

summary(aov(ml) )

##                          Df Sum Sq Mean Sq F value   Pr(>F)    
## Students$race.ethnicity   4  12729    3182   14.59 1.37e-11 ***
## Residuals               995 216960     218                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Intervalos de confianza

Pedro Lizarazo

2023-04-22

1. Test para una muestra

1.1. No se le esta dando un valor contra el cual comparar

1.2. Dandole un valor puntual contra el cual comparar

1.3. Dandole un valor por encima del cual comparar

1.4. Comparacion de una muestra usando prueba t ()

Se usa con menos de 30 datos, aunque luego de un numero de datos alto, z y t sirven

1.5. Test para una proporcion

2. Tests para dos muestras

2.1. Las medias en dos grupos son iguales (two.side)

ho: u1==u2 -> ho: u1-u2==0

ha: u1!=u2 -> ho: u1-u2!=0

2.1. Ejemplo: La media de los dos grupos es igual?

2.1. Ejemplo. Los dos grupos presentan una diferencia de 5 entre las notas de hombres y mujeres?

2.2. Greater

ho: u1-u2>=0

ho: u1-u2<0

2.3. Less

ho: u1-u2<=0

ho: u1-u2>0

2.4. Comparamos dos proporciones

2.2. Comparacion de dos grupos con la prueba t

ho: var1/var2==1

ho: var1/var2==1

Primero revisamos que cumplan el supuesto de igualdad de varianzas

Segundo. comparamos los grupos usando la prueba de t

2.2. Ejemplo. Comparacion de dos grupos con la prueba t

3. test para mas de dos muestras (PRELIMINAR)