SP <- read.csv("C:/Documentos Pedro/Especializacion Estadistica 2023/Metodos estadisticos/StudentsPerformance.csv")
head(SP, n = 5)
## gender race.ethnicity parental.level.of.education lunch
## 1 female group B bachelor's degree standard
## 2 female group C some college standard
## 3 female group B master's degree standard
## 4 male group A associate's degree free/reduced
## 5 male group C some college standard
## test.preparation.course math.score reading.score writing.score
## 1 none 72 72 74
## 2 completed 69 90 88
## 3 none 90 95 93
## 4 none 47 57 44
## 5 none 76 78 75
La presente es la actividad correspondiente a los temas de muestreo sistemático y estratificado.
# Margen de error o de precisión
MargenError <- seq(0, 0.1, by = 0.001) # Nivel de precisión hasta 10% (0.1)
# Cuál es el nivel de confianza de la muestra???
NC <- 0.95 # Nivel de confianza
# El valor del Z para el nivel de confianza
z <- qnorm((1 - NC) / 2, mean = 0, sd = 1) # Valor de Z
# Qué proporción de éxito es necesaria para la toma de la muestra??
P <- 0.5 # Probabilidad de éxito (es en la relacion p*q el valor mas alto posible, es decir la varianza mas alta)
# Cálculo de la muestra para población infinita
n0 <- ((z^2) * P * (1 - P)) / MargenError^2 # Cuando
# Cálculo de la muestra ajustada para población finita
N <- nrow(SP)
n <- n0 / (1 + (n0 / N))
# Creación del dataframe para el gráfico
data <- data.frame(MargenError, n=round(n, digits = 0) )
head(data, n = 10)
## MargenError n
## 1 0.000 NaN
## 2 0.001 999
## 3 0.002 996
## 4 0.003 991
## 5 0.004 984
## 6 0.005 975
## 7 0.006 964
## 8 0.007 951
## 9 0.008 938
## 10 0.009 922
# Gráfico utilizando ggplot2
library(ggplot2)
ggplot(data, aes(x = n, y = MargenError)) +
geom_line() +
geom_point() +
labs(x = "Tamaño de muestra (ajustado para población finita)",
y = "Margen de error") +
ggtitle("Relación entre el tamaño de muestra y el margen de error") +
theme_classic()+
geom_vline(xintercept = 248, colour="red", size=0.5)+
geom_hline(yintercept = 0.054, colour="blue", size=0.5)
### Seleccionamos una muestra. Mediante Muestreo aleatorio simple
n <- 248 # numero de muestras seleccionado
ID_MUESTRA <- sample(nrow(SP), size = n, replace = TRUE) # Estraemos los ID de las observaciones
muestra_MAS <- SP[ID_MUESTRA, ] # Extraemos los valores del dataframe a partir de los ID seleccionados
head(muestra_MAS, n = 10)
## gender race.ethnicity parental.level.of.education lunch
## 647 female group D associate's degree standard
## 587 female group A high school standard
## 155 male group D some high school standard
## 33 female group E master's degree free/reduced
## 489 male group B some high school standard
## 432 female group C high school standard
## 99 female group D some college free/reduced
## 37 female group D associate's degree standard
## 133 male group E some college free/reduced
## 741 male group D bachelor's degree standard
## test.preparation.course math.score reading.score writing.score
## 647 none 59 70 65
## 587 none 55 73 73
## 155 none 62 67 61
## 33 none 56 72 65
## 489 completed 64 53 57
## 432 none 61 72 70
## 99 none 58 67 62
## 37 none 74 81 83
## 133 completed 87 74 70
## 741 none 80 73 72
Seleccione una base de datos de su interés y realice un análisis de muestreo (Prueba piloto) para seleccionar la muestra adecuada para un diseño sistemático y realice los cálculos de interés.
Seleccione una base de su interés donde contenga una variable cualitativa que se pueda estratificar y una variable cuantitativa que este asociada. Realice un análisis de muestreo (Prueba piloto) para seleccionar la muestra adecuada para un diseño estratificado y realice los cálculos de interés.
estratos <- levels(as.factor(SP$race.ethnicity) )
estrato1 <- subset(x=SP, race.ethnicity=="group A"); head(estrato1, n=10)
## gender race.ethnicity parental.level.of.education lunch
## 4 male group A associate's degree free/reduced
## 14 male group A some college standard
## 15 female group A master's degree standard
## 26 male group A master's degree free/reduced
## 47 female group A associate's degree standard
## 62 male group A some high school free/reduced
## 63 male group A associate's degree free/reduced
## 73 female group A associate's degree free/reduced
## 78 male group A bachelor's degree standard
## 83 male group A some college free/reduced
## test.preparation.course math.score reading.score writing.score
## 4 none 47 57 44
## 14 completed 78 72 70
## 15 none 50 53 58
## 26 none 73 74 72
## 47 completed 55 65 62
## 62 none 39 39 34
## 63 none 62 61 55
## 73 none 41 51 48
## 78 completed 80 78 81
## 83 completed 50 47 54
estrato2 <- subset(x=SP, race.ethnicity=="group B"); head(estrato2, n=10)
## gender race.ethnicity parental.level.of.education lunch
## 1 female group B bachelor's degree standard
## 3 female group B master's degree standard
## 6 female group B associate's degree standard
## 7 female group B some college standard
## 8 male group B some college free/reduced
## 10 female group B high school free/reduced
## 13 female group B high school standard
## 18 female group B some high school free/reduced
## 22 female group B some college free/reduced
## 27 male group B some college standard
## test.preparation.course math.score reading.score writing.score
## 1 none 72 72 74
## 3 none 90 95 93
## 6 none 71 83 78
## 7 completed 88 95 92
## 8 none 40 43 39
## 10 none 38 60 50
## 13 none 65 81 73
## 18 none 18 32 28
## 22 completed 65 75 70
## 27 none 69 54 55
estrato3 <- subset(x=SP, race.ethnicity=="group C"); head(estrato3, n=10)
## gender race.ethnicity parental.level.of.education lunch
## 2 female group C some college standard
## 5 male group C some college standard
## 11 male group C associate's degree standard
## 16 female group C some high school standard
## 17 male group C high school standard
## 19 male group C master's degree free/reduced
## 20 female group C associate's degree free/reduced
## 24 female group C some high school standard
## 28 female group C bachelor's degree standard
## 29 male group C high school standard
## test.preparation.course math.score reading.score writing.score
## 2 completed 69 90 88
## 5 none 76 78 75
## 11 none 58 54 52
## 16 none 69 75 78
## 17 none 88 89 86
## 19 completed 46 42 46
## 20 none 54 58 61
## 24 none 69 73 73
## 28 none 67 69 75
## 29 none 70 70 65
estrato4 <- subset(x=SP, race.ethnicity=="group D"); head(estrato4, n=10)
## gender race.ethnicity parental.level.of.education lunch
## 9 male group D high school free/reduced
## 12 male group D associate's degree standard
## 21 male group D high school standard
## 23 male group D some college standard
## 25 male group D bachelor's degree free/reduced
## 30 female group D master's degree standard
## 31 female group D some college standard
## 34 male group D some college standard
## 37 female group D associate's degree standard
## 38 female group D some high school free/reduced
## test.preparation.course math.score reading.score writing.score
## 9 completed 64 64 67
## 12 none 40 52 43
## 21 none 66 69 63
## 23 none 44 54 53
## 25 completed 74 71 80
## 30 none 62 70 75
## 31 none 69 74 74
## 34 none 40 42 38
## 37 none 74 81 83
## 38 none 50 64 59
estrato5 <- subset(x=SP, race.ethnicity=="group E"); head(estrato5, n=10)
## gender race.ethnicity parental.level.of.education lunch
## 33 female group E master's degree free/reduced
## 35 male group E some college standard
## 36 male group E associate's degree standard
## 45 female group E associate's degree free/reduced
## 51 male group E some college standard
## 52 male group E associate's degree free/reduced
## 57 female group E associate's degree standard
## 61 male group E bachelor's degree free/reduced
## 77 male group E some high school standard
## 80 female group E master's degree standard
## test.preparation.course math.score reading.score writing.score
## 33 none 56 72 65
## 35 none 97 87 82
## 36 completed 81 81 79
## 45 none 50 56 54
## 51 none 53 55 48
## 52 completed 77 69 68
## 57 completed 82 85 86
## 61 completed 79 74 72
## 77 none 30 26 22
## 80 none 62 68 68
n_muestra <- nrow(SP)*0.1 #Se toma eñ 10% del total de la poblacion
n1 <- round(n_muestra * nrow(estrato1)/ nrow(SP), digits = 0); n1
## [1] 9
n2 <- round(n_muestra * nrow(estrato2)/ nrow(SP), digits = 0); n2
## [1] 19
n3 <- round(n_muestra * nrow(estrato3)/ nrow(SP), digits = 0); n3
## [1] 32
n4 <- round(n_muestra * nrow(estrato4)/ nrow(SP), digits = 0); n4
## [1] 26
n5 <- round(n_muestra * nrow(estrato5)/ nrow(SP), digits = 0); n5
## [1] 14
library(sampling)
sum(n1, n2, n3, n4, n5)
## [1] 100
estratos <- strata(SP,
stratanames = "race.ethnicity", # variable cualitativa que estratifica
size = c(n1, n2, n3, n4, n5), # vector con los tamanos estimados por estrato
method = "srswor") # simple random sampling without replacement
muestreado <- getdata(SP, estratos)
View(muestreado)
table(muestreado$race.ethnicity)
##
## group A group B group C group D group E
## 32 9 19 26 14
# Para una muestra total de 100 datos, estas son las muestras a tomar por cada estrato.
#Se extrae la media por grupo para la variable math.score
me1 <- mean(muestreado[muestreado$race.ethnicity == "group A","math.score"])
me2 <- mean(muestreado[muestreado$race.ethnicity == "group B","math.score"])
me3 <- mean(muestreado[muestreado$race.ethnicity == "group C","math.score"])
me4 <- mean(muestreado[muestreado$race.ethnicity == "group D","math.score"])
me5 <- mean(muestreado[muestreado$race.ethnicity == "group E","math.score"])
# Calculo de la media estimada total
media.est <- (1/nrow(SP))*(sum(SP$race.ethnicity == "group A")*me1+
sum(SP$race.ethnicity == "group B")*me2+
sum(SP$race.ethnicity == "group C")*me3+
sum(SP$race.ethnicity == "group D")*me4+
sum(SP$race.ethnicity == "group E")*me5)
cbind(me1, me2, me3, me4, me5, media.est)
## me1 me2 me3 me4 me5 media.est
## [1,] 60.9375 62.33333 59.21053 66.38462 73.92857 63.8977
#A partir de la muestra estratificada piloto (media.est) calculamos el tamaño de la muestra
E <- 0.1*media.est # Error admitido, en este caso seleccionamos 10%
MargenError2 <- seq(0, E, by= E/10) # diferentes niveles de error que vamos a evaluar
MargenError2
## [1] 0.000000 0.638977 1.277954 1.916931 2.555908 3.194885 3.833862 4.472839
## [9] 5.111816 5.750793 6.389770
NC2 <- 0.95 # Nivel de confianza
Z2 <- abs(qnorm((1-NC2)/2,mean = 0, sd=1)) # estimacion del valor en la distribucion Z
Nt <- nrow(SP) # Numero total de observaciones
#Calcula la varianza estimada de cada estratos
Var_st <-
((
sum(SP$race.ethnicity == "group A") / Nt) * var(muestreado[muestreado$race.ethnicity == "group A", "math.score"])
+ (sum(SP$race.ethnicity == "group B") / Nt) * var(muestreado[muestreado$race.ethnicity == "group B", "math.score"])
+ (sum(SP$race.ethnicity == "group C") / Nt) * var(muestreado[muestreado$race.ethnicity == "group C", "math.score"])
+ (sum(SP$race.ethnicity == "group D") / Nt) * var(muestreado[muestreado$race.ethnicity == "group D", "math.score"])
+ (sum(SP$race.ethnicity == "group E") / Nt) * var(muestreado[muestreado$race.ethnicity == "group E", "math.score"]) )
n0 <- Var_st * (1 / (MargenError2 / Z2)^2)
n <- n0 / (1 + n0 / Nt)
data <- data.frame(
MargenError2=round(MargenError2, digits = 3),
num.datos=round(n, digits = 0)
)
head(data, n = 11)
## MargenError2 num.datos
## 1 0.000 NaN
## 2 0.639 636
## 3 1.278 304
## 4 1.917 163
## 5 2.556 99
## 6 3.195 65
## 7 3.834 46
## 8 4.473 34
## 9 5.112 27
## 10 5.751 21
## 11 6.390 17
# Grafico
library(ggplot2)
ggplot(data, aes(x = MargenError2, y = num.datos) ) +
geom_line() +
geom_point() +
labs(x = "Margen de Error", y = "Tamaño de muestra") +
ggtitle("Relación entre Margen de Error y Tamaño de muestra") +theme_minimal()+
geom_vline(xintercept = 1.327, colour="red", size=0.5)+
geom_hline(yintercept = 396, colour="blue", size=0.5)
n2 <- 396 # numero de muestras seleccionado
ID_MUESTRA_ESTRA <- sample(nrow(SP), size = n2, replace = TRUE)
muestra_ESTRA <- SP[ID_MUESTRA_ESTRA, ]
head(muestra_ESTRA, n=10)
## gender race.ethnicity parental.level.of.education lunch
## 647 female group D associate's degree standard
## 267 female group C bachelor's degree standard
## 97 male group B some high school standard
## 896 female group E some high school free/reduced
## 125 male group E some college standard
## 912 female group A some college standard
## 834 female group B high school standard
## 527 male group C some high school free/reduced
## 952 female group D some college standard
## 281 male group D high school standard
## test.preparation.course math.score reading.score writing.score
## 647 none 59 70 65
## 267 none 63 75 81
## 97 completed 65 66 62
## 896 none 32 34 38
## 125 none 83 80 73
## 912 none 69 84 82
## 834 completed 77 82 89
## 527 completed 56 61 60
## 952 completed 75 77 83
## 281 none 53 52 42
# Resumen muestra aleatoria simple
summary(muestra_MAS)
## gender race.ethnicity parental.level.of.education
## Length:248 Length:248 Length:248
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## lunch test.preparation.course math.score reading.score
## Length:248 Length:248 Min. : 27.00 Min. : 24.00
## Class :character Class :character 1st Qu.: 58.75 1st Qu.: 62.00
## Mode :character Mode :character Median : 67.50 Median : 72.00
## Mean : 67.88 Mean : 70.85
## 3rd Qu.: 79.00 3rd Qu.: 82.00
## Max. :100.00 Max. :100.00
## writing.score
## Min. : 15.00
## 1st Qu.: 60.00
## Median : 70.00
## Mean : 69.31
## 3rd Qu.: 78.25
## Max. :100.00
# Resumen muestra estratificada
summary(muestra_ESTRA)
## gender race.ethnicity parental.level.of.education
## Length:396 Length:396 Length:396
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## lunch test.preparation.course math.score reading.score
## Length:396 Length:396 Min. : 19.00 Min. : 23.00
## Class :character Class :character 1st Qu.: 57.00 1st Qu.: 60.00
## Mode :character Mode :character Median : 65.50 Median : 70.50
## Mean : 66.38 Mean : 69.46
## 3rd Qu.: 78.00 3rd Qu.: 79.25
## Max. :100.00 Max. :100.00
## writing.score
## Min. : 19.00
## 1st Qu.: 59.00
## Median : 69.00
## Mean : 68.16
## 3rd Qu.: 80.00
## Max. :100.00
data.frame(Media_muestral=mean(muestra_ESTRA$math.score),
Media.poblacional=mean(SP$math.score) )
## Media_muestral Media.poblacional
## 1 66.37626 66.089
# Resumen base data original
summary(SP)
## gender race.ethnicity parental.level.of.education
## Length:1000 Length:1000 Length:1000
## Class :character Class :character Class :character
## Mode :character Mode :character Mode :character
##
##
##
## lunch test.preparation.course math.score reading.score
## Length:1000 Length:1000 Min. : 0.00 Min. : 17.00
## Class :character Class :character 1st Qu.: 57.00 1st Qu.: 59.00
## Mode :character Mode :character Median : 66.00 Median : 70.00
## Mean : 66.09 Mean : 69.17
## 3rd Qu.: 77.00 3rd Qu.: 79.00
## Max. :100.00 Max. :100.00
## writing.score
## Min. : 10.00
## 1st Qu.: 57.75
## Median : 69.00
## Mean : 68.05
## 3rd Qu.: 79.00
## Max. :100.00