Aplicar y comprender distintas pruebas de hipótesis (para una y dos
muestras), utilizando variables cuantitativas y cualitativas del
conjunto Credit Score Classification Dataset.csv
.
# Vectores numéricos
variable <- c(12, 0.23, 4/5)
variable
## [1] 12.00 0.23 0.80
# Vectores de caracteres (no factores aún)
factor <- c("Manuel", "2", "red", "este espacio es para aprender Rstudio")
factor
## [1] "Manuel"
## [2] "2"
## [3] "red"
## [4] "este espacio es para aprender Rstudio"
# Operaciones básicas
Var1 <- c(20,25,25,12,28,26,27,29,29,29,30,40)
length(Var1)
## [1] 12
sum(Var1)
## [1] 320
prod(Var1)
## [1] 8.629023e+16
max(Var1)
## [1] 40
min(Var1)
## [1] 12
diff(range(Var1)) # rango
## [1] 28
mean(Var1)
## [1] 26.66667
median(Var1)
## [1] 27.5
var(Var1)
## [1] 42.9697
sd(Var1)
## [1] 6.555128
library(readr)
## Warning: package 'readr' was built under R version 4.5.1
library(BSDA)
## Warning: package 'BSDA' was built under R version 4.5.1
## Cargando paquete requerido: lattice
##
## Adjuntando el paquete: 'BSDA'
## The following object is masked from 'package:datasets':
##
## Orange
Credit <- read_csv("Credit Score Classification Dataset.csv")
## Rows: 164 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): Gender, Education, Marital Status, Home Ownership, Credit Score
## dbl (3): Age, Income, Number of Children
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Credit <- as.data.frame(unclass(Credit), stringsAsFactors=TRUE)
str(Credit)
## 'data.frame': 164 obs. of 8 variables:
## $ Age : num 25 30 35 40 45 50 26 31 36 41 ...
## $ Gender : Factor w/ 2 levels "Female","Male": 1 2 1 2 1 2 1 2 1 2 ...
## $ Income : num 50000 100000 75000 125000 100000 150000 40000 60000 80000 105000 ...
## $ Education : Factor w/ 5 levels "Associate's Degree",..: 2 5 3 4 2 5 1 2 5 3 ...
## $ Marital.Status : Factor w/ 2 levels "Married","Single": 2 1 1 2 1 1 2 2 1 2 ...
## $ Number.of.Children: num 0 2 1 0 3 0 0 0 2 0 ...
## $ Home.Ownership : Factor w/ 2 levels "Owned","Rented": 2 1 1 1 1 1 2 2 1 1 ...
## $ Credit.Score : Factor w/ 3 levels "Average","High",..: 2 2 2 2 2 2 1 1 2 2 ...
# Selección de muestra aleatoria
set.seed(1020)
n <- 80
N <- nrow(Credit)
Muestra.x1 <- sample(1:N, n, replace = FALSE)
data.m <- Credit[Muestra.x1, ]
head(data.m, 10)
## Age Gender Income Education Marital.Status Number.of.Children
## 80 28 Female 32500 Associate's Degree Single 0
## 114 38 Female 67500 Bachelor's Degree Married 2
## 112 28 Female 32500 Associate's Degree Single 0
## 47 52 Male 130000 High School Diploma Married 0
## 33 40 Male 130000 High School Diploma Single 0
## 127 30 Male 117500 Master's Degree Married 2
## 81 33 Male 52500 High School Diploma Single 0
## 35 50 Male 155000 Master's Degree Married 0
## 19 28 Female 30000 Associate's Degree Single 0
## 70 36 Female 90000 Master's Degree Married 2
## Home.Ownership Credit.Score
## 80 Rented Low
## 114 Owned High
## 112 Rented Low
## 47 Owned High
## 33 Owned High
## 127 Owned High
## 81 Rented Average
## 35 Owned High
## 19 Rented Low
## 70 Owned High
Income
media.income <- mean(data.m$Income)
sd.income <- sd(data.m$Income)
# Ho: media = 75000 vs Ha: media ≠ 75000
z.test(x = data.m$Income,
sigma.x = sd.income,
mu = 75000,
alternative = "two.sided",
conf.level = 0.95)
##
## One-sample z-Test
##
## data: data.m$Income
## z = 2.3358, p-value = 0.0195
## alternative hypothesis: true mean is not equal to 75000
## 95 percent confidence interval:
## 76308.21 89954.29
## sample estimates:
## mean of x
## 83131.25
Gender == Female
table(data.m$Gender)
##
## Female Male
## 44 36
x_female <- sum(data.m$Gender == "Female")
# Ho: p = 0.5 vs Ha: p ≠ 0.5
prop.test(x = x_female,
n = 80,
p = 0.5,
alternative = "two.sided",
conf.level = 0.95,
correct = TRUE)
##
## 1-sample proportions test with continuity correction
##
## data: x_female out of 80, null probability 0.5
## X-squared = 0.6125, df = 1, p-value = 0.4338
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.4351113 0.6600504
## sample estimates:
## p
## 0.55
Income
por
Gender
set.seed(1234)
data.m1 <- Credit[sample(1:N, 110), ]
data.Gender <- split(data.m1, data.m1$Gender)
z.test(x = data.Gender$Female$Income,
y = data.Gender$Male$Income,
sigma.x = sd(data.Gender$Female$Income),
sigma.y = sd(data.Gender$Male$Income),
mu = 0,
alternative = "two.sided",
conf.level = 0.95)
##
## Two-sample z-Test
##
## data: data.Gender$Female$Income and data.Gender$Male$Income
## z = -5.3184, p-value = 1.047e-07
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -39288.09 -18128.57
## sample estimates:
## mean of x mean of y
## 69591.67 98300.00
Marital.Status == Married
por Gender
table(data.Gender$Female$Marital.Status)
##
## Married Single
## 43 17
table(data.Gender$Male$Marital.Status)
##
## Married Single
## 18 32
# Ejemplo con datos observados: 43 mujeres casadas de 60, 18 hombres casados de 50
prop.test(x = c(43, 18),
n = c(60, 50),
alternative = "two.sided",
conf.level = 0.95,
correct = TRUE)
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(43, 18) out of c(60, 50)
## X-squared = 12.638, df = 1, p-value = 0.000378
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.1631137 0.5502197
## sample estimates:
## prop 1 prop 2
## 0.7166667 0.3600000
Objetivo: Aplicar lo aprendido para formular y probar hipótesis con otras variables del mismo conjunto de datos.
Income
(por ejemplo: Age
o Loan.Amount
).Occupation
o Credit.Score
) y realice una
prueba para proporción.Occupation
, Gender
, etc.).