Taller Estadística II - Análisis en R

Carga de Datos

datos <- read.csv("datos_negocios_adicional_1_ventas_online.csv")
datos <- datos %>% mutate_if(is.character, as.factor)
str(datos)

## 'data.frame':    100 obs. of  4 variables:
##  $ Plataforma        : Factor w/ 4 levels "Amazon","eBay",..: 4 3 4 1 3 1 4 3 3 3 ...
##  $ Ingresos          : num  12396 75450 42504 7550 11774 ...
##  $ Pedidos_Realizados: int  787 730 998 419 590 935 204 248 217 803 ...
##  $ Devoluciones      : int  55 4 148 12 14 9 30 42 39 88 ...

summary(datos)

##         Plataforma    Ingresos     Pedidos_Realizados  Devoluciones   
##  Amazon      :29   Min.   : 5281   Min.   : 12.0      Min.   :  0.00  
##  eBay        :16   1st Qu.:22764   1st Qu.:251.0      1st Qu.: 12.00  
##  MercadoLibre:29   Median :48572   Median :479.5      Median : 32.00  
##  Shopify     :26   Mean   :48942   Mean   :490.6      Mean   : 41.63  
##                    3rd Qu.:69834   3rd Qu.:709.5      3rd Qu.: 64.25  
##                    Max.   :98912   Max.   :998.0      Max.   :148.00

Análisis Descriptivo

# Resumen variables numéricas
datos %>% select_if(is.numeric) %>% summary()

##     Ingresos     Pedidos_Realizados  Devoluciones   
##  Min.   : 5281   Min.   : 12.0      Min.   :  0.00  
##  1st Qu.:22764   1st Qu.:251.0      1st Qu.: 12.00  
##  Median :48572   Median :479.5      Median : 32.00  
##  Mean   :48942   Mean   :490.6      Mean   : 41.63  
##  3rd Qu.:69834   3rd Qu.:709.5      3rd Qu.: 64.25  
##  Max.   :98912   Max.   :998.0      Max.   :148.00

# Frecuencia variables categóricas
datos %>% select_if(is.factor) %>% map(table)

## $Plataforma
## 
##       Amazon         eBay MercadoLibre      Shopify 
##           29           16           29           26

# Histogramas
datos %>%
  select_if(is.numeric) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "valor") %>%
  ggplot(aes(x = valor)) +
  facet_wrap(~variable, scales = "free") +
  geom_histogram(bins = 30, fill = "steelblue", color = "white")

# Gráfico de barras categóricas
datos %>%
  select_if(is.factor) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "valor") %>%
  ggplot(aes(x = valor)) +
  facet_wrap(~variable, scales = "free") +
  geom_bar(fill = "coral") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Boxplots
datos %>%
  select_if(is.numeric) %>%
  pivot_longer(cols = everything(), names_to = "variable", values_to = "valor") %>%
  ggplot(aes(x = variable, y = valor)) +
  geom_boxplot(fill = "lightgreen") +
  coord_flip()

Estimaciones Estadísticas

var_numerica <- datos$Ingresos

# Media e IC
media <- mean(var_numerica, na.rm = TRUE)
error <- sd(var_numerica, na.rm = TRUE)/sqrt(length(na.omit(var_numerica)))
ic_media <- c(media - 1.96*error, media + 1.96*error)
ic_media

## [1] 43571.87 54312.25

# Proporción Plataforma
prop_table <- prop.table(table(datos$Plataforma))
n <- sum(!is.na(datos$Plataforma))
p <- prop_table[1]
error_p <- sqrt(p*(1-p)/n)
ic_prop <- c(p - 1.96*error_p, p + 1.96*error_p)
ic_prop

##    Amazon    Amazon 
## 0.2010626 0.3789374

# Varianza
n_var <- length(na.omit(var_numerica))
s2 <- var(var_numerica, na.rm = TRUE)
ic_var <- c(
  (n_var - 1)*s2/qchisq(0.975, df = n_var - 1),
  (n_var - 1)*s2/qchisq(0.025, df = n_var - 1)
)
ic_var

## [1]  578712669 1013063488

Comparaciones entre Plataformas

# Filtrar dos plataformas
datos_dos <- datos %>% filter(Plataforma %in% c("Amazon", "MercadoLibre"))
datos_dos$Plataforma <- factor(datos_dos$Plataforma)

# Diferencia de medias
t.test(Ingresos ~ Plataforma, data = datos_dos)

## 
##  Welch Two Sample t-test
## 
## data:  Ingresos by Plataforma
## t = -1.9096, df = 55.889, p-value = 0.06132
## alternative hypothesis: true difference in means between group Amazon and group MercadoLibre is not equal to 0
## 95 percent confidence interval:
##  -28715.3929    687.7632
## sample estimates:
##       mean in group Amazon mean in group MercadoLibre 
##                   39110.96                   53124.78

# Diferencia de proporciones
datos_dos$Hubo_Devolucion <- ifelse(datos_dos$Devoluciones > 0, 1, 0)
tabla_prop <- table(datos_dos$Plataforma, datos_dos$Hubo_Devolucion)
prop.test(tabla_prop)

## Warning in prop.test(tabla_prop): Chi-squared approximation may be incorrect

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  tabla_prop
## X-squared = 0, df = 1, p-value = 1
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.1136470  0.1826125
## sample estimates:
##     prop 1     prop 2 
## 0.06896552 0.03448276

# Razón de varianzas
var.test(Ingresos ~ Plataforma, data = datos_dos)

## 
##  F test to compare two variances
## 
## data:  Ingresos by Plataforma
## F = 0.91475, num df = 28, denom df = 28, p-value = 0.8153
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4294754 1.9483484
## sample estimates:
## ratio of variances 
##          0.9147501

Pruebas de Hipótesis

# Hipótesis media
t.test(var_numerica, mu = 10000)

## 
##  One Sample t-test
## 
## data:  var_numerica
## t = 14.213, df = 99, p-value < 2.2e-16
## alternative hypothesis: true mean is not equal to 10000
## 95 percent confidence interval:
##  43505.52 54378.60
## sample estimates:
## mean of x 
##  48942.06

# Hipótesis proporción
prop.test(x = round(p*n), n = n, p = 0.5)

## 
##  1-sample proportions test with continuity correction
## 
## data:  round(p * n) out of n, null probability 0.5
## X-squared = 16.81, df = 1, p-value = 4.132e-05
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
##  0.2057664 0.3906248
## sample estimates:
##    p 
## 0.29

# Hipótesis varianza
chi_valor <- (n_var - 1)*s2/1000000
p_valor <- 2*min(pchisq(chi_valor, df = n_var - 1), 1 - pchisq(chi_valor, df = n_var - 1))
p_valor

## [1] 0

# Repetición pruebas clave
t.test(Ingresos ~ Plataforma, data = datos_dos)

## 
##  Welch Two Sample t-test
## 
## data:  Ingresos by Plataforma
## t = -1.9096, df = 55.889, p-value = 0.06132
## alternative hypothesis: true difference in means between group Amazon and group MercadoLibre is not equal to 0
## 95 percent confidence interval:
##  -28715.3929    687.7632
## sample estimates:
##       mean in group Amazon mean in group MercadoLibre 
##                   39110.96                   53124.78

prop.test(tabla_prop)

## Warning in prop.test(tabla_prop): Chi-squared approximation may be incorrect

## 
##  2-sample test for equality of proportions with continuity correction
## 
## data:  tabla_prop
## X-squared = 0, df = 1, p-value = 1
## alternative hypothesis: two.sided
## 95 percent confidence interval:
##  -0.1136470  0.1826125
## sample estimates:
##     prop 1     prop 2 
## 0.06896552 0.03448276

var.test(Ingresos ~ Plataforma, data = datos_dos)

## 
##  F test to compare two variances
## 
## data:  Ingresos by Plataforma
## F = 0.91475, num df = 28, denom df = 28, p-value = 0.8153
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.4294754 1.9483484
## sample estimates:
## ratio of variances 
##          0.9147501