Aplicación de estadistico

TALLER_2A

Autor/a
Afiliación

Carlos Mario González Guerra

Fecha de publicación

24 de noviembre de 2025

Territorial Tolima

Codígo para instalar

Ver código
if(!require(tidyverse)){
  install.packages("tidyverse")}
library(tidyverse)

if(!require(palmerpenguins)){
  install.packages("palmerpenguins")}
library(palmerpenguins)

if(!require(psych)){
  install.packages("psych")}
library(psych)

if(!require(modeest)){
  install.packages("modeest")}
library(modeest)

if(!require(ggthemes)){
  install.packages("ggthemes")}
library(ggthemes)

if(!require(ggplot2)){
  install.packages("ggplot2")}
library(ggplot2)

if(!require(e1071)){
  install.packages("e1071")}
library(e1071)

0.1 1. ESTADISTICOS

En el contexto de las ciencias, la estadística es una disciplina que se ocupa de la recoleccón, análisis e interpretación de datos. Los estadísticos son las medidas de resumen o describen aspectos clave de un conjunto de datos. Estas medidas permiten hacer inferencias, identificar patrones, y comprender la variabilidad de los datos.

\bar{x} = \frac{\sum x_i}{n}

Ver código
set.seed(222)
datos <- sample(1:10, size = 20, replace = TRUE)
datos
 [1]  2  7  8  6 10  4  9 10  9  2  1  8 10  7  9 10  5  9  1  5
Ver código
palmerpenguins::penguins$body_mass_g %>% mean(na.rm = TRUE)
[1] 4201.754
Ver código
(palmerpenguins::penguins$body_mass_g %>%  sum(na.rm = TRUE))/342
[1] 4201.754
Ver código
sum(datos)/length(datos)
[1] 6.6
Ver código
mean(datos)
[1] 6.6
Ver código
tibble(x = 1:length(datos), y = datos) %>% 
  ggplot() +
  geom_hline(yintercept = mean(datos), color = 2
             , linetype = "dashed") + 
  geom_segment(aes(x = x, xend = x, y = mean(datos), yend = datos)) + 
  geom_point(aes(x = x, y = datos), color = 2, size = 4) + 
  scale_y_continuous(breaks = seq(0, 10, by = 2)) + 
  labs(
    x = "", y = "Datos"
  )

bar{y} = \frac{\sum y_i \, n_i}{n}

n = \sum_{i=1}^{k} n_i

Ver código
set.seed(555)
peso <- round(rnorm(200, mean = 60, sd = 5), 1)
peso
  [1] 58.4 62.5 61.9 69.4 51.1 64.4 59.2 66.8 60.2 63.1 58.6 56.7 55.0 55.7 56.6
 [16] 59.6 61.3 56.2 52.8 62.6 63.5 62.3 54.2 66.4 54.7 60.2 63.4 59.5 59.0 67.2
 [31] 59.0 58.5 61.3 58.7 58.5 60.9 57.8 62.8 57.0 55.1 71.1 67.1 64.4 58.2 55.5
 [46] 61.7 62.0 58.3 71.3 44.4 65.7 60.4 61.4 58.9 59.2 55.2 65.5 56.7 54.4 60.3
 [61] 61.9 63.2 53.4 65.0 67.2 56.8 59.6 64.2 70.3 55.2 54.3 58.0 63.9 59.2 52.5
 [76] 56.3 61.7 61.6 66.3 59.6 50.8 66.5 63.0 54.4 60.9 66.7 61.1 53.8 55.7 61.1
 [91] 60.4 52.2 61.7 66.0 60.9 63.8 63.8 58.5 52.3 65.0 56.0 52.4 60.2 63.2 77.1
[106] 56.1 59.4 60.4 62.7 60.6 61.3 57.2 63.7 51.3 53.6 60.8 65.9 63.2 60.7 51.6
[121] 59.6 57.4 63.4 66.2 65.4 56.5 55.0 63.3 61.1 59.3 70.2 59.1 66.5 56.4 56.2
[136] 50.5 53.1 61.7 59.3 65.3 54.9 48.4 55.5 59.6 56.7 56.8 71.8 64.9 55.6 55.4
[151] 52.5 59.0 72.8 69.0 57.1 64.1 59.8 58.1 59.1 64.2 61.7 53.1 59.1 62.9 60.0
[166] 64.0 60.3 66.2 61.6 69.1 60.8 57.9 52.8 60.9 60.1 54.4 63.2 60.9 60.7 57.4
[181] 57.6 65.5 62.5 57.9 66.8 58.4 59.7 56.8 64.6 63.9 61.0 58.8 60.0 64.3 52.4
[196] 51.6 59.0 52.8 65.1 65.9
Ver código
interv = cut(x = peso, breaks = seq(43, 79, by = 4), include.lowest = TRUE, right = TRUE)
tabla_1 <- tibble(Peso = interv) %>% 
  group_by(Peso) %>%
  summarise(fi = n()) %>% 
  mutate(
    mi = seq(45, 77, by = 4)
  ) %>% 
  relocate(Peso, mi, fi) %>% 
  mutate(
    hi = fi/sum(fi),
    Fi = cumsum(fi),
    Hi = cumsum(hi),
    mifi = mi*fi
  )
tabla_1
# A tibble: 9 × 7
  Peso       mi    fi    hi    Fi    Hi  mifi
  <fct>   <dbl> <int> <dbl> <int> <dbl> <dbl>
1 [43,47]    45     1 0.005     1 0.005    45
2 (47,51]    49     3 0.015     4 0.02    147
3 (51,55]    53    27 0.135    31 0.155  1431
4 (55,59]    57    49 0.245    80 0.4    2793
5 (59,63]    61    64 0.32    144 0.72   3904
6 (63,67]    65    43 0.215   187 0.935  2795
7 (67,71]    69     8 0.04    195 0.975   552
8 (71,75]    73     4 0.02    199 0.995   292
9 (75,79]    77     1 0.005   200 1        77
Ver código
Peso1 = factor(c("[1,20]", "(47,51]"))
Ver código
sum(tabla_1$mifi)/length(peso)
[1] 60.18
Ver código
mean(peso)
[1] 60.1125

1 Media Geométrica

2 La media geométrica es una medida de tendencia central que se usa para promediar valores positivos que están relacionados multiplicativamente, no aditivamente.

En lugar de sumar los datos como en la media aritmética, la media geométrica:

Multiplica todos los valores.

Toma la raíz n-ésima (según la cantidad de datos).

G = \sqrt[n]{X_1 \cdot X_2 \cdot X_3 \cdots X_n}

Ver código
x <- c(1:10, size = 5000, replace = TRUE)
prod(x)^(1/length(x))
[1] 7.159693
Ver código
exp(mean(log(x)))
[1] 7.159693
Ver código
psych::geometric.mean(x)
[1] 7.159693
Ver código
x <- c(5,8.2,6.1)
1+x/100
[1] 1.050 1.082 1.061
Ver código
(psych::geometric.mean(1+x/100)-1)*100
[1] 6.425079

Datos Agrupados

Ver código
set.seed(555)
peso <- round(rnorm(200, mean = 60, sd = 5), 1)
peso
  [1] 58.4 62.5 61.9 69.4 51.1 64.4 59.2 66.8 60.2 63.1 58.6 56.7 55.0 55.7 56.6
 [16] 59.6 61.3 56.2 52.8 62.6 63.5 62.3 54.2 66.4 54.7 60.2 63.4 59.5 59.0 67.2
 [31] 59.0 58.5 61.3 58.7 58.5 60.9 57.8 62.8 57.0 55.1 71.1 67.1 64.4 58.2 55.5
 [46] 61.7 62.0 58.3 71.3 44.4 65.7 60.4 61.4 58.9 59.2 55.2 65.5 56.7 54.4 60.3
 [61] 61.9 63.2 53.4 65.0 67.2 56.8 59.6 64.2 70.3 55.2 54.3 58.0 63.9 59.2 52.5
 [76] 56.3 61.7 61.6 66.3 59.6 50.8 66.5 63.0 54.4 60.9 66.7 61.1 53.8 55.7 61.1
 [91] 60.4 52.2 61.7 66.0 60.9 63.8 63.8 58.5 52.3 65.0 56.0 52.4 60.2 63.2 77.1
[106] 56.1 59.4 60.4 62.7 60.6 61.3 57.2 63.7 51.3 53.6 60.8 65.9 63.2 60.7 51.6
[121] 59.6 57.4 63.4 66.2 65.4 56.5 55.0 63.3 61.1 59.3 70.2 59.1 66.5 56.4 56.2
[136] 50.5 53.1 61.7 59.3 65.3 54.9 48.4 55.5 59.6 56.7 56.8 71.8 64.9 55.6 55.4
[151] 52.5 59.0 72.8 69.0 57.1 64.1 59.8 58.1 59.1 64.2 61.7 53.1 59.1 62.9 60.0
[166] 64.0 60.3 66.2 61.6 69.1 60.8 57.9 52.8 60.9 60.1 54.4 63.2 60.9 60.7 57.4
[181] 57.6 65.5 62.5 57.9 66.8 58.4 59.7 56.8 64.6 63.9 61.0 58.8 60.0 64.3 52.4
[196] 51.6 59.0 52.8 65.1 65.9
Ver código
interv = cut(x = peso, breaks = seq(43, 79, by = 4), include.lowest = TRUE, right = TRUE)
tabla_2 <- tibble(Peso = interv) %>% 
  group_by(Peso) %>%
  summarise(fi = n()) %>% 
  mutate(
    mi = seq(45, 77, by = 4)
  ) %>% 
  relocate(Peso, mi, fi) %>% 
  mutate(
    hi = fi/sum(fi),
    Fi = cumsum(fi),
    Hi = cumsum(hi),
    mifi = mi*fi
  )
tabla_2
# A tibble: 9 × 7
  Peso       mi    fi    hi    Fi    Hi  mifi
  <fct>   <dbl> <int> <dbl> <int> <dbl> <dbl>
1 [43,47]    45     1 0.005     1 0.005    45
2 (47,51]    49     3 0.015     4 0.02    147
3 (51,55]    53    27 0.135    31 0.155  1431
4 (55,59]    57    49 0.245    80 0.4    2793
5 (59,63]    61    64 0.32    144 0.72   3904
6 (63,67]    65    43 0.215   187 0.935  2795
7 (67,71]    69     8 0.04    195 0.975   552
8 (71,75]    73     4 0.02    199 0.995   292
9 (75,79]    77     1 0.005   200 1        77
Ver código
prod(tabla_2$mi^tabla_2$fi)^(1/length(peso))
[1] Inf

\exp\left({\sum (f_i *ln m_i)}/{n} \right)

Ver código
exp(sum(tabla_2$fi*log(tabla_2$mi))/length(peso))
[1] 59.96811

La media armónica

es una medida de tendencia central que se usa cuando queremos promediar tasas, razones o velocidades, es decir, valores que están en el denominador de una razón.

H = \frac{n}{\frac{1}{X_1} + \frac{1}{X_2} + \cdots + \frac{1}{X_n}}

Datos no Agrupados

Ver código
x <- c(4,7,2,3,4,6)
length(x)/sum(1/x)
[1] 3.652174
Ver código
psych::harmonic.mean(x)
[1] 3.652174
Ver código
palmerpenguins::penguins$bill_length_mm %>% psych::harmonic.mean(na.rm = TRUE)
[1] 43.2384

2 Datos Agrupados

Ver código
set.seed(555)
peso <- round(rnorm(200, mean = 60, sd = 5), 1)
peso
  [1] 58.4 62.5 61.9 69.4 51.1 64.4 59.2 66.8 60.2 63.1 58.6 56.7 55.0 55.7 56.6
 [16] 59.6 61.3 56.2 52.8 62.6 63.5 62.3 54.2 66.4 54.7 60.2 63.4 59.5 59.0 67.2
 [31] 59.0 58.5 61.3 58.7 58.5 60.9 57.8 62.8 57.0 55.1 71.1 67.1 64.4 58.2 55.5
 [46] 61.7 62.0 58.3 71.3 44.4 65.7 60.4 61.4 58.9 59.2 55.2 65.5 56.7 54.4 60.3
 [61] 61.9 63.2 53.4 65.0 67.2 56.8 59.6 64.2 70.3 55.2 54.3 58.0 63.9 59.2 52.5
 [76] 56.3 61.7 61.6 66.3 59.6 50.8 66.5 63.0 54.4 60.9 66.7 61.1 53.8 55.7 61.1
 [91] 60.4 52.2 61.7 66.0 60.9 63.8 63.8 58.5 52.3 65.0 56.0 52.4 60.2 63.2 77.1
[106] 56.1 59.4 60.4 62.7 60.6 61.3 57.2 63.7 51.3 53.6 60.8 65.9 63.2 60.7 51.6
[121] 59.6 57.4 63.4 66.2 65.4 56.5 55.0 63.3 61.1 59.3 70.2 59.1 66.5 56.4 56.2
[136] 50.5 53.1 61.7 59.3 65.3 54.9 48.4 55.5 59.6 56.7 56.8 71.8 64.9 55.6 55.4
[151] 52.5 59.0 72.8 69.0 57.1 64.1 59.8 58.1 59.1 64.2 61.7 53.1 59.1 62.9 60.0
[166] 64.0 60.3 66.2 61.6 69.1 60.8 57.9 52.8 60.9 60.1 54.4 63.2 60.9 60.7 57.4
[181] 57.6 65.5 62.5 57.9 66.8 58.4 59.7 56.8 64.6 63.9 61.0 58.8 60.0 64.3 52.4
[196] 51.6 59.0 52.8 65.1 65.9
Ver código
interv = cut(x = peso, breaks = seq(43, 79, by = 4), include.lowest = TRUE, right = TRUE)
tabla_3 <- tibble(Peso = interv) %>% 
  group_by(Peso) %>%
  summarise(fi = n()) %>% 
  mutate(
    mi = seq(45, 77, by = 4)
  ) %>% 
  relocate(Peso, mi, fi) %>% 
  mutate(
    hi = fi/sum(fi),
    Fi = cumsum(fi),
    Hi = cumsum(hi),
    mifi = mi*fi
  )
tabla_3
# A tibble: 9 × 7
  Peso       mi    fi    hi    Fi    Hi  mifi
  <fct>   <dbl> <int> <dbl> <int> <dbl> <dbl>
1 [43,47]    45     1 0.005     1 0.005    45
2 (47,51]    49     3 0.015     4 0.02    147
3 (51,55]    53    27 0.135    31 0.155  1431
4 (55,59]    57    49 0.245    80 0.4    2793
5 (59,63]    61    64 0.32    144 0.72   3904
6 (63,67]    65    43 0.215   187 0.935  2795
7 (67,71]    69     8 0.04    195 0.975   552
8 (71,75]    73     4 0.02    199 0.995   292
9 (75,79]    77     1 0.005   200 1        77
Ver código
psych::harmonic.mean(peso)
[1] 59.70164
Ver código
200/sum(tabla_3$fi/tabla_3$mi)
[1] 59.7555
Ver código
mean(c(40, 60))
[1] 50
Ver código
psych::harmonic.mean(c(40, 60))
[1] 48
Ver código
120/40
[1] 3
Ver código
120/60
[1] 2
Ver código
240/5
[1] 48

3 La media aritmética

ponderada es una medida de tendencia central que se usa cuando cada dato tiene un peso o frecuencia distinta. Es decir, algunos valores cuentan más que otros.

bar{x}_p = \frac{\sum_{i=1}^{n} w_i x_i}{\sum_{i=1}^{n} w_i}

Ver código
p <- c(50,30,10,10)
x <- c(2.5,4.0,2.0,5.0)
sum(x*p)/sum(p)
[1] 3.15
Ver código
weighted.mean(x = x, w = p)
[1] 3.15
Ver código
p <- c(5,10,20,30)
x <- c(5,8,10,25)
sum(x*p)/sum(p)
[1] 16.23077
Ver código
weighted.mean(x = x, w = p)
[1] 16.23077

#Datos agrupados

Ver código
set.seed(555)
peso <- round(rnorm(200, mean = 60, sd = 5), 1)
interv = cut(x = peso, breaks = seq(43,79, by = 4), include.lowest = TRUE, right = TRUE)
tabla_4 <- tibble(Peso = interv) %>% 
  group_by(Peso) %>% 
  summarise(fi = n()) %>% 
  mutate(
    mi = seq(45,77, by = 4)
  ) %>% 
  relocate(Peso, mi, fi) %>% 
  mutate(
    hi = fi/sum(fi),
    Fi = cumsum(fi),
    Hi = cumsum(hi),
    mifi = mi*fi
  )
tabla_4
# A tibble: 9 × 7
  Peso       mi    fi    hi    Fi    Hi  mifi
  <fct>   <dbl> <int> <dbl> <int> <dbl> <dbl>
1 [43,47]    45     1 0.005     1 0.005    45
2 (47,51]    49     3 0.015     4 0.02    147
3 (51,55]    53    27 0.135    31 0.155  1431
4 (55,59]    57    49 0.245    80 0.4    2793
5 (59,63]    61    64 0.32    144 0.72   3904
6 (63,67]    65    43 0.215   187 0.935  2795
7 (67,71]    69     8 0.04    195 0.975   552
8 (71,75]    73     4 0.02    199 0.995   292
9 (75,79]    77     1 0.005   200 1        77
Ver código
sum(tabla_4$mifi)/200
[1] 60.18
Ver código
sum(tabla_4$mi*tabla_4$hi)
[1] 60.18
Ver código
mean(peso)
[1] 60.1125

3.1 Media ponderada normalizada

Para cada peso wi del dato xi se define su peso normalizado como:

w’_i = \frac{w_i}{\sum_{k=1}^n w_k}

Se tiene que la suma de los pesos normalizados es 1 y, por lo tanto, la medida ponderada (con pesos wi) es

{x} = \sum_{i=1}^n x_i * w'_i

Ver código
p <- c(5,10,20,30)/65
sum(p)
[1] 1
Ver código
x <- c(5,8,10,25)
sum(x*p)
[1] 16.23077
Ver código
set.seed(555)
personas <- round(rnorm(250, mean = 4, sd = 1))
tabla_5 <- tibble(Personas = personas) %>% 
  group_by(Personas) %>% 
  summarise(fi = n()) %>% 
  mutate(
    hi = round(fi/sum(fi), 4),
    Fi = cumsum(fi),
    Hi = cumsum(hi),
    xifi = Personas*fi
  )
tabla_5
# A tibble: 7 × 6
  Personas    fi    hi    Fi    Hi  xifi
     <dbl> <int> <dbl> <int> <dbl> <dbl>
1        1     2 0.008     2 0.008     2
2        2    14 0.056    16 0.064    28
3        3    57 0.228    73 0.292   171
4        4    99 0.396   172 0.688   396
5        5    63 0.252   235 0.94    315
6        6    13 0.052   248 0.992    78
7        7     2 0.008   250 1        14
Ver código
mean(personas)
[1] 4.016
Ver código
sum(tabla_5$xifi)/250
[1] 4.016
Ver código
sum(tabla_5$Personas*tabla_5$hi)
[1] 4.016

moda

La media aritmética ponderada es una medida de tendencia central que se usa cuando cada dato tiene un peso o frecuencia distinta.

Moda = L_1 + \left( \frac{\Delta_1}{\Delta_1 + \Delta_2} \right) C

Ver código
set.seed(55)
votacion <- sample(c("Pedro", "María", "Daniel", "Ana"), 
                   size = 40, replace = TRUE)
table(votacion)
votacion
   Ana Daniel  María  Pedro 
    10      5     18      7 

Ejemplo 1

Ver código
tibble(votacion = votacion) %>% 
  ggplot(aes(x = votacion)) +
  geom_bar()

Ejemplo 2

Ver código
n <- 10000
set.seed(777)
dados <- sample(1:6, size = n, replace = TRUE) + sample(1:6, size = n, replace = TRUE)
tibble(dados = dados) %>% 
  ggplot(aes(x = as.factor(dados))) +
  geom_bar(fill = "orange", color = "black")

Ver código
names(which.max(table(dados)))
[1] "7"

Ejemplo 3

Ver código
mlv(dados)
[1] 7
Ver código
mlv(votacion)
[1] "María"

Ejemplo 4

Ver código
pen <- palmerpenguins::penguins %>% drop_na()
pen
# A tibble: 333 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           36.7          19.3               193        3450
 5 Adelie  Torgersen           39.3          20.6               190        3650
 6 Adelie  Torgersen           38.9          17.8               181        3625
 7 Adelie  Torgersen           39.2          19.6               195        4675
 8 Adelie  Torgersen           41.1          17.6               182        3200
 9 Adelie  Torgersen           38.6          21.2               191        3800
10 Adelie  Torgersen           34.6          21.1               198        4400
# ℹ 323 more rows
# ℹ 2 more variables: sex <fct>, year <int>
Ver código
names(which.max(table(pen$bill_length_mm)))
[1] "41.1"
Ver código
pen$bill_length_mm %>% summary()
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  32.10   39.50   44.50   43.99   48.60   59.60 
Ver código
interv <- cut(pen$bill_length_mm, breaks = seq(32, 60, by = 2.8))
tibble(Intervalos = interv) %>% 
group_by(Intervalos) %>% count()
# A tibble: 10 × 2
# Groups:   Intervalos [10]
   Intervalos      n
   <fct>       <int>
 1 (32,34.8]       8
 2 (34.8,37.6]    42
 3 (37.6,40.4]    52
 4 (40.4,43.2]    52
 5 (43.2,46]      45
 6 (46,48.8]      55
 7 (48.8,51.6]    59
 8 (51.6,54.4]    15
 9 (54.4,57.2]     3
10 (57.2,60]       2
Ver código
pen %>% 
  ggplot(aes(x = bill_length_mm)) +
  geom_histogram(
    breaks = seq(32, 60, by = 2.8),
    fill = "orange",    # color de las barras
    color = "black"     # opcional: bordes negros
  )

Ver código
scale_x_continuous(breaks = seq(32, 60, by = 2.8))
<ScaleContinuousPosition>
 Range:  
 Limits:    0 --    1
Ver código
48.8 + (59-55)/((59-55)+(59-15))*2.8
[1] 49.03333
Ver código
mlv(pen$bill_length_mm, method = "meanshift")
[1] 46.55969
attr(,"iterations")
[1] 68
Ver código
pen %>% 
  ggplot(aes(x = bill_length_mm)) +
  geom_density() +
  geom_vline(xintercept = 46.55969, color = "red") +
scale_x_continuous(breaks = seq(32, 60, by = 2.8))

Ver código
mlv(as.numeric(pen$body_mass_g), method = "meanshift")
[1] 3658.442
attr(,"iterations")
[1] 56
Ver código
pen %>% 
  ggplot(aes(x = body_mass_g)) +
  geom_density() +
  geom_vline(xintercept = 3658.442, color = "red") +
scale_x_continuous(breaks = seq(32, 60, by = 2.8))

4 Mediana

La mediana es una medida de tendencia central que divide un conjunto de datos en dos partes iguales.

Datos no Agrupados

Si n es impar

Formula: x = x_{((n+1)/2)}

Ver código
set.seed(33)
datos <- sample(1:100, size = 55, replace = TRUE)
datos
 [1] 42  8 86 60  9 92 53 48 29 39 42 96 86 32 66  4  3 60 34 61 79 59 85 81 51
[26] 94 43 92 65 41 66 15 79 86 77 83 42 16 57 82 50 58 97 74 13 68 63 89  4  5
[51] 53  3 86 92 81
Ver código
sort(datos)
 [1]  3  3  4  4  5  8  9 13 15 16 29 32 34 39 41 42 42 42 43 48 50 51 53 53 57
[26] 58 59 60 60 61 63 65 66 66 68 74 77 79 79 81 81 82 83 85 86 86 86 86 89 92
[51] 92 92 94 96 97
Ver código
median(datos)
[1] 60
Ver código
n <- length(datos)
sort(datos)[(n+1)/2]
[1] 60
Ver código
set.seed(444)
datos <- sample(1:100, size = 15, replace = TRUE)
datos
 [1]  6 99 67 17  3 42 56 88 93 71 95 45 30 49 46
Ver código
sort(datos)
 [1]  3  6 17 30 42 45 46 49 56 67 71 88 93 95 99
Ver código
median(datos)
[1] 49
Ver código
tibble(x = 1:length(datos), y = sort(datos)) %>% 
  ggplot(aes(x = x, y = y)) +
  geom_hline(yintercept = median(datos), color = 2
             , linetype = "dashed") +
  geom_point(color = 2, size = 3) +
  scale_y_continuous(breaks = seq(0, 100, by = 10)) +
  labs(
    x = "", y = "Datos"
  )

Si n es par

Formula:

x = \frac{x_{(n/2)} + x_{((n/2)+1)}}{2}

Ver código
set.seed(33)
datos <- sample(1:100, size = 60, replace = TRUE)
datos
 [1] 42  8 86 60  9 92 53 48 29 39 42 96 86 32 66  4  3 60 34 61 79 59 85 81 51
[26] 94 43 92 65 41 66 15 79 86 77 83 42 16 57 82 50 58 97 74 13 68 63 89  4  5
[51] 53  3 86 92 81 15 63 28 41 59
Ver código
sort(datos)
 [1]  3  3  4  4  5  8  9 13 15 15 16 28 29 32 34 39 41 41 42 42 42 43 48 50 51
[26] 53 53 57 58 59 59 60 60 61 63 63 65 66 66 68 74 77 79 79 81 81 82 83 85 86
[51] 86 86 86 89 92 92 92 94 96 97
Ver código
median(datos)
[1] 59
Ver código
n <- length(datos)
(sort(datos)[n/2] + sort(datos)[n/2+1])/2
[1] 59
Ver código
tibble(x = 1:length(datos), y = sort(datos)) %>% 
  ggplot(aes(x = x, y = y)) +
  geom_hline(yintercept = median(datos), color = 2
             , linetype = "dashed") +
  geom_point(color = 2, size = 3) +
  scale_y_continuous(breaks = seq(0, 100, by = 10)) +
  labs(
    x = "", y = "Datos"
  )

Ejemplo

Ver código
pen <- palmerpenguins::penguins %>% drop_na()
pen
# A tibble: 333 × 8
   species island    bill_length_mm bill_depth_mm flipper_length_mm body_mass_g
   <fct>   <fct>              <dbl>         <dbl>             <int>       <int>
 1 Adelie  Torgersen           39.1          18.7               181        3750
 2 Adelie  Torgersen           39.5          17.4               186        3800
 3 Adelie  Torgersen           40.3          18                 195        3250
 4 Adelie  Torgersen           36.7          19.3               193        3450
 5 Adelie  Torgersen           39.3          20.6               190        3650
 6 Adelie  Torgersen           38.9          17.8               181        3625
 7 Adelie  Torgersen           39.2          19.6               195        4675
 8 Adelie  Torgersen           41.1          17.6               182        3200
 9 Adelie  Torgersen           38.6          21.2               191        3800
10 Adelie  Torgersen           34.6          21.1               198        4400
# ℹ 323 more rows
# ℹ 2 more variables: sex <fct>, year <int>
Ver código
median(pen$bill_length_mm)
[1] 44.5
Ver código
median(palmerpenguins::penguins$bill_length_mm, na.rm = TRUE)
[1] 44.45

Datos Agrupados

Formula:

M_e = L_i + \frac{\frac{N}{2} - F_{i-1}}{f_i} * a_i

Ver código
set.seed(555)
peso <- round(rnorm(200, mean = 60, sd = 5), 1)
interv = cut(x = peso, breaks = seq(43, 79, by = 4), include.lowest = TRUE, right = TRUE)
tabla_7<- tibble(Peso = interv) %>% 
  group_by(Peso) %>% 
  summarise(fi = n()) %>% 
  mutate(
    mi = seq(45, 77, by = 4)
  ) %>% 
  relocate(Peso, mi, fi) %>% 
  mutate(
    hi = fi/sum(fi),
    Fi = cumsum(fi),
    Hi = cumsum(hi),
    mifi = mi*fi
  )
tabla_7
# A tibble: 9 × 7
  Peso       mi    fi    hi    Fi    Hi  mifi
  <fct>   <dbl> <int> <dbl> <int> <dbl> <dbl>
1 [43,47]    45     1 0.005     1 0.005    45
2 (47,51]    49     3 0.015     4 0.02    147
3 (51,55]    53    27 0.135    31 0.155  1431
4 (55,59]    57    49 0.245    80 0.4    2793
5 (59,63]    61    64 0.32    144 0.72   3904
6 (63,67]    65    43 0.215   187 0.935  2795
7 (67,71]    69     8 0.04    195 0.975   552
8 (71,75]    73     4 0.02    199 0.995   292
9 (75,79]    77     1 0.005   200 1        77
Ver código
peso
  [1] 58.4 62.5 61.9 69.4 51.1 64.4 59.2 66.8 60.2 63.1 58.6 56.7 55.0 55.7 56.6
 [16] 59.6 61.3 56.2 52.8 62.6 63.5 62.3 54.2 66.4 54.7 60.2 63.4 59.5 59.0 67.2
 [31] 59.0 58.5 61.3 58.7 58.5 60.9 57.8 62.8 57.0 55.1 71.1 67.1 64.4 58.2 55.5
 [46] 61.7 62.0 58.3 71.3 44.4 65.7 60.4 61.4 58.9 59.2 55.2 65.5 56.7 54.4 60.3
 [61] 61.9 63.2 53.4 65.0 67.2 56.8 59.6 64.2 70.3 55.2 54.3 58.0 63.9 59.2 52.5
 [76] 56.3 61.7 61.6 66.3 59.6 50.8 66.5 63.0 54.4 60.9 66.7 61.1 53.8 55.7 61.1
 [91] 60.4 52.2 61.7 66.0 60.9 63.8 63.8 58.5 52.3 65.0 56.0 52.4 60.2 63.2 77.1
[106] 56.1 59.4 60.4 62.7 60.6 61.3 57.2 63.7 51.3 53.6 60.8 65.9 63.2 60.7 51.6
[121] 59.6 57.4 63.4 66.2 65.4 56.5 55.0 63.3 61.1 59.3 70.2 59.1 66.5 56.4 56.2
[136] 50.5 53.1 61.7 59.3 65.3 54.9 48.4 55.5 59.6 56.7 56.8 71.8 64.9 55.6 55.4
[151] 52.5 59.0 72.8 69.0 57.1 64.1 59.8 58.1 59.1 64.2 61.7 53.1 59.1 62.9 60.0
[166] 64.0 60.3 66.2 61.6 69.1 60.8 57.9 52.8 60.9 60.1 54.4 63.2 60.9 60.7 57.4
[181] 57.6 65.5 62.5 57.9 66.8 58.4 59.7 56.8 64.6 63.9 61.0 58.8 60.0 64.3 52.4
[196] 51.6 59.0 52.8 65.1 65.9
Ver código
59 + (200/2-80)/(64)*4
[1] 60.25
Ver código
median(peso)
[1] 60.15

#Medidas de posición

Cuantiles

Los cuantiles son valores que dividen una distribución de datos en partes iguales. Son una generalización de conceptos como cuartiles, deciles y percentiles.

Ver código
set.seed(1)
n <- 250
q <- 10
x <- sort(round(rnorm(n, mean = 40, sd = 8)))
color <- floor(q*(1:n)/(n+1)) + 1
tibble(x = x, color = factor(color)) %>% 
  ggplot(aes(x = x)#, fill = color)
         ) +
  geom_dotplot(binwidth = 1, show.legend = FALSE, )+
  scale_x_continuous(breaks = seq(0, 80, by = 2)) +
  theme(ggthemes::theme_fivethirtyeight()) +
  theme(
    axis.txet.y = element_blank(),
    axis.title = element_blank()
  )

Ver código
set.seed(1)
n <- 250
q <- 2
x <- sort(round(rnorm(n, mean = 40, sd = 8)))
color <- floor(q*(1:n)/(n+1)) + 1
tibble(x = x, color = factor(color)) %>% 
  ggplot(aes(x = x, fill = color)
         ) +
  geom_dotplot(binwidth = 1, show.legend = FALSE, )+
  scale_x_continuous(breaks = seq(0, 80, by = 2)) +
  theme(ggthemes::theme_fivethirtyeight()) +
  theme(
    axis.txet.y = element_blank(),
    axis.title = element_blank()
  )

Datos no Agrupados

Cuartil

Formula:

Q_i = \frac{i(n+1)}{4}, i \in (1, 2, 3)

Ejemplo

Ver código
set.seed(1)
n <- 250
q <- 4
x <- sort(round(rnorm(n, mean = 40, sd = 8)))
color <- floor(q*(1:n)/(n+1)) + 1
tibble(x = x, color = factor(color)) %>% 
  ggplot(aes(x = x, fill = color)
         ) +
  geom_dotplot(binwidth = 1, show.legend = FALSE, )+
  scale_x_continuous(breaks = seq(0, 80, by = 2)) +
  theme(ggthemes::theme_fivethirtyeight()) +
  theme(
    axis.txet.y = element_blank(),
    axis.title = element_blank()
  )

Ver código
3*(250+1)/4
[1] 188.25
Ver código
x[c(188,189)]
[1] 45 45
Ver código
1*(250+1)/4
[1] 62.75
Ver código
x[c(62,63)]
[1] 35 35

Quintil

Formula:

Qu_i = \frac{i(n+1)}{5}, i \in (1, 2, 3, 4)

Ejemplo

Ver código
set.seed(1)
n <- 250
q <- 5
x <- sort(round(rnorm(n, mean = 40, sd = 8)))
color <- floor(q*(1:n)/(n+1)) + 1
tibble(x = x, color = factor(color)) %>% 
  ggplot(aes(x = x, fill = color)
         ) +
  geom_dotplot(binwidth = 1, show.legend = FALSE, )+
  scale_x_continuous(breaks = seq(0, 80, by = 2)) +
  theme(ggthemes::theme_fivethirtyeight()) +
  theme(
    axis.txet.y = element_blank(),
    axis.title = element_blank()
  )

Ver código
3*(250+1)/5
[1] 150.6
Ver código
x[c(150,151)]
[1] 42 42

Decil

Formula:

D_i = \frac{i(n+1)}{10}, i \in (1, 2, ..., 9)

Ejemplo 7

Ver código
set.seed(1)
n <- 250
q <- 10
x <- sort(round(rnorm(n, mean = 40, sd = 8)))
color <- floor(q*(1:n)/(n+1)) + 1
tibble(x = x, color = factor(color)) %>% 
  ggplot(aes(x = x, fill = color)
         ) +
  geom_dotplot(binwidth = 1, show.legend = FALSE, )+
  scale_x_continuous(breaks = seq(0, 80, by = 2)) +
  theme(ggthemes::theme_fivethirtyeight()) +
  theme(
    axis.txet.y = element_blank(),
    axis.title = element_blank()
  )

Ver código
9*(250+1)/10
[1] 225.9
Ver código
x[c(225,226)]
[1] 50 51
Ver código
50+0.9*(51-50)
[1] 50.9

Percentiles

Formula: P_i = \frac{i(n+1)}{100}, i \in (1, 2, ..., 99)

Datos Agrupados

Ver código
set.seed(555)
peso <- round(rnorm(200, mean = 60, sd = 5), 1)
interv = cut(x = peso, breaks = seq(43,79, by = 4), include.lowest = TRUE, right = TRUE)
Ver código
tabla_6 <- tibble(Peso = interv) %>% 
  group_by(Peso) %>% 
  summarise(fi = n()) %>% 
  mutate(
    mi = seq(45,77, by = 4)
  ) %>% 
  relocate(Peso, mi, fi) %>% 
  mutate(
    hi = fi/sum(fi),
    Fi = cumsum(fi),
    Hi = cumsum(hi),
    mifi = mi*fi
  )
tabla_6
# A tibble: 9 × 7
  Peso       mi    fi    hi    Fi    Hi  mifi
  <fct>   <dbl> <int> <dbl> <int> <dbl> <dbl>
1 [43,47]    45     1 0.005     1 0.005    45
2 (47,51]    49     3 0.015     4 0.02    147
3 (51,55]    53    27 0.135    31 0.155  1431
4 (55,59]    57    49 0.245    80 0.4    2793
5 (59,63]    61    64 0.32    144 0.72   3904
6 (63,67]    65    43 0.215   187 0.935  2795
7 (67,71]    69     8 0.04    195 0.975   552
8 (71,75]    73     4 0.02    199 0.995   292
9 (75,79]    77     1 0.005   200 1        77

Cuartil

formula Q_i = L_i + \frac{\frac{i \cdot n}{4} - F_{i-1}}{f_i} a_i, i \in (1, 2, 3)

Tercer cuartil

Ver código
3*200/4 #(63,67]
[1] 150
Ver código
63+(3*200/4-144)/43*4
[1] 63.55814

En R:

Ver código
quantile(x = peso, 3/4)
   75% 
63.325 
Ver código
quantile(x = peso, c(1,2,3)/4)
   25%    50%    75% 
56.700 60.150 63.325 
Ver código
quantile(x, 3/4, type = 3)
75% 
 45 

Quintil

Formula Qu_i = L_i + \frac{\frac{i \cdot n}{5} - F_{i-1}}{f_i} a_i, i \in (1, 2, 3, 4)

Tercer quintil:

Ver código
3*200/5 # (59,63]
[1] 120
Ver código
59+(3*200/5-80)/64*4
[1] 61.5

En R:

Ver código
quantile(x = peso, 3/5)
 60% 
61.1 
Ver código
quantile(x = peso, c(1,2,3,5)/5)
  20%   40%   60%  100% 
55.94 59.06 61.10 77.10 
Ver código
quantile(x, 3/5, type = 3)
60% 
 42 

Decil

Formula

D_i = L_i + \frac{\frac{i \cdot n}{10} - F_{i-1}}{f_i} a_i, i \in (1, 2, ..., 9)

Tercer decil:

Ver código
3*200/10 # (55,59]
[1] 60
Ver código
55+(3*200/10-31)/57*4
[1] 57.03509

En R:

Ver código
quantile(x = peso, 3/10)
  30% 
57.74 
Ver código
quantile(x = peso, c(1:9)/10)
  10%   20%   30%   40%   50%   60%   70%   80%   90% 
53.58 55.94 57.74 59.06 60.15 61.10 62.63 64.12 66.21 
Ver código
quantile(x, 8/10, type = 8)
80% 
 46 

Percentil

Formula

P_i = L_i + \frac{\frac{i \cdot n}{100} - F_{i-1}}{f_i} a_i, i \in (1, 2, ..., 99)

Tercer percentil

Ver código
3*200/100 # (47,51]
[1] 6
Ver código
47+(3*200/100-1)/49*4
[1] 47.40816

En R:

Ver código
quantile(x = peso, 3/100)
    3% 
51.591 
Ver código
quantile(x = peso, c(1:99)/100)
    1%     2%     3%     4%     5%     6%     7%     8%     9%    10%    11% 
50.479 51.094 51.591 52.176 52.395 52.494 52.779 52.800 53.100 53.580 54.156 
   12%    13%    14%    15%    16%    17%    18%    19%    20%    21%    22% 
54.388 54.400 54.872 55.000 55.184 55.366 55.500 55.681 55.940 56.179 56.278 
   23%    24%    25%    26%    27%    28%    29%    30%    31%    32%    33% 
56.477 56.676 56.700 56.800 56.946 57.172 57.400 57.740 57.900 58.068 58.267 
   34%    35%    36%    37%    38%    39%    40%    41%    42%    43%    44% 
58.400 58.500 58.564 58.763 58.962 59.000 59.060 59.100 59.200 59.257 59.356 
   45%    46%    47%    48%    49%    50%    51%    52%    53%    54%    55% 
59.555 59.600 59.600 59.752 60.000 60.150 60.200 60.300 60.400 60.492 60.700 
   56%    57%    58%    59%    60%    61%    62%    63%    64%    65%    66% 
60.800 60.900 60.900 60.941 61.100 61.178 61.300 61.474 61.636 61.700 61.700 
   67%    68%    69%    70%    71%    72%    73%    74%    75%    76%    77% 
61.900 62.096 62.500 62.630 62.829 63.028 63.200 63.200 63.325 63.424 63.723 
   78%    79%    80%    81%    82%    83%    84%    85%    86%    87%    88% 
63.822 63.921 64.120 64.219 64.400 64.651 65.000 65.130 65.414 65.526 65.900 
   89%    90%    91%    92%    93%    94%    95%    96%    97%    98%    99% 
66.022 66.210 66.409 66.516 66.800 67.106 67.290 69.112 70.203 71.104 71.810 
Ver código
quantile(x, 3/100, type = 3)
3% 
26 

5 Medidas de dispersión

Varialza

La Varianza es una de las medidas de dispersión más importantes en estadística. Mide la dispersión promedio de los valores de un conjunto de datos con respecto a su media.

Datos no agrupados

Formula para varianza poblacional:

^2 = \frac{\sum_{i=1}^n (x_i - \bar{x})^2}{n}

Ver código
set.seed(222)
x <- sample(10:50, size = 50, replace = TRUE)
x
 [1] 24 27 32 33 31 18 19 50 42 33 35 16 50 22 25 46 18 10 14 31 50 18 25 14 47
[26] 13 23 48 47 46 10 24 36 38 31 48 32 37 39 13 21 32 39 27 18 39 48 22 49 29
Ver código
varpob <- sum((x - mean(x))^2)/length(x)
varpob
[1] 147.3716

Formula para varianza muestral:

Formula

s^2 = \frac{\sum_{i=1}^n (x_i - \bar{x})^2}{n - 1}

Ver código
set.seed(222)
x <- sample(10:50, size = 50, replace = TRUE)
x
 [1] 24 27 32 33 31 18 19 50 42 33 35 16 50 22 25 46 18 10 14 31 50 18 25 14 47
[26] 13 23 48 47 46 10 24 36 38 31 48 32 37 39 13 21 32 39 27 18 39 48 22 49 29
Ver código
varmue <- sum((x - mean(x))^2)/(length(x)-1)
varmue
[1] 150.3792
Ver código
var(x) #varianza muestral
[1] 150.3792
Ver código
var(x)*(length(x)-1)/length(x) #varianza poblacional
[1] 147.3716

Datos agrupados

Formula para varianza poblacional:

^2 = \frac{\sum_{i=1}^n (m_i - \mu)^2 f_i}{n}

Ver código
set.seed(555)
peso <- round(rnorm(200, mean = 60, sd = 5), 1)
interv = cut(x = peso, breaks = seq(43,79, by = 4), include.lowest = TRUE, right = TRUE)
tabla_8 <- tibble(Peso = interv) %>% 
  group_by(Peso) %>% 
  summarise(fi = n()) %>% 
  mutate(
    mi = seq(45,77, by = 4)
  ) %>% 
  relocate(Peso, mi, fi) %>% 
  mutate(
    hi = fi/sum(fi),
    Fi = cumsum(fi),
    Hi = cumsum(hi),
    mifi = mi*fi
  )
tabla_8
# A tibble: 9 × 7
  Peso       mi    fi    hi    Fi    Hi  mifi
  <fct>   <dbl> <int> <dbl> <int> <dbl> <dbl>
1 [43,47]    45     1 0.005     1 0.005    45
2 (47,51]    49     3 0.015     4 0.02    147
3 (51,55]    53    27 0.135    31 0.155  1431
4 (55,59]    57    49 0.245    80 0.4    2793
5 (59,63]    61    64 0.32    144 0.72   3904
6 (63,67]    65    43 0.215   187 0.935  2795
7 (67,71]    69     8 0.04    195 0.975   552
8 (71,75]    73     4 0.02    199 0.995   292
9 (75,79]    77     1 0.005   200 1        77
Ver código
prompeso <- sum(tabla_8$mifi)/200
prompeso
[1] 60.18
Ver código
prompeso <- sum(tabla_8$mifi)/200
sum((tabla_8$mi - prompeso)^2*tabla_8$fi)/(199)
[1] 25.61568
Ver código
var(peso) #Utiliza todos los datos sin operación
[1] 24.70753

Formula para varianza muestral:

s^2 = \frac{\sum_{i=1}^n (m_i - \bar{x})^2 f_i}{n - 1}

Ver código
set.seed(555)
personas <- round(rnorm(250, mean = 4, sd = 1))
tabla_9 <- tibble(Personas = personas) %>% 
  group_by(Personas) %>% 
  summarise(fi = n()) %>% 
  mutate(
    hi = round(fi/sum(fi), 4),
    Fi = cumsum(fi),
    Hi = cumsum(hi),
    xifi = Personas*fi
  )
tabla_9
# A tibble: 7 × 6
  Personas    fi    hi    Fi    Hi  xifi
     <dbl> <int> <dbl> <int> <dbl> <dbl>
1        1     2 0.008     2 0.008     2
2        2    14 0.056    16 0.064    28
3        3    57 0.228    73 0.292   171
4        4    99 0.396   172 0.688   396
5        5    63 0.252   235 0.94    315
6        6    13 0.052   248 0.992    78
7        7     2 0.008   250 1        14
Ver código
propersonas <- sum(tabla_9$xifi)/250
propersonas
[1] 4.016
Ver código
propersonas <- sum(tabla_9$xifi)/250
sum((tabla_9$Personas - propersonas)^2*tabla_9$fi)/(249)
[1] 1.059984
Ver código
var(personas)
[1] 1.059984

6 Desviación estándar

La Desviación Estándar es la medida de dispersión más utilizada porque nos da la dispersión de los datos en las unidades originales de los mismos, a diferencia de la Varianza, que está en unidades cuadradas.

Datos no agrupados

Formula para desviación estándar poblacional:

= \sqrt{\frac{\sum_{i=1}^n (x_i - \mu)^2}{n}}

Formula para desviación estándar muestral:

s = \sqrt{\frac{\sum_{i=1}^n (x_i - \bar{x})^2}{n - 1}}

Ejemplo

Ver código
set.seed(222)
x <- sample(10:50, size = 50, replace = TRUE)
x
 [1] 24 27 32 33 31 18 19 50 42 33 35 16 50 22 25 46 18 10 14 31 50 18 25 14 47
[26] 13 23 48 47 46 10 24 36 38 31 48 32 37 39 13 21 32 39 27 18 39 48 22 49 29
Ver código
sqrt(var(x))
[1] 12.26292
Ver código
sd(x)
[1] 12.26292

Datos agrupados

Formula para desviación estándar poblacional:

= \sqrt{\frac{\sum_{i=1}^n (m_i - \mu)^2 f_i}{n}}

Formula para desviación estándar muestral:

s = \sqrt{\frac{\sum_{i=1}^n (m_i - \bar{x})^2 f_i}{n - 1}}

Ejemplo

Ver código
set.seed(555)
peso <- round(rnorm(200, mean = 60, sd = 5), 1)
interv = cut(x = peso, breaks = seq(43,79, by = 4), include.lowest = TRUE, right = TRUE)
tabla_10 <- tibble(Peso = interv) %>% 
  group_by(Peso) %>% 
  summarise(fi = n()) %>% 
  mutate(
    mi = seq(45,77, by = 4)
  ) %>% 
  relocate(Peso, mi, fi) %>% 
  mutate(
    hi = fi/sum(fi),
    Fi = cumsum(fi),
    Hi = cumsum(hi),
    mifi = mi*fi
  )
tabla_10
# A tibble: 9 × 7
  Peso       mi    fi    hi    Fi    Hi  mifi
  <fct>   <dbl> <int> <dbl> <int> <dbl> <dbl>
1 [43,47]    45     1 0.005     1 0.005    45
2 (47,51]    49     3 0.015     4 0.02    147
3 (51,55]    53    27 0.135    31 0.155  1431
4 (55,59]    57    49 0.245    80 0.4    2793
5 (59,63]    61    64 0.32    144 0.72   3904
6 (63,67]    65    43 0.215   187 0.935  2795
7 (67,71]    69     8 0.04    195 0.975   552
8 (71,75]    73     4 0.02    199 0.995   292
9 (75,79]    77     1 0.005   200 1        77
Ver código
prompeso <- sum(tabla_10$mifi)/200
prompeso
[1] 60.18
Ver código
sqrt(sum((tabla_10$mi - prompeso)^2*tabla_10$fi)/(199))
[1] 5.061193

7 Coeficiente de variación

es una medida de dispersión relativa muy útil. Su objetivo principal es permitir la comparación de la dispersión entre dos o más conjuntos de datos que tienen unidades de medida diferentes o medias muy distintas.

Formula:

CV = CV=\frac{\sigma}{\mu}

Ver código
futbol <- c(2, 3, 4, 0, 5, 0, 1, 1, 5, 2)
baloncesto <- c(90, 85, 87, 88, 89, 85, 86, 86, 90, 87)
Ver código
# Rango
diff(range(futbol))
[1] 5
Ver código
diff(range(baloncesto))
[1] 5
Ver código
# Rango inter cuantílico
IQR(futbol)
[1] 2.75
Ver código
IQR(baloncesto)
[1] 2.75
Ver código
# Varianza
var(futbol)
[1] 3.566667
Ver código
var(baloncesto)
[1] 3.566667
Ver código
# Desviación estándar
sd(futbol)
[1] 1.888562
Ver código
sd(baloncesto)
[1] 1.888562
Ver código
# Coeficiente de variación
sd(futbol)/mean(futbol)
[1] 0.8211139
Ver código
sd(baloncesto)/mean(baloncesto)
[1] 0.02163301
Ver código
paste0("Cv: ",round(sd(futbol)/mean(futbol)*100,2), "%")
[1] "Cv: 82.11%"
Ver código
paste0("Cv: ",round(sd(baloncesto)/mean(baloncesto)*100,2), "%")
[1] "Cv: 2.16%"

8 Medidas de forma

8.1 Medidas de asimetría

Las medidas de asimetría (o coeficientes de asimetría) son indicadores que muestran qué tan simétrica o inclinada está una distribución de datos respecto a la media.

Sirven para saber si los datos se “inclinan” hacia la derecha, hacia la izquierda o si son simétricos.

w’_i = asimetria\frac{media-moda}{o}

Coeficiente de asimetría

El coeficiente de asimetría es una medida estadística que indica qué tan simétrica o inclinada está una distribución de datos respecto a su media.

Ver código
n <- 1000
set.seed(123)

df <- data.frame(
  x1 = rchisq(n = n, df = 7),
  x2 = rnorm(n = n, mean = 10, sd = 2),
  x3 = rbeta(n = n, shape1 = 5, shape2 = 2)
)
Ver código
df %>% ggplot(aes(x = x1)) +
  geom_histogram(aes(y = ..density..), 
                 bins = 25, 
                 fill = "orange", 
                 color = "black") +
  geom_density(color = "blue", linewidth = 1) +
  ggthemes::theme_fivethirtyeight()

Ver código
df %>% 
  ggplot(aes(x = x2)) +
  geom_histogram(aes(y = ..density..), 
                 bins = 25, 
                 fill = "orange", 
                 color = "white") +
  geom_density(color = "red", linewidth = 1) +
  ggthemes::theme_fivethirtyeight()

Ver código
df %>% 
  ggplot(aes(x = x3)) +
  geom_histogram(aes(y = ..density..), 
                 bins = 25, 
                 fill = "orange", 
                 color = "white") +
  geom_density(color = "blue", linewidth = 1) +
  ggthemes::theme_fivethirtyeight()

Formula del coeficiente de asimetría de Fisher:

\gamma_1 = \frac{\mu_3}{\sigma^3} \mu_3 = \frac{1}{n} \sum_{i=1}^n (x_i - \mu)^3

Ver código
mu3 <- (1/length(df$x1))*sum((df$x1-mean(df$x1))^3)
de <- sd(df$x1)
mu3/de^3
[1] 0.9283745
Ver código
mu3 <- (1/length(df$x2))*sum((df$x2-mean(df$x2))^3)
de <- sd(df$x2)
mu3/de^3
[1] -0.07075422
Ver código
mu3 <- (1/length(df$x3))*sum((df$x3-mean(df$x3))^3)
de <- sd(df$x3)
mu3/de^3
[1] -0.6718215

Formula del coeficiente de asimetría de Pearson:

A_{\rho} = \frac{\mu - M_0}{\sigma}

Ver código
(mean(df$x1)-as.numeric(modeest::mlv(x = df$x1, method = "meanshift")))/sd(df$x1)
[1] 0.2508712
Ver código
(mean(df$x2)-as.numeric(modeest::mlv(x = df$x2, method = "meanshift")))/sd(df$x2)
[1] -0.1880093
Ver código
(mean(df$x3)-as.numeric(modeest::mlv(x = df$x3, method = "meanshift")))/sd(df$x3)
[1] -0.1772704

Formula del coeficiente de asimetría de Bowley-Yule:

A_{BY} = \frac{Q_{3/4} + Q_{1/4} - 2M_e}{Q_{3/4} - Q_{1/4}}

Ver código
as.numeric((quantile(df$x1, 3/4)+quantile(df$x1, 1/4) - 2*median(df$x1))/(quantile(df$x1, 3/4)-quantile(df$x1, 1/4)))
[1] 0.05660247
Ver código
as.numeric((quantile(df$x2, 3/4)+quantile(df$x2, 1/4) - 2*median(df$x2))/(quantile(df$x2, 3/4)-quantile(df$x2, 1/4)))
[1] -0.07035129
Ver código
as.numeric((quantile(df$x3, 3/4)+quantile(df$x3, 1/4) - 2*median(df$x3))/(quantile(df$x3, 3/4)-quantile(df$x3, 1/4)))
[1] -0.1329906

Formula del coeficiente de asimetría de R:

Ver código
e1071::skewness(x = df$x1, type = 3) #R utiliza la formula de Fisher
[1] 0.9283745
Ver código
e1071::skewness(x = df$x2, type = 1) 
[1] -0.07086048
Ver código
e1071::skewness(x = df$x3, type = 2)
[1] -0.6738417

9 Medidas de apuntamiento

Las medidas de apuntamiento describen qué tan “picada” o “aplanada” está una distribución en comparación con la distribución normal. Evaluan la concentración de los datos alrededor de la media y el peso de las colas.

Ver código
ggplot(data = data.frame(x = c(-5, 5)), aes(x)) +
  stat_function(fun = dnorm, n = 201, args = list(mean = 0, sd = 1), color = "red") +
  stat_function(fun = dnorm, n = 201, args = list(mean = 0, sd = 0.5), color = "blue") +
  stat_function(fun = dnorm, n = 201, args = list(mean = 0, sd = 1.5), color = "orange") +
  geom_text(x = 2.5, y = 0.65, label = "Leptocúrtica", color = "blue") +
  geom_text(x = 2.5, y = 0.6, label = "Mesocúrtica", color = "red") +
  geom_text(x = 2.5, y = 0.55, label = "Platicúrtica", color = "orange") +
  ggthemes::theme_fivethirtyeight()

Ver código
N <- 1000
set.seed(123)
df <- tibble(
  M = rnorm(n, mean = 50, sd = 10),
  P = 2*runif(n, min = 40, max = 60)-50,
  L = rexp(n, rate = 0.1) + 50
)
Ver código
df %>% gather(key = "Tipo", value = "x") %>% 
  ggplot(aes(x = x, color = Tipo )) +
  geom_density(linewidth = 1) +
  scale_x_continuous(limits = c(0,100)) +
  ggthemes::theme_fivethirtyeight()

Formulas: \beta_2 = \frac{\mu_4}{\sigma^4} \mu_4 = \frac{1}{n} \sum_{i=1}^n (x_i - \mu)^4 g_2 = \beta_2 - 3

Ver código
mu4 <- (1/length(df$M))*sum((df$M-mean(df$M))^4)
mu4/var(df$M)^2 - 3
[1] -0.08010201
Ver código
mu4 <- (1/length(df$L))*sum((df$M-mean(df$L))^4)
mu4/var(df$L)^2 - 3
[1] 4.957105
Ver código
mu4 <- (1/length(df$P))*sum((df$P-mean(df$P))^4)
mu4/var(df$P)^2 -3
[1] -1.254742
Ver código
df %>% gather(key = "Tipo", value = "x") %>% 
  group_by(Tipo) %>% 
  summarise(
    k = (1/length(x))*sum((x-mean(x))^4)/var(x)^2 - 3
  )
# A tibble: 3 × 2
  Tipo        k
  <chr>   <dbl>
1 L      6.62  
2 M     -0.0801
3 P     -1.25  

Tipo 1

Ver código
df %>% gather(key = "Tipo", value = "x") %>%
  group_by(Tipo) %>% 
  summarise(
    k = e1071::kurtosis(x, type = 1)
  )
# A tibble: 3 × 2
  Tipo        k
  <chr>   <dbl>
1 L      6.64  
2 M     -0.0743
3 P     -1.25  

Tipo 2

Ver código
df %>% gather(key = "Tipo", value = "x") %>%
  group_by(Tipo) %>% 
  summarise(
    k = e1071::kurtosis(x, type = 2)
  )
# A tibble: 3 × 2
  Tipo        k
  <chr>   <dbl>
1 L      6.68  
2 M     -0.0686
3 P     -1.25  

Tipo 3

Ver código
df %>% gather(key = "Tipo", value = "x") %>%
  group_by(Tipo) %>% 
  summarise(
    k = e1071::kurtosis(x, type = 3)
  )
# A tibble: 3 × 2
  Tipo        k
  <chr>   <dbl>
1 L      6.62  
2 M     -0.0801
3 P     -1.25