Análisis Inferencial de la Distribución Geográfica por País

CARGA DE DATOS Y LIBRERIAS

CARGA DE DATOS Y LIBRERIAS

knitr::opts_chunk$set(
    echo = TRUE,                   # Muestra el código R en el reporte final.
    message = FALSE,
    warning = FALSE,               # Message y warning evitan que se impriman alertas o mensajes de carga estorbosos en el HTML.
    fig.align = "center"           # Centra automáticamente todas las gráficas generadas.
)

datos <- read.csv("C:/Users/USER/Documents/PROYECTO ESTADISTICA/CMDB_Data.csv", 
                  header = TRUE, # Indica que la primera fila contienen los nombres de las variables.
                  sep = ";",     # Define que los puntos y comas es el separador de las columnas del archivo.
                  dec = ".",     # Establece el punto como el operador decimal para los números.
                  fileEncoding = "latin1")

# Verificación inicial del set de datos
str(datos)

## 'data.frame':    1366 obs. of  103 variables:
##  $ ï..LAB_ID            : chr  "C355417" "C360759" "C360762" "C360763" ...
##  $ PREVIOUS_LAB_ID1     : chr  "" "" "" "" ...
##  $ PREVIOUS_LAB_ID2     : chr  "" "" "" "" ...
##  $ PREVIOUS_LAB_ID3     : chr  "" "" "" "" ...
##  $ FIELD_ID             : chr  "RM0001" "RM0027" "RM0030" "RM0031" ...
##  $ JOB_ID               : chr  "MRP11968" "MRP12307" "MRP12307" "MRP12307" ...
##  $ PREVIOUS_JOB_ID1     : chr  "" "" "" "" ...
##  $ PREVIOUS_JOB_ID2     : chr  "" "" "" "" ...
##  $ PREVIOUS_JOB_ID3     : chr  "" "" "" "" ...
##  $ SUBMITTER            : chr  "Rare Metals Task" "Rare Metals Task" "Rare Metals Task" "Rare Metals Task" ...
##  $ PROJECT_NAME         : chr  "Critical and Rare Metals" "Critical and Rare Metals" "Critical and Rare Metals" "Critical and Rare Metals" ...
##  $ DATE_SUBMITTED       : chr  "30/6/2011" "31/8/2011" "31/8/2011" "31/8/2011" ...
##  $ COLLECTION           : chr  "Mackay-Keck Ore Deposits Collection" "Mackay-Stanford Ore Deposits Collection" "Mackay-Stanford Ore Deposits Collection" "Mackay-Stanford Ore Deposits Collection" ...
##  $ COLLECTION_ID        : chr  "PHNC08_39_1183" "OD21441" "OD22811" "OD25716" ...
##  $ CONTINENT            : chr  "North America" "South America" "South America" "Africa" ...
##  $ COUNTRY              : chr  "United States" "Chile" "Chile" "South Africa" ...
##  $ STATE_PROVINCE       : chr  "Nevada" "Antofagasta" "TarapacÃ¡" "Transvaal" ...
##  $ COUNTY               : chr  "Lyon" "El Loa" "El Tamarugal" "" ...
##  $ DISTRICT_NAME        : chr  "Yerington" "Chuquicamata" "Collahuasi/Quebrada Blanca" "" ...
##  $ DEPOSIT_NAME         : chr  "Pumpkin Hollow" "" "" "" ...
##  $ MINE_NAME            : chr  "Pumpkin Hollow" "Chuquicamata mine" "Collahuasi district" "" ...
##  $ DISTRICT_NAME_COLLECT: chr  "Yerington" "" "" "" ...
##  $ DEPOSIT_NAME_COLLECT : chr  "" "" "" "" ...
##  $ MINE_NAME_COLLECT    : chr  "Pumpkin Hollow" "Chuquicamata" "Poduosa mine" "Messina Mines Ltd." ...
##  $ LOCATE_DESC          : chr  "" "" "Level 25" "" ...
##  $ LATITUDE             : chr  "38,94021" "-22,2871" "-21,0309" "-24,7" ...
##  $ LONGITUDE            : chr  "-119,05178" "-68,8991" "-68,74951" "29,3" ...
##  $ DATUM                : chr  "WGS84" "WGS84" "WGS84" "" ...
##  $ LATITUDE_COLLECT     : chr  "38,92492" "22,28944" "" "" ...
##  $ LONGITUDE_COLLECT    : chr  "-119,1071" "-68,90111" "" "" ...
##  $ DATUM_COLLECT        : chr  "" "WGS84" "" "" ...
##  $ COORDINATES_QUAL     : chr  "100 m" "Exact" "" "" ...
##  $ COORDINATES_SOURCE   : chr  "1) iTouchMap.com, approx, A. Orkild-Norton; 2) Mineral Resource Deposit Database Deposit ID 10174173, ore body, M. Granitto" "1) Mindat.org, approx, A. Orkild-Norton; 2) Open-File Report 2017-1079 ID 549, mine, M. Granitto" "1) No coordinates; 2) Mineral Resource Deposit Database Deposit ID 10057511, district, M. Granitto" "1) No coordinates; 2) Google Earth Pro, approx ctr of former province of Transvaal, M. Granitto" ...
##  $ PRIMARY_CLASS        : chr  "rock" "rock" "rock" "rock" ...
##  $ SYSTEM_TYPE          : chr  "IOA-IOCG" "Porphyry Cu-Mo-Au" "Porphyry Cu-Mo-Au" "IOA-IOCG" ...
##  $ DEPOSIT_TYPE         : chr  "IOCG" "Supergene Cu" "Porphyry Cu" "IOCG" ...
##  $ SAMPLE_DESC          : chr  "Nearly solid chalcopyrite mixed with small light brown irregular inclusions of unknown mineralogy; clouds of ma"| __truncated__ "Chalcocite-bronchatite-antlerite(?); highly microfractured igneous rock with green copper sulfates coating microfractures" "Bornite-chalcopyrite; mostly massive chalcopyrite with numerous inclusions of micro-chalcopyrite and widely sca"| __truncated__ "Massive chalcopyrite, IOCG in shear zone; mostly massive fine grain cuprite with widely distributed malachite t"| __truncated__ ...
##  $ Al_pct_AES_ST        : chr  "0,33" "6,65" "0,46" "0,7" ...
##  $ Ca_pct_AES_ST        : chr  "1,1" "0,4" "-0,1" "0,3" ...
##  $ Fe_pct_AES_ST        : chr  "42,4" "0,25" "6,98" "27,8" ...
##  $ K_pct_AES_ST         : chr  "-0,1" "6,1" "0,2" "-0,1" ...
##  $ Mg_pct_AES_ST        : chr  "0,57" "0,1" "0,01" "0,33" ...
##  $ Mn_pct_AES_ST        : chr  "0,02" "-0,01" "-0,01" "-0,01" ...
##  $ P_pct_AES_ST         : chr  "-0,01" "0,01" "0,05" "0,01" ...
##  $ S_pct_AES_ST         : chr  "" "" "" "" ...
##  $ Si_pct_AES_ST        : chr  "" "" "" "" ...
##  $ Ti_pct_AES_ST        : chr  "0,01" "0,11" "-0,01" "-0,01" ...
##  $ F_pct_ISE_Fuse       : chr  "" "" "" "" ...
##  $ Ag_ppm_MS_ST         : chr  "58" "6" "468" "16" ...
##  $ As_ppm_MS_ST         : chr  "-30" "-30" "90" "-30" ...
##  $ Au_ppm               : chr  "" "" "" "" ...
##  $ Au_AM                : chr  "" "" "" "" ...
##  $ B_ppm_AES_ST         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Ba_ppm_AES_ST        : chr  "-0,5" "924" "121" "174" ...
##  $ Be_ppm_AES_ST        : int  -5 -5 -5 -5 -5 -5 -5 -5 -5 -5 ...
##  $ Bi_ppm_MS_ST         : chr  "1,5" "3,6" "190" "0,4" ...
##  $ Cd_ppm_MS_ST         : chr  "3,6" "-0,2" "0,9" "-0,2" ...
##  $ Ce_ppm_MS_ST         : chr  "0,4" "8,8" "16,3" "3,5" ...
##  $ Co_ppm_MS_ST         : chr  "209" "-0,5" "1,3" "44,8" ...
##  $ Cr_ppm_AES_ST        : int  -10 -10 -10 30 20 20 60 40 20 10 ...
##  $ Cs_ppm_MS_ST         : chr  "0,5" "1,4" "0,2" "-0,1" ...
##  $ Cu_ppm_AES_ST        : chr  "50000,11111" "23300" "50000,11111" "50000,11111" ...
##  $ Dy_ppm_MS_ST         : chr  "-0,05" "0,32" "1,38" "0,37" ...
##  $ Er_ppm_MS_ST         : chr  "-0,05" "0,22" "0,77" "0,23" ...
##  $ Eu_ppm_MS_ST         : chr  "-0,05" "0,14" "0,17" "0,1" ...
##  $ Ga_ppm_MS_ST         : chr  "5" "15" "6" "3" ...
##  $ Gd_ppm_MS_ST         : chr  "-0,05" "0,45" "1,5" "0,39" ...
##  $ Ge_ppm_MS_ST         : int  -1 5 -1 -1 3 8 8 1 2 2 ...
##  $ Hf_ppm_MS_ST         : int  -1 4 -1 -1 5 13 12 2 3 6 ...
##  $ Ho_ppm_MS_ST         : chr  "-0,05" "0,07" "0,25" "0,07" ...
##  $ In_ppm_MS_ST         : chr  "6,4" "-0,2" "3,7" "0,2" ...
##  $ La_ppm_MS_ST         : chr  "0,2" "4,6" "7,2" "1,7" ...
##  $ Li_ppm_AES_ST        : int  -10 -10 -10 -10 30 20 20 20 -10 20 ...
##  $ Lu_ppm_MS_ST         : chr  "-0,05" "-0,05" "0,08" "-0,05" ...
##  $ Mo_ppm_MS_ST         : chr  "-2" "60" "3" "2" ...
##  $ Nb_ppm_MS_ST         : chr  "-1" "4" "-1" "-1" ...
##  $ Nd_ppm_MS_ST         : chr  "0,2" "3,8" "9,1" "1,7" ...
##  $ Ni_ppm_AES_ST        : chr  "144" "6" "-5" "48" ...
##  $ Pb_ppm_MS_ST         : chr  "23" "16" "188" "39" ...
##  $ Pd_ppm_FA_MS         : chr  "" "" "" "" ...
##  $ Pr_ppm_MS_ST         : chr  "-0,05" "1,09" "2,21" "0,46" ...
##  $ Pt_ppm_FA_MS         : chr  "" "" "" "" ...
##  $ Rb_ppm_MS_ST         : chr  "1,2" "148" "7,1" "0,7" ...
##  $ Re_ppm_MS_HF         : chr  "" "" "" "" ...
##  $ Sb_ppm_MS_ST         : chr  "1,2" "2,4" "2,9" "0,3" ...
##  $ Sc_ppm_AES_ST        : int  -5 -5 -5 -5 11 6 15 10 5 6 ...
##  $ Se_ppm_MS_ST         : int  NA NA NA NA NA NA NA NA NA NA ...
##  $ Sm_ppm_MS_ST         : chr  "-0,1" "0,6" "1,6" "0,4" ...
##  $ Sn_ppm_MS_ST         : chr  "2" "3" "106" "-1" ...
##  $ Sr_ppm_AES_ST        : chr  "26,6" "114" "22,5" "38,4" ...
##  $ Ta_ppm_MS_ST         : chr  "-0,5" "-0,5" "-0,5" "-0,5" ...
##  $ Tb_ppm_MS_ST         : chr  "-0,05" "0,07" "0,23" "-0,05" ...
##  $ Te_ppm_MS_ST         : chr  "" "" "" "" ...
##  $ Th_ppm_MS_ST         : chr  "0,2" "9,7" "2,6" "0,2" ...
##  $ Tl_ppm_MS_ST         : chr  "-0,5" "0,5" "-0,5" "-0,5" ...
##  $ Tm_ppm_MS_ST         : chr  "-0,05" "-0,05" "0,08" "-0,05" ...
##  $ U_ppm_MS_ST          : chr  "0,3" "1,75" "0,63" "34,8" ...
##  $ V_ppm_AES_ST         : int  51 24 -5 493 68 20 40 159 39 61 ...
##  $ W_ppm_MS_ST          : chr  "-1" "28" "22" "11" ...
##   [list output truncated]

TABLA DE DISTRIBUCIÓN DE PROBABILIDAD

# Cargar las librerías necesarias
library(dplyr)
library(gt)

# 1. CREAR LA TABLA DE FRECUENCIAS Y RANGOS (TOP)
tabla_pais <- datos %>%
  count(COUNTRY, name = "ni") %>%
  mutate(
    hi = ni / sum(ni),
    P  = round(hi * 100, 2)
  ) %>%
  arrange(desc(ni)) %>%
  mutate(rank = row_number())

# 2. CREAR LA FILA DE TOTALES
total_pais <- data.frame(
  COUNTRY = "TOTAL",
  ni      = sum(tabla_pais$ni),
  hi      = 1.0000,
  P       = 100.00,
  rank    = NA
)

# 3. UNIR TODO EN UNA SOLA TABLA FINAL
tabla_pais_final <- bind_rows(tabla_pais, total_pais)

# 4. GENERAR LA TABLA VISUAL FILTRANDO EL TOP 20
tabla_pais_gt <- tabla_pais_final %>%
  filter(rank <= 20 | is.na(rank)) %>% # Muestra los primeros 20 países y la fila de TOTAL
  select(-rank) %>%                    # Oculta la columna de rango en la visualización
  gt() %>%
  tab_header(
    title = md("**Tabla N° 1**"),
    subtitle = md("Distribución de probabilidad por País (Top 20)")
  ) %>%
  tab_source_note(
    source_note = md("Autores: Grupo 1 <br> Semestre 2026 - 2026")
  ) %>%
  cols_label(
    COUNTRY = "País",
    ni      = "Frecuencia absoluta (ni)",
    hi      = "Frec. relativa",
    P       = "Probabilidad (%)"
  ) %>%
  fmt_number(columns = c(hi, P), decimals = 2) %>%
  fmt_number(columns = ni, decimals = 0) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(rows = COUNTRY == "TOTAL") # Pone en negrita la fila de TOTAL
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black",
    row.striping.include_table_body = TRUE
  )

# Renderizar la tabla en el documento
tabla_pais_gt

País	Frecuencia absoluta (ni)	Frec. relativa	Probabilidad (%)
Tabla N° 1
Distribución de probabilidad por País (Top 20)
United States	1,021	0.75	74.74
Canada	59	0.04	4.32
Chile	53	0.04	3.88
Australia	39	0.03	2.86
Sweden	38	0.03	2.78
Mexico	30	0.02	2.20
Peru	19	0.01	1.39
Japan	18	0.01	1.32
Brazil	17	0.01	1.24
South Africa	10	0.01	0.73
Zambia	7	0.01	0.51
Vietnam	6	0.00	0.44
Argentina	4	0.00	0.29
Democratic Republic of Congo	4	0.00	0.29
Finland	4	0.00	0.29
Indonesia	4	0.00	0.29
Mauritania	4	0.00	0.29
Poland	4	0.00	0.29
Portugal	4	0.00	0.29
China	3	0.00	0.22
TOTAL	1,366	1.00	100.00
Autores: Grupo 1 Semestre 2026 - 2026

TABLA DE DISTRIBUCIÓN DE PROBABILIDAD AGRUPADA

# Cargar librerías necesarias
library(dplyr)
library(gt)

# 1. CREAR LA NUEVA COLUMNA Y CLASIFICAR USANDO case_when()
datos <- datos %>%
  mutate(
    REGION = case_when(
      COUNTRY %in% c("United States", "Canada", "Mexico") ~ "Norte America",
      COUNTRY %in% c("Brazil", "Chile", "Peru", "Argentina", "Colombia", "Ecuador") ~ "Latino America",
      COUNTRY %in% c("South Africa", "Democratic Republic of the Congo", "Ghana", "Mali") ~ "Africa",
      COUNTRY %in% c("China", "India", "Indonesia", "Kazakhstan") ~ "Asia",
      COUNTRY %in% c("Russia", "Sweden", "Poland", "Germany") ~ "Europa",
      COUNTRY %in% c("Australia", "Papua New Guinea", "New Zealand", "New Caledonia") ~ "Oceania",
      TRUE ~ "Otra / No Clasificado" # El TRUE funciona como un "si no cumple nada de lo anterior"
    )
  )

# 2. CONTEO Y CÁLCULO DE PROBABILIDADES/PORCENTAJES
tabla_regiones <- datos %>%
  count(REGION, name = "ni") %>%
  mutate(
    hi = ni / sum(ni),
    hi_porc = round(hi * 100, 2)
  ) %>%
  arrange(desc(ni)) # Ordenamos la tabla de mayor a menor frecuencia

# 3. FILA DE TOTALES GENERALES
Total_Region <- data.frame(
  REGION = "TOTAL", 
  ni = sum(tabla_regiones$ni), 
  hi = 1.0000, 
  hi_porc = 100.00
)

# 4. ACOPLAR TODO EN UNA TABLA FINAL
tabla_final_regiones <- bind_rows(tabla_regiones, Total_Region)

# 5. GENERAR LA TABLA VISUAL CON 'gt'
tabla_region_gt <- tabla_final_regiones %>%
  gt() %>%
  tab_header(
    title = md("**Tabla N° 2**"),
    subtitle = md("Distribución de muestras de minerales críticos por región <br> en minas archivadas")
  ) %>%
  tab_source_note(
    source_note = md("Autores: Grupo 1 <br> Semestre 2026 - 2026")
  ) %>%
  cols_label(
    REGION = "Región",
    ni = "Frecuencia absoluta (ni)",
    hi = "Frecuencia relativa (hi)",
    hi_porc = "Probabilidad (%)"
  ) %>%
  fmt_number(columns = c(hi, hi_porc), decimals = 2) %>%
  fmt_number(columns = ni, decimals = 0) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(rows = REGION == "TOTAL")
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black",
    row.striping.include_table_body = TRUE
  )

# Renderizar la tabla en el documento
tabla_region_gt

Región	Frecuencia absoluta (ni)	Frecuencia relativa (hi)	Probabilidad (%)
Tabla N° 2
Distribución de muestras de minerales críticos por región en minas archivadas
Norte America	1,110	0.81	81.26
Latino America	93	0.07	6.81
Otra / No Clasificado	60	0.04	4.39
Europa	45	0.03	3.29
Oceania	40	0.03	2.93
Africa	10	0.01	0.73
Asia	8	0.01	0.59
TOTAL	1,366	1.00	100.00
Autores: Grupo 1 Semestre 2026 - 2026

GRÁFICAS DE DISTRIBUCIÓN PROBABILIDAD

# 1. PREPARAR DATOS PARA EL GRÁFICO (EXCLUIR TOTAL Y CREAR IDs)
region_plot <- tabla_final_regiones %>%
  filter(REGION != "TOTAL") %>%
  mutate(
    id = row_number(),
    etiqueta = paste0(id, ". ", REGION)
  )

# 2. AJUSTAR LOS MÁRGENES DEL LIENZO
# Aumentamos el margen derecho (el cuarto número) para que entre la leyenda sin cortarse
par(mar = c(5, 5, 4, 12), xpd = TRUE)

# 3. DIBUJAR LA GRÁFICA DE BARRAS
barras_region <- barplot(
  height    = region_plot$hi_porc,
  main      = "Gráfica N° 1: Distribución de probabilidad por Región (%)",
  xlab      = "Región (ID)",
  ylab      = "Probabilidad (%)",
  col       = "orchid",        # Mantengo el color que venías usando, o pon "#4DB6AC"
  border    = NA,
  names.arg = region_plot$id,  # Pone los números en el eje X
  cex.names = 1.0,
  las       = 1,               # Números del eje Y horizontales
  ylim      = c(0, max(region_plot$hi_porc) * 1.20) # Deja un 20% de espacio arriba para los textos
)

# 4. COLOCAR PORCENTAJES ENCIMA DE LAS BARRAS
text(
  x      = barras_region,               # Usa las coordenadas exactas de las barras
  y      = region_plot$hi_porc + 2.5,   # Sube un poquito el texto
  labels = paste0(region_plot$hi_porc, "%"),
  cex    = 0.85,
  col    = "black"
)

# 5. AGREGAR LA LEYENDA A LA DERECHA
legend(
  "topright",
  inset     = c(-0.55, 0),     # Empuja la leyenda hacia la derecha, fuera del gráfico
  legend    = region_plot$etiqueta,
  fill      = "orchid",        # Muestra un cuadrito de color al lado del nombre
  border    = NA,
  bty       = "n",             # Quita el recuadro negro alrededor de la leyenda
  cex       = 0.8
)

CÁLCULO DE PROBABILIDAD

# 1. IDENTIFICAR LA REGIÓN CON MAYOR PROBABILIDAD (excluyendo la fila de TOTAL)
tabla_sin_total <- tabla_final_regiones %>%
  filter(REGION != "TOTAL")

# Encontrar el nombre de la región y su probabilidad máxima
region_mayor <- tabla_sin_total$REGION[which.max(tabla_sin_total$hi_porc)]
prob_mayor <- tabla_sin_total$hi_porc[which.max(tabla_sin_total$hi_porc)]

# 2. CONFIGURAR EL LIENZO DEL GRÁFICO DE TEXTO
par(mar = c(2, 2, 2, 2))  # Márgenes amplios para evitar cortes

# Crear un plot vacío (type = "n") sin ejes
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "", 
     xlim = c(0, 1), ylim = c(0, 1.1))

# 3. AGREGAR LOS TEXTOS AL LIENZO
# Título principal
text(
  x   = 0.5, 
  y   = 0.85, 
  labels = "REGIÓN CON MAYOR PROBABILIDAD\n(estimación muestral)", 
  cex = 1.6, 
  col = "navy", 
  font = 2, 
  adj = 0.5
)

# Pregunta explicativa
text(
  x   = 0.5, 
  y   = 0.60, 
  labels = paste0(
    "¿Qué región es más probable que concentre\n", 
    "la mayor cantidad de muestras de minerales críticos?\n"
  ), 
  cex = 1.3, 
  col = "black", 
  font = 1, 
  adj = 0.5
)

# Respuesta dinámica (usa las variables calculadas arriba)
text(
  x   = 0.5, 
  y   = 0.40, 
  labels = paste0(
    "Respuesta: ", region_mayor, "\n", 
    "Probabilidad estimada = ", sprintf("%.2f", prob_mayor), " %"
  ), 
  cex = 1.8, 
  col = "darkblue", 
  font = 2, 
  adj = 0.5
)

# 4. LÍNEA DECORATIVA SUTIL
abline(h = 0.75, col = "gray70", lty = 2, lwd = 1)

CONCLUSIÓN

Los resultados muestran que Norte América es el área con mayor concentración de muestras de minerales críticos con una probabilidad estimada del 81,26%.

Análisis Inferencial de la Distribución Geográfica por País

Grupo 1

2026-06-13

CARGA DE DATOS Y LIBRERIAS

TABLA DE DISTRIBUCIÓN DE PROBABILIDAD

TABLA DE DISTRIBUCIÓN DE PROBABILIDAD AGRUPADA

GRÁFICAS DE DISTRIBUCIÓN PROBABILIDAD

CÁLCULO DE PROBABILIDAD

CONCLUSIÓN