Análisis inferencial de Área de Recolección de Sedimentos Marinos

CARGA DE DATOS Y LIBRERÍAS

library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(gt)
library(magrittr)
Sedimentos_Marinos <- read.csv(
  "ESTADISTICA/dataset_geologico_limpio_80.csv",
  header = TRUE,
  sep = ",",
  dec = ".",
  stringsAsFactors = FALSE
)

# Librerías necesarias
library(dplyr)
library(gt)
library(knitr)
# Lectura del archivo
Sedimentos_Marinos <- read.csv(
  "ESTADISTICA/dataset_geologico_limpio_80.csv",
  header = TRUE,
  sep = ",",
  dec = ".",
  stringsAsFactors = FALSE
)

TABLA DE DISTRIBUCION DE PROBABILIDAD

TABLA DE DISTRIBUCION DE PROBABILIDAD POR CIUDAD

tabla_original <- Sedimentos_Marinos %>%
  count(AREA, name = "ni") %>%
  mutate(
    hi = ni / sum(ni),
    P  = round(hi * 100, 2)
  ) %>%
  arrange(desc(ni)) %>%
  mutate(rank = row_number())

total_original <- data.frame(
  AREA = "TOTAL",
  ni   = sum(tabla_original$ni),
  hi   = 1.0000,
  P    = 100.00,
  rank = NA
)

tabla_original_final <- bind_rows(tabla_original, total_original)

tabla_original_final %>%
  filter(rank <= 20 | is.na(rank)) %>%
  select(-rank) %>%
  gt() %>%
  tab_header(
    title = md("**Tabla N°1**"),
    subtitle = md("**Distribución de probabilidad por Área  (sin agrupar)**")
  ) %>%
  tab_source_note(md("Autor: Grupo")) %>%
  cols_label(
    AREA = "Área (valor original)",
    ni   = "Frecuencia absoluta (ni)",
    hi   = "Frec. relativa",
    P    = "Probabilidad (%)"
  ) %>%
  fmt_number(columns = c(hi, P), decimals = 2) %>%
  fmt_number(columns = ni, decimals = 0) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(rows = AREA == "TOTAL")
  )

Área (valor original)	Frecuencia absoluta (ni)	Frec. relativa	Probabilidad (%)
Tabla N°1
Distribución de probabilidad por Área (sin agrupar)
MASSACHUSETTS	1,653	0.06	5.95
GEORGES BANK	1,648	0.06	5.93
LAKE MICHIGAN	1,240	0.04	4.46
LONG ISLAND SOUND	1,219	0.04	4.39
Stellwagen Bank	1,138	0.04	4.10
GULF OF MEXICO	1,133	0.04	4.08
STELLWAGEN BANK	1,079	0.04	3.88
MASS BAY	950	0.03	3.42
LOUISIANA	712	0.03	2.56
Long Island Sound	698	0.03	2.51
NORTH CAROLINA	694	0.02	2.50
PUERTO RICO	508	0.02	1.83
HUDSON SHELF VALLEY	500	0.02	1.80
LYDONIA CANYON	482	0.02	1.73
MASSBAY	470	0.02	1.69
LONG ISLAND SOUND (Connecticut)	446	0.02	1.61
NEW JERSEY	429	0.02	1.54
LAKE MEAD	334	0.01	1.20
LAKE BAIKAL	320	0.01	1.15
COASTAL MASSACHUSETTS	319	0.01	1.15
TOTAL	27,784	1.00	100.00
Autor: Grupo

TABLA DE DISTRIBUCIÓN DE PROBABILIDAD AGRUPADA

TABLA DE DISTRIBUCIÓN DE PROBABILIDAD AGRUPADA Debido a que la tabla presenta numerosos registros de ciudades donde se recolecto muestras de sedimentos , se decidió agruparlos por áreas en común o cercanas entre si

# Limpieza y agrupación de áreas
Sedimentos_Marinos <- Sedimentos_Marinos %>%
  mutate(
    AREA_clean = toupper(trimws(AREA)),
    Area = case_when(
      grepl("STELLWAGEN|MASS BAY|MASSBAY|MASSACHUSETTS BAY|BOSTON HARBOR|MERRIMACK|MARTHA'S VINEYARD|BUZZARDS BAY|CAPE COD BAY|COASTAL MA", AREA_clean) ~ "New England Shelf",
      
      grepl("LONG ISLAND|NEW YORK BIGHT|BLOCK ISLAND|NEW YORK|FIRE ISLAND|LIS", AREA_clean) ~ "Long Island & New York Bight",
      
      grepl("NORTH CAROLINA|CAPE HATTERAS|BARNEGAT BAY|HUDSON CANYON|HUDSON SHELF VALLEY", AREA_clean) ~ "Mid-Atlantic Coast",
      
      grepl("GULF OF MAINE|MAINE|BAY OF FUNDY|NEW HAMPSHIRE", AREA_clean) ~ "Gulf of Maine Region",
      
      grepl("RHODE ISLAND SOUND|NANTUCKET SOUND|NANTUCKET", AREA_clean) ~ "Southern New England & Rhode Island",
      
      grepl("FLORIDA|SOUTH CAROLINA|APALACHICOLA BAY", AREA_clean) ~ "Southeast USA",
      
      grepl("GULF OF MEXICO", AREA_clean) ~ "Gulf of Mexico",
      
      grepl("PUERTO RICO|CARIBBEAN|MONA CANYON", AREA_clean) ~ "Caribbean Region",
      
      grepl("CALIFORNIA|WASHINGTON|ALASKA|CASCADIA", AREA_clean) ~ "Western USA",
      
      TRUE ~ NA_character_
    )
  )

# Filtrar registros válidos
Sedimentos_Marinos_Area <- Sedimentos_Marinos %>%
  filter(!is.na(Area))
# Conteo y cálculo de probabilidades
TablaArea <- as.data.frame(table(Sedimentos_Marinos_Area$Area))
colnames(TablaArea) <- c("Area", "ni")

TablaArea <- TablaArea %>%
  mutate(
    hi = ni / sum(ni),
    P  = round(hi * 100, 2)
  ) %>%
  arrange(desc(P))

# Corrección para que sume exactamente 100%
suma_actual <- sum(TablaArea$P)
diferencia   <- 100.00 - suma_actual
TablaArea$P[nrow(TablaArea)] <- round(TablaArea$P[nrow(TablaArea)] + diferencia, 2)

# Fila total
total_area <- data.frame(
  Area = "Total",
  ni   = sum(TablaArea$ni),
  hi   = 1.0000,
  P    = 100.00
)

TablaAreaFinal <- rbind(TablaArea, total_area)

# Tabla gt
TablaAreaFinal %>%
  gt() %>%
  tab_header(
    title = md("**Tabla N°2**"),
    subtitle = md("**Distribución de probabilidad de las muestras de sedimentos marinos por Área de recolección )**")
  ) %>%
  tab_source_note(md("Autor: GRUPO")) %>%
  cols_label(
    Area = "Área de recolección",
    ni   = "Frecuencia absoluta (ni)",
    hi   = "Frecuencia relativa (hi)",
    P    = "Probabilidad (%)"
  ) %>%
  fmt_number(columns = c(hi, P), decimals = 2) %>%
  fmt_number(columns = ni, decimals = 0) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(rows = Area == "Total")
  ) %>%
  tab_options(
    table.border.top.color = "black",
    table.border.bottom.color = "black",
    heading.border.bottom.color = "black",
    heading.border.bottom.width = px(2),
    column_labels.border.top.color = "black",
    column_labels.border.bottom.color = "black",
    column_labels.border.bottom.width = px(2),
    table_body.hlines.color = "gray",
    table_body.border.bottom.color = "black",
    row.striping.include_table_body = TRUE
  )

Área de recolección	Frecuencia absoluta (ni)	Frecuencia relativa (hi)	Probabilidad (%)
Tabla N°2
Distribución de probabilidad de las muestras de sedimentos marinos por Área de recolección )
New England Shelf	5,443	0.36	35.89
Long Island & New York Bight	3,569	0.24	23.53
Mid-Atlantic Coast	1,759	0.12	11.60
Gulf of Mexico	1,133	0.07	7.47
Gulf of Maine Region	1,123	0.07	7.40
Caribbean Region	925	0.06	6.10
Southeast USA	725	0.05	4.78
Western USA	287	0.02	1.89
Southern New England & Rhode Island	203	0.01	1.34
Total	15,167	1.00	100.00
Autor: GRUPO

GRÁFICAS DE DISTRIBUCION DE PROBABILIDAD

# Preparar datos para gráfico
area_plot <- TablaAreaFinal %>%
  filter(Area != "Total") %>%
  mutate(
    id = row_number(),
    etiqueta = paste0(id, ". ", Area)
  )

# Gráfico de barras
barplot(
  height    = area_plot$P,
  main      = "Gráfica N° 1: Distribución de probabilidad por Área de recolección (%)",
  xlab      = "Área de recolección ",
  ylab      = "Probabilidad (%)",
  col       = "#4DB6AC",
  border    = NA,
  names.arg = area_plot$id,
  cex.names = 1.2,
  las       = 1,
  ylim      = c(0, max(area_plot$P) * 1.18)
)

# Porcentajes encima de barras
text(
  x      = 1:nrow(area_plot),
  y      = area_plot$P + 2.5,
  labels = paste0(area_plot$P, "%"),
  cex    = 0.95,
  col    = "black"
)

# Leyenda
legend(
  "topright",
  legend    = area_plot$etiqueta,
  fill      = NA,
  border    = NA,
  bty       = "n",
  cex       = 0.8,
  xpd       = TRUE,
  inset     = c(-0.28, 0)
)

CÁLCULO DE PROBABILIDAD

# 8. IDENTIFICAR EL ÁREA CON MAYOR PROBABILIDAD (excluyendo Total)
tabla_sin_total <- TablaAreaFinal %>%
  filter(Area != "Total")

area_mayor <- tabla_sin_total$Area[which.max(tabla_sin_total$P)]
prob_mayor <- tabla_sin_total$P[which.max(tabla_sin_total$P)]

# Gráfico de texto explicativo - versión mejorada (evita corte del texto)
par(mar = c(2, 2, 2, 2))  # márgenes más amplios

plot(1, type = "n", axes = FALSE, xlab = "", ylab = "",
     xlim = c(0, 1), ylim = c(0, 1.1))

text(
  x   = 0.5,
  y   = 0.85,
  labels = "ÁREA CON MAYOR PROBABILIDAD\n(estimación muestral)",
  cex = 1.6,
  col = "navy",
  font = 2,
  adj = 0.5
)

text(
  x   = 0.5,
  y   = 0.60,
  labels = paste0(
    "¿Qué área es más probable que concentre\n",
    "la mayor cantidad de muestras de sedimentos marinos?\n"
  ),
  cex = 1.3,
  col = "black",
  font = 1,
  adj = 0.5
)

text(
  x   = 0.5,
  y   = 0.40,
  labels = paste0(
    "Respuesta: ", area_mayor, "\n",
    "Probabilidad estimada = ", sprintf("%.2f", prob_mayor), " %"
  ),
  cex = 1.8,
  col = "darkblue",
  font = 2,
  adj = 0.5
)

# Opcional: línea decorativa sutil
abline(h = 0.75, col = "gray70", lty = 2, lwd = 1)

CONCLUSIONES

Los resultados muestran que New England Shelf es el área con mayor concentración de muestras de sedimentos marinos con una probabilidad estimada del 35.89%.