##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(gt)
library(magrittr)
Sedimentos_Marinos <- read.csv(
"ESTADISTICA/dataset_geologico_limpio_80.csv",
header = TRUE,
sep = ",",
dec = ".",
stringsAsFactors = FALSE
)
# Librerías necesarias
library(dplyr)
library(gt)
library(knitr)
# Lectura del archivo
Sedimentos_Marinos <- read.csv(
"ESTADISTICA/dataset_geologico_limpio_80.csv",
header = TRUE,
sep = ",",
dec = ".",
stringsAsFactors = FALSE
)TABLA DE DISTRIBUCION DE PROBABILIDAD POR CIUDAD
tabla_original <- Sedimentos_Marinos %>%
count(AREA, name = "ni") %>%
mutate(
hi = ni / sum(ni),
P = round(hi * 100, 2)
) %>%
arrange(desc(ni)) %>%
mutate(rank = row_number())
total_original <- data.frame(
AREA = "TOTAL",
ni = sum(tabla_original$ni),
hi = 1.0000,
P = 100.00,
rank = NA
)
tabla_original_final <- bind_rows(tabla_original, total_original)
tabla_original_final %>%
filter(rank <= 20 | is.na(rank)) %>%
select(-rank) %>%
gt() %>%
tab_header(
title = md("**Tabla N°1**"),
subtitle = md("**Distribución de probabilidad por Área (sin agrupar)**")
) %>%
tab_source_note(md("Autor: Grupo")) %>%
cols_label(
AREA = "Área (valor original)",
ni = "Frecuencia absoluta (ni)",
hi = "Frec. relativa",
P = "Probabilidad (%)"
) %>%
fmt_number(columns = c(hi, P), decimals = 2) %>%
fmt_number(columns = ni, decimals = 0) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(rows = AREA == "TOTAL")
)| Tabla N°1 | |||
| Distribución de probabilidad por Área (sin agrupar) | |||
| Área (valor original) | Frecuencia absoluta (ni) | Frec. relativa | Probabilidad (%) |
|---|---|---|---|
| MASSACHUSETTS | 1,653 | 0.06 | 5.95 |
| GEORGES BANK | 1,648 | 0.06 | 5.93 |
| LAKE MICHIGAN | 1,240 | 0.04 | 4.46 |
| LONG ISLAND SOUND | 1,219 | 0.04 | 4.39 |
| Stellwagen Bank | 1,138 | 0.04 | 4.10 |
| GULF OF MEXICO | 1,133 | 0.04 | 4.08 |
| STELLWAGEN BANK | 1,079 | 0.04 | 3.88 |
| MASS BAY | 950 | 0.03 | 3.42 |
| LOUISIANA | 712 | 0.03 | 2.56 |
| Long Island Sound | 698 | 0.03 | 2.51 |
| NORTH CAROLINA | 694 | 0.02 | 2.50 |
| PUERTO RICO | 508 | 0.02 | 1.83 |
| HUDSON SHELF VALLEY | 500 | 0.02 | 1.80 |
| LYDONIA CANYON | 482 | 0.02 | 1.73 |
| MASSBAY | 470 | 0.02 | 1.69 |
| LONG ISLAND SOUND (Connecticut) | 446 | 0.02 | 1.61 |
| NEW JERSEY | 429 | 0.02 | 1.54 |
| LAKE MEAD | 334 | 0.01 | 1.20 |
| LAKE BAIKAL | 320 | 0.01 | 1.15 |
| COASTAL MASSACHUSETTS | 319 | 0.01 | 1.15 |
| TOTAL | 27,784 | 1.00 | 100.00 |
| Autor: Grupo | |||
TABLA DE DISTRIBUCIÓN DE PROBABILIDAD AGRUPADA Debido a que la tabla presenta numerosos registros de ciudades donde se recolecto muestras de sedimentos , se decidió agruparlos por áreas en común o cercanas entre si
# Limpieza y agrupación de áreas
Sedimentos_Marinos <- Sedimentos_Marinos %>%
mutate(
AREA_clean = toupper(trimws(AREA)),
Area = case_when(
grepl("STELLWAGEN|MASS BAY|MASSBAY|MASSACHUSETTS BAY|BOSTON HARBOR|MERRIMACK|MARTHA'S VINEYARD|BUZZARDS BAY|CAPE COD BAY|COASTAL MA", AREA_clean) ~ "New England Shelf",
grepl("LONG ISLAND|NEW YORK BIGHT|BLOCK ISLAND|NEW YORK|FIRE ISLAND|LIS", AREA_clean) ~ "Long Island & New York Bight",
grepl("NORTH CAROLINA|CAPE HATTERAS|BARNEGAT BAY|HUDSON CANYON|HUDSON SHELF VALLEY", AREA_clean) ~ "Mid-Atlantic Coast",
grepl("GULF OF MAINE|MAINE|BAY OF FUNDY|NEW HAMPSHIRE", AREA_clean) ~ "Gulf of Maine Region",
grepl("RHODE ISLAND SOUND|NANTUCKET SOUND|NANTUCKET", AREA_clean) ~ "Southern New England & Rhode Island",
grepl("FLORIDA|SOUTH CAROLINA|APALACHICOLA BAY", AREA_clean) ~ "Southeast USA",
grepl("GULF OF MEXICO", AREA_clean) ~ "Gulf of Mexico",
grepl("PUERTO RICO|CARIBBEAN|MONA CANYON", AREA_clean) ~ "Caribbean Region",
grepl("CALIFORNIA|WASHINGTON|ALASKA|CASCADIA", AREA_clean) ~ "Western USA",
TRUE ~ NA_character_
)
)
# Filtrar registros válidos
Sedimentos_Marinos_Area <- Sedimentos_Marinos %>%
filter(!is.na(Area))
# Conteo y cálculo de probabilidades
TablaArea <- as.data.frame(table(Sedimentos_Marinos_Area$Area))
colnames(TablaArea) <- c("Area", "ni")
TablaArea <- TablaArea %>%
mutate(
hi = ni / sum(ni),
P = round(hi * 100, 2)
) %>%
arrange(desc(P))
# Corrección para que sume exactamente 100%
suma_actual <- sum(TablaArea$P)
diferencia <- 100.00 - suma_actual
TablaArea$P[nrow(TablaArea)] <- round(TablaArea$P[nrow(TablaArea)] + diferencia, 2)
# Fila total
total_area <- data.frame(
Area = "Total",
ni = sum(TablaArea$ni),
hi = 1.0000,
P = 100.00
)
TablaAreaFinal <- rbind(TablaArea, total_area)
# Tabla gt
TablaAreaFinal %>%
gt() %>%
tab_header(
title = md("**Tabla N°2**"),
subtitle = md("**Distribución de probabilidad de las muestras de sedimentos marinos por Área de recolección )**")
) %>%
tab_source_note(md("Autor: GRUPO")) %>%
cols_label(
Area = "Área de recolección",
ni = "Frecuencia absoluta (ni)",
hi = "Frecuencia relativa (hi)",
P = "Probabilidad (%)"
) %>%
fmt_number(columns = c(hi, P), decimals = 2) %>%
fmt_number(columns = ni, decimals = 0) %>%
tab_style(
style = cell_text(weight = "bold"),
locations = cells_body(rows = Area == "Total")
) %>%
tab_options(
table.border.top.color = "black",
table.border.bottom.color = "black",
heading.border.bottom.color = "black",
heading.border.bottom.width = px(2),
column_labels.border.top.color = "black",
column_labels.border.bottom.color = "black",
column_labels.border.bottom.width = px(2),
table_body.hlines.color = "gray",
table_body.border.bottom.color = "black",
row.striping.include_table_body = TRUE
)| Tabla N°2 | |||
| Distribución de probabilidad de las muestras de sedimentos marinos por Área de recolección ) | |||
| Área de recolección | Frecuencia absoluta (ni) | Frecuencia relativa (hi) | Probabilidad (%) |
|---|---|---|---|
| New England Shelf | 5,443 | 0.36 | 35.89 |
| Long Island & New York Bight | 3,569 | 0.24 | 23.53 |
| Mid-Atlantic Coast | 1,759 | 0.12 | 11.60 |
| Gulf of Mexico | 1,133 | 0.07 | 7.47 |
| Gulf of Maine Region | 1,123 | 0.07 | 7.40 |
| Caribbean Region | 925 | 0.06 | 6.10 |
| Southeast USA | 725 | 0.05 | 4.78 |
| Western USA | 287 | 0.02 | 1.89 |
| Southern New England & Rhode Island | 203 | 0.01 | 1.34 |
| Total | 15,167 | 1.00 | 100.00 |
| Autor: GRUPO | |||
# Preparar datos para gráfico
area_plot <- TablaAreaFinal %>%
filter(Area != "Total") %>%
mutate(
id = row_number(),
etiqueta = paste0(id, ". ", Area)
)
# Gráfico de barras
barplot(
height = area_plot$P,
main = "Gráfica N° 1: Distribución de probabilidad por Área de recolección (%)",
xlab = "Área de recolección ",
ylab = "Probabilidad (%)",
col = "#4DB6AC",
border = NA,
names.arg = area_plot$id,
cex.names = 1.2,
las = 1,
ylim = c(0, max(area_plot$P) * 1.18)
)
# Porcentajes encima de barras
text(
x = 1:nrow(area_plot),
y = area_plot$P + 2.5,
labels = paste0(area_plot$P, "%"),
cex = 0.95,
col = "black"
)
# Leyenda
legend(
"topright",
legend = area_plot$etiqueta,
fill = NA,
border = NA,
bty = "n",
cex = 0.8,
xpd = TRUE,
inset = c(-0.28, 0)
)# 8. IDENTIFICAR EL ÁREA CON MAYOR PROBABILIDAD (excluyendo Total)
tabla_sin_total <- TablaAreaFinal %>%
filter(Area != "Total")
area_mayor <- tabla_sin_total$Area[which.max(tabla_sin_total$P)]
prob_mayor <- tabla_sin_total$P[which.max(tabla_sin_total$P)]
# Gráfico de texto explicativo - versión mejorada (evita corte del texto)
par(mar = c(2, 2, 2, 2)) # márgenes más amplios
plot(1, type = "n", axes = FALSE, xlab = "", ylab = "",
xlim = c(0, 1), ylim = c(0, 1.1))
text(
x = 0.5,
y = 0.85,
labels = "ÁREA CON MAYOR PROBABILIDAD\n(estimación muestral)",
cex = 1.6,
col = "navy",
font = 2,
adj = 0.5
)
text(
x = 0.5,
y = 0.60,
labels = paste0(
"¿Qué área es más probable que concentre\n",
"la mayor cantidad de muestras de sedimentos marinos?\n"
),
cex = 1.3,
col = "black",
font = 1,
adj = 0.5
)
text(
x = 0.5,
y = 0.40,
labels = paste0(
"Respuesta: ", area_mayor, "\n",
"Probabilidad estimada = ", sprintf("%.2f", prob_mayor), " %"
),
cex = 1.8,
col = "darkblue",
font = 2,
adj = 0.5
)
# Opcional: línea decorativa sutil
abline(h = 0.75, col = "gray70", lty = 2, lwd = 1)Los resultados muestran que New England Shelf es el área con mayor concentración de muestras de sedimentos marinos con una probabilidad estimada del 35.89%.