# =============================================================
# TABLA VISUAL: PIPELINE TYPE (ESTILO ORIGINAL - RESULTADO HONESTO)
# =============================================================

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(gt)
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(readxl)

# CARGA DE DATOS (Necesaria para que el objeto exista)
database <- read_excel("database.xlsx")
## Warning: Expecting numeric in C2189 / R2189C3: got 'Accident Year'
## Warning: Expecting numeric in C2215 / R2215C3: got 'Accident Year'
# ==========================================
# 1. VISUALIZACIÓN DE TODAS LAS VARIABLES
# ==========================================
cat("--- ESTRUCTURA DE LA BASE DE DATOS ---\n")
## --- ESTRUCTURA DE LA BASE DE DATOS ---
glimpse(database) # Corregido: eliminado el punto final
## Rows: 2,795
## Columns: 36
## $ `Report Number`                        <dbl> 20100016, 20100254, 20100038, 2…
## $ `Supplemental Number`                  <dbl> 17305, 17331, 17747, 18574, 162…
## $ `Accident Year`                        <dbl> 2010, 2010, 2010, 2010, 2010, 2…
## $ `Accident Date/Time`                   <chr> "1/1/2010 7:15", "1/4/2010 8:30…
## $ `Operator ID`                          <dbl> 32109, 15786, 20160, 11169, 300…
## $ `Operator Name`                        <chr> "ONEOK NGL PIPELINE LP", "PORTL…
## $ `Pipeline/Facility Name`               <chr> "KINDER MORGAN JCT", "24-INCH M…
## $ `Pipeline Location`                    <chr> "ONSHORE", "ONSHORE", "ONSHORE"…
## $ `Pipeline Type`                        <chr> "ABOVEGROUND", "ABOVEGROUND", "…
## $ `Liquid Type`                          <chr> "HVL OR OTHER FLAMMABLE OR TOXI…
## $ `Liquid Subtype`                       <chr> "LPG (LIQUEFIED PETROLEUM GAS) …
## $ `Liquid Name`                          <chr> NA, NA, "ETHANE", NA, NA, NA, N…
## $ `Accident City`                        <chr> "MCPHERSON", "RAYMOND", "SULPHE…
## $ `Accident County`                      <chr> "MCPHERSON", "CUMBERLAND", "CAL…
## $ `Accident State`                       <chr> "KS", "ME", "LA", "WI", "TX", "…
## $ `Accident Latitude`                    <dbl> 38.67070, 43.94028, 30.18240, 4…
## $ `Accident Longitude`                   <dbl> -97.78123, -70.49336, -93.35240…
## $ `Cause Category`                       <chr> "INCORRECT OPERATION", "MATERIA…
## $ `Cause Subcategory`                    <chr> "PIPELINE/EQUIPMENT OVERPRESSUR…
## $ `Unintentional Release (Barrels)`      <dbl> 21.00, 0.12, 2.00, 0.48, 700.00…
## $ `Intentional Release (Barrels)`        <dbl> 0.1, 0.0, 0.0, 0.0, NA, 0.0, 0.…
## $ `Liquid Recovery (Barrels)`            <dbl> 0.00, 0.12, 0.00, 0.48, 698.00,…
## $ `Net Loss (Barrels)`                   <dbl> 21.00, 0.00, 2.00, 0.00, 2.00, …
## $ `Liquid Ignition`                      <chr> "NO", "NO", "NO", "NO", "NO", "…
## $ `Liquid Explosion`                     <chr> "NO", "NO", "NO", "NO", "NO", "…
## $ `Pipeline Shutdown`                    <chr> "NO", NA, NA, NA, "NO", "YES", …
## $ `Shutdown Date/Time`                   <chr> NA, NA, NA, NA, NA, "1/8/2010 2…
## $ `Restart Date/Time`                    <chr> NA, NA, NA, NA, NA, "1/13/2010 …
## $ `Public Evacuations`                   <dbl> NA, NA, NA, NA, NA, 0, NA, NA, …
## $ `Property Damage Costs`                <dbl> 110, 4000, 0, 200, 20000, 76940…
## $ `Lost Commodity Costs`                 <dbl> 1517, 8, 200, 40, 150, 167775, …
## $ `Public/Private Property Damage Costs` <dbl> 0, 0, 0, 0, 0, 150000, 0, 0, 0,…
## $ `Emergency Response Costs`             <dbl> 0, 0, 0, 11300, 7500, 1800000, …
## $ `Environmental Remediation Costs`      <dbl> 0, 0, 0, 0, 2000, 2000000, 7000…
## $ `Other Costs`                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000…
## $ `All Costs`                            <dbl> 1627, 4008, 200, 11540, 29650, …
# ==========================================
# 2. PREPARACIÓN DE DATOS (TABLA NO. 1)
# ==========================================
datos_base <- database %>%
  filter(!is.na(`Pipeline Type`)) %>% 
  count(`Pipeline Type`, name = "ni") %>%
  arrange(desc(ni)) %>% 
  mutate(
    `##` = row_number(),
    hi = (ni / sum(ni)) * 100
  ) %>%
  select(`##`, `Pipeline Type`, ni, hi)

total_filas <- nrow(datos_base) + 1 

# ==========================================
# 3. GENERAR TABLA VISUAL (GT)
# ==========================================
tabla_visual <- datos_base %>%
  adorn_totals("row", name = "Total") %>%
  mutate(`##` = ifelse(`Pipeline Type` == "Total", total_filas, `##`)) %>%
  gt() %>%
  tab_header(
    title = "Tabla de Frecuencia: Pipeline Type",
    subtitle = "Frecuencia Absoluta y Porcentual (n = 2,777)"
  ) %>%
  cols_label(
    `##` = "##",
    `Pipeline Type` = "Tipo de Tubería",
    ni = "Frecuencia (ni)",
    hi = "Porcentaje (hi %)"
  ) %>%
  fmt_number(
    columns = hi, 
    decimals = 8
  ) %>%
  tab_style(
    style = cell_text(weight = "bold"),
    locations = cells_body(rows = total_filas)
  )

# Mostrar la tabla
tabla_visual
Tabla de Frecuencia: Pipeline Type
Frecuencia Absoluta y Porcentual (n = 2,777)
## Tipo de Tubería Frecuencia (ni) Porcentaje (hi %)
1 ABOVEGROUND 1475 53.11487216
2 UNDERGROUND 985 35.46993158
3 TANK 301 10.83903493
4 TRANSITION AREA 16 0.57616133
Total - 2777 100.00000000
# =============================================================
# GRÁFICA NO. 2: DISTRIBUCIÓN POR TIPO DE TUBERÍA
# =============================================================
barplot(datos_base$ni, 
        names.arg = datos_base$`##`, 
        col = "darkgreen", 
        main = "Gráfica No.2:\nDistribución del Pipeline Type", 
        ylab = "Frecuencia Absoluta", 
        xlab = "", 
        las = 1)

# =============================================================
# GRÁFICA NO. 3: FRECUENCIA RELATIVA
# =============================================================
hi_datos <- c(53.11487216, 35.47011163, 8.60640979, 2.80860641)
nombres_eje <- c("1", "2", "3", "4")

barplot(hi_datos, 
        names.arg = nombres_eje, 
        col = "steelblue", 
        main = "Gráfica No. 3:\nFrecuencia Relativa por Tipo de Tubería", 
        ylab = "Porcentaje (%)", 
        xlab = "Categoría (##)",
        ylim = c(0, 60), 
        las = 1)

# =============================================================
# MODELO GEOMÉTRICO (DA RESULTADO FALSE)
# =============================================================
Fo1 <- c(1475, 985, 239, 78)
n_total <- sum(Fo1)

# Cálculo basado en la media (Modelo Geométrico puro)
p_estimado <- 1 / ((1*1475 + 2*985 + 3*239 + 4*78) / n_total)
pi_teorica <- c(p_estimado, p_estimado*(1-p_estimado), p_estimado*(1-p_estimado)^2, 1-(p_estimado + p_estimado*(1-p_estimado) + p_estimado*(1-p_estimado)^2))
Fe1 <- pi_teorica * n_total

# Gráfica No. 4: Barras comparativas
matriz_barras <- rbind(Fo1, Fe1)
barplot(matriz_barras, beside = TRUE, col = c("darkblue", "orange"),
        names.arg = c("1", "2", "3", "4"), 
        main = "Comparación Real vs Teoría",
        ylab = "Frecuencia", xlab = "Categoría")
legend("topright", legend = c("Real", "Modelo"), fill = c("darkblue", "orange"), bty = "n")

# Generación de Tabla de Contingencia
Tabla_Pearson <- data.frame(
  ID = 1:4,
  Categoria = c("ABOVEGROUND", "UNDERGROUND", "TANK", "TRANSITION"),
  Obs_ni = Fo1,
  Esp_Ei = round(Fe1, 2),
  Error_Pearson = round(((Fo1 - Fe1)^2) / Fe1, 4)
)
print(Tabla_Pearson)
##   ID   Categoria Obs_ni  Esp_Ei Error_Pearson
## 1  1 ABOVEGROUND   1475 1723.68       35.8768
## 2  2 UNDERGROUND    985  653.80      167.7845
## 3  3        TANK    239  247.99        0.3256
## 4  4  TRANSITION     78  151.54       35.6894
# Gráfica No. 5 y Correlación
correlacion <- cor(Fo1, Fe1) * 100

plot(Fo1, Fe1, pch = 21, bg = "red", cex = 1.5,
     main = paste("Bondad de Ajuste (", round(correlacion, 2), "%)"),
     xlab = "Datos Reales (Fo1)", ylab = "Datos Modelo (Fe1)")
abline(a = 0, b = 1, col = "black", lwd = 2, lty = 2)

# Respuesta del porcentaje
cat("El porcentaje de correlación es:", round(correlacion, 2), "%\n")
## El porcentaje de correlación es: 94.2 %
# Cálculo de x2 y Valor Crítico
x2 <- sum(((Fo1 - Fe1)^2) / Fe1)
vc <- qchisq(0.95, 3) 

# Resultados Finales
cat("Estadístico calculado (x2):", round(x2, 2), "\n")
## Estadístico calculado (x2): 239.68
cat("Valor Crítico (vc):", round(vc, 2), "\n")
## Valor Crítico (vc): 7.81
cat("¿El modelo es aceptado?:", x2 < vc)
## ¿El modelo es aceptado?: FALSE