# 1. CARGA DE LIBRER??AS
library(dplyr)
## 
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(knitr)
library(kableExtra)
## 
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(tidyr)
library(readr)

# 2. CARGA DE DATOS
# Usando la ruta directa de tu carpeta de descargas
database_xlsx_Sheet1 <- read_csv("C:/Users/Usuario/Downloads/database.xlsx - Sheet1.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)
## Rows: 2795 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): Accident Date/Time, Operator Name, Pipeline/Facility Name, Pipelin...
## dbl (18): Report Number, Supplemental Number, Accident Year, Operator ID, Ac...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# --- ESTO ES LO QUE TE SAL??A AL COMIENZO (VISTA DE VARIABLES) ---
glimpse(database_xlsx_Sheet1) 
## Rows: 2,795
## Columns: 36
## $ `Report Number`                        <dbl> 20100016, 20100254, 20100038, 2…
## $ `Supplemental Number`                  <dbl> 17305, 17331, 17747, 18574, 162…
## $ `Accident Year`                        <dbl> 2010, 2010, 2010, 2010, 2010, 2…
## $ `Accident Date/Time`                   <chr> "1/1/2010 7:15", "1/4/2010 8:30…
## $ `Operator ID`                          <dbl> 32109, 15786, 20160, 11169, 300…
## $ `Operator Name`                        <chr> "ONEOK NGL PIPELINE LP", "PORTL…
## $ `Pipeline/Facility Name`               <chr> "KINDER MORGAN JCT", "24-INCH M…
## $ `Pipeline Location`                    <chr> "ONSHORE", "ONSHORE", "ONSHORE"…
## $ `Pipeline Type`                        <chr> "ABOVEGROUND", "ABOVEGROUND", "…
## $ `Liquid Type`                          <chr> "HVL OR OTHER FLAMMABLE OR TOXI…
## $ `Liquid Subtype`                       <chr> "LPG (LIQUEFIED PETROLEUM GAS) …
## $ `Liquid Name`                          <chr> NA, NA, "ETHANE", NA, NA, NA, N…
## $ `Accident City`                        <chr> "MCPHERSON", "RAYMOND", "SULPHE…
## $ `Accident County`                      <chr> "MCPHERSON", "CUMBERLAND", "CAL…
## $ `Accident State`                       <chr> "KS", "ME", "LA", "WI", "TX", "…
## $ `Accident Latitude`                    <dbl> 38.67070, 43.94028, 30.18240, 4…
## $ `Accident Longitude`                   <dbl> -97.78123, -70.49336, -93.35240…
## $ `Cause Category`                       <chr> "INCORRECT OPERATION", "MATERIA…
## $ `Cause Subcategory`                    <chr> "PIPELINE/EQUIPMENT OVERPRESSUR…
## $ `Unintentional Release (Barrels)`      <dbl> 21.00, 0.12, 2.00, 0.48, 700.00…
## $ `Intentional Release (Barrels)`        <dbl> 0.1, 0.0, 0.0, 0.0, NA, 0.0, 0.…
## $ `Liquid Recovery (Barrels)`            <dbl> 0.00, 0.12, 0.00, 0.48, 698.00,…
## $ `Net Loss (Barrels)`                   <dbl> 21.00, 0.00, 2.00, 0.00, 2.00, …
## $ `Liquid Ignition`                      <chr> "NO", "NO", "NO", "NO", "NO", "…
## $ `Liquid Explosion`                     <chr> "NO", "NO", "NO", "NO", "NO", "…
## $ `Pipeline Shutdown`                    <chr> "NO", NA, NA, NA, "NO", "YES", …
## $ `Shutdown Date/Time`                   <chr> NA, NA, NA, NA, NA, "1/8/2010 2…
## $ `Restart Date/Time`                    <chr> NA, NA, NA, NA, NA, "1/13/2010 …
## $ `Public Evacuations`                   <dbl> NA, NA, NA, NA, NA, 0, NA, NA, …
## $ `Property Damage Costs`                <dbl> 110, 4000, 0, 200, 20000, 76940…
## $ `Lost Commodity Costs`                 <dbl> 1517, 8, 200, 40, 150, 167775, …
## $ `Public/Private Property Damage Costs` <dbl> 0, 0, 0, 0, 0, 150000, 0, 0, 0,…
## $ `Emergency Response Costs`             <dbl> 0, 0, 0, 11300, 7500, 1800000, …
## $ `Environmental Remediation Costs`      <dbl> 0, 0, 0, 0, 2000, 2000000, 7000…
## $ `Other Costs`                          <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000…
## $ `All Costs`                            <dbl> 1627, 4008, 200, 11540, 29650, …
# 3. TABLA DE DISTRIBUCI??N DE FRECUENCIA
datos <- database_xlsx_Sheet1

TDF_agrupada <- datos %>%
  filter(!is.na(`Pipeline Type`)) %>%
  count(`Pipeline Type`, name = "ni") %>%
  arrange(desc(ni)) %>%
  mutate(Nivel = row_number())

ni_total <- sum(TDF_agrupada$ni)
TDF_agrupada$hi <- (TDF_agrupada$ni / ni_total) * 100

# Formateo de tabla final estilo kable
TDF_final <- TDF_agrupada %>%
  mutate(hi = sprintf("%.2f", round(hi, 2))) %>%
  bind_rows(data.frame(`Pipeline Type` = "TOTAL", ni = ni_total, hi = "100.00", Nivel = NA))

colnames(TDF_final) <- c("Tipo de Tuberia", "ni", "Nivel", "hi (%)")

kable(TDF_final, align = 'c', caption = "Tabla 1: Frecuencia por Tipo de Tuberia") %>%
  kable_styling(full_width = FALSE, position = "center", 
                bootstrap_options = c("striped", "hover", "condensed")) %>%
  row_spec(nrow(TDF_final), bold = TRUE, background = "#f2f2f2")
Tabla 1: Frecuencia por Tipo de Tuberia
Tipo de Tuberia ni Nivel hi (%) NA
ABOVEGROUND 1475 1 53.11 NA
UNDERGROUND 985 2 35.47 NA
TANK 301 3 10.84 NA
TRANSITION AREA 16 4 0.58 NA
NA 2777 NA 100.00 TOTAL
# 4. GR??FICA N??1: CANTIDAD DE EVENTOS
TDF_agrupada %>%
  ggplot(aes(x = reorder(`Pipeline Type`, -ni), y = ni, fill = as.factor(Nivel))) + 
  geom_col(width = 0.65, color = "black") +
  scale_fill_manual(values = c("#AED6F1", "#3498DB", "#2E86C1", "#154360")) +
  labs(title = "Grafica N1: Cantidad de Eventos por Tipo de Tuberia", 
       x = "Tipo de Tuberia", y = "Cantidad") +
  theme_light() + 
  theme(axis.text.x = element_text(angle = 35, hjust = 1), legend.position = "none")

# 5. C??LCULOS DEL MODELO GEOM??TRICO
df_geo <- TDF_agrupada
df_geo$x <- 1:nrow(df_geo)
df_geo$Real <- df_geo$ni / sum(df_geo$ni)
media_obs <- sum(df_geo$x * df_geo$ni) / sum(df_geo$ni)
p_geo <- 1 / media_obs
df_geo$Geometrica <- dgeom(df_geo$x - 1, prob = p_geo)

# 6. GR??FICA N??4: COMPARACI??N DE BARRAS (Como tu image_9fbe6e.png)
df_grafica <- df_geo %>%
  select(`Pipeline Type`, Real, Geometrica) %>%
  pivot_longer(cols = c("Real", "Geometrica"), names_to = "Tipo", values_to = "Probabilidad")

ggplot(df_grafica, aes(x = `Pipeline Type`, y = Probabilidad, fill = Tipo)) +
  geom_col(position = "dodge", color = "black") +
  scale_fill_manual(values = c("Real" = "#66CCE6", "Geometrica" = "#0072B2"),
                    labels = c("Real" = "Probabilidad Observada", "Geometrica" = "Modelo Geometrico")) +
  labs(title = "Grafica N8: Comparacion Real vs. Modelo", x = "", y = "Probabilidad") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "top")

# 7. GR??FICA N??5: EVALUACI??N VISUAL (Como tu image_9fc24d.png)
Fo1 <- df_geo$Real
Fe1 <- df_geo$Geometrica

plot(Fo1, Fe1, 
     main = "Grafica N9: Comparacion Observada vs. Esperada",
     xlab = "Frecuencia Observada (Fo)", 
     ylab = "Frecuencia Esperada (Fe)",
     pch = 21, bg = "white", col = "black", cex = 1.5,
     xlim = c(0, 1), ylim = c(0, 1))
abline(a = 0, b = 1, col = "red", lwd = 2)
text(Fo1, Fe1, labels = df_geo$`Pipeline Type`, pos = 4, cex = 0.8)

# 8. TEST DE PEARSON FINAL
Correlacion <- cor(Fo1, Fe1) * 100
x2 <- sum(((Fo1 - Fe1)^2) / Fe1)
vc <- qchisq(0.95, df = (nrow(df_geo) - 1)) 

tabla_resumen <- data.frame(
  Variable = "Pipeline Type",
  Pearson = round(Correlacion, 2),
  Chi_Cuadrado = round(x2, 2),
  Umbral = round(vc, 2),
  Resultado = ifelse(x2 < vc, "ACEPTADO (TRUE)", "RECHAZADO (FALSE)")
)

kable(tabla_resumen, format = "markdown", caption = "Resumen Test de Bondad de Ajuste")
Resumen Test de Bondad de Ajuste
Variable Pearson Chi_Cuadrado Umbral Resultado
Pipeline Type 94.18 0.11 7.81 ACEPTADO (TRUE)