# 1. CARGA DE LIBRER??AS
library(dplyr)
##
## Adjuntando el paquete: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(knitr)
library(kableExtra)
##
## Adjuntando el paquete: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(tidyr)
library(readr)
# 2. CARGA DE DATOS
# Usando la ruta directa de tu carpeta de descargas
database_xlsx_Sheet1 <- read_csv("C:/Users/Usuario/Downloads/database.xlsx - Sheet1.csv")
## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
## dat <- vroom(...)
## problems(dat)
## Rows: 2795 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): Accident Date/Time, Operator Name, Pipeline/Facility Name, Pipelin...
## dbl (18): Report Number, Supplemental Number, Accident Year, Operator ID, Ac...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# --- ESTO ES LO QUE TE SAL??A AL COMIENZO (VISTA DE VARIABLES) ---
glimpse(database_xlsx_Sheet1)
## Rows: 2,795
## Columns: 36
## $ `Report Number` <dbl> 20100016, 20100254, 20100038, 2…
## $ `Supplemental Number` <dbl> 17305, 17331, 17747, 18574, 162…
## $ `Accident Year` <dbl> 2010, 2010, 2010, 2010, 2010, 2…
## $ `Accident Date/Time` <chr> "1/1/2010 7:15", "1/4/2010 8:30…
## $ `Operator ID` <dbl> 32109, 15786, 20160, 11169, 300…
## $ `Operator Name` <chr> "ONEOK NGL PIPELINE LP", "PORTL…
## $ `Pipeline/Facility Name` <chr> "KINDER MORGAN JCT", "24-INCH M…
## $ `Pipeline Location` <chr> "ONSHORE", "ONSHORE", "ONSHORE"…
## $ `Pipeline Type` <chr> "ABOVEGROUND", "ABOVEGROUND", "…
## $ `Liquid Type` <chr> "HVL OR OTHER FLAMMABLE OR TOXI…
## $ `Liquid Subtype` <chr> "LPG (LIQUEFIED PETROLEUM GAS) …
## $ `Liquid Name` <chr> NA, NA, "ETHANE", NA, NA, NA, N…
## $ `Accident City` <chr> "MCPHERSON", "RAYMOND", "SULPHE…
## $ `Accident County` <chr> "MCPHERSON", "CUMBERLAND", "CAL…
## $ `Accident State` <chr> "KS", "ME", "LA", "WI", "TX", "…
## $ `Accident Latitude` <dbl> 38.67070, 43.94028, 30.18240, 4…
## $ `Accident Longitude` <dbl> -97.78123, -70.49336, -93.35240…
## $ `Cause Category` <chr> "INCORRECT OPERATION", "MATERIA…
## $ `Cause Subcategory` <chr> "PIPELINE/EQUIPMENT OVERPRESSUR…
## $ `Unintentional Release (Barrels)` <dbl> 21.00, 0.12, 2.00, 0.48, 700.00…
## $ `Intentional Release (Barrels)` <dbl> 0.1, 0.0, 0.0, 0.0, NA, 0.0, 0.…
## $ `Liquid Recovery (Barrels)` <dbl> 0.00, 0.12, 0.00, 0.48, 698.00,…
## $ `Net Loss (Barrels)` <dbl> 21.00, 0.00, 2.00, 0.00, 2.00, …
## $ `Liquid Ignition` <chr> "NO", "NO", "NO", "NO", "NO", "…
## $ `Liquid Explosion` <chr> "NO", "NO", "NO", "NO", "NO", "…
## $ `Pipeline Shutdown` <chr> "NO", NA, NA, NA, "NO", "YES", …
## $ `Shutdown Date/Time` <chr> NA, NA, NA, NA, NA, "1/8/2010 2…
## $ `Restart Date/Time` <chr> NA, NA, NA, NA, NA, "1/13/2010 …
## $ `Public Evacuations` <dbl> NA, NA, NA, NA, NA, 0, NA, NA, …
## $ `Property Damage Costs` <dbl> 110, 4000, 0, 200, 20000, 76940…
## $ `Lost Commodity Costs` <dbl> 1517, 8, 200, 40, 150, 167775, …
## $ `Public/Private Property Damage Costs` <dbl> 0, 0, 0, 0, 0, 150000, 0, 0, 0,…
## $ `Emergency Response Costs` <dbl> 0, 0, 0, 11300, 7500, 1800000, …
## $ `Environmental Remediation Costs` <dbl> 0, 0, 0, 0, 2000, 2000000, 7000…
## $ `Other Costs` <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 1000…
## $ `All Costs` <dbl> 1627, 4008, 200, 11540, 29650, …
# 3. TABLA DE DISTRIBUCI??N DE FRECUENCIA
datos <- database_xlsx_Sheet1
TDF_agrupada <- datos %>%
filter(!is.na(`Pipeline Type`)) %>%
count(`Pipeline Type`, name = "ni") %>%
arrange(desc(ni)) %>%
mutate(Nivel = row_number())
ni_total <- sum(TDF_agrupada$ni)
TDF_agrupada$hi <- (TDF_agrupada$ni / ni_total) * 100
# Formateo de tabla final estilo kable
TDF_final <- TDF_agrupada %>%
mutate(hi = sprintf("%.2f", round(hi, 2))) %>%
bind_rows(data.frame(`Pipeline Type` = "TOTAL", ni = ni_total, hi = "100.00", Nivel = NA))
colnames(TDF_final) <- c("Tipo de Tuberia", "ni", "Nivel", "hi (%)")
kable(TDF_final, align = 'c', caption = "Tabla 1: Frecuencia por Tipo de Tuberia") %>%
kable_styling(full_width = FALSE, position = "center",
bootstrap_options = c("striped", "hover", "condensed")) %>%
row_spec(nrow(TDF_final), bold = TRUE, background = "#f2f2f2")
Tabla 1: Frecuencia por Tipo de Tuberia
|
Tipo de Tuberia
|
ni
|
Nivel
|
hi (%)
|
NA
|
|
ABOVEGROUND
|
1475
|
1
|
53.11
|
NA
|
|
UNDERGROUND
|
985
|
2
|
35.47
|
NA
|
|
TANK
|
301
|
3
|
10.84
|
NA
|
|
TRANSITION AREA
|
16
|
4
|
0.58
|
NA
|
|
NA
|
2777
|
NA
|
100.00
|
TOTAL
|
# 4. GR??FICA N??1: CANTIDAD DE EVENTOS
TDF_agrupada %>%
ggplot(aes(x = reorder(`Pipeline Type`, -ni), y = ni, fill = as.factor(Nivel))) +
geom_col(width = 0.65, color = "black") +
scale_fill_manual(values = c("#AED6F1", "#3498DB", "#2E86C1", "#154360")) +
labs(title = "Grafica N1: Cantidad de Eventos por Tipo de Tuberia",
x = "Tipo de Tuberia", y = "Cantidad") +
theme_light() +
theme(axis.text.x = element_text(angle = 35, hjust = 1), legend.position = "none")

# 5. C??LCULOS DEL MODELO GEOM??TRICO
df_geo <- TDF_agrupada
df_geo$x <- 1:nrow(df_geo)
df_geo$Real <- df_geo$ni / sum(df_geo$ni)
media_obs <- sum(df_geo$x * df_geo$ni) / sum(df_geo$ni)
p_geo <- 1 / media_obs
df_geo$Geometrica <- dgeom(df_geo$x - 1, prob = p_geo)
# 6. GR??FICA N??4: COMPARACI??N DE BARRAS (Como tu image_9fbe6e.png)
df_grafica <- df_geo %>%
select(`Pipeline Type`, Real, Geometrica) %>%
pivot_longer(cols = c("Real", "Geometrica"), names_to = "Tipo", values_to = "Probabilidad")
ggplot(df_grafica, aes(x = `Pipeline Type`, y = Probabilidad, fill = Tipo)) +
geom_col(position = "dodge", color = "black") +
scale_fill_manual(values = c("Real" = "#66CCE6", "Geometrica" = "#0072B2"),
labels = c("Real" = "Probabilidad Observada", "Geometrica" = "Modelo Geometrico")) +
labs(title = "Grafica N8: Comparacion Real vs. Modelo", x = "", y = "Probabilidad") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1), legend.position = "top")

# 7. GR??FICA N??5: EVALUACI??N VISUAL (Como tu image_9fc24d.png)
Fo1 <- df_geo$Real
Fe1 <- df_geo$Geometrica
plot(Fo1, Fe1,
main = "Grafica N9: Comparacion Observada vs. Esperada",
xlab = "Frecuencia Observada (Fo)",
ylab = "Frecuencia Esperada (Fe)",
pch = 21, bg = "white", col = "black", cex = 1.5,
xlim = c(0, 1), ylim = c(0, 1))
abline(a = 0, b = 1, col = "red", lwd = 2)
text(Fo1, Fe1, labels = df_geo$`Pipeline Type`, pos = 4, cex = 0.8)

# 8. TEST DE PEARSON FINAL
Correlacion <- cor(Fo1, Fe1) * 100
x2 <- sum(((Fo1 - Fe1)^2) / Fe1)
vc <- qchisq(0.95, df = (nrow(df_geo) - 1))
tabla_resumen <- data.frame(
Variable = "Pipeline Type",
Pearson = round(Correlacion, 2),
Chi_Cuadrado = round(x2, 2),
Umbral = round(vc, 2),
Resultado = ifelse(x2 < vc, "ACEPTADO (TRUE)", "RECHAZADO (FALSE)")
)
kable(tabla_resumen, format = "markdown", caption = "Resumen Test de Bondad de Ajuste")
Resumen Test de Bondad de Ajuste
| Pipeline Type |
94.18 |
0.11 |
7.81 |
ACEPTADO (TRUE) |