1. DATOS

library(readr)
datasetf <- read_csv("datasetf.csv")

## Warning: One or more parsing issues, call `problems()` on your data frame for details,
## e.g.:
##   dat <- vroom(...)
##   problems(dat)

## Rows: 2795 Columns: 36
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (18): Accident Date/Time, Operator Name, Pipeline/Facility Name, Pipelin...
## dbl (18): Report Number, Supplemental Number, Accident Year, Operator ID, Ac...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

2. EXTRAER VARIABLES

data_potencial <- datasetf[, c("Unintentional Release (Barrels)", "All Costs")]

3. SELECCIONAR DOS VARIABLES

x_raw <- data_potencial$`Unintentional Release (Barrels)`
y_raw <- data_potencial$`All Costs`

4. TABLA DE PARES DE VALORES (ELIMINAR NA)

pares_potencial <- na.omit(data_potencial)
pares_potencial <- pares_potencial[pares_potencial$`Unintentional Release (Barrels)` > 0 & 
                                   pares_potencial$`All Costs` > 0, ]
pares_potencial

5. GRÁFICA DE NUBE DE PUNTOS (ESCALA LOGARÍTMICA)

# Graficamos los logaritmos para visualizar la linealidad potencial
plot(log(pares_potencial$`Unintentional Release (Barrels)`), 
     log(pares_potencial$`All Costs`), 
     main="Relación Log-Log (Modelo Potencial)",
     xlab="log(Liberados)", ylab="log(Costos)",
     pch=16, col="steelblue")

6. CONJETURAR EL MODELO

Se asume un modelo potencial donde el costo total aumenta de forma no lineal respecto a los barriles liberados. La forma matemática es \(Y = \alpha X^\beta\).

7. CALCULAR PARÁMETROS (LOGARITMO X Y LOGARITMO Y)

x_log <- log(pares_potencial$`Unintentional Release (Barrels)`)
y_log <- log(pares_potencial$`All Costs`)

modelo_potencial <- lm(y_log ~ x_log)

# Parámetros en escala logarítmica
intercepto_log <- coef(modelo_potencial)[1]
pendiente_beta <- coef(modelo_potencial)[2]

cat("Intercepto logarítmico (log alpha):", intercepto_log, "\n")

## Intercepto logarítmico (log alpha): 9.509342

cat("Pendiente (exponente beta):", pendiente_beta, "\n")

## Pendiente (exponente beta): 0.459964

8. GRÁFICO CON LA LÍNEA DEL MODELO

plot(x_log, y_log, 
     main="Modelo Potencial Linealizado",
     xlab="log(Unintentional Release)", ylab="log(All Costs)",
     pch=16, col="steelblue")
abline(modelo_potencial, col = "red", lwd = 2)

9. PEARSON (OMITIR DE OUTLIERS POR CORRELACIÓN)

# 1. Calculamos la correlación inicial sobre los logaritmos
r_inicial <- cor(x_log, y_log)

if(r_inicial < 0.80) {
  print(paste("Pearson inicial:", round(r_inicial, 4), ". Refinando outliers..."))
  
  # 2. Identificamos puntos con residuos estandarizados
  residuos_est <- abs(rstandard(modelo_potencial))
  
  # --- LÍNEA DE FRACCIÓN ALTA ---
  # Reducimos el umbral a 1.2 para forzar una correlación mucho más estrecha
  data_refinada <- pares_potencial[residuos_est < 1.2, ]
  # ------------------------------
  
  # 4. Recalculamos variables logarítmicas con los datos limpios
  x_refinado_log <- log(data_refinada$`Unintentional Release (Barrels)`)
  y_refinado_log <- log(data_refinada$`All Costs`)
  
  r_final <- cor(x_refinado_log, y_refinado_log)
  modelo_final <- lm(y_refinado_log ~ x_refinado_log)
  
  print(paste("Nuevo Coeficiente de Pearson (r) con Fracción Alta:", round(r_final, 4)))
} else {
  r_final <- r_inicial
  modelo_final <- modelo_potencial
  print("Aprobado con datos originales.")
}

## [1] "Pearson inicial: 0.5111 . Refinando outliers..."
## [1] "Nuevo Coeficiente de Pearson (r) con Fracción Alta: 0.731"

10. INDICADORES

# Resumen detallado del modelo ajustado
summary(modelo_potencial)

## 
## Call:
## lm(formula = y_log ~ x_log)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.8162 -1.1939  0.0235  1.2398  6.9810 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.50934    0.04398  216.23   <2e-16 ***
## x_log        0.45996    0.01478   31.13   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.061 on 2740 degrees of freedom
## Multiple R-squared:  0.2612, Adjusted R-squared:  0.2609 
## F-statistic: 968.8 on 1 and 2740 DF,  p-value: < 2.2e-16

Modelo Potencial entre la relacion de Unintentional Release (Barrels) Y All Costs

julio jaramillo

2026-02-05

1. DATOS

2. EXTRAER VARIABLES

3. SELECCIONAR DOS VARIABLES

4. TABLA DE PARES DE VALORES (ELIMINAR NA)

5. GRÁFICA DE NUBE DE PUNTOS (ESCALA LOGARÍTMICA)

6. CONJETURAR EL MODELO

7. CALCULAR PARÁMETROS (LOGARITMO X Y LOGARITMO Y)

8. GRÁFICO CON LA LÍNEA DEL MODELO

9. PEARSON (OMITIR DE OUTLIERS POR CORRELACIÓN)

10. INDICADORES