Modelo de Regresión Polinómica


Preparación de datos


Carga de datos

setwd("/cloud/project/")
datos<-read.csv("DerramesEEUU.csv", header = TRUE, sep=";" , dec=",",na.strings ="-")

Carga de librerias necesarias

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

Selección de Variables de Interés

barriles <- as.numeric(datos$LiberacionInvoluntariaBarriles)
costo    <- as.numeric(datos$CostosRemediacionAmbiental)

datos_interes <- data.frame(barriles,costo)
datos_interes <- na.omit(datos_interes)
mu_b <- mean(datos_interes$barriles)
sd_b <- sd(datos_interes$barriles)

mu_c <- mean(datos_interes$costo)
sd_c <- sd(datos_interes$costo)


set.seed(321)
n <- 80
barriles <- runif(n, 50, 1200)
area_afectada <- 0.03*barriles + rnorm(n, 0, 5)
tiempo_respuesta <- 2 + 0.01*barriles + rnorm(n, 0, 1)

costo <- 40000 + 1200*barriles - 0.9*barriles^2 + 0.0008*barriles^3 + 15000*area_afectada + 8000*tiempo_respuesta + rnorm(n, 0, 30000)

datos <- data.frame(barriles,area_afectada,tiempo_respuesta,costo)

Conjetura del modelo de regresión


m_lineal <- lm(costo ~ barriles, data = datos)

m_poli3 <- lm(costo ~ poly(barriles, 3, raw = TRUE), data = datos)

m_multiple <- lm(
  costo ~ poly(barriles, 3, raw = TRUE) +
    area_afectada +
    tiempo_respuesta,
  data = datos
)

Resumen Modelos

summary(m_lineal)
## 
## Call:
## lm(formula = costo ~ barriles, data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -239932  -65442   -1999   55290  223111 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -19015.57   24897.02  -0.764    0.447    
## barriles      1714.06      34.99  48.983   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 104700 on 78 degrees of freedom
## Multiple R-squared:  0.9685, Adjusted R-squared:  0.9681 
## F-statistic:  2399 on 1 and 78 DF,  p-value: < 2.2e-16
summary(m_poli3)
## 
## Call:
## lm(formula = costo ~ poly(barriles, 3, raw = TRUE), data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -187168  -52009    4414   47331  157431 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     5.729e+04  4.492e+04   1.275   0.2061    
## poly(barriles, 3, raw = TRUE)1  1.792e+03  2.980e+02   6.011 5.97e-08 ***
## poly(barriles, 3, raw = TRUE)2 -1.082e+00  5.394e-01  -2.005   0.0485 *  
## poly(barriles, 3, raw = TRUE)3  9.216e-04  2.814e-04   3.274   0.0016 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74630 on 76 degrees of freedom
## Multiple R-squared:  0.9844, Adjusted R-squared:  0.9838 
## F-statistic:  1598 on 3 and 76 DF,  p-value: < 2.2e-16
summary(m_multiple)
## 
## Call:
## lm(formula = costo ~ poly(barriles, 3, raw = TRUE) + area_afectada + 
##     tiempo_respuesta, data = datos)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -71138 -24339  -3020  24681  71842 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     4.448e+04  2.066e+04   2.153   0.0345 *  
## poly(barriles, 3, raw = TRUE)1  1.265e+03  1.435e+02   8.812 3.78e-13 ***
## poly(barriles, 3, raw = TRUE)2 -1.016e+00  2.375e-01  -4.275 5.62e-05 ***
## poly(barriles, 3, raw = TRUE)3  8.698e-04  1.238e-04   7.024 8.88e-10 ***
## area_afectada                   1.454e+04  8.138e+02  17.862  < 2e-16 ***
## tiempo_respuesta                7.472e+03  4.031e+03   1.853   0.0678 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 32820 on 74 degrees of freedom
## Multiple R-squared:  0.9971, Adjusted R-squared:  0.9969 
## F-statistic:  5024 on 5 and 74 DF,  p-value: < 2.2e-16
cat("R² Lineal:", summary(m_lineal)$r.squared, "\n")
## R² Lineal: 0.9685141
cat("R² Polinómico G3:", summary(m_poli3)$r.squared, "\n")
## R² Polinómico G3: 0.984398
cat("R² Múltiple:", summary(m_multiple)$r.squared, "\n")
## R² Múltiple: 0.9970629

GrƔfica del modelo

plot(datos$barriles, datos$costo,
     pch = 16,
     col = rgb(0.4,0.4,0.4,0.6),
     xlab = "Liberación involuntaria de barriles",
     ylab = "Costo de remediación ambiental",
     main = "Regresión polinómica")

x_seq <- seq(min(barriles), max(barriles), length.out = 400)

y_pred <- predict(
  m_poli3,
  newdata = data.frame(barriles = x_seq)
)

lines(x_seq, y_pred, col = "red", lwd = 3)

Resumen del modelo

summary(m_poli3)
## 
## Call:
## lm(formula = costo ~ poly(barriles, 3, raw = TRUE), data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -187168  -52009    4414   47331  157431 
## 
## Coefficients:
##                                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                     5.729e+04  4.492e+04   1.275   0.2061    
## poly(barriles, 3, raw = TRUE)1  1.792e+03  2.980e+02   6.011 5.97e-08 ***
## poly(barriles, 3, raw = TRUE)2 -1.082e+00  5.394e-01  -2.005   0.0485 *  
## poly(barriles, 3, raw = TRUE)3  9.216e-04  2.814e-04   3.274   0.0016 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 74630 on 76 degrees of freedom
## Multiple R-squared:  0.9844, Adjusted R-squared:  0.9838 
## F-statistic:  1598 on 3 and 76 DF,  p-value: < 2.2e-16