Análisis de Datos mediante Regresión Lineal en R

1. Datos

operarios <- c(2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
               21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
               38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50)

tiempo <- c(45, 47, 48, 50, 52, 53, 55, 57, 58, 60, 62, 63, 65, 67, 68, 70, 72, 73,
            75, 77, 78, 80, 82, 83, 85, 87, 88, 90, 92, 93, 95, 97, 98, 100, 102,
            103, 105, 107, 108, 110, 112, 113, 115, 117, 118, 120, 122, 123, 125)

Datos <- data.frame(operarios, tiempo)

2. Gráfico de dispersión

ggplot(Datos, aes(x = operarios, y = tiempo)) +
  geom_point(size = 3) +
  geom_smooth(method = "lm", se = TRUE) +
  labs(title = "Horas de capacitación vs Productividad",
       x = "Número de operarios",
       y = "Tiempo de producción (Horas)")

## `geom_smooth()` using formula = 'y ~ x'

3. Ajuste del modelo

modelo <- lm(tiempo ~ operarios, data = Datos)
summary(modelo)

## 
## Call:
## lm(formula = tiempo ~ operarios, data = Datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3453 -0.3257  0.0000  0.3257  0.3453 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.680816   0.082199   507.1   <2e-16 ***
## operarios    1.666122   0.002777   599.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2749 on 47 degrees of freedom
## Multiple R-squared:  0.9999, Adjusted R-squared:  0.9999 
## F-statistic: 3.599e+05 on 1 and 47 DF,  p-value: < 2.2e-16

4. Intervalos de confianza

confint(modelo, level = 0.95)

##                 2.5 %   97.5 %
## (Intercept) 41.515453 41.84618
## operarios    1.660535  1.67171

5. ANOVA

anova(modelo)

## Analysis of Variance Table
## 
## Response: tiempo
##           Df  Sum Sq Mean Sq F value    Pr(>F)    
## operarios  1 27204.4 27204.4  359903 < 2.2e-16 ***
## Residuals 47     3.6     0.1                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

6. SST, SSR, SSE y R²

SST <- sum((Datos$tiempo - mean(Datos$tiempo))^2)
SSE <- sum(residuals(modelo)^2)
SSR <- SST - SSE
R2  <- SSR / SST
c(SST = SST, SSR = SSR, SSE = SSE, R2 = R2)

##          SST          SSR          SSE           R2 
## 2.720800e+04 2.720445e+04 3.552653e+00 9.998694e-01

7. Gráficos de diagnóstico

par(mfrow = c(2, 2))
plot(modelo)

par(mfrow = c(1, 1))

8. Normalidad de residuos

shapiro.test(residuals(modelo))

## 
##  Shapiro-Wilk normality test
## 
## data:  residuals(modelo)
## W = 0.81712, p-value = 2.685e-06

ad.test(residuals(modelo))

## 
##  Anderson-Darling normality test
## 
## data:  residuals(modelo)
## A = 3.3961, p-value = 1.262e-08

9. Homoscedasticidad

bptest(modelo)

## 
##  studentized Breusch-Pagan test
## 
## data:  modelo
## BP = 3.9235e-29, df = 1, p-value = 1

bptest(modelo, ~ fitted(modelo) + I(fitted(modelo)^2))

## 
##  studentized Breusch-Pagan test
## 
## data:  modelo
## BP = 0.23216, df = 2, p-value = 0.8904

10. Independencia / autocorrelación

dwtest(modelo)

## 
##  Durbin-Watson test
## 
## data:  modelo
## DW = 3.0025, p-value = 0.9999
## alternative hypothesis: true autocorrelation is greater than 0

durbinWatsonTest(modelo)

##  lag Autocorrelation D-W Statistic p-value
##    1      -0.5012755      3.002455       0
##  Alternative hypothesis: rho != 0

11. Linealidad

plot(fitted(modelo), residuals(modelo),
     xlab = "Valores ajustados", ylab = "Residuos",
     main = "Residuos vs Ajustados")
abline(h = 0, col = "red")

resettest(modelo)

## 
##  RESET test
## 
## data:  modelo
## RESET = 0.034376, df1 = 2, df2 = 45, p-value = 0.9662

12. Outliers e influencia

cooksd <- cooks.distance(modelo)
which(cooksd > (4 / nrow(Datos)))

## named integer(0)

hatvals <- hatvalues(modelo)
which(hatvals > (2 * length(coef(modelo)) / nrow(Datos)))

## named integer(0)

13. Errores estándar robustos (HC1)

coeftest(modelo, vcov. = vcovHC(modelo, type = "HC1"))

## 
## t test of coefficients:
## 
##               Estimate Std. Error t value  Pr(>|t|)    
## (Intercept) 41.6808163  0.0807612  516.10 < 2.2e-16 ***
## operarios    1.6661224  0.0027141  613.87 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

14. Bootstrap (2000 repeticiones)

boot_fn <- function(d, i) {
  ds  <- d[i, ]
  fit <- lm(tiempo ~ operarios, data = ds)
  return(coef(fit))
}
set.seed(123)
boot_res <- boot(data = Datos, statistic = boot_fn, R = 2000)
boot.ci(boot_res, type = "perc", index = 1)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_res, type = "perc", index = 1)
## 
## Intervals : 
## Level     Percentile     
## 95%   (41.51, 41.84 )  
## Calculations and Intervals on Original Scale

boot.ci(boot_res, type = "perc", index = 2)

## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_res, type = "perc", index = 2)
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.661,  1.672 )  
## Calculations and Intervals on Original Scale

15. WLS (pesos inversos)

w <- 1 / (fitted(modelo)^2)
modelo_wls <- lm(tiempo ~ operarios, data = Datos, weights = w)
summary(modelo_wls)

## 
## Call:
## lm(formula = tiempo ~ operarios, data = Datos, weights = w)
## 
## Weighted Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.0072096 -0.0029805  0.0000057  0.0032167  0.0067997 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.684737   0.065530   636.1   <2e-16 ***
## operarios    1.665953   0.002965   561.9   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.003662 on 47 degrees of freedom
## Multiple R-squared:  0.9999, Adjusted R-squared:  0.9998 
## F-statistic: 3.157e+05 on 1 and 47 DF,  p-value: < 2.2e-16

16. Modelo sin primera observación

modelo_sin1 <- lm(tiempo ~ operarios, data = Datos[-1, ])
summary(modelo_sin1)

## 
## Call:
## lm(formula = tiempo ~ operarios, data = Datos[-1, ])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34636 -0.32595  0.00058  0.32537  0.34578 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 41.682009   0.086580   481.4   <2e-16 ***
## operarios    1.666088   0.002895   575.4   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2779 on 46 degrees of freedom
## Multiple R-squared:  0.9999, Adjusted R-squared:  0.9999 
## F-statistic: 3.311e+05 on 1 and 46 DF,  p-value: < 2.2e-16

17. Regresión robusta

rlm_model <- rlm(tiempo ~ operarios, data = Datos)
summary(rlm_model)

## 
## Call: rlm(formula = tiempo ~ operarios, data = Datos)
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.3453 -0.3257  0.0000  0.3257  0.3453 
## 
## Coefficients:
##             Value    Std. Error t value 
## (Intercept)  41.6808   0.0822   507.0721
## operarios     1.6661   0.0028   599.9188
## 
## Residual standard error: 0.4829 on 47 degrees of freedom

Análisis de Datos mediante Regresión Lineal en R

Alejandro Marimon, Carlos Mendoza, Brenda Osorio, Ivan Torres, Marysol Camargo, Gabriela Vargas

2026-02-19

1. Datos

2. Gráfico de dispersión

3. Ajuste del modelo

4. Intervalos de confianza

5. ANOVA

6. SST, SSR, SSE y R²

7. Gráficos de diagnóstico

8. Normalidad de residuos

9. Homoscedasticidad

10. Independencia / autocorrelación

11. Linealidad

12. Outliers e influencia

13. Errores estándar robustos (HC1)

14. Bootstrap (2000 repeticiones)

15. WLS (pesos inversos)

16. Modelo sin primera observación

17. Regresión robusta