La regresión lineal se utiliza para predecir una variable dependiente basándose en una o más variables predictoras. El objetivo es establecer una relación lineal entre la variable Y y las variables predictoras, es decir encontrar una fórmula matemática que las relacione. Luego podemos usar dicha fórmula para predecir Y para valores dados de las variables predictoras. Cuando hay una sola variable predictora se tiene un modelo de regresión simple. Asumamos que la relación está dada por:
\[ Y = \beta_{0} + \beta_{1}X + \epsilon\]
Donde \(\beta_{0}\) es el intercepto y \(\beta_{1}\) es la pendiente de dicha recta.
Utilizando el conjunto de datos cars
de r:
head(cars)
## speed dist
## 1 4 2
## 2 4 10
## 3 7 4
## 4 7 22
## 5 8 16
## 6 9 10
Previamente se debe hacer un diagrama de dispersión para ver el tipo de asociación de las dos variables.
scatter.smooth(x= cars$speed, y=cars$distance, main="speed vs distance")
qqnorm(cars$dist)
qqline(cars$dist, col=2)
Luego podemos ver un diagrama de caja de las variables:
par(mfrow=c(1, 2)) # divide el área en una matriz 1 X 2
b1 <- boxplot(cars$speed, main="Speed", col="yellow")
b2 <- boxplot(cars$dist, main="Distance", col="purple")
str(b1)
## List of 6
## $ stats: num [1:5, 1] 4 12 15 19 25
## $ n : num 50
## $ conf : num [1:2, 1] 13.4 16.6
## $ out : num(0)
## $ group: num(0)
## $ names: chr ""
b1$stats
## [,1]
## [1,] 4
## [2,] 12
## [3,] 15
## [4,] 19
## [5,] 25
b2$out
## [1] 120
reg <- lm( dist ~ speed, data = cars)
reg
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Coefficients:
## (Intercept) speed
## -17.579 3.932
plot( cars$speed, cars$dist, col="gray")
abline(reg, col = "blue")
str(reg)
## List of 12
## $ coefficients : Named num [1:2] -17.58 3.93
## ..- attr(*, "names")= chr [1:2] "(Intercept)" "speed"
## $ residuals : Named num [1:50] 3.85 11.85 -5.95 12.05 2.12 ...
## ..- attr(*, "names")= chr [1:50] "1" "2" "3" "4" ...
## $ effects : Named num [1:50] -303.914 145.552 -8.115 9.885 0.194 ...
## ..- attr(*, "names")= chr [1:50] "(Intercept)" "speed" "" "" ...
## $ rank : int 2
## $ fitted.values: Named num [1:50] -1.85 -1.85 9.95 9.95 13.88 ...
## ..- attr(*, "names")= chr [1:50] "1" "2" "3" "4" ...
## $ assign : int [1:2] 0 1
## $ qr :List of 5
## ..$ qr : num [1:50, 1:2] -7.071 0.141 0.141 0.141 0.141 ...
## .. ..- attr(*, "dimnames")=List of 2
## .. .. ..$ : chr [1:50] "1" "2" "3" "4" ...
## .. .. ..$ : chr [1:2] "(Intercept)" "speed"
## .. ..- attr(*, "assign")= int [1:2] 0 1
## ..$ qraux: num [1:2] 1.14 1.27
## ..$ pivot: int [1:2] 1 2
## ..$ tol : num 1e-07
## ..$ rank : int 2
## ..- attr(*, "class")= chr "qr"
## $ df.residual : int 48
## $ xlevels : Named list()
## $ call : language lm(formula = dist ~ speed, data = cars)
## $ terms :Classes 'terms', 'formula' language dist ~ speed
## .. ..- attr(*, "variables")= language list(dist, speed)
## .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. ..$ : chr [1:2] "dist" "speed"
## .. .. .. ..$ : chr "speed"
## .. ..- attr(*, "term.labels")= chr "speed"
## .. ..- attr(*, "order")= int 1
## .. ..- attr(*, "intercept")= int 1
## .. ..- attr(*, "response")= int 1
## .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. ..- attr(*, "predvars")= language list(dist, speed)
## .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. ..- attr(*, "names")= chr [1:2] "dist" "speed"
## $ model :'data.frame': 50 obs. of 2 variables:
## ..$ dist : num [1:50] 2 10 4 22 16 10 18 26 34 17 ...
## ..$ speed: num [1:50] 4 4 7 7 8 9 10 10 10 11 ...
## ..- attr(*, "terms")=Classes 'terms', 'formula' language dist ~ speed
## .. .. ..- attr(*, "variables")= language list(dist, speed)
## .. .. ..- attr(*, "factors")= int [1:2, 1] 0 1
## .. .. .. ..- attr(*, "dimnames")=List of 2
## .. .. .. .. ..$ : chr [1:2] "dist" "speed"
## .. .. .. .. ..$ : chr "speed"
## .. .. ..- attr(*, "term.labels")= chr "speed"
## .. .. ..- attr(*, "order")= int 1
## .. .. ..- attr(*, "intercept")= int 1
## .. .. ..- attr(*, "response")= int 1
## .. .. ..- attr(*, ".Environment")=<environment: R_GlobalEnv>
## .. .. ..- attr(*, "predvars")= language list(dist, speed)
## .. .. ..- attr(*, "dataClasses")= Named chr [1:2] "numeric" "numeric"
## .. .. .. ..- attr(*, "names")= chr [1:2] "dist" "speed"
## - attr(*, "class")= chr "lm"
summary(reg)
##
## Call:
## lm(formula = dist ~ speed, data = cars)
##
## Residuals:
## Min 1Q Median 3Q Max
## -29.069 -9.525 -2.272 9.215 43.201
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -17.5791 6.7584 -2.601 0.0123 *
## speed 3.9324 0.4155 9.464 1.49e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.38 on 48 degrees of freedom
## Multiple R-squared: 0.6511, Adjusted R-squared: 0.6438
## F-statistic: 89.57 on 1 and 48 DF, p-value: 1.49e-12
par(mfrow=c(2,2))
plot(reg)
AIC(reg)
## [1] 419.1569
BIC(reg)
## [1] 424.8929
coefficients(reg) # model coefficients
## (Intercept) speed
## -17.579095 3.932409
confint(reg, level=0.95) # intervalos de confianza para los parámetros
## 2.5 % 97.5 %
## (Intercept) -31.167850 -3.990340
## speed 3.096964 4.767853
fitted(reg) # valores predichos
## 1 2 3 4 5 6 7
## -1.849460 -1.849460 9.947766 9.947766 13.880175 17.812584 21.744993
## 8 9 10 11 12 13 14
## 21.744993 21.744993 25.677401 25.677401 29.609810 29.609810 29.609810
## 15 16 17 18 19 20 21
## 29.609810 33.542219 33.542219 33.542219 33.542219 37.474628 37.474628
## 22 23 24 25 26 27 28
## 37.474628 37.474628 41.407036 41.407036 41.407036 45.339445 45.339445
## 29 30 31 32 33 34 35
## 49.271854 49.271854 49.271854 53.204263 53.204263 53.204263 53.204263
## 36 37 38 39 40 41 42
## 57.136672 57.136672 57.136672 61.069080 61.069080 61.069080 61.069080
## 43 44 45 46 47 48 49
## 61.069080 68.933898 72.866307 76.798715 76.798715 76.798715 76.798715
## 50
## 80.731124
residuals(reg) # residuos
## 1 2 3 4 5 6
## 3.849460 11.849460 -5.947766 12.052234 2.119825 -7.812584
## 7 8 9 10 11 12
## -3.744993 4.255007 12.255007 -8.677401 2.322599 -15.609810
## 13 14 15 16 17 18
## -9.609810 -5.609810 -1.609810 -7.542219 0.457781 0.457781
## 19 20 21 22 23 24
## 12.457781 -11.474628 -1.474628 22.525372 42.525372 -21.407036
## 25 26 27 28 29 30
## -15.407036 12.592964 -13.339445 -5.339445 -17.271854 -9.271854
## 31 32 33 34 35 36
## 0.728146 -11.204263 2.795737 22.795737 30.795737 -21.136672
## 37 38 39 40 41 42
## -11.136672 10.863328 -29.069080 -13.069080 -9.069080 -5.069080
## 43 44 45 46 47 48
## 2.930920 -2.933898 -18.866307 -6.798715 15.201285 16.201285
## 49 50
## 43.201285 4.268876
anova(reg) # tabla anova
## Analysis of Variance Table
##
## Response: dist
## Df Sum Sq Mean Sq F value Pr(>F)
## speed 1 21186 21185.5 89.567 1.49e-12 ***
## Residuals 48 11354 236.5
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vcov(reg)
## (Intercept) speed
## (Intercept) 45.676514 -2.6588234
## speed -2.658823 0.1726509
broom
El paquete broom
organiza la salida de algunos procedimientos, en este caso la regresión, en dataframes. Utiliza las funciones tidy()
, augment()
y glance()
.
library(broom)
est <- tidy(reg)
aum <- augment(reg)
res <- glance(reg)
str(est)
## Classes 'tbl_df', 'tbl' and 'data.frame': 2 obs. of 5 variables:
## $ term : chr "(Intercept)" "speed"
## $ estimate : num -17.58 3.93
## $ std.error: num 6.758 0.416
## $ statistic: num -2.6 9.46
## $ p.value : num 1.23e-02 1.49e-12
est
## # A tibble: 2 x 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 (Intercept) -17.6 6.76 -2.60 1.23e- 2
## 2 speed 3.93 0.416 9.46 1.49e-12
aum
## # A tibble: 50 x 9
## dist speed .fitted .se.fit .resid .hat .sigma .cooksd .std.resid
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2 4 -1.85 5.21 3.85 0.115 15.5 0.00459 0.266
## 2 10 4 -1.85 5.21 11.8 0.115 15.4 0.0435 0.819
## 3 4 7 9.95 4.11 -5.95 0.0715 15.5 0.00620 -0.401
## 4 22 7 9.95 4.11 12.1 0.0715 15.4 0.0255 0.813
## 5 16 8 13.9 3.77 2.12 0.0600 15.5 0.000645 0.142
## 6 10 9 17.8 3.44 -7.81 0.0499 15.5 0.00713 -0.521
## 7 18 10 21.7 3.12 -3.74 0.0413 15.5 0.00133 -0.249
## 8 26 10 21.7 3.12 4.26 0.0413 15.5 0.00172 0.283
## 9 34 10 21.7 3.12 12.3 0.0413 15.4 0.0143 0.814
## 10 17 11 25.7 2.84 -8.68 0.0341 15.5 0.00582 -0.574
## # … with 40 more rows
res
## # A tibble: 1 x 11
## r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC
## <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1 0.651 0.644 15.4 89.6 1.49e-12 2 -207. 419. 425.
## # … with 2 more variables: deviance <dbl>, df.residual <int>