library(readxl)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(olsrr)
##
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
##
## rivers
resumen <- read_excel('Resumen-Censo-EEUU-1977.xlsx')
resumen <- resumen[1:30,]
V. dependiente:
V. predictoras cuantitativas:
datos <- data.frame(expectativa = resumen$c2, ingreso = resumen$c4, analfabetismo = resumen$c5)
attach(datos)
lm(expectativa ~ ., data = datos) -> modelo
modelo |> summary()
##
## Call:
## lm(formula = expectativa ~ ., data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.6360 -0.7919 -0.0958 0.8333 3.4537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 70.9942346 1.9271010 36.840 <2e-16 ***
## ingreso 0.0002329 0.0003673 0.634 0.5314
## analfabetismo -1.0545618 0.4034764 -2.614 0.0145 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.231 on 27 degrees of freedom
## Multiple R-squared: 0.2808, Adjusted R-squared: 0.2276
## F-statistic: 5.272 on 2 and 27 DF, p-value: 0.01167
modelo |> residuals() -> r
# Prueba de normalidad: Shapiro-Wilk
r |> shapiro.test()
##
## Shapiro-Wilk normality test
##
## data: r
## W = 0.96533, p-value = 0.4205
#plot(modelo)
summary(modelo)$sigma -> s
modelo |> model.matrix() -> X
X %*% solve(t(X) %*% X) %*% t(X) -> H
(r/(s*sqrt(1-diag(H)))) -> res_stand
res_stand |> round(3)|> abs() ->res_stand
res_stand > 2
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 27 28 29 30
## FALSE TRUE FALSE FALSE
30 -> n; 3 -> k
diag(H) > 2*k/n
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## 27 28 29 30
## FALSE FALSE FALSE FALSE
modelo |> ols_plot_resid_lev()
data.frame(leverage = diag(H)>2*k/n,
outlier = abs(modelo |> rstandard())>2)
## leverage outlier
## 1 FALSE FALSE
## 2 TRUE FALSE
## 3 FALSE FALSE
## 4 FALSE FALSE
## 5 FALSE FALSE
## 6 FALSE FALSE
## 7 FALSE FALSE
## 8 FALSE FALSE
## 9 FALSE FALSE
## 10 FALSE FALSE
## 11 FALSE TRUE
## 12 FALSE FALSE
## 13 FALSE FALSE
## 14 FALSE FALSE
## 15 FALSE FALSE
## 16 FALSE FALSE
## 17 FALSE FALSE
## 18 TRUE FALSE
## 19 FALSE FALSE
## 20 FALSE FALSE
## 21 FALSE FALSE
## 22 FALSE FALSE
## 23 FALSE FALSE
## 24 TRUE FALSE
## 25 FALSE FALSE
## 26 FALSE FALSE
## 27 FALSE FALSE
## 28 FALSE TRUE
## 29 FALSE FALSE
## 30 FALSE FALSE
modelo_sin = lm(expectativa[-1]~ingreso[-1]+analfabetismo[-1])
modelo |> coef() -> beta
modelo_sin |> coef() -> beta_sin
modelo |> model.matrix() -> X
(modelo |> sigma())**2 -> cme
(t(beta-beta_sin)%*%t(X)%*%X%*%(beta-beta_sin))/(k*cme)
## [,1]
## [1,] 0.01255555
modelo |> rstandard() -> rsta
modelo |> hatvalues() -> h
(rsta[1]**2*h[1])/(k*(1-h[1]))
## 1
## 0.01255555
modelo |> cooks.distance()|> round(4)
## 1 2 3 4 5 6 7 8 9 10 11
## 0.0126 0.5269 0.0031 0.0326 0.0070 0.0059 0.0456 0.0121 0.0001 0.0432 0.4678
## 12 13 14 15 16 17 18 19 20 21 22
## 0.0085 0.0169 0.0026 0.0203 0.0202 0.0001 0.0013 0.0246 0.0219 0.0069 0.0026
## 23 24 25 26 27 28 29 30
## 0.0358 0.1056 0.0032 0.0137 0.0247 0.1449 0.0000 0.0003
Se corrobora el valor “D1” calculado y usando la función directa. Ademas se cumple el supuesto ideal que los “Di” son menores a 1.
Para α = 40%
qf(0.40,k,n-k)
## [1] 0.6332443
(modelo |> cooks.distance()) > (qf(0.40,k,n-k))
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 27 28 29 30
## FALSE FALSE FALSE FALSE
Según la comparación ningún Di cumple que es mayor a 0.6332.
Para α = 15%
qf(0.15,k,n-k)
## [1] 0.2649987
(modelo |> cooks.distance()) > (qf(0.15,k,n-k))
## 1 2 3 4 5 6 7 8 9 10 11 12 13
## FALSE TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE TRUE FALSE FALSE
## 14 15 16 17 18 19 20 21 22 23 24 25 26
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
## 27 28 29 30
## FALSE FALSE FALSE FALSE
(abs(modelo |> dfbetas()))>= 2/sqrt(n)
## (Intercept) ingreso analfabetismo
## 1 FALSE FALSE FALSE
## 2 TRUE TRUE TRUE
## 3 FALSE FALSE FALSE
## 4 FALSE FALSE FALSE
## 5 FALSE FALSE FALSE
## 6 FALSE FALSE FALSE
## 7 FALSE FALSE FALSE
## 8 FALSE FALSE FALSE
## 9 FALSE FALSE FALSE
## 10 FALSE FALSE FALSE
## 11 TRUE TRUE TRUE
## 12 FALSE FALSE FALSE
## 13 FALSE FALSE FALSE
## 14 FALSE FALSE FALSE
## 15 FALSE FALSE FALSE
## 16 FALSE FALSE FALSE
## 17 FALSE FALSE FALSE
## 18 FALSE FALSE FALSE
## 19 FALSE FALSE FALSE
## 20 FALSE FALSE FALSE
## 21 FALSE FALSE FALSE
## 22 FALSE FALSE FALSE
## 23 FALSE FALSE FALSE
## 24 FALSE FALSE FALSE
## 25 FALSE FALSE FALSE
## 26 FALSE FALSE FALSE
## 27 FALSE FALSE FALSE
## 28 FALSE FALSE FALSE
## 29 FALSE FALSE FALSE
## 30 FALSE FALSE FALSE