library(readxl)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(olsrr)
## 
## Attaching package: 'olsrr'
## The following object is masked from 'package:datasets':
## 
##     rivers
resumen <- read_excel('Resumen-Censo-EEUU-1977.xlsx')
resumen <- resumen[1:30,]

1. Utilizar un conjunto de datos con 30 observaciones y 2 variables explicativas (y, x1, x2)

V. dependiente:

V. predictoras cuantitativas:

datos <- data.frame(expectativa = resumen$c2, ingreso = resumen$c4, analfabetismo = resumen$c5)

2. Utilizar los residuales estandarizados o estudentizados, según sea lo adecuado, para identificar posibles outliers.

attach(datos)

lm(expectativa ~ ., data = datos) -> modelo
modelo |> summary()
## 
## Call:
## lm(formula = expectativa ~ ., data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.6360 -0.7919 -0.0958  0.8333  3.4537 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   70.9942346  1.9271010  36.840   <2e-16 ***
## ingreso        0.0002329  0.0003673   0.634   0.5314    
## analfabetismo -1.0545618  0.4034764  -2.614   0.0145 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.231 on 27 degrees of freedom
## Multiple R-squared:  0.2808, Adjusted R-squared:  0.2276 
## F-statistic: 5.272 on 2 and 27 DF,  p-value: 0.01167
modelo |> residuals() -> r
  
# Prueba de normalidad: Shapiro-Wilk 
r |> shapiro.test()
## 
##  Shapiro-Wilk normality test
## 
## data:  r
## W = 0.96533, p-value = 0.4205
#plot(modelo)
summary(modelo)$sigma -> s
modelo |> model.matrix() -> X
X %*% solve(t(X) %*% X) %*% t(X) -> H
(r/(s*sqrt(1-diag(H)))) -> res_stand
res_stand |> round(3)|> abs() ->res_stand 

res_stand > 2 
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    27    28    29    30 
## FALSE  TRUE FALSE FALSE

3. Identificar las posibles observaciones leverage

30 -> n; 3 -> k
diag(H) > 2*k/n
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
## FALSE FALSE FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE 
##    27    28    29    30 
## FALSE FALSE FALSE FALSE
modelo |> ols_plot_resid_lev()

data.frame(leverage = diag(H)>2*k/n,
           outlier = abs(modelo |> rstandard())>2)
##    leverage outlier
## 1     FALSE   FALSE
## 2      TRUE   FALSE
## 3     FALSE   FALSE
## 4     FALSE   FALSE
## 5     FALSE   FALSE
## 6     FALSE   FALSE
## 7     FALSE   FALSE
## 8     FALSE   FALSE
## 9     FALSE   FALSE
## 10    FALSE   FALSE
## 11    FALSE    TRUE
## 12    FALSE   FALSE
## 13    FALSE   FALSE
## 14    FALSE   FALSE
## 15    FALSE   FALSE
## 16    FALSE   FALSE
## 17    FALSE   FALSE
## 18     TRUE   FALSE
## 19    FALSE   FALSE
## 20    FALSE   FALSE
## 21    FALSE   FALSE
## 22    FALSE   FALSE
## 23    FALSE   FALSE
## 24     TRUE   FALSE
## 25    FALSE   FALSE
## 26    FALSE   FALSE
## 27    FALSE   FALSE
## 28    FALSE    TRUE
## 29    FALSE   FALSE
## 30    FALSE   FALSE

4. Identificar los posibles valores influenciales según distancia de Cook (interpretar para al menos 2 alfa)

modelo_sin = lm(expectativa[-1]~ingreso[-1]+analfabetismo[-1])
modelo |> coef() -> beta
modelo_sin |> coef() -> beta_sin
modelo |> model.matrix() -> X
(modelo |> sigma())**2 -> cme
(t(beta-beta_sin)%*%t(X)%*%X%*%(beta-beta_sin))/(k*cme)
##            [,1]
## [1,] 0.01255555
modelo |> rstandard() -> rsta
modelo |> hatvalues() -> h
(rsta[1]**2*h[1])/(k*(1-h[1]))
##          1 
## 0.01255555
modelo |> cooks.distance()|> round(4)
##      1      2      3      4      5      6      7      8      9     10     11 
## 0.0126 0.5269 0.0031 0.0326 0.0070 0.0059 0.0456 0.0121 0.0001 0.0432 0.4678 
##     12     13     14     15     16     17     18     19     20     21     22 
## 0.0085 0.0169 0.0026 0.0203 0.0202 0.0001 0.0013 0.0246 0.0219 0.0069 0.0026 
##     23     24     25     26     27     28     29     30 
## 0.0358 0.1056 0.0032 0.0137 0.0247 0.1449 0.0000 0.0003
qf(0.40,k,n-k)
## [1] 0.6332443
(modelo |> cooks.distance()) > (qf(0.40,k,n-k))
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    27    28    29    30 
## FALSE FALSE FALSE FALSE
qf(0.15,k,n-k)
## [1] 0.2649987
(modelo |> cooks.distance()) > (qf(0.15,k,n-k))
##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE  TRUE FALSE FALSE 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    27    28    29    30 
## FALSE FALSE FALSE FALSE

5. Identificar los posibles valores influenciales según DFBETAS

(abs(modelo |> dfbetas()))>= 2/sqrt(n)
##    (Intercept) ingreso analfabetismo
## 1        FALSE   FALSE         FALSE
## 2         TRUE    TRUE          TRUE
## 3        FALSE   FALSE         FALSE
## 4        FALSE   FALSE         FALSE
## 5        FALSE   FALSE         FALSE
## 6        FALSE   FALSE         FALSE
## 7        FALSE   FALSE         FALSE
## 8        FALSE   FALSE         FALSE
## 9        FALSE   FALSE         FALSE
## 10       FALSE   FALSE         FALSE
## 11        TRUE    TRUE          TRUE
## 12       FALSE   FALSE         FALSE
## 13       FALSE   FALSE         FALSE
## 14       FALSE   FALSE         FALSE
## 15       FALSE   FALSE         FALSE
## 16       FALSE   FALSE         FALSE
## 17       FALSE   FALSE         FALSE
## 18       FALSE   FALSE         FALSE
## 19       FALSE   FALSE         FALSE
## 20       FALSE   FALSE         FALSE
## 21       FALSE   FALSE         FALSE
## 22       FALSE   FALSE         FALSE
## 23       FALSE   FALSE         FALSE
## 24       FALSE   FALSE         FALSE
## 25       FALSE   FALSE         FALSE
## 26       FALSE   FALSE         FALSE
## 27       FALSE   FALSE         FALSE
## 28       FALSE   FALSE         FALSE
## 29       FALSE   FALSE         FALSE
## 30       FALSE   FALSE         FALSE