library(readxl)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(olsrr)

## 
## Attaching package: 'olsrr'

## The following object is masked from 'package:datasets':
## 
##     rivers

datos <- read_excel('datos3.xlsx')
attach(datos)
datos %>% head(10)

## # A tibble: 10 × 4
##        y    x1    x2    x3
##    <dbl> <dbl> <dbl> <dbl>
##  1  24.2     8  6.56     8
##  2  24       9  6.56     7
##  3  21.9    11  7.42    10
##  4  24.2     8  7.8      8
##  5  21.2    10  5.09     7
##  6  23.5    10  5.3      9
##  7  20.6    10  2.98     7
##  8  23.4    12  2.99     7
##  9  22.3     8  6.72     8
## 10  25.1    13  6.51     7

lm(y ~ ., data = datos) -> modelo
modelo |> summary()

## 
## Call:
## lm(formula = y ~ ., data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1091 -0.3640  0.0634  0.8775  2.1345 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  9.62063    2.31786   4.151  0.00024 ***
## x1           0.78425    0.14206   5.521 4.83e-06 ***
## x2           0.97364    0.12963   7.511 1.83e-08 ***
## x3           0.05373    0.22669   0.237  0.81420    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.266 on 31 degrees of freedom
## Multiple R-squared:  0.715,  Adjusted R-squared:  0.6874 
## F-statistic: 25.93 on 3 and 31 DF,  p-value: 1.38e-08

modelo |> residuals() -> r

Prueba de normalidad: Shapiro-Wilk Ho: Los errores se distribuyen normalmente H1: Los errores No se distribuyen normalmente

Nivel de significancia: 5%

r |> shapiro.test()

## 
##  Shapiro-Wilk normality test
## 
## data:  r
## W = 0.91712, p-value = 0.01179

Decisión: Rechazar Ho
Conclusión: Los errores No se distribuyen normalmente

Identificando los posibles outliers

(modelo|> rstudent() |> round(4)|>abs()) > 2

##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    27    28    29    30    31    32    33    34    35 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

modelo |> ols_plot_resid_stud()

La observación 3 por lo tanto se pueden considerar como outliers, se comprueba tanto numericamente como gráficamente.

Identificando las posibles observaciones leverage

modelo %>% model.matrix -> X
X %*% solve(t(X) %*% X) %*% t(X) -> H

length(datos$y) -> n; 4 -> k
diag(H) > 2*k/n

##     1     2     3     4     5     6     7     8     9    10    11    12    13 
## FALSE FALSE  TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    14    15    16    17    18    19    20    21    22    23    24    25    26 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE 
##    27    28    29    30    31    32    33    34    35 
## FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE

modelo |> ols_plot_resid_lev()

El valor de la observación 3 es TRUE y según el criterio se pueden considerar como leverage.

PC3_preg4

Luis Salsavilca

2022-07-14

Identificando los posibles outliers

Identificando las posibles observaciones leverage