library(car)

library(skimr)

library(ggplot2)

library(ggcorrplot)

library(tidyverse)

library(texreg)

library(prediction)

library(lmtest)

library(sandwich)

library(miceadds)

library(gridExtra)

library(ggpubr)

library(pastecs)

library(olsrr)

library(coefplot)

library(stargazer)

library(psych)

library(modelsummary)

library(olsrr)

library(coefplot)

1) Planteamor regresion por matrices

#creamos base de datos
x0 <- c(1,1,1,1,1,1,1,1,1,1)
x1 <- c(5,25,45,35,42,33,20,55,17,8)
x2 <- c(15,35,9,7,6,6,4,10,3,2)
y <- c(10,50,90,70,85,60,40,110,35,15)

data <- data.frame(y,x0,x1,x2)
data
##      y x0 x1 x2
## 1   10  1  5 15
## 2   50  1 25 35
## 3   90  1 45  9
## 4   70  1 35  7
## 5   85  1 42  6
## 6   60  1 33  6
## 7   40  1 20  4
## 8  110  1 55 10
## 9   35  1 17  3
## 10  15  1  8  2
#creamos matriz de parametros desconocidos "K"
k <- data.matrix(data[,c("x0","x1","x2")])
k
##       x0 x1 x2
##  [1,]  1  5 15
##  [2,]  1 25 35
##  [3,]  1 45  9
##  [4,]  1 35  7
##  [5,]  1 42  6
##  [6,]  1 33  6
##  [7,]  1 20  4
##  [8,]  1 55 10
##  [9,]  1 17  3
## [10,]  1  8  2
#creamos matriz de Y
Y <- data.matrix(data[,c("y")])
Y
##       [,1]
##  [1,]   10
##  [2,]   50
##  [3,]   90
##  [4,]   70
##  [5,]   85
##  [6,]   60
##  [7,]   40
##  [8,]  110
##  [9,]   35
## [10,]   15
#calculamos transpuesta de las X
TRx <- t(k)
TRx
##    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## x0    1    1    1    1    1    1    1    1    1     1
## x1    5   25   45   35   42   33   20   55   17     8
## x2   15   35    9    7    6    6    4   10    3     2
#calculamos transpuesta de x por x
TRX_x_X = TRx %*% k
TRX_x_X
##     x0    x1   x2
## x0  10   285   97
## x1 285 10531 2747
## x2  97  2747 1781
#generar inversa de "x_transpuesta"*"x"
invtrXxX = solve(TRX_x_X)
invtrXxX #matriz  1
##             x0            x1            x2
## x0  0.55409237 -0.0119187894 -0.0117945230
## x1 -0.01191879  0.0004152590  0.0000086502
## x2 -0.01179452  0.0000086502  0.0011905147
#multiplicamos matriz transpuesta de x por matriz de y
trXxY = TRx %*% Y
trXxY #matriz 2
##     [,1]
## x0   565
## x1 20915
## x2  5465
#Multiplicamos matriz 1 por matriz 2
betas_estimados = invtrXxX %*% trXxY
betas_estimados
##           [,1]
## x0 -0.67635865
## x1  1.99830001
## x2  0.02317611
#multiplicamos matriz x por matriz estimado de vetas
Xxbeta = k %*% betas_estimados
Xxbeta
##             [,1]
##  [1,]   9.662783
##  [2,]  50.092306
##  [3,]  89.455727
##  [4,]  69.426375
##  [5,]  83.391299
##  [6,]  65.406598
##  [7,]  39.382346
##  [8,] 109.461903
##  [9,]  33.364270
## [10,]  15.356394
#Multiplicamos 
Y_gorro <- k %*% betas_estimados
Y_gorro
##             [,1]
##  [1,]   9.662783
##  [2,]  50.092306
##  [3,]  89.455727
##  [4,]  69.426375
##  [5,]  83.391299
##  [6,]  65.406598
##  [7,]  39.382346
##  [8,] 109.461903
##  [9,]  33.364270
## [10,]  15.356394
#hallar errores estimados
error <- Y-Y_gorro
error
##              [,1]
##  [1,]  0.33721693
##  [2,] -0.09230554
##  [3,]  0.54427306
##  [4,]  0.57362541
##  [5,]  1.60870143
##  [6,] -5.40659845
##  [7,]  0.61765394
##  [8,]  0.53809682
##  [9,]  1.63573009
## [10,] -0.35639368
#sacamos tranpuesta del error 
Tr_ee <- t(error)
Tr_ee
##           [,1]        [,2]      [,3]      [,4]     [,5]      [,6]      [,7]
## [1,] 0.3372169 -0.09230554 0.5442731 0.5736254 1.608701 -5.406598 0.6176539
##           [,8]    [,9]      [,10]
## [1,] 0.5380968 1.63573 -0.3563937
#multiplicamos "transpuesta del error" * "erro"
Tr_ee_x_ee <- Tr_ee %*% error
Tr_ee_x_ee
##          [,1]
## [1,] 36.04042
#hallar constante de la varianza
a <- 1/(10-3)
a
## [1] 0.1428571
#multiplicar constante de la varianza por transpuesta del error por el error
Varianza <- a* Tr_ee_x_ee
Varianza
##          [,1]
## [1,] 5.148631
#corroboramos varianza
Varianza_modelo <- sum(error^2)/(10-3)
Varianza_modelo
## [1] 5.148631
#Matriz de varianzas y covarinzas de los betas
V_b <- Varianza_modelo*invtrXxX
V_b
##             x0             x1             x2
## x0  2.85281707 -0.06136544648 -0.06072564516
## x1 -0.06136545  0.00213801546  0.00004453669
## x2 -0.06072565  0.00004453669  0.00612952058
#errores de la matriz de betas en diagonal 
errores_betas <- sqrt(diag(V_b))
errores_betas
##         x0         x1         x2 
## 1.68902844 0.04623868 0.07829125
#samaos media de y
media_y <- mean(y)
media_y
## [1] 56.5
#Nombramos matricialmente el intercepto
I <- data.matrix(data[,c("x0")])
I
##       [,1]
##  [1,]    1
##  [2,]    1
##  [3,]    1
##  [4,]    1
##  [5,]    1
##  [6,]    1
##  [7,]    1
##  [8,]    1
##  [9,]    1
## [10,]    1
#multiplicamos media de y por I
IxY <- I*media_y
IxY
##       [,1]
##  [1,] 56.5
##  [2,] 56.5
##  [3,] 56.5
##  [4,] 56.5
##  [5,] 56.5
##  [6,] 56.5
##  [7,] 56.5
##  [8,] 56.5
##  [9,] 56.5
## [10,] 56.5
#A la matriz de Y le restamos la media de y por I
Y_IxY <- Y-IxY
Y_IxY #matriz 3
##        [,1]
##  [1,] -46.5
##  [2,]  -6.5
##  [3,]  33.5
##  [4,]  13.5
##  [5,]  28.5
##  [6,]   3.5
##  [7,] -16.5
##  [8,]  53.5
##  [9,] -21.5
## [10,] -41.5
#le sacamos transpuesta a la matriz 3
TR_Y_IxY <- t(Y_IxY)
TR_Y_IxY #matriz 4
##       [,1] [,2] [,3] [,4] [,5] [,6]  [,7] [,8]  [,9] [,10]
## [1,] -46.5 -6.5 33.5 13.5 28.5  3.5 -16.5 53.5 -21.5 -41.5
#multiplicamos matriz 4 por matriz 3
Variacion_total <- TR_Y_IxY %*% Y_IxY
Variacion_total
##        [,1]
## [1,] 9652.5
#Hallamos explicada
Variacion_explicada <- sum((Y_gorro-IxY)^2)
Variacion_explicada
## [1] 9616.46
#hallamos R cuadrado 
R_2 <- Variacion_explicada/Variacion_total
R_2
##           [,1]
## [1,] 0.9962662

2) Planteamor regresion por comandos

a) Analizar datos

#Revisamos variable
# Tener presente que la base de datos de este ejemplo se llama "data"
skimr::skim(data)
Data summary
Name data
Number of rows 10
Number of columns 4
_______________________
Column type frequency:
numeric 4
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
y 0 1 56.5 32.75 10 36.25 55.0 81.25 110 ▅▇▅▅▂
x0 0 1 1.0 0.00 1 1.00 1.0 1.00 1 ▁▁▇▁▁
x1 0 1 28.5 16.36 5 17.75 29.0 40.25 55 ▅▇▅▅▂
x2 0 1 9.7 9.66 2 4.50 6.5 9.75 35 ▇▃▁▁▁
stat.desc(data)
##                         y x0          x1         x2
## nbr.val        10.0000000 10  10.0000000 10.0000000
## nbr.null        0.0000000  0   0.0000000  0.0000000
## nbr.na          0.0000000  0   0.0000000  0.0000000
## min            10.0000000  1   5.0000000  2.0000000
## max           110.0000000  1  55.0000000 35.0000000
## range         100.0000000  0  50.0000000 33.0000000
## sum           565.0000000 10 285.0000000 97.0000000
## median         55.0000000  1  29.0000000  6.5000000
## mean           56.5000000  1  28.5000000  9.7000000
## SE.mean        10.3561576  0   5.1731143  3.0552323
## CI.mean.0.95   23.4272561  0  11.7023975  6.9114156
## var          1072.5000000  0 267.6111111 93.3444444
## std.dev        32.7490458  0  16.3588236  9.6614929
## coef.var        0.5796291  0   0.5739938  0.9960302
  • Matriz de correlación de variables
# Correlacion con decimales completos
cor(data)
## Warning in cor(data): the standard deviation is zero
##             y x0          x1          x2
## y   1.0000000 NA  0.99810794 -0.00544310
## x0         NA  1          NA          NA
## x1  0.9981079 NA  1.00000000 -0.01230267
## x2 -0.0054431 NA -0.01230267  1.00000000
#Grafico de matriz de correlacion
corPlot(data, cex = 1.2, main = "Matriz de correlación")
## Warning in cor(x, use = use, method = method): the standard deviation is zero

  • Analizamos grafico de dispercion
ggplot(data, aes(x1, y)) +
geom_point() +
labs(x = "Variable Explicatoria", y = "Variable Dependiente",
caption = "Fuente: ejemplo clase")

b) Planteamos el modelo

  • Planteamos regresion
ols <- lm(y ~ x1 + x2 , data = data) # regresion 1
ols2 <- lm(y ~ x1 , data = data) # regresion 2
  • creamos resumen de la regresion 1
ols #opcion 1
## 
## Call:
## lm(formula = y ~ x1 + x2, data = data)
## 
## Coefficients:
## (Intercept)           x1           x2  
##    -0.67636      1.99830      0.02318
summary(ols) #opcion 2
## 
## Call:
## lm(formula = y ~ x1 + x2, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4066  0.0151  0.5412  0.6066  1.6357 
## 
## Coefficients:
##             Estimate Std. Error t value       Pr(>|t|)    
## (Intercept) -0.67636    1.68903  -0.400          0.701    
## x1           1.99830    0.04624  43.217 0.000000000927 ***
## x2           0.02318    0.07829   0.296          0.776    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.269 on 7 degrees of freedom
## Multiple R-squared:  0.9963, Adjusted R-squared:  0.9952 
## F-statistic: 933.9 on 2 and 7 DF,  p-value: 0.000000003181
stargazer(ols, type = "text") #opcion 3
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                  y             
## -----------------------------------------------
## x1                           1.998***          
##                               (0.046)          
##                                                
## x2                             0.023           
##                               (0.078)          
##                                                
## Constant                      -0.676           
##                               (1.689)          
##                                                
## -----------------------------------------------
## Observations                    10             
## R2                             0.996           
## Adjusted R2                    0.995           
## Residual Std. Error       2.269 (df = 7)       
## F Statistic           933.885*** (df = 2; 7)   
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
  • Creamos resumen de la regresion 2
ols2 #opcion 1
## 
## Call:
## lm(formula = y ~ x1, data = data)
## 
## Coefficients:
## (Intercept)           x1  
##     -0.4468       1.9981
summary(ols2) #opcion 2
## 
## Call:
## lm(formula = y ~ x1, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4916  0.4631  0.5028  0.5448  1.5252 
## 
## Coefficients:
##             Estimate Std. Error t value        Pr(>|t|)    
## (Intercept) -0.44675    1.41225  -0.316            0.76    
## x1           1.99813    0.04352  45.914 0.0000000000559 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.136 on 8 degrees of freedom
## Multiple R-squared:  0.9962, Adjusted R-squared:  0.9957 
## F-statistic:  2108 on 1 and 8 DF,  p-value: 0.00000000005594
stargazer(ols2, type = "text") #opcion 3
## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                  y             
## -----------------------------------------------
## x1                           1.998***          
##                               (0.044)          
##                                                
## Constant                      -0.447           
##                               (1.412)          
##                                                
## -----------------------------------------------
## Observations                    10             
## R2                             0.996           
## Adjusted R2                    0.996           
## Residual Std. Error       2.136 (df = 8)       
## F Statistic          2,108.104*** (df = 1; 8)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01
  • Comparaciones de Modelos
m_list <- list(Regresion_Simple = ols, Regresion_Multiple = ols)
msummary(m_list)
Regresion_Simple Regresion_Multiple
(Intercept) −0.676 −0.676
(1.689) (1.689)
x1 1.998 1.998
(0.046) (0.046)
x2 0.023 0.023
(0.078) (0.078)
Num.Obs. 10 10
R2 0.996 0.996
R2 Adj. 0.995 0.995
AIC 49.2 49.2
BIC 50.4 50.4
Log.Lik. −20.600 −20.600
F 933.885 933.885
RMSE 1.90 1.90

c) Representacion grafica

ols_test_breusch_pagan(ols) #Analizar heterocedasticidad y homocedasticidad, si se rechaza hay heterocedasticidad
## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##             Data              
##  -----------------------------
##  Response : y 
##  Variables: fitted values of y 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    0.3144014 
##  Prob > Chi2   =    0.5749918
ols_coll_diag(ols) #Analiza si hay colinealidad
## Tolerance and Variance Inflation Factor
## ---------------------------------------
##   Variables Tolerance      VIF
## 1        x1 0.9998486 1.000151
## 2        x2 0.9998486 1.000151
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##   Eigenvalue Condition Index  intercept         x1         x2
## 1  2.4975532        1.000000 0.02645886 0.03122997 0.05528154
## 2  0.3923799        2.522923 0.02563905 0.15600789 0.81320758
## 3  0.1100669        4.763530 0.94790209 0.81276214 0.13151088
ols_plot_resid_qq(ols) #Grafico prueba residuos qq deben estar proximos a la linea de proyeccion

ols_plot_resid_lev(ols) #Grafico para ver los residuos dentro del intervalo

ols_step_both_aic(ols) #Criterio akaika para comparar modelos
## 
## 
##                                Stepwise Summary                               
## ----------------------------------------------------------------------------
## Variable     Method      AIC       RSS       Sum Sq      R-Sq      Adj. R-Sq 
## ----------------------------------------------------------------------------
## x1          addition    47.324    36.492    9616.008    0.99622      0.99575 
## ----------------------------------------------------------------------------
ols_test_normality(ols) #Test que tiene diferentes test de distribucion analisa el estadistico y el pvalue
## -----------------------------------------------
##        Test             Statistic       pvalue  
## -----------------------------------------------
## Shapiro-Wilk              0.6576         0.0003 
## Kolmogorov-Smirnov        0.3293         0.1818 
## Cramer-von Mises          0.1521         0.3881 
## Anderson-Darling          1.4495         0.0004 
## -----------------------------------------------
ols_plot_resid_hist(ols) #Grafico resiuos de la regresion con la distribucion

ols_plot_resid_stand(ols) #Muestra diferentes varainetes de los datos atipicos y como se estan distribuyendo, decirnos si la distribucion tienede a estar acumulada en una region de intervalos

ols_plot_resid_fit(ols) #sirve para detectar no linealidad, varianzas de error desiguales y valores atípicos.

ols_plot_cooksd_chart(ols) #Analizar la distribucion de cook, analizar residuos de la regresion

ols_plot_dfbetas(ols) #Panel of plo ts to detect influential observations using DFBETAs.

ols_plot_diagnostics(ols) #Panel of plots for regression diagnostics.

ols_step_both_p(ols) #Build regression model from a set of candidate predictor variables by entering and removing predictors based on p values,
## 
##                             Stepwise Selection Summary                              
## -----------------------------------------------------------------------------------
##                      Added/                   Adj.                                     
## Step    Variable    Removed     R-Square    R-Square     C(p)       AIC       RMSE     
## -----------------------------------------------------------------------------------
##    1       x1       addition       0.996       0.996    1.0880    47.3237    2.1358    
## -----------------------------------------------------------------------------------
coefplot(ols, title= "ols Regression") #function which extracts model coefficients from objects returned by modeling functions