Regresion por matrices

Librerias a usar:

library(car)

library(skimr)

library(ggplot2)

library(ggcorrplot)

library(tidyverse)

library(texreg)

library(prediction)

library(lmtest)

library(sandwich)

library(miceadds)

library(gridExtra)

library(ggpubr)

library(pastecs)

library(olsrr)

library(coefplot)

library(stargazer)

library(psych)

library(modelsummary)

library(olsrr)

library(coefplot)

1) Planteamor regresion por matrices

Codigo completo de la regresion:

#creamos base de datos
x0 <- c(1,1,1,1,1,1,1,1,1,1)
x1 <- c(5,25,45,35,42,33,20,55,17,8)
x2 <- c(15,35,9,7,6,6,4,10,3,2)
y <- c(10,50,90,70,85,60,40,110,35,15)

data <- data.frame(y,x0,x1,x2)
data

##      y x0 x1 x2
## 1   10  1  5 15
## 2   50  1 25 35
## 3   90  1 45  9
## 4   70  1 35  7
## 5   85  1 42  6
## 6   60  1 33  6
## 7   40  1 20  4
## 8  110  1 55 10
## 9   35  1 17  3
## 10  15  1  8  2

#creamos matriz de parametros desconocidos "K"
k <- data.matrix(data[,c("x0","x1","x2")])
k

##       x0 x1 x2
##  [1,]  1  5 15
##  [2,]  1 25 35
##  [3,]  1 45  9
##  [4,]  1 35  7
##  [5,]  1 42  6
##  [6,]  1 33  6
##  [7,]  1 20  4
##  [8,]  1 55 10
##  [9,]  1 17  3
## [10,]  1  8  2

#creamos matriz de Y
Y <- data.matrix(data[,c("y")])
Y

##       [,1]
##  [1,]   10
##  [2,]   50
##  [3,]   90
##  [4,]   70
##  [5,]   85
##  [6,]   60
##  [7,]   40
##  [8,]  110
##  [9,]   35
## [10,]   15

#calculamos transpuesta de las X
TRx <- t(k)
TRx

##    [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## x0    1    1    1    1    1    1    1    1    1     1
## x1    5   25   45   35   42   33   20   55   17     8
## x2   15   35    9    7    6    6    4   10    3     2

#calculamos transpuesta de x por x
TRX_x_X = TRx %*% k
TRX_x_X

##     x0    x1   x2
## x0  10   285   97
## x1 285 10531 2747
## x2  97  2747 1781

#generar inversa de "x_transpuesta"*"x"
invtrXxX = solve(TRX_x_X)
invtrXxX #matriz  1

##             x0            x1            x2
## x0  0.55409237 -0.0119187894 -0.0117945230
## x1 -0.01191879  0.0004152590  0.0000086502
## x2 -0.01179452  0.0000086502  0.0011905147

#multiplicamos matriz transpuesta de x por matriz de y
trXxY = TRx %*% Y
trXxY #matriz 2

##     [,1]
## x0   565
## x1 20915
## x2  5465

#Multiplicamos matriz 1 por matriz 2
betas_estimados = invtrXxX %*% trXxY
betas_estimados

##           [,1]
## x0 -0.67635865
## x1  1.99830001
## x2  0.02317611

#multiplicamos matriz x por matriz estimado de vetas
Xxbeta = k %*% betas_estimados
Xxbeta

##             [,1]
##  [1,]   9.662783
##  [2,]  50.092306
##  [3,]  89.455727
##  [4,]  69.426375
##  [5,]  83.391299
##  [6,]  65.406598
##  [7,]  39.382346
##  [8,] 109.461903
##  [9,]  33.364270
## [10,]  15.356394

#Multiplicamos 
Y_gorro <- k %*% betas_estimados
Y_gorro

##             [,1]
##  [1,]   9.662783
##  [2,]  50.092306
##  [3,]  89.455727
##  [4,]  69.426375
##  [5,]  83.391299
##  [6,]  65.406598
##  [7,]  39.382346
##  [8,] 109.461903
##  [9,]  33.364270
## [10,]  15.356394

#hallar errores estimados
error <- Y-Y_gorro
error

##              [,1]
##  [1,]  0.33721693
##  [2,] -0.09230554
##  [3,]  0.54427306
##  [4,]  0.57362541
##  [5,]  1.60870143
##  [6,] -5.40659845
##  [7,]  0.61765394
##  [8,]  0.53809682
##  [9,]  1.63573009
## [10,] -0.35639368

#sacamos tranpuesta del error 
Tr_ee <- t(error)
Tr_ee

##           [,1]        [,2]      [,3]      [,4]     [,5]      [,6]      [,7]
## [1,] 0.3372169 -0.09230554 0.5442731 0.5736254 1.608701 -5.406598 0.6176539
##           [,8]    [,9]      [,10]
## [1,] 0.5380968 1.63573 -0.3563937

#multiplicamos "transpuesta del error" * "erro"
Tr_ee_x_ee <- Tr_ee %*% error
Tr_ee_x_ee

##          [,1]
## [1,] 36.04042

#hallar constante de la varianza
a <- 1/(10-3)
a

## [1] 0.1428571

#multiplicar constante de la varianza por transpuesta del error por el error
Varianza <- a* Tr_ee_x_ee
Varianza

##          [,1]
## [1,] 5.148631

#corroboramos varianza
Varianza_modelo <- sum(error^2)/(10-3)
Varianza_modelo

## [1] 5.148631

#Matriz de varianzas y covarinzas de los betas
V_b <- Varianza_modelo*invtrXxX
V_b

##             x0             x1             x2
## x0  2.85281707 -0.06136544648 -0.06072564516
## x1 -0.06136545  0.00213801546  0.00004453669
## x2 -0.06072565  0.00004453669  0.00612952058

#errores de la matriz de betas en diagonal 
errores_betas <- sqrt(diag(V_b))
errores_betas

##         x0         x1         x2 
## 1.68902844 0.04623868 0.07829125

#samaos media de y
media_y <- mean(y)
media_y

## [1] 56.5

#Nombramos matricialmente el intercepto
I <- data.matrix(data[,c("x0")])
I

##       [,1]
##  [1,]    1
##  [2,]    1
##  [3,]    1
##  [4,]    1
##  [5,]    1
##  [6,]    1
##  [7,]    1
##  [8,]    1
##  [9,]    1
## [10,]    1

#multiplicamos media de y por I
IxY <- I*media_y
IxY

##       [,1]
##  [1,] 56.5
##  [2,] 56.5
##  [3,] 56.5
##  [4,] 56.5
##  [5,] 56.5
##  [6,] 56.5
##  [7,] 56.5
##  [8,] 56.5
##  [9,] 56.5
## [10,] 56.5

#A la matriz de Y le restamos la media de y por I
Y_IxY <- Y-IxY
Y_IxY #matriz 3

##        [,1]
##  [1,] -46.5
##  [2,]  -6.5
##  [3,]  33.5
##  [4,]  13.5
##  [5,]  28.5
##  [6,]   3.5
##  [7,] -16.5
##  [8,]  53.5
##  [9,] -21.5
## [10,] -41.5

#le sacamos transpuesta a la matriz 3
TR_Y_IxY <- t(Y_IxY)
TR_Y_IxY #matriz 4

##       [,1] [,2] [,3] [,4] [,5] [,6]  [,7] [,8]  [,9] [,10]
## [1,] -46.5 -6.5 33.5 13.5 28.5  3.5 -16.5 53.5 -21.5 -41.5

#multiplicamos matriz 4 por matriz 3
Variacion_total <- TR_Y_IxY %*% Y_IxY
Variacion_total

##        [,1]
## [1,] 9652.5

#Hallamos explicada
Variacion_explicada <- sum((Y_gorro-IxY)^2)
Variacion_explicada

## [1] 9616.46

#hallamos R cuadrado 
R_2 <- Variacion_explicada/Variacion_total
R_2

##           [,1]
## [1,] 0.9962662

2) Planteamor regresion por comandos

a) Analizar datos

#Revisamos variable
# Tener presente que la base de datos de este ejemplo se llama "data"
skimr::skim(data)

Data summary
Name	data
Number of rows	10
Number of columns	4
_______________________
Column type frequency:
numeric	4
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
y	1	56.5	32.75	10	36.25	55.0	81.25	110	▅▇▅▅▂
x0	1	1.0	0.00	1	1.00	1.0	1.00	1	▁▁▇▁▁
x1	1	28.5	16.36	5	17.75	29.0	40.25	55	▅▇▅▅▂
x2	1	9.7	9.66	2	4.50	6.5	9.75	35	▇▃▁▁▁

stat.desc(data)

##                         y x0          x1         x2
## nbr.val        10.0000000 10  10.0000000 10.0000000
## nbr.null        0.0000000  0   0.0000000  0.0000000
## nbr.na          0.0000000  0   0.0000000  0.0000000
## min            10.0000000  1   5.0000000  2.0000000
## max           110.0000000  1  55.0000000 35.0000000
## range         100.0000000  0  50.0000000 33.0000000
## sum           565.0000000 10 285.0000000 97.0000000
## median         55.0000000  1  29.0000000  6.5000000
## mean           56.5000000  1  28.5000000  9.7000000
## SE.mean        10.3561576  0   5.1731143  3.0552323
## CI.mean.0.95   23.4272561  0  11.7023975  6.9114156
## var          1072.5000000  0 267.6111111 93.3444444
## std.dev        32.7490458  0  16.3588236  9.6614929
## coef.var        0.5796291  0   0.5739938  0.9960302

Matriz de correlación de variables

# Correlacion con decimales completos
cor(data)

## Warning in cor(data): the standard deviation is zero

##             y x0          x1          x2
## y   1.0000000 NA  0.99810794 -0.00544310
## x0         NA  1          NA          NA
## x1  0.9981079 NA  1.00000000 -0.01230267
## x2 -0.0054431 NA -0.01230267  1.00000000

#Grafico de matriz de correlacion
corPlot(data, cex = 1.2, main = "Matriz de correlación")

## Warning in cor(x, use = use, method = method): the standard deviation is zero

Analizamos grafico de dispercion

ggplot(data, aes(x1, y)) +
geom_point() +
labs(x = "Variable Explicatoria", y = "Variable Dependiente",
caption = "Fuente: ejemplo clase")

b) Planteamos el modelo

Planteamos regresion

ols <- lm(y ~ x1 + x2 , data = data) # regresion 1
ols2 <- lm(y ~ x1 , data = data) # regresion 2

creamos resumen de la regresion 1

ols #opcion 1

## 
## Call:
## lm(formula = y ~ x1 + x2, data = data)
## 
## Coefficients:
## (Intercept)           x1           x2  
##    -0.67636      1.99830      0.02318

summary(ols) #opcion 2

## 
## Call:
## lm(formula = y ~ x1 + x2, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4066  0.0151  0.5412  0.6066  1.6357 
## 
## Coefficients:
##             Estimate Std. Error t value       Pr(>|t|)    
## (Intercept) -0.67636    1.68903  -0.400          0.701    
## x1           1.99830    0.04624  43.217 0.000000000927 ***
## x2           0.02318    0.07829   0.296          0.776    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.269 on 7 degrees of freedom
## Multiple R-squared:  0.9963, Adjusted R-squared:  0.9952 
## F-statistic: 933.9 on 2 and 7 DF,  p-value: 0.000000003181

stargazer(ols, type = "text") #opcion 3

## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                  y             
## -----------------------------------------------
## x1                           1.998***          
##                               (0.046)          
##                                                
## x2                             0.023           
##                               (0.078)          
##                                                
## Constant                      -0.676           
##                               (1.689)          
##                                                
## -----------------------------------------------
## Observations                    10             
## R2                             0.996           
## Adjusted R2                    0.995           
## Residual Std. Error       2.269 (df = 7)       
## F Statistic           933.885*** (df = 2; 7)   
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Creamos resumen de la regresion 2

ols2 #opcion 1

## 
## Call:
## lm(formula = y ~ x1, data = data)
## 
## Coefficients:
## (Intercept)           x1  
##     -0.4468       1.9981

summary(ols2) #opcion 2

## 
## Call:
## lm(formula = y ~ x1, data = data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.4916  0.4631  0.5028  0.5448  1.5252 
## 
## Coefficients:
##             Estimate Std. Error t value        Pr(>|t|)    
## (Intercept) -0.44675    1.41225  -0.316            0.76    
## x1           1.99813    0.04352  45.914 0.0000000000559 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.136 on 8 degrees of freedom
## Multiple R-squared:  0.9962, Adjusted R-squared:  0.9957 
## F-statistic:  2108 on 1 and 8 DF,  p-value: 0.00000000005594

stargazer(ols2, type = "text") #opcion 3

## 
## ===============================================
##                         Dependent variable:    
##                     ---------------------------
##                                  y             
## -----------------------------------------------
## x1                           1.998***          
##                               (0.044)          
##                                                
## Constant                      -0.447           
##                               (1.412)          
##                                                
## -----------------------------------------------
## Observations                    10             
## R2                             0.996           
## Adjusted R2                    0.996           
## Residual Std. Error       2.136 (df = 8)       
## F Statistic          2,108.104*** (df = 1; 8)  
## ===============================================
## Note:               *p<0.1; **p<0.05; ***p<0.01

Comparaciones de Modelos

m_list <- list(Regresion_Simple = ols, Regresion_Multiple = ols)
msummary(m_list)

	Regresion_Simple	Regresion_Multiple
(Intercept)	−0.676	−0.676
	(1.689)	(1.689)
x1	1.998	1.998
	(0.046)	(0.046)
x2	0.023	0.023
	(0.078)	(0.078)
Num.Obs.	10	10
R2	0.996	0.996
R2 Adj.	0.995	0.995
AIC	49.2	49.2
BIC	50.4	50.4
Log.Lik.	−20.600	−20.600
F	933.885	933.885
RMSE	1.90	1.90

c) Representacion grafica

Diagnostico de supuestos con el modelo 1 (ya somos indiferentes entre los dos modelos)

ols_test_breusch_pagan(ols) #Analizar heterocedasticidad y homocedasticidad, si se rechaza hay heterocedasticidad

## 
##  Breusch Pagan Test for Heteroskedasticity
##  -----------------------------------------
##  Ho: the variance is constant            
##  Ha: the variance is not constant        
## 
##             Data              
##  -----------------------------
##  Response : y 
##  Variables: fitted values of y 
## 
##         Test Summary         
##  ----------------------------
##  DF            =    1 
##  Chi2          =    0.3144014 
##  Prob > Chi2   =    0.5749918

ols_coll_diag(ols) #Analiza si hay colinealidad

## Tolerance and Variance Inflation Factor
## ---------------------------------------
##   Variables Tolerance      VIF
## 1        x1 0.9998486 1.000151
## 2        x2 0.9998486 1.000151
## 
## 
## Eigenvalue and Condition Index
## ------------------------------
##   Eigenvalue Condition Index  intercept         x1         x2
## 1  2.4975532        1.000000 0.02645886 0.03122997 0.05528154
## 2  0.3923799        2.522923 0.02563905 0.15600789 0.81320758
## 3  0.1100669        4.763530 0.94790209 0.81276214 0.13151088

ols_plot_resid_qq(ols) #Grafico prueba residuos qq deben estar proximos a la linea de proyeccion

ols_plot_resid_lev(ols) #Grafico para ver los residuos dentro del intervalo

ols_step_both_aic(ols) #Criterio akaika para comparar modelos

## 
## 
##                                Stepwise Summary                               
## ----------------------------------------------------------------------------
## Variable     Method      AIC       RSS       Sum Sq      R-Sq      Adj. R-Sq 
## ----------------------------------------------------------------------------
## x1          addition    47.324    36.492    9616.008    0.99622      0.99575 
## ----------------------------------------------------------------------------

ols_test_normality(ols) #Test que tiene diferentes test de distribucion analisa el estadistico y el pvalue

## -----------------------------------------------
##        Test             Statistic       pvalue  
## -----------------------------------------------
## Shapiro-Wilk              0.6576         0.0003 
## Kolmogorov-Smirnov        0.3293         0.1818 
## Cramer-von Mises          0.1521         0.3881 
## Anderson-Darling          1.4495         0.0004 
## -----------------------------------------------

ols_plot_resid_hist(ols) #Grafico resiuos de la regresion con la distribucion

ols_plot_resid_stand(ols) #Muestra diferentes varainetes de los datos atipicos y como se estan distribuyendo, decirnos si la distribucion tienede a estar acumulada en una region de intervalos

ols_plot_resid_fit(ols) #sirve para detectar no linealidad, varianzas de error desiguales y valores atípicos.

ols_plot_cooksd_chart(ols) #Analizar la distribucion de cook, analizar residuos de la regresion

ols_plot_dfbetas(ols) #Panel of plo ts to detect influential observations using DFBETAs.

ols_plot_diagnostics(ols) #Panel of plots for regression diagnostics.

ols_step_both_p(ols) #Build regression model from a set of candidate predictor variables by entering and removing predictors based on p values,

## 
##                             Stepwise Selection Summary                              
## -----------------------------------------------------------------------------------
##                      Added/                   Adj.                                     
## Step    Variable    Removed     R-Square    R-Square     C(p)       AIC       RMSE     
## -----------------------------------------------------------------------------------
##    1       x1       addition       0.996       0.996    1.0880    47.3237    2.1358    
## -----------------------------------------------------------------------------------

Graficamos la significancia estadistica de los coeficientes de una regresion

coefplot(ols, title= "ols Regression") #function which extracts model coefficients from objects returned by modeling functions