TRABAJO PRACTICO REGRESION

CARGA DE LA BASE

Explorar las varibles de la base de datos

library(haven)
hatco <- read_sav("D:/OneDrive/FACEN/Hatco.sav")
#str(hatco)
names(hatco)
## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "y"
#View(hatco)
# Crear un vector con las etiquetas de descripción
descripciones <- c(
  "X1: Velocidad de entrega",
  "X2: Nivel de precio",
  "X3: Flexibilidad de precios",
  "X4: Imagen del fabricante",
  "X5: Servicio conjunto",
  "X6: Imagen de la fuerza de ventas",
  "X7: Calidad del producto"
)

# Asignar las etiquetas a las columnas del dataframe
colnames(hatco) <- descripciones
names(hatco)
## [1] "x1" "x2" "x3" "x4" "x5" "x6" "x7" "y"

variable X1

#"X1: Velocidad de entrega"
summary(hatco$x1)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.500   3.400   3.515   4.600   6.100
boxplot(hatco$x1)

## variable X2

summary(hatco$x2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.200   1.475   2.150   2.364   3.225   5.400
boxplot(hatco$x2)

## Verificando el cumplimiento del requisito de homocedasticidad

boxplot(hatco$x1, hatco$x2, hatco$x3, hatco$x4, hatco$x5, hatco$x6, hatco$x7)

Verificar gráficamente la normalidad en distribución de las variables x e y

par(mfrow = c(3, 1))

h1<-hist(hatco$x1)
h2<-hist(hatco$x2)
h3<-hist(hatco$x3)

h4<-hist(hatco$x4)
h5<-hist(hatco$x5)
h6<-hist(hatco$x6)

h7<-hist(hatco$x7)

Gráfica

#dispersion

hatco1<-subset(hatco,select=c(x1,x2,x3,y))

plot(hatco1)

#dispersion
hatco1<-subset(hatco,select=c(x4,x5,x6,x7,y))

plot(hatco1)

#dispersion

plot(hatco)

Analisis de correlaciones

library(psych)

pairs.panels(hatco,
             ellipses = T,
             hist.col = "lightblue",
             lm=TRUE,
             cor = TRUE,
             cex=1)

#03-Correlación 

Pruebas de normalidad en la distribucion de las variables

hatco<-data.frame(hatco)

#Funcion para aplicar test_shapiro.test para cada variable numerica de la base de datos

test_shapiro.test <- function(data) {
  resultados <- data.frame(Variable = character(), 
                           Estadístico = numeric(),
                           Valor_p = numeric(), 
                           stringsAsFactors = FALSE)
    for (col in colnames(data)) {
    variable <- data[[col]]
    test <- shapiro.test(variable)
    resultados <- rbind(resultados, data.frame(Variable = col, 
                                               Estadístico = test$statistic, 
                                               Valor_p = test$p.value))
  }
  
  return(resultados)
}


test_shapiro.test(hatco)
##    Variable Estadístico      Valor_p
## W        x1   0.9854246 0.3406333975
## W1       x2   0.9685102 0.0170193024
## W2       x3   0.9503082 0.0008693206
## W3       x4   0.9818003 0.1831623707
## W4       x5   0.9858613 0.3657020308
## W5       x6   0.9634128 0.0070943660
## W6       x7   0.9713664 0.0281514400
## W7        y   0.9850479 0.3201266472
#Se aplica el test a cada una de las variables numéricas 
library(correlation)
correlation(hatco)
## # Correlation Matrix (pearson-method)
## 
## Parameter1 | Parameter2 |     r |         95% CI | t(98) |         p
## --------------------------------------------------------------------
## x1         |         x2 | -0.35 | [-0.51, -0.16] | -3.69 | 0.006**  
## x1         |         x3 |  0.51 | [ 0.35,  0.64] |  5.86 | < .001***
## x1         |         x4 |  0.05 | [-0.15,  0.24] |  0.50 | > .999   
## x1         |         x5 |  0.61 | [ 0.47,  0.72] |  7.66 | < .001***
## x1         |         x6 |  0.08 | [-0.12,  0.27] |  0.77 | > .999   
## x1         |         x7 | -0.48 | [-0.62, -0.32] | -5.46 | < .001***
## x1         |          y |  0.68 | [ 0.55,  0.77] |  9.09 | < .001***
## x2         |         x3 | -0.49 | [-0.62, -0.32] | -5.52 | < .001***
## x2         |         x4 |  0.27 | [ 0.08,  0.44] |  2.80 | 0.092    
## x2         |         x5 |  0.51 | [ 0.35,  0.64] |  5.92 | < .001***
## x2         |         x6 |  0.19 | [-0.01,  0.37] |  1.88 | 0.572    
## x2         |         x7 |  0.47 | [ 0.30,  0.61] |  5.27 | < .001***
## x2         |          y |  0.08 | [-0.12,  0.27] |  0.81 | > .999   
## x3         |         x4 | -0.12 | [-0.31,  0.08] | -1.16 | > .999   
## x3         |         x5 |  0.07 | [-0.13,  0.26] |  0.66 | > .999   
## x3         |         x6 | -0.03 | [-0.23,  0.16] | -0.34 | > .999   
## x3         |         x7 | -0.45 | [-0.59, -0.28] | -4.96 | < .001***
## x3         |          y |  0.56 | [ 0.41,  0.68] |  6.67 | < .001***
## x4         |         x5 |  0.30 | [ 0.11,  0.47] |  3.10 | 0.041*   
## x4         |         x6 |  0.79 | [ 0.70,  0.85] | 12.68 | < .001***
## x4         |         x7 |  0.20 | [ 0.00,  0.38] |  2.02 | 0.507    
## x4         |          y |  0.22 | [ 0.03,  0.40] |  2.28 | 0.299    
## x5         |         x6 |  0.24 | [ 0.05,  0.42] |  2.46 | 0.205    
## x5         |         x7 | -0.06 | [-0.25,  0.14] | -0.55 | > .999   
## x5         |          y |  0.70 | [ 0.58,  0.79] |  9.72 | < .001***
## x6         |         x7 |  0.18 | [-0.02,  0.36] |  1.78 | 0.621    
## x6         |          y |  0.26 | [ 0.06,  0.43] |  2.62 | 0.142    
## x7         |          y | -0.19 | [-0.37,  0.00] | -1.94 | 0.551    
## 
## p-value adjustment method: Holm (1979)
## Observations: 100

Construcción del modelo de regresión lineal

Modelo saturado M0: con todas las variables disponibles

modelo <- lm(y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7, data = hatco)
summary(modelo)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7, data = hatco)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9589  -1.9284   0.5978   2.8182   6.7565 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.18687    4.97678  -2.047   0.0435 *  
## x1           -0.05758    2.01266  -0.029   0.9772    
## x2           -0.69691    2.09017  -0.333   0.7396    
## x3            3.36822    0.41123   8.191 1.44e-12 ***
## x4           -0.04220    0.66681  -0.063   0.9497    
## x5            8.36914    3.91815   2.136   0.0353 *  
## x6            1.28067    0.94717   1.352   0.1797    
## x7            0.56693    0.35543   1.595   0.1141    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.424 on 92 degrees of freedom
## Multiple R-squared:  0.7749, Adjusted R-squared:  0.7578 
## F-statistic: 45.25 on 7 and 92 DF,  p-value: < 2.2e-16

Eliminando del modelo la variable X1

#Modelo 1 M1

modelo1 <- lm(y ~ x2 + x3 + x4 + x5 + x6 + x7, data = hatco)

summary(modelo1)
## 
## Call:
## lm(formula = y ~ x2 + x3 + x4 + x5 + x6 + x7, data = hatco)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.9606  -1.9296   0.5967   2.7858   6.7584 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -10.21610    4.84455  -2.109   0.0377 *  
## x2           -0.63969    0.60400  -1.059   0.2923    
## x3            3.36843    0.40895   8.237 1.08e-12 ***
## x4           -0.03936    0.65585  -0.060   0.9523    
## x5            8.25956    0.82176  10.051  < 2e-16 ***
## x6            1.27904    0.94037   1.360   0.1771    
## x7            0.56677    0.35347   1.603   0.1122    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.4 on 93 degrees of freedom
## Multiple R-squared:  0.7749, Adjusted R-squared:  0.7604 
## F-statistic: 53.37 on 6 and 93 DF,  p-value: < 2.2e-16
#Modelo 2 M2

modelo2 <- lm(y ~ x1+x2 + x4 + x5 + x6 + x7, data = hatco)

summary(modelo2)
## 
## Call:
## lm(formula = y ~ x1 + x2 + x4 + x5 + x6 + x7, data = hatco)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.2891  -3.4401   0.2615   4.7243  10.2024 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  18.9959     4.5445   4.180 6.58e-05 ***
## x1           -0.3585     2.6319  -0.136   0.8920    
## x2           -3.3523     2.7006  -1.241   0.2176    
## x4           -0.5618     0.8682  -0.647   0.5191    
## x5           11.2980     5.1031   2.214   0.0293 *  
## x6            1.9416     1.2343   1.573   0.1191    
## x7            0.1604     0.4603   0.348   0.7283    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 5.786 on 93 degrees of freedom
## Multiple R-squared:  0.6108, Adjusted R-squared:  0.5857 
## F-statistic: 24.33 on 6 and 93 DF,  p-value: < 2.2e-16
modelo_step <- step(modelo, direction = "both")   # o  direction = c("backward", "forward","both")
## Start:  AIC=305.06
## y ~ x1 + x2 + x3 + x4 + x5 + x6 + x7
## 
##        Df Sum of Sq    RSS    AIC
## - x1    1      0.02 1800.3 303.06
## - x4    1      0.08 1800.4 303.06
## - x2    1      2.18 1802.5 303.18
## - x6    1     35.78 1836.1 305.02
## <none>              1800.3 305.06
## - x7    1     49.79 1850.1 305.78
## - x5    1     89.28 1889.6 307.90
## - x3    1   1312.76 3113.1 357.82
## 
## Step:  AIC=303.06
## y ~ x2 + x3 + x4 + x5 + x6 + x7
## 
##        Df Sum of Sq    RSS    AIC
## - x4    1      0.07 1800.4 301.06
## - x2    1     21.71 1822.1 302.25
## - x6    1     35.81 1836.2 303.03
## <none>              1800.3 303.06
## - x7    1     49.77 1850.1 303.78
## + x1    1      0.02 1800.3 305.06
## - x3    1   1313.37 3113.7 355.84
## - x5    1   1955.68 3756.0 374.59
## 
## Step:  AIC=301.06
## y ~ x2 + x3 + x5 + x6 + x7
## 
##        Df Sum of Sq    RSS    AIC
## - x2    1     21.80 1822.2 300.26
## <none>              1800.4 301.06
## - x7    1     49.70 1850.1 301.78
## + x4    1      0.07 1800.3 303.06
## + x1    1      0.01 1800.4 303.06
## - x6    1     81.13 1881.5 303.47
## - x3    1   1326.75 3127.2 354.27
## - x5    1   1987.41 3787.8 373.44
## 
## Step:  AIC=300.26
## y ~ x3 + x5 + x6 + x7
## 
##        Df Sum of Sq    RSS    AIC
## - x7    1     31.09 1853.3 299.96
## <none>              1822.2 300.26
## + x2    1     21.80 1800.4 301.06
## + x1    1     19.71 1802.5 301.18
## + x4    1      0.16 1822.1 302.25
## - x6    1     84.06 1906.3 302.77
## - x3    1   1939.30 3761.5 370.74
## - x5    1   3068.33 4890.5 396.99
## 
## Step:  AIC=299.96
## y ~ x3 + x5 + x6
## 
##        Df Sum of Sq    RSS    AIC
## <none>              1853.3 299.96
## + x7    1     31.09 1822.2 300.26
## + x2    1      3.19 1850.1 301.78
## + x1    1      2.95 1850.3 301.80
## + x4    1      0.01 1853.3 301.95
## - x6    1    109.19 1962.5 303.68
## - x3    1   2153.64 4006.9 375.06
## - x5    1   3039.11 4892.4 395.03
summary(modelo_step)
## 
## Call:
## lm(formula = y ~ x3 + x5 + x6, data = hatco)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.5520  -2.1298   0.1947   2.9815   7.5736 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  -6.5201     3.2467  -2.008   0.0474 *  
## x3            3.3760     0.3196  10.562   <2e-16 ***
## x5            7.6214     0.6074  12.547   <2e-16 ***
## x6            1.4056     0.5910   2.378   0.0194 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.394 on 96 degrees of freedom
## Multiple R-squared:  0.7683, Adjusted R-squared:  0.7611 
## F-statistic: 106.1 on 3 and 96 DF,  p-value: < 2.2e-16
# Obtener los residuos estandarizados
residuos <- rstandard(modelo_step)

# Graficar residuos vs. valores ajustados
plot(fitted(modelo_step), residuos, xlab = "Valores ajustados", 
     ylab = "Residuos estandarizados", 
     main = "Gráfico de residuos vs. valores ajustados")
abline(h = 0, col = "red", lty = 2)  # Agregar línea horizontal en 0

#08  Normalidad de los residuos
shapiro.test(modelo_step$residuals)
## 
##  Shapiro-Wilk normality test
## 
## data:  modelo_step$residuals
## W = 0.9623, p-value = 0.005881
library(nortest)
lillie.test(modelo_step$residuals)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  modelo_step$residuals
## D = 0.079017, p-value = 0.1298