Análisis de Regresión Lineal Múltiple

#install.packages("remotes")
#remotes::install_github("kassambara/datarium")

data("marketing", package = "datarium")
head(marketing, 4)

##   youtube facebook newspaper sales
## 1  276.12    45.36     83.04 26.52
## 2   53.40    47.16     54.12 12.48
## 3   20.64    55.08     83.16 11.16
## 4  181.80    49.56     70.20 22.20

library(tidyverse)

## -- Attaching packages ------------------------------ tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.0
## v tidyr   1.1.0     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

# 1.Analizar la relación entre variables

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

multi.hist(marketing,dcol = c("blue", "red"), 
           dlty = c("dotted", "solid"))

ggpairs(marketing,lower = list(continuous = "smooth"),
        diag = list(continuous = "bar"), axisLabels = "none")

## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'bar' to 'barDiag'

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 2.Generar el modelo



modelo<-lm(sales ~  youtube + facebook,data=marketing)
summary(modelo)

## 
## Call:
## lm(formula = sales ~ youtube + facebook, data = marketing)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -10.5572  -1.0502   0.2906   1.4049   3.3994 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.50532    0.35339   9.919   <2e-16 ***
## youtube      0.04575    0.00139  32.909   <2e-16 ***
## facebook     0.18799    0.00804  23.382   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.018 on 197 degrees of freedom
## Multiple R-squared:  0.8972, Adjusted R-squared:  0.8962 
## F-statistic: 859.6 on 2 and 197 DF,  p-value: < 2.2e-16

#Selección de los mejores predictores

step(modelo, direction = "both", trace = 1)

## Start:  AIC=283.75
## sales ~ youtube + facebook
## 
##            Df Sum of Sq    RSS    AIC
## <none>                   802.0 283.75
## - facebook  1    2225.7 3027.6 547.44
## - youtube   1    4408.7 5210.6 656.03

## 
## Call:
## lm(formula = sales ~ youtube + facebook, data = marketing)
## 
## Coefficients:
## (Intercept)      youtube     facebook  
##     3.50532      0.04575      0.18799

# 4.Validación de condiciones para la regresión múltiple lineal

library(ggplot2)
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

plot1 <- ggplot(data = marketing, aes(youtube, modelo$residuals)) +
  geom_point() + geom_smooth(color = "firebrick") +
  geom_hline(yintercept = 0) + theme_bw() 

plot2 <- ggplot(data = marketing, aes(facebook, modelo$residuals)) +
  geom_point() + geom_smooth(color = "firebrick") + 
  geom_hline(yintercept = 0) + theme_bw()





grid.arrange(plot1, plot2)

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Distribución normal de los residuos
qqnorm(modelo$residuals) 
qqline(modelo$residuals)

shapiro.test(modelo$residuals)

## 
##  Shapiro-Wilk normality test
## 
## data:  modelo$residuals
## W = 0.91804, p-value = 4.19e-09

# Varianza constante de los residuos (homocedasticidad):

ggplot(data = marketing, aes(modelo$fitted.values, modelo$residuals)) +
  geom_point() + geom_smooth(color = "firebrick", se = FALSE) + 
  geom_hline(yintercept = 0) + theme_bw()

## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

library(lmtest)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

bptest(modelo)

## 
##  studentized Breusch-Pagan test
## 
## data:  modelo
## BP = 4.8093, df = 2, p-value = 0.0903

# No multicolinealidad:

# Análisis de Inflación de Varianza (VIF)

library(car)

## Loading required package: carData

## 
## Attaching package: 'car'

## The following object is masked from 'package:psych':
## 
##     logit

## The following object is masked from 'package:dplyr':
## 
##     recode

## The following object is masked from 'package:purrr':
## 
##     some

vif(modelo)

##  youtube facebook 
## 1.003013 1.003013

#Autocorrelación: require(car) 

dwt(modelo, alternative = "two.sided")

##  lag Autocorrelation D-W Statistic p-value
##    1     -0.04530537      2.080781   0.544
##  Alternative hypothesis: rho != 0

# 5.Identificación de posibles valores atípicos o influyentes

library(dplyr) 

marketing$studentized_residual <- rstudent(modelo)
ggplot(data = marketing, aes(x = predict(modelo),
                             y = abs(studentized_residual))) +
  geom_hline(yintercept = 3, color = "grey", linetype = "dashed") +
  # se identifican en rojo observaciones con residuos estandarizados absolutos > 3 
  geom_point(aes(color = ifelse(abs(studentized_residual) > 3, 'red', 'black'))) +
  scale_color_identity() + 
  labs(title = "Distribución de los residuos studentized", x = "predicción modelo") +
  theme_bw() + theme(plot.title = element_text(hjust = 0.5))

which(abs(marketing$studentized_residual) > 3)

## [1]   6 131

summary(influence.measures(modelo))

## Potentially influential observations of
##   lm(formula = sales ~ youtube + facebook, data = marketing) :
## 
##     dfb.1_ dfb.yotb dfb.fcbk dffit   cov.r   cook.d hat  
## 6   -0.10   0.41    -0.43    -0.62_*  0.89_*  0.12   0.03
## 36  -0.01  -0.32     0.26    -0.44_*  0.95_*  0.06   0.03
## 127 -0.15   0.29    -0.20    -0.39_*  0.95_*  0.05   0.02
## 131 -0.36   0.73    -0.49    -0.95_*  0.66_*  0.26   0.03
## 179 -0.05  -0.29     0.28    -0.44_*  0.94_*  0.06   0.03

influencePlot(modelo)

##       StudRes        Hat      CookD
## 6   -3.295069 0.03465182 0.12372141
## 131 -5.714235 0.02678270 0.25806539
## 176  1.341284 0.03011550 0.01854523

Análisis de Regresión Lineal Múltiple

Manuel

31-07-2020