#install.packages("remotes")
#remotes::install_github("kassambara/datarium")
data("marketing", package = "datarium")
head(marketing, 4)
## youtube facebook newspaper sales
## 1 276.12 45.36 83.04 26.52
## 2 53.40 47.16 54.12 12.48
## 3 20.64 55.08 83.16 11.16
## 4 181.80 49.56 70.20 22.20
library(tidyverse)
## -- Attaching packages ------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.0
## v tidyr 1.1.0 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# 1.Analizar la relación entre variables
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
multi.hist(marketing,dcol = c("blue", "red"),
dlty = c("dotted", "solid"))

ggpairs(marketing,lower = list(continuous = "smooth"),
diag = list(continuous = "bar"), axisLabels = "none")
## Warning in check_and_set_ggpairs_defaults("diag", diag, continuous =
## "densityDiag", : Changing diag$continuous from 'bar' to 'barDiag'
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# 2.Generar el modelo
modelo<-lm(sales ~ youtube + facebook,data=marketing)
summary(modelo)
##
## Call:
## lm(formula = sales ~ youtube + facebook, data = marketing)
##
## Residuals:
## Min 1Q Median 3Q Max
## -10.5572 -1.0502 0.2906 1.4049 3.3994
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.50532 0.35339 9.919 <2e-16 ***
## youtube 0.04575 0.00139 32.909 <2e-16 ***
## facebook 0.18799 0.00804 23.382 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.018 on 197 degrees of freedom
## Multiple R-squared: 0.8972, Adjusted R-squared: 0.8962
## F-statistic: 859.6 on 2 and 197 DF, p-value: < 2.2e-16
#Selección de los mejores predictores
step(modelo, direction = "both", trace = 1)
## Start: AIC=283.75
## sales ~ youtube + facebook
##
## Df Sum of Sq RSS AIC
## <none> 802.0 283.75
## - facebook 1 2225.7 3027.6 547.44
## - youtube 1 4408.7 5210.6 656.03
##
## Call:
## lm(formula = sales ~ youtube + facebook, data = marketing)
##
## Coefficients:
## (Intercept) youtube facebook
## 3.50532 0.04575 0.18799
# 4.Validación de condiciones para la regresión múltiple lineal
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
plot1 <- ggplot(data = marketing, aes(youtube, modelo$residuals)) +
geom_point() + geom_smooth(color = "firebrick") +
geom_hline(yintercept = 0) + theme_bw()
plot2 <- ggplot(data = marketing, aes(facebook, modelo$residuals)) +
geom_point() + geom_smooth(color = "firebrick") +
geom_hline(yintercept = 0) + theme_bw()
grid.arrange(plot1, plot2)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# Distribución normal de los residuos
qqnorm(modelo$residuals)
qqline(modelo$residuals)

shapiro.test(modelo$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo$residuals
## W = 0.91804, p-value = 4.19e-09
# Varianza constante de los residuos (homocedasticidad):
ggplot(data = marketing, aes(modelo$fitted.values, modelo$residuals)) +
geom_point() + geom_smooth(color = "firebrick", se = FALSE) +
geom_hline(yintercept = 0) + theme_bw()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
bptest(modelo)
##
## studentized Breusch-Pagan test
##
## data: modelo
## BP = 4.8093, df = 2, p-value = 0.0903
# No multicolinealidad:
# Análisis de Inflación de Varianza (VIF)
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
vif(modelo)
## youtube facebook
## 1.003013 1.003013
#Autocorrelación: require(car)
dwt(modelo, alternative = "two.sided")
## lag Autocorrelation D-W Statistic p-value
## 1 -0.04530537 2.080781 0.544
## Alternative hypothesis: rho != 0
# 5.Identificación de posibles valores atípicos o influyentes
library(dplyr)
marketing$studentized_residual <- rstudent(modelo)
ggplot(data = marketing, aes(x = predict(modelo),
y = abs(studentized_residual))) +
geom_hline(yintercept = 3, color = "grey", linetype = "dashed") +
# se identifican en rojo observaciones con residuos estandarizados absolutos > 3
geom_point(aes(color = ifelse(abs(studentized_residual) > 3, 'red', 'black'))) +
scale_color_identity() +
labs(title = "Distribución de los residuos studentized", x = "predicción modelo") +
theme_bw() + theme(plot.title = element_text(hjust = 0.5))

which(abs(marketing$studentized_residual) > 3)
## [1] 6 131
summary(influence.measures(modelo))
## Potentially influential observations of
## lm(formula = sales ~ youtube + facebook, data = marketing) :
##
## dfb.1_ dfb.yotb dfb.fcbk dffit cov.r cook.d hat
## 6 -0.10 0.41 -0.43 -0.62_* 0.89_* 0.12 0.03
## 36 -0.01 -0.32 0.26 -0.44_* 0.95_* 0.06 0.03
## 127 -0.15 0.29 -0.20 -0.39_* 0.95_* 0.05 0.02
## 131 -0.36 0.73 -0.49 -0.95_* 0.66_* 0.26 0.03
## 179 -0.05 -0.29 0.28 -0.44_* 0.94_* 0.06 0.03
influencePlot(modelo)

## StudRes Hat CookD
## 6 -3.295069 0.03465182 0.12372141
## 131 -5.714235 0.02678270 0.25806539
## 176 1.341284 0.03011550 0.01854523