################################################################################
##############------- Statistical Essentials ------------#######################
################################################################################
# Autor: Mg. Ing. Ernesto D. Cancho-RodrÃguez, MBA George Washington University
# Email: ecr@gwu.edu
# Tema: Modelos de Pronostico : Analisis Regresion Simple y Multiple
# Version: 1.0
#########################################################################
#---------------------------------------------------------
# Cambiar el directorio de trabajo
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
getwd()
## [1] "/cloud/project"
#--------------------------------------------
############################################
# Analisis de Regresion Simple #############
############################################
# Librerias basicas para el estudio de series temporales
library(ggplot2) # Graficas y visualizacion
library(TSA) # Formato y trabajar con series de tiempo
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(forecast) # Estimaciones y pronosticos de series de tiempo
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## fitted.Arima TSA
## plot.Arima TSA
library(scales) # Preprocesamiento de los datos
library(stats) # Preprocesamiento mas pruebas estadisticas
# Leemos la data y nos hacemos las siguientes preguntas:
# Hipotesis!
# ?La relacion es lineal?
# ?Hay complementariedad entre los tipos de medio?
# ?Existe relacion entre el presupuesto de marketing y las ventas?
# ?Cuan fuerte es la relacion (si existe)?
# ?Que canal contribuye mas a las ventas?
# ?Cuan precisamente podemos estimar el efecto de cada uno de los tipos de medios sobre las ventas?
# ?Cuan recisamente podemos predecir las ventas futuras?
library(readxl)
data <- read.csv("bostonvivienda.csv")
#--------------------------------------------------------
# 1. Deteccion de valores perdidos
# Deteccion de valores perdidos con el paquete DataExplorer
library(DataExplorer)
plot_missing(data)

# Analisis Univariado de la data
summary(data)
## crim zn indus nox
## Min. : 0.00632 Min. : 0.00 Min. : 0.46 Min. :0.3850
## 1st Qu.: 0.08205 1st Qu.: 0.00 1st Qu.: 5.19 1st Qu.:0.4490
## Median : 0.25651 Median : 0.00 Median : 9.69 Median :0.5380
## Mean : 3.61352 Mean : 11.36 Mean :11.14 Mean :0.5547
## 3rd Qu.: 3.67708 3rd Qu.: 12.50 3rd Qu.:18.10 3rd Qu.:0.6240
## Max. :88.97620 Max. :100.00 Max. :27.74 Max. :0.8710
## rm edad dis rad
## Min. :3.561 Min. : 2.90 Min. : 1.130 Min. : 1.000
## 1st Qu.:5.886 1st Qu.: 45.02 1st Qu.: 2.100 1st Qu.: 4.000
## Median :6.208 Median : 77.50 Median : 3.207 Median : 5.000
## Mean :6.285 Mean : 68.57 Mean : 3.795 Mean : 9.549
## 3rd Qu.:6.623 3rd Qu.: 94.08 3rd Qu.: 5.188 3rd Qu.:24.000
## Max. :8.780 Max. :100.00 Max. :12.127 Max. :24.000
## impuesto ptratio negro lstat
## Min. :187.0 Min. :12.60 Min. : 0.32 Min. : 1.73
## 1st Qu.:279.0 1st Qu.:17.40 1st Qu.:375.38 1st Qu.: 6.95
## Median :330.0 Median :19.05 Median :391.44 Median :11.36
## Mean :408.2 Mean :18.46 Mean :356.67 Mean :12.65
## 3rd Qu.:666.0 3rd Qu.:20.20 3rd Qu.:396.23 3rd Qu.:16.95
## Max. :711.0 Max. :22.00 Max. :396.90 Max. :37.97
## medv
## Min. : 5.00
## 1st Qu.:17.02
## Median :21.20
## Mean :22.53
## 3rd Qu.:25.00
## Max. :50.00
boxplot(data)

#--------------------------------------------------------
# 2. Analisis Bivariado de la data
# Correlacion de Pearson
# 0 < r < 0.30 Debil
# 0.30 < r < 0.50 Considerable (Leve)
# 0.50 < r < 0.70 Fuerte (Moderada)
# 0.70 < r < 0.85 Muy fuerte (Fuerte)
# r > 0.85 Linealidad casi perfecta (Muy fuerte)
cor(data$medv,data$crim,method = "pearson") # Cor. lineal directa pero debil.
## [1] -0.3883046
cor(data$medv,data$nox,method = "pearson") # Cor. lineal directa y fuerte.
## [1] -0.4273208
cor(data$medv,data$rm,method = "pearson") # Cor. lineal directa y muy fuerte.
## [1] 0.6953599
# Analisis Bivariado de la data
correlacion<-cor(data)
library(corrplot)
## corrplot 0.92 loaded
corrplot(correlacion, method="number", type="upper")

library("PerformanceAnalytics")
## Loading required package: xts
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
## ################################### WARNING ###################################
## # We noticed you have dplyr installed. The dplyr lag() function breaks how #
## # base R's lag() function is supposed to work, which breaks lag(my_xts). #
## # #
## # If you call library(dplyr) later in this session, then calls to lag(my_xts) #
## # that you enter or source() into this session won't work correctly. #
## # #
## # All package code is unaffected because it is protected by the R namespace #
## # mechanism. #
## # #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## # You can use stats::lag() to make sure you're not using dplyr::lag(), or you #
## # can add conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## ################################### WARNING ###################################
##
## Attaching package: 'PerformanceAnalytics'
## The following objects are masked from 'package:TSA':
##
## kurtosis, skewness
## The following object is masked from 'package:graphics':
##
## legend
chart.Correlation(data, histogram=TRUE, pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter

library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:scales':
##
## alpha, rescale
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
pairs.panels(data, scale=TRUE)

library(corrplot)
corrplot.mixed(cor(data), order="hclust", tl.col="black")

library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
ggpairs(data)

ggcorr(data, nbreaks=8, palette='RdGy', label=TRUE, label_size=5, label_color='white')

#--------------------------------------------------------
# 3. Colinealidad o Multicolinealidad
# correlacion2<-cor(data[,1:3])
# altaCorr <- findCorrelation(correlacion2, cutoff = .60, names=TRUE)
# altaCorr
# No deberia existir relacion entre las variables independientes.
# La SBS considera multicolinealidad a partir de 0.50.
#-------------------------------------------------------------------
# 4. Seleccion de muestra de entrenamiento (70%) y de prueba (30%)
str(data)
## 'data.frame': 506 obs. of 13 variables:
## $ crim : num 0.00632 0.02731 0.02729 0.03237 0.06905 ...
## $ zn : num 18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
## $ indus : num 2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
## $ nox : num 0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
## $ rm : num 6.58 6.42 7.18 7 7.15 ...
## $ edad : num 65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
## $ dis : num 4.09 4.97 4.97 6.06 6.06 ...
## $ rad : int 1 2 2 3 3 3 5 5 5 5 ...
## $ impuesto: int 296 242 242 222 222 222 311 311 311 311 ...
## $ ptratio : num 15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
## $ negro : num 397 397 393 395 397 ...
## $ lstat : num 4.98 9.14 4.03 2.94 5.33 ...
## $ medv : num 24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...
library(caret)
## Loading required package: lattice
set.seed(2021) # Semilla aleatoria!
index <- createDataPartition(data$medv, p=0.7, list=FALSE)
data.train <- data[ index, ] # 356 datos de entrenamiento
data.test <- data[-index, ] # 150 datos de testing
#-------------------------------------------------------------------
# 5. Modelos Parametricos
# Ajustamos un modelo lineal entre las ventas y el monto invertido en publicidad por TV
m <- lm(medv ~ rm,
data = data.train)
# Y ~ X Regresion Lineal Simple
# Vemos un resumen del modelo
summary(m)
##
## Call:
## lm(formula = medv ~ rm, data = data.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -22.7766 -2.6823 0.0688 3.1601 31.1278
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -33.3138 3.0718 -10.85 <2e-16 ***
## rm 8.8827 0.4854 18.30 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.61 on 354 degrees of freedom
## Multiple R-squared: 0.4861, Adjusted R-squared: 0.4847
## F-statistic: 334.9 on 1 and 354 DF, p-value: < 2.2e-16
# Predecir sobre nuevos registros
x_nuevos<-data.frame(rm=c(14,150,100,250))
# El objetivo final es el pronostico o prediccion
predict(m,x_nuevos)
## 1 2 3 4
## 91.04441 1299.09543 854.95903 2187.36825
# El objetivo final es el pronostico o prediccion
pred <- predict(m,data.test)
# Comparamos los valores reales y predichos
library(forecast)
accuracy(data.test$medv,pred)
## ME RMSE MAE MPE MAPE
## Test set -0.07385454 6.63796 4.26086 -1.042913 20.01008
# Metodologia comparacion
Comparacion <- data.frame(VentasReales = data.test$medv,
VentasEstimadas = round(pred,1))
# Exportar csv
write.csv(Comparacion,
"Comparativa_Mod_Regresion.csv",row.names = F)
# Obtenemos los valores ajustados o predichos
data.train$fitted <- m$fitted.values
# Podemos ver también los residuales
data.train$residual <- m$residuals
ggplot(data = data.train, aes(x = rm, y = medv)) + geom_point(color = "red") +
geom_line(aes(y = fitted), color = "blue") +
geom_segment(aes(x = rm, xend = rm, y = medv, yend = fitted, color="Distancia"), color = "grey80") +
labs(xlab = "Número de habitaciones por vivienda", ylab = "Precio mediano") +
theme_bw()

# Guardar un Modelo Predictivo
saveRDS(m,"Modelo_Regresion.rds")
# Implemento el modelo!
# Predecir sobre nuevos registros
x_nuevos<-data.frame(rm=c(84))
# Utilizar un modelo desarrollado!
m_implementacion <-readRDS("Modelo_Regresion.rds")
# El objetivo final es el pronostico o prediccion
predict(m_implementacion,x_nuevos)
## 1
## 712.8354
##############################################
# ANALISIS DE REGRESION MULTIPLE#############
##############################################
#---------------------------------------------------------
library(readxl)
data2 <- read.csv("bostonvivienda.csv")
#-------------------------------------------------------------------
# Seleccion de muestra de entrenamiento (70%) y de prueba (30%)
library(caret)
set.seed(2021)
##############################################
# ANALISIS DE REGRESION MULTIPLE#############
##############################################
#---------------------------------------------------------
library(readxl)
data2 <- read.csv("bostonvivienda.csv")
#-------------------------------------------------------------------
# Seleccion de muestra de entrenamiento (70%) y de prueba (30%)
library(caret)
set.seed(2021)
index <- createDataPartition(data2$medv, p=0.7, list=FALSE)
data.train2 <- data2[ index, ] # 142 datos trainig
data.test2 <- data2[-index, ] # 58 datos testing
# Ajustamos un modelo lineal entre las ventas y el monto invertido en publicidad por TV
mm <- lm(medv ~ . , data = data.train2)
# Vemos un resumen del modelo
summary(mm)
##
## Call:
## lm(formula = medv ~ ., data = data.train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -13.1086 -2.8061 -0.4131 2.0603 26.7825
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.614e+01 5.911e+00 6.114 2.64e-09 ***
## crim -1.088e-01 4.893e-02 -2.223 0.026838 *
## zn 5.467e-02 1.693e-02 3.228 0.001365 **
## indus 4.287e-02 7.269e-02 0.590 0.555675
## nox -1.682e+01 4.531e+00 -3.713 0.000239 ***
## rm 3.926e+00 4.824e-01 8.140 7.37e-15 ***
## edad 7.849e-04 1.544e-02 0.051 0.959494
## dis -1.519e+00 2.459e-01 -6.176 1.86e-09 ***
## rad 3.306e-01 8.174e-02 4.044 6.48e-05 ***
## impuesto -1.554e-02 4.633e-03 -3.354 0.000886 ***
## ptratio -9.569e-01 1.555e-01 -6.153 2.12e-09 ***
## negro 9.406e-03 3.357e-03 2.802 0.005365 **
## lstat -5.081e-01 6.076e-02 -8.364 1.55e-15 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 4.781 on 343 degrees of freedom
## Multiple R-squared: 0.7395, Adjusted R-squared: 0.7304
## F-statistic: 81.14 on 12 and 343 DF, p-value: < 2.2e-16
# El objetivo final es el pronostico o prediccion
pred2 <- predict(mm,data.test2)
# Comparamos los valores reales y predichos
library(forecast)
accuracy(data.test2$medv,pred2)
## ME RMSE MAE MPE MAPE
## Test set 0.0885859 4.818987 3.378731 0.9444371 18.76841
# Metodologia comparacion!
Comparacion2 <- data.frame(data.test2$medv,pred2)
write.csv(Comparacion2,"Comparativa_Mod_Regresion2.csv")
# Obtenemos los valores ajustados o predichos
data.train2$fitted <- mm$fitted.values
# Podemos ver también los residuales
data.train2$residual <- mm$residuals
### FIN ####