R Forecasting Model wtih Multiple Regression Analysis, Mg. Ing. Ernesto D. Cancho-Rodríguez, MBA George Washington University

################################################################################
##############------- Statistical Essentials ------------#######################
################################################################################
# Autor: Mg. Ing. Ernesto D. Cancho-Rodríguez, MBA George Washington University
# Email: ecr@gwu.edu
# Tema: Modelos de Pronostico : Analisis Regresion Simple y Multiple
# Version: 1.0
#########################################################################

#---------------------------------------------------------
# Cambiar el directorio de trabajo
# setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
getwd()

## [1] "/cloud/project"

#--------------------------------------------
############################################
# Analisis de Regresion Simple #############
############################################

# Librerias basicas para el estudio de series temporales

library(ggplot2)  # Graficas y visualizacion
library(TSA)      # Formato y trabajar con series de tiempo

## 
## Attaching package: 'TSA'

## The following objects are masked from 'package:stats':
## 
##     acf, arima

## The following object is masked from 'package:utils':
## 
##     tar

library(forecast) # Estimaciones y pronosticos de series de tiempo

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Registered S3 methods overwritten by 'forecast':
##   method       from
##   fitted.Arima TSA 
##   plot.Arima   TSA

library(scales)   # Preprocesamiento de los datos
library(stats)    # Preprocesamiento  mas pruebas estadisticas

# Leemos la data y nos hacemos las siguientes preguntas:

# Hipotesis!
# ?La relacion es lineal?
# ?Hay complementariedad entre los tipos de medio?
# ?Existe relacion entre el presupuesto de marketing y las ventas?
# ?Cuan fuerte es la relacion (si existe)?
# ?Que canal  contribuye mas a las ventas?
# ?Cuan precisamente podemos estimar el efecto de cada uno de los tipos de medios sobre las ventas?
# ?Cuan recisamente podemos predecir las ventas futuras?

library(readxl)
data <- read.csv("bostonvivienda.csv")
#--------------------------------------------------------
# 1. Deteccion de valores perdidos

# Deteccion de valores perdidos con el paquete DataExplorer
library(DataExplorer)
plot_missing(data)

# Analisis Univariado de la data
summary(data)

##       crim                zn             indus            nox        
##  Min.   : 0.00632   Min.   :  0.00   Min.   : 0.46   Min.   :0.3850  
##  1st Qu.: 0.08205   1st Qu.:  0.00   1st Qu.: 5.19   1st Qu.:0.4490  
##  Median : 0.25651   Median :  0.00   Median : 9.69   Median :0.5380  
##  Mean   : 3.61352   Mean   : 11.36   Mean   :11.14   Mean   :0.5547  
##  3rd Qu.: 3.67708   3rd Qu.: 12.50   3rd Qu.:18.10   3rd Qu.:0.6240  
##  Max.   :88.97620   Max.   :100.00   Max.   :27.74   Max.   :0.8710  
##        rm             edad             dis              rad        
##  Min.   :3.561   Min.   :  2.90   Min.   : 1.130   Min.   : 1.000  
##  1st Qu.:5.886   1st Qu.: 45.02   1st Qu.: 2.100   1st Qu.: 4.000  
##  Median :6.208   Median : 77.50   Median : 3.207   Median : 5.000  
##  Mean   :6.285   Mean   : 68.57   Mean   : 3.795   Mean   : 9.549  
##  3rd Qu.:6.623   3rd Qu.: 94.08   3rd Qu.: 5.188   3rd Qu.:24.000  
##  Max.   :8.780   Max.   :100.00   Max.   :12.127   Max.   :24.000  
##     impuesto        ptratio          negro            lstat      
##  Min.   :187.0   Min.   :12.60   Min.   :  0.32   Min.   : 1.73  
##  1st Qu.:279.0   1st Qu.:17.40   1st Qu.:375.38   1st Qu.: 6.95  
##  Median :330.0   Median :19.05   Median :391.44   Median :11.36  
##  Mean   :408.2   Mean   :18.46   Mean   :356.67   Mean   :12.65  
##  3rd Qu.:666.0   3rd Qu.:20.20   3rd Qu.:396.23   3rd Qu.:16.95  
##  Max.   :711.0   Max.   :22.00   Max.   :396.90   Max.   :37.97  
##       medv      
##  Min.   : 5.00  
##  1st Qu.:17.02  
##  Median :21.20  
##  Mean   :22.53  
##  3rd Qu.:25.00  
##  Max.   :50.00

boxplot(data)

#--------------------------------------------------------
# 2. Analisis Bivariado de la data
# Correlacion de Pearson

# 0     < r < 0.30 Debil
# 0.30  < r < 0.50 Considerable (Leve)
# 0.50  < r < 0.70 Fuerte (Moderada) 
# 0.70  < r < 0.85 Muy fuerte (Fuerte)
#         r > 0.85 Linealidad casi perfecta (Muy fuerte)

cor(data$medv,data$crim,method = "pearson") # Cor. lineal directa pero debil.

## [1] -0.3883046

cor(data$medv,data$nox,method = "pearson")  # Cor. lineal directa y fuerte.

## [1] -0.4273208

cor(data$medv,data$rm,method = "pearson")     # Cor. lineal directa y muy fuerte.

## [1] 0.6953599

# Analisis Bivariado de la data
correlacion<-cor(data)
library(corrplot)

## corrplot 0.92 loaded

corrplot(correlacion, method="number", type="upper")

library("PerformanceAnalytics")

## Loading required package: xts

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

## 
## ################################### WARNING ###################################
## # We noticed you have dplyr installed. The dplyr lag() function breaks how    #
## # base R's lag() function is supposed to work, which breaks lag(my_xts).      #
## #                                                                             #
## # If you call library(dplyr) later in this session, then calls to lag(my_xts) #
## # that you enter or source() into this session won't work correctly.          #
## #                                                                             #
## # All package code is unaffected because it is protected by the R namespace   #
## # mechanism.                                                                  #
## #                                                                             #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## # You can use stats::lag() to make sure you're not using dplyr::lag(), or you #
## # can add conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop   #
## # dplyr from breaking base R's lag() function.                                #
## ################################### WARNING ###################################

## 
## Attaching package: 'PerformanceAnalytics'

## The following objects are masked from 'package:TSA':
## 
##     kurtosis, skewness

## The following object is masked from 'package:graphics':
## 
##     legend

chart.Correlation(data, histogram=TRUE, pch=19)

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

library(psych)

## 
## Attaching package: 'psych'

## The following objects are masked from 'package:scales':
## 
##     alpha, rescale

## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha

pairs.panels(data, scale=TRUE)

library(corrplot)
corrplot.mixed(cor(data), order="hclust", tl.col="black")

library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

ggpairs(data)

ggcorr(data, nbreaks=8, palette='RdGy', label=TRUE, label_size=5, label_color='white')

#--------------------------------------------------------
# 3. Colinealidad o Multicolinealidad
# correlacion2<-cor(data[,1:3])
# altaCorr <- findCorrelation(correlacion2, cutoff = .60, names=TRUE)
# altaCorr

# No deberia existir relacion entre las variables independientes.
# La SBS considera multicolinealidad a partir de 0.50.

#-------------------------------------------------------------------
# 4. Seleccion de muestra de entrenamiento (70%) y de prueba (30%)
str(data)

## 'data.frame':    506 obs. of  13 variables:
##  $ crim    : num  0.00632 0.02731 0.02729 0.03237 0.06905 ...
##  $ zn      : num  18 0 0 0 0 0 12.5 12.5 12.5 12.5 ...
##  $ indus   : num  2.31 7.07 7.07 2.18 2.18 2.18 7.87 7.87 7.87 7.87 ...
##  $ nox     : num  0.538 0.469 0.469 0.458 0.458 0.458 0.524 0.524 0.524 0.524 ...
##  $ rm      : num  6.58 6.42 7.18 7 7.15 ...
##  $ edad    : num  65.2 78.9 61.1 45.8 54.2 58.7 66.6 96.1 100 85.9 ...
##  $ dis     : num  4.09 4.97 4.97 6.06 6.06 ...
##  $ rad     : int  1 2 2 3 3 3 5 5 5 5 ...
##  $ impuesto: int  296 242 242 222 222 222 311 311 311 311 ...
##  $ ptratio : num  15.3 17.8 17.8 18.7 18.7 18.7 15.2 15.2 15.2 15.2 ...
##  $ negro   : num  397 397 393 395 397 ...
##  $ lstat   : num  4.98 9.14 4.03 2.94 5.33 ...
##  $ medv    : num  24 21.6 34.7 33.4 36.2 28.7 22.9 27.1 16.5 18.9 ...

library(caret)

## Loading required package: lattice

set.seed(2021) # Semilla aleatoria!

index      <- createDataPartition(data$medv, p=0.7, list=FALSE)
data.train <- data[ index, ]            # 356 datos de entrenamiento             
data.test  <- data[-index, ]            # 150  datos de testing

#-------------------------------------------------------------------
# 5. Modelos Parametricos 
# Ajustamos un modelo lineal entre las ventas y el monto invertido en publicidad por TV

m <- lm(medv ~ rm, 
        data = data.train)

# Y ~ X Regresion Lineal Simple

# Vemos un resumen del modelo
summary(m)

## 
## Call:
## lm(formula = medv ~ rm, data = data.train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -22.7766  -2.6823   0.0688   3.1601  31.1278 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -33.3138     3.0718  -10.85   <2e-16 ***
## rm            8.8827     0.4854   18.30   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 6.61 on 354 degrees of freedom
## Multiple R-squared:  0.4861, Adjusted R-squared:  0.4847 
## F-statistic: 334.9 on 1 and 354 DF,  p-value: < 2.2e-16

# Predecir sobre nuevos registros
x_nuevos<-data.frame(rm=c(14,150,100,250))

# El objetivo final es el pronostico o prediccion
predict(m,x_nuevos)

##          1          2          3          4 
##   91.04441 1299.09543  854.95903 2187.36825

# El objetivo final es el pronostico o prediccion
pred <- predict(m,data.test)

# Comparamos los valores reales y predichos
library(forecast)
accuracy(data.test$medv,pred)

##                   ME    RMSE     MAE       MPE     MAPE
## Test set -0.07385454 6.63796 4.26086 -1.042913 20.01008

# Metodologia comparacion

Comparacion <- data.frame(VentasReales = data.test$medv,
                          VentasEstimadas = round(pred,1))

# Exportar csv
write.csv(Comparacion,
          "Comparativa_Mod_Regresion.csv",row.names = F)


# Obtenemos los valores ajustados o predichos
data.train$fitted <- m$fitted.values

# Podemos ver también los residuales
data.train$residual <- m$residuals

ggplot(data = data.train, aes(x = rm, y = medv)) + geom_point(color = "red") +
  geom_line(aes(y = fitted), color = "blue") +
  geom_segment(aes(x = rm, xend = rm, y = medv, yend = fitted, color="Distancia"), color = "grey80") +
  labs(xlab = "Número de habitaciones por vivienda", ylab = "Precio mediano") + 
  theme_bw()

# Guardar un Modelo Predictivo
saveRDS(m,"Modelo_Regresion.rds")

# Implemento el modelo!
# Predecir sobre nuevos registros

x_nuevos<-data.frame(rm=c(84))

# Utilizar un modelo desarrollado!
m_implementacion <-readRDS("Modelo_Regresion.rds")

# El objetivo final es el pronostico o prediccion
predict(m_implementacion,x_nuevos)

##        1 
## 712.8354

##############################################
# ANALISIS DE REGRESION MULTIPLE#############
##############################################

#---------------------------------------------------------
library(readxl)
data2 <- read.csv("bostonvivienda.csv")

#-------------------------------------------------------------------
# Seleccion de muestra de entrenamiento (70%) y de prueba (30%)
library(caret)
set.seed(2021) 

##############################################
# ANALISIS DE REGRESION MULTIPLE#############
##############################################

#---------------------------------------------------------
library(readxl)
data2 <- read.csv("bostonvivienda.csv")

#-------------------------------------------------------------------
# Seleccion de muestra de entrenamiento (70%) y de prueba (30%)
library(caret)
set.seed(2021) 

index      <- createDataPartition(data2$medv, p=0.7, list=FALSE)
data.train2 <- data2[ index, ]            # 142 datos trainig             
data.test2  <- data2[-index, ]            # 58 datos testing

# Ajustamos un modelo lineal entre las ventas y el monto invertido en publicidad por TV
mm <- lm(medv ~ . , data = data.train2)

# Vemos un resumen del modelo
summary(mm)

## 
## Call:
## lm(formula = medv ~ ., data = data.train2)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -13.1086  -2.8061  -0.4131   2.0603  26.7825 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  3.614e+01  5.911e+00   6.114 2.64e-09 ***
## crim        -1.088e-01  4.893e-02  -2.223 0.026838 *  
## zn           5.467e-02  1.693e-02   3.228 0.001365 ** 
## indus        4.287e-02  7.269e-02   0.590 0.555675    
## nox         -1.682e+01  4.531e+00  -3.713 0.000239 ***
## rm           3.926e+00  4.824e-01   8.140 7.37e-15 ***
## edad         7.849e-04  1.544e-02   0.051 0.959494    
## dis         -1.519e+00  2.459e-01  -6.176 1.86e-09 ***
## rad          3.306e-01  8.174e-02   4.044 6.48e-05 ***
## impuesto    -1.554e-02  4.633e-03  -3.354 0.000886 ***
## ptratio     -9.569e-01  1.555e-01  -6.153 2.12e-09 ***
## negro        9.406e-03  3.357e-03   2.802 0.005365 ** 
## lstat       -5.081e-01  6.076e-02  -8.364 1.55e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.781 on 343 degrees of freedom
## Multiple R-squared:  0.7395, Adjusted R-squared:  0.7304 
## F-statistic: 81.14 on 12 and 343 DF,  p-value: < 2.2e-16

# El objetivo final es el pronostico o prediccion
pred2 <- predict(mm,data.test2)

# Comparamos los valores reales y predichos
library(forecast)
accuracy(data.test2$medv,pred2)

##                 ME     RMSE      MAE       MPE     MAPE
## Test set 0.0885859 4.818987 3.378731 0.9444371 18.76841

# Metodologia comparacion!
Comparacion2 <- data.frame(data.test2$medv,pred2)
write.csv(Comparacion2,"Comparativa_Mod_Regresion2.csv")

# Obtenemos los valores ajustados o predichos
data.train2$fitted <- mm$fitted.values
# Podemos ver también los residuales
data.train2$residual <- mm$residuals


### FIN ####