options(scipen = 999)
library(lmtest)
## Warning: package 'lmtest' was built under R version 4.5.2
## Cargando paquete requerido: zoo
## Warning: package 'zoo' was built under R version 4.5.2
##
## Adjuntando el paquete: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(stargazer)
## Warning: package 'stargazer' was built under R version 4.5.2
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
library(equatiomatic) # optativo remotes::install_github("datalorax/equatiomatic")
## Warning: package 'equatiomatic' was built under R version 4.5.3
##
## Adjuntando el paquete: 'equatiomatic'
## The following object is masked from 'package:datasets':
##
## penguins
library(mlbench) #esta librería tiene el data frame BostonHousing
## Warning: package 'mlbench' was built under R version 4.5.3
data(BostonHousing)
#Modelo estimado medv~. indica "medv" en función del resto de variables del dataframe
modelo_boston<-lm(formula = medv~.,data=BostonHousing)
extract_eq(modelo_boston,wrap = TRUE) #optativo
\[ \begin{aligned} \operatorname{medv} &= \alpha + \beta_{1}(\operatorname{crim}) + \beta_{2}(\operatorname{zn}) + \beta_{3}(\operatorname{indus})\ + \\ &\quad \beta_{4}(\operatorname{chas}_{\operatorname{1}}) + \beta_{5}(\operatorname{nox}) + \beta_{6}(\operatorname{rm}) + \beta_{7}(\operatorname{age})\ + \\ &\quad \beta_{8}(\operatorname{dis}) + \beta_{9}(\operatorname{rad}) + \beta_{10}(\operatorname{tax}) + \beta_{11}(\operatorname{ptratio})\ + \\ &\quad \beta_{12}(\operatorname{b}) + \beta_{13}(\operatorname{lstat}) + \epsilon \end{aligned} \]
coeftest(modelo_boston)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 36.45948839 5.10345881 7.1441 0.0000000000032834 ***
## crim -0.10801136 0.03286499 -3.2865 0.0010868 **
## zn 0.04642046 0.01372746 3.3816 0.0007781 ***
## indus 0.02055863 0.06149569 0.3343 0.7382881
## chas1 2.68673382 0.86157976 3.1184 0.0019250 **
## nox -17.76661123 3.81974371 -4.6513 0.0000042456438076 ***
## rm 3.80986521 0.41792525 9.1161 < 0.00000000000000022 ***
## age 0.00069222 0.01320978 0.0524 0.9582293
## dis -1.47556685 0.19945473 -7.3980 0.0000000000006013 ***
## rad 0.30604948 0.06634644 4.6129 0.0000050705290227 ***
## tax -0.01233459 0.00376054 -3.2800 0.0011116 **
## ptratio -0.95274723 0.13082676 -7.2825 0.0000000000013088 ***
## b 0.00931168 0.00268596 3.4668 0.0005729 ***
## lstat -0.52475838 0.05071528 -10.3471 < 0.00000000000000022 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(stargazer)
#Data para la predicción X'm
X_m<-data.frame(crim=0.05,zn=15,indus=2,chas="0",nox=0.004,
rm=5,age=85,dis=5.56,rad=2,tax=300,ptratio=17,b=0.00005,lstat=5)
# Intervalos de Confianza del 95% y del 99%
confidense<-c(0.95,0.99)
#Predicción usando predict
predict(object = modelo_boston,
newdata = X_m,
interval = "prediction",
level = confidense,
se.fit =TRUE)->predicciones
rownames(predicciones$fit)<-as.character(confidense*100)
colnames(predicciones$fit)<-c("Ym","Li","Ls")
stargazer(predicciones$fit,
title = "Pronósticos e intervalos de confianza",
type = "text") #Poner results='asis' en opciones del chunk
95 26.116 15.558 36.673 99 26.116 12.221 40.010 ———————– ## Usando la libreria Forescat
library(forecast)
## Warning: package 'forecast' was built under R version 4.5.3
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.5.3
#Data para la predicción X'm
X_m<-data.frame(crim=0.05,zn=15,indus=2,chas="0",nox=0.004,
rm=5,age=85,dis=5.56,rad=2,tax=300,ptratio=17,b=0.00005,lstat=5)
#Nivel de confianza para el intervalo de confianza
confidense<-c(0.95,0.99)
#Realizando el pronóstico con forecast
pronosticos<-forecast(object = modelo_boston,
level = confidense,
newdata = X_m,ts = FALSE)
kable(pronosticos,
caption = "Pronóstico e intervalos de confianza:",
digits = 2,format = "html") #Poner results='asis' en opciones del chunk
| Point Forecast | Lo 95 | Hi 95 | Lo 99 | Hi 99 |
|---|---|---|---|---|
| 26.12 | 15.56 | 36.67 | 12.22 | 40.01 |
#Bias Proportion
Um<-function(pronosticado,observado){
library(DescTools)
((mean(pronosticado)-mean(observado))^2)/MSE(pronosticado,observado)
}
#Variance Proportion
Us<-function(pronosticado,observado){
library(DescTools)
((sd(pronosticado)-sd(observado))^2)/MSE(pronosticado,observado)
}
#Covariance Proportion
Uc<-function(pronosticado,observado){
library(DescTools)
(2*(1-cor(pronosticado,observado))*sd(pronosticado)*sd(observado))/MSE(pronosticado,observado)}
#Coeficiente U de Theil (también aparece en la librería "DescTools")
THEIL_U<-function(pronosticado,observado){
library(DescTools)
RMSE(pronosticado,observado)/(sqrt(mean(pronosticado^2))+sqrt(mean(observado^2)))
}
# =====================================================================
# PARTE 1: REPRODUCCIÓN DE LA SIMULACIÓN DE LA PRESENTACIÓN
# =====================================================================
options(scipen = 99)
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.3
##
## Adjuntando el paquete: 'dplyr'
## The following object is masked from 'package:kableExtra':
##
## group_rows
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(caret)
## Warning: package 'caret' was built under R version 4.5.3
## Cargando paquete requerido: ggplot2
## Warning: package 'ggplot2' was built under R version 4.5.3
## Cargando paquete requerido: lattice
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.5.3
##
## Adjuntando el paquete: 'DescTools'
## The following objects are masked from 'package:caret':
##
## MAE, RMSE
## The following object is masked from 'package:forecast':
##
## BoxCox
library(stargazer)
library(mlbench)
# Cargar datos de la presentación
data(BostonHousing)
# Definición de las funciones de Theil de tu clase
Um <- function(pronosticado, observado){
((mean(pronosticado) - mean(observado))^2) / MSE(pronosticado, observado)
}
Us <- function(pronosticado, observado){
((sd(pronosticado) - sd(observado))^2) / MSE(pronosticado, observado)
}
Uc <- function(pronosticado, observado){
(2 * (1 - cor(pronosticado, observado)) * sd(pronosticado) * sd(observado)) / MSE(pronosticado, observado)
}
# Configuración de parámetros
set.seed(50)
numero_de_muestras <- 1000
# Partición del 80% para entrenamiento
muestras <- BostonHousing$medv %>%
createDataPartition(p = 0.8, times = numero_de_muestras, list = TRUE)
# Inicializar listas contenedoras
Modelos_Entrenamiento <- vector(mode = "list", length = numero_de_muestras)
Pronostico_Prueba <- vector(mode = "list", length = numero_de_muestras)
Resultados_Performance_data_entrenamiento <- vector(mode = "list", length = numero_de_muestras)
Resultados_Performance <- vector(mode = "list", length = numero_de_muestras)
# Bucle de la simulación
for (j in 1:numero_de_muestras){
Datos_Entrenamiento <- BostonHousing[muestras[[j]], ]
Datos_Prueba <- BostonHousing[-muestras[[j]], ]
# Modelo de la fórmula: medv en función de todas las variables
Modelos_Entrenamiento[[j]] <- lm(formula = medv ~ ., data = Datos_Entrenamiento)
modelo_actual <- Modelos_Entrenamiento[[j]]
Pronostico_Prueba[[j]] <- predict(modelo_actual, newdata = Datos_Prueba)
# Métricas entrenamiento
Resultados_Performance_data_entrenamiento[[j]] <- data.frame(
R2 = R2(modelo_actual$fitted.values, Datos_Entrenamiento$medv),
RMSE = RMSE(modelo_actual$fitted.values, Datos_Entrenamiento$medv),
MAE = MAE(modelo_actual$fitted.values, Datos_Entrenamiento$medv),
MAPE = MAPE(modelo_actual$fitted.values, Datos_Entrenamiento$medv) * 100,
THEIL = TheilU(modelo_actual$fitted.values, Datos_Entrenamiento$medv, type = 1),
Um = Um(modelo_actual$fitted.values, Datos_Entrenamiento$medv),
Us = Us(modelo_actual$fitted.values, Datos_Entrenamiento$medv),
Uc = Uc(modelo_actual$fitted.values, Datos_Entrenamiento$medv)
)
# Métricas prueba
Resultados_Performance[[j]] <- data.frame(
R2 = R2(Pronostico_Prueba[[j]], Datos_Prueba$medv),
RMSE = RMSE(Pronostico_Prueba[[j]], Datos_Prueba$medv),
MAE = MAE(Pronostico_Prueba[[j]], Datos_Prueba$medv),
MAPE = MAPE(Pronostico_Prueba[[j]], Datos_Prueba$medv) * 100,
THEIL = TheilU(Pronostico_Prueba[[j]], Datos_Prueba$medv, type = 1),
Um = Um(Pronostico_Prueba[[j]], Datos_Prueba$medv),
Us = Us(Pronostico_Prueba[[j]], Datos_Prueba$medv),
Uc = Uc(Pronostico_Prueba[[j]], Datos_Prueba$medv)
)
}
# Consolidación de tablas resumen estadísticas
bind_rows(Resultados_Performance_data_entrenamiento) %>%
stargazer(title = "Performance - Datos de Entrenamiento (Boston)", type = "text", digits = 3)
##
## Performance - Datos de Entrenamiento (Boston)
## =============================================
## Statistic N Mean St. Dev. Min Max
## ---------------------------------------------
## R2 1,000 0.743 0.013 0.713 0.794
## RMSE 1,000 4.653 0.141 4.177 4.948
## MAE 1,000 3.265 0.095 2.905 3.512
## MAPE 1,000 16.387 0.464 14.813 17.691
## THEIL 1,000 0.096 0.003 0.087 0.102
## Um 1,000 0.000 0.000 0 0
## Us 1,000 0.074 0.004 0.058 0.085
## Uc 1,000 0.928 0.004 0.918 0.945
## ---------------------------------------------
bind_rows(Resultados_Performance) %>%
stargazer(title = "Performance - Datos Desconocidos / Prueba (Boston)", type = "text", digits = 3)
##
## Performance - Datos Desconocidos / Prueba (Boston)
## ==============================================
## Statistic N Mean St. Dev. Min Max
## ----------------------------------------------
## R2 1,000 0.723 0.056 0.452 0.840
## RMSE 1,000 4.862 0.575 3.465 6.961
## MAE 1,000 3.411 0.281 2.633 4.492
## MAPE 1,000 17.197 1.618 12.875 23.137
## THEIL 1,000 0.101 0.012 0.073 0.148
## Um 1,000 0.011 0.016 0.000 0.205
## Us 1,000 0.081 0.066 0.00000 0.333
## Uc 1,000 0.918 0.066 0.667 1.010
## ----------------------------------------------
# =====================================================================
# PARTE 2: SIMULACIÓN CON DATOS HAC - 500 RÉPLICAS
# =====================================================================
load("C:/Users/MINEDUCYT/Downloads/smoke.RData")
summary(data)
## educ cigpric white age
## Min. : 6.00 Min. :44.00 Min. :0.0000 Min. :17.00
## 1st Qu.:10.00 1st Qu.:58.14 1st Qu.:1.0000 1st Qu.:28.00
## Median :12.00 Median :61.05 Median :1.0000 Median :38.00
## Mean :12.47 Mean :60.30 Mean :0.8786 Mean :41.24
## 3rd Qu.:13.50 3rd Qu.:63.18 3rd Qu.:1.0000 3rd Qu.:54.00
## Max. :18.00 Max. :70.13 Max. :1.0000 Max. :88.00
## income cigs restaurn lincome
## Min. : 500 Min. : 0.000 Min. :0.0000 Min. : 6.215
## 1st Qu.:12500 1st Qu.: 0.000 1st Qu.:0.0000 1st Qu.: 9.433
## Median :20000 Median : 0.000 Median :0.0000 Median : 9.903
## Mean :19305 Mean : 8.686 Mean :0.2466 Mean : 9.687
## 3rd Qu.:30000 3rd Qu.:20.000 3rd Qu.:0.0000 3rd Qu.:10.309
## Max. :30000 Max. :80.000 Max. :1.0000 Max. :10.309
## agesq lcigpric
## Min. : 289 Min. :3.784
## 1st Qu.: 784 1st Qu.:4.063
## Median :1444 Median :4.112
## Mean :1990 Mean :4.096
## 3rd Qu.:2916 3rd Qu.:4.146
## Max. :7744 Max. :4.250
library(stargazer)
library(htmltools)
## Warning: package 'htmltools' was built under R version 4.5.2
modelo_cigs <- lm(cigs ~ cigpric + lcigpric + income + lincome + age + agesq + educ + white + restaurn, data = data)
stargazer(modelo_cigs,title = "modelo estimado",type = "text")
##
## modelo estimado
## ===============================================
## Dependent variable:
## ---------------------------
## cigs
## -----------------------------------------------
## cigpric 2.002
## (1.493)
##
## lcigpric -115.273
## (85.424)
##
## income -0.00005
## (0.0001)
##
## lincome 1.404
## (1.708)
##
## age 0.778***
## (0.161)
##
## agesq -0.009***
## (0.002)
##
## educ -0.495***
## (0.168)
##
## white -0.531
## (1.461)
##
## restaurn -2.644**
## (1.130)
##
## Constant 340.804
## (260.016)
##
## -----------------------------------------------
## Observations 807
## R2 0.055
## Adjusted R2 0.044
## Residual Std. Error 13.413 (df = 797)
## F Statistic 5.169*** (df = 9; 797)
## ===============================================
## Note: *p<0.1; **p<0.05; ***p<0.01
extract_eq(modelo_cigs, wrap = TRUE)
\[ \begin{aligned} \operatorname{cigs} &= \alpha + \beta_{1}(\operatorname{cigpric}) + \beta_{2}(\operatorname{lcigpric}) + \beta_{3}(\operatorname{income})\ + \\ &\quad \beta_{4}(\operatorname{lincome}) + \beta_{5}(\operatorname{age}) + \beta_{6}(\operatorname{agesq}) + \beta_{7}(\operatorname{educ})\ + \\ &\quad \beta_{8}(\operatorname{white}) + \beta_{9}(\operatorname{restaurn}) + \epsilon \end{aligned} \]
coeftest(modelo_cigs)
##
## t test of coefficients:
##
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 340.804374604 260.015587269 1.3107 0.190334
## cigpric 2.002267667 1.492831189 1.3413 0.180220
## lcigpric -115.273464445 85.424315195 -1.3494 0.177585
## income -0.000046194 0.000133491 -0.3460 0.729402
## lincome 1.404061178 1.708165841 0.8220 0.411340
## age 0.778359013 0.160555612 4.8479 0.0000015001 ***
## agesq -0.009150353 0.001749292 -5.2309 0.0000002158 ***
## educ -0.494780616 0.168180198 -2.9420 0.003356 **
## white -0.531051635 1.460721806 -0.3636 0.716287
## restaurn -2.644241351 1.129998690 -2.3400 0.019528 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
library(stargazer)
#Data para la predicción X'm
X_m2 <- data.frame(cigpric = 100,
lcigpric = 4.60,
income = 25000,
lincome = 10.12,
age = 40,
agesq = 1600,
educ = 12,
white = 1,
restaurn = 0)
# Intervalos de Confianza del 95% y del 99%
confidense<-c(0.95,0.99)
#Predicción usando predict
predict(object = modelo_cigs,
newdata = X_m2,
interval = "prediction",
level = confidense,
se.fit =TRUE)->predicciones
rownames(predicciones$fit)<-as.character(confidense*100)
colnames(predicciones$fit)<-c("Ym","Li","Ls")
stargazer(predicciones$fit,
title = "Pronósticos e intervalos de confianza",
type = "text") #Poner results='asis' en opciones del chunk
##
## Pronósticos e intervalos de confianza
## ========================
## Ym Li Ls
## ------------------------
## 95 33.853 -8.004 75.710
## 99 33.853 -21.205 88.910
## ------------------------
# Nivel de confianza para el intervalo de confianza
confidense <- c(0.95,0.99)
# Realizando el pronóstico con forecast
pronosticos <- forecast(object = modelo_cigs,
level = confidense,
newdata = X_m2,
ts = FALSE)
kable(pronosticos,
caption = "Pronóstico e intervalos de confianza:",
digits = 2,
format = "html")
| Point Forecast | Lo 95 | Hi 95 | Lo 99 | Hi 99 |
|---|---|---|---|---|
| 33.85 | -8 | 75.71 | -21.2 | 88.91 |
options(scipen = 999999) # No mostrar notación científica
## Warning in options(scipen = 999999): invalid 'scipen' 999999, used 9999
# 1. Cargar librerías necesarias
library(dplyr) # Para manejo de datos y activar el operador pipe %>%
library(caret) # Permite realizar el muestreo particionado
library(DescTools) # Contiene la función TheilU
library(stargazer) # Para formatos y resúmenes estadísticos
# =========================================================================
# ASIGNACIÓN EN BASE A TU MODELO REAL
# =========================================================================
dataset_trabajo <- data # Tu base de datos se llama 'data'
set.seed(50) # Fijar la semilla aleatoria para reproducibilidad
numero_de_muestras <- 500
# Se crea la lista con las 1000 muestras basadas en tu variable real: cigs
muestras <- dataset_trabajo$cigs %>%
createDataPartition(p = 0.75,
times = numero_de_muestras,
list = TRUE)
# Listas vacías para almacenar los resultados de la simulación
Modelos_Entrenamiento <- vector(mode = "list", length = numero_de_muestras)
Pronostico_Prueba <- vector(mode = "list", length = numero_de_muestras)
Resultados_Performance_data_entrenamiento <- vector(mode = "list", length = numero_de_muestras)
Resultados_Performance <- vector(mode = "list", length = numero_de_muestras)
# Estimación de los modelos lineales, pronósticos y cálculo de métricas de performance
for(j in 1:numero_de_muestras){
# Partición de datos en Entrenamiento (80%) y Prueba (20%)
Datos_Entrenamiento <- dataset_trabajo[muestras[[j]], ]
Datos_Prueba <- dataset_trabajo[-muestras[[j]], ]
# Estimación del modelo con tus variables exactas
Modelos_Entrenamiento[[j]] <- lm(formula = cigs ~ cigpric + lcigpric + income + lincome + age + agesq + educ + white + restaurn,
data = Datos_Entrenamiento)
# Pronóstico sobre el conjunto de prueba
Pronostico_Prueba[[j]] <- Modelos_Entrenamiento[[j]] %>% predict(Datos_Prueba)
# --- CÁLCULO DE DESCOMPOSICIÓN DE THEIL (ENTRENAMIENTO) ---
y_act_ent <- Datos_Entrenamiento$cigs
y_pred_ent <- Modelos_Entrenamiento[[j]]$fitted.values
rmse_ent <- sqrt(mean((y_pred_ent - y_act_ent)^2))
um_ent <- (mean(y_pred_ent) - mean(y_act_ent))^2 / (rmse_ent^2)
us_ent <- (sd(y_pred_ent) - sd(y_act_ent))^2 / (rmse_ent^2)
uc_ent <- 2 * (1 - cor(y_pred_ent, y_act_ent)) * sd(y_pred_ent) * sd(y_act_ent) / (rmse_ent^2)
Resultados_Performance_data_entrenamiento[[j]] <- data.frame(
R2 = R2(y_pred_ent, y_act_ent),
RMSE = RMSE(y_pred_ent, y_act_ent),
MAE = MAE(y_pred_ent, y_act_ent),
MAPE = MAPE(y_pred_ent, y_act_ent) * 100,
THEIL = TheilU(y_pred_ent, y_act_ent, type = 1),
Um = um_ent,
Us = us_ent,
Uc = uc_ent
)
# --- CÁLCULO DE DESCOMPOSICIÓN DE THEIL (PRUEBA) ---
y_act_pru <- Datos_Prueba$cigs
y_pred_pru <- Pronostico_Prueba[[j]]
rmse_pru <- sqrt(mean((y_pred_pru - y_act_pru)^2))
um_pru <- (mean(y_pred_pru) - mean(y_act_pru))^2 / (rmse_pru^2)
us_pru <- (sd(y_pred_pru) - sd(y_act_pru))^2 / (rmse_pru^2)
uc_pru <- 2 * (1 - cor(y_pred_pru, y_act_pru)) * sd(y_pred_pru) * sd(y_act_pru) / (rmse_pru^2)
Resultados_Performance[[j]] <- data.frame(
R2 = R2(y_pred_pru, y_act_pru),
RMSE = RMSE(y_pred_pru, y_act_pru),
MAE = MAE(y_pred_pru, y_act_pru),
MAPE = MAPE(y_pred_pru, y_act_pru) * 100,
THEIL = TheilU(y_pred_pru, y_act_pru, type = 1),
Um = um_pru,
Us = us_pru,
Uc = uc_pru
)
}
# 2. Unificar los resultados de las 1000 simulaciones en dataframes
df_performance_entrenamiento <- do.call(rbind, Resultados_Performance_data_entrenamiento)
df_performance_prueba <- do.call(rbind, Resultados_Performance)
# 3. Mostrar el resumen estadístico final condensado en la consola
bind_rows(Resultados_Performance_data_entrenamiento) %>%
stargazer(title = "Medidas de Performance Datos del Modelo",
type = "text",
digits = 3)
##
## Medidas de Performance Datos del Modelo
## ============================================
## Statistic N Mean St. Dev. Min Max
## --------------------------------------------
## R2 500 0.058 0.007 0.040 0.083
## RMSE 500 13.304 0.198 12.629 13.828
## MAE 500 10.562 0.133 10.068 10.969
## MAPE 500 Inf.000 Inf Inf
## THEIL 500 0.522 0.006 0.505 0.541
## Um 500 0.000 0.000 0 0
## Us 500 0.613 0.020 0.554 0.669
## Uc 500 0.389 0.020 0.332 0.448
## --------------------------------------------
options(warn = -1)
bind_rows(Resultados_Performance) %>%
stargazer(title = "Medidas de Performance Simulación",
type = "text",
digits = 3)
##
## Medidas de Performance Simulación
## =============================================
## Statistic N Mean St. Dev. Min Max
## ---------------------------------------------
## R2 500 0.039 0.018 0.002 0.099
## RMSE 500 13.479 0.595 11.734 15.361
## MAE 500 10.728 0.290 9.834 11.626
## MAPE 500 Inf.000 Inf Inf
## THEIL 500 0.528 0.013 0.491 0.564
## Um 500 0.002 0.003 0.00000 0.021
## Us 500 0.597 0.051 0.476 0.751
## Uc 500 0.405 0.051 0.253 0.529
## ---------------------------------------------