prec_ecop=c(1090, 1170, 1160, 1230, 1155, 1165, 1205, 1170, 1150, 1130, 1110, 1105,
1085, 1060, 1035, 1015, 955, 961)
prec_petr=c(35.62,36.31,37.35,34.95,34.53,35.81,36.14,37.50,37.80,36.81,37.87,37.04,
36.76,35.97,33.97,33.27,31.41,30.44)
data_petroleo = data.frame(prec_ecop, prec_petr)
head(data_petroleo)
| prec_ecop | prec_petr |
|---|---|
| 1090 | 35.62 |
| 1170 | 36.31 |
| 1160 | 37.35 |
| 1230 | 34.95 |
| 1155 | 34.53 |
| 1165 | 35.81 |
require(ggplot2)
require(plotly)
g1= ggplot(data = data_petroleo, mapping = aes(x=prec_petr, y= prec_ecop))+ geom_point()+geom_smooth()
ggplotly(g1)
library('PerformanceAnalytics')
library(dplyr)
chart.Correlation(select_if(data_petroleo, is.numeric), histogram = F, pch = 19)
cor(data_petroleo$prec_petr, data_petroleo$prec_ecop)
## [1] 0.7074373
mod_ecopetrol = lm(formula= prec_ecop~prec_petr, data=data_petroleo)
summary(mod_ecopetrol)
##
## Call:
## lm(formula = prec_ecop ~ prec_petr, data = data_petroleo)
##
## Residuals:
## Min 1Q Median 3Q Max
## -59.90 -40.74 -15.94 33.40 136.82
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 177.768 232.828 0.764 0.45627
## prec_petr 26.192 6.542 4.004 0.00102 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 57.13 on 16 degrees of freedom
## Multiple R-squared: 0.5005, Adjusted R-squared: 0.4692
## F-statistic: 16.03 on 1 and 16 DF, p-value: 0.001024
\[\begin{equation*} \widehat{E[y|_{x_i}]} = \widehat{y_{i}}=\widehat{\beta_{0}}+\widehat{\beta_{1}}x_{i} \end{equation*}\]
\[PrecioAccion = 177.768 + 26.192*(PrecioBarrilPetroleo)\] \[ R^2 = 0.4692 \]
\[H_{0} : \beta_{0} = 0 , H_{a} : \beta_{0} \neq 0 \]
\[p-value = 0.45627 \]
\[H_{0} : \beta_{1} = 0 , H_{a} : \beta_{1} \neq 0 \]
\[p-value = 0.00102 ** \]
\[ \beta_{0} = 177.768 , \beta_{1}= 26.192\]
| \(Ho\) : \(\mu_{u} =0\) |
| \(Ha\) : \(\mu_{u} \neq 0\) |
summary(mod_ecopetrol$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -59.90 -40.74 -15.94 0.00 33.40 136.82
t.test(mod_ecopetrol$residuals, mu=0)
##
## One Sample t-test
##
## data: mod_ecopetrol$residuals
## t = -4.2309e-16, df = 17, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -27.56364 27.56364
## sample estimates:
## mean of x
## -5.527407e-15
Prueba de Goldfeld-Quandt
| \(Ho\) : no existe heteroscedasticidad (homocedasticidad) |
| \(Ha\) : existe heteriscedastucidad |
library(lmtest)
lmtest::gqtest(mod_ecopetrol)
##
## Goldfeld-Quandt test
##
## data: mod_ecopetrol
## GQ = 0.17924, df1 = 7, df2 = 7, p-value = 0.9813
## alternative hypothesis: variance increases from segment 1 to 2
Shapiro Test
| \(Ho\) : \(u \sim norm\) |
| \(Ha\) : \(u no \sim norm\) |
shapiro.test(mod_ecopetrol$residuals)
##
## Shapiro-Wilk normality test
##
## data: mod_ecopetrol$residuals
## W = 0.89259, p-value = 0.04276
Durbin-Watson Test
| \(Ho\) : \(E[u_{i}u_{j}] = 0\) : los errores son independientes |
| \(Ha\) : \(E[u_{i}u_{j}] \neq 0\) : los errores no son independientes |
lmtest::dwtest(mod_ecopetrol)
##
## Durbin-Watson test
##
## data: mod_ecopetrol
## DW = 0.74504, p-value = 0.0004666
## alternative hypothesis: true autocorrelation is greater than 0
En cuanto a las hipótesis de los parámetros se tiene evidencia suficiente para decir que \(\beta_{1}\) es diferente de cero, por tanto es significativo en el modelo. El \(R^{2}\) = 0.4692 indica que la variable precio de barril de petroleo explica el 46% del precio de las acciones, por tanto, esta variable no permite estimar el precio de las acciones, pero si permite estimar su impacto sobre la variable dependiente. En cuanto a los 4 supuestos, dos de ellos no se cumplen, por lo que el modelo no es suficientemente valido para predecir el comportamiento de la variable dependiente en función de la variable independiente.
anio = 1999:2015
infl=c(9.23, 8.75, 7.65, 6.99, 6.49, 5.50, 4.85, 4.48, 5.69, 7.67, 2.00, 3.17,
3.73, 2.44, 1.94, 3.66, 6.77)
smlm=c(236460, 260100, 286000, 309000, 332000, 358000, 381500, 408000, 433700,
461500, 496900, 515000, 535600, 566700, 589500, 616027, 644350)
data_inflacion = data.frame(anio, infl, smlm)
head(data_inflacion, 5)
| anio | infl | smlm |
|---|---|---|
| 1999 | 9.23 | 236460 |
| 2000 | 8.75 | 260100 |
| 2001 | 7.65 | 286000 |
| 2002 | 6.99 | 309000 |
| 2003 | 6.49 | 332000 |
La idea es establecer un modelo de regresión que ayude a determinar el comportamiento de estas dos variables tomando como variable dependiente SALARIO MINIMO LEGAL MENSUAL (SMLM) y como variable independiente INFLACION obtenga un modelo de regresión lineal simple y resuelva
require(ggplot2)
require(plotly)
g2 = ggplot(data = data_inflacion, mapping = aes(x= infl, y=smlm))+geom_point()+geom_smooth()+theme_bw()
ggplotly(g2)
mod_inflacion= lm(formula = smlm~infl, data=data_inflacion)
summary(mod_inflacion)
##
## Call:
## lm(formula = smlm ~ infl, data = data_inflacion)
##
## Residuals:
## Min 1Q Median 3Q Max
## -75463 -63456 -42854 17623 263207
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 648486 58947 11.00 1.4e-08 ***
## infl -39489 10151 -3.89 0.00145 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 94130 on 15 degrees of freedom
## Multiple R-squared: 0.5022, Adjusted R-squared: 0.469
## F-statistic: 15.13 on 1 and 15 DF, p-value: 0.00145
\[\begin{equation*} \widehat{E[y|_{x_i}]} = \widehat{y_{i}}=\widehat{\beta_{0}}+\widehat{\beta_{1}}x_{i} \end{equation*}\]
\[SMLM = 648486 - 39489 *(INFL)\] \[ R^2 = 0.4692 \]
\[H_{0} : \beta_{0} = 0 , H_{a} : \beta_{0} \neq 0 \]
\[p-value = 1.4e-08 *** \]
\[H_{0} : \beta_{1} = 0 , H_{a} : \beta_{1} \neq 0 \]
\[p-value = 0.00145 ** \]
cor(data_inflacion$infl, data_inflacion$smlm)
## [1] -0.7086581
\[ \beta_{0} = 648486 , \beta_{1}= -39489 \]
par(mfrow = c(2, 2))
plot(mod_inflacion)
| \(Ho\) : \(\mu_{u} =0\) |
| \(Ha\) : \(\mu_{u} \neq 0\) |
summary(mod_inflacion$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -75463 -63456 -42854 0 17623 263207
t.test(mod_inflacion$residuals, mu=0)
##
## One Sample t-test
##
## data: mod_inflacion$residuals
## t = -6.7462e-17, df = 16, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -46862.45 46862.45
## sample estimates:
## mean of x
## -1.491304e-12
Prueba de Goldfeld-Quandt
| \(Ho\) : no existe heteroscedasticidad (homoscedasticidad) |
| \(Ha\) : existe heteriscedastucidad |
library(lmtest)
lmtest::gqtest(mod_inflacion)
##
## Goldfeld-Quandt test
##
## data: mod_inflacion
## GQ = 140.68, df1 = 7, df2 = 6, p-value = 3.171e-06
## alternative hypothesis: variance increases from segment 1 to 2
Shapiro Test
| \(Ho\) : \(u \sim norm\) |
| \(Ha\) : \(u no \sim norm\) |
shapiro.test(mod_inflacion$residuals)
##
## Shapiro-Wilk normality test
##
## data: mod_inflacion$residuals
## W = 0.78826, p-value = 0.001407
Durbin-Watson Test
| \(Ho\) : \(E[u_{i}u_{j}] = 0\) : los errores son independientes |
| \(Ha\) : \(E[u_{i}u_{j}] \neq 0\) : los errores no son independientes |
lmtest::dwtest(mod_inflacion)
##
## Durbin-Watson test
##
## data: mod_inflacion
## DW = 0.68432, p-value = 0.0002714
## alternative hypothesis: true autocorrelation is greater than 0
library(readxl)
Datos_Vivienda <- read_excel("D:/ESTUDIO/MASTER IN DATA SCIENCE/Semester 2/Metodos estadisticos para la toma de decisiones/MOD1/UND 1-Conceptos de Regresion Lineal Multiple/Datos_Vivienda.xlsx")
head(Datos_Vivienda, 3)
| Zona | piso | Estrato | precio_millon | Area_contruida | parqueaderos | Banos | Habitaciones | Tipo | Barrio | cordenada_longitud | Cordenada_latitud |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Zona Sur | 2 | 6 | 880 | 237 | 2 | 5 | 4 | Casa | pance | -76.463 | 3.430 |
| Zona Oeste | 2 | 4 | 1200 | 800 | 3 | 6 | 7 | Casa | miraflores | -76.464 | 3.428 |
| Zona Sur | 3 | 5 | 250 | 86 | NA | 2 | 3 | Apartamento | multicentro | -76.464 | 3.429 |
#Filtros:
pos = which ( Datos_Vivienda$Tipo == "Apartamento" & Datos_Vivienda$Zona == "Zona Norte" & Datos_Vivienda$precio_millon < 500 & Datos_Vivienda$Area_contruida < 300)
#datos filtrados:
viviendas = Datos_Vivienda[pos,]
#Cambiar tipo de dato de character a numeric en la columna parqueaderos
# converting character type
# column to numeric
viviendas <- transform(viviendas, parqueaderos = as.numeric(parqueaderos))
#Reemplazar NA
viviendas[is.na(viviendas)] <- 0
head(viviendas, 3)
| Zona | piso | Estrato | precio_millon | Area_contruida | parqueaderos | Banos | Habitaciones | Tipo | Barrio | cordenada_longitud | Cordenada_latitud |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Zona Norte | 2 | 3 | 135 | 56 | 1 | 1 | 3 | Apartamento | torres de comfandi | -76.46745 | 3.40763 |
| Zona Norte | NA | 3 | 78 | 54 | 2 | 1 | 3 | Apartamento | chiminangos | -76.47820 | 3.44898 |
| Zona Norte | NA | 5 | 340 | 106 | 2 | 2 | 3 | Apartamento | la flora | -76.48200 | 3.43500 |
# Describir los datos:
Hmisc::describe(viviendas)
## viviendas
##
## 12 Variables 1077 Observations
## --------------------------------------------------------------------------------
## Zona
## n missing distinct value
## 1077 0 1 Zona Norte
##
## Value Zona Norte
## Frequency 1077
## Proportion 1
## --------------------------------------------------------------------------------
## piso
## n missing distinct
## 1077 0 13
##
## lowest : 1 10 11 12 2 , highest: 6 7 8 9 NA
##
## Value 1 10 11 12 2 3 4 5 6 7 8
## Frequency 91 21 27 31 86 112 115 108 28 29 34
## Proportion 0.084 0.019 0.025 0.029 0.080 0.104 0.107 0.100 0.026 0.027 0.032
##
## Value 9 NA
## Frequency 28 367
## Proportion 0.026 0.341
## --------------------------------------------------------------------------------
## Estrato
## n missing distinct Info Mean Gmd
## 1077 0 4 0.885 4.195 1.013
##
## Value 3 4 5 6
## Frequency 337 241 451 48
## Proportion 0.313 0.224 0.419 0.045
## --------------------------------------------------------------------------------
## precio_millon
## n missing distinct Info Mean Gmd .05 .10
## 1077 0 183 1 233.8 125.6 95.0 108.0
## .25 .50 .75 .90 .95
## 132.0 220.0 320.0 400.0 420.6
##
## lowest : 65 69 70 72 73, highest: 460 470 480 490 495
## --------------------------------------------------------------------------------
## Area_contruida
## n missing distinct Info Mean Gmd .05 .10
## 1077 0 148 0.999 85.94 34.52 54.0 55.0
## .25 .50 .75 .90 .95
## 60.0 76.0 100.0 126.0 149.2
##
## lowest : 35 42 43 44 45, highest: 235 236 240 270 287
## --------------------------------------------------------------------------------
## parqueaderos
## n missing distinct Info Mean Gmd
## 1077 0 5 0.828 0.8886 0.7287
##
## lowest : 0 1 2 3 4, highest: 0 1 2 3 4
##
## Value 0 1 2 3 4
## Frequency 322 559 191 4 1
## Proportion 0.299 0.519 0.177 0.004 0.001
## --------------------------------------------------------------------------------
## Banos
## n missing distinct Info Mean Gmd
## 1077 0 6 0.778 2.138 0.7428
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 2 169 640 214 48 4
## Proportion 0.002 0.157 0.594 0.199 0.045 0.004
## --------------------------------------------------------------------------------
## Habitaciones
## n missing distinct Info Mean Gmd
## 1077 0 6 0.628 2.851 0.5317
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## Value 0 1 2 3 4 5
## Frequency 7 14 201 770 81 4
## Proportion 0.006 0.013 0.187 0.715 0.075 0.004
## --------------------------------------------------------------------------------
## Tipo
## n missing distinct value
## 1077 0 1 Apartamento
##
## Value Apartamento
## Frequency 1077
## Proportion 1
## --------------------------------------------------------------------------------
## Barrio
## n missing distinct
## 1077 0 99
##
## lowest : acopi alameda del rio alameda del rv<U+2260>o alamos alcazares
## highest: villa del sol villas de veracruz zona norte zona norte los zona residencial
## --------------------------------------------------------------------------------
## cordenada_longitud
## n missing distinct Info Mean Gmd .05 .10
## 1077 0 628 1 -76.52 0.01697 -76.54 -76.53
## .25 .50 .75 .90 .95
## -76.53 -76.52 -76.50 -76.50 -76.49
##
## lowest : -76.58732 -76.56409 -76.56213 -76.56107 -76.55559
## highest: -76.48381 -76.48347 -76.48200 -76.47820 -76.46745
## --------------------------------------------------------------------------------
## Cordenada_latitud
## n missing distinct Info Mean Gmd .05 .10
## 1077 0 653 1 3.467 0.02934 3.388 3.421
## .25 .50 .75 .90 .95
## 3.462 3.476 3.488 3.491 3.492
##
## lowest : 3.34525 3.35073 3.36394 3.36400 3.36403
## highest: 3.49600 3.49659 3.49666 3.49700 3.49770
## --------------------------------------------------------------------------------
require(leaflet)
leaflet() %>% addCircleMarkers(lng= viviendas$cordenada_longitud, lat=viviendas$Cordenada_latitud, radius = 4, color = "blue") %>% addTiles()
require(ggplot2)
require(plotly)
g4= ggplot(data = viviendas, mapping = aes(x=Area_contruida, y= precio_millon))+ geom_point()+geom_smooth()
ggplotly(g4)
library('PerformanceAnalytics')
library(dplyr)
cor(viviendas$Area_contruida, viviendas$precio_millon)
## [1] 0.6937608
require(ggplot2)
require(plotly)
g5= ggplot(data = viviendas, mapping = aes(x=Estrato, y= precio_millon))+ geom_point()+geom_smooth()
ggplotly(g5)
cor(viviendas$Estrato, viviendas$precio_millon)
## [1] 0.8230851
require(ggplot2)
require(plotly)
g6= ggplot(data = viviendas, mapping = aes(x=parqueaderos, y= precio_millon))+ geom_point()+geom_smooth()
ggplotly(g6)
cor(viviendas$parqueaderos, viviendas$precio_millon)
## [1] 0.5472535
library(dplyr)
round(cor( select_if(viviendas, is.numeric), method = "pearson"),3)
## Estrato precio_millon Area_contruida parqueaderos Banos
## Estrato 1.000 0.823 0.581 0.476 0.546
## precio_millon 0.823 1.000 0.694 0.547 0.639
## Area_contruida 0.581 0.694 1.000 0.428 0.651
## parqueaderos 0.476 0.547 0.428 1.000 0.437
## Banos 0.546 0.639 0.651 0.437 1.000
## Habitaciones -0.027 0.053 0.321 0.126 0.289
## cordenada_longitud -0.544 -0.486 -0.442 -0.217 -0.365
## Cordenada_latitud -0.071 -0.002 -0.056 0.208 -0.053
## Habitaciones cordenada_longitud Cordenada_latitud
## Estrato -0.027 -0.544 -0.071
## precio_millon 0.053 -0.486 -0.002
## Area_contruida 0.321 -0.442 -0.056
## parqueaderos 0.126 -0.217 0.208
## Banos 0.289 -0.365 -0.053
## Habitaciones 1.000 0.090 0.061
## cordenada_longitud 0.090 1.000 0.362
## Cordenada_latitud 0.061 0.362 1.000
library(GGally)
library(dplyr)
ggpairs(select_if(viviendas, is.numeric), lower = list(continuous = "smooth"),
diag = list(continuous = "barDiag"), axisLabels = "none")
modelo_viviendas = lm(formula= precio_millon ~ Area_contruida + Estrato + parqueaderos, data = viviendas )
summary(modelo_viviendas)
##
## Call:
## lm(formula = precio_millon ~ Area_contruida + Estrato + parqueaderos,
## data = viviendas)
##
## Residuals:
## Min 1Q Median 3Q Max
## -216.571 -31.564 -1.213 27.889 224.053
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -157.38220 7.71190 -20.408 < 2e-16 ***
## Area_contruida 0.94938 0.06054 15.682 < 2e-16 ***
## Estrato 68.99436 2.26623 30.445 < 2e-16 ***
## parqueaderos 22.64906 2.73064 8.294 3.24e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 53.75 on 1073 degrees of freedom
## Multiple R-squared: 0.7629, Adjusted R-squared: 0.7623
## F-statistic: 1151 on 3 and 1073 DF, p-value: < 2.2e-16
Modelo estimado de regresión lineal multiple:
\[\begin{equation*} \widehat{E[y|_{x_i}]} = \widehat{y_{i}}=\widehat{\beta_{0}}+ \widehat{\beta_{1}}x_{1i} +\widehat{\beta_{2}}x_{2i}+\widehat{\beta_{3}}x_{3i} \end{equation*}\]
\[Precio millon = -157.38220 + 0.94938 x_{1i} + 68.99436 x_{3i} + 22.64906 x_{2i} \] \[ R^2 = 0.7623 \]
\[H_{0} : \beta_{0} = 0 , H_{a} : \beta_{0} \neq 0 \]
\[p-value = 2e-16 *** \]
\[H_{0} : \beta_{1} = 0 , H_{a} : \beta_{1} \neq 0 \]
\[p-value = 2e-16 *** \]
\[H_{0} : \beta_{2} = 0 , H_{a} : \beta_{2} \neq 0 \]
\[p-value = 2e-16 *** \]
\[H_{0} : \beta_{3} = 0 , H_{a} : \beta_{3} \neq 0 \]
\[p-value = 3.24e-16 *** \]
par(mfrow = c(2, 2))
plot(modelo_viviendas)
##### 1: En la primera gráfica, residuales vs valores ajustados, para un
modelo que prediga adecuadamente los datos, los residuales deben estar
distribuidos de manera aleatoria alrededor de la línea punteada, para el
caso se evidencia que los puntos se distribuyen no necesariamente de
manera aleatoria y se agrupan ligeramente al centro, por lo que no
podría cumplirse el supuesto.
| \(Ho\) : \(\mu_{u} =0\) |
| \(Ha\) : \(\mu_{u} \neq 0\) |
summary(modelo_viviendas$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -216.571 -31.564 -1.213 0.000 27.889 224.053
t.test(modelo_viviendas$residuals, mu=0)
##
## One Sample t-test
##
## data: modelo_viviendas$residuals
## t = 6.246e-16, df = 1076, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -3.209419 3.209419
## sample estimates:
## mean of x
## 1.021631e-15
Prueba de Goldfeld-Quandt
| \(Ho\) : no existe heteroscedasticidad (homoscedasticidad) |
| \(Ha\) : existe heteriscedastucidad |
library(lmtest)
lmtest::gqtest(modelo_viviendas)
##
## Goldfeld-Quandt test
##
## data: modelo_viviendas
## GQ = 1.8151, df1 = 535, df2 = 534, p-value = 4.04e-12
## alternative hypothesis: variance increases from segment 1 to 2
Shapiro Test
| \(Ho\) : \(u \sim norm\) |
| \(Ha\) : \(u no \sim norm\) |
shapiro.test(modelo_viviendas$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo_viviendas$residuals
## W = 0.98702, p-value = 3.509e-08
Durbin-Watson Test
| \(Ho\) : \(E[u_{i}u_{j}] = 0\) : los errores son independientes |
| \(Ha\) : \(E[u_{i}u_{j}] \neq 0\) : los errores no son independientes |
lmtest::dwtest(modelo_viviendas)
##
## Durbin-Watson test
##
## data: modelo_viviendas
## DW = 1.757, p-value = 2.992e-05
## alternative hypothesis: true autocorrelation is greater than 0
#con un parqueadero:
predict(modelo_viviendas,list(Area_contruida=100, Estrato = 4, parqueaderos = 1), interval = "confidence" )
## fit lwr upr
## 1 236.1822 232.2449 240.1195
#con dos parqueaderos:
predict(modelo_viviendas,list(Area_contruida=100, Estrato = 4, parqueaderos = 2), interval = "confidence" )
## fit lwr upr
## 1 258.8313 251.7806 265.8819
#con tres parqueaderos:
predict(modelo_viviendas,list(Area_contruida=100, Estrato = 4, parqueaderos = 3), interval = "confidence" )
## fit lwr upr
## 1 281.4803 269.5918 293.3689
#con cuatro parqueaderos:
predict(modelo_viviendas,list(Area_contruida=100, Estrato = 4, parqueaderos = 4), interval = "confidence" )
## fit lwr upr
## 1 304.1294 287.0889 321.1699
pos2 = which ( viviendas$Area_contruida >100 & viviendas$Estrato == 4 & viviendas$parqueaderos >=1 & viviendas$precio_millon <= 400 & viviendas$precio_millon >= 236.1822)
#datos filtrados:
viviendas_f = viviendas[pos2,]
min(viviendas_f$precio_millon)
## [1] 240
max(viviendas_f$Area_contruida)
## [1] 287
max(viviendas_f$Banos)
## [1] 4
max(viviendas_f$Habitaciones)
## [1] 4
pos3 = which(viviendas_f$precio_millon == 240 | viviendas_f$Area_contruida == 287 | viviendas_f$Banos ==4 | viviendas_f$Habitaciones ==4 )
viviendas_5 = viviendas_f[pos3,]
pos4 = which(viviendas_5$Barrio =="la flora" | viviendas_5$Barrio == "versalles")
viviendas_5 = viviendas_5[pos4,]
head(viviendas_5)
| Zona | piso | Estrato | precio_millon | Area_contruida | parqueaderos | Banos | Habitaciones | Tipo | Barrio | cordenada_longitud | Cordenada_latitud | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 755 | Zona Norte | NA | 4 | 265 | 125 | 2 | 3 | 4 | Apartamento | la flora | -76.52353 | 3.48157 |
| 788 | Zona Norte | 2 | 4 | 380 | 126 | 2 | 3 | 4 | Apartamento | la flora | -76.52432 | 3.48254 |
| 800 | Zona Norte | 6 | 4 | 270 | 152 | 1 | 3 | 4 | Apartamento | versalles | -76.52515 | 3.46334 |
| 844 | Zona Norte | 2 | 4 | 240 | 103 | 1 | 2 | 3 | Apartamento | versalles | -76.52700 | 3.46500 |
| 921 | Zona Norte | 7 | 4 | 300 | 126 | 2 | 4 | 4 | Apartamento | versalles | -76.52953 | 3.45926 |
require(leaflet)
leaflet() %>% addCircleMarkers(lng= viviendas_5$cordenada_longitud, lat=viviendas_5$Cordenada_latitud, radius = 15, color = "blue") %>% addTiles()
library(readxl)
data_arboles <- read_excel("D:/ESTUDIO/MASTER IN DATA SCIENCE/Semester 2/Metodos estadisticos para la toma de decisiones/MOD1/UND 1-Conceptos de Regresion Lineal Multiple/data arboles.xlsx",
col_types = c("text", "text", "numeric",
"numeric", "numeric"))
head(data_arboles, 5)
| finca | mg | peso | diametro | altura |
|---|---|---|---|---|
| FINCA_1 | GENOTIPO_1 | 13.73 | 4.7 | 5.0 |
| FINCA_1 | GENOTIPO_1 | 14.58 | 5.3 | 5.6 |
| FINCA_1 | GENOTIPO_1 | 15.88 | 4.8 | 5.8 |
| FINCA_1 | GENOTIPO_1 | 8.99 | 3.2 | 4.3 |
| FINCA_1 | GENOTIPO_1 | 6.99 | 2.2 | 3.3 |
library(dplyr)
round(cor( select_if(data_arboles, is.numeric), method = "pearson"),3)
## peso diametro altura
## peso 1.000 0.908 0.858
## diametro 0.908 1.000 0.936
## altura 0.858 0.936 1.000
library('PerformanceAnalytics')
library(dplyr)
chart.Correlation(select_if(data_arboles, is.numeric), histogram = F, pch = 19)
library(GGally)
library(dplyr)
ggpairs(select_if(data_arboles, is.numeric), lower = list(continuous = "smooth"),
diag = list(continuous = "barDiag"), axisLabels = "none")
#
# Conjunto de 90 datos (80% para modelar y 20% para validar):
id_modelar= sample(1:90, size=72)
id_modelar
## [1] 85 55 36 10 27 5 1 56 16 13 75 68 69 6 44 63 29 59 41 24 73 26 11 7 18
## [26] 2 9 90 43 82 19 32 65 67 88 78 8 81 50 76 80 84 40 49 33 47 45 60 61 34
## [51] 4 53 71 22 54 25 20 87 77 3 70 83 52 74 15 38 62 72 31 17 46 14
# data set con 72 datos para modelar:
arboles_modelar= data_arboles[id_modelar,]
head(arboles_modelar)
| finca | mg | peso | diametro | altura |
|---|---|---|---|---|
| FINCA_3 | GENOTIPO_1 | 13.34 | 3.5 | 5.0 |
| FINCA_2 | GENOTIPO_2 | 47.87 | 8.8 | 11.3 |
| FINCA_2 | GENOTIPO_1 | 9.97 | 3.7 | 4.4 |
| FINCA_1 | GENOTIPO_2 | 16.62 | 5.9 | 7.1 |
| FINCA_1 | GENOTIPO_2 | 9.89 | 4.3 | 6.3 |
| FINCA_1 | GENOTIPO_1 | 6.99 | 2.2 | 3.3 |
# data set con 18 datos para validar:
arboles_validar= data_arboles[-id_modelar,]
head(arboles_validar)
| finca | mg | peso | diametro | altura |
|---|---|---|---|---|
| FINCA_1 | GENOTIPO_1 | 15.83 | 4.7 | 5.7 |
| FINCA_1 | GENOTIPO_1 | 14.09 | 4.2 | 5.2 |
| FINCA_1 | GENOTIPO_1 | 18.16 | 5.6 | 6.2 |
| FINCA_1 | GENOTIPO_2 | 6.98 | 3.0 | 4.8 |
| FINCA_1 | GENOTIPO_2 | 8.73 | 4.0 | 5.3 |
| FINCA_2 | GENOTIPO_2 | 26.38 | 7.1 | 9.2 |
modelo_arboles = lm(formula= peso ~ diametro + altura, data = arboles_modelar)
summary(modelo_arboles)
##
## Call:
## lm(formula = peso ~ diametro + altura, data = arboles_modelar)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1924 -3.1280 0.2355 2.0416 11.4413
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -9.6020 1.6088 -5.968 9.27e-08 ***
## diametro 5.4490 0.9097 5.990 8.49e-08 ***
## altura -0.1699 0.7233 -0.235 0.815
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.635 on 69 degrees of freedom
## Multiple R-squared: 0.8304, Adjusted R-squared: 0.8255
## F-statistic: 168.9 on 2 and 69 DF, p-value: < 2.2e-16
\[\begin{equation*} \widehat{E[y|_{x_i}]} = \widehat{y_{i}}=\widehat{\beta_{0}}+\widehat{\beta_{1}}x_{1i}+\widehat{\beta_{2}}x_{2i} \end{equation*}\]
B0 = data.frame(modelo_arboles$coefficients)[1,]
B1 = data.frame(modelo_arboles$coefficients)[2,]
B2 = data.frame(modelo_arboles$coefficients)[3,]
data.frame(B0, B1, B2)
| B0 | B1 | B2 |
|---|---|---|
| -9.602012 | 5.448967 | -0.1699267 |
\[H_{0} : \beta_{0} = 0 , H_{a} : \beta_{0} \neq 0 \]
\[p-value = 4.23e-08 *** \]
\[H_{0} : \beta_{1} = 0 , H_{a} : \beta_{1} \neq 0 \]
\[p-value = 3.96e-06 *** \]
\[H_{0} : \beta_{2} = 0 , H_{a} : \beta_{2} \neq 0 \]
\[p-value = 0.19 \]
confint(modelo_arboles)
## 2.5 % 97.5 %
## (Intercept) -12.811451 -6.392573
## diametro 3.634232 7.263701
## altura -1.612906 1.273052
par(mfrow = c(2, 2))
plot(modelo_arboles)
| \(Ho\) : \(\mu_{u} =0\) |
| \(Ha\) : \(\mu_{u} \neq 0\) |
summary(modelo_arboles$residuals)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -5.1924 -3.1280 0.2355 0.0000 2.0416 11.4413
t.test(modelo_arboles$residuals, mu=0)
##
## One Sample t-test
##
## data: modelo_arboles$residuals
## t = 4.8336e-16, df = 71, p-value = 1
## alternative hypothesis: true mean is not equal to 0
## 95 percent confidence interval:
## -0.8421815 0.8421815
## sample estimates:
## mean of x
## 2.041553e-16
Prueba de Goldfeld-Quandt
| \(Ho\) : no existe heteroscedasticidad (homoscedasticidad) |
| \(Ha\) : existe heteriscedastucidad |
library(lmtest)
lmtest::gqtest(modelo_arboles)
##
## Goldfeld-Quandt test
##
## data: modelo_arboles
## GQ = 0.70051, df1 = 33, df2 = 33, p-value = 0.8442
## alternative hypothesis: variance increases from segment 1 to 2
Shapiro Test
| \(Ho\) : \(u \sim norm\) |
| \(Ha\) : \(u no \sim norm\) |
shapiro.test(modelo_arboles$residuals)
##
## Shapiro-Wilk normality test
##
## data: modelo_arboles$residuals
## W = 0.94232, p-value = 0.00248
Durbin-Watson Test
| \(Ho\) : \(E[u_{i}u_{j}] = 0\) : los errores son independientes |
| \(Ha\) : \(E[u_{i}u_{j}] \neq 0\) : los errores no son independientes |
lmtest::dwtest(modelo_arboles)
##
## Durbin-Watson test
##
## data: modelo_arboles
## DW = 1.4817, p-value = 0.01351
## alternative hypothesis: true autocorrelation is greater than 0
predict_peso = predict(modelo_arboles, list(diametro=arboles_validar$diametro, altura=arboles_validar$altura))
predict_peso
## 1 2 3 4 5 6 7 8
## 15.03955 12.40003 19.85866 5.92924 11.29324 27.52233 18.76886 23.02608
## 9 10 11 12 13 14 15 16
## 26.61945 24.62678 28.03324 20.98243 25.32461 26.21050 21.85132 18.25795
## 17 18
## 11.95709 18.80285
peso_real = arboles_validar$peso
error = peso_real-predict_peso
resultado = data.frame(peso_real, predict_peso, error)
head(resultado, 5)
| peso_real | predict_peso | error |
|---|---|---|
| 15.83 | 15.03955 | 0.7904513 |
| 14.09 | 12.40003 | 1.6899713 |
| 18.16 | 19.85866 | -1.6986553 |
| 6.98 | 5.92924 | 1.0507605 |
| 8.73 | 11.29324 | -2.5632428 |
#Mean Absolut Error
MAE = mean(abs(error))
MAE
## [1] 2.350093
#RMSE
library(Metrics)
rmse(resultado$peso_real, resultado$predict_peso)
## [1] 2.77038
# El valor para RMSE es pequeño, por lo que se puede decir que el modelo de regresión es bueno