11/6/2020

Datos de Arboles

La base de datos contiene información sobre el peso, altura y otras caracteristicas de una muestra de arboles del compañia XYZ.

library(readxl)
data_biomasa <- read_excel("D:/PUJ/2do Semestre/4. Analitica financiera/data biomasa.xlsx")
head(data_biomasa,n = 3)
## # A tibble: 3 x 8
##   finca   mg         bio_aerea bio_sub bio_total area_foliar diametro altura
##   <chr>   <chr>          <dbl>   <dbl>     <dbl>       <dbl>    <dbl>  <dbl>
## 1 FINCA_1 GENOTIPO_1      12.8    0.93      13.7        44.5      4.7    5  
## 2 FINCA_1 GENOTIPO_1      13.9    0.69      14.6        39.7      5.3    5.6
## 3 FINCA_1 GENOTIPO_1      15.1    0.78      15.9        45.6      4.8    5.8

Estimación de Modelo Multiple

require(ggplot2)
## Loading required package: ggplot2
ggplot(data_biomasa,aes(x=diametro,y=log(bio_total),col=finca))+geom_point()

mod1=lm(log(bio_total)~diametro+finca,data=data_biomasa)
summary(mod1)
## 
## Call:
## lm(formula = log(bio_total) ~ diametro + finca, data = data_biomasa)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.269702 -0.058652  0.000277  0.074438  0.233730 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.260860   0.045803  27.528  < 2e-16 ***
## diametro     0.274018   0.009047  30.287  < 2e-16 ***
## fincaFINCA_2 0.042002   0.031983   1.313    0.193    
## fincaFINCA_3 0.227430   0.028554   7.965 6.23e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1069 on 86 degrees of freedom
## Multiple R-squared:   0.94,  Adjusted R-squared:  0.9379 
## F-statistic: 449.1 on 3 and 86 DF,  p-value: < 2.2e-16

Estimación de modelo

mod1=lm(log(bio_total)~diametro+finca+mg,data=data_biomasa)

summary(mod1)
## 
## Call:
## lm(formula = log(bio_total) ~ diametro + finca + mg, data = data_biomasa)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.25355 -0.06083 -0.01964  0.06352  0.18167 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.16651    0.04512  25.856  < 2e-16 ***
## diametro      0.30888    0.01077  28.674  < 2e-16 ***
## fincaFINCA_2 -0.02028    0.03119  -0.650    0.517    
## fincaFINCA_3  0.19919    0.02605   7.646 2.90e-11 ***
## mgGENOTIPO_2 -0.13061    0.02683  -4.868 5.14e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.09507 on 85 degrees of freedom
## Multiple R-squared:  0.9531, Adjusted R-squared:  0.9509 
## F-statistic: 431.7 on 4 and 85 DF,  p-value: < 2.2e-16

predecir con el modelo

¿Cuál es el peso estimado de un arbol con diametro 7.2, de la finca 1 y variedad mg2?

log_peso=predict(mod1,list(diametro=7.2,finca="FINCA_1",mg="GENOTIPO_2"))
exp(log_peso)
##        1 
## 26.04482

Validación cruzada

peso_modelo=array(NA,90)

for(i in 1:90){
valida=data_biomasa[i,]
entrena=data_biomasa[-i,]
mod_entrena=lm(log(bio_total)~diametro+finca+mg,data=entrena)


log_peso=predict(mod_entrena,list(diametro=valida$diametro,finca=valida$finca,mg=valida$mg))
peso_modelo[i]=exp(log_peso)

}

data_biomasa$bio_total[1]
## [1] 13.73
peso_modelo
##  [1] 13.710339 16.670361 14.039981  8.609108  6.284030 19.746287 21.652261
##  [8] 14.514837 12.847368 17.473024  6.944202 13.595121  6.250099  8.391582
## [15] 10.024804 21.017542 20.359826 19.785362 12.840894 11.723706 11.644433
## [22]  9.735849 18.100319 13.276719  9.443274 13.258834 10.683722  7.136069
## [29] 15.452105  9.769583 16.586247 17.154557 24.789701 31.728408 24.671964
## [36]  9.855800 16.663909 15.225782 21.386914  6.984800 21.015045 23.442362
## [43] 31.642024 30.619863 14.437771 20.129047 15.714039 23.515365 22.865357
## [50] 20.076062 25.310892 29.612729 40.201068 34.584343 41.364190 13.908666
## [57] 18.929724 24.254152 25.899871 14.777620 14.742738 21.828413 27.477934
## [64] 28.459107 21.437947 21.899068 21.886042 24.714662 29.921740 17.129806
## [71] 29.651047 27.205776 28.124635 27.055041 21.178947 15.288009 14.406636
## [78] 16.759562 20.223839 15.354839 14.354145 12.639013 20.266398 19.574182
## [85] 11.447299 13.832661 12.979600 18.389891 20.753223 15.613316

métricas de validación

peso_real=data_biomasa$bio_total
resultados=data.frame(peso_real,peso_modelo)
ggplot(resultados,aes(x=peso_modelo,y=peso_real))+geom_point()+theme_bw()+geom_smooth()
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'

# mean absolut error - error medio absoluto
MAE=mean(abs(peso_real-peso_modelo))
MAE
## [1] 1.546798

El modelo tiene un error medio absoluto de prediccion de 1.54 toneladas por arbol