Estadistica espacial

#Carga de datos
library(readxl)
datosqgiz <- read_excel("XPABLO_raw.xlsx")
attach(datosqgiz)

#Mapear la materia organica, con ggplot
library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

datosqgiz |>
  ggplot(aes(x=Lat, y=Long, color=MO))+
  geom_point()+
  labs(x="Longitud", y="Latitud", title = "Datos espaciales de Materia Organica ")+
  theme_classic()

#Ahora se hace un modelo de regresión multiple, para la variable MO

modelo.lm <- lm(MO ~ Ca + Mg )
summary(modelo.lm)

## 
## Call:
## lm(formula = MO ~ Ca + Mg)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.10451 -0.43644 -0.05345  0.47432  2.08162 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.450337   0.078924  18.376   <2e-16 ***
## Ca           0.089591   0.009465   9.466   <2e-16 ***
## Mg          -0.002043   0.031346  -0.065    0.948    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6212 on 400 degrees of freedom
## Multiple R-squared:  0.221,  Adjusted R-squared:  0.2171 
## F-statistic: 56.74 on 2 and 400 DF,  p-value: < 2.2e-16

#Y se extraen los residuos de este modelo
res <-residuals(modelo.lm)
hist(res)

shapiro.test(res)

## 
##  Shapiro-Wilk normality test
## 
## data:  res
## W = 0.99226, p-value = 0.03472

#Ahora se hacen otras pruebas para verificar si se cuimple la normalidad, ya que con shapiro no se pudo

library(nortest)
ad.test(res);cvm.test(res);lillie.test(res)

## 
##  Anderson-Darling normality test
## 
## data:  res
## A = 0.97011, p-value = 0.01444

## 
##  Cramer-von Mises normality test
## 
## data:  res
## W = 0.15644, p-value = 0.01949

## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  res
## D = 0.04024, p-value = 0.1164

#Ahora que los datos son "normales", ahora se hace la
boxplot(res)

#Ahora se verifica la dependencia espacial de los residuos, mediante una matriz de los residuos, teniendo en cuena la longitud y latitud:

dist = as.matrix(dist(cbind(Long, Lat)))
dist_inv <- 1/dist
diag(dist_inv)<- 0
dist_inv[1:5,1:5]

##           1         2        3        4         5
## 1   0.00000 158.90672 158.9067 112.3640  79.45336
## 2 158.90672   0.00000 112.3640 158.9067  71.06525
## 3 158.90672 112.36402   0.0000 158.9067 158.90672
## 4 112.36402 158.90672 158.9067   0.0000 112.36402
## 5  79.45336  71.06525 158.9067 112.3640   0.00000

#Ahora se hace el indice de moran, el cual sirve para verificar autocorrelación espacial:

library(ape)

## 
## Attaching package: 'ape'

## The following object is masked from 'package:dplyr':
## 
##     where

Moran.I (res, dist_inv)

## $observed
## [1] 0.02244296
## 
## $expected
## [1] -0.002487562
## 
## $sd
## [1] 0.004260633
## 
## $p.value
## [1] 4.875545e-09

#Ahora en vez de mapear la materia organica, ahora se ampean los residuales del modelo:

datosqgiz |>
  ggplot(aes(x=Lat, y=Long, color=res))+
  geom_point()+
  labs(x="Longitud", y="Latitud", title = "Datos espaciales de los residuos")+
  theme_classic()

#Existen diversos modelos espaciales, y de gran interés saber cual me sirve más para ajustar la materia organica 

library(spatialreg)

## Loading required package: spData

## To access larger datasets in this package, install the spDataLarge
## package with: `install.packages('spDataLarge',
## repos='https://nowosad.github.io/drat/', type='source')`

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loading required package: sf

## Linking to GEOS 3.8.0, GDAL 3.0.4, PROJ 6.3.1; sf_use_s2() is TRUE

## Registered S3 method overwritten by 'spdep':
##   method   from
##   plot.mst ape

library(spdep)

## 
## Attaching package: 'spdep'

## The following objects are masked from 'package:spatialreg':
## 
##     get.ClusterOption, get.coresOption, get.mcOption,
##     get.VerboseOption, get.ZeroPolicyOption, set.ClusterOption,
##     set.coresOption, set.mcOption, set.VerboseOption,
##     set.ZeroPolicyOption

library(sp)

#Ahora se construye la matriz de pesos espaciales
XYdata=data.frame(Lat,Long)
contnb=dnearneigh(coordinates(XYdata),0,380000,longlat = T)
dlist <- nbdists(contnb, XYdata)                   #Distance
dlist <- lapply(dlist, function(x) 1/x)            #inverse distance
Wve=nb2listw(contnb,glist=dlist,style = "W")       #W matriz-standarized

#Ahora se corre un modelo espacial con la matriz anterior, se usa el sacsar
mod_esp=sacsarlm(MO~Ca+Mg,listw = Wve)
summary(mod_esp)

## 
## Call:sacsarlm(formula = MO ~ Ca + Mg, listw = Wve)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.99262 -0.40366 -0.02078  0.45603  2.03462 
## 
## Type: sac 
## Coefficients: (asymptotic standard errors) 
##              Estimate Std. Error z value  Pr(>|z|)
## (Intercept) 0.0132573  1.4928887  0.0089    0.9929
## Ca          0.0852854  0.0110719  7.7028 1.332e-14
## Mg          0.0010619  0.0338908  0.0313    0.9750
## 
## Rho: 0.66767
## Asymptotic standard error: 0.72032
##     z-value: 0.92691, p-value: 0.35397
## Lambda: 0.70367
## Asymptotic standard error: 0.68067
##     z-value: 1.0338, p-value: 0.30123
## 
## LR test value: 13.13, p-value: 0.0014091
## 
## Log likelihood: -371.8637 for sac model
## ML residual variance (sigma squared): 0.36747, (sigma: 0.6062)
## Number of observations: 403 
## Number of parameters estimated: 6 
## AIC: 755.73, (AIC for lm: 764.86)

res_esp <- residuals(mod_esp)
hist(res_esp)

Moran.I(res_esp, dist_inv)

## $observed
## [1] 0.005040485
## 
## $expected
## [1] -0.002487562
## 
## $sd
## [1] 0.004260315
## 
## $p.value
## [1] 0.07722546

#Ahora se buscan los valores ajustados y predichos

Valores_predichos <- mod_esp$fitted.values

#Ahora se mapean en gg plot:

datosqgiz |>
  ggplot(aes(x=Lat, y=Long, color=Valores_predichos))+
  geom_point()+
  labs(x="Longitud", y="Latitud", title = "Datos espaciales de los valores predichos")+
  theme_classic()

#Como el magnesio no sirve, ahora solo se hace con el calcio:

mod_espca=sacsarlm(MO~Ca,listw = Wve)
summary(mod_espca)

## 
## Call:sacsarlm(formula = MO ~ Ca, listw = Wve)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -1.994648 -0.403813 -0.021071  0.455428  2.034905 
## 
## Type: sac 
## Coefficients: (asymptotic standard errors) 
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.0146609  1.4817690  0.0099   0.9921
## Ca          0.0854420  0.0096792  8.8274   <2e-16
## 
## Rho: 0.66737
## Asymptotic standard error: 0.71638
##     z-value: 0.93159, p-value: 0.35155
## Lambda: 0.70386
## Asymptotic standard error: 0.6764
##     z-value: 1.0406, p-value: 0.29806
## 
## LR test value: 13.133, p-value: 0.0014068
## 
## Log likelihood: -371.8642 for sac model
## ML residual variance (sigma squared): 0.36748, (sigma: 0.6062)
## Number of observations: 403 
## Number of parameters estimated: 5 
## AIC: 753.73, (AIC for lm: 762.86)

res_espca <- residuals(mod_espca)
hist(res_espca)

Moran.I(res_espca, dist_inv)

## $observed
## [1] 0.005047208
## 
## $expected
## [1] -0.002487562
## 
## $sd
## [1] 0.00426031
## 
## $p.value
## [1] 0.07696118

#Ahora que pasa si hago el modelo con todas las variables explicativas:

colnames(datosqgiz)

##  [1] "id"   "Long" "Lat"  "z"    "MO"   "Ca"   "Mg"   "K"    "Na"   "CICE"
## [11] "CE"   "Fe"   "Cu"   "Zn"

mod_all <- sacsarlm(MO ~ Ca + I(Ca**2) + z + K + Na,listw = Wve)
summary(mod_all)

## 
## Call:sacsarlm(formula = MO ~ Ca + I(Ca^2) + z + K + Na, listw = Wve)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -1.581684 -0.384202 -0.060031  0.417791  1.950593 
## 
## Type: sac 
## Coefficients: (asymptotic standard errors) 
##                Estimate  Std. Error z value  Pr(>|z|)
## (Intercept) -0.28019999  1.49616122 -0.1873    0.8514
## Ca           0.12428195  0.03125885  3.9759 7.011e-05
## I(Ca^2)     -0.00316672  0.00155496 -2.0365    0.0417
## z            0.00043946  0.00285613  0.1539    0.8777
## K            1.05861460  0.24210268  4.3726 1.228e-05
## Na           0.03565224  0.24589010  0.1450    0.8847
## 
## Rho: 0.63124
## Asymptotic standard error: 0.67663
##     z-value: 0.93291, p-value: 0.35087
## Lambda: 0.72372
## Asymptotic standard error: 0.5585
##     z-value: 1.2958, p-value: 0.19503
## 
## LR test value: 12.434, p-value: 0.001995
## 
## Log likelihood: -358.1374 for sac model
## ML residual variance (sigma squared): 0.34333, (sigma: 0.58595)
## Number of observations: 403 
## Number of parameters estimated: 9 
## AIC: 734.27, (AIC for lm: 742.71)

res_all <- residuals(mod_all)
hist(res_all)

shapiro.test(res_all) #se toma un 0,01

## 
##  Shapiro-Wilk normality test
## 
## data:  res_all
## W = 0.98683, p-value = 0.001032

Moran.I(res_all, dist_inv)

## $observed
## [1] 0.005423791
## 
## $expected
## [1] -0.002487562
## 
## $sd
## [1] 0.004260256
## 
## $p.value
## [1] 0.06330921

#Ahora se hace la correlación entre el valor observado y los valores predichos o ajustados:

a <- cor(MO, mod_all$fitted.values) #Coeficiente de determinación
cor(MO, mod_all$fitted.values)**2 #Coeficiente de determinación R2

## [1] 0.3018563

#La variabilad explicada es muy baja
#Se hace
plot(MO,mod_all$fitted.values,pch=19)

#De tarea ahora se selcionan otras cuatro variables explicativas, y se hace la pauta para los otros modelos.

Estadistica espacial

2023-11-20