options(digits = 12)
library(rworldmap)
library(rworldxtra)
library(ggmap)
library(sf)       
library(spdep)
library(ape)
library(sp)
library(MVA)
library(Hmisc)
library(normtest)
library(nortest)
library(corrplot)
library(psych) 
library(crayon)
library(pastecs)
library(readxl)
library(clhs)
library(spatialreg)
XPABLO <- read_excel("XPABLO.xlsx", 
                     sheet = "tabla_1");
View(XPABLO)
dfCOS=data.frame(XPABLO)

Se trabajara con la variable Na

  1. Muestreo espacial del 80% de los datos
set.seed(123)
n = 0.8*403;
n = round(n,0)

df_Luis = data.frame(x = dfCOS$Long,
                     y = dfCOS$Lat,
                     Na = dfCOS$Na)

muestra <- clhs(df_Luis, size = n,
           iter = 100,progress = FALSE,
           simple = TRUE)

df_Luis = df_Luis[muestra,]

#View(df_Luis)
plot(dfCOS$Long,dfCOS$Lat)
points(df_Luis$x,df_Luis$y,pch=16,col=10*df_Luis$Na)

df_Luis = dfCOS[muestra,]

XY = as.matrix(df_Luis[,2:3])
COS.d=as.matrix(dist(XY, diag=T, upper=T))
##matriz inversa de distancias
COS.d.inv <-as.matrix(1/COS.d)
##asignando cero a la diag
diag(COS.d.inv) <- 0
##__la matriz de pesos basado en distancias
W=as.matrix(COS.d.inv)
##__verificando sumas por filas
SUMAS=apply(W,1,sum)
##__estandarizando la metriz
We=W/SUMAS

2)Modelos de regresión espacial

# Construccion de matriz de pesos estandarizada
contnb=dnearneigh(coordinates(XY),0,380000,longlat = F)
dlist <- nbdists(contnb, XY)
dlist <- lapply(dlist, function(x) 1/x)
Wve=nb2listw(contnb,glist=dlist,style = "W")

\[y = \lambda W Y + \epsilon\]

mod1=spautolm(Na~1,data=df_Luis,listw=Wve)
summary(mod1)
## 
## Call: spautolm(formula = Na ~ 1, data = df_Luis, listw = Wve)
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.2123726484 -0.0905937180 -0.0255299050  0.0481573667  0.5308942631 
## 
## Coefficients: 
##                 Estimate   Std. Error z value   Pr(>|z|)
## (Intercept) 0.2629027342 0.0569102211  4.6196 3.8447e-06
## 
## Lambda: 0.870360093 LR test value: 14.4436105 p-value: 0.00014441897 
## Numerical Hessian standard error of lambda: 0.121859108 
## 
## Log likelihood: 192.306001079 
## ML residual variance (sigma squared): 0.0175272523, (sigma: 0.13239053)
## Number of observations: 322 
## Number of parameters estimated: 3 
## AIC: -378.612002
res1 = mod1$fit$residuals
Moran.I(res1,COS.d.inv)
## $observed
## [1] 0.0177687856432
## 
## $expected
## [1] -0.00311526479751
## 
## $sd
## [1] 0.00537455945061
## 
## $p.value
## [1] 0.000102025697095

Sigue habiendo dependencia espacial, ya que el p-value es muy pequeƱo, por lo cual este modelo no sirve * Modelo autorregresivo puro SARAR

\[y = \lambda W Y + u \\ u = \rho+\epsilon\]

mod2 = sacsarlm(Na~1,data=df_Luis,listw=Wve)
summary(mod2)
## 
## Call:sacsarlm(formula = Na ~ 1, data = df_Luis, listw = Wve)
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.2196210708 -0.0893525947 -0.0225830485  0.0424488364  0.5393924353 
## 
## Type: sac 
## Coefficients: (numerical Hessian approximate standard errors) 
##                 Estimate   Std. Error z value Pr(>|z|)
## (Intercept) 0.0596432588 0.0677178638 0.88076  0.37845
## 
## Rho: 0.752832996
## Approximate (numerical Hessian) standard error: 0.219922719
##     z-value: 3.42317065, p-value: 0.000618952078
## Lambda: 0.752832993
## Approximate (numerical Hessian) standard error: 0.220644832
##     z-value: 3.41196749, p-value: 0.000644958043
## 
## LR test value: 19.8433073, p-value: 4.90998945e-05
## 
## Log likelihood: 195.005849517 for sac model
## ML residual variance (sigma squared): 0.0171898365, (sigma: 0.131110017)
## Number of observations: 322 
## Number of parameters estimated: 4 
## AIC: -382.011699, (AIC for lm: -366.168392)
res2 = mod2$residuals
Moran.I(res2,COS.d.inv)
## $observed
## [1] 0.00935727155204
## 
## $expected
## [1] -0.00311526479751
## 
## $sd
## [1] 0.00537345304063
## 
## $p.value
## [1] 0.0202792935938

Aunque el p-value mejoro mucho con respecto al modelo anterior, el modelo SARAR todavia no logra quitar la dependencia espacial del Na

\[y = \lambda W Y + X\beta + \epsilon\]

mod3=errorsarlm(formula=Na~Ca+z+Mg+K+cos+CICE+Cu+Zn,data=df_Luis,listw=Wve)
summary(mod3)
## 
## Call:errorsarlm(formula = Na ~ Ca + z + Mg + K + cos + CICE + Cu + 
##     Zn, data = df_Luis, listw = Wve)
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.2883324808 -0.0683821219 -0.0247995767  0.0390753171  0.5350846508 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                    Estimate      Std. Error  z value   Pr(>|z|)
## (Intercept)  0.111905382794  0.057324083362  1.95215    0.05092
## Ca          -0.000723752063  0.004543927168 -0.15928    0.87345
## z            0.000110808906  0.000603978727  0.18346    0.85443
## Mg           0.050417931462  0.010248337286  4.91962 8.6712e-07
## K            0.045339966559  0.061402169975  0.73841    0.46027
## cos          0.031451743932  0.021722610925  1.44788    0.14765
## CICE         0.001609579682  0.004103586086  0.39224    0.69488
## Cu           0.001415662996  0.005458437833  0.25935    0.79536
## Zn          -0.002274320943  0.004367090068 -0.52079    0.60252
## 
## Lambda: 0.725914611, LR test value: 4.73236018, p-value: 0.0296002402
## Asymptotic standard error: 0.176477794
##     z-value: 4.11334817, p-value: 3.89961553e-05
## Wald statistic: 16.9196332, p-value: 3.89961553e-05
## 
## Log likelihood: 234.397530537 for error model
## ML residual variance (sigma squared): 0.0135656615, (sigma: 0.11647172)
## Number of observations: 322 
## Number of parameters estimated: 11 
## AIC: -446.795061, (AIC for lm: -444.062701)
res3 = mod3$residuals
Moran.I(res3,COS.d.inv)
## $observed
## [1] 0.00860535594717
## 
## $expected
## [1] -0.00311526479751
## 
## $sd
## [1] 0.00536921283723
## 
## $p.value
## [1] 0.0290408972473

Este modelo sigue sin quitar la dependencia espacial, a continuacion se prueba quitando las variables con mayor p valor (z,CICE,Ca).

mod3=errorsarlm(formula=Na~Mg+K+cos+Cu+Zn,data=df_Luis,listw=Wve)
summary(mod3)
## 
## Call:errorsarlm(formula = Na ~ Mg + K + cos + Cu + Zn, data = df_Luis, 
##     listw = Wve)
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.2898367055 -0.0694423677 -0.0237357807  0.0387324675  0.5318167847 
## 
## Type: error 
## Coefficients: (asymptotic standard errors) 
##                   Estimate     Std. Error  z value   Pr(>|z|)
## (Intercept)  0.12378410647  0.03237340009  3.82364  0.0001315
## Mg           0.05316904297  0.00864735224  6.14859 7.8174e-10
## K            0.04834861720  0.06100589604  0.79252  0.4280554
## cos          0.03456221120  0.01941173497  1.78048  0.0749974
## Cu           0.00111964090  0.00541028324  0.20695  0.8360514
## Zn          -0.00234986657  0.00418756283 -0.56115  0.5746927
## 
## Lambda: 0.728728621, LR test value: 5.06657386, p-value: 0.0243915474
## Asymptotic standard error: 0.174918966
##     z-value: 4.16609266, p-value: 3.09864796e-05
## Wald statistic: 17.3563281, p-value: 3.09864796e-05
## 
## Log likelihood: 234.255864324 for error model
## ML residual variance (sigma squared): 0.0135766295, (sigma: 0.116518795)
## Number of observations: 322 
## Number of parameters estimated: 8 
## AIC: -452.511729, (AIC for lm: -449.445155)
res3 = mod3$residuals
Moran.I(res3,COS.d.inv)
## $observed
## [1] 0.00868238683685
## 
## $expected
## [1] -0.00311526479751
## 
## $sd
## [1] 0.00536923515636
## 
## $p.value
## [1] 0.0280012712128

Como se puede observar el p valor sigue disminuyendo, lo que indica que este modelo no es suficiente para quitar la dependencia espacial.

mod4=lagsarlm(formula=Na~Ca+z+Mg+K+cos+CICE+Cu+Zn,data=df_Luis,listw=Wve)
summary(mod4)
## 
## Call:lagsarlm(formula = Na ~ Ca + z + Mg + K + cos + CICE + Cu + Zn, 
##     data = df_Luis, listw = Wve)
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.2890802644 -0.0709917430 -0.0247214478  0.0410004533  0.5409346051 
## 
## Type: lag 
## Coefficients: (asymptotic standard errors) 
##                    Estimate      Std. Error  z value   Pr(>|z|)
## (Intercept) -6.10047829e-02  7.44290164e-02 -0.81964    0.41242
## Ca           9.00245073e-05  4.45232519e-03  0.02022    0.98387
## z            1.55289925e-04  4.13023124e-04  0.37598    0.70693
## Mg           4.68757330e-02  9.92064909e-03  4.72507 2.3004e-06
## K            4.10407316e-02  6.12900794e-02  0.66961    0.50310
## cos          2.98446294e-02  2.11559465e-02  1.41070    0.15833
## CICE         1.46722676e-03  4.05484285e-03  0.36185    0.71747
## Cu           3.28072185e-04  5.41078637e-03  0.06063    0.95165
## Zn          -1.13401737e-03  4.30940074e-03 -0.26315    0.79244
## 
## Rho: 0.62027996, LR test value: 3.89815162, p-value: 0.048339263
## Asymptotic standard error: 0.216055692
##     z-value: 2.87092626, p-value: 0.00409270985
## Wald statistic: 8.24221759, p-value: 0.00409270985
## 
## Log likelihood: 233.980426255 for lag model
## ML residual variance (sigma squared): 0.0136310653, (sigma: 0.116752153)
## Number of observations: 322 
## Number of parameters estimated: 11 
## AIC: -445.960853, (AIC for lm: -444.062701)
## LM test for residual autocorrelation
## test value: 7.06828635, p-value: 0.0078460263
res4 = mod4$residuals
Moran.I(res4, COS.d.inv)
## $observed
## [1] 0.00904752141909
## 
## $expected
## [1] -0.00311526479751
## 
## $sd
## [1] 0.00536876521086
## 
## $p.value
## [1] 0.0234837422836

\[y = \lambda W Y +X \beta + u \\ u = \rho W u + \epsilon\]

mod5 = sacsarlm(formula=Na~Mg+cos+Zn,data=df_Luis,listw=Wve)
summary(mod5)
## 
## Call:sacsarlm(formula = Na ~ Mg + cos + Zn, data = df_Luis, listw = Wve)
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.2974616944 -0.0679780211 -0.0234432416  0.0372436693  0.5364515744 
## 
## Type: sac 
## Coefficients: (asymptotic standard errors) 
##                   Estimate     Std. Error  z value   Pr(>|z|)
## (Intercept)  0.00621211161  0.18087927102  0.03434   0.972603
## Mg           0.05552405394  0.00714579390  7.77017 7.7716e-15
## cos          0.03923030881  0.01753786162  2.23689   0.025293
## Zn          -0.00174727314  0.00336705971 -0.51893   0.603809
## 
## Rho: 0.432400616
## Asymptotic standard error: 0.654186993
##     z-value: 0.660974035, p-value: 0.508628965
## Lambda: 0.61748175
## Asymptotic standard error: 0.53542037
##     z-value: 1.15326533, p-value: 0.248801496
## 
## LR test value: 6.24537133, p-value: 0.0440387361
## 
## Log likelihood: 234.53659292 for sac model
## ML residual variance (sigma squared): 0.0135604009, (sigma: 0.116449135)
## Number of observations: 322 
## Number of parameters estimated: 7 
## AIC: -455.073186, (AIC for lm: -452.827815)
res5 = mod5$residuals
Moran.I(res5, COS.d.inv)
## $observed
## [1] 0.00574075949208
## 
## $expected
## [1] -0.00311526479751
## 
## $sd
## [1] 0.00536794136665
## 
## $p.value
## [1] 0.0989840537342

Por ahora este es el mejor modelo, que logra quitar la dependencia espacial

\[y = \lambda W Y +X \beta_1 + X \beta_2 + u \\ u = \rho W u + \epsilon\]

mod6 = sacsarlm(formula=Na~Ca+z+Mg+K+cos+CICE+Cu+Zn,data=df_Luis,listw=Wve, type = "mixed")
summary(mod6)
## 
## Call:sacsarlm(formula = Na ~ Ca + z + Mg + K + cos + CICE + Cu + Zn, 
##     data = df_Luis, listw = Wve, type = "mixed")
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.2887692371 -0.0698291898 -0.0246955759  0.0395861094  0.5366135568 
## 
## Type: sacmixed 
## Coefficients: (asymptotic standard errors) 
##                    Estimate      Std. Error  z value   Pr(>|z|)
## (Intercept)  0.111985052690  0.653533805678  0.17135    0.86395
## Ca          -0.003227266591  0.004853645631 -0.66492    0.50610
## z            0.000625485528  0.001346961242  0.46437    0.64238
## Mg           0.051807845318  0.011223895880  4.61585 3.9149e-06
## K            0.046859682663  0.062851876439  0.74556    0.45593
## cos          0.033952724703  0.022544537006  1.50603    0.13206
## CICE         0.002459146890  0.004241250401  0.57982    0.56204
## Cu           0.002150551085  0.005694622116  0.37765    0.70569
## Zn          -0.003497627646  0.004722535064 -0.74063    0.45892
## lag.Ca       0.086220208105  0.072380408620  1.19121    0.23357
## lag.z       -0.002904630429  0.003893551643 -0.74601    0.45566
## lag.Mg       0.113262357762  0.305024773699  0.37132    0.71040
## lag.K       -0.688550544745  1.150310341951 -0.59858    0.54945
## lag.cos      0.049553461890  0.299740399690  0.16532    0.86869
## lag.CICE    -0.067883447879  0.062584575397 -1.08467    0.27807
## lag.Cu      -0.073470529698  0.101112448076 -0.72662    0.46746
## lag.Zn       0.087635929150  0.085419915386  1.02594    0.30492
## 
## Rho: 0.511707812
## Asymptotic standard error: 4.06579784
##     z-value: 0.125856679, p-value: 0.899845375
## Lambda: 0.550940175
## Asymptotic standard error: 3.82751736
##     z-value: 0.143941914, p-value: 0.885546338
## 
## LR test value: 11.2694243, p-value: 0.336916296
## 
## Log likelihood: 237.66606261 for sacmixed model
## ML residual variance (sigma squared): 0.0133017106, (sigma: 0.115333042)
## Number of observations: 322 
## Number of parameters estimated: 20 
## AIC: -435.332125, (AIC for lm: -444.062701)

El \(\rho\) y \(\lambda\) no sirven, ya que tienen p valor muy alto, por ello no es necesario seguir analizando.

\[y = X\beta_1 \]

mod7=lagsarlm(formula=Na~Ca+z+Mg+K+cos+CICE+Zn,data=df_Luis,listw=Wve,type = "mixed")
summary(mod7)
## 
## Call:
## lagsarlm(formula = Na ~ Ca + z + Mg + K + cos + CICE + Zn, data = df_Luis, 
##     listw = Wve, type = "mixed")
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.2961314254 -0.0676003255 -0.0216031464  0.0426787293  0.5339073716 
## 
## Type: mixed 
## Coefficients: (asymptotic standard errors) 
##                   Estimate     Std. Error  z value   Pr(>|z|)
## (Intercept)  0.01233664686  0.40714975284  0.03030   0.975828
## Ca          -0.00288940393  0.00477113315 -0.60560   0.544780
## z            0.00068537459  0.00132174269  0.51854   0.604083
## Mg           0.05281006279  0.01056425152  4.99894 5.7646e-07
## K            0.05048927147  0.06120104167  0.82497   0.409386
## cos          0.03729994892  0.02163425422  1.72412   0.084687
## CICE         0.00200400564  0.00415753207  0.48202   0.629793
## Zn          -0.00251148428  0.00372440297 -0.67433   0.500100
## lag.Ca       0.09453150218  0.05723992158  1.65150   0.098637
## lag.z       -0.00211611640  0.00340173158 -0.62207   0.533896
## lag.Mg       0.02404027231  0.08739547906  0.27507   0.783259
## lag.K       -0.34257656476  0.97584704977 -0.35106   0.725547
## lag.cos     -0.05311935531  0.22896121810 -0.23200   0.816537
## lag.CICE    -0.07406135285  0.05595506100 -1.32359   0.185640
## lag.Zn       0.05349938386  0.05189087420  1.03100   0.302542
## 
## Rho: 0.649687779, LR test value: 3.13386846, p-value: 0.0766815811
## Asymptotic standard error: 0.216376476
##     z-value: 3.00258046, p-value: 0.00267701195
## Wald statistic: 9.01548944, p-value: 0.00267701195
## 
## Log likelihood: 236.632404787 for mixed model
## ML residual variance (sigma squared): 0.0134011732, (sigma: 0.115763436)
## Number of observations: 322 
## Number of parameters estimated: 17 
## AIC: -439.26481, (AIC for lm: -438.130941)
## LM test for residual autocorrelation
## test value: 5.74454693, p-value: 0.0165399211
res7 = mod7$residuals
Moran.I(res7, COS.d.inv)
## $observed
## [1] 0.00647660043906
## 
## $expected
## [1] -0.00311526479751
## 
## $sd
## [1] 0.00536871587202
## 
## $p.value
## [1] 0.0739986231389

Este modelo tambien logra quitar la dependencia espacial, pero el modelo con mejores resultados fue el modelo SAC con un p valor cercano al 10 %. Ahora se analizara la normalidad de los residuales.

shapiro.test(res5)
## 
##  Shapiro-Wilk normality test
## 
## data:  res5
## W = 0.9054100172, p-value = 2.56722564e-13
ad.test(res5)
## 
##  Anderson-Darling normality test
## 
## data:  res5
## A = 8.53550967, p-value < 2.220446e-16
sf.test(res5)
## 
##  Shapiro-Francia normality test
## 
## data:  res5
## W = 0.9033039359, p-value = 7.21970111e-12
cvm.test(res5)
## Warning in cvm.test(res5): p-value is smaller than 7.37e-10, cannot be computed
## more accurately
## 
##  Cramer-von Mises normality test
## 
## data:  res5
## W = 1.568308832, p-value = 7.37e-10
hist(res5)

Como se puede observar, no se cumple el supuesto de normalidad, para mejorar esto, es necesario revisar si hay valores atipicos en los datos.

outliers::grubbs.test(res7)
## 
##  Grubbs test for one outlier
## 
## data:  res7
## G.402 = 4.6048881676, U = 0.9337350336, p-value = 0.000461011601
## alternative hypothesis: highest value 0.53390737156921 is an outlier

Se esta rechazando la hipotesis nula, lo que significa que el mas alto valor si es un outlier

which.max(res5)
## 402 
##   8
plot(df_Luis$Na,mod5$fitted.values)

Como se puede observar en el grafico anterior, es posible eliminar los datos mayores a 0.65, ya que estos pueden ser datos atipicos y esten daƱando la normalidad.

df_Luis2 = df_Luis
atipicos = which(df_Luis2$Na>0.65)
df_Luis2 = df_Luis2[-atipicos,]


XY = as.matrix(df_Luis2[,2:3])
COS.d=as.matrix(dist(XY, diag=T, upper=T))
COS.d.inv <-as.matrix(1/COS.d)
diag(COS.d.inv) <- 0
W=as.matrix(COS.d.inv)
SUMAS=apply(W,1,sum)
We=W/SUMAS


# Construccion de matriz de pesos estandarizada
contnb=dnearneigh(coordinates(XY),0,380000,longlat = F)
dlist <- nbdists(contnb, XY)
dlist <- lapply(dlist, function(x) 1/x)
Wve=nb2listw(contnb,glist=dlist,style = "W")
mod5b = sacsarlm(formula=Na~Mg+cos+Zn,data=df_Luis2,listw=Wve)
summary(mod5b)
## 
## Call:sacsarlm(formula = Na ~ Mg + cos + Zn, data = df_Luis2, listw = Wve)
## 
## Residuals:
##           Min            1Q        Median            3Q           Max 
## -0.2362630400 -0.0646864377 -0.0144796640  0.0395669444  0.3434738875 
## 
## Type: sac 
## Coefficients: (asymptotic standard errors) 
##                   Estimate     Std. Error  z value   Pr(>|z|)
## (Intercept) -0.01406199523  0.20623010248 -0.06819 0.94563761
## Mg           0.04327803134  0.00641836124  6.74285 1.5531e-11
## cos          0.05070388181  0.01488327905  3.40677 0.00065737
## Zn          -0.00320177764  0.00285582197 -1.12114 0.26222807
## 
## Rho: 0.543050158
## Asymptotic standard error: 0.789768189
##     z-value: 0.687607029, p-value: 0.491700281
## Lambda: 0.633182272
## Asymptotic standard error: 0.701051404
##     z-value: 0.903189508, p-value: 0.366425325
## 
## LR test value: 8.53559108, p-value: 0.0140126395
## 
## Log likelihood: 282.975583749 for sac model
## ML residual variance (sigma squared): 0.0095256362, (sigma: 0.0975993658)
## Number of observations: 313 
## Number of parameters estimated: 7 
## AIC: -551.951167, (AIC for lm: -547.415576)
res5b = mod5b$residuals
Moran.I(res5b, COS.d.inv)
## $observed
## [1] 0.00600294061219
## 
## $expected
## [1] -0.00320512820513
## 
## $sd
## [1] 0.00555144536492
## 
## $p.value
## [1] 0.0971803853349

Como se puede observar, al quitar los datos atipicos, el modelo sigue siviendo y quita la dependencia espacial. Se vuelve a probar el supuesto de normalidad en los residuales.

shapiro.test(res5b)
## 
##  Shapiro-Wilk normality test
## 
## data:  res5b
## W = 0.9450033194, p-value = 2.08655947e-09
ad.test(res5b)
## 
##  Anderson-Darling normality test
## 
## data:  res5b
## A = 5.513199329, p-value = 1.27685734e-13
sf.test(res5b)
## 
##  Shapiro-Francia normality test
## 
## data:  res5b
## W = 0.9447483891, p-value = 1.92845962e-08
cvm.test(res5b)
## 
##  Cramer-von Mises normality test
## 
## data:  res5b
## W = 0.9405557475, p-value = 2.62313595e-09
hist(res5b)