options(digits = 12)
library(rworldmap)
library(rworldxtra)
library(ggmap)
library(sf)
library(spdep)
library(ape)
library(sp)
library(MVA)
library(Hmisc)
library(normtest)
library(nortest)
library(corrplot)
library(psych)
library(crayon)
library(pastecs)
library(readxl)
library(clhs)
library(spatialreg)
XPABLO <- read_excel("XPABLO.xlsx",
sheet = "tabla_1");
View(XPABLO)
dfCOS=data.frame(XPABLO)
Se trabajara con la variable Na
set.seed(123)
n = 0.8*403;
n = round(n,0)
df_Luis = data.frame(x = dfCOS$Long,
y = dfCOS$Lat,
Na = dfCOS$Na)
muestra <- clhs(df_Luis, size = n,
iter = 100,progress = FALSE,
simple = TRUE)
df_Luis = df_Luis[muestra,]
#View(df_Luis)
plot(dfCOS$Long,dfCOS$Lat)
points(df_Luis$x,df_Luis$y,pch=16,col=10*df_Luis$Na)
df_Luis = dfCOS[muestra,]
XY = as.matrix(df_Luis[,2:3])
COS.d=as.matrix(dist(XY, diag=T, upper=T))
##matriz inversa de distancias
COS.d.inv <-as.matrix(1/COS.d)
##asignando cero a la diag
diag(COS.d.inv) <- 0
##__la matriz de pesos basado en distancias
W=as.matrix(COS.d.inv)
##__verificando sumas por filas
SUMAS=apply(W,1,sum)
##__estandarizando la metriz
We=W/SUMAS
2)Modelos de regresión espacial
# Construccion de matriz de pesos estandarizada
contnb=dnearneigh(coordinates(XY),0,380000,longlat = F)
dlist <- nbdists(contnb, XY)
dlist <- lapply(dlist, function(x) 1/x)
Wve=nb2listw(contnb,glist=dlist,style = "W")
\[y = \lambda W Y + \epsilon\]
mod1=spautolm(Na~1,data=df_Luis,listw=Wve)
summary(mod1)
##
## Call: spautolm(formula = Na ~ 1, data = df_Luis, listw = Wve)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2123726484 -0.0905937180 -0.0255299050 0.0481573667 0.5308942631
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.2629027342 0.0569102211 4.6196 3.8447e-06
##
## Lambda: 0.870360093 LR test value: 14.4436105 p-value: 0.00014441897
## Numerical Hessian standard error of lambda: 0.121859108
##
## Log likelihood: 192.306001079
## ML residual variance (sigma squared): 0.0175272523, (sigma: 0.13239053)
## Number of observations: 322
## Number of parameters estimated: 3
## AIC: -378.612002
res1 = mod1$fit$residuals
Moran.I(res1,COS.d.inv)
## $observed
## [1] 0.0177687856432
##
## $expected
## [1] -0.00311526479751
##
## $sd
## [1] 0.00537455945061
##
## $p.value
## [1] 0.000102025697095
Sigue habiendo dependencia espacial, ya que el p-value es muy pequeƱo, por lo cual este modelo no sirve * Modelo autorregresivo puro SARAR
\[y = \lambda W Y + u \\ u = \rho+\epsilon\]
mod2 = sacsarlm(Na~1,data=df_Luis,listw=Wve)
summary(mod2)
##
## Call:sacsarlm(formula = Na ~ 1, data = df_Luis, listw = Wve)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2196210708 -0.0893525947 -0.0225830485 0.0424488364 0.5393924353
##
## Type: sac
## Coefficients: (numerical Hessian approximate standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.0596432588 0.0677178638 0.88076 0.37845
##
## Rho: 0.752832996
## Approximate (numerical Hessian) standard error: 0.219922719
## z-value: 3.42317065, p-value: 0.000618952078
## Lambda: 0.752832993
## Approximate (numerical Hessian) standard error: 0.220644832
## z-value: 3.41196749, p-value: 0.000644958043
##
## LR test value: 19.8433073, p-value: 4.90998945e-05
##
## Log likelihood: 195.005849517 for sac model
## ML residual variance (sigma squared): 0.0171898365, (sigma: 0.131110017)
## Number of observations: 322
## Number of parameters estimated: 4
## AIC: -382.011699, (AIC for lm: -366.168392)
res2 = mod2$residuals
Moran.I(res2,COS.d.inv)
## $observed
## [1] 0.00935727155204
##
## $expected
## [1] -0.00311526479751
##
## $sd
## [1] 0.00537345304063
##
## $p.value
## [1] 0.0202792935938
Aunque el p-value mejoro mucho con respecto al modelo anterior, el modelo SARAR todavia no logra quitar la dependencia espacial del Na
\[y = \lambda W Y + X\beta + \epsilon\]
mod3=errorsarlm(formula=Na~Ca+z+Mg+K+cos+CICE+Cu+Zn,data=df_Luis,listw=Wve)
summary(mod3)
##
## Call:errorsarlm(formula = Na ~ Ca + z + Mg + K + cos + CICE + Cu +
## Zn, data = df_Luis, listw = Wve)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2883324808 -0.0683821219 -0.0247995767 0.0390753171 0.5350846508
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.111905382794 0.057324083362 1.95215 0.05092
## Ca -0.000723752063 0.004543927168 -0.15928 0.87345
## z 0.000110808906 0.000603978727 0.18346 0.85443
## Mg 0.050417931462 0.010248337286 4.91962 8.6712e-07
## K 0.045339966559 0.061402169975 0.73841 0.46027
## cos 0.031451743932 0.021722610925 1.44788 0.14765
## CICE 0.001609579682 0.004103586086 0.39224 0.69488
## Cu 0.001415662996 0.005458437833 0.25935 0.79536
## Zn -0.002274320943 0.004367090068 -0.52079 0.60252
##
## Lambda: 0.725914611, LR test value: 4.73236018, p-value: 0.0296002402
## Asymptotic standard error: 0.176477794
## z-value: 4.11334817, p-value: 3.89961553e-05
## Wald statistic: 16.9196332, p-value: 3.89961553e-05
##
## Log likelihood: 234.397530537 for error model
## ML residual variance (sigma squared): 0.0135656615, (sigma: 0.11647172)
## Number of observations: 322
## Number of parameters estimated: 11
## AIC: -446.795061, (AIC for lm: -444.062701)
res3 = mod3$residuals
Moran.I(res3,COS.d.inv)
## $observed
## [1] 0.00860535594717
##
## $expected
## [1] -0.00311526479751
##
## $sd
## [1] 0.00536921283723
##
## $p.value
## [1] 0.0290408972473
Este modelo sigue sin quitar la dependencia espacial, a continuacion se prueba quitando las variables con mayor p valor (z,CICE,Ca).
mod3=errorsarlm(formula=Na~Mg+K+cos+Cu+Zn,data=df_Luis,listw=Wve)
summary(mod3)
##
## Call:errorsarlm(formula = Na ~ Mg + K + cos + Cu + Zn, data = df_Luis,
## listw = Wve)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2898367055 -0.0694423677 -0.0237357807 0.0387324675 0.5318167847
##
## Type: error
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.12378410647 0.03237340009 3.82364 0.0001315
## Mg 0.05316904297 0.00864735224 6.14859 7.8174e-10
## K 0.04834861720 0.06100589604 0.79252 0.4280554
## cos 0.03456221120 0.01941173497 1.78048 0.0749974
## Cu 0.00111964090 0.00541028324 0.20695 0.8360514
## Zn -0.00234986657 0.00418756283 -0.56115 0.5746927
##
## Lambda: 0.728728621, LR test value: 5.06657386, p-value: 0.0243915474
## Asymptotic standard error: 0.174918966
## z-value: 4.16609266, p-value: 3.09864796e-05
## Wald statistic: 17.3563281, p-value: 3.09864796e-05
##
## Log likelihood: 234.255864324 for error model
## ML residual variance (sigma squared): 0.0135766295, (sigma: 0.116518795)
## Number of observations: 322
## Number of parameters estimated: 8
## AIC: -452.511729, (AIC for lm: -449.445155)
res3 = mod3$residuals
Moran.I(res3,COS.d.inv)
## $observed
## [1] 0.00868238683685
##
## $expected
## [1] -0.00311526479751
##
## $sd
## [1] 0.00536923515636
##
## $p.value
## [1] 0.0280012712128
Como se puede observar el p valor sigue disminuyendo, lo que indica que este modelo no es suficiente para quitar la dependencia espacial.
mod4=lagsarlm(formula=Na~Ca+z+Mg+K+cos+CICE+Cu+Zn,data=df_Luis,listw=Wve)
summary(mod4)
##
## Call:lagsarlm(formula = Na ~ Ca + z + Mg + K + cos + CICE + Cu + Zn,
## data = df_Luis, listw = Wve)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2890802644 -0.0709917430 -0.0247214478 0.0410004533 0.5409346051
##
## Type: lag
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.10047829e-02 7.44290164e-02 -0.81964 0.41242
## Ca 9.00245073e-05 4.45232519e-03 0.02022 0.98387
## z 1.55289925e-04 4.13023124e-04 0.37598 0.70693
## Mg 4.68757330e-02 9.92064909e-03 4.72507 2.3004e-06
## K 4.10407316e-02 6.12900794e-02 0.66961 0.50310
## cos 2.98446294e-02 2.11559465e-02 1.41070 0.15833
## CICE 1.46722676e-03 4.05484285e-03 0.36185 0.71747
## Cu 3.28072185e-04 5.41078637e-03 0.06063 0.95165
## Zn -1.13401737e-03 4.30940074e-03 -0.26315 0.79244
##
## Rho: 0.62027996, LR test value: 3.89815162, p-value: 0.048339263
## Asymptotic standard error: 0.216055692
## z-value: 2.87092626, p-value: 0.00409270985
## Wald statistic: 8.24221759, p-value: 0.00409270985
##
## Log likelihood: 233.980426255 for lag model
## ML residual variance (sigma squared): 0.0136310653, (sigma: 0.116752153)
## Number of observations: 322
## Number of parameters estimated: 11
## AIC: -445.960853, (AIC for lm: -444.062701)
## LM test for residual autocorrelation
## test value: 7.06828635, p-value: 0.0078460263
res4 = mod4$residuals
Moran.I(res4, COS.d.inv)
## $observed
## [1] 0.00904752141909
##
## $expected
## [1] -0.00311526479751
##
## $sd
## [1] 0.00536876521086
##
## $p.value
## [1] 0.0234837422836
\[y = \lambda W Y +X \beta + u \\ u = \rho W u + \epsilon\]
mod5 = sacsarlm(formula=Na~Mg+cos+Zn,data=df_Luis,listw=Wve)
summary(mod5)
##
## Call:sacsarlm(formula = Na ~ Mg + cos + Zn, data = df_Luis, listw = Wve)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2974616944 -0.0679780211 -0.0234432416 0.0372436693 0.5364515744
##
## Type: sac
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.00621211161 0.18087927102 0.03434 0.972603
## Mg 0.05552405394 0.00714579390 7.77017 7.7716e-15
## cos 0.03923030881 0.01753786162 2.23689 0.025293
## Zn -0.00174727314 0.00336705971 -0.51893 0.603809
##
## Rho: 0.432400616
## Asymptotic standard error: 0.654186993
## z-value: 0.660974035, p-value: 0.508628965
## Lambda: 0.61748175
## Asymptotic standard error: 0.53542037
## z-value: 1.15326533, p-value: 0.248801496
##
## LR test value: 6.24537133, p-value: 0.0440387361
##
## Log likelihood: 234.53659292 for sac model
## ML residual variance (sigma squared): 0.0135604009, (sigma: 0.116449135)
## Number of observations: 322
## Number of parameters estimated: 7
## AIC: -455.073186, (AIC for lm: -452.827815)
res5 = mod5$residuals
Moran.I(res5, COS.d.inv)
## $observed
## [1] 0.00574075949208
##
## $expected
## [1] -0.00311526479751
##
## $sd
## [1] 0.00536794136665
##
## $p.value
## [1] 0.0989840537342
Por ahora este es el mejor modelo, que logra quitar la dependencia espacial
\[y = \lambda W Y +X \beta_1 + X \beta_2 + u \\ u = \rho W u + \epsilon\]
mod6 = sacsarlm(formula=Na~Ca+z+Mg+K+cos+CICE+Cu+Zn,data=df_Luis,listw=Wve, type = "mixed")
summary(mod6)
##
## Call:sacsarlm(formula = Na ~ Ca + z + Mg + K + cos + CICE + Cu + Zn,
## data = df_Luis, listw = Wve, type = "mixed")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2887692371 -0.0698291898 -0.0246955759 0.0395861094 0.5366135568
##
## Type: sacmixed
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.111985052690 0.653533805678 0.17135 0.86395
## Ca -0.003227266591 0.004853645631 -0.66492 0.50610
## z 0.000625485528 0.001346961242 0.46437 0.64238
## Mg 0.051807845318 0.011223895880 4.61585 3.9149e-06
## K 0.046859682663 0.062851876439 0.74556 0.45593
## cos 0.033952724703 0.022544537006 1.50603 0.13206
## CICE 0.002459146890 0.004241250401 0.57982 0.56204
## Cu 0.002150551085 0.005694622116 0.37765 0.70569
## Zn -0.003497627646 0.004722535064 -0.74063 0.45892
## lag.Ca 0.086220208105 0.072380408620 1.19121 0.23357
## lag.z -0.002904630429 0.003893551643 -0.74601 0.45566
## lag.Mg 0.113262357762 0.305024773699 0.37132 0.71040
## lag.K -0.688550544745 1.150310341951 -0.59858 0.54945
## lag.cos 0.049553461890 0.299740399690 0.16532 0.86869
## lag.CICE -0.067883447879 0.062584575397 -1.08467 0.27807
## lag.Cu -0.073470529698 0.101112448076 -0.72662 0.46746
## lag.Zn 0.087635929150 0.085419915386 1.02594 0.30492
##
## Rho: 0.511707812
## Asymptotic standard error: 4.06579784
## z-value: 0.125856679, p-value: 0.899845375
## Lambda: 0.550940175
## Asymptotic standard error: 3.82751736
## z-value: 0.143941914, p-value: 0.885546338
##
## LR test value: 11.2694243, p-value: 0.336916296
##
## Log likelihood: 237.66606261 for sacmixed model
## ML residual variance (sigma squared): 0.0133017106, (sigma: 0.115333042)
## Number of observations: 322
## Number of parameters estimated: 20
## AIC: -435.332125, (AIC for lm: -444.062701)
El \(\rho\) y \(\lambda\) no sirven, ya que tienen p valor muy alto, por ello no es necesario seguir analizando.
\[y = X\beta_1 \]
mod7=lagsarlm(formula=Na~Ca+z+Mg+K+cos+CICE+Zn,data=df_Luis,listw=Wve,type = "mixed")
summary(mod7)
##
## Call:
## lagsarlm(formula = Na ~ Ca + z + Mg + K + cos + CICE + Zn, data = df_Luis,
## listw = Wve, type = "mixed")
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2961314254 -0.0676003255 -0.0216031464 0.0426787293 0.5339073716
##
## Type: mixed
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.01233664686 0.40714975284 0.03030 0.975828
## Ca -0.00288940393 0.00477113315 -0.60560 0.544780
## z 0.00068537459 0.00132174269 0.51854 0.604083
## Mg 0.05281006279 0.01056425152 4.99894 5.7646e-07
## K 0.05048927147 0.06120104167 0.82497 0.409386
## cos 0.03729994892 0.02163425422 1.72412 0.084687
## CICE 0.00200400564 0.00415753207 0.48202 0.629793
## Zn -0.00251148428 0.00372440297 -0.67433 0.500100
## lag.Ca 0.09453150218 0.05723992158 1.65150 0.098637
## lag.z -0.00211611640 0.00340173158 -0.62207 0.533896
## lag.Mg 0.02404027231 0.08739547906 0.27507 0.783259
## lag.K -0.34257656476 0.97584704977 -0.35106 0.725547
## lag.cos -0.05311935531 0.22896121810 -0.23200 0.816537
## lag.CICE -0.07406135285 0.05595506100 -1.32359 0.185640
## lag.Zn 0.05349938386 0.05189087420 1.03100 0.302542
##
## Rho: 0.649687779, LR test value: 3.13386846, p-value: 0.0766815811
## Asymptotic standard error: 0.216376476
## z-value: 3.00258046, p-value: 0.00267701195
## Wald statistic: 9.01548944, p-value: 0.00267701195
##
## Log likelihood: 236.632404787 for mixed model
## ML residual variance (sigma squared): 0.0134011732, (sigma: 0.115763436)
## Number of observations: 322
## Number of parameters estimated: 17
## AIC: -439.26481, (AIC for lm: -438.130941)
## LM test for residual autocorrelation
## test value: 5.74454693, p-value: 0.0165399211
res7 = mod7$residuals
Moran.I(res7, COS.d.inv)
## $observed
## [1] 0.00647660043906
##
## $expected
## [1] -0.00311526479751
##
## $sd
## [1] 0.00536871587202
##
## $p.value
## [1] 0.0739986231389
Este modelo tambien logra quitar la dependencia espacial, pero el modelo con mejores resultados fue el modelo SAC con un p valor cercano al 10 %. Ahora se analizara la normalidad de los residuales.
shapiro.test(res5)
##
## Shapiro-Wilk normality test
##
## data: res5
## W = 0.9054100172, p-value = 2.56722564e-13
ad.test(res5)
##
## Anderson-Darling normality test
##
## data: res5
## A = 8.53550967, p-value < 2.220446e-16
sf.test(res5)
##
## Shapiro-Francia normality test
##
## data: res5
## W = 0.9033039359, p-value = 7.21970111e-12
cvm.test(res5)
## Warning in cvm.test(res5): p-value is smaller than 7.37e-10, cannot be computed
## more accurately
##
## Cramer-von Mises normality test
##
## data: res5
## W = 1.568308832, p-value = 7.37e-10
hist(res5)
Como se puede observar, no se cumple el supuesto de normalidad, para mejorar esto, es necesario revisar si hay valores atipicos en los datos.
outliers::grubbs.test(res7)
##
## Grubbs test for one outlier
##
## data: res7
## G.402 = 4.6048881676, U = 0.9337350336, p-value = 0.000461011601
## alternative hypothesis: highest value 0.53390737156921 is an outlier
Se esta rechazando la hipotesis nula, lo que significa que el mas alto valor si es un outlier
which.max(res5)
## 402
## 8
plot(df_Luis$Na,mod5$fitted.values)
Como se puede observar en el grafico anterior, es posible eliminar los datos mayores a 0.65, ya que estos pueden ser datos atipicos y esten daƱando la normalidad.
df_Luis2 = df_Luis
atipicos = which(df_Luis2$Na>0.65)
df_Luis2 = df_Luis2[-atipicos,]
XY = as.matrix(df_Luis2[,2:3])
COS.d=as.matrix(dist(XY, diag=T, upper=T))
COS.d.inv <-as.matrix(1/COS.d)
diag(COS.d.inv) <- 0
W=as.matrix(COS.d.inv)
SUMAS=apply(W,1,sum)
We=W/SUMAS
# Construccion de matriz de pesos estandarizada
contnb=dnearneigh(coordinates(XY),0,380000,longlat = F)
dlist <- nbdists(contnb, XY)
dlist <- lapply(dlist, function(x) 1/x)
Wve=nb2listw(contnb,glist=dlist,style = "W")
mod5b = sacsarlm(formula=Na~Mg+cos+Zn,data=df_Luis2,listw=Wve)
summary(mod5b)
##
## Call:sacsarlm(formula = Na ~ Mg + cos + Zn, data = df_Luis2, listw = Wve)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.2362630400 -0.0646864377 -0.0144796640 0.0395669444 0.3434738875
##
## Type: sac
## Coefficients: (asymptotic standard errors)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.01406199523 0.20623010248 -0.06819 0.94563761
## Mg 0.04327803134 0.00641836124 6.74285 1.5531e-11
## cos 0.05070388181 0.01488327905 3.40677 0.00065737
## Zn -0.00320177764 0.00285582197 -1.12114 0.26222807
##
## Rho: 0.543050158
## Asymptotic standard error: 0.789768189
## z-value: 0.687607029, p-value: 0.491700281
## Lambda: 0.633182272
## Asymptotic standard error: 0.701051404
## z-value: 0.903189508, p-value: 0.366425325
##
## LR test value: 8.53559108, p-value: 0.0140126395
##
## Log likelihood: 282.975583749 for sac model
## ML residual variance (sigma squared): 0.0095256362, (sigma: 0.0975993658)
## Number of observations: 313
## Number of parameters estimated: 7
## AIC: -551.951167, (AIC for lm: -547.415576)
res5b = mod5b$residuals
Moran.I(res5b, COS.d.inv)
## $observed
## [1] 0.00600294061219
##
## $expected
## [1] -0.00320512820513
##
## $sd
## [1] 0.00555144536492
##
## $p.value
## [1] 0.0971803853349
Como se puede observar, al quitar los datos atipicos, el modelo sigue siviendo y quita la dependencia espacial. Se vuelve a probar el supuesto de normalidad en los residuales.
shapiro.test(res5b)
##
## Shapiro-Wilk normality test
##
## data: res5b
## W = 0.9450033194, p-value = 2.08655947e-09
ad.test(res5b)
##
## Anderson-Darling normality test
##
## data: res5b
## A = 5.513199329, p-value = 1.27685734e-13
sf.test(res5b)
##
## Shapiro-Francia normality test
##
## data: res5b
## W = 0.9447483891, p-value = 1.92845962e-08
cvm.test(res5b)
##
## Cramer-von Mises normality test
##
## data: res5b
## W = 0.9405557475, p-value = 2.62313595e-09
hist(res5b)