Markdown Author: Jessie Bell
Download this Rmd: Top right corner → Code → Download Rmd
sf to rast conversion function:
# use the homemade function:
sf_2_rast <-function(sfObject,variableIndex = 1){
# coerce sf to a data.frame
dfObject <- data.frame(st_coordinates(sfObject),
z=as.data.frame(sfObject)[,variableIndex])
# coerce data.frame to SpatRaster
rastObject <- rast(dfObject,crs=crs(sfObject))
names(rastObject) <- names(sfObject)[variableIndex]
return(rastObject)
}
Read in and convert the data, then plot it to get a grasp on what is there:
#read in your data
prcpCA <- readRDS("prcpCA.rds")
#now the empty grid to interpolate the datad
gridCA <- readRDS("gridCA.rds")
#set the CRS and convert foreign object to SF
prcpCAsf <- prcpCA %>% st_as_sf(coords = c("X", "Y")) %>%
st_set_crs(value = 3310)
gridCA_sf <- st_as_sf(gridCA,
coords=c("X", "Y"),
crs=st_crs(prcpCAsf))
# plot it
prcpCAsf %>% ggplot() +
geom_sf(aes(fill=ANNUAL,size=ANNUAL),color="white",
shape=21,alpha=0.8) +
scale_fill_continuous(type = "viridis",name="mm") +
labs(title="Total Annual Precipitation") +
scale_size(guide="none")
Use Kriging to create a surface of the total annual precip in
Cali. Fit the model using the variogram, vgm
and fit.variogram like the tutorial did.
#plot the variogram
CAVar <- variogram(ANNUAL~1, prcpCAsf)
plot(CAVar, pch=20, cex=1.5, col="black",
ylab=expression("Semivariance ("*gamma*")"), xlab="Distance (m)", main="CA Annual Precipitation (mm)")
Range: ~100,000 m
Nugget: \(\gamma\) = 1000
Sill: \(\gamma\) = 110,000
Create an Exponential Model using the variogram (see step 4.5 as to why I chose Exp).
exp.model <- vgm(psill=158188, model="Exp", range=89112, nugget=0)
exp.fit <- fit.variogram(object = CAVar, model = exp.model)
exp.fit
## model psill range
## 1 Nug 0.0 0.00
## 2 Exp 156446.9 87158.59
Create a theoretical variogram.
plot(CAVar,model=exp.fit,pch=20,cex=1.5,col="black",
ylab=expression("Semivariance ("*gamma*")"),
xlab="Distance (m)", main = "Annual Precipitation (mm)",
sub="Points: Empirical, Line: Exponential Model")
Above you can see our observed (empirical/categorical) data alongside a fitted continuous model.
Do the last 3 steps faster with
autofitVariogram, check each model shape to find the best
fit:
#The exponential Model
CAVarAuto <- autofitVariogram(formula = ANNUAL~1,input_data = prcpCAsf, model="Exp")
plot(CAVarAuto)
The best model fit seems to be the Exponential Model (Exp), so I will plug the terms into the Exp Equation:
b = Nugget
C\(_0\) = Sill
d is the distance in question
r = Range
a = \(\frac{r}{3}\) = 29704
⭐ \(\hat{\gamma}(d)= 0+158188(1-e^{-d/29704})\) ⭐
#Checking all model types:
#the Matern, M. Stein's parameterization (Ste)
CAVarAuto <- autofitVariogram(formula = ANNUAL~1,input_data = prcpCAsf, model="Ste")
plot(CAVarAuto)
#The exponential Model
CAVarAuto <- autofitVariogram(formula = ANNUAL~1,input_data = prcpCAsf, model="Mat")
plot(CAVarAuto)
#The Gaussian Model
CAVarAuto <- autofitVariogram(formula = ANNUAL~1,input_data = prcpCAsf, model="Gau")
plot(CAVarAuto)
##### sph
CAVarAuto <- autofitVariogram(formula = ANNUAL~1,input_data = prcpCAsf, model="Sph")
plot(CAVarAuto)
Krige the data.
CAGstat <- gstat(formula = ANNUAL~1, locations = prcpCAsf,
model = exp.fit)
CAKrige_sf <- predict(CAGstat, newdata=gridCA_sf)
## [using ordinary kriging]
Convert sf to rast using homemade
function sf_2_rast.
CAKrige_rast <- sf_2_rast(CAKrige_sf)
Plot the prediction surface.
ggplot() +
geom_spatraster(data=CAKrige_rast, mapping = aes(fill=var1.pred),alpha=0.8) +
scale_fill_continuous(type = "viridis",name="Precip. (mm)",na.value = "transparent") +
labs(title="CA Annual Precipitation (Exp)") +
theme_minimal()
Check out the variance around the predictions and plot it.
CAKrige_sf$var1.var.sqrt <- sqrt(CAKrige_sf$var1.var)
CAKrige_rast <- sf_2_rast(CAKrige_sf,variableIndex = 4)
ggplot() +
geom_spatraster(data=CAKrige_rast, mapping = aes(fill=var1.var.sqrt ),alpha=0.8) +
scale_fill_continuous(type = "viridis",name="precip.(mm)",na.value = "transparent") +
labs(title="Variance of annual precip. (Exp)") +
theme_minimal()
Cross-validate using krige.cv and calculate
\(R^2\).
#cross validate with LOOCV
ExpKrigeLOOCV_sf <- krige.cv(formula = ANNUAL~1,
locations = prcpCAsf,
model = exp.fit, verbose = FALSE)
##R^2 for exp model
Exp_r2_LOOCV <- cor(ExpKrigeLOOCV_sf$observed, ExpKrigeLOOCV_sf$var1.pred)^2
# RMSE
Exp_rmse_LOOCV <-sqrt(mean(ExpKrigeLOOCV_sf$residual^2))
#cross validate with nfold
ExpKrigenfold_sf <- krige.cv(formula = ANNUAL~1,
nfold=10,
locations = prcpCAsf,
model = exp.fit, verbose = FALSE)
##R^2 for ste model
Exp_r2_nfold <- cor(ExpKrigenfold_sf$observed, ExpKrigenfold_sf$var1.pred)^2
# RMSE
Exp_rmse_nfold <- sqrt(mean(ExpKrigenfold_sf$residual^2))
Model5 <- c("Exponential")
#make a nice table
stats1 <- as.data.frame(cbind(Model5, Exp_r2_LOOCV, Exp_rmse_LOOCV, Exp_r2_nfold, Exp_rmse_nfold))
stats1 <- type.convert(stats1, as.is = TRUE) #makes it so that numeric stays numeric and characters stay as characters in df
colnames(stats1) <- c("Model", "R-Sq LOOCV","RMSE LOOCV", "R-Sq 10fold", "RMSE nfold") #rename columns
gt(stats1)|>
tab_options(table.font.size = px(10L))%>%
opt_table_font("Corbel")%>%
opt_stylize(style=4)%>%
fmt_number(columns = where(is.numeric), n_sigfig = 3, drop_trailing_zeros = T)
| Model | R-Sq LOOCV | RMSE LOOCV | R-Sq 10fold | RMSE nfold |
|---|---|---|---|---|
| Exponential | 0.902 | 138 | 0.890 | 147 |
The LOOCV \(R^2\) is 0.9020, that seems pretty, pretty good! The RMSE is 138, which is lower than the RMSE for an nfold of 10. I think this is because our precipitation dataset is small, and so using LOOCV works better here.
Tune and assess the surface fit as you like. Interpret and explain your surface and anything you notice in particular about it.
Please see interpretation in step 11 below.
#create the model
ste.model <- vgm(psill=150643, model="Ste", range=104784, nugget=0)
ste.fit <- fit.variogram(object = CAVar, model = ste.model)
#Krige the data
CAGstat <- gstat(formula = ANNUAL~1, locations = prcpCAsf,
model = ste.fit)
CAKrige_sf <- predict(CAGstat, newdata=gridCA_sf)
## [using ordinary kriging]
#convert to raster
CAKrige_rast <- sf_2_rast(CAKrige_sf)
#plot prediction
a <- ggplot() +
geom_spatraster(data=CAKrige_rast, mapping = aes(fill=var1.pred),alpha=0.8) +
scale_fill_continuous(type = "viridis",name="Precip. (mm)",na.value = "transparent") +
labs(title="CA Annual Precipitation (Ste)") +
theme_minimal()
#plot variance
CAKrige_sf$var1.var.sqrt <- sqrt(CAKrige_sf$var1.var)
CAKrige_rast <- sf_2_rast(CAKrige_sf,variableIndex = 4)
b <- ggplot() +
geom_spatraster(data=CAKrige_rast, mapping = aes(fill=var1.var.sqrt ),alpha=0.8) +
scale_fill_continuous(type = "viridis",name="precip.(mm)",na.value = "transparent") +
labs(title="Variance of annual precip. (Ste)") +
theme_minimal()
ggarrange(a,b)
#cross validate with LOOCV
SteKrigeLOOCV_sf <- krige.cv(formula = ANNUAL~1,
locations = prcpCAsf,
model = ste.fit, verbose = FALSE)
##R^2 for ste model
SteR2LOOCV <- cor(SteKrigeLOOCV_sf$observed, SteKrigeLOOCV_sf$var1.pred)^2
# RMSE
StermseLOOCV <- sqrt(mean(SteKrigeLOOCV_sf$residual^2))
#cross validate with nfold
SteKrigenfold_sf <- krige.cv(formula = ANNUAL~1,
nfold=10,
locations = prcpCAsf,
model = ste.fit, verbose = FALSE)
##R^2 for ste model
Ster2nfold <- cor(SteKrigenfold_sf$observed, SteKrigenfold_sf$var1.pred)^2
# RMSE
Stermsenfold <- sqrt(mean(SteKrigenfold_sf$residual^2))
Model2 <- c("M. Stein's Parameterization")
#make a nice table
stats2 <- as.data.frame(cbind(Model2, SteR2LOOCV, StermseLOOCV, Ster2nfold, Stermsenfold))
stats2 <- type.convert(stats2, as.is = TRUE) #makes it so that numeric stays numeric and characters stay as characters in df
colnames(stats2) <- c("Model", "R-Sq LOOCV","RMSE LOOCV", "R-Sq 10fold", "RMSE nfold") #rename columns
gt(stats2)|>
tab_options(table.font.size = px(10L))%>%
opt_table_font("Corbel")%>%
opt_stylize(style=4)%>%
fmt_number(columns = where(is.numeric), n_sigfig = 3, drop_trailing_zeros = T)
| Model | R-Sq LOOCV | RMSE LOOCV | R-Sq 10fold | RMSE nfold |
|---|---|---|---|---|
| M. Stein's Parameterization | 0.902 | 138 | 0.888 | 148 |
#create the model
gau.model <- vgm(psill=125428, model="Gau", range=54097, nugget=9733)
gau.fit <- fit.variogram(object = CAVar, model = gau.model)
#Krige the data
CAGstat <- gstat(formula = ANNUAL~1, locations = prcpCAsf,
model = gau.fit)
CAKrige_sf <- predict(CAGstat, newdata=gridCA_sf)
## [using ordinary kriging]
#convert to raster
CAKrige_rast <- sf_2_rast(CAKrige_sf)
#plot prediction
a <- ggplot() +
geom_spatraster(data=CAKrige_rast, mapping = aes(fill=var1.pred),alpha=0.8) +
scale_fill_continuous(type = "viridis",name="Precip. (mm)",na.value = "transparent") +
labs(title="CA Annual Precipitation (Gau)") +
theme_minimal()
#plot variance
CAKrige_sf$var1.var.sqrt <- sqrt(CAKrige_sf$var1.var)
CAKrige_rast <- sf_2_rast(CAKrige_sf,variableIndex = 4)
b <- ggplot() +
geom_spatraster(data=CAKrige_rast, mapping = aes(fill=var1.var.sqrt ),alpha=0.8) +
scale_fill_continuous(type = "viridis",name="precip.(mm)",na.value = "transparent") +
labs(title="Variance of annual precip. (Gau)") +
theme_minimal()
ggarrange(a,b)
#cross validate with LOOCV
GauKrigeLOOCV_sf <- krige.cv(formula = ANNUAL~1,
locations = prcpCAsf,
model = gau.fit, verbose = FALSE)
##R^2 for ste model
Gaur2a <- cor(GauKrigeLOOCV_sf$observed, GauKrigeLOOCV_sf$var1.pred)^2
# RMSE
GauRMSEb <- sqrt(mean(GauKrigeLOOCV_sf$residual^2))
#cross validate with nfold
GauKrigenfold_sf <- krige.cv(formula = ANNUAL~1,
nfold=10,
locations = prcpCAsf,
model = gau.fit, verbose = FALSE)
##R^2 for ste model
Gaur2b <- cor(GauKrigenfold_sf$observed, GauKrigenfold_sf$var1.pred)^2
# RMSE
Gaurmseb <- sqrt(mean(GauKrigenfold_sf$residual^2))
Model3 <- c("Gaussian")
#make a nice table
stats3 <- as.data.frame(cbind(Model3, Gaur2a, GauRMSEb, Gaur2b, Gaurmseb))
stats3 <- type.convert(stats3, as.is = TRUE) #makes it so that numeric stays numeric and characters stay as characters in df
colnames(stats3) <- c("Model", "R-Sq LOOCV","RMSE LOOCV", "R-Sq 10fold", "RMSE nfold") #rename columns
gt(stats3)|>
tab_options(table.font.size = px(10L))%>%
opt_table_font("Corbel")%>%
opt_stylize(style=4)%>%
fmt_number(columns = where(is.numeric), n_sigfig = 3, drop_trailing_zeros = T)
| Model | R-Sq LOOCV | RMSE LOOCV | R-Sq 10fold | RMSE nfold |
|---|---|---|---|---|
| Gaussian | 0.869 | 160 | 0.865 | 162 |
#create the model
sph.model <- vgm(psill=135302, model="Sph", range=144746, nugget=630)
sph.fit <- fit.variogram(object = CAVar, model = sph.model)
#Krige the data
CAGstat <- gstat(formula = ANNUAL~1, locations = prcpCAsf,
model = sph.fit)
CAKrige_sf <- predict(CAGstat, newdata=gridCA_sf)
## [using ordinary kriging]
#convert to raster
CAKrige_rast <- sf_2_rast(CAKrige_sf)
#plot prediction
a <- ggplot() +
geom_spatraster(data=CAKrige_rast, mapping = aes(fill=var1.pred),alpha=0.8) +
scale_fill_continuous(type = "viridis",name="Precip. (mm)",na.value = "transparent") +
labs(title="CA Annual Precipitation (Sph)") +
theme_minimal()
#plot variance
CAKrige_sf$var1.var.sqrt <- sqrt(CAKrige_sf$var1.var)
CAKrige_rast <- sf_2_rast(CAKrige_sf,variableIndex = 4)
b <- ggplot() +
geom_spatraster(data=CAKrige_rast, mapping = aes(fill=var1.var.sqrt ),alpha=0.8) +
scale_fill_continuous(type = "viridis",name="precip.(mm)",na.value = "transparent") +
labs(title="Variance of annual precip. (Sph)") +
theme_minimal()
ggarrange(a,b)
#cross validate with LOOCV
SphKrigeLOOCV_sf <- krige.cv(formula = ANNUAL~1,
locations = prcpCAsf,
model = sph.fit, verbose = FALSE)
##R^2 for ste model
sphr2a <- cor(SphKrigeLOOCV_sf$observed, SphKrigeLOOCV_sf$var1.pred)^2
# RMSE
sphrmsea <- sqrt(mean(SphKrigeLOOCV_sf$residual^2))
#cross validate with nfold
SphKrigenfold_sf <- krige.cv(formula = ANNUAL~1,
nfold=10,
locations = prcpCAsf,
model = sph.fit, verbose = FALSE)
##R^2 for ste model
Sphr2b <- cor(SphKrigenfold_sf$observed, SphKrigenfold_sf$var1.pred)^2
# RMSE
Sphrmseb <- sqrt(mean(SphKrigenfold_sf$residual^2))
Model4 <- c("Spherical")
#make a nice table
stats4 <- as.data.frame(cbind(Model4, sphr2a, sphrmsea, Sphr2b, Sphrmseb))
stats4 <- type.convert(stats4, as.is = TRUE) #makes it so that numeric stays numeric and characters stay as characters in df
colnames(stats4) <- c("Model", "R-Sq LOOCV","RMSE LOOCV", "R-Sq 10fold", "RMSE nfold") #rename columns
gt(stats4)|>
tab_options(table.font.size = px(10L))%>%
opt_table_font("Corbel")%>%
opt_stylize(style=4)%>%
fmt_number(columns = where(is.numeric), n_sigfig = 3, drop_trailing_zeros = T)
| Model | R-Sq LOOCV | RMSE LOOCV | R-Sq 10fold | RMSE nfold |
|---|---|---|---|---|
| Spherical | 0.896 | 143 | 0.887 | 149 |
finalstats <- rbind(stats1, stats2, stats3, stats4)
finalstats <- type.convert(finalstats, as.is = TRUE) #makes it so that numeric stays numeric and characters stay as characters in df
gt(finalstats)|>
tab_options(table.font.size = px(10L))%>%
opt_table_font("Corbel")%>%
opt_stylize(style=4)%>%
fmt_number(columns = where(is.numeric), n_sigfig = 3, drop_trailing_zeros = T)
| Model | R-Sq LOOCV | RMSE LOOCV | R-Sq nfold | RMSE nfold |
|---|---|---|---|---|
| Exponential | 0.902 | 138 | 0.890 | 147 |
| M. Stein's Parameterization | 0.902 | 138 | 0.888 | 148 |
| Gaussian | 0.869 | 160 | 0.865 | 162 |
| Spherical | 0.896 | 143 | 0.887 | 149 |
LOOCV Method Just by looking at the variograms, I originally thought the exponential model was best. In step 10 above, I made surface prediction and variance plots for each of the models to help with the interpretation. I found that the Exp Model and Ste Model had the same LOOCV \(R^2\) value. After reassessing the variogram models – they each seemed to fit lines of similar distances between the 11649 and 10504 points on the variogram. I think that each of these models has a similar compromise between the two points, rendering them the same fit overall.
10fold Method Using this method, the exponential model seems to have the best fit. It is still not as good as the fit for the LOOCV method, though. After reading through the Stackoverflow about these methods, it seems that the LOOCV has less bias and in the table above, you can see it also has less root mean square error. I think this means LOOCV is the best method?
Question I still have: What makes a dataset “small” or “large”? Is there a standard size? Maybe n=10000 is large and anything below that is small? And how do you choose the number of best folds when using the nfold function?
IDW has no error term associated with it. Kriging is a geostatistical method of interpolation that uses spatial autocorrelation – and it works best when there is strong autocorrelation in the data. When kriging we use the distance between sample points (and sometimes direction) to explain variation in a variable.
With IDW the weight was a deterministic function of distance. But with kriging, we use the distance between the measured points and the prediction location as well as the overall spatial structure of the measured points themselves.
Here, we demonstrate a variogram with the meuse dataset.
data(meuse.all)
data(meuse.grid, package="sp")
#make a variable to work with
meuse.all$logLead <- log(meuse.all$lead)
#make into an sf
meuse_sf <- st_as_sf(meuse.all,
coords=c("x", "y"))%>%
st_set_crs(value=28992)
meuse.grid <- st_as_sf(meuse.grid,
coords=c("x", "y"),
crs=st_crs(meuse_sf))
#print meusegrid data to see that it is now a simple feature collection with 5 fields, x and y points, Amersfoort CRS, etc...
#now plot it in ggplot
p1 <- ggplot(data=meuse_sf)+
geom_sf(aes(fill=logLead), size=4,
shape=21, color="white", alpha=0.8)+
scale_fill_continuous(type="viridis", name="log(ppm)")+
labs(title="Lead concentrations (DISCRETE VARIOGRAM")
p1
#now use the variogram function from gstat to determine autocorrelation
#sample variogram, observed variogram, etc...is the name of the variogram without the line
leadVar <- variogram(logLead~1, meuse_sf)
plot(leadVar, pch=20, cex=1.5, col="black",
ylab=expression("Semivariance ("*gamma*")"), xlab="Distance (m)", main="Lead concentrations (log(ppm))")
Below is a diagram of how to interpret these variograms:
The range is the distance beyond which the data are no longer correlated.
The sill is the variance of the variable.
The nugget is the autocorrelation at very small scales. The nugget represents independent error, measurement error and/or micro-scale variation at fine spatial scales. In a continuous variable we would expect that the nugget effect will be zero because at distance zero the values will be the same. However some variables can change in an abrupt manner and so in very short distance there is a difference.
If there is no structure (no correlation) and the variable is therefore a purely random variable, the variogram will be flat with a nugget effect.
After plotting the sample variogram we can then fit a statistical model to the points. The model is a continuous estimate of \(\gamma\) as a function of distance: \(\hat{\gamma}=f(d)\).
We will fit two different functions a spherical model and an exponential model. The text has descriptions of the various functions. In practice, we tend to look at the general shape of the plot and then choose a mathematical model that fits.
# note out initial estimates for the sill, range, and nugget
sph.model <- vgm(psill=0.6, model="Sph", range=750, nugget=0.05)
sph.fit <- fit.variogram(object = leadVar, model = sph.model)
sph.fit
## model psill range
## 1 Nug 0.1095742 0.000
## 2 Sph 0.4560318 1002.626
plot(leadVar,model=sph.fit,pch=20,cex=1.5,col="black",
ylab=expression("Semivariance ("*gamma*")"),
xlab="Distance (m)", main = "Lead concentrations (log(ppm))",
sub="Points: Empirical, Line: Spherical Model/THEORETICAL VARIOGRAM")
#this is a nonlinear function. This is a 3 term model. The nugget = y intercept (b in y=mx+b)
#the range x axis
#the sill telsyou the values of semivariance when there is no autocorrelation. the y axis
#these terms give you the distance at which point the curve flatens out. Values past the range are not autocorrelated
#the shape of teh variogram below
The solid line above is a fitted, theoretical, variogram model. This is done by iterations. We have gone from our observed data (empirical categorical data) and we have fitted a line so that it is now a continuous model! COOL. The spherical model is:
\(\hat{\gamma}(d)=b+C_0(1.5(\frac{d}{r})-0.5(\frac{d^3}{r}))\)
b is the nugget
\(C_0\) is the sill
d is the distance
r is the range
There are also exponential models:
\(\hat{\gamma}(d)= b+C_0(1-e^{-d/a})\) where \(a = r/3\) and all other terms are the same as above.
Imagine you have a dataframe with x, y, and z:
#any data frame with xyz
foo <- data.frame(x=c(1, 3, 1, 4, 5),
y=c(5, 4, 3, 5, 1),
z=c(100, 105, 105, 100, 115))
#plot it
p2 <- ggplot() +
geom_point(data=foo,aes(x=x,y=y,size=z)) +
lims(x=c(0,6),y=c(0,6))
p2
Now, imagine you don’t know z at (2,4):
p2 <- p2 +
geom_point(aes(x=2,y=4),color="plum",size=10,shape=0) +
geom_point(aes(x=2,y=4),color="plum",size=6,shape=63)
p2
Interpolate with Kriging to find z at (2,4). The Euclidean distance to our unknown point from each of our known points is a vector:
\(d_i=\) \[\begin{bmatrix} 1 & 1.414 \\ 2 & 1.000 \\ 3 & 1.414 \\ 4 & 2.236 \\ 5 & 4.243 \\ \end{bmatrix}\]To get \(\hat{Z}(s_0)\) you need to calculate \(\lambda\). To do this, fit a variogram by plotting semivariance \((\gamma)\) against distance \((d)\) and get the function that predicts \(\gamma\).
Imagine you did this and found the variogram model is:
\(\hat{\gamma}=0+13.5(d)\)
How will we get the weights \((\lambda)\)?
\(\lambda=g\)\(\Gamma^{-1}\)
\(\Gamma\) = matrix of semivariance of all sampled point pairs predicted as a function of euclidian distance using the fitted variogram model
\(g\) = a vector of predicted semivariance for unknown points using euclidian distances from known points
…and once we have \(\lambda\) we can interpolate (predict) over a surface.
Calculate \(g\) for the point you want to predict.
\(g=0+13.5(d_i)\):
#euclidian distance from each point to our unknown point:
d_i <- sqrt((2-foo$x)^2+(4-foo$y)^2)
#now use your variogram model equation (given to you by Andy) and plug in your d_i values to solve for g
g <- d_i*13.5
g
## [1] 19.09188 13.50000 19.09188 30.18692 57.27565
\[g=\]
\[\begin{bmatrix}
1 & 19.09 \\
2 & 13.50 \\
3 & 19.09 \\
4 & 30.19 \\
5 & 57.28 \\
\end{bmatrix}\]
Find \(\Gamma\).
Here is the distance matrix between all measured points (using the
proxy library):
euclid <- dist(foo[1:2], method="euclidean")
euclid
## 1 2 3 4
## 2 2.236068
## 3 2.000000 2.236068
## 4 3.000000 1.414214 3.605551
## 5 5.656854 3.605551 4.472136 4.123106
Now multiply this by the variogram model to get \(\Gamma\):
Gamma <- 13.5*euclid
Gamma
## 1 2 3 4
## 2 30.18692
## 3 27.00000 30.18692
## 4 40.50000 19.09188 48.67494
## 5 76.36753 48.67494 60.37384 55.66193
Now take the inverse of the matrix for \(\Gamma\) using solve
function:
inverse <- solve(Gamma)
inverse
## 1 2 3 4 5
## 1 -0.0229376496 0.005173891 0.016563227 0.008980417 0.0004308396
## 2 0.0051738909 -0.044601070 0.009873698 0.021194456 0.0028991841
## 3 0.0165632271 0.009873698 -0.025695317 -0.003488374 0.0070317344
## 4 0.0089804170 0.021194456 -0.003488374 -0.027071371 0.0072122450
## 5 0.0004308396 0.002899184 0.007031734 0.007212245 -0.0074569672
Find \(\lambda\) where \(\lambda=g\)\(\Gamma^{-1}\):
baby_lambda <- g %*% inverse
baby_lambda
## 1 2 3 4 5
## [1,] 0.2439155 0.4910203 0.25639 -0.01313665 -0.02777361
Use \(\lambda_i\) to create the final prediction equation:
\(\hat{Z}(s_0)=0.244(100)+0.491(105)+0.256(105)-0.013(100)-0.028(115)=98.3\)
meuse.grid
## Simple feature collection with 3103 features and 5 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 178460 ymin: 329620 xmax: 181540 ymax: 333740
## Projected CRS: Amersfoort / RD New
## First 10 features:
## part.a part.b dist soil ffreq geometry
## 1 1 0 0.00000000 1 1 POINT (181180 333740)
## 2 1 0 0.00000000 1 1 POINT (181140 333700)
## 3 1 0 0.01222430 1 1 POINT (181180 333700)
## 4 1 0 0.04346780 1 1 POINT (181220 333700)
## 5 1 0 0.00000000 1 1 POINT (181100 333660)
## 6 1 0 0.01222430 1 1 POINT (181140 333660)
## 7 1 0 0.03733950 1 1 POINT (181180 333660)
## 8 1 0 0.05936620 1 1 POINT (181220 333660)
## 9 1 0 0.00135803 1 1 POINT (181060 333620)
## 10 1 0 0.01222430 1 1 POINT (181100 333620)
meuse_grid_sf <- st_as_sf(meuse.grid)
leadVar <- variogram(logLead~1, meuse_sf)
#use the plot to determine the parameters for vgm()
plot(leadVar)
leadModel <- vgm(psill=0.6, model="Sph", range=750, nugget=0.05)
#there are lots of different models. We are using spherical based on the variogram look
leadFit <- fit.variogram(object = leadVar, model = leadModel)
leadGstat <- gstat(formula = logLead~1, locations = meuse_sf,
model = leadFit)
leadKrige_sf <- predict(leadGstat,newdata = meuse_grid_sf)
## [using ordinary kriging]
leadKrige_sf
## Simple feature collection with 3103 features and 2 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 178460 ymin: 329620 xmax: 181540 ymax: 333740
## Projected CRS: Amersfoort / RD New
## First 10 features:
## var1.pred var1.var geometry
## 1 5.336713 0.3183947 POINT (181180 333740)
## 2 5.406841 0.2724001 POINT (181140 333700)
## 3 5.343480 0.2854597 POINT (181180 333700)
## 4 5.279198 0.3003649 POINT (181220 333700)
## 5 5.486278 0.2238744 POINT (181100 333660)
## 6 5.415401 0.2377063 POINT (181140 333660)
## 7 5.339935 0.2536512 POINT (181180 333660)
## 8 5.266665 0.2701756 POINT (181220 333660)
## 9 5.562294 0.1801611 POINT (181060 333620)
## 10 5.496279 0.1894106 POINT (181100 333620)
Above you can see that the leadKrige is an
sf object that has both predicted surface
(var1.pred) and the variance around those predictions
(var1.var).
Convert sf into rast:
# use the homemade function:
sf_2_rast <-function(sfObject,variableIndex = 1){
# coerce sf to a data.frame
dfObject <- data.frame(st_coordinates(sfObject),
z=as.data.frame(sfObject)[,variableIndex])
# coerce data.frame to SpatRaster
rastObject <- rast(dfObject,crs=crs(sfObject))
names(rastObject) <- names(sfObject)[variableIndex]
return(rastObject)
}
leadKrige_rast <- sf_2_rast(leadKrige_sf)
leadKrige_rast
## class : SpatRaster
## dimensions : 104, 78, 1 (nrow, ncol, nlyr)
## resolution : 40, 40 (x, y)
## extent : 178440, 181560, 329600, 333760 (xmin, xmax, ymin, ymax)
## coord. ref. : Amersfoort / RD New (EPSG:28992)
## source(s) : memory
## name : var1.pred
## min value : 3.749342
## max value : 6.174216
Nice! Now, let’s plot the prediction surface:
# and plot
ggplot() +
geom_spatraster(data=leadKrige_rast, mapping = aes(fill=var1.pred),alpha=0.8) +
scale_fill_continuous(type = "viridis",name="log(ppm)",na.value = "transparent") +
labs(title="Lead concentrations") +
theme_minimal()
Now, to look at the variance around those predictions, you need to
take the square root of var1.var so that the units match
the var1.pred units.
leadKrige_sf$var1.var.sqrt <- sqrt(leadKrige_sf$var1.var)
leadKrige_rast <- sf_2_rast(leadKrige_sf,variableIndex = 4)
leadKrige_rast
## class : SpatRaster
## dimensions : 104, 78, 1 (nrow, ncol, nlyr)
## resolution : 40, 40 (x, y)
## extent : 178440, 181560, 329600, 333760 (xmin, xmax, ymin, ymax)
## coord. ref. : Amersfoort / RD New (EPSG:28992)
## source(s) : memory
## name : var1.var.sqrt
## min value : 0.3881239
## max value : 0.6503917
And plot it:
# and plot
ggplot() +
geom_spatraster(data=leadKrige_rast, mapping = aes(fill=var1.var.sqrt ),alpha=0.8) +
scale_fill_continuous(type = "viridis",name="log(ppm)",na.value = "transparent") +
labs(title="Variance of lead concentrations") +
theme_minimal()
Using krige.cv we can cross validate. The default is to
leave-one-out cross validate (LOOCV), but you can tell it to k-fold
too:
leadKrigeLOOCV_sf <- krige.cv(formula = logLead~1,
locations = meuse_sf,
model = leadFit, verbose = FALSE)
leadKrigeLOOCV_sf
## Simple feature collection with 164 features and 6 fields
## Geometry type: POINT
## Dimension: XY
## Bounding box: xmin: 178605 ymin: 329714 xmax: 181390 ymax: 333611
## Projected CRS: Amersfoort / RD New
## First 10 features:
## var1.pred var1.var observed residual zscore fold
## 1 5.402571 0.2277973 5.700444 0.29787270 0.624103170 1
## 2 5.473044 0.2199374 5.624018 0.15097361 0.321922586 2
## 3 5.233697 0.2180910 5.293305 0.05960821 0.127640125 3
## 4 5.077189 0.2520550 4.753590 -0.32359847 -0.644553277 4
## 5 4.794539 0.2134651 4.762174 -0.03236554 -0.070051822 5
## 6 4.584552 0.2571416 4.919981 0.33542860 0.661475718 6
## 7 4.970576 0.2088842 4.882802 -0.08777385 -0.192049137 7
## 8 5.253444 0.2072102 5.010635 -0.24280892 -0.533407471 8
## 9 4.886148 0.1786532 4.890349 0.00420153 0.009940359 9
## 10 4.589513 0.1984181 4.382027 -0.20748612 -0.465798826 10
## geometry
## 1 POINT (181072 333611)
## 2 POINT (181025 333558)
## 3 POINT (181165 333537)
## 4 POINT (181298 333484)
## 5 POINT (181307 333330)
## 6 POINT (181390 333260)
## 7 POINT (181165 333370)
## 8 POINT (181027 333363)
## 9 POINT (181060 333231)
## 10 POINT (181232 333168)
Now calculate the \(R^2\):
cor(leadKrigeLOOCV_sf$observed, leadKrigeLOOCV_sf$var1.pred)^2
## [1] 0.6192935
The LOOCV \(R^2\) is 0.6193 which is
obtained from the observed vs. predicted values. Also note, all the
other cool values you get from the krige.cv funcion.
You can easily get the nugget, sill, and range using this code:
CAVarAuto <- autofitVariogram(formula = ANNUAL~1,input_data = prcpCAsf)
summary(CAVarAuto)
## Experimental variogram:
## np dist gamma dir.hor dir.ver id
## 1 113 7117.10 9085.183 0 0 var1
## 2 291 15356.15 20118.936 0 0 var1
## 3 482 24216.82 38977.831 0 0 var1
## 4 971 36289.97 54933.344 0 0 var1
## 5 1268 50835.99 68790.951 0 0 var1
## 6 1476 65163.37 87346.782 0 0 var1
## 7 6010 97592.63 110476.975 0 0 var1
## 8 6996 144845.61 123661.014 0 0 var1
## 9 11649 204160.38 129492.937 0 0 var1
## 10 10504 275857.80 150605.694 0 0 var1
## 11 8883 347876.03 163634.401 0 0 var1
## 12 10027 432324.39 177899.128 0 0 var1
##
## Fitted variogram model:
## model psill range kappa
## 1 Nug 0.0 0.0 0.0
## 2 Ste 150642.5 104783.9 0.6
## Sums of squares betw. var. model and sample var.[1] 132.477
plot(CAVarAuto)
You can also quickly get cross-validation:
CAAutoKrigeLOOCV <- autoKrige.cv(formula = ANNUAL~1, input_data = prcpCAsf,
verbose = c(FALSE,FALSE))
summary(CAAutoKrigeLOOCV)
## [,1]
## mean_error -1.153
## me_mean -0.001881
## MAE 92.21
## MSE 18840
## MSNE 0.7799
## cor_obspred 0.9502
## cor_predres 0.08809
## RMSE 137.3
## RMSE_sd 0.3124
## URMSE 137.3
## iqr 108.5
cor(CAAutoKrigeLOOCV$krige.cv_output$observed, CAAutoKrigeLOOCV$krige.cv_output$var1.pred)^2
## [1] 0.902961
Here we get a LOOCV \(R^2\) of 0.9030 which is very (very) close to what we got with the cross validation we did manually above.