Kriging

library(sp)
library(rgdal)

## rgdal: version: 1.1-3, (SVN revision 594)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.0.1, released 2015/09/15
##  Path to GDAL shared files: C:/Program Files/R/R-3.2.3/library/rgdal/gdal
##  GDAL does not use iconv for recoding strings.
##  Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
##  Path to PROJ.4 shared files: C:/Program Files/R/R-3.2.3/library/rgdal/proj
##  Linking to sp version: 1.2-2

require(gstat)

## Loading required package: gstat

#install.packages("rgdal")
#install.packages("gstat")
setwd("C:/Users/ywang/Dropbox/R/Krigging/CapeRomanzof")

#import data, the columns of data are SampleID,X,Y,PCB
data<-read.csv("LF003_SelectedArea.csv")
head(data)

##     Name Depth       X       Y PCB Qualifier
## 1 SD-013 0-0.4 1643734 2846956  40          
## 2 SD-014 0-0.4 1643725 2846957  92          
## 3 SD-015 0-0.4 1643705 2846955 230          
## 4 SD-016 0-0.4 1643656 2846945 190          
## 5 SD-017 0-0.4 1643613 2846936  58          
## 6 SD-018 0-0.4 1643557 2846902 110

#generate histogram of data
hist(data$PCB, breaks = 10, xlab="PCB (mg/kg)", main="Histogram of PCB")

summary(data$PCB)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0052   0.0150   0.0955  20.8900  17.7500 230.0000

#the histogram shows the data is highly skewed, using log-transformed value is recommended.
data$PCB.l <- log10(data$PCB)
hist(data$PCB.l, breaks = 10,xlab="Log10 PCB (mg/kg)",main="Histogram of Log-transformed PCB")

summary(data$PCB.l)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.2840 -1.8240 -1.0250 -0.4077  1.2480  2.3620

#convert the dataframe to have geological feature which can be plotted.
coordinates(data) <- c("X", "Y")
#assign a coordinate system.
proj4string(data) <- CRS("+init=epsg:26938")
#NAD83 Alaska State Plane Zone8
#http://www.spatialreference.org/ref/epsg/26938/
#projection(data)<-CRS("+init=epsg:4326")
#data.UTM <- spTransform(data, CRS("+init=epsg:4326"))
#writeOGR(data.ll, "data.kml", "data", "KML")
#show data point
plot(data,asp=1,pch=1)

#plot raw data, the size of the dot corresponds to the concentration
plot(data,asp=1,cex=4*data$PCB/max(data$PCB),pch = 1)

Ordinary Kriging for Log-transformed Data

#the fundamental concept is spatial auto-correlation: a variable can be correlated to itself, with the strength of correlation depending on seperation distance (and possibly direction); this should be evident as a relation between seperation distance and correlation (also expresssed as semi-variance);
#each pair of observation has a semivariance,defined as gamma(xi,xj)=1/2[z(xi)-z(xj)]^2, where x is location, z is concentration.
#compute number of point-pairs
n<-length(data$PCB.l)
n*(n-1)/2

## [1] 946

#how the semivariances related to the seperation distance: using empirical variogram defined as average semivariance within some seperation ranges (distances)
#get variogram (semivariance vs distance) of log-tranformed data
v<-variogram(PCB.l~1,data) 
# the 1 here represents the spatial mean of the dependent variable - that is PCB.l is only depednent on itself.
print(plot(v,plot.numbers=T))

#plot.numbers show how many point-pairs in each bin (distance interval); more point-pairs are more reliable. The bin interval can be specified using width in the variogram function. If there are not enough points (at least 100 to 150) with a good distribution of distance intervals, it is not possible to model a variogram,and kriging should not be used.
#fitting the curve using vgm function for different models
#sph model
vm<-vgm(psill=1.5, model="Sph", range=40, nugget=1)
print(plot(v,pl=T,model=vm))

#range means at what distance between point-pairs that there is no more spatial dependence. Nugget means the semivariance at zero seperation. Total sill means the semivariance at the range and beyond. kappa is smoothness parameter for the Matern class of variogram models
#there are many fit models to choose from, using vgm() or print(show.vgms()) to view the avaiable models. Selecting a model form based on knowledge of assumed spatial process. A common model for soil attributes is the spherical model.
#ajusting the fit
vmf<-fit.variogram(v,vm)
print(plot(v,pl=T,model=vmf))

vmf

##   model    psill    range
## 1   Nug 1.040550  0.00000
## 2   Sph 1.773307 58.35068

#the range is 58.35; the total sill is 1.04+1.77

#create a grid matrix with new locations
x1<-min(data$X)
x2<-max(data$X)
y1<-min(data$Y)
y2<-max(data$Y)
r=1
mat<-matrix(0,nrow=((x2-x1)/1+1)*((y2-y1)/1+1),ncol=2)
for (i in seq(x1,x2,by=1))
{for (j in seq(y1,y2,by=1))
{mat[r,1]<-i
mat[r,2]<-j
r=r+1}}
#write.csv(mat,file="grid.csv")


#import grid and ordinary kriging 
#predicts at any point as the weighted average of the values at sampled points. Weights given to each sample point are optimal, given the spatial covariance structure as revealed by the variogram model. The kriging variance at each point is automatically generated as part of the process of computing weights.
setwd("C:/Users/ywang/Dropbox/R/Krigging/CapeRomanzof")
data.grid<-read.csv("grid.csv")
coordinates(data.grid) <- c("x", "y")
proj4string(data.grid) <- CRS("+init=epsg:26938")
gridded(data.grid)<-T
k<-krige(PCB.l~1,locations=data,newdata=data.grid,model=vmf)

## [using ordinary kriging]

write.csv(k,file="OK_sph.csv")
#display maps of concentrations
spplot(k,"var1.pred",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction by Sph Model, log-ppm PCB")

summary(k$var1.pred)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.2020 -1.2250 -0.9074 -0.7726 -0.3089  1.3840

#display maps of uncertainty
spplot(k,"var1.var",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction Variance by Sph Model, log-ppm PCB2")

#save.image(file="OK.RData")

#Matern model
mtn<-fit.variogram(v,vgm(psill=1.5, model="Mat", range=40, nugget=1, kappa = 0.7))
print(plot(v,pl=T,model=mtn))

m<-krige(PCB.l~1,locations=data,newdata=data.grid,model=mtn)

## [using ordinary kriging]

write.csv(m,file="OK_Mat.csv")
#display maps of concentrations
spplot(m,"var1.pred",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction by Matern Model, log-ppm PCB")

summary(m$var1.pred)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.0310 -1.2700 -0.9068 -0.7651 -0.2523  1.3600

#display maps of uncertainty
spplot(m,"var1.var",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction Variance by Matern Model, log-ppm PCB2")

#exponential model
exp<-fit.variogram(v,vgm(psill=1.5, model="Exp", range=40, nugget=1))
print(plot(v,pl=T,model=exp))

e<-krige(PCB.l~1,locations=data,newdata=data.grid,model=exp)

## [using ordinary kriging]

write.csv(e,file="OK_Exp.csv")
#display maps of concentrations
spplot(e,"var1.pred",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction by Exp Model, log-ppm PCB")

summary(e$var1.pred)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.0320 -1.2700 -0.8927 -0.7632 -0.2428  1.4110

#display maps of uncertainty
spplot(e,"var1.var",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction Variance by Exp Model, log-ppm PCB2")

Leave-One-Out Cross-Validation

#cross-validation for ordinary kriging using log-transformed data
#sph model
kcv.ok<-krige.cv(PCB.l~1,locations=data,model=vmf)
summary(kcv.ok)

## Object of class SpatialPointsDataFrame
## Coordinates:
##       min     max
## X 1643510 1643766
## Y 2846858 2846998
## Is projected: NA 
## proj4string : [NA]
## Number of points: 44
## Data attributes:
##    var1.pred          var1.var        observed          residual       
##  Min.   :-1.9496   Min.   :1.449   Min.   :-2.2840   Min.   :-2.28639  
##  1st Qu.:-0.8828   1st Qu.:1.806   1st Qu.:-1.8239   1st Qu.:-1.20492  
##  Median :-0.3410   Median :2.077   Median :-1.0251   Median :-0.28841  
##  Mean   :-0.3440   Mean   :2.105   Mean   :-0.4077   Mean   :-0.06368  
##  3rd Qu.: 0.1453   3rd Qu.:2.368   3rd Qu.: 1.2481   3rd Qu.: 0.92417  
##  Max.   : 1.1674   Max.   :2.891   Max.   : 2.3617   Max.   : 2.96305  
##      zscore              fold      
##  Min.   :-1.84391   Min.   : 1.00  
##  1st Qu.:-0.77479   1st Qu.:11.75  
##  Median :-0.18823   Median :22.50  
##  Mean   :-0.02122   Mean   :22.50  
##  3rd Qu.: 0.67341   3rd Qu.:33.25  
##  Max.   : 1.96945   Max.   :44.00

#look for field of prediction (var1.pred) vs.field observation (observed), and their difference (residual). A positive residual is an under-prediction (predicted less than observed); 
#compare the mean of residual to zero to evaluate whether the prediction is biased. 
summary(kcv.ok$residual)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -2.28600 -1.20500 -0.28840 -0.06368  0.92420  2.96300

#using root of the mean squared error (RMSE) to compare models and evaluate which model is the best; the lower RMSE, the better of the model. 
sqrt_sph<-sqrt(sum(kcv.ok$residual^2)/length(kcv.ok$residual)) 
sqrt_sph

## [1] 1.395828

#RMSE is in the unit of log(mg/kg)
#plot redidual plot
par(mfrow=c(3,1),mar=c(5,5,2,1))
p1<-bubble(kcv.ok,zcol="residual",main="Cross Validation Residual LogPCB by Sph Model",pch=1)
plot(p1)

#cross validation for the other models
#Matern Model
mtn.ok<-krige.cv(PCB.l~1,locations=data,model=mtn)
summary(mtn.ok)

## Object of class SpatialPointsDataFrame
## Coordinates:
##       min     max
## X 1643510 1643766
## Y 2846858 2846998
## Is projected: NA 
## proj4string : [NA]
## Number of points: 44
## Data attributes:
##    var1.pred          var1.var        observed          residual       
##  Min.   :-1.8471   Min.   :1.433   Min.   :-2.2840   Min.   :-2.31456  
##  1st Qu.:-0.8330   1st Qu.:1.836   1st Qu.:-1.8239   1st Qu.:-1.17714  
##  Median :-0.3783   Median :2.114   Median :-1.0251   Median :-0.32902  
##  Mean   :-0.3415   Mean   :2.141   Mean   :-0.4077   Mean   :-0.06618  
##  3rd Qu.: 0.1645   3rd Qu.:2.407   3rd Qu.: 1.2481   3rd Qu.: 0.98438  
##  Max.   : 1.1506   Max.   :2.954   Max.   : 2.3617   Max.   : 2.89342  
##      zscore              fold      
##  Min.   :-1.86051   Min.   : 1.00  
##  1st Qu.:-0.72365   1st Qu.:11.75  
##  Median :-0.21281   Median :22.50  
##  Mean   :-0.02197   Mean   :22.50  
##  3rd Qu.: 0.70842   3rd Qu.:33.25  
##  Max.   : 1.90915   Max.   :44.00

sqrt_mat<-sqrt(sum(mtn.ok$residual^2)/length(mtn.ok$residual)) 
sqrt_mat

## [1] 1.393451

#plot redidual plot
par(mfrow=c(3,1),mar=c(5,5,2,1))
p2<-bubble(mtn.ok,zcol="residual",main="Cross Validation Residual LogPCB by Matern Model",pch=1)
plot(p2)

#Exponential Model
exp.ok<-krige.cv(PCB.l~1,locations=data,model=exp)
summary(exp.ok)

## Object of class SpatialPointsDataFrame
## Coordinates:
##       min     max
## X 1643510 1643766
## Y 2846858 2846998
## Is projected: NA 
## proj4string : [NA]
## Number of points: 44
## Data attributes:
##    var1.pred          var1.var        observed          residual       
##  Min.   :-1.8147   Min.   :1.403   Min.   :-2.2840   Min.   :-2.33566  
##  1st Qu.:-0.8214   1st Qu.:1.839   1st Qu.:-1.8239   1st Qu.:-1.17088  
##  Median :-0.3713   Median :2.125   Median :-1.0251   Median :-0.34258  
##  Mean   :-0.3411   Mean   :2.147   Mean   :-0.4077   Mean   :-0.06655  
##  3rd Qu.: 0.1809   3rd Qu.:2.407   3rd Qu.: 1.2481   3rd Qu.: 1.00351  
##  Max.   : 1.1575   Max.   :2.965   Max.   : 2.3617   Max.   : 2.88990  
##      zscore              fold      
##  Min.   :-1.86893   Min.   : 1.00  
##  1st Qu.:-0.72643   1st Qu.:11.75  
##  Median :-0.22146   Median :22.50  
##  Mean   :-0.02211   Mean   :22.50  
##  3rd Qu.: 0.71627   3rd Qu.:33.25  
##  Max.   : 1.95683   Max.   :44.00

sqrt_exp<-sqrt(sum(exp.ok$residual^2)/length(exp.ok$residual)) 
sqrt_exp

## [1] 1.396971

p3<-bubble(exp.ok,zcol="residual",main="Cross Validation Residual LogPCB by Exponential Model",pch=1)
plot(p3)

#select model with the lowest sqrt value
c(sqrt_sph,sqrt_mat,sqrt_exp)

## [1] 1.395828 1.393451 1.396971