library(sp)
library(rgdal)
## rgdal: version: 1.1-3, (SVN revision 594)
## Geospatial Data Abstraction Library extensions to R successfully loaded
## Loaded GDAL runtime: GDAL 2.0.1, released 2015/09/15
## Path to GDAL shared files: C:/Program Files/R/R-3.2.3/library/rgdal/gdal
## GDAL does not use iconv for recoding strings.
## Loaded PROJ.4 runtime: Rel. 4.9.1, 04 March 2015, [PJ_VERSION: 491]
## Path to PROJ.4 shared files: C:/Program Files/R/R-3.2.3/library/rgdal/proj
## Linking to sp version: 1.2-2
require(gstat)
## Loading required package: gstat
#install.packages("rgdal")
#install.packages("gstat")
setwd("C:/Users/ywang/Dropbox/R/Krigging/CapeRomanzof")
#import data, the columns of data are SampleID,X,Y,PCB
data<-read.csv("LF003_SelectedArea.csv")
head(data)
## Name Depth X Y PCB Qualifier
## 1 SD-013 0-0.4 1643734 2846956 40
## 2 SD-014 0-0.4 1643725 2846957 92
## 3 SD-015 0-0.4 1643705 2846955 230
## 4 SD-016 0-0.4 1643656 2846945 190
## 5 SD-017 0-0.4 1643613 2846936 58
## 6 SD-018 0-0.4 1643557 2846902 110
#generate histogram of data
hist(data$PCB, breaks = 10, xlab="PCB (mg/kg)", main="Histogram of PCB")
summary(data$PCB)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0052 0.0150 0.0955 20.8900 17.7500 230.0000
#the histogram shows the data is highly skewed, using log-transformed value is recommended.
data$PCB.l <- log10(data$PCB)
hist(data$PCB.l, breaks = 10,xlab="Log10 PCB (mg/kg)",main="Histogram of Log-transformed PCB")
summary(data$PCB.l)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.2840 -1.8240 -1.0250 -0.4077 1.2480 2.3620
#convert the dataframe to have geological feature which can be plotted.
coordinates(data) <- c("X", "Y")
#assign a coordinate system.
proj4string(data) <- CRS("+init=epsg:26938")
#NAD83 Alaska State Plane Zone8
#http://www.spatialreference.org/ref/epsg/26938/
#projection(data)<-CRS("+init=epsg:4326")
#data.UTM <- spTransform(data, CRS("+init=epsg:4326"))
#writeOGR(data.ll, "data.kml", "data", "KML")
#show data point
plot(data,asp=1,pch=1)
#plot raw data, the size of the dot corresponds to the concentration
plot(data,asp=1,cex=4*data$PCB/max(data$PCB),pch = 1)
Ordinary Kriging for Log-transformed Data
#the fundamental concept is spatial auto-correlation: a variable can be correlated to itself, with the strength of correlation depending on seperation distance (and possibly direction); this should be evident as a relation between seperation distance and correlation (also expresssed as semi-variance);
#each pair of observation has a semivariance,defined as gamma(xi,xj)=1/2[z(xi)-z(xj)]^2, where x is location, z is concentration.
#compute number of point-pairs
n<-length(data$PCB.l)
n*(n-1)/2
## [1] 946
#how the semivariances related to the seperation distance: using empirical variogram defined as average semivariance within some seperation ranges (distances)
#get variogram (semivariance vs distance) of log-tranformed data
v<-variogram(PCB.l~1,data)
# the 1 here represents the spatial mean of the dependent variable - that is PCB.l is only depednent on itself.
print(plot(v,plot.numbers=T))
#plot.numbers show how many point-pairs in each bin (distance interval); more point-pairs are more reliable. The bin interval can be specified using width in the variogram function. If there are not enough points (at least 100 to 150) with a good distribution of distance intervals, it is not possible to model a variogram,and kriging should not be used.
#fitting the curve using vgm function for different models
#sph model
vm<-vgm(psill=1.5, model="Sph", range=40, nugget=1)
print(plot(v,pl=T,model=vm))
#range means at what distance between point-pairs that there is no more spatial dependence. Nugget means the semivariance at zero seperation. Total sill means the semivariance at the range and beyond. kappa is smoothness parameter for the Matern class of variogram models
#there are many fit models to choose from, using vgm() or print(show.vgms()) to view the avaiable models. Selecting a model form based on knowledge of assumed spatial process. A common model for soil attributes is the spherical model.
#ajusting the fit
vmf<-fit.variogram(v,vm)
print(plot(v,pl=T,model=vmf))
vmf
## model psill range
## 1 Nug 1.040550 0.00000
## 2 Sph 1.773307 58.35068
#the range is 58.35; the total sill is 1.04+1.77
#create a grid matrix with new locations
x1<-min(data$X)
x2<-max(data$X)
y1<-min(data$Y)
y2<-max(data$Y)
r=1
mat<-matrix(0,nrow=((x2-x1)/1+1)*((y2-y1)/1+1),ncol=2)
for (i in seq(x1,x2,by=1))
{for (j in seq(y1,y2,by=1))
{mat[r,1]<-i
mat[r,2]<-j
r=r+1}}
#write.csv(mat,file="grid.csv")
#import grid and ordinary kriging
#predicts at any point as the weighted average of the values at sampled points. Weights given to each sample point are optimal, given the spatial covariance structure as revealed by the variogram model. The kriging variance at each point is automatically generated as part of the process of computing weights.
setwd("C:/Users/ywang/Dropbox/R/Krigging/CapeRomanzof")
data.grid<-read.csv("grid.csv")
coordinates(data.grid) <- c("x", "y")
proj4string(data.grid) <- CRS("+init=epsg:26938")
gridded(data.grid)<-T
k<-krige(PCB.l~1,locations=data,newdata=data.grid,model=vmf)
## [using ordinary kriging]
write.csv(k,file="OK_sph.csv")
#display maps of concentrations
spplot(k,"var1.pred",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction by Sph Model, log-ppm PCB")
summary(k$var1.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.2020 -1.2250 -0.9074 -0.7726 -0.3089 1.3840
#display maps of uncertainty
spplot(k,"var1.var",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction Variance by Sph Model, log-ppm PCB2")
#save.image(file="OK.RData")
#Matern model
mtn<-fit.variogram(v,vgm(psill=1.5, model="Mat", range=40, nugget=1, kappa = 0.7))
print(plot(v,pl=T,model=mtn))
m<-krige(PCB.l~1,locations=data,newdata=data.grid,model=mtn)
## [using ordinary kriging]
write.csv(m,file="OK_Mat.csv")
#display maps of concentrations
spplot(m,"var1.pred",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction by Matern Model, log-ppm PCB")
summary(m$var1.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.0310 -1.2700 -0.9068 -0.7651 -0.2523 1.3600
#display maps of uncertainty
spplot(m,"var1.var",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction Variance by Matern Model, log-ppm PCB2")
#exponential model
exp<-fit.variogram(v,vgm(psill=1.5, model="Exp", range=40, nugget=1))
print(plot(v,pl=T,model=exp))
e<-krige(PCB.l~1,locations=data,newdata=data.grid,model=exp)
## [using ordinary kriging]
write.csv(e,file="OK_Exp.csv")
#display maps of concentrations
spplot(e,"var1.pred",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction by Exp Model, log-ppm PCB")
summary(e$var1.pred)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.0320 -1.2700 -0.8927 -0.7632 -0.2428 1.4110
#display maps of uncertainty
spplot(e,"var1.var",asp=1,col.regions=bpy.colors(64),xlim=c(x1,x2),ylim=c(y1,y2),main="Ordinary Kriging Prediction Variance by Exp Model, log-ppm PCB2")
Leave-One-Out Cross-Validation
#cross-validation for ordinary kriging using log-transformed data
#sph model
kcv.ok<-krige.cv(PCB.l~1,locations=data,model=vmf)
summary(kcv.ok)
## Object of class SpatialPointsDataFrame
## Coordinates:
## min max
## X 1643510 1643766
## Y 2846858 2846998
## Is projected: NA
## proj4string : [NA]
## Number of points: 44
## Data attributes:
## var1.pred var1.var observed residual
## Min. :-1.9496 Min. :1.449 Min. :-2.2840 Min. :-2.28639
## 1st Qu.:-0.8828 1st Qu.:1.806 1st Qu.:-1.8239 1st Qu.:-1.20492
## Median :-0.3410 Median :2.077 Median :-1.0251 Median :-0.28841
## Mean :-0.3440 Mean :2.105 Mean :-0.4077 Mean :-0.06368
## 3rd Qu.: 0.1453 3rd Qu.:2.368 3rd Qu.: 1.2481 3rd Qu.: 0.92417
## Max. : 1.1674 Max. :2.891 Max. : 2.3617 Max. : 2.96305
## zscore fold
## Min. :-1.84391 Min. : 1.00
## 1st Qu.:-0.77479 1st Qu.:11.75
## Median :-0.18823 Median :22.50
## Mean :-0.02122 Mean :22.50
## 3rd Qu.: 0.67341 3rd Qu.:33.25
## Max. : 1.96945 Max. :44.00
#look for field of prediction (var1.pred) vs.field observation (observed), and their difference (residual). A positive residual is an under-prediction (predicted less than observed);
#compare the mean of residual to zero to evaluate whether the prediction is biased.
summary(kcv.ok$residual)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.28600 -1.20500 -0.28840 -0.06368 0.92420 2.96300
#using root of the mean squared error (RMSE) to compare models and evaluate which model is the best; the lower RMSE, the better of the model.
sqrt_sph<-sqrt(sum(kcv.ok$residual^2)/length(kcv.ok$residual))
sqrt_sph
## [1] 1.395828
#RMSE is in the unit of log(mg/kg)
#plot redidual plot
par(mfrow=c(3,1),mar=c(5,5,2,1))
p1<-bubble(kcv.ok,zcol="residual",main="Cross Validation Residual LogPCB by Sph Model",pch=1)
plot(p1)
#cross validation for the other models
#Matern Model
mtn.ok<-krige.cv(PCB.l~1,locations=data,model=mtn)
summary(mtn.ok)
## Object of class SpatialPointsDataFrame
## Coordinates:
## min max
## X 1643510 1643766
## Y 2846858 2846998
## Is projected: NA
## proj4string : [NA]
## Number of points: 44
## Data attributes:
## var1.pred var1.var observed residual
## Min. :-1.8471 Min. :1.433 Min. :-2.2840 Min. :-2.31456
## 1st Qu.:-0.8330 1st Qu.:1.836 1st Qu.:-1.8239 1st Qu.:-1.17714
## Median :-0.3783 Median :2.114 Median :-1.0251 Median :-0.32902
## Mean :-0.3415 Mean :2.141 Mean :-0.4077 Mean :-0.06618
## 3rd Qu.: 0.1645 3rd Qu.:2.407 3rd Qu.: 1.2481 3rd Qu.: 0.98438
## Max. : 1.1506 Max. :2.954 Max. : 2.3617 Max. : 2.89342
## zscore fold
## Min. :-1.86051 Min. : 1.00
## 1st Qu.:-0.72365 1st Qu.:11.75
## Median :-0.21281 Median :22.50
## Mean :-0.02197 Mean :22.50
## 3rd Qu.: 0.70842 3rd Qu.:33.25
## Max. : 1.90915 Max. :44.00
sqrt_mat<-sqrt(sum(mtn.ok$residual^2)/length(mtn.ok$residual))
sqrt_mat
## [1] 1.393451
#plot redidual plot
par(mfrow=c(3,1),mar=c(5,5,2,1))
p2<-bubble(mtn.ok,zcol="residual",main="Cross Validation Residual LogPCB by Matern Model",pch=1)
plot(p2)
#Exponential Model
exp.ok<-krige.cv(PCB.l~1,locations=data,model=exp)
summary(exp.ok)
## Object of class SpatialPointsDataFrame
## Coordinates:
## min max
## X 1643510 1643766
## Y 2846858 2846998
## Is projected: NA
## proj4string : [NA]
## Number of points: 44
## Data attributes:
## var1.pred var1.var observed residual
## Min. :-1.8147 Min. :1.403 Min. :-2.2840 Min. :-2.33566
## 1st Qu.:-0.8214 1st Qu.:1.839 1st Qu.:-1.8239 1st Qu.:-1.17088
## Median :-0.3713 Median :2.125 Median :-1.0251 Median :-0.34258
## Mean :-0.3411 Mean :2.147 Mean :-0.4077 Mean :-0.06655
## 3rd Qu.: 0.1809 3rd Qu.:2.407 3rd Qu.: 1.2481 3rd Qu.: 1.00351
## Max. : 1.1575 Max. :2.965 Max. : 2.3617 Max. : 2.88990
## zscore fold
## Min. :-1.86893 Min. : 1.00
## 1st Qu.:-0.72643 1st Qu.:11.75
## Median :-0.22146 Median :22.50
## Mean :-0.02211 Mean :22.50
## 3rd Qu.: 0.71627 3rd Qu.:33.25
## Max. : 1.95683 Max. :44.00
sqrt_exp<-sqrt(sum(exp.ok$residual^2)/length(exp.ok$residual))
sqrt_exp
## [1] 1.396971
p3<-bubble(exp.ok,zcol="residual",main="Cross Validation Residual LogPCB by Exponential Model",pch=1)
plot(p3)
#select model with the lowest sqrt value
c(sqrt_sph,sqrt_mat,sqrt_exp)
## [1] 1.395828 1.393451 1.396971