Section 1: Importing Covariates

# empty memory and workspace
gc()
rm(list=ls())
# check directory
getwd()
### to install all required R packages
ls <- c("sp", "raster", "rgdal", "mapview", "corrplot",
        "caret", "rpart", "rpart.plot", "randomForest")
new.packages <- ls[!(ls %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# load library
library(sp)
library(raster)
library(rgdal)
library(mapview)
library(corrplot)
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
# import raster layers of covariates (Remote Sensing data)
blue = raster("./RS/Blue.tif")
green = raster("./RS/Green.tif")
red = raster("./RS/Red.tif")
nir = raster("./RS/Nir.tif")
# calculate some Remote Sensing indices such as NDVI
ndvi = (nir - red) / (nir + red)
# plot raster layers of covariates (Remote Sensing data)
plot(blue, main = "blue band of landsat", xlab = "Easting (m)", ylab = "Northing (m)")
plot(red, main = "red band of landsat", xlab = "Easting (m)", ylab = "Northing (m)")
plot(ndvi, main = "NDVI of landsat", xlab = "Easting (m)", ylab = "Northing (m)")
# make a stack layer from Remote Sensing data
landsat = stack(blue,green,red,nir,ndvi)
# inspect the structure of stack layer
landsat
## class      : RasterStack 
## dimensions : 733, 716, 524828, 5  (nrow, ncol, ncell, nlayers)
## resolution : 500, 500  (x, y)
## extent     : 498000, 856000, 5235500, 5602000  (xmin, xmax, ymin, ymax)
## crs        : +proj=utm +zone=32 +datum=WGS84 +units=m +no_defs 
## names      :      layer.1,      layer.2,      layer.3,      layer.4,      layer.5 
## min values :   47.5000000,   92.0000000,   19.5000000,    6.0000000,   -0.6666667 
## max values : 9.383500e+03, 1.003950e+04, 1.027300e+04, 8.635500e+03, 8.870056e-01
# set the names to the rasters in the stack file
names(landsat) <- c("blue","green","red","nir","ndvi")
# plot the stack layers of Remote Sensing data
plot(landsat)

# import the terrain data using list.files command
dem_lst <- list.files("./Terrain/", pattern="\\.sdat$", full.names = TRUE)
# inspect the structure of list
dem_lst
## [1] "./Terrain/CNBL.sdat"  "./Terrain/DEM.sdat"   "./Terrain/MCA.sdat"  
## [4] "./Terrain/Slope.sdat" "./Terrain/TWI.sdat"
# make a stack layer from terrain data 
terrain = stack(dem_lst)
# plot the stack layers of terrain data
plot(terrain)

# Re-sampling Remote Sensing data based on terrain data
RS_terrain = resample(landsat,terrain, method="bilinear")
# make a stack file from re-sampled remote sensing data and terrain data
covariates_stack = stack(terrain, RS_terrain)
# inspect the structure of stack layer
names(covariates_stack)
##  [1] "CNBL"  "DEM"   "MCA"   "Slope" "TWI"   "blue"  "green" "red"   "nir"  
## [10] "ndvi"

Section 2: Importing Soil Data

# import shape file of point data
point = readOGR("./Shapefile","points")
# import shape file of Bavaria boundary
Bavaria = readOGR("./Shapefile","Bavaria")
# plots of the shape files
plot(Bavaria)
plot(point, add=T)
# plot an interactive map
mapview::mapview(point, zcol = "OC", at = c(0,5,10,15,20,25,30,35,200), legend = TRUE)
# plot the point on the raster
plot(covariates_stack$DEM, main = "DEM", xlab = "Easting (m)", ylab = "Northing (m)")
plot(point, add =T,pch = 19)
plot(Bavaria, add =T)

# inspect the point shape files
# types of the spatial data
class(point)
## [1] "SpatialPointsDataFrame"
## attr(,"package")
## [1] "sp"
# summary statistics of data
summary(point)
## Object of class SpatialPointsDataFrame
## Coordinates:
##                 min       max
## coords.x1  508615.3  848381.1
## coords.x2 5271503.2 5591741.8
## Is projected: TRUE 
## proj4string :
## [+proj=utm +zone=32 +datum=WGS84 +units=m +no_defs]
## Number of points: 346
## Data attributes:
##        x                y                 pH              OC        
##  Min.   :508615   Min.   :5271503   Min.   :3.810   Min.   :  6.00  
##  1st Qu.:617164   1st Qu.:5352240   1st Qu.:6.162   1st Qu.: 14.50  
##  Median :672285   Median :5406629   Median :6.850   Median : 21.00  
##  Mean   :676277   Mean   :5416121   Mean   :6.738   Mean   : 28.35  
##  3rd Qu.:733674   3rd Qu.:5485267   3rd Qu.:7.380   3rd Qu.: 33.05  
##  Max.   :848381   Max.   :5591742   Max.   :8.140   Max.   :251.30
# histogram of data
hist(point$OC,col ="blue", xlab= "SOC (g/kg)", main="Histogram")

Section 3: Making Regression Matrix

# extract covariate values at each point of observation 
cov = extract(covariates_stack,point,method='bilinear', df=TRUE)
# inspect the data.frame of cov
str(cov)
## 'data.frame':    346 obs. of  11 variables:
##  $ ID   : num  1 2 3 4 5 6 7 8 9 10 ...
##  $ CNBL : num  398 431 587 408 535 ...
##  $ DEM  : num  426 493 600 474 544 ...
##  $ MCA  : num  6744760 14292214 4667933 5777121 14213043 ...
##  $ Slope: num  0.0465 0.0149 0.026 0.0228 0.016 ...
##  $ TWI  : num  10.5 11.1 13.2 11.3 12 ...
##  $ blue : num  236 401 374 375 411 ...
##  $ green: num  403 698 599 670 680 ...
##  $ red  : num  273 680 537 586 544 ...
##  $ nir  : num  2561 3008 2808 3300 3390 ...
##  $ ndvi : num  0.811 0.632 0.663 0.699 0.725 ...
# combining covariates and soil properties
cov_soil = cbind(cov[,-1], OC=point$OC)
# inspect the data.frame of cov_soil
str(cov_soil)
## 'data.frame':    346 obs. of  11 variables:
##  $ CNBL : num  398 431 587 408 535 ...
##  $ DEM  : num  426 493 600 474 544 ...
##  $ MCA  : num  6744760 14292214 4667933 5777121 14213043 ...
##  $ Slope: num  0.0465 0.0149 0.026 0.0228 0.016 ...
##  $ TWI  : num  10.5 11.1 13.2 11.3 12 ...
##  $ blue : num  236 401 374 375 411 ...
##  $ green: num  403 698 599 670 680 ...
##  $ red  : num  273 680 537 586 544 ...
##  $ nir  : num  2561 3008 2808 3300 3390 ...
##  $ ndvi : num  0.811 0.632 0.663 0.699 0.725 ...
##  $ OC   : num  17.8 18 27.7 12.5 31.5 ...
# check the correlation covariates and OC
corrplot.mixed(cor(cov_soil), lower.col = "black", number.cex = .7)

Section 4: Regression Model

# remove na values
cov_soil <- cov_soil[complete.cases(cov_soil),]
# remove high values of oc
cov_soil <- cov_soil[cov_soil$OC<78,]
# check number of rows
nrow(cov_soil)
# check number of column 
ncol(cov_soil)
# split the data to training (70%) and testing (30%) sets
trainIndex <- createDataPartition(cov_soil$OC, 
                                  p = 0.7, list = FALSE, times = 1)
# subset the datasets
cov_soil_Train <- cov_soil[ trainIndex,]
cov_soil_Test  <- cov_soil[-trainIndex,]
# inspect the two datasets
str(cov_soil_Train)
## 'data.frame':    237 obs. of  11 variables:
##  $ CNBL : num  398 431 587 408 414 ...
##  $ DEM  : num  426 493 600 474 466 ...
##  $ MCA  : num  6744760 14292214 4667933 5777121 18620790 ...
##  $ Slope: num  0.04652 0.01492 0.02598 0.02276 0.00466 ...
##  $ TWI  : num  10.5 11.1 13.2 11.3 13 ...
##  $ blue : num  236 401 374 375 472 ...
##  $ green: num  403 698 599 670 810 ...
##  $ red  : num  273 680 537 586 783 ...
##  $ nir  : num  2561 3008 2808 3300 3573 ...
##  $ ndvi : num  0.811 0.632 0.663 0.699 0.64 ...
##  $ OC   : num  17.8 18 27.7 12.5 12.4 36.1 19.1 15.2 17.4 20.2 ...
str(cov_soil_Test)  
## 'data.frame':    99 obs. of  11 variables:
##  $ CNBL : num  535 403 524 522 467 ...
##  $ DEM  : num  544 408 520 536 463 ...
##  $ MCA  : num  14213043 22964809 14094020 9650510 16280441 ...
##  $ Slope: num  0.01596 0.00203 0.00305 0.01002 0.00766 ...
##  $ TWI  : num  12 14.1 17.6 11.8 12.4 ...
##  $ blue : num  411 489 366 387 423 ...
##  $ green: num  680 806 643 650 671 ...
##  $ red  : num  544 797 476 523 560 ...
##  $ nir  : num  3390 3300 3717 3716 3284 ...
##  $ ndvi : num  0.725 0.611 0.773 0.752 0.706 ...
##  $ OC   : num  31.5 17 72.8 30.7 38 40.4 15.8 41.1 11.2 28.3 ...
# fit models on training set: regression model
linear_fit <- lm(OC ~ CNBL+DEM+MCA+Slope+TWI+blue+green+red+nir+ndvi, 
                 data=cov_soil_Train)
# look at the summary of linear model
summary(linear_fit)
## 
## Call:
## lm(formula = OC ~ CNBL + DEM + MCA + Slope + TWI + blue + green + 
##     red + nir + ndvi, data = cov_soil_Train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -21.342  -7.693  -2.049   5.490  49.928 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.711e+02  5.801e+01   2.949  0.00352 **
## CNBL        -1.842e-02  2.057e-02  -0.896  0.37141   
## DEM          4.068e-02  1.915e-02   2.124  0.03472 * 
## MCA          2.479e-07  8.585e-08   2.888  0.00425 **
## Slope       -4.913e+01  4.789e+01  -1.026  0.30605   
## TWI          9.887e-01  4.612e-01   2.144  0.03311 * 
## blue         9.189e-02  4.795e-02   1.916  0.05659 . 
## green       -6.998e-02  6.055e-02  -1.156  0.24907   
## red         -1.285e-01  3.969e-02  -3.236  0.00139 **
## nir          2.624e-02  8.448e-03   3.106  0.00214 **
## ndvi        -2.396e+02  8.114e+01  -2.953  0.00348 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 11.69 on 226 degrees of freedom
## Multiple R-squared:  0.3947, Adjusted R-squared:  0.368 
## F-statistic: 14.74 on 10 and 226 DF,  p-value: < 2.2e-16
# apply the linear model on testing data
OC_linear_Pred <- predict(linear_fit, cov_soil_Test)  
# check the plot actual and predicted OC values
plot(cov_soil_Test$OC, OC_linear_Pred, main="Linear model", 
     col="blue",xlab="Actual OC", ylab="Predicted OC", 
     xlim=c(0,100),ylim=c(0,100))
abline(coef = c(0,1),  col="red" )

# calculate correlation
cor_linear <- cor(cov_soil_Test$OC, OC_linear_Pred)
cor_linear
## [1] 0.6386474
# calculate RMSE
RMSE_linear <- sqrt(mean((cov_soil_Test$OC - OC_linear_Pred)^2))
RMSE_linear
## [1] 10.88015

Section 5: Decision Tree Model

# fit models on training set: decision tree model
tree_fit <- rpart(OC ~ CNBL+DEM+MCA+Slope+TWI+blue+green+red+nir+ndvi,
                  method="anova", data=cov_soil_Train,
                  control = rpart.control(minsplit = 15, cp = 0.0001))
# display the results
printcp(tree_fit)
# visualize cross-validation results
plotcp(tree_fit) 

# detailed summary of splits
summary(tree_fit) 
# visualize the tree
rpart.plot(tree_fit) 

# apply the tree model on testing data
OC_tree_Pred <- predict(tree_fit, cov_soil_Test)
# check the plot actual and predicted OC values
plot(cov_soil_Test$OC, OC_tree_Pred, main="Tree model", 
     col="blue",xlab="Actual OC", ylab="Predicted OC", xlim=c(0,100),ylim=c(0,100))
abline(coef = c(0,1),  col="red" )

# calculate correlation
cor_tree <- cor(cov_soil_Test$OC, OC_tree_Pred)
cor_tree
## [1] 0.545078
# calculate RMSE
RMSE_tree <- sqrt(mean((cov_soil_Test$OC - OC_tree_Pred)^2))
RMSE_tree
## [1] 12.13934

Section 6: Random Forest

# fit models on training set: random forest model
rf_fit <- randomForest(OC ~ CNBL+DEM+MCA+Slope+TWI+blue+green+red+nir+ndvi, 
                       data=cov_soil_Train,ntree=1000, do.trace = 25)
# detailed summary of random forest
summary(rf_fit) 
# visualize the importance of random forest
varImpPlot(rf_fit) 

# apply the random forest model on testing data
OC_rf_Pred <- predict(rf_fit, cov_soil_Test)
# check the plot actual and predicted OC values
plot(cov_soil_Test$OC, OC_rf_Pred, main="Tree model", 
     col="blue",xlab="Actual OC", ylab="Predicted OC", xlim=c(0,100),ylim=c(0,100))
abline(coef = c(0,1),  col="red" )

# calculate correlation
cor_rf <- cor(cov_soil_Test$OC, OC_rf_Pred)
cor_rf
## [1] 0.674113
# calculate RMSE
RMSE_rf <- sqrt(mean((cov_soil_Test$OC - OC_rf_Pred)^2))
RMSE_rf
## [1] 10.51041

Section 7: Comparing the results

# check the accuracy of three models
RMSE_models <- c(Linear=RMSE_linear,Tree=RMSE_tree,RF=RMSE_rf)
cor_models <- c(Linear=cor_linear,Tree=cor_tree,RF=cor_rf)
# plot the final results
par(mfrow=c(1,2))
barplot(RMSE_models, main="RMSE",col=c("red","blue","green"))
barplot(cor_models, main="Correlation",col=c("red","blue","green"))

Section 8: Preparing soil map

# apply the best model on the stack layer
map_rf <- raster::predict( covariates_stack,rf_fit)
# plot the final maps
spplot(map_rf, main="SOC map based on RF model")

# plot an interactive map
mapview::mapview(map_rf)