Section 1: Importing Covariates
# empty memory and workspace
gc()
rm(list=ls())
# check directory
getwd()
### to install all required R packages
ls <- c("sp", "raster", "rgdal", "mapview", "corrplot",
"caret", "rpart", "rpart.plot", "randomForest")
new.packages <- ls[!(ls %in% installed.packages()[,"Package"])]
if(length(new.packages)) install.packages(new.packages)
# load library
library(sp)
library(raster)
library(rgdal)
library(mapview)
library(corrplot)
library(caret)
library(rpart)
library(rpart.plot)
library(randomForest)
# import raster layers of covariates (Remote Sensing data)
blue = raster("./RS/Blue.tif")
green = raster("./RS/Green.tif")
red = raster("./RS/Red.tif")
nir = raster("./RS/Nir.tif")
# calculate some Remote Sensing indices such as NDVI
ndvi = (nir - red) / (nir + red)
# plot raster layers of covariates (Remote Sensing data)
plot(blue, main = "blue band of landsat", xlab = "Easting (m)", ylab = "Northing (m)")
plot(red, main = "red band of landsat", xlab = "Easting (m)", ylab = "Northing (m)")
plot(ndvi, main = "NDVI of landsat", xlab = "Easting (m)", ylab = "Northing (m)")
# make a stack layer from Remote Sensing data
landsat = stack(blue,green,red,nir,ndvi)
# inspect the structure of stack layer
landsat
## class : RasterStack
## dimensions : 733, 716, 524828, 5 (nrow, ncol, ncell, nlayers)
## resolution : 500, 500 (x, y)
## extent : 498000, 856000, 5235500, 5602000 (xmin, xmax, ymin, ymax)
## crs : +proj=utm +zone=32 +datum=WGS84 +units=m +no_defs
## names : layer.1, layer.2, layer.3, layer.4, layer.5
## min values : 47.5000000, 92.0000000, 19.5000000, 6.0000000, -0.6666667
## max values : 9.383500e+03, 1.003950e+04, 1.027300e+04, 8.635500e+03, 8.870056e-01
# set the names to the rasters in the stack file
names(landsat) <- c("blue","green","red","nir","ndvi")
# plot the stack layers of Remote Sensing data
plot(landsat)

# import the terrain data using list.files command
dem_lst <- list.files("./Terrain/", pattern="\\.sdat$", full.names = TRUE)
# inspect the structure of list
dem_lst
## [1] "./Terrain/CNBL.sdat" "./Terrain/DEM.sdat" "./Terrain/MCA.sdat"
## [4] "./Terrain/Slope.sdat" "./Terrain/TWI.sdat"
# make a stack layer from terrain data
terrain = stack(dem_lst)
# plot the stack layers of terrain data
plot(terrain)

# Re-sampling Remote Sensing data based on terrain data
RS_terrain = resample(landsat,terrain, method="bilinear")
# make a stack file from re-sampled remote sensing data and terrain data
covariates_stack = stack(terrain, RS_terrain)
# inspect the structure of stack layer
names(covariates_stack)
## [1] "CNBL" "DEM" "MCA" "Slope" "TWI" "blue" "green" "red" "nir"
## [10] "ndvi"
Section 2: Importing Soil Data
# import shape file of point data
point = readOGR("./Shapefile","points")
# import shape file of Bavaria boundary
Bavaria = readOGR("./Shapefile","Bavaria")
# plots of the shape files
plot(Bavaria)
plot(point, add=T)
# plot an interactive map
mapview::mapview(point, zcol = "OC", at = c(0,5,10,15,20,25,30,35,200), legend = TRUE)
# plot the point on the raster
plot(covariates_stack$DEM, main = "DEM", xlab = "Easting (m)", ylab = "Northing (m)")
plot(point, add =T,pch = 19)
plot(Bavaria, add =T)

# inspect the point shape files
# types of the spatial data
class(point)
## [1] "SpatialPointsDataFrame"
## attr(,"package")
## [1] "sp"
# summary statistics of data
summary(point)
## Object of class SpatialPointsDataFrame
## Coordinates:
## min max
## coords.x1 508615.3 848381.1
## coords.x2 5271503.2 5591741.8
## Is projected: TRUE
## proj4string :
## [+proj=utm +zone=32 +datum=WGS84 +units=m +no_defs]
## Number of points: 346
## Data attributes:
## x y pH OC
## Min. :508615 Min. :5271503 Min. :3.810 Min. : 6.00
## 1st Qu.:617164 1st Qu.:5352240 1st Qu.:6.162 1st Qu.: 14.50
## Median :672285 Median :5406629 Median :6.850 Median : 21.00
## Mean :676277 Mean :5416121 Mean :6.738 Mean : 28.35
## 3rd Qu.:733674 3rd Qu.:5485267 3rd Qu.:7.380 3rd Qu.: 33.05
## Max. :848381 Max. :5591742 Max. :8.140 Max. :251.30
# histogram of data
hist(point$OC,col ="blue", xlab= "SOC (g/kg)", main="Histogram")

Section 3: Making Regression Matrix
# extract covariate values at each point of observation
cov = extract(covariates_stack,point,method='bilinear', df=TRUE)
# inspect the data.frame of cov
str(cov)
## 'data.frame': 346 obs. of 11 variables:
## $ ID : num 1 2 3 4 5 6 7 8 9 10 ...
## $ CNBL : num 398 431 587 408 535 ...
## $ DEM : num 426 493 600 474 544 ...
## $ MCA : num 6744760 14292214 4667933 5777121 14213043 ...
## $ Slope: num 0.0465 0.0149 0.026 0.0228 0.016 ...
## $ TWI : num 10.5 11.1 13.2 11.3 12 ...
## $ blue : num 236 401 374 375 411 ...
## $ green: num 403 698 599 670 680 ...
## $ red : num 273 680 537 586 544 ...
## $ nir : num 2561 3008 2808 3300 3390 ...
## $ ndvi : num 0.811 0.632 0.663 0.699 0.725 ...
# combining covariates and soil properties
cov_soil = cbind(cov[,-1], OC=point$OC)
# inspect the data.frame of cov_soil
str(cov_soil)
## 'data.frame': 346 obs. of 11 variables:
## $ CNBL : num 398 431 587 408 535 ...
## $ DEM : num 426 493 600 474 544 ...
## $ MCA : num 6744760 14292214 4667933 5777121 14213043 ...
## $ Slope: num 0.0465 0.0149 0.026 0.0228 0.016 ...
## $ TWI : num 10.5 11.1 13.2 11.3 12 ...
## $ blue : num 236 401 374 375 411 ...
## $ green: num 403 698 599 670 680 ...
## $ red : num 273 680 537 586 544 ...
## $ nir : num 2561 3008 2808 3300 3390 ...
## $ ndvi : num 0.811 0.632 0.663 0.699 0.725 ...
## $ OC : num 17.8 18 27.7 12.5 31.5 ...
# check the correlation covariates and OC
corrplot.mixed(cor(cov_soil), lower.col = "black", number.cex = .7)

Section 4: Regression Model
# remove na values
cov_soil <- cov_soil[complete.cases(cov_soil),]
# remove high values of oc
cov_soil <- cov_soil[cov_soil$OC<78,]
# check number of rows
nrow(cov_soil)
# check number of column
ncol(cov_soil)
# split the data to training (70%) and testing (30%) sets
trainIndex <- createDataPartition(cov_soil$OC,
p = 0.7, list = FALSE, times = 1)
# subset the datasets
cov_soil_Train <- cov_soil[ trainIndex,]
cov_soil_Test <- cov_soil[-trainIndex,]
# inspect the two datasets
str(cov_soil_Train)
## 'data.frame': 237 obs. of 11 variables:
## $ CNBL : num 398 431 587 408 414 ...
## $ DEM : num 426 493 600 474 466 ...
## $ MCA : num 6744760 14292214 4667933 5777121 18620790 ...
## $ Slope: num 0.04652 0.01492 0.02598 0.02276 0.00466 ...
## $ TWI : num 10.5 11.1 13.2 11.3 13 ...
## $ blue : num 236 401 374 375 472 ...
## $ green: num 403 698 599 670 810 ...
## $ red : num 273 680 537 586 783 ...
## $ nir : num 2561 3008 2808 3300 3573 ...
## $ ndvi : num 0.811 0.632 0.663 0.699 0.64 ...
## $ OC : num 17.8 18 27.7 12.5 12.4 36.1 19.1 15.2 17.4 20.2 ...
## 'data.frame': 99 obs. of 11 variables:
## $ CNBL : num 535 403 524 522 467 ...
## $ DEM : num 544 408 520 536 463 ...
## $ MCA : num 14213043 22964809 14094020 9650510 16280441 ...
## $ Slope: num 0.01596 0.00203 0.00305 0.01002 0.00766 ...
## $ TWI : num 12 14.1 17.6 11.8 12.4 ...
## $ blue : num 411 489 366 387 423 ...
## $ green: num 680 806 643 650 671 ...
## $ red : num 544 797 476 523 560 ...
## $ nir : num 3390 3300 3717 3716 3284 ...
## $ ndvi : num 0.725 0.611 0.773 0.752 0.706 ...
## $ OC : num 31.5 17 72.8 30.7 38 40.4 15.8 41.1 11.2 28.3 ...
# fit models on training set: regression model
linear_fit <- lm(OC ~ CNBL+DEM+MCA+Slope+TWI+blue+green+red+nir+ndvi,
data=cov_soil_Train)
# look at the summary of linear model
summary(linear_fit)
##
## Call:
## lm(formula = OC ~ CNBL + DEM + MCA + Slope + TWI + blue + green +
## red + nir + ndvi, data = cov_soil_Train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21.342 -7.693 -2.049 5.490 49.928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.711e+02 5.801e+01 2.949 0.00352 **
## CNBL -1.842e-02 2.057e-02 -0.896 0.37141
## DEM 4.068e-02 1.915e-02 2.124 0.03472 *
## MCA 2.479e-07 8.585e-08 2.888 0.00425 **
## Slope -4.913e+01 4.789e+01 -1.026 0.30605
## TWI 9.887e-01 4.612e-01 2.144 0.03311 *
## blue 9.189e-02 4.795e-02 1.916 0.05659 .
## green -6.998e-02 6.055e-02 -1.156 0.24907
## red -1.285e-01 3.969e-02 -3.236 0.00139 **
## nir 2.624e-02 8.448e-03 3.106 0.00214 **
## ndvi -2.396e+02 8.114e+01 -2.953 0.00348 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 11.69 on 226 degrees of freedom
## Multiple R-squared: 0.3947, Adjusted R-squared: 0.368
## F-statistic: 14.74 on 10 and 226 DF, p-value: < 2.2e-16
# apply the linear model on testing data
OC_linear_Pred <- predict(linear_fit, cov_soil_Test)
# check the plot actual and predicted OC values
plot(cov_soil_Test$OC, OC_linear_Pred, main="Linear model",
col="blue",xlab="Actual OC", ylab="Predicted OC",
xlim=c(0,100),ylim=c(0,100))
abline(coef = c(0,1), col="red" )

# calculate correlation
cor_linear <- cor(cov_soil_Test$OC, OC_linear_Pred)
cor_linear
## [1] 0.6386474
# calculate RMSE
RMSE_linear <- sqrt(mean((cov_soil_Test$OC - OC_linear_Pred)^2))
RMSE_linear
## [1] 10.88015
Section 5: Decision Tree Model
# fit models on training set: decision tree model
tree_fit <- rpart(OC ~ CNBL+DEM+MCA+Slope+TWI+blue+green+red+nir+ndvi,
method="anova", data=cov_soil_Train,
control = rpart.control(minsplit = 15, cp = 0.0001))
# display the results
printcp(tree_fit)
# visualize cross-validation results
plotcp(tree_fit)

# detailed summary of splits
summary(tree_fit)
# visualize the tree
rpart.plot(tree_fit)

# apply the tree model on testing data
OC_tree_Pred <- predict(tree_fit, cov_soil_Test)
# check the plot actual and predicted OC values
plot(cov_soil_Test$OC, OC_tree_Pred, main="Tree model",
col="blue",xlab="Actual OC", ylab="Predicted OC", xlim=c(0,100),ylim=c(0,100))
abline(coef = c(0,1), col="red" )

# calculate correlation
cor_tree <- cor(cov_soil_Test$OC, OC_tree_Pred)
cor_tree
## [1] 0.545078
# calculate RMSE
RMSE_tree <- sqrt(mean((cov_soil_Test$OC - OC_tree_Pred)^2))
RMSE_tree
## [1] 12.13934
Section 6: Random Forest
# fit models on training set: random forest model
rf_fit <- randomForest(OC ~ CNBL+DEM+MCA+Slope+TWI+blue+green+red+nir+ndvi,
data=cov_soil_Train,ntree=1000, do.trace = 25)
# detailed summary of random forest
summary(rf_fit)
# visualize the importance of random forest
varImpPlot(rf_fit)

# apply the random forest model on testing data
OC_rf_Pred <- predict(rf_fit, cov_soil_Test)
# check the plot actual and predicted OC values
plot(cov_soil_Test$OC, OC_rf_Pred, main="Tree model",
col="blue",xlab="Actual OC", ylab="Predicted OC", xlim=c(0,100),ylim=c(0,100))
abline(coef = c(0,1), col="red" )

# calculate correlation
cor_rf <- cor(cov_soil_Test$OC, OC_rf_Pred)
cor_rf
## [1] 0.674113
# calculate RMSE
RMSE_rf <- sqrt(mean((cov_soil_Test$OC - OC_rf_Pred)^2))
RMSE_rf
## [1] 10.51041