In this week’s practical we will look at options for bridging to other open-source GIS applications, specifically QGIS. The advantage of doing this is that,although it is useful and satisfying to write all your analysis in R, often there will be an existing algorithm for certain spatial operations that are implemented in other applications. A good example is the range of hydrology and terrain algorithms that are accessible through QGIS and SAGA. The analysis that we will run through today uses some of those tools to assist with a floodplain delineation relative to the estimated distribution of beaver dams in the Lausanne region of Switzerland.

First let’s load the packages needed for today. We need to install RQGIS3 from the github source as this is not available through CRAN at the time of writing. After loading the “devtools” library for use the function install_github() and provide the directory for the source package (hosted by Jannes Muenchow).

install.packages(c("sf","raster","RSAGA","rgdal","ggplot2","gdistance","devtools"))
library(devtools)
install_github("jannes-m/RQGIS3", force = TRUE)
library(sf) #for working with vector data
## Linking to GEOS 3.6.1, GDAL 2.2.3, PROJ 4.9.3
library(raster)#for working with raster data
## Loading required package: sp
library(RSAGA)#for hydrology algorithms
## Loading required package: gstat
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## Loading required package: shapefiles
## Loading required package: foreign
## 
## Attaching package: 'shapefiles'
## The following objects are masked from 'package:foreign':
## 
##     read.dbf, write.dbf
## Loading required package: plyr
library(rgdal)#for spatial transformations/projections
## rgdal: version: 1.4-8, (SVN revision 845)
##  Geospatial Data Abstraction Library extensions to R successfully loaded
##  Loaded GDAL runtime: GDAL 2.2.3, released 2017/11/20
##  Path to GDAL shared files: C:/Users/mlibxmda/Documents/R/win-library/3.6/rgdal/gdal
##  GDAL binary built with GEOS: TRUE 
##  Loaded PROJ.4 runtime: Rel. 4.9.3, 15 August 2016, [PJ_VERSION: 493]
##  Path to PROJ.4 shared files: C:/Users/mlibxmda/Documents/R/win-library/3.6/rgdal/proj
##  Linking to sp version: 1.4-1
library(ggplot2)#for visualizing xy plots
library(gdistance)#for calculating distance
## Loading required package: igraph
## 
## Attaching package: 'igraph'
## The following object is masked from 'package:raster':
## 
##     union
## The following objects are masked from 'package:stats':
## 
##     decompose, spectrum
## The following object is masked from 'package:base':
## 
##     union
## Loading required package: Matrix
## 
## Attaching package: 'gdistance'
## The following object is masked from 'package:igraph':
## 
##     normalize
library(RQGIS3)#bridging to QGIS
## Loading required package: reticulate
###Tell RQGIS the location of the QGIS application on your machine (where you downloaded and installed the app)

set_env("C:/Program Files/QGIS 3.6")
## $root
## [1] "C:/Program Files/QGIS 3.6"
## 
## $qgis_prefix_path
## [1] "C:/Program Files/QGIS 3.6/apps/qgis"
## 
## $python_plugins
## [1] "C:/Program Files/QGIS 3.6/apps/qgis/python/plugins"
## 
## $platform
## [1] "Windows"
open_app()  # you will get a warning if QGIS is already open on your computer
setwd("C:/Castor") #set your directory (in the "Castor" folder of my directory - you can choose your own) and copy the files from this week from the course dropbox


#Load DEM needed for the analysis

CastorDem <-raster("CastorDEM_Small.tif")

#plot it
plot(CastorDem)

Now we can start using some of the options in RQGIS. To find the algorithms we want to use, the most straighforward way is to search for a key word using the find_algorithms() function. The first thing we need to do is to fill any depressions (sinks) in our DEM so we will search for the word “sink” in the list of algorithms.

find_algorithms("sink", name_only = TRUE)
## Warning in check_for_server(): Hey there! According to our internal checks, you are trying to run RQGIS3 on a Windows server.
## Please note that this is only possible if you imitate a x-display.
## QGIS needs this in the background to be able to execute its processing modules.
## Note that you need to start the x-display with admin rights
## [1] "saga:fillsinks"                  "saga:fillsinksqmofesp"          
## [3] "saga:fillsinkswangliu"           "saga:fillsinksxxlwangliu"       
## [5] "saga:sinkdrainageroutedetection" "saga:sinkremoval"
#create an object from the saga:fillsinks option
sinkAlg<-"saga:fillsinks"

The get_usage() function can be used to list the parameters, input and outputs required by the tool.

get_usage(sinkAlg)
## Fill sinks (saga:fillsinks)
## 
## 
## ----------------
## Input parameters
## ----------------
## 
## DEM: DEM
## 
##  Parameter type: QgsProcessingParameterRasterLayer
## 
##  Accepted data types:
##      - str: layer ID
##      - str: layer name
##      - str: layer source
##      - QgsProperty
##      - QgsRasterLayer
## 
## MINSLOPE: Minimum Slope [Degree]
## 
##  Parameter type: QgsProcessingParameterNumber
## 
##  Accepted data types:
##      - int
##      - float
##      - QgsProperty
## 
## RESULT: Filled DEM
## 
##  Parameter type: QgsProcessingParameterRasterDestination
## 
##  Accepted data types:
##      - str
##      - QgsProperty
##      - QgsProcessingOutputLayerDefinition
## 
## ----------------
## Outputs
## ----------------
## 
## RESULT:  <QgsProcessingOutputRasterLayer>
##  Filled DEM
#The tools requires DEM as the input, a value for min slope (0.01 is the default usually so we'll stick to this) and an output. SAGA algorithms save outputs as .sdat files so we speccify hat here and ask R to load the resulting file as an object.

DEM_filled = run_qgis(sinkAlg,  
                       DEM=CastorDem,
                       MINSLOPE=0.01,
                       RESULT= "FillDEM.sdat",
                       load_output = TRUE)
## $RESULT
## [1] "C:/Castor/FillDEM.sdat"
#plot the results
plot(DEM_filled)

Next lets read in the polyline file representing the locations of our beaver dam predictions (streams estimated to contain dams). Here we use the st_read() function in the “sf” package. SF stands for Simple Features and is another way of working with vector data.

plot(DEM_filled)

Prob<-st_read("CastorPred_Small.shp")
## Reading layer `CastorPred_Small' from data source `C:\Castor\CastorPred_Small.shp' using driver `ESRI Shapefile'
## Simple feature collection with 265 features and 4 fields
## geometry type:  LINESTRING
## dimension:      XY
## bbox:           xmin: 4095174 ymin: 2647388 xmax: 4125174 ymax: 2676888
## proj4string:    +proj=laea +lat_0=52 +lon_0=10 +x_0=4321000 +y_0=3210000 +ellps=GRS80 +units=m +no_defs
plot(Prob,add=TRUE)

The key measurement that we need to calculate for the floodplain delineation is the relative difference in height of each cell in the DEM to the nearest stream. This value will eventually be used to determine whether each cell occurs above or below the threshold for inclusion in the floodplain.

For each cell in our DEM We will use Pythagorean calculations to get the coordinates of the nearest stream cell. So, our next job is to calculate for each cell the distance to the nearest stream. You may remember from previous weeks that the distance() function calculates the distance for all cells that are NA to the nearest cell that is not NA.

That means we should create a raster from our Prob object that will provide our non-NA cells, and the distance function will calcaulate distances for all other cells in the spatial extent of Prob to the nearest stream cell.

To create a raster from the Prob object we use the mask function to extract underlying cells from the DEM:

r<-mask(DEM_filled, Prob)#create a mask of the DEM for the streams layer

#Now use the distance() function to calucalte distance to all cells that are NA (i.e. not streams).
dist<-distance(r)

plot(dist)

#For our calcuations we also need the direction (in radians) so we can calculate x and y coordinates

direct<-direction(r,from=FALSE)#specify from =FALSE so that direction is claculated TO NA cells


#Next create binary raster with value 1 for NA values in r and 0 for other values in r.

rna <- is.na(r) # returns NA raster

#Create two rasters whose values are coordinates of x and y for rna

# store coordinates in new raster: https://stackoverflow.com/a/35592230/3752258 
na.x <- init(rna, 'x')
na.y <- init(rna, 'y')

# calculate coordinates of the nearest Non-NA pixel
# assume that we have an orthogonal, projected CRS, so we can use (Pythagorean) calculations
co.x <- na.x + dist * sin(direct)
co.y <- na.y + dist * cos(direct)

# create matrix of point coordinates of nearest non-NA pixel with which to extract values.
co <- cbind(co.x[], co.y[]) 

head(co)
##         [,1]    [,2]
## [1,] 4101063 2672563
## [2,] 4101063 2672563
## [3,] 4101188 2672738
## [4,] 4101188 2672738
## [5,] 4101188 2672738
## [6,] 4101188 2672738
summary(co)
##        V1                V2         
##  Min.   :4095162   Min.   :2647413  
##  1st Qu.:4102238   1st Qu.:2654913  
##  Median :4109662   Median :2662063  
##  Mean   :4110215   Mean   :2662070  
##  3rd Qu.:4117812   3rd Qu.:2669688  
##  Max.   :4125138   Max.   :2676888  
##  NA's   :13076     NA's   :13076
# extract values from original stream mask (remember the values are elevation) using the coordinates co
NAVals <- extract(r, co, method='simple') 
r.NAVals <- rna # initiate a new raster (use same dimensions as rna) 
r.NAVals[] <- NAVals # store the extracted stream elevation values in this new raster

plot(r.NAVals)

# use the cover() function to assign NA cells in r with the values just extracted. The cover function takes the NA values of the first argument 'x' and replaces them with the corressponding cell values of another raster 'y'.   

r.filled <- cover(x=r, y= r.NAVals)#we now have a raster where the value of each cell is the elevation of the nearest stream.

#plot it
plot(r.filled)

#Now we just have to subtract the r.filled object from our original DEM to get a raster in which the value of each cell is the difference in elevation between that cell and the nearest stream!


elevDiff<-DEM_filled-r.filled

plot(elevDiff)# et voila!

Another parameter that we need to compute is slope. Again, we use the find_algorithms() function and from the list returned select the native QGIS ‘slope’ function (“qgis:slope”).

find_algorithms("slope", name_only = TRUE)
##  [1] "gdal:slope"                           
##  [2] "grass7:r.slope.aspect"                
##  [3] "qgis:slope"                           
##  [4] "saga:aspectslopegrid"                 
##  [5] "saga:diffusivehillslopeevolutionadi"  
##  [6] "saga:diffusivehillslopeevolutionftcs" 
##  [7] "saga:downslopedistancegradient"       
##  [8] "saga:dtmfilterslopebased"             
##  [9] "saga:relativeheightsandslopepositions"
## [10] "saga:slopeaspectcurvature"            
## [11] "saga:slopelength"                     
## [12] "saga:slopelimitedflowaccumulation"    
## [13] "saga:upslopeanddownslopecurvature"    
## [14] "saga:upslopearea"                     
## [15] "saga:vegetationindexslopebased"
algSlope<-"qgis:slope"

#check the usage parameters (need a DEM INPUT and an OUTPUT name)
get_usage(algSlope)
## Slope (qgis:slope)
## 
## 
## ----------------
## Input parameters
## ----------------
## 
## INPUT: Elevation layer
## 
##  Parameter type: QgsProcessingParameterRasterLayer
## 
##  Accepted data types:
##      - str: layer ID
##      - str: layer name
##      - str: layer source
##      - QgsProperty
##      - QgsRasterLayer
## 
## Z_FACTOR: Z factor
## 
##  Parameter type: QgsProcessingParameterNumber
## 
##  Accepted data types:
##      - int
##      - float
##      - QgsProperty
## 
## OUTPUT: Slope
## 
##  Parameter type: QgsProcessingParameterRasterDestination
## 
##  Accepted data types:
##      - str
##      - QgsProperty
##      - QgsProcessingOutputLayerDefinition
## 
## ----------------
## Outputs
## ----------------
## 
## OUTPUT:  <QgsProcessingOutputRasterLayer>
##  Slope
slope_run = run_qgis(algSlope,  
                 INPUT = DEM_filled,
                 OUTPUT = file.path("SlopeFilled.tif"),
                 load_output = TRUE)
## $OUTPUT
## [1] "C:/Castor/SlopeFilled.tif"
class(slope_run)
## [1] "RasterLayer"
## attr(,"package")
## [1] "raster"
plot(slope_run)

# We also need to create a cost layer based on the elevDiff object in order to create a cumulative cost layer that will help is avoid unconnected cells that fall under the floodplain threshold values from sneaking into our results. We will do this in the same way as for the connectivity work in Week 8 - using the transition function. 




#Create the transition layer - remember this calculates conductance or ease of movement between cells
Tr<-transition(slope_run, function(x) 1/mean(x), 8)


#create a matrix of coordinates using st_coordinates (for sf objects) as the starting points of the cost layer
XY<-st_coordinates(Prob) 

XY<-XY[,1:2]

#inspect
head(XY)
##            X       Y
## [1,] 4115199 2675638
## [2,] 4115674 2675613
## [3,] 4115999 2676063
## [4,] 4116649 2676038
## [5,] 4116974 2676513
## [6,] 4118249 2676463
#create a cumulative cost layer using the accCost() function
Cost<-accCost(Tr,XY)

#inspect
plot(Cost)

# Now we have the layers requires for the rest of our analysis we turn next to the statistical part of the delineation. Recall from the lecture that we need to ascertain the lowest value for each raster (elevDiff and Cost) at which the distribution differs from that of a normal distribution by less than 1%. The way to do this is to plot quantile-quantile plots of the data against those of a normally distributed sample. We can acheive Q-Q plots using the qqnorm() function applied to the values of our raster layers and then plotting these against a hypothetcal ditribution for the same data using the rnorm() function. 
# 
# First we get the Q-Q plots for our elevation and cost raster layer values



Elevqq<-qqnorm(values(elevDiff))

Costqq<-qqnorm(values(Cost))

# Next we simulate the same data range as a normal distribution using the rnorm() function based on the number of cells, meanand standard deviation of the original.



###Simulate normal distribution for elevation data

#get cell number
En<-ncell(elevDiff)


#get data mean
Em<-cellStats(elevDiff, stat='mean', na.rm=TRUE, asSample=FALSE)

Em
## [1] 49.94172
#get SD
ESD<- cellStats(elevDiff, stat='sd', na.rm=TRUE, asSample=TRUE)

ESD
## [1] 108.0439
#rnorm: the normal distribution

#create distribution with rnorm()
elevDiffNorm <- rnorm(En, mean = Em, sd = ESD)

#inspect with histogram
hist(elevDiffNorm, breaks = 51, col = "orange", main = "rnorm")

#now create a Q-Q plot of the normally distributed version of the data.

elevDiffNormQQ<- qqnorm(elevDiffNorm)

# In order to calculate our threshold (beyond just reading the values from our plots - I tried, it's impossible!) we'll need to create a data frame of all the values we just computed. The results of the qqnorm() function are returned as a list. To get these values into a data frame we first need to convert them to a vector using the unlist() function.
# 
# Note that with the qqnorm() function, values are reurned as 'x' and 'y' where x values are quantiles calculated from a theoretical (normal) distribution and y values are the actual values. In the case of our simulated values the (near) perfect correlation (a traight line) therefore denotes a normal distribution. 




elevDiffnorm_x<-as.vector(unlist(elevDiffNormQQ$x))
elevDiffnorm_y<-as.vector(unlist(elevDiffNormQQ$y))

#use the sort function to make sure the values are returned in ascending order (we'll do the same with the original raster values below so that they match)

elevDiffnorm_x<-sort(elevDiffnorm_x,decreasing = FALSE)
elevDiffnorm_y<-sort(elevDiffnorm_y,decreasing = FALSE)

#Repeat for the original raster values
elevDiff_x<-as.vector(unlist(Elevqq$x)) 
 
elevDiff_y<-as.vector(unlist(Elevqq$y))
 
elevDiff_x <-sort(elevDiff_x, decreasing = FALSE)
 
elevDiff_y<-sort(elevDiff_y, decreasing=FALSE)
# 
# 
#Cobine original values into a single dataframe
elevDiffActual<- data.frame(elevDiff_x,elevDiff_y)


#do the same for the "simulated" normal distribution

elevDiffNorm<-data.frame(elevDiffnorm_x,elevDiffnorm_y)

#combine all data together
elevDataFin<-data.frame(elevDiffNorm,elevDiffActual)

head(elevDataFin)
##   elevDiffnorm_x elevDiffnorm_y elevDiff_x elevDiff_y
## 1      -4.959647      -475.5747  -4.959647  -240.4039
## 2      -4.741765      -456.0369  -4.741765  -240.3724
## 3      -4.637231      -433.5118  -4.637231  -240.2743
## 4      -4.567171      -426.3313  -4.567171  -240.1640
## 5      -4.514190      -425.1957  -4.514190  -240.0834
## 6      -4.471470      -422.7780  -4.471470  -239.8625
#Now we claculate for each value in the original data the difference from the simulated normal distribution.

elevDataFin$elevY_diff=abs((elevDataFin$elevDiffnorm_y-elevDataFin$elevDiff_y)/((elevDataFin$elevDiffnorm_y+elevDataFin$elevDiff_y)/2)*100)   


#check the final dataframe
head(elevDataFin)
##   elevDiffnorm_x elevDiffnorm_y elevDiff_x elevDiff_y elevY_diff
## 1      -4.959647      -475.5747  -4.959647  -240.4039   65.69214
## 2      -4.741765      -456.0369  -4.741765  -240.3724   61.93612
## 3      -4.637231      -433.5118  -4.637231  -240.2743   57.35871
## 4      -4.567171      -426.3313  -4.567171  -240.1640   55.86456
## 5      -4.514190      -425.1957  -4.514190  -240.0834   55.64952
## 6      -4.471470      -422.7780  -4.471470  -239.8625   55.20804
#We can plot the Q-Q plots of the actual and normal distribution using ggplot simply by calling ggplot and adding to lines #(geom_line) to the plot.There are many values to plot so this may take some time to load to the plot panel.



 Diff = ggplot() + 
   geom_line(data = elevDataFin, aes(x = elevDiffnorm_x, y = elevDiffnorm_y), color = "blue") +
   geom_line(data = elevDataFin, aes(x = elevDiff_x, y = elevDiff_y), color = "red") +
   xlab('Quantile') +
   ylab('elevDiff')
 
 print(Diff)

#Now we turn to the selection of the threshold for 'elevDiff'. We select all values from the original data for which there #is less than 1% difference with the simulated normal data.


elevTselect<-elevDataFin[which(elevDataFin$elevY_diff <1),]

 
head(elevTselect)
##        elevDiffnorm_x elevDiffnorm_y elevDiff_x elevDiff_y elevY_diff
## 457867     -0.4583457      0.2315283 -0.4583457  0.2333679  0.7913995
## 457868     -0.4583437      0.2319390 -0.4583437  0.2333679  0.6141851
## 457869     -0.4583418      0.2322377 -0.4583418  0.2333679  0.4855031
## 457870     -0.4583398      0.2324416 -0.4583398  0.2333984  0.4107979
## 457871     -0.4583378      0.2331029 -0.4583378  0.2333984  0.1267144
## 457872     -0.4583359      0.2337212 -0.4583359  0.2333984  0.1381833
#select minimum value as threshold
eT<-min(elevTselect$elevDiff_y)

#######################################



#Next we repeat the same process for our cost layer.





################################################
#finally for cost layer - repeat################
################################################

#get number of cells in the cost layer
Cn<-ncell(Cost)

Cn
## [1] 1416000
#get the mean of the cost layer
Cm<-cellStats(Cost, stat='mean', na.rm=TRUE, asSample=FALSE)

Cm
## [1] 139.3894
#Standard Deviation
CSD<- cellStats(Cost, stat='sd', na.rm=TRUE, asSample=TRUE)

CSD
## [1] 278.8988
#rnorm: the normal distribution



CostNorm <- rnorm(Cn, mean = Cm, sd = CSD)


hist(CostNorm, breaks = 51, col = "orange", main = "Costnorm")

CostNormQQ<- qqnorm(CostNorm)

#arrange the real and normal data in ascending order so we can combine them

Costnorm_x<-as.vector(unlist(CostNormQQ$x))
Costnorm_y<-as.vector(unlist(CostNormQQ$y))

Costnorm_x<-sort(Costnorm_x,decreasing = FALSE)
Costnorm_y<-sort(Costnorm_y,decreasing = FALSE)


Cost_x<-as.vector(unlist(Costqq$x)) 

Cost_y<-as.vector(unlist(Costqq$y))

Cost_x <-sort(Cost_x, decreasing = FALSE)

Cost_y<-sort(Cost_y, decreasing=FALSE)
 

CostActual<- data.frame(Cost_x,Cost_y)



Costnorm<-data.frame(Costnorm_x,Costnorm_y)

#combine the real and normal data together 

CostDataFin<-data.frame(Costnorm,CostActual)


head(CostDataFin)
##   Costnorm_x Costnorm_y    Cost_x Cost_y
## 1  -4.959647  -1230.121 -4.959647      0
## 2  -4.741765  -1204.602 -4.741765      0
## 3  -4.637231  -1176.955 -4.637231      0
## 4  -4.567171  -1161.047 -4.567171      0
## 5  -4.514190  -1129.802 -4.514190      0
## 6  -4.471470  -1128.951 -4.471470      0
#calculate the difference between each data point of the real data and the normal distribution

CostDataFin$cost_diff=abs((CostDataFin$Costnorm_y-CostDataFin$Cost_y)/((CostDataFin$Costnorm_y+CostDataFin$Cost_y)/2)*100)   


head(CostDataFin)
##   Costnorm_x Costnorm_y    Cost_x Cost_y cost_diff
## 1  -4.959647  -1230.121 -4.959647      0       200
## 2  -4.741765  -1204.602 -4.741765      0       200
## 3  -4.637231  -1176.955 -4.637231      0       200
## 4  -4.567171  -1161.047 -4.567171      0       200
## 5  -4.514190  -1129.802 -4.514190      0       200
## 6  -4.471470  -1128.951 -4.471470      0       200
#plot the Q-Q plot for the data against a normal distribution

 diffCost = ggplot() + 
   geom_line(data = CostDataFin, aes(x = Costnorm_x, y = Costnorm_y), color = "blue") +
   geom_line(data = CostDataFin, aes(x = Cost_x, y = Cost_y), color = "red") +
   xlab('NV') +
   ylab('CostDiff')+
   ylim(0,500)
 
print(diffCost)

#select all data points for which there is less than  one precent difference between real and normal distribution 
selectTCost<-CostDataFin[which(CostDataFin$cost_diff <1 ),]

head(selectTCost)
##        Costnorm_x Costnorm_y     Cost_x   Cost_y cost_diff
## 463532 -0.4472348   14.52758 -0.4472348 14.67338 0.9986493
## 463533 -0.4472329   14.52930 -0.4472329 14.67340 0.9868844
## 463534 -0.4472309   14.52985 -0.4472309 14.67342 0.9832369
## 463535 -0.4472290   14.53042 -0.4472290 14.67342 0.9793125
## 463536 -0.4472270   14.53071 -0.4472270 14.67342 0.9773129
## 463537 -0.4472251   14.53089 -0.4472251 14.67342 0.9760544
cT<-min(selectTCost$Cost_y)
print(cT)
## [1] 14.67338
#get the sqaure root (a mathematical convenience - this just works better!)

ct<-sqrt(cT)

ct
## [1] 3.830585
#Next create a layer from each raster that assigns 1 for areas below the threshold and 0 for those above it

#elevation data
rcElev <- reclassify(elevDiff, c(-Inf,eT,1, eT,Inf,0))

plot(rcElev)

#cost layer
rcCost <- reclassify(Cost, c(-Inf,ct,1, ct,Inf,0))



plot(rcCost)

###Now combine scores from both our rasters  
##
elevCost<-rcElev+rcCost


plot(elevCost)

#The final floodplain model is those areas where the cobined score is 2 (i.e. within both thresholds tested).
#We create a binary raster setting all values in the floodplain to 1 and all other values to 0 with the reclassify function.

flood<-reclassify(elevCost, c(-Inf,1,0,1,Inf,1))
#writeRaster(flood,'C:/Castor/FloodP.tif',datatype='INT2s',overwrite=TRUE)

summary(flood)
##         layer
## Min.        0
## 1st Qu.     0
## Median      0
## 3rd Qu.     0
## Max.        1
## NA's        0
plot(flood)
# values(flood)
plot(Prob,col='black', add=TRUE)